ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/tophat
(Generate patch)
# Line 24 | Line 24
24   import signal
25   from datetime import datetime, date, time
26   from shutil import copy
27 + import logging
28  
29   use_message = '''
30   TopHat maps short sequences from spliced transcripts to whole genomes.
# Line 156 | Line 157
157   output_dir = "./tophat_out/"
158   logging_dir = output_dir + "logs/"
159   run_log = None
160 + tophat_log = None  #main log file handle
161 + tophat_logger = None # main logging object
162   run_cmd = None
163   tmp_dir = output_dir + "tmp/"
164   bin_dir = sys.path[0] + "/"
# Line 173 | Line 176
176   fail_str = "\t[FAILED]\n"
177   gtf_juncs = None #file name with junctions extracted from given GFF file
178  
179 +
180 +
181 + def init_logger(log_fname):
182 +    global tophat_logger
183 +    tophat_logger = logging.getLogger('project')
184 +    formatter = logging.Formatter('%(asctime)s %(message)s', '[%Y-%m-%d %H:%M:%S]')
185 +    # level = logging.__dict__.get(options.loglevel.upper(),logging.DEBUG)
186 +    tophat_logger.setLevel(logging.DEBUG)
187 +
188 +    hstream = logging.StreamHandler(sys.stderr)
189 +    hstream.setFormatter(formatter)
190 +    tophat_logger.addHandler(hstream)
191 +    #
192 +    # Output logging information to file
193 +    if os.path.isfile(log_fname):
194 +        os.remove(log_fname)
195 +    global tophat_log
196 +    logfh = logging.FileHandler(log_fname)
197 +    logfh.setFormatter(formatter)
198 +    tophat_logger.addHandler(logfh)
199 +    tophat_log=logfh.stream
200 +
201 +
202   # TopHatParams captures all of the runtime paramaters used by TopHat, and many
203   # of these are passed as command line options to exectubles run by the pipeline
204  
# Line 634 | Line 660
660          self.splice_constraints.check()
661          self.read_params.check()
662          self.system_params.check()
637        self.bowtie2_params.check()
663          if self.segment_length < 10:
664              die("Error: arg to --segment-length must at least 10")
665          if self.segment_mismatches < 0 or self.segment_mismatches > 3:
# Line 644 | Line 669
669  
670          if self.read_params.color:
671              if self.bowtie2:
672 <                die("Error: bowtie2 in colorspace is not yet supported\nuse --bowtie1 option")
672 >                th_log("Warning: bowtie2 in colorspace is not yet supported; --bowtie1 option assumed.")
673 >                self.bowtie2=False
674              if self.fusion_search:
675                  die("Error: fusion-search in colorspace is not yet supported")
676              if self.butterfly_search:
677                  die("Error: butterfly-search in colorspace is not yet supported")
678  
679 +        self.bowtie2_params.check()
680 +
681          library_types = ["fr-unstranded", "fr-firststrand", "fr-secondstrand"]
682  
683          if self.read_params.library_type and self.read_params.library_type not in library_types:
# Line 1006 | Line 1034
1034  
1035   # The TopHat logging formatter
1036   def th_log(out_str):
1037 <    print >> sys.stderr, "[%s] %s" % (right_now(), out_str)
1037 >  #print >> sys.stderr, "[%s] %s" % (right_now(), out_str)
1038 >  if tophat_logger:
1039 >       tophat_logger.info(out_str)
1040 >
1041 > def th_logp(out_str=""):
1042 >  print >> sys.stderr, out_str
1043 >  if tophat_log:
1044 >        print >> tophat_log, out_str
1045 >
1046 > def die(msg=None):
1047 >  if msg is not None:
1048 >    th_logp(msg)
1049 >  sys.exit(1)
1050  
1051   # Ensures that the output, logging, and temp directories are present. If not,
1052   # they are created
1053   def prepare_output_dir():
1054  
1055 <    th_log("Preparing output location "+output_dir)
1055 >    #th_log("Preparing output location "+output_dir)
1056      if os.path.exists(output_dir):
1057          pass
1058      else:
# Line 1041 | Line 1081
1081  
1082   # Check that the Bowtie index specified by the user is present and all files
1083   # are there.
1084 < def check_bowtie_index(idx_prefix):
1084 > def check_bowtie_index(idx_prefix, is_bowtie2):
1085      th_log("Checking for Bowtie index files")
1086 +    idxext="ebwt"
1087 +    bowtie_ver=""
1088 +    if is_bowtie2:
1089 +        idxext="bt2"
1090 +        bowtie_ver="2 "
1091  
1092 <    idx_fwd_1 = idx_prefix + ".1.ebwt"
1093 <    idx_fwd_2 = idx_prefix + ".2.ebwt"
1094 <    idx_rev_1 = idx_prefix + ".rev.1.ebwt"
1095 <    idx_rev_2 = idx_prefix + ".rev.2.ebwt"
1092 >    idx_fwd_1 = idx_prefix + ".1."+idxext
1093 >    idx_fwd_2 = idx_prefix + ".2."+idxext
1094 >    idx_rev_1 = idx_prefix + ".rev.1."+idxext
1095 >    idx_rev_2 = idx_prefix + ".rev.2."+idxext
1096  
1097      if os.path.exists(idx_fwd_1) and \
1098         os.path.exists(idx_fwd_2) and \
# Line 1055 | Line 1100
1100         os.path.exists(idx_rev_2):
1101          return
1102      else:
1103 <        bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES")
1104 <        if bowtie_idx_env_var == None:
1105 <            die("Error: Could not find Bowtie index files " + idx_prefix + ".*")
1106 <        idx_prefix = bowtie_idx_env_var + idx_prefix
1107 <
1108 <        idx_fwd_1 = idx_prefix + ".1.ebwt"
1109 <        idx_fwd_2 = idx_prefix + ".2.ebwt"
1110 <        idx_rev_1 = idx_prefix + ".rev.1.ebwt"
1111 <        idx_rev_2 = idx_prefix + ".rev.2.ebwt"
1067 <
1068 <        if os.path.exists(idx_fwd_1) and \
1069 <           os.path.exists(idx_fwd_2) and \
1070 <           os.path.exists(idx_rev_1) and \
1071 <           os.path.exists(idx_rev_2):
1103 >        bwtidxerr="Error: Could not find Bowtie "+bowtie_ver+"index files (" + idx_prefix + ".*."+idxext+")"
1104 >        bwtidx_env = os.environ.get("BOWTIE_INDEXES")
1105 >
1106 >        if bwtidx_env == None:
1107 >            die(bwtidxerr)
1108 >        if os.path.exists(bwtidx_env+idx_fwd_1) and \
1109 >           os.path.exists(bwtidx_env+idx_fwd_2) and \
1110 >           os.path.exists(bwtidx_env+idx_rev_1) and \
1111 >           os.path.exists(bwtidx_env+idx_rev_2):
1112              return
1113          else:
1114 <            die("Error: Could not find Bowtie index files " + idx_prefix + ".*")
1114 >            die(bwtidxerr)
1115  
1116   # Reconstructs the multifasta file from which the Bowtie index was created, if
1117   # it's not already there.
# Line 1088 | Line 1128
1128          inspect_cmd = [prog_path("bowtie-inspect"),
1129                         idx_prefix]
1130  
1131 <        print >> sys.stderr, "Executing: " + " ".join(inspect_cmd) + " > " + tmp_fasta_file_name
1131 >        th_logp("  Executing: " + " ".join(inspect_cmd) + " > " + tmp_fasta_file_name)
1132          ret = subprocess.call(inspect_cmd,
1133                                stdout=tmp_fasta_file,
1134                                stderr=inspect_log)
# Line 1118 | Line 1158
1158              if os.path.exists(idx_fasta):
1159                  return idx_fasta
1160  
1161 <        print >> sys.stderr, "\tWarning: Could not find FASTA file " + idx_fasta
1161 >        th_logp("\tWarning: Could not find FASTA file " + idx_fasta)
1162          idx_fa = bowtie_idx_to_fa(idx_prefix)
1163          return idx_fa
1164  
1165   # Check that both the Bowtie index and the genome's fasta file are present
1166 < def check_index(idx_prefix):
1167 <    check_bowtie_index(idx_prefix)
1166 > def check_index(idx_prefix, is_bowtie2):
1167 >    check_bowtie_index(idx_prefix, is_bowtie2)
1168      ref_fasta_file = check_fasta(idx_prefix)
1169  
1170      return (ref_fasta_file, None)
# Line 1256 | Line 1296
1296         die(errmsg)
1297  
1298   # Make sure Bowtie is installed and is recent enough to be useful
1299 < def check_bowtie(is_bowtie2):
1300 <    log_msg = "Checking for Bowtie"
1301 <    if is_bowtie2:
1302 <        log_msg += "2"
1299 > def check_bowtie(params):
1300 >    bowtie_req=""
1301 >    if params.bowtie2:
1302 >        bowtie_req="2"
1303 >    log_msg = "Checking for Bowtie" + bowtie_req
1304      th_log(log_msg)
1305  
1306 <    bowtie_bin = "bowtie"
1266 <    if is_bowtie2:
1267 <        bowtie_bin += "2"
1306 >    bowtie_bin = "bowtie"+bowtie_req
1307  
1308      global bowtie_path
1309      bowtie_path=prog_path(bowtie_bin)
1310 <    bowtie_version = get_bowtie_version(is_bowtie2)
1310 >    bowtie_version = get_bowtie_version(params.bowtie2)
1311 >    if params.bowtie2 and bowtie_version == None:
1312 >        #try to fallback on bowtie 1
1313 >        params.bowtie2=False
1314 >        bowtie_version=get_bowtie_version(params.bowtie2)
1315      if bowtie_version == None:
1316 <        die("Error: Bowtie not found on this system")
1317 <    elif is_bowtie2:
1316 >           die("Error: Bowtie not found on this system.")
1317 >    if params.bowtie2:
1318          if bowtie_version[0] < 3 and bowtie_version[1] < 1 and bowtie_version[2] < 1 and bowtie_version[3] < 5:
1319              die("Error: TopHat requires Bowtie 2.0.0-beta5 or later")
1320 <    elif not is_bowtie2:
1320 >    else:
1321          if bowtie_version[0] < 1 and bowtie_version[1] < 12 and bowtie_version[2] < 3:
1322              die("Error: TopHat requires Bowtie 0.12.3 or later")
1323 <    print >> sys.stderr, "\t version:\t\t\t %s" % ".".join([str(x) for x in bowtie_version])
1323 >    th_logp("\t\t  Bowtie version:\t %s" % ".".join([str(x) for x in bowtie_version]))
1324  
1325  
1326   # Retrive a tuple containing the system's version of samtools.  Parsed from
# Line 1313 | Line 1356
1356          die("Error: Samtools not found on this system")
1357      elif  samtools_version_arr[1] < 1 or samtools_version_arr[2] < 7:
1358          die("Error: TopHat requires Samtools 0.1.7 or later")
1359 <    print >> sys.stderr, "\tSamtools %s" % samtools_version_str
1359 >    th_logp("\t\tSamtools version:\t %s" % ".".join([str(x) for x in samtools_version_arr]))
1360 >
1361  
1362  
1363   class FastxReader:
# Line 1464 | Line 1508
1508           return self.lastline
1509    def ungetLine(self):
1510        if self.lastline is None:
1511 <         print >> sys.stderr, "Warning: FastxReader called ungetLine() with no prior line!"
1511 >         th_logp("Warning: FastxReader called ungetLine() with no prior line!")
1512        self.bufline=self.lastline
1513        self.lastline=None
1514   #< class FastxReader
# Line 1591 | Line 1635
1635              if not seqid: break
1636              toread-=1
1637              if seq_len < 20:
1638 <                  print >> sys.stderr, "Warning: found a read < 20bp in", f_name
1638 >                  th_logp("Warning: found a read < 20bp in "+f_name)
1639              else:
1640                  min_seed_len = min(seq_len, min_seed_len)
1641                  max_seed_len = max(seq_len, max_seed_len)
# Line 1605 | Line 1649
1649      #else:
1650      #    seed_len = max_seed_len
1651      #print >> sys.stderr, "\tmin read length: %dbp, max read length: %dbp" % (min_seed_len, max_seed_len)
1652 <    print >> sys.stderr, "\tformat:\t\t %s" % fileformat
1652 >    th_logp("\tformat:\t\t %s" % fileformat)
1653      if fileformat == "fastq":
1654          quality_scale = "phred33 (default)"
1655          if params.read_params.solexa_quals and not params.read_params.phred64_quals:
1656              quality_scale = "solexa33 (reads generated with GA pipeline version < 1.3)"
1657          elif params.read_params.phred64_quals:
1658              quality_scale = "phred64 (reads generated with GA pipeline version >= 1.3)"
1659 <        print >> sys.stderr, "\tquality scale:\t %s" % quality_scale
1659 >        th_logp("\tquality scale:\t %s" % quality_scale)
1660      elif fileformat == "fasta":
1661          if params.read_params.color:
1662              params.read_params.integer_quals = True
# Line 1690 | Line 1734
1734                 raise Exception()
1735             except Exception, e:
1736               die(fail_str+"Error retrieving prep_reads info.")
1737 <           print >> sys.stderr, "\t%s reads: min. length=%s, count=%s" %  (side, self.min_len, self.out_count)
1737 >           th_logp("\t%s reads: min. length=%s, count=%s" %  (side, self.min_len, self.out_count))
1738             if self.min_len < 20:
1739 <               print >> sys.stderr, "Warning: short reads (<20bp) will make TopHat quite slow and take large amount of memory because they are likely to be mapped to too many places"
1739 >               th_logp("Warning: short reads (<20bp) will make TopHat quite slow and take large amount of memory because they are likely to be mapped to too many places")
1740  
1741  
1742   def prep_reads_cmd(params, reads_list, quals_list=None, aux_file=None, index_file=None, filter_reads=None, hits_to_filter=None):
# Line 1819 | Line 1863
1863              params.bowtie_alignment_option = "-v"
1864  
1865      if t_mapping:
1866 <       print >> sys.stderr, "[%s] Mapping %s against transcriptome %s with %s %s" % (start_time.strftime("%c"),
1867 <                         readfile_basename, bwt_idx_name, bowtie_str, extra_output)
1866 >       th_log("Mapping %s against transcriptome %s with %s %s" % (readfile_basename,
1867 >                     bwt_idx_name, bowtie_str, extra_output))
1868      else:
1869 <       print >> sys.stderr, "[%s] Mapping %s against %s with %s %s" % (start_time.strftime("%c"),
1870 <                         readfile_basename, bwt_idx_name, bowtie_str, extra_output)
1869 >       th_log("Mapping %s against %s with %s %s" % (readfile_basename,
1870 >                     bwt_idx_name, bowtie_str, extra_output))
1871      bwt_log = open(logging_dir + 'bowtie.'+readfile_basename+'.fixmap.log', "w")
1872      #bwt_mapped=mapped_reads
1873      unmapped_reads_out=None
# Line 1858 | Line 1902
1902          bam_input=False
1903          if len(reads_list) > 0 and reads_list[0].endswith('.bam'):
1904             bam_input=True
1905 <           unzip_cmd=[ prog_path('bam2fastx') ]
1905 >           unzip_cmd=[ prog_path('bam2fastx'), "--all" ]
1906             if reads_format:
1907                unzip_cmd.append("--" + reads_format)
1908             unzip_cmd+=[reads_list[0]]
# Line 2116 | Line 2160
2160                                    stdout=gtf_juncs_out)
2161          # cvg_islands returned an error
2162          if retcode == 1:
2163 <            print >> sys.stderr, "\tWarning: TopHat did not find any junctions in GTF file"
2163 >            th_logp("\tWarning: TopHat did not find any junctions in GTF file")
2164              return (False, gtf_juncs_out_name)
2165          elif retcode != 0:
2166              die(fail_str+"Error: GTF junction extraction failed with err ="+str(retcode))
# Line 2573 | Line 2617
2617      #maps = [x for x in seg_maps if (os.path.exists(x) and os.path.getsize(x) > 0)]
2618      #if len(maps) == 0:
2619      #    return None
2576    #print >> sys.stderr, left_maps
2620      slash = left_maps[0].rfind('/')
2621      juncs_out = ""
2622      if slash != -1:
# Line 2608 | Line 2651
2651      # cvg_islands not found
2652      except OSError, o:
2653          if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2654 <           print >> sys.stderr, fail_str, "Error: closure_juncs not found on this system"
2654 >           th_logp(fail_str + "Error: closure_juncs not found on this system")
2655          die(str(o))
2656      return [juncs_out]
2657  
# Line 2667 | Line 2710
2710      # cvg_islands not found
2711      except OSError, o:
2712          if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2713 <           print >> sys.stderr, fail_str, "Error: segment_juncs not found on this system"
2713 >           th_logp(fail_str + "Error: segment_juncs not found on this system")
2714          die(str(o))
2715  
2716      return [juncs_out, insertions_out, deletions_out, fusions_out]
# Line 3006 | Line 3049
3049         if num_segs>1: max_seg_len += (read_len % segment_len)
3050  
3051      if num_segs <= 1:
3052 <         print >> sys.stderr, "Warning: you have only one segment per read.\n\tIf the read length is greater than or equal to 45bp,\n\twe strongly recommend that you decrease --segment-length to about half the read length because TopHat will work better with multiple segments"
3052 >         th_logp("Warning: you have only one segment per read.\n\tIf the read length is greater than or equal to 45bp,\n\twe strongly recommend that you decrease --segment-length to about half the read length because TopHat will work better with multiple segments")
3053  
3054      # Using the num_segs value returned by check_reads(),
3055      # decide which junction discovery strategy to use
# Line 3175 | Line 3218
3218          # print >> sys.stderr, "Warning: deletions database is empty!"
3219      if len(possible_juncs) == 0:
3220          possible_juncs.append(os.devnull)
3221 <        print >> sys.stderr, "Warning: junction database is empty!"
3221 >        th_logp("Warning: junction database is empty!")
3222      if len(possible_fusions) == 0:
3223          possible_fusions.append(os.devnull)
3224      if junc_idx_prefix:
# Line 3296 | Line 3339
3339  
3340      return maps
3341  
3299 def die(msg=None):
3300 if msg is not None:
3301    print >> sys.stderr, msg
3302    sys.exit(1)
3303
3342   # rough equivalent of the 'which' command to find external programs
3343   # (current script path is tested first, then PATH envvar)
3344   def which(program):
# Line 3368 | Line 3406
3406              if params.read_params.quals:
3407                  left_quals_list = args[2]
3408  
3371        print >> sys.stderr
3372        th_log("Beginning TopHat run (v"+get_version()+")")
3373        print >> sys.stderr, "-----------------------------------------------"
3374
3409          start_time = datetime.now()
3410          prepare_output_dir()
3411 +        init_logger(logging_dir + "tophat.log")
3412 +
3413 +        th_logp()
3414 +        th_log("Beginning TopHat run (v"+get_version()+")")
3415 +        th_logp("-----------------------------------------------")
3416  
3417          global run_log
3418          run_log = open(logging_dir + "run.log", "w", 0)
# Line 3381 | Line 3420
3420          run_cmd = " ".join(argv)
3421          print >> run_log, run_cmd
3422  
3423 +        check_bowtie(params)
3424 +        check_samtools()
3425 +
3426          # Validate all the input files, check all prereqs before committing
3427          # to the run
3428          if params.gff_annotation:
# Line 3418 | Line 3460
3460  
3461                if not (os.path.exists(t_gff) and os.path.getsize(t_gff)>10):
3462                    die("Error: GFF transcripts file not found or invalid (%s)" % t_gff)
3463 <              check_bowtie_index(params.transcriptome_index)
3463 >              check_bowtie_index(params.transcriptome_index, params.bowtie2)
3464             params.gff_annotation=t_gff
3465             #end @ transcriptome_index given
3466  
3467 <        (ref_fasta, ref_seq_dict) = check_index(bwt_idx_prefix)
3467 >        (ref_fasta, ref_seq_dict) = check_index(bwt_idx_prefix, params.bowtie2)
3468  
3427        check_bowtie(params.bowtie2)
3428        check_samtools()
3469          th_log("Generating SAM header for "+bwt_idx_prefix)
3470          sam_header_filename = get_index_sam_header(params, bwt_idx_prefix)
3471          #if not params.skip_check_reads:
# Line 3553 | Line 3593
3593  
3594          finish_time = datetime.now()
3595          duration = finish_time - start_time
3596 <        print >> sys.stderr,"-----------------------------------------------"
3596 >        th_logp("-----------------------------------------------")
3597          th_log("Run complete: %s elapsed" %  formatTD(duration))
3598  
3599      except Usage, err:
3600 <        print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg)
3601 <        print >> sys.stderr, "    for detailed help see http://tophat.cbcb.umd.edu/manual.html"
3600 >        th_logp(sys.argv[0].split("/")[-1] + ": " + str(err.msg))
3601 >        th_logp("    for detailed help see http://tophat.cbcb.umd.edu/manual.html")
3602          return 2
3603  
3604  

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines