ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/tophat
(Generate patch)
# Line 89 | Line 89
89                                                   files for color space reads]
90  
91   Advanced Options:
92 +    --report-secondary-alignments
93      --genome-read-mismatches       <int>       [ default: 2                ]
94      -N/--read-mismatches           <int>       [ default: 2                ]
95      --segment-mismatches           <int>       [ default: 2                ]
# Line 105 | Line 106
106      --no-convert-bam                           (Do not convert to bam format.
107                                                  Output is <output_dir>accepted_hit.sam.
108                                                  Implies --no-sort-bam)
109 +    --keep-fasta-order
110 +    --allow-partial-mapping
111  
112   Bowtie2 related options:
113    Preset options in --end-to-end mode (local alignment is not used in TopHat2)
# Line 168 | Line 171
171   samtools_path = None
172   bowtie_path = None
173   fail_str = "\t[FAILED]\n"
171
172 sam_header = tmp_dir + "stub_header.sam"
173
174   gtf_juncs = None #file name with junctions extracted from given GFF file
175  
176
176   # TopHatParams captures all of the runtime paramaters used by TopHat, and many
177   # of these are passed as command line options to exectubles run by the pipeline
178  
179   # This class and its nested classes also do options parsing through parse_options()
180   # and option validation via the member function check()
181  
183
182   class BowtieOutputFiles:
183      def __init__(self,
184            mappings=None,
# Line 297 | Line 295
295                       quals,
296                       integer_quals,
297                       color,
300                     color_out,
298                       library_type,
299                       seed_length,
300                       reads_format,
# Line 316 | Line 313
313              self.quals = quals
314              self.integer_quals = integer_quals
315              self.color = color
319            self.color_out = color_out
316              self.library_type = library_type
317              self.seed_length = seed_length
318              self.reads_format = reads_format
# Line 343 | Line 339
339                      self.integer_quals = True
340                  elif option in ("-C", "--color"):
341                      self.color = True
346                elif option == "--color-out":
347                    self.color_out = True
342                  elif option == "--library-type":
343                      self.library_type = value
344                  elif option in ("-s", "--seed-length"):
# Line 567 | Line 561
561                                             False,               # quals
562                                             None,                # integer quals
563                                             False,               # SOLiD - color space
570                                           False,               # SOLiD - color out instead of base pair,
564                                             "",                  # library type (e.g. "illumina-stranded-pair-end")
565                                             None,                # seed_length
566                                             "fastq",             # quality_format
# Line 609 | Line 602
602          self.find_GFF_juncs = True
603          self.max_hits = 20
604          self.t_max_hits = 60
605 +        self.max_seg_hits = 40
606          self.prefilter_multi = False
607          self.genome_read_mismatches = 2
608          self.max_read_mismatches = 2
# Line 624 | Line 618
618          self.closure_search = False
619          self.microexon_search = False
620          self.butterfly_search = None
621 +        self.report_secondary_alignments = False
622 +
623 +        self.keep_fasta_order = False
624 +        self.partial_mapping = False
625 +        
626          self.fusion_search = False
627          self.fusion_anchor_length = 20
628          self.fusion_min_dist = 10000000
# Line 685 | Line 684
684                 "--min-isoform-fraction", str(self.splice_constraints.min_isoform_fraction),
685                 "--output-dir", output_dir,
686                 "--max-multihits", str(self.max_hits),
687 +               "--max-seg-multihits", str(self.max_seg_hits),
688                 "--segment-length", str(self.segment_length),
689                 "--segment-mismatches", str(self.segment_mismatches),
690                 "--min-closure-exon", str(self.search_params.min_closure_exon_length),
# Line 695 | Line 695
695                 "--min-segment-intron", str(self.search_params.min_segment_intron_length),
696                 "--max-segment-intron", str(self.search_params.max_segment_intron_length),
697                 "--max-mismatches", str(self.max_read_mismatches),
698               "--sam-header", sam_header,
698                 "--max-insertion-length", str(self.max_insertion_length),
699                 "--max-deletion-length", str(self.max_deletion_length)]
700  
# Line 735 | Line 734
734              cmd.append("--integer-quals")
735          if self.read_params.color:
736              cmd.append("--color")
738            if self.read_params.color_out:
739                cmd.append("--color-out")
737          if self.read_params.library_type:
738              cmd.extend(["--library-type", self.read_params.library_type])
739          if self.read_params.read_group_id:
# Line 761 | Line 758
758                                           "quals",
759                                           "integer-quals",
760                                           "color",
764                                         "color-out",
761                                           "library-type=",
762                                           "num-threads=",
763                                           "splice-mismatches=",
# Line 828 | Line 824
824                                           "deletions=",
825                                           "no-sort-bam",
826                                           "no-convert-bam",
827 +                                         "report-secondary-alignments",
828 +                                         "keep-fasta-order",
829 +                                         "allow-partial-mapping",
830                                           "b2-very-fast",
831                                           "b2-fast",
832                                           "b2-sensitive",
# Line 862 | Line 861
861          global output_dir
862          global logging_dir
863          global tmp_dir
865        global sam_header
864  
865          custom_tmp_dir = None
866          custom_out_dir = None
# Line 877 | Line 875
875                  self.bowtie2 = False
876              if option in ("-g", "--max-multihits"):
877                  self.max_hits = int(value)
878 +                self.max_seg_hits = max(10, self.max_hits * 2)
879              if option in ("-x", "--transcriptome-max-hits"):
880                  self.t_max_hits = int(value)
881              if option in ("-G", "--GTF"):
# Line 907 | Line 906
906                  self.fusion_multipairs = int(value)
907              if option == "--no-gtf-juncs":
908                  self.find_GFF_juncs = False
910            #if option == "--skip-check-reads":
911            #    self.skip_check_reads = True
909              if option == "--no-coverage-search":
910                  self.coverage_search = False
911              if option == "--coverage-search":
# Line 943 | Line 940
940                  self.raw_insertions = value
941              if option == "--deletions":
942                  self.raw_deletions = value
943 +            if option == "--report-secondary-alignments":
944 +                self.report_secondary_alignments = True
945 +            if option == "--keep-fasta-order":
946 +                self.keep_fasta_order = True
947 +            if option == "--allow-partial-mapping":
948 +                self.partial_mapping = True
949              if option in ("-o", "--output-dir"):
950                  custom_out_dir = value + "/"
951              if option == "--tmp-dir":
# Line 1167 | Line 1170
1170             errmsg+="Error: bowtie not found on this system"
1171         die(errmsg)
1172  
1173 < def get_index_sam_header(is_bowtie2, read_params, idx_prefix, sort = True):
1173 > def get_index_sam_header(params, idx_prefix):
1174      try:
1175 <        bowtie_sam_header_filename = tmp_dir+idx_prefix.split('/')[-1]+".bwt.samheader.sam"
1176 <        bowtie_sam_header_file = open(bowtie_sam_header_filename,"w")
1177 <
1175 >        temp_sam_header_filename = tmp_dir + "temp.samheader.sam"
1176 >        temp_sam_header_file = open(temp_sam_header_filename, "w")
1177 >        
1178          bowtie_header_cmd = [bowtie_path]
1179 <        if not is_bowtie2:
1179 >
1180 >        read_params = params.read_params
1181 >        if not params.bowtie2:
1182              bowtie_header_cmd += ["--sam"]
1183  
1184          if read_params.color:
# Line 1181 | Line 1186
1186  
1187          bowtie_header_cmd.extend([idx_prefix, '/dev/null'])
1188          subprocess.call(bowtie_header_cmd,
1189 <                   stdout=bowtie_sam_header_file,
1189 >                   stdout=temp_sam_header_file,
1190                     stderr=open('/dev/null'))
1191  
1192 <        bowtie_sam_header_file.close()
1193 <
1189 <        if not sort:
1190 <            return bowtie_sam_header_filename
1192 >        temp_sam_header_file.close()
1193 >        temp_sam_header_file = open(temp_sam_header_filename, "r")
1194  
1195 <        bowtie_sam_header_file = open(bowtie_sam_header_filename, "r")
1196 <        sam_header_file = open(sam_header, "w")
1195 >        bowtie_sam_header_filename = tmp_dir + idx_prefix.split('/')[-1] + ".bwt.samheader.sam"
1196 >        bowtie_sam_header_file = open(bowtie_sam_header_filename, "w")
1197  
1198          preamble = []
1199          sq_dict_lines = []
1200  
1201 <        for line in bowtie_sam_header_file.readlines():
1201 >        for line in temp_sam_header_file.readlines():
1202              line = line.strip()
1203              if line.find("@SQ") != -1:
1204                  # Sequence dictionary record
# Line 1213 | Line 1216
1216                  continue
1217              else:
1218                  preamble.append(line)
1219 <        #for line in preamble:
1220 <        #    print >> sam_header_file, line
1218 <        print >> sam_header_file, "@HD\tVN:1.0\tSO:coordinate"
1219 >
1220 >        print >> bowtie_sam_header_file, "@HD\tVN:1.0\tSO:coordinate"
1221          if read_params.read_group_id and read_params.sample_id:
1222              rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
1223                                              read_params.sample_id)
# Line 1234 | Line 1236
1236              if read_params.seq_platform:
1237                  rg_str += "\tPL:%s" % read_params.seq_platform
1238  
1239 <            print >> sam_header_file, rg_str
1239 >            print >> bowtie_sam_header_file, rg_str
1240  
1241 <        sq_dict_lines.sort(lambda x,y: cmp(x[0],y[0]))
1241 >        if not params.keep_fasta_order:
1242 >            sq_dict_lines.sort(lambda x,y: cmp(x[0],y[0]))
1243 >            
1244          for [name, line] in sq_dict_lines:
1245 <            print >> sam_header_file, line
1246 <        print >> sam_header_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
1243 <
1244 <        sam_header_file.close()
1245 >            print >> bowtie_sam_header_file, line
1246 >        print >> bowtie_sam_header_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
1247  
1248 <        copy(sam_header, bowtie_sam_header_filename)
1249 <        return sam_header
1248 >        bowtie_sam_header_file.close()
1249 >        temp_sam_header_file.close()
1250 >        return bowtie_sam_header_filename
1251  
1252      except OSError, o:
1253         errmsg=fail_str+str(o)+"\n"
# Line 1621 | Line 1624
1624                                     params.read_params.quals,
1625                                     params.read_params.integer_quals,
1626                                     params.read_params.color,
1624                                   params.read_params.color_out,
1627                                     params.read_params.library_type,
1628                                     #seed_len,
1629                                     params.read_params.seed_length,
# Line 1812 | Line 1814
1814          bowtie_str += "2"
1815  
1816      if seg_mapping:
1815        # we multiply max_hits by 2 when mapping segments
1816        # this multiplication factor (2) is also assumed in segment_juncs and long_spanning_reads,
1817        # which means "2" should not be changed without modifying those sub-programs.
1818        params.max_hits *= 2
1817          if not params.bowtie2:
1818              backup_bowtie_alignment_option = params.bowtie_alignment_option
1819              params.bowtie_alignment_option = "-v"
# Line 1863 | Line 1861
1861             unzip_cmd=[ prog_path('bam2fastx') ]
1862             if reads_format:
1863                unzip_cmd.append("--" + reads_format)
1864 <           unzip_cmd+=[reads_list]
1864 >           unzip_cmd+=[reads_list[0]]
1865  
1866          if use_zpacker and (unzip_cmd is None):
1867            unzip_cmd=[ params.system_params.zipper ]
# Line 1924 | Line 1922
1922  
1923          fix_map_cmd += ["--index-outfile", mapped_reads + ".index"]
1924  
1927        max_hits = params.max_hits
1925          if t_mapping:
1926 <          max_hits = params.t_max_hits
1926 >            max_hits = params.t_max_hits
1927 >        elif seg_mapping:
1928 >            max_hits = params.max_seg_hits
1929 >        else:
1930 >            max_hits = params.max_hits
1931 >            
1932          if num_mismatches > 3:
1933             num_mismatches = 3
1934  
# Line 2004 | Line 2006
2006          shellcmd=""
2007          unzip_proc=None
2008          samzip_proc=None
2009 +
2010          if multihits_out:
2011             #special prefilter bowtie run: we use prep_reads on the fly
2012             #in order to get multi-mapped reads to exclude later
# Line 2027 | Line 2030
2030                 if bam_input:
2031                     unzip_proc = subprocess.Popen(unzip_cmd, stdout=subprocess.PIPE)
2032                     shellcmd=' '.join(unzip_cmd) + "|"
2033 <               bowtie_cmd += [reads_file]
2033 >               else:
2034 >                   bowtie_cmd += [reads_file]
2035                 bowtie_proc = subprocess.Popen(bowtie_cmd,
2036                                       stdout=subprocess.PIPE,
2037                                       stderr=bwt_log)
# Line 2088 | Line 2092
2092          open(multihits_out, "w").close()
2093  
2094      if seg_mapping:
2091        params.max_hits /= 2
2095          if not params.bowtie2:
2096              params.bowtie_alignment_option = backup_bowtie_alignment_option
2097  
# Line 2176 | Line 2179
2179      deletions_file_list = ",".join(external_deletions)
2180      fusions_file_list = ",".join(external_fusions)
2181  
2182 +    # do not use insertions and deletions in case of Bowtie2
2183 +    if is_bowtie2:
2184 +        insertions_file_list = "/dev/null"
2185 +        deletions_file_list = "/dev/null"
2186 +
2187      juncs_db_log = open(logging_dir + "juncs_db.log", "w")
2188  
2189      external_splices_out_prefix  = tmp_dir + juncs_prefix
# Line 2297 | Line 2305
2305      alignments_output_filename = tmp_dir + "accepted_hits"
2306  
2307      report_cmd.extend(params.cmd())
2308 +    report_cmd += ["--sam-header", sam_header_filename]
2309 +    if params.report_secondary_alignments:
2310 +        report_cmd += ["--report-secondary-alignments"]
2311 +    
2312      report_cmd.extend(["--samtools="+samtools_path])
2313      report_cmd.extend([ref_fasta,
2314                         junctions,
# Line 2551 | Line 2563
2563   # Find possible splice junctions using the "closure search" strategy, and report
2564   # them in closures.juncs.  Calls the executable closure_juncs
2565   def junctions_from_closures(params,
2566 +                            sam_header_filename,
2567                              left_maps,
2568                              right_maps,
2569                              ref_fasta):
# Line 2578 | Line 2591
2591      right_maps = ','.join(right_maps)
2592  
2593      juncs_cmd.extend(params.cmd())
2594 <    juncs_cmd.extend([juncs_out,
2594 >    juncs_cmd.extend(["--sam-header", sam_header_filename,
2595 >                      juncs_out,
2596                        fusions_out,
2597                        ref_fasta,
2598                        left_maps,
# Line 2603 | Line 2617
2617   # segment.insertions, and segment.deletions.  Calls the executable
2618   # segment_juncs
2619   def junctions_from_segments(params,
2620 +                            sam_header_filename,
2621                              left_reads,
2622                              left_reads_map,
2623                              left_seg_maps,
# Line 2626 | Line 2641
2641      segj_cmd = [prog_path("segment_juncs")]
2642  
2643      segj_cmd.extend(params.cmd())
2644 <    segj_cmd.extend(["--ium-reads", ",".join(unmapped_reads),
2645 <                      ref_fasta,
2646 <                      juncs_out,
2647 <                      insertions_out,
2648 <                      deletions_out,
2649 <                      fusions_out,
2650 <                      left_reads,
2651 <                      left_reads_map,
2652 <                      left_maps])
2644 >    segj_cmd.extend(["--sam-header", sam_header_filename,
2645 >                     "--ium-reads", ",".join(unmapped_reads),
2646 >                     ref_fasta,
2647 >                     juncs_out,
2648 >                     insertions_out,
2649 >                     deletions_out,
2650 >                     fusions_out,
2651 >                     left_reads,
2652 >                     left_reads_map,
2653 >                     left_maps])
2654      if right_seg_maps:
2655          right_maps = ','.join(right_seg_maps)
2656          segj_cmd.extend([right_reads, right_reads_map, right_maps])
# Line 2687 | Line 2703
2703      align_cmd = [prog_path("long_spanning_reads")]
2704  
2705      align_cmd.extend(params.cmd())
2706 +    align_cmd += ["--sam-header", sam_header_filename]
2707 +
2708 +    b2_params = params.bowtie2_params
2709 +    max_penalty, min_penalty = b2_params.mp.split(',')
2710 +    align_cmd += ["--bowtie2-max-penalty", max_penalty,
2711 +                  "--bowtie2-min-penalty", min_penalty]
2712 +
2713 +    align_cmd += ["--bowtie2-penalty-for-N", str(b2_params.np)]
2714 +
2715 +    read_gap_open, read_gap_cont = b2_params.rdg.split(',')
2716 +    align_cmd += ["--bowtie2-read-gap-open", read_gap_open,
2717 +                  "--bowtie2-read-gap-cont", read_gap_cont]
2718 +
2719 +    ref_gap_open, ref_gap_cont = b2_params.rfg.split(',')
2720 +    align_cmd += ["--bowtie2-ref-gap-open", ref_gap_open,
2721 +                  "--bowtie2-ref-gap-cont", ref_gap_cont]
2722  
2723      align_cmd.append(ref_fasta)
2724      align_cmd.extend([reads,
# Line 2723 | Line 2755
2755              self.segs = segs
2756  
2757   # Map2GTF stuff
2758 < def m2g_convert_coords(params, gtf_fname, reads, out_fname):
2758 > def m2g_convert_coords(params, sam_header_filename, gtf_fname, reads, out_fname):
2759      """ajjkljlks
2760  
2761      Arguments:
# Line 2734 | Line 2766
2766      """
2767      m2g_cmd = [prog_path("map2gtf")]
2768      m2g_cmd.extend(params.cmd())
2769 +    m2g_cmd += ["--sam-header", sam_header_filename]
2770      m2g_cmd.append(gtf_fname)
2771      m2g_cmd.append(reads) #could be BAM file
2772      m2g_cmd.append(out_fname)
# Line 2783 | Line 2816
2816          err_msg = fail_str + str(o)
2817          die(err_msg + "\n")
2818  
2819 < def map2gtf(params, ref_fasta, left_reads, right_reads):
2819 > def map2gtf(params, genome_sam_header_filename, ref_fasta, left_reads, right_reads):
2820      """ Main GTF mapping function
2821  
2822      Arguments:
# Line 2810 | Line 2843
2843           t_out_dir=params.transcriptome_outdir+"/"
2844         m2g_ref_fasta = t_out_dir + gtf_name + ".fa"
2845         gtf_to_fasta(params, params.gff_annotation, ref_fasta, m2g_ref_fasta)
2813       # trans_hash = transcripts_to_fasta(transcripts,
2814       #                                   ref_fasta,
2815       #                                   m2g_ref_fasta)
2846         m2g_bwt_idx = build_idx_from_fa(params.bowtie2, m2g_ref_fasta, t_out_dir, params.read_params.color)
2847 +
2848 +    get_index_sam_header(params, m2g_bwt_idx)
2849 +    
2850      mapped_gtf_list = []
2851      unmapped_gtf_list = []
2852      # do the initial mapping in GTF coordinates
# Line 2845 | Line 2878
2878          fbasename = getFileBaseName(reads)
2879          sam_out_fname = tmp_dir + fbasename + "_converted.bam"
2880          m2g_convert_coords(params,
2881 +                           genome_sam_header_filename,
2882                             params.gff_annotation,
2883                             reads,
2884                             sam_out_fname)
# Line 2945 | Line 2979
2979  
2980      if params.gff_annotation:
2981          (mapped_gtf_list, unmapped_gtf_list) = \
2982 <            map2gtf(params, ref_fasta, left_reads, right_reads)
2982 >            map2gtf(params, sam_header_filename, ref_fasta, left_reads, right_reads)
2983  
2984          m2g_left_maps, m2g_right_maps = mapped_gtf_list
2985          m2g_maps = [m2g_left_maps, m2g_right_maps]
# Line 3097 | Line 3131
3131          #      or perhaps the GTF file directly
3132          #      -> this could improve alternative junction detection?
3133          juncs = junctions_from_segments(params,
3134 +                                        sam_header_filename,
3135                                          left_reads,
3136                                          left_reads_map,
3137                                          left_seg_maps,
# Line 3120 | Line 3155
3155          # discover addtional junctions
3156          if params.closure_search and left_reads and right_reads:
3157              juncs = junctions_from_closures(params,
3158 +                                            sam_header_filename,
3159                                              [maps[initial_reads[left_reads]].unspliced_sam, maps[initial_reads[left_reads]].seg_maps[-1]],
3160                                              [maps[initial_reads[right_reads]].unspliced_sam, maps[initial_reads[right_reads]].seg_maps[-1]],
3161                                              ref_fasta)
# Line 3154 | Line 3190
3190                                            ref_fasta,
3191                                            params.read_params.color)
3192  
3193 <        juncs_bwt_samheader = get_index_sam_header(params.bowtie2, params.read_params, juncs_bwt_idx, False)
3193 >        juncs_bwt_samheader = get_index_sam_header(params, juncs_bwt_idx)
3194  
3195      # Now map read segments (or whole IUM reads, if num_segs == 1) to the splice
3196      # index with Bowtie
# Line 3219 | Line 3255
3255              # the correct spliced alignment.
3256  
3257              try:
3258 <                merge_cmd = [ prog_path("bam_merge"), "--index-outfile", merged_map + ".index",
3259 <                              merged_map ]
3258 >                merge_cmd = [prog_path("bam_merge"),
3259 >                             "--index-outfile", merged_map + ".index",
3260 >                             "--sam-header", sam_header_filename,
3261 >                             merged_map]
3262  
3263                  if num_segs > 1:
3264                      merge_cmd += [ maps[ri].unspliced_sam ]
# Line 3389 | Line 3427
3427          check_bowtie(params.bowtie2)
3428          check_samtools()
3429          th_log("Generating SAM header for "+bwt_idx_prefix)
3430 <        sam_header_filename = get_index_sam_header(params.bowtie2, params.read_params, bwt_idx_prefix)
3430 >        sam_header_filename = get_index_sam_header(params, bwt_idx_prefix)
3431          #if not params.skip_check_reads:
3432          reads_list = left_reads_list
3433          if right_reads_list:
# Line 3404 | Line 3442
3442          if params.gff_annotation and params.find_GFF_juncs:
3443              test_input_file(params.gff_annotation)
3444              (found_juncs, gtf_juncs) = get_gtf_juncs(params.gff_annotation)
3445 <            ##-- we don't need these junctions in user_supplied_juncs anymore because now map2gtf does a much better job
3445 >            ##-- we shouldn't need these junctions in user_supplied_juncs anymore because now map2gtf does a much better job
3446              ## but we still need them loaded in gtf_juncs for later splice verification
3447 <            #if found_juncs and not params.gff_annotation:
3448 <            #    user_supplied_juncs.append(gtf_juncs)
3447 >            if found_juncs:
3448 >                ## and not params.gff_annotation:
3449 >                user_supplied_juncs.append(gtf_juncs)
3450              #else:
3451              #    gtf_juncs = None
3452          if params.raw_junctions:

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines