ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/tophat
Revision: 154
Committed: Tue Jan 24 02:29:21 2012 UTC (8 years, 4 months ago) by gpertea
File size: 147456 byte(s)
Log Message:
massive update with Daehwan's work

Line User Rev File contents
1 gpertea 29 #!/usr/bin/env python
2    
3     # encoding: utf-8
4     """
5     tophat.py
6    
7     Created by Cole Trapnell on 2008-12-25.
8     Copyright (c) 2008 Cole Trapnell. All rights reserved.
9 gpertea 154 Updated and maintained by Daehwan Kim and Geo Pertea since Jul 2010.
10 gpertea 29 """
11     import sys
12     try:
13     import psyco
14     psyco.full()
15     except ImportError:
16     pass
17    
18     import getopt
19     import subprocess
20     import errno
21     import os
22     import warnings
23     import re
24     import signal
25     from datetime import datetime, date, time
26 gpertea 154 from shutil import copy
27 gpertea 29
28     use_message = '''
29     TopHat maps short sequences from spliced transcripts to whole genomes.
30    
31     Usage:
32     tophat [options] <bowtie_index> <reads1[,reads2,...]> [reads1[,reads2,...]] \\
33     [quals1,[quals2,...]] [quals1[,quals2,...]]
34    
35     Options:
36     -v/--version
37     -o/--output-dir <string> [ default: ./tophat_out ]
38 gpertea 154 --bowtie1 [ default: bowtie2 ]
39 gpertea 29 -a/--min-anchor <int> [ default: 8 ]
40     -m/--splice-mismatches <0-2> [ default: 0 ]
41     -i/--min-intron-length <int> [ default: 50 ]
42     -I/--max-intron-length <int> [ default: 500000 ]
43     -g/--max-multihits <int> [ default: 20 ]
44 gpertea 135 -x/--transcriptome-max-hits <int> [ default: 60 ]
45     -n/--transcriptome-mismatches <int> [ default: 1 ]
46     -M/--prefilter-multihits ( for -G/--GTF option, enable
47     an initial bowtie search
48     against the genome )
49 gpertea 29 --max-insertion-length <int> [ default: 3 ]
50     --max-deletion-length <int> [ default: 3 ]
51     --solexa-quals
52     --solexa1.3-quals (same as phred64-quals)
53     --phred64-quals (same as solexa1.3-quals)
54     -Q/--quals
55     --integer-quals
56     -C/--color (Solid - color space)
57     --color-out
58     --library-type <string> (fr-unstranded, fr-firststrand,
59     fr-secondstrand)
60 gpertea 154 -p/--num-threads <int> [ default: 1 ]
61     -G/--GTF <filename> (GTF/GFF with known transcripts)
62     --transcriptome-index <bwtidx> (transcriptome bowtie index)
63     -T/--transcriptome-only (map only to the transcriptome)
64 gpertea 29 -j/--raw-juncs <filename>
65     --insertions <filename>
66     --deletions <filename>
67     -r/--mate-inner-dist <int>
68 gpertea 154 --mate-std-dev <int> [ default: 20 ]
69 gpertea 29 --no-novel-juncs
70     --no-novel-indels
71 gpertea 154 --fusion-search
72     --fusion-anchor-length <int> [ default: 20 ]
73     --fusion-min-dist <int> [ default: 10000000 ]
74     --fusion-read-mismatches <int> [ default: 2 ]
75     --fusion-multireads <int> [ default: 2 ]
76     --fusion-multipairs <int> [ default: 2 ]
77 gpertea 29 --no-gtf-juncs
78     --no-coverage-search
79     --coverage-search
80     --no-closure-search
81     --closure-search
82     --microexon-search
83     --butterfly-search
84     --no-butterfly-search
85     --keep-tmp
86     --tmp-dir <dirname> [ default: <output_dir>/tmp ]
87     -z/--zpacker <program> [ default: gzip ]
88     -X/--unmapped-fifo [use mkfifo to compress more temporary
89     files for color space reads]
90    
91     Advanced Options:
92 gpertea 154 --genome-read-mismatches <int> [ default: 2 ]
93     -N/--read-mismatches <int> [ default: 2 ]
94 gpertea 29 --segment-mismatches <int> [ default: 2 ]
95     --segment-length <int> [ default: 25 ]
96     --bowtie-n [ default: bowtie -v ]
97     --min-closure-exon <int> [ default: 100 ]
98     --min-closure-intron <int> [ default: 50 ]
99     --max-closure-intron <int> [ default: 5000 ]
100     --min-coverage-intron <int> [ default: 50 ]
101     --max-coverage-intron <int> [ default: 20000 ]
102     --min-segment-intron <int> [ default: 50 ]
103     --max-segment-intron <int> [ default: 500000 ]
104 gpertea 154 --no-sort-bam (Output BAM is not coordinate-sorted)
105     --no-convert-bam (Do not convert to bam format.
106 gpertea 29 Output is <output_dir>accepted_hit.sam.
107 gpertea 154 Implies --no-sort-bam)
108 gpertea 29
109 gpertea 154 Bowtie2 related options:
110     Preset options in --end-to-end mode (local alignment is not used in TopHat2)
111     --b2-very-fast
112     --b2-fast
113     --b2-sensitive
114     --b2-very-sensitive
115    
116     Alignment options
117     --b2-N <int> [ default: 0 ]
118     --b2-L <int> [ default: 20 ]
119     --b2-i <func> [ default: S,1,1.25 ]
120     --b2-n-ceil <func> [ default: L,0,0.15 ]
121     --b2-gbar <int> [ default: 4 ]
122    
123     Scoring options
124     --b2-mp <int>,<int> [ default: 6,2 ]
125     --b2-np <int> [ default: 1 ]
126     --b2-rdg <int>,<int> [ default: 5,3 ]
127     --b2-rfg <int>,<int> [ default: 5,3 ]
128     --b2-score-min <func> [ default: L,-0.6,-0.6 ]
129    
130     Effort options
131     --b2-D <int> [ default: 15 ]
132     --b2-R <int> [ default: 2 ]
133    
134    
135 gpertea 29 SAM Header Options (for embedding sequencing run metadata in output):
136     --rg-id <string> (read group ID)
137     --rg-sample <string> (sample ID)
138     --rg-library <string> (library ID)
139     --rg-description <string> (descriptive string, no tabs allowed)
140     --rg-platform-unit <string> (e.g Illumina lane ID)
141     --rg-center <string> (sequencing center name)
142     --rg-date <string> (ISO 8601 date of the sequencing run)
143     --rg-platform <string> (Sequencing platform descriptor)
144     '''
145    
146 gpertea 154 # -F/--min-isoform-fraction <float> [ default: 0.15 ]
147     # (obsolete)
148    
149 gpertea 29 class Usage(Exception):
150     def __init__(self, msg):
151     self.msg = msg
152    
153     output_dir = "./tophat_out/"
154     logging_dir = output_dir + "logs/"
155     run_log = None
156     run_cmd = None
157     tmp_dir = output_dir + "tmp/"
158     bin_dir = sys.path[0] + "/"
159     use_zpacker = False # this is set by -z/--zpacker option (-z0 leaves it False)
160    
161     use_BAM_Unmapped = False # automatically set to True for non-Solid reads, handles unmapped reads in BAM format
162    
163     use_BWT_FIFO = False # can only be set to True if use_zpacker is True and only with -C/--color
164     # enabled by -X/-unmapped-fifo option (unless -z0)
165 gpertea 135 unmapped_reads_fifo = None # if use_BWT_FIFO is True, this tricks bowtie into writing the
166 gpertea 29 # unmapped reads into a compressed file
167    
168     samtools_path = None
169     bowtie_path = None
170     fail_str = "\t[FAILED]\n"
171    
172     sam_header = tmp_dir + "stub_header.sam"
173    
174 gpertea 135 gtf_juncs = None #file name with junctions extracted from given GFF file
175    
176    
177 gpertea 29 # TopHatParams captures all of the runtime paramaters used by TopHat, and many
178     # of these are passed as command line options to exectubles run by the pipeline
179    
180     # This class and its nested classes also do options parsing through parse_options()
181     # and option validation via the member function check()
182    
183 gpertea 135
184 gpertea 154 class BowtieOutputFiles:
185 gpertea 135 def __init__(self,
186     mappings=None,
187     unmapped_reads=None,
188     multihit_reads=None):
189     self.mappings=mappings
190     self.unmapped_reads=unmapped_reads
191     self.multihit_reads=multihit_reads
192    
193 gpertea 29 class TopHatParams:
194    
195     # SpliceConstraints is a group of runtime parameters that specify what
196     # constraints to put on junctions discovered by the program. These constraints
197     # are used to filter out spurious/false positive junctions.
198    
199     class SpliceConstraints:
200     def __init__(self,
201     min_anchor_length,
202     min_intron_length,
203     max_intron_length,
204     splice_mismatches,
205     min_isoform_fraction):
206     self.min_anchor_length = min_anchor_length
207     self.min_intron_length = min_intron_length
208     self.max_intron_length = max_intron_length
209     self.splice_mismatches = splice_mismatches
210     self.min_isoform_fraction = min_isoform_fraction
211    
212     def parse_options(self, opts):
213     for option, value in opts:
214     if option in ("-m", "--splice-mismatches"):
215     self.splice_mismatches = int(value)
216     elif option in ("-a", "--min-anchor"):
217     self.min_anchor_length = int(value)
218     elif option in ("-F", "--min-isoform-fraction"):
219     self.min_isoform_fraction = float(value)
220     elif option in ("-i", "--min-intron-length"):
221     self.min_intron_length = int(value)
222     elif option in ("-I", "--max-intron-length"):
223     self.max_intron_length = int(value)
224    
225     def check(self):
226     if self.splice_mismatches not in [0,1,2]:
227     die("Error: arg to --splice-mismatches must be 0, 1, or 2")
228     if self.min_anchor_length < 4:
229     die("Error: arg to --min-anchor-len must be greater than 4")
230     if self.min_isoform_fraction < 0.0 or self.min_isoform_fraction > 1.0:
231     die("Error: arg to --min-isoform-fraction must be between 0.0 and 1.0")
232     if self.min_intron_length <= 0:
233     die("Error: arg to --min-intron-length must be greater than 0")
234     if self.max_intron_length <= 0:
235     die("Error: arg to --max-intron-length must be greater than 0")
236    
237     # SystemParams is a group of runtime parameters that determine how to handle
238     # temporary files produced during a run and how many threads to use for threaded
239     # stages of the pipeline (e.g. Bowtie)
240    
241     class SystemParams:
242     def __init__(self,
243 gpertea 154 num_threads,
244 gpertea 29 keep_tmp):
245 gpertea 154 self.num_threads = num_threads
246 gpertea 29 self.keep_tmp = keep_tmp
247     self.zipper = "gzip"
248     self.zipper_opts= []
249    
250     def parse_options(self, opts):
251     global use_zpacker
252     global use_BWT_FIFO
253     for option, value in opts:
254     if option in ("-p", "--num-threads"):
255 gpertea 154 self.num_threads = int(value)
256 gpertea 29 elif option == "--keep-tmp":
257     self.keep_tmp = True
258     elif option in ("-z","--zpacker"):
259     if value.lower() in ["-", " ", ".", "0", "none", "f", "false", "no"]:
260     value=""
261     self.zipper = value
262     #if not self.zipper:
263     # self.zipper='gzip'
264     elif option in ("-X", "--unmapped-fifo"):
265     use_BWT_FIFO=True
266     if self.zipper:
267     use_zpacker=True
268 gpertea 154 if self.num_threads>1 and not self.zipper_opts:
269 gpertea 29 if self.zipper.endswith('pbzip2') or self.zipper.endswith('pigz'):
270 gpertea 154 self.zipper_opts.append('-p'+str(self.num_threads))
271 gpertea 29 else:
272     use_zpacker=False
273     if use_BWT_FIFO: use_BWT_FIFO=False
274     def cmd(self):
275     cmdline=[]
276     if self.zipper:
277     cmdline.extend(['-z',self.zipper])
278 gpertea 154 if self.num_threads>1:
279     cmdline.extend(['-p'+str(self.num_threads)])
280 gpertea 29 return cmdline
281    
282     def check(self):
283 gpertea 154 if self.num_threads<1 :
284 gpertea 29 die("Error: arg to --num-threads must be greater than 0")
285     if self.zipper:
286     xzip=which(self.zipper)
287     if not xzip:
288     die("Error: cannot find compression program "+self.zipper)
289    
290     # ReadParams is a group of runtime parameters that specify various properties
291     # of the user's reads (e.g. which quality scale their are on, how long the
292     # fragments are, etc).
293     class ReadParams:
294     def __init__(self,
295     solexa_quals,
296     phred64_quals,
297     quals,
298     integer_quals,
299     color,
300     color_out,
301     library_type,
302     seed_length,
303     reads_format,
304     mate_inner_dist,
305     mate_inner_dist_std_dev,
306     read_group_id,
307     sample_id,
308     library_id,
309     description,
310     seq_platform_unit,
311     seq_center,
312     seq_run_date,
313     seq_platform):
314     self.solexa_quals = solexa_quals
315     self.phred64_quals = phred64_quals
316     self.quals = quals
317     self.integer_quals = integer_quals
318     self.color = color
319     self.color_out = color_out
320     self.library_type = library_type
321     self.seed_length = seed_length
322     self.reads_format = reads_format
323     self.mate_inner_dist = mate_inner_dist
324     self.mate_inner_dist_std_dev = mate_inner_dist_std_dev
325     self.read_group_id = read_group_id
326     self.sample_id = sample_id
327     self.library_id = library_id
328     self.description = description
329     self.seq_platform_unit = seq_platform_unit
330     self.seq_center = seq_center
331     self.seq_run_date = seq_run_date
332     self.seq_platform = seq_platform
333    
334     def parse_options(self, opts):
335     for option, value in opts:
336     if option == "--solexa-quals":
337     self.solexa_quals = True
338     elif option in ("--solexa1.3-quals", "--phred64-quals"):
339     self.phred64_quals = True
340     elif option in ("-Q", "--quals"):
341     self.quals = True
342     elif option == "--integer-quals":
343     self.integer_quals = True
344     elif option in ("-C", "--color"):
345     self.color = True
346     elif option == "--color-out":
347     self.color_out = True
348     elif option == "--library-type":
349     self.library_type = value
350     elif option in ("-s", "--seed-length"):
351     self.seed_length = int(value)
352     elif option in ("-r", "--mate-inner-dist"):
353     self.mate_inner_dist = int(value)
354     elif option == "--mate-std-dev":
355     self.mate_inner_dist_std_dev = int(value)
356     elif option == "--rg-id":
357     self.read_group_id = value
358     elif option == "--rg-sample":
359     self.sample_id = value
360     elif option == "--rg-library":
361     self.library_id = value
362     elif option == "--rg-description":
363     self.description = value
364     elif option == "--rg-platform-unit":
365     self.seq_platform_unit = value
366     elif option == "--rg-center":
367     self.seq_center = value
368     elif option == "--rg-date":
369     self.seq_run_date = value
370     elif option == "--rg-platform":
371     self.seq_platform = value
372    
373     def check(self):
374 gpertea 135 if self.seed_length and self.seed_length < 20:
375 gpertea 29 die("Error: arg to --seed-length must be at least 20")
376 gpertea 154
377 gpertea 29 if self.mate_inner_dist_std_dev != None and self.mate_inner_dist_std_dev < 0:
378     die("Error: arg to --mate-std-dev must at least 0")
379     if (not self.read_group_id and self.sample_id) or (self.read_group_id and not self.sample_id):
380     die("Error: --rg-id and --rg-sample must be specified or omitted together")
381    
382     # SearchParams is a group of runtime parameters that specify how TopHat will
383     # search for splice junctions
384    
385     class SearchParams:
386     def __init__(self,
387     min_closure_exon,
388     min_closure_intron,
389     max_closure_intron,
390     min_coverage_intron,
391     max_coverage_intron,
392     min_segment_intron,
393     max_segment_intron):
394    
395     self.min_closure_exon_length = min_closure_exon
396     self.min_closure_intron_length = min_closure_intron
397     self.max_closure_intron_length = max_closure_intron
398     self.min_coverage_intron_length = min_coverage_intron
399     self.max_coverage_intron_length = max_coverage_intron
400     self.min_segment_intron_length = min_segment_intron
401     self.max_segment_intron_length = max_segment_intron
402    
403     def parse_options(self, opts):
404     for option, value in opts:
405     if option == "--min-closure-exon":
406     self.min_closure_exon_length = int(value)
407     if option == "--min-closure-intron":
408     self.min_closure_intron_length = int(value)
409     if option == "--max-closure-intron":
410     self.max_closure_intron_length = int(value)
411     if option == "--min-coverage-intron":
412     self.min_coverage_intron_length = int(value)
413     if option == "--max-coverage-intron":
414     self.max_coverage_intron_length = int(value)
415     if option == "--min-segment-intron":
416     self.min_segment_intron_length = int(value)
417     if option == "--max-segment-intron":
418     self.max_segment_intron_length = int(value)
419    
420     def check(self):
421     if self.min_closure_exon_length < 0:
422     die("Error: arg to --min-closure-exon must be at least 20")
423     if self.min_closure_intron_length < 0:
424     die("Error: arg to --min-closure-intron must be at least 20")
425     if self.max_closure_intron_length < 0:
426     die("Error: arg to --max-closure-intron must be at least 20")
427     if self.min_coverage_intron_length < 0:
428     die("Error: arg to --min-coverage-intron must be at least 20")
429     if self.max_coverage_intron_length < 0:
430     die("Error: arg to --max-coverage-intron must be at least 20")
431     if self.min_segment_intron_length < 0:
432     die("Error: arg to --min-segment-intron must be at least 20")
433     if self.max_segment_intron_length < 0:
434     die("Error: arg to --max-segment-intron must be at least 20")
435    
436     class ReportParams:
437     def __init__(self):
438     self.sort_bam = True
439     self.convert_bam = True
440    
441     def parse_options(self, opts):
442     for option, value in opts:
443     if option == "--no-sort-bam":
444     self.sort_bam = False
445     if option == "--no-convert-bam":
446     self.convert_bam = False
447     self.sort_bam = False
448    
449 gpertea 154 class Bowtie2Params:
450     def __init__(self):
451     self.very_fast = False
452     self.fast = False
453     self.sensitive = False
454     self.very_sensitive = False
455 gpertea 29
456 gpertea 154 self.N = 0
457     self.L = 20
458     self.i = "S,1,1.25"
459     self.n_ceil = "L,0,0.15"
460     self.gbar = 4
461    
462     self.mp = "6,2"
463     self.np = 1
464     self.rdg = "5,3"
465     self.rfg = "5,3"
466     self.score_min = "L,-0.6,-0.6"
467    
468     self.D = 15
469     self.R = 2
470    
471     def parse_options(self, opts):
472     for option, value in opts:
473     if option == "--b2-very-fast":
474     self.very_fast = True
475     if option == "--b2-fast":
476     self.fast = True
477     if option == "--b2-sensitive":
478     self.sensitive = True
479     if option == "--b2-very-sensitive":
480     self.very_sensitive = True
481    
482     if option == "--b2-N":
483     self.N = int(value)
484     if option == "--b2-L":
485     self.L = 20
486     if option == "--b2-i":
487     self.i = value
488     if option == "--b2-n-ceil":
489     self.n_ceil = value
490     if option == "--b2-gbar":
491     self.gbar = 4
492    
493     if option == "--b2-mp":
494     self.mp = value
495     if option == "--b2-np":
496     self.np = int(value)
497     if option == "--b2-rdg":
498     self.rdg = value
499     if option == "--b2-rfg":
500     self.rfg = value
501     if option == "--b2-score-min":
502     self.score_min = value
503    
504     if option == "--b2-D":
505     self.D = int(value)
506     if option == "--b2-R":
507     self.R = int(value)
508    
509     def check(self):
510     more_than_once = False
511     if self.very_fast:
512     if self.fast or self.sensitive or self.very_sensitive:
513     more_than_once = True
514     else:
515     if self.fast:
516     if self.sensitive or self.very_sensitive:
517     more_than_once = True
518     else:
519     if self.sensitive and self.very_sensitive:
520     more_than_once = True
521    
522     if more_than_once:
523     die("Error: use only one of --b2-very-fast, --b2-fast, --b2-sensitive, --b2-very-sensitive")
524    
525     if not self.N in [0, 1]:
526     die("Error: arg to --b2-N must be either 0 or 1")
527    
528     function_re = r'^[CLSG],-?\d+(\.\d+)?,-?\d+(\.\d+)?$'
529     function_match = re.search(function_re, self.i)
530    
531     if not function_match:
532     die("Error: arg to --b2-i must be <func> (e.g. --b2-i S,1,1.25)")
533    
534     function_match = re.search(function_re, self.n_ceil)
535     if not function_match:
536     die("Error: arg to --b2-n-ceil must be <func> (e.g. --b2-n-ceil L,0,0.15)")
537    
538     function_match = re.search(function_re, self.score_min)
539     if not function_match:
540     die("Error: arg to --b2-score-min must be <func> (e.g. --b2-score-min L,-0.6,-0.6)")
541    
542     pair_re = r'^\d+,\d+$'
543     pair_match = re.search(pair_re, self.mp)
544     if not pair_match:
545     die("Error: arg to --b2-mp must be <int>,<int> (e.g. --b2-mp 6,2)")
546    
547     pair_match = re.search(pair_re, self.rdg)
548     if not pair_match:
549     die("Error: arg to --b2-rdg must be <int>,<int> (e.g. --b2-mp 5,3)")
550    
551     pair_match = re.search(pair_re, self.rfg)
552     if not pair_match:
553     die("Error: arg to --b2-rfg must be <int>,<int> (e.g. --b2-mp 5,3)")
554    
555    
556 gpertea 29 def __init__(self):
557     self.splice_constraints = self.SpliceConstraints(8, # min_anchor
558     50, # min_intron
559     500000, # max_intron
560     0, # splice_mismatches
561     0.15) # min_isoform_frac
562    
563 gpertea 135 self.preflt_data = [ BowtieOutputFiles(), BowtieOutputFiles() ]
564    
565 gpertea 29 self.read_params = self.ReadParams(False, # solexa_scale
566     False,
567     False, # quals
568     None, # integer quals
569     False, # SOLiD - color space
570     False, # SOLiD - color out instead of base pair,
571     "", # library type (e.g. "illumina-stranded-pair-end")
572     None, # seed_length
573     "fastq", # quality_format
574     None, # mate inner distance
575     20, # mate inner dist std dev
576     None, # read group id
577     None, # sample id
578     None, # library id
579     None, # description
580     None, # platform unit (i.e. lane)
581     None, # sequencing center
582     None, # run date
583     None) # sequencing platform
584    
585 gpertea 154 self.system_params = self.SystemParams(1, # bowtie_threads (num_threads)
586 gpertea 29 False) # keep_tmp
587    
588     self.search_params = self.SearchParams(100, # min_closure_exon_length
589     50, # min_closure_intron_length
590     5000, # max_closure_intron_length
591     50, # min_coverage_intron_length
592     20000, # max_coverage_intron_length
593     50, # min_segment_intron_length
594     500000) # max_segment_intron_length
595    
596     self.report_params = self.ReportParams()
597 gpertea 154
598     self.bowtie2_params = self.Bowtie2Params()
599    
600     self.bowtie2 = True
601 gpertea 29 self.gff_annotation = None
602 gpertea 135 self.transcriptome_only = False
603     self.transcriptome_index = None
604 gpertea 154 self.transcriptome_outdir = None
605 gpertea 29 self.raw_junctions = None
606     self.find_novel_juncs = True
607     self.find_novel_indels = True
608 gpertea 154 self.find_novel_fusions = True
609 gpertea 29 self.find_GFF_juncs = True
610     self.max_hits = 20
611 gpertea 135 self.t_max_hits = 60
612     self.prefilter_multi = False
613 gpertea 154 self.genome_read_mismatches = 2
614     self.max_read_mismatches = 2
615 gpertea 135 self.t_mismatches = 1
616 gpertea 29 self.segment_length = 25
617     self.segment_mismatches = 2
618     self.bowtie_alignment_option = "-v"
619     self.max_insertion_length = 3
620     self.max_deletion_length = 3
621     self.raw_insertions = None
622     self.raw_deletions = None
623     self.coverage_search = None
624 gpertea 135 self.closure_search = False
625 gpertea 29 self.microexon_search = False
626     self.butterfly_search = None
627 gpertea 154 self.fusion_search = False
628     self.fusion_anchor_length = 20
629     self.fusion_min_dist = 10000000
630     self.fusion_read_mismatches = 2
631     self.fusion_multireads = 2
632     self.fusion_multipairs = 2
633 gpertea 29
634     def check(self):
635     self.splice_constraints.check()
636     self.read_params.check()
637     self.system_params.check()
638 gpertea 154 self.bowtie2_params.check()
639 gpertea 135 if self.segment_length < 10:
640     die("Error: arg to --segment-length must at least 10")
641 gpertea 29 if self.segment_mismatches < 0 or self.segment_mismatches > 3:
642     die("Error: arg to --segment-mismatches must in [0, 3]")
643 gpertea 154 if self.t_mismatches < 0 or self.t_mismatches > 3:
644     die("Error: arg to -n or --transcriptome-mismatches must in [0, 3]")
645 gpertea 29
646 gpertea 154 if self.read_params.color:
647     if self.bowtie2:
648     die("Error: bowtie2 in colorspace is not yet supported\nuse --bowtie1 option")
649     if self.fusion_search:
650     die("Error: fusion-search in colorspace is not yet supported")
651     if self.butterfly_search:
652     die("Error: butterfly-search in colorspace is not yet supported")
653 gpertea 29
654     library_types = ["fr-unstranded", "fr-firststrand", "fr-secondstrand"]
655    
656     if self.read_params.library_type and self.read_params.library_type not in library_types:
657     die("Error: library-type should be one of: "+', '.join(library_types))
658    
659     self.search_params.max_closure_intron_length = min(self.splice_constraints.max_intron_length,
660     self.search_params.max_closure_intron_length)
661    
662     self.search_params.max_segment_intron_length = min(self.splice_constraints.max_intron_length,
663     self.search_params.max_segment_intron_length)
664    
665     self.search_params.max_coverage_intron_length = min(self.splice_constraints.max_intron_length,
666     self.search_params.max_coverage_intron_length)
667    
668     if self.max_insertion_length >= self.segment_length:
669     die("Error: the max insertion length ("+self.max_insertion_length+") can not be equal to or greater than the segment length ("+self.segment_length+")")
670    
671     if self.max_insertion_length < 0:
672     die("Error: the max insertion length ("+self.max_insertion_length+") can not be less than 0")
673    
674     if self.max_deletion_length >= self.splice_constraints.min_intron_length:
675     die("Error: the max deletion length ("+self.max_deletion_length+") can not be equal to or greater than the min intron length ("+self.splice_constraints.min_intron_length+")")
676    
677     if self.max_deletion_length < 0:
678     die("Error: the max deletion length ("+self.max_deletion_length+") can not be less than 0")
679    
680     def cmd(self):
681     cmd = ["--min-anchor", str(self.splice_constraints.min_anchor_length),
682     "--splice-mismatches", str(self.splice_constraints.splice_mismatches),
683     "--min-report-intron", str(self.splice_constraints.min_intron_length),
684     "--max-report-intron", str(self.splice_constraints.max_intron_length),
685     "--min-isoform-fraction", str(self.splice_constraints.min_isoform_fraction),
686     "--output-dir", output_dir,
687     "--max-multihits", str(self.max_hits),
688     "--segment-length", str(self.segment_length),
689     "--segment-mismatches", str(self.segment_mismatches),
690     "--min-closure-exon", str(self.search_params.min_closure_exon_length),
691     "--min-closure-intron", str(self.search_params.min_closure_intron_length),
692     "--max-closure-intron", str(self.search_params.max_closure_intron_length),
693     "--min-coverage-intron", str(self.search_params.min_coverage_intron_length),
694     "--max-coverage-intron", str(self.search_params.max_coverage_intron_length),
695     "--min-segment-intron", str(self.search_params.min_segment_intron_length),
696     "--max-segment-intron", str(self.search_params.max_segment_intron_length),
697 gpertea 154 "--max-mismatches", str(self.max_read_mismatches),
698 gpertea 29 "--sam-header", sam_header,
699     "--max-insertion-length", str(self.max_insertion_length),
700     "--max-deletion-length", str(self.max_deletion_length)]
701 gpertea 154
702     if not self.bowtie2:
703     cmd.extend(["--bowtie1"])
704    
705     if self.fusion_search:
706     cmd.extend(["--fusion-search",
707     "--fusion-anchor-length", str(self.fusion_anchor_length),
708     "--fusion-min-dist", str(self.fusion_min_dist),
709     "--fusion-read-mismatches", str(self.fusion_read_mismatches),
710     "--fusion-multireads", str(self.fusion_multireads),
711     "--fusion-multipairs", str(self.fusion_multipairs)])
712    
713 gpertea 29 cmd.extend(self.system_params.cmd())
714    
715     if self.read_params.mate_inner_dist != None:
716     cmd.extend(["--inner-dist-mean", str(self.read_params.mate_inner_dist),
717     "--inner-dist-std-dev", str(self.read_params.mate_inner_dist_std_dev)])
718     if self.gff_annotation != None:
719     cmd.extend(["--gtf-annotations", str(self.gff_annotation)])
720 gpertea 135 if gtf_juncs:
721     cmd.extend(["--gtf-juncs", gtf_juncs])
722     if self.closure_search == False:
723 gpertea 29 cmd.append("--no-closure-search")
724 gpertea 154 if not self.coverage_search:
725 gpertea 29 cmd.append("--no-coverage-search")
726 gpertea 154 if not self.microexon_search:
727 gpertea 29 cmd.append("--no-microexon-search")
728     if self.butterfly_search:
729     cmd.append("--butterfly-search")
730     if self.read_params.solexa_quals:
731     cmd.append("--solexa-quals")
732     if self.read_params.quals:
733     cmd.append("--quals")
734     if self.read_params.integer_quals:
735     cmd.append("--integer-quals")
736     if self.read_params.color:
737     cmd.append("--color")
738     if self.read_params.color_out:
739     cmd.append("--color-out")
740     if self.read_params.library_type:
741     cmd.extend(["--library-type", self.read_params.library_type])
742     if self.read_params.read_group_id:
743     cmd.extend(["--rg-id", self.read_params.read_group_id])
744     if self.read_params.phred64_quals:
745     cmd.append("--phred64-quals")
746     return cmd
747    
748     # This is the master options parsing routine, which calls parse_options for
749     # the delegate classes (e.g. SpliceConstraints) that handle certain groups
750     # of options.
751     def parse_options(self, argv):
752     try:
753 gpertea 135 opts, args = getopt.getopt(argv[1:], "hvp:m:n:N:F:a:i:I:G:Tr:o:j:Xz:s:g:x:MQC",
754 gpertea 29 ["version",
755     "help",
756     "output-dir=",
757 gpertea 154 "bowtie1",
758 gpertea 29 "solexa-quals",
759     "solexa1.3-quals",
760     "phred64-quals",
761     "quals",
762     "integer-quals",
763     "color",
764     "color-out",
765     "library-type=",
766     "num-threads=",
767     "splice-mismatches=",
768     "max-multihits=",
769     "min-isoform-fraction=",
770     "min-anchor-length=",
771     "min-intron-length=",
772     "max-intron-length=",
773     "GTF=",
774 gpertea 135 "transcriptome-only",
775     "transcriptome-max-hits=",
776     "transcriptome-index=",
777 gpertea 29 "raw-juncs=",
778     "no-novel-juncs",
779 gpertea 154 "allow-fusions",
780     "fusion-search",
781     "fusion-anchor-length=",
782     "fusion-min-dist=",
783     "fusion-read-mismatches=",
784     "fusion-multireads=",
785     "fusion-multipairs=",
786 gpertea 29 "no-novel-indels",
787     "no-gtf-juncs",
788     "mate-inner-dist=",
789     "mate-std-dev=",
790     "no-closure-search",
791     "no-coverage-search",
792     "closure-search",
793     "coverage-search",
794 gpertea 135 "prefilter-multihits",
795 gpertea 29 "microexon-search",
796     "min-closure-exon=",
797     "min-closure-intron=",
798     "max-closure-intron=",
799     "min-coverage-intron=",
800     "max-coverage-intron=",
801     "min-segment-intron=",
802     "max-segment-intron=",
803     "seed-length=",
804 gpertea 154 "genome-read-mismatches=",
805 gpertea 135 "read-mismatches=",
806 gpertea 154 "max-read-mismatches=",
807 gpertea 135 "transcriptome-mismatches=",
808 gpertea 29 "segment-length=",
809     "segment-mismatches=",
810     "bowtie-n",
811     "butterfly-search",
812     "no-butterfly-search",
813     "keep-tmp",
814     "rg-id=",
815     "rg-sample=",
816     "rg-library=",
817     "rg-description=",
818     "rg-platform-unit=",
819     "rg-center=",
820     "rg-date=",
821     "rg-platform=",
822     "tmp-dir=",
823     "zpacker=",
824     "unmapped-fifo",
825     "max-insertion-length=",
826     "max-deletion-length=",
827     "insertions=",
828     "deletions=",
829     "no-sort-bam",
830 gpertea 154 "no-convert-bam",
831     "b2-very-fast",
832     "b2-fast",
833     "b2-sensitive",
834     "b2-very-sensitive",
835     "b2-N=",
836     "b2-L=",
837     "b2-i=",
838     "b2-n-ceil=",
839     "b2-gbar=",
840     "b2-ma=",
841     "b2-mp=",
842     "b2-np=",
843     "b2-rdg=",
844     "b2-rfg=",
845     "b2-score-min=",
846     "b2-D=",
847     "b2-R="])
848 gpertea 29 except getopt.error, msg:
849     raise Usage(msg)
850    
851     self.splice_constraints.parse_options(opts)
852     self.system_params.parse_options(opts)
853     self.read_params.parse_options(opts)
854     self.search_params.parse_options(opts)
855     self.report_params.parse_options(opts)
856 gpertea 154 self.bowtie2_params.parse_options(opts)
857 gpertea 29 global use_BWT_FIFO
858     global use_BAM_Unmapped
859     if not self.read_params.color:
860     use_BWT_FIFO=False
861     use_BAM_Unmapped=True
862     global output_dir
863     global logging_dir
864     global tmp_dir
865     global sam_header
866    
867     custom_tmp_dir = None
868     custom_out_dir = None
869     # option processing
870     for option, value in opts:
871     if option in ("-v", "--version"):
872     print "TopHat v%s" % (get_version())
873     sys.exit(0)
874     if option in ("-h", "--help"):
875     raise Usage(use_message)
876 gpertea 154 if option == "--bowtie1":
877     self.bowtie2 = False
878 gpertea 29 if option in ("-g", "--max-multihits"):
879     self.max_hits = int(value)
880 gpertea 135 if option in ("-x", "--transcriptome-max-hits"):
881     self.t_max_hits = int(value)
882 gpertea 29 if option in ("-G", "--GTF"):
883     self.gff_annotation = value
884 gpertea 135 if option in ("-T", "--transcriptome"):
885     self.transcriptome_only = True
886 gpertea 154 if option == "--transcriptome-index":
887 gpertea 135 self.transcriptome_index = value
888     if option in("-M", "--prefilter-multihits"):
889     self.prefilter_multi = True
890 gpertea 29 if option in ("-j", "--raw-juncs"):
891     self.raw_junctions = value
892     if option == "--no-novel-juncs":
893     self.find_novel_juncs = False
894     if option == "--no-novel-indels":
895     self.find_novel_indels = False
896 gpertea 154 if option == "--fusion-search":
897     self.fusion_search = True
898     if option == "--fusion-anchor-length":
899     self.fusion_anchor_length = int(value)
900     if option == "--fusion-min-dist":
901     self.fusion_min_dist = int(value)
902     if option == "--fusion-read-mismatches":
903     self.fusion_read_mismatches = int(value)
904     if option == "--fusion-multireads":
905     self.fusion_multireads = int(value)
906     if option == "--fusion-multipairs":
907     self.fusion_multipairs = int(value)
908 gpertea 29 if option == "--no-gtf-juncs":
909     self.find_GFF_juncs = False
910 gpertea 135 #if option == "--skip-check-reads":
911     # self.skip_check_reads = True
912 gpertea 29 if option == "--no-coverage-search":
913     self.coverage_search = False
914 gpertea 135 if option == "--coverage-search":
915     self.coverage_search = True
916 gpertea 29 if option == "--no-closure-search":
917     self.closure_search = False
918     if option == "--closure-search":
919     self.closure_search = True
920     if option == "--microexon-search":
921     self.microexon_search = True
922     if option == "--butterfly-search":
923     self.butterfly_search = True
924     if option == "--no-butterfly-search":
925     self.butterfly_search = False
926 gpertea 154 if option == "--genome-read-mismatches":
927     self.genome_read_mismatches = int(value)
928     if option in ("-N", "--read-mismatches", "--max-read-mismatches"):
929     self.max_read_mismatches = int(value)
930 gpertea 135 if option in ("-n", "--transcriptome-mismatches"):
931     self.t_mismatches = int(value)
932 gpertea 29 if option == "--segment-length":
933     self.segment_length = int(value)
934     if option == "--segment-mismatches":
935     self.segment_mismatches = int(value)
936     if option == "--bowtie-n":
937     self.bowtie_alignment_option = "-n"
938     if option == "--max-insertion-length":
939     self.max_insertion_length = int(value)
940     if option == "--max-deletion-length":
941     self.max_deletion_length = int(value)
942     if option == "--insertions":
943     self.raw_insertions = value
944     if option == "--deletions":
945     self.raw_deletions = value
946     if option in ("-o", "--output-dir"):
947     custom_out_dir = value + "/"
948     if option == "--tmp-dir":
949     custom_tmp_dir = value + "/"
950 gpertea 135 if self.transcriptome_only:
951     self.find_novel_juncs=False
952     self.find_novel_indels=False
953     if custom_out_dir:
954 gpertea 29 output_dir = custom_out_dir
955     logging_dir = output_dir + "logs/"
956     tmp_dir = output_dir + "tmp/"
957     sam_header = tmp_dir + "stub_header.sam"
958 gpertea 135 if custom_tmp_dir:
959 gpertea 29 tmp_dir = custom_tmp_dir
960     sam_header = tmp_dir + "stub_header.sam"
961     if len(args) < 2:
962     raise Usage(use_message)
963     return args
964    
965    
966 gpertea 154 # check if a file exists and has non-zero (or minimum) size
967     def fileExists(filepath, minfsize=1):
968     if os.path.exists(filepath) and os.path.getsize(filepath)>=minfsize:
969     return True
970     else:
971     return False
972    
973    
974 gpertea 29 def getFileDir(filepath):
975 gpertea 154 #if fullpath given, returns path including the ending /
976 gpertea 29 fpath, fname=os.path.split(filepath)
977     if fpath: fpath+='/'
978     return fpath
979    
980     def getFileBaseName(filepath):
981     fpath, fname=os.path.split(filepath)
982     fbase, fext =os.path.splitext(fname)
983     fx=fext.lower()
984     if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fbase)>0:
985     return fbase
986     elif fx == '.z' or fx.find('.gz')==0 or fx.find('.bz')==0:
987     fb, fext = os.path.splitext(fbase)
988     fx=fext.lower()
989     if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fb)>0:
990     return fb
991     else:
992     return fbase
993     else:
994 gpertea 154 if len(fbase)>0:
995 gpertea 29 return fbase
996     else:
997     return fname
998    
999     # Returns the current time in a nice format
1000     def right_now():
1001     curr_time = datetime.now()
1002     return curr_time.strftime("%c")
1003    
1004 gpertea 135 # The TopHat logging formatter
1005     def th_log(out_str):
1006     print >> sys.stderr, "[%s] %s" % (right_now(), out_str)
1007    
1008 gpertea 29 # Ensures that the output, logging, and temp directories are present. If not,
1009     # they are created
1010     def prepare_output_dir():
1011    
1012 gpertea 135 th_log("Preparing output location "+output_dir)
1013 gpertea 29 if os.path.exists(output_dir):
1014     pass
1015     else:
1016     os.mkdir(output_dir)
1017    
1018     if os.path.exists(logging_dir):
1019     pass
1020     else:
1021     os.mkdir(logging_dir)
1022    
1023     if os.path.exists(tmp_dir):
1024     pass
1025     else:
1026     try:
1027     os.makedirs(tmp_dir)
1028     except OSError, o:
1029     die("\nError creating directory %s (%s)" % (tmp_dir, o))
1030    
1031    
1032     # to be added as preexec_fn for every subprocess.Popen() call:
1033     # see http://bugs.python.org/issue1652
1034     def subprocess_setup():
1035     # Python installs a SIGPIPE handler by default, which causes
1036     # gzip or other de/compression pipes to complain about "stdout: Broken pipe"
1037     signal.signal(signal.SIGPIPE, signal.SIG_DFL)
1038    
1039     # Check that the Bowtie index specified by the user is present and all files
1040     # are there.
1041     def check_bowtie_index(idx_prefix):
1042 gpertea 135 th_log("Checking for Bowtie index files")
1043 gpertea 29
1044     idx_fwd_1 = idx_prefix + ".1.ebwt"
1045     idx_fwd_2 = idx_prefix + ".2.ebwt"
1046     idx_rev_1 = idx_prefix + ".rev.1.ebwt"
1047     idx_rev_2 = idx_prefix + ".rev.2.ebwt"
1048    
1049     if os.path.exists(idx_fwd_1) and \
1050     os.path.exists(idx_fwd_2) and \
1051     os.path.exists(idx_rev_1) and \
1052     os.path.exists(idx_rev_2):
1053     return
1054     else:
1055     bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES")
1056     if bowtie_idx_env_var == None:
1057     die("Error: Could not find Bowtie index files " + idx_prefix + ".*")
1058     idx_prefix = bowtie_idx_env_var + idx_prefix
1059 gpertea 154
1060 gpertea 29 idx_fwd_1 = idx_prefix + ".1.ebwt"
1061     idx_fwd_2 = idx_prefix + ".2.ebwt"
1062     idx_rev_1 = idx_prefix + ".rev.1.ebwt"
1063     idx_rev_2 = idx_prefix + ".rev.2.ebwt"
1064    
1065     if os.path.exists(idx_fwd_1) and \
1066     os.path.exists(idx_fwd_2) and \
1067     os.path.exists(idx_rev_1) and \
1068     os.path.exists(idx_rev_2):
1069     return
1070     else:
1071     die("Error: Could not find Bowtie index files " + idx_prefix + ".*")
1072    
1073     # Reconstructs the multifasta file from which the Bowtie index was created, if
1074     # it's not already there.
1075     def bowtie_idx_to_fa(idx_prefix):
1076     idx_name = idx_prefix.split('/')[-1]
1077 gpertea 135 th_log("Reconstituting reference FASTA file from Bowtie index")
1078 gpertea 29
1079     try:
1080     tmp_fasta_file_name = tmp_dir + idx_name + ".fa"
1081     tmp_fasta_file = open(tmp_fasta_file_name, "w")
1082    
1083     inspect_log = open(logging_dir + "bowtie_inspect_recons.log", "w")
1084    
1085     inspect_cmd = [prog_path("bowtie-inspect"),
1086     idx_prefix]
1087    
1088     print >> sys.stderr, "Executing: " + " ".join(inspect_cmd) + " > " + tmp_fasta_file_name
1089     ret = subprocess.call(inspect_cmd,
1090     stdout=tmp_fasta_file,
1091     stderr=inspect_log)
1092    
1093     # Bowtie reported an error
1094     if ret != 0:
1095     die(fail_str+"Error: bowtie-inspect returned an error")
1096    
1097     # Bowtie not found
1098     except OSError, o:
1099     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1100     die(fail_str+"Error: bowtie-inspect not found on this system. Did you forget to include it in your PATH?")
1101    
1102     return tmp_fasta_file_name
1103    
1104     # Checks whether the multifasta file for the genome is present alongside the
1105     # Bowtie index files for it.
1106     def check_fasta(idx_prefix):
1107 gpertea 135 th_log("Checking for reference FASTA file")
1108 gpertea 29 idx_fasta = idx_prefix + ".fa"
1109     if os.path.exists(idx_fasta):
1110     return idx_fasta
1111     else:
1112     bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES")
1113 gpertea 135 if bowtie_idx_env_var:
1114 gpertea 29 idx_fasta = bowtie_idx_env_var + idx_prefix + ".fa"
1115     if os.path.exists(idx_fasta):
1116     return idx_fasta
1117    
1118     print >> sys.stderr, "\tWarning: Could not find FASTA file " + idx_fasta
1119     idx_fa = bowtie_idx_to_fa(idx_prefix)
1120     return idx_fa
1121    
1122     # Check that both the Bowtie index and the genome's fasta file are present
1123     def check_index(idx_prefix):
1124     check_bowtie_index(idx_prefix)
1125     ref_fasta_file = check_fasta(idx_prefix)
1126    
1127     return (ref_fasta_file, None)
1128    
1129     # Retrive a tuple containing the system's version of Bowtie. Parsed from
1130     # `bowtie --version`
1131 gpertea 154 def get_bowtie_version(is_bowtie2):
1132 gpertea 29 try:
1133     # Launch Bowtie to capture its version info
1134     proc = subprocess.Popen([bowtie_path, "--version"],
1135     stdout=subprocess.PIPE)
1136 gpertea 154
1137 gpertea 29 stdout_value = proc.communicate()[0]
1138 gpertea 154
1139 gpertea 29 bowtie_version = None
1140     bowtie_out = repr(stdout_value)
1141    
1142     # Find the version identifier
1143 gpertea 154 if is_bowtie2:
1144     version_str = "bowtie2-align version "
1145     else:
1146     version_str = "bowtie version "
1147    
1148 gpertea 29 ver_str_idx = bowtie_out.find(version_str)
1149     if ver_str_idx != -1:
1150     nl = bowtie_out.find("\\n", ver_str_idx)
1151     version_val = bowtie_out[ver_str_idx + len(version_str):nl]
1152 gpertea 154
1153     if is_bowtie2:
1154     ver_numbers = version_val.split('.')
1155     ver_numbers[2:] = ver_numbers[2].split('-')
1156     bowtie_version = [int(x) for x in ver_numbers[:3]] + [int(ver_numbers[3][4:])]
1157     else:
1158     bowtie_version = [int(x) for x in version_val.split('.')]
1159    
1160 gpertea 29 if len(bowtie_version) == 3:
1161     bowtie_version.append(0)
1162    
1163     return bowtie_version
1164     except OSError, o:
1165     errmsg=fail_str+str(o)+"\n"
1166     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1167     errmsg+="Error: bowtie not found on this system"
1168     die(errmsg)
1169    
1170 gpertea 154 def get_index_sam_header(is_bowtie2, read_params, idx_prefix, sort = True):
1171 gpertea 29 try:
1172     bowtie_sam_header_filename = tmp_dir+idx_prefix.split('/')[-1]+".bwt.samheader.sam"
1173     bowtie_sam_header_file = open(bowtie_sam_header_filename,"w")
1174    
1175 gpertea 154 bowtie_header_cmd = [bowtie_path]
1176     if not is_bowtie2:
1177     bowtie_header_cmd += ["--sam"]
1178    
1179 gpertea 29 if read_params.color:
1180     bowtie_header_cmd.append('-C')
1181 gpertea 154
1182 gpertea 29 bowtie_header_cmd.extend([idx_prefix, '/dev/null'])
1183     subprocess.call(bowtie_header_cmd,
1184     stdout=bowtie_sam_header_file,
1185     stderr=open('/dev/null'))
1186    
1187     bowtie_sam_header_file.close()
1188    
1189 gpertea 154 if not sort:
1190     return bowtie_sam_header_filename
1191    
1192     bowtie_sam_header_file = open(bowtie_sam_header_filename, "r")
1193 gpertea 29 sam_header_file = open(sam_header, "w")
1194    
1195     preamble = []
1196     sq_dict_lines = []
1197    
1198     for line in bowtie_sam_header_file.readlines():
1199     line = line.strip()
1200     if line.find("@SQ") != -1:
1201     # Sequence dictionary record
1202     cols = line.split('\t')
1203     seq_name = None
1204     for col in cols:
1205     fields = col.split(':')
1206     #print fields
1207     if len(fields) > 0 and fields[0] == "SN":
1208     seq_name = fields[1]
1209     if seq_name == None:
1210     die("Error: malformed sequence dictionary in sam header")
1211     sq_dict_lines.append([seq_name,line])
1212     elif line.find("CL"):
1213     continue
1214     else:
1215     preamble.append(line)
1216     #for line in preamble:
1217     # print >> sam_header_file, line
1218     print >> sam_header_file, "@HD\tVN:1.0\tSO:coordinate"
1219     if read_params.read_group_id and read_params.sample_id:
1220     rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
1221     read_params.sample_id)
1222     if read_params.library_id:
1223     rg_str += "\tLB:%s" % read_params.library_id
1224     if read_params.description:
1225     rg_str += "\tDS:%s" % read_params.description
1226     if read_params.seq_platform_unit:
1227     rg_str += "\tPU:%s" % read_params.seq_platform_unit
1228     if read_params.seq_center:
1229     rg_str += "\tCN:%s" % read_params.seq_center
1230     if read_params.mate_inner_dist:
1231     rg_str += "\tPI:%s" % read_params.mate_inner_dist
1232     if read_params.seq_run_date:
1233     rg_str += "\tDT:%s" % read_params.seq_run_date
1234     if read_params.seq_platform:
1235     rg_str += "\tPL:%s" % read_params.seq_platform
1236    
1237     print >> sam_header_file, rg_str
1238    
1239     sq_dict_lines.sort(lambda x,y: cmp(x[0],y[0]))
1240     for [name, line] in sq_dict_lines:
1241     print >> sam_header_file, line
1242     print >> sam_header_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
1243    
1244     sam_header_file.close()
1245    
1246 gpertea 154 copy(sam_header, bowtie_sam_header_filename)
1247 gpertea 29 return sam_header
1248    
1249     except OSError, o:
1250     errmsg=fail_str+str(o)+"\n"
1251     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1252     errmsg+="Error: bowtie not found on this system"
1253     die(errmsg)
1254    
1255     # Make sure Bowtie is installed and is recent enough to be useful
1256 gpertea 154 def check_bowtie(is_bowtie2):
1257     log_msg = "Checking for Bowtie"
1258     if is_bowtie2:
1259     log_msg += "2"
1260     th_log(log_msg)
1261    
1262     bowtie_bin = "bowtie"
1263     if is_bowtie2:
1264     bowtie_bin += "2"
1265    
1266 gpertea 29 global bowtie_path
1267 gpertea 154 bowtie_path=prog_path(bowtie_bin)
1268     bowtie_version = get_bowtie_version(is_bowtie2)
1269 gpertea 29 if bowtie_version == None:
1270     die("Error: Bowtie not found on this system")
1271 gpertea 154 elif is_bowtie2:
1272     if bowtie_version[0] < 3 and bowtie_version[1] < 1 and bowtie_version[2] < 1 and bowtie_version[3] < 5:
1273     die("Error: TopHat requires Bowtie 2.0.0-beta5 or later")
1274     elif not is_bowtie2:
1275     if bowtie_version[0] < 1 and bowtie_version[1] < 12 and bowtie_version[2] < 3:
1276     die("Error: TopHat requires Bowtie 0.12.3 or later")
1277     print >> sys.stderr, "\t version:\t\t\t %s" % ".".join([str(x) for x in bowtie_version])
1278 gpertea 29
1279    
1280     # Retrive a tuple containing the system's version of samtools. Parsed from
1281     # `samtools`
1282     def get_samtools_version():
1283     try:
1284     # Launch Bowtie to capture its version info
1285     proc = subprocess.Popen(samtools_path, stderr=subprocess.PIPE)
1286     samtools_out = proc.communicate()[1]
1287    
1288     # Find the version identifier
1289     version_match = re.search(r'Version:\s+(\d+)\.(\d+).(\d+)([a-zA-Z]?)', samtools_out)
1290     samtools_version_arr = [int(version_match.group(x)) for x in [1,2,3]]
1291     if version_match.group(4):
1292     samtools_version_arr.append(version_match.group(4))
1293     else:
1294     samtools_version_arr.append(0)
1295    
1296     return version_match.group(), samtools_version_arr
1297     except OSError, o:
1298     errmsg=fail_str+str(o)+"\n"
1299     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1300     errmsg+="Error: samtools not found on this system"
1301     die(errmsg)
1302    
1303     # Make sure the SAM tools are installed and are recent enough to be useful
1304     def check_samtools():
1305 gpertea 135 th_log("Checking for Samtools")
1306 gpertea 29 global samtools_path
1307     samtools_path=prog_path("samtools")
1308     samtools_version_str, samtools_version_arr = get_samtools_version()
1309     if samtools_version_str == None:
1310     die("Error: Samtools not found on this system")
1311     elif samtools_version_arr[1] < 1 or samtools_version_arr[2] < 7:
1312     die("Error: TopHat requires Samtools 0.1.7 or later")
1313     print >> sys.stderr, "\tSamtools %s" % samtools_version_str
1314    
1315    
1316     class FastxReader:
1317     def __init__(self, i_file, is_color=0, fname=''):
1318     self.bufline=None
1319     self.format=None
1320     self.ifile=i_file
1321     self.nextRecord=None
1322     self.eof=None
1323     self.fname=fname
1324     self.lastline=None
1325     self.numrecords=0
1326     self.isColor=0
1327     if is_color : self.isColor=1
1328     # determine file type
1329     #no records processed yet, skip custom header lines if any
1330     hlines=10 # allow maximum 10 header lines
1331     self.lastline=" "
1332     while hlines>0 and self.lastline[0] not in "@>" :
1333     self.lastline=self.ifile.readline()
1334     hlines-=1
1335     if self.lastline[0] == '@':
1336     self.format='fastq'
1337     self.nextRecord=self.nextFastq
1338     elif self.lastline[0] == '>':
1339     self.format='fasta'
1340     self.nextRecord=self.nextFasta
1341     else:
1342     die("Error: cannot determine record type in input file %s" % fname)
1343     self.bufline=self.lastline
1344     self.lastline=None
1345    
1346     def nextFastq(self):
1347     # returning tuple: (seqID, sequence_string, seq_len, qv_string)
1348     seqid,seqstr,qstr,seq_len='','','',0
1349     if self.eof: return (seqid, seqstr, seq_len, qstr)
1350     fline=self.getLine #shortcut to save a bit of time
1351     line=fline()
1352    
1353     if not line : return (seqid, seqstr, seq_len, qstr)
1354     while len(line.rstrip())==0: # skip empty lines
1355     line=fline()
1356     if not line : return (seqid, seqstr,seq_len, qstr)
1357     try:
1358     if line[0] != "@":
1359     raise ValueError("Records in Fastq files should start with '@' character")
1360 gpertea 154
1361 gpertea 29 seqid = line[1:].rstrip()
1362     seqstr = fline().rstrip()
1363    
1364     #There may now be more sequence lines, or the "+" quality marker line:
1365     while True:
1366     line = fline()
1367     if not line:
1368     raise ValueError("Premature end of file (missing quality values for "+seqid+")")
1369     if line[0] == "+":
1370     # -- sequence string ended
1371     #qtitle = line[1:].rstrip()
1372     #if qtitle and qtitle != seqid:
1373     # raise ValueError("Different read ID for sequence and quality (%s vs %s)" \
1374     # % (seqid, qtitle))
1375     break
1376     seqstr += line.rstrip() #removes trailing newlines
1377     #loop until + found
1378     seq_len = len(seqstr)
1379     #at least one line of quality data should follow
1380     qstrlen=0
1381     #now read next lines as quality values until seq_len is reached
1382     while True:
1383     line=fline()
1384     if not line : break #end of file
1385     qstr += line.rstrip()
1386     qstrlen=len(qstr)
1387     if qstrlen + self.isColor >= seq_len :
1388     break # qv string has reached the length of seq string
1389     #loop until qv has the same length as seq
1390 gpertea 154
1391 gpertea 135 if self.isColor:
1392     # and qstrlen==seq_len :
1393     if qstrlen==seq_len:
1394     #qual string may have a dummy qv at the beginning, should be stripped
1395     qstr = qstr[1:]
1396     qstrlen -= 1
1397     if qstrlen!=seq_len-1:
1398     raise ValueError("Length mismatch between sequence and quality strings "+ \
1399     "for %s (%i vs %i)." % (seqid, seq_len, qstrlen))
1400     else:
1401     if seq_len != qstrlen :
1402     raise ValueError("Length mismatch between sequence and quality strings "+ \
1403     "for %s (%i vs %i)." % (seqid, seq_len, qstrlen))
1404 gpertea 29 except ValueError, err:
1405     die("\nError encountered parsing file "+self.fname+":\n "+str(err))
1406     #return the record
1407     self.numrecords+=1
1408 gpertea 135 ##--discard the primer base [NO]
1409 gpertea 29 if self.isColor :
1410     seq_len-=1
1411     seqstr = seqstr[1:]
1412     return (seqid, seqstr, seq_len, qstr)
1413    
1414     def nextFasta(self):
1415     # returning tuple: (seqID, sequence_string, seq_len)
1416     seqid,seqstr,seq_len='','',0
1417     fline=self.getLine # shortcut to readline function of f
1418     line=fline() # this will use the buffer line if it's there
1419     if not line : return (seqid, seqstr, seq_len, None)
1420     while len(line.rstrip())==0: # skip empty lines
1421     line=fline()
1422     if not line : return (seqid, seqstr, seq_len, None)
1423     try:
1424     if line[0] != ">":
1425     raise ValueError("Records in Fasta files must start with '>' character")
1426     seqid = line[1:].split()[0]
1427     #more sequence lines, or the ">" quality marker line:
1428     while True:
1429     line = fline()
1430     if not line: break
1431     if line[0] == '>':
1432     #next sequence starts here
1433     self.ungetLine()
1434     break
1435     seqstr += line.rstrip()
1436     #loop until '>' found
1437     seq_len = len(seqstr)
1438     if seq_len < 3:
1439     raise ValueError("Read %s too short (%i)." \
1440     % (seqid, seq_len))
1441     except ValueError, err:
1442     die("\nError encountered parsing fasta file "+self.fname+"\n "+str(err))
1443     #return the record and continue
1444     self.numrecords+=1
1445 gpertea 135 if self.isColor : # -- discard primer base
1446 gpertea 29 seq_len-=1
1447 gpertea 135 seqstr=seqstr[1:]
1448 gpertea 29 return (seqid, seqstr, seq_len, None)
1449    
1450     def getLine(self):
1451     if self.bufline: #return previously buffered line
1452     r=self.bufline
1453     self.bufline=None
1454     return r
1455     else: #read a new line from stream and return it
1456     if self.eof: return None
1457     self.lastline=self.ifile.readline()
1458     if not self.lastline:
1459     self.eof=1
1460     return None
1461     return self.lastline
1462     def ungetLine(self):
1463     if self.lastline is None:
1464     print >> sys.stderr, "Warning: FastxReader called ungetLine() with no prior line!"
1465     self.bufline=self.lastline
1466     self.lastline=None
1467     #< class FastxReader
1468    
1469 gpertea 135 def fa_write(fhandle, seq_id, seq):
1470     """
1471     Write to a file in the FASTA format.
1472    
1473     Arguments:
1474     - `fhandle`: A file handle open for writing
1475     - `seq_id`: The sequence id string for this sequence
1476     - `seq`: An unformatted string of the sequence to write
1477     """
1478     line_len = 60
1479     fhandle.write(">" + seq_id + "\n")
1480     for i in xrange(len(seq) / line_len + 1):
1481     start = i * line_len
1482 gpertea 154 #end = (i+1) * line_len if (i+1) * line_len < len(seq) else len(seq)
1483     if (i+1) * line_len < len(seq):
1484     end = (i+1) * line_len
1485     else:
1486     end = len(seq)
1487 gpertea 135 fhandle.write( seq[ start:end ] + "\n")
1488    
1489 gpertea 29 class ZReader:
1490     def __init__(self, filename, sysparams, guess=True):
1491     self.fname=filename
1492     self.file=None
1493     self.fsrc=None
1494     self.popen=None
1495     pipecmd=[]
1496 gpertea 31 s=filename.lower()
1497     if s.endswith(".bam"):
1498 gpertea 154 # daehwan - want to ouptut mapped reads as well
1499     pipecmd=[prog_path("bam2fastx"), "--all", "-"]
1500 gpertea 31 else:
1501 gpertea 135 if guess:
1502     if s.endswith(".z") or s.endswith(".gz") or s.endswith(".gzip"):
1503 gpertea 31 pipecmd=['gzip']
1504 gpertea 135 else:
1505 gpertea 31 if s.endswith(".bz2") or s.endswith(".bzip2") or s.endswith(".bzip"):
1506     pipecmd=['bzip2']
1507 gpertea 135 if len(pipecmd)>0 and which(pipecmd[0]) is None:
1508 gpertea 31 die("Error: cannot find %s to decompress input file %s " % (pipecmd, filename))
1509 gpertea 135 if len(pipecmd)>0:
1510     if pipecmd[0]=='gzip' and sysparams.zipper.endswith('pigz'):
1511 gpertea 31 pipecmd[0]=sysparams.zipper
1512     pipecmd.extend(sysparams.zipper_opts)
1513 gpertea 135 elif pipecmd[0]=='bzip2' and sysparams.zipper.endswith('pbzip2'):
1514 gpertea 31 pipecmd[0]=sysparams.zipper
1515     pipecmd.extend(sysparams.zipper_opts)
1516 gpertea 135 else: #not guessing, but must still check if it's a compressed file
1517     if use_zpacker and filename.endswith(".z"):
1518 gpertea 31 pipecmd=[sysparams.zipper]
1519     pipecmd.extend(sysparams.zipper_opts)
1520 gpertea 154
1521 gpertea 135 if pipecmd:
1522     pipecmd+=['-cd']
1523 gpertea 29 if pipecmd:
1524     try:
1525     self.fsrc=open(self.fname, 'rb')
1526     self.popen=subprocess.Popen(pipecmd,
1527     preexec_fn=subprocess_setup,
1528     stdin=self.fsrc,
1529     stdout=subprocess.PIPE, close_fds=True)
1530     except Exception:
1531     die("Error: could not open pipe "+' '.join(pipecmd)+' < '+ self.fname)
1532     self.file=self.popen.stdout
1533     else:
1534     self.file=open(filename)
1535     def close(self):
1536     if self.fsrc: self.fsrc.close()
1537     self.file.close()
1538     if self.popen:
1539     self.popen.wait()
1540     self.popen=None
1541    
1542     class ZWriter:
1543     def __init__(self, filename, sysparams):
1544     self.fname=filename
1545     if use_zpacker:
1546     pipecmd=[sysparams.zipper,"-cf", "-"]
1547     self.ftarget=open(filename, "wb")
1548     try:
1549     self.popen=subprocess.Popen(pipecmd,
1550     preexec_fn=subprocess_setup,
1551     stdin=subprocess.PIPE,
1552     stdout=self.ftarget, close_fds=True)
1553     except Exception:
1554     die("Error: could not open writer pipe "+' '.join(pipecmd)+' < '+ self.fname)
1555     self.file=self.popen.stdin # client writes to this end of the pipe
1556     else: #no compression
1557     self.file=open(filename, "w")
1558     self.ftarget=None
1559     self.popen=None
1560     def close(self):
1561     self.file.close()
1562     if self.ftarget: self.ftarget.close()
1563     if self.popen:
1564     self.popen.wait() #! required to actually flush the pipes (eek!)
1565     self.popen=None
1566    
1567     # check_reads_format() examines the first few records in the user files
1568     # to determines the file format
1569     def check_reads_format(params, reads_files):
1570 gpertea 135 #seed_len = params.read_params.seed_length
1571 gpertea 29 fileformat = params.read_params.reads_format
1572    
1573     observed_formats = set([])
1574     # observed_scales = set([])
1575     min_seed_len = 99999
1576     max_seed_len = 0
1577     files = reads_files.split(',')
1578    
1579     for f_name in files:
1580     #try:
1581     zf = ZReader(f_name, params.system_params)
1582     #except IOError:
1583     # die("Error: could not open file "+f_name)
1584     freader=FastxReader(zf.file, params.read_params.color, zf.fname)
1585     toread=4 #just sample the first 4 reads
1586     while toread>0:
1587     seqid, seqstr, seq_len, qstr = freader.nextRecord()
1588     if not seqid: break
1589     toread-=1
1590     if seq_len < 20:
1591     print >> sys.stderr, "Warning: found a read < 20bp in", f_name
1592     else:
1593     min_seed_len = min(seq_len, min_seed_len)
1594     max_seed_len = max(seq_len, max_seed_len)
1595     zf.close()
1596     observed_formats.add(freader.format)
1597     if len(observed_formats) > 1:
1598     die("Error: TopHat requires all reads be either FASTQ or FASTA. Mixing formats is not supported.")
1599     fileformat=list(observed_formats)[0]
1600 gpertea 135 #if seed_len != None:
1601     # seed_len = max(seed_len, max_seed_len)
1602     #else:
1603     # seed_len = max_seed_len
1604 gpertea 29 #print >> sys.stderr, "\tmin read length: %dbp, max read length: %dbp" % (min_seed_len, max_seed_len)
1605     print >> sys.stderr, "\tformat:\t\t %s" % fileformat
1606     if fileformat == "fastq":
1607     quality_scale = "phred33 (default)"
1608     if params.read_params.solexa_quals and not params.read_params.phred64_quals:
1609     quality_scale = "solexa33 (reads generated with GA pipeline version < 1.3)"
1610     elif params.read_params.phred64_quals:
1611     quality_scale = "phred64 (reads generated with GA pipeline version >= 1.3)"
1612     print >> sys.stderr, "\tquality scale:\t %s" % quality_scale
1613     elif fileformat == "fasta":
1614     if params.read_params.color:
1615     params.read_params.integer_quals = True
1616    
1617     #print seed_len, format, solexa_scale
1618     #NOTE: seed_len will be re-evaluated later by prep_reads
1619     return TopHatParams.ReadParams(params.read_params.solexa_quals,
1620     params.read_params.phred64_quals,
1621     params.read_params.quals,
1622     params.read_params.integer_quals,
1623     params.read_params.color,
1624     params.read_params.color_out,
1625     params.read_params.library_type,
1626 gpertea 135 #seed_len,
1627     params.read_params.seed_length,
1628 gpertea 29 fileformat,
1629     params.read_params.mate_inner_dist,
1630     params.read_params.mate_inner_dist_std_dev,
1631     params.read_params.read_group_id,
1632     params.read_params.sample_id,
1633     params.read_params.library_id,
1634     params.read_params.description,
1635     params.read_params.seq_platform_unit,
1636     params.read_params.seq_center,
1637     params.read_params.seq_run_date,
1638     params.read_params.seq_platform)
1639    
1640     def log_tail(logfile, lines=1):
1641     f=open(logfile, "r")
1642     f.seek(0, 2)
1643     fbytes= f.tell()
1644     size=lines
1645     block=-1
1646     while size > 0 and fbytes+block*1024 > 0:
1647     if (fbytes+block*1024 > 0):
1648     ##Seek back once more, if possible
1649     f.seek( block*1024, 2 )
1650     else:
1651     #Seek to the beginning
1652     f.seek(0, 0)
1653     data= f.read( 1024 )
1654     linesFound= data.count('\n')
1655     size -= linesFound
1656     block -= 1
1657     if (fbytes + block*1024 > 0):
1658     f.seek(block*1024, 2)
1659     else:
1660     f.seek(0,0)
1661     f.readline() # find a newline
1662     lastBlocks= list( f.readlines() )
1663     f.close()
1664     return "\n".join(lastBlocks[-lines:])
1665    
1666     # Format a DateTime as a pretty string.
1667     # FIXME: Currently doesn't support days!
1668     def formatTD(td):
1669 gpertea 154 days = td.days
1670     hours = td.seconds // 3600
1671     minutes = (td.seconds % 3600) // 60
1672     seconds = td.seconds % 60
1673 gpertea 29
1674 gpertea 154 if days > 0:
1675     return '%d days %02d:%02d:%02d' % (days, hours, minutes, seconds)
1676     else:
1677     return '%02d:%02d:%02d' % (hours, minutes, seconds)
1678    
1679 gpertea 29 class PrepReadsInfo:
1680     def __init__(self, fname, side):
1681     try:
1682     f=open(fname,"r")
1683     self.min_len=int(f.readline().split("=")[-1])
1684     self.max_len=int(f.readline().split("=")[-1])
1685     self.in_count=int(f.readline().split("=")[-1])
1686     self.out_count=int(f.readline().split("=")[-1])
1687     if (self.out_count==0) or (self.max_len<16):
1688     raise Exception()
1689     except Exception, e:
1690     die(fail_str+"Error retrieving prep_reads info.")
1691     print >> sys.stderr, "\t%s reads: min. length=%s, count=%s" % (side, self.min_len, self.out_count)
1692     if self.min_len < 20:
1693     print >> sys.stderr, "Warning: short reads (<20bp) will make TopHat quite slow and take large amount of memory because they are likely to be mapped to too many places"
1694    
1695 gpertea 135
1696 gpertea 154 def prep_reads_cmd(params, reads_list, quals_list=None, aux_file=None, index_file=None, filter_reads=None, hits_to_filter=None):
1697 gpertea 135 #generate a prep_reads cmd arguments
1698     filter_cmd = [prog_path("prep_reads")]
1699     filter_cmd.extend(params.cmd())
1700    
1701     if params.read_params.reads_format == "fastq":
1702     filter_cmd += ["--fastq"]
1703     elif params.read_params.reads_format == "fasta":
1704     filter_cmd += ["--fasta"]
1705     if hits_to_filter:
1706     filter_cmd += ["--flt-hits="+hits_to_filter]
1707     if aux_file:
1708     filter_cmd += ["--aux-outfile="+aux_file]
1709 gpertea 154 if index_file:
1710     filter_cmd += ["--index-outfile="+index_file]
1711 gpertea 135 if filter_reads:
1712     filter_cmd += ["--flt-reads="+filter_reads]
1713     filter_cmd.append(reads_list)
1714     if quals_list:
1715     filter_cmd.append(quals_list)
1716     return filter_cmd
1717    
1718 gpertea 29 # Calls the prep_reads executable, which prepares an internal read library.
1719     # The read library features reads with monotonically increasing integer IDs.
1720     # prep_reads also filters out very low complexy or garbage reads as well as
1721     # polyA reads.
1722     #--> returns the tuple (internal_read_library_file, read_info)
1723 gpertea 135 def prep_reads(params, reads_list, quals_list, side="left", prefilter_reads=None):
1724 gpertea 29 reads_suffix = ".fq"
1725 gpertea 154
1726     # for parallelization, we don't compress the read files
1727     do_use_zpacker = use_zpacker
1728     if params.system_params.num_threads > 1:
1729     do_use_zpacker = False
1730    
1731     if do_use_zpacker: reads_suffix += ".z"
1732    
1733 gpertea 135 output_name=side+"_kept_reads"
1734 gpertea 29 kept_reads_filename = tmp_dir + output_name + reads_suffix
1735    
1736     if os.path.exists(kept_reads_filename):
1737     os.remove(kept_reads_filename)
1738     kept_reads = open(kept_reads_filename, "wb")
1739     log_fname=logging_dir + "prep_reads.log"
1740     filter_log = open(log_fname,"w")
1741    
1742     info_file=output_dir+output_name+".info"
1743 gpertea 154 index_file=kept_reads_filename+".index"
1744    
1745     filter_cmd=prep_reads_cmd(params, reads_list, quals_list, info_file, index_file, prefilter_reads)
1746 gpertea 29 shell_cmd = ' '.join(filter_cmd)
1747     #finally, add the compression pipe
1748     zip_cmd=[]
1749 gpertea 154 if do_use_zpacker:
1750 gpertea 29 zip_cmd=[ params.system_params.zipper ]
1751     zip_cmd.extend(params.system_params.zipper_opts)
1752     zip_cmd.extend(['-c','-'])
1753     shell_cmd +=' | '+' '.join(zip_cmd)
1754     shell_cmd += ' >' +kept_reads_filename
1755     retcode=0
1756     try:
1757     print >> run_log, shell_cmd
1758 gpertea 154 if do_use_zpacker:
1759 gpertea 29 filter_proc = subprocess.Popen(filter_cmd,
1760     stdout=subprocess.PIPE,
1761     stderr=filter_log)
1762     zip_proc=subprocess.Popen(zip_cmd,
1763     preexec_fn=subprocess_setup,
1764     stdin=filter_proc.stdout,
1765     stdout=kept_reads)
1766     filter_proc.stdout.close() #as per http://bugs.python.org/issue7678
1767     zip_proc.communicate()
1768     retcode=filter_proc.poll()
1769     if retcode==0:
1770     retcode=zip_proc.poll()
1771     else:
1772     retcode = subprocess.call(filter_cmd,
1773     stdout=kept_reads,
1774     stderr=filter_log)
1775     if retcode:
1776     die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname))
1777    
1778     except OSError, o:
1779     errmsg=fail_str+str(o)
1780     die(errmsg+"\n"+log_tail(log_fname))
1781 gpertea 154
1782 gpertea 29 kept_reads.close()
1783     return kept_reads_filename, PrepReadsInfo(info_file, side)
1784    
1785 gpertea 135
1786 gpertea 29 # Call bowtie
1787     def bowtie(params,
1788     bwt_idx_prefix,
1789     reads_list,
1790     reads_format,
1791     num_mismatches,
1792     mapped_reads,
1793 gpertea 135 unmapped_reads,
1794     extra_output = "",
1795 gpertea 154 mapping_type = "read_against_genome",
1796 gpertea 135 multihits_out = None): #only --prefilter-multihits should activate this parameter for the initial prefilter search
1797 gpertea 29 start_time = datetime.now()
1798     bwt_idx_name = bwt_idx_prefix.split('/')[-1]
1799 gpertea 135 reads_file=reads_list[0]
1800     readfile_basename=getFileBaseName(reads_file)
1801 gpertea 154
1802     g_mapping, t_mapping, seg_mapping = False, False, False
1803     if mapping_type == "read_against_transcriptome":
1804     t_mapping = True
1805     elif mapping_type == "seg_against_genome" or mapping_type == "seg_against_junctions":
1806     seg_mapping = True
1807     else:
1808     g_mapping = True
1809    
1810     bowtie_str = "Bowtie"
1811     if params.bowtie2:
1812     bowtie_str += "2"
1813    
1814     if seg_mapping:
1815     # we multiply max_hits by 2 when mapping segments
1816     # this multiplication factor (2) is also assumed in segment_juncs and long_spanning_reads,
1817     # which means "2" should not be changed without modifying those sub-programs.
1818     params.max_hits *= 2
1819     if not params.bowtie2:
1820     backup_bowtie_alignment_option = params.bowtie_alignment_option
1821     params.bowtie_alignment_option = "-v"
1822    
1823 gpertea 135 if t_mapping:
1824 gpertea 154 print >> sys.stderr, "[%s] Mapping %s against transcriptome %s with %s %s" % (start_time.strftime("%c"),
1825     readfile_basename, bwt_idx_name, bowtie_str, extra_output)
1826 gpertea 135 else:
1827 gpertea 154 print >> sys.stderr, "[%s] Mapping %s against %s with %s %s" % (start_time.strftime("%c"),
1828     readfile_basename, bwt_idx_name, bowtie_str, extra_output)
1829 gpertea 29 bwt_log = open(logging_dir + 'bowtie.'+readfile_basename+'.fixmap.log', "w")
1830 gpertea 135 #bwt_mapped=mapped_reads
1831 gpertea 67 unmapped_reads_out=None
1832 gpertea 137 if unmapped_reads:
1833 gpertea 67 unmapped_reads_out=unmapped_reads+".fq"
1834 gpertea 29 use_FIFO = use_BWT_FIFO and use_zpacker and unmapped_reads and params.read_params.color
1835 gpertea 135 if use_FIFO:
1836 gpertea 29 global unmapped_reads_fifo
1837 gpertea 67 unmapped_reads_fifo=unmapped_reads+".fifo"
1838 gpertea 29 if os.path.exists(unmapped_reads_fifo):
1839     os.remove(unmapped_reads_fifo)
1840     try:
1841     os.mkfifo(unmapped_reads_fifo)
1842     except OSError, o:
1843     die(fail_str+"Error at mkfifo("+unmapped_reads_fifo+'). '+str(o))
1844 gpertea 67 unmapped_reads_out+=".z"
1845 gpertea 29
1846     # Launch Bowtie
1847     try:
1848     bowtie_cmd = [bowtie_path]
1849     if reads_format == "fastq":
1850     bowtie_cmd += ["-q"]
1851     elif reads_format == "fasta":
1852     bowtie_cmd += ["-f"]
1853    
1854     if params.read_params.color:
1855     bowtie_cmd += ["-C", "--col-keepends"]
1856     if unmapped_reads:
1857 gpertea 135 bowtie_cmd += ["--un", unmapped_reads_out]
1858    
1859     unzip_cmd=None
1860     bam_input=False
1861 gpertea 154 if len(reads_list) > 0 and reads_list[0].endswith('.bam'):
1862 gpertea 135 bam_input=True
1863     unzip_cmd=[ prog_path('bam2fastx') ]
1864     if reads_format:
1865 gpertea 154 unzip_cmd.append("--" + reads_format)
1866 gpertea 135 unzip_cmd+=[reads_list]
1867 gpertea 137
1868     if use_zpacker and (unzip_cmd is None):
1869 gpertea 29 unzip_cmd=[ params.system_params.zipper ]
1870     unzip_cmd.extend(params.system_params.zipper_opts)
1871 gpertea 154 unzip_cmd+=['-cd']
1872 gpertea 29
1873     fifo_pid=None
1874     if use_FIFO:
1875 gpertea 135 unm_zipcmd=[ params.system_params.zipper ]
1876     unm_zipcmd.extend(params.system_params.zipper_opts)
1877     unm_zipcmd+=['-c']
1878     print >> run_log, ' '.join(unm_zipcmd)+' < '+ unmapped_reads_fifo + ' > '+ unmapped_reads_out + ' & '
1879     fifo_pid=os.fork()
1880     if fifo_pid==0:
1881     def on_sig_exit(sig, func=None):
1882 gpertea 29 os._exit(os.EX_OK)
1883 gpertea 135 signal.signal(signal.SIGTERM, on_sig_exit)
1884     subprocess.call(unm_zipcmd,
1885     stdin=open(unmapped_reads_fifo, "r"),
1886     stdout=open(unmapped_reads_out, "wb"))
1887     os._exit(os.EX_OK)
1888 gpertea 29
1889     fix_map_cmd = [prog_path('fix_map_ordering')]
1890 gpertea 154
1891     if params.bowtie2:
1892     if t_mapping or g_mapping:
1893     max_penalty, min_penalty = params.bowtie2_params.mp.split(',')
1894     max_penalty, min_penalty = int(max_penalty), int(min_penalty)
1895     min_score = (max_penalty - 1) * num_mismatches
1896     fix_map_cmd += ["--bowtie2-min-score", str(min_score)]
1897    
1898 gpertea 137 samzip_cmd=None
1899 gpertea 154
1900     # daehwan - just use bam format for multi-threading.
1901     if unmapped_reads or params.system_params.num_threads >= 1:
1902 gpertea 29 #mapping on the genome
1903     #fix_map_ordering will write BAM file directly, and unmapped_reads as BAM file too
1904     mapped_reads += ".bam"
1905 gpertea 154
1906     temp_sam_header = tmp_dir + bwt_idx_prefix.split('/')[-1] + ".bwt.samheader.sam"
1907     fix_map_cmd += ["--sam-header", temp_sam_header, "-", mapped_reads]
1908     if not params.read_params.color and unmapped_reads:
1909     unmapped_reads_out = unmapped_reads+".bam"
1910     fix_map_cmd += [unmapped_reads_out]
1911 gpertea 29 else:
1912     #mapping on segment_juncs (spliced potential junctions)
1913 gpertea 135 # don't use SAM headers, could be slow -- better use compressed SAM directly
1914 gpertea 29 mapped_reads += ".sam"
1915     fix_map_cmd += ["-"]
1916     if use_zpacker:
1917     fix_map_cmd += ["-"]
1918 gpertea 137 samzip_cmd=[ params.system_params.zipper ]
1919     samzip_cmd.extend(params.system_params.zipper_opts)
1920     samzip_cmd += ["-c"]
1921 gpertea 29 mapped_reads += ".z"
1922     else:
1923     fix_map_cmd += [mapped_reads]
1924 gpertea 137
1925 gpertea 154 fix_map_cmd += ["--index-outfile", mapped_reads + ".index"]
1926    
1927 gpertea 135 max_hits = params.max_hits
1928     if t_mapping:
1929     max_hits = params.t_max_hits
1930 gpertea 154 if num_mismatches > 3:
1931     num_mismatches = 3
1932 gpertea 29
1933 gpertea 154 if params.bowtie2:
1934     if seg_mapping:
1935     # since bowtie2 does not suppress reads that map to many places,
1936     # we suppress those in segment_juncs and long_spanning_reads.
1937     bowtie_cmd += ["-k", str(max_hits + 1)]
1938     else:
1939     bowtie_cmd += ["-k", str(max_hits)]
1940    
1941     bowtie2_params = params.bowtie2_params
1942     if seg_mapping:
1943     # after intensive testing,
1944     # the following parameters seem to work faster than Bowtie1 and as sensitive as Bowtie1,
1945     # but a chance for further improvements remains.
1946     bowtie_cmd += ["-N", str(min(num_mismatches, 1))]
1947     bowtie_cmd += ["-i", "C,2,0"]
1948     bowtie_cmd += ["-L", "20"]
1949     # bowtie_cmd += ["-L", str(min(params.segment_length, 20))]
1950     else:
1951     bowtie2_preset = ""
1952     if bowtie2_params.very_fast:
1953     bowtie2_preset = "--very-fast"
1954     elif bowtie2_params.fast:
1955     bowtie2_preset = "--fast"
1956     elif bowtie2_params.sensitive:
1957     bowtie2_preset = "--sensitive"
1958     elif bowtie2_params.very_sensitive:
1959     bowtie2_preset = "--very-sensitive"
1960    
1961     if bowtie2_preset != "":
1962     bowtie_cmd += [bowtie2_preset]
1963     else:
1964     bowtie_cmd += ["-D", str(bowtie2_params.D),
1965     "-R", str(bowtie2_params.R),
1966     "-N", str(bowtie2_params.N),
1967     "-L", str(bowtie2_params.L),
1968     "-i", bowtie2_params.i]
1969    
1970     # "--n-ceil" is not correctly parsed in Bowtie2,
1971     # I (daehwan) already talked to Ben who will fix the problem.
1972     bowtie_cmd += [# "--n-ceil", bowtie2_params.n_ceil,
1973     "--gbar", str(bowtie2_params.gbar),
1974     "--mp", bowtie2_params.mp,
1975     "--np", str(bowtie2_params.np),
1976     "--rdg", bowtie2_params.rdg,
1977     "--rfg", bowtie2_params.rfg,
1978     "--score-min", bowtie2_params.score_min]
1979    
1980 gpertea 135 else:
1981 gpertea 154 bowtie_cmd += [params.bowtie_alignment_option, str(num_mismatches),
1982     "-k", str(max_hits),
1983     "-m", str(max_hits),
1984     "-S"]
1985    
1986     bowtie_cmd += ["-p", str(params.system_params.num_threads)]
1987    
1988     if params.bowtie2: #always use headerless SAM file
1989     bowtie_cmd += ["--sam-no-hd"]
1990     else:
1991     bowtie_cmd += ["--sam-nohead"]
1992    
1993     if not params.bowtie2:
1994     if multihits_out:
1995     bowtie_cmd += ["--max", multihits_out]
1996     else:
1997     bowtie_cmd += ["--max", "/dev/null"]
1998    
1999     if params.bowtie2:
2000     bowtie_cmd += ["-x"]
2001    
2002 gpertea 135 bowtie_cmd += [ bwt_idx_prefix ]
2003 gpertea 29 bowtie_proc=None
2004     shellcmd=""
2005     unzip_proc=None
2006 gpertea 135 samzip_proc=None
2007     if multihits_out:
2008     #special prefilter bowtie run: we use prep_reads on the fly
2009     #in order to get multi-mapped reads to exclude later
2010     prep_cmd = prep_reads_cmd(params, ",".join(reads_list))
2011     preplog_fname=logging_dir + "prep_reads_prefilter.log"
2012     prepfilter_log = open(preplog_fname,"w")
2013     unzip_proc = subprocess.Popen(prep_cmd,
2014     stdout=subprocess.PIPE,
2015     stderr=prepfilter_log)
2016     shellcmd=' '.join(prep_cmd) + "|"
2017     else:
2018     z_input=use_zpacker and reads_file.endswith(".z")
2019     if z_input:
2020     unzip_proc = subprocess.Popen(unzip_cmd,
2021     stdin=open(reads_file, "rb"),
2022     stdout=subprocess.PIPE)
2023     shellcmd=' '.join(unzip_cmd) + "< " +reads_file +"|"
2024 gpertea 29 else:
2025 gpertea 135 #must be uncompressed fastq input (unmapped reads from a previous run)
2026     #or a BAM file with unmapped reads
2027     if bam_input:
2028     unzip_proc = subprocess.Popen(unzip_cmd, stdout=subprocess.PIPE)
2029     shellcmd=' '.join(unzip_cmd) + "|"
2030     bowtie_cmd += [reads_file]
2031     bowtie_proc = subprocess.Popen(bowtie_cmd,
2032     stdout=subprocess.PIPE,
2033     stderr=bwt_log)
2034     if unzip_proc:
2035     #input is compressed OR prep_reads is used as a filter
2036     bowtie_cmd += ['-']
2037     bowtie_proc = subprocess.Popen(bowtie_cmd,
2038     stdin=unzip_proc.stdout,
2039     stdout=subprocess.PIPE,
2040     stderr=bwt_log)
2041     unzip_proc.stdout.close() # see http://bugs.python.org/issue7678
2042    
2043     shellcmd += ' '.join(bowtie_cmd) + '|' + ' '.join(fix_map_cmd)
2044 gpertea 137 if samzip_cmd:
2045 gpertea 135 shellcmd += "|"+ ' '.join(samzip_cmd)+" > " + mapped_reads
2046     fix_order_proc = subprocess.Popen(fix_map_cmd,
2047 gpertea 154 stdin=bowtie_proc.stdout,
2048     stdout=subprocess.PIPE)
2049 gpertea 135 bowtie_proc.stdout.close() # needed?
2050     samzip_proc = subprocess.Popen(samzip_cmd,
2051     preexec_fn=subprocess_setup,
2052     stdin=fix_order_proc.stdout,
2053     stdout=open(mapped_reads, "wb"))
2054     fix_order_proc.stdout.close()
2055 gpertea 137 else:
2056 gpertea 135 #write BAM output directly
2057     fix_order_proc = subprocess.Popen(fix_map_cmd,
2058     stdin=bowtie_proc.stdout)
2059     # stdout=open(mapped_reads, "w"))
2060     bowtie_proc.stdout.close()
2061 gpertea 137
2062 gpertea 135 # shellcmd += " > " + mapped_reads
2063 gpertea 29 print >> run_log, shellcmd
2064 gpertea 135 if samzip_proc:
2065     samzip_proc.communicate()
2066 gpertea 29 else:
2067     fix_order_proc.communicate()
2068     if use_FIFO:
2069 gpertea 67 if fifo_pid and not os.path.exists(unmapped_reads_out):
2070 gpertea 29 try:
2071     os.kill(fifo_pid, signal.SIGTERM)
2072     except:
2073     pass
2074 gpertea 154
2075 gpertea 29 except OSError, o:
2076     die(fail_str+"Error: "+str(o))
2077    
2078     # Success
2079     #finish_time = datetime.now()
2080     #duration = finish_time - start_time
2081     #print >> sys.stderr, "\t\t\t[%s elapsed]" % formatTD(duration)
2082     if use_FIFO:
2083     try:
2084     os.remove(unmapped_reads_fifo)
2085     except:
2086     pass
2087 gpertea 135 if multihits_out and not os.path.exists(multihits_out):
2088     open(multihits_out, "w").close()
2089 gpertea 29
2090 gpertea 154 if seg_mapping:
2091     params.max_hits /= 2
2092     if not params.bowtie2:
2093     params.bowtie_alignment_option = backup_bowtie_alignment_option
2094 gpertea 29
2095 gpertea 154 return (mapped_reads, unmapped_reads_out)
2096 gpertea 29
2097    
2098     # Retrieve a .juncs file from a GFF file by calling the gtf_juncs executable
2099     def get_gtf_juncs(gff_annotation):
2100 gpertea 135 th_log("Reading known junctions from GTF file")
2101 gpertea 29 gtf_juncs_log = open(logging_dir + "gtf_juncs.log", "w")
2102    
2103     gff_prefix = gff_annotation.split('/')[-1].split('.')[0]
2104    
2105     gtf_juncs_out_name = tmp_dir + gff_prefix + ".juncs"
2106     gtf_juncs_out = open(gtf_juncs_out_name, "w")
2107    
2108     gtf_juncs_cmd=[prog_path("gtf_juncs"), gff_annotation]
2109     try:
2110 gpertea 135 print >> run_log, " ".join(gtf_juncs_cmd), " > "+gtf_juncs_out_name
2111 gpertea 29 retcode = subprocess.call(gtf_juncs_cmd,
2112     stderr=gtf_juncs_log,
2113     stdout=gtf_juncs_out)
2114     # cvg_islands returned an error
2115     if retcode == 1:
2116     print >> sys.stderr, "\tWarning: TopHat did not find any junctions in GTF file"
2117     return (False, gtf_juncs_out_name)
2118     elif retcode != 0:
2119     die(fail_str+"Error: GTF junction extraction failed with err ="+str(retcode))
2120 gpertea 154
2121 gpertea 29 # cvg_islands not found
2122     except OSError, o:
2123     errmsg=fail_str+str(o)+"\n"
2124     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2125     errmsg+="Error: gtf_juncs not found on this system"
2126     die(errmsg)
2127     return (True, gtf_juncs_out_name)
2128    
2129     # Call bowtie-build on the FASTA file of sythetic splice junction sequences
2130 gpertea 154 def build_juncs_bwt_index(is_bowtie2, external_splice_prefix, color):
2131 gpertea 135 th_log("Indexing splices")
2132 gpertea 29 bowtie_build_log = open(logging_dir + "bowtie_build.log", "w")
2133    
2134     #user_splices_out_prefix = output_dir + "user_splices_idx"
2135    
2136 gpertea 154 if is_bowtie2:
2137     bowtie_build_cmd = [prog_path("bowtie2-build")]
2138     else:
2139     bowtie_build_cmd = [prog_path("bowtie-build")]
2140    
2141 gpertea 29 if color:
2142     bowtie_build_cmd += ["-C"]
2143    
2144     bowtie_build_cmd += [external_splice_prefix + ".fa",
2145     external_splice_prefix]
2146     try:
2147     print >> run_log, " ".join(bowtie_build_cmd)
2148     retcode = subprocess.call(bowtie_build_cmd,
2149     stdout=bowtie_build_log)
2150    
2151     if retcode != 0:
2152     die(fail_str+"Error: Splice sequence indexing failed with err ="+ str(retcode))
2153     except OSError, o:
2154     errmsg=fail_str+str(o)+"\n"
2155     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2156     errmsg+="Error: bowtie-build not found on this system"
2157     die(errmsg)
2158     return external_splice_prefix
2159    
2160     # Build a splice index from a .juncs file, suitable for use with specified read
2161     # (or read segment) lengths
2162 gpertea 154 def build_juncs_index(is_bowtie2,
2163     min_anchor_length,
2164 gpertea 135 max_seg_len,
2165 gpertea 29 juncs_prefix,
2166     external_juncs,
2167     external_insertions,
2168     external_deletions,
2169 gpertea 154 external_fusions,
2170 gpertea 29 reference_fasta,
2171     color):
2172 gpertea 135 th_log("Retrieving sequences for splices")
2173 gpertea 29
2174     juncs_file_list = ",".join(external_juncs)
2175     insertions_file_list = ",".join(external_insertions)
2176     deletions_file_list = ",".join(external_deletions)
2177 gpertea 154 fusions_file_list = ",".join(external_fusions)
2178 gpertea 29
2179     juncs_db_log = open(logging_dir + "juncs_db.log", "w")
2180    
2181     external_splices_out_prefix = tmp_dir + juncs_prefix
2182     external_splices_out_name = external_splices_out_prefix + ".fa"
2183    
2184     external_splices_out = open(external_splices_out_name, "w")
2185     # juncs_db_cmd = [bin_dir + "juncs_db",
2186     juncs_db_cmd = [prog_path("juncs_db"),
2187     str(min_anchor_length),
2188 gpertea 135 str(max_seg_len),
2189 gpertea 29 juncs_file_list,
2190     insertions_file_list,
2191     deletions_file_list,
2192 gpertea 154 fusions_file_list,
2193 gpertea 29 reference_fasta]
2194     try:
2195     print >> run_log, " ".join(juncs_db_cmd)
2196     retcode = subprocess.call(juncs_db_cmd,
2197     stderr=juncs_db_log,
2198     stdout=external_splices_out)
2199    
2200     if retcode != 0:
2201     die(fail_str+"Error: Splice sequence retrieval failed with err ="+str(retcode))
2202     # juncs_db not found
2203     except OSError, o:
2204     errmsg=fail_str+str(o)+"\n"
2205     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2206     errmsg+="Error: juncs_db not found on this system"
2207     die(errmsg)
2208    
2209 gpertea 154 external_splices_out_prefix = build_juncs_bwt_index(is_bowtie2, external_splices_out_prefix, color)
2210 gpertea 29 return external_splices_out_prefix
2211    
2212 gpertea 154 def build_idx_from_fa(is_bowtie2, fasta_fname, out_dir, color):
2213 gpertea 135 """ Build a bowtie index from a FASTA file.
2214    
2215     Arguments:
2216     - `fasta_fname`: File path to FASTA file.
2217     - `out_dir`: Output directory to place index in. (includes os.sep)
2218    
2219     Returns:
2220     - The path to the Bowtie index.
2221     """
2222     bwt_idx_path = out_dir + os.path.basename(fasta_fname).replace(".fa", "")
2223 gpertea 154
2224     if is_bowtie2:
2225     bowtie_idx_cmd = [prog_path("bowtie2-build")]
2226     else:
2227     bowtie_idx_cmd = [prog_path("bowtie-build")]
2228    
2229     if color:
2230     bowtie_idx_cmd += ["-C"]
2231    
2232     bowtie_idx_cmd += [fasta_fname,
2233     bwt_idx_path]
2234 gpertea 135 try:
2235     th_log("Building Bowtie index from " + os.path.basename(fasta_fname))
2236     print >> run_log, " ".join(bowtie_idx_cmd)
2237     retcode = subprocess.call(bowtie_idx_cmd,
2238     stdout=open(os.devnull, "w"),
2239     stderr=open(os.devnull, "w"))
2240     if retcode != 0:
2241     die(fail_str + "Error: Couldn't build bowtie index with err = "
2242     + str(retcode))
2243     except OSError, o:
2244     errmsg=fail_str+str(o)+"\n"
2245     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2246     errmsg+="Error: bowtie-build not found on this system"
2247     die(errmsg)
2248    
2249     return bwt_idx_path
2250    
2251 gpertea 29 # Print out the sam header, embedding the user's specified library properties.
2252     # FIXME: also needs SQ dictionary lines
2253     def write_sam_header(read_params, sam_file):
2254     print >> sam_file, "@HD\tVN:1.0\tSO:coordinate"
2255     if read_params.read_group_id and read_params.sample_id:
2256     rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
2257     read_params.sample_id)
2258     if read_params.library_id:
2259     rg_str += "\tLB:%s" % read_params.library_id
2260     if read_params.description:
2261     rg_str += "\tDS:%s" % read_params.description
2262     if read_params.seq_platform_unit:
2263     rg_str += "\tPU:%s" % read_params.seq_platform_unit
2264     if read_params.seq_center:
2265     rg_str += "\tCN:%s" % read_params.seq_center
2266     if read_params.mate_inner_dist:
2267     rg_str += "\tPI:%s" % read_params.mate_inner_dist
2268     if read_params.seq_run_date:
2269     rg_str += "\tDT:%s" % read_params.seq_run_date
2270     if read_params.seq_platform:
2271     rg_str += "\tPL:%s" % read_params.seq_platform
2272    
2273     print >> sam_file, rg_str
2274     print >> sam_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
2275    
2276     # Write final TopHat output, via tophat_reports and wiggles
2277 gpertea 154 def compile_reports(params, sam_header_filename, ref_fasta, mappings, readfiles, gff_annotation):
2278 gpertea 135 th_log("Reporting output tracks")
2279     left_maps, right_maps = mappings
2280     left_reads, right_reads = readfiles
2281 gpertea 29 left_maps = [x for x in left_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
2282     left_maps = ','.join(left_maps)
2283    
2284     if len(right_maps) > 0:
2285     right_maps = [x for x in right_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
2286     right_maps = ','.join(right_maps)
2287     log_fname = logging_dir + "reports.log"
2288     report_log = open(log_fname, "w")
2289     junctions = output_dir + "junctions.bed"
2290     insertions = output_dir + "insertions.bed"
2291     deletions = output_dir + "deletions.bed"
2292     accepted_hits = output_dir + "accepted_hits"
2293     report_cmdpath = prog_path("tophat_reports")
2294 gpertea 154 fusions = output_dir + "fusions.out"
2295 gpertea 29 report_cmd = [report_cmdpath]
2296 gpertea 154
2297     alignments_output_filename = tmp_dir + "accepted_hits"
2298    
2299 gpertea 29 report_cmd.extend(params.cmd())
2300     report_cmd.extend(["--samtools="+samtools_path])
2301 gpertea 154 report_cmd.extend([ref_fasta,
2302     junctions,
2303 gpertea 29 insertions,
2304     deletions,
2305 gpertea 154 fusions,
2306     alignments_output_filename,
2307 gpertea 29 left_maps,
2308     left_reads])
2309 gpertea 154
2310 gpertea 135 if len(right_maps) > 0 and right_reads:
2311 gpertea 29 report_cmd.append(right_maps)
2312     report_cmd.append(right_reads)
2313 gpertea 154
2314 gpertea 29 # -- tophat_reports now produces (uncompressed) BAM stream,
2315     try:
2316     if params.report_params.convert_bam:
2317     if params.report_params.sort_bam:
2318 gpertea 154 print >> run_log, " ".join(report_cmd)
2319     report_proc=subprocess.call(report_cmd,
2320     preexec_fn=subprocess_setup,
2321     stderr=report_log)
2322 gpertea 29
2323 gpertea 154 bam_parts = []
2324     for i in range(params.system_params.num_threads):
2325     bam_part_filename = "%s%d.bam" % (alignments_output_filename, i)
2326     if os.path.exists(bam_part_filename):
2327     bam_parts.append(bam_part_filename)
2328     else:
2329     break
2330    
2331     num_bam_parts = len(bam_parts)
2332     pids = [0 for i in range(num_bam_parts)]
2333     sorted_bam_parts = ["%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)]
2334     #left_um_parts = ["%s%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)]
2335     #right_um_parts = ["%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)]
2336     for i in range(num_bam_parts):
2337     bamsort_cmd = [samtools_path,
2338     "sort",
2339     bam_parts[i],
2340     sorted_bam_parts[i]]
2341    
2342     sorted_bam_parts[i] += ".bam"
2343     print >> run_log, " ".join(bamsort_cmd)
2344     pid = os.fork()
2345     if pid == 0:
2346     subprocess.call(bamsort_cmd,
2347     stderr=open(logging_dir + "reports.samtools_sort.log%d" % i, "w"))
2348     os._exit(os.EX_OK)
2349     else:
2350     pids[i] = pid
2351    
2352     for i in range(len(pids)):
2353     if pids[i] > 0:
2354     os.waitpid(pids[i], 0)
2355     pids[i] = 0
2356    
2357     # for bam_part in bam_parts:
2358     # os.remove(bam_part)
2359    
2360     if num_bam_parts > 1:
2361     bammerge_cmd = [samtools_path,
2362     "merge",
2363     "-h",
2364     sam_header_filename,
2365     "%s.bam" % accepted_hits]
2366     bammerge_cmd += sorted_bam_parts
2367    
2368     print >> run_log, " ".join(bammerge_cmd)
2369     subprocess.call(bammerge_cmd,
2370     stderr=open(logging_dir + "reports.samtools_merge.log", "w"))
2371    
2372     for sorted_bam_part in sorted_bam_parts:
2373     os.remove(sorted_bam_part)
2374     else:
2375     os.rename(sorted_bam_parts[0], output_dir + "accepted_hits.bam")
2376    
2377 gpertea 29 else:
2378     print >> run_log, " ".join(report_cmd)
2379     report_proc=subprocess.call(report_cmd,
2380     preexec_fn=subprocess_setup,
2381     stdout=open(accepted_hits,"wb"),
2382     stderr=report_log)
2383     os.rename(accepted_hits, output_dir + "accepted_hits.bam")
2384     else:
2385     print >> run_log, " ".join(report_cmd)
2386     report_proc=subprocess.call(report_cmd,
2387     preexec_fn=subprocess_setup,
2388     stdout=open(accepted_hits,"wb"),
2389     stderr=report_log)
2390     tmp_sam = output_dir + "accepted_hits.sam"
2391    
2392 gpertea 135 bam_to_sam_cmd = [samtools_path, "view", "-h", accepted_hits]
2393 gpertea 29 print >> run_log, " ".join(bam_to_sam_cmd) + " > " + tmp_sam
2394     bam_to_sam_log = open(logging_dir + "accepted_hits_bam_to_sam.log", "w")
2395     tmp_sam_file = open(tmp_sam, "w")
2396     ret = subprocess.call(bam_to_sam_cmd,
2397     stdout=tmp_sam_file,
2398     stderr=bam_to_sam_log)
2399     tmp_sam_file.close()
2400     os.remove(accepted_hits)
2401    
2402     except OSError, o:
2403     die(fail_str+"Error: "+str(o)+"\n"+log_tail(log_fname))
2404    
2405 gpertea 154 return junctions
2406 gpertea 29
2407    
2408     # Split up each read in a FASTQ file into multiple segments. Creates a FASTQ file
2409     # for each segment This function needs to be fixed to support mixed read length
2410     # inputs
2411     def open_output_files(prefix, num_files_prev, num_files, out_segf, extension, params):
2412     i = num_files_prev + 1
2413     while i <= num_files:
2414     segfname=prefix+("_seg%d" % i)+extension
2415     out_segf.append(ZWriter(segfname,params.system_params))
2416     i += 1
2417    
2418     def split_reads(reads_filename,
2419     prefix,
2420     fasta,
2421     params,
2422     segment_length):
2423     #reads_file = open(reads_filename)
2424     zreads = ZReader(reads_filename, params.system_params, False)
2425     out_segfiles = []
2426    
2427     if fasta:
2428     extension = ".fa"
2429     else:
2430     extension = ".fq"
2431     if use_zpacker: extension += ".z"
2432     def convert_color_to_bp(color_seq):
2433     decode_dic = { 'A0':'A', 'A1':'C', 'A2':'G', 'A3':'T', 'A4':'N', 'A.':'N', 'AN':'N',
2434     'C0':'C', 'C1':'A', 'C2':'T', 'C3':'G', 'C4':'N', 'C.':'N', 'CN':'N',
2435     'G0':'G', 'G1':'T', 'G2':'A', 'G3':'C', 'G4':'N', 'G.':'N', 'GN':'N',
2436     'T0':'T', 'T1':'G', 'T2':'C', 'T3':'A', 'T4':'N', 'T.':'N', 'TN':'N',
2437     'N0':'N', 'N1':'N', 'N2':'N', 'N3':'N', 'N4':'N', 'N.':'N', 'NN':'N',
2438     '.0':'N', '.1':'N', '.2':'N', '.3':'N', '.4':'N', '..':'N', '.N':'N' }
2439    
2440     base = color_seq[0]
2441     bp_seq = base
2442     for ch in color_seq[1:]:
2443     base = decode_dic[base+ch]
2444     bp_seq += base
2445     return bp_seq
2446    
2447     def convert_bp_to_color(bp_seq):
2448     encode_dic = { 'AA':'0', 'CC':'0', 'GG':'0', 'TT':'0',
2449     'AC':'1', 'CA':'1', 'GT':'1', 'TG':'1',
2450     'AG':'2', 'CT':'2', 'GA':'2', 'TC':'2',
2451     'AT':'3', 'CG':'3', 'GC':'3', 'TA':'3',
2452     'A.':'4', 'C.':'4', 'G.':'4', 'T.':'4',
2453     '.A':'4', '.C':'4', '.G':'4', '.T':'4',
2454     '.N':'4', 'AN':'4', 'CN':'4', 'GN':'4',
2455     'TN':'4', 'NA':'4', 'NC':'4', 'NG':'4',
2456     'NT':'4', 'NN':'4', 'N.':'4', '..':'4' }
2457    
2458     base = bp_seq[0]
2459     color_seq = base
2460     for ch in bp_seq[1:]:
2461     color_seq += encode_dic[base + ch]
2462     base = ch
2463    
2464     return color_seq
2465    
2466     def split_record(read_name, read_seq, read_qual, out_segf, offsets, color):
2467     if color:
2468     color_offset = 1
2469     read_seq_temp = convert_color_to_bp(read_seq)
2470    
2471     seg_num = 1
2472     while seg_num + 1 < len(offsets):
2473     if read_seq[offsets[seg_num]+1] not in ['0', '1', '2', '3']:
2474     return
2475     seg_num += 1
2476     else:
2477     color_offset = 0
2478    
2479     seg_num = 0
2480     last_seq_offset = 0
2481     while seg_num + 1 < len(offsets):
2482     f = out_segf[seg_num].file
2483     seg_seq = read_seq[last_seq_offset+color_offset:offsets[seg_num + 1]+color_offset]
2484     print >> f, "%s|%d:%d:%d" % (read_name,last_seq_offset,seg_num, len(offsets) - 1)
2485     if color:
2486     print >> f, "%s%s" % (read_seq_temp[last_seq_offset], seg_seq)
2487     else:
2488     print >> f, seg_seq
2489     if not fasta:
2490     seg_qual = read_qual[last_seq_offset:offsets[seg_num + 1]]
2491     print >> f, "+"
2492     print >> f, seg_qual
2493     seg_num += 1
2494     last_seq_offset = offsets[seg_num]
2495    
2496     line_state = 0
2497     read_name = ""
2498     read_seq = ""
2499 gpertea 135 read_quals = ""
2500 gpertea 29 num_segments = 0
2501     offsets = []
2502     for line in zreads.file:
2503     if line.strip() == "":
2504     continue
2505     if line_state == 0:
2506     read_name = line.strip()
2507     elif line_state == 1:
2508     read_seq = line.strip()
2509    
2510     read_length = len(read_seq)
2511     tmp_num_segments = read_length / segment_length
2512     offsets = [segment_length * i for i in range(0, tmp_num_segments + 1)]
2513    
2514     # Bowtie's minimum read length here is 20bp, so if the last segment
2515     # is between 20 and segment_length bp long, go ahead and write it out
2516 gpertea 154 if read_length % segment_length >= min(segment_length - 2, 20):
2517 gpertea 29 offsets.append(read_length)
2518     tmp_num_segments += 1
2519     else:
2520     offsets[-1] = read_length
2521    
2522     if tmp_num_segments == 1:
2523     offsets = [0, read_length]
2524    
2525     if tmp_num_segments > num_segments:
2526     open_output_files(prefix, num_segments, tmp_num_segments, out_segfiles, extension, params)
2527     num_segments = tmp_num_segments
2528    
2529     if fasta:
2530     split_record(read_name, read_seq, None, out_segfiles, offsets, params.read_params.color)
2531     elif line_state == 2:
2532     line = line.strip()
2533     else:
2534     read_quals = line.strip()
2535     if not fasta:
2536     split_record(read_name, read_seq, read_quals, out_segfiles, offsets, params.read_params.color)
2537    
2538     line_state += 1
2539     if fasta:
2540     line_state %= 2
2541     else:
2542     line_state %= 4
2543     zreads.close()
2544     out_fnames=[]
2545     for zf in out_segfiles:
2546     zf.close()
2547     out_fnames.append(zf.fname)
2548     #return [o.fname for o in out_segfiles]
2549     return out_fnames
2550    
2551     # Find possible splice junctions using the "closure search" strategy, and report
2552     # them in closures.juncs. Calls the executable closure_juncs
2553     def junctions_from_closures(params,
2554     left_maps,
2555     right_maps,
2556     ref_fasta):
2557 gpertea 135 th_log("Searching for junctions via mate-pair closures")
2558 gpertea 29
2559 gpertea 154
2560 gpertea 29 #maps = [x for x in seg_maps if (os.path.exists(x) and os.path.getsize(x) > 0)]
2561     #if len(maps) == 0:
2562     # return None
2563     #print >> sys.stderr, left_maps
2564     slash = left_maps[0].rfind('/')
2565     juncs_out = ""
2566     if slash != -1:
2567     juncs_out += left_maps[0][:slash+1]
2568 gpertea 154 fusions_out = juncs_out
2569    
2570 gpertea 29 juncs_out += "closure.juncs"
2571 gpertea 154 fusions_out += "closure.fusions"
2572 gpertea 29
2573     juncs_log = open(logging_dir + "closure.log", "w")
2574     juncs_cmdpath=prog_path("closure_juncs")
2575     juncs_cmd = [juncs_cmdpath]
2576    
2577     left_maps = ','.join(left_maps)
2578     right_maps = ','.join(right_maps)
2579    
2580     juncs_cmd.extend(params.cmd())
2581     juncs_cmd.extend([juncs_out,
2582 gpertea 154 fusions_out,
2583 gpertea 29 ref_fasta,
2584     left_maps,
2585     right_maps])
2586     try:
2587     print >> run_log, ' '.join(juncs_cmd)
2588     retcode = subprocess.call(juncs_cmd,
2589     stderr=juncs_log)
2590    
2591     # spanning_reads returned an error
2592     if retcode != 0:
2593     die(fail_str+"Error: closure-based junction search failed with err ="+str(retcode))
2594     # cvg_islands not found
2595     except OSError, o:
2596     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2597     print >> sys.stderr, fail_str, "Error: closure_juncs not found on this system"
2598     die(str(o))
2599     return [juncs_out]
2600    
2601     # Find possible junctions by examining coverage and split segments in the initial
2602     # map and segment maps. Report junctions, insertions, and deletions in segment.juncs,
2603     # segment.insertions, and segment.deletions. Calls the executable
2604     # segment_juncs
2605     def junctions_from_segments(params,
2606     left_reads,
2607     left_reads_map,
2608     left_seg_maps,
2609     right_reads,
2610     right_reads_map,
2611     right_seg_maps,
2612     unmapped_reads,
2613     reads_format,
2614     ref_fasta):
2615     if left_reads_map != left_seg_maps[0]:
2616 gpertea 135 th_log("Searching for junctions via segment mapping")
2617 gpertea 29 out_path=getFileDir(left_seg_maps[0])
2618     juncs_out=out_path+"segment.juncs"
2619     insertions_out=out_path+"segment.insertions"
2620     deletions_out =out_path+"segment.deletions"
2621 gpertea 154 fusions_out = out_path+"segment.fusions"
2622 gpertea 29
2623     left_maps = ','.join(left_seg_maps)
2624 gpertea 135 log_fname = logging_dir + "segment_juncs.log"
2625     segj_log = open(log_fname, "w")
2626     segj_cmd = [prog_path("segment_juncs")]
2627 gpertea 29
2628 gpertea 135 segj_cmd.extend(params.cmd())
2629     segj_cmd.extend(["--ium-reads", ",".join(unmapped_reads),
2630 gpertea 29 ref_fasta,
2631     juncs_out,
2632     insertions_out,
2633     deletions_out,
2634 gpertea 154 fusions_out,
2635 gpertea 29 left_reads,
2636     left_reads_map,
2637     left_maps])
2638 gpertea 135 if right_seg_maps:
2639 gpertea 29 right_maps = ','.join(right_seg_maps)
2640 gpertea 135 segj_cmd.extend([right_reads, right_reads_map, right_maps])
2641 gpertea 29 try:
2642 gpertea 135 print >> run_log, " ".join(segj_cmd)
2643     retcode = subprocess.call(segj_cmd,
2644 gpertea 29 preexec_fn=subprocess_setup,
2645 gpertea 135 stderr=segj_log)
2646 gpertea 29
2647     # spanning_reads returned an error
2648     if retcode != 0:
2649 gpertea 135 die(fail_str+"Error: segment-based junction search failed with err ="+str(retcode)+"\n"+log_tail(log_fname))
2650 gpertea 154
2651 gpertea 29 # cvg_islands not found
2652     except OSError, o:
2653     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2654     print >> sys.stderr, fail_str, "Error: segment_juncs not found on this system"
2655     die(str(o))
2656    
2657 gpertea 154 return [juncs_out, insertions_out, deletions_out, fusions_out]
2658 gpertea 29
2659     # Joins mapped segments into full-length read alignments via the executable
2660     # long_spanning_reads
2661     def join_mapped_segments(params,
2662     sam_header_filename,
2663     reads,
2664     ref_fasta,
2665     possible_juncs,
2666     possible_insertions,
2667     possible_deletions,
2668 gpertea 154 possible_fusions,
2669 gpertea 29 contig_seg_maps,
2670     spliced_seg_maps,
2671     alignments_out_name):
2672 gpertea 135 rn=""
2673 gpertea 29 if len(contig_seg_maps)>1:
2674 gpertea 135 th_log("Joining segment hits")
2675     rn=".segs"
2676 gpertea 29 else:
2677 gpertea 135 th_log("Processing bowtie hits")
2678 gpertea 29 contig_seg_maps = ','.join(contig_seg_maps)
2679    
2680     possible_juncs = ','.join(possible_juncs)
2681     possible_insertions = ",".join(possible_insertions)
2682     possible_deletions = ",".join(possible_deletions)
2683 gpertea 154 possible_fusions = ",".join(possible_fusions)
2684    
2685 gpertea 135 log_fname=logging_dir + "long_spanning_reads"+rn+".log"
2686     align_log = open(log_fname, "w")
2687 gpertea 29 align_cmd = [prog_path("long_spanning_reads")]
2688    
2689     align_cmd.extend(params.cmd())
2690    
2691     align_cmd.append(ref_fasta)
2692 gpertea 154 align_cmd.extend([reads,
2693     possible_juncs,
2694     possible_insertions,
2695     possible_deletions,
2696     possible_fusions,
2697     alignments_out_name,
2698     contig_seg_maps])
2699 gpertea 29
2700 gpertea 135 if spliced_seg_maps:
2701 gpertea 29 spliced_seg_maps = ','.join(spliced_seg_maps)
2702     align_cmd.append(spliced_seg_maps)
2703    
2704     try:
2705 gpertea 154 print >> run_log, " ".join(align_cmd)
2706     join_proc=subprocess.call(align_cmd,
2707     stderr=align_log)
2708    
2709 gpertea 29 except OSError, o:
2710     die(fail_str+"Error: "+str(o))
2711    
2712     # This class collects spliced and unspliced alignments for each of the
2713     # left and right read files provided by the user.
2714     class Maps:
2715     def __init__(self,
2716     unspliced_sam,
2717     seg_maps,
2718     unmapped_segs,
2719     segs):
2720     self.unspliced_sam = unspliced_sam
2721     self.seg_maps = seg_maps
2722     self.unmapped_segs = unmapped_segs
2723     self.segs = segs
2724    
2725 gpertea 135 # Map2GTF stuff
2726     def m2g_convert_coords(params, gtf_fname, reads, out_fname):
2727     """ajjkljlks
2728    
2729     Arguments:
2730     - `params`: TopHat parameters
2731     - `gtf_fname`: File name pointing to the annotation.
2732     - `reads`: The reads to convert coords (in Bowtie format).
2733     - `out_fname`: The file name pointing to the output.
2734     """
2735     m2g_cmd = [prog_path("map2gtf")]
2736     m2g_cmd.extend(params.cmd())
2737     m2g_cmd.append(gtf_fname)
2738     m2g_cmd.append(reads) #could be BAM file
2739     m2g_cmd.append(out_fname)
2740     fbasename = getFileBaseName(reads)
2741     m2g_log = logging_dir + "m2g_" + fbasename + ".out"
2742     m2g_err = logging_dir + "m2g_" + fbasename + ".err"
2743    
2744     try:
2745     th_log("Converting " + fbasename + " to genomic coordinates (map2gtf)")
2746     print >> run_log, " ".join(m2g_cmd) + " > " + m2g_log
2747     ret = subprocess.call(m2g_cmd,
2748     stdout=open(m2g_log, "w"),
2749     stderr=open(m2g_err, "w"))
2750     if ret != 0:
2751     die(fail_str + " Error: map2gtf returned an error")
2752     except OSError, o:
2753     err_msg = fail_str + str(o)
2754     die(err_msg + "\n")
2755    
2756    
2757     def gtf_to_fasta(params, trans_gtf, genome, out_fname):
2758     """ Make the transcriptome from a GTF.
2759    
2760     Arguments:
2761     - `trans_gtf`:
2762     - `genome`:
2763     - `out_fname`:
2764     """
2765     # TODO: Call gtf_to_fasta
2766     g2f_cmd = [prog_path("gtf_to_fasta")]
2767     g2f_cmd.extend(params.cmd())
2768     g2f_cmd.append(trans_gtf)
2769     g2f_cmd.append(genome)
2770     g2f_cmd.append(out_fname)
2771    
2772     g2f_log = logging_dir + "g2f.out"
2773     g2f_err = logging_dir + "g2f.err"
2774    
2775     try:
2776     print >> run_log, " ".join(g2f_cmd)+" > " + g2f_log
2777     ret = subprocess.call(g2f_cmd,
2778     stdout = open(g2f_log, "w"),
2779     stderr = open(g2f_err, "w"))
2780     if ret != 0:
2781     die(fail_str + " Error: gtf_to_fasta returned an error.")
2782     except OSError, o:
2783     err_msg = fail_str + str(o)
2784     die(err_msg + "\n")
2785    
2786     def map2gtf(params, ref_fasta, left_reads, right_reads):
2787     """ Main GTF mapping function
2788    
2789     Arguments:
2790     - `params`: The TopHat parameters.
2791     - `ref_fasta`: The reference genome.
2792     - `left_reads`: A list of reads.
2793     - `right_reads`: A list of reads (empty if single-end).
2794    
2795     """
2796     test_input_file(params.gff_annotation)
2797    
2798     # th_log("Reading in GTF file: " + params.gff_annotation)
2799     # transcripts = gtf_to_transcripts(params.gff_annotation)
2800    
2801     gtf_name = getFileBaseName(params.gff_annotation)
2802     m2g_bwt_idx = None
2803 gpertea 154 t_out_dir = tmp_dir
2804     if params.transcriptome_index and not params.transcriptome_outdir:
2805 gpertea 135 m2g_bwt_idx = params.transcriptome_index
2806 gpertea 154 th_log("Using pre-built transcriptome index..")
2807 gpertea 135 else:
2808 gpertea 154 th_log("Creating transcriptome data files..")
2809     if params.transcriptome_outdir:
2810     t_out_dir=params.transcriptome_outdir+"/"
2811     m2g_ref_fasta = t_out_dir + gtf_name + ".fa"
2812 gpertea 135 gtf_to_fasta(params, params.gff_annotation, ref_fasta, m2g_ref_fasta)
2813     # trans_hash = transcripts_to_fasta(transcripts,
2814     # ref_fasta,
2815     # m2g_ref_fasta)
2816 gpertea 154 m2g_bwt_idx = build_idx_from_fa(params.bowtie2, m2g_ref_fasta, t_out_dir, params.read_params.color)
2817 gpertea 135 mapped_gtf_list = []
2818     unmapped_gtf_list = []
2819     # do the initial mapping in GTF coordinates
2820     for reads in [left_reads, right_reads]:
2821     if reads == None or os.path.getsize(reads) < 25 :
2822     continue
2823     fbasename = getFileBaseName(reads)
2824     mapped_gtf_out = tmp_dir + fbasename + ".m2g"
2825     #if use_zpacker:
2826     # mapped_gtf_out+=".z"
2827    
2828     unmapped_gtf = tmp_dir + fbasename + ".m2g_um"
2829     #if use_BWT_FIFO:
2830     # unmapped_gtf += ".z"
2831    
2832     (mapped_gtf_map, unmapped) = bowtie(params,
2833     m2g_bwt_idx,
2834     [reads],
2835     "fastq",
2836     params.t_mismatches,
2837     mapped_gtf_out,
2838     unmapped_gtf,
2839 gpertea 154 "", "read_against_transcriptome")
2840 gpertea 135 mapped_gtf_list.append(mapped_gtf_map)
2841     unmapped_gtf_list.append(unmapped)
2842    
2843     sam_gtf_list = []
2844     for reads in mapped_gtf_list:
2845     fbasename = getFileBaseName(reads)
2846     sam_out_fname = tmp_dir + fbasename + "_converted.bam"
2847     m2g_convert_coords(params,
2848     params.gff_annotation,
2849     reads,
2850     sam_out_fname)
2851     sam_gtf_list.append(sam_out_fname)
2852    
2853     if len(sam_gtf_list) < 2:
2854     sam_gtf_list.append(None)
2855     unmapped_gtf_list.append(None)
2856    
2857     return (sam_gtf_list, unmapped_gtf_list)
2858     # end Map2GTF
2859    
2860     def get_preflt_data(params, ri, target_reads, out_mappings, out_unmapped):
2861     ## extract mappings and unmapped reads from prefilter mappings and preflt_ium
2862     ##
2863     #this is accomplished by a special prep_reads usage (triggered by --flt-hits)
2864    
2865     prep_cmd=prep_reads_cmd(params, params.preflt_data[ri].unmapped_reads, None,
2866     out_mappings,
2867     target_reads,
2868     params.preflt_data[ri].mappings)
2869     um_reads = open(out_unmapped, "wb")
2870     log_fname=logging_dir + "prep_reads.from_preflt."+str(ri)+".log"
2871     filter_log = open(log_fname,"w")
2872    
2873     shell_cmd = " ".join(prep_cmd)
2874     #add the compression pipe
2875     zip_cmd=[]
2876     if use_zpacker:
2877     zip_cmd=[ params.system_params.zipper ]
2878     zip_cmd.extend(params.system_params.zipper_opts)
2879     zip_cmd.extend(['-c','-'])
2880     shell_cmd +=' | '+' '.join(zip_cmd)
2881     shell_cmd += ' >' + out_unmapped
2882     retcode=0
2883     try:
2884     print >> run_log, shell_cmd
2885     if use_zpacker:
2886     prep_proc = subprocess.Popen(prep_cmd,
2887     stdout=subprocess.PIPE,
2888     stderr=filter_log)
2889     zip_proc = subprocess.Popen(zip_cmd,
2890     preexec_fn=subprocess_setup,
2891     stdin=prep_proc.stdout,
2892     stdout=um_reads)
2893     prep_proc.stdout.close() #as per http://bugs.python.org/issue7678
2894     zip_proc.communicate()
2895     retcode=prep_proc.poll()
2896     if retcode==0:
2897     retcode=zip_proc.poll()
2898     else:
2899     retcode = subprocess.call(prep_cmd,
2900     stdout=um_reads,
2901     stderr=filter_log)
2902     if retcode:
2903     die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname))
2904    
2905     except OSError, o:
2906     errmsg=fail_str+str(o)
2907     die(errmsg+"\n"+log_tail(log_fname))
2908     um_reads.close()
2909    
2910     return (out_mappings, out_unmapped)
2911    
2912    
2913 gpertea 29 # The main aligment routine of TopHat. This function executes most of the
2914     # workflow producing a set of candidate alignments for each cDNA fragment in a
2915     # pair of SAM alignment files (for paired end reads).
2916     def spliced_alignment(params,
2917     bwt_idx_prefix,
2918     sam_header_filename,
2919     ref_fasta,
2920     read_len,
2921     segment_len,
2922 gpertea 135 prepared_reads,
2923 gpertea 29 user_supplied_junctions,
2924     user_supplied_insertions,
2925     user_supplied_deletions):
2926    
2927     possible_juncs = []
2928     possible_juncs.extend(user_supplied_junctions)
2929    
2930     possible_insertions = []
2931     possible_insertions.extend(user_supplied_insertions)
2932     possible_deletions = []
2933     possible_deletions.extend(user_supplied_deletions)
2934 gpertea 154 possible_fusions = []
2935 gpertea 29
2936 gpertea 135 left_reads, right_reads = prepared_reads
2937    
2938     maps = [[], []] # maps[0] = left_reads mapping data, maps[1] = right_reads_mapping_data
2939 gpertea 154
2940 gpertea 29 #single_segments = False
2941    
2942 gpertea 135 # Before anything, map the reads using Map2GTF (if using annotation)
2943     m2g_maps = [ None, None ] # left, right
2944     initial_reads = [ left_reads, right_reads ]
2945    
2946     if params.gff_annotation:
2947     (mapped_gtf_list, unmapped_gtf_list) = \
2948     map2gtf(params, ref_fasta, left_reads, right_reads)
2949    
2950     m2g_left_maps, m2g_right_maps = mapped_gtf_list
2951     m2g_maps = [m2g_left_maps, m2g_right_maps]
2952 gpertea 154 if params.transcriptome_only or not fileExists(unmapped_gtf_list[0]):
2953 gpertea 135 # The case where the user doesn't want to map to anything other
2954     # than the transcriptome OR we have no unmapped reads
2955     maps[0] = [m2g_left_maps]
2956     if right_reads:
2957 gpertea 154 maps[1] = [m2g_right_maps]
2958 gpertea 135 return maps
2959     # Feed the unmapped reads into spliced_alignment()
2960     initial_reads = unmapped_gtf_list[:]
2961     th_log("Resuming TopHat pipeline with unmapped reads")
2962    
2963     max_seg_len = segment_len #this is the ref seq span on either side of the junctions
2964     #to be extracted into segment_juncs.fa
2965    
2966     num_segs = int(read_len / segment_len)
2967     if (read_len % segment_len) >= min(segment_len-2, 20):
2968     #remainder is shorter but long enough to become a new segment
2969     num_segs += 1
2970     else:
2971     # the last segment is longer
2972     if num_segs>1: max_seg_len += (read_len % segment_len)
2973    
2974     if num_segs <= 1:
2975 gpertea 154 print >> sys.stderr, "Warning: you have only one segment per read.\n\tIf the read length is greater than or equal to 45bp,\n\twe strongly recommend that you decrease --segment-length to about half the read length because TopHat will work better with multiple segments"
2976 gpertea 135
2977     # Using the num_segs value returned by check_reads(),
2978     # decide which junction discovery strategy to use
2979 gpertea 154 if num_segs < 3:
2980 gpertea 135 #if params.butterfly_search != False:
2981     # params.butterfly_search = True
2982     if params.coverage_search != False:
2983 gpertea 154 params.coverage_search = True
2984 gpertea 135 if num_segs == 1:
2985     segment_len = read_len
2986     else: #num_segs >= 3:
2987     # if we have at least three segments, just use split segment search,
2988     # which is the most sensitive and specific, fastest, and lightest-weight.
2989     # so unless specifically requested, disable the other junction searches
2990     if params.closure_search != True:
2991     params.closure_search = False
2992     if params.coverage_search != True:
2993     params.coverage_search = False
2994     if params.butterfly_search != True:
2995     params.butterfly_search = False
2996 gpertea 154
2997 gpertea 29 # Perform the first part of the TopHat work flow on the left and right
2998     # reads of paired ends separately - we'll use the pairing information later
2999 gpertea 154 have_left_IUM=False
3000 gpertea 135 for ri in (0,1):
3001     reads=initial_reads[ri]
3002 gpertea 154 if reads == None or not fileExists(reads,25):
3003 gpertea 29 continue
3004     fbasename=getFileBaseName(reads)
3005     unspliced_out = tmp_dir + fbasename + ".mapped"
3006 gpertea 135 unspliced_sam = None
3007     unmapped_reads = None
3008     #if use_zpacker: unspliced_out+=".z"
3009 gpertea 29 unmapped_unspliced = tmp_dir + fbasename + "_unmapped"
3010 gpertea 135 if params.prefilter_multi:
3011     #unmapped_unspliced += ".z"
3012     (unspliced_sam, unmapped_reads) = get_preflt_data(params, ri, reads, unspliced_out, unmapped_unspliced)
3013     else:
3014 gpertea 29 # Perform the initial Bowtie mapping of the full length reads
3015 gpertea 135 (unspliced_sam, unmapped_reads) = bowtie(params,
3016 gpertea 154 bwt_idx_prefix,
3017     [reads],
3018     "fastq",
3019     params.genome_read_mismatches,
3020     unspliced_out,
3021     unmapped_unspliced,
3022     "",
3023     "read_against_genome")
3024 gpertea 29
3025 gpertea 154 # daehwan - check this out
3026     # params.bowtie2 = False
3027     # check_bowtie(params.bowtie2)
3028    
3029 gpertea 29 seg_maps = []
3030     unmapped_segs = []
3031     segs = []
3032 gpertea 154 have_IUM = fileExists(unmapped_reads,25)
3033     if ri==0 and have_IUM:
3034     have_left_IUM = True
3035     if num_segs > 1 and have_IUM:
3036 gpertea 29 # split up the IUM reads into segments
3037 gpertea 31 # unmapped_reads can be in BAM format
3038     read_segments = split_reads(unmapped_reads,
3039 gpertea 29 tmp_dir + fbasename,
3040     False,
3041     params,
3042     segment_len)
3043    
3044     # Map each segment file independently with Bowtie
3045     for i in range(len(read_segments)):
3046     seg = read_segments[i]
3047     fbasename=getFileBaseName(seg)
3048 gpertea 135 seg_out = tmp_dir + fbasename + ".bwtout"
3049     if use_zpacker: seg_out += ".z"
3050     unmapped_seg = tmp_dir + fbasename + "_unmapped.fq"
3051     if use_BWT_FIFO:
3052     unmapped_seg += ".z"
3053 gpertea 29 extra_output = "(%d/%d)" % (i+1, len(read_segments))
3054 gpertea 154 (seg_map, unmapped) = bowtie(params,
3055     bwt_idx_prefix,
3056     [seg],
3057     "fastq",
3058     params.segment_mismatches,
3059     seg_out,
3060     unmapped_seg,
3061     extra_output,
3062     "seg_against_genome")
3063 gpertea 29 seg_maps.append(seg_map)
3064     unmapped_segs.append(unmapped)
3065     segs.append(seg)
3066    
3067     # Collect the segment maps for left and right reads together
3068 gpertea 135 maps[ri] = Maps(unspliced_sam, seg_maps, unmapped_segs, segs)
3069 gpertea 29 else:
3070     # if there's only one segment, just collect the initial map as the only
3071     # map to be used downstream for coverage-based junction discovery
3072     read_segments = [reads]
3073 gpertea 135 maps[ri] = Maps(unspliced_sam, [unspliced_sam], [unmapped_reads], [unmapped_reads])
3074 gpertea 29
3075 gpertea 135 # XXX: At this point if using M2G, have three sets of reads:
3076     # mapped to transcriptome, mapped to genome, and unmapped (potentially
3077     # spliced or poly-A tails) - hp
3078    
3079 gpertea 29 # Unless the user asked not to discover new junctions, start that process
3080     # here
3081 gpertea 135 left_reads_map = maps[0].unspliced_sam
3082     left_seg_maps = maps[0].seg_maps
3083     unmapped_reads = maps[0].unmapped_segs
3084     if right_reads:
3085     right_reads_map = maps[1].unspliced_sam
3086     right_seg_maps = maps[1].seg_maps
3087     unmapped_reads.extend(maps[1].unmapped_segs)
3088     else:
3089     right_reads_map = None
3090     right_seg_maps = None
3091 gpertea 29
3092 gpertea 154 if params.find_novel_juncs and have_left_IUM: # or params.find_novel_indels:
3093 gpertea 29 # Call segment_juncs to infer a list of possible splice junctions from
3094     # the regions of the genome covered in the initial and segment maps
3095 gpertea 135 #if params.find_novel_juncs:
3096     #TODO: in m2g case, we might want to pass the m2g mappings as well,
3097     # or perhaps the GTF file directly
3098     # -> this could improve alternative junction detection?
3099 gpertea 29 juncs = junctions_from_segments(params,
3100     left_reads,
3101     left_reads_map,
3102     left_seg_maps,
3103     right_reads,
3104     right_reads_map,
3105     right_seg_maps,
3106     unmapped_reads,
3107     "fastq",
3108     ref_fasta)
3109 gpertea 135 if os.path.getsize(juncs[0]) != 0:
3110 gpertea 29 possible_juncs.append(juncs[0])
3111     if params.find_novel_indels:
3112 gpertea 154 if os.path.getsize(juncs[1]) != 0:
3113     possible_insertions.append(juncs[1])
3114     if os.path.getsize(juncs[2]) != 0:
3115     possible_deletions.append(juncs[2])
3116     if params.find_novel_fusions:
3117     if os.path.getsize(juncs[3]) != 0:
3118     possible_fusions.append(juncs[3])
3119 gpertea 29 # Optionally, and for paired reads only, use a closure search to
3120     # discover addtional junctions
3121 gpertea 135 if params.closure_search and left_reads and right_reads:
3122 gpertea 29 juncs = junctions_from_closures(params,
3123 gpertea 135 [maps[initial_reads[left_reads]].unspliced_sam, maps[initial_reads[left_reads]].seg_maps[-1]],
3124     [maps[initial_reads[right_reads]].unspliced_sam, maps[initial_reads[right_reads]].seg_maps[-1]],
3125 gpertea 29 ref_fasta)
3126     if os.path.getsize(juncs[0]) != 0:
3127     possible_juncs.extend(juncs)
3128    
3129 gpertea 154 if len(possible_insertions) == 0 and len(possible_deletions) == 0 and len(possible_juncs) == 0 and len(possible_fusions) == 0:
3130 gpertea 29 spliced_seg_maps = None
3131     junc_idx_prefix = None
3132     else:
3133     junc_idx_prefix = "segment_juncs"
3134     if len(possible_insertions) == 0:
3135     possible_insertions.append(os.devnull)
3136     # print >> sys.stderr, "Warning: insertions database is empty!"
3137     if len(possible_deletions) == 0:
3138     possible_deletions.append(os.devnull)
3139     # print >> sys.stderr, "Warning: deletions database is empty!"
3140     if len(possible_juncs) == 0:
3141     possible_juncs.append(os.devnull)
3142     print >> sys.stderr, "Warning: junction database is empty!"
3143 gpertea 154 if len(possible_fusions) == 0:
3144     possible_fusions.append(os.devnull)
3145 gpertea 135 if junc_idx_prefix:
3146 gpertea 154 juncs_bwt_idx = build_juncs_index(params.bowtie2,
3147     3,
3148     max_seg_len,
3149     junc_idx_prefix,
3150     possible_juncs,
3151     possible_insertions,
3152     possible_deletions,
3153     possible_fusions,
3154     ref_fasta,
3155     params.read_params.color)
3156 gpertea 29
3157 gpertea 154 juncs_bwt_samheader = get_index_sam_header(params.bowtie2, params.read_params, juncs_bwt_idx, False)
3158    
3159 gpertea 29 # Now map read segments (or whole IUM reads, if num_segs == 1) to the splice
3160     # index with Bowtie
3161 gpertea 135 # for reads in [left_reads, right_reads]:
3162     for ri in (0,1):
3163     reads = initial_reads[ri]
3164 gpertea 29 spliced_seg_maps = []
3165 gpertea 135
3166     if reads == None or os.path.getsize(reads)<25:
3167 gpertea 29 continue
3168    
3169 gpertea 135 m2g_map = m2g_maps[ri]
3170    
3171     if junc_idx_prefix :
3172 gpertea 29 i = 0
3173 gpertea 135 for seg in maps[ri].segs:
3174     #search each segment
3175 gpertea 29 fsegname = getFileBaseName(seg)
3176 gpertea 135 seg_out = tmp_dir + fsegname + ".to_spliced.bwtout"
3177     if use_zpacker: seg_out += ".z"
3178     extra_output = "(%d/%d)" % (i+1, len(maps[ri].segs))
3179 gpertea 154 (seg_map, unmapped) = bowtie(params,
3180     tmp_dir + junc_idx_prefix,
3181     [seg],
3182     "fastq",
3183     params.segment_mismatches,
3184     seg_out,
3185     None,
3186     extra_output,
3187     "seg_against_junctions")
3188 gpertea 29 spliced_seg_maps.append(seg_map)
3189     i += 1
3190    
3191 gpertea 135 # Join the contigous and spliced segment hits into full length
3192     # read alignments
3193     rfname=getFileBaseName(reads)
3194     rfdir=getFileDir(reads)
3195 gpertea 29 mapped_reads = rfdir + rfname + ".candidates.bam"
3196 gpertea 135 # -- spliced mappings built from all segment mappings vs genome and junc_db
3197 gpertea 29 merged_map =rfdir + rfname + ".candidates_and_unspl.bam"
3198 gpertea 135 #unspl_bwtfile = maps[ri].unspliced_bwt
3199     unspl_samfile = maps[ri].unspliced_sam
3200 gpertea 29 join_mapped_segments(params,
3201     sam_header_filename,
3202     reads,
3203     ref_fasta,
3204     possible_juncs,
3205     possible_insertions,
3206     possible_deletions,
3207 gpertea 154 possible_fusions,
3208 gpertea 135 maps[ri].seg_maps,
3209 gpertea 29 spliced_seg_maps,
3210     mapped_reads)
3211    
3212 gpertea 135 if num_segs > 1 or m2g_map:
3213     # Merge the spliced and unspliced full length alignments into
3214     # a single SAM file.
3215 gpertea 29 # The individual SAM files are all already sorted in
3216     # increasing read ID order.
3217     # NOTE: We also should be able to address bug #134 here, by replacing
3218     # contiguous alignments that poke into an intron by a small amount by
3219     # the correct spliced alignment.
3220 gpertea 154
3221 gpertea 29 try:
3222 gpertea 154 merge_cmd = [ prog_path("bam_merge"), "--index-outfile", merged_map + ".index",
3223     merged_map ]
3224    
3225 gpertea 135 if num_segs > 1:
3226     merge_cmd += [ maps[ri].unspliced_sam ]
3227     if m2g_map:
3228     merge_cmd += [ m2g_map ]
3229 gpertea 154
3230     if os.path.exists(mapped_reads):
3231     merge_cmd += [ mapped_reads ]
3232     else:
3233     for bam_i in range(0, params.system_params.num_threads):
3234     temp_bam = mapped_reads[:-4] + str(bam_i) + ".bam"
3235     if os.path.exists(temp_bam):
3236     merge_cmd += [ temp_bam ]
3237     else:
3238     break
3239    
3240 gpertea 29 print >> run_log, " ".join(merge_cmd)
3241     ret = subprocess.call( merge_cmd,
3242     stderr=open(logging_dir + "sam_merge.log", "w") )
3243     if ret != 0:
3244     die(fail_str+"Error executing: "+" ".join(merge_cmd))
3245     except OSError, o:
3246     die(fail_str+"Error: "+str(o))
3247 gpertea 135 maps[ri] = [merged_map]
3248 gpertea 29
3249     if not params.system_params.keep_tmp:
3250     os.remove(mapped_reads)
3251     else:
3252 gpertea 135 # no segments or transcriptome mappings, so no merge is needed
3253     # because join_mapped_segments() produced the final BAM already
3254 gpertea 29 os.rename(mapped_reads, merged_map)
3255 gpertea 135 maps[ri] = [merged_map]
3256     if not params.system_params.keep_tmp:
3257 gpertea 29 os.remove(unspl_samfile)
3258 gpertea 154
3259 gpertea 29 return maps
3260    
3261     def die(msg=None):
3262     if msg is not None:
3263     print >> sys.stderr, msg
3264     sys.exit(1)
3265    
3266     # rough equivalent of the 'which' command to find external programs
3267     # (current script path is tested first, then PATH envvar)
3268     def which(program):
3269     def is_executable(fpath):
3270 gpertea 37 return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
3271 gpertea 29 fpath, fname = os.path.split(program)
3272     if fpath:
3273     if is_executable(program):
3274     return program
3275     else:
3276     progpath = os.path.join(bin_dir, program)
3277     if is_executable(progpath):
3278     return progpath
3279     for path in os.environ["PATH"].split(os.pathsep):
3280     progpath = os.path.join(path, program)
3281     if is_executable(progpath):
3282     return progpath
3283     return None
3284    
3285     def prog_path(program):
3286     progpath=which(program)
3287     if progpath == None:
3288     die("Error locating program: "+program)
3289     return progpath
3290    
3291     # FIXME: this should get set during the make dist autotools phase of the build
3292     def get_version():
3293 gpertea 154 return "2.0.0"
3294 gpertea 29
3295     def mlog(msg):
3296     print >> sys.stderr, "[DBGLOG]:"+msg
3297    
3298     def test_input_file(filename):
3299     try:
3300     test_file = open(filename, "r")
3301     except IOError, o:
3302     die("Error: Opening file %s" % filename)
3303     return
3304    
3305     def main(argv=None):
3306     warnings.filterwarnings("ignore", "tmpnam is a potential security risk")
3307    
3308     # Initialize default parameter values
3309     params = TopHatParams()
3310     try:
3311     if argv is None:
3312     argv = sys.argv
3313     args = params.parse_options(argv)
3314     params.check()
3315    
3316     bwt_idx_prefix = args[0]
3317     left_reads_list = args[1]
3318     left_quals_list, right_quals_list = [], []
3319     if (not params.read_params.quals and len(args) > 2) or (params.read_params.quals and len(args) > 3):
3320     if params.read_params.mate_inner_dist == None:
3321 gpertea 135 params.read_params.mate_inner_dist = 50
3322 gpertea 67 #die("Error: you must set the mean inner distance between mates with -r")
3323 gpertea 29
3324     right_reads_list = args[2]
3325     if params.read_params.quals:
3326     left_quals_list = args[3]
3327     right_quals_list = args[4]
3328     else:
3329     right_reads_list = None
3330     if params.read_params.quals:
3331     left_quals_list = args[2]
3332    
3333     print >> sys.stderr
3334 gpertea 135 th_log("Beginning TopHat run (v"+get_version()+")")
3335 gpertea 29 print >> sys.stderr, "-----------------------------------------------"
3336    
3337     start_time = datetime.now()
3338     prepare_output_dir()
3339    
3340     global run_log
3341     run_log = open(logging_dir + "run.log", "w", 0)
3342     global run_cmd
3343     run_cmd = " ".join(argv)
3344     print >> run_log, run_cmd
3345    
3346     # Validate all the input files, check all prereqs before committing
3347     # to the run
3348 gpertea 154 if params.gff_annotation:
3349     if not os.path.exists(params.gff_annotation):
3350     die("Error: cannot find transcript file %s" % params.gff_annotation)
3351     if os.path.getsize(params.gff_annotation)<10:
3352     die("Error: invalid transcript file %s" % params.gff_annotation)
3353    
3354 gpertea 135 if params.transcriptome_index:
3355 gpertea 154 if params.gff_annotation:
3356     #gff file given, so transcriptome data will be written there
3357     gff_basename = getFileBaseName(params.gff_annotation)
3358     #just in case, check if it's not already there (-G/--GTF given again by mistake)
3359     tpath, tname = os.path.split(params.transcriptome_index)
3360     new_subdir=False
3361     if tpath in (".", "./") or not tpath :
3362     if not os.path.exists(params.transcriptome_index):
3363     os.makedirs(params.transcriptome_index)
3364     new_subdir=True
3365     if new_subdir or (os.path.exists(params.transcriptome_index) and os.path.isdir(params.transcriptome_index)):
3366     params.transcriptome_index = os.path.join(params.transcriptome_index, gff_basename)
3367     gff_out=params.transcriptome_index+".gff"
3368     if not (os.path.exists(gff_out) and os.path.getsize(gff_out)==os.path.getsize(params.gff_annotation)):
3369     #generate the transcriptome data files
3370     tpath, tname = os.path.split(params.transcriptome_index)
3371     params.transcriptome_outdir=tpath
3372     t_gff=params.transcriptome_index+".gff"
3373     if params.transcriptome_outdir:
3374     #will create the transcriptome data files
3375     if not os.path.exists(params.transcriptome_outdir):
3376     os.makedirs(params.transcriptome_outdir)
3377     copy(params.gff_annotation, t_gff)
3378     else:
3379     #try to use existing transcriptome data files
3380    
3381     if not (os.path.exists(t_gff) and os.path.getsize(t_gff)>10):
3382     die("Error: GFF transcripts file not found or invalid (%s)" % t_gff)
3383     check_bowtie_index(params.transcriptome_index)
3384     params.gff_annotation=t_gff
3385     #end @ transcriptome_index given
3386    
3387 gpertea 29 (ref_fasta, ref_seq_dict) = check_index(bwt_idx_prefix)
3388    
3389 gpertea 154 check_bowtie(params.bowtie2)
3390 gpertea 29 check_samtools()
3391 gpertea 135 th_log("Generating SAM header for "+bwt_idx_prefix)
3392 gpertea 154 sam_header_filename = get_index_sam_header(params.bowtie2, params.read_params, bwt_idx_prefix)
3393 gpertea 135 #if not params.skip_check_reads:
3394     reads_list = left_reads_list
3395     if right_reads_list:
3396 gpertea 29 reads_list = reads_list + "," + right_reads_list
3397 gpertea 135 params.read_params = check_reads_format(params, reads_list)
3398 gpertea 29
3399     user_supplied_juncs = []
3400     user_supplied_insertions = []
3401     user_supplied_deletions = []
3402 gpertea 154 user_supplied_fusions = []
3403 gpertea 135 global gtf_juncs
3404 gpertea 29 if params.gff_annotation and params.find_GFF_juncs:
3405     test_input_file(params.gff_annotation)
3406     (found_juncs, gtf_juncs) = get_gtf_juncs(params.gff_annotation)
3407 gpertea 135 ##-- we don't need these junctions in user_supplied_juncs anymore because now map2gtf does a much better job
3408     ## but we still need them loaded in gtf_juncs for later splice verification
3409     #if found_juncs and not params.gff_annotation:
3410     # user_supplied_juncs.append(gtf_juncs)
3411     #else:
3412     # gtf_juncs = None
3413 gpertea 29 if params.raw_junctions:
3414     test_input_file(params.raw_junctions)
3415     user_supplied_juncs.append(params.raw_junctions)
3416    
3417     if params.raw_insertions:
3418     test_input_file(params.raw_insertions)
3419     user_supplied_insertions.append(params.raw_insertions)
3420    
3421     if params.raw_deletions:
3422     test_input_file(params.raw_deletions)
3423     user_supplied_deletions.append(params.raw_deletions)
3424    
3425     global unmapped_reads_fifo
3426     unmapped_reads_fifo = tmp_dir + str(os.getpid())+".bwt_unmapped.z.fifo"
3427    
3428     # Now start the time consuming stuff
3429 gpertea 135 if params.prefilter_multi:
3430     sides=("left","right")
3431     read_lists=(left_reads_list, right_reads_list)
3432     for ri in (0,1):
3433     reads_list=read_lists[ri]
3434     if not reads_list:
3435     continue
3436     params.preflt_data[ri].multihit_reads = tmp_dir + sides[ri]+"_multimapped.fq"
3437     side_imap = tmp_dir + sides[ri]+"_im.bwtout"
3438     if use_zpacker:
3439     side_imap+=".z"
3440     side_ium = tmp_dir + sides[ri]+"_ium.fq"
3441     if use_BWT_FIFO:
3442     side_ium += ".z"
3443     th_log("Pre-filtering multi-mapped "+sides[ri]+" reads")
3444     rdlist=reads_list.split(',')
3445     bwt=bowtie(params, bwt_idx_prefix, rdlist,
3446     params.read_params.reads_format,
3447 gpertea 154 params.genome_read_mismatches,
3448 gpertea 135 side_imap, side_ium,
3449 gpertea 154 "", "read_against_genome", params.preflt_data[ri].multihit_reads)
3450 gpertea 135 params.preflt_data[ri].mappings = bwt[0] # initial mappings
3451     params.preflt_data[ri].unmapped_reads = bwt[1] # IUM reads
3452    
3453     th_log("Preparing reads")
3454 gpertea 29 left_kept_reads, left_reads_info = prep_reads(params,
3455     left_reads_list,
3456     left_quals_list,
3457 gpertea 135 "left",
3458     params.preflt_data[0].multihit_reads)
3459 gpertea 29 min_read_len=left_reads_info.min_len
3460     max_read_len=left_reads_info.max_len
3461 gpertea 135 if right_reads_list:
3462 gpertea 29 right_kept_reads, right_reads_info = prep_reads(params,
3463     right_reads_list,
3464     right_quals_list,
3465 gpertea 135 "right",
3466     params.preflt_data[1].multihit_reads)
3467 gpertea 29 min_read_len=min(right_reads_info.min_len, min_read_len)
3468     max_read_len=max(right_reads_info.max_len, max_read_len)
3469     else:
3470     right_kept_reads = None
3471     seed_len=params.read_params.seed_length
3472 gpertea 135 if seed_len: #if read len was explicitly given
3473 gpertea 29 seed_len = max(seed_len, min_read_len)
3474 gpertea 135 #can't be smaller than minimum length observed
3475 gpertea 29 else:
3476     seed_len = max_read_len
3477     params.read_params.seed_length=seed_len
3478     # turn off integer-quals
3479     if params.read_params.integer_quals:
3480     params.read_params.integer_quals = False
3481 gpertea 154
3482 gpertea 135 input_reads = [left_kept_reads, right_kept_reads]
3483     mappings = spliced_alignment(params,
3484     bwt_idx_prefix,
3485     sam_header_filename,
3486     ref_fasta,
3487     params.read_params.seed_length,
3488     params.segment_length,
3489     input_reads,
3490     user_supplied_juncs,
3491     user_supplied_insertions,
3492     user_supplied_deletions)
3493 gpertea 29
3494     compile_reports(params,
3495     sam_header_filename,
3496 gpertea 154 ref_fasta,
3497 gpertea 135 mappings,
3498     input_reads,
3499 gpertea 29 params.gff_annotation)
3500    
3501     if not params.system_params.keep_tmp:
3502 gpertea 135 for m in mappings[0]:
3503 gpertea 29 os.remove(m)
3504 gpertea 135 if left_kept_reads:
3505 gpertea 29 os.remove(left_kept_reads)
3506 gpertea 135 for m in mappings[1]:
3507 gpertea 29 os.remove(m)
3508 gpertea 135 if right_kept_reads:
3509 gpertea 29 os.remove(right_kept_reads)
3510     tmp_files = os.listdir(tmp_dir)
3511     for t in tmp_files:
3512     os.remove(tmp_dir+t)
3513     os.rmdir(tmp_dir)
3514    
3515     finish_time = datetime.now()
3516     duration = finish_time - start_time
3517     print >> sys.stderr,"-----------------------------------------------"
3518 gpertea 154 th_log("Run complete: %s elapsed" % formatTD(duration))
3519 gpertea 29
3520     except Usage, err:
3521     print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg)
3522     print >> sys.stderr, " for detailed help see http://tophat.cbcb.umd.edu/manual.html"
3523     return 2
3524    
3525    
3526     if __name__ == "__main__":
3527     sys.exit(main())

Properties

Name Value
svn:executable *