ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/tophat
Revision: 159
Committed: Mon Jan 30 22:38:08 2012 UTC (8 years, 4 months ago) by gpertea
File size: 148993 byte(s)
Log Message:
wip

Line User Rev File contents
1 gpertea 29 #!/usr/bin/env python
2    
3     # encoding: utf-8
4     """
5     tophat.py
6    
7     Created by Cole Trapnell on 2008-12-25.
8     Copyright (c) 2008 Cole Trapnell. All rights reserved.
9 gpertea 154 Updated and maintained by Daehwan Kim and Geo Pertea since Jul 2010.
10 gpertea 29 """
11     import sys
12     try:
13     import psyco
14     psyco.full()
15     except ImportError:
16     pass
17    
18     import getopt
19     import subprocess
20     import errno
21     import os
22     import warnings
23     import re
24     import signal
25     from datetime import datetime, date, time
26 gpertea 154 from shutil import copy
27 gpertea 29
28     use_message = '''
29     TopHat maps short sequences from spliced transcripts to whole genomes.
30    
31     Usage:
32     tophat [options] <bowtie_index> <reads1[,reads2,...]> [reads1[,reads2,...]] \\
33     [quals1,[quals2,...]] [quals1[,quals2,...]]
34    
35     Options:
36     -v/--version
37     -o/--output-dir <string> [ default: ./tophat_out ]
38 gpertea 154 --bowtie1 [ default: bowtie2 ]
39 gpertea 29 -a/--min-anchor <int> [ default: 8 ]
40     -m/--splice-mismatches <0-2> [ default: 0 ]
41     -i/--min-intron-length <int> [ default: 50 ]
42     -I/--max-intron-length <int> [ default: 500000 ]
43     -g/--max-multihits <int> [ default: 20 ]
44 gpertea 135 -x/--transcriptome-max-hits <int> [ default: 60 ]
45     -n/--transcriptome-mismatches <int> [ default: 1 ]
46     -M/--prefilter-multihits ( for -G/--GTF option, enable
47     an initial bowtie search
48     against the genome )
49 gpertea 29 --max-insertion-length <int> [ default: 3 ]
50     --max-deletion-length <int> [ default: 3 ]
51     --solexa-quals
52     --solexa1.3-quals (same as phred64-quals)
53     --phred64-quals (same as solexa1.3-quals)
54     -Q/--quals
55     --integer-quals
56     -C/--color (Solid - color space)
57     --color-out
58     --library-type <string> (fr-unstranded, fr-firststrand,
59     fr-secondstrand)
60 gpertea 154 -p/--num-threads <int> [ default: 1 ]
61     -G/--GTF <filename> (GTF/GFF with known transcripts)
62     --transcriptome-index <bwtidx> (transcriptome bowtie index)
63     -T/--transcriptome-only (map only to the transcriptome)
64 gpertea 29 -j/--raw-juncs <filename>
65     --insertions <filename>
66     --deletions <filename>
67     -r/--mate-inner-dist <int>
68 gpertea 154 --mate-std-dev <int> [ default: 20 ]
69 gpertea 29 --no-novel-juncs
70     --no-novel-indels
71 gpertea 154 --fusion-search
72     --fusion-anchor-length <int> [ default: 20 ]
73     --fusion-min-dist <int> [ default: 10000000 ]
74     --fusion-read-mismatches <int> [ default: 2 ]
75     --fusion-multireads <int> [ default: 2 ]
76     --fusion-multipairs <int> [ default: 2 ]
77 gpertea 29 --no-gtf-juncs
78     --no-coverage-search
79     --coverage-search
80     --no-closure-search
81     --closure-search
82     --microexon-search
83     --butterfly-search
84     --no-butterfly-search
85     --keep-tmp
86     --tmp-dir <dirname> [ default: <output_dir>/tmp ]
87     -z/--zpacker <program> [ default: gzip ]
88     -X/--unmapped-fifo [use mkfifo to compress more temporary
89     files for color space reads]
90    
91     Advanced Options:
92 gpertea 159 --report-secondary-alignments
93 gpertea 154 --genome-read-mismatches <int> [ default: 2 ]
94     -N/--read-mismatches <int> [ default: 2 ]
95 gpertea 29 --segment-mismatches <int> [ default: 2 ]
96     --segment-length <int> [ default: 25 ]
97     --bowtie-n [ default: bowtie -v ]
98     --min-closure-exon <int> [ default: 100 ]
99     --min-closure-intron <int> [ default: 50 ]
100     --max-closure-intron <int> [ default: 5000 ]
101     --min-coverage-intron <int> [ default: 50 ]
102     --max-coverage-intron <int> [ default: 20000 ]
103     --min-segment-intron <int> [ default: 50 ]
104     --max-segment-intron <int> [ default: 500000 ]
105 gpertea 154 --no-sort-bam (Output BAM is not coordinate-sorted)
106     --no-convert-bam (Do not convert to bam format.
107 gpertea 29 Output is <output_dir>accepted_hit.sam.
108 gpertea 154 Implies --no-sort-bam)
109 gpertea 159 --keep-fasta-order
110     --allow-partial-mapping
111 gpertea 29
112 gpertea 154 Bowtie2 related options:
113     Preset options in --end-to-end mode (local alignment is not used in TopHat2)
114     --b2-very-fast
115     --b2-fast
116     --b2-sensitive
117     --b2-very-sensitive
118    
119     Alignment options
120     --b2-N <int> [ default: 0 ]
121     --b2-L <int> [ default: 20 ]
122     --b2-i <func> [ default: S,1,1.25 ]
123     --b2-n-ceil <func> [ default: L,0,0.15 ]
124     --b2-gbar <int> [ default: 4 ]
125    
126     Scoring options
127     --b2-mp <int>,<int> [ default: 6,2 ]
128     --b2-np <int> [ default: 1 ]
129     --b2-rdg <int>,<int> [ default: 5,3 ]
130     --b2-rfg <int>,<int> [ default: 5,3 ]
131     --b2-score-min <func> [ default: L,-0.6,-0.6 ]
132    
133     Effort options
134     --b2-D <int> [ default: 15 ]
135     --b2-R <int> [ default: 2 ]
136    
137    
138 gpertea 29 SAM Header Options (for embedding sequencing run metadata in output):
139     --rg-id <string> (read group ID)
140     --rg-sample <string> (sample ID)
141     --rg-library <string> (library ID)
142     --rg-description <string> (descriptive string, no tabs allowed)
143     --rg-platform-unit <string> (e.g Illumina lane ID)
144     --rg-center <string> (sequencing center name)
145     --rg-date <string> (ISO 8601 date of the sequencing run)
146     --rg-platform <string> (Sequencing platform descriptor)
147     '''
148    
149 gpertea 154 # -F/--min-isoform-fraction <float> [ default: 0.15 ]
150     # (obsolete)
151    
152 gpertea 29 class Usage(Exception):
153     def __init__(self, msg):
154     self.msg = msg
155    
156     output_dir = "./tophat_out/"
157     logging_dir = output_dir + "logs/"
158     run_log = None
159     run_cmd = None
160     tmp_dir = output_dir + "tmp/"
161     bin_dir = sys.path[0] + "/"
162     use_zpacker = False # this is set by -z/--zpacker option (-z0 leaves it False)
163    
164     use_BAM_Unmapped = False # automatically set to True for non-Solid reads, handles unmapped reads in BAM format
165    
166     use_BWT_FIFO = False # can only be set to True if use_zpacker is True and only with -C/--color
167     # enabled by -X/-unmapped-fifo option (unless -z0)
168 gpertea 135 unmapped_reads_fifo = None # if use_BWT_FIFO is True, this tricks bowtie into writing the
169 gpertea 29 # unmapped reads into a compressed file
170    
171     samtools_path = None
172     bowtie_path = None
173     fail_str = "\t[FAILED]\n"
174 gpertea 135 gtf_juncs = None #file name with junctions extracted from given GFF file
175    
176 gpertea 29 # TopHatParams captures all of the runtime paramaters used by TopHat, and many
177     # of these are passed as command line options to exectubles run by the pipeline
178    
179     # This class and its nested classes also do options parsing through parse_options()
180     # and option validation via the member function check()
181    
182 gpertea 154 class BowtieOutputFiles:
183 gpertea 135 def __init__(self,
184     mappings=None,
185     unmapped_reads=None,
186     multihit_reads=None):
187     self.mappings=mappings
188     self.unmapped_reads=unmapped_reads
189     self.multihit_reads=multihit_reads
190    
191 gpertea 29 class TopHatParams:
192    
193     # SpliceConstraints is a group of runtime parameters that specify what
194     # constraints to put on junctions discovered by the program. These constraints
195     # are used to filter out spurious/false positive junctions.
196    
197     class SpliceConstraints:
198     def __init__(self,
199     min_anchor_length,
200     min_intron_length,
201     max_intron_length,
202     splice_mismatches,
203     min_isoform_fraction):
204     self.min_anchor_length = min_anchor_length
205     self.min_intron_length = min_intron_length
206     self.max_intron_length = max_intron_length
207     self.splice_mismatches = splice_mismatches
208     self.min_isoform_fraction = min_isoform_fraction
209    
210     def parse_options(self, opts):
211     for option, value in opts:
212     if option in ("-m", "--splice-mismatches"):
213     self.splice_mismatches = int(value)
214     elif option in ("-a", "--min-anchor"):
215     self.min_anchor_length = int(value)
216     elif option in ("-F", "--min-isoform-fraction"):
217     self.min_isoform_fraction = float(value)
218     elif option in ("-i", "--min-intron-length"):
219     self.min_intron_length = int(value)
220     elif option in ("-I", "--max-intron-length"):
221     self.max_intron_length = int(value)
222    
223     def check(self):
224     if self.splice_mismatches not in [0,1,2]:
225     die("Error: arg to --splice-mismatches must be 0, 1, or 2")
226     if self.min_anchor_length < 4:
227     die("Error: arg to --min-anchor-len must be greater than 4")
228     if self.min_isoform_fraction < 0.0 or self.min_isoform_fraction > 1.0:
229     die("Error: arg to --min-isoform-fraction must be between 0.0 and 1.0")
230     if self.min_intron_length <= 0:
231     die("Error: arg to --min-intron-length must be greater than 0")
232     if self.max_intron_length <= 0:
233     die("Error: arg to --max-intron-length must be greater than 0")
234    
235     # SystemParams is a group of runtime parameters that determine how to handle
236     # temporary files produced during a run and how many threads to use for threaded
237     # stages of the pipeline (e.g. Bowtie)
238    
239     class SystemParams:
240     def __init__(self,
241 gpertea 154 num_threads,
242 gpertea 29 keep_tmp):
243 gpertea 154 self.num_threads = num_threads
244 gpertea 29 self.keep_tmp = keep_tmp
245     self.zipper = "gzip"
246     self.zipper_opts= []
247    
248     def parse_options(self, opts):
249     global use_zpacker
250     global use_BWT_FIFO
251     for option, value in opts:
252     if option in ("-p", "--num-threads"):
253 gpertea 154 self.num_threads = int(value)
254 gpertea 29 elif option == "--keep-tmp":
255     self.keep_tmp = True
256     elif option in ("-z","--zpacker"):
257     if value.lower() in ["-", " ", ".", "0", "none", "f", "false", "no"]:
258     value=""
259     self.zipper = value
260     #if not self.zipper:
261     # self.zipper='gzip'
262     elif option in ("-X", "--unmapped-fifo"):
263     use_BWT_FIFO=True
264     if self.zipper:
265     use_zpacker=True
266 gpertea 154 if self.num_threads>1 and not self.zipper_opts:
267 gpertea 29 if self.zipper.endswith('pbzip2') or self.zipper.endswith('pigz'):
268 gpertea 154 self.zipper_opts.append('-p'+str(self.num_threads))
269 gpertea 29 else:
270     use_zpacker=False
271     if use_BWT_FIFO: use_BWT_FIFO=False
272     def cmd(self):
273     cmdline=[]
274     if self.zipper:
275     cmdline.extend(['-z',self.zipper])
276 gpertea 154 if self.num_threads>1:
277     cmdline.extend(['-p'+str(self.num_threads)])
278 gpertea 29 return cmdline
279    
280     def check(self):
281 gpertea 154 if self.num_threads<1 :
282 gpertea 29 die("Error: arg to --num-threads must be greater than 0")
283     if self.zipper:
284     xzip=which(self.zipper)
285     if not xzip:
286     die("Error: cannot find compression program "+self.zipper)
287    
288     # ReadParams is a group of runtime parameters that specify various properties
289     # of the user's reads (e.g. which quality scale their are on, how long the
290     # fragments are, etc).
291     class ReadParams:
292     def __init__(self,
293     solexa_quals,
294     phred64_quals,
295     quals,
296     integer_quals,
297     color,
298     library_type,
299     seed_length,
300     reads_format,
301     mate_inner_dist,
302     mate_inner_dist_std_dev,
303     read_group_id,
304     sample_id,
305     library_id,
306     description,
307     seq_platform_unit,
308     seq_center,
309     seq_run_date,
310     seq_platform):
311     self.solexa_quals = solexa_quals
312     self.phred64_quals = phred64_quals
313     self.quals = quals
314     self.integer_quals = integer_quals
315     self.color = color
316     self.library_type = library_type
317     self.seed_length = seed_length
318     self.reads_format = reads_format
319     self.mate_inner_dist = mate_inner_dist
320     self.mate_inner_dist_std_dev = mate_inner_dist_std_dev
321     self.read_group_id = read_group_id
322     self.sample_id = sample_id
323     self.library_id = library_id
324     self.description = description
325     self.seq_platform_unit = seq_platform_unit
326     self.seq_center = seq_center
327     self.seq_run_date = seq_run_date
328     self.seq_platform = seq_platform
329    
330     def parse_options(self, opts):
331     for option, value in opts:
332     if option == "--solexa-quals":
333     self.solexa_quals = True
334     elif option in ("--solexa1.3-quals", "--phred64-quals"):
335     self.phred64_quals = True
336     elif option in ("-Q", "--quals"):
337     self.quals = True
338     elif option == "--integer-quals":
339     self.integer_quals = True
340     elif option in ("-C", "--color"):
341     self.color = True
342     elif option == "--library-type":
343     self.library_type = value
344     elif option in ("-s", "--seed-length"):
345     self.seed_length = int(value)
346     elif option in ("-r", "--mate-inner-dist"):
347     self.mate_inner_dist = int(value)
348     elif option == "--mate-std-dev":
349     self.mate_inner_dist_std_dev = int(value)
350     elif option == "--rg-id":
351     self.read_group_id = value
352     elif option == "--rg-sample":
353     self.sample_id = value
354     elif option == "--rg-library":
355     self.library_id = value
356     elif option == "--rg-description":
357     self.description = value
358     elif option == "--rg-platform-unit":
359     self.seq_platform_unit = value
360     elif option == "--rg-center":
361     self.seq_center = value
362     elif option == "--rg-date":
363     self.seq_run_date = value
364     elif option == "--rg-platform":
365     self.seq_platform = value
366    
367     def check(self):
368 gpertea 135 if self.seed_length and self.seed_length < 20:
369 gpertea 29 die("Error: arg to --seed-length must be at least 20")
370 gpertea 154
371 gpertea 29 if self.mate_inner_dist_std_dev != None and self.mate_inner_dist_std_dev < 0:
372     die("Error: arg to --mate-std-dev must at least 0")
373     if (not self.read_group_id and self.sample_id) or (self.read_group_id and not self.sample_id):
374     die("Error: --rg-id and --rg-sample must be specified or omitted together")
375    
376     # SearchParams is a group of runtime parameters that specify how TopHat will
377     # search for splice junctions
378    
379     class SearchParams:
380     def __init__(self,
381     min_closure_exon,
382     min_closure_intron,
383     max_closure_intron,
384     min_coverage_intron,
385     max_coverage_intron,
386     min_segment_intron,
387     max_segment_intron):
388    
389     self.min_closure_exon_length = min_closure_exon
390     self.min_closure_intron_length = min_closure_intron
391     self.max_closure_intron_length = max_closure_intron
392     self.min_coverage_intron_length = min_coverage_intron
393     self.max_coverage_intron_length = max_coverage_intron
394     self.min_segment_intron_length = min_segment_intron
395     self.max_segment_intron_length = max_segment_intron
396    
397     def parse_options(self, opts):
398     for option, value in opts:
399     if option == "--min-closure-exon":
400     self.min_closure_exon_length = int(value)
401     if option == "--min-closure-intron":
402     self.min_closure_intron_length = int(value)
403     if option == "--max-closure-intron":
404     self.max_closure_intron_length = int(value)
405     if option == "--min-coverage-intron":
406     self.min_coverage_intron_length = int(value)
407     if option == "--max-coverage-intron":
408     self.max_coverage_intron_length = int(value)
409     if option == "--min-segment-intron":
410     self.min_segment_intron_length = int(value)
411     if option == "--max-segment-intron":
412     self.max_segment_intron_length = int(value)
413    
414     def check(self):
415     if self.min_closure_exon_length < 0:
416     die("Error: arg to --min-closure-exon must be at least 20")
417     if self.min_closure_intron_length < 0:
418     die("Error: arg to --min-closure-intron must be at least 20")
419     if self.max_closure_intron_length < 0:
420     die("Error: arg to --max-closure-intron must be at least 20")
421     if self.min_coverage_intron_length < 0:
422     die("Error: arg to --min-coverage-intron must be at least 20")
423     if self.max_coverage_intron_length < 0:
424     die("Error: arg to --max-coverage-intron must be at least 20")
425     if self.min_segment_intron_length < 0:
426     die("Error: arg to --min-segment-intron must be at least 20")
427     if self.max_segment_intron_length < 0:
428     die("Error: arg to --max-segment-intron must be at least 20")
429    
430     class ReportParams:
431     def __init__(self):
432     self.sort_bam = True
433     self.convert_bam = True
434    
435     def parse_options(self, opts):
436     for option, value in opts:
437     if option == "--no-sort-bam":
438     self.sort_bam = False
439     if option == "--no-convert-bam":
440     self.convert_bam = False
441     self.sort_bam = False
442    
443 gpertea 154 class Bowtie2Params:
444     def __init__(self):
445     self.very_fast = False
446     self.fast = False
447     self.sensitive = False
448     self.very_sensitive = False
449 gpertea 29
450 gpertea 154 self.N = 0
451     self.L = 20
452     self.i = "S,1,1.25"
453     self.n_ceil = "L,0,0.15"
454     self.gbar = 4
455    
456     self.mp = "6,2"
457     self.np = 1
458     self.rdg = "5,3"
459     self.rfg = "5,3"
460     self.score_min = "L,-0.6,-0.6"
461    
462     self.D = 15
463     self.R = 2
464    
465     def parse_options(self, opts):
466     for option, value in opts:
467     if option == "--b2-very-fast":
468     self.very_fast = True
469     if option == "--b2-fast":
470     self.fast = True
471     if option == "--b2-sensitive":
472     self.sensitive = True
473     if option == "--b2-very-sensitive":
474     self.very_sensitive = True
475    
476     if option == "--b2-N":
477     self.N = int(value)
478     if option == "--b2-L":
479     self.L = 20
480     if option == "--b2-i":
481     self.i = value
482     if option == "--b2-n-ceil":
483     self.n_ceil = value
484     if option == "--b2-gbar":
485     self.gbar = 4
486    
487     if option == "--b2-mp":
488     self.mp = value
489     if option == "--b2-np":
490     self.np = int(value)
491     if option == "--b2-rdg":
492     self.rdg = value
493     if option == "--b2-rfg":
494     self.rfg = value
495     if option == "--b2-score-min":
496     self.score_min = value
497    
498     if option == "--b2-D":
499     self.D = int(value)
500     if option == "--b2-R":
501     self.R = int(value)
502    
503     def check(self):
504     more_than_once = False
505     if self.very_fast:
506     if self.fast or self.sensitive or self.very_sensitive:
507     more_than_once = True
508     else:
509     if self.fast:
510     if self.sensitive or self.very_sensitive:
511     more_than_once = True
512     else:
513     if self.sensitive and self.very_sensitive:
514     more_than_once = True
515    
516     if more_than_once:
517     die("Error: use only one of --b2-very-fast, --b2-fast, --b2-sensitive, --b2-very-sensitive")
518    
519     if not self.N in [0, 1]:
520     die("Error: arg to --b2-N must be either 0 or 1")
521    
522     function_re = r'^[CLSG],-?\d+(\.\d+)?,-?\d+(\.\d+)?$'
523     function_match = re.search(function_re, self.i)
524    
525     if not function_match:
526     die("Error: arg to --b2-i must be <func> (e.g. --b2-i S,1,1.25)")
527    
528     function_match = re.search(function_re, self.n_ceil)
529     if not function_match:
530     die("Error: arg to --b2-n-ceil must be <func> (e.g. --b2-n-ceil L,0,0.15)")
531    
532     function_match = re.search(function_re, self.score_min)
533     if not function_match:
534     die("Error: arg to --b2-score-min must be <func> (e.g. --b2-score-min L,-0.6,-0.6)")
535    
536     pair_re = r'^\d+,\d+$'
537     pair_match = re.search(pair_re, self.mp)
538     if not pair_match:
539     die("Error: arg to --b2-mp must be <int>,<int> (e.g. --b2-mp 6,2)")
540    
541     pair_match = re.search(pair_re, self.rdg)
542     if not pair_match:
543     die("Error: arg to --b2-rdg must be <int>,<int> (e.g. --b2-mp 5,3)")
544    
545     pair_match = re.search(pair_re, self.rfg)
546     if not pair_match:
547     die("Error: arg to --b2-rfg must be <int>,<int> (e.g. --b2-mp 5,3)")
548    
549    
550 gpertea 29 def __init__(self):
551     self.splice_constraints = self.SpliceConstraints(8, # min_anchor
552     50, # min_intron
553     500000, # max_intron
554     0, # splice_mismatches
555     0.15) # min_isoform_frac
556    
557 gpertea 135 self.preflt_data = [ BowtieOutputFiles(), BowtieOutputFiles() ]
558    
559 gpertea 29 self.read_params = self.ReadParams(False, # solexa_scale
560     False,
561     False, # quals
562     None, # integer quals
563     False, # SOLiD - color space
564     "", # library type (e.g. "illumina-stranded-pair-end")
565     None, # seed_length
566     "fastq", # quality_format
567     None, # mate inner distance
568     20, # mate inner dist std dev
569     None, # read group id
570     None, # sample id
571     None, # library id
572     None, # description
573     None, # platform unit (i.e. lane)
574     None, # sequencing center
575     None, # run date
576     None) # sequencing platform
577    
578 gpertea 154 self.system_params = self.SystemParams(1, # bowtie_threads (num_threads)
579 gpertea 29 False) # keep_tmp
580    
581     self.search_params = self.SearchParams(100, # min_closure_exon_length
582     50, # min_closure_intron_length
583     5000, # max_closure_intron_length
584     50, # min_coverage_intron_length
585     20000, # max_coverage_intron_length
586     50, # min_segment_intron_length
587     500000) # max_segment_intron_length
588    
589     self.report_params = self.ReportParams()
590 gpertea 154
591     self.bowtie2_params = self.Bowtie2Params()
592    
593     self.bowtie2 = True
594 gpertea 29 self.gff_annotation = None
595 gpertea 135 self.transcriptome_only = False
596     self.transcriptome_index = None
597 gpertea 154 self.transcriptome_outdir = None
598 gpertea 29 self.raw_junctions = None
599     self.find_novel_juncs = True
600     self.find_novel_indels = True
601 gpertea 154 self.find_novel_fusions = True
602 gpertea 29 self.find_GFF_juncs = True
603     self.max_hits = 20
604 gpertea 135 self.t_max_hits = 60
605 gpertea 159 self.max_seg_hits = 40
606 gpertea 135 self.prefilter_multi = False
607 gpertea 154 self.genome_read_mismatches = 2
608     self.max_read_mismatches = 2
609 gpertea 135 self.t_mismatches = 1
610 gpertea 29 self.segment_length = 25
611     self.segment_mismatches = 2
612     self.bowtie_alignment_option = "-v"
613     self.max_insertion_length = 3
614     self.max_deletion_length = 3
615     self.raw_insertions = None
616     self.raw_deletions = None
617     self.coverage_search = None
618 gpertea 135 self.closure_search = False
619 gpertea 29 self.microexon_search = False
620     self.butterfly_search = None
621 gpertea 159 self.report_secondary_alignments = False
622    
623     self.keep_fasta_order = False
624     self.partial_mapping = False
625    
626 gpertea 154 self.fusion_search = False
627     self.fusion_anchor_length = 20
628     self.fusion_min_dist = 10000000
629     self.fusion_read_mismatches = 2
630     self.fusion_multireads = 2
631     self.fusion_multipairs = 2
632 gpertea 29
633     def check(self):
634     self.splice_constraints.check()
635     self.read_params.check()
636     self.system_params.check()
637 gpertea 154 self.bowtie2_params.check()
638 gpertea 135 if self.segment_length < 10:
639     die("Error: arg to --segment-length must at least 10")
640 gpertea 29 if self.segment_mismatches < 0 or self.segment_mismatches > 3:
641     die("Error: arg to --segment-mismatches must in [0, 3]")
642 gpertea 154 if self.t_mismatches < 0 or self.t_mismatches > 3:
643     die("Error: arg to -n or --transcriptome-mismatches must in [0, 3]")
644 gpertea 29
645 gpertea 154 if self.read_params.color:
646     if self.bowtie2:
647     die("Error: bowtie2 in colorspace is not yet supported\nuse --bowtie1 option")
648     if self.fusion_search:
649     die("Error: fusion-search in colorspace is not yet supported")
650     if self.butterfly_search:
651     die("Error: butterfly-search in colorspace is not yet supported")
652 gpertea 29
653     library_types = ["fr-unstranded", "fr-firststrand", "fr-secondstrand"]
654    
655     if self.read_params.library_type and self.read_params.library_type not in library_types:
656     die("Error: library-type should be one of: "+', '.join(library_types))
657    
658     self.search_params.max_closure_intron_length = min(self.splice_constraints.max_intron_length,
659     self.search_params.max_closure_intron_length)
660    
661     self.search_params.max_segment_intron_length = min(self.splice_constraints.max_intron_length,
662     self.search_params.max_segment_intron_length)
663    
664     self.search_params.max_coverage_intron_length = min(self.splice_constraints.max_intron_length,
665     self.search_params.max_coverage_intron_length)
666    
667     if self.max_insertion_length >= self.segment_length:
668     die("Error: the max insertion length ("+self.max_insertion_length+") can not be equal to or greater than the segment length ("+self.segment_length+")")
669    
670     if self.max_insertion_length < 0:
671     die("Error: the max insertion length ("+self.max_insertion_length+") can not be less than 0")
672    
673     if self.max_deletion_length >= self.splice_constraints.min_intron_length:
674     die("Error: the max deletion length ("+self.max_deletion_length+") can not be equal to or greater than the min intron length ("+self.splice_constraints.min_intron_length+")")
675    
676     if self.max_deletion_length < 0:
677     die("Error: the max deletion length ("+self.max_deletion_length+") can not be less than 0")
678    
679     def cmd(self):
680     cmd = ["--min-anchor", str(self.splice_constraints.min_anchor_length),
681     "--splice-mismatches", str(self.splice_constraints.splice_mismatches),
682     "--min-report-intron", str(self.splice_constraints.min_intron_length),
683     "--max-report-intron", str(self.splice_constraints.max_intron_length),
684     "--min-isoform-fraction", str(self.splice_constraints.min_isoform_fraction),
685     "--output-dir", output_dir,
686     "--max-multihits", str(self.max_hits),
687 gpertea 159 "--max-seg-multihits", str(self.max_seg_hits),
688 gpertea 29 "--segment-length", str(self.segment_length),
689     "--segment-mismatches", str(self.segment_mismatches),
690     "--min-closure-exon", str(self.search_params.min_closure_exon_length),
691     "--min-closure-intron", str(self.search_params.min_closure_intron_length),
692     "--max-closure-intron", str(self.search_params.max_closure_intron_length),
693     "--min-coverage-intron", str(self.search_params.min_coverage_intron_length),
694     "--max-coverage-intron", str(self.search_params.max_coverage_intron_length),
695     "--min-segment-intron", str(self.search_params.min_segment_intron_length),
696     "--max-segment-intron", str(self.search_params.max_segment_intron_length),
697 gpertea 154 "--max-mismatches", str(self.max_read_mismatches),
698 gpertea 29 "--max-insertion-length", str(self.max_insertion_length),
699     "--max-deletion-length", str(self.max_deletion_length)]
700 gpertea 154
701     if not self.bowtie2:
702     cmd.extend(["--bowtie1"])
703    
704     if self.fusion_search:
705     cmd.extend(["--fusion-search",
706     "--fusion-anchor-length", str(self.fusion_anchor_length),
707     "--fusion-min-dist", str(self.fusion_min_dist),
708     "--fusion-read-mismatches", str(self.fusion_read_mismatches),
709     "--fusion-multireads", str(self.fusion_multireads),
710     "--fusion-multipairs", str(self.fusion_multipairs)])
711    
712 gpertea 29 cmd.extend(self.system_params.cmd())
713    
714     if self.read_params.mate_inner_dist != None:
715     cmd.extend(["--inner-dist-mean", str(self.read_params.mate_inner_dist),
716     "--inner-dist-std-dev", str(self.read_params.mate_inner_dist_std_dev)])
717     if self.gff_annotation != None:
718     cmd.extend(["--gtf-annotations", str(self.gff_annotation)])
719 gpertea 135 if gtf_juncs:
720     cmd.extend(["--gtf-juncs", gtf_juncs])
721     if self.closure_search == False:
722 gpertea 29 cmd.append("--no-closure-search")
723 gpertea 154 if not self.coverage_search:
724 gpertea 29 cmd.append("--no-coverage-search")
725 gpertea 154 if not self.microexon_search:
726 gpertea 29 cmd.append("--no-microexon-search")
727     if self.butterfly_search:
728     cmd.append("--butterfly-search")
729     if self.read_params.solexa_quals:
730     cmd.append("--solexa-quals")
731     if self.read_params.quals:
732     cmd.append("--quals")
733     if self.read_params.integer_quals:
734     cmd.append("--integer-quals")
735     if self.read_params.color:
736     cmd.append("--color")
737     if self.read_params.library_type:
738     cmd.extend(["--library-type", self.read_params.library_type])
739     if self.read_params.read_group_id:
740     cmd.extend(["--rg-id", self.read_params.read_group_id])
741     if self.read_params.phred64_quals:
742     cmd.append("--phred64-quals")
743     return cmd
744    
745     # This is the master options parsing routine, which calls parse_options for
746     # the delegate classes (e.g. SpliceConstraints) that handle certain groups
747     # of options.
748     def parse_options(self, argv):
749     try:
750 gpertea 135 opts, args = getopt.getopt(argv[1:], "hvp:m:n:N:F:a:i:I:G:Tr:o:j:Xz:s:g:x:MQC",
751 gpertea 29 ["version",
752     "help",
753     "output-dir=",
754 gpertea 154 "bowtie1",
755 gpertea 29 "solexa-quals",
756     "solexa1.3-quals",
757     "phred64-quals",
758     "quals",
759     "integer-quals",
760     "color",
761     "library-type=",
762     "num-threads=",
763     "splice-mismatches=",
764     "max-multihits=",
765     "min-isoform-fraction=",
766     "min-anchor-length=",
767     "min-intron-length=",
768     "max-intron-length=",
769     "GTF=",
770 gpertea 135 "transcriptome-only",
771     "transcriptome-max-hits=",
772     "transcriptome-index=",
773 gpertea 29 "raw-juncs=",
774     "no-novel-juncs",
775 gpertea 154 "allow-fusions",
776     "fusion-search",
777     "fusion-anchor-length=",
778     "fusion-min-dist=",
779     "fusion-read-mismatches=",
780     "fusion-multireads=",
781     "fusion-multipairs=",
782 gpertea 29 "no-novel-indels",
783     "no-gtf-juncs",
784     "mate-inner-dist=",
785     "mate-std-dev=",
786     "no-closure-search",
787     "no-coverage-search",
788     "closure-search",
789     "coverage-search",
790 gpertea 135 "prefilter-multihits",
791 gpertea 29 "microexon-search",
792     "min-closure-exon=",
793     "min-closure-intron=",
794     "max-closure-intron=",
795     "min-coverage-intron=",
796     "max-coverage-intron=",
797     "min-segment-intron=",
798     "max-segment-intron=",
799     "seed-length=",
800 gpertea 154 "genome-read-mismatches=",
801 gpertea 135 "read-mismatches=",
802 gpertea 154 "max-read-mismatches=",
803 gpertea 135 "transcriptome-mismatches=",
804 gpertea 29 "segment-length=",
805     "segment-mismatches=",
806     "bowtie-n",
807     "butterfly-search",
808     "no-butterfly-search",
809     "keep-tmp",
810     "rg-id=",
811     "rg-sample=",
812     "rg-library=",
813     "rg-description=",
814     "rg-platform-unit=",
815     "rg-center=",
816     "rg-date=",
817     "rg-platform=",
818     "tmp-dir=",
819     "zpacker=",
820     "unmapped-fifo",
821     "max-insertion-length=",
822     "max-deletion-length=",
823     "insertions=",
824     "deletions=",
825     "no-sort-bam",
826 gpertea 154 "no-convert-bam",
827 gpertea 159 "report-secondary-alignments",
828     "keep-fasta-order",
829     "allow-partial-mapping",
830 gpertea 154 "b2-very-fast",
831     "b2-fast",
832     "b2-sensitive",
833     "b2-very-sensitive",
834     "b2-N=",
835     "b2-L=",
836     "b2-i=",
837     "b2-n-ceil=",
838     "b2-gbar=",
839     "b2-ma=",
840     "b2-mp=",
841     "b2-np=",
842     "b2-rdg=",
843     "b2-rfg=",
844     "b2-score-min=",
845     "b2-D=",
846     "b2-R="])
847 gpertea 29 except getopt.error, msg:
848     raise Usage(msg)
849    
850     self.splice_constraints.parse_options(opts)
851     self.system_params.parse_options(opts)
852     self.read_params.parse_options(opts)
853     self.search_params.parse_options(opts)
854     self.report_params.parse_options(opts)
855 gpertea 154 self.bowtie2_params.parse_options(opts)
856 gpertea 29 global use_BWT_FIFO
857     global use_BAM_Unmapped
858     if not self.read_params.color:
859     use_BWT_FIFO=False
860     use_BAM_Unmapped=True
861     global output_dir
862     global logging_dir
863     global tmp_dir
864    
865     custom_tmp_dir = None
866     custom_out_dir = None
867     # option processing
868     for option, value in opts:
869     if option in ("-v", "--version"):
870     print "TopHat v%s" % (get_version())
871     sys.exit(0)
872     if option in ("-h", "--help"):
873     raise Usage(use_message)
874 gpertea 154 if option == "--bowtie1":
875     self.bowtie2 = False
876 gpertea 29 if option in ("-g", "--max-multihits"):
877     self.max_hits = int(value)
878 gpertea 159 self.max_seg_hits = max(10, self.max_hits * 2)
879 gpertea 135 if option in ("-x", "--transcriptome-max-hits"):
880     self.t_max_hits = int(value)
881 gpertea 29 if option in ("-G", "--GTF"):
882     self.gff_annotation = value
883 gpertea 135 if option in ("-T", "--transcriptome"):
884     self.transcriptome_only = True
885 gpertea 154 if option == "--transcriptome-index":
886 gpertea 135 self.transcriptome_index = value
887     if option in("-M", "--prefilter-multihits"):
888     self.prefilter_multi = True
889 gpertea 29 if option in ("-j", "--raw-juncs"):
890     self.raw_junctions = value
891     if option == "--no-novel-juncs":
892     self.find_novel_juncs = False
893     if option == "--no-novel-indels":
894     self.find_novel_indels = False
895 gpertea 154 if option == "--fusion-search":
896     self.fusion_search = True
897     if option == "--fusion-anchor-length":
898     self.fusion_anchor_length = int(value)
899     if option == "--fusion-min-dist":
900     self.fusion_min_dist = int(value)
901     if option == "--fusion-read-mismatches":
902     self.fusion_read_mismatches = int(value)
903     if option == "--fusion-multireads":
904     self.fusion_multireads = int(value)
905     if option == "--fusion-multipairs":
906     self.fusion_multipairs = int(value)
907 gpertea 29 if option == "--no-gtf-juncs":
908     self.find_GFF_juncs = False
909     if option == "--no-coverage-search":
910     self.coverage_search = False
911 gpertea 135 if option == "--coverage-search":
912     self.coverage_search = True
913 gpertea 29 if option == "--no-closure-search":
914     self.closure_search = False
915     if option == "--closure-search":
916     self.closure_search = True
917     if option == "--microexon-search":
918     self.microexon_search = True
919     if option == "--butterfly-search":
920     self.butterfly_search = True
921     if option == "--no-butterfly-search":
922     self.butterfly_search = False
923 gpertea 154 if option == "--genome-read-mismatches":
924     self.genome_read_mismatches = int(value)
925     if option in ("-N", "--read-mismatches", "--max-read-mismatches"):
926     self.max_read_mismatches = int(value)
927 gpertea 135 if option in ("-n", "--transcriptome-mismatches"):
928     self.t_mismatches = int(value)
929 gpertea 29 if option == "--segment-length":
930     self.segment_length = int(value)
931     if option == "--segment-mismatches":
932     self.segment_mismatches = int(value)
933     if option == "--bowtie-n":
934     self.bowtie_alignment_option = "-n"
935     if option == "--max-insertion-length":
936     self.max_insertion_length = int(value)
937     if option == "--max-deletion-length":
938     self.max_deletion_length = int(value)
939     if option == "--insertions":
940     self.raw_insertions = value
941     if option == "--deletions":
942     self.raw_deletions = value
943 gpertea 159 if option == "--report-secondary-alignments":
944     self.report_secondary_alignments = True
945     if option == "--keep-fasta-order":
946     self.keep_fasta_order = True
947     if option == "--allow-partial-mapping":
948     self.partial_mapping = True
949 gpertea 29 if option in ("-o", "--output-dir"):
950     custom_out_dir = value + "/"
951     if option == "--tmp-dir":
952     custom_tmp_dir = value + "/"
953 gpertea 135 if self.transcriptome_only:
954     self.find_novel_juncs=False
955     self.find_novel_indels=False
956     if custom_out_dir:
957 gpertea 29 output_dir = custom_out_dir
958     logging_dir = output_dir + "logs/"
959     tmp_dir = output_dir + "tmp/"
960     sam_header = tmp_dir + "stub_header.sam"
961 gpertea 135 if custom_tmp_dir:
962 gpertea 29 tmp_dir = custom_tmp_dir
963     sam_header = tmp_dir + "stub_header.sam"
964     if len(args) < 2:
965     raise Usage(use_message)
966     return args
967    
968    
969 gpertea 154 # check if a file exists and has non-zero (or minimum) size
970     def fileExists(filepath, minfsize=1):
971     if os.path.exists(filepath) and os.path.getsize(filepath)>=minfsize:
972     return True
973     else:
974     return False
975    
976    
977 gpertea 29 def getFileDir(filepath):
978 gpertea 154 #if fullpath given, returns path including the ending /
979 gpertea 29 fpath, fname=os.path.split(filepath)
980     if fpath: fpath+='/'
981     return fpath
982    
983     def getFileBaseName(filepath):
984     fpath, fname=os.path.split(filepath)
985     fbase, fext =os.path.splitext(fname)
986     fx=fext.lower()
987     if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fbase)>0:
988     return fbase
989     elif fx == '.z' or fx.find('.gz')==0 or fx.find('.bz')==0:
990     fb, fext = os.path.splitext(fbase)
991     fx=fext.lower()
992     if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fb)>0:
993     return fb
994     else:
995     return fbase
996     else:
997 gpertea 154 if len(fbase)>0:
998 gpertea 29 return fbase
999     else:
1000     return fname
1001    
1002     # Returns the current time in a nice format
1003     def right_now():
1004     curr_time = datetime.now()
1005     return curr_time.strftime("%c")
1006    
1007 gpertea 135 # The TopHat logging formatter
1008     def th_log(out_str):
1009     print >> sys.stderr, "[%s] %s" % (right_now(), out_str)
1010    
1011 gpertea 29 # Ensures that the output, logging, and temp directories are present. If not,
1012     # they are created
1013     def prepare_output_dir():
1014    
1015 gpertea 135 th_log("Preparing output location "+output_dir)
1016 gpertea 29 if os.path.exists(output_dir):
1017     pass
1018     else:
1019     os.mkdir(output_dir)
1020    
1021     if os.path.exists(logging_dir):
1022     pass
1023     else:
1024     os.mkdir(logging_dir)
1025    
1026     if os.path.exists(tmp_dir):
1027     pass
1028     else:
1029     try:
1030     os.makedirs(tmp_dir)
1031     except OSError, o:
1032     die("\nError creating directory %s (%s)" % (tmp_dir, o))
1033    
1034    
1035     # to be added as preexec_fn for every subprocess.Popen() call:
1036     # see http://bugs.python.org/issue1652
1037     def subprocess_setup():
1038     # Python installs a SIGPIPE handler by default, which causes
1039     # gzip or other de/compression pipes to complain about "stdout: Broken pipe"
1040     signal.signal(signal.SIGPIPE, signal.SIG_DFL)
1041    
1042     # Check that the Bowtie index specified by the user is present and all files
1043     # are there.
1044     def check_bowtie_index(idx_prefix):
1045 gpertea 135 th_log("Checking for Bowtie index files")
1046 gpertea 29
1047     idx_fwd_1 = idx_prefix + ".1.ebwt"
1048     idx_fwd_2 = idx_prefix + ".2.ebwt"
1049     idx_rev_1 = idx_prefix + ".rev.1.ebwt"
1050     idx_rev_2 = idx_prefix + ".rev.2.ebwt"
1051    
1052     if os.path.exists(idx_fwd_1) and \
1053     os.path.exists(idx_fwd_2) and \
1054     os.path.exists(idx_rev_1) and \
1055     os.path.exists(idx_rev_2):
1056     return
1057     else:
1058     bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES")
1059     if bowtie_idx_env_var == None:
1060     die("Error: Could not find Bowtie index files " + idx_prefix + ".*")
1061     idx_prefix = bowtie_idx_env_var + idx_prefix
1062 gpertea 154
1063 gpertea 29 idx_fwd_1 = idx_prefix + ".1.ebwt"
1064     idx_fwd_2 = idx_prefix + ".2.ebwt"
1065     idx_rev_1 = idx_prefix + ".rev.1.ebwt"
1066     idx_rev_2 = idx_prefix + ".rev.2.ebwt"
1067    
1068     if os.path.exists(idx_fwd_1) and \
1069     os.path.exists(idx_fwd_2) and \
1070     os.path.exists(idx_rev_1) and \
1071     os.path.exists(idx_rev_2):
1072     return
1073     else:
1074     die("Error: Could not find Bowtie index files " + idx_prefix + ".*")
1075    
1076     # Reconstructs the multifasta file from which the Bowtie index was created, if
1077     # it's not already there.
1078     def bowtie_idx_to_fa(idx_prefix):
1079     idx_name = idx_prefix.split('/')[-1]
1080 gpertea 135 th_log("Reconstituting reference FASTA file from Bowtie index")
1081 gpertea 29
1082     try:
1083     tmp_fasta_file_name = tmp_dir + idx_name + ".fa"
1084     tmp_fasta_file = open(tmp_fasta_file_name, "w")
1085    
1086     inspect_log = open(logging_dir + "bowtie_inspect_recons.log", "w")
1087    
1088     inspect_cmd = [prog_path("bowtie-inspect"),
1089     idx_prefix]
1090    
1091     print >> sys.stderr, "Executing: " + " ".join(inspect_cmd) + " > " + tmp_fasta_file_name
1092     ret = subprocess.call(inspect_cmd,
1093     stdout=tmp_fasta_file,
1094     stderr=inspect_log)
1095    
1096     # Bowtie reported an error
1097     if ret != 0:
1098     die(fail_str+"Error: bowtie-inspect returned an error")
1099    
1100     # Bowtie not found
1101     except OSError, o:
1102     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1103     die(fail_str+"Error: bowtie-inspect not found on this system. Did you forget to include it in your PATH?")
1104    
1105     return tmp_fasta_file_name
1106    
1107     # Checks whether the multifasta file for the genome is present alongside the
1108     # Bowtie index files for it.
1109     def check_fasta(idx_prefix):
1110 gpertea 135 th_log("Checking for reference FASTA file")
1111 gpertea 29 idx_fasta = idx_prefix + ".fa"
1112     if os.path.exists(idx_fasta):
1113     return idx_fasta
1114     else:
1115     bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES")
1116 gpertea 135 if bowtie_idx_env_var:
1117 gpertea 29 idx_fasta = bowtie_idx_env_var + idx_prefix + ".fa"
1118     if os.path.exists(idx_fasta):
1119     return idx_fasta
1120    
1121     print >> sys.stderr, "\tWarning: Could not find FASTA file " + idx_fasta
1122     idx_fa = bowtie_idx_to_fa(idx_prefix)
1123     return idx_fa
1124    
1125     # Check that both the Bowtie index and the genome's fasta file are present
1126     def check_index(idx_prefix):
1127     check_bowtie_index(idx_prefix)
1128     ref_fasta_file = check_fasta(idx_prefix)
1129    
1130     return (ref_fasta_file, None)
1131    
1132     # Retrive a tuple containing the system's version of Bowtie. Parsed from
1133     # `bowtie --version`
1134 gpertea 154 def get_bowtie_version(is_bowtie2):
1135 gpertea 29 try:
1136     # Launch Bowtie to capture its version info
1137     proc = subprocess.Popen([bowtie_path, "--version"],
1138     stdout=subprocess.PIPE)
1139 gpertea 154
1140 gpertea 29 stdout_value = proc.communicate()[0]
1141 gpertea 154
1142 gpertea 29 bowtie_version = None
1143     bowtie_out = repr(stdout_value)
1144    
1145     # Find the version identifier
1146 gpertea 154 if is_bowtie2:
1147     version_str = "bowtie2-align version "
1148     else:
1149     version_str = "bowtie version "
1150    
1151 gpertea 29 ver_str_idx = bowtie_out.find(version_str)
1152     if ver_str_idx != -1:
1153     nl = bowtie_out.find("\\n", ver_str_idx)
1154     version_val = bowtie_out[ver_str_idx + len(version_str):nl]
1155 gpertea 154
1156     if is_bowtie2:
1157     ver_numbers = version_val.split('.')
1158     ver_numbers[2:] = ver_numbers[2].split('-')
1159     bowtie_version = [int(x) for x in ver_numbers[:3]] + [int(ver_numbers[3][4:])]
1160     else:
1161     bowtie_version = [int(x) for x in version_val.split('.')]
1162    
1163 gpertea 29 if len(bowtie_version) == 3:
1164     bowtie_version.append(0)
1165    
1166     return bowtie_version
1167     except OSError, o:
1168     errmsg=fail_str+str(o)+"\n"
1169     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1170     errmsg+="Error: bowtie not found on this system"
1171     die(errmsg)
1172    
1173 gpertea 159 def get_index_sam_header(params, idx_prefix):
1174 gpertea 29 try:
1175 gpertea 159 temp_sam_header_filename = tmp_dir + "temp.samheader.sam"
1176     temp_sam_header_file = open(temp_sam_header_filename, "w")
1177    
1178     bowtie_header_cmd = [bowtie_path]
1179 gpertea 29
1180 gpertea 159 read_params = params.read_params
1181     if not params.bowtie2:
1182 gpertea 154 bowtie_header_cmd += ["--sam"]
1183    
1184 gpertea 29 if read_params.color:
1185     bowtie_header_cmd.append('-C')
1186 gpertea 154
1187 gpertea 29 bowtie_header_cmd.extend([idx_prefix, '/dev/null'])
1188     subprocess.call(bowtie_header_cmd,
1189 gpertea 159 stdout=temp_sam_header_file,
1190 gpertea 29 stderr=open('/dev/null'))
1191    
1192 gpertea 159 temp_sam_header_file.close()
1193     temp_sam_header_file = open(temp_sam_header_filename, "r")
1194 gpertea 29
1195 gpertea 159 bowtie_sam_header_filename = tmp_dir + idx_prefix.split('/')[-1] + ".bwt.samheader.sam"
1196     bowtie_sam_header_file = open(bowtie_sam_header_filename, "w")
1197 gpertea 154
1198 gpertea 29 preamble = []
1199     sq_dict_lines = []
1200    
1201 gpertea 159 for line in temp_sam_header_file.readlines():
1202 gpertea 29 line = line.strip()
1203     if line.find("@SQ") != -1:
1204     # Sequence dictionary record
1205     cols = line.split('\t')
1206     seq_name = None
1207     for col in cols:
1208     fields = col.split(':')
1209     #print fields
1210     if len(fields) > 0 and fields[0] == "SN":
1211     seq_name = fields[1]
1212     if seq_name == None:
1213     die("Error: malformed sequence dictionary in sam header")
1214     sq_dict_lines.append([seq_name,line])
1215     elif line.find("CL"):
1216     continue
1217     else:
1218     preamble.append(line)
1219 gpertea 159
1220     print >> bowtie_sam_header_file, "@HD\tVN:1.0\tSO:coordinate"
1221 gpertea 29 if read_params.read_group_id and read_params.sample_id:
1222     rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
1223     read_params.sample_id)
1224     if read_params.library_id:
1225     rg_str += "\tLB:%s" % read_params.library_id
1226     if read_params.description:
1227     rg_str += "\tDS:%s" % read_params.description
1228     if read_params.seq_platform_unit:
1229     rg_str += "\tPU:%s" % read_params.seq_platform_unit
1230     if read_params.seq_center:
1231     rg_str += "\tCN:%s" % read_params.seq_center
1232     if read_params.mate_inner_dist:
1233     rg_str += "\tPI:%s" % read_params.mate_inner_dist
1234     if read_params.seq_run_date:
1235     rg_str += "\tDT:%s" % read_params.seq_run_date
1236     if read_params.seq_platform:
1237     rg_str += "\tPL:%s" % read_params.seq_platform
1238    
1239 gpertea 159 print >> bowtie_sam_header_file, rg_str
1240 gpertea 29
1241 gpertea 159 if not params.keep_fasta_order:
1242     sq_dict_lines.sort(lambda x,y: cmp(x[0],y[0]))
1243    
1244 gpertea 29 for [name, line] in sq_dict_lines:
1245 gpertea 159 print >> bowtie_sam_header_file, line
1246     print >> bowtie_sam_header_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
1247 gpertea 29
1248 gpertea 159 bowtie_sam_header_file.close()
1249     temp_sam_header_file.close()
1250     return bowtie_sam_header_filename
1251 gpertea 29
1252     except OSError, o:
1253     errmsg=fail_str+str(o)+"\n"
1254     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1255     errmsg+="Error: bowtie not found on this system"
1256     die(errmsg)
1257    
1258     # Make sure Bowtie is installed and is recent enough to be useful
1259 gpertea 154 def check_bowtie(is_bowtie2):
1260     log_msg = "Checking for Bowtie"
1261     if is_bowtie2:
1262     log_msg += "2"
1263     th_log(log_msg)
1264    
1265     bowtie_bin = "bowtie"
1266     if is_bowtie2:
1267     bowtie_bin += "2"
1268    
1269 gpertea 29 global bowtie_path
1270 gpertea 154 bowtie_path=prog_path(bowtie_bin)
1271     bowtie_version = get_bowtie_version(is_bowtie2)
1272 gpertea 29 if bowtie_version == None:
1273     die("Error: Bowtie not found on this system")
1274 gpertea 154 elif is_bowtie2:
1275     if bowtie_version[0] < 3 and bowtie_version[1] < 1 and bowtie_version[2] < 1 and bowtie_version[3] < 5:
1276     die("Error: TopHat requires Bowtie 2.0.0-beta5 or later")
1277     elif not is_bowtie2:
1278     if bowtie_version[0] < 1 and bowtie_version[1] < 12 and bowtie_version[2] < 3:
1279     die("Error: TopHat requires Bowtie 0.12.3 or later")
1280     print >> sys.stderr, "\t version:\t\t\t %s" % ".".join([str(x) for x in bowtie_version])
1281 gpertea 29
1282    
1283     # Retrive a tuple containing the system's version of samtools. Parsed from
1284     # `samtools`
1285     def get_samtools_version():
1286     try:
1287     # Launch Bowtie to capture its version info
1288     proc = subprocess.Popen(samtools_path, stderr=subprocess.PIPE)
1289     samtools_out = proc.communicate()[1]
1290    
1291     # Find the version identifier
1292     version_match = re.search(r'Version:\s+(\d+)\.(\d+).(\d+)([a-zA-Z]?)', samtools_out)
1293     samtools_version_arr = [int(version_match.group(x)) for x in [1,2,3]]
1294     if version_match.group(4):
1295     samtools_version_arr.append(version_match.group(4))
1296     else:
1297     samtools_version_arr.append(0)
1298    
1299     return version_match.group(), samtools_version_arr
1300     except OSError, o:
1301     errmsg=fail_str+str(o)+"\n"
1302     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1303     errmsg+="Error: samtools not found on this system"
1304     die(errmsg)
1305    
1306     # Make sure the SAM tools are installed and are recent enough to be useful
1307     def check_samtools():
1308 gpertea 135 th_log("Checking for Samtools")
1309 gpertea 29 global samtools_path
1310     samtools_path=prog_path("samtools")
1311     samtools_version_str, samtools_version_arr = get_samtools_version()
1312     if samtools_version_str == None:
1313     die("Error: Samtools not found on this system")
1314     elif samtools_version_arr[1] < 1 or samtools_version_arr[2] < 7:
1315     die("Error: TopHat requires Samtools 0.1.7 or later")
1316     print >> sys.stderr, "\tSamtools %s" % samtools_version_str
1317    
1318    
1319     class FastxReader:
1320     def __init__(self, i_file, is_color=0, fname=''):
1321     self.bufline=None
1322     self.format=None
1323     self.ifile=i_file
1324     self.nextRecord=None
1325     self.eof=None
1326     self.fname=fname
1327     self.lastline=None
1328     self.numrecords=0
1329     self.isColor=0
1330     if is_color : self.isColor=1
1331     # determine file type
1332     #no records processed yet, skip custom header lines if any
1333     hlines=10 # allow maximum 10 header lines
1334     self.lastline=" "
1335     while hlines>0 and self.lastline[0] not in "@>" :
1336     self.lastline=self.ifile.readline()
1337     hlines-=1
1338     if self.lastline[0] == '@':
1339     self.format='fastq'
1340     self.nextRecord=self.nextFastq
1341     elif self.lastline[0] == '>':
1342     self.format='fasta'
1343     self.nextRecord=self.nextFasta
1344     else:
1345     die("Error: cannot determine record type in input file %s" % fname)
1346     self.bufline=self.lastline
1347     self.lastline=None
1348    
1349     def nextFastq(self):
1350     # returning tuple: (seqID, sequence_string, seq_len, qv_string)
1351     seqid,seqstr,qstr,seq_len='','','',0
1352     if self.eof: return (seqid, seqstr, seq_len, qstr)
1353     fline=self.getLine #shortcut to save a bit of time
1354     line=fline()
1355    
1356     if not line : return (seqid, seqstr, seq_len, qstr)
1357     while len(line.rstrip())==0: # skip empty lines
1358     line=fline()
1359     if not line : return (seqid, seqstr,seq_len, qstr)
1360     try:
1361     if line[0] != "@":
1362     raise ValueError("Records in Fastq files should start with '@' character")
1363 gpertea 154
1364 gpertea 29 seqid = line[1:].rstrip()
1365     seqstr = fline().rstrip()
1366    
1367     #There may now be more sequence lines, or the "+" quality marker line:
1368     while True:
1369     line = fline()
1370     if not line:
1371     raise ValueError("Premature end of file (missing quality values for "+seqid+")")
1372     if line[0] == "+":
1373     # -- sequence string ended
1374     #qtitle = line[1:].rstrip()
1375     #if qtitle and qtitle != seqid:
1376     # raise ValueError("Different read ID for sequence and quality (%s vs %s)" \
1377     # % (seqid, qtitle))
1378     break
1379     seqstr += line.rstrip() #removes trailing newlines
1380     #loop until + found
1381     seq_len = len(seqstr)
1382     #at least one line of quality data should follow
1383     qstrlen=0
1384     #now read next lines as quality values until seq_len is reached
1385     while True:
1386     line=fline()
1387     if not line : break #end of file
1388     qstr += line.rstrip()
1389     qstrlen=len(qstr)
1390     if qstrlen + self.isColor >= seq_len :
1391     break # qv string has reached the length of seq string
1392     #loop until qv has the same length as seq
1393 gpertea 154
1394 gpertea 135 if self.isColor:
1395     # and qstrlen==seq_len :
1396     if qstrlen==seq_len:
1397     #qual string may have a dummy qv at the beginning, should be stripped
1398     qstr = qstr[1:]
1399     qstrlen -= 1
1400     if qstrlen!=seq_len-1:
1401     raise ValueError("Length mismatch between sequence and quality strings "+ \
1402     "for %s (%i vs %i)." % (seqid, seq_len, qstrlen))
1403     else:
1404     if seq_len != qstrlen :
1405     raise ValueError("Length mismatch between sequence and quality strings "+ \
1406     "for %s (%i vs %i)." % (seqid, seq_len, qstrlen))
1407 gpertea 29 except ValueError, err:
1408     die("\nError encountered parsing file "+self.fname+":\n "+str(err))
1409     #return the record
1410     self.numrecords+=1
1411 gpertea 135 ##--discard the primer base [NO]
1412 gpertea 29 if self.isColor :
1413     seq_len-=1
1414     seqstr = seqstr[1:]
1415     return (seqid, seqstr, seq_len, qstr)
1416    
1417     def nextFasta(self):
1418     # returning tuple: (seqID, sequence_string, seq_len)
1419     seqid,seqstr,seq_len='','',0
1420     fline=self.getLine # shortcut to readline function of f
1421     line=fline() # this will use the buffer line if it's there
1422     if not line : return (seqid, seqstr, seq_len, None)
1423     while len(line.rstrip())==0: # skip empty lines
1424     line=fline()
1425     if not line : return (seqid, seqstr, seq_len, None)
1426     try:
1427     if line[0] != ">":
1428     raise ValueError("Records in Fasta files must start with '>' character")
1429     seqid = line[1:].split()[0]
1430     #more sequence lines, or the ">" quality marker line:
1431     while True:
1432     line = fline()
1433     if not line: break
1434     if line[0] == '>':
1435     #next sequence starts here
1436     self.ungetLine()
1437     break
1438     seqstr += line.rstrip()
1439     #loop until '>' found
1440     seq_len = len(seqstr)
1441     if seq_len < 3:
1442     raise ValueError("Read %s too short (%i)." \
1443     % (seqid, seq_len))
1444     except ValueError, err:
1445     die("\nError encountered parsing fasta file "+self.fname+"\n "+str(err))
1446     #return the record and continue
1447     self.numrecords+=1
1448 gpertea 135 if self.isColor : # -- discard primer base
1449 gpertea 29 seq_len-=1
1450 gpertea 135 seqstr=seqstr[1:]
1451 gpertea 29 return (seqid, seqstr, seq_len, None)
1452    
1453     def getLine(self):
1454     if self.bufline: #return previously buffered line
1455     r=self.bufline
1456     self.bufline=None
1457     return r
1458     else: #read a new line from stream and return it
1459     if self.eof: return None
1460     self.lastline=self.ifile.readline()
1461     if not self.lastline:
1462     self.eof=1
1463     return None
1464     return self.lastline
1465     def ungetLine(self):
1466     if self.lastline is None:
1467     print >> sys.stderr, "Warning: FastxReader called ungetLine() with no prior line!"
1468     self.bufline=self.lastline
1469     self.lastline=None
1470     #< class FastxReader
1471    
1472 gpertea 135 def fa_write(fhandle, seq_id, seq):
1473     """
1474     Write to a file in the FASTA format.
1475    
1476     Arguments:
1477     - `fhandle`: A file handle open for writing
1478     - `seq_id`: The sequence id string for this sequence
1479     - `seq`: An unformatted string of the sequence to write
1480     """
1481     line_len = 60
1482     fhandle.write(">" + seq_id + "\n")
1483     for i in xrange(len(seq) / line_len + 1):
1484     start = i * line_len
1485 gpertea 154 #end = (i+1) * line_len if (i+1) * line_len < len(seq) else len(seq)
1486     if (i+1) * line_len < len(seq):
1487     end = (i+1) * line_len
1488     else:
1489     end = len(seq)
1490 gpertea 135 fhandle.write( seq[ start:end ] + "\n")
1491    
1492 gpertea 29 class ZReader:
1493     def __init__(self, filename, sysparams, guess=True):
1494     self.fname=filename
1495     self.file=None
1496     self.fsrc=None
1497     self.popen=None
1498     pipecmd=[]
1499 gpertea 31 s=filename.lower()
1500     if s.endswith(".bam"):
1501 gpertea 154 # daehwan - want to ouptut mapped reads as well
1502     pipecmd=[prog_path("bam2fastx"), "--all", "-"]
1503 gpertea 31 else:
1504 gpertea 135 if guess:
1505     if s.endswith(".z") or s.endswith(".gz") or s.endswith(".gzip"):
1506 gpertea 31 pipecmd=['gzip']
1507 gpertea 135 else:
1508 gpertea 31 if s.endswith(".bz2") or s.endswith(".bzip2") or s.endswith(".bzip"):
1509     pipecmd=['bzip2']
1510 gpertea 135 if len(pipecmd)>0 and which(pipecmd[0]) is None:
1511 gpertea 31 die("Error: cannot find %s to decompress input file %s " % (pipecmd, filename))
1512 gpertea 135 if len(pipecmd)>0:
1513     if pipecmd[0]=='gzip' and sysparams.zipper.endswith('pigz'):
1514 gpertea 31 pipecmd[0]=sysparams.zipper
1515     pipecmd.extend(sysparams.zipper_opts)
1516 gpertea 135 elif pipecmd[0]=='bzip2' and sysparams.zipper.endswith('pbzip2'):
1517 gpertea 31 pipecmd[0]=sysparams.zipper
1518     pipecmd.extend(sysparams.zipper_opts)
1519 gpertea 135 else: #not guessing, but must still check if it's a compressed file
1520     if use_zpacker and filename.endswith(".z"):
1521 gpertea 31 pipecmd=[sysparams.zipper]
1522     pipecmd.extend(sysparams.zipper_opts)
1523 gpertea 154
1524 gpertea 135 if pipecmd:
1525     pipecmd+=['-cd']
1526 gpertea 29 if pipecmd:
1527     try:
1528     self.fsrc=open(self.fname, 'rb')
1529     self.popen=subprocess.Popen(pipecmd,
1530     preexec_fn=subprocess_setup,
1531     stdin=self.fsrc,
1532     stdout=subprocess.PIPE, close_fds=True)
1533     except Exception:
1534     die("Error: could not open pipe "+' '.join(pipecmd)+' < '+ self.fname)
1535     self.file=self.popen.stdout
1536     else:
1537     self.file=open(filename)
1538     def close(self):
1539     if self.fsrc: self.fsrc.close()
1540     self.file.close()
1541     if self.popen:
1542     self.popen.wait()
1543     self.popen=None
1544    
1545     class ZWriter:
1546     def __init__(self, filename, sysparams):
1547     self.fname=filename
1548     if use_zpacker:
1549     pipecmd=[sysparams.zipper,"-cf", "-"]
1550     self.ftarget=open(filename, "wb")
1551     try:
1552     self.popen=subprocess.Popen(pipecmd,
1553     preexec_fn=subprocess_setup,
1554     stdin=subprocess.PIPE,
1555     stdout=self.ftarget, close_fds=True)
1556     except Exception:
1557     die("Error: could not open writer pipe "+' '.join(pipecmd)+' < '+ self.fname)
1558     self.file=self.popen.stdin # client writes to this end of the pipe
1559     else: #no compression
1560     self.file=open(filename, "w")
1561     self.ftarget=None
1562     self.popen=None
1563     def close(self):
1564     self.file.close()
1565     if self.ftarget: self.ftarget.close()
1566     if self.popen:
1567     self.popen.wait() #! required to actually flush the pipes (eek!)
1568     self.popen=None
1569    
1570     # check_reads_format() examines the first few records in the user files
1571     # to determines the file format
1572     def check_reads_format(params, reads_files):
1573 gpertea 135 #seed_len = params.read_params.seed_length
1574 gpertea 29 fileformat = params.read_params.reads_format
1575    
1576     observed_formats = set([])
1577     # observed_scales = set([])
1578     min_seed_len = 99999
1579     max_seed_len = 0
1580     files = reads_files.split(',')
1581    
1582     for f_name in files:
1583     #try:
1584     zf = ZReader(f_name, params.system_params)
1585     #except IOError:
1586     # die("Error: could not open file "+f_name)
1587     freader=FastxReader(zf.file, params.read_params.color, zf.fname)
1588     toread=4 #just sample the first 4 reads
1589     while toread>0:
1590     seqid, seqstr, seq_len, qstr = freader.nextRecord()
1591     if not seqid: break
1592     toread-=1
1593     if seq_len < 20:
1594     print >> sys.stderr, "Warning: found a read < 20bp in", f_name
1595     else:
1596     min_seed_len = min(seq_len, min_seed_len)
1597     max_seed_len = max(seq_len, max_seed_len)
1598     zf.close()
1599     observed_formats.add(freader.format)
1600     if len(observed_formats) > 1:
1601     die("Error: TopHat requires all reads be either FASTQ or FASTA. Mixing formats is not supported.")
1602     fileformat=list(observed_formats)[0]
1603 gpertea 135 #if seed_len != None:
1604     # seed_len = max(seed_len, max_seed_len)
1605     #else:
1606     # seed_len = max_seed_len
1607 gpertea 29 #print >> sys.stderr, "\tmin read length: %dbp, max read length: %dbp" % (min_seed_len, max_seed_len)
1608     print >> sys.stderr, "\tformat:\t\t %s" % fileformat
1609     if fileformat == "fastq":
1610     quality_scale = "phred33 (default)"
1611     if params.read_params.solexa_quals and not params.read_params.phred64_quals:
1612     quality_scale = "solexa33 (reads generated with GA pipeline version < 1.3)"
1613     elif params.read_params.phred64_quals:
1614     quality_scale = "phred64 (reads generated with GA pipeline version >= 1.3)"
1615     print >> sys.stderr, "\tquality scale:\t %s" % quality_scale
1616     elif fileformat == "fasta":
1617     if params.read_params.color:
1618     params.read_params.integer_quals = True
1619    
1620     #print seed_len, format, solexa_scale
1621     #NOTE: seed_len will be re-evaluated later by prep_reads
1622     return TopHatParams.ReadParams(params.read_params.solexa_quals,
1623     params.read_params.phred64_quals,
1624     params.read_params.quals,
1625     params.read_params.integer_quals,
1626     params.read_params.color,
1627     params.read_params.library_type,
1628 gpertea 135 #seed_len,
1629     params.read_params.seed_length,
1630 gpertea 29 fileformat,
1631     params.read_params.mate_inner_dist,
1632     params.read_params.mate_inner_dist_std_dev,
1633     params.read_params.read_group_id,
1634     params.read_params.sample_id,
1635     params.read_params.library_id,
1636     params.read_params.description,
1637     params.read_params.seq_platform_unit,
1638     params.read_params.seq_center,
1639     params.read_params.seq_run_date,
1640     params.read_params.seq_platform)
1641    
1642     def log_tail(logfile, lines=1):
1643     f=open(logfile, "r")
1644     f.seek(0, 2)
1645     fbytes= f.tell()
1646     size=lines
1647     block=-1
1648     while size > 0 and fbytes+block*1024 > 0:
1649     if (fbytes+block*1024 > 0):
1650     ##Seek back once more, if possible
1651     f.seek( block*1024, 2 )
1652     else:
1653     #Seek to the beginning
1654     f.seek(0, 0)
1655     data= f.read( 1024 )
1656     linesFound= data.count('\n')
1657     size -= linesFound
1658     block -= 1
1659     if (fbytes + block*1024 > 0):
1660     f.seek(block*1024, 2)
1661     else:
1662     f.seek(0,0)
1663     f.readline() # find a newline
1664     lastBlocks= list( f.readlines() )
1665     f.close()
1666     return "\n".join(lastBlocks[-lines:])
1667    
1668     # Format a DateTime as a pretty string.
1669     # FIXME: Currently doesn't support days!
1670     def formatTD(td):
1671 gpertea 154 days = td.days
1672     hours = td.seconds // 3600
1673     minutes = (td.seconds % 3600) // 60
1674     seconds = td.seconds % 60
1675 gpertea 29
1676 gpertea 154 if days > 0:
1677     return '%d days %02d:%02d:%02d' % (days, hours, minutes, seconds)
1678     else:
1679     return '%02d:%02d:%02d' % (hours, minutes, seconds)
1680    
1681 gpertea 29 class PrepReadsInfo:
1682     def __init__(self, fname, side):
1683     try:
1684     f=open(fname,"r")
1685     self.min_len=int(f.readline().split("=")[-1])
1686     self.max_len=int(f.readline().split("=")[-1])
1687     self.in_count=int(f.readline().split("=")[-1])
1688     self.out_count=int(f.readline().split("=")[-1])
1689     if (self.out_count==0) or (self.max_len<16):
1690     raise Exception()
1691     except Exception, e:
1692     die(fail_str+"Error retrieving prep_reads info.")
1693     print >> sys.stderr, "\t%s reads: min. length=%s, count=%s" % (side, self.min_len, self.out_count)
1694     if self.min_len < 20:
1695     print >> sys.stderr, "Warning: short reads (<20bp) will make TopHat quite slow and take large amount of memory because they are likely to be mapped to too many places"
1696    
1697 gpertea 135
1698 gpertea 154 def prep_reads_cmd(params, reads_list, quals_list=None, aux_file=None, index_file=None, filter_reads=None, hits_to_filter=None):
1699 gpertea 135 #generate a prep_reads cmd arguments
1700     filter_cmd = [prog_path("prep_reads")]
1701     filter_cmd.extend(params.cmd())
1702    
1703     if params.read_params.reads_format == "fastq":
1704     filter_cmd += ["--fastq"]
1705     elif params.read_params.reads_format == "fasta":
1706     filter_cmd += ["--fasta"]
1707     if hits_to_filter:
1708     filter_cmd += ["--flt-hits="+hits_to_filter]
1709     if aux_file:
1710     filter_cmd += ["--aux-outfile="+aux_file]
1711 gpertea 154 if index_file:
1712     filter_cmd += ["--index-outfile="+index_file]
1713 gpertea 135 if filter_reads:
1714     filter_cmd += ["--flt-reads="+filter_reads]
1715     filter_cmd.append(reads_list)
1716     if quals_list:
1717     filter_cmd.append(quals_list)
1718     return filter_cmd
1719    
1720 gpertea 29 # Calls the prep_reads executable, which prepares an internal read library.
1721     # The read library features reads with monotonically increasing integer IDs.
1722     # prep_reads also filters out very low complexy or garbage reads as well as
1723     # polyA reads.
1724     #--> returns the tuple (internal_read_library_file, read_info)
1725 gpertea 135 def prep_reads(params, reads_list, quals_list, side="left", prefilter_reads=None):
1726 gpertea 29 reads_suffix = ".fq"
1727 gpertea 154
1728     # for parallelization, we don't compress the read files
1729     do_use_zpacker = use_zpacker
1730     if params.system_params.num_threads > 1:
1731     do_use_zpacker = False
1732    
1733     if do_use_zpacker: reads_suffix += ".z"
1734    
1735 gpertea 135 output_name=side+"_kept_reads"
1736 gpertea 29 kept_reads_filename = tmp_dir + output_name + reads_suffix
1737    
1738     if os.path.exists(kept_reads_filename):
1739     os.remove(kept_reads_filename)
1740     kept_reads = open(kept_reads_filename, "wb")
1741     log_fname=logging_dir + "prep_reads.log"
1742     filter_log = open(log_fname,"w")
1743    
1744     info_file=output_dir+output_name+".info"
1745 gpertea 154 index_file=kept_reads_filename+".index"
1746    
1747     filter_cmd=prep_reads_cmd(params, reads_list, quals_list, info_file, index_file, prefilter_reads)
1748 gpertea 29 shell_cmd = ' '.join(filter_cmd)
1749     #finally, add the compression pipe
1750     zip_cmd=[]
1751 gpertea 154 if do_use_zpacker:
1752 gpertea 29 zip_cmd=[ params.system_params.zipper ]
1753     zip_cmd.extend(params.system_params.zipper_opts)
1754     zip_cmd.extend(['-c','-'])
1755     shell_cmd +=' | '+' '.join(zip_cmd)
1756     shell_cmd += ' >' +kept_reads_filename
1757     retcode=0
1758     try:
1759     print >> run_log, shell_cmd
1760 gpertea 154 if do_use_zpacker:
1761 gpertea 29 filter_proc = subprocess.Popen(filter_cmd,
1762     stdout=subprocess.PIPE,
1763     stderr=filter_log)
1764     zip_proc=subprocess.Popen(zip_cmd,
1765     preexec_fn=subprocess_setup,
1766     stdin=filter_proc.stdout,
1767     stdout=kept_reads)
1768     filter_proc.stdout.close() #as per http://bugs.python.org/issue7678
1769     zip_proc.communicate()
1770     retcode=filter_proc.poll()
1771     if retcode==0:
1772     retcode=zip_proc.poll()
1773     else:
1774     retcode = subprocess.call(filter_cmd,
1775     stdout=kept_reads,
1776     stderr=filter_log)
1777     if retcode:
1778     die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname))
1779    
1780     except OSError, o:
1781     errmsg=fail_str+str(o)
1782     die(errmsg+"\n"+log_tail(log_fname))
1783 gpertea 154
1784 gpertea 29 kept_reads.close()
1785     return kept_reads_filename, PrepReadsInfo(info_file, side)
1786    
1787 gpertea 135
1788 gpertea 29 # Call bowtie
1789     def bowtie(params,
1790     bwt_idx_prefix,
1791     reads_list,
1792     reads_format,
1793     num_mismatches,
1794     mapped_reads,
1795 gpertea 135 unmapped_reads,
1796     extra_output = "",
1797 gpertea 154 mapping_type = "read_against_genome",
1798 gpertea 135 multihits_out = None): #only --prefilter-multihits should activate this parameter for the initial prefilter search
1799 gpertea 29 start_time = datetime.now()
1800     bwt_idx_name = bwt_idx_prefix.split('/')[-1]
1801 gpertea 135 reads_file=reads_list[0]
1802     readfile_basename=getFileBaseName(reads_file)
1803 gpertea 154
1804     g_mapping, t_mapping, seg_mapping = False, False, False
1805     if mapping_type == "read_against_transcriptome":
1806     t_mapping = True
1807     elif mapping_type == "seg_against_genome" or mapping_type == "seg_against_junctions":
1808     seg_mapping = True
1809     else:
1810     g_mapping = True
1811    
1812     bowtie_str = "Bowtie"
1813     if params.bowtie2:
1814     bowtie_str += "2"
1815    
1816     if seg_mapping:
1817     if not params.bowtie2:
1818     backup_bowtie_alignment_option = params.bowtie_alignment_option
1819     params.bowtie_alignment_option = "-v"
1820    
1821 gpertea 135 if t_mapping:
1822 gpertea 154 print >> sys.stderr, "[%s] Mapping %s against transcriptome %s with %s %s" % (start_time.strftime("%c"),
1823     readfile_basename, bwt_idx_name, bowtie_str, extra_output)
1824 gpertea 135 else:
1825 gpertea 154 print >> sys.stderr, "[%s] Mapping %s against %s with %s %s" % (start_time.strftime("%c"),
1826     readfile_basename, bwt_idx_name, bowtie_str, extra_output)
1827 gpertea 29 bwt_log = open(logging_dir + 'bowtie.'+readfile_basename+'.fixmap.log', "w")
1828 gpertea 135 #bwt_mapped=mapped_reads
1829 gpertea 67 unmapped_reads_out=None
1830 gpertea 137 if unmapped_reads:
1831 gpertea 67 unmapped_reads_out=unmapped_reads+".fq"
1832 gpertea 29 use_FIFO = use_BWT_FIFO and use_zpacker and unmapped_reads and params.read_params.color
1833 gpertea 135 if use_FIFO:
1834 gpertea 29 global unmapped_reads_fifo
1835 gpertea 67 unmapped_reads_fifo=unmapped_reads+".fifo"
1836 gpertea 29 if os.path.exists(unmapped_reads_fifo):
1837     os.remove(unmapped_reads_fifo)
1838     try:
1839     os.mkfifo(unmapped_reads_fifo)
1840     except OSError, o:
1841     die(fail_str+"Error at mkfifo("+unmapped_reads_fifo+'). '+str(o))
1842 gpertea 67 unmapped_reads_out+=".z"
1843 gpertea 29
1844     # Launch Bowtie
1845     try:
1846     bowtie_cmd = [bowtie_path]
1847     if reads_format == "fastq":
1848     bowtie_cmd += ["-q"]
1849     elif reads_format == "fasta":
1850     bowtie_cmd += ["-f"]
1851    
1852     if params.read_params.color:
1853     bowtie_cmd += ["-C", "--col-keepends"]
1854     if unmapped_reads:
1855 gpertea 135 bowtie_cmd += ["--un", unmapped_reads_out]
1856    
1857     unzip_cmd=None
1858     bam_input=False
1859 gpertea 154 if len(reads_list) > 0 and reads_list[0].endswith('.bam'):
1860 gpertea 135 bam_input=True
1861     unzip_cmd=[ prog_path('bam2fastx') ]
1862     if reads_format:
1863 gpertea 154 unzip_cmd.append("--" + reads_format)
1864 gpertea 159 unzip_cmd+=[reads_list[0]]
1865 gpertea 137
1866     if use_zpacker and (unzip_cmd is None):
1867 gpertea 29 unzip_cmd=[ params.system_params.zipper ]
1868     unzip_cmd.extend(params.system_params.zipper_opts)
1869 gpertea 154 unzip_cmd+=['-cd']
1870 gpertea 29
1871     fifo_pid=None
1872     if use_FIFO:
1873 gpertea 135 unm_zipcmd=[ params.system_params.zipper ]
1874     unm_zipcmd.extend(params.system_params.zipper_opts)
1875     unm_zipcmd+=['-c']
1876     print >> run_log, ' '.join(unm_zipcmd)+' < '+ unmapped_reads_fifo + ' > '+ unmapped_reads_out + ' & '
1877     fifo_pid=os.fork()
1878     if fifo_pid==0:
1879     def on_sig_exit(sig, func=None):
1880 gpertea 29 os._exit(os.EX_OK)
1881 gpertea 135 signal.signal(signal.SIGTERM, on_sig_exit)
1882     subprocess.call(unm_zipcmd,
1883     stdin=open(unmapped_reads_fifo, "r"),
1884     stdout=open(unmapped_reads_out, "wb"))
1885     os._exit(os.EX_OK)
1886 gpertea 29
1887     fix_map_cmd = [prog_path('fix_map_ordering')]
1888 gpertea 154
1889     if params.bowtie2:
1890     if t_mapping or g_mapping:
1891     max_penalty, min_penalty = params.bowtie2_params.mp.split(',')
1892     max_penalty, min_penalty = int(max_penalty), int(min_penalty)
1893     min_score = (max_penalty - 1) * num_mismatches
1894     fix_map_cmd += ["--bowtie2-min-score", str(min_score)]
1895    
1896 gpertea 137 samzip_cmd=None
1897 gpertea 154
1898     # daehwan - just use bam format for multi-threading.
1899     if unmapped_reads or params.system_params.num_threads >= 1:
1900 gpertea 29 #mapping on the genome
1901     #fix_map_ordering will write BAM file directly, and unmapped_reads as BAM file too
1902     mapped_reads += ".bam"
1903 gpertea 154
1904     temp_sam_header = tmp_dir + bwt_idx_prefix.split('/')[-1] + ".bwt.samheader.sam"
1905     fix_map_cmd += ["--sam-header", temp_sam_header, "-", mapped_reads]
1906     if not params.read_params.color and unmapped_reads:
1907     unmapped_reads_out = unmapped_reads+".bam"
1908     fix_map_cmd += [unmapped_reads_out]
1909 gpertea 29 else:
1910     #mapping on segment_juncs (spliced potential junctions)
1911 gpertea 135 # don't use SAM headers, could be slow -- better use compressed SAM directly
1912 gpertea 29 mapped_reads += ".sam"
1913     fix_map_cmd += ["-"]
1914     if use_zpacker:
1915     fix_map_cmd += ["-"]
1916 gpertea 137 samzip_cmd=[ params.system_params.zipper ]
1917     samzip_cmd.extend(params.system_params.zipper_opts)
1918     samzip_cmd += ["-c"]
1919 gpertea 29 mapped_reads += ".z"
1920     else:
1921     fix_map_cmd += [mapped_reads]
1922 gpertea 137
1923 gpertea 154 fix_map_cmd += ["--index-outfile", mapped_reads + ".index"]
1924    
1925 gpertea 135 if t_mapping:
1926 gpertea 159 max_hits = params.t_max_hits
1927     elif seg_mapping:
1928     max_hits = params.max_seg_hits
1929     else:
1930     max_hits = params.max_hits
1931    
1932 gpertea 154 if num_mismatches > 3:
1933     num_mismatches = 3
1934 gpertea 29
1935 gpertea 154 if params.bowtie2:
1936     if seg_mapping:
1937     # since bowtie2 does not suppress reads that map to many places,
1938     # we suppress those in segment_juncs and long_spanning_reads.
1939     bowtie_cmd += ["-k", str(max_hits + 1)]
1940     else:
1941     bowtie_cmd += ["-k", str(max_hits)]
1942    
1943     bowtie2_params = params.bowtie2_params
1944     if seg_mapping:
1945     # after intensive testing,
1946     # the following parameters seem to work faster than Bowtie1 and as sensitive as Bowtie1,
1947     # but a chance for further improvements remains.
1948     bowtie_cmd += ["-N", str(min(num_mismatches, 1))]
1949     bowtie_cmd += ["-i", "C,2,0"]
1950     bowtie_cmd += ["-L", "20"]
1951     # bowtie_cmd += ["-L", str(min(params.segment_length, 20))]
1952     else:
1953     bowtie2_preset = ""
1954     if bowtie2_params.very_fast:
1955     bowtie2_preset = "--very-fast"
1956     elif bowtie2_params.fast:
1957     bowtie2_preset = "--fast"
1958     elif bowtie2_params.sensitive:
1959     bowtie2_preset = "--sensitive"
1960     elif bowtie2_params.very_sensitive:
1961     bowtie2_preset = "--very-sensitive"
1962    
1963     if bowtie2_preset != "":
1964     bowtie_cmd += [bowtie2_preset]
1965     else:
1966     bowtie_cmd += ["-D", str(bowtie2_params.D),
1967     "-R", str(bowtie2_params.R),
1968     "-N", str(bowtie2_params.N),
1969     "-L", str(bowtie2_params.L),
1970     "-i", bowtie2_params.i]
1971    
1972     # "--n-ceil" is not correctly parsed in Bowtie2,
1973     # I (daehwan) already talked to Ben who will fix the problem.
1974     bowtie_cmd += [# "--n-ceil", bowtie2_params.n_ceil,
1975     "--gbar", str(bowtie2_params.gbar),
1976     "--mp", bowtie2_params.mp,
1977     "--np", str(bowtie2_params.np),
1978     "--rdg", bowtie2_params.rdg,
1979     "--rfg", bowtie2_params.rfg,
1980     "--score-min", bowtie2_params.score_min]
1981    
1982 gpertea 135 else:
1983 gpertea 154 bowtie_cmd += [params.bowtie_alignment_option, str(num_mismatches),
1984     "-k", str(max_hits),
1985     "-m", str(max_hits),
1986     "-S"]
1987    
1988     bowtie_cmd += ["-p", str(params.system_params.num_threads)]
1989    
1990     if params.bowtie2: #always use headerless SAM file
1991     bowtie_cmd += ["--sam-no-hd"]
1992     else:
1993     bowtie_cmd += ["--sam-nohead"]
1994    
1995     if not params.bowtie2:
1996     if multihits_out:
1997     bowtie_cmd += ["--max", multihits_out]
1998     else:
1999     bowtie_cmd += ["--max", "/dev/null"]
2000    
2001     if params.bowtie2:
2002     bowtie_cmd += ["-x"]
2003    
2004 gpertea 135 bowtie_cmd += [ bwt_idx_prefix ]
2005 gpertea 29 bowtie_proc=None
2006     shellcmd=""
2007     unzip_proc=None
2008 gpertea 135 samzip_proc=None
2009 gpertea 159
2010 gpertea 135 if multihits_out:
2011     #special prefilter bowtie run: we use prep_reads on the fly
2012     #in order to get multi-mapped reads to exclude later
2013     prep_cmd = prep_reads_cmd(params, ",".join(reads_list))
2014     preplog_fname=logging_dir + "prep_reads_prefilter.log"
2015     prepfilter_log = open(preplog_fname,"w")
2016     unzip_proc = subprocess.Popen(prep_cmd,
2017     stdout=subprocess.PIPE,
2018     stderr=prepfilter_log)
2019     shellcmd=' '.join(prep_cmd) + "|"
2020     else:
2021     z_input=use_zpacker and reads_file.endswith(".z")
2022     if z_input:
2023     unzip_proc = subprocess.Popen(unzip_cmd,
2024     stdin=open(reads_file, "rb"),
2025     stdout=subprocess.PIPE)
2026     shellcmd=' '.join(unzip_cmd) + "< " +reads_file +"|"
2027 gpertea 29 else:
2028 gpertea 135 #must be uncompressed fastq input (unmapped reads from a previous run)
2029     #or a BAM file with unmapped reads
2030     if bam_input:
2031     unzip_proc = subprocess.Popen(unzip_cmd, stdout=subprocess.PIPE)
2032     shellcmd=' '.join(unzip_cmd) + "|"
2033 gpertea 159 else:
2034     bowtie_cmd += [reads_file]
2035 gpertea 135 bowtie_proc = subprocess.Popen(bowtie_cmd,
2036     stdout=subprocess.PIPE,
2037     stderr=bwt_log)
2038     if unzip_proc:
2039     #input is compressed OR prep_reads is used as a filter
2040     bowtie_cmd += ['-']
2041     bowtie_proc = subprocess.Popen(bowtie_cmd,
2042     stdin=unzip_proc.stdout,
2043     stdout=subprocess.PIPE,
2044     stderr=bwt_log)
2045     unzip_proc.stdout.close() # see http://bugs.python.org/issue7678
2046    
2047     shellcmd += ' '.join(bowtie_cmd) + '|' + ' '.join(fix_map_cmd)
2048 gpertea 137 if samzip_cmd:
2049 gpertea 135 shellcmd += "|"+ ' '.join(samzip_cmd)+" > " + mapped_reads
2050     fix_order_proc = subprocess.Popen(fix_map_cmd,
2051 gpertea 154 stdin=bowtie_proc.stdout,
2052     stdout=subprocess.PIPE)
2053 gpertea 135 bowtie_proc.stdout.close() # needed?
2054     samzip_proc = subprocess.Popen(samzip_cmd,
2055     preexec_fn=subprocess_setup,
2056     stdin=fix_order_proc.stdout,
2057     stdout=open(mapped_reads, "wb"))
2058     fix_order_proc.stdout.close()
2059 gpertea 137 else:
2060 gpertea 135 #write BAM output directly
2061     fix_order_proc = subprocess.Popen(fix_map_cmd,
2062     stdin=bowtie_proc.stdout)
2063     # stdout=open(mapped_reads, "w"))
2064     bowtie_proc.stdout.close()
2065 gpertea 137
2066 gpertea 135 # shellcmd += " > " + mapped_reads
2067 gpertea 29 print >> run_log, shellcmd
2068 gpertea 135 if samzip_proc:
2069     samzip_proc.communicate()
2070 gpertea 29 else:
2071     fix_order_proc.communicate()
2072     if use_FIFO:
2073 gpertea 67 if fifo_pid and not os.path.exists(unmapped_reads_out):
2074 gpertea 29 try:
2075     os.kill(fifo_pid, signal.SIGTERM)
2076     except:
2077     pass
2078 gpertea 154
2079 gpertea 29 except OSError, o:
2080     die(fail_str+"Error: "+str(o))
2081    
2082     # Success
2083     #finish_time = datetime.now()
2084     #duration = finish_time - start_time
2085     #print >> sys.stderr, "\t\t\t[%s elapsed]" % formatTD(duration)
2086     if use_FIFO:
2087     try:
2088     os.remove(unmapped_reads_fifo)
2089     except:
2090     pass
2091 gpertea 135 if multihits_out and not os.path.exists(multihits_out):
2092     open(multihits_out, "w").close()
2093 gpertea 29
2094 gpertea 154 if seg_mapping:
2095     if not params.bowtie2:
2096     params.bowtie_alignment_option = backup_bowtie_alignment_option
2097 gpertea 29
2098 gpertea 154 return (mapped_reads, unmapped_reads_out)
2099 gpertea 29
2100    
2101     # Retrieve a .juncs file from a GFF file by calling the gtf_juncs executable
2102     def get_gtf_juncs(gff_annotation):
2103 gpertea 135 th_log("Reading known junctions from GTF file")
2104 gpertea 29 gtf_juncs_log = open(logging_dir + "gtf_juncs.log", "w")
2105    
2106     gff_prefix = gff_annotation.split('/')[-1].split('.')[0]
2107    
2108     gtf_juncs_out_name = tmp_dir + gff_prefix + ".juncs"
2109     gtf_juncs_out = open(gtf_juncs_out_name, "w")
2110    
2111     gtf_juncs_cmd=[prog_path("gtf_juncs"), gff_annotation]
2112     try:
2113 gpertea 135 print >> run_log, " ".join(gtf_juncs_cmd), " > "+gtf_juncs_out_name
2114 gpertea 29 retcode = subprocess.call(gtf_juncs_cmd,
2115     stderr=gtf_juncs_log,
2116     stdout=gtf_juncs_out)
2117     # cvg_islands returned an error
2118     if retcode == 1:
2119     print >> sys.stderr, "\tWarning: TopHat did not find any junctions in GTF file"
2120     return (False, gtf_juncs_out_name)
2121     elif retcode != 0:
2122     die(fail_str+"Error: GTF junction extraction failed with err ="+str(retcode))
2123 gpertea 154
2124 gpertea 29 # cvg_islands not found
2125     except OSError, o:
2126     errmsg=fail_str+str(o)+"\n"
2127     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2128     errmsg+="Error: gtf_juncs not found on this system"
2129     die(errmsg)
2130     return (True, gtf_juncs_out_name)
2131    
2132     # Call bowtie-build on the FASTA file of sythetic splice junction sequences
2133 gpertea 154 def build_juncs_bwt_index(is_bowtie2, external_splice_prefix, color):
2134 gpertea 135 th_log("Indexing splices")
2135 gpertea 29 bowtie_build_log = open(logging_dir + "bowtie_build.log", "w")
2136    
2137     #user_splices_out_prefix = output_dir + "user_splices_idx"
2138    
2139 gpertea 154 if is_bowtie2:
2140     bowtie_build_cmd = [prog_path("bowtie2-build")]
2141     else:
2142     bowtie_build_cmd = [prog_path("bowtie-build")]
2143    
2144 gpertea 29 if color:
2145     bowtie_build_cmd += ["-C"]
2146    
2147     bowtie_build_cmd += [external_splice_prefix + ".fa",
2148     external_splice_prefix]
2149     try:
2150     print >> run_log, " ".join(bowtie_build_cmd)
2151     retcode = subprocess.call(bowtie_build_cmd,
2152     stdout=bowtie_build_log)
2153    
2154     if retcode != 0:
2155     die(fail_str+"Error: Splice sequence indexing failed with err ="+ str(retcode))
2156     except OSError, o:
2157     errmsg=fail_str+str(o)+"\n"
2158     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2159     errmsg+="Error: bowtie-build not found on this system"
2160     die(errmsg)
2161     return external_splice_prefix
2162    
2163     # Build a splice index from a .juncs file, suitable for use with specified read
2164     # (or read segment) lengths
2165 gpertea 154 def build_juncs_index(is_bowtie2,
2166     min_anchor_length,
2167 gpertea 135 max_seg_len,
2168 gpertea 29 juncs_prefix,
2169     external_juncs,
2170     external_insertions,
2171     external_deletions,
2172 gpertea 154 external_fusions,
2173 gpertea 29 reference_fasta,
2174     color):
2175 gpertea 135 th_log("Retrieving sequences for splices")
2176 gpertea 29
2177     juncs_file_list = ",".join(external_juncs)
2178     insertions_file_list = ",".join(external_insertions)
2179     deletions_file_list = ",".join(external_deletions)
2180 gpertea 154 fusions_file_list = ",".join(external_fusions)
2181 gpertea 29
2182 gpertea 159 # do not use insertions and deletions in case of Bowtie2
2183     if is_bowtie2:
2184     insertions_file_list = "/dev/null"
2185     deletions_file_list = "/dev/null"
2186    
2187 gpertea 29 juncs_db_log = open(logging_dir + "juncs_db.log", "w")
2188    
2189     external_splices_out_prefix = tmp_dir + juncs_prefix
2190     external_splices_out_name = external_splices_out_prefix + ".fa"
2191    
2192     external_splices_out = open(external_splices_out_name, "w")
2193     # juncs_db_cmd = [bin_dir + "juncs_db",
2194     juncs_db_cmd = [prog_path("juncs_db"),
2195     str(min_anchor_length),
2196 gpertea 135 str(max_seg_len),
2197 gpertea 29 juncs_file_list,
2198     insertions_file_list,
2199     deletions_file_list,
2200 gpertea 154 fusions_file_list,
2201 gpertea 29 reference_fasta]
2202     try:
2203     print >> run_log, " ".join(juncs_db_cmd)
2204     retcode = subprocess.call(juncs_db_cmd,
2205     stderr=juncs_db_log,
2206     stdout=external_splices_out)
2207    
2208     if retcode != 0:
2209     die(fail_str+"Error: Splice sequence retrieval failed with err ="+str(retcode))
2210     # juncs_db not found
2211     except OSError, o:
2212     errmsg=fail_str+str(o)+"\n"
2213     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2214     errmsg+="Error: juncs_db not found on this system"
2215     die(errmsg)
2216    
2217 gpertea 154 external_splices_out_prefix = build_juncs_bwt_index(is_bowtie2, external_splices_out_prefix, color)
2218 gpertea 29 return external_splices_out_prefix
2219    
2220 gpertea 154 def build_idx_from_fa(is_bowtie2, fasta_fname, out_dir, color):
2221 gpertea 135 """ Build a bowtie index from a FASTA file.
2222    
2223     Arguments:
2224     - `fasta_fname`: File path to FASTA file.
2225     - `out_dir`: Output directory to place index in. (includes os.sep)
2226    
2227     Returns:
2228     - The path to the Bowtie index.
2229     """
2230     bwt_idx_path = out_dir + os.path.basename(fasta_fname).replace(".fa", "")
2231 gpertea 154
2232     if is_bowtie2:
2233     bowtie_idx_cmd = [prog_path("bowtie2-build")]
2234     else:
2235     bowtie_idx_cmd = [prog_path("bowtie-build")]
2236    
2237     if color:
2238     bowtie_idx_cmd += ["-C"]
2239    
2240     bowtie_idx_cmd += [fasta_fname,
2241     bwt_idx_path]
2242 gpertea 135 try:
2243     th_log("Building Bowtie index from " + os.path.basename(fasta_fname))
2244     print >> run_log, " ".join(bowtie_idx_cmd)
2245     retcode = subprocess.call(bowtie_idx_cmd,
2246     stdout=open(os.devnull, "w"),
2247     stderr=open(os.devnull, "w"))
2248     if retcode != 0:
2249     die(fail_str + "Error: Couldn't build bowtie index with err = "
2250     + str(retcode))
2251     except OSError, o:
2252     errmsg=fail_str+str(o)+"\n"
2253     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2254     errmsg+="Error: bowtie-build not found on this system"
2255     die(errmsg)
2256    
2257     return bwt_idx_path
2258    
2259 gpertea 29 # Print out the sam header, embedding the user's specified library properties.
2260     # FIXME: also needs SQ dictionary lines
2261     def write_sam_header(read_params, sam_file):
2262     print >> sam_file, "@HD\tVN:1.0\tSO:coordinate"
2263     if read_params.read_group_id and read_params.sample_id:
2264     rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
2265     read_params.sample_id)
2266     if read_params.library_id:
2267     rg_str += "\tLB:%s" % read_params.library_id
2268     if read_params.description:
2269     rg_str += "\tDS:%s" % read_params.description
2270     if read_params.seq_platform_unit:
2271     rg_str += "\tPU:%s" % read_params.seq_platform_unit
2272     if read_params.seq_center:
2273     rg_str += "\tCN:%s" % read_params.seq_center
2274     if read_params.mate_inner_dist:
2275     rg_str += "\tPI:%s" % read_params.mate_inner_dist
2276     if read_params.seq_run_date:
2277     rg_str += "\tDT:%s" % read_params.seq_run_date
2278     if read_params.seq_platform:
2279     rg_str += "\tPL:%s" % read_params.seq_platform
2280    
2281     print >> sam_file, rg_str
2282     print >> sam_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
2283    
2284     # Write final TopHat output, via tophat_reports and wiggles
2285 gpertea 154 def compile_reports(params, sam_header_filename, ref_fasta, mappings, readfiles, gff_annotation):
2286 gpertea 135 th_log("Reporting output tracks")
2287     left_maps, right_maps = mappings
2288     left_reads, right_reads = readfiles
2289 gpertea 29 left_maps = [x for x in left_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
2290     left_maps = ','.join(left_maps)
2291    
2292     if len(right_maps) > 0:
2293     right_maps = [x for x in right_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
2294     right_maps = ','.join(right_maps)
2295     log_fname = logging_dir + "reports.log"
2296     report_log = open(log_fname, "w")
2297     junctions = output_dir + "junctions.bed"
2298     insertions = output_dir + "insertions.bed"
2299     deletions = output_dir + "deletions.bed"
2300     accepted_hits = output_dir + "accepted_hits"
2301     report_cmdpath = prog_path("tophat_reports")
2302 gpertea 154 fusions = output_dir + "fusions.out"
2303 gpertea 29 report_cmd = [report_cmdpath]
2304 gpertea 154
2305     alignments_output_filename = tmp_dir + "accepted_hits"
2306    
2307 gpertea 29 report_cmd.extend(params.cmd())
2308 gpertea 159 report_cmd += ["--sam-header", sam_header_filename]
2309     if params.report_secondary_alignments:
2310     report_cmd += ["--report-secondary-alignments"]
2311    
2312 gpertea 29 report_cmd.extend(["--samtools="+samtools_path])
2313 gpertea 154 report_cmd.extend([ref_fasta,
2314     junctions,
2315 gpertea 29 insertions,
2316     deletions,
2317 gpertea 154 fusions,
2318     alignments_output_filename,
2319 gpertea 29 left_maps,
2320     left_reads])
2321 gpertea 154
2322 gpertea 135 if len(right_maps) > 0 and right_reads:
2323 gpertea 29 report_cmd.append(right_maps)
2324     report_cmd.append(right_reads)
2325 gpertea 154
2326 gpertea 29 # -- tophat_reports now produces (uncompressed) BAM stream,
2327     try:
2328     if params.report_params.convert_bam:
2329     if params.report_params.sort_bam:
2330 gpertea 154 print >> run_log, " ".join(report_cmd)
2331     report_proc=subprocess.call(report_cmd,
2332     preexec_fn=subprocess_setup,
2333     stderr=report_log)
2334 gpertea 29
2335 gpertea 154 bam_parts = []
2336     for i in range(params.system_params.num_threads):
2337     bam_part_filename = "%s%d.bam" % (alignments_output_filename, i)
2338     if os.path.exists(bam_part_filename):
2339     bam_parts.append(bam_part_filename)
2340     else:
2341     break
2342    
2343     num_bam_parts = len(bam_parts)
2344     pids = [0 for i in range(num_bam_parts)]
2345     sorted_bam_parts = ["%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)]
2346     #left_um_parts = ["%s%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)]
2347     #right_um_parts = ["%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)]
2348     for i in range(num_bam_parts):
2349     bamsort_cmd = [samtools_path,
2350     "sort",
2351     bam_parts[i],
2352     sorted_bam_parts[i]]
2353    
2354     sorted_bam_parts[i] += ".bam"
2355     print >> run_log, " ".join(bamsort_cmd)
2356     pid = os.fork()
2357     if pid == 0:
2358     subprocess.call(bamsort_cmd,
2359     stderr=open(logging_dir + "reports.samtools_sort.log%d" % i, "w"))
2360     os._exit(os.EX_OK)
2361     else:
2362     pids[i] = pid
2363    
2364     for i in range(len(pids)):
2365     if pids[i] > 0:
2366     os.waitpid(pids[i], 0)
2367     pids[i] = 0
2368    
2369     # for bam_part in bam_parts:
2370     # os.remove(bam_part)
2371    
2372     if num_bam_parts > 1:
2373     bammerge_cmd = [samtools_path,
2374     "merge",
2375     "-h",
2376     sam_header_filename,
2377     "%s.bam" % accepted_hits]
2378     bammerge_cmd += sorted_bam_parts
2379    
2380     print >> run_log, " ".join(bammerge_cmd)
2381     subprocess.call(bammerge_cmd,
2382     stderr=open(logging_dir + "reports.samtools_merge.log", "w"))
2383    
2384     for sorted_bam_part in sorted_bam_parts:
2385     os.remove(sorted_bam_part)
2386     else:
2387     os.rename(sorted_bam_parts[0], output_dir + "accepted_hits.bam")
2388    
2389 gpertea 29 else:
2390     print >> run_log, " ".join(report_cmd)
2391     report_proc=subprocess.call(report_cmd,
2392     preexec_fn=subprocess_setup,
2393     stdout=open(accepted_hits,"wb"),
2394     stderr=report_log)
2395     os.rename(accepted_hits, output_dir + "accepted_hits.bam")
2396     else:
2397     print >> run_log, " ".join(report_cmd)
2398     report_proc=subprocess.call(report_cmd,
2399     preexec_fn=subprocess_setup,
2400     stdout=open(accepted_hits,"wb"),
2401     stderr=report_log)
2402     tmp_sam = output_dir + "accepted_hits.sam"
2403    
2404 gpertea 135 bam_to_sam_cmd = [samtools_path, "view", "-h", accepted_hits]
2405 gpertea 29 print >> run_log, " ".join(bam_to_sam_cmd) + " > " + tmp_sam
2406     bam_to_sam_log = open(logging_dir + "accepted_hits_bam_to_sam.log", "w")
2407     tmp_sam_file = open(tmp_sam, "w")
2408     ret = subprocess.call(bam_to_sam_cmd,
2409     stdout=tmp_sam_file,
2410     stderr=bam_to_sam_log)
2411     tmp_sam_file.close()
2412     os.remove(accepted_hits)
2413    
2414     except OSError, o:
2415     die(fail_str+"Error: "+str(o)+"\n"+log_tail(log_fname))
2416    
2417 gpertea 154 return junctions
2418 gpertea 29
2419    
2420     # Split up each read in a FASTQ file into multiple segments. Creates a FASTQ file
2421     # for each segment This function needs to be fixed to support mixed read length
2422     # inputs
2423     def open_output_files(prefix, num_files_prev, num_files, out_segf, extension, params):
2424     i = num_files_prev + 1
2425     while i <= num_files:
2426     segfname=prefix+("_seg%d" % i)+extension
2427     out_segf.append(ZWriter(segfname,params.system_params))
2428     i += 1
2429    
2430     def split_reads(reads_filename,
2431     prefix,
2432     fasta,
2433     params,
2434     segment_length):
2435     #reads_file = open(reads_filename)
2436     zreads = ZReader(reads_filename, params.system_params, False)
2437     out_segfiles = []
2438    
2439     if fasta:
2440     extension = ".fa"
2441     else:
2442     extension = ".fq"
2443     if use_zpacker: extension += ".z"
2444     def convert_color_to_bp(color_seq):
2445     decode_dic = { 'A0':'A', 'A1':'C', 'A2':'G', 'A3':'T', 'A4':'N', 'A.':'N', 'AN':'N',
2446     'C0':'C', 'C1':'A', 'C2':'T', 'C3':'G', 'C4':'N', 'C.':'N', 'CN':'N',
2447     'G0':'G', 'G1':'T', 'G2':'A', 'G3':'C', 'G4':'N', 'G.':'N', 'GN':'N',
2448     'T0':'T', 'T1':'G', 'T2':'C', 'T3':'A', 'T4':'N', 'T.':'N', 'TN':'N',
2449     'N0':'N', 'N1':'N', 'N2':'N', 'N3':'N', 'N4':'N', 'N.':'N', 'NN':'N',
2450     '.0':'N', '.1':'N', '.2':'N', '.3':'N', '.4':'N', '..':'N', '.N':'N' }
2451    
2452     base = color_seq[0]
2453     bp_seq = base
2454     for ch in color_seq[1:]:
2455     base = decode_dic[base+ch]
2456     bp_seq += base
2457     return bp_seq
2458    
2459     def convert_bp_to_color(bp_seq):
2460     encode_dic = { 'AA':'0', 'CC':'0', 'GG':'0', 'TT':'0',
2461     'AC':'1', 'CA':'1', 'GT':'1', 'TG':'1',
2462     'AG':'2', 'CT':'2', 'GA':'2', 'TC':'2',
2463     'AT':'3', 'CG':'3', 'GC':'3', 'TA':'3',
2464     'A.':'4', 'C.':'4', 'G.':'4', 'T.':'4',
2465     '.A':'4', '.C':'4', '.G':'4', '.T':'4',
2466     '.N':'4', 'AN':'4', 'CN':'4', 'GN':'4',
2467     'TN':'4', 'NA':'4', 'NC':'4', 'NG':'4',
2468     'NT':'4', 'NN':'4', 'N.':'4', '..':'4' }
2469    
2470     base = bp_seq[0]
2471     color_seq = base
2472     for ch in bp_seq[1:]:
2473     color_seq += encode_dic[base + ch]
2474     base = ch
2475    
2476     return color_seq
2477    
2478     def split_record(read_name, read_seq, read_qual, out_segf, offsets, color):
2479     if color:
2480     color_offset = 1
2481     read_seq_temp = convert_color_to_bp(read_seq)
2482    
2483     seg_num = 1
2484     while seg_num + 1 < len(offsets):
2485     if read_seq[offsets[seg_num]+1] not in ['0', '1', '2', '3']:
2486     return
2487     seg_num += 1
2488     else:
2489     color_offset = 0
2490    
2491     seg_num = 0
2492     last_seq_offset = 0
2493     while seg_num + 1 < len(offsets):
2494     f = out_segf[seg_num].file
2495     seg_seq = read_seq[last_seq_offset+color_offset:offsets[seg_num + 1]+color_offset]
2496     print >> f, "%s|%d:%d:%d" % (read_name,last_seq_offset,seg_num, len(offsets) - 1)
2497     if color:
2498     print >> f, "%s%s" % (read_seq_temp[last_seq_offset], seg_seq)
2499     else:
2500     print >> f, seg_seq
2501     if not fasta:
2502     seg_qual = read_qual[last_seq_offset:offsets[seg_num + 1]]
2503     print >> f, "+"
2504     print >> f, seg_qual
2505     seg_num += 1
2506     last_seq_offset = offsets[seg_num]
2507    
2508     line_state = 0
2509     read_name = ""
2510     read_seq = ""
2511 gpertea 135 read_quals = ""
2512 gpertea 29 num_segments = 0
2513     offsets = []
2514     for line in zreads.file:
2515     if line.strip() == "":
2516     continue
2517     if line_state == 0:
2518     read_name = line.strip()
2519     elif line_state == 1:
2520     read_seq = line.strip()
2521    
2522     read_length = len(read_seq)
2523     tmp_num_segments = read_length / segment_length
2524     offsets = [segment_length * i for i in range(0, tmp_num_segments + 1)]
2525    
2526     # Bowtie's minimum read length here is 20bp, so if the last segment
2527     # is between 20 and segment_length bp long, go ahead and write it out
2528 gpertea 154 if read_length % segment_length >= min(segment_length - 2, 20):
2529 gpertea 29 offsets.append(read_length)
2530     tmp_num_segments += 1
2531     else:
2532     offsets[-1] = read_length
2533    
2534     if tmp_num_segments == 1:
2535     offsets = [0, read_length]
2536    
2537     if tmp_num_segments > num_segments:
2538     open_output_files(prefix, num_segments, tmp_num_segments, out_segfiles, extension, params)
2539     num_segments = tmp_num_segments
2540    
2541     if fasta:
2542     split_record(read_name, read_seq, None, out_segfiles, offsets, params.read_params.color)
2543     elif line_state == 2:
2544     line = line.strip()
2545     else:
2546     read_quals = line.strip()
2547     if not fasta:
2548     split_record(read_name, read_seq, read_quals, out_segfiles, offsets, params.read_params.color)
2549    
2550     line_state += 1
2551     if fasta:
2552     line_state %= 2
2553     else:
2554     line_state %= 4
2555     zreads.close()
2556     out_fnames=[]
2557     for zf in out_segfiles:
2558     zf.close()
2559     out_fnames.append(zf.fname)
2560     #return [o.fname for o in out_segfiles]
2561     return out_fnames
2562    
2563     # Find possible splice junctions using the "closure search" strategy, and report
2564     # them in closures.juncs. Calls the executable closure_juncs
2565     def junctions_from_closures(params,
2566 gpertea 159 sam_header_filename,
2567 gpertea 29 left_maps,
2568     right_maps,
2569     ref_fasta):
2570 gpertea 135 th_log("Searching for junctions via mate-pair closures")
2571 gpertea 29
2572 gpertea 154
2573 gpertea 29 #maps = [x for x in seg_maps if (os.path.exists(x) and os.path.getsize(x) > 0)]
2574     #if len(maps) == 0:
2575     # return None
2576     #print >> sys.stderr, left_maps
2577     slash = left_maps[0].rfind('/')
2578     juncs_out = ""
2579     if slash != -1:
2580     juncs_out += left_maps[0][:slash+1]
2581 gpertea 154 fusions_out = juncs_out
2582    
2583 gpertea 29 juncs_out += "closure.juncs"
2584 gpertea 154 fusions_out += "closure.fusions"
2585 gpertea 29
2586     juncs_log = open(logging_dir + "closure.log", "w")
2587     juncs_cmdpath=prog_path("closure_juncs")
2588     juncs_cmd = [juncs_cmdpath]
2589    
2590     left_maps = ','.join(left_maps)
2591     right_maps = ','.join(right_maps)
2592    
2593     juncs_cmd.extend(params.cmd())
2594 gpertea 159 juncs_cmd.extend(["--sam-header", sam_header_filename,
2595     juncs_out,
2596 gpertea 154 fusions_out,
2597 gpertea 29 ref_fasta,
2598     left_maps,
2599     right_maps])
2600     try:
2601     print >> run_log, ' '.join(juncs_cmd)
2602     retcode = subprocess.call(juncs_cmd,
2603     stderr=juncs_log)
2604    
2605     # spanning_reads returned an error
2606     if retcode != 0:
2607     die(fail_str+"Error: closure-based junction search failed with err ="+str(retcode))
2608     # cvg_islands not found
2609     except OSError, o:
2610     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2611     print >> sys.stderr, fail_str, "Error: closure_juncs not found on this system"
2612     die(str(o))
2613     return [juncs_out]
2614    
2615     # Find possible junctions by examining coverage and split segments in the initial
2616     # map and segment maps. Report junctions, insertions, and deletions in segment.juncs,
2617     # segment.insertions, and segment.deletions. Calls the executable
2618     # segment_juncs
2619     def junctions_from_segments(params,
2620 gpertea 159 sam_header_filename,
2621 gpertea 29 left_reads,
2622     left_reads_map,
2623     left_seg_maps,
2624     right_reads,
2625     right_reads_map,
2626     right_seg_maps,
2627     unmapped_reads,
2628     reads_format,
2629     ref_fasta):
2630     if left_reads_map != left_seg_maps[0]:
2631 gpertea 135 th_log("Searching for junctions via segment mapping")
2632 gpertea 29 out_path=getFileDir(left_seg_maps[0])
2633     juncs_out=out_path+"segment.juncs"
2634     insertions_out=out_path+"segment.insertions"
2635     deletions_out =out_path+"segment.deletions"
2636 gpertea 154 fusions_out = out_path+"segment.fusions"
2637 gpertea 29
2638     left_maps = ','.join(left_seg_maps)
2639 gpertea 135 log_fname = logging_dir + "segment_juncs.log"
2640     segj_log = open(log_fname, "w")
2641     segj_cmd = [prog_path("segment_juncs")]
2642 gpertea 29
2643 gpertea 135 segj_cmd.extend(params.cmd())
2644 gpertea 159 segj_cmd.extend(["--sam-header", sam_header_filename,
2645     "--ium-reads", ",".join(unmapped_reads),
2646     ref_fasta,
2647     juncs_out,
2648     insertions_out,
2649     deletions_out,
2650     fusions_out,
2651     left_reads,
2652     left_reads_map,
2653     left_maps])
2654 gpertea 135 if right_seg_maps:
2655 gpertea 29 right_maps = ','.join(right_seg_maps)
2656 gpertea 135 segj_cmd.extend([right_reads, right_reads_map, right_maps])
2657 gpertea 29 try:
2658 gpertea 135 print >> run_log, " ".join(segj_cmd)
2659     retcode = subprocess.call(segj_cmd,
2660 gpertea 29 preexec_fn=subprocess_setup,
2661 gpertea 135 stderr=segj_log)
2662 gpertea 29
2663     # spanning_reads returned an error
2664     if retcode != 0:
2665 gpertea 135 die(fail_str+"Error: segment-based junction search failed with err ="+str(retcode)+"\n"+log_tail(log_fname))
2666 gpertea 154
2667 gpertea 29 # cvg_islands not found
2668     except OSError, o:
2669     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2670     print >> sys.stderr, fail_str, "Error: segment_juncs not found on this system"
2671     die(str(o))
2672    
2673 gpertea 154 return [juncs_out, insertions_out, deletions_out, fusions_out]
2674 gpertea 29
2675     # Joins mapped segments into full-length read alignments via the executable
2676     # long_spanning_reads
2677     def join_mapped_segments(params,
2678     sam_header_filename,
2679     reads,
2680     ref_fasta,
2681     possible_juncs,
2682     possible_insertions,
2683     possible_deletions,
2684 gpertea 154 possible_fusions,
2685 gpertea 29 contig_seg_maps,
2686     spliced_seg_maps,
2687     alignments_out_name):
2688 gpertea 135 rn=""
2689 gpertea 29 if len(contig_seg_maps)>1:
2690 gpertea 135 th_log("Joining segment hits")
2691     rn=".segs"
2692 gpertea 29 else:
2693 gpertea 135 th_log("Processing bowtie hits")
2694 gpertea 29 contig_seg_maps = ','.join(contig_seg_maps)
2695    
2696     possible_juncs = ','.join(possible_juncs)
2697     possible_insertions = ",".join(possible_insertions)
2698     possible_deletions = ",".join(possible_deletions)
2699 gpertea 154 possible_fusions = ",".join(possible_fusions)
2700    
2701 gpertea 135 log_fname=logging_dir + "long_spanning_reads"+rn+".log"
2702     align_log = open(log_fname, "w")
2703 gpertea 29 align_cmd = [prog_path("long_spanning_reads")]
2704    
2705     align_cmd.extend(params.cmd())
2706 gpertea 159 align_cmd += ["--sam-header", sam_header_filename]
2707 gpertea 29
2708 gpertea 159 b2_params = params.bowtie2_params
2709     max_penalty, min_penalty = b2_params.mp.split(',')
2710     align_cmd += ["--bowtie2-max-penalty", max_penalty,
2711     "--bowtie2-min-penalty", min_penalty]
2712    
2713     align_cmd += ["--bowtie2-penalty-for-N", str(b2_params.np)]
2714    
2715     read_gap_open, read_gap_cont = b2_params.rdg.split(',')
2716     align_cmd += ["--bowtie2-read-gap-open", read_gap_open,
2717     "--bowtie2-read-gap-cont", read_gap_cont]
2718    
2719     ref_gap_open, ref_gap_cont = b2_params.rfg.split(',')
2720     align_cmd += ["--bowtie2-ref-gap-open", ref_gap_open,
2721     "--bowtie2-ref-gap-cont", ref_gap_cont]
2722    
2723 gpertea 29 align_cmd.append(ref_fasta)
2724 gpertea 154 align_cmd.extend([reads,
2725     possible_juncs,
2726     possible_insertions,
2727     possible_deletions,
2728     possible_fusions,
2729     alignments_out_name,
2730     contig_seg_maps])
2731 gpertea 29
2732 gpertea 135 if spliced_seg_maps:
2733 gpertea 29 spliced_seg_maps = ','.join(spliced_seg_maps)
2734     align_cmd.append(spliced_seg_maps)
2735    
2736     try:
2737 gpertea 154 print >> run_log, " ".join(align_cmd)
2738     join_proc=subprocess.call(align_cmd,
2739     stderr=align_log)
2740    
2741 gpertea 29 except OSError, o:
2742     die(fail_str+"Error: "+str(o))
2743    
2744     # This class collects spliced and unspliced alignments for each of the
2745     # left and right read files provided by the user.
2746     class Maps:
2747     def __init__(self,
2748     unspliced_sam,
2749     seg_maps,
2750     unmapped_segs,
2751     segs):
2752     self.unspliced_sam = unspliced_sam
2753     self.seg_maps = seg_maps
2754     self.unmapped_segs = unmapped_segs
2755     self.segs = segs
2756    
2757 gpertea 135 # Map2GTF stuff
2758 gpertea 159 def m2g_convert_coords(params, sam_header_filename, gtf_fname, reads, out_fname):
2759 gpertea 135 """ajjkljlks
2760    
2761     Arguments:
2762     - `params`: TopHat parameters
2763     - `gtf_fname`: File name pointing to the annotation.
2764     - `reads`: The reads to convert coords (in Bowtie format).
2765     - `out_fname`: The file name pointing to the output.
2766     """
2767     m2g_cmd = [prog_path("map2gtf")]
2768     m2g_cmd.extend(params.cmd())
2769 gpertea 159 m2g_cmd += ["--sam-header", sam_header_filename]
2770 gpertea 135 m2g_cmd.append(gtf_fname)
2771     m2g_cmd.append(reads) #could be BAM file
2772     m2g_cmd.append(out_fname)
2773     fbasename = getFileBaseName(reads)
2774     m2g_log = logging_dir + "m2g_" + fbasename + ".out"
2775     m2g_err = logging_dir + "m2g_" + fbasename + ".err"
2776    
2777     try:
2778     th_log("Converting " + fbasename + " to genomic coordinates (map2gtf)")
2779     print >> run_log, " ".join(m2g_cmd) + " > " + m2g_log
2780     ret = subprocess.call(m2g_cmd,
2781     stdout=open(m2g_log, "w"),
2782     stderr=open(m2g_err, "w"))
2783     if ret != 0:
2784     die(fail_str + " Error: map2gtf returned an error")
2785     except OSError, o:
2786     err_msg = fail_str + str(o)
2787     die(err_msg + "\n")
2788    
2789    
2790     def gtf_to_fasta(params, trans_gtf, genome, out_fname):
2791     """ Make the transcriptome from a GTF.
2792    
2793     Arguments:
2794     - `trans_gtf`:
2795     - `genome`:
2796     - `out_fname`:
2797     """
2798     # TODO: Call gtf_to_fasta
2799     g2f_cmd = [prog_path("gtf_to_fasta")]
2800     g2f_cmd.extend(params.cmd())
2801     g2f_cmd.append(trans_gtf)
2802     g2f_cmd.append(genome)
2803     g2f_cmd.append(out_fname)
2804    
2805     g2f_log = logging_dir + "g2f.out"
2806     g2f_err = logging_dir + "g2f.err"
2807    
2808     try:
2809     print >> run_log, " ".join(g2f_cmd)+" > " + g2f_log
2810     ret = subprocess.call(g2f_cmd,
2811     stdout = open(g2f_log, "w"),
2812     stderr = open(g2f_err, "w"))
2813     if ret != 0:
2814     die(fail_str + " Error: gtf_to_fasta returned an error.")
2815     except OSError, o:
2816     err_msg = fail_str + str(o)
2817     die(err_msg + "\n")
2818    
2819 gpertea 159 def map2gtf(params, genome_sam_header_filename, ref_fasta, left_reads, right_reads):
2820 gpertea 135 """ Main GTF mapping function
2821    
2822     Arguments:
2823     - `params`: The TopHat parameters.
2824     - `ref_fasta`: The reference genome.
2825     - `left_reads`: A list of reads.
2826     - `right_reads`: A list of reads (empty if single-end).
2827    
2828     """
2829     test_input_file(params.gff_annotation)
2830    
2831     # th_log("Reading in GTF file: " + params.gff_annotation)
2832     # transcripts = gtf_to_transcripts(params.gff_annotation)
2833    
2834     gtf_name = getFileBaseName(params.gff_annotation)
2835     m2g_bwt_idx = None
2836 gpertea 154 t_out_dir = tmp_dir
2837     if params.transcriptome_index and not params.transcriptome_outdir:
2838 gpertea 135 m2g_bwt_idx = params.transcriptome_index
2839 gpertea 154 th_log("Using pre-built transcriptome index..")
2840 gpertea 135 else:
2841 gpertea 154 th_log("Creating transcriptome data files..")
2842     if params.transcriptome_outdir:
2843     t_out_dir=params.transcriptome_outdir+"/"
2844     m2g_ref_fasta = t_out_dir + gtf_name + ".fa"
2845 gpertea 135 gtf_to_fasta(params, params.gff_annotation, ref_fasta, m2g_ref_fasta)
2846 gpertea 154 m2g_bwt_idx = build_idx_from_fa(params.bowtie2, m2g_ref_fasta, t_out_dir, params.read_params.color)
2847 gpertea 159
2848     get_index_sam_header(params, m2g_bwt_idx)
2849    
2850 gpertea 135 mapped_gtf_list = []
2851     unmapped_gtf_list = []
2852     # do the initial mapping in GTF coordinates
2853     for reads in [left_reads, right_reads]:
2854     if reads == None or os.path.getsize(reads) < 25 :
2855     continue
2856     fbasename = getFileBaseName(reads)
2857     mapped_gtf_out = tmp_dir + fbasename + ".m2g"
2858     #if use_zpacker:
2859     # mapped_gtf_out+=".z"
2860    
2861     unmapped_gtf = tmp_dir + fbasename + ".m2g_um"
2862     #if use_BWT_FIFO:
2863     # unmapped_gtf += ".z"
2864    
2865     (mapped_gtf_map, unmapped) = bowtie(params,
2866     m2g_bwt_idx,
2867     [reads],
2868     "fastq",
2869     params.t_mismatches,
2870     mapped_gtf_out,
2871     unmapped_gtf,
2872 gpertea 154 "", "read_against_transcriptome")
2873 gpertea 135 mapped_gtf_list.append(mapped_gtf_map)
2874     unmapped_gtf_list.append(unmapped)
2875    
2876     sam_gtf_list = []
2877     for reads in mapped_gtf_list:
2878     fbasename = getFileBaseName(reads)
2879     sam_out_fname = tmp_dir + fbasename + "_converted.bam"
2880     m2g_convert_coords(params,
2881 gpertea 159 genome_sam_header_filename,
2882 gpertea 135 params.gff_annotation,
2883     reads,
2884     sam_out_fname)
2885     sam_gtf_list.append(sam_out_fname)
2886    
2887     if len(sam_gtf_list) < 2:
2888     sam_gtf_list.append(None)
2889     unmapped_gtf_list.append(None)
2890    
2891     return (sam_gtf_list, unmapped_gtf_list)
2892     # end Map2GTF
2893    
2894     def get_preflt_data(params, ri, target_reads, out_mappings, out_unmapped):
2895     ## extract mappings and unmapped reads from prefilter mappings and preflt_ium
2896     ##
2897     #this is accomplished by a special prep_reads usage (triggered by --flt-hits)
2898    
2899     prep_cmd=prep_reads_cmd(params, params.preflt_data[ri].unmapped_reads, None,
2900     out_mappings,
2901     target_reads,
2902     params.preflt_data[ri].mappings)
2903     um_reads = open(out_unmapped, "wb")
2904     log_fname=logging_dir + "prep_reads.from_preflt."+str(ri)+".log"
2905     filter_log = open(log_fname,"w")
2906    
2907     shell_cmd = " ".join(prep_cmd)
2908     #add the compression pipe
2909     zip_cmd=[]
2910     if use_zpacker:
2911     zip_cmd=[ params.system_params.zipper ]
2912     zip_cmd.extend(params.system_params.zipper_opts)
2913     zip_cmd.extend(['-c','-'])
2914     shell_cmd +=' | '+' '.join(zip_cmd)
2915     shell_cmd += ' >' + out_unmapped
2916     retcode=0
2917     try:
2918     print >> run_log, shell_cmd
2919     if use_zpacker:
2920     prep_proc = subprocess.Popen(prep_cmd,
2921     stdout=subprocess.PIPE,
2922     stderr=filter_log)
2923     zip_proc = subprocess.Popen(zip_cmd,
2924     preexec_fn=subprocess_setup,
2925     stdin=prep_proc.stdout,
2926     stdout=um_reads)
2927     prep_proc.stdout.close() #as per http://bugs.python.org/issue7678
2928     zip_proc.communicate()
2929     retcode=prep_proc.poll()
2930     if retcode==0:
2931     retcode=zip_proc.poll()
2932     else:
2933     retcode = subprocess.call(prep_cmd,
2934     stdout=um_reads,
2935     stderr=filter_log)
2936     if retcode:
2937     die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname))
2938    
2939     except OSError, o:
2940     errmsg=fail_str+str(o)
2941     die(errmsg+"\n"+log_tail(log_fname))
2942     um_reads.close()
2943    
2944     return (out_mappings, out_unmapped)
2945    
2946    
2947 gpertea 29 # The main aligment routine of TopHat. This function executes most of the
2948     # workflow producing a set of candidate alignments for each cDNA fragment in a
2949     # pair of SAM alignment files (for paired end reads).
2950     def spliced_alignment(params,
2951     bwt_idx_prefix,
2952     sam_header_filename,
2953     ref_fasta,
2954     read_len,
2955     segment_len,
2956 gpertea 135 prepared_reads,
2957 gpertea 29 user_supplied_junctions,
2958     user_supplied_insertions,
2959     user_supplied_deletions):
2960    
2961     possible_juncs = []
2962     possible_juncs.extend(user_supplied_junctions)
2963    
2964     possible_insertions = []
2965     possible_insertions.extend(user_supplied_insertions)
2966     possible_deletions = []
2967     possible_deletions.extend(user_supplied_deletions)
2968 gpertea 154 possible_fusions = []
2969 gpertea 29
2970 gpertea 135 left_reads, right_reads = prepared_reads
2971    
2972     maps = [[], []] # maps[0] = left_reads mapping data, maps[1] = right_reads_mapping_data
2973 gpertea 154
2974 gpertea 29 #single_segments = False
2975    
2976 gpertea 135 # Before anything, map the reads using Map2GTF (if using annotation)
2977     m2g_maps = [ None, None ] # left, right
2978     initial_reads = [ left_reads, right_reads ]
2979    
2980     if params.gff_annotation:
2981     (mapped_gtf_list, unmapped_gtf_list) = \
2982 gpertea 159 map2gtf(params, sam_header_filename, ref_fasta, left_reads, right_reads)
2983 gpertea 135
2984     m2g_left_maps, m2g_right_maps = mapped_gtf_list
2985     m2g_maps = [m2g_left_maps, m2g_right_maps]
2986 gpertea 154 if params.transcriptome_only or not fileExists(unmapped_gtf_list[0]):
2987 gpertea 135 # The case where the user doesn't want to map to anything other
2988     # than the transcriptome OR we have no unmapped reads
2989     maps[0] = [m2g_left_maps]
2990     if right_reads:
2991 gpertea 154 maps[1] = [m2g_right_maps]
2992 gpertea 135 return maps
2993     # Feed the unmapped reads into spliced_alignment()
2994     initial_reads = unmapped_gtf_list[:]
2995     th_log("Resuming TopHat pipeline with unmapped reads")
2996    
2997     max_seg_len = segment_len #this is the ref seq span on either side of the junctions
2998     #to be extracted into segment_juncs.fa
2999    
3000     num_segs = int(read_len / segment_len)
3001     if (read_len % segment_len) >= min(segment_len-2, 20):
3002     #remainder is shorter but long enough to become a new segment
3003     num_segs += 1
3004     else:
3005     # the last segment is longer
3006     if num_segs>1: max_seg_len += (read_len % segment_len)
3007    
3008     if num_segs <= 1:
3009 gpertea 154 print >> sys.stderr, "Warning: you have only one segment per read.\n\tIf the read length is greater than or equal to 45bp,\n\twe strongly recommend that you decrease --segment-length to about half the read length because TopHat will work better with multiple segments"
3010 gpertea 135
3011     # Using the num_segs value returned by check_reads(),
3012     # decide which junction discovery strategy to use
3013 gpertea 154 if num_segs < 3:
3014 gpertea 135 #if params.butterfly_search != False:
3015     # params.butterfly_search = True
3016     if params.coverage_search != False:
3017 gpertea 154 params.coverage_search = True
3018 gpertea 135 if num_segs == 1:
3019     segment_len = read_len
3020     else: #num_segs >= 3:
3021     # if we have at least three segments, just use split segment search,
3022     # which is the most sensitive and specific, fastest, and lightest-weight.
3023     # so unless specifically requested, disable the other junction searches
3024     if params.closure_search != True:
3025     params.closure_search = False
3026     if params.coverage_search != True:
3027     params.coverage_search = False
3028     if params.butterfly_search != True:
3029     params.butterfly_search = False
3030 gpertea 154
3031 gpertea 29 # Perform the first part of the TopHat work flow on the left and right
3032     # reads of paired ends separately - we'll use the pairing information later
3033 gpertea 154 have_left_IUM=False
3034 gpertea 135 for ri in (0,1):
3035     reads=initial_reads[ri]
3036 gpertea 154 if reads == None or not fileExists(reads,25):
3037 gpertea 29 continue
3038     fbasename=getFileBaseName(reads)
3039     unspliced_out = tmp_dir + fbasename + ".mapped"
3040 gpertea 135 unspliced_sam = None
3041     unmapped_reads = None
3042     #if use_zpacker: unspliced_out+=".z"
3043 gpertea 29 unmapped_unspliced = tmp_dir + fbasename + "_unmapped"
3044 gpertea 135 if params.prefilter_multi:
3045     #unmapped_unspliced += ".z"
3046     (unspliced_sam, unmapped_reads) = get_preflt_data(params, ri, reads, unspliced_out, unmapped_unspliced)
3047     else:
3048 gpertea 29 # Perform the initial Bowtie mapping of the full length reads
3049 gpertea 135 (unspliced_sam, unmapped_reads) = bowtie(params,
3050 gpertea 154 bwt_idx_prefix,
3051     [reads],
3052     "fastq",
3053     params.genome_read_mismatches,
3054     unspliced_out,
3055     unmapped_unspliced,
3056     "",
3057     "read_against_genome")
3058 gpertea 29
3059 gpertea 154 # daehwan - check this out
3060     # params.bowtie2 = False
3061     # check_bowtie(params.bowtie2)
3062    
3063 gpertea 29 seg_maps = []
3064     unmapped_segs = []
3065     segs = []
3066 gpertea 154 have_IUM = fileExists(unmapped_reads,25)
3067     if ri==0 and have_IUM:
3068     have_left_IUM = True
3069     if num_segs > 1 and have_IUM:
3070 gpertea 29 # split up the IUM reads into segments
3071 gpertea 31 # unmapped_reads can be in BAM format
3072     read_segments = split_reads(unmapped_reads,
3073 gpertea 29 tmp_dir + fbasename,
3074     False,
3075     params,
3076     segment_len)
3077    
3078     # Map each segment file independently with Bowtie
3079     for i in range(len(read_segments)):
3080     seg = read_segments[i]
3081     fbasename=getFileBaseName(seg)
3082 gpertea 135 seg_out = tmp_dir + fbasename + ".bwtout"
3083     if use_zpacker: seg_out += ".z"
3084     unmapped_seg = tmp_dir + fbasename + "_unmapped.fq"
3085     if use_BWT_FIFO:
3086     unmapped_seg += ".z"
3087 gpertea 29 extra_output = "(%d/%d)" % (i+1, len(read_segments))
3088 gpertea 154 (seg_map, unmapped) = bowtie(params,
3089     bwt_idx_prefix,
3090     [seg],
3091     "fastq",
3092     params.segment_mismatches,
3093     seg_out,
3094     unmapped_seg,
3095     extra_output,
3096     "seg_against_genome")
3097 gpertea 29 seg_maps.append(seg_map)
3098     unmapped_segs.append(unmapped)
3099     segs.append(seg)
3100    
3101     # Collect the segment maps for left and right reads together
3102 gpertea 135 maps[ri] = Maps(unspliced_sam, seg_maps, unmapped_segs, segs)
3103 gpertea 29 else:
3104     # if there's only one segment, just collect the initial map as the only
3105     # map to be used downstream for coverage-based junction discovery
3106     read_segments = [reads]
3107 gpertea 135 maps[ri] = Maps(unspliced_sam, [unspliced_sam], [unmapped_reads], [unmapped_reads])
3108 gpertea 29
3109 gpertea 135 # XXX: At this point if using M2G, have three sets of reads:
3110     # mapped to transcriptome, mapped to genome, and unmapped (potentially
3111     # spliced or poly-A tails) - hp
3112    
3113 gpertea 29 # Unless the user asked not to discover new junctions, start that process
3114     # here
3115 gpertea 135 left_reads_map = maps[0].unspliced_sam
3116     left_seg_maps = maps[0].seg_maps
3117     unmapped_reads = maps[0].unmapped_segs
3118     if right_reads:
3119     right_reads_map = maps[1].unspliced_sam
3120     right_seg_maps = maps[1].seg_maps
3121     unmapped_reads.extend(maps[1].unmapped_segs)
3122     else:
3123     right_reads_map = None
3124     right_seg_maps = None
3125 gpertea 29
3126 gpertea 154 if params.find_novel_juncs and have_left_IUM: # or params.find_novel_indels:
3127 gpertea 29 # Call segment_juncs to infer a list of possible splice junctions from
3128     # the regions of the genome covered in the initial and segment maps
3129 gpertea 135 #if params.find_novel_juncs:
3130     #TODO: in m2g case, we might want to pass the m2g mappings as well,
3131     # or perhaps the GTF file directly
3132     # -> this could improve alternative junction detection?
3133 gpertea 29 juncs = junctions_from_segments(params,
3134 gpertea 159 sam_header_filename,
3135 gpertea 29 left_reads,
3136     left_reads_map,
3137     left_seg_maps,
3138     right_reads,
3139     right_reads_map,
3140     right_seg_maps,
3141     unmapped_reads,
3142     "fastq",
3143     ref_fasta)
3144 gpertea 135 if os.path.getsize(juncs[0]) != 0:
3145 gpertea 29 possible_juncs.append(juncs[0])
3146     if params.find_novel_indels:
3147 gpertea 154 if os.path.getsize(juncs[1]) != 0:
3148     possible_insertions.append(juncs[1])
3149     if os.path.getsize(juncs[2]) != 0:
3150     possible_deletions.append(juncs[2])
3151     if params.find_novel_fusions:
3152     if os.path.getsize(juncs[3]) != 0:
3153     possible_fusions.append(juncs[3])
3154 gpertea 29 # Optionally, and for paired reads only, use a closure search to
3155     # discover addtional junctions
3156 gpertea 135 if params.closure_search and left_reads and right_reads:
3157 gpertea 29 juncs = junctions_from_closures(params,
3158 gpertea 159 sam_header_filename,
3159 gpertea 135 [maps[initial_reads[left_reads]].unspliced_sam, maps[initial_reads[left_reads]].seg_maps[-1]],
3160     [maps[initial_reads[right_reads]].unspliced_sam, maps[initial_reads[right_reads]].seg_maps[-1]],
3161 gpertea 29 ref_fasta)
3162     if os.path.getsize(juncs[0]) != 0:
3163     possible_juncs.extend(juncs)
3164    
3165 gpertea 154 if len(possible_insertions) == 0 and len(possible_deletions) == 0 and len(possible_juncs) == 0 and len(possible_fusions) == 0:
3166 gpertea 29 spliced_seg_maps = None
3167     junc_idx_prefix = None
3168     else:
3169     junc_idx_prefix = "segment_juncs"
3170     if len(possible_insertions) == 0:
3171     possible_insertions.append(os.devnull)
3172     # print >> sys.stderr, "Warning: insertions database is empty!"
3173     if len(possible_deletions) == 0:
3174     possible_deletions.append(os.devnull)
3175     # print >> sys.stderr, "Warning: deletions database is empty!"
3176     if len(possible_juncs) == 0:
3177     possible_juncs.append(os.devnull)
3178     print >> sys.stderr, "Warning: junction database is empty!"
3179 gpertea 154 if len(possible_fusions) == 0:
3180     possible_fusions.append(os.devnull)
3181 gpertea 135 if junc_idx_prefix:
3182 gpertea 154 juncs_bwt_idx = build_juncs_index(params.bowtie2,
3183     3,
3184     max_seg_len,
3185     junc_idx_prefix,
3186     possible_juncs,
3187     possible_insertions,
3188     possible_deletions,
3189     possible_fusions,
3190     ref_fasta,
3191     params.read_params.color)
3192 gpertea 29
3193 gpertea 159 juncs_bwt_samheader = get_index_sam_header(params, juncs_bwt_idx)
3194 gpertea 154
3195 gpertea 29 # Now map read segments (or whole IUM reads, if num_segs == 1) to the splice
3196     # index with Bowtie
3197 gpertea 135 # for reads in [left_reads, right_reads]:
3198     for ri in (0,1):
3199     reads = initial_reads[ri]
3200 gpertea 29 spliced_seg_maps = []
3201 gpertea 135
3202     if reads == None or os.path.getsize(reads)<25:
3203 gpertea 29 continue
3204    
3205 gpertea 135 m2g_map = m2g_maps[ri]
3206    
3207     if junc_idx_prefix :
3208 gpertea 29 i = 0
3209 gpertea 135 for seg in maps[ri].segs:
3210     #search each segment
3211 gpertea 29 fsegname = getFileBaseName(seg)
3212 gpertea 135 seg_out = tmp_dir + fsegname + ".to_spliced.bwtout"
3213     if use_zpacker: seg_out += ".z"
3214     extra_output = "(%d/%d)" % (i+1, len(maps[ri].segs))
3215 gpertea 154 (seg_map, unmapped) = bowtie(params,
3216     tmp_dir + junc_idx_prefix,
3217     [seg],
3218     "fastq",
3219     params.segment_mismatches,
3220     seg_out,
3221     None,
3222     extra_output,
3223     "seg_against_junctions")
3224 gpertea 29 spliced_seg_maps.append(seg_map)
3225     i += 1
3226    
3227 gpertea 135 # Join the contigous and spliced segment hits into full length
3228     # read alignments
3229     rfname=getFileBaseName(reads)
3230     rfdir=getFileDir(reads)
3231 gpertea 29 mapped_reads = rfdir + rfname + ".candidates.bam"
3232 gpertea 135 # -- spliced mappings built from all segment mappings vs genome and junc_db
3233 gpertea 29 merged_map =rfdir + rfname + ".candidates_and_unspl.bam"
3234 gpertea 135 #unspl_bwtfile = maps[ri].unspliced_bwt
3235     unspl_samfile = maps[ri].unspliced_sam
3236 gpertea 29 join_mapped_segments(params,
3237     sam_header_filename,
3238     reads,
3239     ref_fasta,
3240     possible_juncs,
3241     possible_insertions,
3242     possible_deletions,
3243 gpertea 154 possible_fusions,
3244 gpertea 135 maps[ri].seg_maps,
3245 gpertea 29 spliced_seg_maps,
3246     mapped_reads)
3247    
3248 gpertea 135 if num_segs > 1 or m2g_map:
3249     # Merge the spliced and unspliced full length alignments into
3250     # a single SAM file.
3251 gpertea 29 # The individual SAM files are all already sorted in
3252     # increasing read ID order.
3253     # NOTE: We also should be able to address bug #134 here, by replacing
3254     # contiguous alignments that poke into an intron by a small amount by
3255     # the correct spliced alignment.
3256 gpertea 154
3257 gpertea 29 try:
3258 gpertea 159 merge_cmd = [prog_path("bam_merge"),
3259     "--index-outfile", merged_map + ".index",
3260     "--sam-header", sam_header_filename,
3261     merged_map]
3262 gpertea 154
3263 gpertea 135 if num_segs > 1:
3264     merge_cmd += [ maps[ri].unspliced_sam ]
3265     if m2g_map:
3266     merge_cmd += [ m2g_map ]
3267 gpertea 154
3268     if os.path.exists(mapped_reads):
3269     merge_cmd += [ mapped_reads ]
3270     else:
3271     for bam_i in range(0, params.system_params.num_threads):
3272     temp_bam = mapped_reads[:-4] + str(bam_i) + ".bam"
3273     if os.path.exists(temp_bam):
3274     merge_cmd += [ temp_bam ]
3275     else:
3276     break
3277    
3278 gpertea 29 print >> run_log, " ".join(merge_cmd)
3279     ret = subprocess.call( merge_cmd,
3280     stderr=open(logging_dir + "sam_merge.log", "w") )
3281     if ret != 0:
3282     die(fail_str+"Error executing: "+" ".join(merge_cmd))
3283     except OSError, o:
3284     die(fail_str+"Error: "+str(o))
3285 gpertea 135 maps[ri] = [merged_map]
3286 gpertea 29
3287     if not params.system_params.keep_tmp:
3288     os.remove(mapped_reads)
3289     else:
3290 gpertea 135 # no segments or transcriptome mappings, so no merge is needed
3291     # because join_mapped_segments() produced the final BAM already
3292 gpertea 29 os.rename(mapped_reads, merged_map)
3293 gpertea 135 maps[ri] = [merged_map]
3294     if not params.system_params.keep_tmp:
3295 gpertea 29 os.remove(unspl_samfile)
3296 gpertea 154
3297 gpertea 29 return maps
3298    
3299     def die(msg=None):
3300     if msg is not None:
3301     print >> sys.stderr, msg
3302     sys.exit(1)
3303    
3304     # rough equivalent of the 'which' command to find external programs
3305     # (current script path is tested first, then PATH envvar)
3306     def which(program):
3307     def is_executable(fpath):
3308 gpertea 37 return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
3309 gpertea 29 fpath, fname = os.path.split(program)
3310     if fpath:
3311     if is_executable(program):
3312     return program
3313     else:
3314     progpath = os.path.join(bin_dir, program)
3315     if is_executable(progpath):
3316     return progpath
3317     for path in os.environ["PATH"].split(os.pathsep):
3318     progpath = os.path.join(path, program)
3319     if is_executable(progpath):
3320     return progpath
3321     return None
3322    
3323     def prog_path(program):
3324     progpath=which(program)
3325     if progpath == None:
3326     die("Error locating program: "+program)
3327     return progpath
3328    
3329     # FIXME: this should get set during the make dist autotools phase of the build
3330     def get_version():
3331 gpertea 154 return "2.0.0"
3332 gpertea 29
3333     def mlog(msg):
3334     print >> sys.stderr, "[DBGLOG]:"+msg
3335    
3336     def test_input_file(filename):
3337     try:
3338     test_file = open(filename, "r")
3339     except IOError, o:
3340     die("Error: Opening file %s" % filename)
3341     return
3342    
3343     def main(argv=None):
3344     warnings.filterwarnings("ignore", "tmpnam is a potential security risk")
3345    
3346     # Initialize default parameter values
3347     params = TopHatParams()
3348     try:
3349     if argv is None:
3350     argv = sys.argv
3351     args = params.parse_options(argv)
3352     params.check()
3353    
3354     bwt_idx_prefix = args[0]
3355     left_reads_list = args[1]
3356     left_quals_list, right_quals_list = [], []
3357     if (not params.read_params.quals and len(args) > 2) or (params.read_params.quals and len(args) > 3):
3358     if params.read_params.mate_inner_dist == None:
3359 gpertea 135 params.read_params.mate_inner_dist = 50
3360 gpertea 67 #die("Error: you must set the mean inner distance between mates with -r")
3361 gpertea 29
3362     right_reads_list = args[2]
3363     if params.read_params.quals:
3364     left_quals_list = args[3]
3365     right_quals_list = args[4]
3366     else:
3367     right_reads_list = None
3368     if params.read_params.quals:
3369     left_quals_list = args[2]
3370    
3371     print >> sys.stderr
3372 gpertea 135 th_log("Beginning TopHat run (v"+get_version()+")")
3373 gpertea 29 print >> sys.stderr, "-----------------------------------------------"
3374    
3375     start_time = datetime.now()
3376     prepare_output_dir()
3377    
3378     global run_log
3379     run_log = open(logging_dir + "run.log", "w", 0)
3380     global run_cmd
3381     run_cmd = " ".join(argv)
3382     print >> run_log, run_cmd
3383    
3384     # Validate all the input files, check all prereqs before committing
3385     # to the run
3386 gpertea 154 if params.gff_annotation:
3387     if not os.path.exists(params.gff_annotation):
3388     die("Error: cannot find transcript file %s" % params.gff_annotation)
3389     if os.path.getsize(params.gff_annotation)<10:
3390     die("Error: invalid transcript file %s" % params.gff_annotation)
3391    
3392 gpertea 135 if params.transcriptome_index:
3393 gpertea 154 if params.gff_annotation:
3394     #gff file given, so transcriptome data will be written there
3395     gff_basename = getFileBaseName(params.gff_annotation)
3396     #just in case, check if it's not already there (-G/--GTF given again by mistake)
3397     tpath, tname = os.path.split(params.transcriptome_index)
3398     new_subdir=False
3399     if tpath in (".", "./") or not tpath :
3400     if not os.path.exists(params.transcriptome_index):
3401     os.makedirs(params.transcriptome_index)
3402     new_subdir=True
3403     if new_subdir or (os.path.exists(params.transcriptome_index) and os.path.isdir(params.transcriptome_index)):
3404     params.transcriptome_index = os.path.join(params.transcriptome_index, gff_basename)
3405     gff_out=params.transcriptome_index+".gff"
3406     if not (os.path.exists(gff_out) and os.path.getsize(gff_out)==os.path.getsize(params.gff_annotation)):
3407     #generate the transcriptome data files
3408     tpath, tname = os.path.split(params.transcriptome_index)
3409     params.transcriptome_outdir=tpath
3410     t_gff=params.transcriptome_index+".gff"
3411     if params.transcriptome_outdir:
3412     #will create the transcriptome data files
3413     if not os.path.exists(params.transcriptome_outdir):
3414     os.makedirs(params.transcriptome_outdir)
3415     copy(params.gff_annotation, t_gff)
3416     else:
3417     #try to use existing transcriptome data files
3418    
3419     if not (os.path.exists(t_gff) and os.path.getsize(t_gff)>10):
3420     die("Error: GFF transcripts file not found or invalid (%s)" % t_gff)
3421     check_bowtie_index(params.transcriptome_index)
3422     params.gff_annotation=t_gff
3423     #end @ transcriptome_index given
3424    
3425 gpertea 29 (ref_fasta, ref_seq_dict) = check_index(bwt_idx_prefix)
3426    
3427 gpertea 154 check_bowtie(params.bowtie2)
3428 gpertea 29 check_samtools()
3429 gpertea 135 th_log("Generating SAM header for "+bwt_idx_prefix)
3430 gpertea 159 sam_header_filename = get_index_sam_header(params, bwt_idx_prefix)
3431 gpertea 135 #if not params.skip_check_reads:
3432     reads_list = left_reads_list
3433     if right_reads_list:
3434 gpertea 29 reads_list = reads_list + "," + right_reads_list
3435 gpertea 135 params.read_params = check_reads_format(params, reads_list)
3436 gpertea 29
3437     user_supplied_juncs = []
3438     user_supplied_insertions = []
3439     user_supplied_deletions = []
3440 gpertea 154 user_supplied_fusions = []
3441 gpertea 135 global gtf_juncs
3442 gpertea 29 if params.gff_annotation and params.find_GFF_juncs:
3443     test_input_file(params.gff_annotation)
3444     (found_juncs, gtf_juncs) = get_gtf_juncs(params.gff_annotation)
3445 gpertea 159 ##-- we shouldn't need these junctions in user_supplied_juncs anymore because now map2gtf does a much better job
3446 gpertea 135 ## but we still need them loaded in gtf_juncs for later splice verification
3447 gpertea 159 if found_juncs:
3448     ## and not params.gff_annotation:
3449     user_supplied_juncs.append(gtf_juncs)
3450 gpertea 135 #else:
3451     # gtf_juncs = None
3452 gpertea 29 if params.raw_junctions:
3453     test_input_file(params.raw_junctions)
3454     user_supplied_juncs.append(params.raw_junctions)
3455    
3456     if params.raw_insertions:
3457     test_input_file(params.raw_insertions)
3458     user_supplied_insertions.append(params.raw_insertions)
3459    
3460     if params.raw_deletions:
3461     test_input_file(params.raw_deletions)
3462     user_supplied_deletions.append(params.raw_deletions)
3463    
3464     global unmapped_reads_fifo
3465     unmapped_reads_fifo = tmp_dir + str(os.getpid())+".bwt_unmapped.z.fifo"
3466    
3467     # Now start the time consuming stuff
3468 gpertea 135 if params.prefilter_multi:
3469     sides=("left","right")
3470     read_lists=(left_reads_list, right_reads_list)
3471     for ri in (0,1):
3472     reads_list=read_lists[ri]
3473     if not reads_list:
3474     continue
3475     params.preflt_data[ri].multihit_reads = tmp_dir + sides[ri]+"_multimapped.fq"
3476     side_imap = tmp_dir + sides[ri]+"_im.bwtout"
3477     if use_zpacker:
3478     side_imap+=".z"
3479     side_ium = tmp_dir + sides[ri]+"_ium.fq"
3480     if use_BWT_FIFO:
3481     side_ium += ".z"
3482     th_log("Pre-filtering multi-mapped "+sides[ri]+" reads")
3483     rdlist=reads_list.split(',')
3484     bwt=bowtie(params, bwt_idx_prefix, rdlist,
3485     params.read_params.reads_format,
3486 gpertea 154 params.genome_read_mismatches,
3487 gpertea 135 side_imap, side_ium,
3488 gpertea 154 "", "read_against_genome", params.preflt_data[ri].multihit_reads)
3489 gpertea 135 params.preflt_data[ri].mappings = bwt[0] # initial mappings
3490     params.preflt_data[ri].unmapped_reads = bwt[1] # IUM reads
3491    
3492     th_log("Preparing reads")
3493 gpertea 29 left_kept_reads, left_reads_info = prep_reads(params,
3494     left_reads_list,
3495     left_quals_list,
3496 gpertea 135 "left",
3497     params.preflt_data[0].multihit_reads)
3498 gpertea 29 min_read_len=left_reads_info.min_len
3499     max_read_len=left_reads_info.max_len
3500 gpertea 135 if right_reads_list:
3501 gpertea 29 right_kept_reads, right_reads_info = prep_reads(params,
3502     right_reads_list,
3503     right_quals_list,
3504 gpertea 135 "right",
3505     params.preflt_data[1].multihit_reads)
3506 gpertea 29 min_read_len=min(right_reads_info.min_len, min_read_len)
3507     max_read_len=max(right_reads_info.max_len, max_read_len)
3508     else:
3509     right_kept_reads = None
3510     seed_len=params.read_params.seed_length
3511 gpertea 135 if seed_len: #if read len was explicitly given
3512 gpertea 29 seed_len = max(seed_len, min_read_len)
3513 gpertea 135 #can't be smaller than minimum length observed
3514 gpertea 29 else:
3515     seed_len = max_read_len
3516     params.read_params.seed_length=seed_len
3517     # turn off integer-quals
3518     if params.read_params.integer_quals:
3519     params.read_params.integer_quals = False
3520 gpertea 154
3521 gpertea 135 input_reads = [left_kept_reads, right_kept_reads]
3522     mappings = spliced_alignment(params,
3523     bwt_idx_prefix,
3524     sam_header_filename,
3525     ref_fasta,
3526     params.read_params.seed_length,
3527     params.segment_length,
3528     input_reads,
3529     user_supplied_juncs,
3530     user_supplied_insertions,
3531     user_supplied_deletions)
3532 gpertea 29
3533     compile_reports(params,
3534     sam_header_filename,
3535 gpertea 154 ref_fasta,
3536 gpertea 135 mappings,
3537     input_reads,
3538 gpertea 29 params.gff_annotation)
3539    
3540     if not params.system_params.keep_tmp:
3541 gpertea 135 for m in mappings[0]:
3542 gpertea 29 os.remove(m)
3543 gpertea 135 if left_kept_reads:
3544 gpertea 29 os.remove(left_kept_reads)
3545 gpertea 135 for m in mappings[1]:
3546 gpertea 29 os.remove(m)
3547 gpertea 135 if right_kept_reads:
3548 gpertea 29 os.remove(right_kept_reads)
3549     tmp_files = os.listdir(tmp_dir)
3550     for t in tmp_files:
3551     os.remove(tmp_dir+t)
3552     os.rmdir(tmp_dir)
3553    
3554     finish_time = datetime.now()
3555     duration = finish_time - start_time
3556     print >> sys.stderr,"-----------------------------------------------"
3557 gpertea 154 th_log("Run complete: %s elapsed" % formatTD(duration))
3558 gpertea 29
3559     except Usage, err:
3560     print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg)
3561     print >> sys.stderr, " for detailed help see http://tophat.cbcb.umd.edu/manual.html"
3562     return 2
3563    
3564    
3565     if __name__ == "__main__":
3566     sys.exit(main())

Properties

Name Value
svn:executable *