ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/tophat
Revision: 167
Committed: Fri Feb 10 05:51:56 2012 UTC (8 years, 3 months ago) by gpertea
File size: 150082 byte(s)
Log Message:
bam_merge needs fixing!

Line User Rev File contents
1 gpertea 29 #!/usr/bin/env python
2    
3     # encoding: utf-8
4     """
5     tophat.py
6    
7     Created by Cole Trapnell on 2008-12-25.
8     Copyright (c) 2008 Cole Trapnell. All rights reserved.
9 gpertea 154 Updated and maintained by Daehwan Kim and Geo Pertea since Jul 2010.
10 gpertea 29 """
11     import sys
12     try:
13     import psyco
14     psyco.full()
15     except ImportError:
16     pass
17    
18     import getopt
19     import subprocess
20     import errno
21     import os
22     import warnings
23     import re
24     import signal
25     from datetime import datetime, date, time
26 gpertea 154 from shutil import copy
27 gpertea 164 import logging
28 gpertea 29
29     use_message = '''
30     TopHat maps short sequences from spliced transcripts to whole genomes.
31    
32     Usage:
33     tophat [options] <bowtie_index> <reads1[,reads2,...]> [reads1[,reads2,...]] \\
34     [quals1,[quals2,...]] [quals1[,quals2,...]]
35    
36     Options:
37     -v/--version
38     -o/--output-dir <string> [ default: ./tophat_out ]
39 gpertea 154 --bowtie1 [ default: bowtie2 ]
40 gpertea 29 -a/--min-anchor <int> [ default: 8 ]
41     -m/--splice-mismatches <0-2> [ default: 0 ]
42     -i/--min-intron-length <int> [ default: 50 ]
43     -I/--max-intron-length <int> [ default: 500000 ]
44     -g/--max-multihits <int> [ default: 20 ]
45 gpertea 135 -x/--transcriptome-max-hits <int> [ default: 60 ]
46     -n/--transcriptome-mismatches <int> [ default: 1 ]
47     -M/--prefilter-multihits ( for -G/--GTF option, enable
48     an initial bowtie search
49     against the genome )
50 gpertea 29 --max-insertion-length <int> [ default: 3 ]
51     --max-deletion-length <int> [ default: 3 ]
52     --solexa-quals
53     --solexa1.3-quals (same as phred64-quals)
54     --phred64-quals (same as solexa1.3-quals)
55     -Q/--quals
56     --integer-quals
57     -C/--color (Solid - color space)
58     --color-out
59     --library-type <string> (fr-unstranded, fr-firststrand,
60     fr-secondstrand)
61 gpertea 154 -p/--num-threads <int> [ default: 1 ]
62     -G/--GTF <filename> (GTF/GFF with known transcripts)
63     --transcriptome-index <bwtidx> (transcriptome bowtie index)
64     -T/--transcriptome-only (map only to the transcriptome)
65 gpertea 29 -j/--raw-juncs <filename>
66     --insertions <filename>
67     --deletions <filename>
68     -r/--mate-inner-dist <int>
69 gpertea 154 --mate-std-dev <int> [ default: 20 ]
70 gpertea 29 --no-novel-juncs
71     --no-novel-indels
72 gpertea 154 --fusion-search
73     --fusion-anchor-length <int> [ default: 20 ]
74     --fusion-min-dist <int> [ default: 10000000 ]
75     --fusion-read-mismatches <int> [ default: 2 ]
76     --fusion-multireads <int> [ default: 2 ]
77     --fusion-multipairs <int> [ default: 2 ]
78 gpertea 29 --no-gtf-juncs
79     --no-coverage-search
80     --coverage-search
81     --no-closure-search
82     --closure-search
83     --microexon-search
84     --butterfly-search
85     --no-butterfly-search
86     --keep-tmp
87     --tmp-dir <dirname> [ default: <output_dir>/tmp ]
88     -z/--zpacker <program> [ default: gzip ]
89     -X/--unmapped-fifo [use mkfifo to compress more temporary
90     files for color space reads]
91    
92     Advanced Options:
93 gpertea 159 --report-secondary-alignments
94 gpertea 154 --genome-read-mismatches <int> [ default: 2 ]
95     -N/--read-mismatches <int> [ default: 2 ]
96 gpertea 29 --segment-mismatches <int> [ default: 2 ]
97     --segment-length <int> [ default: 25 ]
98     --bowtie-n [ default: bowtie -v ]
99     --min-closure-exon <int> [ default: 100 ]
100     --min-closure-intron <int> [ default: 50 ]
101     --max-closure-intron <int> [ default: 5000 ]
102     --min-coverage-intron <int> [ default: 50 ]
103     --max-coverage-intron <int> [ default: 20000 ]
104     --min-segment-intron <int> [ default: 50 ]
105     --max-segment-intron <int> [ default: 500000 ]
106 gpertea 154 --no-sort-bam (Output BAM is not coordinate-sorted)
107     --no-convert-bam (Do not convert to bam format.
108 gpertea 29 Output is <output_dir>accepted_hit.sam.
109 gpertea 154 Implies --no-sort-bam)
110 gpertea 159 --keep-fasta-order
111     --allow-partial-mapping
112 gpertea 29
113 gpertea 154 Bowtie2 related options:
114     Preset options in --end-to-end mode (local alignment is not used in TopHat2)
115     --b2-very-fast
116     --b2-fast
117     --b2-sensitive
118     --b2-very-sensitive
119    
120     Alignment options
121     --b2-N <int> [ default: 0 ]
122     --b2-L <int> [ default: 20 ]
123     --b2-i <func> [ default: S,1,1.25 ]
124     --b2-n-ceil <func> [ default: L,0,0.15 ]
125     --b2-gbar <int> [ default: 4 ]
126    
127     Scoring options
128     --b2-mp <int>,<int> [ default: 6,2 ]
129     --b2-np <int> [ default: 1 ]
130     --b2-rdg <int>,<int> [ default: 5,3 ]
131     --b2-rfg <int>,<int> [ default: 5,3 ]
132     --b2-score-min <func> [ default: L,-0.6,-0.6 ]
133    
134     Effort options
135     --b2-D <int> [ default: 15 ]
136     --b2-R <int> [ default: 2 ]
137    
138    
139 gpertea 29 SAM Header Options (for embedding sequencing run metadata in output):
140     --rg-id <string> (read group ID)
141     --rg-sample <string> (sample ID)
142     --rg-library <string> (library ID)
143     --rg-description <string> (descriptive string, no tabs allowed)
144     --rg-platform-unit <string> (e.g Illumina lane ID)
145     --rg-center <string> (sequencing center name)
146     --rg-date <string> (ISO 8601 date of the sequencing run)
147     --rg-platform <string> (Sequencing platform descriptor)
148     '''
149    
150 gpertea 154 # -F/--min-isoform-fraction <float> [ default: 0.15 ]
151     # (obsolete)
152    
153 gpertea 29 class Usage(Exception):
154     def __init__(self, msg):
155     self.msg = msg
156    
157     output_dir = "./tophat_out/"
158     logging_dir = output_dir + "logs/"
159     run_log = None
160 gpertea 164 tophat_log = None #main log file handle
161     tophat_logger = None # main logging object
162 gpertea 29 run_cmd = None
163     tmp_dir = output_dir + "tmp/"
164     bin_dir = sys.path[0] + "/"
165     use_zpacker = False # this is set by -z/--zpacker option (-z0 leaves it False)
166    
167     use_BAM_Unmapped = False # automatically set to True for non-Solid reads, handles unmapped reads in BAM format
168    
169     use_BWT_FIFO = False # can only be set to True if use_zpacker is True and only with -C/--color
170     # enabled by -X/-unmapped-fifo option (unless -z0)
171 gpertea 135 unmapped_reads_fifo = None # if use_BWT_FIFO is True, this tricks bowtie into writing the
172 gpertea 29 # unmapped reads into a compressed file
173    
174     samtools_path = None
175     bowtie_path = None
176     fail_str = "\t[FAILED]\n"
177 gpertea 135 gtf_juncs = None #file name with junctions extracted from given GFF file
178    
179 gpertea 167 #mapping types:
180 gpertea 164
181    
182 gpertea 167 _reads_vs_G, _reads_vs_T, _segs_vs_G, _segs_vs_J = range(1,5)
183    
184 gpertea 164 def init_logger(log_fname):
185     global tophat_logger
186     tophat_logger = logging.getLogger('project')
187     formatter = logging.Formatter('%(asctime)s %(message)s', '[%Y-%m-%d %H:%M:%S]')
188     # level = logging.__dict__.get(options.loglevel.upper(),logging.DEBUG)
189     tophat_logger.setLevel(logging.DEBUG)
190    
191     hstream = logging.StreamHandler(sys.stderr)
192     hstream.setFormatter(formatter)
193     tophat_logger.addHandler(hstream)
194     #
195     # Output logging information to file
196     if os.path.isfile(log_fname):
197     os.remove(log_fname)
198     global tophat_log
199     logfh = logging.FileHandler(log_fname)
200     logfh.setFormatter(formatter)
201     tophat_logger.addHandler(logfh)
202     tophat_log=logfh.stream
203    
204    
205 gpertea 29 # TopHatParams captures all of the runtime paramaters used by TopHat, and many
206     # of these are passed as command line options to exectubles run by the pipeline
207    
208     # This class and its nested classes also do options parsing through parse_options()
209     # and option validation via the member function check()
210    
211 gpertea 154 class BowtieOutputFiles:
212 gpertea 135 def __init__(self,
213     mappings=None,
214     unmapped_reads=None,
215     multihit_reads=None):
216     self.mappings=mappings
217     self.unmapped_reads=unmapped_reads
218     self.multihit_reads=multihit_reads
219    
220 gpertea 29 class TopHatParams:
221    
222     # SpliceConstraints is a group of runtime parameters that specify what
223     # constraints to put on junctions discovered by the program. These constraints
224     # are used to filter out spurious/false positive junctions.
225    
226     class SpliceConstraints:
227     def __init__(self,
228     min_anchor_length,
229     min_intron_length,
230     max_intron_length,
231     splice_mismatches,
232     min_isoform_fraction):
233     self.min_anchor_length = min_anchor_length
234     self.min_intron_length = min_intron_length
235     self.max_intron_length = max_intron_length
236     self.splice_mismatches = splice_mismatches
237     self.min_isoform_fraction = min_isoform_fraction
238    
239     def parse_options(self, opts):
240     for option, value in opts:
241     if option in ("-m", "--splice-mismatches"):
242     self.splice_mismatches = int(value)
243     elif option in ("-a", "--min-anchor"):
244     self.min_anchor_length = int(value)
245     elif option in ("-F", "--min-isoform-fraction"):
246     self.min_isoform_fraction = float(value)
247     elif option in ("-i", "--min-intron-length"):
248     self.min_intron_length = int(value)
249     elif option in ("-I", "--max-intron-length"):
250     self.max_intron_length = int(value)
251    
252     def check(self):
253     if self.splice_mismatches not in [0,1,2]:
254     die("Error: arg to --splice-mismatches must be 0, 1, or 2")
255     if self.min_anchor_length < 4:
256     die("Error: arg to --min-anchor-len must be greater than 4")
257     if self.min_isoform_fraction < 0.0 or self.min_isoform_fraction > 1.0:
258     die("Error: arg to --min-isoform-fraction must be between 0.0 and 1.0")
259     if self.min_intron_length <= 0:
260     die("Error: arg to --min-intron-length must be greater than 0")
261     if self.max_intron_length <= 0:
262     die("Error: arg to --max-intron-length must be greater than 0")
263    
264     # SystemParams is a group of runtime parameters that determine how to handle
265     # temporary files produced during a run and how many threads to use for threaded
266     # stages of the pipeline (e.g. Bowtie)
267    
268     class SystemParams:
269     def __init__(self,
270 gpertea 154 num_threads,
271 gpertea 29 keep_tmp):
272 gpertea 154 self.num_threads = num_threads
273 gpertea 29 self.keep_tmp = keep_tmp
274     self.zipper = "gzip"
275     self.zipper_opts= []
276    
277     def parse_options(self, opts):
278     global use_zpacker
279     global use_BWT_FIFO
280     for option, value in opts:
281     if option in ("-p", "--num-threads"):
282 gpertea 154 self.num_threads = int(value)
283 gpertea 29 elif option == "--keep-tmp":
284     self.keep_tmp = True
285     elif option in ("-z","--zpacker"):
286     if value.lower() in ["-", " ", ".", "0", "none", "f", "false", "no"]:
287     value=""
288     self.zipper = value
289     #if not self.zipper:
290     # self.zipper='gzip'
291     elif option in ("-X", "--unmapped-fifo"):
292     use_BWT_FIFO=True
293     if self.zipper:
294     use_zpacker=True
295 gpertea 154 if self.num_threads>1 and not self.zipper_opts:
296 gpertea 29 if self.zipper.endswith('pbzip2') or self.zipper.endswith('pigz'):
297 gpertea 154 self.zipper_opts.append('-p'+str(self.num_threads))
298 gpertea 29 else:
299     use_zpacker=False
300     if use_BWT_FIFO: use_BWT_FIFO=False
301     def cmd(self):
302     cmdline=[]
303     if self.zipper:
304     cmdline.extend(['-z',self.zipper])
305 gpertea 154 if self.num_threads>1:
306     cmdline.extend(['-p'+str(self.num_threads)])
307 gpertea 29 return cmdline
308    
309     def check(self):
310 gpertea 154 if self.num_threads<1 :
311 gpertea 29 die("Error: arg to --num-threads must be greater than 0")
312     if self.zipper:
313     xzip=which(self.zipper)
314     if not xzip:
315     die("Error: cannot find compression program "+self.zipper)
316    
317     # ReadParams is a group of runtime parameters that specify various properties
318     # of the user's reads (e.g. which quality scale their are on, how long the
319     # fragments are, etc).
320     class ReadParams:
321     def __init__(self,
322     solexa_quals,
323     phred64_quals,
324     quals,
325     integer_quals,
326     color,
327     library_type,
328     seed_length,
329     reads_format,
330     mate_inner_dist,
331     mate_inner_dist_std_dev,
332     read_group_id,
333     sample_id,
334     library_id,
335     description,
336     seq_platform_unit,
337     seq_center,
338     seq_run_date,
339     seq_platform):
340     self.solexa_quals = solexa_quals
341     self.phred64_quals = phred64_quals
342     self.quals = quals
343     self.integer_quals = integer_quals
344     self.color = color
345     self.library_type = library_type
346     self.seed_length = seed_length
347     self.reads_format = reads_format
348     self.mate_inner_dist = mate_inner_dist
349     self.mate_inner_dist_std_dev = mate_inner_dist_std_dev
350     self.read_group_id = read_group_id
351     self.sample_id = sample_id
352     self.library_id = library_id
353     self.description = description
354     self.seq_platform_unit = seq_platform_unit
355     self.seq_center = seq_center
356     self.seq_run_date = seq_run_date
357     self.seq_platform = seq_platform
358    
359     def parse_options(self, opts):
360     for option, value in opts:
361     if option == "--solexa-quals":
362     self.solexa_quals = True
363     elif option in ("--solexa1.3-quals", "--phred64-quals"):
364     self.phred64_quals = True
365     elif option in ("-Q", "--quals"):
366     self.quals = True
367     elif option == "--integer-quals":
368     self.integer_quals = True
369     elif option in ("-C", "--color"):
370     self.color = True
371     elif option == "--library-type":
372     self.library_type = value
373     elif option in ("-s", "--seed-length"):
374     self.seed_length = int(value)
375     elif option in ("-r", "--mate-inner-dist"):
376     self.mate_inner_dist = int(value)
377     elif option == "--mate-std-dev":
378     self.mate_inner_dist_std_dev = int(value)
379     elif option == "--rg-id":
380     self.read_group_id = value
381     elif option == "--rg-sample":
382     self.sample_id = value
383     elif option == "--rg-library":
384     self.library_id = value
385     elif option == "--rg-description":
386     self.description = value
387     elif option == "--rg-platform-unit":
388     self.seq_platform_unit = value
389     elif option == "--rg-center":
390     self.seq_center = value
391     elif option == "--rg-date":
392     self.seq_run_date = value
393     elif option == "--rg-platform":
394     self.seq_platform = value
395    
396     def check(self):
397 gpertea 135 if self.seed_length and self.seed_length < 20:
398 gpertea 29 die("Error: arg to --seed-length must be at least 20")
399 gpertea 154
400 gpertea 29 if self.mate_inner_dist_std_dev != None and self.mate_inner_dist_std_dev < 0:
401     die("Error: arg to --mate-std-dev must at least 0")
402     if (not self.read_group_id and self.sample_id) or (self.read_group_id and not self.sample_id):
403     die("Error: --rg-id and --rg-sample must be specified or omitted together")
404    
405     # SearchParams is a group of runtime parameters that specify how TopHat will
406     # search for splice junctions
407    
408     class SearchParams:
409     def __init__(self,
410     min_closure_exon,
411     min_closure_intron,
412     max_closure_intron,
413     min_coverage_intron,
414     max_coverage_intron,
415     min_segment_intron,
416     max_segment_intron):
417    
418     self.min_closure_exon_length = min_closure_exon
419     self.min_closure_intron_length = min_closure_intron
420     self.max_closure_intron_length = max_closure_intron
421     self.min_coverage_intron_length = min_coverage_intron
422     self.max_coverage_intron_length = max_coverage_intron
423     self.min_segment_intron_length = min_segment_intron
424     self.max_segment_intron_length = max_segment_intron
425    
426     def parse_options(self, opts):
427     for option, value in opts:
428     if option == "--min-closure-exon":
429     self.min_closure_exon_length = int(value)
430     if option == "--min-closure-intron":
431     self.min_closure_intron_length = int(value)
432     if option == "--max-closure-intron":
433     self.max_closure_intron_length = int(value)
434     if option == "--min-coverage-intron":
435     self.min_coverage_intron_length = int(value)
436     if option == "--max-coverage-intron":
437     self.max_coverage_intron_length = int(value)
438     if option == "--min-segment-intron":
439     self.min_segment_intron_length = int(value)
440     if option == "--max-segment-intron":
441     self.max_segment_intron_length = int(value)
442    
443     def check(self):
444     if self.min_closure_exon_length < 0:
445     die("Error: arg to --min-closure-exon must be at least 20")
446     if self.min_closure_intron_length < 0:
447     die("Error: arg to --min-closure-intron must be at least 20")
448     if self.max_closure_intron_length < 0:
449     die("Error: arg to --max-closure-intron must be at least 20")
450     if self.min_coverage_intron_length < 0:
451     die("Error: arg to --min-coverage-intron must be at least 20")
452     if self.max_coverage_intron_length < 0:
453     die("Error: arg to --max-coverage-intron must be at least 20")
454     if self.min_segment_intron_length < 0:
455     die("Error: arg to --min-segment-intron must be at least 20")
456     if self.max_segment_intron_length < 0:
457     die("Error: arg to --max-segment-intron must be at least 20")
458    
459     class ReportParams:
460     def __init__(self):
461     self.sort_bam = True
462     self.convert_bam = True
463    
464     def parse_options(self, opts):
465     for option, value in opts:
466     if option == "--no-sort-bam":
467     self.sort_bam = False
468     if option == "--no-convert-bam":
469     self.convert_bam = False
470     self.sort_bam = False
471    
472 gpertea 154 class Bowtie2Params:
473     def __init__(self):
474     self.very_fast = False
475     self.fast = False
476     self.sensitive = False
477     self.very_sensitive = False
478 gpertea 29
479 gpertea 154 self.N = 0
480     self.L = 20
481     self.i = "S,1,1.25"
482     self.n_ceil = "L,0,0.15"
483     self.gbar = 4
484    
485     self.mp = "6,2"
486     self.np = 1
487     self.rdg = "5,3"
488     self.rfg = "5,3"
489     self.score_min = "L,-0.6,-0.6"
490    
491     self.D = 15
492     self.R = 2
493    
494     def parse_options(self, opts):
495     for option, value in opts:
496     if option == "--b2-very-fast":
497     self.very_fast = True
498     if option == "--b2-fast":
499     self.fast = True
500     if option == "--b2-sensitive":
501     self.sensitive = True
502     if option == "--b2-very-sensitive":
503     self.very_sensitive = True
504    
505     if option == "--b2-N":
506     self.N = int(value)
507     if option == "--b2-L":
508     self.L = 20
509     if option == "--b2-i":
510     self.i = value
511     if option == "--b2-n-ceil":
512     self.n_ceil = value
513     if option == "--b2-gbar":
514     self.gbar = 4
515    
516     if option == "--b2-mp":
517     self.mp = value
518     if option == "--b2-np":
519     self.np = int(value)
520     if option == "--b2-rdg":
521     self.rdg = value
522     if option == "--b2-rfg":
523     self.rfg = value
524     if option == "--b2-score-min":
525     self.score_min = value
526    
527     if option == "--b2-D":
528     self.D = int(value)
529     if option == "--b2-R":
530     self.R = int(value)
531    
532     def check(self):
533     more_than_once = False
534     if self.very_fast:
535     if self.fast or self.sensitive or self.very_sensitive:
536     more_than_once = True
537     else:
538     if self.fast:
539     if self.sensitive or self.very_sensitive:
540     more_than_once = True
541     else:
542     if self.sensitive and self.very_sensitive:
543     more_than_once = True
544    
545     if more_than_once:
546     die("Error: use only one of --b2-very-fast, --b2-fast, --b2-sensitive, --b2-very-sensitive")
547    
548     if not self.N in [0, 1]:
549     die("Error: arg to --b2-N must be either 0 or 1")
550    
551     function_re = r'^[CLSG],-?\d+(\.\d+)?,-?\d+(\.\d+)?$'
552     function_match = re.search(function_re, self.i)
553    
554     if not function_match:
555     die("Error: arg to --b2-i must be <func> (e.g. --b2-i S,1,1.25)")
556    
557     function_match = re.search(function_re, self.n_ceil)
558     if not function_match:
559     die("Error: arg to --b2-n-ceil must be <func> (e.g. --b2-n-ceil L,0,0.15)")
560    
561     function_match = re.search(function_re, self.score_min)
562     if not function_match:
563     die("Error: arg to --b2-score-min must be <func> (e.g. --b2-score-min L,-0.6,-0.6)")
564    
565     pair_re = r'^\d+,\d+$'
566     pair_match = re.search(pair_re, self.mp)
567     if not pair_match:
568     die("Error: arg to --b2-mp must be <int>,<int> (e.g. --b2-mp 6,2)")
569    
570     pair_match = re.search(pair_re, self.rdg)
571     if not pair_match:
572     die("Error: arg to --b2-rdg must be <int>,<int> (e.g. --b2-mp 5,3)")
573    
574     pair_match = re.search(pair_re, self.rfg)
575     if not pair_match:
576     die("Error: arg to --b2-rfg must be <int>,<int> (e.g. --b2-mp 5,3)")
577    
578    
579 gpertea 29 def __init__(self):
580     self.splice_constraints = self.SpliceConstraints(8, # min_anchor
581     50, # min_intron
582     500000, # max_intron
583     0, # splice_mismatches
584     0.15) # min_isoform_frac
585    
586 gpertea 135 self.preflt_data = [ BowtieOutputFiles(), BowtieOutputFiles() ]
587    
588 gpertea 29 self.read_params = self.ReadParams(False, # solexa_scale
589     False,
590     False, # quals
591     None, # integer quals
592     False, # SOLiD - color space
593     "", # library type (e.g. "illumina-stranded-pair-end")
594     None, # seed_length
595     "fastq", # quality_format
596     None, # mate inner distance
597     20, # mate inner dist std dev
598     None, # read group id
599     None, # sample id
600     None, # library id
601     None, # description
602     None, # platform unit (i.e. lane)
603     None, # sequencing center
604     None, # run date
605     None) # sequencing platform
606    
607 gpertea 154 self.system_params = self.SystemParams(1, # bowtie_threads (num_threads)
608 gpertea 29 False) # keep_tmp
609    
610     self.search_params = self.SearchParams(100, # min_closure_exon_length
611     50, # min_closure_intron_length
612     5000, # max_closure_intron_length
613     50, # min_coverage_intron_length
614     20000, # max_coverage_intron_length
615     50, # min_segment_intron_length
616     500000) # max_segment_intron_length
617    
618     self.report_params = self.ReportParams()
619 gpertea 154
620     self.bowtie2_params = self.Bowtie2Params()
621    
622     self.bowtie2 = True
623 gpertea 29 self.gff_annotation = None
624 gpertea 135 self.transcriptome_only = False
625     self.transcriptome_index = None
626 gpertea 154 self.transcriptome_outdir = None
627 gpertea 29 self.raw_junctions = None
628     self.find_novel_juncs = True
629     self.find_novel_indels = True
630 gpertea 154 self.find_novel_fusions = True
631 gpertea 29 self.find_GFF_juncs = True
632     self.max_hits = 20
633 gpertea 135 self.t_max_hits = 60
634 gpertea 159 self.max_seg_hits = 40
635 gpertea 135 self.prefilter_multi = False
636 gpertea 154 self.genome_read_mismatches = 2
637     self.max_read_mismatches = 2
638 gpertea 135 self.t_mismatches = 1
639 gpertea 29 self.segment_length = 25
640     self.segment_mismatches = 2
641     self.bowtie_alignment_option = "-v"
642     self.max_insertion_length = 3
643     self.max_deletion_length = 3
644     self.raw_insertions = None
645     self.raw_deletions = None
646     self.coverage_search = None
647 gpertea 135 self.closure_search = False
648 gpertea 29 self.microexon_search = False
649     self.butterfly_search = None
650 gpertea 159 self.report_secondary_alignments = False
651    
652     self.keep_fasta_order = False
653     self.partial_mapping = False
654 gpertea 163
655 gpertea 154 self.fusion_search = False
656     self.fusion_anchor_length = 20
657     self.fusion_min_dist = 10000000
658     self.fusion_read_mismatches = 2
659     self.fusion_multireads = 2
660     self.fusion_multipairs = 2
661 gpertea 29
662     def check(self):
663     self.splice_constraints.check()
664     self.read_params.check()
665     self.system_params.check()
666 gpertea 135 if self.segment_length < 10:
667     die("Error: arg to --segment-length must at least 10")
668 gpertea 29 if self.segment_mismatches < 0 or self.segment_mismatches > 3:
669     die("Error: arg to --segment-mismatches must in [0, 3]")
670 gpertea 154 if self.t_mismatches < 0 or self.t_mismatches > 3:
671     die("Error: arg to -n or --transcriptome-mismatches must in [0, 3]")
672 gpertea 29
673 gpertea 154 if self.read_params.color:
674     if self.bowtie2:
675 gpertea 164 th_log("Warning: bowtie2 in colorspace is not yet supported; --bowtie1 option assumed.")
676     self.bowtie2=False
677 gpertea 154 if self.fusion_search:
678     die("Error: fusion-search in colorspace is not yet supported")
679     if self.butterfly_search:
680     die("Error: butterfly-search in colorspace is not yet supported")
681 gpertea 29
682 gpertea 164 self.bowtie2_params.check()
683    
684 gpertea 29 library_types = ["fr-unstranded", "fr-firststrand", "fr-secondstrand"]
685    
686     if self.read_params.library_type and self.read_params.library_type not in library_types:
687     die("Error: library-type should be one of: "+', '.join(library_types))
688    
689     self.search_params.max_closure_intron_length = min(self.splice_constraints.max_intron_length,
690     self.search_params.max_closure_intron_length)
691    
692     self.search_params.max_segment_intron_length = min(self.splice_constraints.max_intron_length,
693     self.search_params.max_segment_intron_length)
694    
695     self.search_params.max_coverage_intron_length = min(self.splice_constraints.max_intron_length,
696     self.search_params.max_coverage_intron_length)
697    
698     if self.max_insertion_length >= self.segment_length:
699     die("Error: the max insertion length ("+self.max_insertion_length+") can not be equal to or greater than the segment length ("+self.segment_length+")")
700    
701     if self.max_insertion_length < 0:
702     die("Error: the max insertion length ("+self.max_insertion_length+") can not be less than 0")
703    
704     if self.max_deletion_length >= self.splice_constraints.min_intron_length:
705     die("Error: the max deletion length ("+self.max_deletion_length+") can not be equal to or greater than the min intron length ("+self.splice_constraints.min_intron_length+")")
706    
707     if self.max_deletion_length < 0:
708     die("Error: the max deletion length ("+self.max_deletion_length+") can not be less than 0")
709    
710     def cmd(self):
711     cmd = ["--min-anchor", str(self.splice_constraints.min_anchor_length),
712     "--splice-mismatches", str(self.splice_constraints.splice_mismatches),
713     "--min-report-intron", str(self.splice_constraints.min_intron_length),
714     "--max-report-intron", str(self.splice_constraints.max_intron_length),
715     "--min-isoform-fraction", str(self.splice_constraints.min_isoform_fraction),
716     "--output-dir", output_dir,
717     "--max-multihits", str(self.max_hits),
718 gpertea 159 "--max-seg-multihits", str(self.max_seg_hits),
719 gpertea 29 "--segment-length", str(self.segment_length),
720     "--segment-mismatches", str(self.segment_mismatches),
721     "--min-closure-exon", str(self.search_params.min_closure_exon_length),
722     "--min-closure-intron", str(self.search_params.min_closure_intron_length),
723     "--max-closure-intron", str(self.search_params.max_closure_intron_length),
724     "--min-coverage-intron", str(self.search_params.min_coverage_intron_length),
725     "--max-coverage-intron", str(self.search_params.max_coverage_intron_length),
726     "--min-segment-intron", str(self.search_params.min_segment_intron_length),
727     "--max-segment-intron", str(self.search_params.max_segment_intron_length),
728 gpertea 154 "--max-mismatches", str(self.max_read_mismatches),
729 gpertea 29 "--max-insertion-length", str(self.max_insertion_length),
730     "--max-deletion-length", str(self.max_deletion_length)]
731 gpertea 154
732     if not self.bowtie2:
733     cmd.extend(["--bowtie1"])
734    
735     if self.fusion_search:
736     cmd.extend(["--fusion-search",
737     "--fusion-anchor-length", str(self.fusion_anchor_length),
738     "--fusion-min-dist", str(self.fusion_min_dist),
739     "--fusion-read-mismatches", str(self.fusion_read_mismatches),
740     "--fusion-multireads", str(self.fusion_multireads),
741     "--fusion-multipairs", str(self.fusion_multipairs)])
742    
743 gpertea 29 cmd.extend(self.system_params.cmd())
744    
745     if self.read_params.mate_inner_dist != None:
746     cmd.extend(["--inner-dist-mean", str(self.read_params.mate_inner_dist),
747     "--inner-dist-std-dev", str(self.read_params.mate_inner_dist_std_dev)])
748     if self.gff_annotation != None:
749     cmd.extend(["--gtf-annotations", str(self.gff_annotation)])
750 gpertea 135 if gtf_juncs:
751     cmd.extend(["--gtf-juncs", gtf_juncs])
752     if self.closure_search == False:
753 gpertea 29 cmd.append("--no-closure-search")
754 gpertea 154 if not self.coverage_search:
755 gpertea 29 cmd.append("--no-coverage-search")
756 gpertea 154 if not self.microexon_search:
757 gpertea 29 cmd.append("--no-microexon-search")
758     if self.butterfly_search:
759     cmd.append("--butterfly-search")
760     if self.read_params.solexa_quals:
761     cmd.append("--solexa-quals")
762     if self.read_params.quals:
763     cmd.append("--quals")
764     if self.read_params.integer_quals:
765     cmd.append("--integer-quals")
766     if self.read_params.color:
767     cmd.append("--color")
768     if self.read_params.library_type:
769     cmd.extend(["--library-type", self.read_params.library_type])
770     if self.read_params.read_group_id:
771     cmd.extend(["--rg-id", self.read_params.read_group_id])
772     if self.read_params.phred64_quals:
773     cmd.append("--phred64-quals")
774     return cmd
775    
776     # This is the master options parsing routine, which calls parse_options for
777     # the delegate classes (e.g. SpliceConstraints) that handle certain groups
778     # of options.
779     def parse_options(self, argv):
780     try:
781 gpertea 135 opts, args = getopt.getopt(argv[1:], "hvp:m:n:N:F:a:i:I:G:Tr:o:j:Xz:s:g:x:MQC",
782 gpertea 29 ["version",
783     "help",
784     "output-dir=",
785 gpertea 154 "bowtie1",
786 gpertea 29 "solexa-quals",
787     "solexa1.3-quals",
788     "phred64-quals",
789     "quals",
790     "integer-quals",
791     "color",
792     "library-type=",
793     "num-threads=",
794     "splice-mismatches=",
795     "max-multihits=",
796     "min-isoform-fraction=",
797     "min-anchor-length=",
798     "min-intron-length=",
799     "max-intron-length=",
800     "GTF=",
801 gpertea 135 "transcriptome-only",
802     "transcriptome-max-hits=",
803     "transcriptome-index=",
804 gpertea 29 "raw-juncs=",
805     "no-novel-juncs",
806 gpertea 154 "allow-fusions",
807     "fusion-search",
808     "fusion-anchor-length=",
809     "fusion-min-dist=",
810     "fusion-read-mismatches=",
811     "fusion-multireads=",
812     "fusion-multipairs=",
813 gpertea 29 "no-novel-indels",
814     "no-gtf-juncs",
815     "mate-inner-dist=",
816     "mate-std-dev=",
817     "no-closure-search",
818     "no-coverage-search",
819     "closure-search",
820     "coverage-search",
821 gpertea 135 "prefilter-multihits",
822 gpertea 29 "microexon-search",
823     "min-closure-exon=",
824     "min-closure-intron=",
825     "max-closure-intron=",
826     "min-coverage-intron=",
827     "max-coverage-intron=",
828     "min-segment-intron=",
829     "max-segment-intron=",
830     "seed-length=",
831 gpertea 154 "genome-read-mismatches=",
832 gpertea 135 "read-mismatches=",
833 gpertea 154 "max-read-mismatches=",
834 gpertea 135 "transcriptome-mismatches=",
835 gpertea 29 "segment-length=",
836     "segment-mismatches=",
837     "bowtie-n",
838     "butterfly-search",
839     "no-butterfly-search",
840     "keep-tmp",
841     "rg-id=",
842     "rg-sample=",
843     "rg-library=",
844     "rg-description=",
845     "rg-platform-unit=",
846     "rg-center=",
847     "rg-date=",
848     "rg-platform=",
849     "tmp-dir=",
850     "zpacker=",
851     "unmapped-fifo",
852     "max-insertion-length=",
853     "max-deletion-length=",
854     "insertions=",
855     "deletions=",
856     "no-sort-bam",
857 gpertea 154 "no-convert-bam",
858 gpertea 159 "report-secondary-alignments",
859     "keep-fasta-order",
860     "allow-partial-mapping",
861 gpertea 154 "b2-very-fast",
862     "b2-fast",
863     "b2-sensitive",
864     "b2-very-sensitive",
865     "b2-N=",
866     "b2-L=",
867     "b2-i=",
868     "b2-n-ceil=",
869     "b2-gbar=",
870     "b2-ma=",
871     "b2-mp=",
872     "b2-np=",
873     "b2-rdg=",
874     "b2-rfg=",
875     "b2-score-min=",
876     "b2-D=",
877     "b2-R="])
878 gpertea 29 except getopt.error, msg:
879     raise Usage(msg)
880    
881     self.splice_constraints.parse_options(opts)
882     self.system_params.parse_options(opts)
883     self.read_params.parse_options(opts)
884     self.search_params.parse_options(opts)
885     self.report_params.parse_options(opts)
886 gpertea 154 self.bowtie2_params.parse_options(opts)
887 gpertea 29 global use_BWT_FIFO
888     global use_BAM_Unmapped
889     if not self.read_params.color:
890     use_BWT_FIFO=False
891     use_BAM_Unmapped=True
892     global output_dir
893     global logging_dir
894     global tmp_dir
895    
896     custom_tmp_dir = None
897     custom_out_dir = None
898     # option processing
899     for option, value in opts:
900     if option in ("-v", "--version"):
901     print "TopHat v%s" % (get_version())
902     sys.exit(0)
903     if option in ("-h", "--help"):
904     raise Usage(use_message)
905 gpertea 154 if option == "--bowtie1":
906     self.bowtie2 = False
907 gpertea 29 if option in ("-g", "--max-multihits"):
908     self.max_hits = int(value)
909 gpertea 159 self.max_seg_hits = max(10, self.max_hits * 2)
910 gpertea 135 if option in ("-x", "--transcriptome-max-hits"):
911     self.t_max_hits = int(value)
912 gpertea 29 if option in ("-G", "--GTF"):
913     self.gff_annotation = value
914 gpertea 135 if option in ("-T", "--transcriptome"):
915     self.transcriptome_only = True
916 gpertea 154 if option == "--transcriptome-index":
917 gpertea 135 self.transcriptome_index = value
918     if option in("-M", "--prefilter-multihits"):
919     self.prefilter_multi = True
920 gpertea 29 if option in ("-j", "--raw-juncs"):
921     self.raw_junctions = value
922     if option == "--no-novel-juncs":
923     self.find_novel_juncs = False
924     if option == "--no-novel-indels":
925     self.find_novel_indels = False
926 gpertea 154 if option == "--fusion-search":
927     self.fusion_search = True
928     if option == "--fusion-anchor-length":
929     self.fusion_anchor_length = int(value)
930     if option == "--fusion-min-dist":
931     self.fusion_min_dist = int(value)
932     if option == "--fusion-read-mismatches":
933     self.fusion_read_mismatches = int(value)
934     if option == "--fusion-multireads":
935     self.fusion_multireads = int(value)
936     if option == "--fusion-multipairs":
937     self.fusion_multipairs = int(value)
938 gpertea 29 if option == "--no-gtf-juncs":
939     self.find_GFF_juncs = False
940     if option == "--no-coverage-search":
941     self.coverage_search = False
942 gpertea 135 if option == "--coverage-search":
943     self.coverage_search = True
944 gpertea 29 if option == "--no-closure-search":
945     self.closure_search = False
946     if option == "--closure-search":
947     self.closure_search = True
948     if option == "--microexon-search":
949     self.microexon_search = True
950     if option == "--butterfly-search":
951     self.butterfly_search = True
952     if option == "--no-butterfly-search":
953     self.butterfly_search = False
954 gpertea 154 if option == "--genome-read-mismatches":
955     self.genome_read_mismatches = int(value)
956     if option in ("-N", "--read-mismatches", "--max-read-mismatches"):
957     self.max_read_mismatches = int(value)
958 gpertea 135 if option in ("-n", "--transcriptome-mismatches"):
959     self.t_mismatches = int(value)
960 gpertea 29 if option == "--segment-length":
961     self.segment_length = int(value)
962     if option == "--segment-mismatches":
963     self.segment_mismatches = int(value)
964     if option == "--bowtie-n":
965     self.bowtie_alignment_option = "-n"
966     if option == "--max-insertion-length":
967     self.max_insertion_length = int(value)
968     if option == "--max-deletion-length":
969     self.max_deletion_length = int(value)
970     if option == "--insertions":
971     self.raw_insertions = value
972     if option == "--deletions":
973     self.raw_deletions = value
974 gpertea 159 if option == "--report-secondary-alignments":
975     self.report_secondary_alignments = True
976     if option == "--keep-fasta-order":
977     self.keep_fasta_order = True
978     if option == "--allow-partial-mapping":
979     self.partial_mapping = True
980 gpertea 29 if option in ("-o", "--output-dir"):
981     custom_out_dir = value + "/"
982     if option == "--tmp-dir":
983     custom_tmp_dir = value + "/"
984 gpertea 135 if self.transcriptome_only:
985     self.find_novel_juncs=False
986     self.find_novel_indels=False
987     if custom_out_dir:
988 gpertea 29 output_dir = custom_out_dir
989     logging_dir = output_dir + "logs/"
990     tmp_dir = output_dir + "tmp/"
991     sam_header = tmp_dir + "stub_header.sam"
992 gpertea 135 if custom_tmp_dir:
993 gpertea 29 tmp_dir = custom_tmp_dir
994     sam_header = tmp_dir + "stub_header.sam"
995     if len(args) < 2:
996     raise Usage(use_message)
997     return args
998    
999    
1000 gpertea 154 # check if a file exists and has non-zero (or minimum) size
1001     def fileExists(filepath, minfsize=1):
1002     if os.path.exists(filepath) and os.path.getsize(filepath)>=minfsize:
1003     return True
1004     else:
1005     return False
1006    
1007    
1008 gpertea 29 def getFileDir(filepath):
1009 gpertea 154 #if fullpath given, returns path including the ending /
1010 gpertea 29 fpath, fname=os.path.split(filepath)
1011     if fpath: fpath+='/'
1012     return fpath
1013    
1014     def getFileBaseName(filepath):
1015     fpath, fname=os.path.split(filepath)
1016     fbase, fext =os.path.splitext(fname)
1017     fx=fext.lower()
1018     if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fbase)>0:
1019     return fbase
1020     elif fx == '.z' or fx.find('.gz')==0 or fx.find('.bz')==0:
1021     fb, fext = os.path.splitext(fbase)
1022     fx=fext.lower()
1023     if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fb)>0:
1024     return fb
1025     else:
1026     return fbase
1027     else:
1028 gpertea 154 if len(fbase)>0:
1029 gpertea 29 return fbase
1030     else:
1031     return fname
1032    
1033     # Returns the current time in a nice format
1034     def right_now():
1035     curr_time = datetime.now()
1036     return curr_time.strftime("%c")
1037    
1038 gpertea 135 # The TopHat logging formatter
1039     def th_log(out_str):
1040 gpertea 164 #print >> sys.stderr, "[%s] %s" % (right_now(), out_str)
1041     if tophat_logger:
1042     tophat_logger.info(out_str)
1043 gpertea 135
1044 gpertea 164 def th_logp(out_str=""):
1045     print >> sys.stderr, out_str
1046     if tophat_log:
1047     print >> tophat_log, out_str
1048    
1049     def die(msg=None):
1050     if msg is not None:
1051     th_logp(msg)
1052     sys.exit(1)
1053    
1054 gpertea 29 # Ensures that the output, logging, and temp directories are present. If not,
1055     # they are created
1056     def prepare_output_dir():
1057    
1058 gpertea 164 #th_log("Preparing output location "+output_dir)
1059 gpertea 29 if os.path.exists(output_dir):
1060     pass
1061     else:
1062     os.mkdir(output_dir)
1063    
1064     if os.path.exists(logging_dir):
1065     pass
1066     else:
1067     os.mkdir(logging_dir)
1068    
1069     if os.path.exists(tmp_dir):
1070     pass
1071     else:
1072     try:
1073     os.makedirs(tmp_dir)
1074     except OSError, o:
1075     die("\nError creating directory %s (%s)" % (tmp_dir, o))
1076    
1077    
1078     # to be added as preexec_fn for every subprocess.Popen() call:
1079     # see http://bugs.python.org/issue1652
1080     def subprocess_setup():
1081     # Python installs a SIGPIPE handler by default, which causes
1082     # gzip or other de/compression pipes to complain about "stdout: Broken pipe"
1083     signal.signal(signal.SIGPIPE, signal.SIG_DFL)
1084    
1085     # Check that the Bowtie index specified by the user is present and all files
1086     # are there.
1087 gpertea 164 def check_bowtie_index(idx_prefix, is_bowtie2):
1088 gpertea 135 th_log("Checking for Bowtie index files")
1089 gpertea 164 idxext="ebwt"
1090     bowtie_ver=""
1091     if is_bowtie2:
1092     idxext="bt2"
1093     bowtie_ver="2 "
1094 gpertea 29
1095 gpertea 164 idx_fwd_1 = idx_prefix + ".1."+idxext
1096     idx_fwd_2 = idx_prefix + ".2."+idxext
1097     idx_rev_1 = idx_prefix + ".rev.1."+idxext
1098     idx_rev_2 = idx_prefix + ".rev.2."+idxext
1099 gpertea 29
1100     if os.path.exists(idx_fwd_1) and \
1101     os.path.exists(idx_fwd_2) and \
1102     os.path.exists(idx_rev_1) and \
1103     os.path.exists(idx_rev_2):
1104     return
1105     else:
1106 gpertea 164 bwtidxerr="Error: Could not find Bowtie "+bowtie_ver+"index files (" + idx_prefix + ".*."+idxext+")"
1107     bwtidx_env = os.environ.get("BOWTIE_INDEXES")
1108 gpertea 154
1109 gpertea 164 if bwtidx_env == None:
1110     die(bwtidxerr)
1111     if os.path.exists(bwtidx_env+idx_fwd_1) and \
1112     os.path.exists(bwtidx_env+idx_fwd_2) and \
1113     os.path.exists(bwtidx_env+idx_rev_1) and \
1114     os.path.exists(bwtidx_env+idx_rev_2):
1115 gpertea 29 return
1116     else:
1117 gpertea 164 die(bwtidxerr)
1118 gpertea 29
1119     # Reconstructs the multifasta file from which the Bowtie index was created, if
1120     # it's not already there.
1121     def bowtie_idx_to_fa(idx_prefix):
1122     idx_name = idx_prefix.split('/')[-1]
1123 gpertea 135 th_log("Reconstituting reference FASTA file from Bowtie index")
1124 gpertea 29
1125     try:
1126     tmp_fasta_file_name = tmp_dir + idx_name + ".fa"
1127     tmp_fasta_file = open(tmp_fasta_file_name, "w")
1128    
1129     inspect_log = open(logging_dir + "bowtie_inspect_recons.log", "w")
1130    
1131     inspect_cmd = [prog_path("bowtie-inspect"),
1132     idx_prefix]
1133    
1134 gpertea 164 th_logp(" Executing: " + " ".join(inspect_cmd) + " > " + tmp_fasta_file_name)
1135 gpertea 29 ret = subprocess.call(inspect_cmd,
1136     stdout=tmp_fasta_file,
1137     stderr=inspect_log)
1138    
1139     # Bowtie reported an error
1140     if ret != 0:
1141     die(fail_str+"Error: bowtie-inspect returned an error")
1142    
1143     # Bowtie not found
1144     except OSError, o:
1145     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1146     die(fail_str+"Error: bowtie-inspect not found on this system. Did you forget to include it in your PATH?")
1147    
1148     return tmp_fasta_file_name
1149    
1150     # Checks whether the multifasta file for the genome is present alongside the
1151     # Bowtie index files for it.
1152     def check_fasta(idx_prefix):
1153 gpertea 135 th_log("Checking for reference FASTA file")
1154 gpertea 29 idx_fasta = idx_prefix + ".fa"
1155     if os.path.exists(idx_fasta):
1156     return idx_fasta
1157     else:
1158     bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES")
1159 gpertea 135 if bowtie_idx_env_var:
1160 gpertea 29 idx_fasta = bowtie_idx_env_var + idx_prefix + ".fa"
1161     if os.path.exists(idx_fasta):
1162     return idx_fasta
1163    
1164 gpertea 164 th_logp("\tWarning: Could not find FASTA file " + idx_fasta)
1165 gpertea 29 idx_fa = bowtie_idx_to_fa(idx_prefix)
1166     return idx_fa
1167    
1168     # Check that both the Bowtie index and the genome's fasta file are present
1169 gpertea 164 def check_index(idx_prefix, is_bowtie2):
1170     check_bowtie_index(idx_prefix, is_bowtie2)
1171 gpertea 29 ref_fasta_file = check_fasta(idx_prefix)
1172    
1173     return (ref_fasta_file, None)
1174    
1175     # Retrive a tuple containing the system's version of Bowtie. Parsed from
1176     # `bowtie --version`
1177 gpertea 154 def get_bowtie_version(is_bowtie2):
1178 gpertea 29 try:
1179     # Launch Bowtie to capture its version info
1180     proc = subprocess.Popen([bowtie_path, "--version"],
1181     stdout=subprocess.PIPE)
1182 gpertea 154
1183 gpertea 29 stdout_value = proc.communicate()[0]
1184 gpertea 154
1185 gpertea 29 bowtie_version = None
1186     bowtie_out = repr(stdout_value)
1187    
1188     # Find the version identifier
1189 gpertea 154 if is_bowtie2:
1190     version_str = "bowtie2-align version "
1191     else:
1192     version_str = "bowtie version "
1193    
1194 gpertea 29 ver_str_idx = bowtie_out.find(version_str)
1195     if ver_str_idx != -1:
1196     nl = bowtie_out.find("\\n", ver_str_idx)
1197     version_val = bowtie_out[ver_str_idx + len(version_str):nl]
1198 gpertea 154
1199     if is_bowtie2:
1200     ver_numbers = version_val.split('.')
1201     ver_numbers[2:] = ver_numbers[2].split('-')
1202     bowtie_version = [int(x) for x in ver_numbers[:3]] + [int(ver_numbers[3][4:])]
1203     else:
1204     bowtie_version = [int(x) for x in version_val.split('.')]
1205    
1206 gpertea 29 if len(bowtie_version) == 3:
1207     bowtie_version.append(0)
1208    
1209     return bowtie_version
1210     except OSError, o:
1211     errmsg=fail_str+str(o)+"\n"
1212     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1213     errmsg+="Error: bowtie not found on this system"
1214     die(errmsg)
1215    
1216 gpertea 159 def get_index_sam_header(params, idx_prefix):
1217 gpertea 29 try:
1218 gpertea 159 temp_sam_header_filename = tmp_dir + "temp.samheader.sam"
1219     temp_sam_header_file = open(temp_sam_header_filename, "w")
1220 gpertea 163
1221 gpertea 159 bowtie_header_cmd = [bowtie_path]
1222 gpertea 29
1223 gpertea 159 read_params = params.read_params
1224     if not params.bowtie2:
1225 gpertea 154 bowtie_header_cmd += ["--sam"]
1226    
1227 gpertea 29 if read_params.color:
1228     bowtie_header_cmd.append('-C')
1229 gpertea 154
1230 gpertea 29 bowtie_header_cmd.extend([idx_prefix, '/dev/null'])
1231     subprocess.call(bowtie_header_cmd,
1232 gpertea 159 stdout=temp_sam_header_file,
1233 gpertea 29 stderr=open('/dev/null'))
1234    
1235 gpertea 159 temp_sam_header_file.close()
1236     temp_sam_header_file = open(temp_sam_header_filename, "r")
1237 gpertea 29
1238 gpertea 159 bowtie_sam_header_filename = tmp_dir + idx_prefix.split('/')[-1] + ".bwt.samheader.sam"
1239     bowtie_sam_header_file = open(bowtie_sam_header_filename, "w")
1240 gpertea 154
1241 gpertea 29 preamble = []
1242     sq_dict_lines = []
1243    
1244 gpertea 159 for line in temp_sam_header_file.readlines():
1245 gpertea 29 line = line.strip()
1246     if line.find("@SQ") != -1:
1247     # Sequence dictionary record
1248     cols = line.split('\t')
1249     seq_name = None
1250     for col in cols:
1251     fields = col.split(':')
1252     #print fields
1253     if len(fields) > 0 and fields[0] == "SN":
1254     seq_name = fields[1]
1255     if seq_name == None:
1256     die("Error: malformed sequence dictionary in sam header")
1257     sq_dict_lines.append([seq_name,line])
1258     elif line.find("CL"):
1259     continue
1260     else:
1261     preamble.append(line)
1262 gpertea 159
1263     print >> bowtie_sam_header_file, "@HD\tVN:1.0\tSO:coordinate"
1264 gpertea 29 if read_params.read_group_id and read_params.sample_id:
1265     rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
1266     read_params.sample_id)
1267     if read_params.library_id:
1268     rg_str += "\tLB:%s" % read_params.library_id
1269     if read_params.description:
1270     rg_str += "\tDS:%s" % read_params.description
1271     if read_params.seq_platform_unit:
1272     rg_str += "\tPU:%s" % read_params.seq_platform_unit
1273     if read_params.seq_center:
1274     rg_str += "\tCN:%s" % read_params.seq_center
1275     if read_params.mate_inner_dist:
1276     rg_str += "\tPI:%s" % read_params.mate_inner_dist
1277     if read_params.seq_run_date:
1278     rg_str += "\tDT:%s" % read_params.seq_run_date
1279     if read_params.seq_platform:
1280     rg_str += "\tPL:%s" % read_params.seq_platform
1281    
1282 gpertea 159 print >> bowtie_sam_header_file, rg_str
1283 gpertea 29
1284 gpertea 159 if not params.keep_fasta_order:
1285     sq_dict_lines.sort(lambda x,y: cmp(x[0],y[0]))
1286 gpertea 163
1287 gpertea 29 for [name, line] in sq_dict_lines:
1288 gpertea 159 print >> bowtie_sam_header_file, line
1289     print >> bowtie_sam_header_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
1290 gpertea 29
1291 gpertea 159 bowtie_sam_header_file.close()
1292     temp_sam_header_file.close()
1293     return bowtie_sam_header_filename
1294 gpertea 29
1295     except OSError, o:
1296     errmsg=fail_str+str(o)+"\n"
1297     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1298     errmsg+="Error: bowtie not found on this system"
1299     die(errmsg)
1300    
1301     # Make sure Bowtie is installed and is recent enough to be useful
1302 gpertea 164 def check_bowtie(params):
1303     bowtie_req=""
1304     if params.bowtie2:
1305     bowtie_req="2"
1306     log_msg = "Checking for Bowtie" + bowtie_req
1307 gpertea 154 th_log(log_msg)
1308    
1309 gpertea 164 bowtie_bin = "bowtie"+bowtie_req
1310 gpertea 154
1311 gpertea 29 global bowtie_path
1312 gpertea 154 bowtie_path=prog_path(bowtie_bin)
1313 gpertea 164 bowtie_version = get_bowtie_version(params.bowtie2)
1314     if params.bowtie2 and bowtie_version == None:
1315     #try to fallback on bowtie 1
1316     params.bowtie2=False
1317     bowtie_version=get_bowtie_version(params.bowtie2)
1318 gpertea 29 if bowtie_version == None:
1319 gpertea 164 die("Error: Bowtie not found on this system.")
1320     if params.bowtie2:
1321 gpertea 154 if bowtie_version[0] < 3 and bowtie_version[1] < 1 and bowtie_version[2] < 1 and bowtie_version[3] < 5:
1322     die("Error: TopHat requires Bowtie 2.0.0-beta5 or later")
1323 gpertea 164 else:
1324 gpertea 154 if bowtie_version[0] < 1 and bowtie_version[1] < 12 and bowtie_version[2] < 3:
1325     die("Error: TopHat requires Bowtie 0.12.3 or later")
1326 gpertea 164 th_logp("\t\t Bowtie version:\t %s" % ".".join([str(x) for x in bowtie_version]))
1327 gpertea 29
1328    
1329     # Retrive a tuple containing the system's version of samtools. Parsed from
1330     # `samtools`
1331     def get_samtools_version():
1332     try:
1333     # Launch Bowtie to capture its version info
1334     proc = subprocess.Popen(samtools_path, stderr=subprocess.PIPE)
1335     samtools_out = proc.communicate()[1]
1336    
1337     # Find the version identifier
1338     version_match = re.search(r'Version:\s+(\d+)\.(\d+).(\d+)([a-zA-Z]?)', samtools_out)
1339     samtools_version_arr = [int(version_match.group(x)) for x in [1,2,3]]
1340     if version_match.group(4):
1341     samtools_version_arr.append(version_match.group(4))
1342     else:
1343     samtools_version_arr.append(0)
1344    
1345     return version_match.group(), samtools_version_arr
1346     except OSError, o:
1347     errmsg=fail_str+str(o)+"\n"
1348     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1349     errmsg+="Error: samtools not found on this system"
1350     die(errmsg)
1351    
1352     # Make sure the SAM tools are installed and are recent enough to be useful
1353     def check_samtools():
1354 gpertea 135 th_log("Checking for Samtools")
1355 gpertea 29 global samtools_path
1356     samtools_path=prog_path("samtools")
1357     samtools_version_str, samtools_version_arr = get_samtools_version()
1358     if samtools_version_str == None:
1359     die("Error: Samtools not found on this system")
1360     elif samtools_version_arr[1] < 1 or samtools_version_arr[2] < 7:
1361     die("Error: TopHat requires Samtools 0.1.7 or later")
1362 gpertea 164 th_logp("\t\tSamtools version:\t %s" % ".".join([str(x) for x in samtools_version_arr]))
1363 gpertea 29
1364    
1365 gpertea 164
1366 gpertea 29 class FastxReader:
1367     def __init__(self, i_file, is_color=0, fname=''):
1368     self.bufline=None
1369     self.format=None
1370     self.ifile=i_file
1371     self.nextRecord=None
1372     self.eof=None
1373     self.fname=fname
1374     self.lastline=None
1375     self.numrecords=0
1376     self.isColor=0
1377     if is_color : self.isColor=1
1378     # determine file type
1379     #no records processed yet, skip custom header lines if any
1380     hlines=10 # allow maximum 10 header lines
1381     self.lastline=" "
1382     while hlines>0 and self.lastline[0] not in "@>" :
1383     self.lastline=self.ifile.readline()
1384     hlines-=1
1385     if self.lastline[0] == '@':
1386     self.format='fastq'
1387     self.nextRecord=self.nextFastq
1388     elif self.lastline[0] == '>':
1389     self.format='fasta'
1390     self.nextRecord=self.nextFasta
1391     else:
1392     die("Error: cannot determine record type in input file %s" % fname)
1393     self.bufline=self.lastline
1394     self.lastline=None
1395    
1396     def nextFastq(self):
1397     # returning tuple: (seqID, sequence_string, seq_len, qv_string)
1398     seqid,seqstr,qstr,seq_len='','','',0
1399     if self.eof: return (seqid, seqstr, seq_len, qstr)
1400     fline=self.getLine #shortcut to save a bit of time
1401     line=fline()
1402    
1403     if not line : return (seqid, seqstr, seq_len, qstr)
1404     while len(line.rstrip())==0: # skip empty lines
1405     line=fline()
1406     if not line : return (seqid, seqstr,seq_len, qstr)
1407     try:
1408     if line[0] != "@":
1409     raise ValueError("Records in Fastq files should start with '@' character")
1410 gpertea 154
1411 gpertea 29 seqid = line[1:].rstrip()
1412     seqstr = fline().rstrip()
1413    
1414     #There may now be more sequence lines, or the "+" quality marker line:
1415     while True:
1416     line = fline()
1417     if not line:
1418     raise ValueError("Premature end of file (missing quality values for "+seqid+")")
1419     if line[0] == "+":
1420     # -- sequence string ended
1421     #qtitle = line[1:].rstrip()
1422     #if qtitle and qtitle != seqid:
1423     # raise ValueError("Different read ID for sequence and quality (%s vs %s)" \
1424     # % (seqid, qtitle))
1425     break
1426     seqstr += line.rstrip() #removes trailing newlines
1427     #loop until + found
1428     seq_len = len(seqstr)
1429     #at least one line of quality data should follow
1430     qstrlen=0
1431     #now read next lines as quality values until seq_len is reached
1432     while True:
1433     line=fline()
1434     if not line : break #end of file
1435     qstr += line.rstrip()
1436     qstrlen=len(qstr)
1437     if qstrlen + self.isColor >= seq_len :
1438     break # qv string has reached the length of seq string
1439     #loop until qv has the same length as seq
1440 gpertea 154
1441 gpertea 135 if self.isColor:
1442     # and qstrlen==seq_len :
1443     if qstrlen==seq_len:
1444     #qual string may have a dummy qv at the beginning, should be stripped
1445     qstr = qstr[1:]
1446     qstrlen -= 1
1447     if qstrlen!=seq_len-1:
1448     raise ValueError("Length mismatch between sequence and quality strings "+ \
1449     "for %s (%i vs %i)." % (seqid, seq_len, qstrlen))
1450     else:
1451     if seq_len != qstrlen :
1452     raise ValueError("Length mismatch between sequence and quality strings "+ \
1453     "for %s (%i vs %i)." % (seqid, seq_len, qstrlen))
1454 gpertea 29 except ValueError, err:
1455     die("\nError encountered parsing file "+self.fname+":\n "+str(err))
1456     #return the record
1457     self.numrecords+=1
1458 gpertea 135 ##--discard the primer base [NO]
1459 gpertea 29 if self.isColor :
1460     seq_len-=1
1461     seqstr = seqstr[1:]
1462     return (seqid, seqstr, seq_len, qstr)
1463    
1464     def nextFasta(self):
1465     # returning tuple: (seqID, sequence_string, seq_len)
1466     seqid,seqstr,seq_len='','',0
1467     fline=self.getLine # shortcut to readline function of f
1468     line=fline() # this will use the buffer line if it's there
1469     if not line : return (seqid, seqstr, seq_len, None)
1470     while len(line.rstrip())==0: # skip empty lines
1471     line=fline()
1472     if not line : return (seqid, seqstr, seq_len, None)
1473     try:
1474     if line[0] != ">":
1475     raise ValueError("Records in Fasta files must start with '>' character")
1476     seqid = line[1:].split()[0]
1477     #more sequence lines, or the ">" quality marker line:
1478     while True:
1479     line = fline()
1480     if not line: break
1481     if line[0] == '>':
1482     #next sequence starts here
1483     self.ungetLine()
1484     break
1485     seqstr += line.rstrip()
1486     #loop until '>' found
1487     seq_len = len(seqstr)
1488     if seq_len < 3:
1489     raise ValueError("Read %s too short (%i)." \
1490     % (seqid, seq_len))
1491     except ValueError, err:
1492     die("\nError encountered parsing fasta file "+self.fname+"\n "+str(err))
1493     #return the record and continue
1494     self.numrecords+=1
1495 gpertea 135 if self.isColor : # -- discard primer base
1496 gpertea 29 seq_len-=1
1497 gpertea 135 seqstr=seqstr[1:]
1498 gpertea 29 return (seqid, seqstr, seq_len, None)
1499    
1500     def getLine(self):
1501     if self.bufline: #return previously buffered line
1502     r=self.bufline
1503     self.bufline=None
1504     return r
1505     else: #read a new line from stream and return it
1506     if self.eof: return None
1507     self.lastline=self.ifile.readline()
1508     if not self.lastline:
1509     self.eof=1
1510     return None
1511     return self.lastline
1512     def ungetLine(self):
1513     if self.lastline is None:
1514 gpertea 164 th_logp("Warning: FastxReader called ungetLine() with no prior line!")
1515 gpertea 29 self.bufline=self.lastline
1516     self.lastline=None
1517     #< class FastxReader
1518    
1519 gpertea 135 def fa_write(fhandle, seq_id, seq):
1520     """
1521     Write to a file in the FASTA format.
1522    
1523     Arguments:
1524     - `fhandle`: A file handle open for writing
1525     - `seq_id`: The sequence id string for this sequence
1526     - `seq`: An unformatted string of the sequence to write
1527     """
1528     line_len = 60
1529     fhandle.write(">" + seq_id + "\n")
1530     for i in xrange(len(seq) / line_len + 1):
1531     start = i * line_len
1532 gpertea 154 #end = (i+1) * line_len if (i+1) * line_len < len(seq) else len(seq)
1533     if (i+1) * line_len < len(seq):
1534     end = (i+1) * line_len
1535     else:
1536     end = len(seq)
1537 gpertea 135 fhandle.write( seq[ start:end ] + "\n")
1538    
1539 gpertea 29 class ZReader:
1540     def __init__(self, filename, sysparams, guess=True):
1541     self.fname=filename
1542     self.file=None
1543     self.fsrc=None
1544     self.popen=None
1545     pipecmd=[]
1546 gpertea 31 s=filename.lower()
1547     if s.endswith(".bam"):
1548 gpertea 154 # daehwan - want to ouptut mapped reads as well
1549     pipecmd=[prog_path("bam2fastx"), "--all", "-"]
1550 gpertea 31 else:
1551 gpertea 135 if guess:
1552     if s.endswith(".z") or s.endswith(".gz") or s.endswith(".gzip"):
1553 gpertea 31 pipecmd=['gzip']
1554 gpertea 135 else:
1555 gpertea 31 if s.endswith(".bz2") or s.endswith(".bzip2") or s.endswith(".bzip"):
1556     pipecmd=['bzip2']
1557 gpertea 135 if len(pipecmd)>0 and which(pipecmd[0]) is None:
1558 gpertea 31 die("Error: cannot find %s to decompress input file %s " % (pipecmd, filename))
1559 gpertea 135 if len(pipecmd)>0:
1560     if pipecmd[0]=='gzip' and sysparams.zipper.endswith('pigz'):
1561 gpertea 31 pipecmd[0]=sysparams.zipper
1562     pipecmd.extend(sysparams.zipper_opts)
1563 gpertea 135 elif pipecmd[0]=='bzip2' and sysparams.zipper.endswith('pbzip2'):
1564 gpertea 31 pipecmd[0]=sysparams.zipper
1565     pipecmd.extend(sysparams.zipper_opts)
1566 gpertea 135 else: #not guessing, but must still check if it's a compressed file
1567     if use_zpacker and filename.endswith(".z"):
1568 gpertea 31 pipecmd=[sysparams.zipper]
1569     pipecmd.extend(sysparams.zipper_opts)
1570 gpertea 154
1571 gpertea 135 if pipecmd:
1572     pipecmd+=['-cd']
1573 gpertea 29 if pipecmd:
1574     try:
1575     self.fsrc=open(self.fname, 'rb')
1576     self.popen=subprocess.Popen(pipecmd,
1577     preexec_fn=subprocess_setup,
1578     stdin=self.fsrc,
1579 gpertea 167 stdout=subprocess.PIPE, stderr=tophat_log, close_fds=True)
1580 gpertea 29 except Exception:
1581     die("Error: could not open pipe "+' '.join(pipecmd)+' < '+ self.fname)
1582     self.file=self.popen.stdout
1583     else:
1584     self.file=open(filename)
1585     def close(self):
1586     if self.fsrc: self.fsrc.close()
1587     self.file.close()
1588     if self.popen:
1589     self.popen.wait()
1590     self.popen=None
1591    
1592     class ZWriter:
1593     def __init__(self, filename, sysparams):
1594     self.fname=filename
1595     if use_zpacker:
1596     pipecmd=[sysparams.zipper,"-cf", "-"]
1597     self.ftarget=open(filename, "wb")
1598     try:
1599     self.popen=subprocess.Popen(pipecmd,
1600     preexec_fn=subprocess_setup,
1601     stdin=subprocess.PIPE,
1602 gpertea 167 stderr=tophat_log, stdout=self.ftarget, close_fds=True)
1603 gpertea 29 except Exception:
1604     die("Error: could not open writer pipe "+' '.join(pipecmd)+' < '+ self.fname)
1605     self.file=self.popen.stdin # client writes to this end of the pipe
1606     else: #no compression
1607     self.file=open(filename, "w")
1608     self.ftarget=None
1609     self.popen=None
1610     def close(self):
1611     self.file.close()
1612     if self.ftarget: self.ftarget.close()
1613     if self.popen:
1614     self.popen.wait() #! required to actually flush the pipes (eek!)
1615     self.popen=None
1616    
1617     # check_reads_format() examines the first few records in the user files
1618     # to determines the file format
1619     def check_reads_format(params, reads_files):
1620 gpertea 135 #seed_len = params.read_params.seed_length
1621 gpertea 29 fileformat = params.read_params.reads_format
1622    
1623     observed_formats = set([])
1624     # observed_scales = set([])
1625     min_seed_len = 99999
1626     max_seed_len = 0
1627     files = reads_files.split(',')
1628    
1629     for f_name in files:
1630     #try:
1631     zf = ZReader(f_name, params.system_params)
1632     #except IOError:
1633     # die("Error: could not open file "+f_name)
1634     freader=FastxReader(zf.file, params.read_params.color, zf.fname)
1635     toread=4 #just sample the first 4 reads
1636     while toread>0:
1637     seqid, seqstr, seq_len, qstr = freader.nextRecord()
1638     if not seqid: break
1639     toread-=1
1640     if seq_len < 20:
1641 gpertea 164 th_logp("Warning: found a read < 20bp in "+f_name)
1642 gpertea 29 else:
1643     min_seed_len = min(seq_len, min_seed_len)
1644     max_seed_len = max(seq_len, max_seed_len)
1645     zf.close()
1646     observed_formats.add(freader.format)
1647     if len(observed_formats) > 1:
1648     die("Error: TopHat requires all reads be either FASTQ or FASTA. Mixing formats is not supported.")
1649     fileformat=list(observed_formats)[0]
1650 gpertea 135 #if seed_len != None:
1651     # seed_len = max(seed_len, max_seed_len)
1652     #else:
1653     # seed_len = max_seed_len
1654 gpertea 29 #print >> sys.stderr, "\tmin read length: %dbp, max read length: %dbp" % (min_seed_len, max_seed_len)
1655 gpertea 164 th_logp("\tformat:\t\t %s" % fileformat)
1656 gpertea 29 if fileformat == "fastq":
1657     quality_scale = "phred33 (default)"
1658     if params.read_params.solexa_quals and not params.read_params.phred64_quals:
1659     quality_scale = "solexa33 (reads generated with GA pipeline version < 1.3)"
1660     elif params.read_params.phred64_quals:
1661     quality_scale = "phred64 (reads generated with GA pipeline version >= 1.3)"
1662 gpertea 164 th_logp("\tquality scale:\t %s" % quality_scale)
1663 gpertea 29 elif fileformat == "fasta":
1664     if params.read_params.color:
1665     params.read_params.integer_quals = True
1666    
1667     #print seed_len, format, solexa_scale
1668     #NOTE: seed_len will be re-evaluated later by prep_reads
1669     return TopHatParams.ReadParams(params.read_params.solexa_quals,
1670     params.read_params.phred64_quals,
1671     params.read_params.quals,
1672     params.read_params.integer_quals,
1673     params.read_params.color,
1674     params.read_params.library_type,
1675 gpertea 135 #seed_len,
1676     params.read_params.seed_length,
1677 gpertea 29 fileformat,
1678     params.read_params.mate_inner_dist,
1679     params.read_params.mate_inner_dist_std_dev,
1680     params.read_params.read_group_id,
1681     params.read_params.sample_id,
1682     params.read_params.library_id,
1683     params.read_params.description,
1684     params.read_params.seq_platform_unit,
1685     params.read_params.seq_center,
1686     params.read_params.seq_run_date,
1687     params.read_params.seq_platform)
1688    
1689     def log_tail(logfile, lines=1):
1690     f=open(logfile, "r")
1691     f.seek(0, 2)
1692     fbytes= f.tell()
1693     size=lines
1694     block=-1
1695     while size > 0 and fbytes+block*1024 > 0:
1696     if (fbytes+block*1024 > 0):
1697     ##Seek back once more, if possible
1698     f.seek( block*1024, 2 )
1699     else:
1700     #Seek to the beginning
1701     f.seek(0, 0)
1702     data= f.read( 1024 )
1703     linesFound= data.count('\n')
1704     size -= linesFound
1705     block -= 1
1706     if (fbytes + block*1024 > 0):
1707     f.seek(block*1024, 2)
1708     else:
1709     f.seek(0,0)
1710     f.readline() # find a newline
1711     lastBlocks= list( f.readlines() )
1712     f.close()
1713     return "\n".join(lastBlocks[-lines:])
1714    
1715     # Format a DateTime as a pretty string.
1716     # FIXME: Currently doesn't support days!
1717     def formatTD(td):
1718 gpertea 154 days = td.days
1719     hours = td.seconds // 3600
1720     minutes = (td.seconds % 3600) // 60
1721     seconds = td.seconds % 60
1722 gpertea 29
1723 gpertea 154 if days > 0:
1724     return '%d days %02d:%02d:%02d' % (days, hours, minutes, seconds)
1725     else:
1726     return '%02d:%02d:%02d' % (hours, minutes, seconds)
1727    
1728 gpertea 29 class PrepReadsInfo:
1729     def __init__(self, fname, side):
1730     try:
1731     f=open(fname,"r")
1732     self.min_len=int(f.readline().split("=")[-1])
1733     self.max_len=int(f.readline().split("=")[-1])
1734     self.in_count=int(f.readline().split("=")[-1])
1735     self.out_count=int(f.readline().split("=")[-1])
1736     if (self.out_count==0) or (self.max_len<16):
1737     raise Exception()
1738     except Exception, e:
1739     die(fail_str+"Error retrieving prep_reads info.")
1740 gpertea 167 th_logp("\t%5s reads: min. length=%s, count=%s" % (side, self.min_len, self.out_count))
1741 gpertea 29 if self.min_len < 20:
1742 gpertea 164 th_logp("Warning: short reads (<20bp) will make TopHat quite slow and take large amount of memory because they are likely to be mapped to too many places")
1743 gpertea 29
1744 gpertea 135
1745 gpertea 154 def prep_reads_cmd(params, reads_list, quals_list=None, aux_file=None, index_file=None, filter_reads=None, hits_to_filter=None):
1746 gpertea 135 #generate a prep_reads cmd arguments
1747     filter_cmd = [prog_path("prep_reads")]
1748     filter_cmd.extend(params.cmd())
1749    
1750     if params.read_params.reads_format == "fastq":
1751     filter_cmd += ["--fastq"]
1752     elif params.read_params.reads_format == "fasta":
1753     filter_cmd += ["--fasta"]
1754     if hits_to_filter:
1755     filter_cmd += ["--flt-hits="+hits_to_filter]
1756     if aux_file:
1757     filter_cmd += ["--aux-outfile="+aux_file]
1758 gpertea 154 if index_file:
1759     filter_cmd += ["--index-outfile="+index_file]
1760 gpertea 135 if filter_reads:
1761     filter_cmd += ["--flt-reads="+filter_reads]
1762     filter_cmd.append(reads_list)
1763     if quals_list:
1764     filter_cmd.append(quals_list)
1765     return filter_cmd
1766    
1767 gpertea 29 # Calls the prep_reads executable, which prepares an internal read library.
1768     # The read library features reads with monotonically increasing integer IDs.
1769     # prep_reads also filters out very low complexy or garbage reads as well as
1770     # polyA reads.
1771     #--> returns the tuple (internal_read_library_file, read_info)
1772 gpertea 135 def prep_reads(params, reads_list, quals_list, side="left", prefilter_reads=None):
1773 gpertea 29 reads_suffix = ".fq"
1774 gpertea 154
1775     # for parallelization, we don't compress the read files
1776     do_use_zpacker = use_zpacker
1777     if params.system_params.num_threads > 1:
1778     do_use_zpacker = False
1779    
1780     if do_use_zpacker: reads_suffix += ".z"
1781    
1782 gpertea 135 output_name=side+"_kept_reads"
1783 gpertea 29 kept_reads_filename = tmp_dir + output_name + reads_suffix
1784    
1785     if os.path.exists(kept_reads_filename):
1786     os.remove(kept_reads_filename)
1787     kept_reads = open(kept_reads_filename, "wb")
1788     log_fname=logging_dir + "prep_reads.log"
1789     filter_log = open(log_fname,"w")
1790    
1791     info_file=output_dir+output_name+".info"
1792 gpertea 154 index_file=kept_reads_filename+".index"
1793    
1794     filter_cmd=prep_reads_cmd(params, reads_list, quals_list, info_file, index_file, prefilter_reads)
1795 gpertea 29 shell_cmd = ' '.join(filter_cmd)
1796     #finally, add the compression pipe
1797     zip_cmd=[]
1798 gpertea 154 if do_use_zpacker:
1799 gpertea 29 zip_cmd=[ params.system_params.zipper ]
1800     zip_cmd.extend(params.system_params.zipper_opts)
1801     zip_cmd.extend(['-c','-'])
1802     shell_cmd +=' | '+' '.join(zip_cmd)
1803     shell_cmd += ' >' +kept_reads_filename
1804     retcode=0
1805     try:
1806     print >> run_log, shell_cmd
1807 gpertea 154 if do_use_zpacker:
1808 gpertea 29 filter_proc = subprocess.Popen(filter_cmd,
1809     stdout=subprocess.PIPE,
1810     stderr=filter_log)
1811     zip_proc=subprocess.Popen(zip_cmd,
1812     preexec_fn=subprocess_setup,
1813     stdin=filter_proc.stdout,
1814 gpertea 167 stderr=tophat_log, stdout=kept_reads)
1815 gpertea 29 filter_proc.stdout.close() #as per http://bugs.python.org/issue7678
1816     zip_proc.communicate()
1817     retcode=filter_proc.poll()
1818     if retcode==0:
1819     retcode=zip_proc.poll()
1820     else:
1821     retcode = subprocess.call(filter_cmd,
1822     stdout=kept_reads,
1823     stderr=filter_log)
1824     if retcode:
1825     die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname))
1826    
1827     except OSError, o:
1828     errmsg=fail_str+str(o)
1829     die(errmsg+"\n"+log_tail(log_fname))
1830 gpertea 154
1831 gpertea 29 kept_reads.close()
1832     return kept_reads_filename, PrepReadsInfo(info_file, side)
1833    
1834 gpertea 135
1835 gpertea 29 # Call bowtie
1836     def bowtie(params,
1837     bwt_idx_prefix,
1838     reads_list,
1839     reads_format,
1840     num_mismatches,
1841     mapped_reads,
1842 gpertea 135 unmapped_reads,
1843     extra_output = "",
1844 gpertea 167 mapping_type = _reads_vs_G,
1845 gpertea 135 multihits_out = None): #only --prefilter-multihits should activate this parameter for the initial prefilter search
1846 gpertea 29 start_time = datetime.now()
1847     bwt_idx_name = bwt_idx_prefix.split('/')[-1]
1848 gpertea 135 reads_file=reads_list[0]
1849     readfile_basename=getFileBaseName(reads_file)
1850 gpertea 154
1851     g_mapping, t_mapping, seg_mapping = False, False, False
1852 gpertea 167 if mapping_type == _reads_vs_T:
1853 gpertea 154 t_mapping = True
1854 gpertea 167 elif mapping_type >= _segs_vs_G:
1855 gpertea 154 seg_mapping = True
1856     else:
1857     g_mapping = True
1858    
1859     bowtie_str = "Bowtie"
1860     if params.bowtie2:
1861     bowtie_str += "2"
1862    
1863     if seg_mapping:
1864     if not params.bowtie2:
1865     backup_bowtie_alignment_option = params.bowtie_alignment_option
1866     params.bowtie_alignment_option = "-v"
1867    
1868 gpertea 135 if t_mapping:
1869 gpertea 164 th_log("Mapping %s against transcriptome %s with %s %s" % (readfile_basename,
1870     bwt_idx_name, bowtie_str, extra_output))
1871 gpertea 135 else:
1872 gpertea 164 th_log("Mapping %s against %s with %s %s" % (readfile_basename,
1873     bwt_idx_name, bowtie_str, extra_output))
1874 gpertea 29 bwt_log = open(logging_dir + 'bowtie.'+readfile_basename+'.fixmap.log', "w")
1875 gpertea 135 #bwt_mapped=mapped_reads
1876 gpertea 67 unmapped_reads_out=None
1877 gpertea 137 if unmapped_reads:
1878 gpertea 67 unmapped_reads_out=unmapped_reads+".fq"
1879 gpertea 29 use_FIFO = use_BWT_FIFO and use_zpacker and unmapped_reads and params.read_params.color
1880 gpertea 135 if use_FIFO:
1881 gpertea 29 global unmapped_reads_fifo
1882 gpertea 67 unmapped_reads_fifo=unmapped_reads+".fifo"
1883 gpertea 29 if os.path.exists(unmapped_reads_fifo):
1884     os.remove(unmapped_reads_fifo)
1885     try:
1886     os.mkfifo(unmapped_reads_fifo)
1887     except OSError, o:
1888     die(fail_str+"Error at mkfifo("+unmapped_reads_fifo+'). '+str(o))
1889 gpertea 67 unmapped_reads_out+=".z"
1890 gpertea 29
1891     # Launch Bowtie
1892     try:
1893     bowtie_cmd = [bowtie_path]
1894     if reads_format == "fastq":
1895     bowtie_cmd += ["-q"]
1896     elif reads_format == "fasta":
1897     bowtie_cmd += ["-f"]
1898    
1899     if params.read_params.color:
1900     bowtie_cmd += ["-C", "--col-keepends"]
1901     if unmapped_reads:
1902 gpertea 135 bowtie_cmd += ["--un", unmapped_reads_out]
1903    
1904     unzip_cmd=None
1905     bam_input=False
1906 gpertea 154 if len(reads_list) > 0 and reads_list[0].endswith('.bam'):
1907 gpertea 135 bam_input=True
1908 gpertea 164 unzip_cmd=[ prog_path('bam2fastx'), "--all" ]
1909 gpertea 135 if reads_format:
1910 gpertea 154 unzip_cmd.append("--" + reads_format)
1911 gpertea 159 unzip_cmd+=[reads_list[0]]
1912 gpertea 137
1913     if use_zpacker and (unzip_cmd is None):
1914 gpertea 29 unzip_cmd=[ params.system_params.zipper ]
1915     unzip_cmd.extend(params.system_params.zipper_opts)
1916 gpertea 154 unzip_cmd+=['-cd']
1917 gpertea 29
1918     fifo_pid=None
1919     if use_FIFO:
1920 gpertea 135 unm_zipcmd=[ params.system_params.zipper ]
1921     unm_zipcmd.extend(params.system_params.zipper_opts)
1922     unm_zipcmd+=['-c']
1923     print >> run_log, ' '.join(unm_zipcmd)+' < '+ unmapped_reads_fifo + ' > '+ unmapped_reads_out + ' & '
1924     fifo_pid=os.fork()
1925     if fifo_pid==0:
1926     def on_sig_exit(sig, func=None):
1927 gpertea 29 os._exit(os.EX_OK)
1928 gpertea 135 signal.signal(signal.SIGTERM, on_sig_exit)
1929     subprocess.call(unm_zipcmd,
1930     stdin=open(unmapped_reads_fifo, "r"),
1931 gpertea 167 stderr=tophat_log,
1932 gpertea 135 stdout=open(unmapped_reads_out, "wb"))
1933     os._exit(os.EX_OK)
1934 gpertea 29
1935     fix_map_cmd = [prog_path('fix_map_ordering')]
1936 gpertea 154
1937     if params.bowtie2:
1938     if t_mapping or g_mapping:
1939     max_penalty, min_penalty = params.bowtie2_params.mp.split(',')
1940     max_penalty, min_penalty = int(max_penalty), int(min_penalty)
1941     min_score = (max_penalty - 1) * num_mismatches
1942     fix_map_cmd += ["--bowtie2-min-score", str(min_score)]
1943 gpertea 163
1944 gpertea 137 samzip_cmd=None
1945 gpertea 154
1946     # daehwan - just use bam format for multi-threading.
1947     if unmapped_reads or params.system_params.num_threads >= 1:
1948 gpertea 29 #mapping on the genome
1949     #fix_map_ordering will write BAM file directly, and unmapped_reads as BAM file too
1950     mapped_reads += ".bam"
1951 gpertea 154
1952     temp_sam_header = tmp_dir + bwt_idx_prefix.split('/')[-1] + ".bwt.samheader.sam"
1953     fix_map_cmd += ["--sam-header", temp_sam_header, "-", mapped_reads]
1954     if not params.read_params.color and unmapped_reads:
1955     unmapped_reads_out = unmapped_reads+".bam"
1956     fix_map_cmd += [unmapped_reads_out]
1957 gpertea 29 else:
1958     #mapping on segment_juncs (spliced potential junctions)
1959 gpertea 135 # don't use SAM headers, could be slow -- better use compressed SAM directly
1960 gpertea 29 mapped_reads += ".sam"
1961     fix_map_cmd += ["-"]
1962     if use_zpacker:
1963     fix_map_cmd += ["-"]
1964 gpertea 137 samzip_cmd=[ params.system_params.zipper ]
1965     samzip_cmd.extend(params.system_params.zipper_opts)
1966     samzip_cmd += ["-c"]
1967 gpertea 29 mapped_reads += ".z"
1968     else:
1969     fix_map_cmd += [mapped_reads]
1970 gpertea 137
1971 gpertea 154 fix_map_cmd += ["--index-outfile", mapped_reads + ".index"]
1972    
1973 gpertea 135 if t_mapping:
1974 gpertea 159 max_hits = params.t_max_hits
1975     elif seg_mapping:
1976     max_hits = params.max_seg_hits
1977     else:
1978     max_hits = params.max_hits
1979 gpertea 163
1980 gpertea 154 if num_mismatches > 3:
1981     num_mismatches = 3
1982 gpertea 29
1983 gpertea 154 if params.bowtie2:
1984     if seg_mapping:
1985     # since bowtie2 does not suppress reads that map to many places,
1986     # we suppress those in segment_juncs and long_spanning_reads.
1987     bowtie_cmd += ["-k", str(max_hits + 1)]
1988     else:
1989     bowtie_cmd += ["-k", str(max_hits)]
1990    
1991     bowtie2_params = params.bowtie2_params
1992     if seg_mapping:
1993     # after intensive testing,
1994     # the following parameters seem to work faster than Bowtie1 and as sensitive as Bowtie1,
1995     # but a chance for further improvements remains.
1996     bowtie_cmd += ["-N", str(min(num_mismatches, 1))]
1997     bowtie_cmd += ["-i", "C,2,0"]
1998     bowtie_cmd += ["-L", "20"]
1999     # bowtie_cmd += ["-L", str(min(params.segment_length, 20))]
2000     else:
2001     bowtie2_preset = ""
2002     if bowtie2_params.very_fast:
2003     bowtie2_preset = "--very-fast"
2004     elif bowtie2_params.fast:
2005     bowtie2_preset = "--fast"
2006     elif bowtie2_params.sensitive:
2007     bowtie2_preset = "--sensitive"
2008     elif bowtie2_params.very_sensitive:
2009     bowtie2_preset = "--very-sensitive"
2010    
2011     if bowtie2_preset != "":
2012     bowtie_cmd += [bowtie2_preset]
2013     else:
2014     bowtie_cmd += ["-D", str(bowtie2_params.D),
2015     "-R", str(bowtie2_params.R),
2016     "-N", str(bowtie2_params.N),
2017     "-L", str(bowtie2_params.L),
2018     "-i", bowtie2_params.i]
2019    
2020     # "--n-ceil" is not correctly parsed in Bowtie2,
2021     # I (daehwan) already talked to Ben who will fix the problem.
2022     bowtie_cmd += [# "--n-ceil", bowtie2_params.n_ceil,
2023     "--gbar", str(bowtie2_params.gbar),
2024     "--mp", bowtie2_params.mp,
2025     "--np", str(bowtie2_params.np),
2026     "--rdg", bowtie2_params.rdg,
2027     "--rfg", bowtie2_params.rfg,
2028     "--score-min", bowtie2_params.score_min]
2029    
2030 gpertea 135 else:
2031 gpertea 154 bowtie_cmd += [params.bowtie_alignment_option, str(num_mismatches),
2032     "-k", str(max_hits),
2033     "-m", str(max_hits),
2034     "-S"]
2035    
2036     bowtie_cmd += ["-p", str(params.system_params.num_threads)]
2037    
2038     if params.bowtie2: #always use headerless SAM file
2039     bowtie_cmd += ["--sam-no-hd"]
2040     else:
2041     bowtie_cmd += ["--sam-nohead"]
2042    
2043     if not params.bowtie2:
2044     if multihits_out:
2045     bowtie_cmd += ["--max", multihits_out]
2046     else:
2047     bowtie_cmd += ["--max", "/dev/null"]
2048    
2049     if params.bowtie2:
2050     bowtie_cmd += ["-x"]
2051    
2052 gpertea 135 bowtie_cmd += [ bwt_idx_prefix ]
2053 gpertea 29 bowtie_proc=None
2054     shellcmd=""
2055     unzip_proc=None
2056 gpertea 135 samzip_proc=None
2057 gpertea 159
2058 gpertea 135 if multihits_out:
2059     #special prefilter bowtie run: we use prep_reads on the fly
2060     #in order to get multi-mapped reads to exclude later
2061     prep_cmd = prep_reads_cmd(params, ",".join(reads_list))
2062     preplog_fname=logging_dir + "prep_reads_prefilter.log"
2063     prepfilter_log = open(preplog_fname,"w")
2064     unzip_proc = subprocess.Popen(prep_cmd,
2065     stdout=subprocess.PIPE,
2066     stderr=prepfilter_log)
2067     shellcmd=' '.join(prep_cmd) + "|"
2068     else:
2069     z_input=use_zpacker and reads_file.endswith(".z")
2070     if z_input:
2071     unzip_proc = subprocess.Popen(unzip_cmd,
2072     stdin=open(reads_file, "rb"),
2073 gpertea 167 stderr=tophat_log, stdout=subprocess.PIPE)
2074 gpertea 135 shellcmd=' '.join(unzip_cmd) + "< " +reads_file +"|"
2075 gpertea 29 else:
2076 gpertea 135 #must be uncompressed fastq input (unmapped reads from a previous run)
2077     #or a BAM file with unmapped reads
2078     if bam_input:
2079 gpertea 167 unzip_proc = subprocess.Popen(unzip_cmd, stderr=tophat_log, stdout=subprocess.PIPE)
2080 gpertea 135 shellcmd=' '.join(unzip_cmd) + "|"
2081 gpertea 159 else:
2082     bowtie_cmd += [reads_file]
2083 gpertea 135 bowtie_proc = subprocess.Popen(bowtie_cmd,
2084     stdout=subprocess.PIPE,
2085     stderr=bwt_log)
2086     if unzip_proc:
2087     #input is compressed OR prep_reads is used as a filter
2088     bowtie_cmd += ['-']
2089     bowtie_proc = subprocess.Popen(bowtie_cmd,
2090     stdin=unzip_proc.stdout,
2091     stdout=subprocess.PIPE,
2092     stderr=bwt_log)
2093     unzip_proc.stdout.close() # see http://bugs.python.org/issue7678
2094    
2095     shellcmd += ' '.join(bowtie_cmd) + '|' + ' '.join(fix_map_cmd)
2096 gpertea 137 if samzip_cmd:
2097 gpertea 135 shellcmd += "|"+ ' '.join(samzip_cmd)+" > " + mapped_reads
2098     fix_order_proc = subprocess.Popen(fix_map_cmd,
2099 gpertea 154 stdin=bowtie_proc.stdout,
2100 gpertea 167 stderr=tophat_log,
2101 gpertea 154 stdout=subprocess.PIPE)
2102 gpertea 135 bowtie_proc.stdout.close() # needed?
2103     samzip_proc = subprocess.Popen(samzip_cmd,
2104     preexec_fn=subprocess_setup,
2105     stdin=fix_order_proc.stdout,
2106 gpertea 167 stderr=tophat_log,
2107 gpertea 135 stdout=open(mapped_reads, "wb"))
2108     fix_order_proc.stdout.close()
2109 gpertea 137 else:
2110 gpertea 135 #write BAM output directly
2111     fix_order_proc = subprocess.Popen(fix_map_cmd,
2112 gpertea 167 stdin=bowtie_proc.stdout,
2113     stderr=tophat_log)
2114 gpertea 135 # stdout=open(mapped_reads, "w"))
2115     bowtie_proc.stdout.close()
2116 gpertea 137
2117 gpertea 135 # shellcmd += " > " + mapped_reads
2118 gpertea 29 print >> run_log, shellcmd
2119 gpertea 135 if samzip_proc:
2120     samzip_proc.communicate()
2121 gpertea 29 else:
2122     fix_order_proc.communicate()
2123     if use_FIFO:
2124 gpertea 67 if fifo_pid and not os.path.exists(unmapped_reads_out):
2125 gpertea 29 try:
2126     os.kill(fifo_pid, signal.SIGTERM)
2127     except:
2128     pass
2129 gpertea 154
2130 gpertea 29 except OSError, o:
2131     die(fail_str+"Error: "+str(o))
2132    
2133     # Success
2134     #finish_time = datetime.now()
2135     #duration = finish_time - start_time
2136     #print >> sys.stderr, "\t\t\t[%s elapsed]" % formatTD(duration)
2137     if use_FIFO:
2138     try:
2139     os.remove(unmapped_reads_fifo)
2140     except:
2141     pass
2142 gpertea 135 if multihits_out and not os.path.exists(multihits_out):
2143     open(multihits_out, "w").close()
2144 gpertea 29
2145 gpertea 154 if seg_mapping:
2146     if not params.bowtie2:
2147     params.bowtie_alignment_option = backup_bowtie_alignment_option
2148 gpertea 29
2149 gpertea 154 return (mapped_reads, unmapped_reads_out)
2150 gpertea 29
2151    
2152     # Retrieve a .juncs file from a GFF file by calling the gtf_juncs executable
2153     def get_gtf_juncs(gff_annotation):
2154 gpertea 135 th_log("Reading known junctions from GTF file")
2155 gpertea 29 gtf_juncs_log = open(logging_dir + "gtf_juncs.log", "w")
2156    
2157     gff_prefix = gff_annotation.split('/')[-1].split('.')[0]
2158    
2159     gtf_juncs_out_name = tmp_dir + gff_prefix + ".juncs"
2160     gtf_juncs_out = open(gtf_juncs_out_name, "w")
2161    
2162     gtf_juncs_cmd=[prog_path("gtf_juncs"), gff_annotation]
2163     try:
2164 gpertea 135 print >> run_log, " ".join(gtf_juncs_cmd), " > "+gtf_juncs_out_name
2165 gpertea 29 retcode = subprocess.call(gtf_juncs_cmd,
2166     stderr=gtf_juncs_log,
2167     stdout=gtf_juncs_out)
2168     # cvg_islands returned an error
2169     if retcode == 1:
2170 gpertea 164 th_logp("\tWarning: TopHat did not find any junctions in GTF file")
2171 gpertea 29 return (False, gtf_juncs_out_name)
2172     elif retcode != 0:
2173     die(fail_str+"Error: GTF junction extraction failed with err ="+str(retcode))
2174 gpertea 154
2175 gpertea 29 # cvg_islands not found
2176     except OSError, o:
2177     errmsg=fail_str+str(o)+"\n"
2178     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2179     errmsg+="Error: gtf_juncs not found on this system"
2180     die(errmsg)
2181     return (True, gtf_juncs_out_name)
2182    
2183     # Call bowtie-build on the FASTA file of sythetic splice junction sequences
2184 gpertea 154 def build_juncs_bwt_index(is_bowtie2, external_splice_prefix, color):
2185 gpertea 135 th_log("Indexing splices")
2186 gpertea 29 bowtie_build_log = open(logging_dir + "bowtie_build.log", "w")
2187    
2188     #user_splices_out_prefix = output_dir + "user_splices_idx"
2189    
2190 gpertea 154 if is_bowtie2:
2191     bowtie_build_cmd = [prog_path("bowtie2-build")]
2192     else:
2193     bowtie_build_cmd = [prog_path("bowtie-build")]
2194    
2195 gpertea 29 if color:
2196     bowtie_build_cmd += ["-C"]
2197    
2198     bowtie_build_cmd += [external_splice_prefix + ".fa",
2199     external_splice_prefix]
2200     try:
2201     print >> run_log, " ".join(bowtie_build_cmd)
2202     retcode = subprocess.call(bowtie_build_cmd,
2203     stdout=bowtie_build_log)
2204    
2205     if retcode != 0:
2206     die(fail_str+"Error: Splice sequence indexing failed with err ="+ str(retcode))
2207     except OSError, o:
2208     errmsg=fail_str+str(o)+"\n"
2209     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2210     errmsg+="Error: bowtie-build not found on this system"
2211     die(errmsg)
2212     return external_splice_prefix
2213    
2214     # Build a splice index from a .juncs file, suitable for use with specified read
2215     # (or read segment) lengths
2216 gpertea 154 def build_juncs_index(is_bowtie2,
2217     min_anchor_length,
2218 gpertea 135 max_seg_len,
2219 gpertea 29 juncs_prefix,
2220     external_juncs,
2221     external_insertions,
2222     external_deletions,
2223 gpertea 154 external_fusions,
2224 gpertea 29 reference_fasta,
2225     color):
2226 gpertea 135 th_log("Retrieving sequences for splices")
2227 gpertea 29
2228     juncs_file_list = ",".join(external_juncs)
2229     insertions_file_list = ",".join(external_insertions)
2230     deletions_file_list = ",".join(external_deletions)
2231 gpertea 154 fusions_file_list = ",".join(external_fusions)
2232 gpertea 29
2233 gpertea 159 # do not use insertions and deletions in case of Bowtie2
2234     if is_bowtie2:
2235     insertions_file_list = "/dev/null"
2236     deletions_file_list = "/dev/null"
2237    
2238 gpertea 29 juncs_db_log = open(logging_dir + "juncs_db.log", "w")
2239    
2240     external_splices_out_prefix = tmp_dir + juncs_prefix
2241     external_splices_out_name = external_splices_out_prefix + ".fa"
2242    
2243     external_splices_out = open(external_splices_out_name, "w")
2244     # juncs_db_cmd = [bin_dir + "juncs_db",
2245     juncs_db_cmd = [prog_path("juncs_db"),
2246     str(min_anchor_length),
2247 gpertea 135 str(max_seg_len),
2248 gpertea 29 juncs_file_list,
2249     insertions_file_list,
2250     deletions_file_list,
2251 gpertea 154 fusions_file_list,
2252 gpertea 29 reference_fasta]
2253     try:
2254 gpertea 167 print >> run_log, " ".join(juncs_db_cmd) + " > " + external_splices_out_name
2255 gpertea 29 retcode = subprocess.call(juncs_db_cmd,
2256     stderr=juncs_db_log,
2257     stdout=external_splices_out)
2258    
2259     if retcode != 0:
2260     die(fail_str+"Error: Splice sequence retrieval failed with err ="+str(retcode))
2261     # juncs_db not found
2262     except OSError, o:
2263     errmsg=fail_str+str(o)+"\n"
2264     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2265     errmsg+="Error: juncs_db not found on this system"
2266     die(errmsg)
2267    
2268 gpertea 154 external_splices_out_prefix = build_juncs_bwt_index(is_bowtie2, external_splices_out_prefix, color)
2269 gpertea 29 return external_splices_out_prefix
2270    
2271 gpertea 154 def build_idx_from_fa(is_bowtie2, fasta_fname, out_dir, color):
2272 gpertea 135 """ Build a bowtie index from a FASTA file.
2273    
2274     Arguments:
2275     - `fasta_fname`: File path to FASTA file.
2276     - `out_dir`: Output directory to place index in. (includes os.sep)
2277    
2278     Returns:
2279     - The path to the Bowtie index.
2280     """
2281     bwt_idx_path = out_dir + os.path.basename(fasta_fname).replace(".fa", "")
2282 gpertea 154
2283     if is_bowtie2:
2284     bowtie_idx_cmd = [prog_path("bowtie2-build")]
2285     else:
2286     bowtie_idx_cmd = [prog_path("bowtie-build")]
2287    
2288     if color:
2289     bowtie_idx_cmd += ["-C"]
2290    
2291     bowtie_idx_cmd += [fasta_fname,
2292     bwt_idx_path]
2293 gpertea 135 try:
2294     th_log("Building Bowtie index from " + os.path.basename(fasta_fname))
2295     print >> run_log, " ".join(bowtie_idx_cmd)
2296     retcode = subprocess.call(bowtie_idx_cmd,
2297     stdout=open(os.devnull, "w"),
2298     stderr=open(os.devnull, "w"))
2299     if retcode != 0:
2300     die(fail_str + "Error: Couldn't build bowtie index with err = "
2301     + str(retcode))
2302     except OSError, o:
2303     errmsg=fail_str+str(o)+"\n"
2304     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2305     errmsg+="Error: bowtie-build not found on this system"
2306     die(errmsg)
2307    
2308     return bwt_idx_path
2309    
2310 gpertea 29 # Print out the sam header, embedding the user's specified library properties.
2311     # FIXME: also needs SQ dictionary lines
2312     def write_sam_header(read_params, sam_file):
2313     print >> sam_file, "@HD\tVN:1.0\tSO:coordinate"
2314     if read_params.read_group_id and read_params.sample_id:
2315     rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
2316     read_params.sample_id)
2317     if read_params.library_id:
2318     rg_str += "\tLB:%s" % read_params.library_id
2319     if read_params.description:
2320     rg_str += "\tDS:%s" % read_params.description
2321     if read_params.seq_platform_unit:
2322     rg_str += "\tPU:%s" % read_params.seq_platform_unit
2323     if read_params.seq_center:
2324     rg_str += "\tCN:%s" % read_params.seq_center
2325     if read_params.mate_inner_dist:
2326     rg_str += "\tPI:%s" % read_params.mate_inner_dist
2327     if read_params.seq_run_date:
2328     rg_str += "\tDT:%s" % read_params.seq_run_date
2329     if read_params.seq_platform:
2330     rg_str += "\tPL:%s" % read_params.seq_platform
2331    
2332     print >> sam_file, rg_str
2333     print >> sam_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
2334    
2335     # Write final TopHat output, via tophat_reports and wiggles
2336 gpertea 154 def compile_reports(params, sam_header_filename, ref_fasta, mappings, readfiles, gff_annotation):
2337 gpertea 135 th_log("Reporting output tracks")
2338     left_maps, right_maps = mappings
2339     left_reads, right_reads = readfiles
2340 gpertea 29 left_maps = [x for x in left_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
2341     left_maps = ','.join(left_maps)
2342    
2343     if len(right_maps) > 0:
2344     right_maps = [x for x in right_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
2345     right_maps = ','.join(right_maps)
2346     log_fname = logging_dir + "reports.log"
2347     report_log = open(log_fname, "w")
2348     junctions = output_dir + "junctions.bed"
2349     insertions = output_dir + "insertions.bed"
2350     deletions = output_dir + "deletions.bed"
2351     accepted_hits = output_dir + "accepted_hits"
2352     report_cmdpath = prog_path("tophat_reports")
2353 gpertea 154 fusions = output_dir + "fusions.out"
2354 gpertea 29 report_cmd = [report_cmdpath]
2355 gpertea 154
2356     alignments_output_filename = tmp_dir + "accepted_hits"
2357    
2358 gpertea 29 report_cmd.extend(params.cmd())
2359 gpertea 159 report_cmd += ["--sam-header", sam_header_filename]
2360     if params.report_secondary_alignments:
2361     report_cmd += ["--report-secondary-alignments"]
2362 gpertea 163
2363 gpertea 29 report_cmd.extend(["--samtools="+samtools_path])
2364 gpertea 154 report_cmd.extend([ref_fasta,
2365     junctions,
2366 gpertea 29 insertions,
2367     deletions,
2368 gpertea 154 fusions,
2369     alignments_output_filename,
2370 gpertea 29 left_maps,
2371     left_reads])
2372 gpertea 154
2373 gpertea 135 if len(right_maps) > 0 and right_reads:
2374 gpertea 29 report_cmd.append(right_maps)
2375     report_cmd.append(right_reads)
2376 gpertea 154
2377 gpertea 29 # -- tophat_reports now produces (uncompressed) BAM stream,
2378     try:
2379     if params.report_params.convert_bam:
2380     if params.report_params.sort_bam:
2381 gpertea 154 print >> run_log, " ".join(report_cmd)
2382     report_proc=subprocess.call(report_cmd,
2383     preexec_fn=subprocess_setup,
2384     stderr=report_log)
2385 gpertea 29
2386 gpertea 154 bam_parts = []
2387     for i in range(params.system_params.num_threads):
2388     bam_part_filename = "%s%d.bam" % (alignments_output_filename, i)
2389     if os.path.exists(bam_part_filename):
2390     bam_parts.append(bam_part_filename)
2391     else:
2392     break
2393    
2394     num_bam_parts = len(bam_parts)
2395     pids = [0 for i in range(num_bam_parts)]
2396     sorted_bam_parts = ["%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)]
2397     #left_um_parts = ["%s%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)]
2398     #right_um_parts = ["%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)]
2399     for i in range(num_bam_parts):
2400     bamsort_cmd = [samtools_path,
2401     "sort",
2402     bam_parts[i],
2403     sorted_bam_parts[i]]
2404    
2405     sorted_bam_parts[i] += ".bam"
2406     print >> run_log, " ".join(bamsort_cmd)
2407     pid = os.fork()
2408     if pid == 0:
2409     subprocess.call(bamsort_cmd,
2410     stderr=open(logging_dir + "reports.samtools_sort.log%d" % i, "w"))
2411     os._exit(os.EX_OK)
2412     else:
2413     pids[i] = pid
2414    
2415     for i in range(len(pids)):
2416     if pids[i] > 0:
2417     os.waitpid(pids[i], 0)
2418     pids[i] = 0
2419    
2420     # for bam_part in bam_parts:
2421     # os.remove(bam_part)
2422    
2423     if num_bam_parts > 1:
2424     bammerge_cmd = [samtools_path,
2425     "merge",
2426     "-h",
2427     sam_header_filename,
2428     "%s.bam" % accepted_hits]
2429     bammerge_cmd += sorted_bam_parts
2430    
2431     print >> run_log, " ".join(bammerge_cmd)
2432     subprocess.call(bammerge_cmd,
2433     stderr=open(logging_dir + "reports.samtools_merge.log", "w"))
2434    
2435     for sorted_bam_part in sorted_bam_parts:
2436     os.remove(sorted_bam_part)
2437     else:
2438     os.rename(sorted_bam_parts[0], output_dir + "accepted_hits.bam")
2439    
2440 gpertea 29 else:
2441     print >> run_log, " ".join(report_cmd)
2442     report_proc=subprocess.call(report_cmd,
2443     preexec_fn=subprocess_setup,
2444     stdout=open(accepted_hits,"wb"),
2445     stderr=report_log)
2446     os.rename(accepted_hits, output_dir + "accepted_hits.bam")
2447     else:
2448     print >> run_log, " ".join(report_cmd)
2449     report_proc=subprocess.call(report_cmd,
2450     preexec_fn=subprocess_setup,
2451     stdout=open(accepted_hits,"wb"),
2452     stderr=report_log)
2453     tmp_sam = output_dir + "accepted_hits.sam"
2454    
2455 gpertea 135 bam_to_sam_cmd = [samtools_path, "view", "-h", accepted_hits]
2456 gpertea 29 print >> run_log, " ".join(bam_to_sam_cmd) + " > " + tmp_sam
2457     bam_to_sam_log = open(logging_dir + "accepted_hits_bam_to_sam.log", "w")
2458     tmp_sam_file = open(tmp_sam, "w")
2459     ret = subprocess.call(bam_to_sam_cmd,
2460     stdout=tmp_sam_file,
2461     stderr=bam_to_sam_log)
2462     tmp_sam_file.close()
2463     os.remove(accepted_hits)
2464    
2465     except OSError, o:
2466     die(fail_str+"Error: "+str(o)+"\n"+log_tail(log_fname))
2467    
2468 gpertea 154 return junctions
2469 gpertea 29
2470    
2471     # Split up each read in a FASTQ file into multiple segments. Creates a FASTQ file
2472     # for each segment This function needs to be fixed to support mixed read length
2473     # inputs
2474     def open_output_files(prefix, num_files_prev, num_files, out_segf, extension, params):
2475     i = num_files_prev + 1
2476     while i <= num_files:
2477     segfname=prefix+("_seg%d" % i)+extension
2478     out_segf.append(ZWriter(segfname,params.system_params))
2479     i += 1
2480    
2481     def split_reads(reads_filename,
2482     prefix,
2483     fasta,
2484     params,
2485     segment_length):
2486     #reads_file = open(reads_filename)
2487     zreads = ZReader(reads_filename, params.system_params, False)
2488     out_segfiles = []
2489    
2490     if fasta:
2491     extension = ".fa"
2492     else:
2493     extension = ".fq"
2494     if use_zpacker: extension += ".z"
2495     def convert_color_to_bp(color_seq):
2496     decode_dic = { 'A0':'A', 'A1':'C', 'A2':'G', 'A3':'T', 'A4':'N', 'A.':'N', 'AN':'N',
2497     'C0':'C', 'C1':'A', 'C2':'T', 'C3':'G', 'C4':'N', 'C.':'N', 'CN':'N',
2498     'G0':'G', 'G1':'T', 'G2':'A', 'G3':'C', 'G4':'N', 'G.':'N', 'GN':'N',
2499     'T0':'T', 'T1':'G', 'T2':'C', 'T3':'A', 'T4':'N', 'T.':'N', 'TN':'N',
2500     'N0':'N', 'N1':'N', 'N2':'N', 'N3':'N', 'N4':'N', 'N.':'N', 'NN':'N',
2501     '.0':'N', '.1':'N', '.2':'N', '.3':'N', '.4':'N', '..':'N', '.N':'N' }
2502    
2503     base = color_seq[0]
2504     bp_seq = base
2505     for ch in color_seq[1:]:
2506     base = decode_dic[base+ch]
2507     bp_seq += base
2508     return bp_seq
2509    
2510     def convert_bp_to_color(bp_seq):
2511     encode_dic = { 'AA':'0', 'CC':'0', 'GG':'0', 'TT':'0',
2512     'AC':'1', 'CA':'1', 'GT':'1', 'TG':'1',
2513     'AG':'2', 'CT':'2', 'GA':'2', 'TC':'2',
2514     'AT':'3', 'CG':'3', 'GC':'3', 'TA':'3',
2515     'A.':'4', 'C.':'4', 'G.':'4', 'T.':'4',
2516     '.A':'4', '.C':'4', '.G':'4', '.T':'4',
2517     '.N':'4', 'AN':'4', 'CN':'4', 'GN':'4',
2518     'TN':'4', 'NA':'4', 'NC':'4', 'NG':'4',
2519     'NT':'4', 'NN':'4', 'N.':'4', '..':'4' }
2520    
2521     base = bp_seq[0]
2522     color_seq = base
2523     for ch in bp_seq[1:]:
2524     color_seq += encode_dic[base + ch]
2525     base = ch
2526    
2527     return color_seq
2528    
2529     def split_record(read_name, read_seq, read_qual, out_segf, offsets, color):
2530     if color:
2531     color_offset = 1
2532     read_seq_temp = convert_color_to_bp(read_seq)
2533    
2534     seg_num = 1
2535     while seg_num + 1 < len(offsets):
2536     if read_seq[offsets[seg_num]+1] not in ['0', '1', '2', '3']:
2537     return
2538     seg_num += 1
2539     else:
2540     color_offset = 0
2541    
2542     seg_num = 0
2543     last_seq_offset = 0
2544     while seg_num + 1 < len(offsets):
2545     f = out_segf[seg_num].file
2546     seg_seq = read_seq[last_seq_offset+color_offset:offsets[seg_num + 1]+color_offset]
2547     print >> f, "%s|%d:%d:%d" % (read_name,last_seq_offset,seg_num, len(offsets) - 1)
2548     if color:
2549     print >> f, "%s%s" % (read_seq_temp[last_seq_offset], seg_seq)
2550     else:
2551     print >> f, seg_seq
2552     if not fasta:
2553     seg_qual = read_qual[last_seq_offset:offsets[seg_num + 1]]
2554     print >> f, "+"
2555     print >> f, seg_qual
2556     seg_num += 1
2557     last_seq_offset = offsets[seg_num]
2558    
2559     line_state = 0
2560     read_name = ""
2561     read_seq = ""
2562 gpertea 135 read_quals = ""
2563 gpertea 29 num_segments = 0
2564     offsets = []
2565     for line in zreads.file:
2566     if line.strip() == "":
2567     continue
2568     if line_state == 0:
2569     read_name = line.strip()
2570     elif line_state == 1:
2571     read_seq = line.strip()
2572    
2573     read_length = len(read_seq)
2574     tmp_num_segments = read_length / segment_length
2575     offsets = [segment_length * i for i in range(0, tmp_num_segments + 1)]
2576    
2577     # Bowtie's minimum read length here is 20bp, so if the last segment
2578     # is between 20 and segment_length bp long, go ahead and write it out
2579 gpertea 154 if read_length % segment_length >= min(segment_length - 2, 20):
2580 gpertea 29 offsets.append(read_length)
2581     tmp_num_segments += 1
2582     else:
2583     offsets[-1] = read_length
2584    
2585     if tmp_num_segments == 1:
2586     offsets = [0, read_length]
2587    
2588     if tmp_num_segments > num_segments:
2589     open_output_files(prefix, num_segments, tmp_num_segments, out_segfiles, extension, params)
2590     num_segments = tmp_num_segments
2591    
2592     if fasta:
2593     split_record(read_name, read_seq, None, out_segfiles, offsets, params.read_params.color)
2594     elif line_state == 2:
2595     line = line.strip()
2596     else:
2597     read_quals = line.strip()
2598     if not fasta:
2599     split_record(read_name, read_seq, read_quals, out_segfiles, offsets, params.read_params.color)
2600    
2601     line_state += 1
2602     if fasta:
2603     line_state %= 2
2604     else:
2605     line_state %= 4
2606     zreads.close()
2607     out_fnames=[]
2608     for zf in out_segfiles:
2609     zf.close()
2610     out_fnames.append(zf.fname)
2611     #return [o.fname for o in out_segfiles]
2612     return out_fnames
2613    
2614     # Find possible splice junctions using the "closure search" strategy, and report
2615     # them in closures.juncs. Calls the executable closure_juncs
2616     def junctions_from_closures(params,
2617 gpertea 159 sam_header_filename,
2618 gpertea 29 left_maps,
2619     right_maps,
2620     ref_fasta):
2621 gpertea 135 th_log("Searching for junctions via mate-pair closures")
2622 gpertea 29
2623 gpertea 154
2624 gpertea 29 #maps = [x for x in seg_maps if (os.path.exists(x) and os.path.getsize(x) > 0)]
2625     #if len(maps) == 0:
2626     # return None
2627     slash = left_maps[0].rfind('/')
2628     juncs_out = ""
2629     if slash != -1:
2630     juncs_out += left_maps[0][:slash+1]
2631 gpertea 154 fusions_out = juncs_out
2632    
2633 gpertea 29 juncs_out += "closure.juncs"
2634 gpertea 154 fusions_out += "closure.fusions"
2635 gpertea 29
2636     juncs_log = open(logging_dir + "closure.log", "w")
2637     juncs_cmdpath=prog_path("closure_juncs")
2638     juncs_cmd = [juncs_cmdpath]
2639    
2640     left_maps = ','.join(left_maps)
2641     right_maps = ','.join(right_maps)
2642    
2643     juncs_cmd.extend(params.cmd())
2644 gpertea 159 juncs_cmd.extend(["--sam-header", sam_header_filename,
2645     juncs_out,
2646 gpertea 154 fusions_out,
2647 gpertea 29 ref_fasta,
2648     left_maps,
2649     right_maps])
2650     try:
2651     print >> run_log, ' '.join(juncs_cmd)
2652     retcode = subprocess.call(juncs_cmd,
2653     stderr=juncs_log)
2654    
2655     # spanning_reads returned an error
2656     if retcode != 0:
2657     die(fail_str+"Error: closure-based junction search failed with err ="+str(retcode))
2658     # cvg_islands not found
2659     except OSError, o:
2660     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2661 gpertea 164 th_logp(fail_str + "Error: closure_juncs not found on this system")
2662 gpertea 29 die(str(o))
2663     return [juncs_out]
2664    
2665     # Find possible junctions by examining coverage and split segments in the initial
2666     # map and segment maps. Report junctions, insertions, and deletions in segment.juncs,
2667     # segment.insertions, and segment.deletions. Calls the executable
2668     # segment_juncs
2669     def junctions_from_segments(params,
2670 gpertea 159 sam_header_filename,
2671 gpertea 29 left_reads,
2672     left_reads_map,
2673     left_seg_maps,
2674     right_reads,
2675     right_reads_map,
2676     right_seg_maps,
2677     unmapped_reads,
2678     reads_format,
2679     ref_fasta):
2680     if left_reads_map != left_seg_maps[0]:
2681 gpertea 135 th_log("Searching for junctions via segment mapping")
2682 gpertea 29 out_path=getFileDir(left_seg_maps[0])
2683     juncs_out=out_path+"segment.juncs"
2684     insertions_out=out_path+"segment.insertions"
2685     deletions_out =out_path+"segment.deletions"
2686 gpertea 154 fusions_out = out_path+"segment.fusions"
2687 gpertea 29
2688     left_maps = ','.join(left_seg_maps)
2689 gpertea 135 log_fname = logging_dir + "segment_juncs.log"
2690     segj_log = open(log_fname, "w")
2691     segj_cmd = [prog_path("segment_juncs")]
2692 gpertea 29
2693 gpertea 135 segj_cmd.extend(params.cmd())
2694 gpertea 159 segj_cmd.extend(["--sam-header", sam_header_filename,
2695     "--ium-reads", ",".join(unmapped_reads),
2696     ref_fasta,
2697     juncs_out,
2698     insertions_out,
2699     deletions_out,
2700     fusions_out,
2701     left_reads,
2702     left_reads_map,
2703     left_maps])
2704 gpertea 135 if right_seg_maps:
2705 gpertea 29 right_maps = ','.join(right_seg_maps)
2706 gpertea 135 segj_cmd.extend([right_reads, right_reads_map, right_maps])
2707 gpertea 29 try:
2708 gpertea 135 print >> run_log, " ".join(segj_cmd)
2709     retcode = subprocess.call(segj_cmd,
2710 gpertea 29 preexec_fn=subprocess_setup,
2711 gpertea 135 stderr=segj_log)
2712 gpertea 29
2713     # spanning_reads returned an error
2714     if retcode != 0:
2715 gpertea 135 die(fail_str+"Error: segment-based junction search failed with err ="+str(retcode)+"\n"+log_tail(log_fname))
2716 gpertea 154
2717 gpertea 29 # cvg_islands not found
2718     except OSError, o:
2719     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2720 gpertea 164 th_logp(fail_str + "Error: segment_juncs not found on this system")
2721 gpertea 29 die(str(o))
2722    
2723 gpertea 154 return [juncs_out, insertions_out, deletions_out, fusions_out]
2724 gpertea 29
2725     # Joins mapped segments into full-length read alignments via the executable
2726     # long_spanning_reads
2727     def join_mapped_segments(params,
2728     sam_header_filename,
2729     reads,
2730     ref_fasta,
2731     possible_juncs,
2732     possible_insertions,
2733     possible_deletions,
2734 gpertea 154 possible_fusions,
2735 gpertea 29 contig_seg_maps,
2736     spliced_seg_maps,
2737     alignments_out_name):
2738 gpertea 135 rn=""
2739 gpertea 29 if len(contig_seg_maps)>1:
2740 gpertea 135 th_log("Joining segment hits")
2741     rn=".segs"
2742 gpertea 29 else:
2743 gpertea 135 th_log("Processing bowtie hits")
2744 gpertea 29 contig_seg_maps = ','.join(contig_seg_maps)
2745    
2746     possible_juncs = ','.join(possible_juncs)
2747     possible_insertions = ",".join(possible_insertions)
2748     possible_deletions = ",".join(possible_deletions)
2749 gpertea 154 possible_fusions = ",".join(possible_fusions)
2750    
2751 gpertea 135 log_fname=logging_dir + "long_spanning_reads"+rn+".log"
2752     align_log = open(log_fname, "w")
2753 gpertea 29 align_cmd = [prog_path("long_spanning_reads")]
2754    
2755     align_cmd.extend(params.cmd())
2756 gpertea 159 align_cmd += ["--sam-header", sam_header_filename]
2757 gpertea 29
2758 gpertea 159 b2_params = params.bowtie2_params
2759     max_penalty, min_penalty = b2_params.mp.split(',')
2760     align_cmd += ["--bowtie2-max-penalty", max_penalty,
2761     "--bowtie2-min-penalty", min_penalty]
2762    
2763     align_cmd += ["--bowtie2-penalty-for-N", str(b2_params.np)]
2764    
2765     read_gap_open, read_gap_cont = b2_params.rdg.split(',')
2766     align_cmd += ["--bowtie2-read-gap-open", read_gap_open,
2767     "--bowtie2-read-gap-cont", read_gap_cont]
2768    
2769     ref_gap_open, ref_gap_cont = b2_params.rfg.split(',')
2770     align_cmd += ["--bowtie2-ref-gap-open", ref_gap_open,
2771     "--bowtie2-ref-gap-cont", ref_gap_cont]
2772    
2773 gpertea 29 align_cmd.append(ref_fasta)
2774 gpertea 154 align_cmd.extend([reads,
2775     possible_juncs,
2776     possible_insertions,
2777     possible_deletions,
2778     possible_fusions,
2779     alignments_out_name,
2780     contig_seg_maps])
2781 gpertea 29
2782 gpertea 135 if spliced_seg_maps:
2783 gpertea 29 spliced_seg_maps = ','.join(spliced_seg_maps)
2784     align_cmd.append(spliced_seg_maps)
2785    
2786     try:
2787 gpertea 154 print >> run_log, " ".join(align_cmd)
2788     join_proc=subprocess.call(align_cmd,
2789     stderr=align_log)
2790    
2791 gpertea 29 except OSError, o:
2792     die(fail_str+"Error: "+str(o))
2793    
2794     # This class collects spliced and unspliced alignments for each of the
2795     # left and right read files provided by the user.
2796     class Maps:
2797     def __init__(self,
2798     unspliced_sam,
2799     seg_maps,
2800     unmapped_segs,
2801     segs):
2802     self.unspliced_sam = unspliced_sam
2803     self.seg_maps = seg_maps
2804     self.unmapped_segs = unmapped_segs
2805     self.segs = segs
2806    
2807 gpertea 135 # Map2GTF stuff
2808 gpertea 159 def m2g_convert_coords(params, sam_header_filename, gtf_fname, reads, out_fname):
2809 gpertea 135 """ajjkljlks
2810    
2811     Arguments:
2812     - `params`: TopHat parameters
2813     - `gtf_fname`: File name pointing to the annotation.
2814     - `reads`: The reads to convert coords (in Bowtie format).
2815     - `out_fname`: The file name pointing to the output.
2816     """
2817     m2g_cmd = [prog_path("map2gtf")]
2818     m2g_cmd.extend(params.cmd())
2819 gpertea 159 m2g_cmd += ["--sam-header", sam_header_filename]
2820 gpertea 135 m2g_cmd.append(gtf_fname)
2821     m2g_cmd.append(reads) #could be BAM file
2822     m2g_cmd.append(out_fname)
2823     fbasename = getFileBaseName(reads)
2824     m2g_log = logging_dir + "m2g_" + fbasename + ".out"
2825     m2g_err = logging_dir + "m2g_" + fbasename + ".err"
2826    
2827     try:
2828     th_log("Converting " + fbasename + " to genomic coordinates (map2gtf)")
2829     print >> run_log, " ".join(m2g_cmd) + " > " + m2g_log
2830     ret = subprocess.call(m2g_cmd,
2831     stdout=open(m2g_log, "w"),
2832     stderr=open(m2g_err, "w"))
2833     if ret != 0:
2834     die(fail_str + " Error: map2gtf returned an error")
2835     except OSError, o:
2836     err_msg = fail_str + str(o)
2837     die(err_msg + "\n")
2838    
2839    
2840     def gtf_to_fasta(params, trans_gtf, genome, out_fname):
2841     """ Make the transcriptome from a GTF.
2842    
2843     Arguments:
2844     - `trans_gtf`:
2845     - `genome`:
2846     - `out_fname`:
2847     """
2848     # TODO: Call gtf_to_fasta
2849     g2f_cmd = [prog_path("gtf_to_fasta")]
2850     g2f_cmd.extend(params.cmd())
2851     g2f_cmd.append(trans_gtf)
2852     g2f_cmd.append(genome)
2853     g2f_cmd.append(out_fname)
2854    
2855     g2f_log = logging_dir + "g2f.out"
2856     g2f_err = logging_dir + "g2f.err"
2857    
2858     try:
2859     print >> run_log, " ".join(g2f_cmd)+" > " + g2f_log
2860     ret = subprocess.call(g2f_cmd,
2861     stdout = open(g2f_log, "w"),
2862     stderr = open(g2f_err, "w"))
2863     if ret != 0:
2864     die(fail_str + " Error: gtf_to_fasta returned an error.")
2865     except OSError, o:
2866     err_msg = fail_str + str(o)
2867     die(err_msg + "\n")
2868    
2869 gpertea 159 def map2gtf(params, genome_sam_header_filename, ref_fasta, left_reads, right_reads):
2870 gpertea 135 """ Main GTF mapping function
2871    
2872     Arguments:
2873     - `params`: The TopHat parameters.
2874     - `ref_fasta`: The reference genome.
2875     - `left_reads`: A list of reads.
2876     - `right_reads`: A list of reads (empty if single-end).
2877    
2878     """
2879     test_input_file(params.gff_annotation)
2880    
2881     # th_log("Reading in GTF file: " + params.gff_annotation)
2882     # transcripts = gtf_to_transcripts(params.gff_annotation)
2883    
2884     gtf_name = getFileBaseName(params.gff_annotation)
2885     m2g_bwt_idx = None
2886 gpertea 154 t_out_dir = tmp_dir
2887     if params.transcriptome_index and not params.transcriptome_outdir:
2888 gpertea 135 m2g_bwt_idx = params.transcriptome_index
2889 gpertea 154 th_log("Using pre-built transcriptome index..")
2890 gpertea 135 else:
2891 gpertea 154 th_log("Creating transcriptome data files..")
2892     if params.transcriptome_outdir:
2893     t_out_dir=params.transcriptome_outdir+"/"
2894     m2g_ref_fasta = t_out_dir + gtf_name + ".fa"
2895 gpertea 135 gtf_to_fasta(params, params.gff_annotation, ref_fasta, m2g_ref_fasta)
2896 gpertea 154 m2g_bwt_idx = build_idx_from_fa(params.bowtie2, m2g_ref_fasta, t_out_dir, params.read_params.color)
2897 gpertea 159
2898     get_index_sam_header(params, m2g_bwt_idx)
2899 gpertea 163
2900 gpertea 135 mapped_gtf_list = []
2901     unmapped_gtf_list = []
2902     # do the initial mapping in GTF coordinates
2903     for reads in [left_reads, right_reads]:
2904     if reads == None or os.path.getsize(reads) < 25 :
2905     continue
2906     fbasename = getFileBaseName(reads)
2907     mapped_gtf_out = tmp_dir + fbasename + ".m2g"
2908     #if use_zpacker:
2909     # mapped_gtf_out+=".z"
2910    
2911     unmapped_gtf = tmp_dir + fbasename + ".m2g_um"
2912     #if use_BWT_FIFO:
2913     # unmapped_gtf += ".z"
2914    
2915     (mapped_gtf_map, unmapped) = bowtie(params,
2916     m2g_bwt_idx,
2917     [reads],
2918     "fastq",
2919     params.t_mismatches,
2920     mapped_gtf_out,
2921     unmapped_gtf,
2922 gpertea 167 "", _reads_vs_T)
2923 gpertea 135 mapped_gtf_list.append(mapped_gtf_map)
2924     unmapped_gtf_list.append(unmapped)
2925    
2926     sam_gtf_list = []
2927     for reads in mapped_gtf_list:
2928     fbasename = getFileBaseName(reads)
2929     sam_out_fname = tmp_dir + fbasename + "_converted.bam"
2930     m2g_convert_coords(params,
2931 gpertea 159 genome_sam_header_filename,
2932 gpertea 135 params.gff_annotation,
2933     reads,
2934     sam_out_fname)
2935     sam_gtf_list.append(sam_out_fname)
2936    
2937     if len(sam_gtf_list) < 2:
2938     sam_gtf_list.append(None)
2939     unmapped_gtf_list.append(None)
2940    
2941     return (sam_gtf_list, unmapped_gtf_list)
2942     # end Map2GTF
2943    
2944     def get_preflt_data(params, ri, target_reads, out_mappings, out_unmapped):
2945     ## extract mappings and unmapped reads from prefilter mappings and preflt_ium
2946     ##
2947     #this is accomplished by a special prep_reads usage (triggered by --flt-hits)
2948    
2949     prep_cmd=prep_reads_cmd(params, params.preflt_data[ri].unmapped_reads, None,
2950     out_mappings,
2951     target_reads,
2952     params.preflt_data[ri].mappings)
2953     um_reads = open(out_unmapped, "wb")
2954     log_fname=logging_dir + "prep_reads.from_preflt."+str(ri)+".log"
2955     filter_log = open(log_fname,"w")
2956    
2957     shell_cmd = " ".join(prep_cmd)
2958     #add the compression pipe
2959     zip_cmd=[]
2960     if use_zpacker:
2961     zip_cmd=[ params.system_params.zipper ]
2962     zip_cmd.extend(params.system_params.zipper_opts)
2963     zip_cmd.extend(['-c','-'])
2964     shell_cmd +=' | '+' '.join(zip_cmd)
2965     shell_cmd += ' >' + out_unmapped
2966     retcode=0
2967     try:
2968     print >> run_log, shell_cmd
2969     if use_zpacker:
2970     prep_proc = subprocess.Popen(prep_cmd,
2971     stdout=subprocess.PIPE,
2972     stderr=filter_log)
2973     zip_proc = subprocess.Popen(zip_cmd,
2974     preexec_fn=subprocess_setup,
2975     stdin=prep_proc.stdout,
2976 gpertea 167 stderr=tophat_log, stdout=um_reads)
2977 gpertea 135 prep_proc.stdout.close() #as per http://bugs.python.org/issue7678
2978     zip_proc.communicate()
2979     retcode=prep_proc.poll()
2980     if retcode==0:
2981     retcode=zip_proc.poll()
2982     else:
2983     retcode = subprocess.call(prep_cmd,
2984     stdout=um_reads,
2985     stderr=filter_log)
2986     if retcode:
2987     die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname))
2988    
2989     except OSError, o:
2990     errmsg=fail_str+str(o)
2991     die(errmsg+"\n"+log_tail(log_fname))
2992     um_reads.close()
2993    
2994     return (out_mappings, out_unmapped)
2995    
2996    
2997 gpertea 29 # The main aligment routine of TopHat. This function executes most of the
2998     # workflow producing a set of candidate alignments for each cDNA fragment in a
2999     # pair of SAM alignment files (for paired end reads).
3000     def spliced_alignment(params,
3001     bwt_idx_prefix,
3002     sam_header_filename,
3003     ref_fasta,
3004     read_len,
3005     segment_len,
3006 gpertea 135 prepared_reads,
3007 gpertea 29 user_supplied_junctions,
3008     user_supplied_insertions,
3009     user_supplied_deletions):
3010    
3011     possible_juncs = []
3012     possible_juncs.extend(user_supplied_junctions)
3013    
3014     possible_insertions = []
3015     possible_insertions.extend(user_supplied_insertions)
3016     possible_deletions = []
3017     possible_deletions.extend(user_supplied_deletions)
3018 gpertea 154 possible_fusions = []
3019 gpertea 29
3020 gpertea 135 left_reads, right_reads = prepared_reads
3021    
3022     maps = [[], []] # maps[0] = left_reads mapping data, maps[1] = right_reads_mapping_data
3023 gpertea 154
3024 gpertea 29 #single_segments = False
3025    
3026 gpertea 135 # Before anything, map the reads using Map2GTF (if using annotation)
3027     m2g_maps = [ None, None ] # left, right
3028     initial_reads = [ left_reads, right_reads ]
3029    
3030     if params.gff_annotation:
3031     (mapped_gtf_list, unmapped_gtf_list) = \
3032 gpertea 159 map2gtf(params, sam_header_filename, ref_fasta, left_reads, right_reads)
3033 gpertea 135
3034     m2g_left_maps, m2g_right_maps = mapped_gtf_list
3035     m2g_maps = [m2g_left_maps, m2g_right_maps]
3036 gpertea 154 if params.transcriptome_only or not fileExists(unmapped_gtf_list[0]):
3037 gpertea 135 # The case where the user doesn't want to map to anything other
3038     # than the transcriptome OR we have no unmapped reads
3039     maps[0] = [m2g_left_maps]
3040     if right_reads:
3041 gpertea 154 maps[1] = [m2g_right_maps]
3042 gpertea 135 return maps
3043     # Feed the unmapped reads into spliced_alignment()
3044     initial_reads = unmapped_gtf_list[:]
3045     th_log("Resuming TopHat pipeline with unmapped reads")
3046    
3047     max_seg_len = segment_len #this is the ref seq span on either side of the junctions
3048     #to be extracted into segment_juncs.fa
3049    
3050     num_segs = int(read_len / segment_len)
3051     if (read_len % segment_len) >= min(segment_len-2, 20):
3052     #remainder is shorter but long enough to become a new segment
3053     num_segs += 1
3054     else:
3055     # the last segment is longer
3056     if num_segs>1: max_seg_len += (read_len % segment_len)
3057    
3058     if num_segs <= 1:
3059 gpertea 164 th_logp("Warning: you have only one segment per read.\n\tIf the read length is greater than or equal to 45bp,\n\twe strongly recommend that you decrease --segment-length to about half the read length because TopHat will work better with multiple segments")
3060 gpertea 135
3061     # Using the num_segs value returned by check_reads(),
3062     # decide which junction discovery strategy to use
3063 gpertea 154 if num_segs < 3:
3064 gpertea 135 #if params.butterfly_search != False:
3065     # params.butterfly_search = True
3066     if params.coverage_search != False:
3067 gpertea 154 params.coverage_search = True
3068 gpertea 135 if num_segs == 1:
3069     segment_len = read_len
3070     else: #num_segs >= 3:
3071     # if we have at least three segments, just use split segment search,
3072     # which is the most sensitive and specific, fastest, and lightest-weight.
3073     # so unless specifically requested, disable the other junction searches
3074     if params.closure_search != True:
3075     params.closure_search = False
3076     if params.coverage_search != True:
3077     params.coverage_search = False
3078     if params.butterfly_search != True:
3079     params.butterfly_search = False
3080 gpertea 154
3081 gpertea 29 # Perform the first part of the TopHat work flow on the left and right
3082     # reads of paired ends separately - we'll use the pairing information later
3083 gpertea 154 have_left_IUM=False
3084 gpertea 135 for ri in (0,1):
3085     reads=initial_reads[ri]
3086 gpertea 154 if reads == None or not fileExists(reads,25):
3087 gpertea 29 continue
3088     fbasename=getFileBaseName(reads)
3089     unspliced_out = tmp_dir + fbasename + ".mapped"
3090 gpertea 135 unspliced_sam = None
3091     unmapped_reads = None
3092     #if use_zpacker: unspliced_out+=".z"
3093 gpertea 29 unmapped_unspliced = tmp_dir + fbasename + "_unmapped"
3094 gpertea 135 if params.prefilter_multi:
3095     #unmapped_unspliced += ".z"
3096     (unspliced_sam, unmapped_reads) = get_preflt_data(params, ri, reads, unspliced_out, unmapped_unspliced)
3097     else:
3098 gpertea 29 # Perform the initial Bowtie mapping of the full length reads
3099 gpertea 135 (unspliced_sam, unmapped_reads) = bowtie(params,
3100 gpertea 154 bwt_idx_prefix,
3101     [reads],
3102     "fastq",
3103     params.genome_read_mismatches,
3104     unspliced_out,
3105     unmapped_unspliced,
3106     "",
3107 gpertea 167 _reads_vs_G)
3108 gpertea 29
3109 gpertea 154 # daehwan - check this out
3110     # params.bowtie2 = False
3111     # check_bowtie(params.bowtie2)
3112    
3113 gpertea 29 seg_maps = []
3114     unmapped_segs = []
3115     segs = []
3116 gpertea 154 have_IUM = fileExists(unmapped_reads,25)
3117     if ri==0 and have_IUM:
3118     have_left_IUM = True
3119     if num_segs > 1 and have_IUM:
3120 gpertea 29 # split up the IUM reads into segments
3121 gpertea 31 # unmapped_reads can be in BAM format
3122     read_segments = split_reads(unmapped_reads,
3123 gpertea 29 tmp_dir + fbasename,
3124     False,
3125     params,
3126     segment_len)
3127    
3128     # Map each segment file independently with Bowtie
3129     for i in range(len(read_segments)):
3130     seg = read_segments[i]
3131     fbasename=getFileBaseName(seg)
3132 gpertea 167 seg_out = tmp_dir + fbasename
3133     #if use_zpacker: seg_out += ".z"
3134 gpertea 135 unmapped_seg = tmp_dir + fbasename + "_unmapped.fq"
3135     if use_BWT_FIFO:
3136     unmapped_seg += ".z"
3137 gpertea 29 extra_output = "(%d/%d)" % (i+1, len(read_segments))
3138 gpertea 154 (seg_map, unmapped) = bowtie(params,
3139     bwt_idx_prefix,
3140     [seg],
3141     "fastq",
3142     params.segment_mismatches,
3143     seg_out,
3144     unmapped_seg,
3145     extra_output,
3146 gpertea 167 _segs_vs_G)
3147 gpertea 29 seg_maps.append(seg_map)
3148     unmapped_segs.append(unmapped)
3149     segs.append(seg)
3150    
3151     # Collect the segment maps for left and right reads together
3152 gpertea 135 maps[ri] = Maps(unspliced_sam, seg_maps, unmapped_segs, segs)
3153 gpertea 29 else:
3154     # if there's only one segment, just collect the initial map as the only
3155     # map to be used downstream for coverage-based junction discovery
3156     read_segments = [reads]
3157 gpertea 135 maps[ri] = Maps(unspliced_sam, [unspliced_sam], [unmapped_reads], [unmapped_reads])
3158 gpertea 29
3159 gpertea 135 # XXX: At this point if using M2G, have three sets of reads:
3160     # mapped to transcriptome, mapped to genome, and unmapped (potentially
3161     # spliced or poly-A tails) - hp
3162    
3163 gpertea 29 # Unless the user asked not to discover new junctions, start that process
3164     # here
3165 gpertea 135 left_reads_map = maps[0].unspliced_sam
3166     left_seg_maps = maps[0].seg_maps
3167     unmapped_reads = maps[0].unmapped_segs
3168     if right_reads:
3169     right_reads_map = maps[1].unspliced_sam
3170     right_seg_maps = maps[1].seg_maps
3171     unmapped_reads.extend(maps[1].unmapped_segs)
3172     else:
3173     right_reads_map = None
3174     right_seg_maps = None
3175 gpertea 29
3176 gpertea 154 if params.find_novel_juncs and have_left_IUM: # or params.find_novel_indels:
3177 gpertea 29 # Call segment_juncs to infer a list of possible splice junctions from
3178     # the regions of the genome covered in the initial and segment maps
3179 gpertea 135 #if params.find_novel_juncs:
3180     #TODO: in m2g case, we might want to pass the m2g mappings as well,
3181     # or perhaps the GTF file directly
3182     # -> this could improve alternative junction detection?
3183 gpertea 29 juncs = junctions_from_segments(params,
3184 gpertea 159 sam_header_filename,
3185 gpertea 29 left_reads,
3186     left_reads_map,
3187     left_seg_maps,
3188     right_reads,
3189     right_reads_map,
3190     right_seg_maps,
3191     unmapped_reads,
3192     "fastq",
3193     ref_fasta)
3194 gpertea 135 if os.path.getsize(juncs[0]) != 0:
3195 gpertea 29 possible_juncs.append(juncs[0])
3196     if params.find_novel_indels:
3197 gpertea 154 if os.path.getsize(juncs[1]) != 0:
3198     possible_insertions.append(juncs[1])
3199     if os.path.getsize(juncs[2]) != 0:
3200     possible_deletions.append(juncs[2])
3201     if params.find_novel_fusions:
3202     if os.path.getsize(juncs[3]) != 0:
3203     possible_fusions.append(juncs[3])
3204 gpertea 29 # Optionally, and for paired reads only, use a closure search to
3205     # discover addtional junctions
3206 gpertea 135 if params.closure_search and left_reads and right_reads:
3207 gpertea 29 juncs = junctions_from_closures(params,
3208 gpertea 159 sam_header_filename,
3209 gpertea 135 [maps[initial_reads[left_reads]].unspliced_sam, maps[initial_reads[left_reads]].seg_maps[-1]],
3210     [maps[initial_reads[right_reads]].unspliced_sam, maps[initial_reads[right_reads]].seg_maps[-1]],
3211 gpertea 29 ref_fasta)
3212     if os.path.getsize(juncs[0]) != 0:
3213     possible_juncs.extend(juncs)
3214    
3215 gpertea 154 if len(possible_insertions) == 0 and len(possible_deletions) == 0 and len(possible_juncs) == 0 and len(possible_fusions) == 0:
3216 gpertea 29 spliced_seg_maps = None
3217     junc_idx_prefix = None
3218     else:
3219     junc_idx_prefix = "segment_juncs"
3220     if len(possible_insertions) == 0:
3221     possible_insertions.append(os.devnull)
3222     # print >> sys.stderr, "Warning: insertions database is empty!"
3223     if len(possible_deletions) == 0:
3224     possible_deletions.append(os.devnull)
3225     # print >> sys.stderr, "Warning: deletions database is empty!"
3226     if len(possible_juncs) == 0:
3227     possible_juncs.append(os.devnull)
3228 gpertea 164 th_logp("Warning: junction database is empty!")
3229 gpertea 154 if len(possible_fusions) == 0:
3230     possible_fusions.append(os.devnull)
3231 gpertea 135 if junc_idx_prefix:
3232 gpertea 154 juncs_bwt_idx = build_juncs_index(params.bowtie2,
3233     3,
3234     max_seg_len,
3235     junc_idx_prefix,
3236     possible_juncs,
3237     possible_insertions,
3238     possible_deletions,
3239     possible_fusions,
3240     ref_fasta,
3241     params.read_params.color)
3242 gpertea 29
3243 gpertea 159 juncs_bwt_samheader = get_index_sam_header(params, juncs_bwt_idx)
3244 gpertea 154
3245 gpertea 29 # Now map read segments (or whole IUM reads, if num_segs == 1) to the splice
3246     # index with Bowtie
3247 gpertea 135 # for reads in [left_reads, right_reads]:
3248     for ri in (0,1):
3249     reads = initial_reads[ri]
3250 gpertea 29 spliced_seg_maps = []
3251 gpertea 135
3252     if reads == None or os.path.getsize(reads)<25:
3253 gpertea 29 continue
3254    
3255 gpertea 135 m2g_map = m2g_maps[ri]
3256    
3257     if junc_idx_prefix :
3258 gpertea 29 i = 0
3259 gpertea 135 for seg in maps[ri].segs:
3260     #search each segment
3261 gpertea 29 fsegname = getFileBaseName(seg)
3262 gpertea 167 seg_out = tmp_dir + fsegname + ".to_spliced"
3263     #if use_zpacker: seg_out += ".z"
3264 gpertea 135 extra_output = "(%d/%d)" % (i+1, len(maps[ri].segs))
3265 gpertea 154 (seg_map, unmapped) = bowtie(params,
3266     tmp_dir + junc_idx_prefix,
3267     [seg],
3268     "fastq",
3269     params.segment_mismatches,
3270     seg_out,
3271     None,
3272     extra_output,
3273 gpertea 167 _segs_vs_J)
3274 gpertea 29 spliced_seg_maps.append(seg_map)
3275     i += 1
3276    
3277 gpertea 135 # Join the contigous and spliced segment hits into full length
3278     # read alignments
3279     rfname=getFileBaseName(reads)
3280     rfdir=getFileDir(reads)
3281 gpertea 29 mapped_reads = rfdir + rfname + ".candidates.bam"
3282 gpertea 135 # -- spliced mappings built from all segment mappings vs genome and junc_db
3283 gpertea 29 merged_map =rfdir + rfname + ".candidates_and_unspl.bam"
3284 gpertea 135 #unspl_bwtfile = maps[ri].unspliced_bwt
3285     unspl_samfile = maps[ri].unspliced_sam
3286 gpertea 29 join_mapped_segments(params,
3287     sam_header_filename,
3288     reads,
3289     ref_fasta,
3290     possible_juncs,
3291     possible_insertions,
3292     possible_deletions,
3293 gpertea 154 possible_fusions,
3294 gpertea 135 maps[ri].seg_maps,
3295 gpertea 29 spliced_seg_maps,
3296     mapped_reads)
3297    
3298 gpertea 135 if num_segs > 1 or m2g_map:
3299     # Merge the spliced and unspliced full length alignments into
3300     # a single SAM file.
3301 gpertea 29 # The individual SAM files are all already sorted in
3302     # increasing read ID order.
3303     # NOTE: We also should be able to address bug #134 here, by replacing
3304     # contiguous alignments that poke into an intron by a small amount by
3305     # the correct spliced alignment.
3306 gpertea 154
3307 gpertea 29 try:
3308 gpertea 159 merge_cmd = [prog_path("bam_merge"),
3309     "--index-outfile", merged_map + ".index",
3310     "--sam-header", sam_header_filename,
3311     merged_map]
3312 gpertea 154
3313 gpertea 135 if num_segs > 1:
3314     merge_cmd += [ maps[ri].unspliced_sam ]
3315     if m2g_map:
3316     merge_cmd += [ m2g_map ]
3317 gpertea 154
3318     if os.path.exists(mapped_reads):
3319     merge_cmd += [ mapped_reads ]
3320     else:
3321     for bam_i in range(0, params.system_params.num_threads):
3322     temp_bam = mapped_reads[:-4] + str(bam_i) + ".bam"
3323     if os.path.exists(temp_bam):
3324     merge_cmd += [ temp_bam ]
3325     else:
3326     break
3327    
3328 gpertea 29 print >> run_log, " ".join(merge_cmd)
3329     ret = subprocess.call( merge_cmd,
3330 gpertea 167 stderr=open(logging_dir + "bam_merge.log", "w") )
3331 gpertea 29 if ret != 0:
3332     die(fail_str+"Error executing: "+" ".join(merge_cmd))
3333     except OSError, o:
3334     die(fail_str+"Error: "+str(o))
3335 gpertea 135 maps[ri] = [merged_map]
3336 gpertea 29
3337     if not params.system_params.keep_tmp:
3338     os.remove(mapped_reads)
3339     else:
3340 gpertea 135 # no segments or transcriptome mappings, so no merge is needed
3341     # because join_mapped_segments() produced the final BAM already
3342 gpertea 29 os.rename(mapped_reads, merged_map)
3343 gpertea 135 maps[ri] = [merged_map]
3344     if not params.system_params.keep_tmp:
3345 gpertea 29 os.remove(unspl_samfile)
3346 gpertea 154
3347 gpertea 29 return maps
3348    
3349     # rough equivalent of the 'which' command to find external programs
3350     # (current script path is tested first, then PATH envvar)
3351     def which(program):
3352     def is_executable(fpath):
3353 gpertea 37 return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
3354 gpertea 29 fpath, fname = os.path.split(program)
3355     if fpath:
3356     if is_executable(program):
3357     return program
3358     else:
3359     progpath = os.path.join(bin_dir, program)
3360     if is_executable(progpath):
3361     return progpath
3362     for path in os.environ["PATH"].split(os.pathsep):
3363     progpath = os.path.join(path, program)
3364     if is_executable(progpath):
3365     return progpath
3366     return None
3367    
3368     def prog_path(program):
3369     progpath=which(program)
3370     if progpath == None:
3371     die("Error locating program: "+program)
3372     return progpath
3373    
3374     # FIXME: this should get set during the make dist autotools phase of the build
3375     def get_version():
3376 gpertea 154 return "2.0.0"
3377 gpertea 29
3378     def mlog(msg):
3379     print >> sys.stderr, "[DBGLOG]:"+msg
3380    
3381     def test_input_file(filename):
3382     try:
3383     test_file = open(filename, "r")
3384     except IOError, o:
3385     die("Error: Opening file %s" % filename)
3386     return
3387    
3388     def main(argv=None):
3389     warnings.filterwarnings("ignore", "tmpnam is a potential security risk")
3390    
3391     # Initialize default parameter values
3392     params = TopHatParams()
3393     try:
3394     if argv is None:
3395     argv = sys.argv
3396     args = params.parse_options(argv)
3397     params.check()
3398    
3399     bwt_idx_prefix = args[0]
3400     left_reads_list = args[1]
3401     left_quals_list, right_quals_list = [], []
3402     if (not params.read_params.quals and len(args) > 2) or (params.read_params.quals and len(args) > 3):
3403     if params.read_params.mate_inner_dist == None:
3404 gpertea 135 params.read_params.mate_inner_dist = 50
3405 gpertea 67 #die("Error: you must set the mean inner distance between mates with -r")
3406 gpertea 29
3407     right_reads_list = args[2]
3408     if params.read_params.quals:
3409     left_quals_list = args[3]
3410     right_quals_list = args[4]
3411     else:
3412     right_reads_list = None
3413     if params.read_params.quals:
3414     left_quals_list = args[2]
3415    
3416     start_time = datetime.now()
3417     prepare_output_dir()
3418 gpertea 164 init_logger(logging_dir + "tophat.log")
3419 gpertea 29
3420 gpertea 164 th_logp()
3421     th_log("Beginning TopHat run (v"+get_version()+")")
3422     th_logp("-----------------------------------------------")
3423    
3424 gpertea 29 global run_log
3425     run_log = open(logging_dir + "run.log", "w", 0)
3426     global run_cmd
3427     run_cmd = " ".join(argv)
3428     print >> run_log, run_cmd
3429    
3430 gpertea 164 check_bowtie(params)
3431     check_samtools()
3432    
3433 gpertea 29 # Validate all the input files, check all prereqs before committing
3434     # to the run
3435 gpertea 154 if params.gff_annotation:
3436     if not os.path.exists(params.gff_annotation):
3437     die("Error: cannot find transcript file %s" % params.gff_annotation)
3438     if os.path.getsize(params.gff_annotation)<10:
3439     die("Error: invalid transcript file %s" % params.gff_annotation)
3440    
3441 gpertea 135 if params.transcriptome_index:
3442 gpertea 154 if params.gff_annotation:
3443     #gff file given, so transcriptome data will be written there
3444     gff_basename = getFileBaseName(params.gff_annotation)
3445     #just in case, check if it's not already there (-G/--GTF given again by mistake)
3446     tpath, tname = os.path.split(params.transcriptome_index)
3447     new_subdir=False
3448     if tpath in (".", "./") or not tpath :
3449     if not os.path.exists(params.transcriptome_index):
3450     os.makedirs(params.transcriptome_index)
3451     new_subdir=True
3452     if new_subdir or (os.path.exists(params.transcriptome_index) and os.path.isdir(params.transcriptome_index)):
3453     params.transcriptome_index = os.path.join(params.transcriptome_index, gff_basename)
3454     gff_out=params.transcriptome_index+".gff"
3455     if not (os.path.exists(gff_out) and os.path.getsize(gff_out)==os.path.getsize(params.gff_annotation)):
3456     #generate the transcriptome data files
3457     tpath, tname = os.path.split(params.transcriptome_index)
3458     params.transcriptome_outdir=tpath
3459     t_gff=params.transcriptome_index+".gff"
3460     if params.transcriptome_outdir:
3461     #will create the transcriptome data files
3462     if not os.path.exists(params.transcriptome_outdir):
3463     os.makedirs(params.transcriptome_outdir)
3464     copy(params.gff_annotation, t_gff)
3465     else:
3466     #try to use existing transcriptome data files
3467    
3468     if not (os.path.exists(t_gff) and os.path.getsize(t_gff)>10):
3469     die("Error: GFF transcripts file not found or invalid (%s)" % t_gff)
3470 gpertea 164 check_bowtie_index(params.transcriptome_index, params.bowtie2)
3471 gpertea 154 params.gff_annotation=t_gff
3472     #end @ transcriptome_index given
3473    
3474 gpertea 164 (ref_fasta, ref_seq_dict) = check_index(bwt_idx_prefix, params.bowtie2)
3475 gpertea 29
3476 gpertea 135 th_log("Generating SAM header for "+bwt_idx_prefix)
3477 gpertea 159 sam_header_filename = get_index_sam_header(params, bwt_idx_prefix)
3478 gpertea 135 #if not params.skip_check_reads:
3479     reads_list = left_reads_list
3480     if right_reads_list:
3481 gpertea 29 reads_list = reads_list + "," + right_reads_list
3482 gpertea 135 params.read_params = check_reads_format(params, reads_list)
3483 gpertea 29
3484     user_supplied_juncs = []
3485     user_supplied_insertions = []
3486     user_supplied_deletions = []
3487 gpertea 154 user_supplied_fusions = []
3488 gpertea 135 global gtf_juncs
3489 gpertea 29 if params.gff_annotation and params.find_GFF_juncs:
3490     test_input_file(params.gff_annotation)
3491     (found_juncs, gtf_juncs) = get_gtf_juncs(params.gff_annotation)
3492 gpertea 159 ##-- we shouldn't need these junctions in user_supplied_juncs anymore because now map2gtf does a much better job
3493 gpertea 135 ## but we still need them loaded in gtf_juncs for later splice verification
3494 gpertea 163 if found_juncs:
3495 gpertea 159 ## and not params.gff_annotation:
3496     user_supplied_juncs.append(gtf_juncs)
3497 gpertea 135 #else:
3498     # gtf_juncs = None
3499 gpertea 29 if params.raw_junctions:
3500     test_input_file(params.raw_junctions)
3501     user_supplied_juncs.append(params.raw_junctions)
3502    
3503     if params.raw_insertions:
3504     test_input_file(params.raw_insertions)
3505     user_supplied_insertions.append(params.raw_insertions)
3506    
3507     if params.raw_deletions:
3508     test_input_file(params.raw_deletions)
3509     user_supplied_deletions.append(params.raw_deletions)
3510    
3511     global unmapped_reads_fifo
3512     unmapped_reads_fifo = tmp_dir + str(os.getpid())+".bwt_unmapped.z.fifo"
3513    
3514     # Now start the time consuming stuff
3515 gpertea 135 if params.prefilter_multi:
3516     sides=("left","right")
3517     read_lists=(left_reads_list, right_reads_list)
3518     for ri in (0,1):
3519     reads_list=read_lists[ri]
3520     if not reads_list:
3521     continue
3522     params.preflt_data[ri].multihit_reads = tmp_dir + sides[ri]+"_multimapped.fq"
3523 gpertea 167 side_imap = tmp_dir + sides[ri]+"_im"
3524     #if use_zpacker: side_imap+=".z"
3525 gpertea 135 side_ium = tmp_dir + sides[ri]+"_ium.fq"
3526     if use_BWT_FIFO:
3527     side_ium += ".z"
3528     th_log("Pre-filtering multi-mapped "+sides[ri]+" reads")
3529     rdlist=reads_list.split(',')
3530     bwt=bowtie(params, bwt_idx_prefix, rdlist,
3531     params.read_params.reads_format,
3532 gpertea 154 params.genome_read_mismatches,
3533 gpertea 135 side_imap, side_ium,
3534 gpertea 167 "", _reads_vs_G, params.preflt_data[ri].multihit_reads)
3535 gpertea 135 params.preflt_data[ri].mappings = bwt[0] # initial mappings
3536     params.preflt_data[ri].unmapped_reads = bwt[1] # IUM reads
3537    
3538     th_log("Preparing reads")
3539 gpertea 29 left_kept_reads, left_reads_info = prep_reads(params,
3540     left_reads_list,
3541     left_quals_list,
3542 gpertea 135 "left",
3543     params.preflt_data[0].multihit_reads)
3544 gpertea 29 min_read_len=left_reads_info.min_len
3545     max_read_len=left_reads_info.max_len
3546 gpertea 135 if right_reads_list:
3547 gpertea 29 right_kept_reads, right_reads_info = prep_reads(params,
3548     right_reads_list,
3549     right_quals_list,
3550 gpertea 135 "right",
3551     params.preflt_data[1].multihit_reads)
3552 gpertea 29 min_read_len=min(right_reads_info.min_len, min_read_len)
3553     max_read_len=max(right_reads_info.max_len, max_read_len)
3554     else:
3555     right_kept_reads = None
3556     seed_len=params.read_params.seed_length
3557 gpertea 135 if seed_len: #if read len was explicitly given
3558 gpertea 29 seed_len = max(seed_len, min_read_len)
3559 gpertea 135 #can't be smaller than minimum length observed
3560 gpertea 29 else:
3561     seed_len = max_read_len
3562     params.read_params.seed_length=seed_len
3563     # turn off integer-quals
3564     if params.read_params.integer_quals:
3565     params.read_params.integer_quals = False
3566 gpertea 154
3567 gpertea 135 input_reads = [left_kept_reads, right_kept_reads]
3568     mappings = spliced_alignment(params,
3569     bwt_idx_prefix,
3570     sam_header_filename,
3571     ref_fasta,
3572     params.read_params.seed_length,
3573     params.segment_length,
3574     input_reads,
3575     user_supplied_juncs,
3576     user_supplied_insertions,
3577     user_supplied_deletions)
3578 gpertea 29
3579     compile_reports(params,
3580     sam_header_filename,
3581 gpertea 154 ref_fasta,
3582 gpertea 135 mappings,
3583     input_reads,
3584 gpertea 29 params.gff_annotation)
3585    
3586     if not params.system_params.keep_tmp:
3587 gpertea 135 for m in mappings[0]:
3588 gpertea 29 os.remove(m)
3589 gpertea 135 if left_kept_reads:
3590 gpertea 29 os.remove(left_kept_reads)
3591 gpertea 135 for m in mappings[1]:
3592 gpertea 29 os.remove(m)
3593 gpertea 135 if right_kept_reads:
3594 gpertea 29 os.remove(right_kept_reads)
3595     tmp_files = os.listdir(tmp_dir)
3596     for t in tmp_files:
3597     os.remove(tmp_dir+t)
3598     os.rmdir(tmp_dir)
3599    
3600     finish_time = datetime.now()
3601     duration = finish_time - start_time
3602 gpertea 164 th_logp("-----------------------------------------------")
3603 gpertea 154 th_log("Run complete: %s elapsed" % formatTD(duration))
3604 gpertea 29
3605     except Usage, err:
3606 gpertea 164 th_logp(sys.argv[0].split("/")[-1] + ": " + str(err.msg))
3607     th_logp(" for detailed help see http://tophat.cbcb.umd.edu/manual.html")
3608 gpertea 29 return 2
3609    
3610    
3611     if __name__ == "__main__":
3612     sys.exit(main())

Properties

Name Value
svn:executable *