ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/tophat
Revision: 164
Committed: Thu Feb 9 05:30:41 2012 UTC (8 years, 3 months ago) by gpertea
File size: 149805 byte(s)
Log Message:
wip

Line User Rev File contents
1 gpertea 29 #!/usr/bin/env python
2    
3     # encoding: utf-8
4     """
5     tophat.py
6    
7     Created by Cole Trapnell on 2008-12-25.
8     Copyright (c) 2008 Cole Trapnell. All rights reserved.
9 gpertea 154 Updated and maintained by Daehwan Kim and Geo Pertea since Jul 2010.
10 gpertea 29 """
11     import sys
12     try:
13     import psyco
14     psyco.full()
15     except ImportError:
16     pass
17    
18     import getopt
19     import subprocess
20     import errno
21     import os
22     import warnings
23     import re
24     import signal
25     from datetime import datetime, date, time
26 gpertea 154 from shutil import copy
27 gpertea 164 import logging
28 gpertea 29
29     use_message = '''
30     TopHat maps short sequences from spliced transcripts to whole genomes.
31    
32     Usage:
33     tophat [options] <bowtie_index> <reads1[,reads2,...]> [reads1[,reads2,...]] \\
34     [quals1,[quals2,...]] [quals1[,quals2,...]]
35    
36     Options:
37     -v/--version
38     -o/--output-dir <string> [ default: ./tophat_out ]
39 gpertea 154 --bowtie1 [ default: bowtie2 ]
40 gpertea 29 -a/--min-anchor <int> [ default: 8 ]
41     -m/--splice-mismatches <0-2> [ default: 0 ]
42     -i/--min-intron-length <int> [ default: 50 ]
43     -I/--max-intron-length <int> [ default: 500000 ]
44     -g/--max-multihits <int> [ default: 20 ]
45 gpertea 135 -x/--transcriptome-max-hits <int> [ default: 60 ]
46     -n/--transcriptome-mismatches <int> [ default: 1 ]
47     -M/--prefilter-multihits ( for -G/--GTF option, enable
48     an initial bowtie search
49     against the genome )
50 gpertea 29 --max-insertion-length <int> [ default: 3 ]
51     --max-deletion-length <int> [ default: 3 ]
52     --solexa-quals
53     --solexa1.3-quals (same as phred64-quals)
54     --phred64-quals (same as solexa1.3-quals)
55     -Q/--quals
56     --integer-quals
57     -C/--color (Solid - color space)
58     --color-out
59     --library-type <string> (fr-unstranded, fr-firststrand,
60     fr-secondstrand)
61 gpertea 154 -p/--num-threads <int> [ default: 1 ]
62     -G/--GTF <filename> (GTF/GFF with known transcripts)
63     --transcriptome-index <bwtidx> (transcriptome bowtie index)
64     -T/--transcriptome-only (map only to the transcriptome)
65 gpertea 29 -j/--raw-juncs <filename>
66     --insertions <filename>
67     --deletions <filename>
68     -r/--mate-inner-dist <int>
69 gpertea 154 --mate-std-dev <int> [ default: 20 ]
70 gpertea 29 --no-novel-juncs
71     --no-novel-indels
72 gpertea 154 --fusion-search
73     --fusion-anchor-length <int> [ default: 20 ]
74     --fusion-min-dist <int> [ default: 10000000 ]
75     --fusion-read-mismatches <int> [ default: 2 ]
76     --fusion-multireads <int> [ default: 2 ]
77     --fusion-multipairs <int> [ default: 2 ]
78 gpertea 29 --no-gtf-juncs
79     --no-coverage-search
80     --coverage-search
81     --no-closure-search
82     --closure-search
83     --microexon-search
84     --butterfly-search
85     --no-butterfly-search
86     --keep-tmp
87     --tmp-dir <dirname> [ default: <output_dir>/tmp ]
88     -z/--zpacker <program> [ default: gzip ]
89     -X/--unmapped-fifo [use mkfifo to compress more temporary
90     files for color space reads]
91    
92     Advanced Options:
93 gpertea 159 --report-secondary-alignments
94 gpertea 154 --genome-read-mismatches <int> [ default: 2 ]
95     -N/--read-mismatches <int> [ default: 2 ]
96 gpertea 29 --segment-mismatches <int> [ default: 2 ]
97     --segment-length <int> [ default: 25 ]
98     --bowtie-n [ default: bowtie -v ]
99     --min-closure-exon <int> [ default: 100 ]
100     --min-closure-intron <int> [ default: 50 ]
101     --max-closure-intron <int> [ default: 5000 ]
102     --min-coverage-intron <int> [ default: 50 ]
103     --max-coverage-intron <int> [ default: 20000 ]
104     --min-segment-intron <int> [ default: 50 ]
105     --max-segment-intron <int> [ default: 500000 ]
106 gpertea 154 --no-sort-bam (Output BAM is not coordinate-sorted)
107     --no-convert-bam (Do not convert to bam format.
108 gpertea 29 Output is <output_dir>accepted_hit.sam.
109 gpertea 154 Implies --no-sort-bam)
110 gpertea 159 --keep-fasta-order
111     --allow-partial-mapping
112 gpertea 29
113 gpertea 154 Bowtie2 related options:
114     Preset options in --end-to-end mode (local alignment is not used in TopHat2)
115     --b2-very-fast
116     --b2-fast
117     --b2-sensitive
118     --b2-very-sensitive
119    
120     Alignment options
121     --b2-N <int> [ default: 0 ]
122     --b2-L <int> [ default: 20 ]
123     --b2-i <func> [ default: S,1,1.25 ]
124     --b2-n-ceil <func> [ default: L,0,0.15 ]
125     --b2-gbar <int> [ default: 4 ]
126    
127     Scoring options
128     --b2-mp <int>,<int> [ default: 6,2 ]
129     --b2-np <int> [ default: 1 ]
130     --b2-rdg <int>,<int> [ default: 5,3 ]
131     --b2-rfg <int>,<int> [ default: 5,3 ]
132     --b2-score-min <func> [ default: L,-0.6,-0.6 ]
133    
134     Effort options
135     --b2-D <int> [ default: 15 ]
136     --b2-R <int> [ default: 2 ]
137    
138    
139 gpertea 29 SAM Header Options (for embedding sequencing run metadata in output):
140     --rg-id <string> (read group ID)
141     --rg-sample <string> (sample ID)
142     --rg-library <string> (library ID)
143     --rg-description <string> (descriptive string, no tabs allowed)
144     --rg-platform-unit <string> (e.g Illumina lane ID)
145     --rg-center <string> (sequencing center name)
146     --rg-date <string> (ISO 8601 date of the sequencing run)
147     --rg-platform <string> (Sequencing platform descriptor)
148     '''
149    
150 gpertea 154 # -F/--min-isoform-fraction <float> [ default: 0.15 ]
151     # (obsolete)
152    
153 gpertea 29 class Usage(Exception):
154     def __init__(self, msg):
155     self.msg = msg
156    
157     output_dir = "./tophat_out/"
158     logging_dir = output_dir + "logs/"
159     run_log = None
160 gpertea 164 tophat_log = None #main log file handle
161     tophat_logger = None # main logging object
162 gpertea 29 run_cmd = None
163     tmp_dir = output_dir + "tmp/"
164     bin_dir = sys.path[0] + "/"
165     use_zpacker = False # this is set by -z/--zpacker option (-z0 leaves it False)
166    
167     use_BAM_Unmapped = False # automatically set to True for non-Solid reads, handles unmapped reads in BAM format
168    
169     use_BWT_FIFO = False # can only be set to True if use_zpacker is True and only with -C/--color
170     # enabled by -X/-unmapped-fifo option (unless -z0)
171 gpertea 135 unmapped_reads_fifo = None # if use_BWT_FIFO is True, this tricks bowtie into writing the
172 gpertea 29 # unmapped reads into a compressed file
173    
174     samtools_path = None
175     bowtie_path = None
176     fail_str = "\t[FAILED]\n"
177 gpertea 135 gtf_juncs = None #file name with junctions extracted from given GFF file
178    
179 gpertea 164
180    
181     def init_logger(log_fname):
182     global tophat_logger
183     tophat_logger = logging.getLogger('project')
184     formatter = logging.Formatter('%(asctime)s %(message)s', '[%Y-%m-%d %H:%M:%S]')
185     # level = logging.__dict__.get(options.loglevel.upper(),logging.DEBUG)
186     tophat_logger.setLevel(logging.DEBUG)
187    
188     hstream = logging.StreamHandler(sys.stderr)
189     hstream.setFormatter(formatter)
190     tophat_logger.addHandler(hstream)
191     #
192     # Output logging information to file
193     if os.path.isfile(log_fname):
194     os.remove(log_fname)
195     global tophat_log
196     logfh = logging.FileHandler(log_fname)
197     logfh.setFormatter(formatter)
198     tophat_logger.addHandler(logfh)
199     tophat_log=logfh.stream
200    
201    
202 gpertea 29 # TopHatParams captures all of the runtime paramaters used by TopHat, and many
203     # of these are passed as command line options to exectubles run by the pipeline
204    
205     # This class and its nested classes also do options parsing through parse_options()
206     # and option validation via the member function check()
207    
208 gpertea 154 class BowtieOutputFiles:
209 gpertea 135 def __init__(self,
210     mappings=None,
211     unmapped_reads=None,
212     multihit_reads=None):
213     self.mappings=mappings
214     self.unmapped_reads=unmapped_reads
215     self.multihit_reads=multihit_reads
216    
217 gpertea 29 class TopHatParams:
218    
219     # SpliceConstraints is a group of runtime parameters that specify what
220     # constraints to put on junctions discovered by the program. These constraints
221     # are used to filter out spurious/false positive junctions.
222    
223     class SpliceConstraints:
224     def __init__(self,
225     min_anchor_length,
226     min_intron_length,
227     max_intron_length,
228     splice_mismatches,
229     min_isoform_fraction):
230     self.min_anchor_length = min_anchor_length
231     self.min_intron_length = min_intron_length
232     self.max_intron_length = max_intron_length
233     self.splice_mismatches = splice_mismatches
234     self.min_isoform_fraction = min_isoform_fraction
235    
236     def parse_options(self, opts):
237     for option, value in opts:
238     if option in ("-m", "--splice-mismatches"):
239     self.splice_mismatches = int(value)
240     elif option in ("-a", "--min-anchor"):
241     self.min_anchor_length = int(value)
242     elif option in ("-F", "--min-isoform-fraction"):
243     self.min_isoform_fraction = float(value)
244     elif option in ("-i", "--min-intron-length"):
245     self.min_intron_length = int(value)
246     elif option in ("-I", "--max-intron-length"):
247     self.max_intron_length = int(value)
248    
249     def check(self):
250     if self.splice_mismatches not in [0,1,2]:
251     die("Error: arg to --splice-mismatches must be 0, 1, or 2")
252     if self.min_anchor_length < 4:
253     die("Error: arg to --min-anchor-len must be greater than 4")
254     if self.min_isoform_fraction < 0.0 or self.min_isoform_fraction > 1.0:
255     die("Error: arg to --min-isoform-fraction must be between 0.0 and 1.0")
256     if self.min_intron_length <= 0:
257     die("Error: arg to --min-intron-length must be greater than 0")
258     if self.max_intron_length <= 0:
259     die("Error: arg to --max-intron-length must be greater than 0")
260    
261     # SystemParams is a group of runtime parameters that determine how to handle
262     # temporary files produced during a run and how many threads to use for threaded
263     # stages of the pipeline (e.g. Bowtie)
264    
265     class SystemParams:
266     def __init__(self,
267 gpertea 154 num_threads,
268 gpertea 29 keep_tmp):
269 gpertea 154 self.num_threads = num_threads
270 gpertea 29 self.keep_tmp = keep_tmp
271     self.zipper = "gzip"
272     self.zipper_opts= []
273    
274     def parse_options(self, opts):
275     global use_zpacker
276     global use_BWT_FIFO
277     for option, value in opts:
278     if option in ("-p", "--num-threads"):
279 gpertea 154 self.num_threads = int(value)
280 gpertea 29 elif option == "--keep-tmp":
281     self.keep_tmp = True
282     elif option in ("-z","--zpacker"):
283     if value.lower() in ["-", " ", ".", "0", "none", "f", "false", "no"]:
284     value=""
285     self.zipper = value
286     #if not self.zipper:
287     # self.zipper='gzip'
288     elif option in ("-X", "--unmapped-fifo"):
289     use_BWT_FIFO=True
290     if self.zipper:
291     use_zpacker=True
292 gpertea 154 if self.num_threads>1 and not self.zipper_opts:
293 gpertea 29 if self.zipper.endswith('pbzip2') or self.zipper.endswith('pigz'):
294 gpertea 154 self.zipper_opts.append('-p'+str(self.num_threads))
295 gpertea 29 else:
296     use_zpacker=False
297     if use_BWT_FIFO: use_BWT_FIFO=False
298     def cmd(self):
299     cmdline=[]
300     if self.zipper:
301     cmdline.extend(['-z',self.zipper])
302 gpertea 154 if self.num_threads>1:
303     cmdline.extend(['-p'+str(self.num_threads)])
304 gpertea 29 return cmdline
305    
306     def check(self):
307 gpertea 154 if self.num_threads<1 :
308 gpertea 29 die("Error: arg to --num-threads must be greater than 0")
309     if self.zipper:
310     xzip=which(self.zipper)
311     if not xzip:
312     die("Error: cannot find compression program "+self.zipper)
313    
314     # ReadParams is a group of runtime parameters that specify various properties
315     # of the user's reads (e.g. which quality scale their are on, how long the
316     # fragments are, etc).
317     class ReadParams:
318     def __init__(self,
319     solexa_quals,
320     phred64_quals,
321     quals,
322     integer_quals,
323     color,
324     library_type,
325     seed_length,
326     reads_format,
327     mate_inner_dist,
328     mate_inner_dist_std_dev,
329     read_group_id,
330     sample_id,
331     library_id,
332     description,
333     seq_platform_unit,
334     seq_center,
335     seq_run_date,
336     seq_platform):
337     self.solexa_quals = solexa_quals
338     self.phred64_quals = phred64_quals
339     self.quals = quals
340     self.integer_quals = integer_quals
341     self.color = color
342     self.library_type = library_type
343     self.seed_length = seed_length
344     self.reads_format = reads_format
345     self.mate_inner_dist = mate_inner_dist
346     self.mate_inner_dist_std_dev = mate_inner_dist_std_dev
347     self.read_group_id = read_group_id
348     self.sample_id = sample_id
349     self.library_id = library_id
350     self.description = description
351     self.seq_platform_unit = seq_platform_unit
352     self.seq_center = seq_center
353     self.seq_run_date = seq_run_date
354     self.seq_platform = seq_platform
355    
356     def parse_options(self, opts):
357     for option, value in opts:
358     if option == "--solexa-quals":
359     self.solexa_quals = True
360     elif option in ("--solexa1.3-quals", "--phred64-quals"):
361     self.phred64_quals = True
362     elif option in ("-Q", "--quals"):
363     self.quals = True
364     elif option == "--integer-quals":
365     self.integer_quals = True
366     elif option in ("-C", "--color"):
367     self.color = True
368     elif option == "--library-type":
369     self.library_type = value
370     elif option in ("-s", "--seed-length"):
371     self.seed_length = int(value)
372     elif option in ("-r", "--mate-inner-dist"):
373     self.mate_inner_dist = int(value)
374     elif option == "--mate-std-dev":
375     self.mate_inner_dist_std_dev = int(value)
376     elif option == "--rg-id":
377     self.read_group_id = value
378     elif option == "--rg-sample":
379     self.sample_id = value
380     elif option == "--rg-library":
381     self.library_id = value
382     elif option == "--rg-description":
383     self.description = value
384     elif option == "--rg-platform-unit":
385     self.seq_platform_unit = value
386     elif option == "--rg-center":
387     self.seq_center = value
388     elif option == "--rg-date":
389     self.seq_run_date = value
390     elif option == "--rg-platform":
391     self.seq_platform = value
392    
393     def check(self):
394 gpertea 135 if self.seed_length and self.seed_length < 20:
395 gpertea 29 die("Error: arg to --seed-length must be at least 20")
396 gpertea 154
397 gpertea 29 if self.mate_inner_dist_std_dev != None and self.mate_inner_dist_std_dev < 0:
398     die("Error: arg to --mate-std-dev must at least 0")
399     if (not self.read_group_id and self.sample_id) or (self.read_group_id and not self.sample_id):
400     die("Error: --rg-id and --rg-sample must be specified or omitted together")
401    
402     # SearchParams is a group of runtime parameters that specify how TopHat will
403     # search for splice junctions
404    
405     class SearchParams:
406     def __init__(self,
407     min_closure_exon,
408     min_closure_intron,
409     max_closure_intron,
410     min_coverage_intron,
411     max_coverage_intron,
412     min_segment_intron,
413     max_segment_intron):
414    
415     self.min_closure_exon_length = min_closure_exon
416     self.min_closure_intron_length = min_closure_intron
417     self.max_closure_intron_length = max_closure_intron
418     self.min_coverage_intron_length = min_coverage_intron
419     self.max_coverage_intron_length = max_coverage_intron
420     self.min_segment_intron_length = min_segment_intron
421     self.max_segment_intron_length = max_segment_intron
422    
423     def parse_options(self, opts):
424     for option, value in opts:
425     if option == "--min-closure-exon":
426     self.min_closure_exon_length = int(value)
427     if option == "--min-closure-intron":
428     self.min_closure_intron_length = int(value)
429     if option == "--max-closure-intron":
430     self.max_closure_intron_length = int(value)
431     if option == "--min-coverage-intron":
432     self.min_coverage_intron_length = int(value)
433     if option == "--max-coverage-intron":
434     self.max_coverage_intron_length = int(value)
435     if option == "--min-segment-intron":
436     self.min_segment_intron_length = int(value)
437     if option == "--max-segment-intron":
438     self.max_segment_intron_length = int(value)
439    
440     def check(self):
441     if self.min_closure_exon_length < 0:
442     die("Error: arg to --min-closure-exon must be at least 20")
443     if self.min_closure_intron_length < 0:
444     die("Error: arg to --min-closure-intron must be at least 20")
445     if self.max_closure_intron_length < 0:
446     die("Error: arg to --max-closure-intron must be at least 20")
447     if self.min_coverage_intron_length < 0:
448     die("Error: arg to --min-coverage-intron must be at least 20")
449     if self.max_coverage_intron_length < 0:
450     die("Error: arg to --max-coverage-intron must be at least 20")
451     if self.min_segment_intron_length < 0:
452     die("Error: arg to --min-segment-intron must be at least 20")
453     if self.max_segment_intron_length < 0:
454     die("Error: arg to --max-segment-intron must be at least 20")
455    
456     class ReportParams:
457     def __init__(self):
458     self.sort_bam = True
459     self.convert_bam = True
460    
461     def parse_options(self, opts):
462     for option, value in opts:
463     if option == "--no-sort-bam":
464     self.sort_bam = False
465     if option == "--no-convert-bam":
466     self.convert_bam = False
467     self.sort_bam = False
468    
469 gpertea 154 class Bowtie2Params:
470     def __init__(self):
471     self.very_fast = False
472     self.fast = False
473     self.sensitive = False
474     self.very_sensitive = False
475 gpertea 29
476 gpertea 154 self.N = 0
477     self.L = 20
478     self.i = "S,1,1.25"
479     self.n_ceil = "L,0,0.15"
480     self.gbar = 4
481    
482     self.mp = "6,2"
483     self.np = 1
484     self.rdg = "5,3"
485     self.rfg = "5,3"
486     self.score_min = "L,-0.6,-0.6"
487    
488     self.D = 15
489     self.R = 2
490    
491     def parse_options(self, opts):
492     for option, value in opts:
493     if option == "--b2-very-fast":
494     self.very_fast = True
495     if option == "--b2-fast":
496     self.fast = True
497     if option == "--b2-sensitive":
498     self.sensitive = True
499     if option == "--b2-very-sensitive":
500     self.very_sensitive = True
501    
502     if option == "--b2-N":
503     self.N = int(value)
504     if option == "--b2-L":
505     self.L = 20
506     if option == "--b2-i":
507     self.i = value
508     if option == "--b2-n-ceil":
509     self.n_ceil = value
510     if option == "--b2-gbar":
511     self.gbar = 4
512    
513     if option == "--b2-mp":
514     self.mp = value
515     if option == "--b2-np":
516     self.np = int(value)
517     if option == "--b2-rdg":
518     self.rdg = value
519     if option == "--b2-rfg":
520     self.rfg = value
521     if option == "--b2-score-min":
522     self.score_min = value
523    
524     if option == "--b2-D":
525     self.D = int(value)
526     if option == "--b2-R":
527     self.R = int(value)
528    
529     def check(self):
530     more_than_once = False
531     if self.very_fast:
532     if self.fast or self.sensitive or self.very_sensitive:
533     more_than_once = True
534     else:
535     if self.fast:
536     if self.sensitive or self.very_sensitive:
537     more_than_once = True
538     else:
539     if self.sensitive and self.very_sensitive:
540     more_than_once = True
541    
542     if more_than_once:
543     die("Error: use only one of --b2-very-fast, --b2-fast, --b2-sensitive, --b2-very-sensitive")
544    
545     if not self.N in [0, 1]:
546     die("Error: arg to --b2-N must be either 0 or 1")
547    
548     function_re = r'^[CLSG],-?\d+(\.\d+)?,-?\d+(\.\d+)?$'
549     function_match = re.search(function_re, self.i)
550    
551     if not function_match:
552     die("Error: arg to --b2-i must be <func> (e.g. --b2-i S,1,1.25)")
553    
554     function_match = re.search(function_re, self.n_ceil)
555     if not function_match:
556     die("Error: arg to --b2-n-ceil must be <func> (e.g. --b2-n-ceil L,0,0.15)")
557    
558     function_match = re.search(function_re, self.score_min)
559     if not function_match:
560     die("Error: arg to --b2-score-min must be <func> (e.g. --b2-score-min L,-0.6,-0.6)")
561    
562     pair_re = r'^\d+,\d+$'
563     pair_match = re.search(pair_re, self.mp)
564     if not pair_match:
565     die("Error: arg to --b2-mp must be <int>,<int> (e.g. --b2-mp 6,2)")
566    
567     pair_match = re.search(pair_re, self.rdg)
568     if not pair_match:
569     die("Error: arg to --b2-rdg must be <int>,<int> (e.g. --b2-mp 5,3)")
570    
571     pair_match = re.search(pair_re, self.rfg)
572     if not pair_match:
573     die("Error: arg to --b2-rfg must be <int>,<int> (e.g. --b2-mp 5,3)")
574    
575    
576 gpertea 29 def __init__(self):
577     self.splice_constraints = self.SpliceConstraints(8, # min_anchor
578     50, # min_intron
579     500000, # max_intron
580     0, # splice_mismatches
581     0.15) # min_isoform_frac
582    
583 gpertea 135 self.preflt_data = [ BowtieOutputFiles(), BowtieOutputFiles() ]
584    
585 gpertea 29 self.read_params = self.ReadParams(False, # solexa_scale
586     False,
587     False, # quals
588     None, # integer quals
589     False, # SOLiD - color space
590     "", # library type (e.g. "illumina-stranded-pair-end")
591     None, # seed_length
592     "fastq", # quality_format
593     None, # mate inner distance
594     20, # mate inner dist std dev
595     None, # read group id
596     None, # sample id
597     None, # library id
598     None, # description
599     None, # platform unit (i.e. lane)
600     None, # sequencing center
601     None, # run date
602     None) # sequencing platform
603    
604 gpertea 154 self.system_params = self.SystemParams(1, # bowtie_threads (num_threads)
605 gpertea 29 False) # keep_tmp
606    
607     self.search_params = self.SearchParams(100, # min_closure_exon_length
608     50, # min_closure_intron_length
609     5000, # max_closure_intron_length
610     50, # min_coverage_intron_length
611     20000, # max_coverage_intron_length
612     50, # min_segment_intron_length
613     500000) # max_segment_intron_length
614    
615     self.report_params = self.ReportParams()
616 gpertea 154
617     self.bowtie2_params = self.Bowtie2Params()
618    
619     self.bowtie2 = True
620 gpertea 29 self.gff_annotation = None
621 gpertea 135 self.transcriptome_only = False
622     self.transcriptome_index = None
623 gpertea 154 self.transcriptome_outdir = None
624 gpertea 29 self.raw_junctions = None
625     self.find_novel_juncs = True
626     self.find_novel_indels = True
627 gpertea 154 self.find_novel_fusions = True
628 gpertea 29 self.find_GFF_juncs = True
629     self.max_hits = 20
630 gpertea 135 self.t_max_hits = 60
631 gpertea 159 self.max_seg_hits = 40
632 gpertea 135 self.prefilter_multi = False
633 gpertea 154 self.genome_read_mismatches = 2
634     self.max_read_mismatches = 2
635 gpertea 135 self.t_mismatches = 1
636 gpertea 29 self.segment_length = 25
637     self.segment_mismatches = 2
638     self.bowtie_alignment_option = "-v"
639     self.max_insertion_length = 3
640     self.max_deletion_length = 3
641     self.raw_insertions = None
642     self.raw_deletions = None
643     self.coverage_search = None
644 gpertea 135 self.closure_search = False
645 gpertea 29 self.microexon_search = False
646     self.butterfly_search = None
647 gpertea 159 self.report_secondary_alignments = False
648    
649     self.keep_fasta_order = False
650     self.partial_mapping = False
651 gpertea 163
652 gpertea 154 self.fusion_search = False
653     self.fusion_anchor_length = 20
654     self.fusion_min_dist = 10000000
655     self.fusion_read_mismatches = 2
656     self.fusion_multireads = 2
657     self.fusion_multipairs = 2
658 gpertea 29
659     def check(self):
660     self.splice_constraints.check()
661     self.read_params.check()
662     self.system_params.check()
663 gpertea 135 if self.segment_length < 10:
664     die("Error: arg to --segment-length must at least 10")
665 gpertea 29 if self.segment_mismatches < 0 or self.segment_mismatches > 3:
666     die("Error: arg to --segment-mismatches must in [0, 3]")
667 gpertea 154 if self.t_mismatches < 0 or self.t_mismatches > 3:
668     die("Error: arg to -n or --transcriptome-mismatches must in [0, 3]")
669 gpertea 29
670 gpertea 154 if self.read_params.color:
671     if self.bowtie2:
672 gpertea 164 th_log("Warning: bowtie2 in colorspace is not yet supported; --bowtie1 option assumed.")
673     self.bowtie2=False
674 gpertea 154 if self.fusion_search:
675     die("Error: fusion-search in colorspace is not yet supported")
676     if self.butterfly_search:
677     die("Error: butterfly-search in colorspace is not yet supported")
678 gpertea 29
679 gpertea 164 self.bowtie2_params.check()
680    
681 gpertea 29 library_types = ["fr-unstranded", "fr-firststrand", "fr-secondstrand"]
682    
683     if self.read_params.library_type and self.read_params.library_type not in library_types:
684     die("Error: library-type should be one of: "+', '.join(library_types))
685    
686     self.search_params.max_closure_intron_length = min(self.splice_constraints.max_intron_length,
687     self.search_params.max_closure_intron_length)
688    
689     self.search_params.max_segment_intron_length = min(self.splice_constraints.max_intron_length,
690     self.search_params.max_segment_intron_length)
691    
692     self.search_params.max_coverage_intron_length = min(self.splice_constraints.max_intron_length,
693     self.search_params.max_coverage_intron_length)
694    
695     if self.max_insertion_length >= self.segment_length:
696     die("Error: the max insertion length ("+self.max_insertion_length+") can not be equal to or greater than the segment length ("+self.segment_length+")")
697    
698     if self.max_insertion_length < 0:
699     die("Error: the max insertion length ("+self.max_insertion_length+") can not be less than 0")
700    
701     if self.max_deletion_length >= self.splice_constraints.min_intron_length:
702     die("Error: the max deletion length ("+self.max_deletion_length+") can not be equal to or greater than the min intron length ("+self.splice_constraints.min_intron_length+")")
703    
704     if self.max_deletion_length < 0:
705     die("Error: the max deletion length ("+self.max_deletion_length+") can not be less than 0")
706    
707     def cmd(self):
708     cmd = ["--min-anchor", str(self.splice_constraints.min_anchor_length),
709     "--splice-mismatches", str(self.splice_constraints.splice_mismatches),
710     "--min-report-intron", str(self.splice_constraints.min_intron_length),
711     "--max-report-intron", str(self.splice_constraints.max_intron_length),
712     "--min-isoform-fraction", str(self.splice_constraints.min_isoform_fraction),
713     "--output-dir", output_dir,
714     "--max-multihits", str(self.max_hits),
715 gpertea 159 "--max-seg-multihits", str(self.max_seg_hits),
716 gpertea 29 "--segment-length", str(self.segment_length),
717     "--segment-mismatches", str(self.segment_mismatches),
718     "--min-closure-exon", str(self.search_params.min_closure_exon_length),
719     "--min-closure-intron", str(self.search_params.min_closure_intron_length),
720     "--max-closure-intron", str(self.search_params.max_closure_intron_length),
721     "--min-coverage-intron", str(self.search_params.min_coverage_intron_length),
722     "--max-coverage-intron", str(self.search_params.max_coverage_intron_length),
723     "--min-segment-intron", str(self.search_params.min_segment_intron_length),
724     "--max-segment-intron", str(self.search_params.max_segment_intron_length),
725 gpertea 154 "--max-mismatches", str(self.max_read_mismatches),
726 gpertea 29 "--max-insertion-length", str(self.max_insertion_length),
727     "--max-deletion-length", str(self.max_deletion_length)]
728 gpertea 154
729     if not self.bowtie2:
730     cmd.extend(["--bowtie1"])
731    
732     if self.fusion_search:
733     cmd.extend(["--fusion-search",
734     "--fusion-anchor-length", str(self.fusion_anchor_length),
735     "--fusion-min-dist", str(self.fusion_min_dist),
736     "--fusion-read-mismatches", str(self.fusion_read_mismatches),
737     "--fusion-multireads", str(self.fusion_multireads),
738     "--fusion-multipairs", str(self.fusion_multipairs)])
739    
740 gpertea 29 cmd.extend(self.system_params.cmd())
741    
742     if self.read_params.mate_inner_dist != None:
743     cmd.extend(["--inner-dist-mean", str(self.read_params.mate_inner_dist),
744     "--inner-dist-std-dev", str(self.read_params.mate_inner_dist_std_dev)])
745     if self.gff_annotation != None:
746     cmd.extend(["--gtf-annotations", str(self.gff_annotation)])
747 gpertea 135 if gtf_juncs:
748     cmd.extend(["--gtf-juncs", gtf_juncs])
749     if self.closure_search == False:
750 gpertea 29 cmd.append("--no-closure-search")
751 gpertea 154 if not self.coverage_search:
752 gpertea 29 cmd.append("--no-coverage-search")
753 gpertea 154 if not self.microexon_search:
754 gpertea 29 cmd.append("--no-microexon-search")
755     if self.butterfly_search:
756     cmd.append("--butterfly-search")
757     if self.read_params.solexa_quals:
758     cmd.append("--solexa-quals")
759     if self.read_params.quals:
760     cmd.append("--quals")
761     if self.read_params.integer_quals:
762     cmd.append("--integer-quals")
763     if self.read_params.color:
764     cmd.append("--color")
765     if self.read_params.library_type:
766     cmd.extend(["--library-type", self.read_params.library_type])
767     if self.read_params.read_group_id:
768     cmd.extend(["--rg-id", self.read_params.read_group_id])
769     if self.read_params.phred64_quals:
770     cmd.append("--phred64-quals")
771     return cmd
772    
773     # This is the master options parsing routine, which calls parse_options for
774     # the delegate classes (e.g. SpliceConstraints) that handle certain groups
775     # of options.
776     def parse_options(self, argv):
777     try:
778 gpertea 135 opts, args = getopt.getopt(argv[1:], "hvp:m:n:N:F:a:i:I:G:Tr:o:j:Xz:s:g:x:MQC",
779 gpertea 29 ["version",
780     "help",
781     "output-dir=",
782 gpertea 154 "bowtie1",
783 gpertea 29 "solexa-quals",
784     "solexa1.3-quals",
785     "phred64-quals",
786     "quals",
787     "integer-quals",
788     "color",
789     "library-type=",
790     "num-threads=",
791     "splice-mismatches=",
792     "max-multihits=",
793     "min-isoform-fraction=",
794     "min-anchor-length=",
795     "min-intron-length=",
796     "max-intron-length=",
797     "GTF=",
798 gpertea 135 "transcriptome-only",
799     "transcriptome-max-hits=",
800     "transcriptome-index=",
801 gpertea 29 "raw-juncs=",
802     "no-novel-juncs",
803 gpertea 154 "allow-fusions",
804     "fusion-search",
805     "fusion-anchor-length=",
806     "fusion-min-dist=",
807     "fusion-read-mismatches=",
808     "fusion-multireads=",
809     "fusion-multipairs=",
810 gpertea 29 "no-novel-indels",
811     "no-gtf-juncs",
812     "mate-inner-dist=",
813     "mate-std-dev=",
814     "no-closure-search",
815     "no-coverage-search",
816     "closure-search",
817     "coverage-search",
818 gpertea 135 "prefilter-multihits",
819 gpertea 29 "microexon-search",
820     "min-closure-exon=",
821     "min-closure-intron=",
822     "max-closure-intron=",
823     "min-coverage-intron=",
824     "max-coverage-intron=",
825     "min-segment-intron=",
826     "max-segment-intron=",
827     "seed-length=",
828 gpertea 154 "genome-read-mismatches=",
829 gpertea 135 "read-mismatches=",
830 gpertea 154 "max-read-mismatches=",
831 gpertea 135 "transcriptome-mismatches=",
832 gpertea 29 "segment-length=",
833     "segment-mismatches=",
834     "bowtie-n",
835     "butterfly-search",
836     "no-butterfly-search",
837     "keep-tmp",
838     "rg-id=",
839     "rg-sample=",
840     "rg-library=",
841     "rg-description=",
842     "rg-platform-unit=",
843     "rg-center=",
844     "rg-date=",
845     "rg-platform=",
846     "tmp-dir=",
847     "zpacker=",
848     "unmapped-fifo",
849     "max-insertion-length=",
850     "max-deletion-length=",
851     "insertions=",
852     "deletions=",
853     "no-sort-bam",
854 gpertea 154 "no-convert-bam",
855 gpertea 159 "report-secondary-alignments",
856     "keep-fasta-order",
857     "allow-partial-mapping",
858 gpertea 154 "b2-very-fast",
859     "b2-fast",
860     "b2-sensitive",
861     "b2-very-sensitive",
862     "b2-N=",
863     "b2-L=",
864     "b2-i=",
865     "b2-n-ceil=",
866     "b2-gbar=",
867     "b2-ma=",
868     "b2-mp=",
869     "b2-np=",
870     "b2-rdg=",
871     "b2-rfg=",
872     "b2-score-min=",
873     "b2-D=",
874     "b2-R="])
875 gpertea 29 except getopt.error, msg:
876     raise Usage(msg)
877    
878     self.splice_constraints.parse_options(opts)
879     self.system_params.parse_options(opts)
880     self.read_params.parse_options(opts)
881     self.search_params.parse_options(opts)
882     self.report_params.parse_options(opts)
883 gpertea 154 self.bowtie2_params.parse_options(opts)
884 gpertea 29 global use_BWT_FIFO
885     global use_BAM_Unmapped
886     if not self.read_params.color:
887     use_BWT_FIFO=False
888     use_BAM_Unmapped=True
889     global output_dir
890     global logging_dir
891     global tmp_dir
892    
893     custom_tmp_dir = None
894     custom_out_dir = None
895     # option processing
896     for option, value in opts:
897     if option in ("-v", "--version"):
898     print "TopHat v%s" % (get_version())
899     sys.exit(0)
900     if option in ("-h", "--help"):
901     raise Usage(use_message)
902 gpertea 154 if option == "--bowtie1":
903     self.bowtie2 = False
904 gpertea 29 if option in ("-g", "--max-multihits"):
905     self.max_hits = int(value)
906 gpertea 159 self.max_seg_hits = max(10, self.max_hits * 2)
907 gpertea 135 if option in ("-x", "--transcriptome-max-hits"):
908     self.t_max_hits = int(value)
909 gpertea 29 if option in ("-G", "--GTF"):
910     self.gff_annotation = value
911 gpertea 135 if option in ("-T", "--transcriptome"):
912     self.transcriptome_only = True
913 gpertea 154 if option == "--transcriptome-index":
914 gpertea 135 self.transcriptome_index = value
915     if option in("-M", "--prefilter-multihits"):
916     self.prefilter_multi = True
917 gpertea 29 if option in ("-j", "--raw-juncs"):
918     self.raw_junctions = value
919     if option == "--no-novel-juncs":
920     self.find_novel_juncs = False
921     if option == "--no-novel-indels":
922     self.find_novel_indels = False
923 gpertea 154 if option == "--fusion-search":
924     self.fusion_search = True
925     if option == "--fusion-anchor-length":
926     self.fusion_anchor_length = int(value)
927     if option == "--fusion-min-dist":
928     self.fusion_min_dist = int(value)
929     if option == "--fusion-read-mismatches":
930     self.fusion_read_mismatches = int(value)
931     if option == "--fusion-multireads":
932     self.fusion_multireads = int(value)
933     if option == "--fusion-multipairs":
934     self.fusion_multipairs = int(value)
935 gpertea 29 if option == "--no-gtf-juncs":
936     self.find_GFF_juncs = False
937     if option == "--no-coverage-search":
938     self.coverage_search = False
939 gpertea 135 if option == "--coverage-search":
940     self.coverage_search = True
941 gpertea 29 if option == "--no-closure-search":
942     self.closure_search = False
943     if option == "--closure-search":
944     self.closure_search = True
945     if option == "--microexon-search":
946     self.microexon_search = True
947     if option == "--butterfly-search":
948     self.butterfly_search = True
949     if option == "--no-butterfly-search":
950     self.butterfly_search = False
951 gpertea 154 if option == "--genome-read-mismatches":
952     self.genome_read_mismatches = int(value)
953     if option in ("-N", "--read-mismatches", "--max-read-mismatches"):
954     self.max_read_mismatches = int(value)
955 gpertea 135 if option in ("-n", "--transcriptome-mismatches"):
956     self.t_mismatches = int(value)
957 gpertea 29 if option == "--segment-length":
958     self.segment_length = int(value)
959     if option == "--segment-mismatches":
960     self.segment_mismatches = int(value)
961     if option == "--bowtie-n":
962     self.bowtie_alignment_option = "-n"
963     if option == "--max-insertion-length":
964     self.max_insertion_length = int(value)
965     if option == "--max-deletion-length":
966     self.max_deletion_length = int(value)
967     if option == "--insertions":
968     self.raw_insertions = value
969     if option == "--deletions":
970     self.raw_deletions = value
971 gpertea 159 if option == "--report-secondary-alignments":
972     self.report_secondary_alignments = True
973     if option == "--keep-fasta-order":
974     self.keep_fasta_order = True
975     if option == "--allow-partial-mapping":
976     self.partial_mapping = True
977 gpertea 29 if option in ("-o", "--output-dir"):
978     custom_out_dir = value + "/"
979     if option == "--tmp-dir":
980     custom_tmp_dir = value + "/"
981 gpertea 135 if self.transcriptome_only:
982     self.find_novel_juncs=False
983     self.find_novel_indels=False
984     if custom_out_dir:
985 gpertea 29 output_dir = custom_out_dir
986     logging_dir = output_dir + "logs/"
987     tmp_dir = output_dir + "tmp/"
988     sam_header = tmp_dir + "stub_header.sam"
989 gpertea 135 if custom_tmp_dir:
990 gpertea 29 tmp_dir = custom_tmp_dir
991     sam_header = tmp_dir + "stub_header.sam"
992     if len(args) < 2:
993     raise Usage(use_message)
994     return args
995    
996    
997 gpertea 154 # check if a file exists and has non-zero (or minimum) size
998     def fileExists(filepath, minfsize=1):
999     if os.path.exists(filepath) and os.path.getsize(filepath)>=minfsize:
1000     return True
1001     else:
1002     return False
1003    
1004    
1005 gpertea 29 def getFileDir(filepath):
1006 gpertea 154 #if fullpath given, returns path including the ending /
1007 gpertea 29 fpath, fname=os.path.split(filepath)
1008     if fpath: fpath+='/'
1009     return fpath
1010    
1011     def getFileBaseName(filepath):
1012     fpath, fname=os.path.split(filepath)
1013     fbase, fext =os.path.splitext(fname)
1014     fx=fext.lower()
1015     if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fbase)>0:
1016     return fbase
1017     elif fx == '.z' or fx.find('.gz')==0 or fx.find('.bz')==0:
1018     fb, fext = os.path.splitext(fbase)
1019     fx=fext.lower()
1020     if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fb)>0:
1021     return fb
1022     else:
1023     return fbase
1024     else:
1025 gpertea 154 if len(fbase)>0:
1026 gpertea 29 return fbase
1027     else:
1028     return fname
1029    
1030     # Returns the current time in a nice format
1031     def right_now():
1032     curr_time = datetime.now()
1033     return curr_time.strftime("%c")
1034    
1035 gpertea 135 # The TopHat logging formatter
1036     def th_log(out_str):
1037 gpertea 164 #print >> sys.stderr, "[%s] %s" % (right_now(), out_str)
1038     if tophat_logger:
1039     tophat_logger.info(out_str)
1040 gpertea 135
1041 gpertea 164 def th_logp(out_str=""):
1042     print >> sys.stderr, out_str
1043     if tophat_log:
1044     print >> tophat_log, out_str
1045    
1046     def die(msg=None):
1047     if msg is not None:
1048     th_logp(msg)
1049     sys.exit(1)
1050    
1051 gpertea 29 # Ensures that the output, logging, and temp directories are present. If not,
1052     # they are created
1053     def prepare_output_dir():
1054    
1055 gpertea 164 #th_log("Preparing output location "+output_dir)
1056 gpertea 29 if os.path.exists(output_dir):
1057     pass
1058     else:
1059     os.mkdir(output_dir)
1060    
1061     if os.path.exists(logging_dir):
1062     pass
1063     else:
1064     os.mkdir(logging_dir)
1065    
1066     if os.path.exists(tmp_dir):
1067     pass
1068     else:
1069     try:
1070     os.makedirs(tmp_dir)
1071     except OSError, o:
1072     die("\nError creating directory %s (%s)" % (tmp_dir, o))
1073    
1074    
1075     # to be added as preexec_fn for every subprocess.Popen() call:
1076     # see http://bugs.python.org/issue1652
1077     def subprocess_setup():
1078     # Python installs a SIGPIPE handler by default, which causes
1079     # gzip or other de/compression pipes to complain about "stdout: Broken pipe"
1080     signal.signal(signal.SIGPIPE, signal.SIG_DFL)
1081    
1082     # Check that the Bowtie index specified by the user is present and all files
1083     # are there.
1084 gpertea 164 def check_bowtie_index(idx_prefix, is_bowtie2):
1085 gpertea 135 th_log("Checking for Bowtie index files")
1086 gpertea 164 idxext="ebwt"
1087     bowtie_ver=""
1088     if is_bowtie2:
1089     idxext="bt2"
1090     bowtie_ver="2 "
1091 gpertea 29
1092 gpertea 164 idx_fwd_1 = idx_prefix + ".1."+idxext
1093     idx_fwd_2 = idx_prefix + ".2."+idxext
1094     idx_rev_1 = idx_prefix + ".rev.1."+idxext
1095     idx_rev_2 = idx_prefix + ".rev.2."+idxext
1096 gpertea 29
1097     if os.path.exists(idx_fwd_1) and \
1098     os.path.exists(idx_fwd_2) and \
1099     os.path.exists(idx_rev_1) and \
1100     os.path.exists(idx_rev_2):
1101     return
1102     else:
1103 gpertea 164 bwtidxerr="Error: Could not find Bowtie "+bowtie_ver+"index files (" + idx_prefix + ".*."+idxext+")"
1104     bwtidx_env = os.environ.get("BOWTIE_INDEXES")
1105 gpertea 154
1106 gpertea 164 if bwtidx_env == None:
1107     die(bwtidxerr)
1108     if os.path.exists(bwtidx_env+idx_fwd_1) and \
1109     os.path.exists(bwtidx_env+idx_fwd_2) and \
1110     os.path.exists(bwtidx_env+idx_rev_1) and \
1111     os.path.exists(bwtidx_env+idx_rev_2):
1112 gpertea 29 return
1113     else:
1114 gpertea 164 die(bwtidxerr)
1115 gpertea 29
1116     # Reconstructs the multifasta file from which the Bowtie index was created, if
1117     # it's not already there.
1118     def bowtie_idx_to_fa(idx_prefix):
1119     idx_name = idx_prefix.split('/')[-1]
1120 gpertea 135 th_log("Reconstituting reference FASTA file from Bowtie index")
1121 gpertea 29
1122     try:
1123     tmp_fasta_file_name = tmp_dir + idx_name + ".fa"
1124     tmp_fasta_file = open(tmp_fasta_file_name, "w")
1125    
1126     inspect_log = open(logging_dir + "bowtie_inspect_recons.log", "w")
1127    
1128     inspect_cmd = [prog_path("bowtie-inspect"),
1129     idx_prefix]
1130    
1131 gpertea 164 th_logp(" Executing: " + " ".join(inspect_cmd) + " > " + tmp_fasta_file_name)
1132 gpertea 29 ret = subprocess.call(inspect_cmd,
1133     stdout=tmp_fasta_file,
1134     stderr=inspect_log)
1135    
1136     # Bowtie reported an error
1137     if ret != 0:
1138     die(fail_str+"Error: bowtie-inspect returned an error")
1139    
1140     # Bowtie not found
1141     except OSError, o:
1142     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1143     die(fail_str+"Error: bowtie-inspect not found on this system. Did you forget to include it in your PATH?")
1144    
1145     return tmp_fasta_file_name
1146    
1147     # Checks whether the multifasta file for the genome is present alongside the
1148     # Bowtie index files for it.
1149     def check_fasta(idx_prefix):
1150 gpertea 135 th_log("Checking for reference FASTA file")
1151 gpertea 29 idx_fasta = idx_prefix + ".fa"
1152     if os.path.exists(idx_fasta):
1153     return idx_fasta
1154     else:
1155     bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES")
1156 gpertea 135 if bowtie_idx_env_var:
1157 gpertea 29 idx_fasta = bowtie_idx_env_var + idx_prefix + ".fa"
1158     if os.path.exists(idx_fasta):
1159     return idx_fasta
1160    
1161 gpertea 164 th_logp("\tWarning: Could not find FASTA file " + idx_fasta)
1162 gpertea 29 idx_fa = bowtie_idx_to_fa(idx_prefix)
1163     return idx_fa
1164    
1165     # Check that both the Bowtie index and the genome's fasta file are present
1166 gpertea 164 def check_index(idx_prefix, is_bowtie2):
1167     check_bowtie_index(idx_prefix, is_bowtie2)
1168 gpertea 29 ref_fasta_file = check_fasta(idx_prefix)
1169    
1170     return (ref_fasta_file, None)
1171    
1172     # Retrive a tuple containing the system's version of Bowtie. Parsed from
1173     # `bowtie --version`
1174 gpertea 154 def get_bowtie_version(is_bowtie2):
1175 gpertea 29 try:
1176     # Launch Bowtie to capture its version info
1177     proc = subprocess.Popen([bowtie_path, "--version"],
1178     stdout=subprocess.PIPE)
1179 gpertea 154
1180 gpertea 29 stdout_value = proc.communicate()[0]
1181 gpertea 154
1182 gpertea 29 bowtie_version = None
1183     bowtie_out = repr(stdout_value)
1184    
1185     # Find the version identifier
1186 gpertea 154 if is_bowtie2:
1187     version_str = "bowtie2-align version "
1188     else:
1189     version_str = "bowtie version "
1190    
1191 gpertea 29 ver_str_idx = bowtie_out.find(version_str)
1192     if ver_str_idx != -1:
1193     nl = bowtie_out.find("\\n", ver_str_idx)
1194     version_val = bowtie_out[ver_str_idx + len(version_str):nl]
1195 gpertea 154
1196     if is_bowtie2:
1197     ver_numbers = version_val.split('.')
1198     ver_numbers[2:] = ver_numbers[2].split('-')
1199     bowtie_version = [int(x) for x in ver_numbers[:3]] + [int(ver_numbers[3][4:])]
1200     else:
1201     bowtie_version = [int(x) for x in version_val.split('.')]
1202    
1203 gpertea 29 if len(bowtie_version) == 3:
1204     bowtie_version.append(0)
1205    
1206     return bowtie_version
1207     except OSError, o:
1208     errmsg=fail_str+str(o)+"\n"
1209     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1210     errmsg+="Error: bowtie not found on this system"
1211     die(errmsg)
1212    
1213 gpertea 159 def get_index_sam_header(params, idx_prefix):
1214 gpertea 29 try:
1215 gpertea 159 temp_sam_header_filename = tmp_dir + "temp.samheader.sam"
1216     temp_sam_header_file = open(temp_sam_header_filename, "w")
1217 gpertea 163
1218 gpertea 159 bowtie_header_cmd = [bowtie_path]
1219 gpertea 29
1220 gpertea 159 read_params = params.read_params
1221     if not params.bowtie2:
1222 gpertea 154 bowtie_header_cmd += ["--sam"]
1223    
1224 gpertea 29 if read_params.color:
1225     bowtie_header_cmd.append('-C')
1226 gpertea 154
1227 gpertea 29 bowtie_header_cmd.extend([idx_prefix, '/dev/null'])
1228     subprocess.call(bowtie_header_cmd,
1229 gpertea 159 stdout=temp_sam_header_file,
1230 gpertea 29 stderr=open('/dev/null'))
1231    
1232 gpertea 159 temp_sam_header_file.close()
1233     temp_sam_header_file = open(temp_sam_header_filename, "r")
1234 gpertea 29
1235 gpertea 159 bowtie_sam_header_filename = tmp_dir + idx_prefix.split('/')[-1] + ".bwt.samheader.sam"
1236     bowtie_sam_header_file = open(bowtie_sam_header_filename, "w")
1237 gpertea 154
1238 gpertea 29 preamble = []
1239     sq_dict_lines = []
1240    
1241 gpertea 159 for line in temp_sam_header_file.readlines():
1242 gpertea 29 line = line.strip()
1243     if line.find("@SQ") != -1:
1244     # Sequence dictionary record
1245     cols = line.split('\t')
1246     seq_name = None
1247     for col in cols:
1248     fields = col.split(':')
1249     #print fields
1250     if len(fields) > 0 and fields[0] == "SN":
1251     seq_name = fields[1]
1252     if seq_name == None:
1253     die("Error: malformed sequence dictionary in sam header")
1254     sq_dict_lines.append([seq_name,line])
1255     elif line.find("CL"):
1256     continue
1257     else:
1258     preamble.append(line)
1259 gpertea 159
1260     print >> bowtie_sam_header_file, "@HD\tVN:1.0\tSO:coordinate"
1261 gpertea 29 if read_params.read_group_id and read_params.sample_id:
1262     rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
1263     read_params.sample_id)
1264     if read_params.library_id:
1265     rg_str += "\tLB:%s" % read_params.library_id
1266     if read_params.description:
1267     rg_str += "\tDS:%s" % read_params.description
1268     if read_params.seq_platform_unit:
1269     rg_str += "\tPU:%s" % read_params.seq_platform_unit
1270     if read_params.seq_center:
1271     rg_str += "\tCN:%s" % read_params.seq_center
1272     if read_params.mate_inner_dist:
1273     rg_str += "\tPI:%s" % read_params.mate_inner_dist
1274     if read_params.seq_run_date:
1275     rg_str += "\tDT:%s" % read_params.seq_run_date
1276     if read_params.seq_platform:
1277     rg_str += "\tPL:%s" % read_params.seq_platform
1278    
1279 gpertea 159 print >> bowtie_sam_header_file, rg_str
1280 gpertea 29
1281 gpertea 159 if not params.keep_fasta_order:
1282     sq_dict_lines.sort(lambda x,y: cmp(x[0],y[0]))
1283 gpertea 163
1284 gpertea 29 for [name, line] in sq_dict_lines:
1285 gpertea 159 print >> bowtie_sam_header_file, line
1286     print >> bowtie_sam_header_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
1287 gpertea 29
1288 gpertea 159 bowtie_sam_header_file.close()
1289     temp_sam_header_file.close()
1290     return bowtie_sam_header_filename
1291 gpertea 29
1292     except OSError, o:
1293     errmsg=fail_str+str(o)+"\n"
1294     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1295     errmsg+="Error: bowtie not found on this system"
1296     die(errmsg)
1297    
1298     # Make sure Bowtie is installed and is recent enough to be useful
1299 gpertea 164 def check_bowtie(params):
1300     bowtie_req=""
1301     if params.bowtie2:
1302     bowtie_req="2"
1303     log_msg = "Checking for Bowtie" + bowtie_req
1304 gpertea 154 th_log(log_msg)
1305    
1306 gpertea 164 bowtie_bin = "bowtie"+bowtie_req
1307 gpertea 154
1308 gpertea 29 global bowtie_path
1309 gpertea 154 bowtie_path=prog_path(bowtie_bin)
1310 gpertea 164 bowtie_version = get_bowtie_version(params.bowtie2)
1311     if params.bowtie2 and bowtie_version == None:
1312     #try to fallback on bowtie 1
1313     params.bowtie2=False
1314     bowtie_version=get_bowtie_version(params.bowtie2)
1315 gpertea 29 if bowtie_version == None:
1316 gpertea 164 die("Error: Bowtie not found on this system.")
1317     if params.bowtie2:
1318 gpertea 154 if bowtie_version[0] < 3 and bowtie_version[1] < 1 and bowtie_version[2] < 1 and bowtie_version[3] < 5:
1319     die("Error: TopHat requires Bowtie 2.0.0-beta5 or later")
1320 gpertea 164 else:
1321 gpertea 154 if bowtie_version[0] < 1 and bowtie_version[1] < 12 and bowtie_version[2] < 3:
1322     die("Error: TopHat requires Bowtie 0.12.3 or later")
1323 gpertea 164 th_logp("\t\t Bowtie version:\t %s" % ".".join([str(x) for x in bowtie_version]))
1324 gpertea 29
1325    
1326     # Retrive a tuple containing the system's version of samtools. Parsed from
1327     # `samtools`
1328     def get_samtools_version():
1329     try:
1330     # Launch Bowtie to capture its version info
1331     proc = subprocess.Popen(samtools_path, stderr=subprocess.PIPE)
1332     samtools_out = proc.communicate()[1]
1333    
1334     # Find the version identifier
1335     version_match = re.search(r'Version:\s+(\d+)\.(\d+).(\d+)([a-zA-Z]?)', samtools_out)
1336     samtools_version_arr = [int(version_match.group(x)) for x in [1,2,3]]
1337     if version_match.group(4):
1338     samtools_version_arr.append(version_match.group(4))
1339     else:
1340     samtools_version_arr.append(0)
1341    
1342     return version_match.group(), samtools_version_arr
1343     except OSError, o:
1344     errmsg=fail_str+str(o)+"\n"
1345     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1346     errmsg+="Error: samtools not found on this system"
1347     die(errmsg)
1348    
1349     # Make sure the SAM tools are installed and are recent enough to be useful
1350     def check_samtools():
1351 gpertea 135 th_log("Checking for Samtools")
1352 gpertea 29 global samtools_path
1353     samtools_path=prog_path("samtools")
1354     samtools_version_str, samtools_version_arr = get_samtools_version()
1355     if samtools_version_str == None:
1356     die("Error: Samtools not found on this system")
1357     elif samtools_version_arr[1] < 1 or samtools_version_arr[2] < 7:
1358     die("Error: TopHat requires Samtools 0.1.7 or later")
1359 gpertea 164 th_logp("\t\tSamtools version:\t %s" % ".".join([str(x) for x in samtools_version_arr]))
1360 gpertea 29
1361    
1362 gpertea 164
1363 gpertea 29 class FastxReader:
1364     def __init__(self, i_file, is_color=0, fname=''):
1365     self.bufline=None
1366     self.format=None
1367     self.ifile=i_file
1368     self.nextRecord=None
1369     self.eof=None
1370     self.fname=fname
1371     self.lastline=None
1372     self.numrecords=0
1373     self.isColor=0
1374     if is_color : self.isColor=1
1375     # determine file type
1376     #no records processed yet, skip custom header lines if any
1377     hlines=10 # allow maximum 10 header lines
1378     self.lastline=" "
1379     while hlines>0 and self.lastline[0] not in "@>" :
1380     self.lastline=self.ifile.readline()
1381     hlines-=1
1382     if self.lastline[0] == '@':
1383     self.format='fastq'
1384     self.nextRecord=self.nextFastq
1385     elif self.lastline[0] == '>':
1386     self.format='fasta'
1387     self.nextRecord=self.nextFasta
1388     else:
1389     die("Error: cannot determine record type in input file %s" % fname)
1390     self.bufline=self.lastline
1391     self.lastline=None
1392    
1393     def nextFastq(self):
1394     # returning tuple: (seqID, sequence_string, seq_len, qv_string)
1395     seqid,seqstr,qstr,seq_len='','','',0
1396     if self.eof: return (seqid, seqstr, seq_len, qstr)
1397     fline=self.getLine #shortcut to save a bit of time
1398     line=fline()
1399    
1400     if not line : return (seqid, seqstr, seq_len, qstr)
1401     while len(line.rstrip())==0: # skip empty lines
1402     line=fline()
1403     if not line : return (seqid, seqstr,seq_len, qstr)
1404     try:
1405     if line[0] != "@":
1406     raise ValueError("Records in Fastq files should start with '@' character")
1407 gpertea 154
1408 gpertea 29 seqid = line[1:].rstrip()
1409     seqstr = fline().rstrip()
1410    
1411     #There may now be more sequence lines, or the "+" quality marker line:
1412     while True:
1413     line = fline()
1414     if not line:
1415     raise ValueError("Premature end of file (missing quality values for "+seqid+")")
1416     if line[0] == "+":
1417     # -- sequence string ended
1418     #qtitle = line[1:].rstrip()
1419     #if qtitle and qtitle != seqid:
1420     # raise ValueError("Different read ID for sequence and quality (%s vs %s)" \
1421     # % (seqid, qtitle))
1422     break
1423     seqstr += line.rstrip() #removes trailing newlines
1424     #loop until + found
1425     seq_len = len(seqstr)
1426     #at least one line of quality data should follow
1427     qstrlen=0
1428     #now read next lines as quality values until seq_len is reached
1429     while True:
1430     line=fline()
1431     if not line : break #end of file
1432     qstr += line.rstrip()
1433     qstrlen=len(qstr)
1434     if qstrlen + self.isColor >= seq_len :
1435     break # qv string has reached the length of seq string
1436     #loop until qv has the same length as seq
1437 gpertea 154
1438 gpertea 135 if self.isColor:
1439     # and qstrlen==seq_len :
1440     if qstrlen==seq_len:
1441     #qual string may have a dummy qv at the beginning, should be stripped
1442     qstr = qstr[1:]
1443     qstrlen -= 1
1444     if qstrlen!=seq_len-1:
1445     raise ValueError("Length mismatch between sequence and quality strings "+ \
1446     "for %s (%i vs %i)." % (seqid, seq_len, qstrlen))
1447     else:
1448     if seq_len != qstrlen :
1449     raise ValueError("Length mismatch between sequence and quality strings "+ \
1450     "for %s (%i vs %i)." % (seqid, seq_len, qstrlen))
1451 gpertea 29 except ValueError, err:
1452     die("\nError encountered parsing file "+self.fname+":\n "+str(err))
1453     #return the record
1454     self.numrecords+=1
1455 gpertea 135 ##--discard the primer base [NO]
1456 gpertea 29 if self.isColor :
1457     seq_len-=1
1458     seqstr = seqstr[1:]
1459     return (seqid, seqstr, seq_len, qstr)
1460    
1461     def nextFasta(self):
1462     # returning tuple: (seqID, sequence_string, seq_len)
1463     seqid,seqstr,seq_len='','',0
1464     fline=self.getLine # shortcut to readline function of f
1465     line=fline() # this will use the buffer line if it's there
1466     if not line : return (seqid, seqstr, seq_len, None)
1467     while len(line.rstrip())==0: # skip empty lines
1468     line=fline()
1469     if not line : return (seqid, seqstr, seq_len, None)
1470     try:
1471     if line[0] != ">":
1472     raise ValueError("Records in Fasta files must start with '>' character")
1473     seqid = line[1:].split()[0]
1474     #more sequence lines, or the ">" quality marker line:
1475     while True:
1476     line = fline()
1477     if not line: break
1478     if line[0] == '>':
1479     #next sequence starts here
1480     self.ungetLine()
1481     break
1482     seqstr += line.rstrip()
1483     #loop until '>' found
1484     seq_len = len(seqstr)
1485     if seq_len < 3:
1486     raise ValueError("Read %s too short (%i)." \
1487     % (seqid, seq_len))
1488     except ValueError, err:
1489     die("\nError encountered parsing fasta file "+self.fname+"\n "+str(err))
1490     #return the record and continue
1491     self.numrecords+=1
1492 gpertea 135 if self.isColor : # -- discard primer base
1493 gpertea 29 seq_len-=1
1494 gpertea 135 seqstr=seqstr[1:]
1495 gpertea 29 return (seqid, seqstr, seq_len, None)
1496    
1497     def getLine(self):
1498     if self.bufline: #return previously buffered line
1499     r=self.bufline
1500     self.bufline=None
1501     return r
1502     else: #read a new line from stream and return it
1503     if self.eof: return None
1504     self.lastline=self.ifile.readline()
1505     if not self.lastline:
1506     self.eof=1
1507     return None
1508     return self.lastline
1509     def ungetLine(self):
1510     if self.lastline is None:
1511 gpertea 164 th_logp("Warning: FastxReader called ungetLine() with no prior line!")
1512 gpertea 29 self.bufline=self.lastline
1513     self.lastline=None
1514     #< class FastxReader
1515    
1516 gpertea 135 def fa_write(fhandle, seq_id, seq):
1517     """
1518     Write to a file in the FASTA format.
1519    
1520     Arguments:
1521     - `fhandle`: A file handle open for writing
1522     - `seq_id`: The sequence id string for this sequence
1523     - `seq`: An unformatted string of the sequence to write
1524     """
1525     line_len = 60
1526     fhandle.write(">" + seq_id + "\n")
1527     for i in xrange(len(seq) / line_len + 1):
1528     start = i * line_len
1529 gpertea 154 #end = (i+1) * line_len if (i+1) * line_len < len(seq) else len(seq)
1530     if (i+1) * line_len < len(seq):
1531     end = (i+1) * line_len
1532     else:
1533     end = len(seq)
1534 gpertea 135 fhandle.write( seq[ start:end ] + "\n")
1535    
1536 gpertea 29 class ZReader:
1537     def __init__(self, filename, sysparams, guess=True):
1538     self.fname=filename
1539     self.file=None
1540     self.fsrc=None
1541     self.popen=None
1542     pipecmd=[]
1543 gpertea 31 s=filename.lower()
1544     if s.endswith(".bam"):
1545 gpertea 154 # daehwan - want to ouptut mapped reads as well
1546     pipecmd=[prog_path("bam2fastx"), "--all", "-"]
1547 gpertea 31 else:
1548 gpertea 135 if guess:
1549     if s.endswith(".z") or s.endswith(".gz") or s.endswith(".gzip"):
1550 gpertea 31 pipecmd=['gzip']
1551 gpertea 135 else:
1552 gpertea 31 if s.endswith(".bz2") or s.endswith(".bzip2") or s.endswith(".bzip"):
1553     pipecmd=['bzip2']
1554 gpertea 135 if len(pipecmd)>0 and which(pipecmd[0]) is None:
1555 gpertea 31 die("Error: cannot find %s to decompress input file %s " % (pipecmd, filename))
1556 gpertea 135 if len(pipecmd)>0:
1557     if pipecmd[0]=='gzip' and sysparams.zipper.endswith('pigz'):
1558 gpertea 31 pipecmd[0]=sysparams.zipper
1559     pipecmd.extend(sysparams.zipper_opts)
1560 gpertea 135 elif pipecmd[0]=='bzip2' and sysparams.zipper.endswith('pbzip2'):
1561 gpertea 31 pipecmd[0]=sysparams.zipper
1562     pipecmd.extend(sysparams.zipper_opts)
1563 gpertea 135 else: #not guessing, but must still check if it's a compressed file
1564     if use_zpacker and filename.endswith(".z"):
1565 gpertea 31 pipecmd=[sysparams.zipper]
1566     pipecmd.extend(sysparams.zipper_opts)
1567 gpertea 154
1568 gpertea 135 if pipecmd:
1569     pipecmd+=['-cd']
1570 gpertea 29 if pipecmd:
1571     try:
1572     self.fsrc=open(self.fname, 'rb')
1573     self.popen=subprocess.Popen(pipecmd,
1574     preexec_fn=subprocess_setup,
1575     stdin=self.fsrc,
1576     stdout=subprocess.PIPE, close_fds=True)
1577     except Exception:
1578     die("Error: could not open pipe "+' '.join(pipecmd)+' < '+ self.fname)
1579     self.file=self.popen.stdout
1580     else:
1581     self.file=open(filename)
1582     def close(self):
1583     if self.fsrc: self.fsrc.close()
1584     self.file.close()
1585     if self.popen:
1586     self.popen.wait()
1587     self.popen=None
1588    
1589     class ZWriter:
1590     def __init__(self, filename, sysparams):
1591     self.fname=filename
1592     if use_zpacker:
1593     pipecmd=[sysparams.zipper,"-cf", "-"]
1594     self.ftarget=open(filename, "wb")
1595     try:
1596     self.popen=subprocess.Popen(pipecmd,
1597     preexec_fn=subprocess_setup,
1598     stdin=subprocess.PIPE,
1599     stdout=self.ftarget, close_fds=True)
1600     except Exception:
1601     die("Error: could not open writer pipe "+' '.join(pipecmd)+' < '+ self.fname)
1602     self.file=self.popen.stdin # client writes to this end of the pipe
1603     else: #no compression
1604     self.file=open(filename, "w")
1605     self.ftarget=None
1606     self.popen=None
1607     def close(self):
1608     self.file.close()
1609     if self.ftarget: self.ftarget.close()
1610     if self.popen:
1611     self.popen.wait() #! required to actually flush the pipes (eek!)
1612     self.popen=None
1613    
1614     # check_reads_format() examines the first few records in the user files
1615     # to determines the file format
1616     def check_reads_format(params, reads_files):
1617 gpertea 135 #seed_len = params.read_params.seed_length
1618 gpertea 29 fileformat = params.read_params.reads_format
1619    
1620     observed_formats = set([])
1621     # observed_scales = set([])
1622     min_seed_len = 99999
1623     max_seed_len = 0
1624     files = reads_files.split(',')
1625    
1626     for f_name in files:
1627     #try:
1628     zf = ZReader(f_name, params.system_params)
1629     #except IOError:
1630     # die("Error: could not open file "+f_name)
1631     freader=FastxReader(zf.file, params.read_params.color, zf.fname)
1632     toread=4 #just sample the first 4 reads
1633     while toread>0:
1634     seqid, seqstr, seq_len, qstr = freader.nextRecord()
1635     if not seqid: break
1636     toread-=1
1637     if seq_len < 20:
1638 gpertea 164 th_logp("Warning: found a read < 20bp in "+f_name)
1639 gpertea 29 else:
1640     min_seed_len = min(seq_len, min_seed_len)
1641     max_seed_len = max(seq_len, max_seed_len)
1642     zf.close()
1643     observed_formats.add(freader.format)
1644     if len(observed_formats) > 1:
1645     die("Error: TopHat requires all reads be either FASTQ or FASTA. Mixing formats is not supported.")
1646     fileformat=list(observed_formats)[0]
1647 gpertea 135 #if seed_len != None:
1648     # seed_len = max(seed_len, max_seed_len)
1649     #else:
1650     # seed_len = max_seed_len
1651 gpertea 29 #print >> sys.stderr, "\tmin read length: %dbp, max read length: %dbp" % (min_seed_len, max_seed_len)
1652 gpertea 164 th_logp("\tformat:\t\t %s" % fileformat)
1653 gpertea 29 if fileformat == "fastq":
1654     quality_scale = "phred33 (default)"
1655     if params.read_params.solexa_quals and not params.read_params.phred64_quals:
1656     quality_scale = "solexa33 (reads generated with GA pipeline version < 1.3)"
1657     elif params.read_params.phred64_quals:
1658     quality_scale = "phred64 (reads generated with GA pipeline version >= 1.3)"
1659 gpertea 164 th_logp("\tquality scale:\t %s" % quality_scale)
1660 gpertea 29 elif fileformat == "fasta":
1661     if params.read_params.color:
1662     params.read_params.integer_quals = True
1663    
1664     #print seed_len, format, solexa_scale
1665     #NOTE: seed_len will be re-evaluated later by prep_reads
1666     return TopHatParams.ReadParams(params.read_params.solexa_quals,
1667     params.read_params.phred64_quals,
1668     params.read_params.quals,
1669     params.read_params.integer_quals,
1670     params.read_params.color,
1671     params.read_params.library_type,
1672 gpertea 135 #seed_len,
1673     params.read_params.seed_length,
1674 gpertea 29 fileformat,
1675     params.read_params.mate_inner_dist,
1676     params.read_params.mate_inner_dist_std_dev,
1677     params.read_params.read_group_id,
1678     params.read_params.sample_id,
1679     params.read_params.library_id,
1680     params.read_params.description,
1681     params.read_params.seq_platform_unit,
1682     params.read_params.seq_center,
1683     params.read_params.seq_run_date,
1684     params.read_params.seq_platform)
1685    
1686     def log_tail(logfile, lines=1):
1687     f=open(logfile, "r")
1688     f.seek(0, 2)
1689     fbytes= f.tell()
1690     size=lines
1691     block=-1
1692     while size > 0 and fbytes+block*1024 > 0:
1693     if (fbytes+block*1024 > 0):
1694     ##Seek back once more, if possible
1695     f.seek( block*1024, 2 )
1696     else:
1697     #Seek to the beginning
1698     f.seek(0, 0)
1699     data= f.read( 1024 )
1700     linesFound= data.count('\n')
1701     size -= linesFound
1702     block -= 1
1703     if (fbytes + block*1024 > 0):
1704     f.seek(block*1024, 2)
1705     else:
1706     f.seek(0,0)
1707     f.readline() # find a newline
1708     lastBlocks= list( f.readlines() )
1709     f.close()
1710     return "\n".join(lastBlocks[-lines:])
1711    
1712     # Format a DateTime as a pretty string.
1713     # FIXME: Currently doesn't support days!
1714     def formatTD(td):
1715 gpertea 154 days = td.days
1716     hours = td.seconds // 3600
1717     minutes = (td.seconds % 3600) // 60
1718     seconds = td.seconds % 60
1719 gpertea 29
1720 gpertea 154 if days > 0:
1721     return '%d days %02d:%02d:%02d' % (days, hours, minutes, seconds)
1722     else:
1723     return '%02d:%02d:%02d' % (hours, minutes, seconds)
1724    
1725 gpertea 29 class PrepReadsInfo:
1726     def __init__(self, fname, side):
1727     try:
1728     f=open(fname,"r")
1729     self.min_len=int(f.readline().split("=")[-1])
1730     self.max_len=int(f.readline().split("=")[-1])
1731     self.in_count=int(f.readline().split("=")[-1])
1732     self.out_count=int(f.readline().split("=")[-1])
1733     if (self.out_count==0) or (self.max_len<16):
1734     raise Exception()
1735     except Exception, e:
1736     die(fail_str+"Error retrieving prep_reads info.")
1737 gpertea 164 th_logp("\t%s reads: min. length=%s, count=%s" % (side, self.min_len, self.out_count))
1738 gpertea 29 if self.min_len < 20:
1739 gpertea 164 th_logp("Warning: short reads (<20bp) will make TopHat quite slow and take large amount of memory because they are likely to be mapped to too many places")
1740 gpertea 29
1741 gpertea 135
1742 gpertea 154 def prep_reads_cmd(params, reads_list, quals_list=None, aux_file=None, index_file=None, filter_reads=None, hits_to_filter=None):
1743 gpertea 135 #generate a prep_reads cmd arguments
1744     filter_cmd = [prog_path("prep_reads")]
1745     filter_cmd.extend(params.cmd())
1746    
1747     if params.read_params.reads_format == "fastq":
1748     filter_cmd += ["--fastq"]
1749     elif params.read_params.reads_format == "fasta":
1750     filter_cmd += ["--fasta"]
1751     if hits_to_filter:
1752     filter_cmd += ["--flt-hits="+hits_to_filter]
1753     if aux_file:
1754     filter_cmd += ["--aux-outfile="+aux_file]
1755 gpertea 154 if index_file:
1756     filter_cmd += ["--index-outfile="+index_file]
1757 gpertea 135 if filter_reads:
1758     filter_cmd += ["--flt-reads="+filter_reads]
1759     filter_cmd.append(reads_list)
1760     if quals_list:
1761     filter_cmd.append(quals_list)
1762     return filter_cmd
1763    
1764 gpertea 29 # Calls the prep_reads executable, which prepares an internal read library.
1765     # The read library features reads with monotonically increasing integer IDs.
1766     # prep_reads also filters out very low complexy or garbage reads as well as
1767     # polyA reads.
1768     #--> returns the tuple (internal_read_library_file, read_info)
1769 gpertea 135 def prep_reads(params, reads_list, quals_list, side="left", prefilter_reads=None):
1770 gpertea 29 reads_suffix = ".fq"
1771 gpertea 154
1772     # for parallelization, we don't compress the read files
1773     do_use_zpacker = use_zpacker
1774     if params.system_params.num_threads > 1:
1775     do_use_zpacker = False
1776    
1777     if do_use_zpacker: reads_suffix += ".z"
1778    
1779 gpertea 135 output_name=side+"_kept_reads"
1780 gpertea 29 kept_reads_filename = tmp_dir + output_name + reads_suffix
1781    
1782     if os.path.exists(kept_reads_filename):
1783     os.remove(kept_reads_filename)
1784     kept_reads = open(kept_reads_filename, "wb")
1785     log_fname=logging_dir + "prep_reads.log"
1786     filter_log = open(log_fname,"w")
1787    
1788     info_file=output_dir+output_name+".info"
1789 gpertea 154 index_file=kept_reads_filename+".index"
1790    
1791     filter_cmd=prep_reads_cmd(params, reads_list, quals_list, info_file, index_file, prefilter_reads)
1792 gpertea 29 shell_cmd = ' '.join(filter_cmd)
1793     #finally, add the compression pipe
1794     zip_cmd=[]
1795 gpertea 154 if do_use_zpacker:
1796 gpertea 29 zip_cmd=[ params.system_params.zipper ]
1797     zip_cmd.extend(params.system_params.zipper_opts)
1798     zip_cmd.extend(['-c','-'])
1799     shell_cmd +=' | '+' '.join(zip_cmd)
1800     shell_cmd += ' >' +kept_reads_filename
1801     retcode=0
1802     try:
1803     print >> run_log, shell_cmd
1804 gpertea 154 if do_use_zpacker:
1805 gpertea 29 filter_proc = subprocess.Popen(filter_cmd,
1806     stdout=subprocess.PIPE,
1807     stderr=filter_log)
1808     zip_proc=subprocess.Popen(zip_cmd,
1809     preexec_fn=subprocess_setup,
1810     stdin=filter_proc.stdout,
1811     stdout=kept_reads)
1812     filter_proc.stdout.close() #as per http://bugs.python.org/issue7678
1813     zip_proc.communicate()
1814     retcode=filter_proc.poll()
1815     if retcode==0:
1816     retcode=zip_proc.poll()
1817     else:
1818     retcode = subprocess.call(filter_cmd,
1819     stdout=kept_reads,
1820     stderr=filter_log)
1821     if retcode:
1822     die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname))
1823    
1824     except OSError, o:
1825     errmsg=fail_str+str(o)
1826     die(errmsg+"\n"+log_tail(log_fname))
1827 gpertea 154
1828 gpertea 29 kept_reads.close()
1829     return kept_reads_filename, PrepReadsInfo(info_file, side)
1830    
1831 gpertea 135
1832 gpertea 29 # Call bowtie
1833     def bowtie(params,
1834     bwt_idx_prefix,
1835     reads_list,
1836     reads_format,
1837     num_mismatches,
1838     mapped_reads,
1839 gpertea 135 unmapped_reads,
1840     extra_output = "",
1841 gpertea 154 mapping_type = "read_against_genome",
1842 gpertea 135 multihits_out = None): #only --prefilter-multihits should activate this parameter for the initial prefilter search
1843 gpertea 29 start_time = datetime.now()
1844     bwt_idx_name = bwt_idx_prefix.split('/')[-1]
1845 gpertea 135 reads_file=reads_list[0]
1846     readfile_basename=getFileBaseName(reads_file)
1847 gpertea 154
1848     g_mapping, t_mapping, seg_mapping = False, False, False
1849     if mapping_type == "read_against_transcriptome":
1850     t_mapping = True
1851     elif mapping_type == "seg_against_genome" or mapping_type == "seg_against_junctions":
1852     seg_mapping = True
1853     else:
1854     g_mapping = True
1855    
1856     bowtie_str = "Bowtie"
1857     if params.bowtie2:
1858     bowtie_str += "2"
1859    
1860     if seg_mapping:
1861     if not params.bowtie2:
1862     backup_bowtie_alignment_option = params.bowtie_alignment_option
1863     params.bowtie_alignment_option = "-v"
1864    
1865 gpertea 135 if t_mapping:
1866 gpertea 164 th_log("Mapping %s against transcriptome %s with %s %s" % (readfile_basename,
1867     bwt_idx_name, bowtie_str, extra_output))
1868 gpertea 135 else:
1869 gpertea 164 th_log("Mapping %s against %s with %s %s" % (readfile_basename,
1870     bwt_idx_name, bowtie_str, extra_output))
1871 gpertea 29 bwt_log = open(logging_dir + 'bowtie.'+readfile_basename+'.fixmap.log', "w")
1872 gpertea 135 #bwt_mapped=mapped_reads
1873 gpertea 67 unmapped_reads_out=None
1874 gpertea 137 if unmapped_reads:
1875 gpertea 67 unmapped_reads_out=unmapped_reads+".fq"
1876 gpertea 29 use_FIFO = use_BWT_FIFO and use_zpacker and unmapped_reads and params.read_params.color
1877 gpertea 135 if use_FIFO:
1878 gpertea 29 global unmapped_reads_fifo
1879 gpertea 67 unmapped_reads_fifo=unmapped_reads+".fifo"
1880 gpertea 29 if os.path.exists(unmapped_reads_fifo):
1881     os.remove(unmapped_reads_fifo)
1882     try:
1883     os.mkfifo(unmapped_reads_fifo)
1884     except OSError, o:
1885     die(fail_str+"Error at mkfifo("+unmapped_reads_fifo+'). '+str(o))
1886 gpertea 67 unmapped_reads_out+=".z"
1887 gpertea 29
1888     # Launch Bowtie
1889     try:
1890     bowtie_cmd = [bowtie_path]
1891     if reads_format == "fastq":
1892     bowtie_cmd += ["-q"]
1893     elif reads_format == "fasta":
1894     bowtie_cmd += ["-f"]
1895    
1896     if params.read_params.color:
1897     bowtie_cmd += ["-C", "--col-keepends"]
1898     if unmapped_reads:
1899 gpertea 135 bowtie_cmd += ["--un", unmapped_reads_out]
1900    
1901     unzip_cmd=None
1902     bam_input=False
1903 gpertea 154 if len(reads_list) > 0 and reads_list[0].endswith('.bam'):
1904 gpertea 135 bam_input=True
1905 gpertea 164 unzip_cmd=[ prog_path('bam2fastx'), "--all" ]
1906 gpertea 135 if reads_format:
1907 gpertea 154 unzip_cmd.append("--" + reads_format)
1908 gpertea 159 unzip_cmd+=[reads_list[0]]
1909 gpertea 137
1910     if use_zpacker and (unzip_cmd is None):
1911 gpertea 29 unzip_cmd=[ params.system_params.zipper ]
1912     unzip_cmd.extend(params.system_params.zipper_opts)
1913 gpertea 154 unzip_cmd+=['-cd']
1914 gpertea 29
1915     fifo_pid=None
1916     if use_FIFO:
1917 gpertea 135 unm_zipcmd=[ params.system_params.zipper ]
1918     unm_zipcmd.extend(params.system_params.zipper_opts)
1919     unm_zipcmd+=['-c']
1920     print >> run_log, ' '.join(unm_zipcmd)+' < '+ unmapped_reads_fifo + ' > '+ unmapped_reads_out + ' & '
1921     fifo_pid=os.fork()
1922     if fifo_pid==0:
1923     def on_sig_exit(sig, func=None):
1924 gpertea 29 os._exit(os.EX_OK)
1925 gpertea 135 signal.signal(signal.SIGTERM, on_sig_exit)
1926     subprocess.call(unm_zipcmd,
1927     stdin=open(unmapped_reads_fifo, "r"),
1928     stdout=open(unmapped_reads_out, "wb"))
1929     os._exit(os.EX_OK)
1930 gpertea 29
1931     fix_map_cmd = [prog_path('fix_map_ordering')]
1932 gpertea 154
1933     if params.bowtie2:
1934     if t_mapping or g_mapping:
1935     max_penalty, min_penalty = params.bowtie2_params.mp.split(',')
1936     max_penalty, min_penalty = int(max_penalty), int(min_penalty)
1937     min_score = (max_penalty - 1) * num_mismatches
1938     fix_map_cmd += ["--bowtie2-min-score", str(min_score)]
1939 gpertea 163
1940 gpertea 137 samzip_cmd=None
1941 gpertea 154
1942     # daehwan - just use bam format for multi-threading.
1943     if unmapped_reads or params.system_params.num_threads >= 1:
1944 gpertea 29 #mapping on the genome
1945     #fix_map_ordering will write BAM file directly, and unmapped_reads as BAM file too
1946     mapped_reads += ".bam"
1947 gpertea 154
1948     temp_sam_header = tmp_dir + bwt_idx_prefix.split('/')[-1] + ".bwt.samheader.sam"
1949     fix_map_cmd += ["--sam-header", temp_sam_header, "-", mapped_reads]
1950     if not params.read_params.color and unmapped_reads:
1951     unmapped_reads_out = unmapped_reads+".bam"
1952     fix_map_cmd += [unmapped_reads_out]
1953 gpertea 29 else:
1954     #mapping on segment_juncs (spliced potential junctions)
1955 gpertea 135 # don't use SAM headers, could be slow -- better use compressed SAM directly
1956 gpertea 29 mapped_reads += ".sam"
1957     fix_map_cmd += ["-"]
1958     if use_zpacker:
1959     fix_map_cmd += ["-"]
1960 gpertea 137 samzip_cmd=[ params.system_params.zipper ]
1961     samzip_cmd.extend(params.system_params.zipper_opts)
1962     samzip_cmd += ["-c"]
1963 gpertea 29 mapped_reads += ".z"
1964     else:
1965     fix_map_cmd += [mapped_reads]
1966 gpertea 137
1967 gpertea 154 fix_map_cmd += ["--index-outfile", mapped_reads + ".index"]
1968    
1969 gpertea 135 if t_mapping:
1970 gpertea 159 max_hits = params.t_max_hits
1971     elif seg_mapping:
1972     max_hits = params.max_seg_hits
1973     else:
1974     max_hits = params.max_hits
1975 gpertea 163
1976 gpertea 154 if num_mismatches > 3:
1977     num_mismatches = 3
1978 gpertea 29
1979 gpertea 154 if params.bowtie2:
1980     if seg_mapping:
1981     # since bowtie2 does not suppress reads that map to many places,
1982     # we suppress those in segment_juncs and long_spanning_reads.
1983     bowtie_cmd += ["-k", str(max_hits + 1)]
1984     else:
1985     bowtie_cmd += ["-k", str(max_hits)]
1986    
1987     bowtie2_params = params.bowtie2_params
1988     if seg_mapping:
1989     # after intensive testing,
1990     # the following parameters seem to work faster than Bowtie1 and as sensitive as Bowtie1,
1991     # but a chance for further improvements remains.
1992     bowtie_cmd += ["-N", str(min(num_mismatches, 1))]
1993     bowtie_cmd += ["-i", "C,2,0"]
1994     bowtie_cmd += ["-L", "20"]
1995     # bowtie_cmd += ["-L", str(min(params.segment_length, 20))]
1996     else:
1997     bowtie2_preset = ""
1998     if bowtie2_params.very_fast:
1999     bowtie2_preset = "--very-fast"
2000     elif bowtie2_params.fast:
2001     bowtie2_preset = "--fast"
2002     elif bowtie2_params.sensitive:
2003     bowtie2_preset = "--sensitive"
2004     elif bowtie2_params.very_sensitive:
2005     bowtie2_preset = "--very-sensitive"
2006    
2007     if bowtie2_preset != "":
2008     bowtie_cmd += [bowtie2_preset]
2009     else:
2010     bowtie_cmd += ["-D", str(bowtie2_params.D),
2011     "-R", str(bowtie2_params.R),
2012     "-N", str(bowtie2_params.N),
2013     "-L", str(bowtie2_params.L),
2014     "-i", bowtie2_params.i]
2015    
2016     # "--n-ceil" is not correctly parsed in Bowtie2,
2017     # I (daehwan) already talked to Ben who will fix the problem.
2018     bowtie_cmd += [# "--n-ceil", bowtie2_params.n_ceil,
2019     "--gbar", str(bowtie2_params.gbar),
2020     "--mp", bowtie2_params.mp,
2021     "--np", str(bowtie2_params.np),
2022     "--rdg", bowtie2_params.rdg,
2023     "--rfg", bowtie2_params.rfg,
2024     "--score-min", bowtie2_params.score_min]
2025    
2026 gpertea 135 else:
2027 gpertea 154 bowtie_cmd += [params.bowtie_alignment_option, str(num_mismatches),
2028     "-k", str(max_hits),
2029     "-m", str(max_hits),
2030     "-S"]
2031    
2032     bowtie_cmd += ["-p", str(params.system_params.num_threads)]
2033    
2034     if params.bowtie2: #always use headerless SAM file
2035     bowtie_cmd += ["--sam-no-hd"]
2036     else:
2037     bowtie_cmd += ["--sam-nohead"]
2038    
2039     if not params.bowtie2:
2040     if multihits_out:
2041     bowtie_cmd += ["--max", multihits_out]
2042     else:
2043     bowtie_cmd += ["--max", "/dev/null"]
2044    
2045     if params.bowtie2:
2046     bowtie_cmd += ["-x"]
2047    
2048 gpertea 135 bowtie_cmd += [ bwt_idx_prefix ]
2049 gpertea 29 bowtie_proc=None
2050     shellcmd=""
2051     unzip_proc=None
2052 gpertea 135 samzip_proc=None
2053 gpertea 159
2054 gpertea 135 if multihits_out:
2055     #special prefilter bowtie run: we use prep_reads on the fly
2056     #in order to get multi-mapped reads to exclude later
2057     prep_cmd = prep_reads_cmd(params, ",".join(reads_list))
2058     preplog_fname=logging_dir + "prep_reads_prefilter.log"
2059     prepfilter_log = open(preplog_fname,"w")
2060     unzip_proc = subprocess.Popen(prep_cmd,
2061     stdout=subprocess.PIPE,
2062     stderr=prepfilter_log)
2063     shellcmd=' '.join(prep_cmd) + "|"
2064     else:
2065     z_input=use_zpacker and reads_file.endswith(".z")
2066     if z_input:
2067     unzip_proc = subprocess.Popen(unzip_cmd,
2068     stdin=open(reads_file, "rb"),
2069     stdout=subprocess.PIPE)
2070     shellcmd=' '.join(unzip_cmd) + "< " +reads_file +"|"
2071 gpertea 29 else:
2072 gpertea 135 #must be uncompressed fastq input (unmapped reads from a previous run)
2073     #or a BAM file with unmapped reads
2074     if bam_input:
2075     unzip_proc = subprocess.Popen(unzip_cmd, stdout=subprocess.PIPE)
2076     shellcmd=' '.join(unzip_cmd) + "|"
2077 gpertea 159 else:
2078     bowtie_cmd += [reads_file]
2079 gpertea 135 bowtie_proc = subprocess.Popen(bowtie_cmd,
2080     stdout=subprocess.PIPE,
2081     stderr=bwt_log)
2082     if unzip_proc:
2083     #input is compressed OR prep_reads is used as a filter
2084     bowtie_cmd += ['-']
2085     bowtie_proc = subprocess.Popen(bowtie_cmd,
2086     stdin=unzip_proc.stdout,
2087     stdout=subprocess.PIPE,
2088     stderr=bwt_log)
2089     unzip_proc.stdout.close() # see http://bugs.python.org/issue7678
2090    
2091     shellcmd += ' '.join(bowtie_cmd) + '|' + ' '.join(fix_map_cmd)
2092 gpertea 137 if samzip_cmd:
2093 gpertea 135 shellcmd += "|"+ ' '.join(samzip_cmd)+" > " + mapped_reads
2094     fix_order_proc = subprocess.Popen(fix_map_cmd,
2095 gpertea 154 stdin=bowtie_proc.stdout,
2096     stdout=subprocess.PIPE)
2097 gpertea 135 bowtie_proc.stdout.close() # needed?
2098     samzip_proc = subprocess.Popen(samzip_cmd,
2099     preexec_fn=subprocess_setup,
2100     stdin=fix_order_proc.stdout,
2101     stdout=open(mapped_reads, "wb"))
2102     fix_order_proc.stdout.close()
2103 gpertea 137 else:
2104 gpertea 135 #write BAM output directly
2105     fix_order_proc = subprocess.Popen(fix_map_cmd,
2106     stdin=bowtie_proc.stdout)
2107     # stdout=open(mapped_reads, "w"))
2108     bowtie_proc.stdout.close()
2109 gpertea 137
2110 gpertea 135 # shellcmd += " > " + mapped_reads
2111 gpertea 29 print >> run_log, shellcmd
2112 gpertea 135 if samzip_proc:
2113     samzip_proc.communicate()
2114 gpertea 29 else:
2115     fix_order_proc.communicate()
2116     if use_FIFO:
2117 gpertea 67 if fifo_pid and not os.path.exists(unmapped_reads_out):
2118 gpertea 29 try:
2119     os.kill(fifo_pid, signal.SIGTERM)
2120     except:
2121     pass
2122 gpertea 154
2123 gpertea 29 except OSError, o:
2124     die(fail_str+"Error: "+str(o))
2125    
2126     # Success
2127     #finish_time = datetime.now()
2128     #duration = finish_time - start_time
2129     #print >> sys.stderr, "\t\t\t[%s elapsed]" % formatTD(duration)
2130     if use_FIFO:
2131     try:
2132     os.remove(unmapped_reads_fifo)
2133     except:
2134     pass
2135 gpertea 135 if multihits_out and not os.path.exists(multihits_out):
2136     open(multihits_out, "w").close()
2137 gpertea 29
2138 gpertea 154 if seg_mapping:
2139     if not params.bowtie2:
2140     params.bowtie_alignment_option = backup_bowtie_alignment_option
2141 gpertea 29
2142 gpertea 154 return (mapped_reads, unmapped_reads_out)
2143 gpertea 29
2144    
2145     # Retrieve a .juncs file from a GFF file by calling the gtf_juncs executable
2146     def get_gtf_juncs(gff_annotation):
2147 gpertea 135 th_log("Reading known junctions from GTF file")
2148 gpertea 29 gtf_juncs_log = open(logging_dir + "gtf_juncs.log", "w")
2149    
2150     gff_prefix = gff_annotation.split('/')[-1].split('.')[0]
2151    
2152     gtf_juncs_out_name = tmp_dir + gff_prefix + ".juncs"
2153     gtf_juncs_out = open(gtf_juncs_out_name, "w")
2154    
2155     gtf_juncs_cmd=[prog_path("gtf_juncs"), gff_annotation]
2156     try:
2157 gpertea 135 print >> run_log, " ".join(gtf_juncs_cmd), " > "+gtf_juncs_out_name
2158 gpertea 29 retcode = subprocess.call(gtf_juncs_cmd,
2159     stderr=gtf_juncs_log,
2160     stdout=gtf_juncs_out)
2161     # cvg_islands returned an error
2162     if retcode == 1:
2163 gpertea 164 th_logp("\tWarning: TopHat did not find any junctions in GTF file")
2164 gpertea 29 return (False, gtf_juncs_out_name)
2165     elif retcode != 0:
2166     die(fail_str+"Error: GTF junction extraction failed with err ="+str(retcode))
2167 gpertea 154
2168 gpertea 29 # cvg_islands not found
2169     except OSError, o:
2170     errmsg=fail_str+str(o)+"\n"
2171     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2172     errmsg+="Error: gtf_juncs not found on this system"
2173     die(errmsg)
2174     return (True, gtf_juncs_out_name)
2175    
2176     # Call bowtie-build on the FASTA file of sythetic splice junction sequences
2177 gpertea 154 def build_juncs_bwt_index(is_bowtie2, external_splice_prefix, color):
2178 gpertea 135 th_log("Indexing splices")
2179 gpertea 29 bowtie_build_log = open(logging_dir + "bowtie_build.log", "w")
2180    
2181     #user_splices_out_prefix = output_dir + "user_splices_idx"
2182    
2183 gpertea 154 if is_bowtie2:
2184     bowtie_build_cmd = [prog_path("bowtie2-build")]
2185     else:
2186     bowtie_build_cmd = [prog_path("bowtie-build")]
2187    
2188 gpertea 29 if color:
2189     bowtie_build_cmd += ["-C"]
2190    
2191     bowtie_build_cmd += [external_splice_prefix + ".fa",
2192     external_splice_prefix]
2193     try:
2194     print >> run_log, " ".join(bowtie_build_cmd)
2195     retcode = subprocess.call(bowtie_build_cmd,
2196     stdout=bowtie_build_log)
2197    
2198     if retcode != 0:
2199     die(fail_str+"Error: Splice sequence indexing failed with err ="+ str(retcode))
2200     except OSError, o:
2201     errmsg=fail_str+str(o)+"\n"
2202     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2203     errmsg+="Error: bowtie-build not found on this system"
2204     die(errmsg)
2205     return external_splice_prefix
2206    
2207     # Build a splice index from a .juncs file, suitable for use with specified read
2208     # (or read segment) lengths
2209 gpertea 154 def build_juncs_index(is_bowtie2,
2210     min_anchor_length,
2211 gpertea 135 max_seg_len,
2212 gpertea 29 juncs_prefix,
2213     external_juncs,
2214     external_insertions,
2215     external_deletions,
2216 gpertea 154 external_fusions,
2217 gpertea 29 reference_fasta,
2218     color):
2219 gpertea 135 th_log("Retrieving sequences for splices")
2220 gpertea 29
2221     juncs_file_list = ",".join(external_juncs)
2222     insertions_file_list = ",".join(external_insertions)
2223     deletions_file_list = ",".join(external_deletions)
2224 gpertea 154 fusions_file_list = ",".join(external_fusions)
2225 gpertea 29
2226 gpertea 159 # do not use insertions and deletions in case of Bowtie2
2227     if is_bowtie2:
2228     insertions_file_list = "/dev/null"
2229     deletions_file_list = "/dev/null"
2230    
2231 gpertea 29 juncs_db_log = open(logging_dir + "juncs_db.log", "w")
2232    
2233     external_splices_out_prefix = tmp_dir + juncs_prefix
2234     external_splices_out_name = external_splices_out_prefix + ".fa"
2235    
2236     external_splices_out = open(external_splices_out_name, "w")
2237     # juncs_db_cmd = [bin_dir + "juncs_db",
2238     juncs_db_cmd = [prog_path("juncs_db"),
2239     str(min_anchor_length),
2240 gpertea 135 str(max_seg_len),
2241 gpertea 29 juncs_file_list,
2242     insertions_file_list,
2243     deletions_file_list,
2244 gpertea 154 fusions_file_list,
2245 gpertea 29 reference_fasta]
2246     try:
2247     print >> run_log, " ".join(juncs_db_cmd)
2248     retcode = subprocess.call(juncs_db_cmd,
2249     stderr=juncs_db_log,
2250     stdout=external_splices_out)
2251    
2252     if retcode != 0:
2253     die(fail_str+"Error: Splice sequence retrieval failed with err ="+str(retcode))
2254     # juncs_db not found
2255     except OSError, o:
2256     errmsg=fail_str+str(o)+"\n"
2257     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2258     errmsg+="Error: juncs_db not found on this system"
2259     die(errmsg)
2260    
2261 gpertea 154 external_splices_out_prefix = build_juncs_bwt_index(is_bowtie2, external_splices_out_prefix, color)
2262 gpertea 29 return external_splices_out_prefix
2263    
2264 gpertea 154 def build_idx_from_fa(is_bowtie2, fasta_fname, out_dir, color):
2265 gpertea 135 """ Build a bowtie index from a FASTA file.
2266    
2267     Arguments:
2268     - `fasta_fname`: File path to FASTA file.
2269     - `out_dir`: Output directory to place index in. (includes os.sep)
2270    
2271     Returns:
2272     - The path to the Bowtie index.
2273     """
2274     bwt_idx_path = out_dir + os.path.basename(fasta_fname).replace(".fa", "")
2275 gpertea 154
2276     if is_bowtie2:
2277     bowtie_idx_cmd = [prog_path("bowtie2-build")]
2278     else:
2279     bowtie_idx_cmd = [prog_path("bowtie-build")]
2280    
2281     if color:
2282     bowtie_idx_cmd += ["-C"]
2283    
2284     bowtie_idx_cmd += [fasta_fname,
2285     bwt_idx_path]
2286 gpertea 135 try:
2287     th_log("Building Bowtie index from " + os.path.basename(fasta_fname))
2288     print >> run_log, " ".join(bowtie_idx_cmd)
2289     retcode = subprocess.call(bowtie_idx_cmd,
2290     stdout=open(os.devnull, "w"),
2291     stderr=open(os.devnull, "w"))
2292     if retcode != 0:
2293     die(fail_str + "Error: Couldn't build bowtie index with err = "
2294     + str(retcode))
2295     except OSError, o:
2296     errmsg=fail_str+str(o)+"\n"
2297     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2298     errmsg+="Error: bowtie-build not found on this system"
2299     die(errmsg)
2300    
2301     return bwt_idx_path
2302    
2303 gpertea 29 # Print out the sam header, embedding the user's specified library properties.
2304     # FIXME: also needs SQ dictionary lines
2305     def write_sam_header(read_params, sam_file):
2306     print >> sam_file, "@HD\tVN:1.0\tSO:coordinate"
2307     if read_params.read_group_id and read_params.sample_id:
2308     rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
2309     read_params.sample_id)
2310     if read_params.library_id:
2311     rg_str += "\tLB:%s" % read_params.library_id
2312     if read_params.description:
2313     rg_str += "\tDS:%s" % read_params.description
2314     if read_params.seq_platform_unit:
2315     rg_str += "\tPU:%s" % read_params.seq_platform_unit
2316     if read_params.seq_center:
2317     rg_str += "\tCN:%s" % read_params.seq_center
2318     if read_params.mate_inner_dist:
2319     rg_str += "\tPI:%s" % read_params.mate_inner_dist
2320     if read_params.seq_run_date:
2321     rg_str += "\tDT:%s" % read_params.seq_run_date
2322     if read_params.seq_platform:
2323     rg_str += "\tPL:%s" % read_params.seq_platform
2324    
2325     print >> sam_file, rg_str
2326     print >> sam_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
2327    
2328     # Write final TopHat output, via tophat_reports and wiggles
2329 gpertea 154 def compile_reports(params, sam_header_filename, ref_fasta, mappings, readfiles, gff_annotation):
2330 gpertea 135 th_log("Reporting output tracks")
2331     left_maps, right_maps = mappings
2332     left_reads, right_reads = readfiles
2333 gpertea 29 left_maps = [x for x in left_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
2334     left_maps = ','.join(left_maps)
2335    
2336     if len(right_maps) > 0:
2337     right_maps = [x for x in right_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
2338     right_maps = ','.join(right_maps)
2339     log_fname = logging_dir + "reports.log"
2340     report_log = open(log_fname, "w")
2341     junctions = output_dir + "junctions.bed"
2342     insertions = output_dir + "insertions.bed"
2343     deletions = output_dir + "deletions.bed"
2344     accepted_hits = output_dir + "accepted_hits"
2345     report_cmdpath = prog_path("tophat_reports")
2346 gpertea 154 fusions = output_dir + "fusions.out"
2347 gpertea 29 report_cmd = [report_cmdpath]
2348 gpertea 154
2349     alignments_output_filename = tmp_dir + "accepted_hits"
2350    
2351 gpertea 29 report_cmd.extend(params.cmd())
2352 gpertea 159 report_cmd += ["--sam-header", sam_header_filename]
2353     if params.report_secondary_alignments:
2354     report_cmd += ["--report-secondary-alignments"]
2355 gpertea 163
2356 gpertea 29 report_cmd.extend(["--samtools="+samtools_path])
2357 gpertea 154 report_cmd.extend([ref_fasta,
2358     junctions,
2359 gpertea 29 insertions,
2360     deletions,
2361 gpertea 154 fusions,
2362     alignments_output_filename,
2363 gpertea 29 left_maps,
2364     left_reads])
2365 gpertea 154
2366 gpertea 135 if len(right_maps) > 0 and right_reads:
2367 gpertea 29 report_cmd.append(right_maps)
2368     report_cmd.append(right_reads)
2369 gpertea 154
2370 gpertea 29 # -- tophat_reports now produces (uncompressed) BAM stream,
2371     try:
2372     if params.report_params.convert_bam:
2373     if params.report_params.sort_bam:
2374 gpertea 154 print >> run_log, " ".join(report_cmd)
2375     report_proc=subprocess.call(report_cmd,
2376     preexec_fn=subprocess_setup,
2377     stderr=report_log)
2378 gpertea 29
2379 gpertea 154 bam_parts = []
2380     for i in range(params.system_params.num_threads):
2381     bam_part_filename = "%s%d.bam" % (alignments_output_filename, i)
2382     if os.path.exists(bam_part_filename):
2383     bam_parts.append(bam_part_filename)
2384     else:
2385     break
2386    
2387     num_bam_parts = len(bam_parts)
2388     pids = [0 for i in range(num_bam_parts)]
2389     sorted_bam_parts = ["%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)]
2390     #left_um_parts = ["%s%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)]
2391     #right_um_parts = ["%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)]
2392     for i in range(num_bam_parts):
2393     bamsort_cmd = [samtools_path,
2394     "sort",
2395     bam_parts[i],
2396     sorted_bam_parts[i]]
2397    
2398     sorted_bam_parts[i] += ".bam"
2399     print >> run_log, " ".join(bamsort_cmd)
2400     pid = os.fork()
2401     if pid == 0:
2402     subprocess.call(bamsort_cmd,
2403     stderr=open(logging_dir + "reports.samtools_sort.log%d" % i, "w"))
2404     os._exit(os.EX_OK)
2405     else:
2406     pids[i] = pid
2407    
2408     for i in range(len(pids)):
2409     if pids[i] > 0:
2410     os.waitpid(pids[i], 0)
2411     pids[i] = 0
2412    
2413     # for bam_part in bam_parts:
2414     # os.remove(bam_part)
2415    
2416     if num_bam_parts > 1:
2417     bammerge_cmd = [samtools_path,
2418     "merge",
2419     "-h",
2420     sam_header_filename,
2421     "%s.bam" % accepted_hits]
2422     bammerge_cmd += sorted_bam_parts
2423    
2424     print >> run_log, " ".join(bammerge_cmd)
2425     subprocess.call(bammerge_cmd,
2426     stderr=open(logging_dir + "reports.samtools_merge.log", "w"))
2427    
2428     for sorted_bam_part in sorted_bam_parts:
2429     os.remove(sorted_bam_part)
2430     else:
2431     os.rename(sorted_bam_parts[0], output_dir + "accepted_hits.bam")
2432    
2433 gpertea 29 else:
2434     print >> run_log, " ".join(report_cmd)
2435     report_proc=subprocess.call(report_cmd,
2436     preexec_fn=subprocess_setup,
2437     stdout=open(accepted_hits,"wb"),
2438     stderr=report_log)
2439     os.rename(accepted_hits, output_dir + "accepted_hits.bam")
2440     else:
2441     print >> run_log, " ".join(report_cmd)
2442     report_proc=subprocess.call(report_cmd,
2443     preexec_fn=subprocess_setup,
2444     stdout=open(accepted_hits,"wb"),
2445     stderr=report_log)
2446     tmp_sam = output_dir + "accepted_hits.sam"
2447    
2448 gpertea 135 bam_to_sam_cmd = [samtools_path, "view", "-h", accepted_hits]
2449 gpertea 29 print >> run_log, " ".join(bam_to_sam_cmd) + " > " + tmp_sam
2450     bam_to_sam_log = open(logging_dir + "accepted_hits_bam_to_sam.log", "w")
2451     tmp_sam_file = open(tmp_sam, "w")
2452     ret = subprocess.call(bam_to_sam_cmd,
2453     stdout=tmp_sam_file,
2454     stderr=bam_to_sam_log)
2455     tmp_sam_file.close()
2456     os.remove(accepted_hits)
2457    
2458     except OSError, o:
2459     die(fail_str+"Error: "+str(o)+"\n"+log_tail(log_fname))
2460    
2461 gpertea 154 return junctions
2462 gpertea 29
2463    
2464     # Split up each read in a FASTQ file into multiple segments. Creates a FASTQ file
2465     # for each segment This function needs to be fixed to support mixed read length
2466     # inputs
2467     def open_output_files(prefix, num_files_prev, num_files, out_segf, extension, params):
2468     i = num_files_prev + 1
2469     while i <= num_files:
2470     segfname=prefix+("_seg%d" % i)+extension
2471     out_segf.append(ZWriter(segfname,params.system_params))
2472     i += 1
2473    
2474     def split_reads(reads_filename,
2475     prefix,
2476     fasta,
2477     params,
2478     segment_length):
2479     #reads_file = open(reads_filename)
2480     zreads = ZReader(reads_filename, params.system_params, False)
2481     out_segfiles = []
2482    
2483     if fasta:
2484     extension = ".fa"
2485     else:
2486     extension = ".fq"
2487     if use_zpacker: extension += ".z"
2488     def convert_color_to_bp(color_seq):
2489     decode_dic = { 'A0':'A', 'A1':'C', 'A2':'G', 'A3':'T', 'A4':'N', 'A.':'N', 'AN':'N',
2490     'C0':'C', 'C1':'A', 'C2':'T', 'C3':'G', 'C4':'N', 'C.':'N', 'CN':'N',
2491     'G0':'G', 'G1':'T', 'G2':'A', 'G3':'C', 'G4':'N', 'G.':'N', 'GN':'N',
2492     'T0':'T', 'T1':'G', 'T2':'C', 'T3':'A', 'T4':'N', 'T.':'N', 'TN':'N',
2493     'N0':'N', 'N1':'N', 'N2':'N', 'N3':'N', 'N4':'N', 'N.':'N', 'NN':'N',
2494     '.0':'N', '.1':'N', '.2':'N', '.3':'N', '.4':'N', '..':'N', '.N':'N' }
2495    
2496     base = color_seq[0]
2497     bp_seq = base
2498     for ch in color_seq[1:]:
2499     base = decode_dic[base+ch]
2500     bp_seq += base
2501     return bp_seq
2502    
2503     def convert_bp_to_color(bp_seq):
2504     encode_dic = { 'AA':'0', 'CC':'0', 'GG':'0', 'TT':'0',
2505     'AC':'1', 'CA':'1', 'GT':'1', 'TG':'1',
2506     'AG':'2', 'CT':'2', 'GA':'2', 'TC':'2',
2507     'AT':'3', 'CG':'3', 'GC':'3', 'TA':'3',
2508     'A.':'4', 'C.':'4', 'G.':'4', 'T.':'4',
2509     '.A':'4', '.C':'4', '.G':'4', '.T':'4',
2510     '.N':'4', 'AN':'4', 'CN':'4', 'GN':'4',
2511     'TN':'4', 'NA':'4', 'NC':'4', 'NG':'4',
2512     'NT':'4', 'NN':'4', 'N.':'4', '..':'4' }
2513    
2514     base = bp_seq[0]
2515     color_seq = base
2516     for ch in bp_seq[1:]:
2517     color_seq += encode_dic[base + ch]
2518     base = ch
2519    
2520     return color_seq
2521    
2522     def split_record(read_name, read_seq, read_qual, out_segf, offsets, color):
2523     if color:
2524     color_offset = 1
2525     read_seq_temp = convert_color_to_bp(read_seq)
2526    
2527     seg_num = 1
2528     while seg_num + 1 < len(offsets):
2529     if read_seq[offsets[seg_num]+1] not in ['0', '1', '2', '3']:
2530     return
2531     seg_num += 1
2532     else:
2533     color_offset = 0
2534    
2535     seg_num = 0
2536     last_seq_offset = 0
2537     while seg_num + 1 < len(offsets):
2538     f = out_segf[seg_num].file
2539     seg_seq = read_seq[last_seq_offset+color_offset:offsets[seg_num + 1]+color_offset]
2540     print >> f, "%s|%d:%d:%d" % (read_name,last_seq_offset,seg_num, len(offsets) - 1)
2541     if color:
2542     print >> f, "%s%s" % (read_seq_temp[last_seq_offset], seg_seq)
2543     else:
2544     print >> f, seg_seq
2545     if not fasta:
2546     seg_qual = read_qual[last_seq_offset:offsets[seg_num + 1]]
2547     print >> f, "+"
2548     print >> f, seg_qual
2549     seg_num += 1
2550     last_seq_offset = offsets[seg_num]
2551    
2552     line_state = 0
2553     read_name = ""
2554     read_seq = ""
2555 gpertea 135 read_quals = ""
2556 gpertea 29 num_segments = 0
2557     offsets = []
2558     for line in zreads.file:
2559     if line.strip() == "":
2560     continue
2561     if line_state == 0:
2562     read_name = line.strip()
2563     elif line_state == 1:
2564     read_seq = line.strip()
2565    
2566     read_length = len(read_seq)
2567     tmp_num_segments = read_length / segment_length
2568     offsets = [segment_length * i for i in range(0, tmp_num_segments + 1)]
2569    
2570     # Bowtie's minimum read length here is 20bp, so if the last segment
2571     # is between 20 and segment_length bp long, go ahead and write it out
2572 gpertea 154 if read_length % segment_length >= min(segment_length - 2, 20):
2573 gpertea 29 offsets.append(read_length)
2574     tmp_num_segments += 1
2575     else:
2576     offsets[-1] = read_length
2577    
2578     if tmp_num_segments == 1:
2579     offsets = [0, read_length]
2580    
2581     if tmp_num_segments > num_segments:
2582     open_output_files(prefix, num_segments, tmp_num_segments, out_segfiles, extension, params)
2583     num_segments = tmp_num_segments
2584    
2585     if fasta:
2586     split_record(read_name, read_seq, None, out_segfiles, offsets, params.read_params.color)
2587     elif line_state == 2:
2588     line = line.strip()
2589     else:
2590     read_quals = line.strip()
2591     if not fasta:
2592     split_record(read_name, read_seq, read_quals, out_segfiles, offsets, params.read_params.color)
2593    
2594     line_state += 1
2595     if fasta:
2596     line_state %= 2
2597     else:
2598     line_state %= 4
2599     zreads.close()
2600     out_fnames=[]
2601     for zf in out_segfiles:
2602     zf.close()
2603     out_fnames.append(zf.fname)
2604     #return [o.fname for o in out_segfiles]
2605     return out_fnames
2606    
2607     # Find possible splice junctions using the "closure search" strategy, and report
2608     # them in closures.juncs. Calls the executable closure_juncs
2609     def junctions_from_closures(params,
2610 gpertea 159 sam_header_filename,
2611 gpertea 29 left_maps,
2612     right_maps,
2613     ref_fasta):
2614 gpertea 135 th_log("Searching for junctions via mate-pair closures")
2615 gpertea 29
2616 gpertea 154
2617 gpertea 29 #maps = [x for x in seg_maps if (os.path.exists(x) and os.path.getsize(x) > 0)]
2618     #if len(maps) == 0:
2619     # return None
2620     slash = left_maps[0].rfind('/')
2621     juncs_out = ""
2622     if slash != -1:
2623     juncs_out += left_maps[0][:slash+1]
2624 gpertea 154 fusions_out = juncs_out
2625    
2626 gpertea 29 juncs_out += "closure.juncs"
2627 gpertea 154 fusions_out += "closure.fusions"
2628 gpertea 29
2629     juncs_log = open(logging_dir + "closure.log", "w")
2630     juncs_cmdpath=prog_path("closure_juncs")
2631     juncs_cmd = [juncs_cmdpath]
2632    
2633     left_maps = ','.join(left_maps)
2634     right_maps = ','.join(right_maps)
2635    
2636     juncs_cmd.extend(params.cmd())
2637 gpertea 159 juncs_cmd.extend(["--sam-header", sam_header_filename,
2638     juncs_out,
2639 gpertea 154 fusions_out,
2640 gpertea 29 ref_fasta,
2641     left_maps,
2642     right_maps])
2643     try:
2644     print >> run_log, ' '.join(juncs_cmd)
2645     retcode = subprocess.call(juncs_cmd,
2646     stderr=juncs_log)
2647    
2648     # spanning_reads returned an error
2649     if retcode != 0:
2650     die(fail_str+"Error: closure-based junction search failed with err ="+str(retcode))
2651     # cvg_islands not found
2652     except OSError, o:
2653     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2654 gpertea 164 th_logp(fail_str + "Error: closure_juncs not found on this system")
2655 gpertea 29 die(str(o))
2656     return [juncs_out]
2657    
2658     # Find possible junctions by examining coverage and split segments in the initial
2659     # map and segment maps. Report junctions, insertions, and deletions in segment.juncs,
2660     # segment.insertions, and segment.deletions. Calls the executable
2661     # segment_juncs
2662     def junctions_from_segments(params,
2663 gpertea 159 sam_header_filename,
2664 gpertea 29 left_reads,
2665     left_reads_map,
2666     left_seg_maps,
2667     right_reads,
2668     right_reads_map,
2669     right_seg_maps,
2670     unmapped_reads,
2671     reads_format,
2672     ref_fasta):
2673     if left_reads_map != left_seg_maps[0]:
2674 gpertea 135 th_log("Searching for junctions via segment mapping")
2675 gpertea 29 out_path=getFileDir(left_seg_maps[0])
2676     juncs_out=out_path+"segment.juncs"
2677     insertions_out=out_path+"segment.insertions"
2678     deletions_out =out_path+"segment.deletions"
2679 gpertea 154 fusions_out = out_path+"segment.fusions"
2680 gpertea 29
2681     left_maps = ','.join(left_seg_maps)
2682 gpertea 135 log_fname = logging_dir + "segment_juncs.log"
2683     segj_log = open(log_fname, "w")
2684     segj_cmd = [prog_path("segment_juncs")]
2685 gpertea 29
2686 gpertea 135 segj_cmd.extend(params.cmd())
2687 gpertea 159 segj_cmd.extend(["--sam-header", sam_header_filename,
2688     "--ium-reads", ",".join(unmapped_reads),
2689     ref_fasta,
2690     juncs_out,
2691     insertions_out,
2692     deletions_out,
2693     fusions_out,
2694     left_reads,
2695     left_reads_map,
2696     left_maps])
2697 gpertea 135 if right_seg_maps:
2698 gpertea 29 right_maps = ','.join(right_seg_maps)
2699 gpertea 135 segj_cmd.extend([right_reads, right_reads_map, right_maps])
2700 gpertea 29 try:
2701 gpertea 135 print >> run_log, " ".join(segj_cmd)
2702     retcode = subprocess.call(segj_cmd,
2703 gpertea 29 preexec_fn=subprocess_setup,
2704 gpertea 135 stderr=segj_log)
2705 gpertea 29
2706     # spanning_reads returned an error
2707     if retcode != 0:
2708 gpertea 135 die(fail_str+"Error: segment-based junction search failed with err ="+str(retcode)+"\n"+log_tail(log_fname))
2709 gpertea 154
2710 gpertea 29 # cvg_islands not found
2711     except OSError, o:
2712     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2713 gpertea 164 th_logp(fail_str + "Error: segment_juncs not found on this system")
2714 gpertea 29 die(str(o))
2715    
2716 gpertea 154 return [juncs_out, insertions_out, deletions_out, fusions_out]
2717 gpertea 29
2718     # Joins mapped segments into full-length read alignments via the executable
2719     # long_spanning_reads
2720     def join_mapped_segments(params,
2721     sam_header_filename,
2722     reads,
2723     ref_fasta,
2724     possible_juncs,
2725     possible_insertions,
2726     possible_deletions,
2727 gpertea 154 possible_fusions,
2728 gpertea 29 contig_seg_maps,
2729     spliced_seg_maps,
2730     alignments_out_name):
2731 gpertea 135 rn=""
2732 gpertea 29 if len(contig_seg_maps)>1:
2733 gpertea 135 th_log("Joining segment hits")
2734     rn=".segs"
2735 gpertea 29 else:
2736 gpertea 135 th_log("Processing bowtie hits")
2737 gpertea 29 contig_seg_maps = ','.join(contig_seg_maps)
2738    
2739     possible_juncs = ','.join(possible_juncs)
2740     possible_insertions = ",".join(possible_insertions)
2741     possible_deletions = ",".join(possible_deletions)
2742 gpertea 154 possible_fusions = ",".join(possible_fusions)
2743    
2744 gpertea 135 log_fname=logging_dir + "long_spanning_reads"+rn+".log"
2745     align_log = open(log_fname, "w")
2746 gpertea 29 align_cmd = [prog_path("long_spanning_reads")]
2747    
2748     align_cmd.extend(params.cmd())
2749 gpertea 159 align_cmd += ["--sam-header", sam_header_filename]
2750 gpertea 29
2751 gpertea 159 b2_params = params.bowtie2_params
2752     max_penalty, min_penalty = b2_params.mp.split(',')
2753     align_cmd += ["--bowtie2-max-penalty", max_penalty,
2754     "--bowtie2-min-penalty", min_penalty]
2755    
2756     align_cmd += ["--bowtie2-penalty-for-N", str(b2_params.np)]
2757    
2758     read_gap_open, read_gap_cont = b2_params.rdg.split(',')
2759     align_cmd += ["--bowtie2-read-gap-open", read_gap_open,
2760     "--bowtie2-read-gap-cont", read_gap_cont]
2761    
2762     ref_gap_open, ref_gap_cont = b2_params.rfg.split(',')
2763     align_cmd += ["--bowtie2-ref-gap-open", ref_gap_open,
2764     "--bowtie2-ref-gap-cont", ref_gap_cont]
2765    
2766 gpertea 29 align_cmd.append(ref_fasta)
2767 gpertea 154 align_cmd.extend([reads,
2768     possible_juncs,
2769     possible_insertions,
2770     possible_deletions,
2771     possible_fusions,
2772     alignments_out_name,
2773     contig_seg_maps])
2774 gpertea 29
2775 gpertea 135 if spliced_seg_maps:
2776 gpertea 29 spliced_seg_maps = ','.join(spliced_seg_maps)
2777     align_cmd.append(spliced_seg_maps)
2778    
2779     try:
2780 gpertea 154 print >> run_log, " ".join(align_cmd)
2781     join_proc=subprocess.call(align_cmd,
2782     stderr=align_log)
2783    
2784 gpertea 29 except OSError, o:
2785     die(fail_str+"Error: "+str(o))
2786    
2787     # This class collects spliced and unspliced alignments for each of the
2788     # left and right read files provided by the user.
2789     class Maps:
2790     def __init__(self,
2791     unspliced_sam,
2792     seg_maps,
2793     unmapped_segs,
2794     segs):
2795     self.unspliced_sam = unspliced_sam
2796     self.seg_maps = seg_maps
2797     self.unmapped_segs = unmapped_segs
2798     self.segs = segs
2799    
2800 gpertea 135 # Map2GTF stuff
2801 gpertea 159 def m2g_convert_coords(params, sam_header_filename, gtf_fname, reads, out_fname):
2802 gpertea 135 """ajjkljlks
2803    
2804     Arguments:
2805     - `params`: TopHat parameters
2806     - `gtf_fname`: File name pointing to the annotation.
2807     - `reads`: The reads to convert coords (in Bowtie format).
2808     - `out_fname`: The file name pointing to the output.
2809     """
2810     m2g_cmd = [prog_path("map2gtf")]
2811     m2g_cmd.extend(params.cmd())
2812 gpertea 159 m2g_cmd += ["--sam-header", sam_header_filename]
2813 gpertea 135 m2g_cmd.append(gtf_fname)
2814     m2g_cmd.append(reads) #could be BAM file
2815     m2g_cmd.append(out_fname)
2816     fbasename = getFileBaseName(reads)
2817     m2g_log = logging_dir + "m2g_" + fbasename + ".out"
2818     m2g_err = logging_dir + "m2g_" + fbasename + ".err"
2819    
2820     try:
2821     th_log("Converting " + fbasename + " to genomic coordinates (map2gtf)")
2822     print >> run_log, " ".join(m2g_cmd) + " > " + m2g_log
2823     ret = subprocess.call(m2g_cmd,
2824     stdout=open(m2g_log, "w"),
2825     stderr=open(m2g_err, "w"))
2826     if ret != 0:
2827     die(fail_str + " Error: map2gtf returned an error")
2828     except OSError, o:
2829     err_msg = fail_str + str(o)
2830     die(err_msg + "\n")
2831    
2832    
2833     def gtf_to_fasta(params, trans_gtf, genome, out_fname):
2834     """ Make the transcriptome from a GTF.
2835    
2836     Arguments:
2837     - `trans_gtf`:
2838     - `genome`:
2839     - `out_fname`:
2840     """
2841     # TODO: Call gtf_to_fasta
2842     g2f_cmd = [prog_path("gtf_to_fasta")]
2843     g2f_cmd.extend(params.cmd())
2844     g2f_cmd.append(trans_gtf)
2845     g2f_cmd.append(genome)
2846     g2f_cmd.append(out_fname)
2847    
2848     g2f_log = logging_dir + "g2f.out"
2849     g2f_err = logging_dir + "g2f.err"
2850    
2851     try:
2852     print >> run_log, " ".join(g2f_cmd)+" > " + g2f_log
2853     ret = subprocess.call(g2f_cmd,
2854     stdout = open(g2f_log, "w"),
2855     stderr = open(g2f_err, "w"))
2856     if ret != 0:
2857     die(fail_str + " Error: gtf_to_fasta returned an error.")
2858     except OSError, o:
2859     err_msg = fail_str + str(o)
2860     die(err_msg + "\n")
2861    
2862 gpertea 159 def map2gtf(params, genome_sam_header_filename, ref_fasta, left_reads, right_reads):
2863 gpertea 135 """ Main GTF mapping function
2864    
2865     Arguments:
2866     - `params`: The TopHat parameters.
2867     - `ref_fasta`: The reference genome.
2868     - `left_reads`: A list of reads.
2869     - `right_reads`: A list of reads (empty if single-end).
2870    
2871     """
2872     test_input_file(params.gff_annotation)
2873    
2874     # th_log("Reading in GTF file: " + params.gff_annotation)
2875     # transcripts = gtf_to_transcripts(params.gff_annotation)
2876    
2877     gtf_name = getFileBaseName(params.gff_annotation)
2878     m2g_bwt_idx = None
2879 gpertea 154 t_out_dir = tmp_dir
2880     if params.transcriptome_index and not params.transcriptome_outdir:
2881 gpertea 135 m2g_bwt_idx = params.transcriptome_index
2882 gpertea 154 th_log("Using pre-built transcriptome index..")
2883 gpertea 135 else:
2884 gpertea 154 th_log("Creating transcriptome data files..")
2885     if params.transcriptome_outdir:
2886     t_out_dir=params.transcriptome_outdir+"/"
2887     m2g_ref_fasta = t_out_dir + gtf_name + ".fa"
2888 gpertea 135 gtf_to_fasta(params, params.gff_annotation, ref_fasta, m2g_ref_fasta)
2889 gpertea 154 m2g_bwt_idx = build_idx_from_fa(params.bowtie2, m2g_ref_fasta, t_out_dir, params.read_params.color)
2890 gpertea 159
2891     get_index_sam_header(params, m2g_bwt_idx)
2892 gpertea 163
2893 gpertea 135 mapped_gtf_list = []
2894     unmapped_gtf_list = []
2895     # do the initial mapping in GTF coordinates
2896     for reads in [left_reads, right_reads]:
2897     if reads == None or os.path.getsize(reads) < 25 :
2898     continue
2899     fbasename = getFileBaseName(reads)
2900     mapped_gtf_out = tmp_dir + fbasename + ".m2g"
2901     #if use_zpacker:
2902     # mapped_gtf_out+=".z"
2903    
2904     unmapped_gtf = tmp_dir + fbasename + ".m2g_um"
2905     #if use_BWT_FIFO:
2906     # unmapped_gtf += ".z"
2907    
2908     (mapped_gtf_map, unmapped) = bowtie(params,
2909     m2g_bwt_idx,
2910     [reads],
2911     "fastq",
2912     params.t_mismatches,
2913     mapped_gtf_out,
2914     unmapped_gtf,
2915 gpertea 154 "", "read_against_transcriptome")
2916 gpertea 135 mapped_gtf_list.append(mapped_gtf_map)
2917     unmapped_gtf_list.append(unmapped)
2918    
2919     sam_gtf_list = []
2920     for reads in mapped_gtf_list:
2921     fbasename = getFileBaseName(reads)
2922     sam_out_fname = tmp_dir + fbasename + "_converted.bam"
2923     m2g_convert_coords(params,
2924 gpertea 159 genome_sam_header_filename,
2925 gpertea 135 params.gff_annotation,
2926     reads,
2927     sam_out_fname)
2928     sam_gtf_list.append(sam_out_fname)
2929    
2930     if len(sam_gtf_list) < 2:
2931     sam_gtf_list.append(None)
2932     unmapped_gtf_list.append(None)
2933    
2934     return (sam_gtf_list, unmapped_gtf_list)
2935     # end Map2GTF
2936    
2937     def get_preflt_data(params, ri, target_reads, out_mappings, out_unmapped):
2938     ## extract mappings and unmapped reads from prefilter mappings and preflt_ium
2939     ##
2940     #this is accomplished by a special prep_reads usage (triggered by --flt-hits)
2941    
2942     prep_cmd=prep_reads_cmd(params, params.preflt_data[ri].unmapped_reads, None,
2943     out_mappings,
2944     target_reads,
2945     params.preflt_data[ri].mappings)
2946     um_reads = open(out_unmapped, "wb")
2947     log_fname=logging_dir + "prep_reads.from_preflt."+str(ri)+".log"
2948     filter_log = open(log_fname,"w")
2949    
2950     shell_cmd = " ".join(prep_cmd)
2951     #add the compression pipe
2952     zip_cmd=[]
2953     if use_zpacker:
2954     zip_cmd=[ params.system_params.zipper ]
2955     zip_cmd.extend(params.system_params.zipper_opts)
2956     zip_cmd.extend(['-c','-'])
2957     shell_cmd +=' | '+' '.join(zip_cmd)
2958     shell_cmd += ' >' + out_unmapped
2959     retcode=0
2960     try:
2961     print >> run_log, shell_cmd
2962     if use_zpacker:
2963     prep_proc = subprocess.Popen(prep_cmd,
2964     stdout=subprocess.PIPE,
2965     stderr=filter_log)
2966     zip_proc = subprocess.Popen(zip_cmd,
2967     preexec_fn=subprocess_setup,
2968     stdin=prep_proc.stdout,
2969     stdout=um_reads)
2970     prep_proc.stdout.close() #as per http://bugs.python.org/issue7678
2971     zip_proc.communicate()
2972     retcode=prep_proc.poll()
2973     if retcode==0:
2974     retcode=zip_proc.poll()
2975     else:
2976     retcode = subprocess.call(prep_cmd,
2977     stdout=um_reads,
2978     stderr=filter_log)
2979     if retcode:
2980     die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname))
2981    
2982     except OSError, o:
2983     errmsg=fail_str+str(o)
2984     die(errmsg+"\n"+log_tail(log_fname))
2985     um_reads.close()
2986    
2987     return (out_mappings, out_unmapped)
2988    
2989    
2990 gpertea 29 # The main aligment routine of TopHat. This function executes most of the
2991     # workflow producing a set of candidate alignments for each cDNA fragment in a
2992     # pair of SAM alignment files (for paired end reads).
2993     def spliced_alignment(params,
2994     bwt_idx_prefix,
2995     sam_header_filename,
2996     ref_fasta,
2997     read_len,
2998     segment_len,
2999 gpertea 135 prepared_reads,
3000 gpertea 29 user_supplied_junctions,
3001     user_supplied_insertions,
3002     user_supplied_deletions):
3003    
3004     possible_juncs = []
3005     possible_juncs.extend(user_supplied_junctions)
3006    
3007     possible_insertions = []
3008     possible_insertions.extend(user_supplied_insertions)
3009     possible_deletions = []
3010     possible_deletions.extend(user_supplied_deletions)
3011 gpertea 154 possible_fusions = []
3012 gpertea 29
3013 gpertea 135 left_reads, right_reads = prepared_reads
3014    
3015     maps = [[], []] # maps[0] = left_reads mapping data, maps[1] = right_reads_mapping_data
3016 gpertea 154
3017 gpertea 29 #single_segments = False
3018    
3019 gpertea 135 # Before anything, map the reads using Map2GTF (if using annotation)
3020     m2g_maps = [ None, None ] # left, right
3021     initial_reads = [ left_reads, right_reads ]
3022    
3023     if params.gff_annotation:
3024     (mapped_gtf_list, unmapped_gtf_list) = \
3025 gpertea 159 map2gtf(params, sam_header_filename, ref_fasta, left_reads, right_reads)
3026 gpertea 135
3027     m2g_left_maps, m2g_right_maps = mapped_gtf_list
3028     m2g_maps = [m2g_left_maps, m2g_right_maps]
3029 gpertea 154 if params.transcriptome_only or not fileExists(unmapped_gtf_list[0]):
3030 gpertea 135 # The case where the user doesn't want to map to anything other
3031     # than the transcriptome OR we have no unmapped reads
3032     maps[0] = [m2g_left_maps]
3033     if right_reads:
3034 gpertea 154 maps[1] = [m2g_right_maps]
3035 gpertea 135 return maps
3036     # Feed the unmapped reads into spliced_alignment()
3037     initial_reads = unmapped_gtf_list[:]
3038     th_log("Resuming TopHat pipeline with unmapped reads")
3039    
3040     max_seg_len = segment_len #this is the ref seq span on either side of the junctions
3041     #to be extracted into segment_juncs.fa
3042    
3043     num_segs = int(read_len / segment_len)
3044     if (read_len % segment_len) >= min(segment_len-2, 20):
3045     #remainder is shorter but long enough to become a new segment
3046     num_segs += 1
3047     else:
3048     # the last segment is longer
3049     if num_segs>1: max_seg_len += (read_len % segment_len)
3050    
3051     if num_segs <= 1:
3052 gpertea 164 th_logp("Warning: you have only one segment per read.\n\tIf the read length is greater than or equal to 45bp,\n\twe strongly recommend that you decrease --segment-length to about half the read length because TopHat will work better with multiple segments")
3053 gpertea 135
3054     # Using the num_segs value returned by check_reads(),
3055     # decide which junction discovery strategy to use
3056 gpertea 154 if num_segs < 3:
3057 gpertea 135 #if params.butterfly_search != False:
3058     # params.butterfly_search = True
3059     if params.coverage_search != False:
3060 gpertea 154 params.coverage_search = True
3061 gpertea 135 if num_segs == 1:
3062     segment_len = read_len
3063     else: #num_segs >= 3:
3064     # if we have at least three segments, just use split segment search,
3065     # which is the most sensitive and specific, fastest, and lightest-weight.
3066     # so unless specifically requested, disable the other junction searches
3067     if params.closure_search != True:
3068     params.closure_search = False
3069     if params.coverage_search != True:
3070     params.coverage_search = False
3071     if params.butterfly_search != True:
3072     params.butterfly_search = False
3073 gpertea 154
3074 gpertea 29 # Perform the first part of the TopHat work flow on the left and right
3075     # reads of paired ends separately - we'll use the pairing information later
3076 gpertea 154 have_left_IUM=False
3077 gpertea 135 for ri in (0,1):
3078     reads=initial_reads[ri]
3079 gpertea 154 if reads == None or not fileExists(reads,25):
3080 gpertea 29 continue
3081     fbasename=getFileBaseName(reads)
3082     unspliced_out = tmp_dir + fbasename + ".mapped"
3083 gpertea 135 unspliced_sam = None
3084     unmapped_reads = None
3085     #if use_zpacker: unspliced_out+=".z"
3086 gpertea 29 unmapped_unspliced = tmp_dir + fbasename + "_unmapped"
3087 gpertea 135 if params.prefilter_multi:
3088     #unmapped_unspliced += ".z"
3089     (unspliced_sam, unmapped_reads) = get_preflt_data(params, ri, reads, unspliced_out, unmapped_unspliced)
3090     else:
3091 gpertea 29 # Perform the initial Bowtie mapping of the full length reads
3092 gpertea 135 (unspliced_sam, unmapped_reads) = bowtie(params,
3093 gpertea 154 bwt_idx_prefix,
3094     [reads],
3095     "fastq",
3096     params.genome_read_mismatches,
3097     unspliced_out,
3098     unmapped_unspliced,
3099     "",
3100     "read_against_genome")
3101 gpertea 29
3102 gpertea 154 # daehwan - check this out
3103     # params.bowtie2 = False
3104     # check_bowtie(params.bowtie2)
3105    
3106 gpertea 29 seg_maps = []
3107     unmapped_segs = []
3108     segs = []
3109 gpertea 154 have_IUM = fileExists(unmapped_reads,25)
3110     if ri==0 and have_IUM:
3111     have_left_IUM = True
3112     if num_segs > 1 and have_IUM:
3113 gpertea 29 # split up the IUM reads into segments
3114 gpertea 31 # unmapped_reads can be in BAM format
3115     read_segments = split_reads(unmapped_reads,
3116 gpertea 29 tmp_dir + fbasename,
3117     False,
3118     params,
3119     segment_len)
3120    
3121     # Map each segment file independently with Bowtie
3122     for i in range(len(read_segments)):
3123     seg = read_segments[i]
3124     fbasename=getFileBaseName(seg)
3125 gpertea 135 seg_out = tmp_dir + fbasename + ".bwtout"
3126     if use_zpacker: seg_out += ".z"
3127     unmapped_seg = tmp_dir + fbasename + "_unmapped.fq"
3128     if use_BWT_FIFO:
3129     unmapped_seg += ".z"
3130 gpertea 29 extra_output = "(%d/%d)" % (i+1, len(read_segments))
3131 gpertea 154 (seg_map, unmapped) = bowtie(params,
3132     bwt_idx_prefix,
3133     [seg],
3134     "fastq",
3135     params.segment_mismatches,
3136     seg_out,
3137     unmapped_seg,
3138     extra_output,
3139     "seg_against_genome")
3140 gpertea 29 seg_maps.append(seg_map)
3141     unmapped_segs.append(unmapped)
3142     segs.append(seg)
3143    
3144     # Collect the segment maps for left and right reads together
3145 gpertea 135 maps[ri] = Maps(unspliced_sam, seg_maps, unmapped_segs, segs)
3146 gpertea 29 else:
3147     # if there's only one segment, just collect the initial map as the only
3148     # map to be used downstream for coverage-based junction discovery
3149     read_segments = [reads]
3150 gpertea 135 maps[ri] = Maps(unspliced_sam, [unspliced_sam], [unmapped_reads], [unmapped_reads])
3151 gpertea 29
3152 gpertea 135 # XXX: At this point if using M2G, have three sets of reads:
3153     # mapped to transcriptome, mapped to genome, and unmapped (potentially
3154     # spliced or poly-A tails) - hp
3155    
3156 gpertea 29 # Unless the user asked not to discover new junctions, start that process
3157     # here
3158 gpertea 135 left_reads_map = maps[0].unspliced_sam
3159     left_seg_maps = maps[0].seg_maps
3160     unmapped_reads = maps[0].unmapped_segs
3161     if right_reads:
3162     right_reads_map = maps[1].unspliced_sam
3163     right_seg_maps = maps[1].seg_maps
3164     unmapped_reads.extend(maps[1].unmapped_segs)
3165     else:
3166     right_reads_map = None
3167     right_seg_maps = None
3168 gpertea 29
3169 gpertea 154 if params.find_novel_juncs and have_left_IUM: # or params.find_novel_indels:
3170 gpertea 29 # Call segment_juncs to infer a list of possible splice junctions from
3171     # the regions of the genome covered in the initial and segment maps
3172 gpertea 135 #if params.find_novel_juncs:
3173     #TODO: in m2g case, we might want to pass the m2g mappings as well,
3174     # or perhaps the GTF file directly
3175     # -> this could improve alternative junction detection?
3176 gpertea 29 juncs = junctions_from_segments(params,
3177 gpertea 159 sam_header_filename,
3178 gpertea 29 left_reads,
3179     left_reads_map,
3180     left_seg_maps,
3181     right_reads,
3182     right_reads_map,
3183     right_seg_maps,
3184     unmapped_reads,
3185     "fastq",
3186     ref_fasta)
3187 gpertea 135 if os.path.getsize(juncs[0]) != 0:
3188 gpertea 29 possible_juncs.append(juncs[0])
3189     if params.find_novel_indels:
3190 gpertea 154 if os.path.getsize(juncs[1]) != 0:
3191     possible_insertions.append(juncs[1])
3192     if os.path.getsize(juncs[2]) != 0:
3193     possible_deletions.append(juncs[2])
3194     if params.find_novel_fusions:
3195     if os.path.getsize(juncs[3]) != 0:
3196     possible_fusions.append(juncs[3])
3197 gpertea 29 # Optionally, and for paired reads only, use a closure search to
3198     # discover addtional junctions
3199 gpertea 135 if params.closure_search and left_reads and right_reads:
3200 gpertea 29 juncs = junctions_from_closures(params,
3201 gpertea 159 sam_header_filename,
3202 gpertea 135 [maps[initial_reads[left_reads]].unspliced_sam, maps[initial_reads[left_reads]].seg_maps[-1]],
3203     [maps[initial_reads[right_reads]].unspliced_sam, maps[initial_reads[right_reads]].seg_maps[-1]],
3204 gpertea 29 ref_fasta)
3205     if os.path.getsize(juncs[0]) != 0:
3206     possible_juncs.extend(juncs)
3207    
3208 gpertea 154 if len(possible_insertions) == 0 and len(possible_deletions) == 0 and len(possible_juncs) == 0 and len(possible_fusions) == 0:
3209 gpertea 29 spliced_seg_maps = None
3210     junc_idx_prefix = None
3211     else:
3212     junc_idx_prefix = "segment_juncs"
3213     if len(possible_insertions) == 0:
3214     possible_insertions.append(os.devnull)
3215     # print >> sys.stderr, "Warning: insertions database is empty!"
3216     if len(possible_deletions) == 0:
3217     possible_deletions.append(os.devnull)
3218     # print >> sys.stderr, "Warning: deletions database is empty!"
3219     if len(possible_juncs) == 0:
3220     possible_juncs.append(os.devnull)
3221 gpertea 164 th_logp("Warning: junction database is empty!")
3222 gpertea 154 if len(possible_fusions) == 0:
3223     possible_fusions.append(os.devnull)
3224 gpertea 135 if junc_idx_prefix:
3225 gpertea 154 juncs_bwt_idx = build_juncs_index(params.bowtie2,
3226     3,
3227     max_seg_len,
3228     junc_idx_prefix,
3229     possible_juncs,
3230     possible_insertions,
3231     possible_deletions,
3232     possible_fusions,
3233     ref_fasta,
3234     params.read_params.color)
3235 gpertea 29
3236 gpertea 159 juncs_bwt_samheader = get_index_sam_header(params, juncs_bwt_idx)
3237 gpertea 154
3238 gpertea 29 # Now map read segments (or whole IUM reads, if num_segs == 1) to the splice
3239     # index with Bowtie
3240 gpertea 135 # for reads in [left_reads, right_reads]:
3241     for ri in (0,1):
3242     reads = initial_reads[ri]
3243 gpertea 29 spliced_seg_maps = []
3244 gpertea 135
3245     if reads == None or os.path.getsize(reads)<25:
3246 gpertea 29 continue
3247    
3248 gpertea 135 m2g_map = m2g_maps[ri]
3249    
3250     if junc_idx_prefix :
3251 gpertea 29 i = 0
3252 gpertea 135 for seg in maps[ri].segs:
3253     #search each segment
3254 gpertea 29 fsegname = getFileBaseName(seg)
3255 gpertea 135 seg_out = tmp_dir + fsegname + ".to_spliced.bwtout"
3256     if use_zpacker: seg_out += ".z"
3257     extra_output = "(%d/%d)" % (i+1, len(maps[ri].segs))
3258 gpertea 154 (seg_map, unmapped) = bowtie(params,
3259     tmp_dir + junc_idx_prefix,
3260     [seg],
3261     "fastq",
3262     params.segment_mismatches,
3263     seg_out,
3264     None,
3265     extra_output,
3266     "seg_against_junctions")
3267 gpertea 29 spliced_seg_maps.append(seg_map)
3268     i += 1
3269    
3270 gpertea 135 # Join the contigous and spliced segment hits into full length
3271     # read alignments
3272     rfname=getFileBaseName(reads)
3273     rfdir=getFileDir(reads)
3274 gpertea 29 mapped_reads = rfdir + rfname + ".candidates.bam"
3275 gpertea 135 # -- spliced mappings built from all segment mappings vs genome and junc_db
3276 gpertea 29 merged_map =rfdir + rfname + ".candidates_and_unspl.bam"
3277 gpertea 135 #unspl_bwtfile = maps[ri].unspliced_bwt
3278     unspl_samfile = maps[ri].unspliced_sam
3279 gpertea 29 join_mapped_segments(params,
3280     sam_header_filename,
3281     reads,
3282     ref_fasta,
3283     possible_juncs,
3284     possible_insertions,
3285     possible_deletions,
3286 gpertea 154 possible_fusions,
3287 gpertea 135 maps[ri].seg_maps,
3288 gpertea 29 spliced_seg_maps,
3289     mapped_reads)
3290    
3291 gpertea 135 if num_segs > 1 or m2g_map:
3292     # Merge the spliced and unspliced full length alignments into
3293     # a single SAM file.
3294 gpertea 29 # The individual SAM files are all already sorted in
3295     # increasing read ID order.
3296     # NOTE: We also should be able to address bug #134 here, by replacing
3297     # contiguous alignments that poke into an intron by a small amount by
3298     # the correct spliced alignment.
3299 gpertea 154
3300 gpertea 29 try:
3301 gpertea 159 merge_cmd = [prog_path("bam_merge"),
3302     "--index-outfile", merged_map + ".index",
3303     "--sam-header", sam_header_filename,
3304     merged_map]
3305 gpertea 154
3306 gpertea 135 if num_segs > 1:
3307     merge_cmd += [ maps[ri].unspliced_sam ]
3308     if m2g_map:
3309     merge_cmd += [ m2g_map ]
3310 gpertea 154
3311     if os.path.exists(mapped_reads):
3312     merge_cmd += [ mapped_reads ]
3313     else:
3314     for bam_i in range(0, params.system_params.num_threads):
3315     temp_bam = mapped_reads[:-4] + str(bam_i) + ".bam"
3316     if os.path.exists(temp_bam):
3317     merge_cmd += [ temp_bam ]
3318     else:
3319     break
3320    
3321 gpertea 29 print >> run_log, " ".join(merge_cmd)
3322     ret = subprocess.call( merge_cmd,
3323     stderr=open(logging_dir + "sam_merge.log", "w") )
3324     if ret != 0:
3325     die(fail_str+"Error executing: "+" ".join(merge_cmd))
3326     except OSError, o:
3327     die(fail_str+"Error: "+str(o))
3328 gpertea 135 maps[ri] = [merged_map]
3329 gpertea 29
3330     if not params.system_params.keep_tmp:
3331     os.remove(mapped_reads)
3332     else:
3333 gpertea 135 # no segments or transcriptome mappings, so no merge is needed
3334     # because join_mapped_segments() produced the final BAM already
3335 gpertea 29 os.rename(mapped_reads, merged_map)
3336 gpertea 135 maps[ri] = [merged_map]
3337     if not params.system_params.keep_tmp:
3338 gpertea 29 os.remove(unspl_samfile)
3339 gpertea 154
3340 gpertea 29 return maps
3341    
3342     # rough equivalent of the 'which' command to find external programs
3343     # (current script path is tested first, then PATH envvar)
3344     def which(program):
3345     def is_executable(fpath):
3346 gpertea 37 return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
3347 gpertea 29 fpath, fname = os.path.split(program)
3348     if fpath:
3349     if is_executable(program):
3350     return program
3351     else:
3352     progpath = os.path.join(bin_dir, program)
3353     if is_executable(progpath):
3354     return progpath
3355     for path in os.environ["PATH"].split(os.pathsep):
3356     progpath = os.path.join(path, program)
3357     if is_executable(progpath):
3358     return progpath
3359     return None
3360    
3361     def prog_path(program):
3362     progpath=which(program)
3363     if progpath == None:
3364     die("Error locating program: "+program)
3365     return progpath
3366    
3367     # FIXME: this should get set during the make dist autotools phase of the build
3368     def get_version():
3369 gpertea 154 return "2.0.0"
3370 gpertea 29
3371     def mlog(msg):
3372     print >> sys.stderr, "[DBGLOG]:"+msg
3373    
3374     def test_input_file(filename):
3375     try:
3376     test_file = open(filename, "r")
3377     except IOError, o:
3378     die("Error: Opening file %s" % filename)
3379     return
3380    
3381     def main(argv=None):
3382     warnings.filterwarnings("ignore", "tmpnam is a potential security risk")
3383    
3384     # Initialize default parameter values
3385     params = TopHatParams()
3386     try:
3387     if argv is None:
3388     argv = sys.argv
3389     args = params.parse_options(argv)
3390     params.check()
3391    
3392     bwt_idx_prefix = args[0]
3393     left_reads_list = args[1]
3394     left_quals_list, right_quals_list = [], []
3395     if (not params.read_params.quals and len(args) > 2) or (params.read_params.quals and len(args) > 3):
3396     if params.read_params.mate_inner_dist == None:
3397 gpertea 135 params.read_params.mate_inner_dist = 50
3398 gpertea 67 #die("Error: you must set the mean inner distance between mates with -r")
3399 gpertea 29
3400     right_reads_list = args[2]
3401     if params.read_params.quals:
3402     left_quals_list = args[3]
3403     right_quals_list = args[4]
3404     else:
3405     right_reads_list = None
3406     if params.read_params.quals:
3407     left_quals_list = args[2]
3408    
3409     start_time = datetime.now()
3410     prepare_output_dir()
3411 gpertea 164 init_logger(logging_dir + "tophat.log")
3412 gpertea 29
3413 gpertea 164 th_logp()
3414     th_log("Beginning TopHat run (v"+get_version()+")")
3415     th_logp("-----------------------------------------------")
3416    
3417 gpertea 29 global run_log
3418     run_log = open(logging_dir + "run.log", "w", 0)
3419     global run_cmd
3420     run_cmd = " ".join(argv)
3421     print >> run_log, run_cmd
3422    
3423 gpertea 164 check_bowtie(params)
3424     check_samtools()
3425    
3426 gpertea 29 # Validate all the input files, check all prereqs before committing
3427     # to the run
3428 gpertea 154 if params.gff_annotation:
3429     if not os.path.exists(params.gff_annotation):
3430     die("Error: cannot find transcript file %s" % params.gff_annotation)
3431     if os.path.getsize(params.gff_annotation)<10:
3432     die("Error: invalid transcript file %s" % params.gff_annotation)
3433    
3434 gpertea 135 if params.transcriptome_index:
3435 gpertea 154 if params.gff_annotation:
3436     #gff file given, so transcriptome data will be written there
3437     gff_basename = getFileBaseName(params.gff_annotation)
3438     #just in case, check if it's not already there (-G/--GTF given again by mistake)
3439     tpath, tname = os.path.split(params.transcriptome_index)
3440     new_subdir=False
3441     if tpath in (".", "./") or not tpath :
3442     if not os.path.exists(params.transcriptome_index):
3443     os.makedirs(params.transcriptome_index)
3444     new_subdir=True
3445     if new_subdir or (os.path.exists(params.transcriptome_index) and os.path.isdir(params.transcriptome_index)):
3446     params.transcriptome_index = os.path.join(params.transcriptome_index, gff_basename)
3447     gff_out=params.transcriptome_index+".gff"
3448     if not (os.path.exists(gff_out) and os.path.getsize(gff_out)==os.path.getsize(params.gff_annotation)):
3449     #generate the transcriptome data files
3450     tpath, tname = os.path.split(params.transcriptome_index)
3451     params.transcriptome_outdir=tpath
3452     t_gff=params.transcriptome_index+".gff"
3453     if params.transcriptome_outdir:
3454     #will create the transcriptome data files
3455     if not os.path.exists(params.transcriptome_outdir):
3456     os.makedirs(params.transcriptome_outdir)
3457     copy(params.gff_annotation, t_gff)
3458     else:
3459     #try to use existing transcriptome data files
3460    
3461     if not (os.path.exists(t_gff) and os.path.getsize(t_gff)>10):
3462     die("Error: GFF transcripts file not found or invalid (%s)" % t_gff)
3463 gpertea 164 check_bowtie_index(params.transcriptome_index, params.bowtie2)
3464 gpertea 154 params.gff_annotation=t_gff
3465     #end @ transcriptome_index given
3466    
3467 gpertea 164 (ref_fasta, ref_seq_dict) = check_index(bwt_idx_prefix, params.bowtie2)
3468 gpertea 29
3469 gpertea 135 th_log("Generating SAM header for "+bwt_idx_prefix)
3470 gpertea 159 sam_header_filename = get_index_sam_header(params, bwt_idx_prefix)
3471 gpertea 135 #if not params.skip_check_reads:
3472     reads_list = left_reads_list
3473     if right_reads_list:
3474 gpertea 29 reads_list = reads_list + "," + right_reads_list
3475 gpertea 135 params.read_params = check_reads_format(params, reads_list)
3476 gpertea 29
3477     user_supplied_juncs = []
3478     user_supplied_insertions = []
3479     user_supplied_deletions = []
3480 gpertea 154 user_supplied_fusions = []
3481 gpertea 135 global gtf_juncs
3482 gpertea 29 if params.gff_annotation and params.find_GFF_juncs:
3483     test_input_file(params.gff_annotation)
3484     (found_juncs, gtf_juncs) = get_gtf_juncs(params.gff_annotation)
3485 gpertea 159 ##-- we shouldn't need these junctions in user_supplied_juncs anymore because now map2gtf does a much better job
3486 gpertea 135 ## but we still need them loaded in gtf_juncs for later splice verification
3487 gpertea 163 if found_juncs:
3488 gpertea 159 ## and not params.gff_annotation:
3489     user_supplied_juncs.append(gtf_juncs)
3490 gpertea 135 #else:
3491     # gtf_juncs = None
3492 gpertea 29 if params.raw_junctions:
3493     test_input_file(params.raw_junctions)
3494     user_supplied_juncs.append(params.raw_junctions)
3495    
3496     if params.raw_insertions:
3497     test_input_file(params.raw_insertions)
3498     user_supplied_insertions.append(params.raw_insertions)
3499    
3500     if params.raw_deletions:
3501     test_input_file(params.raw_deletions)
3502     user_supplied_deletions.append(params.raw_deletions)
3503    
3504     global unmapped_reads_fifo
3505     unmapped_reads_fifo = tmp_dir + str(os.getpid())+".bwt_unmapped.z.fifo"
3506    
3507     # Now start the time consuming stuff
3508 gpertea 135 if params.prefilter_multi:
3509     sides=("left","right")
3510     read_lists=(left_reads_list, right_reads_list)
3511     for ri in (0,1):
3512     reads_list=read_lists[ri]
3513     if not reads_list:
3514     continue
3515     params.preflt_data[ri].multihit_reads = tmp_dir + sides[ri]+"_multimapped.fq"
3516     side_imap = tmp_dir + sides[ri]+"_im.bwtout"
3517     if use_zpacker:
3518     side_imap+=".z"
3519     side_ium = tmp_dir + sides[ri]+"_ium.fq"
3520     if use_BWT_FIFO:
3521     side_ium += ".z"
3522     th_log("Pre-filtering multi-mapped "+sides[ri]+" reads")
3523     rdlist=reads_list.split(',')
3524     bwt=bowtie(params, bwt_idx_prefix, rdlist,
3525     params.read_params.reads_format,
3526 gpertea 154 params.genome_read_mismatches,
3527 gpertea 135 side_imap, side_ium,
3528 gpertea 154 "", "read_against_genome", params.preflt_data[ri].multihit_reads)
3529 gpertea 135 params.preflt_data[ri].mappings = bwt[0] # initial mappings
3530     params.preflt_data[ri].unmapped_reads = bwt[1] # IUM reads
3531    
3532     th_log("Preparing reads")
3533 gpertea 29 left_kept_reads, left_reads_info = prep_reads(params,
3534     left_reads_list,
3535     left_quals_list,
3536 gpertea 135 "left",
3537     params.preflt_data[0].multihit_reads)
3538 gpertea 29 min_read_len=left_reads_info.min_len
3539     max_read_len=left_reads_info.max_len
3540 gpertea 135 if right_reads_list:
3541 gpertea 29 right_kept_reads, right_reads_info = prep_reads(params,
3542     right_reads_list,
3543     right_quals_list,
3544 gpertea 135 "right",
3545     params.preflt_data[1].multihit_reads)
3546 gpertea 29 min_read_len=min(right_reads_info.min_len, min_read_len)
3547     max_read_len=max(right_reads_info.max_len, max_read_len)
3548     else:
3549     right_kept_reads = None
3550     seed_len=params.read_params.seed_length
3551 gpertea 135 if seed_len: #if read len was explicitly given
3552 gpertea 29 seed_len = max(seed_len, min_read_len)
3553 gpertea 135 #can't be smaller than minimum length observed
3554 gpertea 29 else:
3555     seed_len = max_read_len
3556     params.read_params.seed_length=seed_len
3557     # turn off integer-quals
3558     if params.read_params.integer_quals:
3559     params.read_params.integer_quals = False
3560 gpertea 154
3561 gpertea 135 input_reads = [left_kept_reads, right_kept_reads]
3562     mappings = spliced_alignment(params,
3563     bwt_idx_prefix,
3564     sam_header_filename,
3565     ref_fasta,
3566     params.read_params.seed_length,
3567     params.segment_length,
3568     input_reads,
3569     user_supplied_juncs,
3570     user_supplied_insertions,
3571     user_supplied_deletions)
3572 gpertea 29
3573     compile_reports(params,
3574     sam_header_filename,
3575 gpertea 154 ref_fasta,
3576 gpertea 135 mappings,
3577     input_reads,
3578 gpertea 29 params.gff_annotation)
3579    
3580     if not params.system_params.keep_tmp:
3581 gpertea 135 for m in mappings[0]:
3582 gpertea 29 os.remove(m)
3583 gpertea 135 if left_kept_reads:
3584 gpertea 29 os.remove(left_kept_reads)
3585 gpertea 135 for m in mappings[1]:
3586 gpertea 29 os.remove(m)
3587 gpertea 135 if right_kept_reads:
3588 gpertea 29 os.remove(right_kept_reads)
3589     tmp_files = os.listdir(tmp_dir)
3590     for t in tmp_files:
3591     os.remove(tmp_dir+t)
3592     os.rmdir(tmp_dir)
3593    
3594     finish_time = datetime.now()
3595     duration = finish_time - start_time
3596 gpertea 164 th_logp("-----------------------------------------------")
3597 gpertea 154 th_log("Run complete: %s elapsed" % formatTD(duration))
3598 gpertea 29
3599     except Usage, err:
3600 gpertea 164 th_logp(sys.argv[0].split("/")[-1] + ": " + str(err.msg))
3601     th_logp(" for detailed help see http://tophat.cbcb.umd.edu/manual.html")
3602 gpertea 29 return 2
3603    
3604    
3605     if __name__ == "__main__":
3606     sys.exit(main())

Properties

Name Value
svn:executable *