ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/tophat
Revision: 135
Committed: Mon Dec 12 22:28:38 2011 UTC (8 years, 5 months ago) by gpertea
File size: 127439 byte(s)
Log Message:
wip - SplicedSAMHitFactory() still not implemented

Line User Rev File contents
1 gpertea 29 #!/usr/bin/env python
2    
3     # encoding: utf-8
4     """
5     tophat.py
6    
7     Created by Cole Trapnell on 2008-12-25.
8     Copyright (c) 2008 Cole Trapnell. All rights reserved.
9     """
10     import sys
11     try:
12     import psyco
13     psyco.full()
14     except ImportError:
15     pass
16    
17     import getopt
18     import subprocess
19     import errno
20     import os
21     import warnings
22     import re
23     import signal
24     from datetime import datetime, date, time
25    
26     use_message = '''
27     TopHat maps short sequences from spliced transcripts to whole genomes.
28    
29     Usage:
30     tophat [options] <bowtie_index> <reads1[,reads2,...]> [reads1[,reads2,...]] \\
31     [quals1,[quals2,...]] [quals1[,quals2,...]]
32    
33     Options:
34     -v/--version
35     -o/--output-dir <string> [ default: ./tophat_out ]
36     -a/--min-anchor <int> [ default: 8 ]
37     -m/--splice-mismatches <0-2> [ default: 0 ]
38     -i/--min-intron-length <int> [ default: 50 ]
39     -I/--max-intron-length <int> [ default: 500000 ]
40     -g/--max-multihits <int> [ default: 20 ]
41 gpertea 135 -x/--transcriptome-max-hits <int> [ default: 60 ]
42     -n/--transcriptome-mismatches <int> [ default: 1 ]
43     -M/--prefilter-multihits ( for -G/--GTF option, enable
44     an initial bowtie search
45     against the genome )
46 gpertea 29 -F/--min-isoform-fraction <float> [ default: 0.15 ]
47     --max-insertion-length <int> [ default: 3 ]
48     --max-deletion-length <int> [ default: 3 ]
49     --solexa-quals
50     --solexa1.3-quals (same as phred64-quals)
51     --phred64-quals (same as solexa1.3-quals)
52     -Q/--quals
53     --integer-quals
54     -C/--color (Solid - color space)
55     --color-out
56     --library-type <string> (fr-unstranded, fr-firststrand,
57     fr-secondstrand)
58     -p/--num-threads <int> [ default: 1 ]
59     -G/--GTF <filename>
60 gpertea 135 --transcriptome-index <bwtidx> [pre-built bowtie index for the
61     transcripts in the -G/--GTF file]
62     -T/--transcriptome-only [map only to the transcriptome]
63 gpertea 29 -j/--raw-juncs <filename>
64     --insertions <filename>
65     --deletions <filename>
66     -r/--mate-inner-dist <int>
67     --mate-std-dev <int> [ default: 20 ]
68     --no-novel-juncs
69     --no-novel-indels
70     --no-gtf-juncs
71     --no-coverage-search
72     --coverage-search
73     --no-closure-search
74     --closure-search
75     --microexon-search
76     --butterfly-search
77     --no-butterfly-search
78     --keep-tmp
79     --tmp-dir <dirname> [ default: <output_dir>/tmp ]
80     -z/--zpacker <program> [ default: gzip ]
81     -X/--unmapped-fifo [use mkfifo to compress more temporary
82     files for color space reads]
83    
84     Advanced Options:
85 gpertea 135 -N/--initial-read-mismatches <int> [ default: 2 ]
86 gpertea 29 --segment-mismatches <int> [ default: 2 ]
87     --segment-length <int> [ default: 25 ]
88     --bowtie-n [ default: bowtie -v ]
89     --min-closure-exon <int> [ default: 100 ]
90     --min-closure-intron <int> [ default: 50 ]
91     --max-closure-intron <int> [ default: 5000 ]
92     --min-coverage-intron <int> [ default: 50 ]
93     --max-coverage-intron <int> [ default: 20000 ]
94     --min-segment-intron <int> [ default: 50 ]
95     --max-segment-intron <int> [ default: 500000 ]
96     --no-sort-bam [Output BAM is not coordinate-sorted]
97     --no-convert-bam [Do not convert to bam format.
98     Output is <output_dir>accepted_hit.sam.
99     Implies --no-sort-bam.]
100    
101     SAM Header Options (for embedding sequencing run metadata in output):
102     --rg-id <string> (read group ID)
103     --rg-sample <string> (sample ID)
104     --rg-library <string> (library ID)
105     --rg-description <string> (descriptive string, no tabs allowed)
106     --rg-platform-unit <string> (e.g Illumina lane ID)
107     --rg-center <string> (sequencing center name)
108     --rg-date <string> (ISO 8601 date of the sequencing run)
109     --rg-platform <string> (Sequencing platform descriptor)
110     '''
111    
112     class Usage(Exception):
113     def __init__(self, msg):
114     self.msg = msg
115    
116     output_dir = "./tophat_out/"
117     logging_dir = output_dir + "logs/"
118     run_log = None
119     run_cmd = None
120     tmp_dir = output_dir + "tmp/"
121     bin_dir = sys.path[0] + "/"
122     use_zpacker = False # this is set by -z/--zpacker option (-z0 leaves it False)
123    
124     use_BAM_Unmapped = False # automatically set to True for non-Solid reads, handles unmapped reads in BAM format
125    
126     use_BWT_FIFO = False # can only be set to True if use_zpacker is True and only with -C/--color
127     # enabled by -X/-unmapped-fifo option (unless -z0)
128 gpertea 135 unmapped_reads_fifo = None # if use_BWT_FIFO is True, this tricks bowtie into writing the
129 gpertea 29 # unmapped reads into a compressed file
130    
131     samtools_path = None
132     bowtie_path = None
133     fail_str = "\t[FAILED]\n"
134    
135     sam_header = tmp_dir + "stub_header.sam"
136    
137 gpertea 135 gtf_juncs = None #file name with junctions extracted from given GFF file
138    
139    
140 gpertea 29 # TopHatParams captures all of the runtime paramaters used by TopHat, and many
141     # of these are passed as command line options to exectubles run by the pipeline
142    
143     # This class and its nested classes also do options parsing through parse_options()
144     # and option validation via the member function check()
145    
146 gpertea 135
147     class BowtieOutputFiles():
148     def __init__(self,
149     mappings=None,
150     unmapped_reads=None,
151     multihit_reads=None):
152     self.mappings=mappings
153     self.unmapped_reads=unmapped_reads
154     self.multihit_reads=multihit_reads
155    
156 gpertea 29 class TopHatParams:
157    
158     # SpliceConstraints is a group of runtime parameters that specify what
159     # constraints to put on junctions discovered by the program. These constraints
160     # are used to filter out spurious/false positive junctions.
161    
162     class SpliceConstraints:
163     def __init__(self,
164     min_anchor_length,
165     min_intron_length,
166     max_intron_length,
167     splice_mismatches,
168     min_isoform_fraction):
169     self.min_anchor_length = min_anchor_length
170     self.min_intron_length = min_intron_length
171     self.max_intron_length = max_intron_length
172     self.splice_mismatches = splice_mismatches
173     self.min_isoform_fraction = min_isoform_fraction
174    
175     def parse_options(self, opts):
176     for option, value in opts:
177     if option in ("-m", "--splice-mismatches"):
178     self.splice_mismatches = int(value)
179     elif option in ("-a", "--min-anchor"):
180     self.min_anchor_length = int(value)
181     elif option in ("-F", "--min-isoform-fraction"):
182     self.min_isoform_fraction = float(value)
183     elif option in ("-i", "--min-intron-length"):
184     self.min_intron_length = int(value)
185     elif option in ("-I", "--max-intron-length"):
186     self.max_intron_length = int(value)
187    
188     def check(self):
189     if self.splice_mismatches not in [0,1,2]:
190     die("Error: arg to --splice-mismatches must be 0, 1, or 2")
191     if self.min_anchor_length < 4:
192     die("Error: arg to --min-anchor-len must be greater than 4")
193     if self.min_isoform_fraction < 0.0 or self.min_isoform_fraction > 1.0:
194     die("Error: arg to --min-isoform-fraction must be between 0.0 and 1.0")
195     if self.min_intron_length <= 0:
196     die("Error: arg to --min-intron-length must be greater than 0")
197     if self.max_intron_length <= 0:
198     die("Error: arg to --max-intron-length must be greater than 0")
199    
200     # SystemParams is a group of runtime parameters that determine how to handle
201     # temporary files produced during a run and how many threads to use for threaded
202     # stages of the pipeline (e.g. Bowtie)
203    
204     class SystemParams:
205     def __init__(self,
206     num_cpus,
207     keep_tmp):
208     self.num_cpus = num_cpus
209     self.keep_tmp = keep_tmp
210     self.zipper = "gzip"
211     self.zipper_opts= []
212    
213     def parse_options(self, opts):
214     global use_zpacker
215     global use_BWT_FIFO
216     for option, value in opts:
217     if option in ("-p", "--num-threads"):
218     self.num_cpus = int(value)
219     elif option == "--keep-tmp":
220     self.keep_tmp = True
221     elif option in ("-z","--zpacker"):
222     if value.lower() in ["-", " ", ".", "0", "none", "f", "false", "no"]:
223     value=""
224     self.zipper = value
225     #if not self.zipper:
226     # self.zipper='gzip'
227     elif option in ("-X", "--unmapped-fifo"):
228     use_BWT_FIFO=True
229     if self.zipper:
230     use_zpacker=True
231     if self.num_cpus>1 and not self.zipper_opts:
232     if self.zipper.endswith('pbzip2') or self.zipper.endswith('pigz'):
233     self.zipper_opts.append('-p'+str(self.num_cpus))
234     else:
235     use_zpacker=False
236     if use_BWT_FIFO: use_BWT_FIFO=False
237     def cmd(self):
238     cmdline=[]
239     if self.zipper:
240     cmdline.extend(['-z',self.zipper])
241     if self.num_cpus>1:
242     cmdline.extend(['-p'+str(self.num_cpus)])
243     return cmdline
244    
245     def check(self):
246     if self.num_cpus<1 :
247     die("Error: arg to --num-threads must be greater than 0")
248     if self.zipper:
249     xzip=which(self.zipper)
250     if not xzip:
251     die("Error: cannot find compression program "+self.zipper)
252    
253     # ReadParams is a group of runtime parameters that specify various properties
254     # of the user's reads (e.g. which quality scale their are on, how long the
255     # fragments are, etc).
256     class ReadParams:
257     def __init__(self,
258     solexa_quals,
259     phred64_quals,
260     quals,
261     integer_quals,
262     color,
263     color_out,
264     library_type,
265     seed_length,
266     reads_format,
267     mate_inner_dist,
268     mate_inner_dist_std_dev,
269     read_group_id,
270     sample_id,
271     library_id,
272     description,
273     seq_platform_unit,
274     seq_center,
275     seq_run_date,
276     seq_platform):
277     self.solexa_quals = solexa_quals
278     self.phred64_quals = phred64_quals
279     self.quals = quals
280     self.integer_quals = integer_quals
281     self.color = color
282     self.color_out = color_out
283     self.library_type = library_type
284     self.seed_length = seed_length
285     self.reads_format = reads_format
286     self.mate_inner_dist = mate_inner_dist
287     self.mate_inner_dist_std_dev = mate_inner_dist_std_dev
288     self.read_group_id = read_group_id
289     self.sample_id = sample_id
290     self.library_id = library_id
291     self.description = description
292     self.seq_platform_unit = seq_platform_unit
293     self.seq_center = seq_center
294     self.seq_run_date = seq_run_date
295     self.seq_platform = seq_platform
296    
297     def parse_options(self, opts):
298     for option, value in opts:
299     if option == "--solexa-quals":
300     self.solexa_quals = True
301     elif option in ("--solexa1.3-quals", "--phred64-quals"):
302     self.phred64_quals = True
303     elif option in ("-Q", "--quals"):
304     self.quals = True
305     elif option == "--integer-quals":
306     self.integer_quals = True
307     elif option in ("-C", "--color"):
308     self.color = True
309     elif option == "--color-out":
310     self.color_out = True
311     elif option == "--library-type":
312     self.library_type = value
313     elif option in ("-s", "--seed-length"):
314     self.seed_length = int(value)
315     elif option in ("-r", "--mate-inner-dist"):
316     self.mate_inner_dist = int(value)
317     elif option == "--mate-std-dev":
318     self.mate_inner_dist_std_dev = int(value)
319     elif option == "--rg-id":
320     self.read_group_id = value
321     elif option == "--rg-sample":
322     self.sample_id = value
323     elif option == "--rg-library":
324     self.library_id = value
325     elif option == "--rg-description":
326     self.description = value
327     elif option == "--rg-platform-unit":
328     self.seq_platform_unit = value
329     elif option == "--rg-center":
330     self.seq_center = value
331     elif option == "--rg-date":
332     self.seq_run_date = value
333     elif option == "--rg-platform":
334     self.seq_platform = value
335    
336     def check(self):
337 gpertea 135 if self.seed_length and self.seed_length < 20:
338 gpertea 29 die("Error: arg to --seed-length must be at least 20")
339     if self.mate_inner_dist_std_dev != None and self.mate_inner_dist_std_dev < 0:
340     die("Error: arg to --mate-std-dev must at least 0")
341     if (not self.read_group_id and self.sample_id) or (self.read_group_id and not self.sample_id):
342     die("Error: --rg-id and --rg-sample must be specified or omitted together")
343    
344     # SearchParams is a group of runtime parameters that specify how TopHat will
345     # search for splice junctions
346    
347     class SearchParams:
348     def __init__(self,
349     min_closure_exon,
350     min_closure_intron,
351     max_closure_intron,
352     min_coverage_intron,
353     max_coverage_intron,
354     min_segment_intron,
355     max_segment_intron):
356    
357     self.min_closure_exon_length = min_closure_exon
358     self.min_closure_intron_length = min_closure_intron
359     self.max_closure_intron_length = max_closure_intron
360     self.min_coverage_intron_length = min_coverage_intron
361     self.max_coverage_intron_length = max_coverage_intron
362     self.min_segment_intron_length = min_segment_intron
363     self.max_segment_intron_length = max_segment_intron
364    
365     def parse_options(self, opts):
366     for option, value in opts:
367     if option == "--min-closure-exon":
368     self.min_closure_exon_length = int(value)
369     if option == "--min-closure-intron":
370     self.min_closure_intron_length = int(value)
371     if option == "--max-closure-intron":
372     self.max_closure_intron_length = int(value)
373     if option == "--min-coverage-intron":
374     self.min_coverage_intron_length = int(value)
375     if option == "--max-coverage-intron":
376     self.max_coverage_intron_length = int(value)
377     if option == "--min-segment-intron":
378     self.min_segment_intron_length = int(value)
379     if option == "--max-segment-intron":
380     self.max_segment_intron_length = int(value)
381    
382     def check(self):
383     if self.min_closure_exon_length < 0:
384     die("Error: arg to --min-closure-exon must be at least 20")
385     if self.min_closure_intron_length < 0:
386     die("Error: arg to --min-closure-intron must be at least 20")
387     if self.max_closure_intron_length < 0:
388     die("Error: arg to --max-closure-intron must be at least 20")
389     if self.min_coverage_intron_length < 0:
390     die("Error: arg to --min-coverage-intron must be at least 20")
391     if self.max_coverage_intron_length < 0:
392     die("Error: arg to --max-coverage-intron must be at least 20")
393     if self.min_segment_intron_length < 0:
394     die("Error: arg to --min-segment-intron must be at least 20")
395     if self.max_segment_intron_length < 0:
396     die("Error: arg to --max-segment-intron must be at least 20")
397    
398     class ReportParams:
399     def __init__(self):
400     self.sort_bam = True
401     self.convert_bam = True
402    
403     def parse_options(self, opts):
404     for option, value in opts:
405     if option == "--no-sort-bam":
406     self.sort_bam = False
407     if option == "--no-convert-bam":
408     self.convert_bam = False
409     self.sort_bam = False
410    
411    
412     def __init__(self):
413     self.splice_constraints = self.SpliceConstraints(8, # min_anchor
414     50, # min_intron
415     500000, # max_intron
416     0, # splice_mismatches
417     0.15) # min_isoform_frac
418    
419 gpertea 135 self.preflt_data = [ BowtieOutputFiles(), BowtieOutputFiles() ]
420    
421 gpertea 29 self.read_params = self.ReadParams(False, # solexa_scale
422     False,
423     False, # quals
424     None, # integer quals
425     False, # SOLiD - color space
426     False, # SOLiD - color out instead of base pair,
427     "", # library type (e.g. "illumina-stranded-pair-end")
428     None, # seed_length
429     "fastq", # quality_format
430     None, # mate inner distance
431     20, # mate inner dist std dev
432     None, # read group id
433     None, # sample id
434     None, # library id
435     None, # description
436     None, # platform unit (i.e. lane)
437     None, # sequencing center
438     None, # run date
439     None) # sequencing platform
440    
441     self.system_params = self.SystemParams(1, # bowtie_threads (num_cpus)
442     False) # keep_tmp
443    
444     self.search_params = self.SearchParams(100, # min_closure_exon_length
445     50, # min_closure_intron_length
446     5000, # max_closure_intron_length
447     50, # min_coverage_intron_length
448     20000, # max_coverage_intron_length
449     50, # min_segment_intron_length
450     500000) # max_segment_intron_length
451    
452     self.report_params = self.ReportParams()
453     self.gff_annotation = None
454 gpertea 135 self.transcriptome_only = False
455     self.transcriptome_index = None
456 gpertea 29 self.raw_junctions = None
457     self.find_novel_juncs = True
458     self.find_novel_indels = True
459     self.find_GFF_juncs = True
460 gpertea 135 #self.skip_check_reads = False
461 gpertea 29 self.max_hits = 20
462 gpertea 135 self.t_max_hits = 60
463     self.prefilter_multi = False
464 gpertea 29 self.initial_read_mismatches = 2
465 gpertea 135 self.t_mismatches = 1
466 gpertea 29 self.segment_length = 25
467     self.segment_mismatches = 2
468     self.bowtie_alignment_option = "-v"
469     self.max_insertion_length = 3
470     self.max_deletion_length = 3
471     self.raw_insertions = None
472     self.raw_deletions = None
473     self.coverage_search = None
474 gpertea 135 self.closure_search = False
475 gpertea 29 self.microexon_search = False
476     self.butterfly_search = None
477    
478     def check(self):
479     self.splice_constraints.check()
480     self.read_params.check()
481     self.system_params.check()
482    
483 gpertea 135 if self.segment_length < 10:
484     die("Error: arg to --segment-length must at least 10")
485 gpertea 29 if self.segment_mismatches < 0 or self.segment_mismatches > 3:
486     die("Error: arg to --segment-mismatches must in [0, 3]")
487    
488     if self.read_params.color and self.butterfly_search:
489     die("Error: butterfly-search in colorspace is not yet supported")
490    
491     library_types = ["fr-unstranded", "fr-firststrand", "fr-secondstrand"]
492    
493     if self.read_params.library_type and self.read_params.library_type not in library_types:
494     die("Error: library-type should be one of: "+', '.join(library_types))
495    
496     self.search_params.max_closure_intron_length = min(self.splice_constraints.max_intron_length,
497     self.search_params.max_closure_intron_length)
498    
499     self.search_params.max_segment_intron_length = min(self.splice_constraints.max_intron_length,
500     self.search_params.max_segment_intron_length)
501    
502     self.search_params.max_coverage_intron_length = min(self.splice_constraints.max_intron_length,
503     self.search_params.max_coverage_intron_length)
504    
505     if self.max_insertion_length >= self.segment_length:
506     die("Error: the max insertion length ("+self.max_insertion_length+") can not be equal to or greater than the segment length ("+self.segment_length+")")
507    
508     if self.max_insertion_length < 0:
509     die("Error: the max insertion length ("+self.max_insertion_length+") can not be less than 0")
510    
511     if self.max_deletion_length >= self.splice_constraints.min_intron_length:
512     die("Error: the max deletion length ("+self.max_deletion_length+") can not be equal to or greater than the min intron length ("+self.splice_constraints.min_intron_length+")")
513    
514     if self.max_deletion_length < 0:
515     die("Error: the max deletion length ("+self.max_deletion_length+") can not be less than 0")
516    
517     def cmd(self):
518     cmd = ["--min-anchor", str(self.splice_constraints.min_anchor_length),
519     "--splice-mismatches", str(self.splice_constraints.splice_mismatches),
520     "--min-report-intron", str(self.splice_constraints.min_intron_length),
521     "--max-report-intron", str(self.splice_constraints.max_intron_length),
522     "--min-isoform-fraction", str(self.splice_constraints.min_isoform_fraction),
523     "--output-dir", output_dir,
524     "--max-multihits", str(self.max_hits),
525     "--segment-length", str(self.segment_length),
526     "--segment-mismatches", str(self.segment_mismatches),
527     "--min-closure-exon", str(self.search_params.min_closure_exon_length),
528     "--min-closure-intron", str(self.search_params.min_closure_intron_length),
529     "--max-closure-intron", str(self.search_params.max_closure_intron_length),
530     "--min-coverage-intron", str(self.search_params.min_coverage_intron_length),
531     "--max-coverage-intron", str(self.search_params.max_coverage_intron_length),
532     "--min-segment-intron", str(self.search_params.min_segment_intron_length),
533     "--max-segment-intron", str(self.search_params.max_segment_intron_length),
534 gpertea 135 "--max-mismatches", str(self.initial_read_mismatches),
535 gpertea 29 "--sam-header", sam_header,
536     "--max-insertion-length", str(self.max_insertion_length),
537     "--max-deletion-length", str(self.max_deletion_length)]
538     cmd.extend(self.system_params.cmd())
539    
540     if self.read_params.mate_inner_dist != None:
541     cmd.extend(["--inner-dist-mean", str(self.read_params.mate_inner_dist),
542     "--inner-dist-std-dev", str(self.read_params.mate_inner_dist_std_dev)])
543     if self.gff_annotation != None:
544     cmd.extend(["--gtf-annotations", str(self.gff_annotation)])
545 gpertea 135 if gtf_juncs:
546     cmd.extend(["--gtf-juncs", gtf_juncs])
547     if self.closure_search == False:
548 gpertea 29 cmd.append("--no-closure-search")
549 gpertea 135 if self.coverage_search == False:
550 gpertea 29 cmd.append("--no-coverage-search")
551 gpertea 135 if self.microexon_search == False:
552 gpertea 29 cmd.append("--no-microexon-search")
553     if self.butterfly_search:
554     cmd.append("--butterfly-search")
555     if self.read_params.solexa_quals:
556     cmd.append("--solexa-quals")
557     if self.read_params.quals:
558     cmd.append("--quals")
559     if self.read_params.integer_quals:
560     cmd.append("--integer-quals")
561     if self.read_params.color:
562     cmd.append("--color")
563     if self.read_params.color_out:
564     cmd.append("--color-out")
565     if self.read_params.library_type:
566     cmd.extend(["--library-type", self.read_params.library_type])
567     if self.read_params.read_group_id:
568     cmd.extend(["--rg-id", self.read_params.read_group_id])
569     if self.read_params.phred64_quals:
570     cmd.append("--phred64-quals")
571     return cmd
572    
573     # This is the master options parsing routine, which calls parse_options for
574     # the delegate classes (e.g. SpliceConstraints) that handle certain groups
575     # of options.
576     def parse_options(self, argv):
577     try:
578 gpertea 135 opts, args = getopt.getopt(argv[1:], "hvp:m:n:N:F:a:i:I:G:Tr:o:j:Xz:s:g:x:MQC",
579 gpertea 29 ["version",
580     "help",
581     "output-dir=",
582     "solexa-quals",
583     "solexa1.3-quals",
584     "phred64-quals",
585     "quals",
586     "integer-quals",
587     "color",
588     "color-out",
589     "library-type=",
590     "num-threads=",
591     "splice-mismatches=",
592     "max-multihits=",
593     "min-isoform-fraction=",
594     "min-anchor-length=",
595     "min-intron-length=",
596     "max-intron-length=",
597     "GTF=",
598 gpertea 135 "transcriptome-only",
599     "transcriptome-max-hits=",
600     "transcriptome-index=",
601 gpertea 29 "raw-juncs=",
602     "no-novel-juncs",
603     "no-novel-indels",
604     "no-gtf-juncs",
605 gpertea 135 # "skip-check-reads",
606 gpertea 29 "mate-inner-dist=",
607     "mate-std-dev=",
608     "no-closure-search",
609     "no-coverage-search",
610     "closure-search",
611     "coverage-search",
612 gpertea 135 "prefilter-multihits",
613 gpertea 29 "microexon-search",
614     "min-closure-exon=",
615     "min-closure-intron=",
616     "max-closure-intron=",
617     "min-coverage-intron=",
618     "max-coverage-intron=",
619     "min-segment-intron=",
620     "max-segment-intron=",
621     "seed-length=",
622     "initial-read-mismatches=",
623 gpertea 135 "read-mismatches=",
624     "max-mismatches=",
625     "transcriptome-mismatches=",
626 gpertea 29 "segment-length=",
627     "segment-mismatches=",
628     "bowtie-n",
629     "butterfly-search",
630     "no-butterfly-search",
631     "keep-tmp",
632     "rg-id=",
633     "rg-sample=",
634     "rg-library=",
635     "rg-description=",
636     "rg-platform-unit=",
637     "rg-center=",
638     "rg-date=",
639     "rg-platform=",
640     "tmp-dir=",
641     "zpacker=",
642     "unmapped-fifo",
643     "max-insertion-length=",
644     "max-deletion-length=",
645     "insertions=",
646     "deletions=",
647     "no-sort-bam",
648     "no-convert-bam"])
649     except getopt.error, msg:
650     raise Usage(msg)
651    
652     self.splice_constraints.parse_options(opts)
653     self.system_params.parse_options(opts)
654     self.read_params.parse_options(opts)
655     self.search_params.parse_options(opts)
656     self.report_params.parse_options(opts)
657     global use_BWT_FIFO
658     global use_BAM_Unmapped
659     if not self.read_params.color:
660     use_BWT_FIFO=False
661     use_BAM_Unmapped=True
662     global output_dir
663     global logging_dir
664     global tmp_dir
665     global sam_header
666    
667     custom_tmp_dir = None
668     custom_out_dir = None
669     # option processing
670     for option, value in opts:
671     if option in ("-v", "--version"):
672     print "TopHat v%s" % (get_version())
673     sys.exit(0)
674     if option in ("-h", "--help"):
675     raise Usage(use_message)
676     if option in ("-g", "--max-multihits"):
677     self.max_hits = int(value)
678 gpertea 135 if option in ("-x", "--transcriptome-max-hits"):
679     self.t_max_hits = int(value)
680 gpertea 29 if option in ("-G", "--GTF"):
681     self.gff_annotation = value
682 gpertea 135 if option in ("-T", "--transcriptome"):
683     self.transcriptome_only = True
684     if option in ("--transcriptome-index"):
685     self.transcriptome_index = value
686     if option in("-M", "--prefilter-multihits"):
687     self.prefilter_multi = True
688 gpertea 29 if option in ("-j", "--raw-juncs"):
689     self.raw_junctions = value
690     if option == "--no-novel-juncs":
691     self.find_novel_juncs = False
692     if option == "--no-novel-indels":
693     self.find_novel_indels = False
694     if option == "--no-gtf-juncs":
695     self.find_GFF_juncs = False
696 gpertea 135 #if option == "--skip-check-reads":
697     # self.skip_check_reads = True
698 gpertea 29 if option == "--no-coverage-search":
699     self.coverage_search = False
700 gpertea 135 if option == "--coverage-search":
701     self.coverage_search = True
702 gpertea 29 if option == "--no-closure-search":
703     self.closure_search = False
704     if option == "--closure-search":
705     self.closure_search = True
706     if option == "--microexon-search":
707     self.microexon_search = True
708     if option == "--butterfly-search":
709     self.butterfly_search = True
710     if option == "--no-butterfly-search":
711     self.butterfly_search = False
712 gpertea 135 if option in ("-N", "--initial-read-mismatches", "--read-mismatches", "--max-mismatches"):
713 gpertea 29 self.initial_read_mismatches = int(value)
714 gpertea 135 if option in ("-n", "--transcriptome-mismatches"):
715     self.t_mismatches = int(value)
716 gpertea 29 if option == "--segment-length":
717     self.segment_length = int(value)
718     if option == "--segment-mismatches":
719     self.segment_mismatches = int(value)
720     if option == "--bowtie-n":
721     self.bowtie_alignment_option = "-n"
722     if option == "--max-insertion-length":
723     self.max_insertion_length = int(value)
724     if option == "--max-deletion-length":
725     self.max_deletion_length = int(value)
726     if option == "--insertions":
727     self.raw_insertions = value
728     if option == "--deletions":
729     self.raw_deletions = value
730     if option in ("-o", "--output-dir"):
731     custom_out_dir = value + "/"
732     if option == "--tmp-dir":
733     custom_tmp_dir = value + "/"
734 gpertea 135 if self.transcriptome_only:
735     self.find_novel_juncs=False
736     self.find_novel_indels=False
737     if custom_out_dir:
738 gpertea 29 output_dir = custom_out_dir
739     logging_dir = output_dir + "logs/"
740     tmp_dir = output_dir + "tmp/"
741     sam_header = tmp_dir + "stub_header.sam"
742 gpertea 135 if custom_tmp_dir:
743 gpertea 29 tmp_dir = custom_tmp_dir
744     sam_header = tmp_dir + "stub_header.sam"
745     if len(args) < 2:
746     raise Usage(use_message)
747     return args
748    
749    
750     def getFileDir(filepath):
751     #if fullpath, returns path including the ending /
752     fpath, fname=os.path.split(filepath)
753     if fpath: fpath+='/'
754     return fpath
755    
756     def getFileBaseName(filepath):
757     fpath, fname=os.path.split(filepath)
758     fbase, fext =os.path.splitext(fname)
759     fx=fext.lower()
760     if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fbase)>0:
761     return fbase
762     elif fx == '.z' or fx.find('.gz')==0 or fx.find('.bz')==0:
763     fb, fext = os.path.splitext(fbase)
764     fx=fext.lower()
765     if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fb)>0:
766     return fb
767     else:
768     return fbase
769     else:
770     if len(fbase)>len(fext):
771     return fbase
772     else:
773     return fname
774    
775     # Returns the current time in a nice format
776     def right_now():
777     curr_time = datetime.now()
778     return curr_time.strftime("%c")
779    
780 gpertea 135 # The TopHat logging formatter
781     def th_log(out_str):
782     print >> sys.stderr, "[%s] %s" % (right_now(), out_str)
783    
784 gpertea 29 # Ensures that the output, logging, and temp directories are present. If not,
785     # they are created
786     def prepare_output_dir():
787    
788 gpertea 135 th_log("Preparing output location "+output_dir)
789 gpertea 29 if os.path.exists(output_dir):
790     pass
791     else:
792     os.mkdir(output_dir)
793    
794     if os.path.exists(logging_dir):
795     pass
796     else:
797     os.mkdir(logging_dir)
798    
799     if os.path.exists(tmp_dir):
800     pass
801     else:
802     try:
803     os.makedirs(tmp_dir)
804     except OSError, o:
805     die("\nError creating directory %s (%s)" % (tmp_dir, o))
806    
807    
808     # to be added as preexec_fn for every subprocess.Popen() call:
809     # see http://bugs.python.org/issue1652
810     def subprocess_setup():
811     # Python installs a SIGPIPE handler by default, which causes
812     # gzip or other de/compression pipes to complain about "stdout: Broken pipe"
813     signal.signal(signal.SIGPIPE, signal.SIG_DFL)
814    
815     # Check that the Bowtie index specified by the user is present and all files
816     # are there.
817     def check_bowtie_index(idx_prefix):
818 gpertea 135 th_log("Checking for Bowtie index files")
819 gpertea 29
820     idx_fwd_1 = idx_prefix + ".1.ebwt"
821     idx_fwd_2 = idx_prefix + ".2.ebwt"
822     idx_rev_1 = idx_prefix + ".rev.1.ebwt"
823     idx_rev_2 = idx_prefix + ".rev.2.ebwt"
824    
825     if os.path.exists(idx_fwd_1) and \
826     os.path.exists(idx_fwd_2) and \
827     os.path.exists(idx_rev_1) and \
828     os.path.exists(idx_rev_2):
829     return
830     else:
831     bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES")
832     if bowtie_idx_env_var == None:
833     die("Error: Could not find Bowtie index files " + idx_prefix + ".*")
834     idx_prefix = bowtie_idx_env_var + idx_prefix
835     idx_fwd_1 = idx_prefix + ".1.ebwt"
836     idx_fwd_2 = idx_prefix + ".2.ebwt"
837     idx_rev_1 = idx_prefix + ".rev.1.ebwt"
838     idx_rev_2 = idx_prefix + ".rev.2.ebwt"
839    
840     if os.path.exists(idx_fwd_1) and \
841     os.path.exists(idx_fwd_2) and \
842     os.path.exists(idx_rev_1) and \
843     os.path.exists(idx_rev_2):
844     return
845     else:
846     die("Error: Could not find Bowtie index files " + idx_prefix + ".*")
847    
848     # Reconstructs the multifasta file from which the Bowtie index was created, if
849     # it's not already there.
850     def bowtie_idx_to_fa(idx_prefix):
851     idx_name = idx_prefix.split('/')[-1]
852 gpertea 135 th_log("Reconstituting reference FASTA file from Bowtie index")
853 gpertea 29
854     try:
855     tmp_fasta_file_name = tmp_dir + idx_name + ".fa"
856     tmp_fasta_file = open(tmp_fasta_file_name, "w")
857    
858     inspect_log = open(logging_dir + "bowtie_inspect_recons.log", "w")
859    
860     inspect_cmd = [prog_path("bowtie-inspect"),
861     idx_prefix]
862    
863     print >> sys.stderr, "Executing: " + " ".join(inspect_cmd) + " > " + tmp_fasta_file_name
864     ret = subprocess.call(inspect_cmd,
865     stdout=tmp_fasta_file,
866     stderr=inspect_log)
867    
868     # Bowtie reported an error
869     if ret != 0:
870     die(fail_str+"Error: bowtie-inspect returned an error")
871    
872     # Bowtie not found
873     except OSError, o:
874     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
875     die(fail_str+"Error: bowtie-inspect not found on this system. Did you forget to include it in your PATH?")
876    
877     return tmp_fasta_file_name
878    
879     # Checks whether the multifasta file for the genome is present alongside the
880     # Bowtie index files for it.
881     def check_fasta(idx_prefix):
882 gpertea 135 th_log("Checking for reference FASTA file")
883 gpertea 29 idx_fasta = idx_prefix + ".fa"
884     if os.path.exists(idx_fasta):
885     return idx_fasta
886     else:
887     bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES")
888 gpertea 135 if bowtie_idx_env_var:
889 gpertea 29 idx_fasta = bowtie_idx_env_var + idx_prefix + ".fa"
890     if os.path.exists(idx_fasta):
891     return idx_fasta
892    
893     print >> sys.stderr, "\tWarning: Could not find FASTA file " + idx_fasta
894     idx_fa = bowtie_idx_to_fa(idx_prefix)
895     return idx_fa
896    
897     # Check that both the Bowtie index and the genome's fasta file are present
898     def check_index(idx_prefix):
899     check_bowtie_index(idx_prefix)
900     ref_fasta_file = check_fasta(idx_prefix)
901    
902     return (ref_fasta_file, None)
903    
904     # Retrive a tuple containing the system's version of Bowtie. Parsed from
905     # `bowtie --version`
906     def get_bowtie_version():
907     try:
908     # Launch Bowtie to capture its version info
909     proc = subprocess.Popen([bowtie_path, "--version"],
910     stdout=subprocess.PIPE)
911     stdout_value = proc.communicate()[0]
912     bowtie_version = None
913     bowtie_out = repr(stdout_value)
914    
915     # Find the version identifier
916     version_str = "bowtie version "
917     ver_str_idx = bowtie_out.find(version_str)
918     if ver_str_idx != -1:
919     nl = bowtie_out.find("\\n", ver_str_idx)
920     version_val = bowtie_out[ver_str_idx + len(version_str):nl]
921     bowtie_version = [int(x) for x in version_val.split('.')]
922     if len(bowtie_version) == 3:
923     bowtie_version.append(0)
924    
925     return bowtie_version
926     except OSError, o:
927     errmsg=fail_str+str(o)+"\n"
928     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
929     errmsg+="Error: bowtie not found on this system"
930     die(errmsg)
931    
932     def get_index_sam_header(read_params, idx_prefix):
933     try:
934     bowtie_sam_header_filename = tmp_dir+idx_prefix.split('/')[-1]+".bwt.samheader.sam"
935     bowtie_sam_header_file = open(bowtie_sam_header_filename,"w")
936    
937     bowtie_header_cmd = [bowtie_path, "--sam"]
938     if read_params.color:
939     bowtie_header_cmd.append('-C')
940     bowtie_header_cmd.extend([idx_prefix, '/dev/null'])
941     subprocess.call(bowtie_header_cmd,
942     stdout=bowtie_sam_header_file,
943     stderr=open('/dev/null'))
944    
945     bowtie_sam_header_file.close()
946     bowtie_sam_header_file = open(bowtie_sam_header_filename,"r")
947    
948     sam_header_file = open(sam_header, "w")
949    
950     preamble = []
951     sq_dict_lines = []
952    
953     for line in bowtie_sam_header_file.readlines():
954     line = line.strip()
955     if line.find("@SQ") != -1:
956     # Sequence dictionary record
957     cols = line.split('\t')
958     seq_name = None
959     for col in cols:
960     fields = col.split(':')
961     #print fields
962     if len(fields) > 0 and fields[0] == "SN":
963     seq_name = fields[1]
964     if seq_name == None:
965     die("Error: malformed sequence dictionary in sam header")
966     sq_dict_lines.append([seq_name,line])
967     elif line.find("CL"):
968     continue
969     else:
970     preamble.append(line)
971     #for line in preamble:
972     # print >> sam_header_file, line
973     print >> sam_header_file, "@HD\tVN:1.0\tSO:coordinate"
974    
975     if read_params.read_group_id and read_params.sample_id:
976     rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
977     read_params.sample_id)
978     if read_params.library_id:
979     rg_str += "\tLB:%s" % read_params.library_id
980     if read_params.description:
981     rg_str += "\tDS:%s" % read_params.description
982     if read_params.seq_platform_unit:
983     rg_str += "\tPU:%s" % read_params.seq_platform_unit
984     if read_params.seq_center:
985     rg_str += "\tCN:%s" % read_params.seq_center
986     if read_params.mate_inner_dist:
987     rg_str += "\tPI:%s" % read_params.mate_inner_dist
988     if read_params.seq_run_date:
989     rg_str += "\tDT:%s" % read_params.seq_run_date
990     if read_params.seq_platform:
991     rg_str += "\tPL:%s" % read_params.seq_platform
992    
993     print >> sam_header_file, rg_str
994    
995     sq_dict_lines.sort(lambda x,y: cmp(x[0],y[0]))
996     for [name, line] in sq_dict_lines:
997     print >> sam_header_file, line
998     print >> sam_header_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
999    
1000     sam_header_file.close()
1001    
1002     return sam_header
1003    
1004     except OSError, o:
1005     errmsg=fail_str+str(o)+"\n"
1006     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1007     errmsg+="Error: bowtie not found on this system"
1008     die(errmsg)
1009    
1010     # Make sure Bowtie is installed and is recent enough to be useful
1011     def check_bowtie():
1012 gpertea 135 th_log("Checking for Bowtie")
1013 gpertea 29 global bowtie_path
1014     bowtie_path=prog_path("bowtie")
1015     bowtie_version = get_bowtie_version()
1016     if bowtie_version == None:
1017     die("Error: Bowtie not found on this system")
1018     elif bowtie_version[0] < 1 and bowtie_version[1] < 12 and bowtie_version[2] < 3:
1019     die("Error: TopHat requires Bowtie 0.12.3 or later")
1020     print >> sys.stderr, "\tBowtie version:\t\t\t %s" % ".".join([str(x) for x in bowtie_version])
1021    
1022    
1023     # Retrive a tuple containing the system's version of samtools. Parsed from
1024     # `samtools`
1025     def get_samtools_version():
1026     try:
1027     # Launch Bowtie to capture its version info
1028     proc = subprocess.Popen(samtools_path, stderr=subprocess.PIPE)
1029     samtools_out = proc.communicate()[1]
1030    
1031     # Find the version identifier
1032     version_match = re.search(r'Version:\s+(\d+)\.(\d+).(\d+)([a-zA-Z]?)', samtools_out)
1033     samtools_version_arr = [int(version_match.group(x)) for x in [1,2,3]]
1034     if version_match.group(4):
1035     samtools_version_arr.append(version_match.group(4))
1036     else:
1037     samtools_version_arr.append(0)
1038    
1039     return version_match.group(), samtools_version_arr
1040     except OSError, o:
1041     errmsg=fail_str+str(o)+"\n"
1042     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1043     errmsg+="Error: samtools not found on this system"
1044     die(errmsg)
1045    
1046     # Make sure the SAM tools are installed and are recent enough to be useful
1047     def check_samtools():
1048 gpertea 135 th_log("Checking for Samtools")
1049 gpertea 29 global samtools_path
1050     samtools_path=prog_path("samtools")
1051     samtools_version_str, samtools_version_arr = get_samtools_version()
1052     if samtools_version_str == None:
1053     die("Error: Samtools not found on this system")
1054     elif samtools_version_arr[1] < 1 or samtools_version_arr[2] < 7:
1055     die("Error: TopHat requires Samtools 0.1.7 or later")
1056     print >> sys.stderr, "\tSamtools %s" % samtools_version_str
1057    
1058    
1059    
1060     class FastxReader:
1061     def __init__(self, i_file, is_color=0, fname=''):
1062     self.bufline=None
1063     self.format=None
1064     self.ifile=i_file
1065     self.nextRecord=None
1066     self.eof=None
1067     self.fname=fname
1068     self.lastline=None
1069     self.numrecords=0
1070     self.isColor=0
1071     if is_color : self.isColor=1
1072     # determine file type
1073     #no records processed yet, skip custom header lines if any
1074     hlines=10 # allow maximum 10 header lines
1075     self.lastline=" "
1076     while hlines>0 and self.lastline[0] not in "@>" :
1077     self.lastline=self.ifile.readline()
1078     hlines-=1
1079     if self.lastline[0] == '@':
1080     self.format='fastq'
1081     self.nextRecord=self.nextFastq
1082     elif self.lastline[0] == '>':
1083     self.format='fasta'
1084     self.nextRecord=self.nextFasta
1085     else:
1086     die("Error: cannot determine record type in input file %s" % fname)
1087     self.bufline=self.lastline
1088     self.lastline=None
1089    
1090     def nextFastq(self):
1091     # returning tuple: (seqID, sequence_string, seq_len, qv_string)
1092     seqid,seqstr,qstr,seq_len='','','',0
1093     if self.eof: return (seqid, seqstr, seq_len, qstr)
1094     fline=self.getLine #shortcut to save a bit of time
1095     line=fline()
1096    
1097     if not line : return (seqid, seqstr, seq_len, qstr)
1098     while len(line.rstrip())==0: # skip empty lines
1099     line=fline()
1100     if not line : return (seqid, seqstr,seq_len, qstr)
1101     try:
1102     if line[0] != "@":
1103     raise ValueError("Records in Fastq files should start with '@' character")
1104     seqid = line[1:].rstrip()
1105     seqstr = fline().rstrip()
1106    
1107     #There may now be more sequence lines, or the "+" quality marker line:
1108     while True:
1109     line = fline()
1110     if not line:
1111     raise ValueError("Premature end of file (missing quality values for "+seqid+")")
1112     if line[0] == "+":
1113     # -- sequence string ended
1114     #qtitle = line[1:].rstrip()
1115     #if qtitle and qtitle != seqid:
1116     # raise ValueError("Different read ID for sequence and quality (%s vs %s)" \
1117     # % (seqid, qtitle))
1118     break
1119     seqstr += line.rstrip() #removes trailing newlines
1120     #loop until + found
1121     seq_len = len(seqstr)
1122     #at least one line of quality data should follow
1123     qstrlen=0
1124     #now read next lines as quality values until seq_len is reached
1125     while True:
1126     line=fline()
1127     if not line : break #end of file
1128     qstr += line.rstrip()
1129     qstrlen=len(qstr)
1130     if qstrlen + self.isColor >= seq_len :
1131     break # qv string has reached the length of seq string
1132     #loop until qv has the same length as seq
1133 gpertea 135 if self.isColor:
1134     # and qstrlen==seq_len :
1135     if qstrlen==seq_len:
1136     #qual string may have a dummy qv at the beginning, should be stripped
1137     qstr = qstr[1:]
1138     qstrlen -= 1
1139     if qstrlen!=seq_len-1:
1140     raise ValueError("Length mismatch between sequence and quality strings "+ \
1141     "for %s (%i vs %i)." % (seqid, seq_len, qstrlen))
1142     else:
1143     if seq_len != qstrlen :
1144     raise ValueError("Length mismatch between sequence and quality strings "+ \
1145     "for %s (%i vs %i)." % (seqid, seq_len, qstrlen))
1146 gpertea 29 except ValueError, err:
1147     die("\nError encountered parsing file "+self.fname+":\n "+str(err))
1148     #return the record
1149     self.numrecords+=1
1150 gpertea 135 ##--discard the primer base [NO]
1151 gpertea 29 if self.isColor :
1152     seq_len-=1
1153     seqstr = seqstr[1:]
1154     return (seqid, seqstr, seq_len, qstr)
1155    
1156     def nextFasta(self):
1157     # returning tuple: (seqID, sequence_string, seq_len)
1158     seqid,seqstr,seq_len='','',0
1159     fline=self.getLine # shortcut to readline function of f
1160     line=fline() # this will use the buffer line if it's there
1161     if not line : return (seqid, seqstr, seq_len, None)
1162     while len(line.rstrip())==0: # skip empty lines
1163     line=fline()
1164     if not line : return (seqid, seqstr, seq_len, None)
1165     try:
1166     if line[0] != ">":
1167     raise ValueError("Records in Fasta files must start with '>' character")
1168     seqid = line[1:].split()[0]
1169     #more sequence lines, or the ">" quality marker line:
1170     while True:
1171     line = fline()
1172     if not line: break
1173     if line[0] == '>':
1174     #next sequence starts here
1175     self.ungetLine()
1176     break
1177     seqstr += line.rstrip()
1178     #loop until '>' found
1179     seq_len = len(seqstr)
1180     if seq_len < 3:
1181     raise ValueError("Read %s too short (%i)." \
1182     % (seqid, seq_len))
1183     except ValueError, err:
1184     die("\nError encountered parsing fasta file "+self.fname+"\n "+str(err))
1185     #return the record and continue
1186     self.numrecords+=1
1187 gpertea 135 if self.isColor : # -- discard primer base
1188 gpertea 29 seq_len-=1
1189 gpertea 135 seqstr=seqstr[1:]
1190 gpertea 29 return (seqid, seqstr, seq_len, None)
1191    
1192     def getLine(self):
1193     if self.bufline: #return previously buffered line
1194     r=self.bufline
1195     self.bufline=None
1196     return r
1197     else: #read a new line from stream and return it
1198     if self.eof: return None
1199     self.lastline=self.ifile.readline()
1200     if not self.lastline:
1201     self.eof=1
1202     return None
1203     return self.lastline
1204     def ungetLine(self):
1205     if self.lastline is None:
1206     print >> sys.stderr, "Warning: FastxReader called ungetLine() with no prior line!"
1207     self.bufline=self.lastline
1208     self.lastline=None
1209     #< class FastxReader
1210    
1211 gpertea 135 def fa_write(fhandle, seq_id, seq):
1212     """
1213     Write to a file in the FASTA format.
1214    
1215     Arguments:
1216     - `fhandle`: A file handle open for writing
1217     - `seq_id`: The sequence id string for this sequence
1218     - `seq`: An unformatted string of the sequence to write
1219     """
1220     line_len = 60
1221     fhandle.write(">" + seq_id + "\n")
1222     for i in xrange(len(seq) / line_len + 1):
1223     start = i * line_len
1224     end = (i+1) * line_len if (i+1) * line_len < len(seq) else len(seq)
1225     fhandle.write( seq[ start:end ] + "\n")
1226    
1227 gpertea 29 class ZReader:
1228     def __init__(self, filename, sysparams, guess=True):
1229     self.fname=filename
1230     self.file=None
1231     self.fsrc=None
1232     self.popen=None
1233     pipecmd=[]
1234 gpertea 31 s=filename.lower()
1235     if s.endswith(".bam"):
1236     pipecmd=["bam2fastx", "-"]
1237     else:
1238 gpertea 135 if guess:
1239     if s.endswith(".z") or s.endswith(".gz") or s.endswith(".gzip"):
1240 gpertea 31 pipecmd=['gzip']
1241 gpertea 135 else:
1242 gpertea 31 if s.endswith(".bz2") or s.endswith(".bzip2") or s.endswith(".bzip"):
1243     pipecmd=['bzip2']
1244 gpertea 135 if len(pipecmd)>0 and which(pipecmd[0]) is None:
1245 gpertea 31 die("Error: cannot find %s to decompress input file %s " % (pipecmd, filename))
1246 gpertea 135 if len(pipecmd)>0:
1247     if pipecmd[0]=='gzip' and sysparams.zipper.endswith('pigz'):
1248 gpertea 31 pipecmd[0]=sysparams.zipper
1249     pipecmd.extend(sysparams.zipper_opts)
1250 gpertea 135 elif pipecmd[0]=='bzip2' and sysparams.zipper.endswith('pbzip2'):
1251 gpertea 31 pipecmd[0]=sysparams.zipper
1252     pipecmd.extend(sysparams.zipper_opts)
1253 gpertea 135 else: #not guessing, but must still check if it's a compressed file
1254     if use_zpacker and filename.endswith(".z"):
1255 gpertea 31 pipecmd=[sysparams.zipper]
1256     pipecmd.extend(sysparams.zipper_opts)
1257 gpertea 135 if pipecmd:
1258     pipecmd+=['-cd']
1259 gpertea 29 if pipecmd:
1260     try:
1261     self.fsrc=open(self.fname, 'rb')
1262     self.popen=subprocess.Popen(pipecmd,
1263     preexec_fn=subprocess_setup,
1264     stdin=self.fsrc,
1265     stdout=subprocess.PIPE, close_fds=True)
1266     except Exception:
1267     die("Error: could not open pipe "+' '.join(pipecmd)+' < '+ self.fname)
1268     self.file=self.popen.stdout
1269     else:
1270     self.file=open(filename)
1271     def close(self):
1272     if self.fsrc: self.fsrc.close()
1273     self.file.close()
1274     if self.popen:
1275     self.popen.wait()
1276     self.popen=None
1277    
1278     class ZWriter:
1279     def __init__(self, filename, sysparams):
1280     self.fname=filename
1281     if use_zpacker:
1282     pipecmd=[sysparams.zipper,"-cf", "-"]
1283     self.ftarget=open(filename, "wb")
1284     try:
1285     self.popen=subprocess.Popen(pipecmd,
1286     preexec_fn=subprocess_setup,
1287     stdin=subprocess.PIPE,
1288     stdout=self.ftarget, close_fds=True)
1289     except Exception:
1290     die("Error: could not open writer pipe "+' '.join(pipecmd)+' < '+ self.fname)
1291     self.file=self.popen.stdin # client writes to this end of the pipe
1292     else: #no compression
1293     self.file=open(filename, "w")
1294     self.ftarget=None
1295     self.popen=None
1296     def close(self):
1297     self.file.close()
1298     if self.ftarget: self.ftarget.close()
1299     if self.popen:
1300     self.popen.wait() #! required to actually flush the pipes (eek!)
1301     self.popen=None
1302    
1303     # check_reads_format() examines the first few records in the user files
1304     # to determines the file format
1305     def check_reads_format(params, reads_files):
1306 gpertea 135 #seed_len = params.read_params.seed_length
1307 gpertea 29 fileformat = params.read_params.reads_format
1308    
1309     observed_formats = set([])
1310     # observed_scales = set([])
1311     min_seed_len = 99999
1312     max_seed_len = 0
1313     files = reads_files.split(',')
1314    
1315     for f_name in files:
1316     #try:
1317     zf = ZReader(f_name, params.system_params)
1318     #except IOError:
1319     # die("Error: could not open file "+f_name)
1320     freader=FastxReader(zf.file, params.read_params.color, zf.fname)
1321     toread=4 #just sample the first 4 reads
1322     while toread>0:
1323     seqid, seqstr, seq_len, qstr = freader.nextRecord()
1324     if not seqid: break
1325     toread-=1
1326     if seq_len < 20:
1327     print >> sys.stderr, "Warning: found a read < 20bp in", f_name
1328     else:
1329     min_seed_len = min(seq_len, min_seed_len)
1330     max_seed_len = max(seq_len, max_seed_len)
1331     zf.close()
1332     observed_formats.add(freader.format)
1333     if len(observed_formats) > 1:
1334     die("Error: TopHat requires all reads be either FASTQ or FASTA. Mixing formats is not supported.")
1335     fileformat=list(observed_formats)[0]
1336 gpertea 135 #if seed_len != None:
1337     # seed_len = max(seed_len, max_seed_len)
1338     #else:
1339     # seed_len = max_seed_len
1340 gpertea 29 #print >> sys.stderr, "\tmin read length: %dbp, max read length: %dbp" % (min_seed_len, max_seed_len)
1341     print >> sys.stderr, "\tformat:\t\t %s" % fileformat
1342     if fileformat == "fastq":
1343     quality_scale = "phred33 (default)"
1344     if params.read_params.solexa_quals and not params.read_params.phred64_quals:
1345     quality_scale = "solexa33 (reads generated with GA pipeline version < 1.3)"
1346     elif params.read_params.phred64_quals:
1347     quality_scale = "phred64 (reads generated with GA pipeline version >= 1.3)"
1348     print >> sys.stderr, "\tquality scale:\t %s" % quality_scale
1349     elif fileformat == "fasta":
1350     if params.read_params.color:
1351     params.read_params.integer_quals = True
1352    
1353     #print seed_len, format, solexa_scale
1354     #NOTE: seed_len will be re-evaluated later by prep_reads
1355     return TopHatParams.ReadParams(params.read_params.solexa_quals,
1356     params.read_params.phred64_quals,
1357     params.read_params.quals,
1358     params.read_params.integer_quals,
1359     params.read_params.color,
1360     params.read_params.color_out,
1361     params.read_params.library_type,
1362 gpertea 135 #seed_len,
1363     params.read_params.seed_length,
1364 gpertea 29 fileformat,
1365     params.read_params.mate_inner_dist,
1366     params.read_params.mate_inner_dist_std_dev,
1367     params.read_params.read_group_id,
1368     params.read_params.sample_id,
1369     params.read_params.library_id,
1370     params.read_params.description,
1371     params.read_params.seq_platform_unit,
1372     params.read_params.seq_center,
1373     params.read_params.seq_run_date,
1374     params.read_params.seq_platform)
1375    
1376     def log_tail(logfile, lines=1):
1377     f=open(logfile, "r")
1378     f.seek(0, 2)
1379     fbytes= f.tell()
1380     size=lines
1381     block=-1
1382     while size > 0 and fbytes+block*1024 > 0:
1383     if (fbytes+block*1024 > 0):
1384     ##Seek back once more, if possible
1385     f.seek( block*1024, 2 )
1386     else:
1387     #Seek to the beginning
1388     f.seek(0, 0)
1389     data= f.read( 1024 )
1390     linesFound= data.count('\n')
1391     size -= linesFound
1392     block -= 1
1393     if (fbytes + block*1024 > 0):
1394     f.seek(block*1024, 2)
1395     else:
1396     f.seek(0,0)
1397     f.readline() # find a newline
1398     lastBlocks= list( f.readlines() )
1399     f.close()
1400     return "\n".join(lastBlocks[-lines:])
1401    
1402     # Format a DateTime as a pretty string.
1403     # FIXME: Currently doesn't support days!
1404     def formatTD(td):
1405     hours = td.seconds // 3600
1406     minutes = (td.seconds % 3600) // 60
1407     seconds = td.seconds % 60
1408     return '%02d:%02d:%02d' % (hours, minutes, seconds)
1409    
1410     class PrepReadsInfo:
1411     def __init__(self, fname, side):
1412     try:
1413     f=open(fname,"r")
1414     self.min_len=int(f.readline().split("=")[-1])
1415     self.max_len=int(f.readline().split("=")[-1])
1416     self.in_count=int(f.readline().split("=")[-1])
1417     self.out_count=int(f.readline().split("=")[-1])
1418     if (self.out_count==0) or (self.max_len<16):
1419     raise Exception()
1420     except Exception, e:
1421     die(fail_str+"Error retrieving prep_reads info.")
1422     print >> sys.stderr, "\t%s reads: min. length=%s, count=%s" % (side, self.min_len, self.out_count)
1423     if self.min_len < 20:
1424     print >> sys.stderr, "Warning: short reads (<20bp) will make TopHat quite slow and take large amount of memory because they are likely to be mapped to too many places"
1425    
1426 gpertea 135
1427     def prep_reads_cmd(params, reads_list, quals_list=None, aux_file=None, filter_reads=None, hits_to_filter=None):
1428     #generate a prep_reads cmd arguments
1429     filter_cmd = [prog_path("prep_reads")]
1430     filter_cmd.extend(params.cmd())
1431    
1432     if params.read_params.reads_format == "fastq":
1433     filter_cmd += ["--fastq"]
1434     elif params.read_params.reads_format == "fasta":
1435     filter_cmd += ["--fasta"]
1436     if hits_to_filter:
1437     filter_cmd += ["--flt-hits="+hits_to_filter]
1438     if aux_file:
1439     filter_cmd += ["--aux-outfile="+aux_file]
1440     if filter_reads:
1441     filter_cmd += ["--flt-reads="+filter_reads]
1442     filter_cmd.append(reads_list)
1443     if quals_list:
1444     filter_cmd.append(quals_list)
1445     return filter_cmd
1446    
1447 gpertea 29 # Calls the prep_reads executable, which prepares an internal read library.
1448     # The read library features reads with monotonically increasing integer IDs.
1449     # prep_reads also filters out very low complexy or garbage reads as well as
1450     # polyA reads.
1451     #--> returns the tuple (internal_read_library_file, read_info)
1452 gpertea 135 def prep_reads(params, reads_list, quals_list, side="left", prefilter_reads=None):
1453 gpertea 29 reads_suffix = ".fq"
1454     if use_zpacker: reads_suffix += ".z"
1455 gpertea 135 output_name=side+"_kept_reads"
1456 gpertea 29 kept_reads_filename = tmp_dir + output_name + reads_suffix
1457    
1458     if os.path.exists(kept_reads_filename):
1459     os.remove(kept_reads_filename)
1460     kept_reads = open(kept_reads_filename, "wb")
1461     log_fname=logging_dir + "prep_reads.log"
1462     filter_log = open(log_fname,"w")
1463    
1464     info_file=output_dir+output_name+".info"
1465 gpertea 135 filter_cmd=prep_reads_cmd(params, reads_list, quals_list, info_file, prefilter_reads)
1466 gpertea 29 shell_cmd = ' '.join(filter_cmd)
1467     #finally, add the compression pipe
1468     zip_cmd=[]
1469     if use_zpacker:
1470     zip_cmd=[ params.system_params.zipper ]
1471     zip_cmd.extend(params.system_params.zipper_opts)
1472     zip_cmd.extend(['-c','-'])
1473     shell_cmd +=' | '+' '.join(zip_cmd)
1474     shell_cmd += ' >' +kept_reads_filename
1475     retcode=0
1476     try:
1477     print >> run_log, shell_cmd
1478     if use_zpacker:
1479     filter_proc = subprocess.Popen(filter_cmd,
1480     stdout=subprocess.PIPE,
1481     stderr=filter_log)
1482     zip_proc=subprocess.Popen(zip_cmd,
1483     preexec_fn=subprocess_setup,
1484     stdin=filter_proc.stdout,
1485     stdout=kept_reads)
1486     filter_proc.stdout.close() #as per http://bugs.python.org/issue7678
1487     zip_proc.communicate()
1488     retcode=filter_proc.poll()
1489     if retcode==0:
1490     retcode=zip_proc.poll()
1491     else:
1492     retcode = subprocess.call(filter_cmd,
1493     stdout=kept_reads,
1494     stderr=filter_log)
1495     if retcode:
1496     die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname))
1497    
1498     except OSError, o:
1499     errmsg=fail_str+str(o)
1500     die(errmsg+"\n"+log_tail(log_fname))
1501     kept_reads.close()
1502    
1503     return kept_reads_filename, PrepReadsInfo(info_file, side)
1504    
1505 gpertea 135
1506 gpertea 29 # Call bowtie
1507     def bowtie(params,
1508     bwt_idx_prefix,
1509     reads_list,
1510     reads_format,
1511     num_mismatches,
1512     mapped_reads,
1513 gpertea 135 unmapped_reads,
1514     extra_output = "",
1515     t_mapping = False,
1516     multihits_out = None): #only --prefilter-multihits should activate this parameter for the initial prefilter search
1517 gpertea 29 start_time = datetime.now()
1518     bwt_idx_name = bwt_idx_prefix.split('/')[-1]
1519 gpertea 135 reads_file=reads_list[0]
1520     readfile_basename=getFileBaseName(reads_file)
1521     if t_mapping:
1522     print >> sys.stderr, "[%s] Mapping %s against transcriptome %s with Bowtie %s" % (start_time.strftime("%c"),
1523 gpertea 29 readfile_basename, bwt_idx_name, extra_output)
1524 gpertea 135 else:
1525     print >> sys.stderr, "[%s] Mapping %s against %s with Bowtie %s" % (start_time.strftime("%c"),
1526     readfile_basename, bwt_idx_name, extra_output)
1527 gpertea 29 bwt_log = open(logging_dir + 'bowtie.'+readfile_basename+'.fixmap.log', "w")
1528 gpertea 135 #bwt_mapped=mapped_reads
1529 gpertea 67 unmapped_reads_out=None
1530     if unmapped_reads:
1531     unmapped_reads_out=unmapped_reads+".fq"
1532 gpertea 29 use_FIFO = use_BWT_FIFO and use_zpacker and unmapped_reads and params.read_params.color
1533 gpertea 135 if use_FIFO:
1534 gpertea 29 global unmapped_reads_fifo
1535 gpertea 67 unmapped_reads_fifo=unmapped_reads+".fifo"
1536 gpertea 29 if os.path.exists(unmapped_reads_fifo):
1537     os.remove(unmapped_reads_fifo)
1538     try:
1539     os.mkfifo(unmapped_reads_fifo)
1540     except OSError, o:
1541     die(fail_str+"Error at mkfifo("+unmapped_reads_fifo+'). '+str(o))
1542 gpertea 67 unmapped_reads_out+=".z"
1543 gpertea 29
1544     # Launch Bowtie
1545     try:
1546     bowtie_cmd = [bowtie_path]
1547    
1548     if reads_format == "fastq":
1549     bowtie_cmd += ["-q"]
1550     elif reads_format == "fasta":
1551     bowtie_cmd += ["-f"]
1552    
1553     if params.read_params.color:
1554     bowtie_cmd += ["-C", "--col-keepends"]
1555     if unmapped_reads:
1556 gpertea 135 bowtie_cmd += ["--un", unmapped_reads_out]
1557    
1558     unzip_cmd=None
1559     bam_input=False
1560     if reads_list.endswith('.bam'):
1561     bam_input=True
1562     unzip_cmd=[ prog_path('bam2fastx') ]
1563     if reads_format:
1564     unzip_cmd.append("--"+reads_format)
1565     unzip_cmd+=[reads_list]
1566    
1567     if use_zpacker and (unzip_cmd is None):
1568 gpertea 29 unzip_cmd=[ params.system_params.zipper ]
1569     unzip_cmd.extend(params.system_params.zipper_opts)
1570    
1571     fifo_pid=None
1572     if use_FIFO:
1573 gpertea 135 unm_zipcmd=[ params.system_params.zipper ]
1574     unm_zipcmd.extend(params.system_params.zipper_opts)
1575     unm_zipcmd+=['-c']
1576     print >> run_log, ' '.join(unm_zipcmd)+' < '+ unmapped_reads_fifo + ' > '+ unmapped_reads_out + ' & '
1577     fifo_pid=os.fork()
1578     if fifo_pid==0:
1579     def on_sig_exit(sig, func=None):
1580 gpertea 29 os._exit(os.EX_OK)
1581 gpertea 135 signal.signal(signal.SIGTERM, on_sig_exit)
1582     subprocess.call(unm_zipcmd,
1583     stdin=open(unmapped_reads_fifo, "r"),
1584     stdout=open(unmapped_reads_out, "wb"))
1585     os._exit(os.EX_OK)
1586 gpertea 29
1587     fix_map_cmd = [prog_path('fix_map_ordering')]
1588     sam_zipcmd=None
1589     if unmapped_reads:
1590     #mapping on the genome
1591     #fix_map_ordering will write BAM file directly, and unmapped_reads as BAM file too
1592     mapped_reads += ".bam"
1593     if params.read_params.color:
1594     fix_map_cmd += ["--sam-header", sam_header, "-", mapped_reads]
1595     else:
1596 gpertea 67 unmapped_reads_out = unmapped_reads+".bam"
1597     fix_map_cmd += ["--sam-header", sam_header, "-", mapped_reads, unmapped_reads_out]
1598 gpertea 29 else:
1599     #mapping on segment_juncs (spliced potential junctions)
1600 gpertea 135 # don't use SAM headers, could be slow -- better use compressed SAM directly
1601 gpertea 29 mapped_reads += ".sam"
1602     fix_map_cmd += ["-"]
1603     if use_zpacker:
1604     fix_map_cmd += ["-"]
1605     sam_zipcmd=[ params.system_params.zipper ]
1606     sam_zipcmd.extend(params.system_params.zipper_opts)
1607     sam_zipcmd += ["-c"]
1608     mapped_reads += ".z"
1609     else:
1610     fix_map_cmd += [mapped_reads]
1611 gpertea 135
1612     max_hits = params.max_hits
1613     if t_mapping:
1614     max_hits = params.t_max_hits
1615     if params.bowtie_alignment_option == "-n" and num_mismatches>3:
1616     num_mismatches=3
1617     bowtie_cmd += [params.bowtie_alignment_option, str(num_mismatches),
1618     "-p", str(params.system_params.num_cpus),
1619     "-k", str(max_hits),
1620     "-m", str(max_hits),
1621     "-S", "--sam-nohead"] #always use headerless SAM file
1622 gpertea 29
1623 gpertea 135 if multihits_out:
1624     bowtie_cmd += ["--max", multihits_out]
1625     else:
1626     bowtie_cmd += ["--max", "/dev/null"]
1627     bowtie_cmd += [ bwt_idx_prefix ]
1628 gpertea 29 bowtie_proc=None
1629     shellcmd=""
1630     unzip_proc=None
1631 gpertea 135 samzip_proc=None
1632     if multihits_out:
1633     #special prefilter bowtie run: we use prep_reads on the fly
1634     #in order to get multi-mapped reads to exclude later
1635     prep_cmd = prep_reads_cmd(params, ",".join(reads_list))
1636     preplog_fname=logging_dir + "prep_reads_prefilter.log"
1637     prepfilter_log = open(preplog_fname,"w")
1638     unzip_proc = subprocess.Popen(prep_cmd,
1639     stdout=subprocess.PIPE,
1640     stderr=prepfilter_log)
1641     shellcmd=' '.join(prep_cmd) + "|"
1642     else:
1643     z_input=use_zpacker and reads_file.endswith(".z")
1644     if z_input:
1645     unzip_proc = subprocess.Popen(unzip_cmd,
1646     stdin=open(reads_file, "rb"),
1647     stdout=subprocess.PIPE)
1648     shellcmd=' '.join(unzip_cmd) + "< " +reads_file +"|"
1649 gpertea 29 else:
1650 gpertea 135 #must be uncompressed fastq input (unmapped reads from a previous run)
1651     #or a BAM file with unmapped reads
1652     if bam_input:
1653     unzip_proc = subprocess.Popen(unzip_cmd, stdout=subprocess.PIPE)
1654     shellcmd=' '.join(unzip_cmd) + "|"
1655     bowtie_cmd += [reads_file]
1656     bowtie_proc = subprocess.Popen(bowtie_cmd,
1657     stdout=subprocess.PIPE,
1658     stderr=bwt_log)
1659     if unzip_proc:
1660     #input is compressed OR prep_reads is used as a filter
1661     bowtie_cmd += ['-']
1662     bowtie_proc = subprocess.Popen(bowtie_cmd,
1663     stdin=unzip_proc.stdout,
1664     stdout=subprocess.PIPE,
1665     stderr=bwt_log)
1666     unzip_proc.stdout.close() # see http://bugs.python.org/issue7678
1667    
1668     shellcmd += ' '.join(bowtie_cmd) + '|' + ' '.join(fix_map_cmd)
1669 gpertea 29 if sam_zipcmd:
1670 gpertea 135 shellcmd += "|"+ ' '.join(samzip_cmd)+" > " + mapped_reads
1671     fix_order_proc = subprocess.Popen(fix_map_cmd,
1672     stdin=bowtie_proc.stdout,
1673     stdout=subprocess.PIPE)
1674     bowtie_proc.stdout.close() # needed?
1675     samzip_proc = subprocess.Popen(samzip_cmd,
1676     preexec_fn=subprocess_setup,
1677     stdin=fix_order_proc.stdout,
1678     stdout=open(mapped_reads, "wb"))
1679     fix_order_proc.stdout.close()
1680     else:
1681     #write BAM output directly
1682     fix_order_proc = subprocess.Popen(fix_map_cmd,
1683     stdin=bowtie_proc.stdout)
1684     # stdout=open(mapped_reads, "w"))
1685     bowtie_proc.stdout.close()
1686    
1687     # shellcmd += " > " + mapped_reads
1688 gpertea 29 print >> run_log, shellcmd
1689 gpertea 135 if samzip_proc:
1690     samzip_proc.communicate()
1691 gpertea 29 else:
1692     fix_order_proc.communicate()
1693     if use_FIFO:
1694 gpertea 67 if fifo_pid and not os.path.exists(unmapped_reads_out):
1695 gpertea 29 try:
1696     os.kill(fifo_pid, signal.SIGTERM)
1697     except:
1698     pass
1699     except OSError, o:
1700     die(fail_str+"Error: "+str(o))
1701    
1702     # Success
1703     #finish_time = datetime.now()
1704     #duration = finish_time - start_time
1705     #print >> sys.stderr, "\t\t\t[%s elapsed]" % formatTD(duration)
1706     if use_FIFO:
1707     try:
1708     os.remove(unmapped_reads_fifo)
1709     except:
1710     pass
1711 gpertea 135 if multihits_out and not os.path.exists(multihits_out):
1712     open(multihits_out, "w").close()
1713 gpertea 67 return (mapped_reads, unmapped_reads_out)
1714 gpertea 29
1715     def bowtie_segment(params,
1716     bwt_idx_prefix,
1717 gpertea 135 reads_file,
1718 gpertea 29 reads_format,
1719     mapped_reads,
1720     unmapped_reads = None,
1721     extra_output = ""):
1722    
1723     backup_bowtie_alignment_option = params.bowtie_alignment_option
1724     params.bowtie_alignment_option = "-v"
1725     params.max_hits *= 2
1726    
1727     result = bowtie(params,
1728     bwt_idx_prefix,
1729 gpertea 135 [reads_file],
1730 gpertea 29 reads_format,
1731     params.segment_mismatches,
1732     mapped_reads,
1733     unmapped_reads,
1734     extra_output)
1735    
1736     params.bowtie_alignment_option = backup_bowtie_alignment_option
1737     params.max_hits /= 2
1738     return result
1739    
1740     # Retrieve a .juncs file from a GFF file by calling the gtf_juncs executable
1741     def get_gtf_juncs(gff_annotation):
1742 gpertea 135 th_log("Reading known junctions from GTF file")
1743 gpertea 29 gtf_juncs_log = open(logging_dir + "gtf_juncs.log", "w")
1744    
1745     gff_prefix = gff_annotation.split('/')[-1].split('.')[0]
1746    
1747     gtf_juncs_out_name = tmp_dir + gff_prefix + ".juncs"
1748     gtf_juncs_out = open(gtf_juncs_out_name, "w")
1749    
1750     gtf_juncs_cmd=[prog_path("gtf_juncs"), gff_annotation]
1751     try:
1752 gpertea 135 print >> run_log, " ".join(gtf_juncs_cmd), " > "+gtf_juncs_out_name
1753 gpertea 29 retcode = subprocess.call(gtf_juncs_cmd,
1754     stderr=gtf_juncs_log,
1755     stdout=gtf_juncs_out)
1756     # cvg_islands returned an error
1757     if retcode == 1:
1758     print >> sys.stderr, "\tWarning: TopHat did not find any junctions in GTF file"
1759     return (False, gtf_juncs_out_name)
1760     elif retcode != 0:
1761     die(fail_str+"Error: GTF junction extraction failed with err ="+str(retcode))
1762     # cvg_islands not found
1763     except OSError, o:
1764     errmsg=fail_str+str(o)+"\n"
1765     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1766     errmsg+="Error: gtf_juncs not found on this system"
1767     die(errmsg)
1768     return (True, gtf_juncs_out_name)
1769    
1770     # Call bowtie-build on the FASTA file of sythetic splice junction sequences
1771     def build_juncs_bwt_index(external_splice_prefix, color):
1772 gpertea 135 th_log("Indexing splices")
1773 gpertea 29 bowtie_build_log = open(logging_dir + "bowtie_build.log", "w")
1774    
1775     #user_splices_out_prefix = output_dir + "user_splices_idx"
1776    
1777     bowtie_build_cmd = [prog_path("bowtie-build")]
1778     if color:
1779     bowtie_build_cmd += ["-C"]
1780    
1781     bowtie_build_cmd += [external_splice_prefix + ".fa",
1782     external_splice_prefix]
1783     try:
1784     print >> run_log, " ".join(bowtie_build_cmd)
1785     retcode = subprocess.call(bowtie_build_cmd,
1786     stdout=bowtie_build_log)
1787    
1788     if retcode != 0:
1789     die(fail_str+"Error: Splice sequence indexing failed with err ="+ str(retcode))
1790     except OSError, o:
1791     errmsg=fail_str+str(o)+"\n"
1792     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1793     errmsg+="Error: bowtie-build not found on this system"
1794     die(errmsg)
1795     return external_splice_prefix
1796    
1797     # Build a splice index from a .juncs file, suitable for use with specified read
1798     # (or read segment) lengths
1799     def build_juncs_index(min_anchor_length,
1800 gpertea 135 max_seg_len,
1801 gpertea 29 juncs_prefix,
1802     external_juncs,
1803     external_insertions,
1804     external_deletions,
1805     reference_fasta,
1806     color):
1807 gpertea 135 th_log("Retrieving sequences for splices")
1808 gpertea 29
1809     juncs_file_list = ",".join(external_juncs)
1810     insertions_file_list = ",".join(external_insertions)
1811     deletions_file_list = ",".join(external_deletions)
1812    
1813    
1814     juncs_db_log = open(logging_dir + "juncs_db.log", "w")
1815    
1816     external_splices_out_prefix = tmp_dir + juncs_prefix
1817     external_splices_out_name = external_splices_out_prefix + ".fa"
1818    
1819     external_splices_out = open(external_splices_out_name, "w")
1820     # juncs_db_cmd = [bin_dir + "juncs_db",
1821     juncs_db_cmd = [prog_path("juncs_db"),
1822     str(min_anchor_length),
1823 gpertea 135 str(max_seg_len),
1824 gpertea 29 juncs_file_list,
1825     insertions_file_list,
1826     deletions_file_list,
1827     reference_fasta]
1828     try:
1829     print >> run_log, " ".join(juncs_db_cmd)
1830     retcode = subprocess.call(juncs_db_cmd,
1831     stderr=juncs_db_log,
1832     stdout=external_splices_out)
1833    
1834     if retcode != 0:
1835     die(fail_str+"Error: Splice sequence retrieval failed with err ="+str(retcode))
1836     # juncs_db not found
1837     except OSError, o:
1838     errmsg=fail_str+str(o)+"\n"
1839     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1840     errmsg+="Error: juncs_db not found on this system"
1841     die(errmsg)
1842    
1843     external_splices_out_prefix = build_juncs_bwt_index(external_splices_out_prefix, color)
1844     return external_splices_out_prefix
1845    
1846 gpertea 135 def build_idx_from_fa(fasta_fname, out_dir):
1847     """ Build a bowtie index from a FASTA file.
1848    
1849     Arguments:
1850     - `fasta_fname`: File path to FASTA file.
1851     - `out_dir`: Output directory to place index in. (includes os.sep)
1852    
1853     Returns:
1854     - The path to the Bowtie index.
1855     """
1856     bwt_idx_path = out_dir + os.path.basename(fasta_fname).replace(".fa", "")
1857     bowtie_idx_cmd = [prog_path("bowtie-build"),
1858     "-f",
1859     fasta_fname,
1860     bwt_idx_path]
1861     try:
1862     th_log("Building Bowtie index from " + os.path.basename(fasta_fname))
1863     print >> run_log, " ".join(bowtie_idx_cmd)
1864     retcode = subprocess.call(bowtie_idx_cmd,
1865     stdout=open(os.devnull, "w"),
1866     stderr=open(os.devnull, "w"))
1867     if retcode != 0:
1868     die(fail_str + "Error: Couldn't build bowtie index with err = "
1869     + str(retcode))
1870     except OSError, o:
1871     errmsg=fail_str+str(o)+"\n"
1872     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1873     errmsg+="Error: bowtie-build not found on this system"
1874     die(errmsg)
1875    
1876     return bwt_idx_path
1877    
1878 gpertea 29 # Print out the sam header, embedding the user's specified library properties.
1879     # FIXME: also needs SQ dictionary lines
1880     def write_sam_header(read_params, sam_file):
1881     print >> sam_file, "@HD\tVN:1.0\tSO:coordinate"
1882     if read_params.read_group_id and read_params.sample_id:
1883     rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
1884     read_params.sample_id)
1885     if read_params.library_id:
1886     rg_str += "\tLB:%s" % read_params.library_id
1887     if read_params.description:
1888     rg_str += "\tDS:%s" % read_params.description
1889     if read_params.seq_platform_unit:
1890     rg_str += "\tPU:%s" % read_params.seq_platform_unit
1891     if read_params.seq_center:
1892     rg_str += "\tCN:%s" % read_params.seq_center
1893     if read_params.mate_inner_dist:
1894     rg_str += "\tPI:%s" % read_params.mate_inner_dist
1895     if read_params.seq_run_date:
1896     rg_str += "\tDT:%s" % read_params.seq_run_date
1897     if read_params.seq_platform:
1898     rg_str += "\tPL:%s" % read_params.seq_platform
1899    
1900     print >> sam_file, rg_str
1901     print >> sam_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
1902    
1903     # Write final TopHat output, via tophat_reports and wiggles
1904 gpertea 135 def compile_reports(params, sam_header_filename, mappings, readfiles, gff_annotation):
1905     th_log("Reporting output tracks")
1906     left_maps, right_maps = mappings
1907     left_reads, right_reads = readfiles
1908 gpertea 29 left_maps = [x for x in left_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
1909     left_maps = ','.join(left_maps)
1910    
1911     if len(right_maps) > 0:
1912     right_maps = [x for x in right_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
1913     right_maps = ','.join(right_maps)
1914     log_fname = logging_dir + "reports.log"
1915     report_log = open(log_fname, "w")
1916     junctions = output_dir + "junctions.bed"
1917     insertions = output_dir + "insertions.bed"
1918     deletions = output_dir + "deletions.bed"
1919     coverage = "coverage.wig"
1920     accepted_hits = output_dir + "accepted_hits"
1921     report_cmdpath = prog_path("tophat_reports")
1922     report_cmd = [report_cmdpath]
1923     report_cmd.extend(params.cmd())
1924     report_cmd.extend(["--samtools="+samtools_path])
1925     report_cmd.extend([junctions,
1926     insertions,
1927     deletions,
1928     "-",
1929     left_maps,
1930     left_reads])
1931 gpertea 135 if len(right_maps) > 0 and right_reads:
1932 gpertea 29 report_cmd.append(right_maps)
1933     report_cmd.append(right_reads)
1934     # -- tophat_reports now produces (uncompressed) BAM stream,
1935     # directly piped into samtools sort
1936     try:
1937     if params.report_params.convert_bam:
1938     if params.report_params.sort_bam:
1939     report_proc=subprocess.Popen(report_cmd,
1940     preexec_fn=subprocess_setup,
1941     stdout=subprocess.PIPE,
1942     stderr=report_log)
1943    
1944 gpertea 135 bamsort_cmd = [samtools_path, "sort", "-", accepted_hits]
1945 gpertea 29 bamsort_proc= subprocess.Popen(bamsort_cmd,
1946     preexec_fn=subprocess_setup,
1947     stderr=open(logging_dir + "reports.samtools_sort.log", "w"),
1948     stdin=report_proc.stdout)
1949     report_proc.stdout.close()
1950     print >> run_log, " ".join(report_cmd)+" | " + " ".join(bamsort_cmd)
1951     bamsort_proc.communicate()
1952     retcode = report_proc.poll()
1953     if retcode:
1954     die(fail_str+"Error running tophat_reports\n"+log_tail(log_fname))
1955     else:
1956     print >> run_log, " ".join(report_cmd)
1957     report_proc=subprocess.call(report_cmd,
1958     preexec_fn=subprocess_setup,
1959     stdout=open(accepted_hits,"wb"),
1960     stderr=report_log)
1961     os.rename(accepted_hits, output_dir + "accepted_hits.bam")
1962     else:
1963     print >> run_log, " ".join(report_cmd)
1964     report_proc=subprocess.call(report_cmd,
1965     preexec_fn=subprocess_setup,
1966     stdout=open(accepted_hits,"wb"),
1967     stderr=report_log)
1968     tmp_sam = output_dir + "accepted_hits.sam"
1969    
1970 gpertea 135 bam_to_sam_cmd = [samtools_path, "view", "-h", accepted_hits]
1971 gpertea 29 print >> run_log, " ".join(bam_to_sam_cmd) + " > " + tmp_sam
1972     bam_to_sam_log = open(logging_dir + "accepted_hits_bam_to_sam.log", "w")
1973     tmp_sam_file = open(tmp_sam, "w")
1974     ret = subprocess.call(bam_to_sam_cmd,
1975     stdout=tmp_sam_file,
1976     stderr=bam_to_sam_log)
1977     tmp_sam_file.close()
1978     os.remove(accepted_hits)
1979     #print >> run_log, "mv %s %s" % (tmp_sam, output_dir + "accepted_hits.sam")
1980     #os.rename(tmp_sam, output_dir + "accepted_hits.sam")
1981    
1982     except OSError, o:
1983     die(fail_str+"Error: "+str(o)+"\n"+log_tail(log_fname))
1984    
1985     # FIXME: put wiggles back!
1986     # wig_cmd = [prog_path("wiggles"), output_dir + accepted_hits, output_dir + coverage]
1987     # print >> run_log, " ".join(wig_cmd)
1988     # subprocess.call(wig_cmd,
1989     # stderr=open("/dev/null"))
1990     return (coverage, junctions)
1991    
1992    
1993     # Split up each read in a FASTQ file into multiple segments. Creates a FASTQ file
1994     # for each segment This function needs to be fixed to support mixed read length
1995     # inputs
1996    
1997     def open_output_files(prefix, num_files_prev, num_files, out_segf, extension, params):
1998     i = num_files_prev + 1
1999     while i <= num_files:
2000     segfname=prefix+("_seg%d" % i)+extension
2001     out_segf.append(ZWriter(segfname,params.system_params))
2002     i += 1
2003    
2004     def split_reads(reads_filename,
2005     prefix,
2006     fasta,
2007     params,
2008     segment_length):
2009     #reads_file = open(reads_filename)
2010     zreads = ZReader(reads_filename, params.system_params, False)
2011     out_segfiles = []
2012    
2013     if fasta:
2014     extension = ".fa"
2015     else:
2016     extension = ".fq"
2017     if use_zpacker: extension += ".z"
2018     def convert_color_to_bp(color_seq):
2019     decode_dic = { 'A0':'A', 'A1':'C', 'A2':'G', 'A3':'T', 'A4':'N', 'A.':'N', 'AN':'N',
2020     'C0':'C', 'C1':'A', 'C2':'T', 'C3':'G', 'C4':'N', 'C.':'N', 'CN':'N',
2021     'G0':'G', 'G1':'T', 'G2':'A', 'G3':'C', 'G4':'N', 'G.':'N', 'GN':'N',
2022     'T0':'T', 'T1':'G', 'T2':'C', 'T3':'A', 'T4':'N', 'T.':'N', 'TN':'N',
2023     'N0':'N', 'N1':'N', 'N2':'N', 'N3':'N', 'N4':'N', 'N.':'N', 'NN':'N',
2024     '.0':'N', '.1':'N', '.2':'N', '.3':'N', '.4':'N', '..':'N', '.N':'N' }
2025    
2026     base = color_seq[0]
2027     bp_seq = base
2028     for ch in color_seq[1:]:
2029     base = decode_dic[base+ch]
2030     bp_seq += base
2031     return bp_seq
2032    
2033     def convert_bp_to_color(bp_seq):
2034     encode_dic = { 'AA':'0', 'CC':'0', 'GG':'0', 'TT':'0',
2035     'AC':'1', 'CA':'1', 'GT':'1', 'TG':'1',
2036     'AG':'2', 'CT':'2', 'GA':'2', 'TC':'2',
2037     'AT':'3', 'CG':'3', 'GC':'3', 'TA':'3',
2038     'A.':'4', 'C.':'4', 'G.':'4', 'T.':'4',
2039     '.A':'4', '.C':'4', '.G':'4', '.T':'4',
2040     '.N':'4', 'AN':'4', 'CN':'4', 'GN':'4',
2041     'TN':'4', 'NA':'4', 'NC':'4', 'NG':'4',
2042     'NT':'4', 'NN':'4', 'N.':'4', '..':'4' }
2043    
2044     base = bp_seq[0]
2045     color_seq = base
2046     for ch in bp_seq[1:]:
2047     color_seq += encode_dic[base + ch]
2048     base = ch
2049    
2050     return color_seq
2051    
2052     def split_record(read_name, read_seq, read_qual, out_segf, offsets, color):
2053     if color:
2054     color_offset = 1
2055     read_seq_temp = convert_color_to_bp(read_seq)
2056    
2057     seg_num = 1
2058     while seg_num + 1 < len(offsets):
2059     if read_seq[offsets[seg_num]+1] not in ['0', '1', '2', '3']:
2060     return
2061     seg_num += 1
2062     else:
2063     color_offset = 0
2064    
2065     seg_num = 0
2066     last_seq_offset = 0
2067     while seg_num + 1 < len(offsets):
2068     f = out_segf[seg_num].file
2069     seg_seq = read_seq[last_seq_offset+color_offset:offsets[seg_num + 1]+color_offset]
2070     print >> f, "%s|%d:%d:%d" % (read_name,last_seq_offset,seg_num, len(offsets) - 1)
2071     if color:
2072     print >> f, "%s%s" % (read_seq_temp[last_seq_offset], seg_seq)
2073     else:
2074     print >> f, seg_seq
2075     if not fasta:
2076     seg_qual = read_qual[last_seq_offset:offsets[seg_num + 1]]
2077     print >> f, "+"
2078     print >> f, seg_qual
2079     seg_num += 1
2080     last_seq_offset = offsets[seg_num]
2081    
2082     line_state = 0
2083     read_name = ""
2084     read_seq = ""
2085 gpertea 135 read_quals = ""
2086 gpertea 29 num_segments = 0
2087     offsets = []
2088     for line in zreads.file:
2089     if line.strip() == "":
2090     continue
2091     if line_state == 0:
2092     read_name = line.strip()
2093     elif line_state == 1:
2094     read_seq = line.strip()
2095    
2096     read_length = len(read_seq)
2097     tmp_num_segments = read_length / segment_length
2098     offsets = [segment_length * i for i in range(0, tmp_num_segments + 1)]
2099    
2100     # Bowtie's minimum read length here is 20bp, so if the last segment
2101     # is between 20 and segment_length bp long, go ahead and write it out
2102     if read_length % segment_length >= 20:
2103     offsets.append(read_length)
2104     tmp_num_segments += 1
2105     else:
2106     offsets[-1] = read_length
2107    
2108     if tmp_num_segments == 1:
2109     offsets = [0, read_length]
2110    
2111     if tmp_num_segments > num_segments:
2112     open_output_files(prefix, num_segments, tmp_num_segments, out_segfiles, extension, params)
2113     num_segments = tmp_num_segments
2114    
2115     if fasta:
2116     split_record(read_name, read_seq, None, out_segfiles, offsets, params.read_params.color)
2117     elif line_state == 2:
2118     line = line.strip()
2119     else:
2120     read_quals = line.strip()
2121     if not fasta:
2122     split_record(read_name, read_seq, read_quals, out_segfiles, offsets, params.read_params.color)
2123    
2124     line_state += 1
2125     if fasta:
2126     line_state %= 2
2127     else:
2128     line_state %= 4
2129     zreads.close()
2130     out_fnames=[]
2131     for zf in out_segfiles:
2132     zf.close()
2133     out_fnames.append(zf.fname)
2134     #return [o.fname for o in out_segfiles]
2135     return out_fnames
2136    
2137     # Find possible splice junctions using the "closure search" strategy, and report
2138     # them in closures.juncs. Calls the executable closure_juncs
2139     def junctions_from_closures(params,
2140     left_maps,
2141     right_maps,
2142     ref_fasta):
2143 gpertea 135 th_log("Searching for junctions via mate-pair closures")
2144 gpertea 29
2145     #maps = [x for x in seg_maps if (os.path.exists(x) and os.path.getsize(x) > 0)]
2146     #if len(maps) == 0:
2147     # return None
2148     #print >> sys.stderr, left_maps
2149     slash = left_maps[0].rfind('/')
2150     juncs_out = ""
2151     if slash != -1:
2152     juncs_out += left_maps[0][:slash+1]
2153     juncs_out += "closure.juncs"
2154    
2155     juncs_log = open(logging_dir + "closure.log", "w")
2156     juncs_cmdpath=prog_path("closure_juncs")
2157     juncs_cmd = [juncs_cmdpath]
2158    
2159     left_maps = ','.join(left_maps)
2160     right_maps = ','.join(right_maps)
2161    
2162     juncs_cmd.extend(params.cmd())
2163     juncs_cmd.extend([juncs_out,
2164     ref_fasta,
2165     left_maps,
2166     right_maps])
2167     try:
2168     print >> run_log, ' '.join(juncs_cmd)
2169     retcode = subprocess.call(juncs_cmd,
2170     stderr=juncs_log)
2171    
2172     # spanning_reads returned an error
2173     if retcode != 0:
2174     die(fail_str+"Error: closure-based junction search failed with err ="+str(retcode))
2175     # cvg_islands not found
2176     except OSError, o:
2177     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2178     print >> sys.stderr, fail_str, "Error: closure_juncs not found on this system"
2179     die(str(o))
2180     return [juncs_out]
2181    
2182     # Find possible junctions by examining coverage and split segments in the initial
2183     # map and segment maps. Report junctions, insertions, and deletions in segment.juncs,
2184     # segment.insertions, and segment.deletions. Calls the executable
2185     # segment_juncs
2186     def junctions_from_segments(params,
2187     left_reads,
2188     left_reads_map,
2189     left_seg_maps,
2190     right_reads,
2191     right_reads_map,
2192     right_seg_maps,
2193     unmapped_reads,
2194     reads_format,
2195     ref_fasta):
2196     if left_reads_map != left_seg_maps[0]:
2197 gpertea 135 th_log("Searching for junctions via segment mapping")
2198 gpertea 29 out_path=getFileDir(left_seg_maps[0])
2199     juncs_out=out_path+"segment.juncs"
2200     insertions_out=out_path+"segment.insertions"
2201     deletions_out =out_path+"segment.deletions"
2202    
2203     left_maps = ','.join(left_seg_maps)
2204 gpertea 135 log_fname = logging_dir + "segment_juncs.log"
2205     segj_log = open(log_fname, "w")
2206     segj_cmd = [prog_path("segment_juncs")]
2207 gpertea 29
2208 gpertea 135 segj_cmd.extend(params.cmd())
2209 gpertea 29
2210 gpertea 135 segj_cmd.extend(["--ium-reads", ",".join(unmapped_reads),
2211 gpertea 29 ref_fasta,
2212     juncs_out,
2213     insertions_out,
2214     deletions_out,
2215     left_reads,
2216     left_reads_map,
2217     left_maps])
2218 gpertea 135 if right_seg_maps:
2219 gpertea 29 right_maps = ','.join(right_seg_maps)
2220 gpertea 135 segj_cmd.extend([right_reads, right_reads_map, right_maps])
2221 gpertea 29 try:
2222 gpertea 135 print >> run_log, " ".join(segj_cmd)
2223     retcode = subprocess.call(segj_cmd,
2224 gpertea 29 preexec_fn=subprocess_setup,
2225 gpertea 135 stderr=segj_log)
2226 gpertea 29
2227     # spanning_reads returned an error
2228     if retcode != 0:
2229 gpertea 135 die(fail_str+"Error: segment-based junction search failed with err ="+str(retcode)+"\n"+log_tail(log_fname))
2230 gpertea 29 # cvg_islands not found
2231     except OSError, o:
2232     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2233     print >> sys.stderr, fail_str, "Error: segment_juncs not found on this system"
2234     die(str(o))
2235    
2236     return [juncs_out, insertions_out, deletions_out]
2237    
2238     # Joins mapped segments into full-length read alignments via the executable
2239     # long_spanning_reads
2240     def join_mapped_segments(params,
2241     sam_header_filename,
2242     reads,
2243     ref_fasta,
2244     possible_juncs,
2245     possible_insertions,
2246     possible_deletions,
2247     contig_seg_maps,
2248     spliced_seg_maps,
2249     alignments_out_name):
2250 gpertea 135 rn=""
2251 gpertea 29 if len(contig_seg_maps)>1:
2252 gpertea 135 th_log("Joining segment hits")
2253     rn=".segs"
2254 gpertea 29 else:
2255 gpertea 135 th_log("Processing bowtie hits")
2256 gpertea 29 contig_seg_maps = ','.join(contig_seg_maps)
2257    
2258     possible_juncs = ','.join(possible_juncs)
2259     possible_insertions = ",".join(possible_insertions)
2260     possible_deletions = ",".join(possible_deletions)
2261 gpertea 135 log_fname=logging_dir + "long_spanning_reads"+rn+".log"
2262     align_log = open(log_fname, "w")
2263 gpertea 29 align_cmd = [prog_path("long_spanning_reads")]
2264    
2265     align_cmd.extend(params.cmd())
2266    
2267     align_cmd.append(ref_fasta)
2268     align_cmd.extend([ reads,
2269     possible_juncs,
2270     possible_insertions,
2271     possible_deletions,
2272     contig_seg_maps])
2273    
2274 gpertea 135 if spliced_seg_maps:
2275 gpertea 29 spliced_seg_maps = ','.join(spliced_seg_maps)
2276     align_cmd.append(spliced_seg_maps)
2277    
2278     try:
2279     print >> run_log, " ".join(align_cmd),">",alignments_out_name
2280     join_proc=subprocess.Popen(align_cmd,
2281     preexec_fn=subprocess_setup,
2282     stdout=open(alignments_out_name, "wb"),
2283     stderr=align_log)
2284     join_proc.communicate()
2285 gpertea 135 retcode=join_proc.poll()
2286     if retcode:
2287     die(fail_str+"Error at 'long_spanning_reads'\n"+log_tail(log_fname))
2288 gpertea 29 except OSError, o:
2289     die(fail_str+"Error: "+str(o))
2290    
2291    
2292     # This class collects spliced and unspliced alignments for each of the
2293     # left and right read files provided by the user.
2294     class Maps:
2295     def __init__(self,
2296     unspliced_sam,
2297     seg_maps,
2298     unmapped_segs,
2299     segs):
2300     self.unspliced_sam = unspliced_sam
2301     self.seg_maps = seg_maps
2302     self.unmapped_segs = unmapped_segs
2303     self.segs = segs
2304    
2305 gpertea 135 # Map2GTF stuff
2306     def m2g_convert_coords(params, gtf_fname, reads, out_fname):
2307     """ajjkljlks
2308    
2309     Arguments:
2310     - `params`: TopHat parameters
2311     - `gtf_fname`: File name pointing to the annotation.
2312     - `reads`: The reads to convert coords (in Bowtie format).
2313     - `out_fname`: The file name pointing to the output.
2314     """
2315     m2g_cmd = [prog_path("map2gtf")]
2316     m2g_cmd.extend(params.cmd())
2317     m2g_cmd.append(gtf_fname)
2318     m2g_cmd.append(reads) #could be BAM file
2319     m2g_cmd.append(out_fname)
2320     fbasename = getFileBaseName(reads)
2321     m2g_log = logging_dir + "m2g_" + fbasename + ".out"
2322     m2g_err = logging_dir + "m2g_" + fbasename + ".err"
2323    
2324     try:
2325     th_log("Converting " + fbasename + " to genomic coordinates (map2gtf)")
2326     print >> run_log, " ".join(m2g_cmd) + " > " + m2g_log
2327     ret = subprocess.call(m2g_cmd,
2328     stdout=open(m2g_log, "w"),
2329     stderr=open(m2g_err, "w"))
2330     if ret != 0:
2331     die(fail_str + " Error: map2gtf returned an error")
2332     except OSError, o:
2333     err_msg = fail_str + str(o)
2334     die(err_msg + "\n")
2335    
2336    
2337     def gtf_to_fasta(params, trans_gtf, genome, out_fname):
2338     """ Make the transcriptome from a GTF.
2339    
2340     Arguments:
2341     - `trans_gtf`:
2342     - `genome`:
2343     - `out_fname`:
2344     """
2345     # TODO: Call gtf_to_fasta
2346     g2f_cmd = [prog_path("gtf_to_fasta")]
2347     g2f_cmd.extend(params.cmd())
2348     g2f_cmd.append(trans_gtf)
2349     g2f_cmd.append(genome)
2350     g2f_cmd.append(out_fname)
2351    
2352     g2f_log = logging_dir + "g2f.out"
2353     g2f_err = logging_dir + "g2f.err"
2354    
2355     try:
2356     print >> run_log, " ".join(g2f_cmd)+" > " + g2f_log
2357     ret = subprocess.call(g2f_cmd,
2358     stdout = open(g2f_log, "w"),
2359     stderr = open(g2f_err, "w"))
2360     if ret != 0:
2361     die(fail_str + " Error: gtf_to_fasta returned an error.")
2362     except OSError, o:
2363     err_msg = fail_str + str(o)
2364     die(err_msg + "\n")
2365    
2366     def map2gtf(params, ref_fasta, left_reads, right_reads):
2367     """ Main GTF mapping function
2368    
2369     Arguments:
2370     - `params`: The TopHat parameters.
2371     - `ref_fasta`: The reference genome.
2372     - `left_reads`: A list of reads.
2373     - `right_reads`: A list of reads (empty if single-end).
2374    
2375     Returns:
2376     TODO: Put something here
2377     """
2378     test_input_file(params.gff_annotation)
2379    
2380     # th_log("Reading in GTF file: " + params.gff_annotation)
2381     # transcripts = gtf_to_transcripts(params.gff_annotation)
2382    
2383     gtf_name = getFileBaseName(params.gff_annotation)
2384     m2g_bwt_idx = None
2385     if params.transcriptome_index:
2386     m2g_bwt_idx = params.transcriptome_index
2387     #th_log("Using pre-built transcriptome index..")
2388     else:
2389     th_log("Creating multi-FASTA from GTF and genome")
2390     m2g_ref_fasta = tmp_dir + gtf_name + ".fa"
2391     gtf_to_fasta(params, params.gff_annotation, ref_fasta, m2g_ref_fasta)
2392     # trans_hash = transcripts_to_fasta(transcripts,
2393     # ref_fasta,
2394     # m2g_ref_fasta)
2395     m2g_bwt_idx = build_idx_from_fa(m2g_ref_fasta, tmp_dir)
2396     mapped_gtf_list = []
2397     unmapped_gtf_list = []
2398     # do the initial mapping in GTF coordinates
2399     for reads in [left_reads, right_reads]:
2400     if reads == None or os.path.getsize(reads) < 25 :
2401     continue
2402     fbasename = getFileBaseName(reads)
2403     mapped_gtf_out = tmp_dir + fbasename + ".m2g"
2404     #if use_zpacker:
2405     # mapped_gtf_out+=".z"
2406    
2407     unmapped_gtf = tmp_dir + fbasename + ".m2g_um"
2408     #if use_BWT_FIFO:
2409     # unmapped_gtf += ".z"
2410    
2411     (mapped_gtf_map, unmapped) = bowtie(params,
2412     m2g_bwt_idx,
2413     [reads],
2414     "fastq",
2415     params.t_mismatches,
2416     mapped_gtf_out,
2417     unmapped_gtf,
2418     "", True)
2419     mapped_gtf_list.append(mapped_gtf_map)
2420     unmapped_gtf_list.append(unmapped)
2421    
2422     sam_gtf_list = []
2423     for reads in mapped_gtf_list:
2424     fbasename = getFileBaseName(reads)
2425     sam_out_fname = tmp_dir + fbasename + "_converted.bam"
2426     m2g_convert_coords(params,
2427     params.gff_annotation,
2428     reads,
2429     sam_out_fname)
2430     sam_gtf_list.append(sam_out_fname)
2431    
2432     if len(sam_gtf_list) < 2:
2433     sam_gtf_list.append(None)
2434     unmapped_gtf_list.append(None)
2435    
2436     return (sam_gtf_list, unmapped_gtf_list)
2437     # end Map2GTF
2438    
2439     def get_preflt_data(params, ri, target_reads, out_mappings, out_unmapped):
2440     ## extract mappings and unmapped reads from prefilter mappings and preflt_ium
2441     ##
2442     #this is accomplished by a special prep_reads usage (triggered by --flt-hits)
2443    
2444     prep_cmd=prep_reads_cmd(params, params.preflt_data[ri].unmapped_reads, None,
2445     out_mappings,
2446     target_reads,
2447     params.preflt_data[ri].mappings)
2448     um_reads = open(out_unmapped, "wb")
2449     log_fname=logging_dir + "prep_reads.from_preflt."+str(ri)+".log"
2450     filter_log = open(log_fname,"w")
2451    
2452     shell_cmd = " ".join(prep_cmd)
2453     #add the compression pipe
2454     zip_cmd=[]
2455     if use_zpacker:
2456     zip_cmd=[ params.system_params.zipper ]
2457     zip_cmd.extend(params.system_params.zipper_opts)
2458     zip_cmd.extend(['-c','-'])
2459     shell_cmd +=' | '+' '.join(zip_cmd)
2460     shell_cmd += ' >' + out_unmapped
2461     retcode=0
2462     try:
2463     print >> run_log, shell_cmd
2464     if use_zpacker:
2465     prep_proc = subprocess.Popen(prep_cmd,
2466     stdout=subprocess.PIPE,
2467     stderr=filter_log)
2468     zip_proc = subprocess.Popen(zip_cmd,
2469     preexec_fn=subprocess_setup,
2470     stdin=prep_proc.stdout,
2471     stdout=um_reads)
2472     prep_proc.stdout.close() #as per http://bugs.python.org/issue7678
2473     zip_proc.communicate()
2474     retcode=prep_proc.poll()
2475     if retcode==0:
2476     retcode=zip_proc.poll()
2477     else:
2478     retcode = subprocess.call(prep_cmd,
2479     stdout=um_reads,
2480     stderr=filter_log)
2481     if retcode:
2482     die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname))
2483    
2484     except OSError, o:
2485     errmsg=fail_str+str(o)
2486     die(errmsg+"\n"+log_tail(log_fname))
2487     um_reads.close()
2488    
2489     return (out_mappings, out_unmapped)
2490    
2491    
2492 gpertea 29 # The main aligment routine of TopHat. This function executes most of the
2493     # workflow producing a set of candidate alignments for each cDNA fragment in a
2494     # pair of SAM alignment files (for paired end reads).
2495     def spliced_alignment(params,
2496     bwt_idx_prefix,
2497     sam_header_filename,
2498     ref_fasta,
2499     read_len,
2500     segment_len,
2501 gpertea 135 prepared_reads,
2502 gpertea 29 user_supplied_junctions,
2503     user_supplied_insertions,
2504     user_supplied_deletions):
2505    
2506     possible_juncs = []
2507     possible_juncs.extend(user_supplied_junctions)
2508    
2509     possible_insertions = []
2510     possible_insertions.extend(user_supplied_insertions)
2511     possible_deletions = []
2512     possible_deletions.extend(user_supplied_deletions)
2513    
2514 gpertea 135 left_reads, right_reads = prepared_reads
2515    
2516     maps = [[], []] # maps[0] = left_reads mapping data, maps[1] = right_reads_mapping_data
2517 gpertea 29 #single_segments = False
2518    
2519 gpertea 135 # Before anything, map the reads using Map2GTF (if using annotation)
2520     m2g_maps = [ None, None ] # left, right
2521     initial_reads = [ left_reads, right_reads ]
2522    
2523     if params.gff_annotation:
2524     (mapped_gtf_list, unmapped_gtf_list) = \
2525     map2gtf(params, ref_fasta, left_reads, right_reads)
2526    
2527     m2g_left_maps, m2g_right_maps = mapped_gtf_list
2528     m2g_maps = [m2g_left_maps, m2g_right_maps]
2529     if params.transcriptome_only or not os.path.exists(unmapped_gtf_list[0]):
2530     # The case where the user doesn't want to map to anything other
2531     # than the transcriptome OR we have no unmapped reads
2532     maps[0] = [m2g_left_maps]
2533     if right_reads:
2534     maps[0] = [m2g_right_maps]
2535     return maps
2536     # Feed the unmapped reads into spliced_alignment()
2537     initial_reads = unmapped_gtf_list[:]
2538    
2539     th_log("Resuming TopHat pipeline with unmapped reads")
2540    
2541     max_seg_len = segment_len #this is the ref seq span on either side of the junctions
2542     #to be extracted into segment_juncs.fa
2543    
2544     num_segs = int(read_len / segment_len)
2545     if (read_len % segment_len) >= min(segment_len-2, 20):
2546     #remainder is shorter but long enough to become a new segment
2547     num_segs += 1
2548     else:
2549     # the last segment is longer
2550     if num_segs>1: max_seg_len += (read_len % segment_len)
2551    
2552     # daehwan - check this out
2553     if num_segs <= 1:
2554     print >> sys.stderr, "Warning: you have only one segment per read\n\twe strongly recommend that you decrease --segment-length to about half the read length because TopHat will work better with multiple segments"
2555    
2556     # Using the num_segs value returned by check_reads(),
2557     # decide which junction discovery strategy to use
2558     if num_segs<3:
2559     #if params.butterfly_search != False:
2560     # params.butterfly_search = True
2561     if params.coverage_search != False:
2562     params.coverage_search = True
2563     if num_segs == 1:
2564     segment_len = read_len
2565     else: #num_segs >= 3:
2566     # if we have at least three segments, just use split segment search,
2567     # which is the most sensitive and specific, fastest, and lightest-weight.
2568     # so unless specifically requested, disable the other junction searches
2569     if params.closure_search != True:
2570     params.closure_search = False
2571     if params.coverage_search != True:
2572     params.coverage_search = False
2573     if params.butterfly_search != True:
2574     params.butterfly_search = False
2575 gpertea 29 # Perform the first part of the TopHat work flow on the left and right
2576     # reads of paired ends separately - we'll use the pairing information later
2577 gpertea 135 for ri in (0,1):
2578     reads=initial_reads[ri]
2579 gpertea 29 if reads == None or os.path.getsize(reads)<25 :
2580     continue
2581     fbasename=getFileBaseName(reads)
2582     unspliced_out = tmp_dir + fbasename + ".mapped"
2583 gpertea 135 unspliced_sam = None
2584     unmapped_reads = None
2585     #if use_zpacker: unspliced_out+=".z"
2586 gpertea 29 unmapped_unspliced = tmp_dir + fbasename + "_unmapped"
2587 gpertea 135 if params.prefilter_multi:
2588     #unmapped_unspliced += ".z"
2589     (unspliced_sam, unmapped_reads) = get_preflt_data(params, ri, reads, unspliced_out, unmapped_unspliced)
2590     else:
2591 gpertea 29 # Perform the initial Bowtie mapping of the full length reads
2592 gpertea 135 (unspliced_sam, unmapped_reads) = bowtie(params,
2593     bwt_idx_prefix,
2594     [reads],
2595     "fastq",
2596     params.initial_read_mismatches,
2597     unspliced_out,
2598     unmapped_unspliced)
2599 gpertea 29
2600 gpertea 135 # unspliced_sam = tmp_dir+fbasename+'.unspl.bam'
2601     # join_mapped_segments(params,
2602     # sam_header_filename,
2603     # reads,
2604     # ref_fasta,
2605     # ["/dev/null"],
2606     # ["/dev/null"],
2607     # ["/dev/null"],
2608     # [unspliced_map],
2609     # [],
2610     # unspliced_sam)
2611 gpertea 29 seg_maps = []
2612     unmapped_segs = []
2613     segs = []
2614     if num_segs > 1:
2615     # split up the IUM reads into segments
2616 gpertea 31 # unmapped_reads can be in BAM format
2617     read_segments = split_reads(unmapped_reads,
2618 gpertea 29 tmp_dir + fbasename,
2619     False,
2620     params,
2621     segment_len)
2622    
2623     # Map each segment file independently with Bowtie
2624     for i in range(len(read_segments)):
2625     seg = read_segments[i]
2626     fbasename=getFileBaseName(seg)
2627 gpertea 135 seg_out = tmp_dir + fbasename + ".bwtout"
2628     if use_zpacker: seg_out += ".z"
2629     unmapped_seg = tmp_dir + fbasename + "_unmapped.fq"
2630     if use_BWT_FIFO:
2631     unmapped_seg += ".z"
2632 gpertea 29 extra_output = "(%d/%d)" % (i+1, len(read_segments))
2633     (seg_map, unmapped) = bowtie_segment(params,
2634     bwt_idx_prefix,
2635     seg,
2636     "fastq",
2637     seg_out,
2638     unmapped_seg,
2639     extra_output)
2640     seg_maps.append(seg_map)
2641     unmapped_segs.append(unmapped)
2642     segs.append(seg)
2643    
2644     # Collect the segment maps for left and right reads together
2645 gpertea 135 maps[ri] = Maps(unspliced_sam, seg_maps, unmapped_segs, segs)
2646 gpertea 29 else:
2647     # if there's only one segment, just collect the initial map as the only
2648     # map to be used downstream for coverage-based junction discovery
2649     read_segments = [reads]
2650 gpertea 135 maps[ri] = Maps(unspliced_sam, [unspliced_sam], [unmapped_reads], [unmapped_reads])
2651 gpertea 29
2652 gpertea 135 # XXX: At this point if using M2G, have three sets of reads:
2653     # mapped to transcriptome, mapped to genome, and unmapped (potentially
2654     # spliced or poly-A tails) - hp
2655    
2656 gpertea 29 # Unless the user asked not to discover new junctions, start that process
2657     # here
2658    
2659 gpertea 135 left_reads_map = maps[0].unspliced_sam
2660     left_seg_maps = maps[0].seg_maps
2661     unmapped_reads = maps[0].unmapped_segs
2662     if right_reads:
2663     right_reads_map = maps[1].unspliced_sam
2664     right_seg_maps = maps[1].seg_maps
2665     unmapped_reads.extend(maps[1].unmapped_segs)
2666     else:
2667     right_reads_map = None
2668     right_seg_maps = None
2669 gpertea 29
2670 gpertea 135 if params.find_novel_juncs: # or params.find_novel_indels:
2671    
2672 gpertea 29 # Call segment_juncs to infer a list of possible splice junctions from
2673     # the regions of the genome covered in the initial and segment maps
2674 gpertea 135 #if params.find_novel_juncs:
2675     #TODO: in m2g case, we might want to pass the m2g mappings as well,
2676     # or perhaps the GTF file directly
2677     # -> this could improve alternative junction detection?
2678 gpertea 29 juncs = junctions_from_segments(params,
2679     left_reads,
2680     left_reads_map,
2681     left_seg_maps,
2682     right_reads,
2683     right_reads_map,
2684     right_seg_maps,
2685     unmapped_reads,
2686     "fastq",
2687     ref_fasta)
2688 gpertea 135 if os.path.getsize(juncs[0]) != 0:
2689 gpertea 29 possible_juncs.append(juncs[0])
2690     if params.find_novel_indels:
2691     if os.path.getsize(juncs[1]) != 0:
2692     possible_insertions.append(juncs[1])
2693     if os.path.getsize(juncs[2]) != 0:
2694     possible_deletions.append(juncs[2])
2695     # Optionally, and for paired reads only, use a closure search to
2696     # discover addtional junctions
2697 gpertea 135 if params.closure_search and left_reads and right_reads:
2698 gpertea 29 juncs = junctions_from_closures(params,
2699 gpertea 135 [maps[initial_reads[left_reads]].unspliced_sam, maps[initial_reads[left_reads]].seg_maps[-1]],
2700     [maps[initial_reads[right_reads]].unspliced_sam, maps[initial_reads[right_reads]].seg_maps[-1]],
2701 gpertea 29 ref_fasta)
2702     if os.path.getsize(juncs[0]) != 0:
2703     possible_juncs.extend(juncs)
2704    
2705     if len(possible_insertions) == 0 and len(possible_deletions) == 0 and len(possible_juncs) == 0:
2706     spliced_seg_maps = None
2707     junc_idx_prefix = None
2708     else:
2709     junc_idx_prefix = "segment_juncs"
2710     if len(possible_insertions) == 0:
2711     possible_insertions.append(os.devnull)
2712     # print >> sys.stderr, "Warning: insertions database is empty!"
2713     if len(possible_deletions) == 0:
2714     possible_deletions.append(os.devnull)
2715     # print >> sys.stderr, "Warning: deletions database is empty!"
2716     if len(possible_juncs) == 0:
2717     possible_juncs.append(os.devnull)
2718     print >> sys.stderr, "Warning: junction database is empty!"
2719 gpertea 135 if junc_idx_prefix:
2720 gpertea 29 build_juncs_index(3,
2721 gpertea 135 #segment_len,
2722     max_seg_len,
2723 gpertea 29 junc_idx_prefix,
2724     possible_juncs,
2725     possible_insertions,
2726     possible_deletions,
2727     ref_fasta,
2728     params.read_params.color)
2729    
2730     # Now map read segments (or whole IUM reads, if num_segs == 1) to the splice
2731     # index with Bowtie
2732 gpertea 135 # for reads in [left_reads, right_reads]:
2733     for ri in (0,1):
2734     reads = initial_reads[ri]
2735 gpertea 29 spliced_seg_maps = []
2736 gpertea 135
2737     if reads == None or os.path.getsize(reads)<25:
2738 gpertea 29 continue
2739    
2740 gpertea 135 m2g_map = m2g_maps[ri]
2741    
2742     if junc_idx_prefix :
2743 gpertea 29 i = 0
2744 gpertea 135 for seg in maps[ri].segs:
2745     #search each segment
2746 gpertea 29 fsegname = getFileBaseName(seg)
2747 gpertea 135 seg_out = tmp_dir + fsegname + ".to_spliced.bwtout"
2748     if use_zpacker: seg_out += ".z"
2749     extra_output = "(%d/%d)" % (i+1, len(maps[ri].segs))
2750 gpertea 29 (seg_map, unmapped) = bowtie_segment(params,
2751     tmp_dir + junc_idx_prefix,
2752     seg,
2753     "fastq",
2754     seg_out,
2755     None,
2756     extra_output)
2757     spliced_seg_maps.append(seg_map)
2758     i += 1
2759    
2760 gpertea 135 # Join the contigous and spliced segment hits into full length
2761     # read alignments
2762     rfname=getFileBaseName(reads)
2763     rfdir=getFileDir(reads)
2764 gpertea 29 mapped_reads = rfdir + rfname + ".candidates.bam"
2765 gpertea 135 # -- spliced mappings built from all segment mappings vs genome and junc_db
2766 gpertea 29 merged_map =rfdir + rfname + ".candidates_and_unspl.bam"
2767 gpertea 135 #unspl_bwtfile = maps[ri].unspliced_bwt
2768     unspl_samfile = maps[ri].unspliced_sam
2769 gpertea 29 join_mapped_segments(params,
2770     sam_header_filename,
2771     reads,
2772     ref_fasta,
2773     possible_juncs,
2774     possible_insertions,
2775     possible_deletions,
2776 gpertea 135 maps[ri].seg_maps,
2777 gpertea 29 spliced_seg_maps,
2778     mapped_reads)
2779    
2780 gpertea 135 if num_segs > 1 or m2g_map:
2781     # Merge the spliced and unspliced full length alignments into
2782     # a single SAM file.
2783 gpertea 29 # The individual SAM files are all already sorted in
2784     # increasing read ID order.
2785     # NOTE: We also should be able to address bug #134 here, by replacing
2786     # contiguous alignments that poke into an intron by a small amount by
2787     # the correct spliced alignment.
2788     try:
2789 gpertea 135 merge_cmd = [ prog_path("bam_merge"), merged_map, mapped_reads ]
2790     if num_segs > 1:
2791     merge_cmd += [ maps[ri].unspliced_sam ]
2792     if m2g_map:
2793     merge_cmd += [ m2g_map ]
2794 gpertea 29 print >> run_log, " ".join(merge_cmd)
2795     ret = subprocess.call( merge_cmd,
2796     stderr=open(logging_dir + "sam_merge.log", "w") )
2797     if ret != 0:
2798     die(fail_str+"Error executing: "+" ".join(merge_cmd))
2799     except OSError, o:
2800     die(fail_str+"Error: "+str(o))
2801 gpertea 135 maps[ri] = [merged_map]
2802 gpertea 29
2803     if not params.system_params.keep_tmp:
2804     os.remove(mapped_reads)
2805     else:
2806 gpertea 135 # no segments or transcriptome mappings, so no merge is needed
2807     # because join_mapped_segments() produced the final BAM already
2808 gpertea 29 os.rename(mapped_reads, merged_map)
2809 gpertea 135 maps[ri] = [merged_map]
2810     if not params.system_params.keep_tmp:
2811 gpertea 29 os.remove(unspl_samfile)
2812 gpertea 135 os.remove(unspl_bwtfile)
2813     #for ri
2814 gpertea 29 return maps
2815    
2816     def die(msg=None):
2817     if msg is not None:
2818     print >> sys.stderr, msg
2819     sys.exit(1)
2820    
2821     # rough equivalent of the 'which' command to find external programs
2822     # (current script path is tested first, then PATH envvar)
2823     def which(program):
2824     def is_executable(fpath):
2825 gpertea 37 return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
2826 gpertea 29 fpath, fname = os.path.split(program)
2827     if fpath:
2828     if is_executable(program):
2829     return program
2830     else:
2831     progpath = os.path.join(bin_dir, program)
2832     if is_executable(progpath):
2833     return progpath
2834     for path in os.environ["PATH"].split(os.pathsep):
2835     progpath = os.path.join(path, program)
2836     if is_executable(progpath):
2837     return progpath
2838     return None
2839    
2840     def prog_path(program):
2841     progpath=which(program)
2842     if progpath == None:
2843     die("Error locating program: "+program)
2844     return progpath
2845    
2846     # FIXME: this should get set during the make dist autotools phase of the build
2847     def get_version():
2848 gpertea 135 return "1.4.0"
2849 gpertea 29
2850     def mlog(msg):
2851     print >> sys.stderr, "[DBGLOG]:"+msg
2852    
2853     def test_input_file(filename):
2854     try:
2855     test_file = open(filename, "r")
2856     # Bowtie not found
2857     except IOError, o:
2858     die("Error: Opening file %s" % filename)
2859     return
2860    
2861    
2862     def main(argv=None):
2863     warnings.filterwarnings("ignore", "tmpnam is a potential security risk")
2864    
2865     # Initialize default parameter values
2866     params = TopHatParams()
2867    
2868     try:
2869     if argv is None:
2870     argv = sys.argv
2871     args = params.parse_options(argv)
2872     params.check()
2873    
2874     bwt_idx_prefix = args[0]
2875     left_reads_list = args[1]
2876     left_quals_list, right_quals_list = [], []
2877     if (not params.read_params.quals and len(args) > 2) or (params.read_params.quals and len(args) > 3):
2878     if params.read_params.mate_inner_dist == None:
2879 gpertea 135 params.read_params.mate_inner_dist = 50
2880 gpertea 67 #die("Error: you must set the mean inner distance between mates with -r")
2881 gpertea 29
2882     right_reads_list = args[2]
2883     if params.read_params.quals:
2884     left_quals_list = args[3]
2885     right_quals_list = args[4]
2886     else:
2887     right_reads_list = None
2888     if params.read_params.quals:
2889     left_quals_list = args[2]
2890    
2891     print >> sys.stderr
2892 gpertea 135 th_log("Beginning TopHat run (v"+get_version()+")")
2893 gpertea 29 print >> sys.stderr, "-----------------------------------------------"
2894    
2895     start_time = datetime.now()
2896     prepare_output_dir()
2897    
2898     global run_log
2899     run_log = open(logging_dir + "run.log", "w", 0)
2900     global run_cmd
2901     run_cmd = " ".join(argv)
2902     print >> run_log, run_cmd
2903    
2904     # Validate all the input files, check all prereqs before committing
2905     # to the run
2906 gpertea 135 if params.transcriptome_index:
2907     check_bowtie_index(params.transcriptome_index)
2908 gpertea 29 (ref_fasta, ref_seq_dict) = check_index(bwt_idx_prefix)
2909    
2910     check_bowtie()
2911     check_samtools()
2912 gpertea 135 th_log("Generating SAM header for "+bwt_idx_prefix)
2913 gpertea 29 sam_header_filename = get_index_sam_header(params.read_params, bwt_idx_prefix)
2914    
2915 gpertea 135 #if not params.skip_check_reads:
2916     reads_list = left_reads_list
2917     if right_reads_list:
2918 gpertea 29 reads_list = reads_list + "," + right_reads_list
2919 gpertea 135 params.read_params = check_reads_format(params, reads_list)
2920 gpertea 29
2921     user_supplied_juncs = []
2922     user_supplied_insertions = []
2923     user_supplied_deletions = []
2924 gpertea 135 global gtf_juncs
2925 gpertea 29 if params.gff_annotation and params.find_GFF_juncs:
2926     test_input_file(params.gff_annotation)
2927     (found_juncs, gtf_juncs) = get_gtf_juncs(params.gff_annotation)
2928 gpertea 135 ##-- we don't need these junctions in user_supplied_juncs anymore because now map2gtf does a much better job
2929     ## but we still need them loaded in gtf_juncs for later splice verification
2930     #if found_juncs and not params.gff_annotation:
2931     # user_supplied_juncs.append(gtf_juncs)
2932     #else:
2933     # gtf_juncs = None
2934 gpertea 29 if params.raw_junctions:
2935     test_input_file(params.raw_junctions)
2936     user_supplied_juncs.append(params.raw_junctions)
2937    
2938     if params.raw_insertions:
2939     test_input_file(params.raw_insertions)
2940     user_supplied_insertions.append(params.raw_insertions)
2941    
2942     if params.raw_deletions:
2943     test_input_file(params.raw_deletions)
2944     user_supplied_deletions.append(params.raw_deletions)
2945    
2946     global unmapped_reads_fifo
2947     unmapped_reads_fifo = tmp_dir + str(os.getpid())+".bwt_unmapped.z.fifo"
2948    
2949     # Now start the time consuming stuff
2950 gpertea 135 if params.prefilter_multi:
2951     sides=("left","right")
2952     read_lists=(left_reads_list, right_reads_list)
2953     for ri in (0,1):
2954     reads_list=read_lists[ri]
2955     if not reads_list:
2956     continue
2957     params.preflt_data[ri].multihit_reads = tmp_dir + sides[ri]+"_multimapped.fq"
2958     side_imap = tmp_dir + sides[ri]+"_im.bwtout"
2959     if use_zpacker:
2960     side_imap+=".z"
2961     side_ium = tmp_dir + sides[ri]+"_ium.fq"
2962     if use_BWT_FIFO:
2963     side_ium += ".z"
2964     th_log("Pre-filtering multi-mapped "+sides[ri]+" reads")
2965     rdlist=reads_list.split(',')
2966     bwt=bowtie(params, bwt_idx_prefix, rdlist,
2967     params.read_params.reads_format,
2968     params.initial_read_mismatches,
2969     side_imap, side_ium,
2970     "", False, params.preflt_data[ri].multihit_reads)
2971     params.preflt_data[ri].mappings = bwt[0] # initial mappings
2972     params.preflt_data[ri].unmapped_reads = bwt[1] # IUM reads
2973    
2974     th_log("Preparing reads")
2975 gpertea 29 left_kept_reads, left_reads_info = prep_reads(params,
2976     left_reads_list,
2977     left_quals_list,
2978 gpertea 135 "left",
2979     params.preflt_data[0].multihit_reads)
2980 gpertea 29 min_read_len=left_reads_info.min_len
2981     max_read_len=left_reads_info.max_len
2982 gpertea 135 if right_reads_list:
2983 gpertea 29 right_kept_reads, right_reads_info = prep_reads(params,
2984     right_reads_list,
2985     right_quals_list,
2986 gpertea 135 "right",
2987     params.preflt_data[1].multihit_reads)
2988 gpertea 29 min_read_len=min(right_reads_info.min_len, min_read_len)
2989     max_read_len=max(right_reads_info.max_len, max_read_len)
2990     else:
2991     right_kept_reads = None
2992     seed_len=params.read_params.seed_length
2993 gpertea 135 if seed_len: #if read len was explicitly given
2994 gpertea 29 seed_len = max(seed_len, min_read_len)
2995 gpertea 135 #can't be smaller than minimum length observed
2996 gpertea 29 else:
2997     seed_len = max_read_len
2998     params.read_params.seed_length=seed_len
2999     # turn off integer-quals
3000     if params.read_params.integer_quals:
3001     params.read_params.integer_quals = False
3002 gpertea 135 input_reads = [left_kept_reads, right_kept_reads]
3003     mappings = spliced_alignment(params,
3004     bwt_idx_prefix,
3005     sam_header_filename,
3006     ref_fasta,
3007     params.read_params.seed_length,
3008     params.segment_length,
3009     input_reads,
3010     user_supplied_juncs,
3011     user_supplied_insertions,
3012     user_supplied_deletions)
3013 gpertea 29
3014     compile_reports(params,
3015     sam_header_filename,
3016 gpertea 135 mappings,
3017     input_reads,
3018 gpertea 29 params.gff_annotation)
3019    
3020     if not params.system_params.keep_tmp:
3021 gpertea 135 for m in mappings[0]:
3022 gpertea 29 os.remove(m)
3023 gpertea 135 if left_kept_reads:
3024 gpertea 29 os.remove(left_kept_reads)
3025 gpertea 135 for m in mappings[1]:
3026 gpertea 29 os.remove(m)
3027 gpertea 135 if right_kept_reads:
3028 gpertea 29 os.remove(right_kept_reads)
3029     tmp_files = os.listdir(tmp_dir)
3030     for t in tmp_files:
3031     os.remove(tmp_dir+t)
3032     os.rmdir(tmp_dir)
3033    
3034     finish_time = datetime.now()
3035     duration = finish_time - start_time
3036     print >> sys.stderr,"-----------------------------------------------"
3037     print >> sys.stderr, "Run complete [%s elapsed]" % formatTD(duration)
3038    
3039     except Usage, err:
3040     print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg)
3041     print >> sys.stderr, " for detailed help see http://tophat.cbcb.umd.edu/manual.html"
3042     return 2
3043    
3044    
3045     if __name__ == "__main__":
3046     sys.exit(main())

Properties

Name Value
svn:executable *