ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/tophat
Revision: 29
Committed: Tue Aug 2 21:24:54 2011 UTC (8 years, 9 months ago) by gpertea
File size: 110617 byte(s)
Log Message:
adding tophat source work

Line User Rev File contents
1 gpertea 29 #!/usr/bin/env python
2    
3     # encoding: utf-8
4     """
5     tophat.py
6    
7     Created by Cole Trapnell on 2008-12-25.
8     Copyright (c) 2008 Cole Trapnell. All rights reserved.
9     """
10    
11     import sys
12     try:
13     import psyco
14     psyco.full()
15     except ImportError:
16     pass
17    
18     import getopt
19     import subprocess
20     import errno
21     import os
22     import warnings
23     import re
24     import signal
25     from datetime import datetime, date, time
26    
27     use_message = '''
28     TopHat maps short sequences from spliced transcripts to whole genomes.
29    
30     Usage:
31     tophat [options] <bowtie_index> <reads1[,reads2,...]> [reads1[,reads2,...]] \\
32     [quals1,[quals2,...]] [quals1[,quals2,...]]
33    
34     Options:
35     -v/--version
36     -o/--output-dir <string> [ default: ./tophat_out ]
37     -a/--min-anchor <int> [ default: 8 ]
38     -m/--splice-mismatches <0-2> [ default: 0 ]
39     -i/--min-intron-length <int> [ default: 50 ]
40     -I/--max-intron-length <int> [ default: 500000 ]
41     -g/--max-multihits <int> [ default: 20 ]
42     -F/--min-isoform-fraction <float> [ default: 0.15 ]
43     --max-insertion-length <int> [ default: 3 ]
44     --max-deletion-length <int> [ default: 3 ]
45     --solexa-quals
46     --solexa1.3-quals (same as phred64-quals)
47     --phred64-quals (same as solexa1.3-quals)
48     -Q/--quals
49     --integer-quals
50     -C/--color (Solid - color space)
51     --color-out
52     --library-type <string> (fr-unstranded, fr-firststrand,
53     fr-secondstrand)
54     -p/--num-threads <int> [ default: 1 ]
55     -G/--GTF <filename>
56     -j/--raw-juncs <filename>
57     --insertions <filename>
58     --deletions <filename>
59     -r/--mate-inner-dist <int>
60     --mate-std-dev <int> [ default: 20 ]
61     --no-novel-juncs
62     --no-novel-indels
63     --no-gtf-juncs
64     --no-coverage-search
65     --coverage-search
66     --no-closure-search
67     --closure-search
68     --microexon-search
69     --butterfly-search
70     --no-butterfly-search
71     --keep-tmp
72     --tmp-dir <dirname> [ default: <output_dir>/tmp ]
73     -z/--zpacker <program> [ default: gzip ]
74     -X/--unmapped-fifo [use mkfifo to compress more temporary
75     files for color space reads]
76    
77     Advanced Options:
78     --initial-read-mismatches <int> [ default: 2 ]
79     --segment-mismatches <int> [ default: 2 ]
80     --segment-length <int> [ default: 25 ]
81     --bowtie-n [ default: bowtie -v ]
82     --min-closure-exon <int> [ default: 100 ]
83     --min-closure-intron <int> [ default: 50 ]
84     --max-closure-intron <int> [ default: 5000 ]
85     --min-coverage-intron <int> [ default: 50 ]
86     --max-coverage-intron <int> [ default: 20000 ]
87     --min-segment-intron <int> [ default: 50 ]
88     --max-segment-intron <int> [ default: 500000 ]
89     --no-sort-bam [Output BAM is not coordinate-sorted]
90     --no-convert-bam [Do not convert to bam format.
91     Output is <output_dir>accepted_hit.sam.
92     Implies --no-sort-bam.]
93    
94     SAM Header Options (for embedding sequencing run metadata in output):
95     --rg-id <string> (read group ID)
96     --rg-sample <string> (sample ID)
97     --rg-library <string> (library ID)
98     --rg-description <string> (descriptive string, no tabs allowed)
99     --rg-platform-unit <string> (e.g Illumina lane ID)
100     --rg-center <string> (sequencing center name)
101     --rg-date <string> (ISO 8601 date of the sequencing run)
102     --rg-platform <string> (Sequencing platform descriptor)
103     '''
104    
105     class Usage(Exception):
106     def __init__(self, msg):
107     self.msg = msg
108    
109     output_dir = "./tophat_out/"
110     logging_dir = output_dir + "logs/"
111     run_log = None
112     run_cmd = None
113     tmp_dir = output_dir + "tmp/"
114     bin_dir = sys.path[0] + "/"
115     use_zpacker = False # this is set by -z/--zpacker option (-z0 leaves it False)
116    
117     use_BAM_Unmapped = False # automatically set to True for non-Solid reads, handles unmapped reads in BAM format
118    
119     use_BWT_FIFO = False # can only be set to True if use_zpacker is True and only with -C/--color
120     # enabled by -X/-unmapped-fifo option (unless -z0)
121     unmapped_reads_fifo = None # if use_BWT_FIFO is True, this is to trick bowtie into writing the
122     # unmapped reads into a compressed file
123    
124     samtools_path = None
125     bowtie_path = None
126     fail_str = "\t[FAILED]\n"
127    
128     sam_header = tmp_dir + "stub_header.sam"
129    
130     # TopHatParams captures all of the runtime paramaters used by TopHat, and many
131     # of these are passed as command line options to exectubles run by the pipeline
132    
133     # This class and its nested classes also do options parsing through parse_options()
134     # and option validation via the member function check()
135    
136     class TopHatParams:
137    
138     # SpliceConstraints is a group of runtime parameters that specify what
139     # constraints to put on junctions discovered by the program. These constraints
140     # are used to filter out spurious/false positive junctions.
141    
142     class SpliceConstraints:
143     def __init__(self,
144     min_anchor_length,
145     min_intron_length,
146     max_intron_length,
147     splice_mismatches,
148     min_isoform_fraction):
149     self.min_anchor_length = min_anchor_length
150     self.min_intron_length = min_intron_length
151     self.max_intron_length = max_intron_length
152     self.splice_mismatches = splice_mismatches
153     self.min_isoform_fraction = min_isoform_fraction
154    
155     def parse_options(self, opts):
156     for option, value in opts:
157     if option in ("-m", "--splice-mismatches"):
158     self.splice_mismatches = int(value)
159     elif option in ("-a", "--min-anchor"):
160     self.min_anchor_length = int(value)
161     elif option in ("-F", "--min-isoform-fraction"):
162     self.min_isoform_fraction = float(value)
163     elif option in ("-i", "--min-intron-length"):
164     self.min_intron_length = int(value)
165     elif option in ("-I", "--max-intron-length"):
166     self.max_intron_length = int(value)
167    
168     def check(self):
169     if self.splice_mismatches not in [0,1,2]:
170     die("Error: arg to --splice-mismatches must be 0, 1, or 2")
171     if self.min_anchor_length < 4:
172     die("Error: arg to --min-anchor-len must be greater than 4")
173     if self.min_isoform_fraction < 0.0 or self.min_isoform_fraction > 1.0:
174     die("Error: arg to --min-isoform-fraction must be between 0.0 and 1.0")
175     if self.min_intron_length <= 0:
176     die("Error: arg to --min-intron-length must be greater than 0")
177     if self.max_intron_length <= 0:
178     die("Error: arg to --max-intron-length must be greater than 0")
179    
180     # SystemParams is a group of runtime parameters that determine how to handle
181     # temporary files produced during a run and how many threads to use for threaded
182     # stages of the pipeline (e.g. Bowtie)
183    
184     class SystemParams:
185     def __init__(self,
186     num_cpus,
187     keep_tmp):
188     self.num_cpus = num_cpus
189     self.keep_tmp = keep_tmp
190     self.zipper = "gzip"
191     self.zipper_opts= []
192    
193     def parse_options(self, opts):
194     global use_zpacker
195     global use_BWT_FIFO
196     for option, value in opts:
197     if option in ("-p", "--num-threads"):
198     self.num_cpus = int(value)
199     elif option == "--keep-tmp":
200     self.keep_tmp = True
201     elif option in ("-z","--zpacker"):
202     if value.lower() in ["-", " ", ".", "0", "none", "f", "false", "no"]:
203     value=""
204     self.zipper = value
205     #if not self.zipper:
206     # self.zipper='gzip'
207     elif option in ("-X", "--unmapped-fifo"):
208     use_BWT_FIFO=True
209     if self.zipper:
210     use_zpacker=True
211     if self.num_cpus>1 and not self.zipper_opts:
212     if self.zipper.endswith('pbzip2') or self.zipper.endswith('pigz'):
213     self.zipper_opts.append('-p'+str(self.num_cpus))
214     else:
215     use_zpacker=False
216     if use_BWT_FIFO: use_BWT_FIFO=False
217     def cmd(self):
218     cmdline=[]
219     if self.zipper:
220     cmdline.extend(['-z',self.zipper])
221     if self.num_cpus>1:
222     cmdline.extend(['-p'+str(self.num_cpus)])
223     return cmdline
224    
225     def check(self):
226     if self.num_cpus<1 :
227     die("Error: arg to --num-threads must be greater than 0")
228     if self.zipper:
229     xzip=which(self.zipper)
230     if not xzip:
231     die("Error: cannot find compression program "+self.zipper)
232    
233     # ReadParams is a group of runtime parameters that specify various properties
234     # of the user's reads (e.g. which quality scale their are on, how long the
235     # fragments are, etc).
236     class ReadParams:
237     def __init__(self,
238     solexa_quals,
239     phred64_quals,
240     quals,
241     integer_quals,
242     color,
243     color_out,
244     library_type,
245     seed_length,
246     reads_format,
247     mate_inner_dist,
248     mate_inner_dist_std_dev,
249     read_group_id,
250     sample_id,
251     library_id,
252     description,
253     seq_platform_unit,
254     seq_center,
255     seq_run_date,
256     seq_platform):
257     self.solexa_quals = solexa_quals
258     self.phred64_quals = phred64_quals
259     self.quals = quals
260     self.integer_quals = integer_quals
261     self.color = color
262     self.color_out = color_out
263     self.library_type = library_type
264     self.seed_length = seed_length
265     self.reads_format = reads_format
266     self.mate_inner_dist = mate_inner_dist
267     self.mate_inner_dist_std_dev = mate_inner_dist_std_dev
268     self.read_group_id = read_group_id
269     self.sample_id = sample_id
270     self.library_id = library_id
271     self.description = description
272     self.seq_platform_unit = seq_platform_unit
273     self.seq_center = seq_center
274     self.seq_run_date = seq_run_date
275     self.seq_platform = seq_platform
276    
277     def parse_options(self, opts):
278     for option, value in opts:
279     if option == "--solexa-quals":
280     self.solexa_quals = True
281     elif option in ("--solexa1.3-quals", "--phred64-quals"):
282     self.phred64_quals = True
283     elif option in ("-Q", "--quals"):
284     self.quals = True
285     elif option == "--integer-quals":
286     self.integer_quals = True
287     elif option in ("-C", "--color"):
288     self.color = True
289     elif option == "--color-out":
290     self.color_out = True
291     elif option == "--library-type":
292     self.library_type = value
293     elif option in ("-s", "--seed-length"):
294     self.seed_length = int(value)
295     elif option in ("-r", "--mate-inner-dist"):
296     self.mate_inner_dist = int(value)
297     elif option == "--mate-std-dev":
298     self.mate_inner_dist_std_dev = int(value)
299     elif option == "--rg-id":
300     self.read_group_id = value
301     elif option == "--rg-sample":
302     self.sample_id = value
303     elif option == "--rg-library":
304     self.library_id = value
305     elif option == "--rg-description":
306     self.description = value
307     elif option == "--rg-platform-unit":
308     self.seq_platform_unit = value
309     elif option == "--rg-center":
310     self.seq_center = value
311     elif option == "--rg-date":
312     self.seq_run_date = value
313     elif option == "--rg-platform":
314     self.seq_platform = value
315    
316     def check(self):
317     if self.seed_length != None and self.seed_length < 20:
318     die("Error: arg to --seed-length must be at least 20")
319     if self.mate_inner_dist_std_dev != None and self.mate_inner_dist_std_dev < 0:
320     die("Error: arg to --mate-std-dev must at least 0")
321     if (not self.read_group_id and self.sample_id) or (self.read_group_id and not self.sample_id):
322     die("Error: --rg-id and --rg-sample must be specified or omitted together")
323    
324     # SearchParams is a group of runtime parameters that specify how TopHat will
325     # search for splice junctions
326    
327     class SearchParams:
328     def __init__(self,
329     min_closure_exon,
330     min_closure_intron,
331     max_closure_intron,
332     min_coverage_intron,
333     max_coverage_intron,
334     min_segment_intron,
335     max_segment_intron):
336    
337     self.min_closure_exon_length = min_closure_exon
338     self.min_closure_intron_length = min_closure_intron
339     self.max_closure_intron_length = max_closure_intron
340     self.min_coverage_intron_length = min_coverage_intron
341     self.max_coverage_intron_length = max_coverage_intron
342     self.min_segment_intron_length = min_segment_intron
343     self.max_segment_intron_length = max_segment_intron
344    
345     def parse_options(self, opts):
346     for option, value in opts:
347     if option == "--min-closure-exon":
348     self.min_closure_exon_length = int(value)
349     if option == "--min-closure-intron":
350     self.min_closure_intron_length = int(value)
351     if option == "--max-closure-intron":
352     self.max_closure_intron_length = int(value)
353     if option == "--min-coverage-intron":
354     self.min_coverage_intron_length = int(value)
355     if option == "--max-coverage-intron":
356     self.max_coverage_intron_length = int(value)
357     if option == "--min-segment-intron":
358     self.min_segment_intron_length = int(value)
359     if option == "--max-segment-intron":
360     self.max_segment_intron_length = int(value)
361    
362     def check(self):
363     if self.min_closure_exon_length < 0:
364     die("Error: arg to --min-closure-exon must be at least 20")
365     if self.min_closure_intron_length < 0:
366     die("Error: arg to --min-closure-intron must be at least 20")
367     if self.max_closure_intron_length < 0:
368     die("Error: arg to --max-closure-intron must be at least 20")
369     if self.min_coverage_intron_length < 0:
370     die("Error: arg to --min-coverage-intron must be at least 20")
371     if self.max_coverage_intron_length < 0:
372     die("Error: arg to --max-coverage-intron must be at least 20")
373     if self.min_segment_intron_length < 0:
374     die("Error: arg to --min-segment-intron must be at least 20")
375     if self.max_segment_intron_length < 0:
376     die("Error: arg to --max-segment-intron must be at least 20")
377    
378     class ReportParams:
379     def __init__(self):
380     self.sort_bam = True
381     self.convert_bam = True
382    
383     def parse_options(self, opts):
384     for option, value in opts:
385     if option == "--no-sort-bam":
386     self.sort_bam = False
387     if option == "--no-convert-bam":
388     self.convert_bam = False
389     self.sort_bam = False
390    
391    
392     def __init__(self):
393     self.splice_constraints = self.SpliceConstraints(8, # min_anchor
394     50, # min_intron
395     500000, # max_intron
396     0, # splice_mismatches
397     0.15) # min_isoform_frac
398    
399     self.read_params = self.ReadParams(False, # solexa_scale
400     False,
401     False, # quals
402     None, # integer quals
403     False, # SOLiD - color space
404     False, # SOLiD - color out instead of base pair,
405     "", # library type (e.g. "illumina-stranded-pair-end")
406     None, # seed_length
407     "fastq", # quality_format
408     None, # mate inner distance
409     20, # mate inner dist std dev
410     None, # read group id
411     None, # sample id
412     None, # library id
413     None, # description
414     None, # platform unit (i.e. lane)
415     None, # sequencing center
416     None, # run date
417     None) # sequencing platform
418    
419     self.system_params = self.SystemParams(1, # bowtie_threads (num_cpus)
420     False) # keep_tmp
421    
422     self.search_params = self.SearchParams(100, # min_closure_exon_length
423     50, # min_closure_intron_length
424     5000, # max_closure_intron_length
425     50, # min_coverage_intron_length
426     20000, # max_coverage_intron_length
427     50, # min_segment_intron_length
428     500000) # max_segment_intron_length
429    
430     self.report_params = self.ReportParams()
431     self.gff_annotation = None
432     self.raw_junctions = None
433     self.find_novel_juncs = True
434     self.find_novel_indels = True
435     self.find_GFF_juncs = True
436     self.skip_check_reads = False
437     self.max_hits = 20
438     self.initial_read_mismatches = 2
439     self.segment_length = 25
440     self.segment_mismatches = 2
441     self.bowtie_alignment_option = "-v"
442     self.max_insertion_length = 3
443     self.max_deletion_length = 3
444     self.raw_insertions = None
445     self.raw_deletions = None
446     self.closure_search = None
447     self.coverage_search = None
448     self.microexon_search = False
449     self.butterfly_search = None
450    
451     def check(self):
452     self.splice_constraints.check()
453     self.read_params.check()
454     self.system_params.check()
455    
456     if self.segment_length <= 4:
457     die("Error: arg to --segment-length must at least 4")
458     if self.segment_mismatches < 0 or self.segment_mismatches > 3:
459     die("Error: arg to --segment-mismatches must in [0, 3]")
460    
461     if self.read_params.color and self.butterfly_search:
462     die("Error: butterfly-search in colorspace is not yet supported")
463    
464     library_types = ["fr-unstranded", "fr-firststrand", "fr-secondstrand"]
465    
466     if self.read_params.library_type and self.read_params.library_type not in library_types:
467     die("Error: library-type should be one of: "+', '.join(library_types))
468    
469     self.search_params.max_closure_intron_length = min(self.splice_constraints.max_intron_length,
470     self.search_params.max_closure_intron_length)
471    
472     self.search_params.max_segment_intron_length = min(self.splice_constraints.max_intron_length,
473     self.search_params.max_segment_intron_length)
474    
475     self.search_params.max_coverage_intron_length = min(self.splice_constraints.max_intron_length,
476     self.search_params.max_coverage_intron_length)
477    
478     if self.max_insertion_length >= self.segment_length:
479     die("Error: the max insertion length ("+self.max_insertion_length+") can not be equal to or greater than the segment length ("+self.segment_length+")")
480    
481     if self.max_insertion_length < 0:
482     die("Error: the max insertion length ("+self.max_insertion_length+") can not be less than 0")
483    
484     if self.max_deletion_length >= self.splice_constraints.min_intron_length:
485     die("Error: the max deletion length ("+self.max_deletion_length+") can not be equal to or greater than the min intron length ("+self.splice_constraints.min_intron_length+")")
486    
487     if self.max_deletion_length < 0:
488     die("Error: the max deletion length ("+self.max_deletion_length+") can not be less than 0")
489    
490     def cmd(self):
491     cmd = ["--min-anchor", str(self.splice_constraints.min_anchor_length),
492     "--splice-mismatches", str(self.splice_constraints.splice_mismatches),
493     "--min-report-intron", str(self.splice_constraints.min_intron_length),
494     "--max-report-intron", str(self.splice_constraints.max_intron_length),
495     "--min-isoform-fraction", str(self.splice_constraints.min_isoform_fraction),
496     "--output-dir", output_dir,
497     "--max-multihits", str(self.max_hits),
498     "--segment-length", str(self.segment_length),
499     "--segment-mismatches", str(self.segment_mismatches),
500     "--min-closure-exon", str(self.search_params.min_closure_exon_length),
501     "--min-closure-intron", str(self.search_params.min_closure_intron_length),
502     "--max-closure-intron", str(self.search_params.max_closure_intron_length),
503     "--min-coverage-intron", str(self.search_params.min_coverage_intron_length),
504     "--max-coverage-intron", str(self.search_params.max_coverage_intron_length),
505     "--min-segment-intron", str(self.search_params.min_segment_intron_length),
506     "--max-segment-intron", str(self.search_params.max_segment_intron_length),
507     "--sam-header", sam_header,
508     "--max-insertion-length", str(self.max_insertion_length),
509     "--max-deletion-length", str(self.max_deletion_length)]
510    
511     cmd.extend(self.system_params.cmd())
512    
513     if self.read_params.mate_inner_dist != None:
514     cmd.extend(["--inner-dist-mean", str(self.read_params.mate_inner_dist),
515     "--inner-dist-std-dev", str(self.read_params.mate_inner_dist_std_dev)])
516     if self.gff_annotation != None:
517     cmd.extend(["--gtf-annotations", str(self.gff_annotation)])
518     if not self.closure_search:
519     cmd.append("--no-closure-search")
520     if not self.coverage_search:
521     cmd.append("--no-coverage-search")
522     if not self.microexon_search:
523     cmd.append("--no-microexon-search")
524     if self.butterfly_search:
525     cmd.append("--butterfly-search")
526     if self.read_params.solexa_quals:
527     cmd.append("--solexa-quals")
528     if self.read_params.quals:
529     cmd.append("--quals")
530     if self.read_params.integer_quals:
531     cmd.append("--integer-quals")
532     if self.read_params.color:
533     cmd.append("--color")
534     if self.read_params.color_out:
535     cmd.append("--color-out")
536     if self.read_params.library_type:
537     cmd.extend(["--library-type", self.read_params.library_type])
538     if self.read_params.read_group_id:
539     cmd.extend(["--rg-id", self.read_params.read_group_id])
540     if self.read_params.phred64_quals:
541     cmd.append("--phred64-quals")
542     return cmd
543    
544     # This is the master options parsing routine, which calls parse_options for
545     # the delegate classes (e.g. SpliceConstraints) that handle certain groups
546     # of options.
547     def parse_options(self, argv):
548     try:
549     opts, args = getopt.getopt(argv[1:], "hvp:m:F:a:i:I:G:r:o:j:Xz:s:g:QC",
550     ["version",
551     "help",
552     "output-dir=",
553     "solexa-quals",
554     "solexa1.3-quals",
555     "phred64-quals",
556     "quals",
557     "integer-quals",
558     "color",
559     "color-out",
560     "library-type=",
561     "num-threads=",
562     "splice-mismatches=",
563     "max-multihits=",
564     "min-isoform-fraction=",
565     "min-anchor-length=",
566     "min-intron-length=",
567     "max-intron-length=",
568     "GTF=",
569     "raw-juncs=",
570     "no-novel-juncs",
571     "no-novel-indels",
572     "no-gtf-juncs",
573     "skip-check-reads",
574     "mate-inner-dist=",
575     "mate-std-dev=",
576     "no-closure-search",
577     "no-coverage-search",
578     "closure-search",
579     "coverage-search",
580     "microexon-search",
581     "min-closure-exon=",
582     "min-closure-intron=",
583     "max-closure-intron=",
584     "min-coverage-intron=",
585     "max-coverage-intron=",
586     "min-segment-intron=",
587     "max-segment-intron=",
588     "seed-length=",
589     "initial-read-mismatches=",
590     "segment-length=",
591     "segment-mismatches=",
592     "bowtie-n",
593     "butterfly-search",
594     "no-butterfly-search",
595     "keep-tmp",
596     "rg-id=",
597     "rg-sample=",
598     "rg-library=",
599     "rg-description=",
600     "rg-platform-unit=",
601     "rg-center=",
602     "rg-date=",
603     "rg-platform=",
604     "tmp-dir=",
605     "zpacker=",
606     "unmapped-fifo",
607     "max-insertion-length=",
608     "max-deletion-length=",
609     "insertions=",
610     "deletions=",
611     "no-sort-bam",
612     "no-convert-bam"])
613     except getopt.error, msg:
614     raise Usage(msg)
615    
616     self.splice_constraints.parse_options(opts)
617     self.system_params.parse_options(opts)
618     self.read_params.parse_options(opts)
619     self.search_params.parse_options(opts)
620     self.report_params.parse_options(opts)
621     global use_BWT_FIFO
622     global use_BAM_Unmapped
623     if not self.read_params.color:
624     use_BWT_FIFO=False
625     use_BAM_Unmapped=True
626     global output_dir
627     global logging_dir
628     global tmp_dir
629     global sam_header
630    
631     custom_tmp_dir = None
632     custom_out_dir = None
633     # option processing
634     for option, value in opts:
635     if option in ("-v", "--version"):
636     print "TopHat v%s" % (get_version())
637     sys.exit(0)
638     if option in ("-h", "--help"):
639     raise Usage(use_message)
640     if option in ("-g", "--max-multihits"):
641     self.max_hits = int(value)
642     if option in ("-G", "--GTF"):
643     self.gff_annotation = value
644     if option in ("-j", "--raw-juncs"):
645     self.raw_junctions = value
646     if option == "--no-novel-juncs":
647     self.find_novel_juncs = False
648     if option == "--no-novel-indels":
649     self.find_novel_indels = False
650     if option == "--no-gtf-juncs":
651     self.find_GFF_juncs = False
652     if option == "--skip-check-reads":
653     self.skip_check_reads = True
654     if option == "--no-coverage-search":
655     self.coverage_search = False
656     if option == "--no-closure-search":
657     self.closure_search = False
658     if option == "--coverage-search":
659     self.coverage_search = True
660     if option == "--closure-search":
661     self.closure_search = True
662     if option == "--microexon-search":
663     self.microexon_search = True
664     if option == "--butterfly-search":
665     self.butterfly_search = True
666     if option == "--no-butterfly-search":
667     self.butterfly_search = False
668     if option == "--initial-read-mismatches":
669     self.initial_read_mismatches = int(value)
670     if option == "--segment-length":
671     self.segment_length = int(value)
672     if option == "--segment-mismatches":
673     self.segment_mismatches = int(value)
674     if option == "--bowtie-n":
675     self.bowtie_alignment_option = "-n"
676     if option == "--max-insertion-length":
677     self.max_insertion_length = int(value)
678     if option == "--max-deletion-length":
679     self.max_deletion_length = int(value)
680     if option == "--insertions":
681     self.raw_insertions = value
682     if option == "--deletions":
683     self.raw_deletions = value
684     if option in ("-o", "--output-dir"):
685     custom_out_dir = value + "/"
686     if option == "--tmp-dir":
687     custom_tmp_dir = value + "/"
688    
689     if custom_out_dir != None:
690     output_dir = custom_out_dir
691     logging_dir = output_dir + "logs/"
692     tmp_dir = output_dir + "tmp/"
693     sam_header = tmp_dir + "stub_header.sam"
694     if custom_tmp_dir != None:
695     tmp_dir = custom_tmp_dir
696     sam_header = tmp_dir + "stub_header.sam"
697     if len(args) < 2:
698     raise Usage(use_message)
699     return args
700    
701    
702     def getFileDir(filepath):
703     #if fullpath, returns path including the ending /
704     fpath, fname=os.path.split(filepath)
705     if fpath: fpath+='/'
706     return fpath
707    
708     def getFileBaseName(filepath):
709     fpath, fname=os.path.split(filepath)
710     fbase, fext =os.path.splitext(fname)
711     fx=fext.lower()
712     if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fbase)>0:
713     return fbase
714     elif fx == '.z' or fx.find('.gz')==0 or fx.find('.bz')==0:
715     fb, fext = os.path.splitext(fbase)
716     fx=fext.lower()
717     if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fb)>0:
718     return fb
719     else:
720     return fbase
721     else:
722     if len(fbase)>len(fext):
723     return fbase
724     else:
725     return fname
726    
727     # Returns the current time in a nice format
728     def right_now():
729     curr_time = datetime.now()
730     return curr_time.strftime("%c")
731    
732     # Ensures that the output, logging, and temp directories are present. If not,
733     # they are created
734     def prepare_output_dir():
735    
736     print >> sys.stderr, "[%s] Preparing output location %s" % (right_now(), output_dir)
737     if os.path.exists(output_dir):
738     pass
739     else:
740     os.mkdir(output_dir)
741    
742     if os.path.exists(logging_dir):
743     pass
744     else:
745     os.mkdir(logging_dir)
746    
747     if os.path.exists(tmp_dir):
748     pass
749     else:
750     try:
751     os.makedirs(tmp_dir)
752     except OSError, o:
753     die("\nError creating directory %s (%s)" % (tmp_dir, o))
754    
755    
756     # to be added as preexec_fn for every subprocess.Popen() call:
757     # see http://bugs.python.org/issue1652
758     def subprocess_setup():
759     # Python installs a SIGPIPE handler by default, which causes
760     # gzip or other de/compression pipes to complain about "stdout: Broken pipe"
761     signal.signal(signal.SIGPIPE, signal.SIG_DFL)
762    
763     # Check that the Bowtie index specified by the user is present and all files
764     # are there.
765     def check_bowtie_index(idx_prefix):
766     print >> sys.stderr, "[%s] Checking for Bowtie index files" % right_now()
767    
768     idx_fwd_1 = idx_prefix + ".1.ebwt"
769     idx_fwd_2 = idx_prefix + ".2.ebwt"
770     idx_rev_1 = idx_prefix + ".rev.1.ebwt"
771     idx_rev_2 = idx_prefix + ".rev.2.ebwt"
772    
773     if os.path.exists(idx_fwd_1) and \
774     os.path.exists(idx_fwd_2) and \
775     os.path.exists(idx_rev_1) and \
776     os.path.exists(idx_rev_2):
777     return
778     else:
779     bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES")
780     if bowtie_idx_env_var == None:
781     die("Error: Could not find Bowtie index files " + idx_prefix + ".*")
782     idx_prefix = bowtie_idx_env_var + idx_prefix
783     idx_fwd_1 = idx_prefix + ".1.ebwt"
784     idx_fwd_2 = idx_prefix + ".2.ebwt"
785     idx_rev_1 = idx_prefix + ".rev.1.ebwt"
786     idx_rev_2 = idx_prefix + ".rev.2.ebwt"
787    
788     if os.path.exists(idx_fwd_1) and \
789     os.path.exists(idx_fwd_2) and \
790     os.path.exists(idx_rev_1) and \
791     os.path.exists(idx_rev_2):
792     return
793     else:
794     die("Error: Could not find Bowtie index files " + idx_prefix + ".*")
795    
796     # Reconstructs the multifasta file from which the Bowtie index was created, if
797     # it's not already there.
798     def bowtie_idx_to_fa(idx_prefix):
799     idx_name = idx_prefix.split('/')[-1]
800     print >> sys.stderr, "[%s] Reconstituting reference FASTA file from Bowtie index" % (right_now())
801    
802     try:
803     tmp_fasta_file_name = tmp_dir + idx_name + ".fa"
804     tmp_fasta_file = open(tmp_fasta_file_name, "w")
805    
806     inspect_log = open(logging_dir + "bowtie_inspect_recons.log", "w")
807    
808     inspect_cmd = [prog_path("bowtie-inspect"),
809     idx_prefix]
810    
811     print >> sys.stderr, "Executing: " + " ".join(inspect_cmd) + " > " + tmp_fasta_file_name
812     ret = subprocess.call(inspect_cmd,
813     stdout=tmp_fasta_file,
814     stderr=inspect_log)
815    
816     # Bowtie reported an error
817     if ret != 0:
818     die(fail_str+"Error: bowtie-inspect returned an error")
819    
820     # Bowtie not found
821     except OSError, o:
822     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
823     die(fail_str+"Error: bowtie-inspect not found on this system. Did you forget to include it in your PATH?")
824    
825     return tmp_fasta_file_name
826    
827     # Checks whether the multifasta file for the genome is present alongside the
828     # Bowtie index files for it.
829     def check_fasta(idx_prefix):
830     print >> sys.stderr, "[%s] Checking for reference FASTA file" % right_now()
831     idx_fasta = idx_prefix + ".fa"
832     if os.path.exists(idx_fasta):
833     return idx_fasta
834     else:
835     bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES")
836     if bowtie_idx_env_var != None:
837     idx_fasta = bowtie_idx_env_var + idx_prefix + ".fa"
838     if os.path.exists(idx_fasta):
839     return idx_fasta
840    
841     print >> sys.stderr, "\tWarning: Could not find FASTA file " + idx_fasta
842     idx_fa = bowtie_idx_to_fa(idx_prefix)
843     return idx_fa
844    
845     # Check that both the Bowtie index and the genome's fasta file are present
846     def check_index(idx_prefix):
847     check_bowtie_index(idx_prefix)
848     ref_fasta_file = check_fasta(idx_prefix)
849    
850     return (ref_fasta_file, None)
851    
852     # Retrive a tuple containing the system's version of Bowtie. Parsed from
853     # `bowtie --version`
854     def get_bowtie_version():
855     try:
856     # Launch Bowtie to capture its version info
857     proc = subprocess.Popen([bowtie_path, "--version"],
858     stdout=subprocess.PIPE)
859     stdout_value = proc.communicate()[0]
860     bowtie_version = None
861     bowtie_out = repr(stdout_value)
862    
863     # Find the version identifier
864     version_str = "bowtie version "
865     ver_str_idx = bowtie_out.find(version_str)
866     if ver_str_idx != -1:
867     nl = bowtie_out.find("\\n", ver_str_idx)
868     version_val = bowtie_out[ver_str_idx + len(version_str):nl]
869     bowtie_version = [int(x) for x in version_val.split('.')]
870     if len(bowtie_version) == 3:
871     bowtie_version.append(0)
872    
873     return bowtie_version
874     except OSError, o:
875     errmsg=fail_str+str(o)+"\n"
876     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
877     errmsg+="Error: bowtie not found on this system"
878     die(errmsg)
879    
880     def get_index_sam_header(read_params, idx_prefix):
881     try:
882     bowtie_sam_header_filename = tmp_dir+idx_prefix.split('/')[-1]+".bwt.samheader.sam"
883     bowtie_sam_header_file = open(bowtie_sam_header_filename,"w")
884    
885     bowtie_header_cmd = [bowtie_path, "--sam"]
886     if read_params.color:
887     bowtie_header_cmd.append('-C')
888     bowtie_header_cmd.extend([idx_prefix, '/dev/null'])
889     subprocess.call(bowtie_header_cmd,
890     stdout=bowtie_sam_header_file,
891     stderr=open('/dev/null'))
892    
893     bowtie_sam_header_file.close()
894     bowtie_sam_header_file = open(bowtie_sam_header_filename,"r")
895    
896     sam_header_file = open(sam_header, "w")
897    
898     preamble = []
899     sq_dict_lines = []
900    
901     for line in bowtie_sam_header_file.readlines():
902     line = line.strip()
903     if line.find("@SQ") != -1:
904     # Sequence dictionary record
905     cols = line.split('\t')
906     seq_name = None
907     for col in cols:
908     fields = col.split(':')
909     #print fields
910     if len(fields) > 0 and fields[0] == "SN":
911     seq_name = fields[1]
912     if seq_name == None:
913     die("Error: malformed sequence dictionary in sam header")
914     sq_dict_lines.append([seq_name,line])
915     elif line.find("CL"):
916     continue
917     else:
918     preamble.append(line)
919     #for line in preamble:
920     # print >> sam_header_file, line
921     print >> sam_header_file, "@HD\tVN:1.0\tSO:coordinate"
922    
923     if read_params.read_group_id and read_params.sample_id:
924     rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
925     read_params.sample_id)
926     if read_params.library_id:
927     rg_str += "\tLB:%s" % read_params.library_id
928     if read_params.description:
929     rg_str += "\tDS:%s" % read_params.description
930     if read_params.seq_platform_unit:
931     rg_str += "\tPU:%s" % read_params.seq_platform_unit
932     if read_params.seq_center:
933     rg_str += "\tCN:%s" % read_params.seq_center
934     if read_params.mate_inner_dist:
935     rg_str += "\tPI:%s" % read_params.mate_inner_dist
936     if read_params.seq_run_date:
937     rg_str += "\tDT:%s" % read_params.seq_run_date
938     if read_params.seq_platform:
939     rg_str += "\tPL:%s" % read_params.seq_platform
940    
941     print >> sam_header_file, rg_str
942    
943     sq_dict_lines.sort(lambda x,y: cmp(x[0],y[0]))
944     for [name, line] in sq_dict_lines:
945     print >> sam_header_file, line
946     print >> sam_header_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
947    
948     sam_header_file.close()
949    
950     return sam_header
951    
952     except OSError, o:
953     errmsg=fail_str+str(o)+"\n"
954     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
955     errmsg+="Error: bowtie not found on this system"
956     die(errmsg)
957    
958     # Make sure Bowtie is installed and is recent enough to be useful
959     def check_bowtie():
960     print >> sys.stderr, "[%s] Checking for Bowtie" % right_now()
961     global bowtie_path
962     bowtie_path=prog_path("bowtie")
963     bowtie_version = get_bowtie_version()
964     if bowtie_version == None:
965     die("Error: Bowtie not found on this system")
966     elif bowtie_version[0] < 1 and bowtie_version[1] < 12 and bowtie_version[2] < 3:
967     die("Error: TopHat requires Bowtie 0.12.3 or later")
968     print >> sys.stderr, "\tBowtie version:\t\t\t %s" % ".".join([str(x) for x in bowtie_version])
969    
970    
971     # Retrive a tuple containing the system's version of samtools. Parsed from
972     # `samtools`
973     def get_samtools_version():
974     try:
975     # Launch Bowtie to capture its version info
976     proc = subprocess.Popen(samtools_path, stderr=subprocess.PIPE)
977     samtools_out = proc.communicate()[1]
978    
979     # Find the version identifier
980     version_match = re.search(r'Version:\s+(\d+)\.(\d+).(\d+)([a-zA-Z]?)', samtools_out)
981     samtools_version_arr = [int(version_match.group(x)) for x in [1,2,3]]
982     if version_match.group(4):
983     samtools_version_arr.append(version_match.group(4))
984     else:
985     samtools_version_arr.append(0)
986    
987     return version_match.group(), samtools_version_arr
988     except OSError, o:
989     errmsg=fail_str+str(o)+"\n"
990     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
991     errmsg+="Error: samtools not found on this system"
992     die(errmsg)
993    
994     # Make sure the SAM tools are installed and are recent enough to be useful
995     def check_samtools():
996     print >> sys.stderr, "[%s] Checking for Samtools" % right_now()
997     global samtools_path
998     samtools_path=prog_path("samtools")
999     samtools_version_str, samtools_version_arr = get_samtools_version()
1000     if samtools_version_str == None:
1001     die("Error: Samtools not found on this system")
1002     elif samtools_version_arr[1] < 1 or samtools_version_arr[2] < 7:
1003     die("Error: TopHat requires Samtools 0.1.7 or later")
1004     print >> sys.stderr, "\tSamtools %s" % samtools_version_str
1005    
1006    
1007    
1008     class FastxReader:
1009     def __init__(self, i_file, is_color=0, fname=''):
1010     self.bufline=None
1011     self.format=None
1012     self.ifile=i_file
1013     self.nextRecord=None
1014     self.eof=None
1015     self.fname=fname
1016     self.lastline=None
1017     self.numrecords=0
1018     self.isColor=0
1019     if is_color : self.isColor=1
1020     # determine file type
1021     #no records processed yet, skip custom header lines if any
1022     hlines=10 # allow maximum 10 header lines
1023     self.lastline=" "
1024     while hlines>0 and self.lastline[0] not in "@>" :
1025     self.lastline=self.ifile.readline()
1026     hlines-=1
1027     if self.lastline[0] == '@':
1028     self.format='fastq'
1029     self.nextRecord=self.nextFastq
1030     elif self.lastline[0] == '>':
1031     self.format='fasta'
1032     self.nextRecord=self.nextFasta
1033     else:
1034     die("Error: cannot determine record type in input file %s" % fname)
1035     self.bufline=self.lastline
1036     self.lastline=None
1037    
1038     def nextFastq(self):
1039     # returning tuple: (seqID, sequence_string, seq_len, qv_string)
1040     seqid,seqstr,qstr,seq_len='','','',0
1041     if self.eof: return (seqid, seqstr, seq_len, qstr)
1042     fline=self.getLine #shortcut to save a bit of time
1043     line=fline()
1044    
1045     if not line : return (seqid, seqstr, seq_len, qstr)
1046     while len(line.rstrip())==0: # skip empty lines
1047     line=fline()
1048     if not line : return (seqid, seqstr,seq_len, qstr)
1049     try:
1050     if line[0] != "@":
1051     raise ValueError("Records in Fastq files should start with '@' character")
1052     seqid = line[1:].rstrip()
1053     seqstr = fline().rstrip()
1054    
1055     #There may now be more sequence lines, or the "+" quality marker line:
1056     while True:
1057     line = fline()
1058     if not line:
1059     raise ValueError("Premature end of file (missing quality values for "+seqid+")")
1060     if line[0] == "+":
1061     # -- sequence string ended
1062     #qtitle = line[1:].rstrip()
1063     #if qtitle and qtitle != seqid:
1064     # raise ValueError("Different read ID for sequence and quality (%s vs %s)" \
1065     # % (seqid, qtitle))
1066     break
1067     seqstr += line.rstrip() #removes trailing newlines
1068     #loop until + found
1069     seq_len = len(seqstr)
1070     #at least one line of quality data should follow
1071     qstrlen=0
1072     #now read next lines as quality values until seq_len is reached
1073     while True:
1074     line=fline()
1075     if not line : break #end of file
1076     qstr += line.rstrip()
1077     qstrlen=len(qstr)
1078     if qstrlen + self.isColor >= seq_len :
1079     break # qv string has reached the length of seq string
1080     #loop until qv has the same length as seq
1081     if self.isColor and qstr[0]=="!":
1082     #some color qual string have a '!' character at the beginning, should be stripped
1083     qstr = qstr[1:]
1084     qstrlen -= 1
1085     if seq_len != qstrlen+self.isColor :
1086     raise ValueError("Length mismatch between sequence and quality strings "+ \
1087     "for %s (%i vs %i)." \
1088     % (seqid, seq_len, qstrlen))
1089     except ValueError, err:
1090     die("\nError encountered parsing file "+self.fname+":\n "+str(err))
1091     #return the record
1092     self.numrecords+=1
1093     if self.isColor :
1094     seq_len-=1
1095     seqstr = seqstr[1:]
1096     return (seqid, seqstr, seq_len, qstr)
1097    
1098     def nextFasta(self):
1099     # returning tuple: (seqID, sequence_string, seq_len)
1100     seqid,seqstr,seq_len='','',0
1101     fline=self.getLine # shortcut to readline function of f
1102     line=fline() # this will use the buffer line if it's there
1103     if not line : return (seqid, seqstr, seq_len, None)
1104     while len(line.rstrip())==0: # skip empty lines
1105     line=fline()
1106     if not line : return (seqid, seqstr, seq_len, None)
1107     try:
1108     if line[0] != ">":
1109     raise ValueError("Records in Fasta files must start with '>' character")
1110     seqid = line[1:].split()[0]
1111     #more sequence lines, or the ">" quality marker line:
1112     while True:
1113     line = fline()
1114     if not line: break
1115     if line[0] == '>':
1116     #next sequence starts here
1117     self.ungetLine()
1118     break
1119     seqstr += line.rstrip()
1120     #loop until '>' found
1121     seq_len = len(seqstr)
1122     if seq_len < 3:
1123     raise ValueError("Read %s too short (%i)." \
1124     % (seqid, seq_len))
1125     except ValueError, err:
1126     die("\nError encountered parsing fasta file "+self.fname+"\n "+str(err))
1127     #return the record and continue
1128     self.numrecords+=1
1129     if self.isColor :
1130     seq_len-=1
1131     seqstr = seqstr[1:]
1132     return (seqid, seqstr, seq_len, None)
1133    
1134     def getLine(self):
1135     if self.bufline: #return previously buffered line
1136     r=self.bufline
1137     self.bufline=None
1138     return r
1139     else: #read a new line from stream and return it
1140     if self.eof: return None
1141     self.lastline=self.ifile.readline()
1142     if not self.lastline:
1143     self.eof=1
1144     return None
1145     return self.lastline
1146     def ungetLine(self):
1147     if self.lastline is None:
1148     print >> sys.stderr, "Warning: FastxReader called ungetLine() with no prior line!"
1149     self.bufline=self.lastline
1150     self.lastline=None
1151     #< class FastxReader
1152    
1153     class ZReader:
1154     def __init__(self, filename, sysparams, guess=True):
1155     self.fname=filename
1156     self.file=None
1157     self.fsrc=None
1158     self.popen=None
1159     pipecmd=[]
1160     if guess:
1161     s=filename.lower()
1162     if s.endswith(".z") or s.endswith(".gz") or s.endswith(".gzip"):
1163     pipecmd=['gzip']
1164     else:
1165     if s.endswith(".bz2") or s.endswith(".bzip2") or s.endswith(".bzip"):
1166     pipecmd=['bzip2']
1167     if len(pipecmd)>0 and which(pipecmd[0]) is None:
1168     die("Error: cannot find %s to decompress input file %s " % (pipecmd, filename))
1169     if len(pipecmd)>0:
1170     if pipecmd[0]=='gzip' and sysparams.zipper.endswith('pigz'):
1171     pipecmd[0]=sysparams.zipper
1172     pipecmd.extend(sysparams.zipper_opts)
1173     elif pipecmd[0]=='bzip2' and sysparams.zipper.endswith('pbzip2'):
1174     pipecmd[0]=sysparams.zipper
1175     pipecmd.extend(sysparams.zipper_opts)
1176     else: #not guessing, but we must still be sure it's a compressed file
1177     if use_zpacker and filename.endswith(".z"):
1178     pipecmd=[sysparams.zipper]
1179     pipecmd.extend(sysparams.zipper_opts)
1180    
1181     if pipecmd:
1182     pipecmd+=['-cd']
1183     try:
1184     self.fsrc=open(self.fname, 'rb')
1185     self.popen=subprocess.Popen(pipecmd,
1186     preexec_fn=subprocess_setup,
1187     stdin=self.fsrc,
1188     stdout=subprocess.PIPE, close_fds=True)
1189     except Exception:
1190     die("Error: could not open pipe "+' '.join(pipecmd)+' < '+ self.fname)
1191     self.file=self.popen.stdout
1192     else:
1193     self.file=open(filename)
1194     def close(self):
1195     if self.fsrc: self.fsrc.close()
1196     self.file.close()
1197     if self.popen:
1198     self.popen.wait()
1199     self.popen=None
1200    
1201     class ZWriter:
1202     def __init__(self, filename, sysparams):
1203     self.fname=filename
1204     if use_zpacker:
1205     pipecmd=[sysparams.zipper,"-cf", "-"]
1206     self.ftarget=open(filename, "wb")
1207     try:
1208     self.popen=subprocess.Popen(pipecmd,
1209     preexec_fn=subprocess_setup,
1210     stdin=subprocess.PIPE,
1211     stdout=self.ftarget, close_fds=True)
1212     except Exception:
1213     die("Error: could not open writer pipe "+' '.join(pipecmd)+' < '+ self.fname)
1214     self.file=self.popen.stdin # client writes to this end of the pipe
1215     else: #no compression
1216     self.file=open(filename, "w")
1217     self.ftarget=None
1218     self.popen=None
1219     def close(self):
1220     self.file.close()
1221     if self.ftarget: self.ftarget.close()
1222     if self.popen:
1223     self.popen.wait() #! required to actually flush the pipes (eek!)
1224     self.popen=None
1225    
1226     # check_reads_format() examines the first few records in the user files
1227     # to determines the file format
1228     def check_reads_format(params, reads_files):
1229     #print >> sys.stderr, "[%s] Checking reads" % right_now()
1230     #
1231     seed_len = params.read_params.seed_length
1232     fileformat = params.read_params.reads_format
1233    
1234     observed_formats = set([])
1235     # observed_scales = set([])
1236     min_seed_len = 99999
1237     max_seed_len = 0
1238     files = reads_files.split(',')
1239    
1240     for f_name in files:
1241     #try:
1242     zf = ZReader(f_name, params.system_params)
1243     #except IOError:
1244     # die("Error: could not open file "+f_name)
1245     freader=FastxReader(zf.file, params.read_params.color, zf.fname)
1246     toread=4 #just sample the first 4 reads
1247     while toread>0:
1248     seqid, seqstr, seq_len, qstr = freader.nextRecord()
1249     if not seqid: break
1250     toread-=1
1251     if seq_len < 20:
1252     print >> sys.stderr, "Warning: found a read < 20bp in", f_name
1253     else:
1254     min_seed_len = min(seq_len, min_seed_len)
1255     max_seed_len = max(seq_len, max_seed_len)
1256     zf.close()
1257     observed_formats.add(freader.format)
1258     if len(observed_formats) > 1:
1259     die("Error: TopHat requires all reads be either FASTQ or FASTA. Mixing formats is not supported.")
1260     fileformat=list(observed_formats)[0]
1261     if seed_len != None:
1262     seed_len = max(seed_len, min_seed_len)
1263     else:
1264     seed_len = max_seed_len
1265     #print >> sys.stderr, "\tmin read length: %dbp, max read length: %dbp" % (min_seed_len, max_seed_len)
1266     print >> sys.stderr, "\tformat:\t\t %s" % fileformat
1267     if fileformat == "fastq":
1268     quality_scale = "phred33 (default)"
1269     if params.read_params.solexa_quals and not params.read_params.phred64_quals:
1270     quality_scale = "solexa33 (reads generated with GA pipeline version < 1.3)"
1271     elif params.read_params.phred64_quals:
1272     quality_scale = "phred64 (reads generated with GA pipeline version >= 1.3)"
1273     print >> sys.stderr, "\tquality scale:\t %s" % quality_scale
1274     elif fileformat == "fasta":
1275     if params.read_params.color:
1276     params.read_params.integer_quals = True
1277    
1278     #print seed_len, format, solexa_scale
1279     #NOTE: seed_len will be re-evaluated later by prep_reads
1280     return TopHatParams.ReadParams(params.read_params.solexa_quals,
1281     params.read_params.phred64_quals,
1282     params.read_params.quals,
1283     params.read_params.integer_quals,
1284     params.read_params.color,
1285     params.read_params.color_out,
1286     params.read_params.library_type,
1287     seed_len,
1288     fileformat,
1289     params.read_params.mate_inner_dist,
1290     params.read_params.mate_inner_dist_std_dev,
1291     params.read_params.read_group_id,
1292     params.read_params.sample_id,
1293     params.read_params.library_id,
1294     params.read_params.description,
1295     params.read_params.seq_platform_unit,
1296     params.read_params.seq_center,
1297     params.read_params.seq_run_date,
1298     params.read_params.seq_platform)
1299    
1300     def log_tail(logfile, lines=1):
1301     f=open(logfile, "r")
1302     f.seek(0, 2)
1303     fbytes= f.tell()
1304     size=lines
1305     block=-1
1306     while size > 0 and fbytes+block*1024 > 0:
1307     if (fbytes+block*1024 > 0):
1308     ##Seek back once more, if possible
1309     f.seek( block*1024, 2 )
1310     else:
1311     #Seek to the beginning
1312     f.seek(0, 0)
1313     data= f.read( 1024 )
1314     linesFound= data.count('\n')
1315     size -= linesFound
1316     block -= 1
1317     if (fbytes + block*1024 > 0):
1318     f.seek(block*1024, 2)
1319     else:
1320     f.seek(0,0)
1321     f.readline() # find a newline
1322     lastBlocks= list( f.readlines() )
1323     f.close()
1324     return "\n".join(lastBlocks[-lines:])
1325    
1326     # Format a DateTime as a pretty string.
1327     # FIXME: Currently doesn't support days!
1328     def formatTD(td):
1329     hours = td.seconds // 3600
1330     minutes = (td.seconds % 3600) // 60
1331     seconds = td.seconds % 60
1332     return '%02d:%02d:%02d' % (hours, minutes, seconds)
1333    
1334     class PrepReadsInfo:
1335     def __init__(self, fname, side):
1336     try:
1337     f=open(fname,"r")
1338     self.min_len=int(f.readline().split("=")[-1])
1339     self.max_len=int(f.readline().split("=")[-1])
1340     self.in_count=int(f.readline().split("=")[-1])
1341     self.out_count=int(f.readline().split("=")[-1])
1342     if (self.out_count==0) or (self.max_len<16):
1343     raise Exception()
1344     except Exception, e:
1345     die(fail_str+"Error retrieving prep_reads info.")
1346     print >> sys.stderr, "\t%s reads: min. length=%s, count=%s" % (side, self.min_len, self.out_count)
1347     if self.min_len < 20:
1348     print >> sys.stderr, "Warning: short reads (<20bp) will make TopHat quite slow and take large amount of memory because they are likely to be mapped to too many places"
1349    
1350     # Calls the prep_reads executable, which prepares an internal read library.
1351     # The read library features reads with monotonically increasing integer IDs.
1352     # prep_reads also filters out very low complexy or garbage reads as well as
1353     # polyA reads.
1354     #--> returns the tuple (internal_read_library_file, read_info)
1355     def prep_reads(params, reads_list, quals_list, output_name, side="Left "):
1356     reads_suffix = ".fq"
1357     if use_zpacker: reads_suffix += ".z"
1358     kept_reads_filename = tmp_dir + output_name + reads_suffix
1359    
1360     if os.path.exists(kept_reads_filename):
1361     os.remove(kept_reads_filename)
1362     kept_reads = open(kept_reads_filename, "wb")
1363     log_fname=logging_dir + "prep_reads.log"
1364     filter_log = open(log_fname,"w")
1365    
1366     filter_cmd = [prog_path("prep_reads")]
1367     filter_cmd.extend(params.cmd())
1368    
1369     if params.read_params.reads_format == "fastq":
1370     filter_cmd += ["--fastq"]
1371     elif params.read_params.reads_format == "fasta":
1372     filter_cmd += ["--fasta"]
1373     info_file=output_dir+output_name+".info"
1374     filter_cmd += ["--aux-outfile="+info_file]
1375     filter_cmd.append(reads_list)
1376     if params.read_params.quals:
1377     filter_cmd.append(quals_list)
1378     shell_cmd = ' '.join(filter_cmd)
1379     #finally, add the compression pipe
1380     zip_cmd=[]
1381     if use_zpacker:
1382     zip_cmd=[ params.system_params.zipper ]
1383     zip_cmd.extend(params.system_params.zipper_opts)
1384     zip_cmd.extend(['-c','-'])
1385     shell_cmd +=' | '+' '.join(zip_cmd)
1386     shell_cmd += ' >' +kept_reads_filename
1387     retcode=0
1388     try:
1389     print >> run_log, shell_cmd
1390     if use_zpacker:
1391     filter_proc = subprocess.Popen(filter_cmd,
1392     stdout=subprocess.PIPE,
1393     stderr=filter_log)
1394     zip_proc=subprocess.Popen(zip_cmd,
1395     preexec_fn=subprocess_setup,
1396     stdin=filter_proc.stdout,
1397     stdout=kept_reads)
1398     filter_proc.stdout.close() #as per http://bugs.python.org/issue7678
1399     zip_proc.communicate()
1400     retcode=filter_proc.poll()
1401     if retcode==0:
1402     retcode=zip_proc.poll()
1403     else:
1404     retcode = subprocess.call(filter_cmd,
1405     stdout=kept_reads,
1406     stderr=filter_log)
1407     if retcode:
1408     die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname))
1409    
1410     except OSError, o:
1411     errmsg=fail_str+str(o)
1412     die(errmsg+"\n"+log_tail(log_fname))
1413     kept_reads.close()
1414    
1415     return kept_reads_filename, PrepReadsInfo(info_file, side)
1416    
1417     # Call bowtie
1418     def bowtie(params,
1419     bwt_idx_prefix,
1420     reads_list,
1421     reads_format,
1422     num_mismatches,
1423     mapped_reads,
1424     unmapped_reads = None, #not needed vs segment_juncs
1425     extra_output = ""):
1426     start_time = datetime.now()
1427     bwt_idx_name = bwt_idx_prefix.split('/')[-1]
1428     readfile_basename=getFileBaseName(reads_list)
1429     print >> sys.stderr, "[%s] Mapping %s against %s with Bowtie %s" % (start_time.strftime("%c"),
1430     readfile_basename, bwt_idx_name, extra_output)
1431    
1432     bwt_log = open(logging_dir + 'bowtie.'+readfile_basename+'.fixmap.log', "w")
1433     unmapped_reads_bwt=unmapped_reads+".fq"
1434     use_FIFO = use_BWT_FIFO and use_zpacker and unmapped_reads and params.read_params.color
1435     if use_FIFO: #this should always be False unless color reads are used with -X option
1436     global unmapped_reads_fifo
1437     if os.path.exists(unmapped_reads_fifo):
1438     os.remove(unmapped_reads_fifo)
1439     try:
1440     os.mkfifo(unmapped_reads_fifo)
1441     except OSError, o:
1442     die(fail_str+"Error at mkfifo("+unmapped_reads_fifo+'). '+str(o))
1443     unmapped_reads_bwt=unmapped_reads_fifo
1444    
1445     # Launch Bowtie
1446     try:
1447     bowtie_cmd = [bowtie_path]
1448     bam_input=0
1449     unzip_cmd=None
1450     if reads_list.endswith('.bam'):
1451     bam_input=1
1452     unzip_cmd=[ prog_path('bam2fastx') ]
1453     if reads_format:
1454     unzip_cmd.append("--"+reads_format)
1455     unzip_cmd+=[reads_list]
1456    
1457     if reads_format == "fastq":
1458     bowtie_cmd += ["-q"]
1459     elif reads_format == "fasta":
1460     bowtie_cmd += ["-f"]
1461    
1462     if params.read_params.color:
1463     bowtie_cmd += ["-C", "--col-keepends"]
1464     if unmapped_reads:
1465     bowtie_cmd += ["--un", unmapped_reads_bwt]
1466     # "--max", "/dev/null"]
1467     if use_zpacker and (unzip_cmd is None):
1468     unzip_cmd=[ params.system_params.zipper ]
1469     unzip_cmd.extend(params.system_params.zipper_opts)
1470     unzip_cmd+=['-cd']
1471    
1472     fifo_pid=None
1473     if use_FIFO:
1474     unm_zipcmd=[ params.system_params.zipper ]
1475     unm_zipcmd.extend(params.system_params.zipper_opts)
1476     unm_zipcmd+=['-c']
1477     unmapped_reads+=".fq.z"
1478     print >> run_log, ' '.join(unm_zipcmd)+' < '+ unmapped_reads_fifo + ' > '+ unmapped_reads + ' & '
1479     fifo_pid=os.fork()
1480     if fifo_pid==0:
1481     def on_sig_exit(sig, func=None):
1482     os._exit(os.EX_OK)
1483     signal.signal(signal.SIGTERM, on_sig_exit)
1484     subprocess.call(unm_zipcmd,
1485     stdin=open(unmapped_reads_fifo, "r"),
1486     stdout=open(unmapped_reads, "wb"))
1487     os._exit(os.EX_OK)
1488    
1489     bowtie_cmd += [params.bowtie_alignment_option, str(num_mismatches),
1490     "-p", str(params.system_params.num_cpus),
1491     "-k", str(params.max_hits),
1492     "-m", str(params.max_hits),
1493     "--max", "/dev/null",
1494     "--sam-nohead"]
1495     fix_map_cmd = [prog_path('fix_map_ordering')]
1496     sam_zipcmd=None
1497     if unmapped_reads:
1498     #mapping on the genome
1499     #fix_map_ordering will write BAM file directly, and unmapped_reads as BAM file too
1500     mapped_reads += ".bam"
1501     if params.read_params.color:
1502     fix_map_cmd += ["--sam-header", sam_header, "-", mapped_reads]
1503     else:
1504     unmapped_reads += ".bam"
1505     fix_map_cmd += ["--sam-header", sam_header, "-", mapped_reads, unmapped_reads]
1506     else:
1507     #mapping on segment_juncs (spliced potential junctions)
1508     # don't use SAM headers, could be slow -- better compress SAM output directly
1509     mapped_reads += ".sam"
1510     fix_map_cmd += ["-"]
1511     if use_zpacker:
1512     fix_map_cmd += ["-"]
1513     sam_zipcmd=[ params.system_params.zipper ]
1514     sam_zipcmd.extend(params.system_params.zipper_opts)
1515     sam_zipcmd += ["-c"]
1516     mapped_reads += ".z"
1517     else:
1518     fix_map_cmd += [mapped_reads]
1519    
1520     bowtie_cmd+=[bwt_idx_prefix]
1521     bowtie_proc=None
1522     shellcmd=""
1523     unzip_proc=None
1524     mapzip_proc=None
1525     if unzip_cmd:
1526     # could be bam2fastx
1527     if bam_input:
1528     unzip_proc = subprocess.Popen(unzip_cmd, stdout=subprocess.PIPE)
1529     shellcmd=' '.join(unzip_cmd) + "|"
1530     else:
1531     unzip_proc = subprocess.Popen(unzip_cmd,
1532     stdin=open(reads_list, "rb"),
1533     stdout=subprocess.PIPE)
1534     shellcmd=' '.join(unzip_cmd) + "< " +reads_list +"|"
1535     bowtie_cmd += ['-']
1536     bowtie_proc = subprocess.Popen(bowtie_cmd,
1537     stdin=unzip_proc.stdout,
1538     stdout=subprocess.PIPE,
1539     stderr=bwt_log)
1540     unzip_proc.stdout.close() # see http://bugs.python.org/issue7678
1541     else:
1542     bowtie_cmd += [reads_list]
1543     bowtie_proc = subprocess.Popen(bowtie_cmd,
1544     stdout=subprocess.PIPE,
1545     stderr=bwt_log)
1546     shellcmd += " ".join(bowtie_cmd) + " | " + " ".join(fix_map_cmd)
1547     if sam_zipcmd:
1548     shellcmd += "|" + " ".join(sam_zipcmd)+" > " + mapped_reads
1549     fix_order_proc = subprocess.Popen(fix_map_cmd,
1550     stdin=bowtie_proc.stdout,
1551     stdout=subprocess.PIPE)
1552     bowtie_proc.stdout.close()
1553     mapzip_proc = subprocess.Popen(sam_zipcmd,
1554     stdin=fix_order_proc.stdout,
1555     stdout=open(mapped_reads, "wb"))
1556     fix_order_proc.stdout.close()
1557     else:
1558     fix_order_proc = subprocess.Popen(fix_map_cmd,
1559     stdin=bowtie_proc.stdout)
1560     bowtie_proc.stdout.close()
1561    
1562     print >> run_log, shellcmd
1563     if mapzip_proc:
1564     mapzip_proc.communicate()
1565     else:
1566     fix_order_proc.communicate()
1567     if use_FIFO:
1568     if fifo_pid and not os.path.exists(unmapped_reads):
1569     try:
1570     os.kill(fifo_pid, signal.SIGTERM)
1571     except:
1572     pass
1573     except OSError, o:
1574     die(fail_str+"Error: "+str(o))
1575    
1576     # Success
1577     #finish_time = datetime.now()
1578     #duration = finish_time - start_time
1579     #print >> sys.stderr, "\t\t\t[%s elapsed]" % formatTD(duration)
1580     if use_FIFO:
1581     try:
1582     os.remove(unmapped_reads_fifo)
1583     except:
1584     pass
1585     return (mapped_reads, unmapped_reads)
1586    
1587     def bowtie_segment(params,
1588     bwt_idx_prefix,
1589     reads_list,
1590     reads_format,
1591     mapped_reads,
1592     unmapped_reads = None,
1593     extra_output = ""):
1594    
1595     backup_bowtie_alignment_option = params.bowtie_alignment_option
1596     params.bowtie_alignment_option = "-v"
1597     params.max_hits *= 2
1598    
1599     result = bowtie(params,
1600     bwt_idx_prefix,
1601     reads_list,
1602     reads_format,
1603     params.segment_mismatches,
1604     mapped_reads,
1605     unmapped_reads,
1606     extra_output)
1607    
1608     params.bowtie_alignment_option = backup_bowtie_alignment_option
1609     params.max_hits /= 2
1610     return result
1611    
1612     # Retrieve a .juncs file from a GFF file by calling the gtf_juncs executable
1613     def get_gtf_juncs(gff_annotation):
1614     print >> sys.stderr, "[%s] Reading known junctions from GTF file" % (right_now())
1615     gtf_juncs_log = open(logging_dir + "gtf_juncs.log", "w")
1616    
1617     gff_prefix = gff_annotation.split('/')[-1].split('.')[0]
1618    
1619     gtf_juncs_out_name = tmp_dir + gff_prefix + ".juncs"
1620     gtf_juncs_out = open(gtf_juncs_out_name, "w")
1621    
1622     gtf_juncs_cmd=[prog_path("gtf_juncs"), gff_annotation]
1623     try:
1624     print >> run_log, " ".join(gtf_juncs_cmd)
1625     retcode = subprocess.call(gtf_juncs_cmd,
1626     stderr=gtf_juncs_log,
1627     stdout=gtf_juncs_out)
1628     # cvg_islands returned an error
1629     if retcode == 1:
1630     print >> sys.stderr, "\tWarning: TopHat did not find any junctions in GTF file"
1631     return (False, gtf_juncs_out_name)
1632     elif retcode != 0:
1633     die(fail_str+"Error: GTF junction extraction failed with err ="+str(retcode))
1634     # cvg_islands not found
1635     except OSError, o:
1636     errmsg=fail_str+str(o)+"\n"
1637     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1638     errmsg+="Error: gtf_juncs not found on this system"
1639     die(errmsg)
1640     return (True, gtf_juncs_out_name)
1641    
1642     # Call bowtie-build on the FASTA file of sythetic splice junction sequences
1643     def build_juncs_bwt_index(external_splice_prefix, color):
1644     print >> sys.stderr, "[%s] Indexing splices" % (right_now())
1645     bowtie_build_log = open(logging_dir + "bowtie_build.log", "w")
1646    
1647     #user_splices_out_prefix = output_dir + "user_splices_idx"
1648    
1649     bowtie_build_cmd = [prog_path("bowtie-build")]
1650     if color:
1651     bowtie_build_cmd += ["-C"]
1652    
1653     bowtie_build_cmd += [external_splice_prefix + ".fa",
1654     external_splice_prefix]
1655     try:
1656     print >> run_log, " ".join(bowtie_build_cmd)
1657     retcode = subprocess.call(bowtie_build_cmd,
1658     stdout=bowtie_build_log)
1659    
1660     if retcode != 0:
1661     die(fail_str+"Error: Splice sequence indexing failed with err ="+ str(retcode))
1662     except OSError, o:
1663     errmsg=fail_str+str(o)+"\n"
1664     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1665     errmsg+="Error: bowtie-build not found on this system"
1666     die(errmsg)
1667     return external_splice_prefix
1668    
1669     # Build a splice index from a .juncs file, suitable for use with specified read
1670     # (or read segment) lengths
1671     def build_juncs_index(min_anchor_length,
1672     read_length,
1673     juncs_prefix,
1674     external_juncs,
1675     external_insertions,
1676     external_deletions,
1677     reference_fasta,
1678     color):
1679     print >> sys.stderr, "[%s] Retrieving sequences for splices" % (right_now())
1680    
1681     juncs_file_list = ",".join(external_juncs)
1682     insertions_file_list = ",".join(external_insertions)
1683     deletions_file_list = ",".join(external_deletions)
1684    
1685    
1686     juncs_db_log = open(logging_dir + "juncs_db.log", "w")
1687    
1688     external_splices_out_prefix = tmp_dir + juncs_prefix
1689     external_splices_out_name = external_splices_out_prefix + ".fa"
1690    
1691     external_splices_out = open(external_splices_out_name, "w")
1692     # juncs_db_cmd = [bin_dir + "juncs_db",
1693     juncs_db_cmd = [prog_path("juncs_db"),
1694     str(min_anchor_length),
1695     str(read_length),
1696     juncs_file_list,
1697     insertions_file_list,
1698     deletions_file_list,
1699     reference_fasta]
1700     try:
1701     print >> run_log, " ".join(juncs_db_cmd)
1702     retcode = subprocess.call(juncs_db_cmd,
1703     stderr=juncs_db_log,
1704     stdout=external_splices_out)
1705    
1706     if retcode != 0:
1707     die(fail_str+"Error: Splice sequence retrieval failed with err ="+str(retcode))
1708     # juncs_db not found
1709     except OSError, o:
1710     errmsg=fail_str+str(o)+"\n"
1711     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1712     errmsg+="Error: juncs_db not found on this system"
1713     die(errmsg)
1714    
1715     external_splices_out_prefix = build_juncs_bwt_index(external_splices_out_prefix, color)
1716     return external_splices_out_prefix
1717    
1718     # Print out the sam header, embedding the user's specified library properties.
1719     # FIXME: also needs SQ dictionary lines
1720     def write_sam_header(read_params, sam_file):
1721     print >> sam_file, "@HD\tVN:1.0\tSO:coordinate"
1722     if read_params.read_group_id and read_params.sample_id:
1723     rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
1724     read_params.sample_id)
1725     if read_params.library_id:
1726     rg_str += "\tLB:%s" % read_params.library_id
1727     if read_params.description:
1728     rg_str += "\tDS:%s" % read_params.description
1729     if read_params.seq_platform_unit:
1730     rg_str += "\tPU:%s" % read_params.seq_platform_unit
1731     if read_params.seq_center:
1732     rg_str += "\tCN:%s" % read_params.seq_center
1733     if read_params.mate_inner_dist:
1734     rg_str += "\tPI:%s" % read_params.mate_inner_dist
1735     if read_params.seq_run_date:
1736     rg_str += "\tDT:%s" % read_params.seq_run_date
1737     if read_params.seq_platform:
1738     rg_str += "\tPL:%s" % read_params.seq_platform
1739    
1740     print >> sam_file, rg_str
1741     print >> sam_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
1742    
1743     # Write final TopHat output, via tophat_reports and wiggles
1744     def compile_reports(params, sam_header_filename, left_maps, left_reads, right_maps, right_reads, gff_annotation):
1745     print >> sys.stderr, "[%s] Reporting output tracks" % right_now()
1746    
1747     left_maps = [x for x in left_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
1748     left_maps = ','.join(left_maps)
1749    
1750     if len(right_maps) > 0:
1751     right_maps = [x for x in right_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
1752     right_maps = ','.join(right_maps)
1753     log_fname = logging_dir + "reports.log"
1754     report_log = open(log_fname, "w")
1755     junctions = output_dir + "junctions.bed"
1756     insertions = output_dir + "insertions.bed"
1757     deletions = output_dir + "deletions.bed"
1758     coverage = "coverage.wig"
1759     accepted_hits = output_dir + "accepted_hits"
1760     report_cmdpath = prog_path("tophat_reports")
1761     report_cmd = [report_cmdpath]
1762     report_cmd.extend(params.cmd())
1763     report_cmd.extend(["--samtools="+samtools_path])
1764     report_cmd.extend([junctions,
1765     insertions,
1766     deletions,
1767     "-",
1768     left_maps,
1769     left_reads])
1770     if len(right_maps) > 0 and right_reads != None:
1771     report_cmd.append(right_maps)
1772     report_cmd.append(right_reads)
1773     # -- tophat_reports now produces (uncompressed) BAM stream,
1774     # directly piped into samtools sort
1775     try:
1776     if params.report_params.convert_bam:
1777     if params.report_params.sort_bam:
1778     report_proc=subprocess.Popen(report_cmd,
1779     preexec_fn=subprocess_setup,
1780     stdout=subprocess.PIPE,
1781     stderr=report_log)
1782    
1783     bamsort_cmd = ["samtools", "sort", "-", accepted_hits]
1784     bamsort_proc= subprocess.Popen(bamsort_cmd,
1785     preexec_fn=subprocess_setup,
1786     stderr=open(logging_dir + "reports.samtools_sort.log", "w"),
1787     stdin=report_proc.stdout)
1788     report_proc.stdout.close()
1789     print >> run_log, " ".join(report_cmd)+" | " + " ".join(bamsort_cmd)
1790     bamsort_proc.communicate()
1791     retcode = report_proc.poll()
1792     if retcode:
1793     die(fail_str+"Error running tophat_reports\n"+log_tail(log_fname))
1794     else:
1795     print >> run_log, " ".join(report_cmd)
1796     report_proc=subprocess.call(report_cmd,
1797     preexec_fn=subprocess_setup,
1798     stdout=open(accepted_hits,"wb"),
1799     stderr=report_log)
1800     os.rename(accepted_hits, output_dir + "accepted_hits.bam")
1801     else:
1802     print >> run_log, " ".join(report_cmd)
1803     report_proc=subprocess.call(report_cmd,
1804     preexec_fn=subprocess_setup,
1805     stdout=open(accepted_hits,"wb"),
1806     stderr=report_log)
1807     tmp_sam = output_dir + "accepted_hits.sam"
1808    
1809     bam_to_sam_cmd = ["samtools", "view", "-h", accepted_hits]
1810     print >> run_log, " ".join(bam_to_sam_cmd) + " > " + tmp_sam
1811     bam_to_sam_log = open(logging_dir + "accepted_hits_bam_to_sam.log", "w")
1812     tmp_sam_file = open(tmp_sam, "w")
1813     ret = subprocess.call(bam_to_sam_cmd,
1814     stdout=tmp_sam_file,
1815     stderr=bam_to_sam_log)
1816     tmp_sam_file.close()
1817     os.remove(accepted_hits)
1818     #print >> run_log, "mv %s %s" % (tmp_sam, output_dir + "accepted_hits.sam")
1819     #os.rename(tmp_sam, output_dir + "accepted_hits.sam")
1820    
1821     except OSError, o:
1822     die(fail_str+"Error: "+str(o)+"\n"+log_tail(log_fname))
1823    
1824     # FIXME: put wiggles back!
1825     # wig_cmd = [prog_path("wiggles"), output_dir + accepted_hits, output_dir + coverage]
1826     # print >> run_log, " ".join(wig_cmd)
1827     # subprocess.call(wig_cmd,
1828     # stderr=open("/dev/null"))
1829     return (coverage, junctions)
1830    
1831    
1832     # Split up each read in a FASTQ file into multiple segments. Creates a FASTQ file
1833     # for each segment This function needs to be fixed to support mixed read length
1834     # inputs
1835    
1836     def open_output_files(prefix, num_files_prev, num_files, out_segf, extension, params):
1837     i = num_files_prev + 1
1838     while i <= num_files:
1839     segfname=prefix+("_seg%d" % i)+extension
1840     out_segf.append(ZWriter(segfname,params.system_params))
1841     i += 1
1842    
1843     def split_reads(reads_filename,
1844     prefix,
1845     fasta,
1846     params,
1847     segment_length):
1848     #reads_file = open(reads_filename)
1849     zreads = ZReader(reads_filename, params.system_params, False)
1850     out_segfiles = []
1851    
1852     if fasta:
1853     extension = ".fa"
1854     else:
1855     extension = ".fq"
1856     if use_zpacker: extension += ".z"
1857     def convert_color_to_bp(color_seq):
1858     decode_dic = { 'A0':'A', 'A1':'C', 'A2':'G', 'A3':'T', 'A4':'N', 'A.':'N', 'AN':'N',
1859     'C0':'C', 'C1':'A', 'C2':'T', 'C3':'G', 'C4':'N', 'C.':'N', 'CN':'N',
1860     'G0':'G', 'G1':'T', 'G2':'A', 'G3':'C', 'G4':'N', 'G.':'N', 'GN':'N',
1861     'T0':'T', 'T1':'G', 'T2':'C', 'T3':'A', 'T4':'N', 'T.':'N', 'TN':'N',
1862     'N0':'N', 'N1':'N', 'N2':'N', 'N3':'N', 'N4':'N', 'N.':'N', 'NN':'N',
1863     '.0':'N', '.1':'N', '.2':'N', '.3':'N', '.4':'N', '..':'N', '.N':'N' }
1864    
1865     base = color_seq[0]
1866     bp_seq = base
1867     for ch in color_seq[1:]:
1868     base = decode_dic[base+ch]
1869     bp_seq += base
1870     return bp_seq
1871    
1872     def convert_bp_to_color(bp_seq):
1873     encode_dic = { 'AA':'0', 'CC':'0', 'GG':'0', 'TT':'0',
1874     'AC':'1', 'CA':'1', 'GT':'1', 'TG':'1',
1875     'AG':'2', 'CT':'2', 'GA':'2', 'TC':'2',
1876     'AT':'3', 'CG':'3', 'GC':'3', 'TA':'3',
1877     'A.':'4', 'C.':'4', 'G.':'4', 'T.':'4',
1878     '.A':'4', '.C':'4', '.G':'4', '.T':'4',
1879     '.N':'4', 'AN':'4', 'CN':'4', 'GN':'4',
1880     'TN':'4', 'NA':'4', 'NC':'4', 'NG':'4',
1881     'NT':'4', 'NN':'4', 'N.':'4', '..':'4' }
1882    
1883     base = bp_seq[0]
1884     color_seq = base
1885     for ch in bp_seq[1:]:
1886     color_seq += encode_dic[base + ch]
1887     base = ch
1888    
1889     return color_seq
1890    
1891     def split_record(read_name, read_seq, read_qual, out_segf, offsets, color):
1892     if color:
1893     color_offset = 1
1894     read_seq_temp = convert_color_to_bp(read_seq)
1895    
1896     seg_num = 1
1897     while seg_num + 1 < len(offsets):
1898     if read_seq[offsets[seg_num]+1] not in ['0', '1', '2', '3']:
1899     return
1900     seg_num += 1
1901     else:
1902     color_offset = 0
1903    
1904     seg_num = 0
1905     last_seq_offset = 0
1906     while seg_num + 1 < len(offsets):
1907     f = out_segf[seg_num].file
1908     seg_seq = read_seq[last_seq_offset+color_offset:offsets[seg_num + 1]+color_offset]
1909     print >> f, "%s|%d:%d:%d" % (read_name,last_seq_offset,seg_num, len(offsets) - 1)
1910     if color:
1911     print >> f, "%s%s" % (read_seq_temp[last_seq_offset], seg_seq)
1912     else:
1913     print >> f, seg_seq
1914     if not fasta:
1915     seg_qual = read_qual[last_seq_offset:offsets[seg_num + 1]]
1916     print >> f, "+"
1917     print >> f, seg_qual
1918     seg_num += 1
1919     last_seq_offset = offsets[seg_num]
1920    
1921     line_state = 0
1922     read_name = ""
1923     read_seq = ""
1924     read_qual = ""
1925     num_segments = 0
1926     offsets = []
1927     for line in zreads.file:
1928     if line.strip() == "":
1929     continue
1930     if line_state == 0:
1931     read_name = line.strip()
1932     elif line_state == 1:
1933     read_seq = line.strip()
1934    
1935     read_length = len(read_seq)
1936     tmp_num_segments = read_length / segment_length
1937     offsets = [segment_length * i for i in range(0, tmp_num_segments + 1)]
1938    
1939     # Bowtie's minimum read length here is 20bp, so if the last segment
1940     # is between 20 and segment_length bp long, go ahead and write it out
1941     if read_length % segment_length >= 20:
1942     offsets.append(read_length)
1943     tmp_num_segments += 1
1944     else:
1945     offsets[-1] = read_length
1946    
1947     if tmp_num_segments == 1:
1948     offsets = [0, read_length]
1949    
1950     if tmp_num_segments > num_segments:
1951     open_output_files(prefix, num_segments, tmp_num_segments, out_segfiles, extension, params)
1952     num_segments = tmp_num_segments
1953    
1954     if fasta:
1955     split_record(read_name, read_seq, None, out_segfiles, offsets, params.read_params.color)
1956     elif line_state == 2:
1957     line = line.strip()
1958     else:
1959     read_quals = line.strip()
1960     if not fasta:
1961     split_record(read_name, read_seq, read_quals, out_segfiles, offsets, params.read_params.color)
1962    
1963     line_state += 1
1964     if fasta:
1965     line_state %= 2
1966     else:
1967     line_state %= 4
1968     zreads.close()
1969     out_fnames=[]
1970     for zf in out_segfiles:
1971     zf.close()
1972     out_fnames.append(zf.fname)
1973     #return [o.fname for o in out_segfiles]
1974     return out_fnames
1975    
1976     # Find possible splice junctions using the "closure search" strategy, and report
1977     # them in closures.juncs. Calls the executable closure_juncs
1978     def junctions_from_closures(params,
1979     left_maps,
1980     right_maps,
1981     ref_fasta):
1982     #print >> sys.stderr, "[%s] " % right_now()
1983     print >> sys.stderr, "[%s] Searching for junctions via mate-pair closures" % right_now()
1984    
1985     #maps = [x for x in seg_maps if (os.path.exists(x) and os.path.getsize(x) > 0)]
1986     #if len(maps) == 0:
1987     # return None
1988     #print >> sys.stderr, left_maps
1989     slash = left_maps[0].rfind('/')
1990     juncs_out = ""
1991     if slash != -1:
1992     juncs_out += left_maps[0][:slash+1]
1993     juncs_out += "closure.juncs"
1994    
1995     juncs_log = open(logging_dir + "closure.log", "w")
1996     juncs_cmdpath=prog_path("closure_juncs")
1997     juncs_cmd = [juncs_cmdpath]
1998    
1999     left_maps = ','.join(left_maps)
2000     right_maps = ','.join(right_maps)
2001    
2002     juncs_cmd.extend(params.cmd())
2003     juncs_cmd.extend([juncs_out,
2004     ref_fasta,
2005     left_maps,
2006     right_maps])
2007     try:
2008     print >> run_log, ' '.join(juncs_cmd)
2009     retcode = subprocess.call(juncs_cmd,
2010     stderr=juncs_log)
2011    
2012     # spanning_reads returned an error
2013     if retcode != 0:
2014     die(fail_str+"Error: closure-based junction search failed with err ="+str(retcode))
2015     # cvg_islands not found
2016     except OSError, o:
2017     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2018     print >> sys.stderr, fail_str, "Error: closure_juncs not found on this system"
2019     die(str(o))
2020     return [juncs_out]
2021    
2022     # Find possible junctions by examining coverage and split segments in the initial
2023     # map and segment maps. Report junctions, insertions, and deletions in segment.juncs,
2024     # segment.insertions, and segment.deletions. Calls the executable
2025     # segment_juncs
2026     def junctions_from_segments(params,
2027     left_reads,
2028     left_reads_map,
2029     left_seg_maps,
2030     right_reads,
2031     right_reads_map,
2032     right_seg_maps,
2033     unmapped_reads,
2034     reads_format,
2035     ref_fasta):
2036     if left_reads_map != left_seg_maps[0]:
2037     print >> sys.stderr, "[%s] Searching for junctions via segment mapping" % right_now()
2038     out_path=getFileDir(left_seg_maps[0])
2039     juncs_out=out_path+"segment.juncs"
2040     insertions_out=out_path+"segment.insertions"
2041     deletions_out =out_path+"segment.deletions"
2042    
2043     left_maps = ','.join(left_seg_maps)
2044     align_log = open(logging_dir + "segment_juncs.log", "w")
2045     align_cmd = [prog_path("segment_juncs")]
2046    
2047     align_cmd.extend(params.cmd())
2048    
2049     align_cmd.extend(["--ium-reads", ",".join(unmapped_reads),
2050     ref_fasta,
2051     juncs_out,
2052     insertions_out,
2053     deletions_out,
2054     left_reads,
2055     left_reads_map,
2056     left_maps])
2057     if right_seg_maps != None:
2058     right_maps = ','.join(right_seg_maps)
2059     align_cmd.extend([right_reads, right_reads_map, right_maps])
2060     try:
2061     print >> run_log, " ".join(align_cmd)
2062     retcode = subprocess.call(align_cmd,
2063     preexec_fn=subprocess_setup,
2064     stderr=align_log)
2065    
2066     # spanning_reads returned an error
2067     if retcode != 0:
2068     die(fail_str+"Error: segment-based junction search failed with err ="+str(retcode))
2069     # cvg_islands not found
2070     except OSError, o:
2071     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2072     print >> sys.stderr, fail_str, "Error: segment_juncs not found on this system"
2073     die(str(o))
2074    
2075     return [juncs_out, insertions_out, deletions_out]
2076    
2077     # Joins mapped segments into full-length read alignments via the executable
2078     # long_spanning_reads
2079     def join_mapped_segments(params,
2080     sam_header_filename,
2081     reads,
2082     ref_fasta,
2083     possible_juncs,
2084     possible_insertions,
2085     possible_deletions,
2086     contig_seg_maps,
2087     spliced_seg_maps,
2088     alignments_out_name):
2089     if len(contig_seg_maps)>1:
2090     print >> sys.stderr, "[%s] Joining segment hits" % right_now()
2091     else:
2092     print >> sys.stderr, "[%s] Processing bowtie hits" % right_now()
2093     contig_seg_maps = ','.join(contig_seg_maps)
2094    
2095     possible_juncs = ','.join(possible_juncs)
2096     possible_insertions = ",".join(possible_insertions)
2097     possible_deletions = ",".join(possible_deletions)
2098    
2099     align_log = open(logging_dir + "long_spanning_reads.log", "w")
2100     align_cmd = [prog_path("long_spanning_reads")]
2101    
2102     align_cmd.extend(params.cmd())
2103    
2104     align_cmd.append(ref_fasta)
2105     align_cmd.extend([ reads,
2106     possible_juncs,
2107     possible_insertions,
2108     possible_deletions,
2109     contig_seg_maps])
2110    
2111     if spliced_seg_maps != None:
2112     spliced_seg_maps = ','.join(spliced_seg_maps)
2113     align_cmd.append(spliced_seg_maps)
2114    
2115     try:
2116     print >> run_log, " ".join(align_cmd),">",alignments_out_name
2117     join_proc=subprocess.Popen(align_cmd,
2118     preexec_fn=subprocess_setup,
2119     stdout=open(alignments_out_name, "wb"),
2120     stderr=align_log)
2121     join_proc.communicate()
2122     except OSError, o:
2123     die(fail_str+"Error: "+str(o))
2124    
2125    
2126     # This class collects spliced and unspliced alignments for each of the
2127     # left and right read files provided by the user.
2128     class Maps:
2129     def __init__(self,
2130     unspliced_bwt,
2131     unspliced_sam,
2132     seg_maps,
2133     unmapped_segs,
2134     segs):
2135     self.unspliced_bwt = unspliced_bwt
2136     self.unspliced_sam = unspliced_sam
2137     self.seg_maps = seg_maps
2138     self.unmapped_segs = unmapped_segs
2139     self.segs = segs
2140    
2141     # The main aligment routine of TopHat. This function executes most of the
2142     # workflow producing a set of candidate alignments for each cDNA fragment in a
2143     # pair of SAM alignment files (for paired end reads).
2144     def spliced_alignment(params,
2145     bwt_idx_prefix,
2146     sam_header_filename,
2147     ref_fasta,
2148     read_len,
2149     segment_len,
2150     left_reads,
2151     right_reads,
2152     user_supplied_junctions,
2153     user_supplied_insertions,
2154     user_supplied_deletions):
2155    
2156     possible_juncs = []
2157     possible_juncs.extend(user_supplied_junctions)
2158    
2159     possible_insertions = []
2160     possible_insertions.extend(user_supplied_insertions)
2161    
2162     possible_deletions = []
2163     possible_deletions.extend(user_supplied_deletions)
2164    
2165     maps = { left_reads : [], right_reads : [] }
2166     #single_segments = False
2167    
2168     # Perform the first part of the TopHat work flow on the left and right
2169     # reads of paired ends separately - we'll use the pairing information later
2170     for reads in [left_reads, right_reads]:
2171     if reads == None or os.path.getsize(reads)<25 :
2172     continue
2173     fbasename=getFileBaseName(reads)
2174     unspliced_out = tmp_dir + fbasename + ".mapped"
2175     #if use_zpacker: unspliced_out+=".z"
2176     unmapped_unspliced = tmp_dir + fbasename + "_unmapped"
2177    
2178     num_segs = read_len / segment_len
2179     if read_len % segment_len >= min(segment_len -2, 20):
2180     num_segs += 1
2181    
2182     # daehwan - check this out
2183     if num_segs <= 1:
2184     print >> sys.stderr, "Warning: you have only one segment per read\n\twe strongly recommend that you decrease --segment-length to about half the read length because TopHat will work better with multiple segments"
2185    
2186     # Perform the initial Bowtie mapping of the full length reads
2187     (unspliced_map, unmapped) = bowtie(params,
2188     bwt_idx_prefix,
2189     reads,
2190     "fastq",
2191     params.initial_read_mismatches,
2192     unspliced_out,
2193     unmapped_unspliced)
2194    
2195     unspliced_sam = tmp_dir+fbasename+'.unspl.bam'
2196    
2197     # Convert the initial Bowtie maps into BAM format.
2198     # TODO: this step should be replaced by a more direct conversion
2199     # because Bowtie can actually output SAM format now
2200     join_mapped_segments(params,
2201     sam_header_filename,
2202     reads,
2203     ref_fasta,
2204     ["/dev/null"],
2205     ["/dev/null"],
2206     ["/dev/null"],
2207     [unspliced_map],
2208     [],
2209     unspliced_sam)
2210     # Using the num_segs value returned by check_reads(), decide which
2211     # junction discovery strategy to use
2212     if num_segs == 1:
2213     segment_len = read_len
2214     elif num_segs >= 3:
2215     # if we have at least three segments, just use split segment search,
2216     # which is the most sensitive and specific, fastest, and lightest-weight.
2217     if not params.closure_search:
2218     params.closure_search = False
2219     if not params.coverage_search:
2220     params.coverage_search = False
2221     if not params.butterfly_search:
2222     params.butterfly_search = False
2223     if not params.closure_search:
2224     params.closure_search = False
2225     seg_maps = []
2226     unmapped_segs = []
2227     segs = []
2228     if num_segs > 1:
2229     # split up the IUM reads into segments
2230     read_segments = split_reads(unmapped_unspliced,
2231     tmp_dir + fbasename,
2232     False,
2233     params,
2234     segment_len)
2235    
2236     # Map each segment file independently with Bowtie
2237     for i in range(len(read_segments)):
2238     seg = read_segments[i]
2239     fbasename=getFileBaseName(seg)
2240     seg_out = tmp_dir + fbasename + ".bwtout"
2241     if use_zpacker: seg_out += ".z"
2242     unmapped_seg = tmp_dir + fbasename + "_unmapped.fq"
2243     if use_BWT_FIFO:
2244     unmapped_seg += ".z"
2245     extra_output = "(%d/%d)" % (i+1, len(read_segments))
2246     (seg_map, unmapped) = bowtie_segment(params,
2247     bwt_idx_prefix,
2248     seg,
2249     "fastq",
2250     seg_out,
2251     unmapped_seg,
2252     extra_output)
2253     seg_maps.append(seg_map)
2254     unmapped_segs.append(unmapped)
2255     segs.append(seg)
2256    
2257     # Collect the segment maps for left and right reads together
2258     maps[reads] = Maps(unspliced_map, unspliced_sam, seg_maps, unmapped_segs, segs)
2259     else:
2260     # if there's only one segment, just collect the initial map as the only
2261     # map to be used downstream for coverage-based junction discovery
2262     read_segments = [reads]
2263     maps[reads] = Maps(unspliced_map, unspliced_sam, [unspliced_map], [unmapped_unspliced], [unmapped_unspliced])
2264    
2265     # Unless the user asked not to discover new junctions, start that process
2266     # here
2267    
2268     if params.find_novel_juncs or params.find_novel_indels:
2269     left_reads_map = maps[left_reads].unspliced_bwt
2270     left_seg_maps = maps[left_reads].seg_maps
2271     unmapped_reads = maps[left_reads].unmapped_segs
2272     if right_reads != None:
2273     right_reads_map = maps[right_reads].unspliced_bwt
2274     right_seg_maps = maps[right_reads].seg_maps
2275     unmapped_reads.extend(maps[right_reads].unmapped_segs)
2276     else:
2277     right_reads_map = None
2278     right_seg_maps = None
2279    
2280     # Call segment_juncs to infer a list of possible splice junctions from
2281     # the regions of the genome covered in the initial and segment maps
2282     juncs = junctions_from_segments(params,
2283     left_reads,
2284     left_reads_map,
2285     left_seg_maps,
2286     right_reads,
2287     right_reads_map,
2288     right_seg_maps,
2289     unmapped_reads,
2290     "fastq",
2291     ref_fasta)
2292     if params.find_novel_juncs:
2293     if os.path.getsize(juncs[0]) != 0:
2294     possible_juncs.append(juncs[0])
2295     if params.find_novel_indels:
2296     if os.path.getsize(juncs[1]) != 0:
2297     possible_insertions.append(juncs[1])
2298     if os.path.getsize(juncs[2]) != 0:
2299     possible_deletions.append(juncs[2])
2300     # Optionally, and for paired reads only, use a closure search to
2301     # discover addtional junctions
2302     if params.find_novel_juncs and params.closure_search and left_reads != None and right_reads != None:
2303     juncs = junctions_from_closures(params,
2304     [maps[left_reads].unspliced_bwt, maps[left_reads].seg_maps[-1]],
2305     [maps[right_reads].unspliced_bwt, maps[right_reads].seg_maps[-1]],
2306     ref_fasta)
2307     if os.path.getsize(juncs[0]) != 0:
2308     possible_juncs.extend(juncs)
2309    
2310     if len(possible_insertions) == 0 and len(possible_deletions) == 0 and len(possible_juncs) == 0:
2311     spliced_seg_maps = None
2312     junc_idx_prefix = None
2313     else:
2314     junc_idx_prefix = "segment_juncs"
2315     if len(possible_insertions) == 0:
2316     possible_insertions.append(os.devnull)
2317     # print >> sys.stderr, "Warning: insertions database is empty!"
2318     if len(possible_deletions) == 0:
2319     possible_deletions.append(os.devnull)
2320     # print >> sys.stderr, "Warning: deletions database is empty!"
2321     if len(possible_juncs) == 0:
2322     possible_juncs.append(os.devnull)
2323     print >> sys.stderr, "Warning: junction database is empty!"
2324     if junc_idx_prefix != None:
2325     build_juncs_index(3,
2326     segment_len,
2327     junc_idx_prefix,
2328     possible_juncs,
2329     possible_insertions,
2330     possible_deletions,
2331     ref_fasta,
2332     params.read_params.color)
2333    
2334     # Now map read segments (or whole IUM reads, if num_segs == 1) to the splice
2335     # index with Bowtie
2336     for reads in [left_reads, right_reads]:
2337     spliced_seg_maps = []
2338     if reads == None or reads not in maps:
2339     continue
2340    
2341     if junc_idx_prefix != None:
2342     i = 0
2343     for seg in maps[reads].segs:
2344     fsegname = getFileBaseName(seg)
2345     seg_out = tmp_dir + fsegname + ".to_spliced.bwtout"
2346     if use_zpacker: seg_out += ".z"
2347     extra_output = "(%d/%d)" % (i+1, len(maps[reads].segs))
2348     (seg_map, unmapped) = bowtie_segment(params,
2349     tmp_dir + junc_idx_prefix,
2350     seg,
2351     "fastq",
2352     seg_out,
2353     None,
2354     extra_output)
2355     spliced_seg_maps.append(seg_map)
2356     i += 1
2357    
2358     # Join the contigous and spliced segment hits into full length read
2359     # alignments or convert a spliced segment hit into a genomic-coordinate hit
2360     rfname = getFileBaseName(reads)
2361     rfdir = getFileDir(reads)
2362     mapped_reads = rfdir + rfname + ".candidates.bam"
2363     merged_map =rfdir + rfname + ".candidates_and_unspl.bam"
2364     unspl_bwtfile = maps[reads].unspliced_bwt
2365     unspl_samfile = maps[reads].unspliced_sam
2366    
2367     join_mapped_segments(params,
2368     sam_header_filename,
2369     reads,
2370     ref_fasta,
2371     possible_juncs,
2372     possible_insertions,
2373     possible_deletions,
2374     maps[reads].seg_maps,
2375     spliced_seg_maps,
2376     mapped_reads)
2377    
2378     if num_segs > 1:
2379     # Merge the spliced and unspliced full length alignments into a
2380     # single SAM file.
2381     # The individual SAM files are all already sorted in
2382     # increasing read ID order.
2383     # NOTE: We also should be able to address bug #134 here, by replacing
2384     # contiguous alignments that poke into an intron by a small amount by
2385     # the correct spliced alignment.
2386    
2387     try:
2388     merge_cmd = [ prog_path("bam_merge"), merged_map,
2389     maps[reads].unspliced_sam,
2390     mapped_reads ]
2391     print >> run_log, " ".join(merge_cmd)
2392     ret = subprocess.call( merge_cmd,
2393     stderr=open(logging_dir + "sam_merge.log", "w") )
2394     if ret != 0:
2395     die(fail_str+"Error executing: "+" ".join(merge_cmd))
2396     except OSError, o:
2397     die(fail_str+"Error: "+str(o))
2398     maps[reads] = [merged_map]
2399    
2400     if not params.system_params.keep_tmp:
2401     os.remove(mapped_reads)
2402     os.remove(unspl_samfile)
2403     os.remove(unspl_bwtfile)
2404     else:
2405     os.rename(mapped_reads, merged_map)
2406     maps[reads] = [merged_map]
2407     if not params.system_params.keep_tmp:
2408     os.remove(unspl_samfile)
2409     os.remove(unspl_bwtfile)
2410    
2411     return maps
2412    
2413     def die(msg=None):
2414     if msg is not None:
2415     print >> sys.stderr, msg
2416     sys.exit(1)
2417    
2418     # rough equivalent of the 'which' command to find external programs
2419     # (current script path is tested first, then PATH envvar)
2420     def which(program):
2421     def is_executable(fpath):
2422     return os.path.exists(fpath) and os.access(fpath, os.X_OK)
2423     fpath, fname = os.path.split(program)
2424     if fpath:
2425     if is_executable(program):
2426     return program
2427     else:
2428     progpath = os.path.join(bin_dir, program)
2429     if is_executable(progpath):
2430     return progpath
2431     for path in os.environ["PATH"].split(os.pathsep):
2432     progpath = os.path.join(path, program)
2433     if is_executable(progpath):
2434     return progpath
2435     return None
2436    
2437     def prog_path(program):
2438     progpath=which(program)
2439     if progpath == None:
2440     die("Error locating program: "+program)
2441     return progpath
2442    
2443     # FIXME: this should get set during the make dist autotools phase of the build
2444     def get_version():
2445     return "1.3.2"
2446    
2447     def mlog(msg):
2448     print >> sys.stderr, "[DBGLOG]:"+msg
2449    
2450     def test_input_file(filename):
2451     try:
2452     test_file = open(filename, "r")
2453     # Bowtie not found
2454     except IOError, o:
2455     die("Error: Opening file %s" % filename)
2456     return
2457    
2458    
2459     def main(argv=None):
2460     warnings.filterwarnings("ignore", "tmpnam is a potential security risk")
2461    
2462     # Initialize default parameter values
2463     params = TopHatParams()
2464    
2465     try:
2466     if argv is None:
2467     argv = sys.argv
2468     args = params.parse_options(argv)
2469     params.check()
2470    
2471     bwt_idx_prefix = args[0]
2472     left_reads_list = args[1]
2473     left_quals_list, right_quals_list = [], []
2474     if (not params.read_params.quals and len(args) > 2) or (params.read_params.quals and len(args) > 3):
2475     if params.read_params.mate_inner_dist == None:
2476     die("Error: you must set the mean inner distance between mates with -r")
2477    
2478     right_reads_list = args[2]
2479     if params.read_params.quals:
2480     left_quals_list = args[3]
2481     right_quals_list = args[4]
2482     else:
2483     right_reads_list = None
2484     if params.read_params.quals:
2485     left_quals_list = args[2]
2486    
2487     print >> sys.stderr
2488     print >> sys.stderr, "[%s] Beginning TopHat run (v%s)" % (right_now(), get_version())
2489     print >> sys.stderr, "-----------------------------------------------"
2490    
2491     start_time = datetime.now()
2492     prepare_output_dir()
2493    
2494     global run_log
2495     run_log = open(logging_dir + "run.log", "w", 0)
2496     global run_cmd
2497     run_cmd = " ".join(argv)
2498     print >> run_log, run_cmd
2499    
2500     # Validate all the input files, check all prereqs before committing
2501     # to the run
2502     (ref_fasta, ref_seq_dict) = check_index(bwt_idx_prefix)
2503    
2504     check_bowtie()
2505     check_samtools()
2506     print >> sys.stderr, "[%s] Generating SAM header for %s" % (right_now(), bwt_idx_prefix)
2507     sam_header_filename = get_index_sam_header(params.read_params, bwt_idx_prefix)
2508     print >> sys.stderr, "[%s] Preparing reads" % (right_now())
2509    
2510     if not params.skip_check_reads:
2511     reads_list = left_reads_list
2512     if right_reads_list:
2513     reads_list = reads_list + "," + right_reads_list
2514     params.read_params = check_reads_format(params, reads_list)
2515    
2516     user_supplied_juncs = []
2517     user_supplied_insertions = []
2518     user_supplied_deletions = []
2519    
2520     if params.gff_annotation and params.find_GFF_juncs:
2521     test_input_file(params.gff_annotation)
2522     (found_juncs, gtf_juncs) = get_gtf_juncs(params.gff_annotation)
2523     if found_juncs:
2524     user_supplied_juncs.append(gtf_juncs)
2525     if params.raw_junctions:
2526     test_input_file(params.raw_junctions)
2527     user_supplied_juncs.append(params.raw_junctions)
2528    
2529     if params.raw_insertions:
2530     test_input_file(params.raw_insertions)
2531     user_supplied_insertions.append(params.raw_insertions)
2532    
2533     if params.raw_deletions:
2534     test_input_file(params.raw_deletions)
2535     user_supplied_deletions.append(params.raw_deletions)
2536    
2537     global unmapped_reads_fifo
2538     unmapped_reads_fifo = tmp_dir + str(os.getpid())+".bwt_unmapped.z.fifo"
2539    
2540     # Now start the time consuming stuff
2541     left_kept_reads, left_reads_info = prep_reads(params,
2542     left_reads_list,
2543     left_quals_list,
2544     "left_kept_reads", "Left ")
2545     min_read_len=left_reads_info.min_len
2546     max_read_len=left_reads_info.max_len
2547     if right_reads_list != None:
2548     right_kept_reads, right_reads_info = prep_reads(params,
2549     right_reads_list,
2550     right_quals_list,
2551     "right_kept_reads", "Right")
2552     min_read_len=min(right_reads_info.min_len, min_read_len)
2553     max_read_len=max(right_reads_info.max_len, max_read_len)
2554     else:
2555     right_kept_reads = None
2556     seed_len=params.read_params.seed_length
2557     if seed_len != None:
2558     seed_len = max(seed_len, min_read_len)
2559     else:
2560     seed_len = max_read_len
2561     params.read_params.seed_length=seed_len
2562     # turn off integer-quals
2563     if params.read_params.integer_quals:
2564     params.read_params.integer_quals = False
2565    
2566     mapping = spliced_alignment(params,
2567     bwt_idx_prefix,
2568     sam_header_filename,
2569     ref_fasta,
2570     params.read_params.seed_length,
2571     params.segment_length,
2572     left_kept_reads,
2573     right_kept_reads,
2574     user_supplied_juncs,
2575     user_supplied_insertions,
2576     user_supplied_deletions)
2577    
2578     left_maps = mapping[left_kept_reads]
2579     #left_unmapped_reads = mapping[1]
2580     right_maps = mapping[right_kept_reads]
2581     #right_unmapped_reads = mapping[3]
2582    
2583     compile_reports(params,
2584     sam_header_filename,
2585     left_maps,
2586     left_kept_reads,
2587     right_maps,
2588     right_kept_reads,
2589     params.gff_annotation)
2590    
2591     if not params.system_params.keep_tmp:
2592     for m in left_maps:
2593     os.remove(m)
2594     if left_kept_reads != None:
2595     os.remove(left_kept_reads)
2596     for m in right_maps:
2597     os.remove(m)
2598     if right_kept_reads != None:
2599     os.remove(right_kept_reads)
2600     tmp_files = os.listdir(tmp_dir)
2601     for t in tmp_files:
2602     os.remove(tmp_dir+t)
2603     os.rmdir(tmp_dir)
2604    
2605     finish_time = datetime.now()
2606     duration = finish_time - start_time
2607     print >> sys.stderr,"-----------------------------------------------"
2608     print >> sys.stderr, "Run complete [%s elapsed]" % formatTD(duration)
2609    
2610     except Usage, err:
2611     print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg)
2612     print >> sys.stderr, " for detailed help see http://tophat.cbcb.umd.edu/manual.html"
2613     return 2
2614    
2615    
2616     if __name__ == "__main__":
2617     sys.exit(main())

Properties

Name Value
svn:executable *