ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/tophat
Revision: 31
Committed: Tue Aug 2 21:58:18 2011 UTC (8 years, 10 months ago) by gpertea
File size: 111077 byte(s)
Log Message:
trying to get rid of unspliced_bwt in Maps (replace by unspliced_sam)

Line User Rev File contents
1 gpertea 29 #!/usr/bin/env python
2    
3     # encoding: utf-8
4     """
5     tophat.py
6    
7     Created by Cole Trapnell on 2008-12-25.
8     Copyright (c) 2008 Cole Trapnell. All rights reserved.
9     """
10    
11     import sys
12     try:
13     import psyco
14     psyco.full()
15     except ImportError:
16     pass
17    
18     import getopt
19     import subprocess
20     import errno
21     import os
22     import warnings
23     import re
24     import signal
25     from datetime import datetime, date, time
26    
27     use_message = '''
28     TopHat maps short sequences from spliced transcripts to whole genomes.
29    
30     Usage:
31     tophat [options] <bowtie_index> <reads1[,reads2,...]> [reads1[,reads2,...]] \\
32     [quals1,[quals2,...]] [quals1[,quals2,...]]
33    
34     Options:
35     -v/--version
36     -o/--output-dir <string> [ default: ./tophat_out ]
37     -a/--min-anchor <int> [ default: 8 ]
38     -m/--splice-mismatches <0-2> [ default: 0 ]
39     -i/--min-intron-length <int> [ default: 50 ]
40     -I/--max-intron-length <int> [ default: 500000 ]
41     -g/--max-multihits <int> [ default: 20 ]
42     -F/--min-isoform-fraction <float> [ default: 0.15 ]
43     --max-insertion-length <int> [ default: 3 ]
44     --max-deletion-length <int> [ default: 3 ]
45     --solexa-quals
46     --solexa1.3-quals (same as phred64-quals)
47     --phred64-quals (same as solexa1.3-quals)
48     -Q/--quals
49     --integer-quals
50     -C/--color (Solid - color space)
51     --color-out
52     --library-type <string> (fr-unstranded, fr-firststrand,
53     fr-secondstrand)
54     -p/--num-threads <int> [ default: 1 ]
55     -G/--GTF <filename>
56     -j/--raw-juncs <filename>
57     --insertions <filename>
58     --deletions <filename>
59     -r/--mate-inner-dist <int>
60     --mate-std-dev <int> [ default: 20 ]
61     --no-novel-juncs
62     --no-novel-indels
63     --no-gtf-juncs
64     --no-coverage-search
65     --coverage-search
66     --no-closure-search
67     --closure-search
68     --microexon-search
69     --butterfly-search
70     --no-butterfly-search
71     --keep-tmp
72     --tmp-dir <dirname> [ default: <output_dir>/tmp ]
73     -z/--zpacker <program> [ default: gzip ]
74     -X/--unmapped-fifo [use mkfifo to compress more temporary
75     files for color space reads]
76    
77     Advanced Options:
78     --initial-read-mismatches <int> [ default: 2 ]
79     --segment-mismatches <int> [ default: 2 ]
80     --segment-length <int> [ default: 25 ]
81     --bowtie-n [ default: bowtie -v ]
82     --min-closure-exon <int> [ default: 100 ]
83     --min-closure-intron <int> [ default: 50 ]
84     --max-closure-intron <int> [ default: 5000 ]
85     --min-coverage-intron <int> [ default: 50 ]
86     --max-coverage-intron <int> [ default: 20000 ]
87     --min-segment-intron <int> [ default: 50 ]
88     --max-segment-intron <int> [ default: 500000 ]
89     --no-sort-bam [Output BAM is not coordinate-sorted]
90     --no-convert-bam [Do not convert to bam format.
91     Output is <output_dir>accepted_hit.sam.
92     Implies --no-sort-bam.]
93    
94     SAM Header Options (for embedding sequencing run metadata in output):
95     --rg-id <string> (read group ID)
96     --rg-sample <string> (sample ID)
97     --rg-library <string> (library ID)
98     --rg-description <string> (descriptive string, no tabs allowed)
99     --rg-platform-unit <string> (e.g Illumina lane ID)
100     --rg-center <string> (sequencing center name)
101     --rg-date <string> (ISO 8601 date of the sequencing run)
102     --rg-platform <string> (Sequencing platform descriptor)
103     '''
104    
105     class Usage(Exception):
106     def __init__(self, msg):
107     self.msg = msg
108    
109     output_dir = "./tophat_out/"
110     logging_dir = output_dir + "logs/"
111     run_log = None
112     run_cmd = None
113     tmp_dir = output_dir + "tmp/"
114     bin_dir = sys.path[0] + "/"
115     use_zpacker = False # this is set by -z/--zpacker option (-z0 leaves it False)
116    
117     use_BAM_Unmapped = False # automatically set to True for non-Solid reads, handles unmapped reads in BAM format
118    
119     use_BWT_FIFO = False # can only be set to True if use_zpacker is True and only with -C/--color
120     # enabled by -X/-unmapped-fifo option (unless -z0)
121     unmapped_reads_fifo = None # if use_BWT_FIFO is True, this is to trick bowtie into writing the
122     # unmapped reads into a compressed file
123    
124     samtools_path = None
125     bowtie_path = None
126     fail_str = "\t[FAILED]\n"
127    
128     sam_header = tmp_dir + "stub_header.sam"
129    
130     # TopHatParams captures all of the runtime paramaters used by TopHat, and many
131     # of these are passed as command line options to exectubles run by the pipeline
132    
133     # This class and its nested classes also do options parsing through parse_options()
134     # and option validation via the member function check()
135    
136     class TopHatParams:
137    
138     # SpliceConstraints is a group of runtime parameters that specify what
139     # constraints to put on junctions discovered by the program. These constraints
140     # are used to filter out spurious/false positive junctions.
141    
142     class SpliceConstraints:
143     def __init__(self,
144     min_anchor_length,
145     min_intron_length,
146     max_intron_length,
147     splice_mismatches,
148     min_isoform_fraction):
149     self.min_anchor_length = min_anchor_length
150     self.min_intron_length = min_intron_length
151     self.max_intron_length = max_intron_length
152     self.splice_mismatches = splice_mismatches
153     self.min_isoform_fraction = min_isoform_fraction
154    
155     def parse_options(self, opts):
156     for option, value in opts:
157     if option in ("-m", "--splice-mismatches"):
158     self.splice_mismatches = int(value)
159     elif option in ("-a", "--min-anchor"):
160     self.min_anchor_length = int(value)
161     elif option in ("-F", "--min-isoform-fraction"):
162     self.min_isoform_fraction = float(value)
163     elif option in ("-i", "--min-intron-length"):
164     self.min_intron_length = int(value)
165     elif option in ("-I", "--max-intron-length"):
166     self.max_intron_length = int(value)
167    
168     def check(self):
169     if self.splice_mismatches not in [0,1,2]:
170     die("Error: arg to --splice-mismatches must be 0, 1, or 2")
171     if self.min_anchor_length < 4:
172     die("Error: arg to --min-anchor-len must be greater than 4")
173     if self.min_isoform_fraction < 0.0 or self.min_isoform_fraction > 1.0:
174     die("Error: arg to --min-isoform-fraction must be between 0.0 and 1.0")
175     if self.min_intron_length <= 0:
176     die("Error: arg to --min-intron-length must be greater than 0")
177     if self.max_intron_length <= 0:
178     die("Error: arg to --max-intron-length must be greater than 0")
179    
180     # SystemParams is a group of runtime parameters that determine how to handle
181     # temporary files produced during a run and how many threads to use for threaded
182     # stages of the pipeline (e.g. Bowtie)
183    
184     class SystemParams:
185     def __init__(self,
186     num_cpus,
187     keep_tmp):
188     self.num_cpus = num_cpus
189     self.keep_tmp = keep_tmp
190     self.zipper = "gzip"
191     self.zipper_opts= []
192    
193     def parse_options(self, opts):
194     global use_zpacker
195     global use_BWT_FIFO
196     for option, value in opts:
197     if option in ("-p", "--num-threads"):
198     self.num_cpus = int(value)
199     elif option == "--keep-tmp":
200     self.keep_tmp = True
201     elif option in ("-z","--zpacker"):
202     if value.lower() in ["-", " ", ".", "0", "none", "f", "false", "no"]:
203     value=""
204     self.zipper = value
205     #if not self.zipper:
206     # self.zipper='gzip'
207     elif option in ("-X", "--unmapped-fifo"):
208     use_BWT_FIFO=True
209     if self.zipper:
210     use_zpacker=True
211     if self.num_cpus>1 and not self.zipper_opts:
212     if self.zipper.endswith('pbzip2') or self.zipper.endswith('pigz'):
213     self.zipper_opts.append('-p'+str(self.num_cpus))
214     else:
215     use_zpacker=False
216     if use_BWT_FIFO: use_BWT_FIFO=False
217     def cmd(self):
218     cmdline=[]
219     if self.zipper:
220     cmdline.extend(['-z',self.zipper])
221     if self.num_cpus>1:
222     cmdline.extend(['-p'+str(self.num_cpus)])
223     return cmdline
224    
225     def check(self):
226     if self.num_cpus<1 :
227     die("Error: arg to --num-threads must be greater than 0")
228     if self.zipper:
229     xzip=which(self.zipper)
230     if not xzip:
231     die("Error: cannot find compression program "+self.zipper)
232    
233     # ReadParams is a group of runtime parameters that specify various properties
234     # of the user's reads (e.g. which quality scale their are on, how long the
235     # fragments are, etc).
236     class ReadParams:
237     def __init__(self,
238     solexa_quals,
239     phred64_quals,
240     quals,
241     integer_quals,
242     color,
243     color_out,
244     library_type,
245     seed_length,
246     reads_format,
247     mate_inner_dist,
248     mate_inner_dist_std_dev,
249     read_group_id,
250     sample_id,
251     library_id,
252     description,
253     seq_platform_unit,
254     seq_center,
255     seq_run_date,
256     seq_platform):
257     self.solexa_quals = solexa_quals
258     self.phred64_quals = phred64_quals
259     self.quals = quals
260     self.integer_quals = integer_quals
261     self.color = color
262     self.color_out = color_out
263     self.library_type = library_type
264     self.seed_length = seed_length
265     self.reads_format = reads_format
266     self.mate_inner_dist = mate_inner_dist
267     self.mate_inner_dist_std_dev = mate_inner_dist_std_dev
268     self.read_group_id = read_group_id
269     self.sample_id = sample_id
270     self.library_id = library_id
271     self.description = description
272     self.seq_platform_unit = seq_platform_unit
273     self.seq_center = seq_center
274     self.seq_run_date = seq_run_date
275     self.seq_platform = seq_platform
276    
277     def parse_options(self, opts):
278     for option, value in opts:
279     if option == "--solexa-quals":
280     self.solexa_quals = True
281     elif option in ("--solexa1.3-quals", "--phred64-quals"):
282     self.phred64_quals = True
283     elif option in ("-Q", "--quals"):
284     self.quals = True
285     elif option == "--integer-quals":
286     self.integer_quals = True
287     elif option in ("-C", "--color"):
288     self.color = True
289     elif option == "--color-out":
290     self.color_out = True
291     elif option == "--library-type":
292     self.library_type = value
293     elif option in ("-s", "--seed-length"):
294     self.seed_length = int(value)
295     elif option in ("-r", "--mate-inner-dist"):
296     self.mate_inner_dist = int(value)
297     elif option == "--mate-std-dev":
298     self.mate_inner_dist_std_dev = int(value)
299     elif option == "--rg-id":
300     self.read_group_id = value
301     elif option == "--rg-sample":
302     self.sample_id = value
303     elif option == "--rg-library":
304     self.library_id = value
305     elif option == "--rg-description":
306     self.description = value
307     elif option == "--rg-platform-unit":
308     self.seq_platform_unit = value
309     elif option == "--rg-center":
310     self.seq_center = value
311     elif option == "--rg-date":
312     self.seq_run_date = value
313     elif option == "--rg-platform":
314     self.seq_platform = value
315    
316     def check(self):
317     if self.seed_length != None and self.seed_length < 20:
318     die("Error: arg to --seed-length must be at least 20")
319     if self.mate_inner_dist_std_dev != None and self.mate_inner_dist_std_dev < 0:
320     die("Error: arg to --mate-std-dev must at least 0")
321     if (not self.read_group_id and self.sample_id) or (self.read_group_id and not self.sample_id):
322     die("Error: --rg-id and --rg-sample must be specified or omitted together")
323    
324     # SearchParams is a group of runtime parameters that specify how TopHat will
325     # search for splice junctions
326    
327     class SearchParams:
328     def __init__(self,
329     min_closure_exon,
330     min_closure_intron,
331     max_closure_intron,
332     min_coverage_intron,
333     max_coverage_intron,
334     min_segment_intron,
335     max_segment_intron):
336    
337     self.min_closure_exon_length = min_closure_exon
338     self.min_closure_intron_length = min_closure_intron
339     self.max_closure_intron_length = max_closure_intron
340     self.min_coverage_intron_length = min_coverage_intron
341     self.max_coverage_intron_length = max_coverage_intron
342     self.min_segment_intron_length = min_segment_intron
343     self.max_segment_intron_length = max_segment_intron
344    
345     def parse_options(self, opts):
346     for option, value in opts:
347     if option == "--min-closure-exon":
348     self.min_closure_exon_length = int(value)
349     if option == "--min-closure-intron":
350     self.min_closure_intron_length = int(value)
351     if option == "--max-closure-intron":
352     self.max_closure_intron_length = int(value)
353     if option == "--min-coverage-intron":
354     self.min_coverage_intron_length = int(value)
355     if option == "--max-coverage-intron":
356     self.max_coverage_intron_length = int(value)
357     if option == "--min-segment-intron":
358     self.min_segment_intron_length = int(value)
359     if option == "--max-segment-intron":
360     self.max_segment_intron_length = int(value)
361    
362     def check(self):
363     if self.min_closure_exon_length < 0:
364     die("Error: arg to --min-closure-exon must be at least 20")
365     if self.min_closure_intron_length < 0:
366     die("Error: arg to --min-closure-intron must be at least 20")
367     if self.max_closure_intron_length < 0:
368     die("Error: arg to --max-closure-intron must be at least 20")
369     if self.min_coverage_intron_length < 0:
370     die("Error: arg to --min-coverage-intron must be at least 20")
371     if self.max_coverage_intron_length < 0:
372     die("Error: arg to --max-coverage-intron must be at least 20")
373     if self.min_segment_intron_length < 0:
374     die("Error: arg to --min-segment-intron must be at least 20")
375     if self.max_segment_intron_length < 0:
376     die("Error: arg to --max-segment-intron must be at least 20")
377    
378     class ReportParams:
379     def __init__(self):
380     self.sort_bam = True
381     self.convert_bam = True
382    
383     def parse_options(self, opts):
384     for option, value in opts:
385     if option == "--no-sort-bam":
386     self.sort_bam = False
387     if option == "--no-convert-bam":
388     self.convert_bam = False
389     self.sort_bam = False
390    
391    
392     def __init__(self):
393     self.splice_constraints = self.SpliceConstraints(8, # min_anchor
394     50, # min_intron
395     500000, # max_intron
396     0, # splice_mismatches
397     0.15) # min_isoform_frac
398    
399     self.read_params = self.ReadParams(False, # solexa_scale
400     False,
401     False, # quals
402     None, # integer quals
403     False, # SOLiD - color space
404     False, # SOLiD - color out instead of base pair,
405     "", # library type (e.g. "illumina-stranded-pair-end")
406     None, # seed_length
407     "fastq", # quality_format
408     None, # mate inner distance
409     20, # mate inner dist std dev
410     None, # read group id
411     None, # sample id
412     None, # library id
413     None, # description
414     None, # platform unit (i.e. lane)
415     None, # sequencing center
416     None, # run date
417     None) # sequencing platform
418    
419     self.system_params = self.SystemParams(1, # bowtie_threads (num_cpus)
420     False) # keep_tmp
421    
422     self.search_params = self.SearchParams(100, # min_closure_exon_length
423     50, # min_closure_intron_length
424     5000, # max_closure_intron_length
425     50, # min_coverage_intron_length
426     20000, # max_coverage_intron_length
427     50, # min_segment_intron_length
428     500000) # max_segment_intron_length
429    
430     self.report_params = self.ReportParams()
431     self.gff_annotation = None
432     self.raw_junctions = None
433     self.find_novel_juncs = True
434     self.find_novel_indels = True
435     self.find_GFF_juncs = True
436     self.skip_check_reads = False
437     self.max_hits = 20
438     self.initial_read_mismatches = 2
439     self.segment_length = 25
440     self.segment_mismatches = 2
441     self.bowtie_alignment_option = "-v"
442     self.max_insertion_length = 3
443     self.max_deletion_length = 3
444     self.raw_insertions = None
445     self.raw_deletions = None
446     self.closure_search = None
447     self.coverage_search = None
448     self.microexon_search = False
449     self.butterfly_search = None
450    
451     def check(self):
452     self.splice_constraints.check()
453     self.read_params.check()
454     self.system_params.check()
455    
456     if self.segment_length <= 4:
457     die("Error: arg to --segment-length must at least 4")
458     if self.segment_mismatches < 0 or self.segment_mismatches > 3:
459     die("Error: arg to --segment-mismatches must in [0, 3]")
460    
461     if self.read_params.color and self.butterfly_search:
462     die("Error: butterfly-search in colorspace is not yet supported")
463    
464     library_types = ["fr-unstranded", "fr-firststrand", "fr-secondstrand"]
465    
466     if self.read_params.library_type and self.read_params.library_type not in library_types:
467     die("Error: library-type should be one of: "+', '.join(library_types))
468    
469     self.search_params.max_closure_intron_length = min(self.splice_constraints.max_intron_length,
470     self.search_params.max_closure_intron_length)
471    
472     self.search_params.max_segment_intron_length = min(self.splice_constraints.max_intron_length,
473     self.search_params.max_segment_intron_length)
474    
475     self.search_params.max_coverage_intron_length = min(self.splice_constraints.max_intron_length,
476     self.search_params.max_coverage_intron_length)
477    
478     if self.max_insertion_length >= self.segment_length:
479     die("Error: the max insertion length ("+self.max_insertion_length+") can not be equal to or greater than the segment length ("+self.segment_length+")")
480    
481     if self.max_insertion_length < 0:
482     die("Error: the max insertion length ("+self.max_insertion_length+") can not be less than 0")
483    
484     if self.max_deletion_length >= self.splice_constraints.min_intron_length:
485     die("Error: the max deletion length ("+self.max_deletion_length+") can not be equal to or greater than the min intron length ("+self.splice_constraints.min_intron_length+")")
486    
487     if self.max_deletion_length < 0:
488     die("Error: the max deletion length ("+self.max_deletion_length+") can not be less than 0")
489    
490     def cmd(self):
491     cmd = ["--min-anchor", str(self.splice_constraints.min_anchor_length),
492     "--splice-mismatches", str(self.splice_constraints.splice_mismatches),
493     "--min-report-intron", str(self.splice_constraints.min_intron_length),
494     "--max-report-intron", str(self.splice_constraints.max_intron_length),
495     "--min-isoform-fraction", str(self.splice_constraints.min_isoform_fraction),
496     "--output-dir", output_dir,
497     "--max-multihits", str(self.max_hits),
498     "--segment-length", str(self.segment_length),
499     "--segment-mismatches", str(self.segment_mismatches),
500     "--min-closure-exon", str(self.search_params.min_closure_exon_length),
501     "--min-closure-intron", str(self.search_params.min_closure_intron_length),
502     "--max-closure-intron", str(self.search_params.max_closure_intron_length),
503     "--min-coverage-intron", str(self.search_params.min_coverage_intron_length),
504     "--max-coverage-intron", str(self.search_params.max_coverage_intron_length),
505     "--min-segment-intron", str(self.search_params.min_segment_intron_length),
506     "--max-segment-intron", str(self.search_params.max_segment_intron_length),
507     "--sam-header", sam_header,
508     "--max-insertion-length", str(self.max_insertion_length),
509     "--max-deletion-length", str(self.max_deletion_length)]
510    
511     cmd.extend(self.system_params.cmd())
512    
513     if self.read_params.mate_inner_dist != None:
514     cmd.extend(["--inner-dist-mean", str(self.read_params.mate_inner_dist),
515     "--inner-dist-std-dev", str(self.read_params.mate_inner_dist_std_dev)])
516     if self.gff_annotation != None:
517     cmd.extend(["--gtf-annotations", str(self.gff_annotation)])
518     if not self.closure_search:
519     cmd.append("--no-closure-search")
520     if not self.coverage_search:
521     cmd.append("--no-coverage-search")
522     if not self.microexon_search:
523     cmd.append("--no-microexon-search")
524     if self.butterfly_search:
525     cmd.append("--butterfly-search")
526     if self.read_params.solexa_quals:
527     cmd.append("--solexa-quals")
528     if self.read_params.quals:
529     cmd.append("--quals")
530     if self.read_params.integer_quals:
531     cmd.append("--integer-quals")
532     if self.read_params.color:
533     cmd.append("--color")
534     if self.read_params.color_out:
535     cmd.append("--color-out")
536     if self.read_params.library_type:
537     cmd.extend(["--library-type", self.read_params.library_type])
538     if self.read_params.read_group_id:
539     cmd.extend(["--rg-id", self.read_params.read_group_id])
540     if self.read_params.phred64_quals:
541     cmd.append("--phred64-quals")
542     return cmd
543    
544     # This is the master options parsing routine, which calls parse_options for
545     # the delegate classes (e.g. SpliceConstraints) that handle certain groups
546     # of options.
547     def parse_options(self, argv):
548     try:
549     opts, args = getopt.getopt(argv[1:], "hvp:m:F:a:i:I:G:r:o:j:Xz:s:g:QC",
550     ["version",
551     "help",
552     "output-dir=",
553     "solexa-quals",
554     "solexa1.3-quals",
555     "phred64-quals",
556     "quals",
557     "integer-quals",
558     "color",
559     "color-out",
560     "library-type=",
561     "num-threads=",
562     "splice-mismatches=",
563     "max-multihits=",
564     "min-isoform-fraction=",
565     "min-anchor-length=",
566     "min-intron-length=",
567     "max-intron-length=",
568     "GTF=",
569     "raw-juncs=",
570     "no-novel-juncs",
571     "no-novel-indels",
572     "no-gtf-juncs",
573     "skip-check-reads",
574     "mate-inner-dist=",
575     "mate-std-dev=",
576     "no-closure-search",
577     "no-coverage-search",
578     "closure-search",
579     "coverage-search",
580     "microexon-search",
581     "min-closure-exon=",
582     "min-closure-intron=",
583     "max-closure-intron=",
584     "min-coverage-intron=",
585     "max-coverage-intron=",
586     "min-segment-intron=",
587     "max-segment-intron=",
588     "seed-length=",
589     "initial-read-mismatches=",
590     "segment-length=",
591     "segment-mismatches=",
592     "bowtie-n",
593     "butterfly-search",
594     "no-butterfly-search",
595     "keep-tmp",
596     "rg-id=",
597     "rg-sample=",
598     "rg-library=",
599     "rg-description=",
600     "rg-platform-unit=",
601     "rg-center=",
602     "rg-date=",
603     "rg-platform=",
604     "tmp-dir=",
605     "zpacker=",
606     "unmapped-fifo",
607     "max-insertion-length=",
608     "max-deletion-length=",
609     "insertions=",
610     "deletions=",
611     "no-sort-bam",
612     "no-convert-bam"])
613     except getopt.error, msg:
614     raise Usage(msg)
615    
616     self.splice_constraints.parse_options(opts)
617     self.system_params.parse_options(opts)
618     self.read_params.parse_options(opts)
619     self.search_params.parse_options(opts)
620     self.report_params.parse_options(opts)
621     global use_BWT_FIFO
622     global use_BAM_Unmapped
623     if not self.read_params.color:
624     use_BWT_FIFO=False
625     use_BAM_Unmapped=True
626     global output_dir
627     global logging_dir
628     global tmp_dir
629     global sam_header
630    
631     custom_tmp_dir = None
632     custom_out_dir = None
633     # option processing
634     for option, value in opts:
635     if option in ("-v", "--version"):
636     print "TopHat v%s" % (get_version())
637     sys.exit(0)
638     if option in ("-h", "--help"):
639     raise Usage(use_message)
640     if option in ("-g", "--max-multihits"):
641     self.max_hits = int(value)
642     if option in ("-G", "--GTF"):
643     self.gff_annotation = value
644     if option in ("-j", "--raw-juncs"):
645     self.raw_junctions = value
646     if option == "--no-novel-juncs":
647     self.find_novel_juncs = False
648     if option == "--no-novel-indels":
649     self.find_novel_indels = False
650     if option == "--no-gtf-juncs":
651     self.find_GFF_juncs = False
652     if option == "--skip-check-reads":
653     self.skip_check_reads = True
654     if option == "--no-coverage-search":
655     self.coverage_search = False
656     if option == "--no-closure-search":
657     self.closure_search = False
658     if option == "--coverage-search":
659     self.coverage_search = True
660     if option == "--closure-search":
661     self.closure_search = True
662     if option == "--microexon-search":
663     self.microexon_search = True
664     if option == "--butterfly-search":
665     self.butterfly_search = True
666     if option == "--no-butterfly-search":
667     self.butterfly_search = False
668     if option == "--initial-read-mismatches":
669     self.initial_read_mismatches = int(value)
670     if option == "--segment-length":
671     self.segment_length = int(value)
672     if option == "--segment-mismatches":
673     self.segment_mismatches = int(value)
674     if option == "--bowtie-n":
675     self.bowtie_alignment_option = "-n"
676     if option == "--max-insertion-length":
677     self.max_insertion_length = int(value)
678     if option == "--max-deletion-length":
679     self.max_deletion_length = int(value)
680     if option == "--insertions":
681     self.raw_insertions = value
682     if option == "--deletions":
683     self.raw_deletions = value
684     if option in ("-o", "--output-dir"):
685     custom_out_dir = value + "/"
686     if option == "--tmp-dir":
687     custom_tmp_dir = value + "/"
688    
689     if custom_out_dir != None:
690     output_dir = custom_out_dir
691     logging_dir = output_dir + "logs/"
692     tmp_dir = output_dir + "tmp/"
693     sam_header = tmp_dir + "stub_header.sam"
694     if custom_tmp_dir != None:
695     tmp_dir = custom_tmp_dir
696     sam_header = tmp_dir + "stub_header.sam"
697     if len(args) < 2:
698     raise Usage(use_message)
699     return args
700    
701    
702     def getFileDir(filepath):
703     #if fullpath, returns path including the ending /
704     fpath, fname=os.path.split(filepath)
705     if fpath: fpath+='/'
706     return fpath
707    
708     def getFileBaseName(filepath):
709     fpath, fname=os.path.split(filepath)
710     fbase, fext =os.path.splitext(fname)
711     fx=fext.lower()
712     if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fbase)>0:
713     return fbase
714     elif fx == '.z' or fx.find('.gz')==0 or fx.find('.bz')==0:
715     fb, fext = os.path.splitext(fbase)
716     fx=fext.lower()
717     if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fb)>0:
718     return fb
719     else:
720     return fbase
721     else:
722     if len(fbase)>len(fext):
723     return fbase
724     else:
725     return fname
726    
727     # Returns the current time in a nice format
728     def right_now():
729     curr_time = datetime.now()
730     return curr_time.strftime("%c")
731    
732     # Ensures that the output, logging, and temp directories are present. If not,
733     # they are created
734     def prepare_output_dir():
735    
736     print >> sys.stderr, "[%s] Preparing output location %s" % (right_now(), output_dir)
737     if os.path.exists(output_dir):
738     pass
739     else:
740     os.mkdir(output_dir)
741    
742     if os.path.exists(logging_dir):
743     pass
744     else:
745     os.mkdir(logging_dir)
746    
747     if os.path.exists(tmp_dir):
748     pass
749     else:
750     try:
751     os.makedirs(tmp_dir)
752     except OSError, o:
753     die("\nError creating directory %s (%s)" % (tmp_dir, o))
754    
755    
756     # to be added as preexec_fn for every subprocess.Popen() call:
757     # see http://bugs.python.org/issue1652
758     def subprocess_setup():
759     # Python installs a SIGPIPE handler by default, which causes
760     # gzip or other de/compression pipes to complain about "stdout: Broken pipe"
761     signal.signal(signal.SIGPIPE, signal.SIG_DFL)
762    
763     # Check that the Bowtie index specified by the user is present and all files
764     # are there.
765     def check_bowtie_index(idx_prefix):
766     print >> sys.stderr, "[%s] Checking for Bowtie index files" % right_now()
767    
768     idx_fwd_1 = idx_prefix + ".1.ebwt"
769     idx_fwd_2 = idx_prefix + ".2.ebwt"
770     idx_rev_1 = idx_prefix + ".rev.1.ebwt"
771     idx_rev_2 = idx_prefix + ".rev.2.ebwt"
772    
773     if os.path.exists(idx_fwd_1) and \
774     os.path.exists(idx_fwd_2) and \
775     os.path.exists(idx_rev_1) and \
776     os.path.exists(idx_rev_2):
777     return
778     else:
779     bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES")
780     if bowtie_idx_env_var == None:
781     die("Error: Could not find Bowtie index files " + idx_prefix + ".*")
782     idx_prefix = bowtie_idx_env_var + idx_prefix
783     idx_fwd_1 = idx_prefix + ".1.ebwt"
784     idx_fwd_2 = idx_prefix + ".2.ebwt"
785     idx_rev_1 = idx_prefix + ".rev.1.ebwt"
786     idx_rev_2 = idx_prefix + ".rev.2.ebwt"
787    
788     if os.path.exists(idx_fwd_1) and \
789     os.path.exists(idx_fwd_2) and \
790     os.path.exists(idx_rev_1) and \
791     os.path.exists(idx_rev_2):
792     return
793     else:
794     die("Error: Could not find Bowtie index files " + idx_prefix + ".*")
795    
796     # Reconstructs the multifasta file from which the Bowtie index was created, if
797     # it's not already there.
798     def bowtie_idx_to_fa(idx_prefix):
799     idx_name = idx_prefix.split('/')[-1]
800     print >> sys.stderr, "[%s] Reconstituting reference FASTA file from Bowtie index" % (right_now())
801    
802     try:
803     tmp_fasta_file_name = tmp_dir + idx_name + ".fa"
804     tmp_fasta_file = open(tmp_fasta_file_name, "w")
805    
806     inspect_log = open(logging_dir + "bowtie_inspect_recons.log", "w")
807    
808     inspect_cmd = [prog_path("bowtie-inspect"),
809     idx_prefix]
810    
811     print >> sys.stderr, "Executing: " + " ".join(inspect_cmd) + " > " + tmp_fasta_file_name
812     ret = subprocess.call(inspect_cmd,
813     stdout=tmp_fasta_file,
814     stderr=inspect_log)
815    
816     # Bowtie reported an error
817     if ret != 0:
818     die(fail_str+"Error: bowtie-inspect returned an error")
819    
820     # Bowtie not found
821     except OSError, o:
822     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
823     die(fail_str+"Error: bowtie-inspect not found on this system. Did you forget to include it in your PATH?")
824    
825     return tmp_fasta_file_name
826    
827     # Checks whether the multifasta file for the genome is present alongside the
828     # Bowtie index files for it.
829     def check_fasta(idx_prefix):
830     print >> sys.stderr, "[%s] Checking for reference FASTA file" % right_now()
831     idx_fasta = idx_prefix + ".fa"
832     if os.path.exists(idx_fasta):
833     return idx_fasta
834     else:
835     bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES")
836     if bowtie_idx_env_var != None:
837     idx_fasta = bowtie_idx_env_var + idx_prefix + ".fa"
838     if os.path.exists(idx_fasta):
839     return idx_fasta
840    
841     print >> sys.stderr, "\tWarning: Could not find FASTA file " + idx_fasta
842     idx_fa = bowtie_idx_to_fa(idx_prefix)
843     return idx_fa
844    
845     # Check that both the Bowtie index and the genome's fasta file are present
846     def check_index(idx_prefix):
847     check_bowtie_index(idx_prefix)
848     ref_fasta_file = check_fasta(idx_prefix)
849    
850     return (ref_fasta_file, None)
851    
852     # Retrive a tuple containing the system's version of Bowtie. Parsed from
853     # `bowtie --version`
854     def get_bowtie_version():
855     try:
856     # Launch Bowtie to capture its version info
857     proc = subprocess.Popen([bowtie_path, "--version"],
858     stdout=subprocess.PIPE)
859     stdout_value = proc.communicate()[0]
860     bowtie_version = None
861     bowtie_out = repr(stdout_value)
862    
863     # Find the version identifier
864     version_str = "bowtie version "
865     ver_str_idx = bowtie_out.find(version_str)
866     if ver_str_idx != -1:
867     nl = bowtie_out.find("\\n", ver_str_idx)
868     version_val = bowtie_out[ver_str_idx + len(version_str):nl]
869     bowtie_version = [int(x) for x in version_val.split('.')]
870     if len(bowtie_version) == 3:
871     bowtie_version.append(0)
872    
873     return bowtie_version
874     except OSError, o:
875     errmsg=fail_str+str(o)+"\n"
876     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
877     errmsg+="Error: bowtie not found on this system"
878     die(errmsg)
879    
880     def get_index_sam_header(read_params, idx_prefix):
881     try:
882     bowtie_sam_header_filename = tmp_dir+idx_prefix.split('/')[-1]+".bwt.samheader.sam"
883     bowtie_sam_header_file = open(bowtie_sam_header_filename,"w")
884    
885     bowtie_header_cmd = [bowtie_path, "--sam"]
886     if read_params.color:
887     bowtie_header_cmd.append('-C')
888     bowtie_header_cmd.extend([idx_prefix, '/dev/null'])
889     subprocess.call(bowtie_header_cmd,
890     stdout=bowtie_sam_header_file,
891     stderr=open('/dev/null'))
892    
893     bowtie_sam_header_file.close()
894     bowtie_sam_header_file = open(bowtie_sam_header_filename,"r")
895    
896     sam_header_file = open(sam_header, "w")
897    
898     preamble = []
899     sq_dict_lines = []
900    
901     for line in bowtie_sam_header_file.readlines():
902     line = line.strip()
903     if line.find("@SQ") != -1:
904     # Sequence dictionary record
905     cols = line.split('\t')
906     seq_name = None
907     for col in cols:
908     fields = col.split(':')
909     #print fields
910     if len(fields) > 0 and fields[0] == "SN":
911     seq_name = fields[1]
912     if seq_name == None:
913     die("Error: malformed sequence dictionary in sam header")
914     sq_dict_lines.append([seq_name,line])
915     elif line.find("CL"):
916     continue
917     else:
918     preamble.append(line)
919     #for line in preamble:
920     # print >> sam_header_file, line
921     print >> sam_header_file, "@HD\tVN:1.0\tSO:coordinate"
922    
923     if read_params.read_group_id and read_params.sample_id:
924     rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
925     read_params.sample_id)
926     if read_params.library_id:
927     rg_str += "\tLB:%s" % read_params.library_id
928     if read_params.description:
929     rg_str += "\tDS:%s" % read_params.description
930     if read_params.seq_platform_unit:
931     rg_str += "\tPU:%s" % read_params.seq_platform_unit
932     if read_params.seq_center:
933     rg_str += "\tCN:%s" % read_params.seq_center
934     if read_params.mate_inner_dist:
935     rg_str += "\tPI:%s" % read_params.mate_inner_dist
936     if read_params.seq_run_date:
937     rg_str += "\tDT:%s" % read_params.seq_run_date
938     if read_params.seq_platform:
939     rg_str += "\tPL:%s" % read_params.seq_platform
940    
941     print >> sam_header_file, rg_str
942    
943     sq_dict_lines.sort(lambda x,y: cmp(x[0],y[0]))
944     for [name, line] in sq_dict_lines:
945     print >> sam_header_file, line
946     print >> sam_header_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
947    
948     sam_header_file.close()
949    
950     return sam_header
951    
952     except OSError, o:
953     errmsg=fail_str+str(o)+"\n"
954     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
955     errmsg+="Error: bowtie not found on this system"
956     die(errmsg)
957    
958     # Make sure Bowtie is installed and is recent enough to be useful
959     def check_bowtie():
960     print >> sys.stderr, "[%s] Checking for Bowtie" % right_now()
961     global bowtie_path
962     bowtie_path=prog_path("bowtie")
963     bowtie_version = get_bowtie_version()
964     if bowtie_version == None:
965     die("Error: Bowtie not found on this system")
966     elif bowtie_version[0] < 1 and bowtie_version[1] < 12 and bowtie_version[2] < 3:
967     die("Error: TopHat requires Bowtie 0.12.3 or later")
968     print >> sys.stderr, "\tBowtie version:\t\t\t %s" % ".".join([str(x) for x in bowtie_version])
969    
970    
971     # Retrive a tuple containing the system's version of samtools. Parsed from
972     # `samtools`
973     def get_samtools_version():
974     try:
975     # Launch Bowtie to capture its version info
976     proc = subprocess.Popen(samtools_path, stderr=subprocess.PIPE)
977     samtools_out = proc.communicate()[1]
978    
979     # Find the version identifier
980     version_match = re.search(r'Version:\s+(\d+)\.(\d+).(\d+)([a-zA-Z]?)', samtools_out)
981     samtools_version_arr = [int(version_match.group(x)) for x in [1,2,3]]
982     if version_match.group(4):
983     samtools_version_arr.append(version_match.group(4))
984     else:
985     samtools_version_arr.append(0)
986    
987     return version_match.group(), samtools_version_arr
988     except OSError, o:
989     errmsg=fail_str+str(o)+"\n"
990     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
991     errmsg+="Error: samtools not found on this system"
992     die(errmsg)
993    
994     # Make sure the SAM tools are installed and are recent enough to be useful
995     def check_samtools():
996     print >> sys.stderr, "[%s] Checking for Samtools" % right_now()
997     global samtools_path
998     samtools_path=prog_path("samtools")
999     samtools_version_str, samtools_version_arr = get_samtools_version()
1000     if samtools_version_str == None:
1001     die("Error: Samtools not found on this system")
1002     elif samtools_version_arr[1] < 1 or samtools_version_arr[2] < 7:
1003     die("Error: TopHat requires Samtools 0.1.7 or later")
1004     print >> sys.stderr, "\tSamtools %s" % samtools_version_str
1005    
1006    
1007    
1008     class FastxReader:
1009     def __init__(self, i_file, is_color=0, fname=''):
1010     self.bufline=None
1011     self.format=None
1012     self.ifile=i_file
1013     self.nextRecord=None
1014     self.eof=None
1015     self.fname=fname
1016     self.lastline=None
1017     self.numrecords=0
1018     self.isColor=0
1019     if is_color : self.isColor=1
1020     # determine file type
1021     #no records processed yet, skip custom header lines if any
1022     hlines=10 # allow maximum 10 header lines
1023     self.lastline=" "
1024     while hlines>0 and self.lastline[0] not in "@>" :
1025     self.lastline=self.ifile.readline()
1026     hlines-=1
1027     if self.lastline[0] == '@':
1028     self.format='fastq'
1029     self.nextRecord=self.nextFastq
1030     elif self.lastline[0] == '>':
1031     self.format='fasta'
1032     self.nextRecord=self.nextFasta
1033     else:
1034     die("Error: cannot determine record type in input file %s" % fname)
1035     self.bufline=self.lastline
1036     self.lastline=None
1037    
1038     def nextFastq(self):
1039     # returning tuple: (seqID, sequence_string, seq_len, qv_string)
1040     seqid,seqstr,qstr,seq_len='','','',0
1041     if self.eof: return (seqid, seqstr, seq_len, qstr)
1042     fline=self.getLine #shortcut to save a bit of time
1043     line=fline()
1044    
1045     if not line : return (seqid, seqstr, seq_len, qstr)
1046     while len(line.rstrip())==0: # skip empty lines
1047     line=fline()
1048     if not line : return (seqid, seqstr,seq_len, qstr)
1049     try:
1050     if line[0] != "@":
1051     raise ValueError("Records in Fastq files should start with '@' character")
1052     seqid = line[1:].rstrip()
1053     seqstr = fline().rstrip()
1054    
1055     #There may now be more sequence lines, or the "+" quality marker line:
1056     while True:
1057     line = fline()
1058     if not line:
1059     raise ValueError("Premature end of file (missing quality values for "+seqid+")")
1060     if line[0] == "+":
1061     # -- sequence string ended
1062     #qtitle = line[1:].rstrip()
1063     #if qtitle and qtitle != seqid:
1064     # raise ValueError("Different read ID for sequence and quality (%s vs %s)" \
1065     # % (seqid, qtitle))
1066     break
1067     seqstr += line.rstrip() #removes trailing newlines
1068     #loop until + found
1069     seq_len = len(seqstr)
1070     #at least one line of quality data should follow
1071     qstrlen=0
1072     #now read next lines as quality values until seq_len is reached
1073     while True:
1074     line=fline()
1075     if not line : break #end of file
1076     qstr += line.rstrip()
1077     qstrlen=len(qstr)
1078     if qstrlen + self.isColor >= seq_len :
1079     break # qv string has reached the length of seq string
1080     #loop until qv has the same length as seq
1081     if self.isColor and qstr[0]=="!":
1082     #some color qual string have a '!' character at the beginning, should be stripped
1083     qstr = qstr[1:]
1084     qstrlen -= 1
1085     if seq_len != qstrlen+self.isColor :
1086     raise ValueError("Length mismatch between sequence and quality strings "+ \
1087     "for %s (%i vs %i)." \
1088     % (seqid, seq_len, qstrlen))
1089     except ValueError, err:
1090     die("\nError encountered parsing file "+self.fname+":\n "+str(err))
1091     #return the record
1092     self.numrecords+=1
1093     if self.isColor :
1094     seq_len-=1
1095     seqstr = seqstr[1:]
1096     return (seqid, seqstr, seq_len, qstr)
1097    
1098     def nextFasta(self):
1099     # returning tuple: (seqID, sequence_string, seq_len)
1100     seqid,seqstr,seq_len='','',0
1101     fline=self.getLine # shortcut to readline function of f
1102     line=fline() # this will use the buffer line if it's there
1103     if not line : return (seqid, seqstr, seq_len, None)
1104     while len(line.rstrip())==0: # skip empty lines
1105     line=fline()
1106     if not line : return (seqid, seqstr, seq_len, None)
1107     try:
1108     if line[0] != ">":
1109     raise ValueError("Records in Fasta files must start with '>' character")
1110     seqid = line[1:].split()[0]
1111     #more sequence lines, or the ">" quality marker line:
1112     while True:
1113     line = fline()
1114     if not line: break
1115     if line[0] == '>':
1116     #next sequence starts here
1117     self.ungetLine()
1118     break
1119     seqstr += line.rstrip()
1120     #loop until '>' found
1121     seq_len = len(seqstr)
1122     if seq_len < 3:
1123     raise ValueError("Read %s too short (%i)." \
1124     % (seqid, seq_len))
1125     except ValueError, err:
1126     die("\nError encountered parsing fasta file "+self.fname+"\n "+str(err))
1127     #return the record and continue
1128     self.numrecords+=1
1129     if self.isColor :
1130     seq_len-=1
1131     seqstr = seqstr[1:]
1132     return (seqid, seqstr, seq_len, None)
1133    
1134     def getLine(self):
1135     if self.bufline: #return previously buffered line
1136     r=self.bufline
1137     self.bufline=None
1138     return r
1139     else: #read a new line from stream and return it
1140     if self.eof: return None
1141     self.lastline=self.ifile.readline()
1142     if not self.lastline:
1143     self.eof=1
1144     return None
1145     return self.lastline
1146     def ungetLine(self):
1147     if self.lastline is None:
1148     print >> sys.stderr, "Warning: FastxReader called ungetLine() with no prior line!"
1149     self.bufline=self.lastline
1150     self.lastline=None
1151     #< class FastxReader
1152    
1153     class ZReader:
1154     def __init__(self, filename, sysparams, guess=True):
1155     self.fname=filename
1156     self.file=None
1157     self.fsrc=None
1158     self.popen=None
1159     pipecmd=[]
1160 gpertea 31 s=filename.lower()
1161     if s.endswith(".bam"):
1162     pipecmd=["bam2fastx", "-"]
1163     else:
1164     if guess:
1165     if s.endswith(".z") or s.endswith(".gz") or s.endswith(".gzip"):
1166     pipecmd=['gzip']
1167     else:
1168     if s.endswith(".bz2") or s.endswith(".bzip2") or s.endswith(".bzip"):
1169     pipecmd=['bzip2']
1170     if len(pipecmd)>0 and which(pipecmd[0]) is None:
1171     die("Error: cannot find %s to decompress input file %s " % (pipecmd, filename))
1172     if len(pipecmd)>0:
1173     if pipecmd[0]=='gzip' and sysparams.zipper.endswith('pigz'):
1174     pipecmd[0]=sysparams.zipper
1175     pipecmd.extend(sysparams.zipper_opts)
1176     elif pipecmd[0]=='bzip2' and sysparams.zipper.endswith('pbzip2'):
1177     pipecmd[0]=sysparams.zipper
1178     pipecmd.extend(sysparams.zipper_opts)
1179     else: #not guessing, but we must still be sure it's a compressed file
1180     if use_zpacker and filename.endswith(".z"):
1181     pipecmd=[sysparams.zipper]
1182     pipecmd.extend(sysparams.zipper_opts)
1183     if pipecmd:
1184     pipecmd+=['-cd']
1185 gpertea 29 if pipecmd:
1186     try:
1187     self.fsrc=open(self.fname, 'rb')
1188     self.popen=subprocess.Popen(pipecmd,
1189     preexec_fn=subprocess_setup,
1190     stdin=self.fsrc,
1191     stdout=subprocess.PIPE, close_fds=True)
1192     except Exception:
1193     die("Error: could not open pipe "+' '.join(pipecmd)+' < '+ self.fname)
1194     self.file=self.popen.stdout
1195     else:
1196     self.file=open(filename)
1197     def close(self):
1198     if self.fsrc: self.fsrc.close()
1199     self.file.close()
1200     if self.popen:
1201     self.popen.wait()
1202     self.popen=None
1203    
1204     class ZWriter:
1205     def __init__(self, filename, sysparams):
1206     self.fname=filename
1207     if use_zpacker:
1208     pipecmd=[sysparams.zipper,"-cf", "-"]
1209     self.ftarget=open(filename, "wb")
1210     try:
1211     self.popen=subprocess.Popen(pipecmd,
1212     preexec_fn=subprocess_setup,
1213     stdin=subprocess.PIPE,
1214     stdout=self.ftarget, close_fds=True)
1215     except Exception:
1216     die("Error: could not open writer pipe "+' '.join(pipecmd)+' < '+ self.fname)
1217     self.file=self.popen.stdin # client writes to this end of the pipe
1218     else: #no compression
1219     self.file=open(filename, "w")
1220     self.ftarget=None
1221     self.popen=None
1222     def close(self):
1223     self.file.close()
1224     if self.ftarget: self.ftarget.close()
1225     if self.popen:
1226     self.popen.wait() #! required to actually flush the pipes (eek!)
1227     self.popen=None
1228    
1229     # check_reads_format() examines the first few records in the user files
1230     # to determines the file format
1231     def check_reads_format(params, reads_files):
1232     #print >> sys.stderr, "[%s] Checking reads" % right_now()
1233     #
1234     seed_len = params.read_params.seed_length
1235     fileformat = params.read_params.reads_format
1236    
1237     observed_formats = set([])
1238     # observed_scales = set([])
1239     min_seed_len = 99999
1240     max_seed_len = 0
1241     files = reads_files.split(',')
1242    
1243     for f_name in files:
1244     #try:
1245     zf = ZReader(f_name, params.system_params)
1246     #except IOError:
1247     # die("Error: could not open file "+f_name)
1248     freader=FastxReader(zf.file, params.read_params.color, zf.fname)
1249     toread=4 #just sample the first 4 reads
1250     while toread>0:
1251     seqid, seqstr, seq_len, qstr = freader.nextRecord()
1252     if not seqid: break
1253     toread-=1
1254     if seq_len < 20:
1255     print >> sys.stderr, "Warning: found a read < 20bp in", f_name
1256     else:
1257     min_seed_len = min(seq_len, min_seed_len)
1258     max_seed_len = max(seq_len, max_seed_len)
1259     zf.close()
1260     observed_formats.add(freader.format)
1261     if len(observed_formats) > 1:
1262     die("Error: TopHat requires all reads be either FASTQ or FASTA. Mixing formats is not supported.")
1263     fileformat=list(observed_formats)[0]
1264     if seed_len != None:
1265     seed_len = max(seed_len, min_seed_len)
1266     else:
1267     seed_len = max_seed_len
1268     #print >> sys.stderr, "\tmin read length: %dbp, max read length: %dbp" % (min_seed_len, max_seed_len)
1269     print >> sys.stderr, "\tformat:\t\t %s" % fileformat
1270     if fileformat == "fastq":
1271     quality_scale = "phred33 (default)"
1272     if params.read_params.solexa_quals and not params.read_params.phred64_quals:
1273     quality_scale = "solexa33 (reads generated with GA pipeline version < 1.3)"
1274     elif params.read_params.phred64_quals:
1275     quality_scale = "phred64 (reads generated with GA pipeline version >= 1.3)"
1276     print >> sys.stderr, "\tquality scale:\t %s" % quality_scale
1277     elif fileformat == "fasta":
1278     if params.read_params.color:
1279     params.read_params.integer_quals = True
1280    
1281     #print seed_len, format, solexa_scale
1282     #NOTE: seed_len will be re-evaluated later by prep_reads
1283     return TopHatParams.ReadParams(params.read_params.solexa_quals,
1284     params.read_params.phred64_quals,
1285     params.read_params.quals,
1286     params.read_params.integer_quals,
1287     params.read_params.color,
1288     params.read_params.color_out,
1289     params.read_params.library_type,
1290     seed_len,
1291     fileformat,
1292     params.read_params.mate_inner_dist,
1293     params.read_params.mate_inner_dist_std_dev,
1294     params.read_params.read_group_id,
1295     params.read_params.sample_id,
1296     params.read_params.library_id,
1297     params.read_params.description,
1298     params.read_params.seq_platform_unit,
1299     params.read_params.seq_center,
1300     params.read_params.seq_run_date,
1301     params.read_params.seq_platform)
1302    
1303     def log_tail(logfile, lines=1):
1304     f=open(logfile, "r")
1305     f.seek(0, 2)
1306     fbytes= f.tell()
1307     size=lines
1308     block=-1
1309     while size > 0 and fbytes+block*1024 > 0:
1310     if (fbytes+block*1024 > 0):
1311     ##Seek back once more, if possible
1312     f.seek( block*1024, 2 )
1313     else:
1314     #Seek to the beginning
1315     f.seek(0, 0)
1316     data= f.read( 1024 )
1317     linesFound= data.count('\n')
1318     size -= linesFound
1319     block -= 1
1320     if (fbytes + block*1024 > 0):
1321     f.seek(block*1024, 2)
1322     else:
1323     f.seek(0,0)
1324     f.readline() # find a newline
1325     lastBlocks= list( f.readlines() )
1326     f.close()
1327     return "\n".join(lastBlocks[-lines:])
1328    
1329     # Format a DateTime as a pretty string.
1330     # FIXME: Currently doesn't support days!
1331     def formatTD(td):
1332     hours = td.seconds // 3600
1333     minutes = (td.seconds % 3600) // 60
1334     seconds = td.seconds % 60
1335     return '%02d:%02d:%02d' % (hours, minutes, seconds)
1336    
1337     class PrepReadsInfo:
1338     def __init__(self, fname, side):
1339     try:
1340     f=open(fname,"r")
1341     self.min_len=int(f.readline().split("=")[-1])
1342     self.max_len=int(f.readline().split("=")[-1])
1343     self.in_count=int(f.readline().split("=")[-1])
1344     self.out_count=int(f.readline().split("=")[-1])
1345     if (self.out_count==0) or (self.max_len<16):
1346     raise Exception()
1347     except Exception, e:
1348     die(fail_str+"Error retrieving prep_reads info.")
1349     print >> sys.stderr, "\t%s reads: min. length=%s, count=%s" % (side, self.min_len, self.out_count)
1350     if self.min_len < 20:
1351     print >> sys.stderr, "Warning: short reads (<20bp) will make TopHat quite slow and take large amount of memory because they are likely to be mapped to too many places"
1352    
1353     # Calls the prep_reads executable, which prepares an internal read library.
1354     # The read library features reads with monotonically increasing integer IDs.
1355     # prep_reads also filters out very low complexy or garbage reads as well as
1356     # polyA reads.
1357     #--> returns the tuple (internal_read_library_file, read_info)
1358     def prep_reads(params, reads_list, quals_list, output_name, side="Left "):
1359     reads_suffix = ".fq"
1360     if use_zpacker: reads_suffix += ".z"
1361     kept_reads_filename = tmp_dir + output_name + reads_suffix
1362    
1363     if os.path.exists(kept_reads_filename):
1364     os.remove(kept_reads_filename)
1365     kept_reads = open(kept_reads_filename, "wb")
1366     log_fname=logging_dir + "prep_reads.log"
1367     filter_log = open(log_fname,"w")
1368    
1369     filter_cmd = [prog_path("prep_reads")]
1370     filter_cmd.extend(params.cmd())
1371    
1372     if params.read_params.reads_format == "fastq":
1373     filter_cmd += ["--fastq"]
1374     elif params.read_params.reads_format == "fasta":
1375     filter_cmd += ["--fasta"]
1376     info_file=output_dir+output_name+".info"
1377     filter_cmd += ["--aux-outfile="+info_file]
1378     filter_cmd.append(reads_list)
1379     if params.read_params.quals:
1380     filter_cmd.append(quals_list)
1381     shell_cmd = ' '.join(filter_cmd)
1382     #finally, add the compression pipe
1383     zip_cmd=[]
1384     if use_zpacker:
1385     zip_cmd=[ params.system_params.zipper ]
1386     zip_cmd.extend(params.system_params.zipper_opts)
1387     zip_cmd.extend(['-c','-'])
1388     shell_cmd +=' | '+' '.join(zip_cmd)
1389     shell_cmd += ' >' +kept_reads_filename
1390     retcode=0
1391     try:
1392     print >> run_log, shell_cmd
1393     if use_zpacker:
1394     filter_proc = subprocess.Popen(filter_cmd,
1395     stdout=subprocess.PIPE,
1396     stderr=filter_log)
1397     zip_proc=subprocess.Popen(zip_cmd,
1398     preexec_fn=subprocess_setup,
1399     stdin=filter_proc.stdout,
1400     stdout=kept_reads)
1401     filter_proc.stdout.close() #as per http://bugs.python.org/issue7678
1402     zip_proc.communicate()
1403     retcode=filter_proc.poll()
1404     if retcode==0:
1405     retcode=zip_proc.poll()
1406     else:
1407     retcode = subprocess.call(filter_cmd,
1408     stdout=kept_reads,
1409     stderr=filter_log)
1410     if retcode:
1411     die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname))
1412    
1413     except OSError, o:
1414     errmsg=fail_str+str(o)
1415     die(errmsg+"\n"+log_tail(log_fname))
1416     kept_reads.close()
1417    
1418     return kept_reads_filename, PrepReadsInfo(info_file, side)
1419    
1420     # Call bowtie
1421     def bowtie(params,
1422     bwt_idx_prefix,
1423     reads_list,
1424     reads_format,
1425     num_mismatches,
1426     mapped_reads,
1427     unmapped_reads = None, #not needed vs segment_juncs
1428     extra_output = ""):
1429     start_time = datetime.now()
1430     bwt_idx_name = bwt_idx_prefix.split('/')[-1]
1431     readfile_basename=getFileBaseName(reads_list)
1432     print >> sys.stderr, "[%s] Mapping %s against %s with Bowtie %s" % (start_time.strftime("%c"),
1433     readfile_basename, bwt_idx_name, extra_output)
1434    
1435     bwt_log = open(logging_dir + 'bowtie.'+readfile_basename+'.fixmap.log', "w")
1436     unmapped_reads_bwt=unmapped_reads+".fq"
1437     use_FIFO = use_BWT_FIFO and use_zpacker and unmapped_reads and params.read_params.color
1438     if use_FIFO: #this should always be False unless color reads are used with -X option
1439     global unmapped_reads_fifo
1440     if os.path.exists(unmapped_reads_fifo):
1441     os.remove(unmapped_reads_fifo)
1442     try:
1443     os.mkfifo(unmapped_reads_fifo)
1444     except OSError, o:
1445     die(fail_str+"Error at mkfifo("+unmapped_reads_fifo+'). '+str(o))
1446     unmapped_reads_bwt=unmapped_reads_fifo
1447    
1448     # Launch Bowtie
1449     try:
1450     bowtie_cmd = [bowtie_path]
1451     bam_input=0
1452     unzip_cmd=None
1453     if reads_list.endswith('.bam'):
1454     bam_input=1
1455     unzip_cmd=[ prog_path('bam2fastx') ]
1456     if reads_format:
1457     unzip_cmd.append("--"+reads_format)
1458     unzip_cmd+=[reads_list]
1459    
1460     if reads_format == "fastq":
1461     bowtie_cmd += ["-q"]
1462     elif reads_format == "fasta":
1463     bowtie_cmd += ["-f"]
1464    
1465     if params.read_params.color:
1466     bowtie_cmd += ["-C", "--col-keepends"]
1467     if unmapped_reads:
1468     bowtie_cmd += ["--un", unmapped_reads_bwt]
1469     # "--max", "/dev/null"]
1470     if use_zpacker and (unzip_cmd is None):
1471     unzip_cmd=[ params.system_params.zipper ]
1472     unzip_cmd.extend(params.system_params.zipper_opts)
1473     unzip_cmd+=['-cd']
1474    
1475     fifo_pid=None
1476     if use_FIFO:
1477     unm_zipcmd=[ params.system_params.zipper ]
1478     unm_zipcmd.extend(params.system_params.zipper_opts)
1479     unm_zipcmd+=['-c']
1480     unmapped_reads+=".fq.z"
1481     print >> run_log, ' '.join(unm_zipcmd)+' < '+ unmapped_reads_fifo + ' > '+ unmapped_reads + ' & '
1482     fifo_pid=os.fork()
1483     if fifo_pid==0:
1484     def on_sig_exit(sig, func=None):
1485     os._exit(os.EX_OK)
1486     signal.signal(signal.SIGTERM, on_sig_exit)
1487     subprocess.call(unm_zipcmd,
1488     stdin=open(unmapped_reads_fifo, "r"),
1489     stdout=open(unmapped_reads, "wb"))
1490     os._exit(os.EX_OK)
1491    
1492     bowtie_cmd += [params.bowtie_alignment_option, str(num_mismatches),
1493     "-p", str(params.system_params.num_cpus),
1494     "-k", str(params.max_hits),
1495     "-m", str(params.max_hits),
1496     "--max", "/dev/null",
1497     "--sam-nohead"]
1498     fix_map_cmd = [prog_path('fix_map_ordering')]
1499     sam_zipcmd=None
1500     if unmapped_reads:
1501     #mapping on the genome
1502     #fix_map_ordering will write BAM file directly, and unmapped_reads as BAM file too
1503     mapped_reads += ".bam"
1504     if params.read_params.color:
1505     fix_map_cmd += ["--sam-header", sam_header, "-", mapped_reads]
1506     else:
1507     unmapped_reads += ".bam"
1508     fix_map_cmd += ["--sam-header", sam_header, "-", mapped_reads, unmapped_reads]
1509     else:
1510     #mapping on segment_juncs (spliced potential junctions)
1511     # don't use SAM headers, could be slow -- better compress SAM output directly
1512     mapped_reads += ".sam"
1513     fix_map_cmd += ["-"]
1514     if use_zpacker:
1515     fix_map_cmd += ["-"]
1516     sam_zipcmd=[ params.system_params.zipper ]
1517     sam_zipcmd.extend(params.system_params.zipper_opts)
1518     sam_zipcmd += ["-c"]
1519     mapped_reads += ".z"
1520     else:
1521     fix_map_cmd += [mapped_reads]
1522    
1523     bowtie_cmd+=[bwt_idx_prefix]
1524     bowtie_proc=None
1525     shellcmd=""
1526     unzip_proc=None
1527     mapzip_proc=None
1528     if unzip_cmd:
1529     # could be bam2fastx
1530     if bam_input:
1531     unzip_proc = subprocess.Popen(unzip_cmd, stdout=subprocess.PIPE)
1532     shellcmd=' '.join(unzip_cmd) + "|"
1533     else:
1534     unzip_proc = subprocess.Popen(unzip_cmd,
1535     stdin=open(reads_list, "rb"),
1536     stdout=subprocess.PIPE)
1537     shellcmd=' '.join(unzip_cmd) + "< " +reads_list +"|"
1538     bowtie_cmd += ['-']
1539     bowtie_proc = subprocess.Popen(bowtie_cmd,
1540     stdin=unzip_proc.stdout,
1541     stdout=subprocess.PIPE,
1542     stderr=bwt_log)
1543     unzip_proc.stdout.close() # see http://bugs.python.org/issue7678
1544     else:
1545     bowtie_cmd += [reads_list]
1546     bowtie_proc = subprocess.Popen(bowtie_cmd,
1547     stdout=subprocess.PIPE,
1548     stderr=bwt_log)
1549     shellcmd += " ".join(bowtie_cmd) + " | " + " ".join(fix_map_cmd)
1550     if sam_zipcmd:
1551     shellcmd += "|" + " ".join(sam_zipcmd)+" > " + mapped_reads
1552     fix_order_proc = subprocess.Popen(fix_map_cmd,
1553     stdin=bowtie_proc.stdout,
1554     stdout=subprocess.PIPE)
1555     bowtie_proc.stdout.close()
1556     mapzip_proc = subprocess.Popen(sam_zipcmd,
1557     stdin=fix_order_proc.stdout,
1558     stdout=open(mapped_reads, "wb"))
1559     fix_order_proc.stdout.close()
1560     else:
1561     fix_order_proc = subprocess.Popen(fix_map_cmd,
1562     stdin=bowtie_proc.stdout)
1563     bowtie_proc.stdout.close()
1564    
1565     print >> run_log, shellcmd
1566     if mapzip_proc:
1567     mapzip_proc.communicate()
1568     else:
1569     fix_order_proc.communicate()
1570     if use_FIFO:
1571     if fifo_pid and not os.path.exists(unmapped_reads):
1572     try:
1573     os.kill(fifo_pid, signal.SIGTERM)
1574     except:
1575     pass
1576     except OSError, o:
1577     die(fail_str+"Error: "+str(o))
1578    
1579     # Success
1580     #finish_time = datetime.now()
1581     #duration = finish_time - start_time
1582     #print >> sys.stderr, "\t\t\t[%s elapsed]" % formatTD(duration)
1583     if use_FIFO:
1584     try:
1585     os.remove(unmapped_reads_fifo)
1586     except:
1587     pass
1588     return (mapped_reads, unmapped_reads)
1589    
1590     def bowtie_segment(params,
1591     bwt_idx_prefix,
1592     reads_list,
1593     reads_format,
1594     mapped_reads,
1595     unmapped_reads = None,
1596     extra_output = ""):
1597    
1598     backup_bowtie_alignment_option = params.bowtie_alignment_option
1599     params.bowtie_alignment_option = "-v"
1600     params.max_hits *= 2
1601    
1602     result = bowtie(params,
1603     bwt_idx_prefix,
1604     reads_list,
1605     reads_format,
1606     params.segment_mismatches,
1607     mapped_reads,
1608     unmapped_reads,
1609     extra_output)
1610    
1611     params.bowtie_alignment_option = backup_bowtie_alignment_option
1612     params.max_hits /= 2
1613     return result
1614    
1615     # Retrieve a .juncs file from a GFF file by calling the gtf_juncs executable
1616     def get_gtf_juncs(gff_annotation):
1617     print >> sys.stderr, "[%s] Reading known junctions from GTF file" % (right_now())
1618     gtf_juncs_log = open(logging_dir + "gtf_juncs.log", "w")
1619    
1620     gff_prefix = gff_annotation.split('/')[-1].split('.')[0]
1621    
1622     gtf_juncs_out_name = tmp_dir + gff_prefix + ".juncs"
1623     gtf_juncs_out = open(gtf_juncs_out_name, "w")
1624    
1625     gtf_juncs_cmd=[prog_path("gtf_juncs"), gff_annotation]
1626     try:
1627     print >> run_log, " ".join(gtf_juncs_cmd)
1628     retcode = subprocess.call(gtf_juncs_cmd,
1629     stderr=gtf_juncs_log,
1630     stdout=gtf_juncs_out)
1631     # cvg_islands returned an error
1632     if retcode == 1:
1633     print >> sys.stderr, "\tWarning: TopHat did not find any junctions in GTF file"
1634     return (False, gtf_juncs_out_name)
1635     elif retcode != 0:
1636     die(fail_str+"Error: GTF junction extraction failed with err ="+str(retcode))
1637     # cvg_islands not found
1638     except OSError, o:
1639     errmsg=fail_str+str(o)+"\n"
1640     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1641     errmsg+="Error: gtf_juncs not found on this system"
1642     die(errmsg)
1643     return (True, gtf_juncs_out_name)
1644    
1645     # Call bowtie-build on the FASTA file of sythetic splice junction sequences
1646     def build_juncs_bwt_index(external_splice_prefix, color):
1647     print >> sys.stderr, "[%s] Indexing splices" % (right_now())
1648     bowtie_build_log = open(logging_dir + "bowtie_build.log", "w")
1649    
1650     #user_splices_out_prefix = output_dir + "user_splices_idx"
1651    
1652     bowtie_build_cmd = [prog_path("bowtie-build")]
1653     if color:
1654     bowtie_build_cmd += ["-C"]
1655    
1656     bowtie_build_cmd += [external_splice_prefix + ".fa",
1657     external_splice_prefix]
1658     try:
1659     print >> run_log, " ".join(bowtie_build_cmd)
1660     retcode = subprocess.call(bowtie_build_cmd,
1661     stdout=bowtie_build_log)
1662    
1663     if retcode != 0:
1664     die(fail_str+"Error: Splice sequence indexing failed with err ="+ str(retcode))
1665     except OSError, o:
1666     errmsg=fail_str+str(o)+"\n"
1667     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1668     errmsg+="Error: bowtie-build not found on this system"
1669     die(errmsg)
1670     return external_splice_prefix
1671    
1672     # Build a splice index from a .juncs file, suitable for use with specified read
1673     # (or read segment) lengths
1674     def build_juncs_index(min_anchor_length,
1675     read_length,
1676     juncs_prefix,
1677     external_juncs,
1678     external_insertions,
1679     external_deletions,
1680     reference_fasta,
1681     color):
1682     print >> sys.stderr, "[%s] Retrieving sequences for splices" % (right_now())
1683    
1684     juncs_file_list = ",".join(external_juncs)
1685     insertions_file_list = ",".join(external_insertions)
1686     deletions_file_list = ",".join(external_deletions)
1687    
1688    
1689     juncs_db_log = open(logging_dir + "juncs_db.log", "w")
1690    
1691     external_splices_out_prefix = tmp_dir + juncs_prefix
1692     external_splices_out_name = external_splices_out_prefix + ".fa"
1693    
1694     external_splices_out = open(external_splices_out_name, "w")
1695     # juncs_db_cmd = [bin_dir + "juncs_db",
1696     juncs_db_cmd = [prog_path("juncs_db"),
1697     str(min_anchor_length),
1698     str(read_length),
1699     juncs_file_list,
1700     insertions_file_list,
1701     deletions_file_list,
1702     reference_fasta]
1703     try:
1704     print >> run_log, " ".join(juncs_db_cmd)
1705     retcode = subprocess.call(juncs_db_cmd,
1706     stderr=juncs_db_log,
1707     stdout=external_splices_out)
1708    
1709     if retcode != 0:
1710     die(fail_str+"Error: Splice sequence retrieval failed with err ="+str(retcode))
1711     # juncs_db not found
1712     except OSError, o:
1713     errmsg=fail_str+str(o)+"\n"
1714     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1715     errmsg+="Error: juncs_db not found on this system"
1716     die(errmsg)
1717    
1718     external_splices_out_prefix = build_juncs_bwt_index(external_splices_out_prefix, color)
1719     return external_splices_out_prefix
1720    
1721     # Print out the sam header, embedding the user's specified library properties.
1722     # FIXME: also needs SQ dictionary lines
1723     def write_sam_header(read_params, sam_file):
1724     print >> sam_file, "@HD\tVN:1.0\tSO:coordinate"
1725     if read_params.read_group_id and read_params.sample_id:
1726     rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
1727     read_params.sample_id)
1728     if read_params.library_id:
1729     rg_str += "\tLB:%s" % read_params.library_id
1730     if read_params.description:
1731     rg_str += "\tDS:%s" % read_params.description
1732     if read_params.seq_platform_unit:
1733     rg_str += "\tPU:%s" % read_params.seq_platform_unit
1734     if read_params.seq_center:
1735     rg_str += "\tCN:%s" % read_params.seq_center
1736     if read_params.mate_inner_dist:
1737     rg_str += "\tPI:%s" % read_params.mate_inner_dist
1738     if read_params.seq_run_date:
1739     rg_str += "\tDT:%s" % read_params.seq_run_date
1740     if read_params.seq_platform:
1741     rg_str += "\tPL:%s" % read_params.seq_platform
1742    
1743     print >> sam_file, rg_str
1744     print >> sam_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
1745    
1746     # Write final TopHat output, via tophat_reports and wiggles
1747     def compile_reports(params, sam_header_filename, left_maps, left_reads, right_maps, right_reads, gff_annotation):
1748     print >> sys.stderr, "[%s] Reporting output tracks" % right_now()
1749    
1750     left_maps = [x for x in left_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
1751     left_maps = ','.join(left_maps)
1752    
1753     if len(right_maps) > 0:
1754     right_maps = [x for x in right_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
1755     right_maps = ','.join(right_maps)
1756     log_fname = logging_dir + "reports.log"
1757     report_log = open(log_fname, "w")
1758     junctions = output_dir + "junctions.bed"
1759     insertions = output_dir + "insertions.bed"
1760     deletions = output_dir + "deletions.bed"
1761     coverage = "coverage.wig"
1762     accepted_hits = output_dir + "accepted_hits"
1763     report_cmdpath = prog_path("tophat_reports")
1764     report_cmd = [report_cmdpath]
1765     report_cmd.extend(params.cmd())
1766     report_cmd.extend(["--samtools="+samtools_path])
1767     report_cmd.extend([junctions,
1768     insertions,
1769     deletions,
1770     "-",
1771     left_maps,
1772     left_reads])
1773     if len(right_maps) > 0 and right_reads != None:
1774     report_cmd.append(right_maps)
1775     report_cmd.append(right_reads)
1776     # -- tophat_reports now produces (uncompressed) BAM stream,
1777     # directly piped into samtools sort
1778     try:
1779     if params.report_params.convert_bam:
1780     if params.report_params.sort_bam:
1781     report_proc=subprocess.Popen(report_cmd,
1782     preexec_fn=subprocess_setup,
1783     stdout=subprocess.PIPE,
1784     stderr=report_log)
1785    
1786     bamsort_cmd = ["samtools", "sort", "-", accepted_hits]
1787     bamsort_proc= subprocess.Popen(bamsort_cmd,
1788     preexec_fn=subprocess_setup,
1789     stderr=open(logging_dir + "reports.samtools_sort.log", "w"),
1790     stdin=report_proc.stdout)
1791     report_proc.stdout.close()
1792     print >> run_log, " ".join(report_cmd)+" | " + " ".join(bamsort_cmd)
1793     bamsort_proc.communicate()
1794     retcode = report_proc.poll()
1795     if retcode:
1796     die(fail_str+"Error running tophat_reports\n"+log_tail(log_fname))
1797     else:
1798     print >> run_log, " ".join(report_cmd)
1799     report_proc=subprocess.call(report_cmd,
1800     preexec_fn=subprocess_setup,
1801     stdout=open(accepted_hits,"wb"),
1802     stderr=report_log)
1803     os.rename(accepted_hits, output_dir + "accepted_hits.bam")
1804     else:
1805     print >> run_log, " ".join(report_cmd)
1806     report_proc=subprocess.call(report_cmd,
1807     preexec_fn=subprocess_setup,
1808     stdout=open(accepted_hits,"wb"),
1809     stderr=report_log)
1810     tmp_sam = output_dir + "accepted_hits.sam"
1811    
1812     bam_to_sam_cmd = ["samtools", "view", "-h", accepted_hits]
1813     print >> run_log, " ".join(bam_to_sam_cmd) + " > " + tmp_sam
1814     bam_to_sam_log = open(logging_dir + "accepted_hits_bam_to_sam.log", "w")
1815     tmp_sam_file = open(tmp_sam, "w")
1816     ret = subprocess.call(bam_to_sam_cmd,
1817     stdout=tmp_sam_file,
1818     stderr=bam_to_sam_log)
1819     tmp_sam_file.close()
1820     os.remove(accepted_hits)
1821     #print >> run_log, "mv %s %s" % (tmp_sam, output_dir + "accepted_hits.sam")
1822     #os.rename(tmp_sam, output_dir + "accepted_hits.sam")
1823    
1824     except OSError, o:
1825     die(fail_str+"Error: "+str(o)+"\n"+log_tail(log_fname))
1826    
1827     # FIXME: put wiggles back!
1828     # wig_cmd = [prog_path("wiggles"), output_dir + accepted_hits, output_dir + coverage]
1829     # print >> run_log, " ".join(wig_cmd)
1830     # subprocess.call(wig_cmd,
1831     # stderr=open("/dev/null"))
1832     return (coverage, junctions)
1833    
1834    
1835     # Split up each read in a FASTQ file into multiple segments. Creates a FASTQ file
1836     # for each segment This function needs to be fixed to support mixed read length
1837     # inputs
1838    
1839     def open_output_files(prefix, num_files_prev, num_files, out_segf, extension, params):
1840     i = num_files_prev + 1
1841     while i <= num_files:
1842     segfname=prefix+("_seg%d" % i)+extension
1843     out_segf.append(ZWriter(segfname,params.system_params))
1844     i += 1
1845    
1846     def split_reads(reads_filename,
1847     prefix,
1848     fasta,
1849     params,
1850     segment_length):
1851     #reads_file = open(reads_filename)
1852     zreads = ZReader(reads_filename, params.system_params, False)
1853     out_segfiles = []
1854    
1855     if fasta:
1856     extension = ".fa"
1857     else:
1858     extension = ".fq"
1859     if use_zpacker: extension += ".z"
1860     def convert_color_to_bp(color_seq):
1861     decode_dic = { 'A0':'A', 'A1':'C', 'A2':'G', 'A3':'T', 'A4':'N', 'A.':'N', 'AN':'N',
1862     'C0':'C', 'C1':'A', 'C2':'T', 'C3':'G', 'C4':'N', 'C.':'N', 'CN':'N',
1863     'G0':'G', 'G1':'T', 'G2':'A', 'G3':'C', 'G4':'N', 'G.':'N', 'GN':'N',
1864     'T0':'T', 'T1':'G', 'T2':'C', 'T3':'A', 'T4':'N', 'T.':'N', 'TN':'N',
1865     'N0':'N', 'N1':'N', 'N2':'N', 'N3':'N', 'N4':'N', 'N.':'N', 'NN':'N',
1866     '.0':'N', '.1':'N', '.2':'N', '.3':'N', '.4':'N', '..':'N', '.N':'N' }
1867    
1868     base = color_seq[0]
1869     bp_seq = base
1870     for ch in color_seq[1:]:
1871     base = decode_dic[base+ch]
1872     bp_seq += base
1873     return bp_seq
1874    
1875     def convert_bp_to_color(bp_seq):
1876     encode_dic = { 'AA':'0', 'CC':'0', 'GG':'0', 'TT':'0',
1877     'AC':'1', 'CA':'1', 'GT':'1', 'TG':'1',
1878     'AG':'2', 'CT':'2', 'GA':'2', 'TC':'2',
1879     'AT':'3', 'CG':'3', 'GC':'3', 'TA':'3',
1880     'A.':'4', 'C.':'4', 'G.':'4', 'T.':'4',
1881     '.A':'4', '.C':'4', '.G':'4', '.T':'4',
1882     '.N':'4', 'AN':'4', 'CN':'4', 'GN':'4',
1883     'TN':'4', 'NA':'4', 'NC':'4', 'NG':'4',
1884     'NT':'4', 'NN':'4', 'N.':'4', '..':'4' }
1885    
1886     base = bp_seq[0]
1887     color_seq = base
1888     for ch in bp_seq[1:]:
1889     color_seq += encode_dic[base + ch]
1890     base = ch
1891    
1892     return color_seq
1893    
1894     def split_record(read_name, read_seq, read_qual, out_segf, offsets, color):
1895     if color:
1896     color_offset = 1
1897     read_seq_temp = convert_color_to_bp(read_seq)
1898    
1899     seg_num = 1
1900     while seg_num + 1 < len(offsets):
1901     if read_seq[offsets[seg_num]+1] not in ['0', '1', '2', '3']:
1902     return
1903     seg_num += 1
1904     else:
1905     color_offset = 0
1906    
1907     seg_num = 0
1908     last_seq_offset = 0
1909     while seg_num + 1 < len(offsets):
1910     f = out_segf[seg_num].file
1911     seg_seq = read_seq[last_seq_offset+color_offset:offsets[seg_num + 1]+color_offset]
1912     print >> f, "%s|%d:%d:%d" % (read_name,last_seq_offset,seg_num, len(offsets) - 1)
1913     if color:
1914     print >> f, "%s%s" % (read_seq_temp[last_seq_offset], seg_seq)
1915     else:
1916     print >> f, seg_seq
1917     if not fasta:
1918     seg_qual = read_qual[last_seq_offset:offsets[seg_num + 1]]
1919     print >> f, "+"
1920     print >> f, seg_qual
1921     seg_num += 1
1922     last_seq_offset = offsets[seg_num]
1923    
1924     line_state = 0
1925     read_name = ""
1926     read_seq = ""
1927     read_qual = ""
1928     num_segments = 0
1929     offsets = []
1930     for line in zreads.file:
1931     if line.strip() == "":
1932     continue
1933     if line_state == 0:
1934     read_name = line.strip()
1935     elif line_state == 1:
1936     read_seq = line.strip()
1937    
1938     read_length = len(read_seq)
1939     tmp_num_segments = read_length / segment_length
1940     offsets = [segment_length * i for i in range(0, tmp_num_segments + 1)]
1941    
1942     # Bowtie's minimum read length here is 20bp, so if the last segment
1943     # is between 20 and segment_length bp long, go ahead and write it out
1944     if read_length % segment_length >= 20:
1945     offsets.append(read_length)
1946     tmp_num_segments += 1
1947     else:
1948     offsets[-1] = read_length
1949    
1950     if tmp_num_segments == 1:
1951     offsets = [0, read_length]
1952    
1953     if tmp_num_segments > num_segments:
1954     open_output_files(prefix, num_segments, tmp_num_segments, out_segfiles, extension, params)
1955     num_segments = tmp_num_segments
1956    
1957     if fasta:
1958     split_record(read_name, read_seq, None, out_segfiles, offsets, params.read_params.color)
1959     elif line_state == 2:
1960     line = line.strip()
1961     else:
1962     read_quals = line.strip()
1963     if not fasta:
1964     split_record(read_name, read_seq, read_quals, out_segfiles, offsets, params.read_params.color)
1965    
1966     line_state += 1
1967     if fasta:
1968     line_state %= 2
1969     else:
1970     line_state %= 4
1971     zreads.close()
1972     out_fnames=[]
1973     for zf in out_segfiles:
1974     zf.close()
1975     out_fnames.append(zf.fname)
1976     #return [o.fname for o in out_segfiles]
1977     return out_fnames
1978    
1979     # Find possible splice junctions using the "closure search" strategy, and report
1980     # them in closures.juncs. Calls the executable closure_juncs
1981     def junctions_from_closures(params,
1982     left_maps,
1983     right_maps,
1984     ref_fasta):
1985     #print >> sys.stderr, "[%s] " % right_now()
1986     print >> sys.stderr, "[%s] Searching for junctions via mate-pair closures" % right_now()
1987    
1988     #maps = [x for x in seg_maps if (os.path.exists(x) and os.path.getsize(x) > 0)]
1989     #if len(maps) == 0:
1990     # return None
1991     #print >> sys.stderr, left_maps
1992     slash = left_maps[0].rfind('/')
1993     juncs_out = ""
1994     if slash != -1:
1995     juncs_out += left_maps[0][:slash+1]
1996     juncs_out += "closure.juncs"
1997    
1998     juncs_log = open(logging_dir + "closure.log", "w")
1999     juncs_cmdpath=prog_path("closure_juncs")
2000     juncs_cmd = [juncs_cmdpath]
2001    
2002     left_maps = ','.join(left_maps)
2003     right_maps = ','.join(right_maps)
2004    
2005     juncs_cmd.extend(params.cmd())
2006     juncs_cmd.extend([juncs_out,
2007     ref_fasta,
2008     left_maps,
2009     right_maps])
2010     try:
2011     print >> run_log, ' '.join(juncs_cmd)
2012     retcode = subprocess.call(juncs_cmd,
2013     stderr=juncs_log)
2014    
2015     # spanning_reads returned an error
2016     if retcode != 0:
2017     die(fail_str+"Error: closure-based junction search failed with err ="+str(retcode))
2018     # cvg_islands not found
2019     except OSError, o:
2020     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2021     print >> sys.stderr, fail_str, "Error: closure_juncs not found on this system"
2022     die(str(o))
2023     return [juncs_out]
2024    
2025     # Find possible junctions by examining coverage and split segments in the initial
2026     # map and segment maps. Report junctions, insertions, and deletions in segment.juncs,
2027     # segment.insertions, and segment.deletions. Calls the executable
2028     # segment_juncs
2029     def junctions_from_segments(params,
2030     left_reads,
2031     left_reads_map,
2032     left_seg_maps,
2033     right_reads,
2034     right_reads_map,
2035     right_seg_maps,
2036     unmapped_reads,
2037     reads_format,
2038     ref_fasta):
2039     if left_reads_map != left_seg_maps[0]:
2040     print >> sys.stderr, "[%s] Searching for junctions via segment mapping" % right_now()
2041     out_path=getFileDir(left_seg_maps[0])
2042     juncs_out=out_path+"segment.juncs"
2043     insertions_out=out_path+"segment.insertions"
2044     deletions_out =out_path+"segment.deletions"
2045    
2046     left_maps = ','.join(left_seg_maps)
2047     align_log = open(logging_dir + "segment_juncs.log", "w")
2048     align_cmd = [prog_path("segment_juncs")]
2049    
2050     align_cmd.extend(params.cmd())
2051    
2052     align_cmd.extend(["--ium-reads", ",".join(unmapped_reads),
2053     ref_fasta,
2054     juncs_out,
2055     insertions_out,
2056     deletions_out,
2057     left_reads,
2058     left_reads_map,
2059     left_maps])
2060     if right_seg_maps != None:
2061     right_maps = ','.join(right_seg_maps)
2062     align_cmd.extend([right_reads, right_reads_map, right_maps])
2063     try:
2064     print >> run_log, " ".join(align_cmd)
2065     retcode = subprocess.call(align_cmd,
2066     preexec_fn=subprocess_setup,
2067     stderr=align_log)
2068    
2069     # spanning_reads returned an error
2070     if retcode != 0:
2071     die(fail_str+"Error: segment-based junction search failed with err ="+str(retcode))
2072     # cvg_islands not found
2073     except OSError, o:
2074     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2075     print >> sys.stderr, fail_str, "Error: segment_juncs not found on this system"
2076     die(str(o))
2077    
2078     return [juncs_out, insertions_out, deletions_out]
2079    
2080     # Joins mapped segments into full-length read alignments via the executable
2081     # long_spanning_reads
2082     def join_mapped_segments(params,
2083     sam_header_filename,
2084     reads,
2085     ref_fasta,
2086     possible_juncs,
2087     possible_insertions,
2088     possible_deletions,
2089     contig_seg_maps,
2090     spliced_seg_maps,
2091     alignments_out_name):
2092     if len(contig_seg_maps)>1:
2093     print >> sys.stderr, "[%s] Joining segment hits" % right_now()
2094     else:
2095     print >> sys.stderr, "[%s] Processing bowtie hits" % right_now()
2096     contig_seg_maps = ','.join(contig_seg_maps)
2097    
2098     possible_juncs = ','.join(possible_juncs)
2099     possible_insertions = ",".join(possible_insertions)
2100     possible_deletions = ",".join(possible_deletions)
2101    
2102     align_log = open(logging_dir + "long_spanning_reads.log", "w")
2103     align_cmd = [prog_path("long_spanning_reads")]
2104    
2105     align_cmd.extend(params.cmd())
2106    
2107     align_cmd.append(ref_fasta)
2108     align_cmd.extend([ reads,
2109     possible_juncs,
2110     possible_insertions,
2111     possible_deletions,
2112     contig_seg_maps])
2113    
2114     if spliced_seg_maps != None:
2115     spliced_seg_maps = ','.join(spliced_seg_maps)
2116     align_cmd.append(spliced_seg_maps)
2117    
2118     try:
2119     print >> run_log, " ".join(align_cmd),">",alignments_out_name
2120     join_proc=subprocess.Popen(align_cmd,
2121     preexec_fn=subprocess_setup,
2122     stdout=open(alignments_out_name, "wb"),
2123     stderr=align_log)
2124     join_proc.communicate()
2125     except OSError, o:
2126     die(fail_str+"Error: "+str(o))
2127    
2128    
2129     # This class collects spliced and unspliced alignments for each of the
2130     # left and right read files provided by the user.
2131     class Maps:
2132     def __init__(self,
2133 gpertea 31 # unspliced_bwt,
2134 gpertea 29 unspliced_sam,
2135     seg_maps,
2136     unmapped_segs,
2137     segs):
2138 gpertea 31 #self.unspliced_bwt = unspliced_bwt
2139 gpertea 29 self.unspliced_sam = unspliced_sam
2140     self.seg_maps = seg_maps
2141     self.unmapped_segs = unmapped_segs
2142     self.segs = segs
2143    
2144     # The main aligment routine of TopHat. This function executes most of the
2145     # workflow producing a set of candidate alignments for each cDNA fragment in a
2146     # pair of SAM alignment files (for paired end reads).
2147     def spliced_alignment(params,
2148     bwt_idx_prefix,
2149     sam_header_filename,
2150     ref_fasta,
2151     read_len,
2152     segment_len,
2153     left_reads,
2154     right_reads,
2155     user_supplied_junctions,
2156     user_supplied_insertions,
2157     user_supplied_deletions):
2158    
2159     possible_juncs = []
2160     possible_juncs.extend(user_supplied_junctions)
2161    
2162     possible_insertions = []
2163     possible_insertions.extend(user_supplied_insertions)
2164    
2165     possible_deletions = []
2166     possible_deletions.extend(user_supplied_deletions)
2167    
2168     maps = { left_reads : [], right_reads : [] }
2169     #single_segments = False
2170    
2171     # Perform the first part of the TopHat work flow on the left and right
2172     # reads of paired ends separately - we'll use the pairing information later
2173     for reads in [left_reads, right_reads]:
2174     if reads == None or os.path.getsize(reads)<25 :
2175     continue
2176     fbasename=getFileBaseName(reads)
2177     unspliced_out = tmp_dir + fbasename + ".mapped"
2178     unmapped_unspliced = tmp_dir + fbasename + "_unmapped"
2179 gpertea 31
2180 gpertea 29 num_segs = read_len / segment_len
2181     if read_len % segment_len >= min(segment_len -2, 20):
2182     num_segs += 1
2183 gpertea 31
2184 gpertea 29 # daehwan - check this out
2185     if num_segs <= 1:
2186     print >> sys.stderr, "Warning: you have only one segment per read\n\twe strongly recommend that you decrease --segment-length to about half the read length because TopHat will work better with multiple segments"
2187    
2188     # Perform the initial Bowtie mapping of the full length reads
2189 gpertea 31 (unspliced_sam, unmapped_reads) = bowtie(params,
2190 gpertea 29 bwt_idx_prefix,
2191     reads,
2192     "fastq",
2193     params.initial_read_mismatches,
2194     unspliced_out,
2195     unmapped_unspliced)
2196    
2197     # Convert the initial Bowtie maps into BAM format.
2198 gpertea 31 # skip this step as fix_map_ordering does it now
2199     #join_mapped_segments(params,
2200     # sam_header_filename,
2201     # reads,
2202     # ref_fasta,
2203     # ["/dev/null"],
2204     # ["/dev/null"],
2205     # ["/dev/null"],
2206     # [unspliced_map],
2207     # [],
2208     # unspliced_sam)
2209 gpertea 29 # Using the num_segs value returned by check_reads(), decide which
2210     # junction discovery strategy to use
2211     if num_segs == 1:
2212     segment_len = read_len
2213     elif num_segs >= 3:
2214     # if we have at least three segments, just use split segment search,
2215     # which is the most sensitive and specific, fastest, and lightest-weight.
2216     if not params.closure_search:
2217     params.closure_search = False
2218     if not params.coverage_search:
2219     params.coverage_search = False
2220     if not params.butterfly_search:
2221     params.butterfly_search = False
2222     if not params.closure_search:
2223     params.closure_search = False
2224     seg_maps = []
2225     unmapped_segs = []
2226     segs = []
2227     if num_segs > 1:
2228     # split up the IUM reads into segments
2229 gpertea 31 # unmapped_reads can be in BAM format
2230     read_segments = split_reads(unmapped_reads,
2231 gpertea 29 tmp_dir + fbasename,
2232     False,
2233     params,
2234     segment_len)
2235    
2236     # Map each segment file independently with Bowtie
2237     for i in range(len(read_segments)):
2238     seg = read_segments[i]
2239     fbasename=getFileBaseName(seg)
2240 gpertea 31 seg_out = tmp_dir + fbasename + ".mapped"
2241 gpertea 29 if use_zpacker: seg_out += ".z"
2242 gpertea 31 unmapped_seg = tmp_dir + fbasename + "_unmapped"
2243 gpertea 29 extra_output = "(%d/%d)" % (i+1, len(read_segments))
2244     (seg_map, unmapped) = bowtie_segment(params,
2245     bwt_idx_prefix,
2246     seg,
2247     "fastq",
2248     seg_out,
2249     unmapped_seg,
2250     extra_output)
2251     seg_maps.append(seg_map)
2252     unmapped_segs.append(unmapped)
2253     segs.append(seg)
2254    
2255     # Collect the segment maps for left and right reads together
2256 gpertea 31 #maps[reads] = Maps(unspliced_map, unspliced_sam, seg_maps, unmapped_segs, segs)
2257     maps[reads] = Maps(unspliced_sam, seg_maps, unmapped_segs, segs)
2258 gpertea 29 else:
2259     # if there's only one segment, just collect the initial map as the only
2260     # map to be used downstream for coverage-based junction discovery
2261     read_segments = [reads]
2262 gpertea 31 #maps[reads] = Maps(unspliced_map, unspliced_sam, [unspliced_map], [unmapped_reads], [unmapped_reads])
2263     maps[reads] = Maps(unspliced_sam, [unspliced_sam], [unmapped_reads], [unmapped_reads])
2264 gpertea 29
2265     # Unless the user asked not to discover new junctions, start that process
2266     # here
2267    
2268     if params.find_novel_juncs or params.find_novel_indels:
2269 gpertea 31 #left_reads_map = maps[left_reads].unspliced_bwt
2270     left_reads_map = maps[left_reads].unspliced_sam
2271 gpertea 29 left_seg_maps = maps[left_reads].seg_maps
2272     unmapped_reads = maps[left_reads].unmapped_segs
2273     if right_reads != None:
2274 gpertea 31 #right_reads_map = maps[right_reads].unspliced_bwt
2275     right_reads_map = maps[right_reads].unspliced_sam
2276 gpertea 29 right_seg_maps = maps[right_reads].seg_maps
2277     unmapped_reads.extend(maps[right_reads].unmapped_segs)
2278     else:
2279     right_reads_map = None
2280     right_seg_maps = None
2281    
2282     # Call segment_juncs to infer a list of possible splice junctions from
2283     # the regions of the genome covered in the initial and segment maps
2284     juncs = junctions_from_segments(params,
2285     left_reads,
2286     left_reads_map,
2287     left_seg_maps,
2288     right_reads,
2289     right_reads_map,
2290     right_seg_maps,
2291     unmapped_reads,
2292     "fastq",
2293     ref_fasta)
2294     if params.find_novel_juncs:
2295     if os.path.getsize(juncs[0]) != 0:
2296     possible_juncs.append(juncs[0])
2297     if params.find_novel_indels:
2298     if os.path.getsize(juncs[1]) != 0:
2299     possible_insertions.append(juncs[1])
2300     if os.path.getsize(juncs[2]) != 0:
2301     possible_deletions.append(juncs[2])
2302     # Optionally, and for paired reads only, use a closure search to
2303     # discover addtional junctions
2304     if params.find_novel_juncs and params.closure_search and left_reads != None and right_reads != None:
2305     juncs = junctions_from_closures(params,
2306 gpertea 31 # [maps[left_reads].unspliced_bwt, maps[left_reads].seg_maps[-1]],
2307     # [maps[right_reads].unspliced_bwt, maps[right_reads].seg_maps[-1]],
2308     [maps[left_reads].unspliced_sam, maps[left_reads].seg_maps[-1]],
2309     [maps[right_reads].unspliced_sam, maps[right_reads].seg_maps[-1]],
2310 gpertea 29 ref_fasta)
2311     if os.path.getsize(juncs[0]) != 0:
2312     possible_juncs.extend(juncs)
2313    
2314     if len(possible_insertions) == 0 and len(possible_deletions) == 0 and len(possible_juncs) == 0:
2315     spliced_seg_maps = None
2316     junc_idx_prefix = None
2317     else:
2318     junc_idx_prefix = "segment_juncs"
2319     if len(possible_insertions) == 0:
2320     possible_insertions.append(os.devnull)
2321     # print >> sys.stderr, "Warning: insertions database is empty!"
2322     if len(possible_deletions) == 0:
2323     possible_deletions.append(os.devnull)
2324     # print >> sys.stderr, "Warning: deletions database is empty!"
2325     if len(possible_juncs) == 0:
2326     possible_juncs.append(os.devnull)
2327     print >> sys.stderr, "Warning: junction database is empty!"
2328     if junc_idx_prefix != None:
2329     build_juncs_index(3,
2330     segment_len,
2331     junc_idx_prefix,
2332     possible_juncs,
2333     possible_insertions,
2334     possible_deletions,
2335     ref_fasta,
2336     params.read_params.color)
2337    
2338     # Now map read segments (or whole IUM reads, if num_segs == 1) to the splice
2339     # index with Bowtie
2340     for reads in [left_reads, right_reads]:
2341     spliced_seg_maps = []
2342     if reads == None or reads not in maps:
2343     continue
2344    
2345     if junc_idx_prefix != None:
2346     i = 0
2347     for seg in maps[reads].segs:
2348     fsegname = getFileBaseName(seg)
2349     seg_out = tmp_dir + fsegname + ".to_spliced.bwtout"
2350     if use_zpacker: seg_out += ".z"
2351     extra_output = "(%d/%d)" % (i+1, len(maps[reads].segs))
2352     (seg_map, unmapped) = bowtie_segment(params,
2353     tmp_dir + junc_idx_prefix,
2354     seg,
2355     "fastq",
2356     seg_out,
2357     None,
2358     extra_output)
2359     spliced_seg_maps.append(seg_map)
2360     i += 1
2361    
2362     # Join the contigous and spliced segment hits into full length read
2363     # alignments or convert a spliced segment hit into a genomic-coordinate hit
2364     rfname = getFileBaseName(reads)
2365     rfdir = getFileDir(reads)
2366     mapped_reads = rfdir + rfname + ".candidates.bam"
2367     merged_map =rfdir + rfname + ".candidates_and_unspl.bam"
2368 gpertea 31 # unspl_bwtfile = maps[reads].unspliced_bwt
2369 gpertea 29 unspl_samfile = maps[reads].unspliced_sam
2370    
2371     join_mapped_segments(params,
2372     sam_header_filename,
2373     reads,
2374     ref_fasta,
2375     possible_juncs,
2376     possible_insertions,
2377     possible_deletions,
2378     maps[reads].seg_maps,
2379     spliced_seg_maps,
2380     mapped_reads)
2381    
2382     if num_segs > 1:
2383     # Merge the spliced and unspliced full length alignments into a
2384     # single SAM file.
2385     # The individual SAM files are all already sorted in
2386     # increasing read ID order.
2387     # NOTE: We also should be able to address bug #134 here, by replacing
2388     # contiguous alignments that poke into an intron by a small amount by
2389     # the correct spliced alignment.
2390    
2391     try:
2392     merge_cmd = [ prog_path("bam_merge"), merged_map,
2393     maps[reads].unspliced_sam,
2394     mapped_reads ]
2395     print >> run_log, " ".join(merge_cmd)
2396     ret = subprocess.call( merge_cmd,
2397     stderr=open(logging_dir + "sam_merge.log", "w") )
2398     if ret != 0:
2399     die(fail_str+"Error executing: "+" ".join(merge_cmd))
2400     except OSError, o:
2401     die(fail_str+"Error: "+str(o))
2402     maps[reads] = [merged_map]
2403    
2404     if not params.system_params.keep_tmp:
2405     os.remove(mapped_reads)
2406     os.remove(unspl_samfile)
2407 gpertea 31 #os.remove(unspl_bwtfile)
2408 gpertea 29 else:
2409     os.rename(mapped_reads, merged_map)
2410     maps[reads] = [merged_map]
2411     if not params.system_params.keep_tmp:
2412     os.remove(unspl_samfile)
2413 gpertea 31 #os.remove(unspl_bwtfile)
2414 gpertea 29
2415     return maps
2416    
2417     def die(msg=None):
2418     if msg is not None:
2419     print >> sys.stderr, msg
2420     sys.exit(1)
2421    
2422     # rough equivalent of the 'which' command to find external programs
2423     # (current script path is tested first, then PATH envvar)
2424     def which(program):
2425     def is_executable(fpath):
2426     return os.path.exists(fpath) and os.access(fpath, os.X_OK)
2427     fpath, fname = os.path.split(program)
2428     if fpath:
2429     if is_executable(program):
2430     return program
2431     else:
2432     progpath = os.path.join(bin_dir, program)
2433     if is_executable(progpath):
2434     return progpath
2435     for path in os.environ["PATH"].split(os.pathsep):
2436     progpath = os.path.join(path, program)
2437     if is_executable(progpath):
2438     return progpath
2439     return None
2440    
2441     def prog_path(program):
2442     progpath=which(program)
2443     if progpath == None:
2444     die("Error locating program: "+program)
2445     return progpath
2446    
2447     # FIXME: this should get set during the make dist autotools phase of the build
2448     def get_version():
2449     return "1.3.2"
2450    
2451     def mlog(msg):
2452     print >> sys.stderr, "[DBGLOG]:"+msg
2453    
2454     def test_input_file(filename):
2455     try:
2456     test_file = open(filename, "r")
2457     # Bowtie not found
2458     except IOError, o:
2459     die("Error: Opening file %s" % filename)
2460     return
2461    
2462    
2463     def main(argv=None):
2464     warnings.filterwarnings("ignore", "tmpnam is a potential security risk")
2465    
2466     # Initialize default parameter values
2467     params = TopHatParams()
2468    
2469     try:
2470     if argv is None:
2471     argv = sys.argv
2472     args = params.parse_options(argv)
2473     params.check()
2474    
2475     bwt_idx_prefix = args[0]
2476     left_reads_list = args[1]
2477     left_quals_list, right_quals_list = [], []
2478     if (not params.read_params.quals and len(args) > 2) or (params.read_params.quals and len(args) > 3):
2479     if params.read_params.mate_inner_dist == None:
2480     die("Error: you must set the mean inner distance between mates with -r")
2481    
2482     right_reads_list = args[2]
2483     if params.read_params.quals:
2484     left_quals_list = args[3]
2485     right_quals_list = args[4]
2486     else:
2487     right_reads_list = None
2488     if params.read_params.quals:
2489     left_quals_list = args[2]
2490    
2491     print >> sys.stderr
2492     print >> sys.stderr, "[%s] Beginning TopHat run (v%s)" % (right_now(), get_version())
2493     print >> sys.stderr, "-----------------------------------------------"
2494    
2495     start_time = datetime.now()
2496     prepare_output_dir()
2497    
2498     global run_log
2499     run_log = open(logging_dir + "run.log", "w", 0)
2500     global run_cmd
2501     run_cmd = " ".join(argv)
2502     print >> run_log, run_cmd
2503    
2504     # Validate all the input files, check all prereqs before committing
2505     # to the run
2506     (ref_fasta, ref_seq_dict) = check_index(bwt_idx_prefix)
2507    
2508     check_bowtie()
2509     check_samtools()
2510     print >> sys.stderr, "[%s] Generating SAM header for %s" % (right_now(), bwt_idx_prefix)
2511     sam_header_filename = get_index_sam_header(params.read_params, bwt_idx_prefix)
2512     print >> sys.stderr, "[%s] Preparing reads" % (right_now())
2513    
2514     if not params.skip_check_reads:
2515     reads_list = left_reads_list
2516     if right_reads_list:
2517     reads_list = reads_list + "," + right_reads_list
2518     params.read_params = check_reads_format(params, reads_list)
2519    
2520     user_supplied_juncs = []
2521     user_supplied_insertions = []
2522     user_supplied_deletions = []
2523    
2524     if params.gff_annotation and params.find_GFF_juncs:
2525     test_input_file(params.gff_annotation)
2526     (found_juncs, gtf_juncs) = get_gtf_juncs(params.gff_annotation)
2527     if found_juncs:
2528     user_supplied_juncs.append(gtf_juncs)
2529     if params.raw_junctions:
2530     test_input_file(params.raw_junctions)
2531     user_supplied_juncs.append(params.raw_junctions)
2532    
2533     if params.raw_insertions:
2534     test_input_file(params.raw_insertions)
2535     user_supplied_insertions.append(params.raw_insertions)
2536    
2537     if params.raw_deletions:
2538     test_input_file(params.raw_deletions)
2539     user_supplied_deletions.append(params.raw_deletions)
2540    
2541     global unmapped_reads_fifo
2542     unmapped_reads_fifo = tmp_dir + str(os.getpid())+".bwt_unmapped.z.fifo"
2543    
2544     # Now start the time consuming stuff
2545     left_kept_reads, left_reads_info = prep_reads(params,
2546     left_reads_list,
2547     left_quals_list,
2548     "left_kept_reads", "Left ")
2549     min_read_len=left_reads_info.min_len
2550     max_read_len=left_reads_info.max_len
2551     if right_reads_list != None:
2552     right_kept_reads, right_reads_info = prep_reads(params,
2553     right_reads_list,
2554     right_quals_list,
2555     "right_kept_reads", "Right")
2556     min_read_len=min(right_reads_info.min_len, min_read_len)
2557     max_read_len=max(right_reads_info.max_len, max_read_len)
2558     else:
2559     right_kept_reads = None
2560     seed_len=params.read_params.seed_length
2561     if seed_len != None:
2562     seed_len = max(seed_len, min_read_len)
2563     else:
2564     seed_len = max_read_len
2565     params.read_params.seed_length=seed_len
2566     # turn off integer-quals
2567     if params.read_params.integer_quals:
2568     params.read_params.integer_quals = False
2569    
2570     mapping = spliced_alignment(params,
2571     bwt_idx_prefix,
2572     sam_header_filename,
2573     ref_fasta,
2574     params.read_params.seed_length,
2575     params.segment_length,
2576     left_kept_reads,
2577     right_kept_reads,
2578     user_supplied_juncs,
2579     user_supplied_insertions,
2580     user_supplied_deletions)
2581    
2582     left_maps = mapping[left_kept_reads]
2583     #left_unmapped_reads = mapping[1]
2584     right_maps = mapping[right_kept_reads]
2585     #right_unmapped_reads = mapping[3]
2586    
2587     compile_reports(params,
2588     sam_header_filename,
2589     left_maps,
2590     left_kept_reads,
2591     right_maps,
2592     right_kept_reads,
2593     params.gff_annotation)
2594    
2595     if not params.system_params.keep_tmp:
2596     for m in left_maps:
2597     os.remove(m)
2598     if left_kept_reads != None:
2599     os.remove(left_kept_reads)
2600     for m in right_maps:
2601     os.remove(m)
2602     if right_kept_reads != None:
2603     os.remove(right_kept_reads)
2604     tmp_files = os.listdir(tmp_dir)
2605     for t in tmp_files:
2606     os.remove(tmp_dir+t)
2607     os.rmdir(tmp_dir)
2608    
2609     finish_time = datetime.now()
2610     duration = finish_time - start_time
2611     print >> sys.stderr,"-----------------------------------------------"
2612     print >> sys.stderr, "Run complete [%s elapsed]" % formatTD(duration)
2613    
2614     except Usage, err:
2615     print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg)
2616     print >> sys.stderr, " for detailed help see http://tophat.cbcb.umd.edu/manual.html"
2617     return 2
2618    
2619    
2620     if __name__ == "__main__":
2621     sys.exit(main())

Properties

Name Value
svn:executable *