ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/tophat
Revision: 67
Committed: Mon Sep 19 19:39:49 2011 UTC (8 years, 8 months ago) by gpertea
File size: 111262 byte(s)
Log Message:
fixed segment_juncs to work with .bam files

Line User Rev File contents
1 gpertea 29 #!/usr/bin/env python
2    
3     # encoding: utf-8
4     """
5     tophat.py
6    
7     Created by Cole Trapnell on 2008-12-25.
8     Copyright (c) 2008 Cole Trapnell. All rights reserved.
9     """
10    
11     import sys
12     try:
13     import psyco
14     psyco.full()
15     except ImportError:
16     pass
17    
18     import getopt
19     import subprocess
20     import errno
21     import os
22     import warnings
23     import re
24     import signal
25     from datetime import datetime, date, time
26    
27     use_message = '''
28     TopHat maps short sequences from spliced transcripts to whole genomes.
29    
30     Usage:
31     tophat [options] <bowtie_index> <reads1[,reads2,...]> [reads1[,reads2,...]] \\
32     [quals1,[quals2,...]] [quals1[,quals2,...]]
33    
34     Options:
35     -v/--version
36     -o/--output-dir <string> [ default: ./tophat_out ]
37     -a/--min-anchor <int> [ default: 8 ]
38     -m/--splice-mismatches <0-2> [ default: 0 ]
39     -i/--min-intron-length <int> [ default: 50 ]
40     -I/--max-intron-length <int> [ default: 500000 ]
41     -g/--max-multihits <int> [ default: 20 ]
42     -F/--min-isoform-fraction <float> [ default: 0.15 ]
43     --max-insertion-length <int> [ default: 3 ]
44     --max-deletion-length <int> [ default: 3 ]
45     --solexa-quals
46     --solexa1.3-quals (same as phred64-quals)
47     --phred64-quals (same as solexa1.3-quals)
48     -Q/--quals
49     --integer-quals
50     -C/--color (Solid - color space)
51     --color-out
52     --library-type <string> (fr-unstranded, fr-firststrand,
53     fr-secondstrand)
54     -p/--num-threads <int> [ default: 1 ]
55     -G/--GTF <filename>
56     -j/--raw-juncs <filename>
57     --insertions <filename>
58     --deletions <filename>
59     -r/--mate-inner-dist <int>
60     --mate-std-dev <int> [ default: 20 ]
61     --no-novel-juncs
62     --no-novel-indels
63     --no-gtf-juncs
64     --no-coverage-search
65     --coverage-search
66     --no-closure-search
67     --closure-search
68     --microexon-search
69     --butterfly-search
70     --no-butterfly-search
71     --keep-tmp
72     --tmp-dir <dirname> [ default: <output_dir>/tmp ]
73     -z/--zpacker <program> [ default: gzip ]
74     -X/--unmapped-fifo [use mkfifo to compress more temporary
75     files for color space reads]
76    
77     Advanced Options:
78     --initial-read-mismatches <int> [ default: 2 ]
79     --segment-mismatches <int> [ default: 2 ]
80     --segment-length <int> [ default: 25 ]
81     --bowtie-n [ default: bowtie -v ]
82     --min-closure-exon <int> [ default: 100 ]
83     --min-closure-intron <int> [ default: 50 ]
84     --max-closure-intron <int> [ default: 5000 ]
85     --min-coverage-intron <int> [ default: 50 ]
86     --max-coverage-intron <int> [ default: 20000 ]
87     --min-segment-intron <int> [ default: 50 ]
88     --max-segment-intron <int> [ default: 500000 ]
89     --no-sort-bam [Output BAM is not coordinate-sorted]
90     --no-convert-bam [Do not convert to bam format.
91     Output is <output_dir>accepted_hit.sam.
92     Implies --no-sort-bam.]
93    
94     SAM Header Options (for embedding sequencing run metadata in output):
95     --rg-id <string> (read group ID)
96     --rg-sample <string> (sample ID)
97     --rg-library <string> (library ID)
98     --rg-description <string> (descriptive string, no tabs allowed)
99     --rg-platform-unit <string> (e.g Illumina lane ID)
100     --rg-center <string> (sequencing center name)
101     --rg-date <string> (ISO 8601 date of the sequencing run)
102     --rg-platform <string> (Sequencing platform descriptor)
103     '''
104    
105     class Usage(Exception):
106     def __init__(self, msg):
107     self.msg = msg
108    
109     output_dir = "./tophat_out/"
110     logging_dir = output_dir + "logs/"
111     run_log = None
112     run_cmd = None
113     tmp_dir = output_dir + "tmp/"
114     bin_dir = sys.path[0] + "/"
115     use_zpacker = False # this is set by -z/--zpacker option (-z0 leaves it False)
116    
117     use_BAM_Unmapped = False # automatically set to True for non-Solid reads, handles unmapped reads in BAM format
118    
119     use_BWT_FIFO = False # can only be set to True if use_zpacker is True and only with -C/--color
120     # enabled by -X/-unmapped-fifo option (unless -z0)
121     unmapped_reads_fifo = None # if use_BWT_FIFO is True, this is to trick bowtie into writing the
122     # unmapped reads into a compressed file
123    
124     samtools_path = None
125     bowtie_path = None
126     fail_str = "\t[FAILED]\n"
127    
128     sam_header = tmp_dir + "stub_header.sam"
129    
130     # TopHatParams captures all of the runtime paramaters used by TopHat, and many
131     # of these are passed as command line options to exectubles run by the pipeline
132    
133     # This class and its nested classes also do options parsing through parse_options()
134     # and option validation via the member function check()
135    
136     class TopHatParams:
137    
138     # SpliceConstraints is a group of runtime parameters that specify what
139     # constraints to put on junctions discovered by the program. These constraints
140     # are used to filter out spurious/false positive junctions.
141    
142     class SpliceConstraints:
143     def __init__(self,
144     min_anchor_length,
145     min_intron_length,
146     max_intron_length,
147     splice_mismatches,
148     min_isoform_fraction):
149     self.min_anchor_length = min_anchor_length
150     self.min_intron_length = min_intron_length
151     self.max_intron_length = max_intron_length
152     self.splice_mismatches = splice_mismatches
153     self.min_isoform_fraction = min_isoform_fraction
154    
155     def parse_options(self, opts):
156     for option, value in opts:
157     if option in ("-m", "--splice-mismatches"):
158     self.splice_mismatches = int(value)
159     elif option in ("-a", "--min-anchor"):
160     self.min_anchor_length = int(value)
161     elif option in ("-F", "--min-isoform-fraction"):
162     self.min_isoform_fraction = float(value)
163     elif option in ("-i", "--min-intron-length"):
164     self.min_intron_length = int(value)
165     elif option in ("-I", "--max-intron-length"):
166     self.max_intron_length = int(value)
167    
168     def check(self):
169     if self.splice_mismatches not in [0,1,2]:
170     die("Error: arg to --splice-mismatches must be 0, 1, or 2")
171     if self.min_anchor_length < 4:
172     die("Error: arg to --min-anchor-len must be greater than 4")
173     if self.min_isoform_fraction < 0.0 or self.min_isoform_fraction > 1.0:
174     die("Error: arg to --min-isoform-fraction must be between 0.0 and 1.0")
175     if self.min_intron_length <= 0:
176     die("Error: arg to --min-intron-length must be greater than 0")
177     if self.max_intron_length <= 0:
178     die("Error: arg to --max-intron-length must be greater than 0")
179    
180     # SystemParams is a group of runtime parameters that determine how to handle
181     # temporary files produced during a run and how many threads to use for threaded
182     # stages of the pipeline (e.g. Bowtie)
183    
184     class SystemParams:
185     def __init__(self,
186     num_cpus,
187     keep_tmp):
188     self.num_cpus = num_cpus
189     self.keep_tmp = keep_tmp
190     self.zipper = "gzip"
191     self.zipper_opts= []
192    
193     def parse_options(self, opts):
194     global use_zpacker
195     global use_BWT_FIFO
196     for option, value in opts:
197     if option in ("-p", "--num-threads"):
198     self.num_cpus = int(value)
199     elif option == "--keep-tmp":
200     self.keep_tmp = True
201     elif option in ("-z","--zpacker"):
202     if value.lower() in ["-", " ", ".", "0", "none", "f", "false", "no"]:
203     value=""
204     self.zipper = value
205     #if not self.zipper:
206     # self.zipper='gzip'
207     elif option in ("-X", "--unmapped-fifo"):
208     use_BWT_FIFO=True
209     if self.zipper:
210     use_zpacker=True
211     if self.num_cpus>1 and not self.zipper_opts:
212     if self.zipper.endswith('pbzip2') or self.zipper.endswith('pigz'):
213     self.zipper_opts.append('-p'+str(self.num_cpus))
214     else:
215     use_zpacker=False
216     if use_BWT_FIFO: use_BWT_FIFO=False
217     def cmd(self):
218     cmdline=[]
219     if self.zipper:
220     cmdline.extend(['-z',self.zipper])
221     if self.num_cpus>1:
222     cmdline.extend(['-p'+str(self.num_cpus)])
223     return cmdline
224    
225     def check(self):
226     if self.num_cpus<1 :
227     die("Error: arg to --num-threads must be greater than 0")
228     if self.zipper:
229     xzip=which(self.zipper)
230     if not xzip:
231     die("Error: cannot find compression program "+self.zipper)
232    
233     # ReadParams is a group of runtime parameters that specify various properties
234     # of the user's reads (e.g. which quality scale their are on, how long the
235     # fragments are, etc).
236     class ReadParams:
237     def __init__(self,
238     solexa_quals,
239     phred64_quals,
240     quals,
241     integer_quals,
242     color,
243     color_out,
244     library_type,
245     seed_length,
246     reads_format,
247     mate_inner_dist,
248     mate_inner_dist_std_dev,
249     read_group_id,
250     sample_id,
251     library_id,
252     description,
253     seq_platform_unit,
254     seq_center,
255     seq_run_date,
256     seq_platform):
257     self.solexa_quals = solexa_quals
258     self.phred64_quals = phred64_quals
259     self.quals = quals
260     self.integer_quals = integer_quals
261     self.color = color
262     self.color_out = color_out
263     self.library_type = library_type
264     self.seed_length = seed_length
265     self.reads_format = reads_format
266     self.mate_inner_dist = mate_inner_dist
267     self.mate_inner_dist_std_dev = mate_inner_dist_std_dev
268     self.read_group_id = read_group_id
269     self.sample_id = sample_id
270     self.library_id = library_id
271     self.description = description
272     self.seq_platform_unit = seq_platform_unit
273     self.seq_center = seq_center
274     self.seq_run_date = seq_run_date
275     self.seq_platform = seq_platform
276    
277     def parse_options(self, opts):
278     for option, value in opts:
279     if option == "--solexa-quals":
280     self.solexa_quals = True
281     elif option in ("--solexa1.3-quals", "--phred64-quals"):
282     self.phred64_quals = True
283     elif option in ("-Q", "--quals"):
284     self.quals = True
285     elif option == "--integer-quals":
286     self.integer_quals = True
287     elif option in ("-C", "--color"):
288     self.color = True
289     elif option == "--color-out":
290     self.color_out = True
291     elif option == "--library-type":
292     self.library_type = value
293     elif option in ("-s", "--seed-length"):
294     self.seed_length = int(value)
295     elif option in ("-r", "--mate-inner-dist"):
296     self.mate_inner_dist = int(value)
297     elif option == "--mate-std-dev":
298     self.mate_inner_dist_std_dev = int(value)
299     elif option == "--rg-id":
300     self.read_group_id = value
301     elif option == "--rg-sample":
302     self.sample_id = value
303     elif option == "--rg-library":
304     self.library_id = value
305     elif option == "--rg-description":
306     self.description = value
307     elif option == "--rg-platform-unit":
308     self.seq_platform_unit = value
309     elif option == "--rg-center":
310     self.seq_center = value
311     elif option == "--rg-date":
312     self.seq_run_date = value
313     elif option == "--rg-platform":
314     self.seq_platform = value
315    
316     def check(self):
317     if self.seed_length != None and self.seed_length < 20:
318     die("Error: arg to --seed-length must be at least 20")
319     if self.mate_inner_dist_std_dev != None and self.mate_inner_dist_std_dev < 0:
320     die("Error: arg to --mate-std-dev must at least 0")
321     if (not self.read_group_id and self.sample_id) or (self.read_group_id and not self.sample_id):
322     die("Error: --rg-id and --rg-sample must be specified or omitted together")
323    
324     # SearchParams is a group of runtime parameters that specify how TopHat will
325     # search for splice junctions
326    
327     class SearchParams:
328     def __init__(self,
329     min_closure_exon,
330     min_closure_intron,
331     max_closure_intron,
332     min_coverage_intron,
333     max_coverage_intron,
334     min_segment_intron,
335     max_segment_intron):
336    
337     self.min_closure_exon_length = min_closure_exon
338     self.min_closure_intron_length = min_closure_intron
339     self.max_closure_intron_length = max_closure_intron
340     self.min_coverage_intron_length = min_coverage_intron
341     self.max_coverage_intron_length = max_coverage_intron
342     self.min_segment_intron_length = min_segment_intron
343     self.max_segment_intron_length = max_segment_intron
344    
345     def parse_options(self, opts):
346     for option, value in opts:
347     if option == "--min-closure-exon":
348     self.min_closure_exon_length = int(value)
349     if option == "--min-closure-intron":
350     self.min_closure_intron_length = int(value)
351     if option == "--max-closure-intron":
352     self.max_closure_intron_length = int(value)
353     if option == "--min-coverage-intron":
354     self.min_coverage_intron_length = int(value)
355     if option == "--max-coverage-intron":
356     self.max_coverage_intron_length = int(value)
357     if option == "--min-segment-intron":
358     self.min_segment_intron_length = int(value)
359     if option == "--max-segment-intron":
360     self.max_segment_intron_length = int(value)
361    
362     def check(self):
363     if self.min_closure_exon_length < 0:
364     die("Error: arg to --min-closure-exon must be at least 20")
365     if self.min_closure_intron_length < 0:
366     die("Error: arg to --min-closure-intron must be at least 20")
367     if self.max_closure_intron_length < 0:
368     die("Error: arg to --max-closure-intron must be at least 20")
369     if self.min_coverage_intron_length < 0:
370     die("Error: arg to --min-coverage-intron must be at least 20")
371     if self.max_coverage_intron_length < 0:
372     die("Error: arg to --max-coverage-intron must be at least 20")
373     if self.min_segment_intron_length < 0:
374     die("Error: arg to --min-segment-intron must be at least 20")
375     if self.max_segment_intron_length < 0:
376     die("Error: arg to --max-segment-intron must be at least 20")
377    
378     class ReportParams:
379     def __init__(self):
380     self.sort_bam = True
381     self.convert_bam = True
382    
383     def parse_options(self, opts):
384     for option, value in opts:
385     if option == "--no-sort-bam":
386     self.sort_bam = False
387     if option == "--no-convert-bam":
388     self.convert_bam = False
389     self.sort_bam = False
390    
391    
392     def __init__(self):
393     self.splice_constraints = self.SpliceConstraints(8, # min_anchor
394     50, # min_intron
395     500000, # max_intron
396     0, # splice_mismatches
397     0.15) # min_isoform_frac
398    
399     self.read_params = self.ReadParams(False, # solexa_scale
400     False,
401     False, # quals
402     None, # integer quals
403     False, # SOLiD - color space
404     False, # SOLiD - color out instead of base pair,
405     "", # library type (e.g. "illumina-stranded-pair-end")
406     None, # seed_length
407     "fastq", # quality_format
408     None, # mate inner distance
409     20, # mate inner dist std dev
410     None, # read group id
411     None, # sample id
412     None, # library id
413     None, # description
414     None, # platform unit (i.e. lane)
415     None, # sequencing center
416     None, # run date
417     None) # sequencing platform
418    
419     self.system_params = self.SystemParams(1, # bowtie_threads (num_cpus)
420     False) # keep_tmp
421    
422     self.search_params = self.SearchParams(100, # min_closure_exon_length
423     50, # min_closure_intron_length
424     5000, # max_closure_intron_length
425     50, # min_coverage_intron_length
426     20000, # max_coverage_intron_length
427     50, # min_segment_intron_length
428     500000) # max_segment_intron_length
429    
430     self.report_params = self.ReportParams()
431     self.gff_annotation = None
432     self.raw_junctions = None
433     self.find_novel_juncs = True
434     self.find_novel_indels = True
435     self.find_GFF_juncs = True
436     self.skip_check_reads = False
437     self.max_hits = 20
438     self.initial_read_mismatches = 2
439     self.segment_length = 25
440     self.segment_mismatches = 2
441     self.bowtie_alignment_option = "-v"
442     self.max_insertion_length = 3
443     self.max_deletion_length = 3
444     self.raw_insertions = None
445     self.raw_deletions = None
446     self.closure_search = None
447     self.coverage_search = None
448     self.microexon_search = False
449     self.butterfly_search = None
450    
451     def check(self):
452     self.splice_constraints.check()
453     self.read_params.check()
454     self.system_params.check()
455    
456     if self.segment_length <= 4:
457     die("Error: arg to --segment-length must at least 4")
458     if self.segment_mismatches < 0 or self.segment_mismatches > 3:
459     die("Error: arg to --segment-mismatches must in [0, 3]")
460    
461     if self.read_params.color and self.butterfly_search:
462     die("Error: butterfly-search in colorspace is not yet supported")
463    
464     library_types = ["fr-unstranded", "fr-firststrand", "fr-secondstrand"]
465    
466     if self.read_params.library_type and self.read_params.library_type not in library_types:
467     die("Error: library-type should be one of: "+', '.join(library_types))
468    
469     self.search_params.max_closure_intron_length = min(self.splice_constraints.max_intron_length,
470     self.search_params.max_closure_intron_length)
471    
472     self.search_params.max_segment_intron_length = min(self.splice_constraints.max_intron_length,
473     self.search_params.max_segment_intron_length)
474    
475     self.search_params.max_coverage_intron_length = min(self.splice_constraints.max_intron_length,
476     self.search_params.max_coverage_intron_length)
477    
478     if self.max_insertion_length >= self.segment_length:
479     die("Error: the max insertion length ("+self.max_insertion_length+") can not be equal to or greater than the segment length ("+self.segment_length+")")
480    
481     if self.max_insertion_length < 0:
482     die("Error: the max insertion length ("+self.max_insertion_length+") can not be less than 0")
483    
484     if self.max_deletion_length >= self.splice_constraints.min_intron_length:
485     die("Error: the max deletion length ("+self.max_deletion_length+") can not be equal to or greater than the min intron length ("+self.splice_constraints.min_intron_length+")")
486    
487     if self.max_deletion_length < 0:
488     die("Error: the max deletion length ("+self.max_deletion_length+") can not be less than 0")
489    
490     def cmd(self):
491     cmd = ["--min-anchor", str(self.splice_constraints.min_anchor_length),
492     "--splice-mismatches", str(self.splice_constraints.splice_mismatches),
493     "--min-report-intron", str(self.splice_constraints.min_intron_length),
494     "--max-report-intron", str(self.splice_constraints.max_intron_length),
495     "--min-isoform-fraction", str(self.splice_constraints.min_isoform_fraction),
496     "--output-dir", output_dir,
497     "--max-multihits", str(self.max_hits),
498     "--segment-length", str(self.segment_length),
499     "--segment-mismatches", str(self.segment_mismatches),
500     "--min-closure-exon", str(self.search_params.min_closure_exon_length),
501     "--min-closure-intron", str(self.search_params.min_closure_intron_length),
502     "--max-closure-intron", str(self.search_params.max_closure_intron_length),
503     "--min-coverage-intron", str(self.search_params.min_coverage_intron_length),
504     "--max-coverage-intron", str(self.search_params.max_coverage_intron_length),
505     "--min-segment-intron", str(self.search_params.min_segment_intron_length),
506     "--max-segment-intron", str(self.search_params.max_segment_intron_length),
507     "--sam-header", sam_header,
508     "--max-insertion-length", str(self.max_insertion_length),
509     "--max-deletion-length", str(self.max_deletion_length)]
510    
511     cmd.extend(self.system_params.cmd())
512    
513     if self.read_params.mate_inner_dist != None:
514     cmd.extend(["--inner-dist-mean", str(self.read_params.mate_inner_dist),
515     "--inner-dist-std-dev", str(self.read_params.mate_inner_dist_std_dev)])
516     if self.gff_annotation != None:
517     cmd.extend(["--gtf-annotations", str(self.gff_annotation)])
518     if not self.closure_search:
519     cmd.append("--no-closure-search")
520     if not self.coverage_search:
521     cmd.append("--no-coverage-search")
522     if not self.microexon_search:
523     cmd.append("--no-microexon-search")
524     if self.butterfly_search:
525     cmd.append("--butterfly-search")
526     if self.read_params.solexa_quals:
527     cmd.append("--solexa-quals")
528     if self.read_params.quals:
529     cmd.append("--quals")
530     if self.read_params.integer_quals:
531     cmd.append("--integer-quals")
532     if self.read_params.color:
533     cmd.append("--color")
534     if self.read_params.color_out:
535     cmd.append("--color-out")
536     if self.read_params.library_type:
537     cmd.extend(["--library-type", self.read_params.library_type])
538     if self.read_params.read_group_id:
539     cmd.extend(["--rg-id", self.read_params.read_group_id])
540     if self.read_params.phred64_quals:
541     cmd.append("--phred64-quals")
542     return cmd
543    
544     # This is the master options parsing routine, which calls parse_options for
545     # the delegate classes (e.g. SpliceConstraints) that handle certain groups
546     # of options.
547     def parse_options(self, argv):
548     try:
549     opts, args = getopt.getopt(argv[1:], "hvp:m:F:a:i:I:G:r:o:j:Xz:s:g:QC",
550     ["version",
551     "help",
552     "output-dir=",
553     "solexa-quals",
554     "solexa1.3-quals",
555     "phred64-quals",
556     "quals",
557     "integer-quals",
558     "color",
559     "color-out",
560     "library-type=",
561     "num-threads=",
562     "splice-mismatches=",
563     "max-multihits=",
564     "min-isoform-fraction=",
565     "min-anchor-length=",
566     "min-intron-length=",
567     "max-intron-length=",
568     "GTF=",
569     "raw-juncs=",
570     "no-novel-juncs",
571     "no-novel-indels",
572     "no-gtf-juncs",
573     "skip-check-reads",
574     "mate-inner-dist=",
575     "mate-std-dev=",
576     "no-closure-search",
577     "no-coverage-search",
578     "closure-search",
579     "coverage-search",
580     "microexon-search",
581     "min-closure-exon=",
582     "min-closure-intron=",
583     "max-closure-intron=",
584     "min-coverage-intron=",
585     "max-coverage-intron=",
586     "min-segment-intron=",
587     "max-segment-intron=",
588     "seed-length=",
589     "initial-read-mismatches=",
590     "segment-length=",
591     "segment-mismatches=",
592     "bowtie-n",
593     "butterfly-search",
594     "no-butterfly-search",
595     "keep-tmp",
596     "rg-id=",
597     "rg-sample=",
598     "rg-library=",
599     "rg-description=",
600     "rg-platform-unit=",
601     "rg-center=",
602     "rg-date=",
603     "rg-platform=",
604     "tmp-dir=",
605     "zpacker=",
606     "unmapped-fifo",
607     "max-insertion-length=",
608     "max-deletion-length=",
609     "insertions=",
610     "deletions=",
611     "no-sort-bam",
612     "no-convert-bam"])
613     except getopt.error, msg:
614     raise Usage(msg)
615    
616     self.splice_constraints.parse_options(opts)
617     self.system_params.parse_options(opts)
618     self.read_params.parse_options(opts)
619     self.search_params.parse_options(opts)
620     self.report_params.parse_options(opts)
621     global use_BWT_FIFO
622     global use_BAM_Unmapped
623     if not self.read_params.color:
624     use_BWT_FIFO=False
625     use_BAM_Unmapped=True
626     global output_dir
627     global logging_dir
628     global tmp_dir
629     global sam_header
630    
631     custom_tmp_dir = None
632     custom_out_dir = None
633     # option processing
634     for option, value in opts:
635     if option in ("-v", "--version"):
636     print "TopHat v%s" % (get_version())
637     sys.exit(0)
638     if option in ("-h", "--help"):
639     raise Usage(use_message)
640     if option in ("-g", "--max-multihits"):
641     self.max_hits = int(value)
642     if option in ("-G", "--GTF"):
643     self.gff_annotation = value
644     if option in ("-j", "--raw-juncs"):
645     self.raw_junctions = value
646     if option == "--no-novel-juncs":
647     self.find_novel_juncs = False
648     if option == "--no-novel-indels":
649     self.find_novel_indels = False
650     if option == "--no-gtf-juncs":
651     self.find_GFF_juncs = False
652     if option == "--skip-check-reads":
653     self.skip_check_reads = True
654     if option == "--no-coverage-search":
655     self.coverage_search = False
656     if option == "--no-closure-search":
657     self.closure_search = False
658     if option == "--coverage-search":
659     self.coverage_search = True
660     if option == "--closure-search":
661     self.closure_search = True
662     if option == "--microexon-search":
663     self.microexon_search = True
664     if option == "--butterfly-search":
665     self.butterfly_search = True
666     if option == "--no-butterfly-search":
667     self.butterfly_search = False
668     if option == "--initial-read-mismatches":
669     self.initial_read_mismatches = int(value)
670     if option == "--segment-length":
671     self.segment_length = int(value)
672     if option == "--segment-mismatches":
673     self.segment_mismatches = int(value)
674     if option == "--bowtie-n":
675     self.bowtie_alignment_option = "-n"
676     if option == "--max-insertion-length":
677     self.max_insertion_length = int(value)
678     if option == "--max-deletion-length":
679     self.max_deletion_length = int(value)
680     if option == "--insertions":
681     self.raw_insertions = value
682     if option == "--deletions":
683     self.raw_deletions = value
684     if option in ("-o", "--output-dir"):
685     custom_out_dir = value + "/"
686     if option == "--tmp-dir":
687     custom_tmp_dir = value + "/"
688    
689     if custom_out_dir != None:
690     output_dir = custom_out_dir
691     logging_dir = output_dir + "logs/"
692     tmp_dir = output_dir + "tmp/"
693     sam_header = tmp_dir + "stub_header.sam"
694     if custom_tmp_dir != None:
695     tmp_dir = custom_tmp_dir
696     sam_header = tmp_dir + "stub_header.sam"
697     if len(args) < 2:
698     raise Usage(use_message)
699     return args
700    
701    
702     def getFileDir(filepath):
703     #if fullpath, returns path including the ending /
704     fpath, fname=os.path.split(filepath)
705     if fpath: fpath+='/'
706     return fpath
707    
708     def getFileBaseName(filepath):
709     fpath, fname=os.path.split(filepath)
710     fbase, fext =os.path.splitext(fname)
711     fx=fext.lower()
712     if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fbase)>0:
713     return fbase
714     elif fx == '.z' or fx.find('.gz')==0 or fx.find('.bz')==0:
715     fb, fext = os.path.splitext(fbase)
716     fx=fext.lower()
717     if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fb)>0:
718     return fb
719     else:
720     return fbase
721     else:
722     if len(fbase)>len(fext):
723     return fbase
724     else:
725     return fname
726    
727     # Returns the current time in a nice format
728     def right_now():
729     curr_time = datetime.now()
730     return curr_time.strftime("%c")
731    
732     # Ensures that the output, logging, and temp directories are present. If not,
733     # they are created
734     def prepare_output_dir():
735    
736     print >> sys.stderr, "[%s] Preparing output location %s" % (right_now(), output_dir)
737     if os.path.exists(output_dir):
738     pass
739     else:
740     os.mkdir(output_dir)
741    
742     if os.path.exists(logging_dir):
743     pass
744     else:
745     os.mkdir(logging_dir)
746    
747     if os.path.exists(tmp_dir):
748     pass
749     else:
750     try:
751     os.makedirs(tmp_dir)
752     except OSError, o:
753     die("\nError creating directory %s (%s)" % (tmp_dir, o))
754    
755    
756     # to be added as preexec_fn for every subprocess.Popen() call:
757     # see http://bugs.python.org/issue1652
758     def subprocess_setup():
759     # Python installs a SIGPIPE handler by default, which causes
760     # gzip or other de/compression pipes to complain about "stdout: Broken pipe"
761     signal.signal(signal.SIGPIPE, signal.SIG_DFL)
762    
763     # Check that the Bowtie index specified by the user is present and all files
764     # are there.
765     def check_bowtie_index(idx_prefix):
766     print >> sys.stderr, "[%s] Checking for Bowtie index files" % right_now()
767    
768     idx_fwd_1 = idx_prefix + ".1.ebwt"
769     idx_fwd_2 = idx_prefix + ".2.ebwt"
770     idx_rev_1 = idx_prefix + ".rev.1.ebwt"
771     idx_rev_2 = idx_prefix + ".rev.2.ebwt"
772    
773     if os.path.exists(idx_fwd_1) and \
774     os.path.exists(idx_fwd_2) and \
775     os.path.exists(idx_rev_1) and \
776     os.path.exists(idx_rev_2):
777     return
778     else:
779     bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES")
780     if bowtie_idx_env_var == None:
781     die("Error: Could not find Bowtie index files " + idx_prefix + ".*")
782     idx_prefix = bowtie_idx_env_var + idx_prefix
783     idx_fwd_1 = idx_prefix + ".1.ebwt"
784     idx_fwd_2 = idx_prefix + ".2.ebwt"
785     idx_rev_1 = idx_prefix + ".rev.1.ebwt"
786     idx_rev_2 = idx_prefix + ".rev.2.ebwt"
787    
788     if os.path.exists(idx_fwd_1) and \
789     os.path.exists(idx_fwd_2) and \
790     os.path.exists(idx_rev_1) and \
791     os.path.exists(idx_rev_2):
792     return
793     else:
794     die("Error: Could not find Bowtie index files " + idx_prefix + ".*")
795    
796     # Reconstructs the multifasta file from which the Bowtie index was created, if
797     # it's not already there.
798     def bowtie_idx_to_fa(idx_prefix):
799     idx_name = idx_prefix.split('/')[-1]
800     print >> sys.stderr, "[%s] Reconstituting reference FASTA file from Bowtie index" % (right_now())
801    
802     try:
803     tmp_fasta_file_name = tmp_dir + idx_name + ".fa"
804     tmp_fasta_file = open(tmp_fasta_file_name, "w")
805    
806     inspect_log = open(logging_dir + "bowtie_inspect_recons.log", "w")
807    
808     inspect_cmd = [prog_path("bowtie-inspect"),
809     idx_prefix]
810    
811     print >> sys.stderr, "Executing: " + " ".join(inspect_cmd) + " > " + tmp_fasta_file_name
812     ret = subprocess.call(inspect_cmd,
813     stdout=tmp_fasta_file,
814     stderr=inspect_log)
815    
816     # Bowtie reported an error
817     if ret != 0:
818     die(fail_str+"Error: bowtie-inspect returned an error")
819    
820     # Bowtie not found
821     except OSError, o:
822     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
823     die(fail_str+"Error: bowtie-inspect not found on this system. Did you forget to include it in your PATH?")
824    
825     return tmp_fasta_file_name
826    
827     # Checks whether the multifasta file for the genome is present alongside the
828     # Bowtie index files for it.
829     def check_fasta(idx_prefix):
830     print >> sys.stderr, "[%s] Checking for reference FASTA file" % right_now()
831     idx_fasta = idx_prefix + ".fa"
832     if os.path.exists(idx_fasta):
833     return idx_fasta
834     else:
835     bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES")
836     if bowtie_idx_env_var != None:
837     idx_fasta = bowtie_idx_env_var + idx_prefix + ".fa"
838     if os.path.exists(idx_fasta):
839     return idx_fasta
840    
841     print >> sys.stderr, "\tWarning: Could not find FASTA file " + idx_fasta
842     idx_fa = bowtie_idx_to_fa(idx_prefix)
843     return idx_fa
844    
845     # Check that both the Bowtie index and the genome's fasta file are present
846     def check_index(idx_prefix):
847     check_bowtie_index(idx_prefix)
848     ref_fasta_file = check_fasta(idx_prefix)
849    
850     return (ref_fasta_file, None)
851    
852     # Retrive a tuple containing the system's version of Bowtie. Parsed from
853     # `bowtie --version`
854     def get_bowtie_version():
855     try:
856     # Launch Bowtie to capture its version info
857     proc = subprocess.Popen([bowtie_path, "--version"],
858     stdout=subprocess.PIPE)
859     stdout_value = proc.communicate()[0]
860     bowtie_version = None
861     bowtie_out = repr(stdout_value)
862    
863     # Find the version identifier
864     version_str = "bowtie version "
865     ver_str_idx = bowtie_out.find(version_str)
866     if ver_str_idx != -1:
867     nl = bowtie_out.find("\\n", ver_str_idx)
868     version_val = bowtie_out[ver_str_idx + len(version_str):nl]
869     bowtie_version = [int(x) for x in version_val.split('.')]
870     if len(bowtie_version) == 3:
871     bowtie_version.append(0)
872    
873     return bowtie_version
874     except OSError, o:
875     errmsg=fail_str+str(o)+"\n"
876     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
877     errmsg+="Error: bowtie not found on this system"
878     die(errmsg)
879    
880     def get_index_sam_header(read_params, idx_prefix):
881     try:
882     bowtie_sam_header_filename = tmp_dir+idx_prefix.split('/')[-1]+".bwt.samheader.sam"
883     bowtie_sam_header_file = open(bowtie_sam_header_filename,"w")
884    
885     bowtie_header_cmd = [bowtie_path, "--sam"]
886     if read_params.color:
887     bowtie_header_cmd.append('-C')
888     bowtie_header_cmd.extend([idx_prefix, '/dev/null'])
889     subprocess.call(bowtie_header_cmd,
890     stdout=bowtie_sam_header_file,
891     stderr=open('/dev/null'))
892    
893     bowtie_sam_header_file.close()
894     bowtie_sam_header_file = open(bowtie_sam_header_filename,"r")
895    
896     sam_header_file = open(sam_header, "w")
897    
898     preamble = []
899     sq_dict_lines = []
900    
901     for line in bowtie_sam_header_file.readlines():
902     line = line.strip()
903     if line.find("@SQ") != -1:
904     # Sequence dictionary record
905     cols = line.split('\t')
906     seq_name = None
907     for col in cols:
908     fields = col.split(':')
909     #print fields
910     if len(fields) > 0 and fields[0] == "SN":
911     seq_name = fields[1]
912     if seq_name == None:
913     die("Error: malformed sequence dictionary in sam header")
914     sq_dict_lines.append([seq_name,line])
915     elif line.find("CL"):
916     continue
917     else:
918     preamble.append(line)
919     #for line in preamble:
920     # print >> sam_header_file, line
921     print >> sam_header_file, "@HD\tVN:1.0\tSO:coordinate"
922    
923     if read_params.read_group_id and read_params.sample_id:
924     rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
925     read_params.sample_id)
926     if read_params.library_id:
927     rg_str += "\tLB:%s" % read_params.library_id
928     if read_params.description:
929     rg_str += "\tDS:%s" % read_params.description
930     if read_params.seq_platform_unit:
931     rg_str += "\tPU:%s" % read_params.seq_platform_unit
932     if read_params.seq_center:
933     rg_str += "\tCN:%s" % read_params.seq_center
934     if read_params.mate_inner_dist:
935     rg_str += "\tPI:%s" % read_params.mate_inner_dist
936     if read_params.seq_run_date:
937     rg_str += "\tDT:%s" % read_params.seq_run_date
938     if read_params.seq_platform:
939     rg_str += "\tPL:%s" % read_params.seq_platform
940    
941     print >> sam_header_file, rg_str
942    
943     sq_dict_lines.sort(lambda x,y: cmp(x[0],y[0]))
944     for [name, line] in sq_dict_lines:
945     print >> sam_header_file, line
946     print >> sam_header_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
947    
948     sam_header_file.close()
949    
950     return sam_header
951    
952     except OSError, o:
953     errmsg=fail_str+str(o)+"\n"
954     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
955     errmsg+="Error: bowtie not found on this system"
956     die(errmsg)
957    
958     # Make sure Bowtie is installed and is recent enough to be useful
959     def check_bowtie():
960     print >> sys.stderr, "[%s] Checking for Bowtie" % right_now()
961     global bowtie_path
962     bowtie_path=prog_path("bowtie")
963     bowtie_version = get_bowtie_version()
964     if bowtie_version == None:
965     die("Error: Bowtie not found on this system")
966     elif bowtie_version[0] < 1 and bowtie_version[1] < 12 and bowtie_version[2] < 3:
967     die("Error: TopHat requires Bowtie 0.12.3 or later")
968     print >> sys.stderr, "\tBowtie version:\t\t\t %s" % ".".join([str(x) for x in bowtie_version])
969    
970    
971     # Retrive a tuple containing the system's version of samtools. Parsed from
972     # `samtools`
973     def get_samtools_version():
974     try:
975     # Launch Bowtie to capture its version info
976     proc = subprocess.Popen(samtools_path, stderr=subprocess.PIPE)
977     samtools_out = proc.communicate()[1]
978    
979     # Find the version identifier
980     version_match = re.search(r'Version:\s+(\d+)\.(\d+).(\d+)([a-zA-Z]?)', samtools_out)
981     samtools_version_arr = [int(version_match.group(x)) for x in [1,2,3]]
982     if version_match.group(4):
983     samtools_version_arr.append(version_match.group(4))
984     else:
985     samtools_version_arr.append(0)
986    
987     return version_match.group(), samtools_version_arr
988     except OSError, o:
989     errmsg=fail_str+str(o)+"\n"
990     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
991     errmsg+="Error: samtools not found on this system"
992     die(errmsg)
993    
994     # Make sure the SAM tools are installed and are recent enough to be useful
995     def check_samtools():
996     print >> sys.stderr, "[%s] Checking for Samtools" % right_now()
997     global samtools_path
998     samtools_path=prog_path("samtools")
999     samtools_version_str, samtools_version_arr = get_samtools_version()
1000     if samtools_version_str == None:
1001     die("Error: Samtools not found on this system")
1002     elif samtools_version_arr[1] < 1 or samtools_version_arr[2] < 7:
1003     die("Error: TopHat requires Samtools 0.1.7 or later")
1004     print >> sys.stderr, "\tSamtools %s" % samtools_version_str
1005    
1006    
1007    
1008     class FastxReader:
1009     def __init__(self, i_file, is_color=0, fname=''):
1010     self.bufline=None
1011     self.format=None
1012     self.ifile=i_file
1013     self.nextRecord=None
1014     self.eof=None
1015     self.fname=fname
1016     self.lastline=None
1017     self.numrecords=0
1018     self.isColor=0
1019     if is_color : self.isColor=1
1020     # determine file type
1021     #no records processed yet, skip custom header lines if any
1022     hlines=10 # allow maximum 10 header lines
1023     self.lastline=" "
1024     while hlines>0 and self.lastline[0] not in "@>" :
1025     self.lastline=self.ifile.readline()
1026     hlines-=1
1027     if self.lastline[0] == '@':
1028     self.format='fastq'
1029     self.nextRecord=self.nextFastq
1030     elif self.lastline[0] == '>':
1031     self.format='fasta'
1032     self.nextRecord=self.nextFasta
1033     else:
1034     die("Error: cannot determine record type in input file %s" % fname)
1035     self.bufline=self.lastline
1036     self.lastline=None
1037    
1038     def nextFastq(self):
1039     # returning tuple: (seqID, sequence_string, seq_len, qv_string)
1040     seqid,seqstr,qstr,seq_len='','','',0
1041     if self.eof: return (seqid, seqstr, seq_len, qstr)
1042     fline=self.getLine #shortcut to save a bit of time
1043     line=fline()
1044    
1045     if not line : return (seqid, seqstr, seq_len, qstr)
1046     while len(line.rstrip())==0: # skip empty lines
1047     line=fline()
1048     if not line : return (seqid, seqstr,seq_len, qstr)
1049     try:
1050     if line[0] != "@":
1051     raise ValueError("Records in Fastq files should start with '@' character")
1052     seqid = line[1:].rstrip()
1053     seqstr = fline().rstrip()
1054    
1055     #There may now be more sequence lines, or the "+" quality marker line:
1056     while True:
1057     line = fline()
1058     if not line:
1059     raise ValueError("Premature end of file (missing quality values for "+seqid+")")
1060     if line[0] == "+":
1061     # -- sequence string ended
1062     #qtitle = line[1:].rstrip()
1063     #if qtitle and qtitle != seqid:
1064     # raise ValueError("Different read ID for sequence and quality (%s vs %s)" \
1065     # % (seqid, qtitle))
1066     break
1067     seqstr += line.rstrip() #removes trailing newlines
1068     #loop until + found
1069     seq_len = len(seqstr)
1070     #at least one line of quality data should follow
1071     qstrlen=0
1072     #now read next lines as quality values until seq_len is reached
1073     while True:
1074     line=fline()
1075     if not line : break #end of file
1076     qstr += line.rstrip()
1077     qstrlen=len(qstr)
1078     if qstrlen + self.isColor >= seq_len :
1079     break # qv string has reached the length of seq string
1080     #loop until qv has the same length as seq
1081     if self.isColor and qstr[0]=="!":
1082     #some color qual string have a '!' character at the beginning, should be stripped
1083     qstr = qstr[1:]
1084     qstrlen -= 1
1085     if seq_len != qstrlen+self.isColor :
1086     raise ValueError("Length mismatch between sequence and quality strings "+ \
1087     "for %s (%i vs %i)." \
1088     % (seqid, seq_len, qstrlen))
1089     except ValueError, err:
1090     die("\nError encountered parsing file "+self.fname+":\n "+str(err))
1091     #return the record
1092     self.numrecords+=1
1093     if self.isColor :
1094     seq_len-=1
1095     seqstr = seqstr[1:]
1096     return (seqid, seqstr, seq_len, qstr)
1097    
1098     def nextFasta(self):
1099     # returning tuple: (seqID, sequence_string, seq_len)
1100     seqid,seqstr,seq_len='','',0
1101     fline=self.getLine # shortcut to readline function of f
1102     line=fline() # this will use the buffer line if it's there
1103     if not line : return (seqid, seqstr, seq_len, None)
1104     while len(line.rstrip())==0: # skip empty lines
1105     line=fline()
1106     if not line : return (seqid, seqstr, seq_len, None)
1107     try:
1108     if line[0] != ">":
1109     raise ValueError("Records in Fasta files must start with '>' character")
1110     seqid = line[1:].split()[0]
1111     #more sequence lines, or the ">" quality marker line:
1112     while True:
1113     line = fline()
1114     if not line: break
1115     if line[0] == '>':
1116     #next sequence starts here
1117     self.ungetLine()
1118     break
1119     seqstr += line.rstrip()
1120     #loop until '>' found
1121     seq_len = len(seqstr)
1122     if seq_len < 3:
1123     raise ValueError("Read %s too short (%i)." \
1124     % (seqid, seq_len))
1125     except ValueError, err:
1126     die("\nError encountered parsing fasta file "+self.fname+"\n "+str(err))
1127     #return the record and continue
1128     self.numrecords+=1
1129     if self.isColor :
1130     seq_len-=1
1131     seqstr = seqstr[1:]
1132     return (seqid, seqstr, seq_len, None)
1133    
1134     def getLine(self):
1135     if self.bufline: #return previously buffered line
1136     r=self.bufline
1137     self.bufline=None
1138     return r
1139     else: #read a new line from stream and return it
1140     if self.eof: return None
1141     self.lastline=self.ifile.readline()
1142     if not self.lastline:
1143     self.eof=1
1144     return None
1145     return self.lastline
1146     def ungetLine(self):
1147     if self.lastline is None:
1148     print >> sys.stderr, "Warning: FastxReader called ungetLine() with no prior line!"
1149     self.bufline=self.lastline
1150     self.lastline=None
1151     #< class FastxReader
1152    
1153     class ZReader:
1154     def __init__(self, filename, sysparams, guess=True):
1155     self.fname=filename
1156     self.file=None
1157     self.fsrc=None
1158     self.popen=None
1159     pipecmd=[]
1160 gpertea 31 s=filename.lower()
1161     if s.endswith(".bam"):
1162     pipecmd=["bam2fastx", "-"]
1163     else:
1164     if guess:
1165     if s.endswith(".z") or s.endswith(".gz") or s.endswith(".gzip"):
1166     pipecmd=['gzip']
1167     else:
1168     if s.endswith(".bz2") or s.endswith(".bzip2") or s.endswith(".bzip"):
1169     pipecmd=['bzip2']
1170     if len(pipecmd)>0 and which(pipecmd[0]) is None:
1171     die("Error: cannot find %s to decompress input file %s " % (pipecmd, filename))
1172     if len(pipecmd)>0:
1173     if pipecmd[0]=='gzip' and sysparams.zipper.endswith('pigz'):
1174     pipecmd[0]=sysparams.zipper
1175     pipecmd.extend(sysparams.zipper_opts)
1176     elif pipecmd[0]=='bzip2' and sysparams.zipper.endswith('pbzip2'):
1177     pipecmd[0]=sysparams.zipper
1178     pipecmd.extend(sysparams.zipper_opts)
1179     else: #not guessing, but we must still be sure it's a compressed file
1180     if use_zpacker and filename.endswith(".z"):
1181     pipecmd=[sysparams.zipper]
1182     pipecmd.extend(sysparams.zipper_opts)
1183     if pipecmd:
1184     pipecmd+=['-cd']
1185 gpertea 29 if pipecmd:
1186     try:
1187     self.fsrc=open(self.fname, 'rb')
1188     self.popen=subprocess.Popen(pipecmd,
1189     preexec_fn=subprocess_setup,
1190     stdin=self.fsrc,
1191     stdout=subprocess.PIPE, close_fds=True)
1192     except Exception:
1193     die("Error: could not open pipe "+' '.join(pipecmd)+' < '+ self.fname)
1194     self.file=self.popen.stdout
1195     else:
1196     self.file=open(filename)
1197     def close(self):
1198     if self.fsrc: self.fsrc.close()
1199     self.file.close()
1200     if self.popen:
1201     self.popen.wait()
1202     self.popen=None
1203    
1204     class ZWriter:
1205     def __init__(self, filename, sysparams):
1206     self.fname=filename
1207     if use_zpacker:
1208     pipecmd=[sysparams.zipper,"-cf", "-"]
1209     self.ftarget=open(filename, "wb")
1210     try:
1211     self.popen=subprocess.Popen(pipecmd,
1212     preexec_fn=subprocess_setup,
1213     stdin=subprocess.PIPE,
1214     stdout=self.ftarget, close_fds=True)
1215     except Exception:
1216     die("Error: could not open writer pipe "+' '.join(pipecmd)+' < '+ self.fname)
1217     self.file=self.popen.stdin # client writes to this end of the pipe
1218     else: #no compression
1219     self.file=open(filename, "w")
1220     self.ftarget=None
1221     self.popen=None
1222     def close(self):
1223     self.file.close()
1224     if self.ftarget: self.ftarget.close()
1225     if self.popen:
1226     self.popen.wait() #! required to actually flush the pipes (eek!)
1227     self.popen=None
1228    
1229     # check_reads_format() examines the first few records in the user files
1230     # to determines the file format
1231     def check_reads_format(params, reads_files):
1232     #print >> sys.stderr, "[%s] Checking reads" % right_now()
1233     #
1234     seed_len = params.read_params.seed_length
1235     fileformat = params.read_params.reads_format
1236    
1237     observed_formats = set([])
1238     # observed_scales = set([])
1239     min_seed_len = 99999
1240     max_seed_len = 0
1241     files = reads_files.split(',')
1242    
1243     for f_name in files:
1244     #try:
1245     zf = ZReader(f_name, params.system_params)
1246     #except IOError:
1247     # die("Error: could not open file "+f_name)
1248     freader=FastxReader(zf.file, params.read_params.color, zf.fname)
1249     toread=4 #just sample the first 4 reads
1250     while toread>0:
1251     seqid, seqstr, seq_len, qstr = freader.nextRecord()
1252     if not seqid: break
1253     toread-=1
1254     if seq_len < 20:
1255     print >> sys.stderr, "Warning: found a read < 20bp in", f_name
1256     else:
1257     min_seed_len = min(seq_len, min_seed_len)
1258     max_seed_len = max(seq_len, max_seed_len)
1259     zf.close()
1260     observed_formats.add(freader.format)
1261     if len(observed_formats) > 1:
1262     die("Error: TopHat requires all reads be either FASTQ or FASTA. Mixing formats is not supported.")
1263     fileformat=list(observed_formats)[0]
1264     if seed_len != None:
1265     seed_len = max(seed_len, min_seed_len)
1266     else:
1267     seed_len = max_seed_len
1268     #print >> sys.stderr, "\tmin read length: %dbp, max read length: %dbp" % (min_seed_len, max_seed_len)
1269     print >> sys.stderr, "\tformat:\t\t %s" % fileformat
1270     if fileformat == "fastq":
1271     quality_scale = "phred33 (default)"
1272     if params.read_params.solexa_quals and not params.read_params.phred64_quals:
1273     quality_scale = "solexa33 (reads generated with GA pipeline version < 1.3)"
1274     elif params.read_params.phred64_quals:
1275     quality_scale = "phred64 (reads generated with GA pipeline version >= 1.3)"
1276     print >> sys.stderr, "\tquality scale:\t %s" % quality_scale
1277     elif fileformat == "fasta":
1278     if params.read_params.color:
1279     params.read_params.integer_quals = True
1280    
1281     #print seed_len, format, solexa_scale
1282     #NOTE: seed_len will be re-evaluated later by prep_reads
1283     return TopHatParams.ReadParams(params.read_params.solexa_quals,
1284     params.read_params.phred64_quals,
1285     params.read_params.quals,
1286     params.read_params.integer_quals,
1287     params.read_params.color,
1288     params.read_params.color_out,
1289     params.read_params.library_type,
1290     seed_len,
1291     fileformat,
1292     params.read_params.mate_inner_dist,
1293     params.read_params.mate_inner_dist_std_dev,
1294     params.read_params.read_group_id,
1295     params.read_params.sample_id,
1296     params.read_params.library_id,
1297     params.read_params.description,
1298     params.read_params.seq_platform_unit,
1299     params.read_params.seq_center,
1300     params.read_params.seq_run_date,
1301     params.read_params.seq_platform)
1302    
1303     def log_tail(logfile, lines=1):
1304     f=open(logfile, "r")
1305     f.seek(0, 2)
1306     fbytes= f.tell()
1307     size=lines
1308     block=-1
1309     while size > 0 and fbytes+block*1024 > 0:
1310     if (fbytes+block*1024 > 0):
1311     ##Seek back once more, if possible
1312     f.seek( block*1024, 2 )
1313     else:
1314     #Seek to the beginning
1315     f.seek(0, 0)
1316     data= f.read( 1024 )
1317     linesFound= data.count('\n')
1318     size -= linesFound
1319     block -= 1
1320     if (fbytes + block*1024 > 0):
1321     f.seek(block*1024, 2)
1322     else:
1323     f.seek(0,0)
1324     f.readline() # find a newline
1325     lastBlocks= list( f.readlines() )
1326     f.close()
1327     return "\n".join(lastBlocks[-lines:])
1328    
1329     # Format a DateTime as a pretty string.
1330     # FIXME: Currently doesn't support days!
1331     def formatTD(td):
1332     hours = td.seconds // 3600
1333     minutes = (td.seconds % 3600) // 60
1334     seconds = td.seconds % 60
1335     return '%02d:%02d:%02d' % (hours, minutes, seconds)
1336    
1337     class PrepReadsInfo:
1338     def __init__(self, fname, side):
1339     try:
1340     f=open(fname,"r")
1341     self.min_len=int(f.readline().split("=")[-1])
1342     self.max_len=int(f.readline().split("=")[-1])
1343     self.in_count=int(f.readline().split("=")[-1])
1344     self.out_count=int(f.readline().split("=")[-1])
1345     if (self.out_count==0) or (self.max_len<16):
1346     raise Exception()
1347     except Exception, e:
1348     die(fail_str+"Error retrieving prep_reads info.")
1349     print >> sys.stderr, "\t%s reads: min. length=%s, count=%s" % (side, self.min_len, self.out_count)
1350     if self.min_len < 20:
1351     print >> sys.stderr, "Warning: short reads (<20bp) will make TopHat quite slow and take large amount of memory because they are likely to be mapped to too many places"
1352    
1353     # Calls the prep_reads executable, which prepares an internal read library.
1354     # The read library features reads with monotonically increasing integer IDs.
1355     # prep_reads also filters out very low complexy or garbage reads as well as
1356     # polyA reads.
1357     #--> returns the tuple (internal_read_library_file, read_info)
1358     def prep_reads(params, reads_list, quals_list, output_name, side="Left "):
1359     reads_suffix = ".fq"
1360     if use_zpacker: reads_suffix += ".z"
1361     kept_reads_filename = tmp_dir + output_name + reads_suffix
1362    
1363     if os.path.exists(kept_reads_filename):
1364     os.remove(kept_reads_filename)
1365     kept_reads = open(kept_reads_filename, "wb")
1366     log_fname=logging_dir + "prep_reads.log"
1367     filter_log = open(log_fname,"w")
1368    
1369     filter_cmd = [prog_path("prep_reads")]
1370     filter_cmd.extend(params.cmd())
1371    
1372     if params.read_params.reads_format == "fastq":
1373     filter_cmd += ["--fastq"]
1374     elif params.read_params.reads_format == "fasta":
1375     filter_cmd += ["--fasta"]
1376     info_file=output_dir+output_name+".info"
1377     filter_cmd += ["--aux-outfile="+info_file]
1378     filter_cmd.append(reads_list)
1379     if params.read_params.quals:
1380     filter_cmd.append(quals_list)
1381     shell_cmd = ' '.join(filter_cmd)
1382     #finally, add the compression pipe
1383     zip_cmd=[]
1384     if use_zpacker:
1385     zip_cmd=[ params.system_params.zipper ]
1386     zip_cmd.extend(params.system_params.zipper_opts)
1387     zip_cmd.extend(['-c','-'])
1388     shell_cmd +=' | '+' '.join(zip_cmd)
1389     shell_cmd += ' >' +kept_reads_filename
1390     retcode=0
1391     try:
1392     print >> run_log, shell_cmd
1393     if use_zpacker:
1394     filter_proc = subprocess.Popen(filter_cmd,
1395     stdout=subprocess.PIPE,
1396     stderr=filter_log)
1397     zip_proc=subprocess.Popen(zip_cmd,
1398     preexec_fn=subprocess_setup,
1399     stdin=filter_proc.stdout,
1400     stdout=kept_reads)
1401     filter_proc.stdout.close() #as per http://bugs.python.org/issue7678
1402     zip_proc.communicate()
1403     retcode=filter_proc.poll()
1404     if retcode==0:
1405     retcode=zip_proc.poll()
1406     else:
1407     retcode = subprocess.call(filter_cmd,
1408     stdout=kept_reads,
1409     stderr=filter_log)
1410     if retcode:
1411     die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname))
1412    
1413     except OSError, o:
1414     errmsg=fail_str+str(o)
1415     die(errmsg+"\n"+log_tail(log_fname))
1416     kept_reads.close()
1417    
1418     return kept_reads_filename, PrepReadsInfo(info_file, side)
1419    
1420     # Call bowtie
1421     def bowtie(params,
1422     bwt_idx_prefix,
1423     reads_list,
1424     reads_format,
1425     num_mismatches,
1426     mapped_reads,
1427     unmapped_reads = None, #not needed vs segment_juncs
1428     extra_output = ""):
1429     start_time = datetime.now()
1430     bwt_idx_name = bwt_idx_prefix.split('/')[-1]
1431     readfile_basename=getFileBaseName(reads_list)
1432     print >> sys.stderr, "[%s] Mapping %s against %s with Bowtie %s" % (start_time.strftime("%c"),
1433     readfile_basename, bwt_idx_name, extra_output)
1434    
1435     bwt_log = open(logging_dir + 'bowtie.'+readfile_basename+'.fixmap.log', "w")
1436 gpertea 67 unmapped_reads_out=None
1437     if unmapped_reads:
1438     unmapped_reads_out=unmapped_reads+".fq"
1439 gpertea 29 use_FIFO = use_BWT_FIFO and use_zpacker and unmapped_reads and params.read_params.color
1440     if use_FIFO: #this should always be False unless color reads are used with -X option
1441     global unmapped_reads_fifo
1442 gpertea 67 unmapped_reads_fifo=unmapped_reads+".fifo"
1443 gpertea 29 if os.path.exists(unmapped_reads_fifo):
1444     os.remove(unmapped_reads_fifo)
1445     try:
1446     os.mkfifo(unmapped_reads_fifo)
1447     except OSError, o:
1448     die(fail_str+"Error at mkfifo("+unmapped_reads_fifo+'). '+str(o))
1449 gpertea 67 unmapped_reads_out+=".z"
1450 gpertea 29
1451     # Launch Bowtie
1452     try:
1453     bowtie_cmd = [bowtie_path]
1454     bam_input=0
1455     unzip_cmd=None
1456     if reads_list.endswith('.bam'):
1457     bam_input=1
1458     unzip_cmd=[ prog_path('bam2fastx') ]
1459     if reads_format:
1460     unzip_cmd.append("--"+reads_format)
1461     unzip_cmd+=[reads_list]
1462    
1463     if reads_format == "fastq":
1464     bowtie_cmd += ["-q"]
1465     elif reads_format == "fasta":
1466     bowtie_cmd += ["-f"]
1467    
1468     if params.read_params.color:
1469     bowtie_cmd += ["-C", "--col-keepends"]
1470     if unmapped_reads:
1471 gpertea 67 bowtie_cmd += ["--un", unmapped_reads_fifo]
1472 gpertea 29 # "--max", "/dev/null"]
1473     if use_zpacker and (unzip_cmd is None):
1474     unzip_cmd=[ params.system_params.zipper ]
1475     unzip_cmd.extend(params.system_params.zipper_opts)
1476     unzip_cmd+=['-cd']
1477    
1478     fifo_pid=None
1479     if use_FIFO:
1480     unm_zipcmd=[ params.system_params.zipper ]
1481     unm_zipcmd.extend(params.system_params.zipper_opts)
1482     unm_zipcmd+=['-c']
1483 gpertea 67 print >> run_log, ' '.join(unm_zipcmd)+' < '+ unmapped_reads_fifo + ' > '+ unmapped_reads_out + ' & '
1484 gpertea 29 fifo_pid=os.fork()
1485     if fifo_pid==0:
1486     def on_sig_exit(sig, func=None):
1487     os._exit(os.EX_OK)
1488     signal.signal(signal.SIGTERM, on_sig_exit)
1489     subprocess.call(unm_zipcmd,
1490     stdin=open(unmapped_reads_fifo, "r"),
1491 gpertea 67 stdout=open(unmapped_reads_out, "wb"))
1492 gpertea 29 os._exit(os.EX_OK)
1493    
1494     bowtie_cmd += [params.bowtie_alignment_option, str(num_mismatches),
1495     "-p", str(params.system_params.num_cpus),
1496     "-k", str(params.max_hits),
1497     "-m", str(params.max_hits),
1498     "--max", "/dev/null",
1499 gpertea 34 "-S", "--sam-nohead"] #always use headerless SAM file
1500 gpertea 29 fix_map_cmd = [prog_path('fix_map_ordering')]
1501     sam_zipcmd=None
1502     if unmapped_reads:
1503     #mapping on the genome
1504     #fix_map_ordering will write BAM file directly, and unmapped_reads as BAM file too
1505     mapped_reads += ".bam"
1506     if params.read_params.color:
1507     fix_map_cmd += ["--sam-header", sam_header, "-", mapped_reads]
1508     else:
1509 gpertea 67 unmapped_reads_out = unmapped_reads+".bam"
1510     fix_map_cmd += ["--sam-header", sam_header, "-", mapped_reads, unmapped_reads_out]
1511 gpertea 29 else:
1512     #mapping on segment_juncs (spliced potential junctions)
1513     # don't use SAM headers, could be slow -- better compress SAM output directly
1514     mapped_reads += ".sam"
1515     fix_map_cmd += ["-"]
1516     if use_zpacker:
1517     fix_map_cmd += ["-"]
1518     sam_zipcmd=[ params.system_params.zipper ]
1519     sam_zipcmd.extend(params.system_params.zipper_opts)
1520     sam_zipcmd += ["-c"]
1521     mapped_reads += ".z"
1522     else:
1523     fix_map_cmd += [mapped_reads]
1524    
1525     bowtie_cmd+=[bwt_idx_prefix]
1526     bowtie_proc=None
1527     shellcmd=""
1528     unzip_proc=None
1529     mapzip_proc=None
1530     if unzip_cmd:
1531     # could be bam2fastx
1532     if bam_input:
1533     unzip_proc = subprocess.Popen(unzip_cmd, stdout=subprocess.PIPE)
1534     shellcmd=' '.join(unzip_cmd) + "|"
1535     else:
1536     unzip_proc = subprocess.Popen(unzip_cmd,
1537     stdin=open(reads_list, "rb"),
1538     stdout=subprocess.PIPE)
1539     shellcmd=' '.join(unzip_cmd) + "< " +reads_list +"|"
1540     bowtie_cmd += ['-']
1541     bowtie_proc = subprocess.Popen(bowtie_cmd,
1542     stdin=unzip_proc.stdout,
1543     stdout=subprocess.PIPE,
1544     stderr=bwt_log)
1545     unzip_proc.stdout.close() # see http://bugs.python.org/issue7678
1546     else:
1547     bowtie_cmd += [reads_list]
1548     bowtie_proc = subprocess.Popen(bowtie_cmd,
1549     stdout=subprocess.PIPE,
1550     stderr=bwt_log)
1551     shellcmd += " ".join(bowtie_cmd) + " | " + " ".join(fix_map_cmd)
1552     if sam_zipcmd:
1553     shellcmd += "|" + " ".join(sam_zipcmd)+" > " + mapped_reads
1554     fix_order_proc = subprocess.Popen(fix_map_cmd,
1555     stdin=bowtie_proc.stdout,
1556     stdout=subprocess.PIPE)
1557     bowtie_proc.stdout.close()
1558     mapzip_proc = subprocess.Popen(sam_zipcmd,
1559     stdin=fix_order_proc.stdout,
1560     stdout=open(mapped_reads, "wb"))
1561     fix_order_proc.stdout.close()
1562     else:
1563     fix_order_proc = subprocess.Popen(fix_map_cmd,
1564     stdin=bowtie_proc.stdout)
1565     bowtie_proc.stdout.close()
1566    
1567     print >> run_log, shellcmd
1568     if mapzip_proc:
1569     mapzip_proc.communicate()
1570     else:
1571     fix_order_proc.communicate()
1572     if use_FIFO:
1573 gpertea 67 if fifo_pid and not os.path.exists(unmapped_reads_out):
1574 gpertea 29 try:
1575     os.kill(fifo_pid, signal.SIGTERM)
1576     except:
1577     pass
1578     except OSError, o:
1579     die(fail_str+"Error: "+str(o))
1580    
1581     # Success
1582     #finish_time = datetime.now()
1583     #duration = finish_time - start_time
1584     #print >> sys.stderr, "\t\t\t[%s elapsed]" % formatTD(duration)
1585     if use_FIFO:
1586     try:
1587     os.remove(unmapped_reads_fifo)
1588     except:
1589     pass
1590 gpertea 67 return (mapped_reads, unmapped_reads_out)
1591 gpertea 29
1592     def bowtie_segment(params,
1593     bwt_idx_prefix,
1594     reads_list,
1595     reads_format,
1596     mapped_reads,
1597     unmapped_reads = None,
1598     extra_output = ""):
1599    
1600     backup_bowtie_alignment_option = params.bowtie_alignment_option
1601     params.bowtie_alignment_option = "-v"
1602     params.max_hits *= 2
1603    
1604     result = bowtie(params,
1605     bwt_idx_prefix,
1606     reads_list,
1607     reads_format,
1608     params.segment_mismatches,
1609     mapped_reads,
1610     unmapped_reads,
1611     extra_output)
1612    
1613     params.bowtie_alignment_option = backup_bowtie_alignment_option
1614     params.max_hits /= 2
1615     return result
1616    
1617     # Retrieve a .juncs file from a GFF file by calling the gtf_juncs executable
1618     def get_gtf_juncs(gff_annotation):
1619     print >> sys.stderr, "[%s] Reading known junctions from GTF file" % (right_now())
1620     gtf_juncs_log = open(logging_dir + "gtf_juncs.log", "w")
1621    
1622     gff_prefix = gff_annotation.split('/')[-1].split('.')[0]
1623    
1624     gtf_juncs_out_name = tmp_dir + gff_prefix + ".juncs"
1625     gtf_juncs_out = open(gtf_juncs_out_name, "w")
1626    
1627     gtf_juncs_cmd=[prog_path("gtf_juncs"), gff_annotation]
1628     try:
1629     print >> run_log, " ".join(gtf_juncs_cmd)
1630     retcode = subprocess.call(gtf_juncs_cmd,
1631     stderr=gtf_juncs_log,
1632     stdout=gtf_juncs_out)
1633     # cvg_islands returned an error
1634     if retcode == 1:
1635     print >> sys.stderr, "\tWarning: TopHat did not find any junctions in GTF file"
1636     return (False, gtf_juncs_out_name)
1637     elif retcode != 0:
1638     die(fail_str+"Error: GTF junction extraction failed with err ="+str(retcode))
1639     # cvg_islands not found
1640     except OSError, o:
1641     errmsg=fail_str+str(o)+"\n"
1642     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1643     errmsg+="Error: gtf_juncs not found on this system"
1644     die(errmsg)
1645     return (True, gtf_juncs_out_name)
1646    
1647     # Call bowtie-build on the FASTA file of sythetic splice junction sequences
1648     def build_juncs_bwt_index(external_splice_prefix, color):
1649     print >> sys.stderr, "[%s] Indexing splices" % (right_now())
1650     bowtie_build_log = open(logging_dir + "bowtie_build.log", "w")
1651    
1652     #user_splices_out_prefix = output_dir + "user_splices_idx"
1653    
1654     bowtie_build_cmd = [prog_path("bowtie-build")]
1655     if color:
1656     bowtie_build_cmd += ["-C"]
1657    
1658     bowtie_build_cmd += [external_splice_prefix + ".fa",
1659     external_splice_prefix]
1660     try:
1661     print >> run_log, " ".join(bowtie_build_cmd)
1662     retcode = subprocess.call(bowtie_build_cmd,
1663     stdout=bowtie_build_log)
1664    
1665     if retcode != 0:
1666     die(fail_str+"Error: Splice sequence indexing failed with err ="+ str(retcode))
1667     except OSError, o:
1668     errmsg=fail_str+str(o)+"\n"
1669     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1670     errmsg+="Error: bowtie-build not found on this system"
1671     die(errmsg)
1672     return external_splice_prefix
1673    
1674     # Build a splice index from a .juncs file, suitable for use with specified read
1675     # (or read segment) lengths
1676     def build_juncs_index(min_anchor_length,
1677     read_length,
1678     juncs_prefix,
1679     external_juncs,
1680     external_insertions,
1681     external_deletions,
1682     reference_fasta,
1683     color):
1684     print >> sys.stderr, "[%s] Retrieving sequences for splices" % (right_now())
1685    
1686     juncs_file_list = ",".join(external_juncs)
1687     insertions_file_list = ",".join(external_insertions)
1688     deletions_file_list = ",".join(external_deletions)
1689    
1690    
1691     juncs_db_log = open(logging_dir + "juncs_db.log", "w")
1692    
1693     external_splices_out_prefix = tmp_dir + juncs_prefix
1694     external_splices_out_name = external_splices_out_prefix + ".fa"
1695    
1696     external_splices_out = open(external_splices_out_name, "w")
1697     # juncs_db_cmd = [bin_dir + "juncs_db",
1698     juncs_db_cmd = [prog_path("juncs_db"),
1699     str(min_anchor_length),
1700     str(read_length),
1701     juncs_file_list,
1702     insertions_file_list,
1703     deletions_file_list,
1704     reference_fasta]
1705     try:
1706     print >> run_log, " ".join(juncs_db_cmd)
1707     retcode = subprocess.call(juncs_db_cmd,
1708     stderr=juncs_db_log,
1709     stdout=external_splices_out)
1710    
1711     if retcode != 0:
1712     die(fail_str+"Error: Splice sequence retrieval failed with err ="+str(retcode))
1713     # juncs_db not found
1714     except OSError, o:
1715     errmsg=fail_str+str(o)+"\n"
1716     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1717     errmsg+="Error: juncs_db not found on this system"
1718     die(errmsg)
1719    
1720     external_splices_out_prefix = build_juncs_bwt_index(external_splices_out_prefix, color)
1721     return external_splices_out_prefix
1722    
1723     # Print out the sam header, embedding the user's specified library properties.
1724     # FIXME: also needs SQ dictionary lines
1725     def write_sam_header(read_params, sam_file):
1726     print >> sam_file, "@HD\tVN:1.0\tSO:coordinate"
1727     if read_params.read_group_id and read_params.sample_id:
1728     rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
1729     read_params.sample_id)
1730     if read_params.library_id:
1731     rg_str += "\tLB:%s" % read_params.library_id
1732     if read_params.description:
1733     rg_str += "\tDS:%s" % read_params.description
1734     if read_params.seq_platform_unit:
1735     rg_str += "\tPU:%s" % read_params.seq_platform_unit
1736     if read_params.seq_center:
1737     rg_str += "\tCN:%s" % read_params.seq_center
1738     if read_params.mate_inner_dist:
1739     rg_str += "\tPI:%s" % read_params.mate_inner_dist
1740     if read_params.seq_run_date:
1741     rg_str += "\tDT:%s" % read_params.seq_run_date
1742     if read_params.seq_platform:
1743     rg_str += "\tPL:%s" % read_params.seq_platform
1744    
1745     print >> sam_file, rg_str
1746     print >> sam_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
1747    
1748     # Write final TopHat output, via tophat_reports and wiggles
1749     def compile_reports(params, sam_header_filename, left_maps, left_reads, right_maps, right_reads, gff_annotation):
1750     print >> sys.stderr, "[%s] Reporting output tracks" % right_now()
1751    
1752     left_maps = [x for x in left_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
1753     left_maps = ','.join(left_maps)
1754    
1755     if len(right_maps) > 0:
1756     right_maps = [x for x in right_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
1757     right_maps = ','.join(right_maps)
1758     log_fname = logging_dir + "reports.log"
1759     report_log = open(log_fname, "w")
1760     junctions = output_dir + "junctions.bed"
1761     insertions = output_dir + "insertions.bed"
1762     deletions = output_dir + "deletions.bed"
1763     coverage = "coverage.wig"
1764     accepted_hits = output_dir + "accepted_hits"
1765     report_cmdpath = prog_path("tophat_reports")
1766     report_cmd = [report_cmdpath]
1767     report_cmd.extend(params.cmd())
1768     report_cmd.extend(["--samtools="+samtools_path])
1769     report_cmd.extend([junctions,
1770     insertions,
1771     deletions,
1772     "-",
1773     left_maps,
1774     left_reads])
1775     if len(right_maps) > 0 and right_reads != None:
1776     report_cmd.append(right_maps)
1777     report_cmd.append(right_reads)
1778     # -- tophat_reports now produces (uncompressed) BAM stream,
1779     # directly piped into samtools sort
1780     try:
1781     if params.report_params.convert_bam:
1782     if params.report_params.sort_bam:
1783     report_proc=subprocess.Popen(report_cmd,
1784     preexec_fn=subprocess_setup,
1785     stdout=subprocess.PIPE,
1786     stderr=report_log)
1787    
1788     bamsort_cmd = ["samtools", "sort", "-", accepted_hits]
1789     bamsort_proc= subprocess.Popen(bamsort_cmd,
1790     preexec_fn=subprocess_setup,
1791     stderr=open(logging_dir + "reports.samtools_sort.log", "w"),
1792     stdin=report_proc.stdout)
1793     report_proc.stdout.close()
1794     print >> run_log, " ".join(report_cmd)+" | " + " ".join(bamsort_cmd)
1795     bamsort_proc.communicate()
1796     retcode = report_proc.poll()
1797     if retcode:
1798     die(fail_str+"Error running tophat_reports\n"+log_tail(log_fname))
1799     else:
1800     print >> run_log, " ".join(report_cmd)
1801     report_proc=subprocess.call(report_cmd,
1802     preexec_fn=subprocess_setup,
1803     stdout=open(accepted_hits,"wb"),
1804     stderr=report_log)
1805     os.rename(accepted_hits, output_dir + "accepted_hits.bam")
1806     else:
1807     print >> run_log, " ".join(report_cmd)
1808     report_proc=subprocess.call(report_cmd,
1809     preexec_fn=subprocess_setup,
1810     stdout=open(accepted_hits,"wb"),
1811     stderr=report_log)
1812     tmp_sam = output_dir + "accepted_hits.sam"
1813    
1814     bam_to_sam_cmd = ["samtools", "view", "-h", accepted_hits]
1815     print >> run_log, " ".join(bam_to_sam_cmd) + " > " + tmp_sam
1816     bam_to_sam_log = open(logging_dir + "accepted_hits_bam_to_sam.log", "w")
1817     tmp_sam_file = open(tmp_sam, "w")
1818     ret = subprocess.call(bam_to_sam_cmd,
1819     stdout=tmp_sam_file,
1820     stderr=bam_to_sam_log)
1821     tmp_sam_file.close()
1822     os.remove(accepted_hits)
1823     #print >> run_log, "mv %s %s" % (tmp_sam, output_dir + "accepted_hits.sam")
1824     #os.rename(tmp_sam, output_dir + "accepted_hits.sam")
1825    
1826     except OSError, o:
1827     die(fail_str+"Error: "+str(o)+"\n"+log_tail(log_fname))
1828    
1829     # FIXME: put wiggles back!
1830     # wig_cmd = [prog_path("wiggles"), output_dir + accepted_hits, output_dir + coverage]
1831     # print >> run_log, " ".join(wig_cmd)
1832     # subprocess.call(wig_cmd,
1833     # stderr=open("/dev/null"))
1834     return (coverage, junctions)
1835    
1836    
1837     # Split up each read in a FASTQ file into multiple segments. Creates a FASTQ file
1838     # for each segment This function needs to be fixed to support mixed read length
1839     # inputs
1840    
1841     def open_output_files(prefix, num_files_prev, num_files, out_segf, extension, params):
1842     i = num_files_prev + 1
1843     while i <= num_files:
1844     segfname=prefix+("_seg%d" % i)+extension
1845     out_segf.append(ZWriter(segfname,params.system_params))
1846     i += 1
1847    
1848     def split_reads(reads_filename,
1849     prefix,
1850     fasta,
1851     params,
1852     segment_length):
1853     #reads_file = open(reads_filename)
1854     zreads = ZReader(reads_filename, params.system_params, False)
1855     out_segfiles = []
1856    
1857     if fasta:
1858     extension = ".fa"
1859     else:
1860     extension = ".fq"
1861     if use_zpacker: extension += ".z"
1862     def convert_color_to_bp(color_seq):
1863     decode_dic = { 'A0':'A', 'A1':'C', 'A2':'G', 'A3':'T', 'A4':'N', 'A.':'N', 'AN':'N',
1864     'C0':'C', 'C1':'A', 'C2':'T', 'C3':'G', 'C4':'N', 'C.':'N', 'CN':'N',
1865     'G0':'G', 'G1':'T', 'G2':'A', 'G3':'C', 'G4':'N', 'G.':'N', 'GN':'N',
1866     'T0':'T', 'T1':'G', 'T2':'C', 'T3':'A', 'T4':'N', 'T.':'N', 'TN':'N',
1867     'N0':'N', 'N1':'N', 'N2':'N', 'N3':'N', 'N4':'N', 'N.':'N', 'NN':'N',
1868     '.0':'N', '.1':'N', '.2':'N', '.3':'N', '.4':'N', '..':'N', '.N':'N' }
1869    
1870     base = color_seq[0]
1871     bp_seq = base
1872     for ch in color_seq[1:]:
1873     base = decode_dic[base+ch]
1874     bp_seq += base
1875     return bp_seq
1876    
1877     def convert_bp_to_color(bp_seq):
1878     encode_dic = { 'AA':'0', 'CC':'0', 'GG':'0', 'TT':'0',
1879     'AC':'1', 'CA':'1', 'GT':'1', 'TG':'1',
1880     'AG':'2', 'CT':'2', 'GA':'2', 'TC':'2',
1881     'AT':'3', 'CG':'3', 'GC':'3', 'TA':'3',
1882     'A.':'4', 'C.':'4', 'G.':'4', 'T.':'4',
1883     '.A':'4', '.C':'4', '.G':'4', '.T':'4',
1884     '.N':'4', 'AN':'4', 'CN':'4', 'GN':'4',
1885     'TN':'4', 'NA':'4', 'NC':'4', 'NG':'4',
1886     'NT':'4', 'NN':'4', 'N.':'4', '..':'4' }
1887    
1888     base = bp_seq[0]
1889     color_seq = base
1890     for ch in bp_seq[1:]:
1891     color_seq += encode_dic[base + ch]
1892     base = ch
1893    
1894     return color_seq
1895    
1896     def split_record(read_name, read_seq, read_qual, out_segf, offsets, color):
1897     if color:
1898     color_offset = 1
1899     read_seq_temp = convert_color_to_bp(read_seq)
1900    
1901     seg_num = 1
1902     while seg_num + 1 < len(offsets):
1903     if read_seq[offsets[seg_num]+1] not in ['0', '1', '2', '3']:
1904     return
1905     seg_num += 1
1906     else:
1907     color_offset = 0
1908    
1909     seg_num = 0
1910     last_seq_offset = 0
1911     while seg_num + 1 < len(offsets):
1912     f = out_segf[seg_num].file
1913     seg_seq = read_seq[last_seq_offset+color_offset:offsets[seg_num + 1]+color_offset]
1914     print >> f, "%s|%d:%d:%d" % (read_name,last_seq_offset,seg_num, len(offsets) - 1)
1915     if color:
1916     print >> f, "%s%s" % (read_seq_temp[last_seq_offset], seg_seq)
1917     else:
1918     print >> f, seg_seq
1919     if not fasta:
1920     seg_qual = read_qual[last_seq_offset:offsets[seg_num + 1]]
1921     print >> f, "+"
1922     print >> f, seg_qual
1923     seg_num += 1
1924     last_seq_offset = offsets[seg_num]
1925    
1926     line_state = 0
1927     read_name = ""
1928     read_seq = ""
1929     read_qual = ""
1930     num_segments = 0
1931     offsets = []
1932     for line in zreads.file:
1933     if line.strip() == "":
1934     continue
1935     if line_state == 0:
1936     read_name = line.strip()
1937     elif line_state == 1:
1938     read_seq = line.strip()
1939    
1940     read_length = len(read_seq)
1941     tmp_num_segments = read_length / segment_length
1942     offsets = [segment_length * i for i in range(0, tmp_num_segments + 1)]
1943    
1944     # Bowtie's minimum read length here is 20bp, so if the last segment
1945     # is between 20 and segment_length bp long, go ahead and write it out
1946     if read_length % segment_length >= 20:
1947     offsets.append(read_length)
1948     tmp_num_segments += 1
1949     else:
1950     offsets[-1] = read_length
1951    
1952     if tmp_num_segments == 1:
1953     offsets = [0, read_length]
1954    
1955     if tmp_num_segments > num_segments:
1956     open_output_files(prefix, num_segments, tmp_num_segments, out_segfiles, extension, params)
1957     num_segments = tmp_num_segments
1958    
1959     if fasta:
1960     split_record(read_name, read_seq, None, out_segfiles, offsets, params.read_params.color)
1961     elif line_state == 2:
1962     line = line.strip()
1963     else:
1964     read_quals = line.strip()
1965     if not fasta:
1966     split_record(read_name, read_seq, read_quals, out_segfiles, offsets, params.read_params.color)
1967    
1968     line_state += 1
1969     if fasta:
1970     line_state %= 2
1971     else:
1972     line_state %= 4
1973     zreads.close()
1974     out_fnames=[]
1975     for zf in out_segfiles:
1976     zf.close()
1977     out_fnames.append(zf.fname)
1978     #return [o.fname for o in out_segfiles]
1979     return out_fnames
1980    
1981     # Find possible splice junctions using the "closure search" strategy, and report
1982     # them in closures.juncs. Calls the executable closure_juncs
1983     def junctions_from_closures(params,
1984     left_maps,
1985     right_maps,
1986     ref_fasta):
1987     #print >> sys.stderr, "[%s] " % right_now()
1988     print >> sys.stderr, "[%s] Searching for junctions via mate-pair closures" % right_now()
1989    
1990     #maps = [x for x in seg_maps if (os.path.exists(x) and os.path.getsize(x) > 0)]
1991     #if len(maps) == 0:
1992     # return None
1993     #print >> sys.stderr, left_maps
1994     slash = left_maps[0].rfind('/')
1995     juncs_out = ""
1996     if slash != -1:
1997     juncs_out += left_maps[0][:slash+1]
1998     juncs_out += "closure.juncs"
1999    
2000     juncs_log = open(logging_dir + "closure.log", "w")
2001     juncs_cmdpath=prog_path("closure_juncs")
2002     juncs_cmd = [juncs_cmdpath]
2003    
2004     left_maps = ','.join(left_maps)
2005     right_maps = ','.join(right_maps)
2006    
2007     juncs_cmd.extend(params.cmd())
2008     juncs_cmd.extend([juncs_out,
2009     ref_fasta,
2010     left_maps,
2011     right_maps])
2012     try:
2013     print >> run_log, ' '.join(juncs_cmd)
2014     retcode = subprocess.call(juncs_cmd,
2015     stderr=juncs_log)
2016    
2017     # spanning_reads returned an error
2018     if retcode != 0:
2019     die(fail_str+"Error: closure-based junction search failed with err ="+str(retcode))
2020     # cvg_islands not found
2021     except OSError, o:
2022     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2023     print >> sys.stderr, fail_str, "Error: closure_juncs not found on this system"
2024     die(str(o))
2025     return [juncs_out]
2026    
2027     # Find possible junctions by examining coverage and split segments in the initial
2028     # map and segment maps. Report junctions, insertions, and deletions in segment.juncs,
2029     # segment.insertions, and segment.deletions. Calls the executable
2030     # segment_juncs
2031     def junctions_from_segments(params,
2032     left_reads,
2033     left_reads_map,
2034     left_seg_maps,
2035     right_reads,
2036     right_reads_map,
2037     right_seg_maps,
2038     unmapped_reads,
2039     reads_format,
2040     ref_fasta):
2041     if left_reads_map != left_seg_maps[0]:
2042     print >> sys.stderr, "[%s] Searching for junctions via segment mapping" % right_now()
2043     out_path=getFileDir(left_seg_maps[0])
2044     juncs_out=out_path+"segment.juncs"
2045     insertions_out=out_path+"segment.insertions"
2046     deletions_out =out_path+"segment.deletions"
2047    
2048     left_maps = ','.join(left_seg_maps)
2049     align_log = open(logging_dir + "segment_juncs.log", "w")
2050     align_cmd = [prog_path("segment_juncs")]
2051    
2052     align_cmd.extend(params.cmd())
2053    
2054     align_cmd.extend(["--ium-reads", ",".join(unmapped_reads),
2055     ref_fasta,
2056     juncs_out,
2057     insertions_out,
2058     deletions_out,
2059     left_reads,
2060     left_reads_map,
2061     left_maps])
2062     if right_seg_maps != None:
2063     right_maps = ','.join(right_seg_maps)
2064     align_cmd.extend([right_reads, right_reads_map, right_maps])
2065     try:
2066     print >> run_log, " ".join(align_cmd)
2067     retcode = subprocess.call(align_cmd,
2068     preexec_fn=subprocess_setup,
2069     stderr=align_log)
2070    
2071     # spanning_reads returned an error
2072     if retcode != 0:
2073     die(fail_str+"Error: segment-based junction search failed with err ="+str(retcode))
2074     # cvg_islands not found
2075     except OSError, o:
2076     if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2077     print >> sys.stderr, fail_str, "Error: segment_juncs not found on this system"
2078     die(str(o))
2079    
2080     return [juncs_out, insertions_out, deletions_out]
2081    
2082     # Joins mapped segments into full-length read alignments via the executable
2083     # long_spanning_reads
2084     def join_mapped_segments(params,
2085     sam_header_filename,
2086     reads,
2087     ref_fasta,
2088     possible_juncs,
2089     possible_insertions,
2090     possible_deletions,
2091     contig_seg_maps,
2092     spliced_seg_maps,
2093     alignments_out_name):
2094     if len(contig_seg_maps)>1:
2095     print >> sys.stderr, "[%s] Joining segment hits" % right_now()
2096     else:
2097     print >> sys.stderr, "[%s] Processing bowtie hits" % right_now()
2098     contig_seg_maps = ','.join(contig_seg_maps)
2099    
2100     possible_juncs = ','.join(possible_juncs)
2101     possible_insertions = ",".join(possible_insertions)
2102     possible_deletions = ",".join(possible_deletions)
2103    
2104     align_log = open(logging_dir + "long_spanning_reads.log", "w")
2105     align_cmd = [prog_path("long_spanning_reads")]
2106    
2107     align_cmd.extend(params.cmd())
2108    
2109     align_cmd.append(ref_fasta)
2110     align_cmd.extend([ reads,
2111     possible_juncs,
2112     possible_insertions,
2113     possible_deletions,
2114     contig_seg_maps])
2115    
2116     if spliced_seg_maps != None:
2117     spliced_seg_maps = ','.join(spliced_seg_maps)
2118     align_cmd.append(spliced_seg_maps)
2119    
2120     try:
2121     print >> run_log, " ".join(align_cmd),">",alignments_out_name
2122     join_proc=subprocess.Popen(align_cmd,
2123     preexec_fn=subprocess_setup,
2124     stdout=open(alignments_out_name, "wb"),
2125     stderr=align_log)
2126     join_proc.communicate()
2127     except OSError, o:
2128     die(fail_str+"Error: "+str(o))
2129    
2130    
2131     # This class collects spliced and unspliced alignments for each of the
2132     # left and right read files provided by the user.
2133     class Maps:
2134     def __init__(self,
2135 gpertea 31 # unspliced_bwt,
2136 gpertea 29 unspliced_sam,
2137     seg_maps,
2138     unmapped_segs,
2139     segs):
2140 gpertea 31 #self.unspliced_bwt = unspliced_bwt
2141 gpertea 29 self.unspliced_sam = unspliced_sam
2142     self.seg_maps = seg_maps
2143     self.unmapped_segs = unmapped_segs
2144     self.segs = segs
2145    
2146     # The main aligment routine of TopHat. This function executes most of the
2147     # workflow producing a set of candidate alignments for each cDNA fragment in a
2148     # pair of SAM alignment files (for paired end reads).
2149     def spliced_alignment(params,
2150     bwt_idx_prefix,
2151     sam_header_filename,
2152     ref_fasta,
2153     read_len,
2154     segment_len,
2155     left_reads,
2156     right_reads,
2157     user_supplied_junctions,
2158     user_supplied_insertions,
2159     user_supplied_deletions):
2160    
2161     possible_juncs = []
2162     possible_juncs.extend(user_supplied_junctions)
2163    
2164     possible_insertions = []
2165     possible_insertions.extend(user_supplied_insertions)
2166    
2167     possible_deletions = []
2168     possible_deletions.extend(user_supplied_deletions)
2169    
2170     maps = { left_reads : [], right_reads : [] }
2171     #single_segments = False
2172    
2173     # Perform the first part of the TopHat work flow on the left and right
2174     # reads of paired ends separately - we'll use the pairing information later
2175     for reads in [left_reads, right_reads]:
2176     if reads == None or os.path.getsize(reads)<25 :
2177     continue
2178     fbasename=getFileBaseName(reads)
2179     unspliced_out = tmp_dir + fbasename + ".mapped"
2180     unmapped_unspliced = tmp_dir + fbasename + "_unmapped"
2181 gpertea 31
2182 gpertea 29 num_segs = read_len / segment_len
2183     if read_len % segment_len >= min(segment_len -2, 20):
2184     num_segs += 1
2185 gpertea 31
2186 gpertea 29 # daehwan - check this out
2187     if num_segs <= 1:
2188     print >> sys.stderr, "Warning: you have only one segment per read\n\twe strongly recommend that you decrease --segment-length to about half the read length because TopHat will work better with multiple segments"
2189    
2190     # Perform the initial Bowtie mapping of the full length reads
2191 gpertea 31 (unspliced_sam, unmapped_reads) = bowtie(params,
2192 gpertea 29 bwt_idx_prefix,
2193     reads,
2194     "fastq",
2195     params.initial_read_mismatches,
2196     unspliced_out,
2197     unmapped_unspliced)
2198    
2199     # Convert the initial Bowtie maps into BAM format.
2200 gpertea 31 # skip this step as fix_map_ordering does it now
2201     #join_mapped_segments(params,
2202     # sam_header_filename,
2203     # reads,
2204     # ref_fasta,
2205     # ["/dev/null"],
2206     # ["/dev/null"],
2207     # ["/dev/null"],
2208     # [unspliced_map],
2209     # [],
2210     # unspliced_sam)
2211 gpertea 29 # Using the num_segs value returned by check_reads(), decide which
2212     # junction discovery strategy to use
2213     if num_segs == 1:
2214     segment_len = read_len
2215     elif num_segs >= 3:
2216     # if we have at least three segments, just use split segment search,
2217     # which is the most sensitive and specific, fastest, and lightest-weight.
2218     if not params.closure_search:
2219     params.closure_search = False
2220     if not params.coverage_search:
2221     params.coverage_search = False
2222     if not params.butterfly_search:
2223     params.butterfly_search = False
2224     if not params.closure_search:
2225     params.closure_search = False
2226     seg_maps = []
2227     unmapped_segs = []
2228     segs = []
2229     if num_segs > 1:
2230     # split up the IUM reads into segments
2231 gpertea 31 # unmapped_reads can be in BAM format
2232     read_segments = split_reads(unmapped_reads,
2233 gpertea 29 tmp_dir + fbasename,
2234     False,
2235     params,
2236     segment_len)
2237    
2238     # Map each segment file independently with Bowtie
2239     for i in range(len(read_segments)):
2240     seg = read_segments[i]
2241     fbasename=getFileBaseName(seg)
2242 gpertea 31 seg_out = tmp_dir + fbasename + ".mapped"
2243 gpertea 35 #if use_zpacker: seg_out += ".z"
2244 gpertea 31 unmapped_seg = tmp_dir + fbasename + "_unmapped"
2245 gpertea 29 extra_output = "(%d/%d)" % (i+1, len(read_segments))
2246     (seg_map, unmapped) = bowtie_segment(params,
2247     bwt_idx_prefix,
2248     seg,
2249     "fastq",
2250     seg_out,
2251     unmapped_seg,
2252     extra_output)
2253     seg_maps.append(seg_map)
2254     unmapped_segs.append(unmapped)
2255     segs.append(seg)
2256    
2257     # Collect the segment maps for left and right reads together
2258 gpertea 31 #maps[reads] = Maps(unspliced_map, unspliced_sam, seg_maps, unmapped_segs, segs)
2259     maps[reads] = Maps(unspliced_sam, seg_maps, unmapped_segs, segs)
2260 gpertea 29 else:
2261     # if there's only one segment, just collect the initial map as the only
2262     # map to be used downstream for coverage-based junction discovery
2263     read_segments = [reads]
2264 gpertea 31 #maps[reads] = Maps(unspliced_map, unspliced_sam, [unspliced_map], [unmapped_reads], [unmapped_reads])
2265     maps[reads] = Maps(unspliced_sam, [unspliced_sam], [unmapped_reads], [unmapped_reads])
2266 gpertea 29
2267     # Unless the user asked not to discover new junctions, start that process
2268     # here
2269    
2270     if params.find_novel_juncs or params.find_novel_indels:
2271 gpertea 31 #left_reads_map = maps[left_reads].unspliced_bwt
2272     left_reads_map = maps[left_reads].unspliced_sam
2273 gpertea 29 left_seg_maps = maps[left_reads].seg_maps
2274     unmapped_reads = maps[left_reads].unmapped_segs
2275     if right_reads != None:
2276 gpertea 31 #right_reads_map = maps[right_reads].unspliced_bwt
2277     right_reads_map = maps[right_reads].unspliced_sam
2278 gpertea 29 right_seg_maps = maps[right_reads].seg_maps
2279     unmapped_reads.extend(maps[right_reads].unmapped_segs)
2280     else:
2281     right_reads_map = None
2282     right_seg_maps = None
2283    
2284     # Call segment_juncs to infer a list of possible splice junctions from
2285     # the regions of the genome covered in the initial and segment maps
2286     juncs = junctions_from_segments(params,
2287     left_reads,
2288     left_reads_map,
2289     left_seg_maps,
2290     right_reads,
2291     right_reads_map,
2292     right_seg_maps,
2293     unmapped_reads,
2294     "fastq",
2295     ref_fasta)
2296     if params.find_novel_juncs:
2297     if os.path.getsize(juncs[0]) != 0:
2298     possible_juncs.append(juncs[0])
2299     if params.find_novel_indels:
2300     if os.path.getsize(juncs[1]) != 0:
2301     possible_insertions.append(juncs[1])
2302     if os.path.getsize(juncs[2]) != 0:
2303     possible_deletions.append(juncs[2])
2304     # Optionally, and for paired reads only, use a closure search to
2305     # discover addtional junctions
2306     if params.find_novel_juncs and params.closure_search and left_reads != None and right_reads != None:
2307     juncs = junctions_from_closures(params,
2308 gpertea 31 # [maps[left_reads].unspliced_bwt, maps[left_reads].seg_maps[-1]],
2309     # [maps[right_reads].unspliced_bwt, maps[right_reads].seg_maps[-1]],
2310     [maps[left_reads].unspliced_sam, maps[left_reads].seg_maps[-1]],
2311     [maps[right_reads].unspliced_sam, maps[right_reads].seg_maps[-1]],
2312 gpertea 29 ref_fasta)
2313     if os.path.getsize(juncs[0]) != 0:
2314     possible_juncs.extend(juncs)
2315    
2316     if len(possible_insertions) == 0 and len(possible_deletions) == 0 and len(possible_juncs) == 0:
2317     spliced_seg_maps = None
2318     junc_idx_prefix = None
2319     else:
2320     junc_idx_prefix = "segment_juncs"
2321     if len(possible_insertions) == 0:
2322     possible_insertions.append(os.devnull)
2323     # print >> sys.stderr, "Warning: insertions database is empty!"
2324     if len(possible_deletions) == 0:
2325     possible_deletions.append(os.devnull)
2326     # print >> sys.stderr, "Warning: deletions database is empty!"
2327     if len(possible_juncs) == 0:
2328     possible_juncs.append(os.devnull)
2329     print >> sys.stderr, "Warning: junction database is empty!"
2330     if junc_idx_prefix != None:
2331     build_juncs_index(3,
2332     segment_len,
2333     junc_idx_prefix,
2334     possible_juncs,
2335     possible_insertions,
2336     possible_deletions,
2337     ref_fasta,
2338     params.read_params.color)
2339    
2340     # Now map read segments (or whole IUM reads, if num_segs == 1) to the splice
2341     # index with Bowtie
2342     for reads in [left_reads, right_reads]:
2343     spliced_seg_maps = []
2344     if reads == None or reads not in maps:
2345     continue
2346    
2347     if junc_idx_prefix != None:
2348     i = 0
2349     for seg in maps[reads].segs:
2350     fsegname = getFileBaseName(seg)
2351 gpertea 35 seg_out = tmp_dir + fsegname + ".to_spliced"
2352     #if use_zpacker: seg_out += ".z"
2353 gpertea 29 extra_output = "(%d/%d)" % (i+1, len(maps[reads].segs))
2354     (seg_map, unmapped) = bowtie_segment(params,
2355     tmp_dir + junc_idx_prefix,
2356     seg,
2357     "fastq",
2358     seg_out,
2359     None,
2360     extra_output)
2361     spliced_seg_maps.append(seg_map)
2362     i += 1
2363    
2364     # Join the contigous and spliced segment hits into full length read
2365     # alignments or convert a spliced segment hit into a genomic-coordinate hit
2366     rfname = getFileBaseName(reads)
2367     rfdir = getFileDir(reads)
2368     mapped_reads = rfdir + rfname + ".candidates.bam"
2369     merged_map =rfdir + rfname + ".candidates_and_unspl.bam"
2370 gpertea 31 # unspl_bwtfile = maps[reads].unspliced_bwt
2371 gpertea 29 unspl_samfile = maps[reads].unspliced_sam
2372    
2373     join_mapped_segments(params,
2374     sam_header_filename,
2375     reads,
2376     ref_fasta,
2377     possible_juncs,
2378     possible_insertions,
2379     possible_deletions,
2380     maps[reads].seg_maps,
2381     spliced_seg_maps,
2382     mapped_reads)
2383    
2384     if num_segs > 1:
2385     # Merge the spliced and unspliced full length alignments into a
2386     # single SAM file.
2387     # The individual SAM files are all already sorted in
2388     # increasing read ID order.
2389     # NOTE: We also should be able to address bug #134 here, by replacing
2390     # contiguous alignments that poke into an intron by a small amount by
2391     # the correct spliced alignment.
2392    
2393     try:
2394     merge_cmd = [ prog_path("bam_merge"), merged_map,
2395     maps[reads].unspliced_sam,
2396     mapped_reads ]
2397     print >> run_log, " ".join(merge_cmd)
2398     ret = subprocess.call( merge_cmd,
2399     stderr=open(logging_dir + "sam_merge.log", "w") )
2400     if ret != 0:
2401     die(fail_str+"Error executing: "+" ".join(merge_cmd))
2402     except OSError, o:
2403     die(fail_str+"Error: "+str(o))
2404     maps[reads] = [merged_map]
2405    
2406     if not params.system_params.keep_tmp:
2407     os.remove(mapped_reads)
2408     os.remove(unspl_samfile)
2409 gpertea 31 #os.remove(unspl_bwtfile)
2410 gpertea 29 else:
2411     os.rename(mapped_reads, merged_map)
2412     maps[reads] = [merged_map]
2413     if not params.system_params.keep_tmp:
2414     os.remove(unspl_samfile)
2415 gpertea 31 #os.remove(unspl_bwtfile)
2416 gpertea 29
2417     return maps
2418    
2419     def die(msg=None):
2420     if msg is not None:
2421     print >> sys.stderr, msg
2422     sys.exit(1)
2423    
2424     # rough equivalent of the 'which' command to find external programs
2425     # (current script path is tested first, then PATH envvar)
2426     def which(program):
2427     def is_executable(fpath):
2428 gpertea 37 return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
2429 gpertea 29 fpath, fname = os.path.split(program)
2430     if fpath:
2431     if is_executable(program):
2432     return program
2433     else:
2434     progpath = os.path.join(bin_dir, program)
2435     if is_executable(progpath):
2436     return progpath
2437     for path in os.environ["PATH"].split(os.pathsep):
2438     progpath = os.path.join(path, program)
2439     if is_executable(progpath):
2440     return progpath
2441     return None
2442    
2443     def prog_path(program):
2444     progpath=which(program)
2445     if progpath == None:
2446     die("Error locating program: "+program)
2447     return progpath
2448    
2449     # FIXME: this should get set during the make dist autotools phase of the build
2450     def get_version():
2451     return "1.3.2"
2452    
2453     def mlog(msg):
2454     print >> sys.stderr, "[DBGLOG]:"+msg
2455    
2456     def test_input_file(filename):
2457     try:
2458     test_file = open(filename, "r")
2459     # Bowtie not found
2460     except IOError, o:
2461     die("Error: Opening file %s" % filename)
2462     return
2463    
2464    
2465     def main(argv=None):
2466     warnings.filterwarnings("ignore", "tmpnam is a potential security risk")
2467    
2468     # Initialize default parameter values
2469     params = TopHatParams()
2470    
2471     try:
2472     if argv is None:
2473     argv = sys.argv
2474     args = params.parse_options(argv)
2475     params.check()
2476    
2477     bwt_idx_prefix = args[0]
2478     left_reads_list = args[1]
2479     left_quals_list, right_quals_list = [], []
2480     if (not params.read_params.quals and len(args) > 2) or (params.read_params.quals and len(args) > 3):
2481     if params.read_params.mate_inner_dist == None:
2482 gpertea 67 params.read_params.mate_inner_dist = 50
2483     #die("Error: you must set the mean inner distance between mates with -r")
2484 gpertea 29
2485     right_reads_list = args[2]
2486     if params.read_params.quals:
2487     left_quals_list = args[3]
2488     right_quals_list = args[4]
2489     else:
2490     right_reads_list = None
2491     if params.read_params.quals:
2492     left_quals_list = args[2]
2493    
2494     print >> sys.stderr
2495     print >> sys.stderr, "[%s] Beginning TopHat run (v%s)" % (right_now(), get_version())
2496     print >> sys.stderr, "-----------------------------------------------"
2497    
2498     start_time = datetime.now()
2499     prepare_output_dir()
2500    
2501     global run_log
2502     run_log = open(logging_dir + "run.log", "w", 0)
2503     global run_cmd
2504     run_cmd = " ".join(argv)
2505     print >> run_log, run_cmd
2506    
2507     # Validate all the input files, check all prereqs before committing
2508     # to the run
2509     (ref_fasta, ref_seq_dict) = check_index(bwt_idx_prefix)
2510    
2511     check_bowtie()
2512     check_samtools()
2513     print >> sys.stderr, "[%s] Generating SAM header for %s" % (right_now(), bwt_idx_prefix)
2514     sam_header_filename = get_index_sam_header(params.read_params, bwt_idx_prefix)
2515     print >> sys.stderr, "[%s] Preparing reads" % (right_now())
2516    
2517     if not params.skip_check_reads:
2518     reads_list = left_reads_list
2519     if right_reads_list:
2520     reads_list = reads_list + "," + right_reads_list
2521     params.read_params = check_reads_format(params, reads_list)
2522    
2523     user_supplied_juncs = []
2524     user_supplied_insertions = []
2525     user_supplied_deletions = []
2526    
2527     if params.gff_annotation and params.find_GFF_juncs:
2528     test_input_file(params.gff_annotation)
2529     (found_juncs, gtf_juncs) = get_gtf_juncs(params.gff_annotation)
2530     if found_juncs:
2531     user_supplied_juncs.append(gtf_juncs)
2532     if params.raw_junctions:
2533     test_input_file(params.raw_junctions)
2534     user_supplied_juncs.append(params.raw_junctions)
2535    
2536     if params.raw_insertions:
2537     test_input_file(params.raw_insertions)
2538     user_supplied_insertions.append(params.raw_insertions)
2539    
2540     if params.raw_deletions:
2541     test_input_file(params.raw_deletions)
2542     user_supplied_deletions.append(params.raw_deletions)
2543    
2544     global unmapped_reads_fifo
2545     unmapped_reads_fifo = tmp_dir + str(os.getpid())+".bwt_unmapped.z.fifo"
2546    
2547     # Now start the time consuming stuff
2548     left_kept_reads, left_reads_info = prep_reads(params,
2549     left_reads_list,
2550     left_quals_list,
2551     "left_kept_reads", "Left ")
2552     min_read_len=left_reads_info.min_len
2553     max_read_len=left_reads_info.max_len
2554     if right_reads_list != None:
2555     right_kept_reads, right_reads_info = prep_reads(params,
2556     right_reads_list,
2557     right_quals_list,
2558     "right_kept_reads", "Right")
2559     min_read_len=min(right_reads_info.min_len, min_read_len)
2560     max_read_len=max(right_reads_info.max_len, max_read_len)
2561     else:
2562     right_kept_reads = None
2563     seed_len=params.read_params.seed_length
2564     if seed_len != None:
2565     seed_len = max(seed_len, min_read_len)
2566     else:
2567     seed_len = max_read_len
2568     params.read_params.seed_length=seed_len
2569     # turn off integer-quals
2570     if params.read_params.integer_quals:
2571     params.read_params.integer_quals = False
2572    
2573     mapping = spliced_alignment(params,
2574     bwt_idx_prefix,
2575     sam_header_filename,
2576     ref_fasta,
2577     params.read_params.seed_length,
2578     params.segment_length,
2579     left_kept_reads,
2580     right_kept_reads,
2581     user_supplied_juncs,
2582     user_supplied_insertions,
2583     user_supplied_deletions)
2584    
2585     left_maps = mapping[left_kept_reads]
2586     #left_unmapped_reads = mapping[1]
2587     right_maps = mapping[right_kept_reads]
2588     #right_unmapped_reads = mapping[3]
2589    
2590     compile_reports(params,
2591     sam_header_filename,
2592     left_maps,
2593     left_kept_reads,
2594     right_maps,
2595     right_kept_reads,
2596     params.gff_annotation)
2597    
2598     if not params.system_params.keep_tmp:
2599     for m in left_maps:
2600     os.remove(m)
2601     if left_kept_reads != None:
2602     os.remove(left_kept_reads)
2603     for m in right_maps:
2604     os.remove(m)
2605     if right_kept_reads != None:
2606     os.remove(right_kept_reads)
2607     tmp_files = os.listdir(tmp_dir)
2608     for t in tmp_files:
2609     os.remove(tmp_dir+t)
2610     os.rmdir(tmp_dir)
2611    
2612     finish_time = datetime.now()
2613     duration = finish_time - start_time
2614     print >> sys.stderr,"-----------------------------------------------"
2615     print >> sys.stderr, "Run complete [%s elapsed]" % formatTD(duration)
2616    
2617     except Usage, err:
2618     print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg)
2619     print >> sys.stderr, " for detailed help see http://tophat.cbcb.umd.edu/manual.html"
2620     return 2
2621    
2622    
2623     if __name__ == "__main__":
2624     sys.exit(main())

Properties

Name Value
svn:executable *