ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/tophat
Revision: 137
Committed: Wed Dec 14 22:58:22 2011 UTC (8 years, 5 months ago) by gpertea
File size: 127371 byte(s)
Log Message:
wip spliceCigar() in bwt_map.cpp needs testing

Line File contents
1 #!/usr/bin/env python
2
3 # encoding: utf-8
4 """
5 tophat.py
6
7 Created by Cole Trapnell on 2008-12-25.
8 Copyright (c) 2008 Cole Trapnell. All rights reserved.
9 """
10 import sys
11 try:
12 import psyco
13 psyco.full()
14 except ImportError:
15 pass
16
17 import getopt
18 import subprocess
19 import errno
20 import os
21 import warnings
22 import re
23 import signal
24 from datetime import datetime, date, time
25
26 use_message = '''
27 TopHat maps short sequences from spliced transcripts to whole genomes.
28
29 Usage:
30 tophat [options] <bowtie_index> <reads1[,reads2,...]> [reads1[,reads2,...]] \\
31 [quals1,[quals2,...]] [quals1[,quals2,...]]
32
33 Options:
34 -v/--version
35 -o/--output-dir <string> [ default: ./tophat_out ]
36 -a/--min-anchor <int> [ default: 8 ]
37 -m/--splice-mismatches <0-2> [ default: 0 ]
38 -i/--min-intron-length <int> [ default: 50 ]
39 -I/--max-intron-length <int> [ default: 500000 ]
40 -g/--max-multihits <int> [ default: 20 ]
41 -x/--transcriptome-max-hits <int> [ default: 60 ]
42 -n/--transcriptome-mismatches <int> [ default: 1 ]
43 -M/--prefilter-multihits ( for -G/--GTF option, enable
44 an initial bowtie search
45 against the genome )
46 -F/--min-isoform-fraction <float> [ default: 0.15 ]
47 --max-insertion-length <int> [ default: 3 ]
48 --max-deletion-length <int> [ default: 3 ]
49 --solexa-quals
50 --solexa1.3-quals (same as phred64-quals)
51 --phred64-quals (same as solexa1.3-quals)
52 -Q/--quals
53 --integer-quals
54 -C/--color (Solid - color space)
55 --color-out
56 --library-type <string> (fr-unstranded, fr-firststrand,
57 fr-secondstrand)
58 -p/--num-threads <int> [ default: 1 ]
59 -G/--GTF <filename>
60 --transcriptome-index <bwtidx> [pre-built bowtie index for the
61 transcripts in the -G/--GTF file]
62 -T/--transcriptome-only [map only to the transcriptome]
63 -j/--raw-juncs <filename>
64 --insertions <filename>
65 --deletions <filename>
66 -r/--mate-inner-dist <int>
67 --mate-std-dev <int> [ default: 20 ]
68 --no-novel-juncs
69 --no-novel-indels
70 --no-gtf-juncs
71 --no-coverage-search
72 --coverage-search
73 --no-closure-search
74 --closure-search
75 --microexon-search
76 --butterfly-search
77 --no-butterfly-search
78 --keep-tmp
79 --tmp-dir <dirname> [ default: <output_dir>/tmp ]
80 -z/--zpacker <program> [ default: gzip ]
81 -X/--unmapped-fifo [use mkfifo to compress more temporary
82 files for color space reads]
83
84 Advanced Options:
85 -N/--initial-read-mismatches <int> [ default: 2 ]
86 --segment-mismatches <int> [ default: 2 ]
87 --segment-length <int> [ default: 25 ]
88 --bowtie-n [ default: bowtie -v ]
89 --min-closure-exon <int> [ default: 100 ]
90 --min-closure-intron <int> [ default: 50 ]
91 --max-closure-intron <int> [ default: 5000 ]
92 --min-coverage-intron <int> [ default: 50 ]
93 --max-coverage-intron <int> [ default: 20000 ]
94 --min-segment-intron <int> [ default: 50 ]
95 --max-segment-intron <int> [ default: 500000 ]
96 --no-sort-bam [Output BAM is not coordinate-sorted]
97 --no-convert-bam [Do not convert to bam format.
98 Output is <output_dir>accepted_hit.sam.
99 Implies --no-sort-bam.]
100
101 SAM Header Options (for embedding sequencing run metadata in output):
102 --rg-id <string> (read group ID)
103 --rg-sample <string> (sample ID)
104 --rg-library <string> (library ID)
105 --rg-description <string> (descriptive string, no tabs allowed)
106 --rg-platform-unit <string> (e.g Illumina lane ID)
107 --rg-center <string> (sequencing center name)
108 --rg-date <string> (ISO 8601 date of the sequencing run)
109 --rg-platform <string> (Sequencing platform descriptor)
110 '''
111
112 class Usage(Exception):
113 def __init__(self, msg):
114 self.msg = msg
115
116 output_dir = "./tophat_out/"
117 logging_dir = output_dir + "logs/"
118 run_log = None
119 run_cmd = None
120 tmp_dir = output_dir + "tmp/"
121 bin_dir = sys.path[0] + "/"
122 use_zpacker = False # this is set by -z/--zpacker option (-z0 leaves it False)
123
124 use_BAM_Unmapped = False # automatically set to True for non-Solid reads, handles unmapped reads in BAM format
125
126 use_BWT_FIFO = False # can only be set to True if use_zpacker is True and only with -C/--color
127 # enabled by -X/-unmapped-fifo option (unless -z0)
128 unmapped_reads_fifo = None # if use_BWT_FIFO is True, this tricks bowtie into writing the
129 # unmapped reads into a compressed file
130
131 samtools_path = None
132 bowtie_path = None
133 fail_str = "\t[FAILED]\n"
134
135 sam_header = tmp_dir + "stub_header.sam"
136
137 gtf_juncs = None #file name with junctions extracted from given GFF file
138
139
140 # TopHatParams captures all of the runtime paramaters used by TopHat, and many
141 # of these are passed as command line options to exectubles run by the pipeline
142
143 # This class and its nested classes also do options parsing through parse_options()
144 # and option validation via the member function check()
145
146
147 class BowtieOutputFiles():
148 def __init__(self,
149 mappings=None,
150 unmapped_reads=None,
151 multihit_reads=None):
152 self.mappings=mappings
153 self.unmapped_reads=unmapped_reads
154 self.multihit_reads=multihit_reads
155
156 class TopHatParams:
157
158 # SpliceConstraints is a group of runtime parameters that specify what
159 # constraints to put on junctions discovered by the program. These constraints
160 # are used to filter out spurious/false positive junctions.
161
162 class SpliceConstraints:
163 def __init__(self,
164 min_anchor_length,
165 min_intron_length,
166 max_intron_length,
167 splice_mismatches,
168 min_isoform_fraction):
169 self.min_anchor_length = min_anchor_length
170 self.min_intron_length = min_intron_length
171 self.max_intron_length = max_intron_length
172 self.splice_mismatches = splice_mismatches
173 self.min_isoform_fraction = min_isoform_fraction
174
175 def parse_options(self, opts):
176 for option, value in opts:
177 if option in ("-m", "--splice-mismatches"):
178 self.splice_mismatches = int(value)
179 elif option in ("-a", "--min-anchor"):
180 self.min_anchor_length = int(value)
181 elif option in ("-F", "--min-isoform-fraction"):
182 self.min_isoform_fraction = float(value)
183 elif option in ("-i", "--min-intron-length"):
184 self.min_intron_length = int(value)
185 elif option in ("-I", "--max-intron-length"):
186 self.max_intron_length = int(value)
187
188 def check(self):
189 if self.splice_mismatches not in [0,1,2]:
190 die("Error: arg to --splice-mismatches must be 0, 1, or 2")
191 if self.min_anchor_length < 4:
192 die("Error: arg to --min-anchor-len must be greater than 4")
193 if self.min_isoform_fraction < 0.0 or self.min_isoform_fraction > 1.0:
194 die("Error: arg to --min-isoform-fraction must be between 0.0 and 1.0")
195 if self.min_intron_length <= 0:
196 die("Error: arg to --min-intron-length must be greater than 0")
197 if self.max_intron_length <= 0:
198 die("Error: arg to --max-intron-length must be greater than 0")
199
200 # SystemParams is a group of runtime parameters that determine how to handle
201 # temporary files produced during a run and how many threads to use for threaded
202 # stages of the pipeline (e.g. Bowtie)
203
204 class SystemParams:
205 def __init__(self,
206 num_cpus,
207 keep_tmp):
208 self.num_cpus = num_cpus
209 self.keep_tmp = keep_tmp
210 self.zipper = "gzip"
211 self.zipper_opts= []
212
213 def parse_options(self, opts):
214 global use_zpacker
215 global use_BWT_FIFO
216 for option, value in opts:
217 if option in ("-p", "--num-threads"):
218 self.num_cpus = int(value)
219 elif option == "--keep-tmp":
220 self.keep_tmp = True
221 elif option in ("-z","--zpacker"):
222 if value.lower() in ["-", " ", ".", "0", "none", "f", "false", "no"]:
223 value=""
224 self.zipper = value
225 #if not self.zipper:
226 # self.zipper='gzip'
227 elif option in ("-X", "--unmapped-fifo"):
228 use_BWT_FIFO=True
229 if self.zipper:
230 use_zpacker=True
231 if self.num_cpus>1 and not self.zipper_opts:
232 if self.zipper.endswith('pbzip2') or self.zipper.endswith('pigz'):
233 self.zipper_opts.append('-p'+str(self.num_cpus))
234 else:
235 use_zpacker=False
236 if use_BWT_FIFO: use_BWT_FIFO=False
237 def cmd(self):
238 cmdline=[]
239 if self.zipper:
240 cmdline.extend(['-z',self.zipper])
241 if self.num_cpus>1:
242 cmdline.extend(['-p'+str(self.num_cpus)])
243 return cmdline
244
245 def check(self):
246 if self.num_cpus<1 :
247 die("Error: arg to --num-threads must be greater than 0")
248 if self.zipper:
249 xzip=which(self.zipper)
250 if not xzip:
251 die("Error: cannot find compression program "+self.zipper)
252
253 # ReadParams is a group of runtime parameters that specify various properties
254 # of the user's reads (e.g. which quality scale their are on, how long the
255 # fragments are, etc).
256 class ReadParams:
257 def __init__(self,
258 solexa_quals,
259 phred64_quals,
260 quals,
261 integer_quals,
262 color,
263 color_out,
264 library_type,
265 seed_length,
266 reads_format,
267 mate_inner_dist,
268 mate_inner_dist_std_dev,
269 read_group_id,
270 sample_id,
271 library_id,
272 description,
273 seq_platform_unit,
274 seq_center,
275 seq_run_date,
276 seq_platform):
277 self.solexa_quals = solexa_quals
278 self.phred64_quals = phred64_quals
279 self.quals = quals
280 self.integer_quals = integer_quals
281 self.color = color
282 self.color_out = color_out
283 self.library_type = library_type
284 self.seed_length = seed_length
285 self.reads_format = reads_format
286 self.mate_inner_dist = mate_inner_dist
287 self.mate_inner_dist_std_dev = mate_inner_dist_std_dev
288 self.read_group_id = read_group_id
289 self.sample_id = sample_id
290 self.library_id = library_id
291 self.description = description
292 self.seq_platform_unit = seq_platform_unit
293 self.seq_center = seq_center
294 self.seq_run_date = seq_run_date
295 self.seq_platform = seq_platform
296
297 def parse_options(self, opts):
298 for option, value in opts:
299 if option == "--solexa-quals":
300 self.solexa_quals = True
301 elif option in ("--solexa1.3-quals", "--phred64-quals"):
302 self.phred64_quals = True
303 elif option in ("-Q", "--quals"):
304 self.quals = True
305 elif option == "--integer-quals":
306 self.integer_quals = True
307 elif option in ("-C", "--color"):
308 self.color = True
309 elif option == "--color-out":
310 self.color_out = True
311 elif option == "--library-type":
312 self.library_type = value
313 elif option in ("-s", "--seed-length"):
314 self.seed_length = int(value)
315 elif option in ("-r", "--mate-inner-dist"):
316 self.mate_inner_dist = int(value)
317 elif option == "--mate-std-dev":
318 self.mate_inner_dist_std_dev = int(value)
319 elif option == "--rg-id":
320 self.read_group_id = value
321 elif option == "--rg-sample":
322 self.sample_id = value
323 elif option == "--rg-library":
324 self.library_id = value
325 elif option == "--rg-description":
326 self.description = value
327 elif option == "--rg-platform-unit":
328 self.seq_platform_unit = value
329 elif option == "--rg-center":
330 self.seq_center = value
331 elif option == "--rg-date":
332 self.seq_run_date = value
333 elif option == "--rg-platform":
334 self.seq_platform = value
335
336 def check(self):
337 if self.seed_length and self.seed_length < 20:
338 die("Error: arg to --seed-length must be at least 20")
339 if self.mate_inner_dist_std_dev != None and self.mate_inner_dist_std_dev < 0:
340 die("Error: arg to --mate-std-dev must at least 0")
341 if (not self.read_group_id and self.sample_id) or (self.read_group_id and not self.sample_id):
342 die("Error: --rg-id and --rg-sample must be specified or omitted together")
343
344 # SearchParams is a group of runtime parameters that specify how TopHat will
345 # search for splice junctions
346
347 class SearchParams:
348 def __init__(self,
349 min_closure_exon,
350 min_closure_intron,
351 max_closure_intron,
352 min_coverage_intron,
353 max_coverage_intron,
354 min_segment_intron,
355 max_segment_intron):
356
357 self.min_closure_exon_length = min_closure_exon
358 self.min_closure_intron_length = min_closure_intron
359 self.max_closure_intron_length = max_closure_intron
360 self.min_coverage_intron_length = min_coverage_intron
361 self.max_coverage_intron_length = max_coverage_intron
362 self.min_segment_intron_length = min_segment_intron
363 self.max_segment_intron_length = max_segment_intron
364
365 def parse_options(self, opts):
366 for option, value in opts:
367 if option == "--min-closure-exon":
368 self.min_closure_exon_length = int(value)
369 if option == "--min-closure-intron":
370 self.min_closure_intron_length = int(value)
371 if option == "--max-closure-intron":
372 self.max_closure_intron_length = int(value)
373 if option == "--min-coverage-intron":
374 self.min_coverage_intron_length = int(value)
375 if option == "--max-coverage-intron":
376 self.max_coverage_intron_length = int(value)
377 if option == "--min-segment-intron":
378 self.min_segment_intron_length = int(value)
379 if option == "--max-segment-intron":
380 self.max_segment_intron_length = int(value)
381
382 def check(self):
383 if self.min_closure_exon_length < 0:
384 die("Error: arg to --min-closure-exon must be at least 20")
385 if self.min_closure_intron_length < 0:
386 die("Error: arg to --min-closure-intron must be at least 20")
387 if self.max_closure_intron_length < 0:
388 die("Error: arg to --max-closure-intron must be at least 20")
389 if self.min_coverage_intron_length < 0:
390 die("Error: arg to --min-coverage-intron must be at least 20")
391 if self.max_coverage_intron_length < 0:
392 die("Error: arg to --max-coverage-intron must be at least 20")
393 if self.min_segment_intron_length < 0:
394 die("Error: arg to --min-segment-intron must be at least 20")
395 if self.max_segment_intron_length < 0:
396 die("Error: arg to --max-segment-intron must be at least 20")
397
398 class ReportParams:
399 def __init__(self):
400 self.sort_bam = True
401 self.convert_bam = True
402
403 def parse_options(self, opts):
404 for option, value in opts:
405 if option == "--no-sort-bam":
406 self.sort_bam = False
407 if option == "--no-convert-bam":
408 self.convert_bam = False
409 self.sort_bam = False
410
411
412 def __init__(self):
413 self.splice_constraints = self.SpliceConstraints(8, # min_anchor
414 50, # min_intron
415 500000, # max_intron
416 0, # splice_mismatches
417 0.15) # min_isoform_frac
418
419 self.preflt_data = [ BowtieOutputFiles(), BowtieOutputFiles() ]
420
421 self.read_params = self.ReadParams(False, # solexa_scale
422 False,
423 False, # quals
424 None, # integer quals
425 False, # SOLiD - color space
426 False, # SOLiD - color out instead of base pair,
427 "", # library type (e.g. "illumina-stranded-pair-end")
428 None, # seed_length
429 "fastq", # quality_format
430 None, # mate inner distance
431 20, # mate inner dist std dev
432 None, # read group id
433 None, # sample id
434 None, # library id
435 None, # description
436 None, # platform unit (i.e. lane)
437 None, # sequencing center
438 None, # run date
439 None) # sequencing platform
440
441 self.system_params = self.SystemParams(1, # bowtie_threads (num_cpus)
442 False) # keep_tmp
443
444 self.search_params = self.SearchParams(100, # min_closure_exon_length
445 50, # min_closure_intron_length
446 5000, # max_closure_intron_length
447 50, # min_coverage_intron_length
448 20000, # max_coverage_intron_length
449 50, # min_segment_intron_length
450 500000) # max_segment_intron_length
451
452 self.report_params = self.ReportParams()
453 self.gff_annotation = None
454 self.transcriptome_only = False
455 self.transcriptome_index = None
456 self.raw_junctions = None
457 self.find_novel_juncs = True
458 self.find_novel_indels = True
459 self.find_GFF_juncs = True
460 #self.skip_check_reads = False
461 self.max_hits = 20
462 self.t_max_hits = 60
463 self.prefilter_multi = False
464 self.initial_read_mismatches = 2
465 self.t_mismatches = 1
466 self.segment_length = 25
467 self.segment_mismatches = 2
468 self.bowtie_alignment_option = "-v"
469 self.max_insertion_length = 3
470 self.max_deletion_length = 3
471 self.raw_insertions = None
472 self.raw_deletions = None
473 self.coverage_search = None
474 self.closure_search = False
475 self.microexon_search = False
476 self.butterfly_search = None
477
478 def check(self):
479 self.splice_constraints.check()
480 self.read_params.check()
481 self.system_params.check()
482
483 if self.segment_length < 10:
484 die("Error: arg to --segment-length must at least 10")
485 if self.segment_mismatches < 0 or self.segment_mismatches > 3:
486 die("Error: arg to --segment-mismatches must in [0, 3]")
487
488 if self.read_params.color and self.butterfly_search:
489 die("Error: butterfly-search in colorspace is not yet supported")
490
491 library_types = ["fr-unstranded", "fr-firststrand", "fr-secondstrand"]
492
493 if self.read_params.library_type and self.read_params.library_type not in library_types:
494 die("Error: library-type should be one of: "+', '.join(library_types))
495
496 self.search_params.max_closure_intron_length = min(self.splice_constraints.max_intron_length,
497 self.search_params.max_closure_intron_length)
498
499 self.search_params.max_segment_intron_length = min(self.splice_constraints.max_intron_length,
500 self.search_params.max_segment_intron_length)
501
502 self.search_params.max_coverage_intron_length = min(self.splice_constraints.max_intron_length,
503 self.search_params.max_coverage_intron_length)
504
505 if self.max_insertion_length >= self.segment_length:
506 die("Error: the max insertion length ("+self.max_insertion_length+") can not be equal to or greater than the segment length ("+self.segment_length+")")
507
508 if self.max_insertion_length < 0:
509 die("Error: the max insertion length ("+self.max_insertion_length+") can not be less than 0")
510
511 if self.max_deletion_length >= self.splice_constraints.min_intron_length:
512 die("Error: the max deletion length ("+self.max_deletion_length+") can not be equal to or greater than the min intron length ("+self.splice_constraints.min_intron_length+")")
513
514 if self.max_deletion_length < 0:
515 die("Error: the max deletion length ("+self.max_deletion_length+") can not be less than 0")
516
517 def cmd(self):
518 cmd = ["--min-anchor", str(self.splice_constraints.min_anchor_length),
519 "--splice-mismatches", str(self.splice_constraints.splice_mismatches),
520 "--min-report-intron", str(self.splice_constraints.min_intron_length),
521 "--max-report-intron", str(self.splice_constraints.max_intron_length),
522 "--min-isoform-fraction", str(self.splice_constraints.min_isoform_fraction),
523 "--output-dir", output_dir,
524 "--max-multihits", str(self.max_hits),
525 "--segment-length", str(self.segment_length),
526 "--segment-mismatches", str(self.segment_mismatches),
527 "--min-closure-exon", str(self.search_params.min_closure_exon_length),
528 "--min-closure-intron", str(self.search_params.min_closure_intron_length),
529 "--max-closure-intron", str(self.search_params.max_closure_intron_length),
530 "--min-coverage-intron", str(self.search_params.min_coverage_intron_length),
531 "--max-coverage-intron", str(self.search_params.max_coverage_intron_length),
532 "--min-segment-intron", str(self.search_params.min_segment_intron_length),
533 "--max-segment-intron", str(self.search_params.max_segment_intron_length),
534 "--max-mismatches", str(self.initial_read_mismatches),
535 "--sam-header", sam_header,
536 "--max-insertion-length", str(self.max_insertion_length),
537 "--max-deletion-length", str(self.max_deletion_length)]
538 cmd.extend(self.system_params.cmd())
539
540 if self.read_params.mate_inner_dist != None:
541 cmd.extend(["--inner-dist-mean", str(self.read_params.mate_inner_dist),
542 "--inner-dist-std-dev", str(self.read_params.mate_inner_dist_std_dev)])
543 if self.gff_annotation != None:
544 cmd.extend(["--gtf-annotations", str(self.gff_annotation)])
545 if gtf_juncs:
546 cmd.extend(["--gtf-juncs", gtf_juncs])
547 if self.closure_search == False:
548 cmd.append("--no-closure-search")
549 if self.coverage_search == False:
550 cmd.append("--no-coverage-search")
551 if self.microexon_search == False:
552 cmd.append("--no-microexon-search")
553 if self.butterfly_search:
554 cmd.append("--butterfly-search")
555 if self.read_params.solexa_quals:
556 cmd.append("--solexa-quals")
557 if self.read_params.quals:
558 cmd.append("--quals")
559 if self.read_params.integer_quals:
560 cmd.append("--integer-quals")
561 if self.read_params.color:
562 cmd.append("--color")
563 if self.read_params.color_out:
564 cmd.append("--color-out")
565 if self.read_params.library_type:
566 cmd.extend(["--library-type", self.read_params.library_type])
567 if self.read_params.read_group_id:
568 cmd.extend(["--rg-id", self.read_params.read_group_id])
569 if self.read_params.phred64_quals:
570 cmd.append("--phred64-quals")
571 return cmd
572
573 # This is the master options parsing routine, which calls parse_options for
574 # the delegate classes (e.g. SpliceConstraints) that handle certain groups
575 # of options.
576 def parse_options(self, argv):
577 try:
578 opts, args = getopt.getopt(argv[1:], "hvp:m:n:N:F:a:i:I:G:Tr:o:j:Xz:s:g:x:MQC",
579 ["version",
580 "help",
581 "output-dir=",
582 "solexa-quals",
583 "solexa1.3-quals",
584 "phred64-quals",
585 "quals",
586 "integer-quals",
587 "color",
588 "color-out",
589 "library-type=",
590 "num-threads=",
591 "splice-mismatches=",
592 "max-multihits=",
593 "min-isoform-fraction=",
594 "min-anchor-length=",
595 "min-intron-length=",
596 "max-intron-length=",
597 "GTF=",
598 "transcriptome-only",
599 "transcriptome-max-hits=",
600 "transcriptome-index=",
601 "raw-juncs=",
602 "no-novel-juncs",
603 "no-novel-indels",
604 "no-gtf-juncs",
605 # "skip-check-reads",
606 "mate-inner-dist=",
607 "mate-std-dev=",
608 "no-closure-search",
609 "no-coverage-search",
610 "closure-search",
611 "coverage-search",
612 "prefilter-multihits",
613 "microexon-search",
614 "min-closure-exon=",
615 "min-closure-intron=",
616 "max-closure-intron=",
617 "min-coverage-intron=",
618 "max-coverage-intron=",
619 "min-segment-intron=",
620 "max-segment-intron=",
621 "seed-length=",
622 "initial-read-mismatches=",
623 "read-mismatches=",
624 "max-mismatches=",
625 "transcriptome-mismatches=",
626 "segment-length=",
627 "segment-mismatches=",
628 "bowtie-n",
629 "butterfly-search",
630 "no-butterfly-search",
631 "keep-tmp",
632 "rg-id=",
633 "rg-sample=",
634 "rg-library=",
635 "rg-description=",
636 "rg-platform-unit=",
637 "rg-center=",
638 "rg-date=",
639 "rg-platform=",
640 "tmp-dir=",
641 "zpacker=",
642 "unmapped-fifo",
643 "max-insertion-length=",
644 "max-deletion-length=",
645 "insertions=",
646 "deletions=",
647 "no-sort-bam",
648 "no-convert-bam"])
649 except getopt.error, msg:
650 raise Usage(msg)
651
652 self.splice_constraints.parse_options(opts)
653 self.system_params.parse_options(opts)
654 self.read_params.parse_options(opts)
655 self.search_params.parse_options(opts)
656 self.report_params.parse_options(opts)
657 global use_BWT_FIFO
658 global use_BAM_Unmapped
659 if not self.read_params.color:
660 use_BWT_FIFO=False
661 use_BAM_Unmapped=True
662 global output_dir
663 global logging_dir
664 global tmp_dir
665 global sam_header
666
667 custom_tmp_dir = None
668 custom_out_dir = None
669 # option processing
670 for option, value in opts:
671 if option in ("-v", "--version"):
672 print "TopHat v%s" % (get_version())
673 sys.exit(0)
674 if option in ("-h", "--help"):
675 raise Usage(use_message)
676 if option in ("-g", "--max-multihits"):
677 self.max_hits = int(value)
678 if option in ("-x", "--transcriptome-max-hits"):
679 self.t_max_hits = int(value)
680 if option in ("-G", "--GTF"):
681 self.gff_annotation = value
682 if option in ("-T", "--transcriptome"):
683 self.transcriptome_only = True
684 if option in ("--transcriptome-index"):
685 self.transcriptome_index = value
686 if option in("-M", "--prefilter-multihits"):
687 self.prefilter_multi = True
688 if option in ("-j", "--raw-juncs"):
689 self.raw_junctions = value
690 if option == "--no-novel-juncs":
691 self.find_novel_juncs = False
692 if option == "--no-novel-indels":
693 self.find_novel_indels = False
694 if option == "--no-gtf-juncs":
695 self.find_GFF_juncs = False
696 #if option == "--skip-check-reads":
697 # self.skip_check_reads = True
698 if option == "--no-coverage-search":
699 self.coverage_search = False
700 if option == "--coverage-search":
701 self.coverage_search = True
702 if option == "--no-closure-search":
703 self.closure_search = False
704 if option == "--closure-search":
705 self.closure_search = True
706 if option == "--microexon-search":
707 self.microexon_search = True
708 if option == "--butterfly-search":
709 self.butterfly_search = True
710 if option == "--no-butterfly-search":
711 self.butterfly_search = False
712 if option in ("-N", "--initial-read-mismatches", "--read-mismatches", "--max-mismatches"):
713 self.initial_read_mismatches = int(value)
714 if option in ("-n", "--transcriptome-mismatches"):
715 self.t_mismatches = int(value)
716 if option == "--segment-length":
717 self.segment_length = int(value)
718 if option == "--segment-mismatches":
719 self.segment_mismatches = int(value)
720 if option == "--bowtie-n":
721 self.bowtie_alignment_option = "-n"
722 if option == "--max-insertion-length":
723 self.max_insertion_length = int(value)
724 if option == "--max-deletion-length":
725 self.max_deletion_length = int(value)
726 if option == "--insertions":
727 self.raw_insertions = value
728 if option == "--deletions":
729 self.raw_deletions = value
730 if option in ("-o", "--output-dir"):
731 custom_out_dir = value + "/"
732 if option == "--tmp-dir":
733 custom_tmp_dir = value + "/"
734 if self.transcriptome_only:
735 self.find_novel_juncs=False
736 self.find_novel_indels=False
737 if custom_out_dir:
738 output_dir = custom_out_dir
739 logging_dir = output_dir + "logs/"
740 tmp_dir = output_dir + "tmp/"
741 sam_header = tmp_dir + "stub_header.sam"
742 if custom_tmp_dir:
743 tmp_dir = custom_tmp_dir
744 sam_header = tmp_dir + "stub_header.sam"
745 if len(args) < 2:
746 raise Usage(use_message)
747 return args
748
749
750 def getFileDir(filepath):
751 #if fullpath, returns path including the ending /
752 fpath, fname=os.path.split(filepath)
753 if fpath: fpath+='/'
754 return fpath
755
756 def getFileBaseName(filepath):
757 fpath, fname=os.path.split(filepath)
758 fbase, fext =os.path.splitext(fname)
759 fx=fext.lower()
760 if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fbase)>0:
761 return fbase
762 elif fx == '.z' or fx.find('.gz')==0 or fx.find('.bz')==0:
763 fb, fext = os.path.splitext(fbase)
764 fx=fext.lower()
765 if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fb)>0:
766 return fb
767 else:
768 return fbase
769 else:
770 if len(fbase)>len(fext):
771 return fbase
772 else:
773 return fname
774
775 # Returns the current time in a nice format
776 def right_now():
777 curr_time = datetime.now()
778 return curr_time.strftime("%c")
779
780 # The TopHat logging formatter
781 def th_log(out_str):
782 print >> sys.stderr, "[%s] %s" % (right_now(), out_str)
783
784 # Ensures that the output, logging, and temp directories are present. If not,
785 # they are created
786 def prepare_output_dir():
787
788 th_log("Preparing output location "+output_dir)
789 if os.path.exists(output_dir):
790 pass
791 else:
792 os.mkdir(output_dir)
793
794 if os.path.exists(logging_dir):
795 pass
796 else:
797 os.mkdir(logging_dir)
798
799 if os.path.exists(tmp_dir):
800 pass
801 else:
802 try:
803 os.makedirs(tmp_dir)
804 except OSError, o:
805 die("\nError creating directory %s (%s)" % (tmp_dir, o))
806
807
808 # to be added as preexec_fn for every subprocess.Popen() call:
809 # see http://bugs.python.org/issue1652
810 def subprocess_setup():
811 # Python installs a SIGPIPE handler by default, which causes
812 # gzip or other de/compression pipes to complain about "stdout: Broken pipe"
813 signal.signal(signal.SIGPIPE, signal.SIG_DFL)
814
815 # Check that the Bowtie index specified by the user is present and all files
816 # are there.
817 def check_bowtie_index(idx_prefix):
818 th_log("Checking for Bowtie index files")
819
820 idx_fwd_1 = idx_prefix + ".1.ebwt"
821 idx_fwd_2 = idx_prefix + ".2.ebwt"
822 idx_rev_1 = idx_prefix + ".rev.1.ebwt"
823 idx_rev_2 = idx_prefix + ".rev.2.ebwt"
824
825 if os.path.exists(idx_fwd_1) and \
826 os.path.exists(idx_fwd_2) and \
827 os.path.exists(idx_rev_1) and \
828 os.path.exists(idx_rev_2):
829 return
830 else:
831 bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES")
832 if bowtie_idx_env_var == None:
833 die("Error: Could not find Bowtie index files " + idx_prefix + ".*")
834 idx_prefix = bowtie_idx_env_var + idx_prefix
835 idx_fwd_1 = idx_prefix + ".1.ebwt"
836 idx_fwd_2 = idx_prefix + ".2.ebwt"
837 idx_rev_1 = idx_prefix + ".rev.1.ebwt"
838 idx_rev_2 = idx_prefix + ".rev.2.ebwt"
839
840 if os.path.exists(idx_fwd_1) and \
841 os.path.exists(idx_fwd_2) and \
842 os.path.exists(idx_rev_1) and \
843 os.path.exists(idx_rev_2):
844 return
845 else:
846 die("Error: Could not find Bowtie index files " + idx_prefix + ".*")
847
848 # Reconstructs the multifasta file from which the Bowtie index was created, if
849 # it's not already there.
850 def bowtie_idx_to_fa(idx_prefix):
851 idx_name = idx_prefix.split('/')[-1]
852 th_log("Reconstituting reference FASTA file from Bowtie index")
853
854 try:
855 tmp_fasta_file_name = tmp_dir + idx_name + ".fa"
856 tmp_fasta_file = open(tmp_fasta_file_name, "w")
857
858 inspect_log = open(logging_dir + "bowtie_inspect_recons.log", "w")
859
860 inspect_cmd = [prog_path("bowtie-inspect"),
861 idx_prefix]
862
863 print >> sys.stderr, "Executing: " + " ".join(inspect_cmd) + " > " + tmp_fasta_file_name
864 ret = subprocess.call(inspect_cmd,
865 stdout=tmp_fasta_file,
866 stderr=inspect_log)
867
868 # Bowtie reported an error
869 if ret != 0:
870 die(fail_str+"Error: bowtie-inspect returned an error")
871
872 # Bowtie not found
873 except OSError, o:
874 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
875 die(fail_str+"Error: bowtie-inspect not found on this system. Did you forget to include it in your PATH?")
876
877 return tmp_fasta_file_name
878
879 # Checks whether the multifasta file for the genome is present alongside the
880 # Bowtie index files for it.
881 def check_fasta(idx_prefix):
882 th_log("Checking for reference FASTA file")
883 idx_fasta = idx_prefix + ".fa"
884 if os.path.exists(idx_fasta):
885 return idx_fasta
886 else:
887 bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES")
888 if bowtie_idx_env_var:
889 idx_fasta = bowtie_idx_env_var + idx_prefix + ".fa"
890 if os.path.exists(idx_fasta):
891 return idx_fasta
892
893 print >> sys.stderr, "\tWarning: Could not find FASTA file " + idx_fasta
894 idx_fa = bowtie_idx_to_fa(idx_prefix)
895 return idx_fa
896
897 # Check that both the Bowtie index and the genome's fasta file are present
898 def check_index(idx_prefix):
899 check_bowtie_index(idx_prefix)
900 ref_fasta_file = check_fasta(idx_prefix)
901
902 return (ref_fasta_file, None)
903
904 # Retrive a tuple containing the system's version of Bowtie. Parsed from
905 # `bowtie --version`
906 def get_bowtie_version():
907 try:
908 # Launch Bowtie to capture its version info
909 proc = subprocess.Popen([bowtie_path, "--version"],
910 stdout=subprocess.PIPE)
911 stdout_value = proc.communicate()[0]
912 bowtie_version = None
913 bowtie_out = repr(stdout_value)
914
915 # Find the version identifier
916 version_str = "bowtie version "
917 ver_str_idx = bowtie_out.find(version_str)
918 if ver_str_idx != -1:
919 nl = bowtie_out.find("\\n", ver_str_idx)
920 version_val = bowtie_out[ver_str_idx + len(version_str):nl]
921 bowtie_version = [int(x) for x in version_val.split('.')]
922 if len(bowtie_version) == 3:
923 bowtie_version.append(0)
924
925 return bowtie_version
926 except OSError, o:
927 errmsg=fail_str+str(o)+"\n"
928 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
929 errmsg+="Error: bowtie not found on this system"
930 die(errmsg)
931
932 def get_index_sam_header(read_params, idx_prefix):
933 try:
934 bowtie_sam_header_filename = tmp_dir+idx_prefix.split('/')[-1]+".bwt.samheader.sam"
935 bowtie_sam_header_file = open(bowtie_sam_header_filename,"w")
936
937 bowtie_header_cmd = [bowtie_path, "--sam"]
938 if read_params.color:
939 bowtie_header_cmd.append('-C')
940 bowtie_header_cmd.extend([idx_prefix, '/dev/null'])
941 subprocess.call(bowtie_header_cmd,
942 stdout=bowtie_sam_header_file,
943 stderr=open('/dev/null'))
944
945 bowtie_sam_header_file.close()
946 bowtie_sam_header_file = open(bowtie_sam_header_filename,"r")
947
948 sam_header_file = open(sam_header, "w")
949
950 preamble = []
951 sq_dict_lines = []
952
953 for line in bowtie_sam_header_file.readlines():
954 line = line.strip()
955 if line.find("@SQ") != -1:
956 # Sequence dictionary record
957 cols = line.split('\t')
958 seq_name = None
959 for col in cols:
960 fields = col.split(':')
961 #print fields
962 if len(fields) > 0 and fields[0] == "SN":
963 seq_name = fields[1]
964 if seq_name == None:
965 die("Error: malformed sequence dictionary in sam header")
966 sq_dict_lines.append([seq_name,line])
967 elif line.find("CL"):
968 continue
969 else:
970 preamble.append(line)
971 #for line in preamble:
972 # print >> sam_header_file, line
973 print >> sam_header_file, "@HD\tVN:1.0\tSO:coordinate"
974
975 if read_params.read_group_id and read_params.sample_id:
976 rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
977 read_params.sample_id)
978 if read_params.library_id:
979 rg_str += "\tLB:%s" % read_params.library_id
980 if read_params.description:
981 rg_str += "\tDS:%s" % read_params.description
982 if read_params.seq_platform_unit:
983 rg_str += "\tPU:%s" % read_params.seq_platform_unit
984 if read_params.seq_center:
985 rg_str += "\tCN:%s" % read_params.seq_center
986 if read_params.mate_inner_dist:
987 rg_str += "\tPI:%s" % read_params.mate_inner_dist
988 if read_params.seq_run_date:
989 rg_str += "\tDT:%s" % read_params.seq_run_date
990 if read_params.seq_platform:
991 rg_str += "\tPL:%s" % read_params.seq_platform
992
993 print >> sam_header_file, rg_str
994
995 sq_dict_lines.sort(lambda x,y: cmp(x[0],y[0]))
996 for [name, line] in sq_dict_lines:
997 print >> sam_header_file, line
998 print >> sam_header_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
999
1000 sam_header_file.close()
1001
1002 return sam_header
1003
1004 except OSError, o:
1005 errmsg=fail_str+str(o)+"\n"
1006 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1007 errmsg+="Error: bowtie not found on this system"
1008 die(errmsg)
1009
1010 # Make sure Bowtie is installed and is recent enough to be useful
1011 def check_bowtie():
1012 th_log("Checking for Bowtie")
1013 global bowtie_path
1014 bowtie_path=prog_path("bowtie")
1015 bowtie_version = get_bowtie_version()
1016 if bowtie_version == None:
1017 die("Error: Bowtie not found on this system")
1018 elif bowtie_version[0] < 1 and bowtie_version[1] < 12 and bowtie_version[2] < 3:
1019 die("Error: TopHat requires Bowtie 0.12.3 or later")
1020 print >> sys.stderr, "\tBowtie version:\t\t\t %s" % ".".join([str(x) for x in bowtie_version])
1021
1022
1023 # Retrive a tuple containing the system's version of samtools. Parsed from
1024 # `samtools`
1025 def get_samtools_version():
1026 try:
1027 # Launch Bowtie to capture its version info
1028 proc = subprocess.Popen(samtools_path, stderr=subprocess.PIPE)
1029 samtools_out = proc.communicate()[1]
1030
1031 # Find the version identifier
1032 version_match = re.search(r'Version:\s+(\d+)\.(\d+).(\d+)([a-zA-Z]?)', samtools_out)
1033 samtools_version_arr = [int(version_match.group(x)) for x in [1,2,3]]
1034 if version_match.group(4):
1035 samtools_version_arr.append(version_match.group(4))
1036 else:
1037 samtools_version_arr.append(0)
1038
1039 return version_match.group(), samtools_version_arr
1040 except OSError, o:
1041 errmsg=fail_str+str(o)+"\n"
1042 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1043 errmsg+="Error: samtools not found on this system"
1044 die(errmsg)
1045
1046 # Make sure the SAM tools are installed and are recent enough to be useful
1047 def check_samtools():
1048 th_log("Checking for Samtools")
1049 global samtools_path
1050 samtools_path=prog_path("samtools")
1051 samtools_version_str, samtools_version_arr = get_samtools_version()
1052 if samtools_version_str == None:
1053 die("Error: Samtools not found on this system")
1054 elif samtools_version_arr[1] < 1 or samtools_version_arr[2] < 7:
1055 die("Error: TopHat requires Samtools 0.1.7 or later")
1056 print >> sys.stderr, "\tSamtools %s" % samtools_version_str
1057
1058
1059
1060 class FastxReader:
1061 def __init__(self, i_file, is_color=0, fname=''):
1062 self.bufline=None
1063 self.format=None
1064 self.ifile=i_file
1065 self.nextRecord=None
1066 self.eof=None
1067 self.fname=fname
1068 self.lastline=None
1069 self.numrecords=0
1070 self.isColor=0
1071 if is_color : self.isColor=1
1072 # determine file type
1073 #no records processed yet, skip custom header lines if any
1074 hlines=10 # allow maximum 10 header lines
1075 self.lastline=" "
1076 while hlines>0 and self.lastline[0] not in "@>" :
1077 self.lastline=self.ifile.readline()
1078 hlines-=1
1079 if self.lastline[0] == '@':
1080 self.format='fastq'
1081 self.nextRecord=self.nextFastq
1082 elif self.lastline[0] == '>':
1083 self.format='fasta'
1084 self.nextRecord=self.nextFasta
1085 else:
1086 die("Error: cannot determine record type in input file %s" % fname)
1087 self.bufline=self.lastline
1088 self.lastline=None
1089
1090 def nextFastq(self):
1091 # returning tuple: (seqID, sequence_string, seq_len, qv_string)
1092 seqid,seqstr,qstr,seq_len='','','',0
1093 if self.eof: return (seqid, seqstr, seq_len, qstr)
1094 fline=self.getLine #shortcut to save a bit of time
1095 line=fline()
1096
1097 if not line : return (seqid, seqstr, seq_len, qstr)
1098 while len(line.rstrip())==0: # skip empty lines
1099 line=fline()
1100 if not line : return (seqid, seqstr,seq_len, qstr)
1101 try:
1102 if line[0] != "@":
1103 raise ValueError("Records in Fastq files should start with '@' character")
1104 seqid = line[1:].rstrip()
1105 seqstr = fline().rstrip()
1106
1107 #There may now be more sequence lines, or the "+" quality marker line:
1108 while True:
1109 line = fline()
1110 if not line:
1111 raise ValueError("Premature end of file (missing quality values for "+seqid+")")
1112 if line[0] == "+":
1113 # -- sequence string ended
1114 #qtitle = line[1:].rstrip()
1115 #if qtitle and qtitle != seqid:
1116 # raise ValueError("Different read ID for sequence and quality (%s vs %s)" \
1117 # % (seqid, qtitle))
1118 break
1119 seqstr += line.rstrip() #removes trailing newlines
1120 #loop until + found
1121 seq_len = len(seqstr)
1122 #at least one line of quality data should follow
1123 qstrlen=0
1124 #now read next lines as quality values until seq_len is reached
1125 while True:
1126 line=fline()
1127 if not line : break #end of file
1128 qstr += line.rstrip()
1129 qstrlen=len(qstr)
1130 if qstrlen + self.isColor >= seq_len :
1131 break # qv string has reached the length of seq string
1132 #loop until qv has the same length as seq
1133 if self.isColor:
1134 # and qstrlen==seq_len :
1135 if qstrlen==seq_len:
1136 #qual string may have a dummy qv at the beginning, should be stripped
1137 qstr = qstr[1:]
1138 qstrlen -= 1
1139 if qstrlen!=seq_len-1:
1140 raise ValueError("Length mismatch between sequence and quality strings "+ \
1141 "for %s (%i vs %i)." % (seqid, seq_len, qstrlen))
1142 else:
1143 if seq_len != qstrlen :
1144 raise ValueError("Length mismatch between sequence and quality strings "+ \
1145 "for %s (%i vs %i)." % (seqid, seq_len, qstrlen))
1146 except ValueError, err:
1147 die("\nError encountered parsing file "+self.fname+":\n "+str(err))
1148 #return the record
1149 self.numrecords+=1
1150 ##--discard the primer base [NO]
1151 if self.isColor :
1152 seq_len-=1
1153 seqstr = seqstr[1:]
1154 return (seqid, seqstr, seq_len, qstr)
1155
1156 def nextFasta(self):
1157 # returning tuple: (seqID, sequence_string, seq_len)
1158 seqid,seqstr,seq_len='','',0
1159 fline=self.getLine # shortcut to readline function of f
1160 line=fline() # this will use the buffer line if it's there
1161 if not line : return (seqid, seqstr, seq_len, None)
1162 while len(line.rstrip())==0: # skip empty lines
1163 line=fline()
1164 if not line : return (seqid, seqstr, seq_len, None)
1165 try:
1166 if line[0] != ">":
1167 raise ValueError("Records in Fasta files must start with '>' character")
1168 seqid = line[1:].split()[0]
1169 #more sequence lines, or the ">" quality marker line:
1170 while True:
1171 line = fline()
1172 if not line: break
1173 if line[0] == '>':
1174 #next sequence starts here
1175 self.ungetLine()
1176 break
1177 seqstr += line.rstrip()
1178 #loop until '>' found
1179 seq_len = len(seqstr)
1180 if seq_len < 3:
1181 raise ValueError("Read %s too short (%i)." \
1182 % (seqid, seq_len))
1183 except ValueError, err:
1184 die("\nError encountered parsing fasta file "+self.fname+"\n "+str(err))
1185 #return the record and continue
1186 self.numrecords+=1
1187 if self.isColor : # -- discard primer base
1188 seq_len-=1
1189 seqstr=seqstr[1:]
1190 return (seqid, seqstr, seq_len, None)
1191
1192 def getLine(self):
1193 if self.bufline: #return previously buffered line
1194 r=self.bufline
1195 self.bufline=None
1196 return r
1197 else: #read a new line from stream and return it
1198 if self.eof: return None
1199 self.lastline=self.ifile.readline()
1200 if not self.lastline:
1201 self.eof=1
1202 return None
1203 return self.lastline
1204 def ungetLine(self):
1205 if self.lastline is None:
1206 print >> sys.stderr, "Warning: FastxReader called ungetLine() with no prior line!"
1207 self.bufline=self.lastline
1208 self.lastline=None
1209 #< class FastxReader
1210
1211 def fa_write(fhandle, seq_id, seq):
1212 """
1213 Write to a file in the FASTA format.
1214
1215 Arguments:
1216 - `fhandle`: A file handle open for writing
1217 - `seq_id`: The sequence id string for this sequence
1218 - `seq`: An unformatted string of the sequence to write
1219 """
1220 line_len = 60
1221 fhandle.write(">" + seq_id + "\n")
1222 for i in xrange(len(seq) / line_len + 1):
1223 start = i * line_len
1224 end = (i+1) * line_len if (i+1) * line_len < len(seq) else len(seq)
1225 fhandle.write( seq[ start:end ] + "\n")
1226
1227 class ZReader:
1228 def __init__(self, filename, sysparams, guess=True):
1229 self.fname=filename
1230 self.file=None
1231 self.fsrc=None
1232 self.popen=None
1233 pipecmd=[]
1234 s=filename.lower()
1235 if s.endswith(".bam"):
1236 pipecmd=["bam2fastx", "-"]
1237 else:
1238 if guess:
1239 if s.endswith(".z") or s.endswith(".gz") or s.endswith(".gzip"):
1240 pipecmd=['gzip']
1241 else:
1242 if s.endswith(".bz2") or s.endswith(".bzip2") or s.endswith(".bzip"):
1243 pipecmd=['bzip2']
1244 if len(pipecmd)>0 and which(pipecmd[0]) is None:
1245 die("Error: cannot find %s to decompress input file %s " % (pipecmd, filename))
1246 if len(pipecmd)>0:
1247 if pipecmd[0]=='gzip' and sysparams.zipper.endswith('pigz'):
1248 pipecmd[0]=sysparams.zipper
1249 pipecmd.extend(sysparams.zipper_opts)
1250 elif pipecmd[0]=='bzip2' and sysparams.zipper.endswith('pbzip2'):
1251 pipecmd[0]=sysparams.zipper
1252 pipecmd.extend(sysparams.zipper_opts)
1253 else: #not guessing, but must still check if it's a compressed file
1254 if use_zpacker and filename.endswith(".z"):
1255 pipecmd=[sysparams.zipper]
1256 pipecmd.extend(sysparams.zipper_opts)
1257 if pipecmd:
1258 pipecmd+=['-cd']
1259 if pipecmd:
1260 try:
1261 self.fsrc=open(self.fname, 'rb')
1262 self.popen=subprocess.Popen(pipecmd,
1263 preexec_fn=subprocess_setup,
1264 stdin=self.fsrc,
1265 stdout=subprocess.PIPE, close_fds=True)
1266 except Exception:
1267 die("Error: could not open pipe "+' '.join(pipecmd)+' < '+ self.fname)
1268 self.file=self.popen.stdout
1269 else:
1270 self.file=open(filename)
1271 def close(self):
1272 if self.fsrc: self.fsrc.close()
1273 self.file.close()
1274 if self.popen:
1275 self.popen.wait()
1276 self.popen=None
1277
1278 class ZWriter:
1279 def __init__(self, filename, sysparams):
1280 self.fname=filename
1281 if use_zpacker:
1282 pipecmd=[sysparams.zipper,"-cf", "-"]
1283 self.ftarget=open(filename, "wb")
1284 try:
1285 self.popen=subprocess.Popen(pipecmd,
1286 preexec_fn=subprocess_setup,
1287 stdin=subprocess.PIPE,
1288 stdout=self.ftarget, close_fds=True)
1289 except Exception:
1290 die("Error: could not open writer pipe "+' '.join(pipecmd)+' < '+ self.fname)
1291 self.file=self.popen.stdin # client writes to this end of the pipe
1292 else: #no compression
1293 self.file=open(filename, "w")
1294 self.ftarget=None
1295 self.popen=None
1296 def close(self):
1297 self.file.close()
1298 if self.ftarget: self.ftarget.close()
1299 if self.popen:
1300 self.popen.wait() #! required to actually flush the pipes (eek!)
1301 self.popen=None
1302
1303 # check_reads_format() examines the first few records in the user files
1304 # to determines the file format
1305 def check_reads_format(params, reads_files):
1306 #seed_len = params.read_params.seed_length
1307 fileformat = params.read_params.reads_format
1308
1309 observed_formats = set([])
1310 # observed_scales = set([])
1311 min_seed_len = 99999
1312 max_seed_len = 0
1313 files = reads_files.split(',')
1314
1315 for f_name in files:
1316 #try:
1317 zf = ZReader(f_name, params.system_params)
1318 #except IOError:
1319 # die("Error: could not open file "+f_name)
1320 freader=FastxReader(zf.file, params.read_params.color, zf.fname)
1321 toread=4 #just sample the first 4 reads
1322 while toread>0:
1323 seqid, seqstr, seq_len, qstr = freader.nextRecord()
1324 if not seqid: break
1325 toread-=1
1326 if seq_len < 20:
1327 print >> sys.stderr, "Warning: found a read < 20bp in", f_name
1328 else:
1329 min_seed_len = min(seq_len, min_seed_len)
1330 max_seed_len = max(seq_len, max_seed_len)
1331 zf.close()
1332 observed_formats.add(freader.format)
1333 if len(observed_formats) > 1:
1334 die("Error: TopHat requires all reads be either FASTQ or FASTA. Mixing formats is not supported.")
1335 fileformat=list(observed_formats)[0]
1336 #if seed_len != None:
1337 # seed_len = max(seed_len, max_seed_len)
1338 #else:
1339 # seed_len = max_seed_len
1340 #print >> sys.stderr, "\tmin read length: %dbp, max read length: %dbp" % (min_seed_len, max_seed_len)
1341 print >> sys.stderr, "\tformat:\t\t %s" % fileformat
1342 if fileformat == "fastq":
1343 quality_scale = "phred33 (default)"
1344 if params.read_params.solexa_quals and not params.read_params.phred64_quals:
1345 quality_scale = "solexa33 (reads generated with GA pipeline version < 1.3)"
1346 elif params.read_params.phred64_quals:
1347 quality_scale = "phred64 (reads generated with GA pipeline version >= 1.3)"
1348 print >> sys.stderr, "\tquality scale:\t %s" % quality_scale
1349 elif fileformat == "fasta":
1350 if params.read_params.color:
1351 params.read_params.integer_quals = True
1352
1353 #print seed_len, format, solexa_scale
1354 #NOTE: seed_len will be re-evaluated later by prep_reads
1355 return TopHatParams.ReadParams(params.read_params.solexa_quals,
1356 params.read_params.phred64_quals,
1357 params.read_params.quals,
1358 params.read_params.integer_quals,
1359 params.read_params.color,
1360 params.read_params.color_out,
1361 params.read_params.library_type,
1362 #seed_len,
1363 params.read_params.seed_length,
1364 fileformat,
1365 params.read_params.mate_inner_dist,
1366 params.read_params.mate_inner_dist_std_dev,
1367 params.read_params.read_group_id,
1368 params.read_params.sample_id,
1369 params.read_params.library_id,
1370 params.read_params.description,
1371 params.read_params.seq_platform_unit,
1372 params.read_params.seq_center,
1373 params.read_params.seq_run_date,
1374 params.read_params.seq_platform)
1375
1376 def log_tail(logfile, lines=1):
1377 f=open(logfile, "r")
1378 f.seek(0, 2)
1379 fbytes= f.tell()
1380 size=lines
1381 block=-1
1382 while size > 0 and fbytes+block*1024 > 0:
1383 if (fbytes+block*1024 > 0):
1384 ##Seek back once more, if possible
1385 f.seek( block*1024, 2 )
1386 else:
1387 #Seek to the beginning
1388 f.seek(0, 0)
1389 data= f.read( 1024 )
1390 linesFound= data.count('\n')
1391 size -= linesFound
1392 block -= 1
1393 if (fbytes + block*1024 > 0):
1394 f.seek(block*1024, 2)
1395 else:
1396 f.seek(0,0)
1397 f.readline() # find a newline
1398 lastBlocks= list( f.readlines() )
1399 f.close()
1400 return "\n".join(lastBlocks[-lines:])
1401
1402 # Format a DateTime as a pretty string.
1403 # FIXME: Currently doesn't support days!
1404 def formatTD(td):
1405 hours = td.seconds // 3600
1406 minutes = (td.seconds % 3600) // 60
1407 seconds = td.seconds % 60
1408 return '%02d:%02d:%02d' % (hours, minutes, seconds)
1409
1410 class PrepReadsInfo:
1411 def __init__(self, fname, side):
1412 try:
1413 f=open(fname,"r")
1414 self.min_len=int(f.readline().split("=")[-1])
1415 self.max_len=int(f.readline().split("=")[-1])
1416 self.in_count=int(f.readline().split("=")[-1])
1417 self.out_count=int(f.readline().split("=")[-1])
1418 if (self.out_count==0) or (self.max_len<16):
1419 raise Exception()
1420 except Exception, e:
1421 die(fail_str+"Error retrieving prep_reads info.")
1422 print >> sys.stderr, "\t%s reads: min. length=%s, count=%s" % (side, self.min_len, self.out_count)
1423 if self.min_len < 20:
1424 print >> sys.stderr, "Warning: short reads (<20bp) will make TopHat quite slow and take large amount of memory because they are likely to be mapped to too many places"
1425
1426
1427 def prep_reads_cmd(params, reads_list, quals_list=None, aux_file=None, filter_reads=None, hits_to_filter=None):
1428 #generate a prep_reads cmd arguments
1429 filter_cmd = [prog_path("prep_reads")]
1430 filter_cmd.extend(params.cmd())
1431
1432 if params.read_params.reads_format == "fastq":
1433 filter_cmd += ["--fastq"]
1434 elif params.read_params.reads_format == "fasta":
1435 filter_cmd += ["--fasta"]
1436 if hits_to_filter:
1437 filter_cmd += ["--flt-hits="+hits_to_filter]
1438 if aux_file:
1439 filter_cmd += ["--aux-outfile="+aux_file]
1440 if filter_reads:
1441 filter_cmd += ["--flt-reads="+filter_reads]
1442 filter_cmd.append(reads_list)
1443 if quals_list:
1444 filter_cmd.append(quals_list)
1445 return filter_cmd
1446
1447 # Calls the prep_reads executable, which prepares an internal read library.
1448 # The read library features reads with monotonically increasing integer IDs.
1449 # prep_reads also filters out very low complexy or garbage reads as well as
1450 # polyA reads.
1451 #--> returns the tuple (internal_read_library_file, read_info)
1452 def prep_reads(params, reads_list, quals_list, side="left", prefilter_reads=None):
1453 reads_suffix = ".fq"
1454 if use_zpacker: reads_suffix += ".z"
1455 output_name=side+"_kept_reads"
1456 kept_reads_filename = tmp_dir + output_name + reads_suffix
1457
1458 if os.path.exists(kept_reads_filename):
1459 os.remove(kept_reads_filename)
1460 kept_reads = open(kept_reads_filename, "wb")
1461 log_fname=logging_dir + "prep_reads.log"
1462 filter_log = open(log_fname,"w")
1463
1464 info_file=output_dir+output_name+".info"
1465 filter_cmd=prep_reads_cmd(params, reads_list, quals_list, info_file, prefilter_reads)
1466 shell_cmd = ' '.join(filter_cmd)
1467 #finally, add the compression pipe
1468 zip_cmd=[]
1469 if use_zpacker:
1470 zip_cmd=[ params.system_params.zipper ]
1471 zip_cmd.extend(params.system_params.zipper_opts)
1472 zip_cmd.extend(['-c','-'])
1473 shell_cmd +=' | '+' '.join(zip_cmd)
1474 shell_cmd += ' >' +kept_reads_filename
1475 retcode=0
1476 try:
1477 print >> run_log, shell_cmd
1478 if use_zpacker:
1479 filter_proc = subprocess.Popen(filter_cmd,
1480 stdout=subprocess.PIPE,
1481 stderr=filter_log)
1482 zip_proc=subprocess.Popen(zip_cmd,
1483 preexec_fn=subprocess_setup,
1484 stdin=filter_proc.stdout,
1485 stdout=kept_reads)
1486 filter_proc.stdout.close() #as per http://bugs.python.org/issue7678
1487 zip_proc.communicate()
1488 retcode=filter_proc.poll()
1489 if retcode==0:
1490 retcode=zip_proc.poll()
1491 else:
1492 retcode = subprocess.call(filter_cmd,
1493 stdout=kept_reads,
1494 stderr=filter_log)
1495 if retcode:
1496 die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname))
1497
1498 except OSError, o:
1499 errmsg=fail_str+str(o)
1500 die(errmsg+"\n"+log_tail(log_fname))
1501 kept_reads.close()
1502
1503 return kept_reads_filename, PrepReadsInfo(info_file, side)
1504
1505
1506 # Call bowtie
1507 def bowtie(params,
1508 bwt_idx_prefix,
1509 reads_list,
1510 reads_format,
1511 num_mismatches,
1512 mapped_reads,
1513 unmapped_reads,
1514 extra_output = "",
1515 t_mapping = False,
1516 multihits_out = None): #only --prefilter-multihits should activate this parameter for the initial prefilter search
1517 start_time = datetime.now()
1518 bwt_idx_name = bwt_idx_prefix.split('/')[-1]
1519 reads_file=reads_list[0]
1520 readfile_basename=getFileBaseName(reads_file)
1521 if t_mapping:
1522 print >> sys.stderr, "[%s] Mapping %s against transcriptome %s with Bowtie %s" % (start_time.strftime("%c"),
1523 readfile_basename, bwt_idx_name, extra_output)
1524 else:
1525 print >> sys.stderr, "[%s] Mapping %s against %s with Bowtie %s" % (start_time.strftime("%c"),
1526 readfile_basename, bwt_idx_name, extra_output)
1527 bwt_log = open(logging_dir + 'bowtie.'+readfile_basename+'.fixmap.log', "w")
1528 #bwt_mapped=mapped_reads
1529 unmapped_reads_out=None
1530 if unmapped_reads:
1531 unmapped_reads_out=unmapped_reads+".fq"
1532 use_FIFO = use_BWT_FIFO and use_zpacker and unmapped_reads and params.read_params.color
1533 if use_FIFO:
1534 global unmapped_reads_fifo
1535 unmapped_reads_fifo=unmapped_reads+".fifo"
1536 if os.path.exists(unmapped_reads_fifo):
1537 os.remove(unmapped_reads_fifo)
1538 try:
1539 os.mkfifo(unmapped_reads_fifo)
1540 except OSError, o:
1541 die(fail_str+"Error at mkfifo("+unmapped_reads_fifo+'). '+str(o))
1542 unmapped_reads_out+=".z"
1543
1544 # Launch Bowtie
1545 try:
1546 bowtie_cmd = [bowtie_path]
1547
1548 if reads_format == "fastq":
1549 bowtie_cmd += ["-q"]
1550 elif reads_format == "fasta":
1551 bowtie_cmd += ["-f"]
1552
1553 if params.read_params.color:
1554 bowtie_cmd += ["-C", "--col-keepends"]
1555 if unmapped_reads:
1556 bowtie_cmd += ["--un", unmapped_reads_out]
1557
1558 unzip_cmd=None
1559 bam_input=False
1560 if reads_list.endswith('.bam'):
1561 bam_input=True
1562 unzip_cmd=[ prog_path('bam2fastx') ]
1563 if reads_format:
1564 unzip_cmd.append("--"+reads_format)
1565 unzip_cmd+=[reads_list]
1566
1567 if use_zpacker and (unzip_cmd is None):
1568 unzip_cmd=[ params.system_params.zipper ]
1569 unzip_cmd.extend(params.system_params.zipper_opts)
1570
1571 fifo_pid=None
1572 if use_FIFO:
1573 unm_zipcmd=[ params.system_params.zipper ]
1574 unm_zipcmd.extend(params.system_params.zipper_opts)
1575 unm_zipcmd+=['-c']
1576 print >> run_log, ' '.join(unm_zipcmd)+' < '+ unmapped_reads_fifo + ' > '+ unmapped_reads_out + ' & '
1577 fifo_pid=os.fork()
1578 if fifo_pid==0:
1579 def on_sig_exit(sig, func=None):
1580 os._exit(os.EX_OK)
1581 signal.signal(signal.SIGTERM, on_sig_exit)
1582 subprocess.call(unm_zipcmd,
1583 stdin=open(unmapped_reads_fifo, "r"),
1584 stdout=open(unmapped_reads_out, "wb"))
1585 os._exit(os.EX_OK)
1586
1587 fix_map_cmd = [prog_path('fix_map_ordering')]
1588 samzip_cmd=None
1589 if unmapped_reads:
1590 #mapping on the genome
1591 #fix_map_ordering will write BAM file directly, and unmapped_reads as BAM file too
1592 mapped_reads += ".bam"
1593 if params.read_params.color:
1594 fix_map_cmd += ["--sam-header", sam_header, "-", mapped_reads]
1595 else:
1596 unmapped_reads_out = unmapped_reads+".bam"
1597 fix_map_cmd += ["--sam-header", sam_header, "-", mapped_reads, unmapped_reads_out]
1598 else:
1599 #mapping on segment_juncs (spliced potential junctions)
1600 # don't use SAM headers, could be slow -- better use compressed SAM directly
1601 mapped_reads += ".sam"
1602 fix_map_cmd += ["-"]
1603 if use_zpacker:
1604 fix_map_cmd += ["-"]
1605 samzip_cmd=[ params.system_params.zipper ]
1606 samzip_cmd.extend(params.system_params.zipper_opts)
1607 samzip_cmd += ["-c"]
1608 mapped_reads += ".z"
1609 else:
1610 fix_map_cmd += [mapped_reads]
1611
1612 max_hits = params.max_hits
1613 if t_mapping:
1614 max_hits = params.t_max_hits
1615 if params.bowtie_alignment_option == "-n" and num_mismatches>3:
1616 num_mismatches=3
1617 bowtie_cmd += [params.bowtie_alignment_option, str(num_mismatches),
1618 "-p", str(params.system_params.num_cpus),
1619 "-k", str(max_hits),
1620 "-m", str(max_hits),
1621 "-S", "--sam-nohead"] #always use headerless SAM file
1622
1623 if multihits_out:
1624 bowtie_cmd += ["--max", multihits_out]
1625 else:
1626 bowtie_cmd += ["--max", "/dev/null"]
1627 bowtie_cmd += [ bwt_idx_prefix ]
1628 bowtie_proc=None
1629 shellcmd=""
1630 unzip_proc=None
1631 samzip_proc=None
1632 if multihits_out:
1633 #special prefilter bowtie run: we use prep_reads on the fly
1634 #in order to get multi-mapped reads to exclude later
1635 prep_cmd = prep_reads_cmd(params, ",".join(reads_list))
1636 preplog_fname=logging_dir + "prep_reads_prefilter.log"
1637 prepfilter_log = open(preplog_fname,"w")
1638 unzip_proc = subprocess.Popen(prep_cmd,
1639 stdout=subprocess.PIPE,
1640 stderr=prepfilter_log)
1641 shellcmd=' '.join(prep_cmd) + "|"
1642 else:
1643 z_input=use_zpacker and reads_file.endswith(".z")
1644 if z_input:
1645 unzip_proc = subprocess.Popen(unzip_cmd,
1646 stdin=open(reads_file, "rb"),
1647 stdout=subprocess.PIPE)
1648 shellcmd=' '.join(unzip_cmd) + "< " +reads_file +"|"
1649 else:
1650 #must be uncompressed fastq input (unmapped reads from a previous run)
1651 #or a BAM file with unmapped reads
1652 if bam_input:
1653 unzip_proc = subprocess.Popen(unzip_cmd, stdout=subprocess.PIPE)
1654 shellcmd=' '.join(unzip_cmd) + "|"
1655 bowtie_cmd += [reads_file]
1656 bowtie_proc = subprocess.Popen(bowtie_cmd,
1657 stdout=subprocess.PIPE,
1658 stderr=bwt_log)
1659 if unzip_proc:
1660 #input is compressed OR prep_reads is used as a filter
1661 bowtie_cmd += ['-']
1662 bowtie_proc = subprocess.Popen(bowtie_cmd,
1663 stdin=unzip_proc.stdout,
1664 stdout=subprocess.PIPE,
1665 stderr=bwt_log)
1666 unzip_proc.stdout.close() # see http://bugs.python.org/issue7678
1667
1668 shellcmd += ' '.join(bowtie_cmd) + '|' + ' '.join(fix_map_cmd)
1669 if samzip_cmd:
1670 shellcmd += "|"+ ' '.join(samzip_cmd)+" > " + mapped_reads
1671 fix_order_proc = subprocess.Popen(fix_map_cmd,
1672 stdin=bowtie_proc.stdout,
1673 stdout=subprocess.PIPE)
1674 bowtie_proc.stdout.close() # needed?
1675 samzip_proc = subprocess.Popen(samzip_cmd,
1676 preexec_fn=subprocess_setup,
1677 stdin=fix_order_proc.stdout,
1678 stdout=open(mapped_reads, "wb"))
1679 fix_order_proc.stdout.close()
1680 else:
1681 #write BAM output directly
1682 fix_order_proc = subprocess.Popen(fix_map_cmd,
1683 stdin=bowtie_proc.stdout)
1684 # stdout=open(mapped_reads, "w"))
1685 bowtie_proc.stdout.close()
1686
1687 # shellcmd += " > " + mapped_reads
1688 print >> run_log, shellcmd
1689 if samzip_proc:
1690 samzip_proc.communicate()
1691 else:
1692 fix_order_proc.communicate()
1693 if use_FIFO:
1694 if fifo_pid and not os.path.exists(unmapped_reads_out):
1695 try:
1696 os.kill(fifo_pid, signal.SIGTERM)
1697 except:
1698 pass
1699 except OSError, o:
1700 die(fail_str+"Error: "+str(o))
1701
1702 # Success
1703 #finish_time = datetime.now()
1704 #duration = finish_time - start_time
1705 #print >> sys.stderr, "\t\t\t[%s elapsed]" % formatTD(duration)
1706 if use_FIFO:
1707 try:
1708 os.remove(unmapped_reads_fifo)
1709 except:
1710 pass
1711 if multihits_out and not os.path.exists(multihits_out):
1712 open(multihits_out, "w").close()
1713 return (mapped_reads, unmapped_reads_out)
1714
1715 def bowtie_segment(params,
1716 bwt_idx_prefix,
1717 reads_file,
1718 reads_format,
1719 mapped_reads,
1720 unmapped_reads = None,
1721 extra_output = ""):
1722
1723 backup_bowtie_alignment_option = params.bowtie_alignment_option
1724 params.bowtie_alignment_option = "-v"
1725 params.max_hits *= 2
1726
1727 result = bowtie(params,
1728 bwt_idx_prefix,
1729 [reads_file],
1730 reads_format,
1731 params.segment_mismatches,
1732 mapped_reads,
1733 unmapped_reads,
1734 extra_output)
1735
1736 params.bowtie_alignment_option = backup_bowtie_alignment_option
1737 params.max_hits /= 2
1738 return result
1739
1740 # Retrieve a .juncs file from a GFF file by calling the gtf_juncs executable
1741 def get_gtf_juncs(gff_annotation):
1742 th_log("Reading known junctions from GTF file")
1743 gtf_juncs_log = open(logging_dir + "gtf_juncs.log", "w")
1744
1745 gff_prefix = gff_annotation.split('/')[-1].split('.')[0]
1746
1747 gtf_juncs_out_name = tmp_dir + gff_prefix + ".juncs"
1748 gtf_juncs_out = open(gtf_juncs_out_name, "w")
1749
1750 gtf_juncs_cmd=[prog_path("gtf_juncs"), gff_annotation]
1751 try:
1752 print >> run_log, " ".join(gtf_juncs_cmd), " > "+gtf_juncs_out_name
1753 retcode = subprocess.call(gtf_juncs_cmd,
1754 stderr=gtf_juncs_log,
1755 stdout=gtf_juncs_out)
1756 # cvg_islands returned an error
1757 if retcode == 1:
1758 print >> sys.stderr, "\tWarning: TopHat did not find any junctions in GTF file"
1759 return (False, gtf_juncs_out_name)
1760 elif retcode != 0:
1761 die(fail_str+"Error: GTF junction extraction failed with err ="+str(retcode))
1762 # cvg_islands not found
1763 except OSError, o:
1764 errmsg=fail_str+str(o)+"\n"
1765 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1766 errmsg+="Error: gtf_juncs not found on this system"
1767 die(errmsg)
1768 return (True, gtf_juncs_out_name)
1769
1770 # Call bowtie-build on the FASTA file of sythetic splice junction sequences
1771 def build_juncs_bwt_index(external_splice_prefix, color):
1772 th_log("Indexing splices")
1773 bowtie_build_log = open(logging_dir + "bowtie_build.log", "w")
1774
1775 #user_splices_out_prefix = output_dir + "user_splices_idx"
1776
1777 bowtie_build_cmd = [prog_path("bowtie-build")]
1778 if color:
1779 bowtie_build_cmd += ["-C"]
1780
1781 bowtie_build_cmd += [external_splice_prefix + ".fa",
1782 external_splice_prefix]
1783 try:
1784 print >> run_log, " ".join(bowtie_build_cmd)
1785 retcode = subprocess.call(bowtie_build_cmd,
1786 stdout=bowtie_build_log)
1787
1788 if retcode != 0:
1789 die(fail_str+"Error: Splice sequence indexing failed with err ="+ str(retcode))
1790 except OSError, o:
1791 errmsg=fail_str+str(o)+"\n"
1792 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1793 errmsg+="Error: bowtie-build not found on this system"
1794 die(errmsg)
1795 return external_splice_prefix
1796
1797 # Build a splice index from a .juncs file, suitable for use with specified read
1798 # (or read segment) lengths
1799 def build_juncs_index(min_anchor_length,
1800 max_seg_len,
1801 juncs_prefix,
1802 external_juncs,
1803 external_insertions,
1804 external_deletions,
1805 reference_fasta,
1806 color):
1807 th_log("Retrieving sequences for splices")
1808
1809 juncs_file_list = ",".join(external_juncs)
1810 insertions_file_list = ",".join(external_insertions)
1811 deletions_file_list = ",".join(external_deletions)
1812
1813
1814 juncs_db_log = open(logging_dir + "juncs_db.log", "w")
1815
1816 external_splices_out_prefix = tmp_dir + juncs_prefix
1817 external_splices_out_name = external_splices_out_prefix + ".fa"
1818
1819 external_splices_out = open(external_splices_out_name, "w")
1820 # juncs_db_cmd = [bin_dir + "juncs_db",
1821 juncs_db_cmd = [prog_path("juncs_db"),
1822 str(min_anchor_length),
1823 str(max_seg_len),
1824 juncs_file_list,
1825 insertions_file_list,
1826 deletions_file_list,
1827 reference_fasta]
1828 try:
1829 print >> run_log, " ".join(juncs_db_cmd)
1830 retcode = subprocess.call(juncs_db_cmd,
1831 stderr=juncs_db_log,
1832 stdout=external_splices_out)
1833
1834 if retcode != 0:
1835 die(fail_str+"Error: Splice sequence retrieval failed with err ="+str(retcode))
1836 # juncs_db not found
1837 except OSError, o:
1838 errmsg=fail_str+str(o)+"\n"
1839 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1840 errmsg+="Error: juncs_db not found on this system"
1841 die(errmsg)
1842
1843 external_splices_out_prefix = build_juncs_bwt_index(external_splices_out_prefix, color)
1844 return external_splices_out_prefix
1845
1846 def build_idx_from_fa(fasta_fname, out_dir):
1847 """ Build a bowtie index from a FASTA file.
1848
1849 Arguments:
1850 - `fasta_fname`: File path to FASTA file.
1851 - `out_dir`: Output directory to place index in. (includes os.sep)
1852
1853 Returns:
1854 - The path to the Bowtie index.
1855 """
1856 bwt_idx_path = out_dir + os.path.basename(fasta_fname).replace(".fa", "")
1857 bowtie_idx_cmd = [prog_path("bowtie-build"),
1858 "-f",
1859 fasta_fname,
1860 bwt_idx_path]
1861 try:
1862 th_log("Building Bowtie index from " + os.path.basename(fasta_fname))
1863 print >> run_log, " ".join(bowtie_idx_cmd)
1864 retcode = subprocess.call(bowtie_idx_cmd,
1865 stdout=open(os.devnull, "w"),
1866 stderr=open(os.devnull, "w"))
1867 if retcode != 0:
1868 die(fail_str + "Error: Couldn't build bowtie index with err = "
1869 + str(retcode))
1870 except OSError, o:
1871 errmsg=fail_str+str(o)+"\n"
1872 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1873 errmsg+="Error: bowtie-build not found on this system"
1874 die(errmsg)
1875
1876 return bwt_idx_path
1877
1878 # Print out the sam header, embedding the user's specified library properties.
1879 # FIXME: also needs SQ dictionary lines
1880 def write_sam_header(read_params, sam_file):
1881 print >> sam_file, "@HD\tVN:1.0\tSO:coordinate"
1882 if read_params.read_group_id and read_params.sample_id:
1883 rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
1884 read_params.sample_id)
1885 if read_params.library_id:
1886 rg_str += "\tLB:%s" % read_params.library_id
1887 if read_params.description:
1888 rg_str += "\tDS:%s" % read_params.description
1889 if read_params.seq_platform_unit:
1890 rg_str += "\tPU:%s" % read_params.seq_platform_unit
1891 if read_params.seq_center:
1892 rg_str += "\tCN:%s" % read_params.seq_center
1893 if read_params.mate_inner_dist:
1894 rg_str += "\tPI:%s" % read_params.mate_inner_dist
1895 if read_params.seq_run_date:
1896 rg_str += "\tDT:%s" % read_params.seq_run_date
1897 if read_params.seq_platform:
1898 rg_str += "\tPL:%s" % read_params.seq_platform
1899
1900 print >> sam_file, rg_str
1901 print >> sam_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
1902
1903 # Write final TopHat output, via tophat_reports and wiggles
1904 def compile_reports(params, sam_header_filename, mappings, readfiles, gff_annotation):
1905 th_log("Reporting output tracks")
1906 left_maps, right_maps = mappings
1907 left_reads, right_reads = readfiles
1908 left_maps = [x for x in left_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
1909 left_maps = ','.join(left_maps)
1910
1911 if len(right_maps) > 0:
1912 right_maps = [x for x in right_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
1913 right_maps = ','.join(right_maps)
1914 log_fname = logging_dir + "reports.log"
1915 report_log = open(log_fname, "w")
1916 junctions = output_dir + "junctions.bed"
1917 insertions = output_dir + "insertions.bed"
1918 deletions = output_dir + "deletions.bed"
1919 coverage = "coverage.wig"
1920 accepted_hits = output_dir + "accepted_hits"
1921 report_cmdpath = prog_path("tophat_reports")
1922 report_cmd = [report_cmdpath]
1923 report_cmd.extend(params.cmd())
1924 report_cmd.extend(["--samtools="+samtools_path])
1925 report_cmd.extend([junctions,
1926 insertions,
1927 deletions,
1928 "-",
1929 left_maps,
1930 left_reads])
1931 if len(right_maps) > 0 and right_reads:
1932 report_cmd.append(right_maps)
1933 report_cmd.append(right_reads)
1934 # -- tophat_reports now produces (uncompressed) BAM stream,
1935 # directly piped into samtools sort
1936 try:
1937 if params.report_params.convert_bam:
1938 if params.report_params.sort_bam:
1939 report_proc=subprocess.Popen(report_cmd,
1940 preexec_fn=subprocess_setup,
1941 stdout=subprocess.PIPE,
1942 stderr=report_log)
1943
1944 bamsort_cmd = [samtools_path, "sort", "-", accepted_hits]
1945 bamsort_proc= subprocess.Popen(bamsort_cmd,
1946 preexec_fn=subprocess_setup,
1947 stderr=open(logging_dir + "reports.samtools_sort.log", "w"),
1948 stdin=report_proc.stdout)
1949 report_proc.stdout.close()
1950 print >> run_log, " ".join(report_cmd)+" | " + " ".join(bamsort_cmd)
1951 bamsort_proc.communicate()
1952 retcode = report_proc.poll()
1953 if retcode:
1954 die(fail_str+"Error running tophat_reports\n"+log_tail(log_fname))
1955 else:
1956 print >> run_log, " ".join(report_cmd)
1957 report_proc=subprocess.call(report_cmd,
1958 preexec_fn=subprocess_setup,
1959 stdout=open(accepted_hits,"wb"),
1960 stderr=report_log)
1961 os.rename(accepted_hits, output_dir + "accepted_hits.bam")
1962 else:
1963 print >> run_log, " ".join(report_cmd)
1964 report_proc=subprocess.call(report_cmd,
1965 preexec_fn=subprocess_setup,
1966 stdout=open(accepted_hits,"wb"),
1967 stderr=report_log)
1968 tmp_sam = output_dir + "accepted_hits.sam"
1969
1970 bam_to_sam_cmd = [samtools_path, "view", "-h", accepted_hits]
1971 print >> run_log, " ".join(bam_to_sam_cmd) + " > " + tmp_sam
1972 bam_to_sam_log = open(logging_dir + "accepted_hits_bam_to_sam.log", "w")
1973 tmp_sam_file = open(tmp_sam, "w")
1974 ret = subprocess.call(bam_to_sam_cmd,
1975 stdout=tmp_sam_file,
1976 stderr=bam_to_sam_log)
1977 tmp_sam_file.close()
1978 os.remove(accepted_hits)
1979 #print >> run_log, "mv %s %s" % (tmp_sam, output_dir + "accepted_hits.sam")
1980 #os.rename(tmp_sam, output_dir + "accepted_hits.sam")
1981
1982 except OSError, o:
1983 die(fail_str+"Error: "+str(o)+"\n"+log_tail(log_fname))
1984
1985 # FIXME: put wiggles back!
1986 # wig_cmd = [prog_path("wiggles"), output_dir + accepted_hits, output_dir + coverage]
1987 # print >> run_log, " ".join(wig_cmd)
1988 # subprocess.call(wig_cmd,
1989 # stderr=open("/dev/null"))
1990 return (coverage, junctions)
1991
1992
1993 # Split up each read in a FASTQ file into multiple segments. Creates a FASTQ file
1994 # for each segment This function needs to be fixed to support mixed read length
1995 # inputs
1996
1997 def open_output_files(prefix, num_files_prev, num_files, out_segf, extension, params):
1998 i = num_files_prev + 1
1999 while i <= num_files:
2000 segfname=prefix+("_seg%d" % i)+extension
2001 out_segf.append(ZWriter(segfname,params.system_params))
2002 i += 1
2003
2004 def split_reads(reads_filename,
2005 prefix,
2006 fasta,
2007 params,
2008 segment_length):
2009 #reads_file = open(reads_filename)
2010 zreads = ZReader(reads_filename, params.system_params, False)
2011 out_segfiles = []
2012
2013 if fasta:
2014 extension = ".fa"
2015 else:
2016 extension = ".fq"
2017 if use_zpacker: extension += ".z"
2018 def convert_color_to_bp(color_seq):
2019 decode_dic = { 'A0':'A', 'A1':'C', 'A2':'G', 'A3':'T', 'A4':'N', 'A.':'N', 'AN':'N',
2020 'C0':'C', 'C1':'A', 'C2':'T', 'C3':'G', 'C4':'N', 'C.':'N', 'CN':'N',
2021 'G0':'G', 'G1':'T', 'G2':'A', 'G3':'C', 'G4':'N', 'G.':'N', 'GN':'N',
2022 'T0':'T', 'T1':'G', 'T2':'C', 'T3':'A', 'T4':'N', 'T.':'N', 'TN':'N',
2023 'N0':'N', 'N1':'N', 'N2':'N', 'N3':'N', 'N4':'N', 'N.':'N', 'NN':'N',
2024 '.0':'N', '.1':'N', '.2':'N', '.3':'N', '.4':'N', '..':'N', '.N':'N' }
2025
2026 base = color_seq[0]
2027 bp_seq = base
2028 for ch in color_seq[1:]:
2029 base = decode_dic[base+ch]
2030 bp_seq += base
2031 return bp_seq
2032
2033 def convert_bp_to_color(bp_seq):
2034 encode_dic = { 'AA':'0', 'CC':'0', 'GG':'0', 'TT':'0',
2035 'AC':'1', 'CA':'1', 'GT':'1', 'TG':'1',
2036 'AG':'2', 'CT':'2', 'GA':'2', 'TC':'2',
2037 'AT':'3', 'CG':'3', 'GC':'3', 'TA':'3',
2038 'A.':'4', 'C.':'4', 'G.':'4', 'T.':'4',
2039 '.A':'4', '.C':'4', '.G':'4', '.T':'4',
2040 '.N':'4', 'AN':'4', 'CN':'4', 'GN':'4',
2041 'TN':'4', 'NA':'4', 'NC':'4', 'NG':'4',
2042 'NT':'4', 'NN':'4', 'N.':'4', '..':'4' }
2043
2044 base = bp_seq[0]
2045 color_seq = base
2046 for ch in bp_seq[1:]:
2047 color_seq += encode_dic[base + ch]
2048 base = ch
2049
2050 return color_seq
2051
2052 def split_record(read_name, read_seq, read_qual, out_segf, offsets, color):
2053 if color:
2054 color_offset = 1
2055 read_seq_temp = convert_color_to_bp(read_seq)
2056
2057 seg_num = 1
2058 while seg_num + 1 < len(offsets):
2059 if read_seq[offsets[seg_num]+1] not in ['0', '1', '2', '3']:
2060 return
2061 seg_num += 1
2062 else:
2063 color_offset = 0
2064
2065 seg_num = 0
2066 last_seq_offset = 0
2067 while seg_num + 1 < len(offsets):
2068 f = out_segf[seg_num].file
2069 seg_seq = read_seq[last_seq_offset+color_offset:offsets[seg_num + 1]+color_offset]
2070 print >> f, "%s|%d:%d:%d" % (read_name,last_seq_offset,seg_num, len(offsets) - 1)
2071 if color:
2072 print >> f, "%s%s" % (read_seq_temp[last_seq_offset], seg_seq)
2073 else:
2074 print >> f, seg_seq
2075 if not fasta:
2076 seg_qual = read_qual[last_seq_offset:offsets[seg_num + 1]]
2077 print >> f, "+"
2078 print >> f, seg_qual
2079 seg_num += 1
2080 last_seq_offset = offsets[seg_num]
2081
2082 line_state = 0
2083 read_name = ""
2084 read_seq = ""
2085 read_quals = ""
2086 num_segments = 0
2087 offsets = []
2088 for line in zreads.file:
2089 if line.strip() == "":
2090 continue
2091 if line_state == 0:
2092 read_name = line.strip()
2093 elif line_state == 1:
2094 read_seq = line.strip()
2095
2096 read_length = len(read_seq)
2097 tmp_num_segments = read_length / segment_length
2098 offsets = [segment_length * i for i in range(0, tmp_num_segments + 1)]
2099
2100 # Bowtie's minimum read length here is 20bp, so if the last segment
2101 # is between 20 and segment_length bp long, go ahead and write it out
2102 if read_length % segment_length >= 20:
2103 offsets.append(read_length)
2104 tmp_num_segments += 1
2105 else:
2106 offsets[-1] = read_length
2107
2108 if tmp_num_segments == 1:
2109 offsets = [0, read_length]
2110
2111 if tmp_num_segments > num_segments:
2112 open_output_files(prefix, num_segments, tmp_num_segments, out_segfiles, extension, params)
2113 num_segments = tmp_num_segments
2114
2115 if fasta:
2116 split_record(read_name, read_seq, None, out_segfiles, offsets, params.read_params.color)
2117 elif line_state == 2:
2118 line = line.strip()
2119 else:
2120 read_quals = line.strip()
2121 if not fasta:
2122 split_record(read_name, read_seq, read_quals, out_segfiles, offsets, params.read_params.color)
2123
2124 line_state += 1
2125 if fasta:
2126 line_state %= 2
2127 else:
2128 line_state %= 4
2129 zreads.close()
2130 out_fnames=[]
2131 for zf in out_segfiles:
2132 zf.close()
2133 out_fnames.append(zf.fname)
2134 #return [o.fname for o in out_segfiles]
2135 return out_fnames
2136
2137 # Find possible splice junctions using the "closure search" strategy, and report
2138 # them in closures.juncs. Calls the executable closure_juncs
2139 def junctions_from_closures(params,
2140 left_maps,
2141 right_maps,
2142 ref_fasta):
2143 th_log("Searching for junctions via mate-pair closures")
2144
2145 #maps = [x for x in seg_maps if (os.path.exists(x) and os.path.getsize(x) > 0)]
2146 #if len(maps) == 0:
2147 # return None
2148 #print >> sys.stderr, left_maps
2149 slash = left_maps[0].rfind('/')
2150 juncs_out = ""
2151 if slash != -1:
2152 juncs_out += left_maps[0][:slash+1]
2153 juncs_out += "closure.juncs"
2154
2155 juncs_log = open(logging_dir + "closure.log", "w")
2156 juncs_cmdpath=prog_path("closure_juncs")
2157 juncs_cmd = [juncs_cmdpath]
2158
2159 left_maps = ','.join(left_maps)
2160 right_maps = ','.join(right_maps)
2161
2162 juncs_cmd.extend(params.cmd())
2163 juncs_cmd.extend([juncs_out,
2164 ref_fasta,
2165 left_maps,
2166 right_maps])
2167 try:
2168 print >> run_log, ' '.join(juncs_cmd)
2169 retcode = subprocess.call(juncs_cmd,
2170 stderr=juncs_log)
2171
2172 # spanning_reads returned an error
2173 if retcode != 0:
2174 die(fail_str+"Error: closure-based junction search failed with err ="+str(retcode))
2175 # cvg_islands not found
2176 except OSError, o:
2177 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2178 print >> sys.stderr, fail_str, "Error: closure_juncs not found on this system"
2179 die(str(o))
2180 return [juncs_out]
2181
2182 # Find possible junctions by examining coverage and split segments in the initial
2183 # map and segment maps. Report junctions, insertions, and deletions in segment.juncs,
2184 # segment.insertions, and segment.deletions. Calls the executable
2185 # segment_juncs
2186 def junctions_from_segments(params,
2187 left_reads,
2188 left_reads_map,
2189 left_seg_maps,
2190 right_reads,
2191 right_reads_map,
2192 right_seg_maps,
2193 unmapped_reads,
2194 reads_format,
2195 ref_fasta):
2196 if left_reads_map != left_seg_maps[0]:
2197 th_log("Searching for junctions via segment mapping")
2198 out_path=getFileDir(left_seg_maps[0])
2199 juncs_out=out_path+"segment.juncs"
2200 insertions_out=out_path+"segment.insertions"
2201 deletions_out =out_path+"segment.deletions"
2202
2203 left_maps = ','.join(left_seg_maps)
2204 log_fname = logging_dir + "segment_juncs.log"
2205 segj_log = open(log_fname, "w")
2206 segj_cmd = [prog_path("segment_juncs")]
2207
2208 segj_cmd.extend(params.cmd())
2209
2210 segj_cmd.extend(["--ium-reads", ",".join(unmapped_reads),
2211 ref_fasta,
2212 juncs_out,
2213 insertions_out,
2214 deletions_out,
2215 left_reads,
2216 left_reads_map,
2217 left_maps])
2218 if right_seg_maps:
2219 right_maps = ','.join(right_seg_maps)
2220 segj_cmd.extend([right_reads, right_reads_map, right_maps])
2221 try:
2222 print >> run_log, " ".join(segj_cmd)
2223 retcode = subprocess.call(segj_cmd,
2224 preexec_fn=subprocess_setup,
2225 stderr=segj_log)
2226
2227 # spanning_reads returned an error
2228 if retcode != 0:
2229 die(fail_str+"Error: segment-based junction search failed with err ="+str(retcode)+"\n"+log_tail(log_fname))
2230 # cvg_islands not found
2231 except OSError, o:
2232 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2233 print >> sys.stderr, fail_str, "Error: segment_juncs not found on this system"
2234 die(str(o))
2235
2236 return [juncs_out, insertions_out, deletions_out]
2237
2238 # Joins mapped segments into full-length read alignments via the executable
2239 # long_spanning_reads
2240 def join_mapped_segments(params,
2241 sam_header_filename,
2242 reads,
2243 ref_fasta,
2244 possible_juncs,
2245 possible_insertions,
2246 possible_deletions,
2247 contig_seg_maps,
2248 spliced_seg_maps,
2249 alignments_out_name):
2250 rn=""
2251 if len(contig_seg_maps)>1:
2252 th_log("Joining segment hits")
2253 rn=".segs"
2254 else:
2255 th_log("Processing bowtie hits")
2256 contig_seg_maps = ','.join(contig_seg_maps)
2257
2258 possible_juncs = ','.join(possible_juncs)
2259 possible_insertions = ",".join(possible_insertions)
2260 possible_deletions = ",".join(possible_deletions)
2261 log_fname=logging_dir + "long_spanning_reads"+rn+".log"
2262 align_log = open(log_fname, "w")
2263 align_cmd = [prog_path("long_spanning_reads")]
2264
2265 align_cmd.extend(params.cmd())
2266
2267 align_cmd.append(ref_fasta)
2268 align_cmd.extend([ reads,
2269 possible_juncs,
2270 possible_insertions,
2271 possible_deletions,
2272 contig_seg_maps])
2273
2274 if spliced_seg_maps:
2275 spliced_seg_maps = ','.join(spliced_seg_maps)
2276 align_cmd.append(spliced_seg_maps)
2277
2278 try:
2279 print >> run_log, " ".join(align_cmd),">",alignments_out_name
2280 join_proc=subprocess.Popen(align_cmd,
2281 preexec_fn=subprocess_setup,
2282 stdout=open(alignments_out_name, "wb"),
2283 stderr=align_log)
2284 join_proc.communicate()
2285 retcode=join_proc.poll()
2286 if retcode:
2287 die(fail_str+"Error at 'long_spanning_reads'\n"+log_tail(log_fname))
2288 except OSError, o:
2289 die(fail_str+"Error: "+str(o))
2290
2291
2292 # This class collects spliced and unspliced alignments for each of the
2293 # left and right read files provided by the user.
2294 class Maps:
2295 def __init__(self,
2296 unspliced_sam,
2297 seg_maps,
2298 unmapped_segs,
2299 segs):
2300 self.unspliced_sam = unspliced_sam
2301 self.seg_maps = seg_maps
2302 self.unmapped_segs = unmapped_segs
2303 self.segs = segs
2304
2305 # Map2GTF stuff
2306 def m2g_convert_coords(params, gtf_fname, reads, out_fname):
2307 """ajjkljlks
2308
2309 Arguments:
2310 - `params`: TopHat parameters
2311 - `gtf_fname`: File name pointing to the annotation.
2312 - `reads`: The reads to convert coords (in Bowtie format).
2313 - `out_fname`: The file name pointing to the output.
2314 """
2315 m2g_cmd = [prog_path("map2gtf")]
2316 m2g_cmd.extend(params.cmd())
2317 m2g_cmd.append(gtf_fname)
2318 m2g_cmd.append(reads) #could be BAM file
2319 m2g_cmd.append(out_fname)
2320 fbasename = getFileBaseName(reads)
2321 m2g_log = logging_dir + "m2g_" + fbasename + ".out"
2322 m2g_err = logging_dir + "m2g_" + fbasename + ".err"
2323
2324 try:
2325 th_log("Converting " + fbasename + " to genomic coordinates (map2gtf)")
2326 print >> run_log, " ".join(m2g_cmd) + " > " + m2g_log
2327 ret = subprocess.call(m2g_cmd,
2328 stdout=open(m2g_log, "w"),
2329 stderr=open(m2g_err, "w"))
2330 if ret != 0:
2331 die(fail_str + " Error: map2gtf returned an error")
2332 except OSError, o:
2333 err_msg = fail_str + str(o)
2334 die(err_msg + "\n")
2335
2336
2337 def gtf_to_fasta(params, trans_gtf, genome, out_fname):
2338 """ Make the transcriptome from a GTF.
2339
2340 Arguments:
2341 - `trans_gtf`:
2342 - `genome`:
2343 - `out_fname`:
2344 """
2345 # TODO: Call gtf_to_fasta
2346 g2f_cmd = [prog_path("gtf_to_fasta")]
2347 g2f_cmd.extend(params.cmd())
2348 g2f_cmd.append(trans_gtf)
2349 g2f_cmd.append(genome)
2350 g2f_cmd.append(out_fname)
2351
2352 g2f_log = logging_dir + "g2f.out"
2353 g2f_err = logging_dir + "g2f.err"
2354
2355 try:
2356 print >> run_log, " ".join(g2f_cmd)+" > " + g2f_log
2357 ret = subprocess.call(g2f_cmd,
2358 stdout = open(g2f_log, "w"),
2359 stderr = open(g2f_err, "w"))
2360 if ret != 0:
2361 die(fail_str + " Error: gtf_to_fasta returned an error.")
2362 except OSError, o:
2363 err_msg = fail_str + str(o)
2364 die(err_msg + "\n")
2365
2366 def map2gtf(params, ref_fasta, left_reads, right_reads):
2367 """ Main GTF mapping function
2368
2369 Arguments:
2370 - `params`: The TopHat parameters.
2371 - `ref_fasta`: The reference genome.
2372 - `left_reads`: A list of reads.
2373 - `right_reads`: A list of reads (empty if single-end).
2374
2375 Returns:
2376 TODO: Put something here
2377 """
2378 test_input_file(params.gff_annotation)
2379
2380 # th_log("Reading in GTF file: " + params.gff_annotation)
2381 # transcripts = gtf_to_transcripts(params.gff_annotation)
2382
2383 gtf_name = getFileBaseName(params.gff_annotation)
2384 m2g_bwt_idx = None
2385 if params.transcriptome_index:
2386 m2g_bwt_idx = params.transcriptome_index
2387 #th_log("Using pre-built transcriptome index..")
2388 else:
2389 th_log("Creating multi-FASTA from GTF and genome")
2390 m2g_ref_fasta = tmp_dir + gtf_name + ".fa"
2391 gtf_to_fasta(params, params.gff_annotation, ref_fasta, m2g_ref_fasta)
2392 # trans_hash = transcripts_to_fasta(transcripts,
2393 # ref_fasta,
2394 # m2g_ref_fasta)
2395 m2g_bwt_idx = build_idx_from_fa(m2g_ref_fasta, tmp_dir)
2396 mapped_gtf_list = []
2397 unmapped_gtf_list = []
2398 # do the initial mapping in GTF coordinates
2399 for reads in [left_reads, right_reads]:
2400 if reads == None or os.path.getsize(reads) < 25 :
2401 continue
2402 fbasename = getFileBaseName(reads)
2403 mapped_gtf_out = tmp_dir + fbasename + ".m2g"
2404 #if use_zpacker:
2405 # mapped_gtf_out+=".z"
2406
2407 unmapped_gtf = tmp_dir + fbasename + ".m2g_um"
2408 #if use_BWT_FIFO:
2409 # unmapped_gtf += ".z"
2410
2411 (mapped_gtf_map, unmapped) = bowtie(params,
2412 m2g_bwt_idx,
2413 [reads],
2414 "fastq",
2415 params.t_mismatches,
2416 mapped_gtf_out,
2417 unmapped_gtf,
2418 "", True)
2419 mapped_gtf_list.append(mapped_gtf_map)
2420 unmapped_gtf_list.append(unmapped)
2421
2422 sam_gtf_list = []
2423 for reads in mapped_gtf_list:
2424 fbasename = getFileBaseName(reads)
2425 sam_out_fname = tmp_dir + fbasename + "_converted.bam"
2426 m2g_convert_coords(params,
2427 params.gff_annotation,
2428 reads,
2429 sam_out_fname)
2430 sam_gtf_list.append(sam_out_fname)
2431
2432 if len(sam_gtf_list) < 2:
2433 sam_gtf_list.append(None)
2434 unmapped_gtf_list.append(None)
2435
2436 return (sam_gtf_list, unmapped_gtf_list)
2437 # end Map2GTF
2438
2439 def get_preflt_data(params, ri, target_reads, out_mappings, out_unmapped):
2440 ## extract mappings and unmapped reads from prefilter mappings and preflt_ium
2441 ##
2442 #this is accomplished by a special prep_reads usage (triggered by --flt-hits)
2443
2444 prep_cmd=prep_reads_cmd(params, params.preflt_data[ri].unmapped_reads, None,
2445 out_mappings,
2446 target_reads,
2447 params.preflt_data[ri].mappings)
2448 um_reads = open(out_unmapped, "wb")
2449 log_fname=logging_dir + "prep_reads.from_preflt."+str(ri)+".log"
2450 filter_log = open(log_fname,"w")
2451
2452 shell_cmd = " ".join(prep_cmd)
2453 #add the compression pipe
2454 zip_cmd=[]
2455 if use_zpacker:
2456 zip_cmd=[ params.system_params.zipper ]
2457 zip_cmd.extend(params.system_params.zipper_opts)
2458 zip_cmd.extend(['-c','-'])
2459 shell_cmd +=' | '+' '.join(zip_cmd)
2460 shell_cmd += ' >' + out_unmapped
2461 retcode=0
2462 try:
2463 print >> run_log, shell_cmd
2464 if use_zpacker:
2465 prep_proc = subprocess.Popen(prep_cmd,
2466 stdout=subprocess.PIPE,
2467 stderr=filter_log)
2468 zip_proc = subprocess.Popen(zip_cmd,
2469 preexec_fn=subprocess_setup,
2470 stdin=prep_proc.stdout,
2471 stdout=um_reads)
2472 prep_proc.stdout.close() #as per http://bugs.python.org/issue7678
2473 zip_proc.communicate()
2474 retcode=prep_proc.poll()
2475 if retcode==0:
2476 retcode=zip_proc.poll()
2477 else:
2478 retcode = subprocess.call(prep_cmd,
2479 stdout=um_reads,
2480 stderr=filter_log)
2481 if retcode:
2482 die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname))
2483
2484 except OSError, o:
2485 errmsg=fail_str+str(o)
2486 die(errmsg+"\n"+log_tail(log_fname))
2487 um_reads.close()
2488
2489 return (out_mappings, out_unmapped)
2490
2491
2492 # The main aligment routine of TopHat. This function executes most of the
2493 # workflow producing a set of candidate alignments for each cDNA fragment in a
2494 # pair of SAM alignment files (for paired end reads).
2495 def spliced_alignment(params,
2496 bwt_idx_prefix,
2497 sam_header_filename,
2498 ref_fasta,
2499 read_len,
2500 segment_len,
2501 prepared_reads,
2502 user_supplied_junctions,
2503 user_supplied_insertions,
2504 user_supplied_deletions):
2505
2506 possible_juncs = []
2507 possible_juncs.extend(user_supplied_junctions)
2508
2509 possible_insertions = []
2510 possible_insertions.extend(user_supplied_insertions)
2511 possible_deletions = []
2512 possible_deletions.extend(user_supplied_deletions)
2513
2514 left_reads, right_reads = prepared_reads
2515
2516 maps = [[], []] # maps[0] = left_reads mapping data, maps[1] = right_reads_mapping_data
2517 #single_segments = False
2518
2519 # Before anything, map the reads using Map2GTF (if using annotation)
2520 m2g_maps = [ None, None ] # left, right
2521 initial_reads = [ left_reads, right_reads ]
2522
2523 if params.gff_annotation:
2524 (mapped_gtf_list, unmapped_gtf_list) = \
2525 map2gtf(params, ref_fasta, left_reads, right_reads)
2526
2527 m2g_left_maps, m2g_right_maps = mapped_gtf_list
2528 m2g_maps = [m2g_left_maps, m2g_right_maps]
2529 if params.transcriptome_only or not os.path.exists(unmapped_gtf_list[0]):
2530 # The case where the user doesn't want to map to anything other
2531 # than the transcriptome OR we have no unmapped reads
2532 maps[0] = [m2g_left_maps]
2533 if right_reads:
2534 maps[0] = [m2g_right_maps]
2535 return maps
2536 # Feed the unmapped reads into spliced_alignment()
2537 initial_reads = unmapped_gtf_list[:]
2538
2539 th_log("Resuming TopHat pipeline with unmapped reads")
2540
2541 max_seg_len = segment_len #this is the ref seq span on either side of the junctions
2542 #to be extracted into segment_juncs.fa
2543
2544 num_segs = int(read_len / segment_len)
2545 if (read_len % segment_len) >= min(segment_len-2, 20):
2546 #remainder is shorter but long enough to become a new segment
2547 num_segs += 1
2548 else:
2549 # the last segment is longer
2550 if num_segs>1: max_seg_len += (read_len % segment_len)
2551
2552 # daehwan - check this out
2553 if num_segs <= 1:
2554 print >> sys.stderr, "Warning: you have only one segment per read\n\twe strongly recommend that you decrease --segment-length to about half the read length because TopHat will work better with multiple segments"
2555
2556 # Using the num_segs value returned by check_reads(),
2557 # decide which junction discovery strategy to use
2558 if num_segs<3:
2559 #if params.butterfly_search != False:
2560 # params.butterfly_search = True
2561 if params.coverage_search != False:
2562 params.coverage_search = True
2563 if num_segs == 1:
2564 segment_len = read_len
2565 else: #num_segs >= 3:
2566 # if we have at least three segments, just use split segment search,
2567 # which is the most sensitive and specific, fastest, and lightest-weight.
2568 # so unless specifically requested, disable the other junction searches
2569 if params.closure_search != True:
2570 params.closure_search = False
2571 if params.coverage_search != True:
2572 params.coverage_search = False
2573 if params.butterfly_search != True:
2574 params.butterfly_search = False
2575 # Perform the first part of the TopHat work flow on the left and right
2576 # reads of paired ends separately - we'll use the pairing information later
2577 for ri in (0,1):
2578 reads=initial_reads[ri]
2579 if reads == None or os.path.getsize(reads)<25 :
2580 continue
2581 fbasename=getFileBaseName(reads)
2582 unspliced_out = tmp_dir + fbasename + ".mapped"
2583 unspliced_sam = None
2584 unmapped_reads = None
2585 #if use_zpacker: unspliced_out+=".z"
2586 unmapped_unspliced = tmp_dir + fbasename + "_unmapped"
2587 if params.prefilter_multi:
2588 #unmapped_unspliced += ".z"
2589 (unspliced_sam, unmapped_reads) = get_preflt_data(params, ri, reads, unspliced_out, unmapped_unspliced)
2590 else:
2591 # Perform the initial Bowtie mapping of the full length reads
2592 (unspliced_sam, unmapped_reads) = bowtie(params,
2593 bwt_idx_prefix,
2594 [reads],
2595 "fastq",
2596 params.initial_read_mismatches,
2597 unspliced_out,
2598 unmapped_unspliced)
2599
2600 # unspliced_sam = tmp_dir+fbasename+'.unspl.bam'
2601 # join_mapped_segments(params,
2602 # sam_header_filename,
2603 # reads,
2604 # ref_fasta,
2605 # ["/dev/null"],
2606 # ["/dev/null"],
2607 # ["/dev/null"],
2608 # [unspliced_map],
2609 # [],
2610 # unspliced_sam)
2611 seg_maps = []
2612 unmapped_segs = []
2613 segs = []
2614 if num_segs > 1:
2615 # split up the IUM reads into segments
2616 # unmapped_reads can be in BAM format
2617 read_segments = split_reads(unmapped_reads,
2618 tmp_dir + fbasename,
2619 False,
2620 params,
2621 segment_len)
2622
2623 # Map each segment file independently with Bowtie
2624 for i in range(len(read_segments)):
2625 seg = read_segments[i]
2626 fbasename=getFileBaseName(seg)
2627 seg_out = tmp_dir + fbasename + ".bwtout"
2628 if use_zpacker: seg_out += ".z"
2629 unmapped_seg = tmp_dir + fbasename + "_unmapped.fq"
2630 if use_BWT_FIFO:
2631 unmapped_seg += ".z"
2632 extra_output = "(%d/%d)" % (i+1, len(read_segments))
2633 (seg_map, unmapped) = bowtie_segment(params,
2634 bwt_idx_prefix,
2635 seg,
2636 "fastq",
2637 seg_out,
2638 unmapped_seg,
2639 extra_output)
2640 seg_maps.append(seg_map)
2641 unmapped_segs.append(unmapped)
2642 segs.append(seg)
2643
2644 # Collect the segment maps for left and right reads together
2645 maps[ri] = Maps(unspliced_sam, seg_maps, unmapped_segs, segs)
2646 else:
2647 # if there's only one segment, just collect the initial map as the only
2648 # map to be used downstream for coverage-based junction discovery
2649 read_segments = [reads]
2650 maps[ri] = Maps(unspliced_sam, [unspliced_sam], [unmapped_reads], [unmapped_reads])
2651
2652 # XXX: At this point if using M2G, have three sets of reads:
2653 # mapped to transcriptome, mapped to genome, and unmapped (potentially
2654 # spliced or poly-A tails) - hp
2655
2656 # Unless the user asked not to discover new junctions, start that process
2657 # here
2658
2659 left_reads_map = maps[0].unspliced_sam
2660 left_seg_maps = maps[0].seg_maps
2661 unmapped_reads = maps[0].unmapped_segs
2662 if right_reads:
2663 right_reads_map = maps[1].unspliced_sam
2664 right_seg_maps = maps[1].seg_maps
2665 unmapped_reads.extend(maps[1].unmapped_segs)
2666 else:
2667 right_reads_map = None
2668 right_seg_maps = None
2669
2670 if params.find_novel_juncs: # or params.find_novel_indels:
2671
2672 # Call segment_juncs to infer a list of possible splice junctions from
2673 # the regions of the genome covered in the initial and segment maps
2674 #if params.find_novel_juncs:
2675 #TODO: in m2g case, we might want to pass the m2g mappings as well,
2676 # or perhaps the GTF file directly
2677 # -> this could improve alternative junction detection?
2678 juncs = junctions_from_segments(params,
2679 left_reads,
2680 left_reads_map,
2681 left_seg_maps,
2682 right_reads,
2683 right_reads_map,
2684 right_seg_maps,
2685 unmapped_reads,
2686 "fastq",
2687 ref_fasta)
2688 if os.path.getsize(juncs[0]) != 0:
2689 possible_juncs.append(juncs[0])
2690 if params.find_novel_indels:
2691 if os.path.getsize(juncs[1]) != 0:
2692 possible_insertions.append(juncs[1])
2693 if os.path.getsize(juncs[2]) != 0:
2694 possible_deletions.append(juncs[2])
2695 # Optionally, and for paired reads only, use a closure search to
2696 # discover addtional junctions
2697 if params.closure_search and left_reads and right_reads:
2698 juncs = junctions_from_closures(params,
2699 [maps[initial_reads[left_reads]].unspliced_sam, maps[initial_reads[left_reads]].seg_maps[-1]],
2700 [maps[initial_reads[right_reads]].unspliced_sam, maps[initial_reads[right_reads]].seg_maps[-1]],
2701 ref_fasta)
2702 if os.path.getsize(juncs[0]) != 0:
2703 possible_juncs.extend(juncs)
2704
2705 if len(possible_insertions) == 0 and len(possible_deletions) == 0 and len(possible_juncs) == 0:
2706 spliced_seg_maps = None
2707 junc_idx_prefix = None
2708 else:
2709 junc_idx_prefix = "segment_juncs"
2710 if len(possible_insertions) == 0:
2711 possible_insertions.append(os.devnull)
2712 # print >> sys.stderr, "Warning: insertions database is empty!"
2713 if len(possible_deletions) == 0:
2714 possible_deletions.append(os.devnull)
2715 # print >> sys.stderr, "Warning: deletions database is empty!"
2716 if len(possible_juncs) == 0:
2717 possible_juncs.append(os.devnull)
2718 print >> sys.stderr, "Warning: junction database is empty!"
2719 if junc_idx_prefix:
2720 build_juncs_index(3,
2721 #segment_len,
2722 max_seg_len,
2723 junc_idx_prefix,
2724 possible_juncs,
2725 possible_insertions,
2726 possible_deletions,
2727 ref_fasta,
2728 params.read_params.color)
2729
2730 # Now map read segments (or whole IUM reads, if num_segs == 1) to the splice
2731 # index with Bowtie
2732 # for reads in [left_reads, right_reads]:
2733 for ri in (0,1):
2734 reads = initial_reads[ri]
2735 spliced_seg_maps = []
2736
2737 if reads == None or os.path.getsize(reads)<25:
2738 continue
2739
2740 m2g_map = m2g_maps[ri]
2741
2742 if junc_idx_prefix :
2743 i = 0
2744 for seg in maps[ri].segs:
2745 #search each segment
2746 fsegname = getFileBaseName(seg)
2747 seg_out = tmp_dir + fsegname + ".to_spliced.bwtout"
2748 if use_zpacker: seg_out += ".z"
2749 extra_output = "(%d/%d)" % (i+1, len(maps[ri].segs))
2750 (seg_map, unmapped) = bowtie_segment(params,
2751 tmp_dir + junc_idx_prefix,
2752 seg,
2753 "fastq",
2754 seg_out,
2755 None,
2756 extra_output)
2757 spliced_seg_maps.append(seg_map)
2758 i += 1
2759
2760 # Join the contigous and spliced segment hits into full length
2761 # read alignments
2762 rfname=getFileBaseName(reads)
2763 rfdir=getFileDir(reads)
2764 mapped_reads = rfdir + rfname + ".candidates.bam"
2765 # -- spliced mappings built from all segment mappings vs genome and junc_db
2766 merged_map =rfdir + rfname + ".candidates_and_unspl.bam"
2767 #unspl_bwtfile = maps[ri].unspliced_bwt
2768 unspl_samfile = maps[ri].unspliced_sam
2769 join_mapped_segments(params,
2770 sam_header_filename,
2771 reads,
2772 ref_fasta,
2773 possible_juncs,
2774 possible_insertions,
2775 possible_deletions,
2776 maps[ri].seg_maps,
2777 spliced_seg_maps,
2778 mapped_reads)
2779
2780 if num_segs > 1 or m2g_map:
2781 # Merge the spliced and unspliced full length alignments into
2782 # a single SAM file.
2783 # The individual SAM files are all already sorted in
2784 # increasing read ID order.
2785 # NOTE: We also should be able to address bug #134 here, by replacing
2786 # contiguous alignments that poke into an intron by a small amount by
2787 # the correct spliced alignment.
2788 try:
2789 merge_cmd = [ prog_path("bam_merge"), merged_map, mapped_reads ]
2790 if num_segs > 1:
2791 merge_cmd += [ maps[ri].unspliced_sam ]
2792 if m2g_map:
2793 merge_cmd += [ m2g_map ]
2794 print >> run_log, " ".join(merge_cmd)
2795 ret = subprocess.call( merge_cmd,
2796 stderr=open(logging_dir + "sam_merge.log", "w") )
2797 if ret != 0:
2798 die(fail_str+"Error executing: "+" ".join(merge_cmd))
2799 except OSError, o:
2800 die(fail_str+"Error: "+str(o))
2801 maps[ri] = [merged_map]
2802
2803 if not params.system_params.keep_tmp:
2804 os.remove(mapped_reads)
2805 else:
2806 # no segments or transcriptome mappings, so no merge is needed
2807 # because join_mapped_segments() produced the final BAM already
2808 os.rename(mapped_reads, merged_map)
2809 maps[ri] = [merged_map]
2810 if not params.system_params.keep_tmp:
2811 os.remove(unspl_samfile)
2812 #for ri
2813 return maps
2814
2815 def die(msg=None):
2816 if msg is not None:
2817 print >> sys.stderr, msg
2818 sys.exit(1)
2819
2820 # rough equivalent of the 'which' command to find external programs
2821 # (current script path is tested first, then PATH envvar)
2822 def which(program):
2823 def is_executable(fpath):
2824 return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
2825 fpath, fname = os.path.split(program)
2826 if fpath:
2827 if is_executable(program):
2828 return program
2829 else:
2830 progpath = os.path.join(bin_dir, program)
2831 if is_executable(progpath):
2832 return progpath
2833 for path in os.environ["PATH"].split(os.pathsep):
2834 progpath = os.path.join(path, program)
2835 if is_executable(progpath):
2836 return progpath
2837 return None
2838
2839 def prog_path(program):
2840 progpath=which(program)
2841 if progpath == None:
2842 die("Error locating program: "+program)
2843 return progpath
2844
2845 # FIXME: this should get set during the make dist autotools phase of the build
2846 def get_version():
2847 return "1.4.0"
2848
2849 def mlog(msg):
2850 print >> sys.stderr, "[DBGLOG]:"+msg
2851
2852 def test_input_file(filename):
2853 try:
2854 test_file = open(filename, "r")
2855 # Bowtie not found
2856 except IOError, o:
2857 die("Error: Opening file %s" % filename)
2858 return
2859
2860
2861 def main(argv=None):
2862 warnings.filterwarnings("ignore", "tmpnam is a potential security risk")
2863
2864 # Initialize default parameter values
2865 params = TopHatParams()
2866
2867 try:
2868 if argv is None:
2869 argv = sys.argv
2870 args = params.parse_options(argv)
2871 params.check()
2872
2873 bwt_idx_prefix = args[0]
2874 left_reads_list = args[1]
2875 left_quals_list, right_quals_list = [], []
2876 if (not params.read_params.quals and len(args) > 2) or (params.read_params.quals and len(args) > 3):
2877 if params.read_params.mate_inner_dist == None:
2878 params.read_params.mate_inner_dist = 50
2879 #die("Error: you must set the mean inner distance between mates with -r")
2880
2881 right_reads_list = args[2]
2882 if params.read_params.quals:
2883 left_quals_list = args[3]
2884 right_quals_list = args[4]
2885 else:
2886 right_reads_list = None
2887 if params.read_params.quals:
2888 left_quals_list = args[2]
2889
2890 print >> sys.stderr
2891 th_log("Beginning TopHat run (v"+get_version()+")")
2892 print >> sys.stderr, "-----------------------------------------------"
2893
2894 start_time = datetime.now()
2895 prepare_output_dir()
2896
2897 global run_log
2898 run_log = open(logging_dir + "run.log", "w", 0)
2899 global run_cmd
2900 run_cmd = " ".join(argv)
2901 print >> run_log, run_cmd
2902
2903 # Validate all the input files, check all prereqs before committing
2904 # to the run
2905 if params.transcriptome_index:
2906 check_bowtie_index(params.transcriptome_index)
2907 (ref_fasta, ref_seq_dict) = check_index(bwt_idx_prefix)
2908
2909 check_bowtie()
2910 check_samtools()
2911 th_log("Generating SAM header for "+bwt_idx_prefix)
2912 sam_header_filename = get_index_sam_header(params.read_params, bwt_idx_prefix)
2913
2914 #if not params.skip_check_reads:
2915 reads_list = left_reads_list
2916 if right_reads_list:
2917 reads_list = reads_list + "," + right_reads_list
2918 params.read_params = check_reads_format(params, reads_list)
2919
2920 user_supplied_juncs = []
2921 user_supplied_insertions = []
2922 user_supplied_deletions = []
2923 global gtf_juncs
2924 if params.gff_annotation and params.find_GFF_juncs:
2925 test_input_file(params.gff_annotation)
2926 (found_juncs, gtf_juncs) = get_gtf_juncs(params.gff_annotation)
2927 ##-- we don't need these junctions in user_supplied_juncs anymore because now map2gtf does a much better job
2928 ## but we still need them loaded in gtf_juncs for later splice verification
2929 #if found_juncs and not params.gff_annotation:
2930 # user_supplied_juncs.append(gtf_juncs)
2931 #else:
2932 # gtf_juncs = None
2933 if params.raw_junctions:
2934 test_input_file(params.raw_junctions)
2935 user_supplied_juncs.append(params.raw_junctions)
2936
2937 if params.raw_insertions:
2938 test_input_file(params.raw_insertions)
2939 user_supplied_insertions.append(params.raw_insertions)
2940
2941 if params.raw_deletions:
2942 test_input_file(params.raw_deletions)
2943 user_supplied_deletions.append(params.raw_deletions)
2944
2945 global unmapped_reads_fifo
2946 unmapped_reads_fifo = tmp_dir + str(os.getpid())+".bwt_unmapped.z.fifo"
2947
2948 # Now start the time consuming stuff
2949 if params.prefilter_multi:
2950 sides=("left","right")
2951 read_lists=(left_reads_list, right_reads_list)
2952 for ri in (0,1):
2953 reads_list=read_lists[ri]
2954 if not reads_list:
2955 continue
2956 params.preflt_data[ri].multihit_reads = tmp_dir + sides[ri]+"_multimapped.fq"
2957 side_imap = tmp_dir + sides[ri]+"_im.bwtout"
2958 if use_zpacker:
2959 side_imap+=".z"
2960 side_ium = tmp_dir + sides[ri]+"_ium.fq"
2961 if use_BWT_FIFO:
2962 side_ium += ".z"
2963 th_log("Pre-filtering multi-mapped "+sides[ri]+" reads")
2964 rdlist=reads_list.split(',')
2965 bwt=bowtie(params, bwt_idx_prefix, rdlist,
2966 params.read_params.reads_format,
2967 params.initial_read_mismatches,
2968 side_imap, side_ium,
2969 "", False, params.preflt_data[ri].multihit_reads)
2970 params.preflt_data[ri].mappings = bwt[0] # initial mappings
2971 params.preflt_data[ri].unmapped_reads = bwt[1] # IUM reads
2972
2973 th_log("Preparing reads")
2974 left_kept_reads, left_reads_info = prep_reads(params,
2975 left_reads_list,
2976 left_quals_list,
2977 "left",
2978 params.preflt_data[0].multihit_reads)
2979 min_read_len=left_reads_info.min_len
2980 max_read_len=left_reads_info.max_len
2981 if right_reads_list:
2982 right_kept_reads, right_reads_info = prep_reads(params,
2983 right_reads_list,
2984 right_quals_list,
2985 "right",
2986 params.preflt_data[1].multihit_reads)
2987 min_read_len=min(right_reads_info.min_len, min_read_len)
2988 max_read_len=max(right_reads_info.max_len, max_read_len)
2989 else:
2990 right_kept_reads = None
2991 seed_len=params.read_params.seed_length
2992 if seed_len: #if read len was explicitly given
2993 seed_len = max(seed_len, min_read_len)
2994 #can't be smaller than minimum length observed
2995 else:
2996 seed_len = max_read_len
2997 params.read_params.seed_length=seed_len
2998 # turn off integer-quals
2999 if params.read_params.integer_quals:
3000 params.read_params.integer_quals = False
3001 input_reads = [left_kept_reads, right_kept_reads]
3002 mappings = spliced_alignment(params,
3003 bwt_idx_prefix,
3004 sam_header_filename,
3005 ref_fasta,
3006 params.read_params.seed_length,
3007 params.segment_length,
3008 input_reads,
3009 user_supplied_juncs,
3010 user_supplied_insertions,
3011 user_supplied_deletions)
3012
3013 compile_reports(params,
3014 sam_header_filename,
3015 mappings,
3016 input_reads,
3017 params.gff_annotation)
3018
3019 if not params.system_params.keep_tmp:
3020 for m in mappings[0]:
3021 os.remove(m)
3022 if left_kept_reads:
3023 os.remove(left_kept_reads)
3024 for m in mappings[1]:
3025 os.remove(m)
3026 if right_kept_reads:
3027 os.remove(right_kept_reads)
3028 tmp_files = os.listdir(tmp_dir)
3029 for t in tmp_files:
3030 os.remove(tmp_dir+t)
3031 os.rmdir(tmp_dir)
3032
3033 finish_time = datetime.now()
3034 duration = finish_time - start_time
3035 print >> sys.stderr,"-----------------------------------------------"
3036 print >> sys.stderr, "Run complete [%s elapsed]" % formatTD(duration)
3037
3038 except Usage, err:
3039 print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg)
3040 print >> sys.stderr, " for detailed help see http://tophat.cbcb.umd.edu/manual.html"
3041 return 2
3042
3043
3044 if __name__ == "__main__":
3045 sys.exit(main())

Properties

Name Value
svn:executable *