ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/tophat
Revision: 37
Committed: Mon Aug 22 16:17:34 2011 UTC (8 years, 9 months ago) by gpertea
File size: 111110 byte(s)
Log Message:
wip

Line File contents
1 #!/usr/bin/env python
2
3 # encoding: utf-8
4 """
5 tophat.py
6
7 Created by Cole Trapnell on 2008-12-25.
8 Copyright (c) 2008 Cole Trapnell. All rights reserved.
9 """
10
11 import sys
12 try:
13 import psyco
14 psyco.full()
15 except ImportError:
16 pass
17
18 import getopt
19 import subprocess
20 import errno
21 import os
22 import warnings
23 import re
24 import signal
25 from datetime import datetime, date, time
26
27 use_message = '''
28 TopHat maps short sequences from spliced transcripts to whole genomes.
29
30 Usage:
31 tophat [options] <bowtie_index> <reads1[,reads2,...]> [reads1[,reads2,...]] \\
32 [quals1,[quals2,...]] [quals1[,quals2,...]]
33
34 Options:
35 -v/--version
36 -o/--output-dir <string> [ default: ./tophat_out ]
37 -a/--min-anchor <int> [ default: 8 ]
38 -m/--splice-mismatches <0-2> [ default: 0 ]
39 -i/--min-intron-length <int> [ default: 50 ]
40 -I/--max-intron-length <int> [ default: 500000 ]
41 -g/--max-multihits <int> [ default: 20 ]
42 -F/--min-isoform-fraction <float> [ default: 0.15 ]
43 --max-insertion-length <int> [ default: 3 ]
44 --max-deletion-length <int> [ default: 3 ]
45 --solexa-quals
46 --solexa1.3-quals (same as phred64-quals)
47 --phred64-quals (same as solexa1.3-quals)
48 -Q/--quals
49 --integer-quals
50 -C/--color (Solid - color space)
51 --color-out
52 --library-type <string> (fr-unstranded, fr-firststrand,
53 fr-secondstrand)
54 -p/--num-threads <int> [ default: 1 ]
55 -G/--GTF <filename>
56 -j/--raw-juncs <filename>
57 --insertions <filename>
58 --deletions <filename>
59 -r/--mate-inner-dist <int>
60 --mate-std-dev <int> [ default: 20 ]
61 --no-novel-juncs
62 --no-novel-indels
63 --no-gtf-juncs
64 --no-coverage-search
65 --coverage-search
66 --no-closure-search
67 --closure-search
68 --microexon-search
69 --butterfly-search
70 --no-butterfly-search
71 --keep-tmp
72 --tmp-dir <dirname> [ default: <output_dir>/tmp ]
73 -z/--zpacker <program> [ default: gzip ]
74 -X/--unmapped-fifo [use mkfifo to compress more temporary
75 files for color space reads]
76
77 Advanced Options:
78 --initial-read-mismatches <int> [ default: 2 ]
79 --segment-mismatches <int> [ default: 2 ]
80 --segment-length <int> [ default: 25 ]
81 --bowtie-n [ default: bowtie -v ]
82 --min-closure-exon <int> [ default: 100 ]
83 --min-closure-intron <int> [ default: 50 ]
84 --max-closure-intron <int> [ default: 5000 ]
85 --min-coverage-intron <int> [ default: 50 ]
86 --max-coverage-intron <int> [ default: 20000 ]
87 --min-segment-intron <int> [ default: 50 ]
88 --max-segment-intron <int> [ default: 500000 ]
89 --no-sort-bam [Output BAM is not coordinate-sorted]
90 --no-convert-bam [Do not convert to bam format.
91 Output is <output_dir>accepted_hit.sam.
92 Implies --no-sort-bam.]
93
94 SAM Header Options (for embedding sequencing run metadata in output):
95 --rg-id <string> (read group ID)
96 --rg-sample <string> (sample ID)
97 --rg-library <string> (library ID)
98 --rg-description <string> (descriptive string, no tabs allowed)
99 --rg-platform-unit <string> (e.g Illumina lane ID)
100 --rg-center <string> (sequencing center name)
101 --rg-date <string> (ISO 8601 date of the sequencing run)
102 --rg-platform <string> (Sequencing platform descriptor)
103 '''
104
105 class Usage(Exception):
106 def __init__(self, msg):
107 self.msg = msg
108
109 output_dir = "./tophat_out/"
110 logging_dir = output_dir + "logs/"
111 run_log = None
112 run_cmd = None
113 tmp_dir = output_dir + "tmp/"
114 bin_dir = sys.path[0] + "/"
115 use_zpacker = False # this is set by -z/--zpacker option (-z0 leaves it False)
116
117 use_BAM_Unmapped = False # automatically set to True for non-Solid reads, handles unmapped reads in BAM format
118
119 use_BWT_FIFO = False # can only be set to True if use_zpacker is True and only with -C/--color
120 # enabled by -X/-unmapped-fifo option (unless -z0)
121 unmapped_reads_fifo = None # if use_BWT_FIFO is True, this is to trick bowtie into writing the
122 # unmapped reads into a compressed file
123
124 samtools_path = None
125 bowtie_path = None
126 fail_str = "\t[FAILED]\n"
127
128 sam_header = tmp_dir + "stub_header.sam"
129
130 # TopHatParams captures all of the runtime paramaters used by TopHat, and many
131 # of these are passed as command line options to exectubles run by the pipeline
132
133 # This class and its nested classes also do options parsing through parse_options()
134 # and option validation via the member function check()
135
136 class TopHatParams:
137
138 # SpliceConstraints is a group of runtime parameters that specify what
139 # constraints to put on junctions discovered by the program. These constraints
140 # are used to filter out spurious/false positive junctions.
141
142 class SpliceConstraints:
143 def __init__(self,
144 min_anchor_length,
145 min_intron_length,
146 max_intron_length,
147 splice_mismatches,
148 min_isoform_fraction):
149 self.min_anchor_length = min_anchor_length
150 self.min_intron_length = min_intron_length
151 self.max_intron_length = max_intron_length
152 self.splice_mismatches = splice_mismatches
153 self.min_isoform_fraction = min_isoform_fraction
154
155 def parse_options(self, opts):
156 for option, value in opts:
157 if option in ("-m", "--splice-mismatches"):
158 self.splice_mismatches = int(value)
159 elif option in ("-a", "--min-anchor"):
160 self.min_anchor_length = int(value)
161 elif option in ("-F", "--min-isoform-fraction"):
162 self.min_isoform_fraction = float(value)
163 elif option in ("-i", "--min-intron-length"):
164 self.min_intron_length = int(value)
165 elif option in ("-I", "--max-intron-length"):
166 self.max_intron_length = int(value)
167
168 def check(self):
169 if self.splice_mismatches not in [0,1,2]:
170 die("Error: arg to --splice-mismatches must be 0, 1, or 2")
171 if self.min_anchor_length < 4:
172 die("Error: arg to --min-anchor-len must be greater than 4")
173 if self.min_isoform_fraction < 0.0 or self.min_isoform_fraction > 1.0:
174 die("Error: arg to --min-isoform-fraction must be between 0.0 and 1.0")
175 if self.min_intron_length <= 0:
176 die("Error: arg to --min-intron-length must be greater than 0")
177 if self.max_intron_length <= 0:
178 die("Error: arg to --max-intron-length must be greater than 0")
179
180 # SystemParams is a group of runtime parameters that determine how to handle
181 # temporary files produced during a run and how many threads to use for threaded
182 # stages of the pipeline (e.g. Bowtie)
183
184 class SystemParams:
185 def __init__(self,
186 num_cpus,
187 keep_tmp):
188 self.num_cpus = num_cpus
189 self.keep_tmp = keep_tmp
190 self.zipper = "gzip"
191 self.zipper_opts= []
192
193 def parse_options(self, opts):
194 global use_zpacker
195 global use_BWT_FIFO
196 for option, value in opts:
197 if option in ("-p", "--num-threads"):
198 self.num_cpus = int(value)
199 elif option == "--keep-tmp":
200 self.keep_tmp = True
201 elif option in ("-z","--zpacker"):
202 if value.lower() in ["-", " ", ".", "0", "none", "f", "false", "no"]:
203 value=""
204 self.zipper = value
205 #if not self.zipper:
206 # self.zipper='gzip'
207 elif option in ("-X", "--unmapped-fifo"):
208 use_BWT_FIFO=True
209 if self.zipper:
210 use_zpacker=True
211 if self.num_cpus>1 and not self.zipper_opts:
212 if self.zipper.endswith('pbzip2') or self.zipper.endswith('pigz'):
213 self.zipper_opts.append('-p'+str(self.num_cpus))
214 else:
215 use_zpacker=False
216 if use_BWT_FIFO: use_BWT_FIFO=False
217 def cmd(self):
218 cmdline=[]
219 if self.zipper:
220 cmdline.extend(['-z',self.zipper])
221 if self.num_cpus>1:
222 cmdline.extend(['-p'+str(self.num_cpus)])
223 return cmdline
224
225 def check(self):
226 if self.num_cpus<1 :
227 die("Error: arg to --num-threads must be greater than 0")
228 if self.zipper:
229 xzip=which(self.zipper)
230 if not xzip:
231 die("Error: cannot find compression program "+self.zipper)
232
233 # ReadParams is a group of runtime parameters that specify various properties
234 # of the user's reads (e.g. which quality scale their are on, how long the
235 # fragments are, etc).
236 class ReadParams:
237 def __init__(self,
238 solexa_quals,
239 phred64_quals,
240 quals,
241 integer_quals,
242 color,
243 color_out,
244 library_type,
245 seed_length,
246 reads_format,
247 mate_inner_dist,
248 mate_inner_dist_std_dev,
249 read_group_id,
250 sample_id,
251 library_id,
252 description,
253 seq_platform_unit,
254 seq_center,
255 seq_run_date,
256 seq_platform):
257 self.solexa_quals = solexa_quals
258 self.phred64_quals = phred64_quals
259 self.quals = quals
260 self.integer_quals = integer_quals
261 self.color = color
262 self.color_out = color_out
263 self.library_type = library_type
264 self.seed_length = seed_length
265 self.reads_format = reads_format
266 self.mate_inner_dist = mate_inner_dist
267 self.mate_inner_dist_std_dev = mate_inner_dist_std_dev
268 self.read_group_id = read_group_id
269 self.sample_id = sample_id
270 self.library_id = library_id
271 self.description = description
272 self.seq_platform_unit = seq_platform_unit
273 self.seq_center = seq_center
274 self.seq_run_date = seq_run_date
275 self.seq_platform = seq_platform
276
277 def parse_options(self, opts):
278 for option, value in opts:
279 if option == "--solexa-quals":
280 self.solexa_quals = True
281 elif option in ("--solexa1.3-quals", "--phred64-quals"):
282 self.phred64_quals = True
283 elif option in ("-Q", "--quals"):
284 self.quals = True
285 elif option == "--integer-quals":
286 self.integer_quals = True
287 elif option in ("-C", "--color"):
288 self.color = True
289 elif option == "--color-out":
290 self.color_out = True
291 elif option == "--library-type":
292 self.library_type = value
293 elif option in ("-s", "--seed-length"):
294 self.seed_length = int(value)
295 elif option in ("-r", "--mate-inner-dist"):
296 self.mate_inner_dist = int(value)
297 elif option == "--mate-std-dev":
298 self.mate_inner_dist_std_dev = int(value)
299 elif option == "--rg-id":
300 self.read_group_id = value
301 elif option == "--rg-sample":
302 self.sample_id = value
303 elif option == "--rg-library":
304 self.library_id = value
305 elif option == "--rg-description":
306 self.description = value
307 elif option == "--rg-platform-unit":
308 self.seq_platform_unit = value
309 elif option == "--rg-center":
310 self.seq_center = value
311 elif option == "--rg-date":
312 self.seq_run_date = value
313 elif option == "--rg-platform":
314 self.seq_platform = value
315
316 def check(self):
317 if self.seed_length != None and self.seed_length < 20:
318 die("Error: arg to --seed-length must be at least 20")
319 if self.mate_inner_dist_std_dev != None and self.mate_inner_dist_std_dev < 0:
320 die("Error: arg to --mate-std-dev must at least 0")
321 if (not self.read_group_id and self.sample_id) or (self.read_group_id and not self.sample_id):
322 die("Error: --rg-id and --rg-sample must be specified or omitted together")
323
324 # SearchParams is a group of runtime parameters that specify how TopHat will
325 # search for splice junctions
326
327 class SearchParams:
328 def __init__(self,
329 min_closure_exon,
330 min_closure_intron,
331 max_closure_intron,
332 min_coverage_intron,
333 max_coverage_intron,
334 min_segment_intron,
335 max_segment_intron):
336
337 self.min_closure_exon_length = min_closure_exon
338 self.min_closure_intron_length = min_closure_intron
339 self.max_closure_intron_length = max_closure_intron
340 self.min_coverage_intron_length = min_coverage_intron
341 self.max_coverage_intron_length = max_coverage_intron
342 self.min_segment_intron_length = min_segment_intron
343 self.max_segment_intron_length = max_segment_intron
344
345 def parse_options(self, opts):
346 for option, value in opts:
347 if option == "--min-closure-exon":
348 self.min_closure_exon_length = int(value)
349 if option == "--min-closure-intron":
350 self.min_closure_intron_length = int(value)
351 if option == "--max-closure-intron":
352 self.max_closure_intron_length = int(value)
353 if option == "--min-coverage-intron":
354 self.min_coverage_intron_length = int(value)
355 if option == "--max-coverage-intron":
356 self.max_coverage_intron_length = int(value)
357 if option == "--min-segment-intron":
358 self.min_segment_intron_length = int(value)
359 if option == "--max-segment-intron":
360 self.max_segment_intron_length = int(value)
361
362 def check(self):
363 if self.min_closure_exon_length < 0:
364 die("Error: arg to --min-closure-exon must be at least 20")
365 if self.min_closure_intron_length < 0:
366 die("Error: arg to --min-closure-intron must be at least 20")
367 if self.max_closure_intron_length < 0:
368 die("Error: arg to --max-closure-intron must be at least 20")
369 if self.min_coverage_intron_length < 0:
370 die("Error: arg to --min-coverage-intron must be at least 20")
371 if self.max_coverage_intron_length < 0:
372 die("Error: arg to --max-coverage-intron must be at least 20")
373 if self.min_segment_intron_length < 0:
374 die("Error: arg to --min-segment-intron must be at least 20")
375 if self.max_segment_intron_length < 0:
376 die("Error: arg to --max-segment-intron must be at least 20")
377
378 class ReportParams:
379 def __init__(self):
380 self.sort_bam = True
381 self.convert_bam = True
382
383 def parse_options(self, opts):
384 for option, value in opts:
385 if option == "--no-sort-bam":
386 self.sort_bam = False
387 if option == "--no-convert-bam":
388 self.convert_bam = False
389 self.sort_bam = False
390
391
392 def __init__(self):
393 self.splice_constraints = self.SpliceConstraints(8, # min_anchor
394 50, # min_intron
395 500000, # max_intron
396 0, # splice_mismatches
397 0.15) # min_isoform_frac
398
399 self.read_params = self.ReadParams(False, # solexa_scale
400 False,
401 False, # quals
402 None, # integer quals
403 False, # SOLiD - color space
404 False, # SOLiD - color out instead of base pair,
405 "", # library type (e.g. "illumina-stranded-pair-end")
406 None, # seed_length
407 "fastq", # quality_format
408 None, # mate inner distance
409 20, # mate inner dist std dev
410 None, # read group id
411 None, # sample id
412 None, # library id
413 None, # description
414 None, # platform unit (i.e. lane)
415 None, # sequencing center
416 None, # run date
417 None) # sequencing platform
418
419 self.system_params = self.SystemParams(1, # bowtie_threads (num_cpus)
420 False) # keep_tmp
421
422 self.search_params = self.SearchParams(100, # min_closure_exon_length
423 50, # min_closure_intron_length
424 5000, # max_closure_intron_length
425 50, # min_coverage_intron_length
426 20000, # max_coverage_intron_length
427 50, # min_segment_intron_length
428 500000) # max_segment_intron_length
429
430 self.report_params = self.ReportParams()
431 self.gff_annotation = None
432 self.raw_junctions = None
433 self.find_novel_juncs = True
434 self.find_novel_indels = True
435 self.find_GFF_juncs = True
436 self.skip_check_reads = False
437 self.max_hits = 20
438 self.initial_read_mismatches = 2
439 self.segment_length = 25
440 self.segment_mismatches = 2
441 self.bowtie_alignment_option = "-v"
442 self.max_insertion_length = 3
443 self.max_deletion_length = 3
444 self.raw_insertions = None
445 self.raw_deletions = None
446 self.closure_search = None
447 self.coverage_search = None
448 self.microexon_search = False
449 self.butterfly_search = None
450
451 def check(self):
452 self.splice_constraints.check()
453 self.read_params.check()
454 self.system_params.check()
455
456 if self.segment_length <= 4:
457 die("Error: arg to --segment-length must at least 4")
458 if self.segment_mismatches < 0 or self.segment_mismatches > 3:
459 die("Error: arg to --segment-mismatches must in [0, 3]")
460
461 if self.read_params.color and self.butterfly_search:
462 die("Error: butterfly-search in colorspace is not yet supported")
463
464 library_types = ["fr-unstranded", "fr-firststrand", "fr-secondstrand"]
465
466 if self.read_params.library_type and self.read_params.library_type not in library_types:
467 die("Error: library-type should be one of: "+', '.join(library_types))
468
469 self.search_params.max_closure_intron_length = min(self.splice_constraints.max_intron_length,
470 self.search_params.max_closure_intron_length)
471
472 self.search_params.max_segment_intron_length = min(self.splice_constraints.max_intron_length,
473 self.search_params.max_segment_intron_length)
474
475 self.search_params.max_coverage_intron_length = min(self.splice_constraints.max_intron_length,
476 self.search_params.max_coverage_intron_length)
477
478 if self.max_insertion_length >= self.segment_length:
479 die("Error: the max insertion length ("+self.max_insertion_length+") can not be equal to or greater than the segment length ("+self.segment_length+")")
480
481 if self.max_insertion_length < 0:
482 die("Error: the max insertion length ("+self.max_insertion_length+") can not be less than 0")
483
484 if self.max_deletion_length >= self.splice_constraints.min_intron_length:
485 die("Error: the max deletion length ("+self.max_deletion_length+") can not be equal to or greater than the min intron length ("+self.splice_constraints.min_intron_length+")")
486
487 if self.max_deletion_length < 0:
488 die("Error: the max deletion length ("+self.max_deletion_length+") can not be less than 0")
489
490 def cmd(self):
491 cmd = ["--min-anchor", str(self.splice_constraints.min_anchor_length),
492 "--splice-mismatches", str(self.splice_constraints.splice_mismatches),
493 "--min-report-intron", str(self.splice_constraints.min_intron_length),
494 "--max-report-intron", str(self.splice_constraints.max_intron_length),
495 "--min-isoform-fraction", str(self.splice_constraints.min_isoform_fraction),
496 "--output-dir", output_dir,
497 "--max-multihits", str(self.max_hits),
498 "--segment-length", str(self.segment_length),
499 "--segment-mismatches", str(self.segment_mismatches),
500 "--min-closure-exon", str(self.search_params.min_closure_exon_length),
501 "--min-closure-intron", str(self.search_params.min_closure_intron_length),
502 "--max-closure-intron", str(self.search_params.max_closure_intron_length),
503 "--min-coverage-intron", str(self.search_params.min_coverage_intron_length),
504 "--max-coverage-intron", str(self.search_params.max_coverage_intron_length),
505 "--min-segment-intron", str(self.search_params.min_segment_intron_length),
506 "--max-segment-intron", str(self.search_params.max_segment_intron_length),
507 "--sam-header", sam_header,
508 "--max-insertion-length", str(self.max_insertion_length),
509 "--max-deletion-length", str(self.max_deletion_length)]
510
511 cmd.extend(self.system_params.cmd())
512
513 if self.read_params.mate_inner_dist != None:
514 cmd.extend(["--inner-dist-mean", str(self.read_params.mate_inner_dist),
515 "--inner-dist-std-dev", str(self.read_params.mate_inner_dist_std_dev)])
516 if self.gff_annotation != None:
517 cmd.extend(["--gtf-annotations", str(self.gff_annotation)])
518 if not self.closure_search:
519 cmd.append("--no-closure-search")
520 if not self.coverage_search:
521 cmd.append("--no-coverage-search")
522 if not self.microexon_search:
523 cmd.append("--no-microexon-search")
524 if self.butterfly_search:
525 cmd.append("--butterfly-search")
526 if self.read_params.solexa_quals:
527 cmd.append("--solexa-quals")
528 if self.read_params.quals:
529 cmd.append("--quals")
530 if self.read_params.integer_quals:
531 cmd.append("--integer-quals")
532 if self.read_params.color:
533 cmd.append("--color")
534 if self.read_params.color_out:
535 cmd.append("--color-out")
536 if self.read_params.library_type:
537 cmd.extend(["--library-type", self.read_params.library_type])
538 if self.read_params.read_group_id:
539 cmd.extend(["--rg-id", self.read_params.read_group_id])
540 if self.read_params.phred64_quals:
541 cmd.append("--phred64-quals")
542 return cmd
543
544 # This is the master options parsing routine, which calls parse_options for
545 # the delegate classes (e.g. SpliceConstraints) that handle certain groups
546 # of options.
547 def parse_options(self, argv):
548 try:
549 opts, args = getopt.getopt(argv[1:], "hvp:m:F:a:i:I:G:r:o:j:Xz:s:g:QC",
550 ["version",
551 "help",
552 "output-dir=",
553 "solexa-quals",
554 "solexa1.3-quals",
555 "phred64-quals",
556 "quals",
557 "integer-quals",
558 "color",
559 "color-out",
560 "library-type=",
561 "num-threads=",
562 "splice-mismatches=",
563 "max-multihits=",
564 "min-isoform-fraction=",
565 "min-anchor-length=",
566 "min-intron-length=",
567 "max-intron-length=",
568 "GTF=",
569 "raw-juncs=",
570 "no-novel-juncs",
571 "no-novel-indels",
572 "no-gtf-juncs",
573 "skip-check-reads",
574 "mate-inner-dist=",
575 "mate-std-dev=",
576 "no-closure-search",
577 "no-coverage-search",
578 "closure-search",
579 "coverage-search",
580 "microexon-search",
581 "min-closure-exon=",
582 "min-closure-intron=",
583 "max-closure-intron=",
584 "min-coverage-intron=",
585 "max-coverage-intron=",
586 "min-segment-intron=",
587 "max-segment-intron=",
588 "seed-length=",
589 "initial-read-mismatches=",
590 "segment-length=",
591 "segment-mismatches=",
592 "bowtie-n",
593 "butterfly-search",
594 "no-butterfly-search",
595 "keep-tmp",
596 "rg-id=",
597 "rg-sample=",
598 "rg-library=",
599 "rg-description=",
600 "rg-platform-unit=",
601 "rg-center=",
602 "rg-date=",
603 "rg-platform=",
604 "tmp-dir=",
605 "zpacker=",
606 "unmapped-fifo",
607 "max-insertion-length=",
608 "max-deletion-length=",
609 "insertions=",
610 "deletions=",
611 "no-sort-bam",
612 "no-convert-bam"])
613 except getopt.error, msg:
614 raise Usage(msg)
615
616 self.splice_constraints.parse_options(opts)
617 self.system_params.parse_options(opts)
618 self.read_params.parse_options(opts)
619 self.search_params.parse_options(opts)
620 self.report_params.parse_options(opts)
621 global use_BWT_FIFO
622 global use_BAM_Unmapped
623 if not self.read_params.color:
624 use_BWT_FIFO=False
625 use_BAM_Unmapped=True
626 global output_dir
627 global logging_dir
628 global tmp_dir
629 global sam_header
630
631 custom_tmp_dir = None
632 custom_out_dir = None
633 # option processing
634 for option, value in opts:
635 if option in ("-v", "--version"):
636 print "TopHat v%s" % (get_version())
637 sys.exit(0)
638 if option in ("-h", "--help"):
639 raise Usage(use_message)
640 if option in ("-g", "--max-multihits"):
641 self.max_hits = int(value)
642 if option in ("-G", "--GTF"):
643 self.gff_annotation = value
644 if option in ("-j", "--raw-juncs"):
645 self.raw_junctions = value
646 if option == "--no-novel-juncs":
647 self.find_novel_juncs = False
648 if option == "--no-novel-indels":
649 self.find_novel_indels = False
650 if option == "--no-gtf-juncs":
651 self.find_GFF_juncs = False
652 if option == "--skip-check-reads":
653 self.skip_check_reads = True
654 if option == "--no-coverage-search":
655 self.coverage_search = False
656 if option == "--no-closure-search":
657 self.closure_search = False
658 if option == "--coverage-search":
659 self.coverage_search = True
660 if option == "--closure-search":
661 self.closure_search = True
662 if option == "--microexon-search":
663 self.microexon_search = True
664 if option == "--butterfly-search":
665 self.butterfly_search = True
666 if option == "--no-butterfly-search":
667 self.butterfly_search = False
668 if option == "--initial-read-mismatches":
669 self.initial_read_mismatches = int(value)
670 if option == "--segment-length":
671 self.segment_length = int(value)
672 if option == "--segment-mismatches":
673 self.segment_mismatches = int(value)
674 if option == "--bowtie-n":
675 self.bowtie_alignment_option = "-n"
676 if option == "--max-insertion-length":
677 self.max_insertion_length = int(value)
678 if option == "--max-deletion-length":
679 self.max_deletion_length = int(value)
680 if option == "--insertions":
681 self.raw_insertions = value
682 if option == "--deletions":
683 self.raw_deletions = value
684 if option in ("-o", "--output-dir"):
685 custom_out_dir = value + "/"
686 if option == "--tmp-dir":
687 custom_tmp_dir = value + "/"
688
689 if custom_out_dir != None:
690 output_dir = custom_out_dir
691 logging_dir = output_dir + "logs/"
692 tmp_dir = output_dir + "tmp/"
693 sam_header = tmp_dir + "stub_header.sam"
694 if custom_tmp_dir != None:
695 tmp_dir = custom_tmp_dir
696 sam_header = tmp_dir + "stub_header.sam"
697 if len(args) < 2:
698 raise Usage(use_message)
699 return args
700
701
702 def getFileDir(filepath):
703 #if fullpath, returns path including the ending /
704 fpath, fname=os.path.split(filepath)
705 if fpath: fpath+='/'
706 return fpath
707
708 def getFileBaseName(filepath):
709 fpath, fname=os.path.split(filepath)
710 fbase, fext =os.path.splitext(fname)
711 fx=fext.lower()
712 if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fbase)>0:
713 return fbase
714 elif fx == '.z' or fx.find('.gz')==0 or fx.find('.bz')==0:
715 fb, fext = os.path.splitext(fbase)
716 fx=fext.lower()
717 if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fb)>0:
718 return fb
719 else:
720 return fbase
721 else:
722 if len(fbase)>len(fext):
723 return fbase
724 else:
725 return fname
726
727 # Returns the current time in a nice format
728 def right_now():
729 curr_time = datetime.now()
730 return curr_time.strftime("%c")
731
732 # Ensures that the output, logging, and temp directories are present. If not,
733 # they are created
734 def prepare_output_dir():
735
736 print >> sys.stderr, "[%s] Preparing output location %s" % (right_now(), output_dir)
737 if os.path.exists(output_dir):
738 pass
739 else:
740 os.mkdir(output_dir)
741
742 if os.path.exists(logging_dir):
743 pass
744 else:
745 os.mkdir(logging_dir)
746
747 if os.path.exists(tmp_dir):
748 pass
749 else:
750 try:
751 os.makedirs(tmp_dir)
752 except OSError, o:
753 die("\nError creating directory %s (%s)" % (tmp_dir, o))
754
755
756 # to be added as preexec_fn for every subprocess.Popen() call:
757 # see http://bugs.python.org/issue1652
758 def subprocess_setup():
759 # Python installs a SIGPIPE handler by default, which causes
760 # gzip or other de/compression pipes to complain about "stdout: Broken pipe"
761 signal.signal(signal.SIGPIPE, signal.SIG_DFL)
762
763 # Check that the Bowtie index specified by the user is present and all files
764 # are there.
765 def check_bowtie_index(idx_prefix):
766 print >> sys.stderr, "[%s] Checking for Bowtie index files" % right_now()
767
768 idx_fwd_1 = idx_prefix + ".1.ebwt"
769 idx_fwd_2 = idx_prefix + ".2.ebwt"
770 idx_rev_1 = idx_prefix + ".rev.1.ebwt"
771 idx_rev_2 = idx_prefix + ".rev.2.ebwt"
772
773 if os.path.exists(idx_fwd_1) and \
774 os.path.exists(idx_fwd_2) and \
775 os.path.exists(idx_rev_1) and \
776 os.path.exists(idx_rev_2):
777 return
778 else:
779 bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES")
780 if bowtie_idx_env_var == None:
781 die("Error: Could not find Bowtie index files " + idx_prefix + ".*")
782 idx_prefix = bowtie_idx_env_var + idx_prefix
783 idx_fwd_1 = idx_prefix + ".1.ebwt"
784 idx_fwd_2 = idx_prefix + ".2.ebwt"
785 idx_rev_1 = idx_prefix + ".rev.1.ebwt"
786 idx_rev_2 = idx_prefix + ".rev.2.ebwt"
787
788 if os.path.exists(idx_fwd_1) and \
789 os.path.exists(idx_fwd_2) and \
790 os.path.exists(idx_rev_1) and \
791 os.path.exists(idx_rev_2):
792 return
793 else:
794 die("Error: Could not find Bowtie index files " + idx_prefix + ".*")
795
796 # Reconstructs the multifasta file from which the Bowtie index was created, if
797 # it's not already there.
798 def bowtie_idx_to_fa(idx_prefix):
799 idx_name = idx_prefix.split('/')[-1]
800 print >> sys.stderr, "[%s] Reconstituting reference FASTA file from Bowtie index" % (right_now())
801
802 try:
803 tmp_fasta_file_name = tmp_dir + idx_name + ".fa"
804 tmp_fasta_file = open(tmp_fasta_file_name, "w")
805
806 inspect_log = open(logging_dir + "bowtie_inspect_recons.log", "w")
807
808 inspect_cmd = [prog_path("bowtie-inspect"),
809 idx_prefix]
810
811 print >> sys.stderr, "Executing: " + " ".join(inspect_cmd) + " > " + tmp_fasta_file_name
812 ret = subprocess.call(inspect_cmd,
813 stdout=tmp_fasta_file,
814 stderr=inspect_log)
815
816 # Bowtie reported an error
817 if ret != 0:
818 die(fail_str+"Error: bowtie-inspect returned an error")
819
820 # Bowtie not found
821 except OSError, o:
822 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
823 die(fail_str+"Error: bowtie-inspect not found on this system. Did you forget to include it in your PATH?")
824
825 return tmp_fasta_file_name
826
827 # Checks whether the multifasta file for the genome is present alongside the
828 # Bowtie index files for it.
829 def check_fasta(idx_prefix):
830 print >> sys.stderr, "[%s] Checking for reference FASTA file" % right_now()
831 idx_fasta = idx_prefix + ".fa"
832 if os.path.exists(idx_fasta):
833 return idx_fasta
834 else:
835 bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES")
836 if bowtie_idx_env_var != None:
837 idx_fasta = bowtie_idx_env_var + idx_prefix + ".fa"
838 if os.path.exists(idx_fasta):
839 return idx_fasta
840
841 print >> sys.stderr, "\tWarning: Could not find FASTA file " + idx_fasta
842 idx_fa = bowtie_idx_to_fa(idx_prefix)
843 return idx_fa
844
845 # Check that both the Bowtie index and the genome's fasta file are present
846 def check_index(idx_prefix):
847 check_bowtie_index(idx_prefix)
848 ref_fasta_file = check_fasta(idx_prefix)
849
850 return (ref_fasta_file, None)
851
852 # Retrive a tuple containing the system's version of Bowtie. Parsed from
853 # `bowtie --version`
854 def get_bowtie_version():
855 try:
856 # Launch Bowtie to capture its version info
857 proc = subprocess.Popen([bowtie_path, "--version"],
858 stdout=subprocess.PIPE)
859 stdout_value = proc.communicate()[0]
860 bowtie_version = None
861 bowtie_out = repr(stdout_value)
862
863 # Find the version identifier
864 version_str = "bowtie version "
865 ver_str_idx = bowtie_out.find(version_str)
866 if ver_str_idx != -1:
867 nl = bowtie_out.find("\\n", ver_str_idx)
868 version_val = bowtie_out[ver_str_idx + len(version_str):nl]
869 bowtie_version = [int(x) for x in version_val.split('.')]
870 if len(bowtie_version) == 3:
871 bowtie_version.append(0)
872
873 return bowtie_version
874 except OSError, o:
875 errmsg=fail_str+str(o)+"\n"
876 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
877 errmsg+="Error: bowtie not found on this system"
878 die(errmsg)
879
880 def get_index_sam_header(read_params, idx_prefix):
881 try:
882 bowtie_sam_header_filename = tmp_dir+idx_prefix.split('/')[-1]+".bwt.samheader.sam"
883 bowtie_sam_header_file = open(bowtie_sam_header_filename,"w")
884
885 bowtie_header_cmd = [bowtie_path, "--sam"]
886 if read_params.color:
887 bowtie_header_cmd.append('-C')
888 bowtie_header_cmd.extend([idx_prefix, '/dev/null'])
889 subprocess.call(bowtie_header_cmd,
890 stdout=bowtie_sam_header_file,
891 stderr=open('/dev/null'))
892
893 bowtie_sam_header_file.close()
894 bowtie_sam_header_file = open(bowtie_sam_header_filename,"r")
895
896 sam_header_file = open(sam_header, "w")
897
898 preamble = []
899 sq_dict_lines = []
900
901 for line in bowtie_sam_header_file.readlines():
902 line = line.strip()
903 if line.find("@SQ") != -1:
904 # Sequence dictionary record
905 cols = line.split('\t')
906 seq_name = None
907 for col in cols:
908 fields = col.split(':')
909 #print fields
910 if len(fields) > 0 and fields[0] == "SN":
911 seq_name = fields[1]
912 if seq_name == None:
913 die("Error: malformed sequence dictionary in sam header")
914 sq_dict_lines.append([seq_name,line])
915 elif line.find("CL"):
916 continue
917 else:
918 preamble.append(line)
919 #for line in preamble:
920 # print >> sam_header_file, line
921 print >> sam_header_file, "@HD\tVN:1.0\tSO:coordinate"
922
923 if read_params.read_group_id and read_params.sample_id:
924 rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
925 read_params.sample_id)
926 if read_params.library_id:
927 rg_str += "\tLB:%s" % read_params.library_id
928 if read_params.description:
929 rg_str += "\tDS:%s" % read_params.description
930 if read_params.seq_platform_unit:
931 rg_str += "\tPU:%s" % read_params.seq_platform_unit
932 if read_params.seq_center:
933 rg_str += "\tCN:%s" % read_params.seq_center
934 if read_params.mate_inner_dist:
935 rg_str += "\tPI:%s" % read_params.mate_inner_dist
936 if read_params.seq_run_date:
937 rg_str += "\tDT:%s" % read_params.seq_run_date
938 if read_params.seq_platform:
939 rg_str += "\tPL:%s" % read_params.seq_platform
940
941 print >> sam_header_file, rg_str
942
943 sq_dict_lines.sort(lambda x,y: cmp(x[0],y[0]))
944 for [name, line] in sq_dict_lines:
945 print >> sam_header_file, line
946 print >> sam_header_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
947
948 sam_header_file.close()
949
950 return sam_header
951
952 except OSError, o:
953 errmsg=fail_str+str(o)+"\n"
954 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
955 errmsg+="Error: bowtie not found on this system"
956 die(errmsg)
957
958 # Make sure Bowtie is installed and is recent enough to be useful
959 def check_bowtie():
960 print >> sys.stderr, "[%s] Checking for Bowtie" % right_now()
961 global bowtie_path
962 bowtie_path=prog_path("bowtie")
963 bowtie_version = get_bowtie_version()
964 if bowtie_version == None:
965 die("Error: Bowtie not found on this system")
966 elif bowtie_version[0] < 1 and bowtie_version[1] < 12 and bowtie_version[2] < 3:
967 die("Error: TopHat requires Bowtie 0.12.3 or later")
968 print >> sys.stderr, "\tBowtie version:\t\t\t %s" % ".".join([str(x) for x in bowtie_version])
969
970
971 # Retrive a tuple containing the system's version of samtools. Parsed from
972 # `samtools`
973 def get_samtools_version():
974 try:
975 # Launch Bowtie to capture its version info
976 proc = subprocess.Popen(samtools_path, stderr=subprocess.PIPE)
977 samtools_out = proc.communicate()[1]
978
979 # Find the version identifier
980 version_match = re.search(r'Version:\s+(\d+)\.(\d+).(\d+)([a-zA-Z]?)', samtools_out)
981 samtools_version_arr = [int(version_match.group(x)) for x in [1,2,3]]
982 if version_match.group(4):
983 samtools_version_arr.append(version_match.group(4))
984 else:
985 samtools_version_arr.append(0)
986
987 return version_match.group(), samtools_version_arr
988 except OSError, o:
989 errmsg=fail_str+str(o)+"\n"
990 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
991 errmsg+="Error: samtools not found on this system"
992 die(errmsg)
993
994 # Make sure the SAM tools are installed and are recent enough to be useful
995 def check_samtools():
996 print >> sys.stderr, "[%s] Checking for Samtools" % right_now()
997 global samtools_path
998 samtools_path=prog_path("samtools")
999 samtools_version_str, samtools_version_arr = get_samtools_version()
1000 if samtools_version_str == None:
1001 die("Error: Samtools not found on this system")
1002 elif samtools_version_arr[1] < 1 or samtools_version_arr[2] < 7:
1003 die("Error: TopHat requires Samtools 0.1.7 or later")
1004 print >> sys.stderr, "\tSamtools %s" % samtools_version_str
1005
1006
1007
1008 class FastxReader:
1009 def __init__(self, i_file, is_color=0, fname=''):
1010 self.bufline=None
1011 self.format=None
1012 self.ifile=i_file
1013 self.nextRecord=None
1014 self.eof=None
1015 self.fname=fname
1016 self.lastline=None
1017 self.numrecords=0
1018 self.isColor=0
1019 if is_color : self.isColor=1
1020 # determine file type
1021 #no records processed yet, skip custom header lines if any
1022 hlines=10 # allow maximum 10 header lines
1023 self.lastline=" "
1024 while hlines>0 and self.lastline[0] not in "@>" :
1025 self.lastline=self.ifile.readline()
1026 hlines-=1
1027 if self.lastline[0] == '@':
1028 self.format='fastq'
1029 self.nextRecord=self.nextFastq
1030 elif self.lastline[0] == '>':
1031 self.format='fasta'
1032 self.nextRecord=self.nextFasta
1033 else:
1034 die("Error: cannot determine record type in input file %s" % fname)
1035 self.bufline=self.lastline
1036 self.lastline=None
1037
1038 def nextFastq(self):
1039 # returning tuple: (seqID, sequence_string, seq_len, qv_string)
1040 seqid,seqstr,qstr,seq_len='','','',0
1041 if self.eof: return (seqid, seqstr, seq_len, qstr)
1042 fline=self.getLine #shortcut to save a bit of time
1043 line=fline()
1044
1045 if not line : return (seqid, seqstr, seq_len, qstr)
1046 while len(line.rstrip())==0: # skip empty lines
1047 line=fline()
1048 if not line : return (seqid, seqstr,seq_len, qstr)
1049 try:
1050 if line[0] != "@":
1051 raise ValueError("Records in Fastq files should start with '@' character")
1052 seqid = line[1:].rstrip()
1053 seqstr = fline().rstrip()
1054
1055 #There may now be more sequence lines, or the "+" quality marker line:
1056 while True:
1057 line = fline()
1058 if not line:
1059 raise ValueError("Premature end of file (missing quality values for "+seqid+")")
1060 if line[0] == "+":
1061 # -- sequence string ended
1062 #qtitle = line[1:].rstrip()
1063 #if qtitle and qtitle != seqid:
1064 # raise ValueError("Different read ID for sequence and quality (%s vs %s)" \
1065 # % (seqid, qtitle))
1066 break
1067 seqstr += line.rstrip() #removes trailing newlines
1068 #loop until + found
1069 seq_len = len(seqstr)
1070 #at least one line of quality data should follow
1071 qstrlen=0
1072 #now read next lines as quality values until seq_len is reached
1073 while True:
1074 line=fline()
1075 if not line : break #end of file
1076 qstr += line.rstrip()
1077 qstrlen=len(qstr)
1078 if qstrlen + self.isColor >= seq_len :
1079 break # qv string has reached the length of seq string
1080 #loop until qv has the same length as seq
1081 if self.isColor and qstr[0]=="!":
1082 #some color qual string have a '!' character at the beginning, should be stripped
1083 qstr = qstr[1:]
1084 qstrlen -= 1
1085 if seq_len != qstrlen+self.isColor :
1086 raise ValueError("Length mismatch between sequence and quality strings "+ \
1087 "for %s (%i vs %i)." \
1088 % (seqid, seq_len, qstrlen))
1089 except ValueError, err:
1090 die("\nError encountered parsing file "+self.fname+":\n "+str(err))
1091 #return the record
1092 self.numrecords+=1
1093 if self.isColor :
1094 seq_len-=1
1095 seqstr = seqstr[1:]
1096 return (seqid, seqstr, seq_len, qstr)
1097
1098 def nextFasta(self):
1099 # returning tuple: (seqID, sequence_string, seq_len)
1100 seqid,seqstr,seq_len='','',0
1101 fline=self.getLine # shortcut to readline function of f
1102 line=fline() # this will use the buffer line if it's there
1103 if not line : return (seqid, seqstr, seq_len, None)
1104 while len(line.rstrip())==0: # skip empty lines
1105 line=fline()
1106 if not line : return (seqid, seqstr, seq_len, None)
1107 try:
1108 if line[0] != ">":
1109 raise ValueError("Records in Fasta files must start with '>' character")
1110 seqid = line[1:].split()[0]
1111 #more sequence lines, or the ">" quality marker line:
1112 while True:
1113 line = fline()
1114 if not line: break
1115 if line[0] == '>':
1116 #next sequence starts here
1117 self.ungetLine()
1118 break
1119 seqstr += line.rstrip()
1120 #loop until '>' found
1121 seq_len = len(seqstr)
1122 if seq_len < 3:
1123 raise ValueError("Read %s too short (%i)." \
1124 % (seqid, seq_len))
1125 except ValueError, err:
1126 die("\nError encountered parsing fasta file "+self.fname+"\n "+str(err))
1127 #return the record and continue
1128 self.numrecords+=1
1129 if self.isColor :
1130 seq_len-=1
1131 seqstr = seqstr[1:]
1132 return (seqid, seqstr, seq_len, None)
1133
1134 def getLine(self):
1135 if self.bufline: #return previously buffered line
1136 r=self.bufline
1137 self.bufline=None
1138 return r
1139 else: #read a new line from stream and return it
1140 if self.eof: return None
1141 self.lastline=self.ifile.readline()
1142 if not self.lastline:
1143 self.eof=1
1144 return None
1145 return self.lastline
1146 def ungetLine(self):
1147 if self.lastline is None:
1148 print >> sys.stderr, "Warning: FastxReader called ungetLine() with no prior line!"
1149 self.bufline=self.lastline
1150 self.lastline=None
1151 #< class FastxReader
1152
1153 class ZReader:
1154 def __init__(self, filename, sysparams, guess=True):
1155 self.fname=filename
1156 self.file=None
1157 self.fsrc=None
1158 self.popen=None
1159 pipecmd=[]
1160 s=filename.lower()
1161 if s.endswith(".bam"):
1162 pipecmd=["bam2fastx", "-"]
1163 else:
1164 if guess:
1165 if s.endswith(".z") or s.endswith(".gz") or s.endswith(".gzip"):
1166 pipecmd=['gzip']
1167 else:
1168 if s.endswith(".bz2") or s.endswith(".bzip2") or s.endswith(".bzip"):
1169 pipecmd=['bzip2']
1170 if len(pipecmd)>0 and which(pipecmd[0]) is None:
1171 die("Error: cannot find %s to decompress input file %s " % (pipecmd, filename))
1172 if len(pipecmd)>0:
1173 if pipecmd[0]=='gzip' and sysparams.zipper.endswith('pigz'):
1174 pipecmd[0]=sysparams.zipper
1175 pipecmd.extend(sysparams.zipper_opts)
1176 elif pipecmd[0]=='bzip2' and sysparams.zipper.endswith('pbzip2'):
1177 pipecmd[0]=sysparams.zipper
1178 pipecmd.extend(sysparams.zipper_opts)
1179 else: #not guessing, but we must still be sure it's a compressed file
1180 if use_zpacker and filename.endswith(".z"):
1181 pipecmd=[sysparams.zipper]
1182 pipecmd.extend(sysparams.zipper_opts)
1183 if pipecmd:
1184 pipecmd+=['-cd']
1185 if pipecmd:
1186 try:
1187 self.fsrc=open(self.fname, 'rb')
1188 self.popen=subprocess.Popen(pipecmd,
1189 preexec_fn=subprocess_setup,
1190 stdin=self.fsrc,
1191 stdout=subprocess.PIPE, close_fds=True)
1192 except Exception:
1193 die("Error: could not open pipe "+' '.join(pipecmd)+' < '+ self.fname)
1194 self.file=self.popen.stdout
1195 else:
1196 self.file=open(filename)
1197 def close(self):
1198 if self.fsrc: self.fsrc.close()
1199 self.file.close()
1200 if self.popen:
1201 self.popen.wait()
1202 self.popen=None
1203
1204 class ZWriter:
1205 def __init__(self, filename, sysparams):
1206 self.fname=filename
1207 if use_zpacker:
1208 pipecmd=[sysparams.zipper,"-cf", "-"]
1209 self.ftarget=open(filename, "wb")
1210 try:
1211 self.popen=subprocess.Popen(pipecmd,
1212 preexec_fn=subprocess_setup,
1213 stdin=subprocess.PIPE,
1214 stdout=self.ftarget, close_fds=True)
1215 except Exception:
1216 die("Error: could not open writer pipe "+' '.join(pipecmd)+' < '+ self.fname)
1217 self.file=self.popen.stdin # client writes to this end of the pipe
1218 else: #no compression
1219 self.file=open(filename, "w")
1220 self.ftarget=None
1221 self.popen=None
1222 def close(self):
1223 self.file.close()
1224 if self.ftarget: self.ftarget.close()
1225 if self.popen:
1226 self.popen.wait() #! required to actually flush the pipes (eek!)
1227 self.popen=None
1228
1229 # check_reads_format() examines the first few records in the user files
1230 # to determines the file format
1231 def check_reads_format(params, reads_files):
1232 #print >> sys.stderr, "[%s] Checking reads" % right_now()
1233 #
1234 seed_len = params.read_params.seed_length
1235 fileformat = params.read_params.reads_format
1236
1237 observed_formats = set([])
1238 # observed_scales = set([])
1239 min_seed_len = 99999
1240 max_seed_len = 0
1241 files = reads_files.split(',')
1242
1243 for f_name in files:
1244 #try:
1245 zf = ZReader(f_name, params.system_params)
1246 #except IOError:
1247 # die("Error: could not open file "+f_name)
1248 freader=FastxReader(zf.file, params.read_params.color, zf.fname)
1249 toread=4 #just sample the first 4 reads
1250 while toread>0:
1251 seqid, seqstr, seq_len, qstr = freader.nextRecord()
1252 if not seqid: break
1253 toread-=1
1254 if seq_len < 20:
1255 print >> sys.stderr, "Warning: found a read < 20bp in", f_name
1256 else:
1257 min_seed_len = min(seq_len, min_seed_len)
1258 max_seed_len = max(seq_len, max_seed_len)
1259 zf.close()
1260 observed_formats.add(freader.format)
1261 if len(observed_formats) > 1:
1262 die("Error: TopHat requires all reads be either FASTQ or FASTA. Mixing formats is not supported.")
1263 fileformat=list(observed_formats)[0]
1264 if seed_len != None:
1265 seed_len = max(seed_len, min_seed_len)
1266 else:
1267 seed_len = max_seed_len
1268 #print >> sys.stderr, "\tmin read length: %dbp, max read length: %dbp" % (min_seed_len, max_seed_len)
1269 print >> sys.stderr, "\tformat:\t\t %s" % fileformat
1270 if fileformat == "fastq":
1271 quality_scale = "phred33 (default)"
1272 if params.read_params.solexa_quals and not params.read_params.phred64_quals:
1273 quality_scale = "solexa33 (reads generated with GA pipeline version < 1.3)"
1274 elif params.read_params.phred64_quals:
1275 quality_scale = "phred64 (reads generated with GA pipeline version >= 1.3)"
1276 print >> sys.stderr, "\tquality scale:\t %s" % quality_scale
1277 elif fileformat == "fasta":
1278 if params.read_params.color:
1279 params.read_params.integer_quals = True
1280
1281 #print seed_len, format, solexa_scale
1282 #NOTE: seed_len will be re-evaluated later by prep_reads
1283 return TopHatParams.ReadParams(params.read_params.solexa_quals,
1284 params.read_params.phred64_quals,
1285 params.read_params.quals,
1286 params.read_params.integer_quals,
1287 params.read_params.color,
1288 params.read_params.color_out,
1289 params.read_params.library_type,
1290 seed_len,
1291 fileformat,
1292 params.read_params.mate_inner_dist,
1293 params.read_params.mate_inner_dist_std_dev,
1294 params.read_params.read_group_id,
1295 params.read_params.sample_id,
1296 params.read_params.library_id,
1297 params.read_params.description,
1298 params.read_params.seq_platform_unit,
1299 params.read_params.seq_center,
1300 params.read_params.seq_run_date,
1301 params.read_params.seq_platform)
1302
1303 def log_tail(logfile, lines=1):
1304 f=open(logfile, "r")
1305 f.seek(0, 2)
1306 fbytes= f.tell()
1307 size=lines
1308 block=-1
1309 while size > 0 and fbytes+block*1024 > 0:
1310 if (fbytes+block*1024 > 0):
1311 ##Seek back once more, if possible
1312 f.seek( block*1024, 2 )
1313 else:
1314 #Seek to the beginning
1315 f.seek(0, 0)
1316 data= f.read( 1024 )
1317 linesFound= data.count('\n')
1318 size -= linesFound
1319 block -= 1
1320 if (fbytes + block*1024 > 0):
1321 f.seek(block*1024, 2)
1322 else:
1323 f.seek(0,0)
1324 f.readline() # find a newline
1325 lastBlocks= list( f.readlines() )
1326 f.close()
1327 return "\n".join(lastBlocks[-lines:])
1328
1329 # Format a DateTime as a pretty string.
1330 # FIXME: Currently doesn't support days!
1331 def formatTD(td):
1332 hours = td.seconds // 3600
1333 minutes = (td.seconds % 3600) // 60
1334 seconds = td.seconds % 60
1335 return '%02d:%02d:%02d' % (hours, minutes, seconds)
1336
1337 class PrepReadsInfo:
1338 def __init__(self, fname, side):
1339 try:
1340 f=open(fname,"r")
1341 self.min_len=int(f.readline().split("=")[-1])
1342 self.max_len=int(f.readline().split("=")[-1])
1343 self.in_count=int(f.readline().split("=")[-1])
1344 self.out_count=int(f.readline().split("=")[-1])
1345 if (self.out_count==0) or (self.max_len<16):
1346 raise Exception()
1347 except Exception, e:
1348 die(fail_str+"Error retrieving prep_reads info.")
1349 print >> sys.stderr, "\t%s reads: min. length=%s, count=%s" % (side, self.min_len, self.out_count)
1350 if self.min_len < 20:
1351 print >> sys.stderr, "Warning: short reads (<20bp) will make TopHat quite slow and take large amount of memory because they are likely to be mapped to too many places"
1352
1353 # Calls the prep_reads executable, which prepares an internal read library.
1354 # The read library features reads with monotonically increasing integer IDs.
1355 # prep_reads also filters out very low complexy or garbage reads as well as
1356 # polyA reads.
1357 #--> returns the tuple (internal_read_library_file, read_info)
1358 def prep_reads(params, reads_list, quals_list, output_name, side="Left "):
1359 reads_suffix = ".fq"
1360 if use_zpacker: reads_suffix += ".z"
1361 kept_reads_filename = tmp_dir + output_name + reads_suffix
1362
1363 if os.path.exists(kept_reads_filename):
1364 os.remove(kept_reads_filename)
1365 kept_reads = open(kept_reads_filename, "wb")
1366 log_fname=logging_dir + "prep_reads.log"
1367 filter_log = open(log_fname,"w")
1368
1369 filter_cmd = [prog_path("prep_reads")]
1370 filter_cmd.extend(params.cmd())
1371
1372 if params.read_params.reads_format == "fastq":
1373 filter_cmd += ["--fastq"]
1374 elif params.read_params.reads_format == "fasta":
1375 filter_cmd += ["--fasta"]
1376 info_file=output_dir+output_name+".info"
1377 filter_cmd += ["--aux-outfile="+info_file]
1378 filter_cmd.append(reads_list)
1379 if params.read_params.quals:
1380 filter_cmd.append(quals_list)
1381 shell_cmd = ' '.join(filter_cmd)
1382 #finally, add the compression pipe
1383 zip_cmd=[]
1384 if use_zpacker:
1385 zip_cmd=[ params.system_params.zipper ]
1386 zip_cmd.extend(params.system_params.zipper_opts)
1387 zip_cmd.extend(['-c','-'])
1388 shell_cmd +=' | '+' '.join(zip_cmd)
1389 shell_cmd += ' >' +kept_reads_filename
1390 retcode=0
1391 try:
1392 print >> run_log, shell_cmd
1393 if use_zpacker:
1394 filter_proc = subprocess.Popen(filter_cmd,
1395 stdout=subprocess.PIPE,
1396 stderr=filter_log)
1397 zip_proc=subprocess.Popen(zip_cmd,
1398 preexec_fn=subprocess_setup,
1399 stdin=filter_proc.stdout,
1400 stdout=kept_reads)
1401 filter_proc.stdout.close() #as per http://bugs.python.org/issue7678
1402 zip_proc.communicate()
1403 retcode=filter_proc.poll()
1404 if retcode==0:
1405 retcode=zip_proc.poll()
1406 else:
1407 retcode = subprocess.call(filter_cmd,
1408 stdout=kept_reads,
1409 stderr=filter_log)
1410 if retcode:
1411 die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname))
1412
1413 except OSError, o:
1414 errmsg=fail_str+str(o)
1415 die(errmsg+"\n"+log_tail(log_fname))
1416 kept_reads.close()
1417
1418 return kept_reads_filename, PrepReadsInfo(info_file, side)
1419
1420 # Call bowtie
1421 def bowtie(params,
1422 bwt_idx_prefix,
1423 reads_list,
1424 reads_format,
1425 num_mismatches,
1426 mapped_reads,
1427 unmapped_reads = None, #not needed vs segment_juncs
1428 extra_output = ""):
1429 start_time = datetime.now()
1430 bwt_idx_name = bwt_idx_prefix.split('/')[-1]
1431 readfile_basename=getFileBaseName(reads_list)
1432 print >> sys.stderr, "[%s] Mapping %s against %s with Bowtie %s" % (start_time.strftime("%c"),
1433 readfile_basename, bwt_idx_name, extra_output)
1434
1435 bwt_log = open(logging_dir + 'bowtie.'+readfile_basename+'.fixmap.log', "w")
1436 unmapped_reads_bwt=unmapped_reads+".fq"
1437 use_FIFO = use_BWT_FIFO and use_zpacker and unmapped_reads and params.read_params.color
1438 if use_FIFO: #this should always be False unless color reads are used with -X option
1439 global unmapped_reads_fifo
1440 if os.path.exists(unmapped_reads_fifo):
1441 os.remove(unmapped_reads_fifo)
1442 try:
1443 os.mkfifo(unmapped_reads_fifo)
1444 except OSError, o:
1445 die(fail_str+"Error at mkfifo("+unmapped_reads_fifo+'). '+str(o))
1446 unmapped_reads_bwt=unmapped_reads_fifo
1447
1448 # Launch Bowtie
1449 try:
1450 bowtie_cmd = [bowtie_path]
1451 bam_input=0
1452 unzip_cmd=None
1453 if reads_list.endswith('.bam'):
1454 bam_input=1
1455 unzip_cmd=[ prog_path('bam2fastx') ]
1456 if reads_format:
1457 unzip_cmd.append("--"+reads_format)
1458 unzip_cmd+=[reads_list]
1459
1460 if reads_format == "fastq":
1461 bowtie_cmd += ["-q"]
1462 elif reads_format == "fasta":
1463 bowtie_cmd += ["-f"]
1464
1465 if params.read_params.color:
1466 bowtie_cmd += ["-C", "--col-keepends"]
1467 if unmapped_reads:
1468 bowtie_cmd += ["--un", unmapped_reads_bwt]
1469 # "--max", "/dev/null"]
1470 if use_zpacker and (unzip_cmd is None):
1471 unzip_cmd=[ params.system_params.zipper ]
1472 unzip_cmd.extend(params.system_params.zipper_opts)
1473 unzip_cmd+=['-cd']
1474
1475 fifo_pid=None
1476 if use_FIFO:
1477 unm_zipcmd=[ params.system_params.zipper ]
1478 unm_zipcmd.extend(params.system_params.zipper_opts)
1479 unm_zipcmd+=['-c']
1480 unmapped_reads+=".fq.z"
1481 print >> run_log, ' '.join(unm_zipcmd)+' < '+ unmapped_reads_fifo + ' > '+ unmapped_reads + ' & '
1482 fifo_pid=os.fork()
1483 if fifo_pid==0:
1484 def on_sig_exit(sig, func=None):
1485 os._exit(os.EX_OK)
1486 signal.signal(signal.SIGTERM, on_sig_exit)
1487 subprocess.call(unm_zipcmd,
1488 stdin=open(unmapped_reads_fifo, "r"),
1489 stdout=open(unmapped_reads, "wb"))
1490 os._exit(os.EX_OK)
1491
1492 bowtie_cmd += [params.bowtie_alignment_option, str(num_mismatches),
1493 "-p", str(params.system_params.num_cpus),
1494 "-k", str(params.max_hits),
1495 "-m", str(params.max_hits),
1496 "--max", "/dev/null",
1497 "-S", "--sam-nohead"] #always use headerless SAM file
1498 fix_map_cmd = [prog_path('fix_map_ordering')]
1499 sam_zipcmd=None
1500 if unmapped_reads:
1501 #mapping on the genome
1502 #fix_map_ordering will write BAM file directly, and unmapped_reads as BAM file too
1503 mapped_reads += ".bam"
1504 if params.read_params.color:
1505 fix_map_cmd += ["--sam-header", sam_header, "-", mapped_reads]
1506 else:
1507 unmapped_reads += ".bam"
1508 fix_map_cmd += ["--sam-header", sam_header, "-", mapped_reads, unmapped_reads]
1509 else:
1510 #mapping on segment_juncs (spliced potential junctions)
1511 # don't use SAM headers, could be slow -- better compress SAM output directly
1512 mapped_reads += ".sam"
1513 fix_map_cmd += ["-"]
1514 if use_zpacker:
1515 fix_map_cmd += ["-"]
1516 sam_zipcmd=[ params.system_params.zipper ]
1517 sam_zipcmd.extend(params.system_params.zipper_opts)
1518 sam_zipcmd += ["-c"]
1519 mapped_reads += ".z"
1520 else:
1521 fix_map_cmd += [mapped_reads]
1522
1523 bowtie_cmd+=[bwt_idx_prefix]
1524 bowtie_proc=None
1525 shellcmd=""
1526 unzip_proc=None
1527 mapzip_proc=None
1528 if unzip_cmd:
1529 # could be bam2fastx
1530 if bam_input:
1531 unzip_proc = subprocess.Popen(unzip_cmd, stdout=subprocess.PIPE)
1532 shellcmd=' '.join(unzip_cmd) + "|"
1533 else:
1534 unzip_proc = subprocess.Popen(unzip_cmd,
1535 stdin=open(reads_list, "rb"),
1536 stdout=subprocess.PIPE)
1537 shellcmd=' '.join(unzip_cmd) + "< " +reads_list +"|"
1538 bowtie_cmd += ['-']
1539 bowtie_proc = subprocess.Popen(bowtie_cmd,
1540 stdin=unzip_proc.stdout,
1541 stdout=subprocess.PIPE,
1542 stderr=bwt_log)
1543 unzip_proc.stdout.close() # see http://bugs.python.org/issue7678
1544 else:
1545 bowtie_cmd += [reads_list]
1546 bowtie_proc = subprocess.Popen(bowtie_cmd,
1547 stdout=subprocess.PIPE,
1548 stderr=bwt_log)
1549 shellcmd += " ".join(bowtie_cmd) + " | " + " ".join(fix_map_cmd)
1550 if sam_zipcmd:
1551 shellcmd += "|" + " ".join(sam_zipcmd)+" > " + mapped_reads
1552 fix_order_proc = subprocess.Popen(fix_map_cmd,
1553 stdin=bowtie_proc.stdout,
1554 stdout=subprocess.PIPE)
1555 bowtie_proc.stdout.close()
1556 mapzip_proc = subprocess.Popen(sam_zipcmd,
1557 stdin=fix_order_proc.stdout,
1558 stdout=open(mapped_reads, "wb"))
1559 fix_order_proc.stdout.close()
1560 else:
1561 fix_order_proc = subprocess.Popen(fix_map_cmd,
1562 stdin=bowtie_proc.stdout)
1563 bowtie_proc.stdout.close()
1564
1565 print >> run_log, shellcmd
1566 if mapzip_proc:
1567 mapzip_proc.communicate()
1568 else:
1569 fix_order_proc.communicate()
1570 if use_FIFO:
1571 if fifo_pid and not os.path.exists(unmapped_reads):
1572 try:
1573 os.kill(fifo_pid, signal.SIGTERM)
1574 except:
1575 pass
1576 except OSError, o:
1577 die(fail_str+"Error: "+str(o))
1578
1579 # Success
1580 #finish_time = datetime.now()
1581 #duration = finish_time - start_time
1582 #print >> sys.stderr, "\t\t\t[%s elapsed]" % formatTD(duration)
1583 if use_FIFO:
1584 try:
1585 os.remove(unmapped_reads_fifo)
1586 except:
1587 pass
1588 return (mapped_reads, unmapped_reads)
1589
1590 def bowtie_segment(params,
1591 bwt_idx_prefix,
1592 reads_list,
1593 reads_format,
1594 mapped_reads,
1595 unmapped_reads = None,
1596 extra_output = ""):
1597
1598 backup_bowtie_alignment_option = params.bowtie_alignment_option
1599 params.bowtie_alignment_option = "-v"
1600 params.max_hits *= 2
1601
1602 result = bowtie(params,
1603 bwt_idx_prefix,
1604 reads_list,
1605 reads_format,
1606 params.segment_mismatches,
1607 mapped_reads,
1608 unmapped_reads,
1609 extra_output)
1610
1611 params.bowtie_alignment_option = backup_bowtie_alignment_option
1612 params.max_hits /= 2
1613 return result
1614
1615 # Retrieve a .juncs file from a GFF file by calling the gtf_juncs executable
1616 def get_gtf_juncs(gff_annotation):
1617 print >> sys.stderr, "[%s] Reading known junctions from GTF file" % (right_now())
1618 gtf_juncs_log = open(logging_dir + "gtf_juncs.log", "w")
1619
1620 gff_prefix = gff_annotation.split('/')[-1].split('.')[0]
1621
1622 gtf_juncs_out_name = tmp_dir + gff_prefix + ".juncs"
1623 gtf_juncs_out = open(gtf_juncs_out_name, "w")
1624
1625 gtf_juncs_cmd=[prog_path("gtf_juncs"), gff_annotation]
1626 try:
1627 print >> run_log, " ".join(gtf_juncs_cmd)
1628 retcode = subprocess.call(gtf_juncs_cmd,
1629 stderr=gtf_juncs_log,
1630 stdout=gtf_juncs_out)
1631 # cvg_islands returned an error
1632 if retcode == 1:
1633 print >> sys.stderr, "\tWarning: TopHat did not find any junctions in GTF file"
1634 return (False, gtf_juncs_out_name)
1635 elif retcode != 0:
1636 die(fail_str+"Error: GTF junction extraction failed with err ="+str(retcode))
1637 # cvg_islands not found
1638 except OSError, o:
1639 errmsg=fail_str+str(o)+"\n"
1640 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1641 errmsg+="Error: gtf_juncs not found on this system"
1642 die(errmsg)
1643 return (True, gtf_juncs_out_name)
1644
1645 # Call bowtie-build on the FASTA file of sythetic splice junction sequences
1646 def build_juncs_bwt_index(external_splice_prefix, color):
1647 print >> sys.stderr, "[%s] Indexing splices" % (right_now())
1648 bowtie_build_log = open(logging_dir + "bowtie_build.log", "w")
1649
1650 #user_splices_out_prefix = output_dir + "user_splices_idx"
1651
1652 bowtie_build_cmd = [prog_path("bowtie-build")]
1653 if color:
1654 bowtie_build_cmd += ["-C"]
1655
1656 bowtie_build_cmd += [external_splice_prefix + ".fa",
1657 external_splice_prefix]
1658 try:
1659 print >> run_log, " ".join(bowtie_build_cmd)
1660 retcode = subprocess.call(bowtie_build_cmd,
1661 stdout=bowtie_build_log)
1662
1663 if retcode != 0:
1664 die(fail_str+"Error: Splice sequence indexing failed with err ="+ str(retcode))
1665 except OSError, o:
1666 errmsg=fail_str+str(o)+"\n"
1667 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1668 errmsg+="Error: bowtie-build not found on this system"
1669 die(errmsg)
1670 return external_splice_prefix
1671
1672 # Build a splice index from a .juncs file, suitable for use with specified read
1673 # (or read segment) lengths
1674 def build_juncs_index(min_anchor_length,
1675 read_length,
1676 juncs_prefix,
1677 external_juncs,
1678 external_insertions,
1679 external_deletions,
1680 reference_fasta,
1681 color):
1682 print >> sys.stderr, "[%s] Retrieving sequences for splices" % (right_now())
1683
1684 juncs_file_list = ",".join(external_juncs)
1685 insertions_file_list = ",".join(external_insertions)
1686 deletions_file_list = ",".join(external_deletions)
1687
1688
1689 juncs_db_log = open(logging_dir + "juncs_db.log", "w")
1690
1691 external_splices_out_prefix = tmp_dir + juncs_prefix
1692 external_splices_out_name = external_splices_out_prefix + ".fa"
1693
1694 external_splices_out = open(external_splices_out_name, "w")
1695 # juncs_db_cmd = [bin_dir + "juncs_db",
1696 juncs_db_cmd = [prog_path("juncs_db"),
1697 str(min_anchor_length),
1698 str(read_length),
1699 juncs_file_list,
1700 insertions_file_list,
1701 deletions_file_list,
1702 reference_fasta]
1703 try:
1704 print >> run_log, " ".join(juncs_db_cmd)
1705 retcode = subprocess.call(juncs_db_cmd,
1706 stderr=juncs_db_log,
1707 stdout=external_splices_out)
1708
1709 if retcode != 0:
1710 die(fail_str+"Error: Splice sequence retrieval failed with err ="+str(retcode))
1711 # juncs_db not found
1712 except OSError, o:
1713 errmsg=fail_str+str(o)+"\n"
1714 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1715 errmsg+="Error: juncs_db not found on this system"
1716 die(errmsg)
1717
1718 external_splices_out_prefix = build_juncs_bwt_index(external_splices_out_prefix, color)
1719 return external_splices_out_prefix
1720
1721 # Print out the sam header, embedding the user's specified library properties.
1722 # FIXME: also needs SQ dictionary lines
1723 def write_sam_header(read_params, sam_file):
1724 print >> sam_file, "@HD\tVN:1.0\tSO:coordinate"
1725 if read_params.read_group_id and read_params.sample_id:
1726 rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
1727 read_params.sample_id)
1728 if read_params.library_id:
1729 rg_str += "\tLB:%s" % read_params.library_id
1730 if read_params.description:
1731 rg_str += "\tDS:%s" % read_params.description
1732 if read_params.seq_platform_unit:
1733 rg_str += "\tPU:%s" % read_params.seq_platform_unit
1734 if read_params.seq_center:
1735 rg_str += "\tCN:%s" % read_params.seq_center
1736 if read_params.mate_inner_dist:
1737 rg_str += "\tPI:%s" % read_params.mate_inner_dist
1738 if read_params.seq_run_date:
1739 rg_str += "\tDT:%s" % read_params.seq_run_date
1740 if read_params.seq_platform:
1741 rg_str += "\tPL:%s" % read_params.seq_platform
1742
1743 print >> sam_file, rg_str
1744 print >> sam_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
1745
1746 # Write final TopHat output, via tophat_reports and wiggles
1747 def compile_reports(params, sam_header_filename, left_maps, left_reads, right_maps, right_reads, gff_annotation):
1748 print >> sys.stderr, "[%s] Reporting output tracks" % right_now()
1749
1750 left_maps = [x for x in left_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
1751 left_maps = ','.join(left_maps)
1752
1753 if len(right_maps) > 0:
1754 right_maps = [x for x in right_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
1755 right_maps = ','.join(right_maps)
1756 log_fname = logging_dir + "reports.log"
1757 report_log = open(log_fname, "w")
1758 junctions = output_dir + "junctions.bed"
1759 insertions = output_dir + "insertions.bed"
1760 deletions = output_dir + "deletions.bed"
1761 coverage = "coverage.wig"
1762 accepted_hits = output_dir + "accepted_hits"
1763 report_cmdpath = prog_path("tophat_reports")
1764 report_cmd = [report_cmdpath]
1765 report_cmd.extend(params.cmd())
1766 report_cmd.extend(["--samtools="+samtools_path])
1767 report_cmd.extend([junctions,
1768 insertions,
1769 deletions,
1770 "-",
1771 left_maps,
1772 left_reads])
1773 if len(right_maps) > 0 and right_reads != None:
1774 report_cmd.append(right_maps)
1775 report_cmd.append(right_reads)
1776 # -- tophat_reports now produces (uncompressed) BAM stream,
1777 # directly piped into samtools sort
1778 try:
1779 if params.report_params.convert_bam:
1780 if params.report_params.sort_bam:
1781 report_proc=subprocess.Popen(report_cmd,
1782 preexec_fn=subprocess_setup,
1783 stdout=subprocess.PIPE,
1784 stderr=report_log)
1785
1786 bamsort_cmd = ["samtools", "sort", "-", accepted_hits]
1787 bamsort_proc= subprocess.Popen(bamsort_cmd,
1788 preexec_fn=subprocess_setup,
1789 stderr=open(logging_dir + "reports.samtools_sort.log", "w"),
1790 stdin=report_proc.stdout)
1791 report_proc.stdout.close()
1792 print >> run_log, " ".join(report_cmd)+" | " + " ".join(bamsort_cmd)
1793 bamsort_proc.communicate()
1794 retcode = report_proc.poll()
1795 if retcode:
1796 die(fail_str+"Error running tophat_reports\n"+log_tail(log_fname))
1797 else:
1798 print >> run_log, " ".join(report_cmd)
1799 report_proc=subprocess.call(report_cmd,
1800 preexec_fn=subprocess_setup,
1801 stdout=open(accepted_hits,"wb"),
1802 stderr=report_log)
1803 os.rename(accepted_hits, output_dir + "accepted_hits.bam")
1804 else:
1805 print >> run_log, " ".join(report_cmd)
1806 report_proc=subprocess.call(report_cmd,
1807 preexec_fn=subprocess_setup,
1808 stdout=open(accepted_hits,"wb"),
1809 stderr=report_log)
1810 tmp_sam = output_dir + "accepted_hits.sam"
1811
1812 bam_to_sam_cmd = ["samtools", "view", "-h", accepted_hits]
1813 print >> run_log, " ".join(bam_to_sam_cmd) + " > " + tmp_sam
1814 bam_to_sam_log = open(logging_dir + "accepted_hits_bam_to_sam.log", "w")
1815 tmp_sam_file = open(tmp_sam, "w")
1816 ret = subprocess.call(bam_to_sam_cmd,
1817 stdout=tmp_sam_file,
1818 stderr=bam_to_sam_log)
1819 tmp_sam_file.close()
1820 os.remove(accepted_hits)
1821 #print >> run_log, "mv %s %s" % (tmp_sam, output_dir + "accepted_hits.sam")
1822 #os.rename(tmp_sam, output_dir + "accepted_hits.sam")
1823
1824 except OSError, o:
1825 die(fail_str+"Error: "+str(o)+"\n"+log_tail(log_fname))
1826
1827 # FIXME: put wiggles back!
1828 # wig_cmd = [prog_path("wiggles"), output_dir + accepted_hits, output_dir + coverage]
1829 # print >> run_log, " ".join(wig_cmd)
1830 # subprocess.call(wig_cmd,
1831 # stderr=open("/dev/null"))
1832 return (coverage, junctions)
1833
1834
1835 # Split up each read in a FASTQ file into multiple segments. Creates a FASTQ file
1836 # for each segment This function needs to be fixed to support mixed read length
1837 # inputs
1838
1839 def open_output_files(prefix, num_files_prev, num_files, out_segf, extension, params):
1840 i = num_files_prev + 1
1841 while i <= num_files:
1842 segfname=prefix+("_seg%d" % i)+extension
1843 out_segf.append(ZWriter(segfname,params.system_params))
1844 i += 1
1845
1846 def split_reads(reads_filename,
1847 prefix,
1848 fasta,
1849 params,
1850 segment_length):
1851 #reads_file = open(reads_filename)
1852 zreads = ZReader(reads_filename, params.system_params, False)
1853 out_segfiles = []
1854
1855 if fasta:
1856 extension = ".fa"
1857 else:
1858 extension = ".fq"
1859 if use_zpacker: extension += ".z"
1860 def convert_color_to_bp(color_seq):
1861 decode_dic = { 'A0':'A', 'A1':'C', 'A2':'G', 'A3':'T', 'A4':'N', 'A.':'N', 'AN':'N',
1862 'C0':'C', 'C1':'A', 'C2':'T', 'C3':'G', 'C4':'N', 'C.':'N', 'CN':'N',
1863 'G0':'G', 'G1':'T', 'G2':'A', 'G3':'C', 'G4':'N', 'G.':'N', 'GN':'N',
1864 'T0':'T', 'T1':'G', 'T2':'C', 'T3':'A', 'T4':'N', 'T.':'N', 'TN':'N',
1865 'N0':'N', 'N1':'N', 'N2':'N', 'N3':'N', 'N4':'N', 'N.':'N', 'NN':'N',
1866 '.0':'N', '.1':'N', '.2':'N', '.3':'N', '.4':'N', '..':'N', '.N':'N' }
1867
1868 base = color_seq[0]
1869 bp_seq = base
1870 for ch in color_seq[1:]:
1871 base = decode_dic[base+ch]
1872 bp_seq += base
1873 return bp_seq
1874
1875 def convert_bp_to_color(bp_seq):
1876 encode_dic = { 'AA':'0', 'CC':'0', 'GG':'0', 'TT':'0',
1877 'AC':'1', 'CA':'1', 'GT':'1', 'TG':'1',
1878 'AG':'2', 'CT':'2', 'GA':'2', 'TC':'2',
1879 'AT':'3', 'CG':'3', 'GC':'3', 'TA':'3',
1880 'A.':'4', 'C.':'4', 'G.':'4', 'T.':'4',
1881 '.A':'4', '.C':'4', '.G':'4', '.T':'4',
1882 '.N':'4', 'AN':'4', 'CN':'4', 'GN':'4',
1883 'TN':'4', 'NA':'4', 'NC':'4', 'NG':'4',
1884 'NT':'4', 'NN':'4', 'N.':'4', '..':'4' }
1885
1886 base = bp_seq[0]
1887 color_seq = base
1888 for ch in bp_seq[1:]:
1889 color_seq += encode_dic[base + ch]
1890 base = ch
1891
1892 return color_seq
1893
1894 def split_record(read_name, read_seq, read_qual, out_segf, offsets, color):
1895 if color:
1896 color_offset = 1
1897 read_seq_temp = convert_color_to_bp(read_seq)
1898
1899 seg_num = 1
1900 while seg_num + 1 < len(offsets):
1901 if read_seq[offsets[seg_num]+1] not in ['0', '1', '2', '3']:
1902 return
1903 seg_num += 1
1904 else:
1905 color_offset = 0
1906
1907 seg_num = 0
1908 last_seq_offset = 0
1909 while seg_num + 1 < len(offsets):
1910 f = out_segf[seg_num].file
1911 seg_seq = read_seq[last_seq_offset+color_offset:offsets[seg_num + 1]+color_offset]
1912 print >> f, "%s|%d:%d:%d" % (read_name,last_seq_offset,seg_num, len(offsets) - 1)
1913 if color:
1914 print >> f, "%s%s" % (read_seq_temp[last_seq_offset], seg_seq)
1915 else:
1916 print >> f, seg_seq
1917 if not fasta:
1918 seg_qual = read_qual[last_seq_offset:offsets[seg_num + 1]]
1919 print >> f, "+"
1920 print >> f, seg_qual
1921 seg_num += 1
1922 last_seq_offset = offsets[seg_num]
1923
1924 line_state = 0
1925 read_name = ""
1926 read_seq = ""
1927 read_qual = ""
1928 num_segments = 0
1929 offsets = []
1930 for line in zreads.file:
1931 if line.strip() == "":
1932 continue
1933 if line_state == 0:
1934 read_name = line.strip()
1935 elif line_state == 1:
1936 read_seq = line.strip()
1937
1938 read_length = len(read_seq)
1939 tmp_num_segments = read_length / segment_length
1940 offsets = [segment_length * i for i in range(0, tmp_num_segments + 1)]
1941
1942 # Bowtie's minimum read length here is 20bp, so if the last segment
1943 # is between 20 and segment_length bp long, go ahead and write it out
1944 if read_length % segment_length >= 20:
1945 offsets.append(read_length)
1946 tmp_num_segments += 1
1947 else:
1948 offsets[-1] = read_length
1949
1950 if tmp_num_segments == 1:
1951 offsets = [0, read_length]
1952
1953 if tmp_num_segments > num_segments:
1954 open_output_files(prefix, num_segments, tmp_num_segments, out_segfiles, extension, params)
1955 num_segments = tmp_num_segments
1956
1957 if fasta:
1958 split_record(read_name, read_seq, None, out_segfiles, offsets, params.read_params.color)
1959 elif line_state == 2:
1960 line = line.strip()
1961 else:
1962 read_quals = line.strip()
1963 if not fasta:
1964 split_record(read_name, read_seq, read_quals, out_segfiles, offsets, params.read_params.color)
1965
1966 line_state += 1
1967 if fasta:
1968 line_state %= 2
1969 else:
1970 line_state %= 4
1971 zreads.close()
1972 out_fnames=[]
1973 for zf in out_segfiles:
1974 zf.close()
1975 out_fnames.append(zf.fname)
1976 #return [o.fname for o in out_segfiles]
1977 return out_fnames
1978
1979 # Find possible splice junctions using the "closure search" strategy, and report
1980 # them in closures.juncs. Calls the executable closure_juncs
1981 def junctions_from_closures(params,
1982 left_maps,
1983 right_maps,
1984 ref_fasta):
1985 #print >> sys.stderr, "[%s] " % right_now()
1986 print >> sys.stderr, "[%s] Searching for junctions via mate-pair closures" % right_now()
1987
1988 #maps = [x for x in seg_maps if (os.path.exists(x) and os.path.getsize(x) > 0)]
1989 #if len(maps) == 0:
1990 # return None
1991 #print >> sys.stderr, left_maps
1992 slash = left_maps[0].rfind('/')
1993 juncs_out = ""
1994 if slash != -1:
1995 juncs_out += left_maps[0][:slash+1]
1996 juncs_out += "closure.juncs"
1997
1998 juncs_log = open(logging_dir + "closure.log", "w")
1999 juncs_cmdpath=prog_path("closure_juncs")
2000 juncs_cmd = [juncs_cmdpath]
2001
2002 left_maps = ','.join(left_maps)
2003 right_maps = ','.join(right_maps)
2004
2005 juncs_cmd.extend(params.cmd())
2006 juncs_cmd.extend([juncs_out,
2007 ref_fasta,
2008 left_maps,
2009 right_maps])
2010 try:
2011 print >> run_log, ' '.join(juncs_cmd)
2012 retcode = subprocess.call(juncs_cmd,
2013 stderr=juncs_log)
2014
2015 # spanning_reads returned an error
2016 if retcode != 0:
2017 die(fail_str+"Error: closure-based junction search failed with err ="+str(retcode))
2018 # cvg_islands not found
2019 except OSError, o:
2020 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2021 print >> sys.stderr, fail_str, "Error: closure_juncs not found on this system"
2022 die(str(o))
2023 return [juncs_out]
2024
2025 # Find possible junctions by examining coverage and split segments in the initial
2026 # map and segment maps. Report junctions, insertions, and deletions in segment.juncs,
2027 # segment.insertions, and segment.deletions. Calls the executable
2028 # segment_juncs
2029 def junctions_from_segments(params,
2030 left_reads,
2031 left_reads_map,
2032 left_seg_maps,
2033 right_reads,
2034 right_reads_map,
2035 right_seg_maps,
2036 unmapped_reads,
2037 reads_format,
2038 ref_fasta):
2039 if left_reads_map != left_seg_maps[0]:
2040 print >> sys.stderr, "[%s] Searching for junctions via segment mapping" % right_now()
2041 out_path=getFileDir(left_seg_maps[0])
2042 juncs_out=out_path+"segment.juncs"
2043 insertions_out=out_path+"segment.insertions"
2044 deletions_out =out_path+"segment.deletions"
2045
2046 left_maps = ','.join(left_seg_maps)
2047 align_log = open(logging_dir + "segment_juncs.log", "w")
2048 align_cmd = [prog_path("segment_juncs")]
2049
2050 align_cmd.extend(params.cmd())
2051
2052 align_cmd.extend(["--ium-reads", ",".join(unmapped_reads),
2053 ref_fasta,
2054 juncs_out,
2055 insertions_out,
2056 deletions_out,
2057 left_reads,
2058 left_reads_map,
2059 left_maps])
2060 if right_seg_maps != None:
2061 right_maps = ','.join(right_seg_maps)
2062 align_cmd.extend([right_reads, right_reads_map, right_maps])
2063 try:
2064 print >> run_log, " ".join(align_cmd)
2065 retcode = subprocess.call(align_cmd,
2066 preexec_fn=subprocess_setup,
2067 stderr=align_log)
2068
2069 # spanning_reads returned an error
2070 if retcode != 0:
2071 die(fail_str+"Error: segment-based junction search failed with err ="+str(retcode))
2072 # cvg_islands not found
2073 except OSError, o:
2074 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2075 print >> sys.stderr, fail_str, "Error: segment_juncs not found on this system"
2076 die(str(o))
2077
2078 return [juncs_out, insertions_out, deletions_out]
2079
2080 # Joins mapped segments into full-length read alignments via the executable
2081 # long_spanning_reads
2082 def join_mapped_segments(params,
2083 sam_header_filename,
2084 reads,
2085 ref_fasta,
2086 possible_juncs,
2087 possible_insertions,
2088 possible_deletions,
2089 contig_seg_maps,
2090 spliced_seg_maps,
2091 alignments_out_name):
2092 if len(contig_seg_maps)>1:
2093 print >> sys.stderr, "[%s] Joining segment hits" % right_now()
2094 else:
2095 print >> sys.stderr, "[%s] Processing bowtie hits" % right_now()
2096 contig_seg_maps = ','.join(contig_seg_maps)
2097
2098 possible_juncs = ','.join(possible_juncs)
2099 possible_insertions = ",".join(possible_insertions)
2100 possible_deletions = ",".join(possible_deletions)
2101
2102 align_log = open(logging_dir + "long_spanning_reads.log", "w")
2103 align_cmd = [prog_path("long_spanning_reads")]
2104
2105 align_cmd.extend(params.cmd())
2106
2107 align_cmd.append(ref_fasta)
2108 align_cmd.extend([ reads,
2109 possible_juncs,
2110 possible_insertions,
2111 possible_deletions,
2112 contig_seg_maps])
2113
2114 if spliced_seg_maps != None:
2115 spliced_seg_maps = ','.join(spliced_seg_maps)
2116 align_cmd.append(spliced_seg_maps)
2117
2118 try:
2119 print >> run_log, " ".join(align_cmd),">",alignments_out_name
2120 join_proc=subprocess.Popen(align_cmd,
2121 preexec_fn=subprocess_setup,
2122 stdout=open(alignments_out_name, "wb"),
2123 stderr=align_log)
2124 join_proc.communicate()
2125 except OSError, o:
2126 die(fail_str+"Error: "+str(o))
2127
2128
2129 # This class collects spliced and unspliced alignments for each of the
2130 # left and right read files provided by the user.
2131 class Maps:
2132 def __init__(self,
2133 # unspliced_bwt,
2134 unspliced_sam,
2135 seg_maps,
2136 unmapped_segs,
2137 segs):
2138 #self.unspliced_bwt = unspliced_bwt
2139 self.unspliced_sam = unspliced_sam
2140 self.seg_maps = seg_maps
2141 self.unmapped_segs = unmapped_segs
2142 self.segs = segs
2143
2144 # The main aligment routine of TopHat. This function executes most of the
2145 # workflow producing a set of candidate alignments for each cDNA fragment in a
2146 # pair of SAM alignment files (for paired end reads).
2147 def spliced_alignment(params,
2148 bwt_idx_prefix,
2149 sam_header_filename,
2150 ref_fasta,
2151 read_len,
2152 segment_len,
2153 left_reads,
2154 right_reads,
2155 user_supplied_junctions,
2156 user_supplied_insertions,
2157 user_supplied_deletions):
2158
2159 possible_juncs = []
2160 possible_juncs.extend(user_supplied_junctions)
2161
2162 possible_insertions = []
2163 possible_insertions.extend(user_supplied_insertions)
2164
2165 possible_deletions = []
2166 possible_deletions.extend(user_supplied_deletions)
2167
2168 maps = { left_reads : [], right_reads : [] }
2169 #single_segments = False
2170
2171 # Perform the first part of the TopHat work flow on the left and right
2172 # reads of paired ends separately - we'll use the pairing information later
2173 for reads in [left_reads, right_reads]:
2174 if reads == None or os.path.getsize(reads)<25 :
2175 continue
2176 fbasename=getFileBaseName(reads)
2177 unspliced_out = tmp_dir + fbasename + ".mapped"
2178 unmapped_unspliced = tmp_dir + fbasename + "_unmapped"
2179
2180 num_segs = read_len / segment_len
2181 if read_len % segment_len >= min(segment_len -2, 20):
2182 num_segs += 1
2183
2184 # daehwan - check this out
2185 if num_segs <= 1:
2186 print >> sys.stderr, "Warning: you have only one segment per read\n\twe strongly recommend that you decrease --segment-length to about half the read length because TopHat will work better with multiple segments"
2187
2188 # Perform the initial Bowtie mapping of the full length reads
2189 (unspliced_sam, unmapped_reads) = bowtie(params,
2190 bwt_idx_prefix,
2191 reads,
2192 "fastq",
2193 params.initial_read_mismatches,
2194 unspliced_out,
2195 unmapped_unspliced)
2196
2197 # Convert the initial Bowtie maps into BAM format.
2198 # skip this step as fix_map_ordering does it now
2199 #join_mapped_segments(params,
2200 # sam_header_filename,
2201 # reads,
2202 # ref_fasta,
2203 # ["/dev/null"],
2204 # ["/dev/null"],
2205 # ["/dev/null"],
2206 # [unspliced_map],
2207 # [],
2208 # unspliced_sam)
2209 # Using the num_segs value returned by check_reads(), decide which
2210 # junction discovery strategy to use
2211 if num_segs == 1:
2212 segment_len = read_len
2213 elif num_segs >= 3:
2214 # if we have at least three segments, just use split segment search,
2215 # which is the most sensitive and specific, fastest, and lightest-weight.
2216 if not params.closure_search:
2217 params.closure_search = False
2218 if not params.coverage_search:
2219 params.coverage_search = False
2220 if not params.butterfly_search:
2221 params.butterfly_search = False
2222 if not params.closure_search:
2223 params.closure_search = False
2224 seg_maps = []
2225 unmapped_segs = []
2226 segs = []
2227 if num_segs > 1:
2228 # split up the IUM reads into segments
2229 # unmapped_reads can be in BAM format
2230 read_segments = split_reads(unmapped_reads,
2231 tmp_dir + fbasename,
2232 False,
2233 params,
2234 segment_len)
2235
2236 # Map each segment file independently with Bowtie
2237 for i in range(len(read_segments)):
2238 seg = read_segments[i]
2239 fbasename=getFileBaseName(seg)
2240 seg_out = tmp_dir + fbasename + ".mapped"
2241 #if use_zpacker: seg_out += ".z"
2242 unmapped_seg = tmp_dir + fbasename + "_unmapped"
2243 extra_output = "(%d/%d)" % (i+1, len(read_segments))
2244 (seg_map, unmapped) = bowtie_segment(params,
2245 bwt_idx_prefix,
2246 seg,
2247 "fastq",
2248 seg_out,
2249 unmapped_seg,
2250 extra_output)
2251 seg_maps.append(seg_map)
2252 unmapped_segs.append(unmapped)
2253 segs.append(seg)
2254
2255 # Collect the segment maps for left and right reads together
2256 #maps[reads] = Maps(unspliced_map, unspliced_sam, seg_maps, unmapped_segs, segs)
2257 maps[reads] = Maps(unspliced_sam, seg_maps, unmapped_segs, segs)
2258 else:
2259 # if there's only one segment, just collect the initial map as the only
2260 # map to be used downstream for coverage-based junction discovery
2261 read_segments = [reads]
2262 #maps[reads] = Maps(unspliced_map, unspliced_sam, [unspliced_map], [unmapped_reads], [unmapped_reads])
2263 maps[reads] = Maps(unspliced_sam, [unspliced_sam], [unmapped_reads], [unmapped_reads])
2264
2265 # Unless the user asked not to discover new junctions, start that process
2266 # here
2267
2268 if params.find_novel_juncs or params.find_novel_indels:
2269 #left_reads_map = maps[left_reads].unspliced_bwt
2270 left_reads_map = maps[left_reads].unspliced_sam
2271 left_seg_maps = maps[left_reads].seg_maps
2272 unmapped_reads = maps[left_reads].unmapped_segs
2273 if right_reads != None:
2274 #right_reads_map = maps[right_reads].unspliced_bwt
2275 right_reads_map = maps[right_reads].unspliced_sam
2276 right_seg_maps = maps[right_reads].seg_maps
2277 unmapped_reads.extend(maps[right_reads].unmapped_segs)
2278 else:
2279 right_reads_map = None
2280 right_seg_maps = None
2281
2282 # Call segment_juncs to infer a list of possible splice junctions from
2283 # the regions of the genome covered in the initial and segment maps
2284 juncs = junctions_from_segments(params,
2285 left_reads,
2286 left_reads_map,
2287 left_seg_maps,
2288 right_reads,
2289 right_reads_map,
2290 right_seg_maps,
2291 unmapped_reads,
2292 "fastq",
2293 ref_fasta)
2294 if params.find_novel_juncs:
2295 if os.path.getsize(juncs[0]) != 0:
2296 possible_juncs.append(juncs[0])
2297 if params.find_novel_indels:
2298 if os.path.getsize(juncs[1]) != 0:
2299 possible_insertions.append(juncs[1])
2300 if os.path.getsize(juncs[2]) != 0:
2301 possible_deletions.append(juncs[2])
2302 # Optionally, and for paired reads only, use a closure search to
2303 # discover addtional junctions
2304 if params.find_novel_juncs and params.closure_search and left_reads != None and right_reads != None:
2305 juncs = junctions_from_closures(params,
2306 # [maps[left_reads].unspliced_bwt, maps[left_reads].seg_maps[-1]],
2307 # [maps[right_reads].unspliced_bwt, maps[right_reads].seg_maps[-1]],
2308 [maps[left_reads].unspliced_sam, maps[left_reads].seg_maps[-1]],
2309 [maps[right_reads].unspliced_sam, maps[right_reads].seg_maps[-1]],
2310 ref_fasta)
2311 if os.path.getsize(juncs[0]) != 0:
2312 possible_juncs.extend(juncs)
2313
2314 if len(possible_insertions) == 0 and len(possible_deletions) == 0 and len(possible_juncs) == 0:
2315 spliced_seg_maps = None
2316 junc_idx_prefix = None
2317 else:
2318 junc_idx_prefix = "segment_juncs"
2319 if len(possible_insertions) == 0:
2320 possible_insertions.append(os.devnull)
2321 # print >> sys.stderr, "Warning: insertions database is empty!"
2322 if len(possible_deletions) == 0:
2323 possible_deletions.append(os.devnull)
2324 # print >> sys.stderr, "Warning: deletions database is empty!"
2325 if len(possible_juncs) == 0:
2326 possible_juncs.append(os.devnull)
2327 print >> sys.stderr, "Warning: junction database is empty!"
2328 if junc_idx_prefix != None:
2329 build_juncs_index(3,
2330 segment_len,
2331 junc_idx_prefix,
2332 possible_juncs,
2333 possible_insertions,
2334 possible_deletions,
2335 ref_fasta,
2336 params.read_params.color)
2337
2338 # Now map read segments (or whole IUM reads, if num_segs == 1) to the splice
2339 # index with Bowtie
2340 for reads in [left_reads, right_reads]:
2341 spliced_seg_maps = []
2342 if reads == None or reads not in maps:
2343 continue
2344
2345 if junc_idx_prefix != None:
2346 i = 0
2347 for seg in maps[reads].segs:
2348 fsegname = getFileBaseName(seg)
2349 seg_out = tmp_dir + fsegname + ".to_spliced"
2350 #if use_zpacker: seg_out += ".z"
2351 extra_output = "(%d/%d)" % (i+1, len(maps[reads].segs))
2352 (seg_map, unmapped) = bowtie_segment(params,
2353 tmp_dir + junc_idx_prefix,
2354 seg,
2355 "fastq",
2356 seg_out,
2357 None,
2358 extra_output)
2359 spliced_seg_maps.append(seg_map)
2360 i += 1
2361
2362 # Join the contigous and spliced segment hits into full length read
2363 # alignments or convert a spliced segment hit into a genomic-coordinate hit
2364 rfname = getFileBaseName(reads)
2365 rfdir = getFileDir(reads)
2366 mapped_reads = rfdir + rfname + ".candidates.bam"
2367 merged_map =rfdir + rfname + ".candidates_and_unspl.bam"
2368 # unspl_bwtfile = maps[reads].unspliced_bwt
2369 unspl_samfile = maps[reads].unspliced_sam
2370
2371 join_mapped_segments(params,
2372 sam_header_filename,
2373 reads,
2374 ref_fasta,
2375 possible_juncs,
2376 possible_insertions,
2377 possible_deletions,
2378 maps[reads].seg_maps,
2379 spliced_seg_maps,
2380 mapped_reads)
2381
2382 if num_segs > 1:
2383 # Merge the spliced and unspliced full length alignments into a
2384 # single SAM file.
2385 # The individual SAM files are all already sorted in
2386 # increasing read ID order.
2387 # NOTE: We also should be able to address bug #134 here, by replacing
2388 # contiguous alignments that poke into an intron by a small amount by
2389 # the correct spliced alignment.
2390
2391 try:
2392 merge_cmd = [ prog_path("bam_merge"), merged_map,
2393 maps[reads].unspliced_sam,
2394 mapped_reads ]
2395 print >> run_log, " ".join(merge_cmd)
2396 ret = subprocess.call( merge_cmd,
2397 stderr=open(logging_dir + "sam_merge.log", "w") )
2398 if ret != 0:
2399 die(fail_str+"Error executing: "+" ".join(merge_cmd))
2400 except OSError, o:
2401 die(fail_str+"Error: "+str(o))
2402 maps[reads] = [merged_map]
2403
2404 if not params.system_params.keep_tmp:
2405 os.remove(mapped_reads)
2406 os.remove(unspl_samfile)
2407 #os.remove(unspl_bwtfile)
2408 else:
2409 os.rename(mapped_reads, merged_map)
2410 maps[reads] = [merged_map]
2411 if not params.system_params.keep_tmp:
2412 os.remove(unspl_samfile)
2413 #os.remove(unspl_bwtfile)
2414
2415 return maps
2416
2417 def die(msg=None):
2418 if msg is not None:
2419 print >> sys.stderr, msg
2420 sys.exit(1)
2421
2422 # rough equivalent of the 'which' command to find external programs
2423 # (current script path is tested first, then PATH envvar)
2424 def which(program):
2425 def is_executable(fpath):
2426 return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
2427 fpath, fname = os.path.split(program)
2428 if fpath:
2429 if is_executable(program):
2430 return program
2431 else:
2432 progpath = os.path.join(bin_dir, program)
2433 if is_executable(progpath):
2434 return progpath
2435 for path in os.environ["PATH"].split(os.pathsep):
2436 progpath = os.path.join(path, program)
2437 if is_executable(progpath):
2438 return progpath
2439 return None
2440
2441 def prog_path(program):
2442 progpath=which(program)
2443 if progpath == None:
2444 die("Error locating program: "+program)
2445 return progpath
2446
2447 # FIXME: this should get set during the make dist autotools phase of the build
2448 def get_version():
2449 return "1.3.2"
2450
2451 def mlog(msg):
2452 print >> sys.stderr, "[DBGLOG]:"+msg
2453
2454 def test_input_file(filename):
2455 try:
2456 test_file = open(filename, "r")
2457 # Bowtie not found
2458 except IOError, o:
2459 die("Error: Opening file %s" % filename)
2460 return
2461
2462
2463 def main(argv=None):
2464 warnings.filterwarnings("ignore", "tmpnam is a potential security risk")
2465
2466 # Initialize default parameter values
2467 params = TopHatParams()
2468
2469 try:
2470 if argv is None:
2471 argv = sys.argv
2472 args = params.parse_options(argv)
2473 params.check()
2474
2475 bwt_idx_prefix = args[0]
2476 left_reads_list = args[1]
2477 left_quals_list, right_quals_list = [], []
2478 if (not params.read_params.quals and len(args) > 2) or (params.read_params.quals and len(args) > 3):
2479 if params.read_params.mate_inner_dist == None:
2480 die("Error: you must set the mean inner distance between mates with -r")
2481
2482 right_reads_list = args[2]
2483 if params.read_params.quals:
2484 left_quals_list = args[3]
2485 right_quals_list = args[4]
2486 else:
2487 right_reads_list = None
2488 if params.read_params.quals:
2489 left_quals_list = args[2]
2490
2491 print >> sys.stderr
2492 print >> sys.stderr, "[%s] Beginning TopHat run (v%s)" % (right_now(), get_version())
2493 print >> sys.stderr, "-----------------------------------------------"
2494
2495 start_time = datetime.now()
2496 prepare_output_dir()
2497
2498 global run_log
2499 run_log = open(logging_dir + "run.log", "w", 0)
2500 global run_cmd
2501 run_cmd = " ".join(argv)
2502 print >> run_log, run_cmd
2503
2504 # Validate all the input files, check all prereqs before committing
2505 # to the run
2506 (ref_fasta, ref_seq_dict) = check_index(bwt_idx_prefix)
2507
2508 check_bowtie()
2509 check_samtools()
2510 print >> sys.stderr, "[%s] Generating SAM header for %s" % (right_now(), bwt_idx_prefix)
2511 sam_header_filename = get_index_sam_header(params.read_params, bwt_idx_prefix)
2512 print >> sys.stderr, "[%s] Preparing reads" % (right_now())
2513
2514 if not params.skip_check_reads:
2515 reads_list = left_reads_list
2516 if right_reads_list:
2517 reads_list = reads_list + "," + right_reads_list
2518 params.read_params = check_reads_format(params, reads_list)
2519
2520 user_supplied_juncs = []
2521 user_supplied_insertions = []
2522 user_supplied_deletions = []
2523
2524 if params.gff_annotation and params.find_GFF_juncs:
2525 test_input_file(params.gff_annotation)
2526 (found_juncs, gtf_juncs) = get_gtf_juncs(params.gff_annotation)
2527 if found_juncs:
2528 user_supplied_juncs.append(gtf_juncs)
2529 if params.raw_junctions:
2530 test_input_file(params.raw_junctions)
2531 user_supplied_juncs.append(params.raw_junctions)
2532
2533 if params.raw_insertions:
2534 test_input_file(params.raw_insertions)
2535 user_supplied_insertions.append(params.raw_insertions)
2536
2537 if params.raw_deletions:
2538 test_input_file(params.raw_deletions)
2539 user_supplied_deletions.append(params.raw_deletions)
2540
2541 global unmapped_reads_fifo
2542 unmapped_reads_fifo = tmp_dir + str(os.getpid())+".bwt_unmapped.z.fifo"
2543
2544 # Now start the time consuming stuff
2545 left_kept_reads, left_reads_info = prep_reads(params,
2546 left_reads_list,
2547 left_quals_list,
2548 "left_kept_reads", "Left ")
2549 min_read_len=left_reads_info.min_len
2550 max_read_len=left_reads_info.max_len
2551 if right_reads_list != None:
2552 right_kept_reads, right_reads_info = prep_reads(params,
2553 right_reads_list,
2554 right_quals_list,
2555 "right_kept_reads", "Right")
2556 min_read_len=min(right_reads_info.min_len, min_read_len)
2557 max_read_len=max(right_reads_info.max_len, max_read_len)
2558 else:
2559 right_kept_reads = None
2560 seed_len=params.read_params.seed_length
2561 if seed_len != None:
2562 seed_len = max(seed_len, min_read_len)
2563 else:
2564 seed_len = max_read_len
2565 params.read_params.seed_length=seed_len
2566 # turn off integer-quals
2567 if params.read_params.integer_quals:
2568 params.read_params.integer_quals = False
2569
2570 mapping = spliced_alignment(params,
2571 bwt_idx_prefix,
2572 sam_header_filename,
2573 ref_fasta,
2574 params.read_params.seed_length,
2575 params.segment_length,
2576 left_kept_reads,
2577 right_kept_reads,
2578 user_supplied_juncs,
2579 user_supplied_insertions,
2580 user_supplied_deletions)
2581
2582 left_maps = mapping[left_kept_reads]
2583 #left_unmapped_reads = mapping[1]
2584 right_maps = mapping[right_kept_reads]
2585 #right_unmapped_reads = mapping[3]
2586
2587 compile_reports(params,
2588 sam_header_filename,
2589 left_maps,
2590 left_kept_reads,
2591 right_maps,
2592 right_kept_reads,
2593 params.gff_annotation)
2594
2595 if not params.system_params.keep_tmp:
2596 for m in left_maps:
2597 os.remove(m)
2598 if left_kept_reads != None:
2599 os.remove(left_kept_reads)
2600 for m in right_maps:
2601 os.remove(m)
2602 if right_kept_reads != None:
2603 os.remove(right_kept_reads)
2604 tmp_files = os.listdir(tmp_dir)
2605 for t in tmp_files:
2606 os.remove(tmp_dir+t)
2607 os.rmdir(tmp_dir)
2608
2609 finish_time = datetime.now()
2610 duration = finish_time - start_time
2611 print >> sys.stderr,"-----------------------------------------------"
2612 print >> sys.stderr, "Run complete [%s elapsed]" % formatTD(duration)
2613
2614 except Usage, err:
2615 print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg)
2616 print >> sys.stderr, " for detailed help see http://tophat.cbcb.umd.edu/manual.html"
2617 return 2
2618
2619
2620 if __name__ == "__main__":
2621 sys.exit(main())

Properties

Name Value
svn:executable *