ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/tophat
Revision: 29
Committed: Tue Aug 2 21:24:54 2011 UTC (8 years, 10 months ago) by gpertea
File size: 110617 byte(s)
Log Message:
adding tophat source work

Line File contents
1 #!/usr/bin/env python
2
3 # encoding: utf-8
4 """
5 tophat.py
6
7 Created by Cole Trapnell on 2008-12-25.
8 Copyright (c) 2008 Cole Trapnell. All rights reserved.
9 """
10
11 import sys
12 try:
13 import psyco
14 psyco.full()
15 except ImportError:
16 pass
17
18 import getopt
19 import subprocess
20 import errno
21 import os
22 import warnings
23 import re
24 import signal
25 from datetime import datetime, date, time
26
27 use_message = '''
28 TopHat maps short sequences from spliced transcripts to whole genomes.
29
30 Usage:
31 tophat [options] <bowtie_index> <reads1[,reads2,...]> [reads1[,reads2,...]] \\
32 [quals1,[quals2,...]] [quals1[,quals2,...]]
33
34 Options:
35 -v/--version
36 -o/--output-dir <string> [ default: ./tophat_out ]
37 -a/--min-anchor <int> [ default: 8 ]
38 -m/--splice-mismatches <0-2> [ default: 0 ]
39 -i/--min-intron-length <int> [ default: 50 ]
40 -I/--max-intron-length <int> [ default: 500000 ]
41 -g/--max-multihits <int> [ default: 20 ]
42 -F/--min-isoform-fraction <float> [ default: 0.15 ]
43 --max-insertion-length <int> [ default: 3 ]
44 --max-deletion-length <int> [ default: 3 ]
45 --solexa-quals
46 --solexa1.3-quals (same as phred64-quals)
47 --phred64-quals (same as solexa1.3-quals)
48 -Q/--quals
49 --integer-quals
50 -C/--color (Solid - color space)
51 --color-out
52 --library-type <string> (fr-unstranded, fr-firststrand,
53 fr-secondstrand)
54 -p/--num-threads <int> [ default: 1 ]
55 -G/--GTF <filename>
56 -j/--raw-juncs <filename>
57 --insertions <filename>
58 --deletions <filename>
59 -r/--mate-inner-dist <int>
60 --mate-std-dev <int> [ default: 20 ]
61 --no-novel-juncs
62 --no-novel-indels
63 --no-gtf-juncs
64 --no-coverage-search
65 --coverage-search
66 --no-closure-search
67 --closure-search
68 --microexon-search
69 --butterfly-search
70 --no-butterfly-search
71 --keep-tmp
72 --tmp-dir <dirname> [ default: <output_dir>/tmp ]
73 -z/--zpacker <program> [ default: gzip ]
74 -X/--unmapped-fifo [use mkfifo to compress more temporary
75 files for color space reads]
76
77 Advanced Options:
78 --initial-read-mismatches <int> [ default: 2 ]
79 --segment-mismatches <int> [ default: 2 ]
80 --segment-length <int> [ default: 25 ]
81 --bowtie-n [ default: bowtie -v ]
82 --min-closure-exon <int> [ default: 100 ]
83 --min-closure-intron <int> [ default: 50 ]
84 --max-closure-intron <int> [ default: 5000 ]
85 --min-coverage-intron <int> [ default: 50 ]
86 --max-coverage-intron <int> [ default: 20000 ]
87 --min-segment-intron <int> [ default: 50 ]
88 --max-segment-intron <int> [ default: 500000 ]
89 --no-sort-bam [Output BAM is not coordinate-sorted]
90 --no-convert-bam [Do not convert to bam format.
91 Output is <output_dir>accepted_hit.sam.
92 Implies --no-sort-bam.]
93
94 SAM Header Options (for embedding sequencing run metadata in output):
95 --rg-id <string> (read group ID)
96 --rg-sample <string> (sample ID)
97 --rg-library <string> (library ID)
98 --rg-description <string> (descriptive string, no tabs allowed)
99 --rg-platform-unit <string> (e.g Illumina lane ID)
100 --rg-center <string> (sequencing center name)
101 --rg-date <string> (ISO 8601 date of the sequencing run)
102 --rg-platform <string> (Sequencing platform descriptor)
103 '''
104
105 class Usage(Exception):
106 def __init__(self, msg):
107 self.msg = msg
108
109 output_dir = "./tophat_out/"
110 logging_dir = output_dir + "logs/"
111 run_log = None
112 run_cmd = None
113 tmp_dir = output_dir + "tmp/"
114 bin_dir = sys.path[0] + "/"
115 use_zpacker = False # this is set by -z/--zpacker option (-z0 leaves it False)
116
117 use_BAM_Unmapped = False # automatically set to True for non-Solid reads, handles unmapped reads in BAM format
118
119 use_BWT_FIFO = False # can only be set to True if use_zpacker is True and only with -C/--color
120 # enabled by -X/-unmapped-fifo option (unless -z0)
121 unmapped_reads_fifo = None # if use_BWT_FIFO is True, this is to trick bowtie into writing the
122 # unmapped reads into a compressed file
123
124 samtools_path = None
125 bowtie_path = None
126 fail_str = "\t[FAILED]\n"
127
128 sam_header = tmp_dir + "stub_header.sam"
129
130 # TopHatParams captures all of the runtime paramaters used by TopHat, and many
131 # of these are passed as command line options to exectubles run by the pipeline
132
133 # This class and its nested classes also do options parsing through parse_options()
134 # and option validation via the member function check()
135
136 class TopHatParams:
137
138 # SpliceConstraints is a group of runtime parameters that specify what
139 # constraints to put on junctions discovered by the program. These constraints
140 # are used to filter out spurious/false positive junctions.
141
142 class SpliceConstraints:
143 def __init__(self,
144 min_anchor_length,
145 min_intron_length,
146 max_intron_length,
147 splice_mismatches,
148 min_isoform_fraction):
149 self.min_anchor_length = min_anchor_length
150 self.min_intron_length = min_intron_length
151 self.max_intron_length = max_intron_length
152 self.splice_mismatches = splice_mismatches
153 self.min_isoform_fraction = min_isoform_fraction
154
155 def parse_options(self, opts):
156 for option, value in opts:
157 if option in ("-m", "--splice-mismatches"):
158 self.splice_mismatches = int(value)
159 elif option in ("-a", "--min-anchor"):
160 self.min_anchor_length = int(value)
161 elif option in ("-F", "--min-isoform-fraction"):
162 self.min_isoform_fraction = float(value)
163 elif option in ("-i", "--min-intron-length"):
164 self.min_intron_length = int(value)
165 elif option in ("-I", "--max-intron-length"):
166 self.max_intron_length = int(value)
167
168 def check(self):
169 if self.splice_mismatches not in [0,1,2]:
170 die("Error: arg to --splice-mismatches must be 0, 1, or 2")
171 if self.min_anchor_length < 4:
172 die("Error: arg to --min-anchor-len must be greater than 4")
173 if self.min_isoform_fraction < 0.0 or self.min_isoform_fraction > 1.0:
174 die("Error: arg to --min-isoform-fraction must be between 0.0 and 1.0")
175 if self.min_intron_length <= 0:
176 die("Error: arg to --min-intron-length must be greater than 0")
177 if self.max_intron_length <= 0:
178 die("Error: arg to --max-intron-length must be greater than 0")
179
180 # SystemParams is a group of runtime parameters that determine how to handle
181 # temporary files produced during a run and how many threads to use for threaded
182 # stages of the pipeline (e.g. Bowtie)
183
184 class SystemParams:
185 def __init__(self,
186 num_cpus,
187 keep_tmp):
188 self.num_cpus = num_cpus
189 self.keep_tmp = keep_tmp
190 self.zipper = "gzip"
191 self.zipper_opts= []
192
193 def parse_options(self, opts):
194 global use_zpacker
195 global use_BWT_FIFO
196 for option, value in opts:
197 if option in ("-p", "--num-threads"):
198 self.num_cpus = int(value)
199 elif option == "--keep-tmp":
200 self.keep_tmp = True
201 elif option in ("-z","--zpacker"):
202 if value.lower() in ["-", " ", ".", "0", "none", "f", "false", "no"]:
203 value=""
204 self.zipper = value
205 #if not self.zipper:
206 # self.zipper='gzip'
207 elif option in ("-X", "--unmapped-fifo"):
208 use_BWT_FIFO=True
209 if self.zipper:
210 use_zpacker=True
211 if self.num_cpus>1 and not self.zipper_opts:
212 if self.zipper.endswith('pbzip2') or self.zipper.endswith('pigz'):
213 self.zipper_opts.append('-p'+str(self.num_cpus))
214 else:
215 use_zpacker=False
216 if use_BWT_FIFO: use_BWT_FIFO=False
217 def cmd(self):
218 cmdline=[]
219 if self.zipper:
220 cmdline.extend(['-z',self.zipper])
221 if self.num_cpus>1:
222 cmdline.extend(['-p'+str(self.num_cpus)])
223 return cmdline
224
225 def check(self):
226 if self.num_cpus<1 :
227 die("Error: arg to --num-threads must be greater than 0")
228 if self.zipper:
229 xzip=which(self.zipper)
230 if not xzip:
231 die("Error: cannot find compression program "+self.zipper)
232
233 # ReadParams is a group of runtime parameters that specify various properties
234 # of the user's reads (e.g. which quality scale their are on, how long the
235 # fragments are, etc).
236 class ReadParams:
237 def __init__(self,
238 solexa_quals,
239 phred64_quals,
240 quals,
241 integer_quals,
242 color,
243 color_out,
244 library_type,
245 seed_length,
246 reads_format,
247 mate_inner_dist,
248 mate_inner_dist_std_dev,
249 read_group_id,
250 sample_id,
251 library_id,
252 description,
253 seq_platform_unit,
254 seq_center,
255 seq_run_date,
256 seq_platform):
257 self.solexa_quals = solexa_quals
258 self.phred64_quals = phred64_quals
259 self.quals = quals
260 self.integer_quals = integer_quals
261 self.color = color
262 self.color_out = color_out
263 self.library_type = library_type
264 self.seed_length = seed_length
265 self.reads_format = reads_format
266 self.mate_inner_dist = mate_inner_dist
267 self.mate_inner_dist_std_dev = mate_inner_dist_std_dev
268 self.read_group_id = read_group_id
269 self.sample_id = sample_id
270 self.library_id = library_id
271 self.description = description
272 self.seq_platform_unit = seq_platform_unit
273 self.seq_center = seq_center
274 self.seq_run_date = seq_run_date
275 self.seq_platform = seq_platform
276
277 def parse_options(self, opts):
278 for option, value in opts:
279 if option == "--solexa-quals":
280 self.solexa_quals = True
281 elif option in ("--solexa1.3-quals", "--phred64-quals"):
282 self.phred64_quals = True
283 elif option in ("-Q", "--quals"):
284 self.quals = True
285 elif option == "--integer-quals":
286 self.integer_quals = True
287 elif option in ("-C", "--color"):
288 self.color = True
289 elif option == "--color-out":
290 self.color_out = True
291 elif option == "--library-type":
292 self.library_type = value
293 elif option in ("-s", "--seed-length"):
294 self.seed_length = int(value)
295 elif option in ("-r", "--mate-inner-dist"):
296 self.mate_inner_dist = int(value)
297 elif option == "--mate-std-dev":
298 self.mate_inner_dist_std_dev = int(value)
299 elif option == "--rg-id":
300 self.read_group_id = value
301 elif option == "--rg-sample":
302 self.sample_id = value
303 elif option == "--rg-library":
304 self.library_id = value
305 elif option == "--rg-description":
306 self.description = value
307 elif option == "--rg-platform-unit":
308 self.seq_platform_unit = value
309 elif option == "--rg-center":
310 self.seq_center = value
311 elif option == "--rg-date":
312 self.seq_run_date = value
313 elif option == "--rg-platform":
314 self.seq_platform = value
315
316 def check(self):
317 if self.seed_length != None and self.seed_length < 20:
318 die("Error: arg to --seed-length must be at least 20")
319 if self.mate_inner_dist_std_dev != None and self.mate_inner_dist_std_dev < 0:
320 die("Error: arg to --mate-std-dev must at least 0")
321 if (not self.read_group_id and self.sample_id) or (self.read_group_id and not self.sample_id):
322 die("Error: --rg-id and --rg-sample must be specified or omitted together")
323
324 # SearchParams is a group of runtime parameters that specify how TopHat will
325 # search for splice junctions
326
327 class SearchParams:
328 def __init__(self,
329 min_closure_exon,
330 min_closure_intron,
331 max_closure_intron,
332 min_coverage_intron,
333 max_coverage_intron,
334 min_segment_intron,
335 max_segment_intron):
336
337 self.min_closure_exon_length = min_closure_exon
338 self.min_closure_intron_length = min_closure_intron
339 self.max_closure_intron_length = max_closure_intron
340 self.min_coverage_intron_length = min_coverage_intron
341 self.max_coverage_intron_length = max_coverage_intron
342 self.min_segment_intron_length = min_segment_intron
343 self.max_segment_intron_length = max_segment_intron
344
345 def parse_options(self, opts):
346 for option, value in opts:
347 if option == "--min-closure-exon":
348 self.min_closure_exon_length = int(value)
349 if option == "--min-closure-intron":
350 self.min_closure_intron_length = int(value)
351 if option == "--max-closure-intron":
352 self.max_closure_intron_length = int(value)
353 if option == "--min-coverage-intron":
354 self.min_coverage_intron_length = int(value)
355 if option == "--max-coverage-intron":
356 self.max_coverage_intron_length = int(value)
357 if option == "--min-segment-intron":
358 self.min_segment_intron_length = int(value)
359 if option == "--max-segment-intron":
360 self.max_segment_intron_length = int(value)
361
362 def check(self):
363 if self.min_closure_exon_length < 0:
364 die("Error: arg to --min-closure-exon must be at least 20")
365 if self.min_closure_intron_length < 0:
366 die("Error: arg to --min-closure-intron must be at least 20")
367 if self.max_closure_intron_length < 0:
368 die("Error: arg to --max-closure-intron must be at least 20")
369 if self.min_coverage_intron_length < 0:
370 die("Error: arg to --min-coverage-intron must be at least 20")
371 if self.max_coverage_intron_length < 0:
372 die("Error: arg to --max-coverage-intron must be at least 20")
373 if self.min_segment_intron_length < 0:
374 die("Error: arg to --min-segment-intron must be at least 20")
375 if self.max_segment_intron_length < 0:
376 die("Error: arg to --max-segment-intron must be at least 20")
377
378 class ReportParams:
379 def __init__(self):
380 self.sort_bam = True
381 self.convert_bam = True
382
383 def parse_options(self, opts):
384 for option, value in opts:
385 if option == "--no-sort-bam":
386 self.sort_bam = False
387 if option == "--no-convert-bam":
388 self.convert_bam = False
389 self.sort_bam = False
390
391
392 def __init__(self):
393 self.splice_constraints = self.SpliceConstraints(8, # min_anchor
394 50, # min_intron
395 500000, # max_intron
396 0, # splice_mismatches
397 0.15) # min_isoform_frac
398
399 self.read_params = self.ReadParams(False, # solexa_scale
400 False,
401 False, # quals
402 None, # integer quals
403 False, # SOLiD - color space
404 False, # SOLiD - color out instead of base pair,
405 "", # library type (e.g. "illumina-stranded-pair-end")
406 None, # seed_length
407 "fastq", # quality_format
408 None, # mate inner distance
409 20, # mate inner dist std dev
410 None, # read group id
411 None, # sample id
412 None, # library id
413 None, # description
414 None, # platform unit (i.e. lane)
415 None, # sequencing center
416 None, # run date
417 None) # sequencing platform
418
419 self.system_params = self.SystemParams(1, # bowtie_threads (num_cpus)
420 False) # keep_tmp
421
422 self.search_params = self.SearchParams(100, # min_closure_exon_length
423 50, # min_closure_intron_length
424 5000, # max_closure_intron_length
425 50, # min_coverage_intron_length
426 20000, # max_coverage_intron_length
427 50, # min_segment_intron_length
428 500000) # max_segment_intron_length
429
430 self.report_params = self.ReportParams()
431 self.gff_annotation = None
432 self.raw_junctions = None
433 self.find_novel_juncs = True
434 self.find_novel_indels = True
435 self.find_GFF_juncs = True
436 self.skip_check_reads = False
437 self.max_hits = 20
438 self.initial_read_mismatches = 2
439 self.segment_length = 25
440 self.segment_mismatches = 2
441 self.bowtie_alignment_option = "-v"
442 self.max_insertion_length = 3
443 self.max_deletion_length = 3
444 self.raw_insertions = None
445 self.raw_deletions = None
446 self.closure_search = None
447 self.coverage_search = None
448 self.microexon_search = False
449 self.butterfly_search = None
450
451 def check(self):
452 self.splice_constraints.check()
453 self.read_params.check()
454 self.system_params.check()
455
456 if self.segment_length <= 4:
457 die("Error: arg to --segment-length must at least 4")
458 if self.segment_mismatches < 0 or self.segment_mismatches > 3:
459 die("Error: arg to --segment-mismatches must in [0, 3]")
460
461 if self.read_params.color and self.butterfly_search:
462 die("Error: butterfly-search in colorspace is not yet supported")
463
464 library_types = ["fr-unstranded", "fr-firststrand", "fr-secondstrand"]
465
466 if self.read_params.library_type and self.read_params.library_type not in library_types:
467 die("Error: library-type should be one of: "+', '.join(library_types))
468
469 self.search_params.max_closure_intron_length = min(self.splice_constraints.max_intron_length,
470 self.search_params.max_closure_intron_length)
471
472 self.search_params.max_segment_intron_length = min(self.splice_constraints.max_intron_length,
473 self.search_params.max_segment_intron_length)
474
475 self.search_params.max_coverage_intron_length = min(self.splice_constraints.max_intron_length,
476 self.search_params.max_coverage_intron_length)
477
478 if self.max_insertion_length >= self.segment_length:
479 die("Error: the max insertion length ("+self.max_insertion_length+") can not be equal to or greater than the segment length ("+self.segment_length+")")
480
481 if self.max_insertion_length < 0:
482 die("Error: the max insertion length ("+self.max_insertion_length+") can not be less than 0")
483
484 if self.max_deletion_length >= self.splice_constraints.min_intron_length:
485 die("Error: the max deletion length ("+self.max_deletion_length+") can not be equal to or greater than the min intron length ("+self.splice_constraints.min_intron_length+")")
486
487 if self.max_deletion_length < 0:
488 die("Error: the max deletion length ("+self.max_deletion_length+") can not be less than 0")
489
490 def cmd(self):
491 cmd = ["--min-anchor", str(self.splice_constraints.min_anchor_length),
492 "--splice-mismatches", str(self.splice_constraints.splice_mismatches),
493 "--min-report-intron", str(self.splice_constraints.min_intron_length),
494 "--max-report-intron", str(self.splice_constraints.max_intron_length),
495 "--min-isoform-fraction", str(self.splice_constraints.min_isoform_fraction),
496 "--output-dir", output_dir,
497 "--max-multihits", str(self.max_hits),
498 "--segment-length", str(self.segment_length),
499 "--segment-mismatches", str(self.segment_mismatches),
500 "--min-closure-exon", str(self.search_params.min_closure_exon_length),
501 "--min-closure-intron", str(self.search_params.min_closure_intron_length),
502 "--max-closure-intron", str(self.search_params.max_closure_intron_length),
503 "--min-coverage-intron", str(self.search_params.min_coverage_intron_length),
504 "--max-coverage-intron", str(self.search_params.max_coverage_intron_length),
505 "--min-segment-intron", str(self.search_params.min_segment_intron_length),
506 "--max-segment-intron", str(self.search_params.max_segment_intron_length),
507 "--sam-header", sam_header,
508 "--max-insertion-length", str(self.max_insertion_length),
509 "--max-deletion-length", str(self.max_deletion_length)]
510
511 cmd.extend(self.system_params.cmd())
512
513 if self.read_params.mate_inner_dist != None:
514 cmd.extend(["--inner-dist-mean", str(self.read_params.mate_inner_dist),
515 "--inner-dist-std-dev", str(self.read_params.mate_inner_dist_std_dev)])
516 if self.gff_annotation != None:
517 cmd.extend(["--gtf-annotations", str(self.gff_annotation)])
518 if not self.closure_search:
519 cmd.append("--no-closure-search")
520 if not self.coverage_search:
521 cmd.append("--no-coverage-search")
522 if not self.microexon_search:
523 cmd.append("--no-microexon-search")
524 if self.butterfly_search:
525 cmd.append("--butterfly-search")
526 if self.read_params.solexa_quals:
527 cmd.append("--solexa-quals")
528 if self.read_params.quals:
529 cmd.append("--quals")
530 if self.read_params.integer_quals:
531 cmd.append("--integer-quals")
532 if self.read_params.color:
533 cmd.append("--color")
534 if self.read_params.color_out:
535 cmd.append("--color-out")
536 if self.read_params.library_type:
537 cmd.extend(["--library-type", self.read_params.library_type])
538 if self.read_params.read_group_id:
539 cmd.extend(["--rg-id", self.read_params.read_group_id])
540 if self.read_params.phred64_quals:
541 cmd.append("--phred64-quals")
542 return cmd
543
544 # This is the master options parsing routine, which calls parse_options for
545 # the delegate classes (e.g. SpliceConstraints) that handle certain groups
546 # of options.
547 def parse_options(self, argv):
548 try:
549 opts, args = getopt.getopt(argv[1:], "hvp:m:F:a:i:I:G:r:o:j:Xz:s:g:QC",
550 ["version",
551 "help",
552 "output-dir=",
553 "solexa-quals",
554 "solexa1.3-quals",
555 "phred64-quals",
556 "quals",
557 "integer-quals",
558 "color",
559 "color-out",
560 "library-type=",
561 "num-threads=",
562 "splice-mismatches=",
563 "max-multihits=",
564 "min-isoform-fraction=",
565 "min-anchor-length=",
566 "min-intron-length=",
567 "max-intron-length=",
568 "GTF=",
569 "raw-juncs=",
570 "no-novel-juncs",
571 "no-novel-indels",
572 "no-gtf-juncs",
573 "skip-check-reads",
574 "mate-inner-dist=",
575 "mate-std-dev=",
576 "no-closure-search",
577 "no-coverage-search",
578 "closure-search",
579 "coverage-search",
580 "microexon-search",
581 "min-closure-exon=",
582 "min-closure-intron=",
583 "max-closure-intron=",
584 "min-coverage-intron=",
585 "max-coverage-intron=",
586 "min-segment-intron=",
587 "max-segment-intron=",
588 "seed-length=",
589 "initial-read-mismatches=",
590 "segment-length=",
591 "segment-mismatches=",
592 "bowtie-n",
593 "butterfly-search",
594 "no-butterfly-search",
595 "keep-tmp",
596 "rg-id=",
597 "rg-sample=",
598 "rg-library=",
599 "rg-description=",
600 "rg-platform-unit=",
601 "rg-center=",
602 "rg-date=",
603 "rg-platform=",
604 "tmp-dir=",
605 "zpacker=",
606 "unmapped-fifo",
607 "max-insertion-length=",
608 "max-deletion-length=",
609 "insertions=",
610 "deletions=",
611 "no-sort-bam",
612 "no-convert-bam"])
613 except getopt.error, msg:
614 raise Usage(msg)
615
616 self.splice_constraints.parse_options(opts)
617 self.system_params.parse_options(opts)
618 self.read_params.parse_options(opts)
619 self.search_params.parse_options(opts)
620 self.report_params.parse_options(opts)
621 global use_BWT_FIFO
622 global use_BAM_Unmapped
623 if not self.read_params.color:
624 use_BWT_FIFO=False
625 use_BAM_Unmapped=True
626 global output_dir
627 global logging_dir
628 global tmp_dir
629 global sam_header
630
631 custom_tmp_dir = None
632 custom_out_dir = None
633 # option processing
634 for option, value in opts:
635 if option in ("-v", "--version"):
636 print "TopHat v%s" % (get_version())
637 sys.exit(0)
638 if option in ("-h", "--help"):
639 raise Usage(use_message)
640 if option in ("-g", "--max-multihits"):
641 self.max_hits = int(value)
642 if option in ("-G", "--GTF"):
643 self.gff_annotation = value
644 if option in ("-j", "--raw-juncs"):
645 self.raw_junctions = value
646 if option == "--no-novel-juncs":
647 self.find_novel_juncs = False
648 if option == "--no-novel-indels":
649 self.find_novel_indels = False
650 if option == "--no-gtf-juncs":
651 self.find_GFF_juncs = False
652 if option == "--skip-check-reads":
653 self.skip_check_reads = True
654 if option == "--no-coverage-search":
655 self.coverage_search = False
656 if option == "--no-closure-search":
657 self.closure_search = False
658 if option == "--coverage-search":
659 self.coverage_search = True
660 if option == "--closure-search":
661 self.closure_search = True
662 if option == "--microexon-search":
663 self.microexon_search = True
664 if option == "--butterfly-search":
665 self.butterfly_search = True
666 if option == "--no-butterfly-search":
667 self.butterfly_search = False
668 if option == "--initial-read-mismatches":
669 self.initial_read_mismatches = int(value)
670 if option == "--segment-length":
671 self.segment_length = int(value)
672 if option == "--segment-mismatches":
673 self.segment_mismatches = int(value)
674 if option == "--bowtie-n":
675 self.bowtie_alignment_option = "-n"
676 if option == "--max-insertion-length":
677 self.max_insertion_length = int(value)
678 if option == "--max-deletion-length":
679 self.max_deletion_length = int(value)
680 if option == "--insertions":
681 self.raw_insertions = value
682 if option == "--deletions":
683 self.raw_deletions = value
684 if option in ("-o", "--output-dir"):
685 custom_out_dir = value + "/"
686 if option == "--tmp-dir":
687 custom_tmp_dir = value + "/"
688
689 if custom_out_dir != None:
690 output_dir = custom_out_dir
691 logging_dir = output_dir + "logs/"
692 tmp_dir = output_dir + "tmp/"
693 sam_header = tmp_dir + "stub_header.sam"
694 if custom_tmp_dir != None:
695 tmp_dir = custom_tmp_dir
696 sam_header = tmp_dir + "stub_header.sam"
697 if len(args) < 2:
698 raise Usage(use_message)
699 return args
700
701
702 def getFileDir(filepath):
703 #if fullpath, returns path including the ending /
704 fpath, fname=os.path.split(filepath)
705 if fpath: fpath+='/'
706 return fpath
707
708 def getFileBaseName(filepath):
709 fpath, fname=os.path.split(filepath)
710 fbase, fext =os.path.splitext(fname)
711 fx=fext.lower()
712 if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fbase)>0:
713 return fbase
714 elif fx == '.z' or fx.find('.gz')==0 or fx.find('.bz')==0:
715 fb, fext = os.path.splitext(fbase)
716 fx=fext.lower()
717 if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fb)>0:
718 return fb
719 else:
720 return fbase
721 else:
722 if len(fbase)>len(fext):
723 return fbase
724 else:
725 return fname
726
727 # Returns the current time in a nice format
728 def right_now():
729 curr_time = datetime.now()
730 return curr_time.strftime("%c")
731
732 # Ensures that the output, logging, and temp directories are present. If not,
733 # they are created
734 def prepare_output_dir():
735
736 print >> sys.stderr, "[%s] Preparing output location %s" % (right_now(), output_dir)
737 if os.path.exists(output_dir):
738 pass
739 else:
740 os.mkdir(output_dir)
741
742 if os.path.exists(logging_dir):
743 pass
744 else:
745 os.mkdir(logging_dir)
746
747 if os.path.exists(tmp_dir):
748 pass
749 else:
750 try:
751 os.makedirs(tmp_dir)
752 except OSError, o:
753 die("\nError creating directory %s (%s)" % (tmp_dir, o))
754
755
756 # to be added as preexec_fn for every subprocess.Popen() call:
757 # see http://bugs.python.org/issue1652
758 def subprocess_setup():
759 # Python installs a SIGPIPE handler by default, which causes
760 # gzip or other de/compression pipes to complain about "stdout: Broken pipe"
761 signal.signal(signal.SIGPIPE, signal.SIG_DFL)
762
763 # Check that the Bowtie index specified by the user is present and all files
764 # are there.
765 def check_bowtie_index(idx_prefix):
766 print >> sys.stderr, "[%s] Checking for Bowtie index files" % right_now()
767
768 idx_fwd_1 = idx_prefix + ".1.ebwt"
769 idx_fwd_2 = idx_prefix + ".2.ebwt"
770 idx_rev_1 = idx_prefix + ".rev.1.ebwt"
771 idx_rev_2 = idx_prefix + ".rev.2.ebwt"
772
773 if os.path.exists(idx_fwd_1) and \
774 os.path.exists(idx_fwd_2) and \
775 os.path.exists(idx_rev_1) and \
776 os.path.exists(idx_rev_2):
777 return
778 else:
779 bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES")
780 if bowtie_idx_env_var == None:
781 die("Error: Could not find Bowtie index files " + idx_prefix + ".*")
782 idx_prefix = bowtie_idx_env_var + idx_prefix
783 idx_fwd_1 = idx_prefix + ".1.ebwt"
784 idx_fwd_2 = idx_prefix + ".2.ebwt"
785 idx_rev_1 = idx_prefix + ".rev.1.ebwt"
786 idx_rev_2 = idx_prefix + ".rev.2.ebwt"
787
788 if os.path.exists(idx_fwd_1) and \
789 os.path.exists(idx_fwd_2) and \
790 os.path.exists(idx_rev_1) and \
791 os.path.exists(idx_rev_2):
792 return
793 else:
794 die("Error: Could not find Bowtie index files " + idx_prefix + ".*")
795
796 # Reconstructs the multifasta file from which the Bowtie index was created, if
797 # it's not already there.
798 def bowtie_idx_to_fa(idx_prefix):
799 idx_name = idx_prefix.split('/')[-1]
800 print >> sys.stderr, "[%s] Reconstituting reference FASTA file from Bowtie index" % (right_now())
801
802 try:
803 tmp_fasta_file_name = tmp_dir + idx_name + ".fa"
804 tmp_fasta_file = open(tmp_fasta_file_name, "w")
805
806 inspect_log = open(logging_dir + "bowtie_inspect_recons.log", "w")
807
808 inspect_cmd = [prog_path("bowtie-inspect"),
809 idx_prefix]
810
811 print >> sys.stderr, "Executing: " + " ".join(inspect_cmd) + " > " + tmp_fasta_file_name
812 ret = subprocess.call(inspect_cmd,
813 stdout=tmp_fasta_file,
814 stderr=inspect_log)
815
816 # Bowtie reported an error
817 if ret != 0:
818 die(fail_str+"Error: bowtie-inspect returned an error")
819
820 # Bowtie not found
821 except OSError, o:
822 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
823 die(fail_str+"Error: bowtie-inspect not found on this system. Did you forget to include it in your PATH?")
824
825 return tmp_fasta_file_name
826
827 # Checks whether the multifasta file for the genome is present alongside the
828 # Bowtie index files for it.
829 def check_fasta(idx_prefix):
830 print >> sys.stderr, "[%s] Checking for reference FASTA file" % right_now()
831 idx_fasta = idx_prefix + ".fa"
832 if os.path.exists(idx_fasta):
833 return idx_fasta
834 else:
835 bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES")
836 if bowtie_idx_env_var != None:
837 idx_fasta = bowtie_idx_env_var + idx_prefix + ".fa"
838 if os.path.exists(idx_fasta):
839 return idx_fasta
840
841 print >> sys.stderr, "\tWarning: Could not find FASTA file " + idx_fasta
842 idx_fa = bowtie_idx_to_fa(idx_prefix)
843 return idx_fa
844
845 # Check that both the Bowtie index and the genome's fasta file are present
846 def check_index(idx_prefix):
847 check_bowtie_index(idx_prefix)
848 ref_fasta_file = check_fasta(idx_prefix)
849
850 return (ref_fasta_file, None)
851
852 # Retrive a tuple containing the system's version of Bowtie. Parsed from
853 # `bowtie --version`
854 def get_bowtie_version():
855 try:
856 # Launch Bowtie to capture its version info
857 proc = subprocess.Popen([bowtie_path, "--version"],
858 stdout=subprocess.PIPE)
859 stdout_value = proc.communicate()[0]
860 bowtie_version = None
861 bowtie_out = repr(stdout_value)
862
863 # Find the version identifier
864 version_str = "bowtie version "
865 ver_str_idx = bowtie_out.find(version_str)
866 if ver_str_idx != -1:
867 nl = bowtie_out.find("\\n", ver_str_idx)
868 version_val = bowtie_out[ver_str_idx + len(version_str):nl]
869 bowtie_version = [int(x) for x in version_val.split('.')]
870 if len(bowtie_version) == 3:
871 bowtie_version.append(0)
872
873 return bowtie_version
874 except OSError, o:
875 errmsg=fail_str+str(o)+"\n"
876 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
877 errmsg+="Error: bowtie not found on this system"
878 die(errmsg)
879
880 def get_index_sam_header(read_params, idx_prefix):
881 try:
882 bowtie_sam_header_filename = tmp_dir+idx_prefix.split('/')[-1]+".bwt.samheader.sam"
883 bowtie_sam_header_file = open(bowtie_sam_header_filename,"w")
884
885 bowtie_header_cmd = [bowtie_path, "--sam"]
886 if read_params.color:
887 bowtie_header_cmd.append('-C')
888 bowtie_header_cmd.extend([idx_prefix, '/dev/null'])
889 subprocess.call(bowtie_header_cmd,
890 stdout=bowtie_sam_header_file,
891 stderr=open('/dev/null'))
892
893 bowtie_sam_header_file.close()
894 bowtie_sam_header_file = open(bowtie_sam_header_filename,"r")
895
896 sam_header_file = open(sam_header, "w")
897
898 preamble = []
899 sq_dict_lines = []
900
901 for line in bowtie_sam_header_file.readlines():
902 line = line.strip()
903 if line.find("@SQ") != -1:
904 # Sequence dictionary record
905 cols = line.split('\t')
906 seq_name = None
907 for col in cols:
908 fields = col.split(':')
909 #print fields
910 if len(fields) > 0 and fields[0] == "SN":
911 seq_name = fields[1]
912 if seq_name == None:
913 die("Error: malformed sequence dictionary in sam header")
914 sq_dict_lines.append([seq_name,line])
915 elif line.find("CL"):
916 continue
917 else:
918 preamble.append(line)
919 #for line in preamble:
920 # print >> sam_header_file, line
921 print >> sam_header_file, "@HD\tVN:1.0\tSO:coordinate"
922
923 if read_params.read_group_id and read_params.sample_id:
924 rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
925 read_params.sample_id)
926 if read_params.library_id:
927 rg_str += "\tLB:%s" % read_params.library_id
928 if read_params.description:
929 rg_str += "\tDS:%s" % read_params.description
930 if read_params.seq_platform_unit:
931 rg_str += "\tPU:%s" % read_params.seq_platform_unit
932 if read_params.seq_center:
933 rg_str += "\tCN:%s" % read_params.seq_center
934 if read_params.mate_inner_dist:
935 rg_str += "\tPI:%s" % read_params.mate_inner_dist
936 if read_params.seq_run_date:
937 rg_str += "\tDT:%s" % read_params.seq_run_date
938 if read_params.seq_platform:
939 rg_str += "\tPL:%s" % read_params.seq_platform
940
941 print >> sam_header_file, rg_str
942
943 sq_dict_lines.sort(lambda x,y: cmp(x[0],y[0]))
944 for [name, line] in sq_dict_lines:
945 print >> sam_header_file, line
946 print >> sam_header_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
947
948 sam_header_file.close()
949
950 return sam_header
951
952 except OSError, o:
953 errmsg=fail_str+str(o)+"\n"
954 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
955 errmsg+="Error: bowtie not found on this system"
956 die(errmsg)
957
958 # Make sure Bowtie is installed and is recent enough to be useful
959 def check_bowtie():
960 print >> sys.stderr, "[%s] Checking for Bowtie" % right_now()
961 global bowtie_path
962 bowtie_path=prog_path("bowtie")
963 bowtie_version = get_bowtie_version()
964 if bowtie_version == None:
965 die("Error: Bowtie not found on this system")
966 elif bowtie_version[0] < 1 and bowtie_version[1] < 12 and bowtie_version[2] < 3:
967 die("Error: TopHat requires Bowtie 0.12.3 or later")
968 print >> sys.stderr, "\tBowtie version:\t\t\t %s" % ".".join([str(x) for x in bowtie_version])
969
970
971 # Retrive a tuple containing the system's version of samtools. Parsed from
972 # `samtools`
973 def get_samtools_version():
974 try:
975 # Launch Bowtie to capture its version info
976 proc = subprocess.Popen(samtools_path, stderr=subprocess.PIPE)
977 samtools_out = proc.communicate()[1]
978
979 # Find the version identifier
980 version_match = re.search(r'Version:\s+(\d+)\.(\d+).(\d+)([a-zA-Z]?)', samtools_out)
981 samtools_version_arr = [int(version_match.group(x)) for x in [1,2,3]]
982 if version_match.group(4):
983 samtools_version_arr.append(version_match.group(4))
984 else:
985 samtools_version_arr.append(0)
986
987 return version_match.group(), samtools_version_arr
988 except OSError, o:
989 errmsg=fail_str+str(o)+"\n"
990 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
991 errmsg+="Error: samtools not found on this system"
992 die(errmsg)
993
994 # Make sure the SAM tools are installed and are recent enough to be useful
995 def check_samtools():
996 print >> sys.stderr, "[%s] Checking for Samtools" % right_now()
997 global samtools_path
998 samtools_path=prog_path("samtools")
999 samtools_version_str, samtools_version_arr = get_samtools_version()
1000 if samtools_version_str == None:
1001 die("Error: Samtools not found on this system")
1002 elif samtools_version_arr[1] < 1 or samtools_version_arr[2] < 7:
1003 die("Error: TopHat requires Samtools 0.1.7 or later")
1004 print >> sys.stderr, "\tSamtools %s" % samtools_version_str
1005
1006
1007
1008 class FastxReader:
1009 def __init__(self, i_file, is_color=0, fname=''):
1010 self.bufline=None
1011 self.format=None
1012 self.ifile=i_file
1013 self.nextRecord=None
1014 self.eof=None
1015 self.fname=fname
1016 self.lastline=None
1017 self.numrecords=0
1018 self.isColor=0
1019 if is_color : self.isColor=1
1020 # determine file type
1021 #no records processed yet, skip custom header lines if any
1022 hlines=10 # allow maximum 10 header lines
1023 self.lastline=" "
1024 while hlines>0 and self.lastline[0] not in "@>" :
1025 self.lastline=self.ifile.readline()
1026 hlines-=1
1027 if self.lastline[0] == '@':
1028 self.format='fastq'
1029 self.nextRecord=self.nextFastq
1030 elif self.lastline[0] == '>':
1031 self.format='fasta'
1032 self.nextRecord=self.nextFasta
1033 else:
1034 die("Error: cannot determine record type in input file %s" % fname)
1035 self.bufline=self.lastline
1036 self.lastline=None
1037
1038 def nextFastq(self):
1039 # returning tuple: (seqID, sequence_string, seq_len, qv_string)
1040 seqid,seqstr,qstr,seq_len='','','',0
1041 if self.eof: return (seqid, seqstr, seq_len, qstr)
1042 fline=self.getLine #shortcut to save a bit of time
1043 line=fline()
1044
1045 if not line : return (seqid, seqstr, seq_len, qstr)
1046 while len(line.rstrip())==0: # skip empty lines
1047 line=fline()
1048 if not line : return (seqid, seqstr,seq_len, qstr)
1049 try:
1050 if line[0] != "@":
1051 raise ValueError("Records in Fastq files should start with '@' character")
1052 seqid = line[1:].rstrip()
1053 seqstr = fline().rstrip()
1054
1055 #There may now be more sequence lines, or the "+" quality marker line:
1056 while True:
1057 line = fline()
1058 if not line:
1059 raise ValueError("Premature end of file (missing quality values for "+seqid+")")
1060 if line[0] == "+":
1061 # -- sequence string ended
1062 #qtitle = line[1:].rstrip()
1063 #if qtitle and qtitle != seqid:
1064 # raise ValueError("Different read ID for sequence and quality (%s vs %s)" \
1065 # % (seqid, qtitle))
1066 break
1067 seqstr += line.rstrip() #removes trailing newlines
1068 #loop until + found
1069 seq_len = len(seqstr)
1070 #at least one line of quality data should follow
1071 qstrlen=0
1072 #now read next lines as quality values until seq_len is reached
1073 while True:
1074 line=fline()
1075 if not line : break #end of file
1076 qstr += line.rstrip()
1077 qstrlen=len(qstr)
1078 if qstrlen + self.isColor >= seq_len :
1079 break # qv string has reached the length of seq string
1080 #loop until qv has the same length as seq
1081 if self.isColor and qstr[0]=="!":
1082 #some color qual string have a '!' character at the beginning, should be stripped
1083 qstr = qstr[1:]
1084 qstrlen -= 1
1085 if seq_len != qstrlen+self.isColor :
1086 raise ValueError("Length mismatch between sequence and quality strings "+ \
1087 "for %s (%i vs %i)." \
1088 % (seqid, seq_len, qstrlen))
1089 except ValueError, err:
1090 die("\nError encountered parsing file "+self.fname+":\n "+str(err))
1091 #return the record
1092 self.numrecords+=1
1093 if self.isColor :
1094 seq_len-=1
1095 seqstr = seqstr[1:]
1096 return (seqid, seqstr, seq_len, qstr)
1097
1098 def nextFasta(self):
1099 # returning tuple: (seqID, sequence_string, seq_len)
1100 seqid,seqstr,seq_len='','',0
1101 fline=self.getLine # shortcut to readline function of f
1102 line=fline() # this will use the buffer line if it's there
1103 if not line : return (seqid, seqstr, seq_len, None)
1104 while len(line.rstrip())==0: # skip empty lines
1105 line=fline()
1106 if not line : return (seqid, seqstr, seq_len, None)
1107 try:
1108 if line[0] != ">":
1109 raise ValueError("Records in Fasta files must start with '>' character")
1110 seqid = line[1:].split()[0]
1111 #more sequence lines, or the ">" quality marker line:
1112 while True:
1113 line = fline()
1114 if not line: break
1115 if line[0] == '>':
1116 #next sequence starts here
1117 self.ungetLine()
1118 break
1119 seqstr += line.rstrip()
1120 #loop until '>' found
1121 seq_len = len(seqstr)
1122 if seq_len < 3:
1123 raise ValueError("Read %s too short (%i)." \
1124 % (seqid, seq_len))
1125 except ValueError, err:
1126 die("\nError encountered parsing fasta file "+self.fname+"\n "+str(err))
1127 #return the record and continue
1128 self.numrecords+=1
1129 if self.isColor :
1130 seq_len-=1
1131 seqstr = seqstr[1:]
1132 return (seqid, seqstr, seq_len, None)
1133
1134 def getLine(self):
1135 if self.bufline: #return previously buffered line
1136 r=self.bufline
1137 self.bufline=None
1138 return r
1139 else: #read a new line from stream and return it
1140 if self.eof: return None
1141 self.lastline=self.ifile.readline()
1142 if not self.lastline:
1143 self.eof=1
1144 return None
1145 return self.lastline
1146 def ungetLine(self):
1147 if self.lastline is None:
1148 print >> sys.stderr, "Warning: FastxReader called ungetLine() with no prior line!"
1149 self.bufline=self.lastline
1150 self.lastline=None
1151 #< class FastxReader
1152
1153 class ZReader:
1154 def __init__(self, filename, sysparams, guess=True):
1155 self.fname=filename
1156 self.file=None
1157 self.fsrc=None
1158 self.popen=None
1159 pipecmd=[]
1160 if guess:
1161 s=filename.lower()
1162 if s.endswith(".z") or s.endswith(".gz") or s.endswith(".gzip"):
1163 pipecmd=['gzip']
1164 else:
1165 if s.endswith(".bz2") or s.endswith(".bzip2") or s.endswith(".bzip"):
1166 pipecmd=['bzip2']
1167 if len(pipecmd)>0 and which(pipecmd[0]) is None:
1168 die("Error: cannot find %s to decompress input file %s " % (pipecmd, filename))
1169 if len(pipecmd)>0:
1170 if pipecmd[0]=='gzip' and sysparams.zipper.endswith('pigz'):
1171 pipecmd[0]=sysparams.zipper
1172 pipecmd.extend(sysparams.zipper_opts)
1173 elif pipecmd[0]=='bzip2' and sysparams.zipper.endswith('pbzip2'):
1174 pipecmd[0]=sysparams.zipper
1175 pipecmd.extend(sysparams.zipper_opts)
1176 else: #not guessing, but we must still be sure it's a compressed file
1177 if use_zpacker and filename.endswith(".z"):
1178 pipecmd=[sysparams.zipper]
1179 pipecmd.extend(sysparams.zipper_opts)
1180
1181 if pipecmd:
1182 pipecmd+=['-cd']
1183 try:
1184 self.fsrc=open(self.fname, 'rb')
1185 self.popen=subprocess.Popen(pipecmd,
1186 preexec_fn=subprocess_setup,
1187 stdin=self.fsrc,
1188 stdout=subprocess.PIPE, close_fds=True)
1189 except Exception:
1190 die("Error: could not open pipe "+' '.join(pipecmd)+' < '+ self.fname)
1191 self.file=self.popen.stdout
1192 else:
1193 self.file=open(filename)
1194 def close(self):
1195 if self.fsrc: self.fsrc.close()
1196 self.file.close()
1197 if self.popen:
1198 self.popen.wait()
1199 self.popen=None
1200
1201 class ZWriter:
1202 def __init__(self, filename, sysparams):
1203 self.fname=filename
1204 if use_zpacker:
1205 pipecmd=[sysparams.zipper,"-cf", "-"]
1206 self.ftarget=open(filename, "wb")
1207 try:
1208 self.popen=subprocess.Popen(pipecmd,
1209 preexec_fn=subprocess_setup,
1210 stdin=subprocess.PIPE,
1211 stdout=self.ftarget, close_fds=True)
1212 except Exception:
1213 die("Error: could not open writer pipe "+' '.join(pipecmd)+' < '+ self.fname)
1214 self.file=self.popen.stdin # client writes to this end of the pipe
1215 else: #no compression
1216 self.file=open(filename, "w")
1217 self.ftarget=None
1218 self.popen=None
1219 def close(self):
1220 self.file.close()
1221 if self.ftarget: self.ftarget.close()
1222 if self.popen:
1223 self.popen.wait() #! required to actually flush the pipes (eek!)
1224 self.popen=None
1225
1226 # check_reads_format() examines the first few records in the user files
1227 # to determines the file format
1228 def check_reads_format(params, reads_files):
1229 #print >> sys.stderr, "[%s] Checking reads" % right_now()
1230 #
1231 seed_len = params.read_params.seed_length
1232 fileformat = params.read_params.reads_format
1233
1234 observed_formats = set([])
1235 # observed_scales = set([])
1236 min_seed_len = 99999
1237 max_seed_len = 0
1238 files = reads_files.split(',')
1239
1240 for f_name in files:
1241 #try:
1242 zf = ZReader(f_name, params.system_params)
1243 #except IOError:
1244 # die("Error: could not open file "+f_name)
1245 freader=FastxReader(zf.file, params.read_params.color, zf.fname)
1246 toread=4 #just sample the first 4 reads
1247 while toread>0:
1248 seqid, seqstr, seq_len, qstr = freader.nextRecord()
1249 if not seqid: break
1250 toread-=1
1251 if seq_len < 20:
1252 print >> sys.stderr, "Warning: found a read < 20bp in", f_name
1253 else:
1254 min_seed_len = min(seq_len, min_seed_len)
1255 max_seed_len = max(seq_len, max_seed_len)
1256 zf.close()
1257 observed_formats.add(freader.format)
1258 if len(observed_formats) > 1:
1259 die("Error: TopHat requires all reads be either FASTQ or FASTA. Mixing formats is not supported.")
1260 fileformat=list(observed_formats)[0]
1261 if seed_len != None:
1262 seed_len = max(seed_len, min_seed_len)
1263 else:
1264 seed_len = max_seed_len
1265 #print >> sys.stderr, "\tmin read length: %dbp, max read length: %dbp" % (min_seed_len, max_seed_len)
1266 print >> sys.stderr, "\tformat:\t\t %s" % fileformat
1267 if fileformat == "fastq":
1268 quality_scale = "phred33 (default)"
1269 if params.read_params.solexa_quals and not params.read_params.phred64_quals:
1270 quality_scale = "solexa33 (reads generated with GA pipeline version < 1.3)"
1271 elif params.read_params.phred64_quals:
1272 quality_scale = "phred64 (reads generated with GA pipeline version >= 1.3)"
1273 print >> sys.stderr, "\tquality scale:\t %s" % quality_scale
1274 elif fileformat == "fasta":
1275 if params.read_params.color:
1276 params.read_params.integer_quals = True
1277
1278 #print seed_len, format, solexa_scale
1279 #NOTE: seed_len will be re-evaluated later by prep_reads
1280 return TopHatParams.ReadParams(params.read_params.solexa_quals,
1281 params.read_params.phred64_quals,
1282 params.read_params.quals,
1283 params.read_params.integer_quals,
1284 params.read_params.color,
1285 params.read_params.color_out,
1286 params.read_params.library_type,
1287 seed_len,
1288 fileformat,
1289 params.read_params.mate_inner_dist,
1290 params.read_params.mate_inner_dist_std_dev,
1291 params.read_params.read_group_id,
1292 params.read_params.sample_id,
1293 params.read_params.library_id,
1294 params.read_params.description,
1295 params.read_params.seq_platform_unit,
1296 params.read_params.seq_center,
1297 params.read_params.seq_run_date,
1298 params.read_params.seq_platform)
1299
1300 def log_tail(logfile, lines=1):
1301 f=open(logfile, "r")
1302 f.seek(0, 2)
1303 fbytes= f.tell()
1304 size=lines
1305 block=-1
1306 while size > 0 and fbytes+block*1024 > 0:
1307 if (fbytes+block*1024 > 0):
1308 ##Seek back once more, if possible
1309 f.seek( block*1024, 2 )
1310 else:
1311 #Seek to the beginning
1312 f.seek(0, 0)
1313 data= f.read( 1024 )
1314 linesFound= data.count('\n')
1315 size -= linesFound
1316 block -= 1
1317 if (fbytes + block*1024 > 0):
1318 f.seek(block*1024, 2)
1319 else:
1320 f.seek(0,0)
1321 f.readline() # find a newline
1322 lastBlocks= list( f.readlines() )
1323 f.close()
1324 return "\n".join(lastBlocks[-lines:])
1325
1326 # Format a DateTime as a pretty string.
1327 # FIXME: Currently doesn't support days!
1328 def formatTD(td):
1329 hours = td.seconds // 3600
1330 minutes = (td.seconds % 3600) // 60
1331 seconds = td.seconds % 60
1332 return '%02d:%02d:%02d' % (hours, minutes, seconds)
1333
1334 class PrepReadsInfo:
1335 def __init__(self, fname, side):
1336 try:
1337 f=open(fname,"r")
1338 self.min_len=int(f.readline().split("=")[-1])
1339 self.max_len=int(f.readline().split("=")[-1])
1340 self.in_count=int(f.readline().split("=")[-1])
1341 self.out_count=int(f.readline().split("=")[-1])
1342 if (self.out_count==0) or (self.max_len<16):
1343 raise Exception()
1344 except Exception, e:
1345 die(fail_str+"Error retrieving prep_reads info.")
1346 print >> sys.stderr, "\t%s reads: min. length=%s, count=%s" % (side, self.min_len, self.out_count)
1347 if self.min_len < 20:
1348 print >> sys.stderr, "Warning: short reads (<20bp) will make TopHat quite slow and take large amount of memory because they are likely to be mapped to too many places"
1349
1350 # Calls the prep_reads executable, which prepares an internal read library.
1351 # The read library features reads with monotonically increasing integer IDs.
1352 # prep_reads also filters out very low complexy or garbage reads as well as
1353 # polyA reads.
1354 #--> returns the tuple (internal_read_library_file, read_info)
1355 def prep_reads(params, reads_list, quals_list, output_name, side="Left "):
1356 reads_suffix = ".fq"
1357 if use_zpacker: reads_suffix += ".z"
1358 kept_reads_filename = tmp_dir + output_name + reads_suffix
1359
1360 if os.path.exists(kept_reads_filename):
1361 os.remove(kept_reads_filename)
1362 kept_reads = open(kept_reads_filename, "wb")
1363 log_fname=logging_dir + "prep_reads.log"
1364 filter_log = open(log_fname,"w")
1365
1366 filter_cmd = [prog_path("prep_reads")]
1367 filter_cmd.extend(params.cmd())
1368
1369 if params.read_params.reads_format == "fastq":
1370 filter_cmd += ["--fastq"]
1371 elif params.read_params.reads_format == "fasta":
1372 filter_cmd += ["--fasta"]
1373 info_file=output_dir+output_name+".info"
1374 filter_cmd += ["--aux-outfile="+info_file]
1375 filter_cmd.append(reads_list)
1376 if params.read_params.quals:
1377 filter_cmd.append(quals_list)
1378 shell_cmd = ' '.join(filter_cmd)
1379 #finally, add the compression pipe
1380 zip_cmd=[]
1381 if use_zpacker:
1382 zip_cmd=[ params.system_params.zipper ]
1383 zip_cmd.extend(params.system_params.zipper_opts)
1384 zip_cmd.extend(['-c','-'])
1385 shell_cmd +=' | '+' '.join(zip_cmd)
1386 shell_cmd += ' >' +kept_reads_filename
1387 retcode=0
1388 try:
1389 print >> run_log, shell_cmd
1390 if use_zpacker:
1391 filter_proc = subprocess.Popen(filter_cmd,
1392 stdout=subprocess.PIPE,
1393 stderr=filter_log)
1394 zip_proc=subprocess.Popen(zip_cmd,
1395 preexec_fn=subprocess_setup,
1396 stdin=filter_proc.stdout,
1397 stdout=kept_reads)
1398 filter_proc.stdout.close() #as per http://bugs.python.org/issue7678
1399 zip_proc.communicate()
1400 retcode=filter_proc.poll()
1401 if retcode==0:
1402 retcode=zip_proc.poll()
1403 else:
1404 retcode = subprocess.call(filter_cmd,
1405 stdout=kept_reads,
1406 stderr=filter_log)
1407 if retcode:
1408 die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname))
1409
1410 except OSError, o:
1411 errmsg=fail_str+str(o)
1412 die(errmsg+"\n"+log_tail(log_fname))
1413 kept_reads.close()
1414
1415 return kept_reads_filename, PrepReadsInfo(info_file, side)
1416
1417 # Call bowtie
1418 def bowtie(params,
1419 bwt_idx_prefix,
1420 reads_list,
1421 reads_format,
1422 num_mismatches,
1423 mapped_reads,
1424 unmapped_reads = None, #not needed vs segment_juncs
1425 extra_output = ""):
1426 start_time = datetime.now()
1427 bwt_idx_name = bwt_idx_prefix.split('/')[-1]
1428 readfile_basename=getFileBaseName(reads_list)
1429 print >> sys.stderr, "[%s] Mapping %s against %s with Bowtie %s" % (start_time.strftime("%c"),
1430 readfile_basename, bwt_idx_name, extra_output)
1431
1432 bwt_log = open(logging_dir + 'bowtie.'+readfile_basename+'.fixmap.log', "w")
1433 unmapped_reads_bwt=unmapped_reads+".fq"
1434 use_FIFO = use_BWT_FIFO and use_zpacker and unmapped_reads and params.read_params.color
1435 if use_FIFO: #this should always be False unless color reads are used with -X option
1436 global unmapped_reads_fifo
1437 if os.path.exists(unmapped_reads_fifo):
1438 os.remove(unmapped_reads_fifo)
1439 try:
1440 os.mkfifo(unmapped_reads_fifo)
1441 except OSError, o:
1442 die(fail_str+"Error at mkfifo("+unmapped_reads_fifo+'). '+str(o))
1443 unmapped_reads_bwt=unmapped_reads_fifo
1444
1445 # Launch Bowtie
1446 try:
1447 bowtie_cmd = [bowtie_path]
1448 bam_input=0
1449 unzip_cmd=None
1450 if reads_list.endswith('.bam'):
1451 bam_input=1
1452 unzip_cmd=[ prog_path('bam2fastx') ]
1453 if reads_format:
1454 unzip_cmd.append("--"+reads_format)
1455 unzip_cmd+=[reads_list]
1456
1457 if reads_format == "fastq":
1458 bowtie_cmd += ["-q"]
1459 elif reads_format == "fasta":
1460 bowtie_cmd += ["-f"]
1461
1462 if params.read_params.color:
1463 bowtie_cmd += ["-C", "--col-keepends"]
1464 if unmapped_reads:
1465 bowtie_cmd += ["--un", unmapped_reads_bwt]
1466 # "--max", "/dev/null"]
1467 if use_zpacker and (unzip_cmd is None):
1468 unzip_cmd=[ params.system_params.zipper ]
1469 unzip_cmd.extend(params.system_params.zipper_opts)
1470 unzip_cmd+=['-cd']
1471
1472 fifo_pid=None
1473 if use_FIFO:
1474 unm_zipcmd=[ params.system_params.zipper ]
1475 unm_zipcmd.extend(params.system_params.zipper_opts)
1476 unm_zipcmd+=['-c']
1477 unmapped_reads+=".fq.z"
1478 print >> run_log, ' '.join(unm_zipcmd)+' < '+ unmapped_reads_fifo + ' > '+ unmapped_reads + ' & '
1479 fifo_pid=os.fork()
1480 if fifo_pid==0:
1481 def on_sig_exit(sig, func=None):
1482 os._exit(os.EX_OK)
1483 signal.signal(signal.SIGTERM, on_sig_exit)
1484 subprocess.call(unm_zipcmd,
1485 stdin=open(unmapped_reads_fifo, "r"),
1486 stdout=open(unmapped_reads, "wb"))
1487 os._exit(os.EX_OK)
1488
1489 bowtie_cmd += [params.bowtie_alignment_option, str(num_mismatches),
1490 "-p", str(params.system_params.num_cpus),
1491 "-k", str(params.max_hits),
1492 "-m", str(params.max_hits),
1493 "--max", "/dev/null",
1494 "--sam-nohead"]
1495 fix_map_cmd = [prog_path('fix_map_ordering')]
1496 sam_zipcmd=None
1497 if unmapped_reads:
1498 #mapping on the genome
1499 #fix_map_ordering will write BAM file directly, and unmapped_reads as BAM file too
1500 mapped_reads += ".bam"
1501 if params.read_params.color:
1502 fix_map_cmd += ["--sam-header", sam_header, "-", mapped_reads]
1503 else:
1504 unmapped_reads += ".bam"
1505 fix_map_cmd += ["--sam-header", sam_header, "-", mapped_reads, unmapped_reads]
1506 else:
1507 #mapping on segment_juncs (spliced potential junctions)
1508 # don't use SAM headers, could be slow -- better compress SAM output directly
1509 mapped_reads += ".sam"
1510 fix_map_cmd += ["-"]
1511 if use_zpacker:
1512 fix_map_cmd += ["-"]
1513 sam_zipcmd=[ params.system_params.zipper ]
1514 sam_zipcmd.extend(params.system_params.zipper_opts)
1515 sam_zipcmd += ["-c"]
1516 mapped_reads += ".z"
1517 else:
1518 fix_map_cmd += [mapped_reads]
1519
1520 bowtie_cmd+=[bwt_idx_prefix]
1521 bowtie_proc=None
1522 shellcmd=""
1523 unzip_proc=None
1524 mapzip_proc=None
1525 if unzip_cmd:
1526 # could be bam2fastx
1527 if bam_input:
1528 unzip_proc = subprocess.Popen(unzip_cmd, stdout=subprocess.PIPE)
1529 shellcmd=' '.join(unzip_cmd) + "|"
1530 else:
1531 unzip_proc = subprocess.Popen(unzip_cmd,
1532 stdin=open(reads_list, "rb"),
1533 stdout=subprocess.PIPE)
1534 shellcmd=' '.join(unzip_cmd) + "< " +reads_list +"|"
1535 bowtie_cmd += ['-']
1536 bowtie_proc = subprocess.Popen(bowtie_cmd,
1537 stdin=unzip_proc.stdout,
1538 stdout=subprocess.PIPE,
1539 stderr=bwt_log)
1540 unzip_proc.stdout.close() # see http://bugs.python.org/issue7678
1541 else:
1542 bowtie_cmd += [reads_list]
1543 bowtie_proc = subprocess.Popen(bowtie_cmd,
1544 stdout=subprocess.PIPE,
1545 stderr=bwt_log)
1546 shellcmd += " ".join(bowtie_cmd) + " | " + " ".join(fix_map_cmd)
1547 if sam_zipcmd:
1548 shellcmd += "|" + " ".join(sam_zipcmd)+" > " + mapped_reads
1549 fix_order_proc = subprocess.Popen(fix_map_cmd,
1550 stdin=bowtie_proc.stdout,
1551 stdout=subprocess.PIPE)
1552 bowtie_proc.stdout.close()
1553 mapzip_proc = subprocess.Popen(sam_zipcmd,
1554 stdin=fix_order_proc.stdout,
1555 stdout=open(mapped_reads, "wb"))
1556 fix_order_proc.stdout.close()
1557 else:
1558 fix_order_proc = subprocess.Popen(fix_map_cmd,
1559 stdin=bowtie_proc.stdout)
1560 bowtie_proc.stdout.close()
1561
1562 print >> run_log, shellcmd
1563 if mapzip_proc:
1564 mapzip_proc.communicate()
1565 else:
1566 fix_order_proc.communicate()
1567 if use_FIFO:
1568 if fifo_pid and not os.path.exists(unmapped_reads):
1569 try:
1570 os.kill(fifo_pid, signal.SIGTERM)
1571 except:
1572 pass
1573 except OSError, o:
1574 die(fail_str+"Error: "+str(o))
1575
1576 # Success
1577 #finish_time = datetime.now()
1578 #duration = finish_time - start_time
1579 #print >> sys.stderr, "\t\t\t[%s elapsed]" % formatTD(duration)
1580 if use_FIFO:
1581 try:
1582 os.remove(unmapped_reads_fifo)
1583 except:
1584 pass
1585 return (mapped_reads, unmapped_reads)
1586
1587 def bowtie_segment(params,
1588 bwt_idx_prefix,
1589 reads_list,
1590 reads_format,
1591 mapped_reads,
1592 unmapped_reads = None,
1593 extra_output = ""):
1594
1595 backup_bowtie_alignment_option = params.bowtie_alignment_option
1596 params.bowtie_alignment_option = "-v"
1597 params.max_hits *= 2
1598
1599 result = bowtie(params,
1600 bwt_idx_prefix,
1601 reads_list,
1602 reads_format,
1603 params.segment_mismatches,
1604 mapped_reads,
1605 unmapped_reads,
1606 extra_output)
1607
1608 params.bowtie_alignment_option = backup_bowtie_alignment_option
1609 params.max_hits /= 2
1610 return result
1611
1612 # Retrieve a .juncs file from a GFF file by calling the gtf_juncs executable
1613 def get_gtf_juncs(gff_annotation):
1614 print >> sys.stderr, "[%s] Reading known junctions from GTF file" % (right_now())
1615 gtf_juncs_log = open(logging_dir + "gtf_juncs.log", "w")
1616
1617 gff_prefix = gff_annotation.split('/')[-1].split('.')[0]
1618
1619 gtf_juncs_out_name = tmp_dir + gff_prefix + ".juncs"
1620 gtf_juncs_out = open(gtf_juncs_out_name, "w")
1621
1622 gtf_juncs_cmd=[prog_path("gtf_juncs"), gff_annotation]
1623 try:
1624 print >> run_log, " ".join(gtf_juncs_cmd)
1625 retcode = subprocess.call(gtf_juncs_cmd,
1626 stderr=gtf_juncs_log,
1627 stdout=gtf_juncs_out)
1628 # cvg_islands returned an error
1629 if retcode == 1:
1630 print >> sys.stderr, "\tWarning: TopHat did not find any junctions in GTF file"
1631 return (False, gtf_juncs_out_name)
1632 elif retcode != 0:
1633 die(fail_str+"Error: GTF junction extraction failed with err ="+str(retcode))
1634 # cvg_islands not found
1635 except OSError, o:
1636 errmsg=fail_str+str(o)+"\n"
1637 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1638 errmsg+="Error: gtf_juncs not found on this system"
1639 die(errmsg)
1640 return (True, gtf_juncs_out_name)
1641
1642 # Call bowtie-build on the FASTA file of sythetic splice junction sequences
1643 def build_juncs_bwt_index(external_splice_prefix, color):
1644 print >> sys.stderr, "[%s] Indexing splices" % (right_now())
1645 bowtie_build_log = open(logging_dir + "bowtie_build.log", "w")
1646
1647 #user_splices_out_prefix = output_dir + "user_splices_idx"
1648
1649 bowtie_build_cmd = [prog_path("bowtie-build")]
1650 if color:
1651 bowtie_build_cmd += ["-C"]
1652
1653 bowtie_build_cmd += [external_splice_prefix + ".fa",
1654 external_splice_prefix]
1655 try:
1656 print >> run_log, " ".join(bowtie_build_cmd)
1657 retcode = subprocess.call(bowtie_build_cmd,
1658 stdout=bowtie_build_log)
1659
1660 if retcode != 0:
1661 die(fail_str+"Error: Splice sequence indexing failed with err ="+ str(retcode))
1662 except OSError, o:
1663 errmsg=fail_str+str(o)+"\n"
1664 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1665 errmsg+="Error: bowtie-build not found on this system"
1666 die(errmsg)
1667 return external_splice_prefix
1668
1669 # Build a splice index from a .juncs file, suitable for use with specified read
1670 # (or read segment) lengths
1671 def build_juncs_index(min_anchor_length,
1672 read_length,
1673 juncs_prefix,
1674 external_juncs,
1675 external_insertions,
1676 external_deletions,
1677 reference_fasta,
1678 color):
1679 print >> sys.stderr, "[%s] Retrieving sequences for splices" % (right_now())
1680
1681 juncs_file_list = ",".join(external_juncs)
1682 insertions_file_list = ",".join(external_insertions)
1683 deletions_file_list = ",".join(external_deletions)
1684
1685
1686 juncs_db_log = open(logging_dir + "juncs_db.log", "w")
1687
1688 external_splices_out_prefix = tmp_dir + juncs_prefix
1689 external_splices_out_name = external_splices_out_prefix + ".fa"
1690
1691 external_splices_out = open(external_splices_out_name, "w")
1692 # juncs_db_cmd = [bin_dir + "juncs_db",
1693 juncs_db_cmd = [prog_path("juncs_db"),
1694 str(min_anchor_length),
1695 str(read_length),
1696 juncs_file_list,
1697 insertions_file_list,
1698 deletions_file_list,
1699 reference_fasta]
1700 try:
1701 print >> run_log, " ".join(juncs_db_cmd)
1702 retcode = subprocess.call(juncs_db_cmd,
1703 stderr=juncs_db_log,
1704 stdout=external_splices_out)
1705
1706 if retcode != 0:
1707 die(fail_str+"Error: Splice sequence retrieval failed with err ="+str(retcode))
1708 # juncs_db not found
1709 except OSError, o:
1710 errmsg=fail_str+str(o)+"\n"
1711 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1712 errmsg+="Error: juncs_db not found on this system"
1713 die(errmsg)
1714
1715 external_splices_out_prefix = build_juncs_bwt_index(external_splices_out_prefix, color)
1716 return external_splices_out_prefix
1717
1718 # Print out the sam header, embedding the user's specified library properties.
1719 # FIXME: also needs SQ dictionary lines
1720 def write_sam_header(read_params, sam_file):
1721 print >> sam_file, "@HD\tVN:1.0\tSO:coordinate"
1722 if read_params.read_group_id and read_params.sample_id:
1723 rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
1724 read_params.sample_id)
1725 if read_params.library_id:
1726 rg_str += "\tLB:%s" % read_params.library_id
1727 if read_params.description:
1728 rg_str += "\tDS:%s" % read_params.description
1729 if read_params.seq_platform_unit:
1730 rg_str += "\tPU:%s" % read_params.seq_platform_unit
1731 if read_params.seq_center:
1732 rg_str += "\tCN:%s" % read_params.seq_center
1733 if read_params.mate_inner_dist:
1734 rg_str += "\tPI:%s" % read_params.mate_inner_dist
1735 if read_params.seq_run_date:
1736 rg_str += "\tDT:%s" % read_params.seq_run_date
1737 if read_params.seq_platform:
1738 rg_str += "\tPL:%s" % read_params.seq_platform
1739
1740 print >> sam_file, rg_str
1741 print >> sam_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
1742
1743 # Write final TopHat output, via tophat_reports and wiggles
1744 def compile_reports(params, sam_header_filename, left_maps, left_reads, right_maps, right_reads, gff_annotation):
1745 print >> sys.stderr, "[%s] Reporting output tracks" % right_now()
1746
1747 left_maps = [x for x in left_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
1748 left_maps = ','.join(left_maps)
1749
1750 if len(right_maps) > 0:
1751 right_maps = [x for x in right_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
1752 right_maps = ','.join(right_maps)
1753 log_fname = logging_dir + "reports.log"
1754 report_log = open(log_fname, "w")
1755 junctions = output_dir + "junctions.bed"
1756 insertions = output_dir + "insertions.bed"
1757 deletions = output_dir + "deletions.bed"
1758 coverage = "coverage.wig"
1759 accepted_hits = output_dir + "accepted_hits"
1760 report_cmdpath = prog_path("tophat_reports")
1761 report_cmd = [report_cmdpath]
1762 report_cmd.extend(params.cmd())
1763 report_cmd.extend(["--samtools="+samtools_path])
1764 report_cmd.extend([junctions,
1765 insertions,
1766 deletions,
1767 "-",
1768 left_maps,
1769 left_reads])
1770 if len(right_maps) > 0 and right_reads != None:
1771 report_cmd.append(right_maps)
1772 report_cmd.append(right_reads)
1773 # -- tophat_reports now produces (uncompressed) BAM stream,
1774 # directly piped into samtools sort
1775 try:
1776 if params.report_params.convert_bam:
1777 if params.report_params.sort_bam:
1778 report_proc=subprocess.Popen(report_cmd,
1779 preexec_fn=subprocess_setup,
1780 stdout=subprocess.PIPE,
1781 stderr=report_log)
1782
1783 bamsort_cmd = ["samtools", "sort", "-", accepted_hits]
1784 bamsort_proc= subprocess.Popen(bamsort_cmd,
1785 preexec_fn=subprocess_setup,
1786 stderr=open(logging_dir + "reports.samtools_sort.log", "w"),
1787 stdin=report_proc.stdout)
1788 report_proc.stdout.close()
1789 print >> run_log, " ".join(report_cmd)+" | " + " ".join(bamsort_cmd)
1790 bamsort_proc.communicate()
1791 retcode = report_proc.poll()
1792 if retcode:
1793 die(fail_str+"Error running tophat_reports\n"+log_tail(log_fname))
1794 else:
1795 print >> run_log, " ".join(report_cmd)
1796 report_proc=subprocess.call(report_cmd,
1797 preexec_fn=subprocess_setup,
1798 stdout=open(accepted_hits,"wb"),
1799 stderr=report_log)
1800 os.rename(accepted_hits, output_dir + "accepted_hits.bam")
1801 else:
1802 print >> run_log, " ".join(report_cmd)
1803 report_proc=subprocess.call(report_cmd,
1804 preexec_fn=subprocess_setup,
1805 stdout=open(accepted_hits,"wb"),
1806 stderr=report_log)
1807 tmp_sam = output_dir + "accepted_hits.sam"
1808
1809 bam_to_sam_cmd = ["samtools", "view", "-h", accepted_hits]
1810 print >> run_log, " ".join(bam_to_sam_cmd) + " > " + tmp_sam
1811 bam_to_sam_log = open(logging_dir + "accepted_hits_bam_to_sam.log", "w")
1812 tmp_sam_file = open(tmp_sam, "w")
1813 ret = subprocess.call(bam_to_sam_cmd,
1814 stdout=tmp_sam_file,
1815 stderr=bam_to_sam_log)
1816 tmp_sam_file.close()
1817 os.remove(accepted_hits)
1818 #print >> run_log, "mv %s %s" % (tmp_sam, output_dir + "accepted_hits.sam")
1819 #os.rename(tmp_sam, output_dir + "accepted_hits.sam")
1820
1821 except OSError, o:
1822 die(fail_str+"Error: "+str(o)+"\n"+log_tail(log_fname))
1823
1824 # FIXME: put wiggles back!
1825 # wig_cmd = [prog_path("wiggles"), output_dir + accepted_hits, output_dir + coverage]
1826 # print >> run_log, " ".join(wig_cmd)
1827 # subprocess.call(wig_cmd,
1828 # stderr=open("/dev/null"))
1829 return (coverage, junctions)
1830
1831
1832 # Split up each read in a FASTQ file into multiple segments. Creates a FASTQ file
1833 # for each segment This function needs to be fixed to support mixed read length
1834 # inputs
1835
1836 def open_output_files(prefix, num_files_prev, num_files, out_segf, extension, params):
1837 i = num_files_prev + 1
1838 while i <= num_files:
1839 segfname=prefix+("_seg%d" % i)+extension
1840 out_segf.append(ZWriter(segfname,params.system_params))
1841 i += 1
1842
1843 def split_reads(reads_filename,
1844 prefix,
1845 fasta,
1846 params,
1847 segment_length):
1848 #reads_file = open(reads_filename)
1849 zreads = ZReader(reads_filename, params.system_params, False)
1850 out_segfiles = []
1851
1852 if fasta:
1853 extension = ".fa"
1854 else:
1855 extension = ".fq"
1856 if use_zpacker: extension += ".z"
1857 def convert_color_to_bp(color_seq):
1858 decode_dic = { 'A0':'A', 'A1':'C', 'A2':'G', 'A3':'T', 'A4':'N', 'A.':'N', 'AN':'N',
1859 'C0':'C', 'C1':'A', 'C2':'T', 'C3':'G', 'C4':'N', 'C.':'N', 'CN':'N',
1860 'G0':'G', 'G1':'T', 'G2':'A', 'G3':'C', 'G4':'N', 'G.':'N', 'GN':'N',
1861 'T0':'T', 'T1':'G', 'T2':'C', 'T3':'A', 'T4':'N', 'T.':'N', 'TN':'N',
1862 'N0':'N', 'N1':'N', 'N2':'N', 'N3':'N', 'N4':'N', 'N.':'N', 'NN':'N',
1863 '.0':'N', '.1':'N', '.2':'N', '.3':'N', '.4':'N', '..':'N', '.N':'N' }
1864
1865 base = color_seq[0]
1866 bp_seq = base
1867 for ch in color_seq[1:]:
1868 base = decode_dic[base+ch]
1869 bp_seq += base
1870 return bp_seq
1871
1872 def convert_bp_to_color(bp_seq):
1873 encode_dic = { 'AA':'0', 'CC':'0', 'GG':'0', 'TT':'0',
1874 'AC':'1', 'CA':'1', 'GT':'1', 'TG':'1',
1875 'AG':'2', 'CT':'2', 'GA':'2', 'TC':'2',
1876 'AT':'3', 'CG':'3', 'GC':'3', 'TA':'3',
1877 'A.':'4', 'C.':'4', 'G.':'4', 'T.':'4',
1878 '.A':'4', '.C':'4', '.G':'4', '.T':'4',
1879 '.N':'4', 'AN':'4', 'CN':'4', 'GN':'4',
1880 'TN':'4', 'NA':'4', 'NC':'4', 'NG':'4',
1881 'NT':'4', 'NN':'4', 'N.':'4', '..':'4' }
1882
1883 base = bp_seq[0]
1884 color_seq = base
1885 for ch in bp_seq[1:]:
1886 color_seq += encode_dic[base + ch]
1887 base = ch
1888
1889 return color_seq
1890
1891 def split_record(read_name, read_seq, read_qual, out_segf, offsets, color):
1892 if color:
1893 color_offset = 1
1894 read_seq_temp = convert_color_to_bp(read_seq)
1895
1896 seg_num = 1
1897 while seg_num + 1 < len(offsets):
1898 if read_seq[offsets[seg_num]+1] not in ['0', '1', '2', '3']:
1899 return
1900 seg_num += 1
1901 else:
1902 color_offset = 0
1903
1904 seg_num = 0
1905 last_seq_offset = 0
1906 while seg_num + 1 < len(offsets):
1907 f = out_segf[seg_num].file
1908 seg_seq = read_seq[last_seq_offset+color_offset:offsets[seg_num + 1]+color_offset]
1909 print >> f, "%s|%d:%d:%d" % (read_name,last_seq_offset,seg_num, len(offsets) - 1)
1910 if color:
1911 print >> f, "%s%s" % (read_seq_temp[last_seq_offset], seg_seq)
1912 else:
1913 print >> f, seg_seq
1914 if not fasta:
1915 seg_qual = read_qual[last_seq_offset:offsets[seg_num + 1]]
1916 print >> f, "+"
1917 print >> f, seg_qual
1918 seg_num += 1
1919 last_seq_offset = offsets[seg_num]
1920
1921 line_state = 0
1922 read_name = ""
1923 read_seq = ""
1924 read_qual = ""
1925 num_segments = 0
1926 offsets = []
1927 for line in zreads.file:
1928 if line.strip() == "":
1929 continue
1930 if line_state == 0:
1931 read_name = line.strip()
1932 elif line_state == 1:
1933 read_seq = line.strip()
1934
1935 read_length = len(read_seq)
1936 tmp_num_segments = read_length / segment_length
1937 offsets = [segment_length * i for i in range(0, tmp_num_segments + 1)]
1938
1939 # Bowtie's minimum read length here is 20bp, so if the last segment
1940 # is between 20 and segment_length bp long, go ahead and write it out
1941 if read_length % segment_length >= 20:
1942 offsets.append(read_length)
1943 tmp_num_segments += 1
1944 else:
1945 offsets[-1] = read_length
1946
1947 if tmp_num_segments == 1:
1948 offsets = [0, read_length]
1949
1950 if tmp_num_segments > num_segments:
1951 open_output_files(prefix, num_segments, tmp_num_segments, out_segfiles, extension, params)
1952 num_segments = tmp_num_segments
1953
1954 if fasta:
1955 split_record(read_name, read_seq, None, out_segfiles, offsets, params.read_params.color)
1956 elif line_state == 2:
1957 line = line.strip()
1958 else:
1959 read_quals = line.strip()
1960 if not fasta:
1961 split_record(read_name, read_seq, read_quals, out_segfiles, offsets, params.read_params.color)
1962
1963 line_state += 1
1964 if fasta:
1965 line_state %= 2
1966 else:
1967 line_state %= 4
1968 zreads.close()
1969 out_fnames=[]
1970 for zf in out_segfiles:
1971 zf.close()
1972 out_fnames.append(zf.fname)
1973 #return [o.fname for o in out_segfiles]
1974 return out_fnames
1975
1976 # Find possible splice junctions using the "closure search" strategy, and report
1977 # them in closures.juncs. Calls the executable closure_juncs
1978 def junctions_from_closures(params,
1979 left_maps,
1980 right_maps,
1981 ref_fasta):
1982 #print >> sys.stderr, "[%s] " % right_now()
1983 print >> sys.stderr, "[%s] Searching for junctions via mate-pair closures" % right_now()
1984
1985 #maps = [x for x in seg_maps if (os.path.exists(x) and os.path.getsize(x) > 0)]
1986 #if len(maps) == 0:
1987 # return None
1988 #print >> sys.stderr, left_maps
1989 slash = left_maps[0].rfind('/')
1990 juncs_out = ""
1991 if slash != -1:
1992 juncs_out += left_maps[0][:slash+1]
1993 juncs_out += "closure.juncs"
1994
1995 juncs_log = open(logging_dir + "closure.log", "w")
1996 juncs_cmdpath=prog_path("closure_juncs")
1997 juncs_cmd = [juncs_cmdpath]
1998
1999 left_maps = ','.join(left_maps)
2000 right_maps = ','.join(right_maps)
2001
2002 juncs_cmd.extend(params.cmd())
2003 juncs_cmd.extend([juncs_out,
2004 ref_fasta,
2005 left_maps,
2006 right_maps])
2007 try:
2008 print >> run_log, ' '.join(juncs_cmd)
2009 retcode = subprocess.call(juncs_cmd,
2010 stderr=juncs_log)
2011
2012 # spanning_reads returned an error
2013 if retcode != 0:
2014 die(fail_str+"Error: closure-based junction search failed with err ="+str(retcode))
2015 # cvg_islands not found
2016 except OSError, o:
2017 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2018 print >> sys.stderr, fail_str, "Error: closure_juncs not found on this system"
2019 die(str(o))
2020 return [juncs_out]
2021
2022 # Find possible junctions by examining coverage and split segments in the initial
2023 # map and segment maps. Report junctions, insertions, and deletions in segment.juncs,
2024 # segment.insertions, and segment.deletions. Calls the executable
2025 # segment_juncs
2026 def junctions_from_segments(params,
2027 left_reads,
2028 left_reads_map,
2029 left_seg_maps,
2030 right_reads,
2031 right_reads_map,
2032 right_seg_maps,
2033 unmapped_reads,
2034 reads_format,
2035 ref_fasta):
2036 if left_reads_map != left_seg_maps[0]:
2037 print >> sys.stderr, "[%s] Searching for junctions via segment mapping" % right_now()
2038 out_path=getFileDir(left_seg_maps[0])
2039 juncs_out=out_path+"segment.juncs"
2040 insertions_out=out_path+"segment.insertions"
2041 deletions_out =out_path+"segment.deletions"
2042
2043 left_maps = ','.join(left_seg_maps)
2044 align_log = open(logging_dir + "segment_juncs.log", "w")
2045 align_cmd = [prog_path("segment_juncs")]
2046
2047 align_cmd.extend(params.cmd())
2048
2049 align_cmd.extend(["--ium-reads", ",".join(unmapped_reads),
2050 ref_fasta,
2051 juncs_out,
2052 insertions_out,
2053 deletions_out,
2054 left_reads,
2055 left_reads_map,
2056 left_maps])
2057 if right_seg_maps != None:
2058 right_maps = ','.join(right_seg_maps)
2059 align_cmd.extend([right_reads, right_reads_map, right_maps])
2060 try:
2061 print >> run_log, " ".join(align_cmd)
2062 retcode = subprocess.call(align_cmd,
2063 preexec_fn=subprocess_setup,
2064 stderr=align_log)
2065
2066 # spanning_reads returned an error
2067 if retcode != 0:
2068 die(fail_str+"Error: segment-based junction search failed with err ="+str(retcode))
2069 # cvg_islands not found
2070 except OSError, o:
2071 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2072 print >> sys.stderr, fail_str, "Error: segment_juncs not found on this system"
2073 die(str(o))
2074
2075 return [juncs_out, insertions_out, deletions_out]
2076
2077 # Joins mapped segments into full-length read alignments via the executable
2078 # long_spanning_reads
2079 def join_mapped_segments(params,
2080 sam_header_filename,
2081 reads,
2082 ref_fasta,
2083 possible_juncs,
2084 possible_insertions,
2085 possible_deletions,
2086 contig_seg_maps,
2087 spliced_seg_maps,
2088 alignments_out_name):
2089 if len(contig_seg_maps)>1:
2090 print >> sys.stderr, "[%s] Joining segment hits" % right_now()
2091 else:
2092 print >> sys.stderr, "[%s] Processing bowtie hits" % right_now()
2093 contig_seg_maps = ','.join(contig_seg_maps)
2094
2095 possible_juncs = ','.join(possible_juncs)
2096 possible_insertions = ",".join(possible_insertions)
2097 possible_deletions = ",".join(possible_deletions)
2098
2099 align_log = open(logging_dir + "long_spanning_reads.log", "w")
2100 align_cmd = [prog_path("long_spanning_reads")]
2101
2102 align_cmd.extend(params.cmd())
2103
2104 align_cmd.append(ref_fasta)
2105 align_cmd.extend([ reads,
2106 possible_juncs,
2107 possible_insertions,
2108 possible_deletions,
2109 contig_seg_maps])
2110
2111 if spliced_seg_maps != None:
2112 spliced_seg_maps = ','.join(spliced_seg_maps)
2113 align_cmd.append(spliced_seg_maps)
2114
2115 try:
2116 print >> run_log, " ".join(align_cmd),">",alignments_out_name
2117 join_proc=subprocess.Popen(align_cmd,
2118 preexec_fn=subprocess_setup,
2119 stdout=open(alignments_out_name, "wb"),
2120 stderr=align_log)
2121 join_proc.communicate()
2122 except OSError, o:
2123 die(fail_str+"Error: "+str(o))
2124
2125
2126 # This class collects spliced and unspliced alignments for each of the
2127 # left and right read files provided by the user.
2128 class Maps:
2129 def __init__(self,
2130 unspliced_bwt,
2131 unspliced_sam,
2132 seg_maps,
2133 unmapped_segs,
2134 segs):
2135 self.unspliced_bwt = unspliced_bwt
2136 self.unspliced_sam = unspliced_sam
2137 self.seg_maps = seg_maps
2138 self.unmapped_segs = unmapped_segs
2139 self.segs = segs
2140
2141 # The main aligment routine of TopHat. This function executes most of the
2142 # workflow producing a set of candidate alignments for each cDNA fragment in a
2143 # pair of SAM alignment files (for paired end reads).
2144 def spliced_alignment(params,
2145 bwt_idx_prefix,
2146 sam_header_filename,
2147 ref_fasta,
2148 read_len,
2149 segment_len,
2150 left_reads,
2151 right_reads,
2152 user_supplied_junctions,
2153 user_supplied_insertions,
2154 user_supplied_deletions):
2155
2156 possible_juncs = []
2157 possible_juncs.extend(user_supplied_junctions)
2158
2159 possible_insertions = []
2160 possible_insertions.extend(user_supplied_insertions)
2161
2162 possible_deletions = []
2163 possible_deletions.extend(user_supplied_deletions)
2164
2165 maps = { left_reads : [], right_reads : [] }
2166 #single_segments = False
2167
2168 # Perform the first part of the TopHat work flow on the left and right
2169 # reads of paired ends separately - we'll use the pairing information later
2170 for reads in [left_reads, right_reads]:
2171 if reads == None or os.path.getsize(reads)<25 :
2172 continue
2173 fbasename=getFileBaseName(reads)
2174 unspliced_out = tmp_dir + fbasename + ".mapped"
2175 #if use_zpacker: unspliced_out+=".z"
2176 unmapped_unspliced = tmp_dir + fbasename + "_unmapped"
2177
2178 num_segs = read_len / segment_len
2179 if read_len % segment_len >= min(segment_len -2, 20):
2180 num_segs += 1
2181
2182 # daehwan - check this out
2183 if num_segs <= 1:
2184 print >> sys.stderr, "Warning: you have only one segment per read\n\twe strongly recommend that you decrease --segment-length to about half the read length because TopHat will work better with multiple segments"
2185
2186 # Perform the initial Bowtie mapping of the full length reads
2187 (unspliced_map, unmapped) = bowtie(params,
2188 bwt_idx_prefix,
2189 reads,
2190 "fastq",
2191 params.initial_read_mismatches,
2192 unspliced_out,
2193 unmapped_unspliced)
2194
2195 unspliced_sam = tmp_dir+fbasename+'.unspl.bam'
2196
2197 # Convert the initial Bowtie maps into BAM format.
2198 # TODO: this step should be replaced by a more direct conversion
2199 # because Bowtie can actually output SAM format now
2200 join_mapped_segments(params,
2201 sam_header_filename,
2202 reads,
2203 ref_fasta,
2204 ["/dev/null"],
2205 ["/dev/null"],
2206 ["/dev/null"],
2207 [unspliced_map],
2208 [],
2209 unspliced_sam)
2210 # Using the num_segs value returned by check_reads(), decide which
2211 # junction discovery strategy to use
2212 if num_segs == 1:
2213 segment_len = read_len
2214 elif num_segs >= 3:
2215 # if we have at least three segments, just use split segment search,
2216 # which is the most sensitive and specific, fastest, and lightest-weight.
2217 if not params.closure_search:
2218 params.closure_search = False
2219 if not params.coverage_search:
2220 params.coverage_search = False
2221 if not params.butterfly_search:
2222 params.butterfly_search = False
2223 if not params.closure_search:
2224 params.closure_search = False
2225 seg_maps = []
2226 unmapped_segs = []
2227 segs = []
2228 if num_segs > 1:
2229 # split up the IUM reads into segments
2230 read_segments = split_reads(unmapped_unspliced,
2231 tmp_dir + fbasename,
2232 False,
2233 params,
2234 segment_len)
2235
2236 # Map each segment file independently with Bowtie
2237 for i in range(len(read_segments)):
2238 seg = read_segments[i]
2239 fbasename=getFileBaseName(seg)
2240 seg_out = tmp_dir + fbasename + ".bwtout"
2241 if use_zpacker: seg_out += ".z"
2242 unmapped_seg = tmp_dir + fbasename + "_unmapped.fq"
2243 if use_BWT_FIFO:
2244 unmapped_seg += ".z"
2245 extra_output = "(%d/%d)" % (i+1, len(read_segments))
2246 (seg_map, unmapped) = bowtie_segment(params,
2247 bwt_idx_prefix,
2248 seg,
2249 "fastq",
2250 seg_out,
2251 unmapped_seg,
2252 extra_output)
2253 seg_maps.append(seg_map)
2254 unmapped_segs.append(unmapped)
2255 segs.append(seg)
2256
2257 # Collect the segment maps for left and right reads together
2258 maps[reads] = Maps(unspliced_map, unspliced_sam, seg_maps, unmapped_segs, segs)
2259 else:
2260 # if there's only one segment, just collect the initial map as the only
2261 # map to be used downstream for coverage-based junction discovery
2262 read_segments = [reads]
2263 maps[reads] = Maps(unspliced_map, unspliced_sam, [unspliced_map], [unmapped_unspliced], [unmapped_unspliced])
2264
2265 # Unless the user asked not to discover new junctions, start that process
2266 # here
2267
2268 if params.find_novel_juncs or params.find_novel_indels:
2269 left_reads_map = maps[left_reads].unspliced_bwt
2270 left_seg_maps = maps[left_reads].seg_maps
2271 unmapped_reads = maps[left_reads].unmapped_segs
2272 if right_reads != None:
2273 right_reads_map = maps[right_reads].unspliced_bwt
2274 right_seg_maps = maps[right_reads].seg_maps
2275 unmapped_reads.extend(maps[right_reads].unmapped_segs)
2276 else:
2277 right_reads_map = None
2278 right_seg_maps = None
2279
2280 # Call segment_juncs to infer a list of possible splice junctions from
2281 # the regions of the genome covered in the initial and segment maps
2282 juncs = junctions_from_segments(params,
2283 left_reads,
2284 left_reads_map,
2285 left_seg_maps,
2286 right_reads,
2287 right_reads_map,
2288 right_seg_maps,
2289 unmapped_reads,
2290 "fastq",
2291 ref_fasta)
2292 if params.find_novel_juncs:
2293 if os.path.getsize(juncs[0]) != 0:
2294 possible_juncs.append(juncs[0])
2295 if params.find_novel_indels:
2296 if os.path.getsize(juncs[1]) != 0:
2297 possible_insertions.append(juncs[1])
2298 if os.path.getsize(juncs[2]) != 0:
2299 possible_deletions.append(juncs[2])
2300 # Optionally, and for paired reads only, use a closure search to
2301 # discover addtional junctions
2302 if params.find_novel_juncs and params.closure_search and left_reads != None and right_reads != None:
2303 juncs = junctions_from_closures(params,
2304 [maps[left_reads].unspliced_bwt, maps[left_reads].seg_maps[-1]],
2305 [maps[right_reads].unspliced_bwt, maps[right_reads].seg_maps[-1]],
2306 ref_fasta)
2307 if os.path.getsize(juncs[0]) != 0:
2308 possible_juncs.extend(juncs)
2309
2310 if len(possible_insertions) == 0 and len(possible_deletions) == 0 and len(possible_juncs) == 0:
2311 spliced_seg_maps = None
2312 junc_idx_prefix = None
2313 else:
2314 junc_idx_prefix = "segment_juncs"
2315 if len(possible_insertions) == 0:
2316 possible_insertions.append(os.devnull)
2317 # print >> sys.stderr, "Warning: insertions database is empty!"
2318 if len(possible_deletions) == 0:
2319 possible_deletions.append(os.devnull)
2320 # print >> sys.stderr, "Warning: deletions database is empty!"
2321 if len(possible_juncs) == 0:
2322 possible_juncs.append(os.devnull)
2323 print >> sys.stderr, "Warning: junction database is empty!"
2324 if junc_idx_prefix != None:
2325 build_juncs_index(3,
2326 segment_len,
2327 junc_idx_prefix,
2328 possible_juncs,
2329 possible_insertions,
2330 possible_deletions,
2331 ref_fasta,
2332 params.read_params.color)
2333
2334 # Now map read segments (or whole IUM reads, if num_segs == 1) to the splice
2335 # index with Bowtie
2336 for reads in [left_reads, right_reads]:
2337 spliced_seg_maps = []
2338 if reads == None or reads not in maps:
2339 continue
2340
2341 if junc_idx_prefix != None:
2342 i = 0
2343 for seg in maps[reads].segs:
2344 fsegname = getFileBaseName(seg)
2345 seg_out = tmp_dir + fsegname + ".to_spliced.bwtout"
2346 if use_zpacker: seg_out += ".z"
2347 extra_output = "(%d/%d)" % (i+1, len(maps[reads].segs))
2348 (seg_map, unmapped) = bowtie_segment(params,
2349 tmp_dir + junc_idx_prefix,
2350 seg,
2351 "fastq",
2352 seg_out,
2353 None,
2354 extra_output)
2355 spliced_seg_maps.append(seg_map)
2356 i += 1
2357
2358 # Join the contigous and spliced segment hits into full length read
2359 # alignments or convert a spliced segment hit into a genomic-coordinate hit
2360 rfname = getFileBaseName(reads)
2361 rfdir = getFileDir(reads)
2362 mapped_reads = rfdir + rfname + ".candidates.bam"
2363 merged_map =rfdir + rfname + ".candidates_and_unspl.bam"
2364 unspl_bwtfile = maps[reads].unspliced_bwt
2365 unspl_samfile = maps[reads].unspliced_sam
2366
2367 join_mapped_segments(params,
2368 sam_header_filename,
2369 reads,
2370 ref_fasta,
2371 possible_juncs,
2372 possible_insertions,
2373 possible_deletions,
2374 maps[reads].seg_maps,
2375 spliced_seg_maps,
2376 mapped_reads)
2377
2378 if num_segs > 1:
2379 # Merge the spliced and unspliced full length alignments into a
2380 # single SAM file.
2381 # The individual SAM files are all already sorted in
2382 # increasing read ID order.
2383 # NOTE: We also should be able to address bug #134 here, by replacing
2384 # contiguous alignments that poke into an intron by a small amount by
2385 # the correct spliced alignment.
2386
2387 try:
2388 merge_cmd = [ prog_path("bam_merge"), merged_map,
2389 maps[reads].unspliced_sam,
2390 mapped_reads ]
2391 print >> run_log, " ".join(merge_cmd)
2392 ret = subprocess.call( merge_cmd,
2393 stderr=open(logging_dir + "sam_merge.log", "w") )
2394 if ret != 0:
2395 die(fail_str+"Error executing: "+" ".join(merge_cmd))
2396 except OSError, o:
2397 die(fail_str+"Error: "+str(o))
2398 maps[reads] = [merged_map]
2399
2400 if not params.system_params.keep_tmp:
2401 os.remove(mapped_reads)
2402 os.remove(unspl_samfile)
2403 os.remove(unspl_bwtfile)
2404 else:
2405 os.rename(mapped_reads, merged_map)
2406 maps[reads] = [merged_map]
2407 if not params.system_params.keep_tmp:
2408 os.remove(unspl_samfile)
2409 os.remove(unspl_bwtfile)
2410
2411 return maps
2412
2413 def die(msg=None):
2414 if msg is not None:
2415 print >> sys.stderr, msg
2416 sys.exit(1)
2417
2418 # rough equivalent of the 'which' command to find external programs
2419 # (current script path is tested first, then PATH envvar)
2420 def which(program):
2421 def is_executable(fpath):
2422 return os.path.exists(fpath) and os.access(fpath, os.X_OK)
2423 fpath, fname = os.path.split(program)
2424 if fpath:
2425 if is_executable(program):
2426 return program
2427 else:
2428 progpath = os.path.join(bin_dir, program)
2429 if is_executable(progpath):
2430 return progpath
2431 for path in os.environ["PATH"].split(os.pathsep):
2432 progpath = os.path.join(path, program)
2433 if is_executable(progpath):
2434 return progpath
2435 return None
2436
2437 def prog_path(program):
2438 progpath=which(program)
2439 if progpath == None:
2440 die("Error locating program: "+program)
2441 return progpath
2442
2443 # FIXME: this should get set during the make dist autotools phase of the build
2444 def get_version():
2445 return "1.3.2"
2446
2447 def mlog(msg):
2448 print >> sys.stderr, "[DBGLOG]:"+msg
2449
2450 def test_input_file(filename):
2451 try:
2452 test_file = open(filename, "r")
2453 # Bowtie not found
2454 except IOError, o:
2455 die("Error: Opening file %s" % filename)
2456 return
2457
2458
2459 def main(argv=None):
2460 warnings.filterwarnings("ignore", "tmpnam is a potential security risk")
2461
2462 # Initialize default parameter values
2463 params = TopHatParams()
2464
2465 try:
2466 if argv is None:
2467 argv = sys.argv
2468 args = params.parse_options(argv)
2469 params.check()
2470
2471 bwt_idx_prefix = args[0]
2472 left_reads_list = args[1]
2473 left_quals_list, right_quals_list = [], []
2474 if (not params.read_params.quals and len(args) > 2) or (params.read_params.quals and len(args) > 3):
2475 if params.read_params.mate_inner_dist == None:
2476 die("Error: you must set the mean inner distance between mates with -r")
2477
2478 right_reads_list = args[2]
2479 if params.read_params.quals:
2480 left_quals_list = args[3]
2481 right_quals_list = args[4]
2482 else:
2483 right_reads_list = None
2484 if params.read_params.quals:
2485 left_quals_list = args[2]
2486
2487 print >> sys.stderr
2488 print >> sys.stderr, "[%s] Beginning TopHat run (v%s)" % (right_now(), get_version())
2489 print >> sys.stderr, "-----------------------------------------------"
2490
2491 start_time = datetime.now()
2492 prepare_output_dir()
2493
2494 global run_log
2495 run_log = open(logging_dir + "run.log", "w", 0)
2496 global run_cmd
2497 run_cmd = " ".join(argv)
2498 print >> run_log, run_cmd
2499
2500 # Validate all the input files, check all prereqs before committing
2501 # to the run
2502 (ref_fasta, ref_seq_dict) = check_index(bwt_idx_prefix)
2503
2504 check_bowtie()
2505 check_samtools()
2506 print >> sys.stderr, "[%s] Generating SAM header for %s" % (right_now(), bwt_idx_prefix)
2507 sam_header_filename = get_index_sam_header(params.read_params, bwt_idx_prefix)
2508 print >> sys.stderr, "[%s] Preparing reads" % (right_now())
2509
2510 if not params.skip_check_reads:
2511 reads_list = left_reads_list
2512 if right_reads_list:
2513 reads_list = reads_list + "," + right_reads_list
2514 params.read_params = check_reads_format(params, reads_list)
2515
2516 user_supplied_juncs = []
2517 user_supplied_insertions = []
2518 user_supplied_deletions = []
2519
2520 if params.gff_annotation and params.find_GFF_juncs:
2521 test_input_file(params.gff_annotation)
2522 (found_juncs, gtf_juncs) = get_gtf_juncs(params.gff_annotation)
2523 if found_juncs:
2524 user_supplied_juncs.append(gtf_juncs)
2525 if params.raw_junctions:
2526 test_input_file(params.raw_junctions)
2527 user_supplied_juncs.append(params.raw_junctions)
2528
2529 if params.raw_insertions:
2530 test_input_file(params.raw_insertions)
2531 user_supplied_insertions.append(params.raw_insertions)
2532
2533 if params.raw_deletions:
2534 test_input_file(params.raw_deletions)
2535 user_supplied_deletions.append(params.raw_deletions)
2536
2537 global unmapped_reads_fifo
2538 unmapped_reads_fifo = tmp_dir + str(os.getpid())+".bwt_unmapped.z.fifo"
2539
2540 # Now start the time consuming stuff
2541 left_kept_reads, left_reads_info = prep_reads(params,
2542 left_reads_list,
2543 left_quals_list,
2544 "left_kept_reads", "Left ")
2545 min_read_len=left_reads_info.min_len
2546 max_read_len=left_reads_info.max_len
2547 if right_reads_list != None:
2548 right_kept_reads, right_reads_info = prep_reads(params,
2549 right_reads_list,
2550 right_quals_list,
2551 "right_kept_reads", "Right")
2552 min_read_len=min(right_reads_info.min_len, min_read_len)
2553 max_read_len=max(right_reads_info.max_len, max_read_len)
2554 else:
2555 right_kept_reads = None
2556 seed_len=params.read_params.seed_length
2557 if seed_len != None:
2558 seed_len = max(seed_len, min_read_len)
2559 else:
2560 seed_len = max_read_len
2561 params.read_params.seed_length=seed_len
2562 # turn off integer-quals
2563 if params.read_params.integer_quals:
2564 params.read_params.integer_quals = False
2565
2566 mapping = spliced_alignment(params,
2567 bwt_idx_prefix,
2568 sam_header_filename,
2569 ref_fasta,
2570 params.read_params.seed_length,
2571 params.segment_length,
2572 left_kept_reads,
2573 right_kept_reads,
2574 user_supplied_juncs,
2575 user_supplied_insertions,
2576 user_supplied_deletions)
2577
2578 left_maps = mapping[left_kept_reads]
2579 #left_unmapped_reads = mapping[1]
2580 right_maps = mapping[right_kept_reads]
2581 #right_unmapped_reads = mapping[3]
2582
2583 compile_reports(params,
2584 sam_header_filename,
2585 left_maps,
2586 left_kept_reads,
2587 right_maps,
2588 right_kept_reads,
2589 params.gff_annotation)
2590
2591 if not params.system_params.keep_tmp:
2592 for m in left_maps:
2593 os.remove(m)
2594 if left_kept_reads != None:
2595 os.remove(left_kept_reads)
2596 for m in right_maps:
2597 os.remove(m)
2598 if right_kept_reads != None:
2599 os.remove(right_kept_reads)
2600 tmp_files = os.listdir(tmp_dir)
2601 for t in tmp_files:
2602 os.remove(tmp_dir+t)
2603 os.rmdir(tmp_dir)
2604
2605 finish_time = datetime.now()
2606 duration = finish_time - start_time
2607 print >> sys.stderr,"-----------------------------------------------"
2608 print >> sys.stderr, "Run complete [%s elapsed]" % formatTD(duration)
2609
2610 except Usage, err:
2611 print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg)
2612 print >> sys.stderr, " for detailed help see http://tophat.cbcb.umd.edu/manual.html"
2613 return 2
2614
2615
2616 if __name__ == "__main__":
2617 sys.exit(main())

Properties

Name Value
svn:executable *