ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/tophat
Revision: 154
Committed: Tue Jan 24 02:29:21 2012 UTC (7 years, 8 months ago) by gpertea
File size: 147456 byte(s)
Log Message:
massive update with Daehwan's work

Line File contents
1 #!/usr/bin/env python
2
3 # encoding: utf-8
4 """
5 tophat.py
6
7 Created by Cole Trapnell on 2008-12-25.
8 Copyright (c) 2008 Cole Trapnell. All rights reserved.
9 Updated and maintained by Daehwan Kim and Geo Pertea since Jul 2010.
10 """
11 import sys
12 try:
13 import psyco
14 psyco.full()
15 except ImportError:
16 pass
17
18 import getopt
19 import subprocess
20 import errno
21 import os
22 import warnings
23 import re
24 import signal
25 from datetime import datetime, date, time
26 from shutil import copy
27
28 use_message = '''
29 TopHat maps short sequences from spliced transcripts to whole genomes.
30
31 Usage:
32 tophat [options] <bowtie_index> <reads1[,reads2,...]> [reads1[,reads2,...]] \\
33 [quals1,[quals2,...]] [quals1[,quals2,...]]
34
35 Options:
36 -v/--version
37 -o/--output-dir <string> [ default: ./tophat_out ]
38 --bowtie1 [ default: bowtie2 ]
39 -a/--min-anchor <int> [ default: 8 ]
40 -m/--splice-mismatches <0-2> [ default: 0 ]
41 -i/--min-intron-length <int> [ default: 50 ]
42 -I/--max-intron-length <int> [ default: 500000 ]
43 -g/--max-multihits <int> [ default: 20 ]
44 -x/--transcriptome-max-hits <int> [ default: 60 ]
45 -n/--transcriptome-mismatches <int> [ default: 1 ]
46 -M/--prefilter-multihits ( for -G/--GTF option, enable
47 an initial bowtie search
48 against the genome )
49 --max-insertion-length <int> [ default: 3 ]
50 --max-deletion-length <int> [ default: 3 ]
51 --solexa-quals
52 --solexa1.3-quals (same as phred64-quals)
53 --phred64-quals (same as solexa1.3-quals)
54 -Q/--quals
55 --integer-quals
56 -C/--color (Solid - color space)
57 --color-out
58 --library-type <string> (fr-unstranded, fr-firststrand,
59 fr-secondstrand)
60 -p/--num-threads <int> [ default: 1 ]
61 -G/--GTF <filename> (GTF/GFF with known transcripts)
62 --transcriptome-index <bwtidx> (transcriptome bowtie index)
63 -T/--transcriptome-only (map only to the transcriptome)
64 -j/--raw-juncs <filename>
65 --insertions <filename>
66 --deletions <filename>
67 -r/--mate-inner-dist <int>
68 --mate-std-dev <int> [ default: 20 ]
69 --no-novel-juncs
70 --no-novel-indels
71 --fusion-search
72 --fusion-anchor-length <int> [ default: 20 ]
73 --fusion-min-dist <int> [ default: 10000000 ]
74 --fusion-read-mismatches <int> [ default: 2 ]
75 --fusion-multireads <int> [ default: 2 ]
76 --fusion-multipairs <int> [ default: 2 ]
77 --no-gtf-juncs
78 --no-coverage-search
79 --coverage-search
80 --no-closure-search
81 --closure-search
82 --microexon-search
83 --butterfly-search
84 --no-butterfly-search
85 --keep-tmp
86 --tmp-dir <dirname> [ default: <output_dir>/tmp ]
87 -z/--zpacker <program> [ default: gzip ]
88 -X/--unmapped-fifo [use mkfifo to compress more temporary
89 files for color space reads]
90
91 Advanced Options:
92 --genome-read-mismatches <int> [ default: 2 ]
93 -N/--read-mismatches <int> [ default: 2 ]
94 --segment-mismatches <int> [ default: 2 ]
95 --segment-length <int> [ default: 25 ]
96 --bowtie-n [ default: bowtie -v ]
97 --min-closure-exon <int> [ default: 100 ]
98 --min-closure-intron <int> [ default: 50 ]
99 --max-closure-intron <int> [ default: 5000 ]
100 --min-coverage-intron <int> [ default: 50 ]
101 --max-coverage-intron <int> [ default: 20000 ]
102 --min-segment-intron <int> [ default: 50 ]
103 --max-segment-intron <int> [ default: 500000 ]
104 --no-sort-bam (Output BAM is not coordinate-sorted)
105 --no-convert-bam (Do not convert to bam format.
106 Output is <output_dir>accepted_hit.sam.
107 Implies --no-sort-bam)
108
109 Bowtie2 related options:
110 Preset options in --end-to-end mode (local alignment is not used in TopHat2)
111 --b2-very-fast
112 --b2-fast
113 --b2-sensitive
114 --b2-very-sensitive
115
116 Alignment options
117 --b2-N <int> [ default: 0 ]
118 --b2-L <int> [ default: 20 ]
119 --b2-i <func> [ default: S,1,1.25 ]
120 --b2-n-ceil <func> [ default: L,0,0.15 ]
121 --b2-gbar <int> [ default: 4 ]
122
123 Scoring options
124 --b2-mp <int>,<int> [ default: 6,2 ]
125 --b2-np <int> [ default: 1 ]
126 --b2-rdg <int>,<int> [ default: 5,3 ]
127 --b2-rfg <int>,<int> [ default: 5,3 ]
128 --b2-score-min <func> [ default: L,-0.6,-0.6 ]
129
130 Effort options
131 --b2-D <int> [ default: 15 ]
132 --b2-R <int> [ default: 2 ]
133
134
135 SAM Header Options (for embedding sequencing run metadata in output):
136 --rg-id <string> (read group ID)
137 --rg-sample <string> (sample ID)
138 --rg-library <string> (library ID)
139 --rg-description <string> (descriptive string, no tabs allowed)
140 --rg-platform-unit <string> (e.g Illumina lane ID)
141 --rg-center <string> (sequencing center name)
142 --rg-date <string> (ISO 8601 date of the sequencing run)
143 --rg-platform <string> (Sequencing platform descriptor)
144 '''
145
146 # -F/--min-isoform-fraction <float> [ default: 0.15 ]
147 # (obsolete)
148
149 class Usage(Exception):
150 def __init__(self, msg):
151 self.msg = msg
152
153 output_dir = "./tophat_out/"
154 logging_dir = output_dir + "logs/"
155 run_log = None
156 run_cmd = None
157 tmp_dir = output_dir + "tmp/"
158 bin_dir = sys.path[0] + "/"
159 use_zpacker = False # this is set by -z/--zpacker option (-z0 leaves it False)
160
161 use_BAM_Unmapped = False # automatically set to True for non-Solid reads, handles unmapped reads in BAM format
162
163 use_BWT_FIFO = False # can only be set to True if use_zpacker is True and only with -C/--color
164 # enabled by -X/-unmapped-fifo option (unless -z0)
165 unmapped_reads_fifo = None # if use_BWT_FIFO is True, this tricks bowtie into writing the
166 # unmapped reads into a compressed file
167
168 samtools_path = None
169 bowtie_path = None
170 fail_str = "\t[FAILED]\n"
171
172 sam_header = tmp_dir + "stub_header.sam"
173
174 gtf_juncs = None #file name with junctions extracted from given GFF file
175
176
177 # TopHatParams captures all of the runtime paramaters used by TopHat, and many
178 # of these are passed as command line options to exectubles run by the pipeline
179
180 # This class and its nested classes also do options parsing through parse_options()
181 # and option validation via the member function check()
182
183
184 class BowtieOutputFiles:
185 def __init__(self,
186 mappings=None,
187 unmapped_reads=None,
188 multihit_reads=None):
189 self.mappings=mappings
190 self.unmapped_reads=unmapped_reads
191 self.multihit_reads=multihit_reads
192
193 class TopHatParams:
194
195 # SpliceConstraints is a group of runtime parameters that specify what
196 # constraints to put on junctions discovered by the program. These constraints
197 # are used to filter out spurious/false positive junctions.
198
199 class SpliceConstraints:
200 def __init__(self,
201 min_anchor_length,
202 min_intron_length,
203 max_intron_length,
204 splice_mismatches,
205 min_isoform_fraction):
206 self.min_anchor_length = min_anchor_length
207 self.min_intron_length = min_intron_length
208 self.max_intron_length = max_intron_length
209 self.splice_mismatches = splice_mismatches
210 self.min_isoform_fraction = min_isoform_fraction
211
212 def parse_options(self, opts):
213 for option, value in opts:
214 if option in ("-m", "--splice-mismatches"):
215 self.splice_mismatches = int(value)
216 elif option in ("-a", "--min-anchor"):
217 self.min_anchor_length = int(value)
218 elif option in ("-F", "--min-isoform-fraction"):
219 self.min_isoform_fraction = float(value)
220 elif option in ("-i", "--min-intron-length"):
221 self.min_intron_length = int(value)
222 elif option in ("-I", "--max-intron-length"):
223 self.max_intron_length = int(value)
224
225 def check(self):
226 if self.splice_mismatches not in [0,1,2]:
227 die("Error: arg to --splice-mismatches must be 0, 1, or 2")
228 if self.min_anchor_length < 4:
229 die("Error: arg to --min-anchor-len must be greater than 4")
230 if self.min_isoform_fraction < 0.0 or self.min_isoform_fraction > 1.0:
231 die("Error: arg to --min-isoform-fraction must be between 0.0 and 1.0")
232 if self.min_intron_length <= 0:
233 die("Error: arg to --min-intron-length must be greater than 0")
234 if self.max_intron_length <= 0:
235 die("Error: arg to --max-intron-length must be greater than 0")
236
237 # SystemParams is a group of runtime parameters that determine how to handle
238 # temporary files produced during a run and how many threads to use for threaded
239 # stages of the pipeline (e.g. Bowtie)
240
241 class SystemParams:
242 def __init__(self,
243 num_threads,
244 keep_tmp):
245 self.num_threads = num_threads
246 self.keep_tmp = keep_tmp
247 self.zipper = "gzip"
248 self.zipper_opts= []
249
250 def parse_options(self, opts):
251 global use_zpacker
252 global use_BWT_FIFO
253 for option, value in opts:
254 if option in ("-p", "--num-threads"):
255 self.num_threads = int(value)
256 elif option == "--keep-tmp":
257 self.keep_tmp = True
258 elif option in ("-z","--zpacker"):
259 if value.lower() in ["-", " ", ".", "0", "none", "f", "false", "no"]:
260 value=""
261 self.zipper = value
262 #if not self.zipper:
263 # self.zipper='gzip'
264 elif option in ("-X", "--unmapped-fifo"):
265 use_BWT_FIFO=True
266 if self.zipper:
267 use_zpacker=True
268 if self.num_threads>1 and not self.zipper_opts:
269 if self.zipper.endswith('pbzip2') or self.zipper.endswith('pigz'):
270 self.zipper_opts.append('-p'+str(self.num_threads))
271 else:
272 use_zpacker=False
273 if use_BWT_FIFO: use_BWT_FIFO=False
274 def cmd(self):
275 cmdline=[]
276 if self.zipper:
277 cmdline.extend(['-z',self.zipper])
278 if self.num_threads>1:
279 cmdline.extend(['-p'+str(self.num_threads)])
280 return cmdline
281
282 def check(self):
283 if self.num_threads<1 :
284 die("Error: arg to --num-threads must be greater than 0")
285 if self.zipper:
286 xzip=which(self.zipper)
287 if not xzip:
288 die("Error: cannot find compression program "+self.zipper)
289
290 # ReadParams is a group of runtime parameters that specify various properties
291 # of the user's reads (e.g. which quality scale their are on, how long the
292 # fragments are, etc).
293 class ReadParams:
294 def __init__(self,
295 solexa_quals,
296 phred64_quals,
297 quals,
298 integer_quals,
299 color,
300 color_out,
301 library_type,
302 seed_length,
303 reads_format,
304 mate_inner_dist,
305 mate_inner_dist_std_dev,
306 read_group_id,
307 sample_id,
308 library_id,
309 description,
310 seq_platform_unit,
311 seq_center,
312 seq_run_date,
313 seq_platform):
314 self.solexa_quals = solexa_quals
315 self.phred64_quals = phred64_quals
316 self.quals = quals
317 self.integer_quals = integer_quals
318 self.color = color
319 self.color_out = color_out
320 self.library_type = library_type
321 self.seed_length = seed_length
322 self.reads_format = reads_format
323 self.mate_inner_dist = mate_inner_dist
324 self.mate_inner_dist_std_dev = mate_inner_dist_std_dev
325 self.read_group_id = read_group_id
326 self.sample_id = sample_id
327 self.library_id = library_id
328 self.description = description
329 self.seq_platform_unit = seq_platform_unit
330 self.seq_center = seq_center
331 self.seq_run_date = seq_run_date
332 self.seq_platform = seq_platform
333
334 def parse_options(self, opts):
335 for option, value in opts:
336 if option == "--solexa-quals":
337 self.solexa_quals = True
338 elif option in ("--solexa1.3-quals", "--phred64-quals"):
339 self.phred64_quals = True
340 elif option in ("-Q", "--quals"):
341 self.quals = True
342 elif option == "--integer-quals":
343 self.integer_quals = True
344 elif option in ("-C", "--color"):
345 self.color = True
346 elif option == "--color-out":
347 self.color_out = True
348 elif option == "--library-type":
349 self.library_type = value
350 elif option in ("-s", "--seed-length"):
351 self.seed_length = int(value)
352 elif option in ("-r", "--mate-inner-dist"):
353 self.mate_inner_dist = int(value)
354 elif option == "--mate-std-dev":
355 self.mate_inner_dist_std_dev = int(value)
356 elif option == "--rg-id":
357 self.read_group_id = value
358 elif option == "--rg-sample":
359 self.sample_id = value
360 elif option == "--rg-library":
361 self.library_id = value
362 elif option == "--rg-description":
363 self.description = value
364 elif option == "--rg-platform-unit":
365 self.seq_platform_unit = value
366 elif option == "--rg-center":
367 self.seq_center = value
368 elif option == "--rg-date":
369 self.seq_run_date = value
370 elif option == "--rg-platform":
371 self.seq_platform = value
372
373 def check(self):
374 if self.seed_length and self.seed_length < 20:
375 die("Error: arg to --seed-length must be at least 20")
376
377 if self.mate_inner_dist_std_dev != None and self.mate_inner_dist_std_dev < 0:
378 die("Error: arg to --mate-std-dev must at least 0")
379 if (not self.read_group_id and self.sample_id) or (self.read_group_id and not self.sample_id):
380 die("Error: --rg-id and --rg-sample must be specified or omitted together")
381
382 # SearchParams is a group of runtime parameters that specify how TopHat will
383 # search for splice junctions
384
385 class SearchParams:
386 def __init__(self,
387 min_closure_exon,
388 min_closure_intron,
389 max_closure_intron,
390 min_coverage_intron,
391 max_coverage_intron,
392 min_segment_intron,
393 max_segment_intron):
394
395 self.min_closure_exon_length = min_closure_exon
396 self.min_closure_intron_length = min_closure_intron
397 self.max_closure_intron_length = max_closure_intron
398 self.min_coverage_intron_length = min_coverage_intron
399 self.max_coverage_intron_length = max_coverage_intron
400 self.min_segment_intron_length = min_segment_intron
401 self.max_segment_intron_length = max_segment_intron
402
403 def parse_options(self, opts):
404 for option, value in opts:
405 if option == "--min-closure-exon":
406 self.min_closure_exon_length = int(value)
407 if option == "--min-closure-intron":
408 self.min_closure_intron_length = int(value)
409 if option == "--max-closure-intron":
410 self.max_closure_intron_length = int(value)
411 if option == "--min-coverage-intron":
412 self.min_coverage_intron_length = int(value)
413 if option == "--max-coverage-intron":
414 self.max_coverage_intron_length = int(value)
415 if option == "--min-segment-intron":
416 self.min_segment_intron_length = int(value)
417 if option == "--max-segment-intron":
418 self.max_segment_intron_length = int(value)
419
420 def check(self):
421 if self.min_closure_exon_length < 0:
422 die("Error: arg to --min-closure-exon must be at least 20")
423 if self.min_closure_intron_length < 0:
424 die("Error: arg to --min-closure-intron must be at least 20")
425 if self.max_closure_intron_length < 0:
426 die("Error: arg to --max-closure-intron must be at least 20")
427 if self.min_coverage_intron_length < 0:
428 die("Error: arg to --min-coverage-intron must be at least 20")
429 if self.max_coverage_intron_length < 0:
430 die("Error: arg to --max-coverage-intron must be at least 20")
431 if self.min_segment_intron_length < 0:
432 die("Error: arg to --min-segment-intron must be at least 20")
433 if self.max_segment_intron_length < 0:
434 die("Error: arg to --max-segment-intron must be at least 20")
435
436 class ReportParams:
437 def __init__(self):
438 self.sort_bam = True
439 self.convert_bam = True
440
441 def parse_options(self, opts):
442 for option, value in opts:
443 if option == "--no-sort-bam":
444 self.sort_bam = False
445 if option == "--no-convert-bam":
446 self.convert_bam = False
447 self.sort_bam = False
448
449 class Bowtie2Params:
450 def __init__(self):
451 self.very_fast = False
452 self.fast = False
453 self.sensitive = False
454 self.very_sensitive = False
455
456 self.N = 0
457 self.L = 20
458 self.i = "S,1,1.25"
459 self.n_ceil = "L,0,0.15"
460 self.gbar = 4
461
462 self.mp = "6,2"
463 self.np = 1
464 self.rdg = "5,3"
465 self.rfg = "5,3"
466 self.score_min = "L,-0.6,-0.6"
467
468 self.D = 15
469 self.R = 2
470
471 def parse_options(self, opts):
472 for option, value in opts:
473 if option == "--b2-very-fast":
474 self.very_fast = True
475 if option == "--b2-fast":
476 self.fast = True
477 if option == "--b2-sensitive":
478 self.sensitive = True
479 if option == "--b2-very-sensitive":
480 self.very_sensitive = True
481
482 if option == "--b2-N":
483 self.N = int(value)
484 if option == "--b2-L":
485 self.L = 20
486 if option == "--b2-i":
487 self.i = value
488 if option == "--b2-n-ceil":
489 self.n_ceil = value
490 if option == "--b2-gbar":
491 self.gbar = 4
492
493 if option == "--b2-mp":
494 self.mp = value
495 if option == "--b2-np":
496 self.np = int(value)
497 if option == "--b2-rdg":
498 self.rdg = value
499 if option == "--b2-rfg":
500 self.rfg = value
501 if option == "--b2-score-min":
502 self.score_min = value
503
504 if option == "--b2-D":
505 self.D = int(value)
506 if option == "--b2-R":
507 self.R = int(value)
508
509 def check(self):
510 more_than_once = False
511 if self.very_fast:
512 if self.fast or self.sensitive or self.very_sensitive:
513 more_than_once = True
514 else:
515 if self.fast:
516 if self.sensitive or self.very_sensitive:
517 more_than_once = True
518 else:
519 if self.sensitive and self.very_sensitive:
520 more_than_once = True
521
522 if more_than_once:
523 die("Error: use only one of --b2-very-fast, --b2-fast, --b2-sensitive, --b2-very-sensitive")
524
525 if not self.N in [0, 1]:
526 die("Error: arg to --b2-N must be either 0 or 1")
527
528 function_re = r'^[CLSG],-?\d+(\.\d+)?,-?\d+(\.\d+)?$'
529 function_match = re.search(function_re, self.i)
530
531 if not function_match:
532 die("Error: arg to --b2-i must be <func> (e.g. --b2-i S,1,1.25)")
533
534 function_match = re.search(function_re, self.n_ceil)
535 if not function_match:
536 die("Error: arg to --b2-n-ceil must be <func> (e.g. --b2-n-ceil L,0,0.15)")
537
538 function_match = re.search(function_re, self.score_min)
539 if not function_match:
540 die("Error: arg to --b2-score-min must be <func> (e.g. --b2-score-min L,-0.6,-0.6)")
541
542 pair_re = r'^\d+,\d+$'
543 pair_match = re.search(pair_re, self.mp)
544 if not pair_match:
545 die("Error: arg to --b2-mp must be <int>,<int> (e.g. --b2-mp 6,2)")
546
547 pair_match = re.search(pair_re, self.rdg)
548 if not pair_match:
549 die("Error: arg to --b2-rdg must be <int>,<int> (e.g. --b2-mp 5,3)")
550
551 pair_match = re.search(pair_re, self.rfg)
552 if not pair_match:
553 die("Error: arg to --b2-rfg must be <int>,<int> (e.g. --b2-mp 5,3)")
554
555
556 def __init__(self):
557 self.splice_constraints = self.SpliceConstraints(8, # min_anchor
558 50, # min_intron
559 500000, # max_intron
560 0, # splice_mismatches
561 0.15) # min_isoform_frac
562
563 self.preflt_data = [ BowtieOutputFiles(), BowtieOutputFiles() ]
564
565 self.read_params = self.ReadParams(False, # solexa_scale
566 False,
567 False, # quals
568 None, # integer quals
569 False, # SOLiD - color space
570 False, # SOLiD - color out instead of base pair,
571 "", # library type (e.g. "illumina-stranded-pair-end")
572 None, # seed_length
573 "fastq", # quality_format
574 None, # mate inner distance
575 20, # mate inner dist std dev
576 None, # read group id
577 None, # sample id
578 None, # library id
579 None, # description
580 None, # platform unit (i.e. lane)
581 None, # sequencing center
582 None, # run date
583 None) # sequencing platform
584
585 self.system_params = self.SystemParams(1, # bowtie_threads (num_threads)
586 False) # keep_tmp
587
588 self.search_params = self.SearchParams(100, # min_closure_exon_length
589 50, # min_closure_intron_length
590 5000, # max_closure_intron_length
591 50, # min_coverage_intron_length
592 20000, # max_coverage_intron_length
593 50, # min_segment_intron_length
594 500000) # max_segment_intron_length
595
596 self.report_params = self.ReportParams()
597
598 self.bowtie2_params = self.Bowtie2Params()
599
600 self.bowtie2 = True
601 self.gff_annotation = None
602 self.transcriptome_only = False
603 self.transcriptome_index = None
604 self.transcriptome_outdir = None
605 self.raw_junctions = None
606 self.find_novel_juncs = True
607 self.find_novel_indels = True
608 self.find_novel_fusions = True
609 self.find_GFF_juncs = True
610 self.max_hits = 20
611 self.t_max_hits = 60
612 self.prefilter_multi = False
613 self.genome_read_mismatches = 2
614 self.max_read_mismatches = 2
615 self.t_mismatches = 1
616 self.segment_length = 25
617 self.segment_mismatches = 2
618 self.bowtie_alignment_option = "-v"
619 self.max_insertion_length = 3
620 self.max_deletion_length = 3
621 self.raw_insertions = None
622 self.raw_deletions = None
623 self.coverage_search = None
624 self.closure_search = False
625 self.microexon_search = False
626 self.butterfly_search = None
627 self.fusion_search = False
628 self.fusion_anchor_length = 20
629 self.fusion_min_dist = 10000000
630 self.fusion_read_mismatches = 2
631 self.fusion_multireads = 2
632 self.fusion_multipairs = 2
633
634 def check(self):
635 self.splice_constraints.check()
636 self.read_params.check()
637 self.system_params.check()
638 self.bowtie2_params.check()
639 if self.segment_length < 10:
640 die("Error: arg to --segment-length must at least 10")
641 if self.segment_mismatches < 0 or self.segment_mismatches > 3:
642 die("Error: arg to --segment-mismatches must in [0, 3]")
643 if self.t_mismatches < 0 or self.t_mismatches > 3:
644 die("Error: arg to -n or --transcriptome-mismatches must in [0, 3]")
645
646 if self.read_params.color:
647 if self.bowtie2:
648 die("Error: bowtie2 in colorspace is not yet supported\nuse --bowtie1 option")
649 if self.fusion_search:
650 die("Error: fusion-search in colorspace is not yet supported")
651 if self.butterfly_search:
652 die("Error: butterfly-search in colorspace is not yet supported")
653
654 library_types = ["fr-unstranded", "fr-firststrand", "fr-secondstrand"]
655
656 if self.read_params.library_type and self.read_params.library_type not in library_types:
657 die("Error: library-type should be one of: "+', '.join(library_types))
658
659 self.search_params.max_closure_intron_length = min(self.splice_constraints.max_intron_length,
660 self.search_params.max_closure_intron_length)
661
662 self.search_params.max_segment_intron_length = min(self.splice_constraints.max_intron_length,
663 self.search_params.max_segment_intron_length)
664
665 self.search_params.max_coverage_intron_length = min(self.splice_constraints.max_intron_length,
666 self.search_params.max_coverage_intron_length)
667
668 if self.max_insertion_length >= self.segment_length:
669 die("Error: the max insertion length ("+self.max_insertion_length+") can not be equal to or greater than the segment length ("+self.segment_length+")")
670
671 if self.max_insertion_length < 0:
672 die("Error: the max insertion length ("+self.max_insertion_length+") can not be less than 0")
673
674 if self.max_deletion_length >= self.splice_constraints.min_intron_length:
675 die("Error: the max deletion length ("+self.max_deletion_length+") can not be equal to or greater than the min intron length ("+self.splice_constraints.min_intron_length+")")
676
677 if self.max_deletion_length < 0:
678 die("Error: the max deletion length ("+self.max_deletion_length+") can not be less than 0")
679
680 def cmd(self):
681 cmd = ["--min-anchor", str(self.splice_constraints.min_anchor_length),
682 "--splice-mismatches", str(self.splice_constraints.splice_mismatches),
683 "--min-report-intron", str(self.splice_constraints.min_intron_length),
684 "--max-report-intron", str(self.splice_constraints.max_intron_length),
685 "--min-isoform-fraction", str(self.splice_constraints.min_isoform_fraction),
686 "--output-dir", output_dir,
687 "--max-multihits", str(self.max_hits),
688 "--segment-length", str(self.segment_length),
689 "--segment-mismatches", str(self.segment_mismatches),
690 "--min-closure-exon", str(self.search_params.min_closure_exon_length),
691 "--min-closure-intron", str(self.search_params.min_closure_intron_length),
692 "--max-closure-intron", str(self.search_params.max_closure_intron_length),
693 "--min-coverage-intron", str(self.search_params.min_coverage_intron_length),
694 "--max-coverage-intron", str(self.search_params.max_coverage_intron_length),
695 "--min-segment-intron", str(self.search_params.min_segment_intron_length),
696 "--max-segment-intron", str(self.search_params.max_segment_intron_length),
697 "--max-mismatches", str(self.max_read_mismatches),
698 "--sam-header", sam_header,
699 "--max-insertion-length", str(self.max_insertion_length),
700 "--max-deletion-length", str(self.max_deletion_length)]
701
702 if not self.bowtie2:
703 cmd.extend(["--bowtie1"])
704
705 if self.fusion_search:
706 cmd.extend(["--fusion-search",
707 "--fusion-anchor-length", str(self.fusion_anchor_length),
708 "--fusion-min-dist", str(self.fusion_min_dist),
709 "--fusion-read-mismatches", str(self.fusion_read_mismatches),
710 "--fusion-multireads", str(self.fusion_multireads),
711 "--fusion-multipairs", str(self.fusion_multipairs)])
712
713 cmd.extend(self.system_params.cmd())
714
715 if self.read_params.mate_inner_dist != None:
716 cmd.extend(["--inner-dist-mean", str(self.read_params.mate_inner_dist),
717 "--inner-dist-std-dev", str(self.read_params.mate_inner_dist_std_dev)])
718 if self.gff_annotation != None:
719 cmd.extend(["--gtf-annotations", str(self.gff_annotation)])
720 if gtf_juncs:
721 cmd.extend(["--gtf-juncs", gtf_juncs])
722 if self.closure_search == False:
723 cmd.append("--no-closure-search")
724 if not self.coverage_search:
725 cmd.append("--no-coverage-search")
726 if not self.microexon_search:
727 cmd.append("--no-microexon-search")
728 if self.butterfly_search:
729 cmd.append("--butterfly-search")
730 if self.read_params.solexa_quals:
731 cmd.append("--solexa-quals")
732 if self.read_params.quals:
733 cmd.append("--quals")
734 if self.read_params.integer_quals:
735 cmd.append("--integer-quals")
736 if self.read_params.color:
737 cmd.append("--color")
738 if self.read_params.color_out:
739 cmd.append("--color-out")
740 if self.read_params.library_type:
741 cmd.extend(["--library-type", self.read_params.library_type])
742 if self.read_params.read_group_id:
743 cmd.extend(["--rg-id", self.read_params.read_group_id])
744 if self.read_params.phred64_quals:
745 cmd.append("--phred64-quals")
746 return cmd
747
748 # This is the master options parsing routine, which calls parse_options for
749 # the delegate classes (e.g. SpliceConstraints) that handle certain groups
750 # of options.
751 def parse_options(self, argv):
752 try:
753 opts, args = getopt.getopt(argv[1:], "hvp:m:n:N:F:a:i:I:G:Tr:o:j:Xz:s:g:x:MQC",
754 ["version",
755 "help",
756 "output-dir=",
757 "bowtie1",
758 "solexa-quals",
759 "solexa1.3-quals",
760 "phred64-quals",
761 "quals",
762 "integer-quals",
763 "color",
764 "color-out",
765 "library-type=",
766 "num-threads=",
767 "splice-mismatches=",
768 "max-multihits=",
769 "min-isoform-fraction=",
770 "min-anchor-length=",
771 "min-intron-length=",
772 "max-intron-length=",
773 "GTF=",
774 "transcriptome-only",
775 "transcriptome-max-hits=",
776 "transcriptome-index=",
777 "raw-juncs=",
778 "no-novel-juncs",
779 "allow-fusions",
780 "fusion-search",
781 "fusion-anchor-length=",
782 "fusion-min-dist=",
783 "fusion-read-mismatches=",
784 "fusion-multireads=",
785 "fusion-multipairs=",
786 "no-novel-indels",
787 "no-gtf-juncs",
788 "mate-inner-dist=",
789 "mate-std-dev=",
790 "no-closure-search",
791 "no-coverage-search",
792 "closure-search",
793 "coverage-search",
794 "prefilter-multihits",
795 "microexon-search",
796 "min-closure-exon=",
797 "min-closure-intron=",
798 "max-closure-intron=",
799 "min-coverage-intron=",
800 "max-coverage-intron=",
801 "min-segment-intron=",
802 "max-segment-intron=",
803 "seed-length=",
804 "genome-read-mismatches=",
805 "read-mismatches=",
806 "max-read-mismatches=",
807 "transcriptome-mismatches=",
808 "segment-length=",
809 "segment-mismatches=",
810 "bowtie-n",
811 "butterfly-search",
812 "no-butterfly-search",
813 "keep-tmp",
814 "rg-id=",
815 "rg-sample=",
816 "rg-library=",
817 "rg-description=",
818 "rg-platform-unit=",
819 "rg-center=",
820 "rg-date=",
821 "rg-platform=",
822 "tmp-dir=",
823 "zpacker=",
824 "unmapped-fifo",
825 "max-insertion-length=",
826 "max-deletion-length=",
827 "insertions=",
828 "deletions=",
829 "no-sort-bam",
830 "no-convert-bam",
831 "b2-very-fast",
832 "b2-fast",
833 "b2-sensitive",
834 "b2-very-sensitive",
835 "b2-N=",
836 "b2-L=",
837 "b2-i=",
838 "b2-n-ceil=",
839 "b2-gbar=",
840 "b2-ma=",
841 "b2-mp=",
842 "b2-np=",
843 "b2-rdg=",
844 "b2-rfg=",
845 "b2-score-min=",
846 "b2-D=",
847 "b2-R="])
848 except getopt.error, msg:
849 raise Usage(msg)
850
851 self.splice_constraints.parse_options(opts)
852 self.system_params.parse_options(opts)
853 self.read_params.parse_options(opts)
854 self.search_params.parse_options(opts)
855 self.report_params.parse_options(opts)
856 self.bowtie2_params.parse_options(opts)
857 global use_BWT_FIFO
858 global use_BAM_Unmapped
859 if not self.read_params.color:
860 use_BWT_FIFO=False
861 use_BAM_Unmapped=True
862 global output_dir
863 global logging_dir
864 global tmp_dir
865 global sam_header
866
867 custom_tmp_dir = None
868 custom_out_dir = None
869 # option processing
870 for option, value in opts:
871 if option in ("-v", "--version"):
872 print "TopHat v%s" % (get_version())
873 sys.exit(0)
874 if option in ("-h", "--help"):
875 raise Usage(use_message)
876 if option == "--bowtie1":
877 self.bowtie2 = False
878 if option in ("-g", "--max-multihits"):
879 self.max_hits = int(value)
880 if option in ("-x", "--transcriptome-max-hits"):
881 self.t_max_hits = int(value)
882 if option in ("-G", "--GTF"):
883 self.gff_annotation = value
884 if option in ("-T", "--transcriptome"):
885 self.transcriptome_only = True
886 if option == "--transcriptome-index":
887 self.transcriptome_index = value
888 if option in("-M", "--prefilter-multihits"):
889 self.prefilter_multi = True
890 if option in ("-j", "--raw-juncs"):
891 self.raw_junctions = value
892 if option == "--no-novel-juncs":
893 self.find_novel_juncs = False
894 if option == "--no-novel-indels":
895 self.find_novel_indels = False
896 if option == "--fusion-search":
897 self.fusion_search = True
898 if option == "--fusion-anchor-length":
899 self.fusion_anchor_length = int(value)
900 if option == "--fusion-min-dist":
901 self.fusion_min_dist = int(value)
902 if option == "--fusion-read-mismatches":
903 self.fusion_read_mismatches = int(value)
904 if option == "--fusion-multireads":
905 self.fusion_multireads = int(value)
906 if option == "--fusion-multipairs":
907 self.fusion_multipairs = int(value)
908 if option == "--no-gtf-juncs":
909 self.find_GFF_juncs = False
910 #if option == "--skip-check-reads":
911 # self.skip_check_reads = True
912 if option == "--no-coverage-search":
913 self.coverage_search = False
914 if option == "--coverage-search":
915 self.coverage_search = True
916 if option == "--no-closure-search":
917 self.closure_search = False
918 if option == "--closure-search":
919 self.closure_search = True
920 if option == "--microexon-search":
921 self.microexon_search = True
922 if option == "--butterfly-search":
923 self.butterfly_search = True
924 if option == "--no-butterfly-search":
925 self.butterfly_search = False
926 if option == "--genome-read-mismatches":
927 self.genome_read_mismatches = int(value)
928 if option in ("-N", "--read-mismatches", "--max-read-mismatches"):
929 self.max_read_mismatches = int(value)
930 if option in ("-n", "--transcriptome-mismatches"):
931 self.t_mismatches = int(value)
932 if option == "--segment-length":
933 self.segment_length = int(value)
934 if option == "--segment-mismatches":
935 self.segment_mismatches = int(value)
936 if option == "--bowtie-n":
937 self.bowtie_alignment_option = "-n"
938 if option == "--max-insertion-length":
939 self.max_insertion_length = int(value)
940 if option == "--max-deletion-length":
941 self.max_deletion_length = int(value)
942 if option == "--insertions":
943 self.raw_insertions = value
944 if option == "--deletions":
945 self.raw_deletions = value
946 if option in ("-o", "--output-dir"):
947 custom_out_dir = value + "/"
948 if option == "--tmp-dir":
949 custom_tmp_dir = value + "/"
950 if self.transcriptome_only:
951 self.find_novel_juncs=False
952 self.find_novel_indels=False
953 if custom_out_dir:
954 output_dir = custom_out_dir
955 logging_dir = output_dir + "logs/"
956 tmp_dir = output_dir + "tmp/"
957 sam_header = tmp_dir + "stub_header.sam"
958 if custom_tmp_dir:
959 tmp_dir = custom_tmp_dir
960 sam_header = tmp_dir + "stub_header.sam"
961 if len(args) < 2:
962 raise Usage(use_message)
963 return args
964
965
966 # check if a file exists and has non-zero (or minimum) size
967 def fileExists(filepath, minfsize=1):
968 if os.path.exists(filepath) and os.path.getsize(filepath)>=minfsize:
969 return True
970 else:
971 return False
972
973
974 def getFileDir(filepath):
975 #if fullpath given, returns path including the ending /
976 fpath, fname=os.path.split(filepath)
977 if fpath: fpath+='/'
978 return fpath
979
980 def getFileBaseName(filepath):
981 fpath, fname=os.path.split(filepath)
982 fbase, fext =os.path.splitext(fname)
983 fx=fext.lower()
984 if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fbase)>0:
985 return fbase
986 elif fx == '.z' or fx.find('.gz')==0 or fx.find('.bz')==0:
987 fb, fext = os.path.splitext(fbase)
988 fx=fext.lower()
989 if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fb)>0:
990 return fb
991 else:
992 return fbase
993 else:
994 if len(fbase)>0:
995 return fbase
996 else:
997 return fname
998
999 # Returns the current time in a nice format
1000 def right_now():
1001 curr_time = datetime.now()
1002 return curr_time.strftime("%c")
1003
1004 # The TopHat logging formatter
1005 def th_log(out_str):
1006 print >> sys.stderr, "[%s] %s" % (right_now(), out_str)
1007
1008 # Ensures that the output, logging, and temp directories are present. If not,
1009 # they are created
1010 def prepare_output_dir():
1011
1012 th_log("Preparing output location "+output_dir)
1013 if os.path.exists(output_dir):
1014 pass
1015 else:
1016 os.mkdir(output_dir)
1017
1018 if os.path.exists(logging_dir):
1019 pass
1020 else:
1021 os.mkdir(logging_dir)
1022
1023 if os.path.exists(tmp_dir):
1024 pass
1025 else:
1026 try:
1027 os.makedirs(tmp_dir)
1028 except OSError, o:
1029 die("\nError creating directory %s (%s)" % (tmp_dir, o))
1030
1031
1032 # to be added as preexec_fn for every subprocess.Popen() call:
1033 # see http://bugs.python.org/issue1652
1034 def subprocess_setup():
1035 # Python installs a SIGPIPE handler by default, which causes
1036 # gzip or other de/compression pipes to complain about "stdout: Broken pipe"
1037 signal.signal(signal.SIGPIPE, signal.SIG_DFL)
1038
1039 # Check that the Bowtie index specified by the user is present and all files
1040 # are there.
1041 def check_bowtie_index(idx_prefix):
1042 th_log("Checking for Bowtie index files")
1043
1044 idx_fwd_1 = idx_prefix + ".1.ebwt"
1045 idx_fwd_2 = idx_prefix + ".2.ebwt"
1046 idx_rev_1 = idx_prefix + ".rev.1.ebwt"
1047 idx_rev_2 = idx_prefix + ".rev.2.ebwt"
1048
1049 if os.path.exists(idx_fwd_1) and \
1050 os.path.exists(idx_fwd_2) and \
1051 os.path.exists(idx_rev_1) and \
1052 os.path.exists(idx_rev_2):
1053 return
1054 else:
1055 bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES")
1056 if bowtie_idx_env_var == None:
1057 die("Error: Could not find Bowtie index files " + idx_prefix + ".*")
1058 idx_prefix = bowtie_idx_env_var + idx_prefix
1059
1060 idx_fwd_1 = idx_prefix + ".1.ebwt"
1061 idx_fwd_2 = idx_prefix + ".2.ebwt"
1062 idx_rev_1 = idx_prefix + ".rev.1.ebwt"
1063 idx_rev_2 = idx_prefix + ".rev.2.ebwt"
1064
1065 if os.path.exists(idx_fwd_1) and \
1066 os.path.exists(idx_fwd_2) and \
1067 os.path.exists(idx_rev_1) and \
1068 os.path.exists(idx_rev_2):
1069 return
1070 else:
1071 die("Error: Could not find Bowtie index files " + idx_prefix + ".*")
1072
1073 # Reconstructs the multifasta file from which the Bowtie index was created, if
1074 # it's not already there.
1075 def bowtie_idx_to_fa(idx_prefix):
1076 idx_name = idx_prefix.split('/')[-1]
1077 th_log("Reconstituting reference FASTA file from Bowtie index")
1078
1079 try:
1080 tmp_fasta_file_name = tmp_dir + idx_name + ".fa"
1081 tmp_fasta_file = open(tmp_fasta_file_name, "w")
1082
1083 inspect_log = open(logging_dir + "bowtie_inspect_recons.log", "w")
1084
1085 inspect_cmd = [prog_path("bowtie-inspect"),
1086 idx_prefix]
1087
1088 print >> sys.stderr, "Executing: " + " ".join(inspect_cmd) + " > " + tmp_fasta_file_name
1089 ret = subprocess.call(inspect_cmd,
1090 stdout=tmp_fasta_file,
1091 stderr=inspect_log)
1092
1093 # Bowtie reported an error
1094 if ret != 0:
1095 die(fail_str+"Error: bowtie-inspect returned an error")
1096
1097 # Bowtie not found
1098 except OSError, o:
1099 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1100 die(fail_str+"Error: bowtie-inspect not found on this system. Did you forget to include it in your PATH?")
1101
1102 return tmp_fasta_file_name
1103
1104 # Checks whether the multifasta file for the genome is present alongside the
1105 # Bowtie index files for it.
1106 def check_fasta(idx_prefix):
1107 th_log("Checking for reference FASTA file")
1108 idx_fasta = idx_prefix + ".fa"
1109 if os.path.exists(idx_fasta):
1110 return idx_fasta
1111 else:
1112 bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES")
1113 if bowtie_idx_env_var:
1114 idx_fasta = bowtie_idx_env_var + idx_prefix + ".fa"
1115 if os.path.exists(idx_fasta):
1116 return idx_fasta
1117
1118 print >> sys.stderr, "\tWarning: Could not find FASTA file " + idx_fasta
1119 idx_fa = bowtie_idx_to_fa(idx_prefix)
1120 return idx_fa
1121
1122 # Check that both the Bowtie index and the genome's fasta file are present
1123 def check_index(idx_prefix):
1124 check_bowtie_index(idx_prefix)
1125 ref_fasta_file = check_fasta(idx_prefix)
1126
1127 return (ref_fasta_file, None)
1128
1129 # Retrive a tuple containing the system's version of Bowtie. Parsed from
1130 # `bowtie --version`
1131 def get_bowtie_version(is_bowtie2):
1132 try:
1133 # Launch Bowtie to capture its version info
1134 proc = subprocess.Popen([bowtie_path, "--version"],
1135 stdout=subprocess.PIPE)
1136
1137 stdout_value = proc.communicate()[0]
1138
1139 bowtie_version = None
1140 bowtie_out = repr(stdout_value)
1141
1142 # Find the version identifier
1143 if is_bowtie2:
1144 version_str = "bowtie2-align version "
1145 else:
1146 version_str = "bowtie version "
1147
1148 ver_str_idx = bowtie_out.find(version_str)
1149 if ver_str_idx != -1:
1150 nl = bowtie_out.find("\\n", ver_str_idx)
1151 version_val = bowtie_out[ver_str_idx + len(version_str):nl]
1152
1153 if is_bowtie2:
1154 ver_numbers = version_val.split('.')
1155 ver_numbers[2:] = ver_numbers[2].split('-')
1156 bowtie_version = [int(x) for x in ver_numbers[:3]] + [int(ver_numbers[3][4:])]
1157 else:
1158 bowtie_version = [int(x) for x in version_val.split('.')]
1159
1160 if len(bowtie_version) == 3:
1161 bowtie_version.append(0)
1162
1163 return bowtie_version
1164 except OSError, o:
1165 errmsg=fail_str+str(o)+"\n"
1166 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1167 errmsg+="Error: bowtie not found on this system"
1168 die(errmsg)
1169
1170 def get_index_sam_header(is_bowtie2, read_params, idx_prefix, sort = True):
1171 try:
1172 bowtie_sam_header_filename = tmp_dir+idx_prefix.split('/')[-1]+".bwt.samheader.sam"
1173 bowtie_sam_header_file = open(bowtie_sam_header_filename,"w")
1174
1175 bowtie_header_cmd = [bowtie_path]
1176 if not is_bowtie2:
1177 bowtie_header_cmd += ["--sam"]
1178
1179 if read_params.color:
1180 bowtie_header_cmd.append('-C')
1181
1182 bowtie_header_cmd.extend([idx_prefix, '/dev/null'])
1183 subprocess.call(bowtie_header_cmd,
1184 stdout=bowtie_sam_header_file,
1185 stderr=open('/dev/null'))
1186
1187 bowtie_sam_header_file.close()
1188
1189 if not sort:
1190 return bowtie_sam_header_filename
1191
1192 bowtie_sam_header_file = open(bowtie_sam_header_filename, "r")
1193 sam_header_file = open(sam_header, "w")
1194
1195 preamble = []
1196 sq_dict_lines = []
1197
1198 for line in bowtie_sam_header_file.readlines():
1199 line = line.strip()
1200 if line.find("@SQ") != -1:
1201 # Sequence dictionary record
1202 cols = line.split('\t')
1203 seq_name = None
1204 for col in cols:
1205 fields = col.split(':')
1206 #print fields
1207 if len(fields) > 0 and fields[0] == "SN":
1208 seq_name = fields[1]
1209 if seq_name == None:
1210 die("Error: malformed sequence dictionary in sam header")
1211 sq_dict_lines.append([seq_name,line])
1212 elif line.find("CL"):
1213 continue
1214 else:
1215 preamble.append(line)
1216 #for line in preamble:
1217 # print >> sam_header_file, line
1218 print >> sam_header_file, "@HD\tVN:1.0\tSO:coordinate"
1219 if read_params.read_group_id and read_params.sample_id:
1220 rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
1221 read_params.sample_id)
1222 if read_params.library_id:
1223 rg_str += "\tLB:%s" % read_params.library_id
1224 if read_params.description:
1225 rg_str += "\tDS:%s" % read_params.description
1226 if read_params.seq_platform_unit:
1227 rg_str += "\tPU:%s" % read_params.seq_platform_unit
1228 if read_params.seq_center:
1229 rg_str += "\tCN:%s" % read_params.seq_center
1230 if read_params.mate_inner_dist:
1231 rg_str += "\tPI:%s" % read_params.mate_inner_dist
1232 if read_params.seq_run_date:
1233 rg_str += "\tDT:%s" % read_params.seq_run_date
1234 if read_params.seq_platform:
1235 rg_str += "\tPL:%s" % read_params.seq_platform
1236
1237 print >> sam_header_file, rg_str
1238
1239 sq_dict_lines.sort(lambda x,y: cmp(x[0],y[0]))
1240 for [name, line] in sq_dict_lines:
1241 print >> sam_header_file, line
1242 print >> sam_header_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
1243
1244 sam_header_file.close()
1245
1246 copy(sam_header, bowtie_sam_header_filename)
1247 return sam_header
1248
1249 except OSError, o:
1250 errmsg=fail_str+str(o)+"\n"
1251 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1252 errmsg+="Error: bowtie not found on this system"
1253 die(errmsg)
1254
1255 # Make sure Bowtie is installed and is recent enough to be useful
1256 def check_bowtie(is_bowtie2):
1257 log_msg = "Checking for Bowtie"
1258 if is_bowtie2:
1259 log_msg += "2"
1260 th_log(log_msg)
1261
1262 bowtie_bin = "bowtie"
1263 if is_bowtie2:
1264 bowtie_bin += "2"
1265
1266 global bowtie_path
1267 bowtie_path=prog_path(bowtie_bin)
1268 bowtie_version = get_bowtie_version(is_bowtie2)
1269 if bowtie_version == None:
1270 die("Error: Bowtie not found on this system")
1271 elif is_bowtie2:
1272 if bowtie_version[0] < 3 and bowtie_version[1] < 1 and bowtie_version[2] < 1 and bowtie_version[3] < 5:
1273 die("Error: TopHat requires Bowtie 2.0.0-beta5 or later")
1274 elif not is_bowtie2:
1275 if bowtie_version[0] < 1 and bowtie_version[1] < 12 and bowtie_version[2] < 3:
1276 die("Error: TopHat requires Bowtie 0.12.3 or later")
1277 print >> sys.stderr, "\t version:\t\t\t %s" % ".".join([str(x) for x in bowtie_version])
1278
1279
1280 # Retrive a tuple containing the system's version of samtools. Parsed from
1281 # `samtools`
1282 def get_samtools_version():
1283 try:
1284 # Launch Bowtie to capture its version info
1285 proc = subprocess.Popen(samtools_path, stderr=subprocess.PIPE)
1286 samtools_out = proc.communicate()[1]
1287
1288 # Find the version identifier
1289 version_match = re.search(r'Version:\s+(\d+)\.(\d+).(\d+)([a-zA-Z]?)', samtools_out)
1290 samtools_version_arr = [int(version_match.group(x)) for x in [1,2,3]]
1291 if version_match.group(4):
1292 samtools_version_arr.append(version_match.group(4))
1293 else:
1294 samtools_version_arr.append(0)
1295
1296 return version_match.group(), samtools_version_arr
1297 except OSError, o:
1298 errmsg=fail_str+str(o)+"\n"
1299 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1300 errmsg+="Error: samtools not found on this system"
1301 die(errmsg)
1302
1303 # Make sure the SAM tools are installed and are recent enough to be useful
1304 def check_samtools():
1305 th_log("Checking for Samtools")
1306 global samtools_path
1307 samtools_path=prog_path("samtools")
1308 samtools_version_str, samtools_version_arr = get_samtools_version()
1309 if samtools_version_str == None:
1310 die("Error: Samtools not found on this system")
1311 elif samtools_version_arr[1] < 1 or samtools_version_arr[2] < 7:
1312 die("Error: TopHat requires Samtools 0.1.7 or later")
1313 print >> sys.stderr, "\tSamtools %s" % samtools_version_str
1314
1315
1316 class FastxReader:
1317 def __init__(self, i_file, is_color=0, fname=''):
1318 self.bufline=None
1319 self.format=None
1320 self.ifile=i_file
1321 self.nextRecord=None
1322 self.eof=None
1323 self.fname=fname
1324 self.lastline=None
1325 self.numrecords=0
1326 self.isColor=0
1327 if is_color : self.isColor=1
1328 # determine file type
1329 #no records processed yet, skip custom header lines if any
1330 hlines=10 # allow maximum 10 header lines
1331 self.lastline=" "
1332 while hlines>0 and self.lastline[0] not in "@>" :
1333 self.lastline=self.ifile.readline()
1334 hlines-=1
1335 if self.lastline[0] == '@':
1336 self.format='fastq'
1337 self.nextRecord=self.nextFastq
1338 elif self.lastline[0] == '>':
1339 self.format='fasta'
1340 self.nextRecord=self.nextFasta
1341 else:
1342 die("Error: cannot determine record type in input file %s" % fname)
1343 self.bufline=self.lastline
1344 self.lastline=None
1345
1346 def nextFastq(self):
1347 # returning tuple: (seqID, sequence_string, seq_len, qv_string)
1348 seqid,seqstr,qstr,seq_len='','','',0
1349 if self.eof: return (seqid, seqstr, seq_len, qstr)
1350 fline=self.getLine #shortcut to save a bit of time
1351 line=fline()
1352
1353 if not line : return (seqid, seqstr, seq_len, qstr)
1354 while len(line.rstrip())==0: # skip empty lines
1355 line=fline()
1356 if not line : return (seqid, seqstr,seq_len, qstr)
1357 try:
1358 if line[0] != "@":
1359 raise ValueError("Records in Fastq files should start with '@' character")
1360
1361 seqid = line[1:].rstrip()
1362 seqstr = fline().rstrip()
1363
1364 #There may now be more sequence lines, or the "+" quality marker line:
1365 while True:
1366 line = fline()
1367 if not line:
1368 raise ValueError("Premature end of file (missing quality values for "+seqid+")")
1369 if line[0] == "+":
1370 # -- sequence string ended
1371 #qtitle = line[1:].rstrip()
1372 #if qtitle and qtitle != seqid:
1373 # raise ValueError("Different read ID for sequence and quality (%s vs %s)" \
1374 # % (seqid, qtitle))
1375 break
1376 seqstr += line.rstrip() #removes trailing newlines
1377 #loop until + found
1378 seq_len = len(seqstr)
1379 #at least one line of quality data should follow
1380 qstrlen=0
1381 #now read next lines as quality values until seq_len is reached
1382 while True:
1383 line=fline()
1384 if not line : break #end of file
1385 qstr += line.rstrip()
1386 qstrlen=len(qstr)
1387 if qstrlen + self.isColor >= seq_len :
1388 break # qv string has reached the length of seq string
1389 #loop until qv has the same length as seq
1390
1391 if self.isColor:
1392 # and qstrlen==seq_len :
1393 if qstrlen==seq_len:
1394 #qual string may have a dummy qv at the beginning, should be stripped
1395 qstr = qstr[1:]
1396 qstrlen -= 1
1397 if qstrlen!=seq_len-1:
1398 raise ValueError("Length mismatch between sequence and quality strings "+ \
1399 "for %s (%i vs %i)." % (seqid, seq_len, qstrlen))
1400 else:
1401 if seq_len != qstrlen :
1402 raise ValueError("Length mismatch between sequence and quality strings "+ \
1403 "for %s (%i vs %i)." % (seqid, seq_len, qstrlen))
1404 except ValueError, err:
1405 die("\nError encountered parsing file "+self.fname+":\n "+str(err))
1406 #return the record
1407 self.numrecords+=1
1408 ##--discard the primer base [NO]
1409 if self.isColor :
1410 seq_len-=1
1411 seqstr = seqstr[1:]
1412 return (seqid, seqstr, seq_len, qstr)
1413
1414 def nextFasta(self):
1415 # returning tuple: (seqID, sequence_string, seq_len)
1416 seqid,seqstr,seq_len='','',0
1417 fline=self.getLine # shortcut to readline function of f
1418 line=fline() # this will use the buffer line if it's there
1419 if not line : return (seqid, seqstr, seq_len, None)
1420 while len(line.rstrip())==0: # skip empty lines
1421 line=fline()
1422 if not line : return (seqid, seqstr, seq_len, None)
1423 try:
1424 if line[0] != ">":
1425 raise ValueError("Records in Fasta files must start with '>' character")
1426 seqid = line[1:].split()[0]
1427 #more sequence lines, or the ">" quality marker line:
1428 while True:
1429 line = fline()
1430 if not line: break
1431 if line[0] == '>':
1432 #next sequence starts here
1433 self.ungetLine()
1434 break
1435 seqstr += line.rstrip()
1436 #loop until '>' found
1437 seq_len = len(seqstr)
1438 if seq_len < 3:
1439 raise ValueError("Read %s too short (%i)." \
1440 % (seqid, seq_len))
1441 except ValueError, err:
1442 die("\nError encountered parsing fasta file "+self.fname+"\n "+str(err))
1443 #return the record and continue
1444 self.numrecords+=1
1445 if self.isColor : # -- discard primer base
1446 seq_len-=1
1447 seqstr=seqstr[1:]
1448 return (seqid, seqstr, seq_len, None)
1449
1450 def getLine(self):
1451 if self.bufline: #return previously buffered line
1452 r=self.bufline
1453 self.bufline=None
1454 return r
1455 else: #read a new line from stream and return it
1456 if self.eof: return None
1457 self.lastline=self.ifile.readline()
1458 if not self.lastline:
1459 self.eof=1
1460 return None
1461 return self.lastline
1462 def ungetLine(self):
1463 if self.lastline is None:
1464 print >> sys.stderr, "Warning: FastxReader called ungetLine() with no prior line!"
1465 self.bufline=self.lastline
1466 self.lastline=None
1467 #< class FastxReader
1468
1469 def fa_write(fhandle, seq_id, seq):
1470 """
1471 Write to a file in the FASTA format.
1472
1473 Arguments:
1474 - `fhandle`: A file handle open for writing
1475 - `seq_id`: The sequence id string for this sequence
1476 - `seq`: An unformatted string of the sequence to write
1477 """
1478 line_len = 60
1479 fhandle.write(">" + seq_id + "\n")
1480 for i in xrange(len(seq) / line_len + 1):
1481 start = i * line_len
1482 #end = (i+1) * line_len if (i+1) * line_len < len(seq) else len(seq)
1483 if (i+1) * line_len < len(seq):
1484 end = (i+1) * line_len
1485 else:
1486 end = len(seq)
1487 fhandle.write( seq[ start:end ] + "\n")
1488
1489 class ZReader:
1490 def __init__(self, filename, sysparams, guess=True):
1491 self.fname=filename
1492 self.file=None
1493 self.fsrc=None
1494 self.popen=None
1495 pipecmd=[]
1496 s=filename.lower()
1497 if s.endswith(".bam"):
1498 # daehwan - want to ouptut mapped reads as well
1499 pipecmd=[prog_path("bam2fastx"), "--all", "-"]
1500 else:
1501 if guess:
1502 if s.endswith(".z") or s.endswith(".gz") or s.endswith(".gzip"):
1503 pipecmd=['gzip']
1504 else:
1505 if s.endswith(".bz2") or s.endswith(".bzip2") or s.endswith(".bzip"):
1506 pipecmd=['bzip2']
1507 if len(pipecmd)>0 and which(pipecmd[0]) is None:
1508 die("Error: cannot find %s to decompress input file %s " % (pipecmd, filename))
1509 if len(pipecmd)>0:
1510 if pipecmd[0]=='gzip' and sysparams.zipper.endswith('pigz'):
1511 pipecmd[0]=sysparams.zipper
1512 pipecmd.extend(sysparams.zipper_opts)
1513 elif pipecmd[0]=='bzip2' and sysparams.zipper.endswith('pbzip2'):
1514 pipecmd[0]=sysparams.zipper
1515 pipecmd.extend(sysparams.zipper_opts)
1516 else: #not guessing, but must still check if it's a compressed file
1517 if use_zpacker and filename.endswith(".z"):
1518 pipecmd=[sysparams.zipper]
1519 pipecmd.extend(sysparams.zipper_opts)
1520
1521 if pipecmd:
1522 pipecmd+=['-cd']
1523 if pipecmd:
1524 try:
1525 self.fsrc=open(self.fname, 'rb')
1526 self.popen=subprocess.Popen(pipecmd,
1527 preexec_fn=subprocess_setup,
1528 stdin=self.fsrc,
1529 stdout=subprocess.PIPE, close_fds=True)
1530 except Exception:
1531 die("Error: could not open pipe "+' '.join(pipecmd)+' < '+ self.fname)
1532 self.file=self.popen.stdout
1533 else:
1534 self.file=open(filename)
1535 def close(self):
1536 if self.fsrc: self.fsrc.close()
1537 self.file.close()
1538 if self.popen:
1539 self.popen.wait()
1540 self.popen=None
1541
1542 class ZWriter:
1543 def __init__(self, filename, sysparams):
1544 self.fname=filename
1545 if use_zpacker:
1546 pipecmd=[sysparams.zipper,"-cf", "-"]
1547 self.ftarget=open(filename, "wb")
1548 try:
1549 self.popen=subprocess.Popen(pipecmd,
1550 preexec_fn=subprocess_setup,
1551 stdin=subprocess.PIPE,
1552 stdout=self.ftarget, close_fds=True)
1553 except Exception:
1554 die("Error: could not open writer pipe "+' '.join(pipecmd)+' < '+ self.fname)
1555 self.file=self.popen.stdin # client writes to this end of the pipe
1556 else: #no compression
1557 self.file=open(filename, "w")
1558 self.ftarget=None
1559 self.popen=None
1560 def close(self):
1561 self.file.close()
1562 if self.ftarget: self.ftarget.close()
1563 if self.popen:
1564 self.popen.wait() #! required to actually flush the pipes (eek!)
1565 self.popen=None
1566
1567 # check_reads_format() examines the first few records in the user files
1568 # to determines the file format
1569 def check_reads_format(params, reads_files):
1570 #seed_len = params.read_params.seed_length
1571 fileformat = params.read_params.reads_format
1572
1573 observed_formats = set([])
1574 # observed_scales = set([])
1575 min_seed_len = 99999
1576 max_seed_len = 0
1577 files = reads_files.split(',')
1578
1579 for f_name in files:
1580 #try:
1581 zf = ZReader(f_name, params.system_params)
1582 #except IOError:
1583 # die("Error: could not open file "+f_name)
1584 freader=FastxReader(zf.file, params.read_params.color, zf.fname)
1585 toread=4 #just sample the first 4 reads
1586 while toread>0:
1587 seqid, seqstr, seq_len, qstr = freader.nextRecord()
1588 if not seqid: break
1589 toread-=1
1590 if seq_len < 20:
1591 print >> sys.stderr, "Warning: found a read < 20bp in", f_name
1592 else:
1593 min_seed_len = min(seq_len, min_seed_len)
1594 max_seed_len = max(seq_len, max_seed_len)
1595 zf.close()
1596 observed_formats.add(freader.format)
1597 if len(observed_formats) > 1:
1598 die("Error: TopHat requires all reads be either FASTQ or FASTA. Mixing formats is not supported.")
1599 fileformat=list(observed_formats)[0]
1600 #if seed_len != None:
1601 # seed_len = max(seed_len, max_seed_len)
1602 #else:
1603 # seed_len = max_seed_len
1604 #print >> sys.stderr, "\tmin read length: %dbp, max read length: %dbp" % (min_seed_len, max_seed_len)
1605 print >> sys.stderr, "\tformat:\t\t %s" % fileformat
1606 if fileformat == "fastq":
1607 quality_scale = "phred33 (default)"
1608 if params.read_params.solexa_quals and not params.read_params.phred64_quals:
1609 quality_scale = "solexa33 (reads generated with GA pipeline version < 1.3)"
1610 elif params.read_params.phred64_quals:
1611 quality_scale = "phred64 (reads generated with GA pipeline version >= 1.3)"
1612 print >> sys.stderr, "\tquality scale:\t %s" % quality_scale
1613 elif fileformat == "fasta":
1614 if params.read_params.color:
1615 params.read_params.integer_quals = True
1616
1617 #print seed_len, format, solexa_scale
1618 #NOTE: seed_len will be re-evaluated later by prep_reads
1619 return TopHatParams.ReadParams(params.read_params.solexa_quals,
1620 params.read_params.phred64_quals,
1621 params.read_params.quals,
1622 params.read_params.integer_quals,
1623 params.read_params.color,
1624 params.read_params.color_out,
1625 params.read_params.library_type,
1626 #seed_len,
1627 params.read_params.seed_length,
1628 fileformat,
1629 params.read_params.mate_inner_dist,
1630 params.read_params.mate_inner_dist_std_dev,
1631 params.read_params.read_group_id,
1632 params.read_params.sample_id,
1633 params.read_params.library_id,
1634 params.read_params.description,
1635 params.read_params.seq_platform_unit,
1636 params.read_params.seq_center,
1637 params.read_params.seq_run_date,
1638 params.read_params.seq_platform)
1639
1640 def log_tail(logfile, lines=1):
1641 f=open(logfile, "r")
1642 f.seek(0, 2)
1643 fbytes= f.tell()
1644 size=lines
1645 block=-1
1646 while size > 0 and fbytes+block*1024 > 0:
1647 if (fbytes+block*1024 > 0):
1648 ##Seek back once more, if possible
1649 f.seek( block*1024, 2 )
1650 else:
1651 #Seek to the beginning
1652 f.seek(0, 0)
1653 data= f.read( 1024 )
1654 linesFound= data.count('\n')
1655 size -= linesFound
1656 block -= 1
1657 if (fbytes + block*1024 > 0):
1658 f.seek(block*1024, 2)
1659 else:
1660 f.seek(0,0)
1661 f.readline() # find a newline
1662 lastBlocks= list( f.readlines() )
1663 f.close()
1664 return "\n".join(lastBlocks[-lines:])
1665
1666 # Format a DateTime as a pretty string.
1667 # FIXME: Currently doesn't support days!
1668 def formatTD(td):
1669 days = td.days
1670 hours = td.seconds // 3600
1671 minutes = (td.seconds % 3600) // 60
1672 seconds = td.seconds % 60
1673
1674 if days > 0:
1675 return '%d days %02d:%02d:%02d' % (days, hours, minutes, seconds)
1676 else:
1677 return '%02d:%02d:%02d' % (hours, minutes, seconds)
1678
1679 class PrepReadsInfo:
1680 def __init__(self, fname, side):
1681 try:
1682 f=open(fname,"r")
1683 self.min_len=int(f.readline().split("=")[-1])
1684 self.max_len=int(f.readline().split("=")[-1])
1685 self.in_count=int(f.readline().split("=")[-1])
1686 self.out_count=int(f.readline().split("=")[-1])
1687 if (self.out_count==0) or (self.max_len<16):
1688 raise Exception()
1689 except Exception, e:
1690 die(fail_str+"Error retrieving prep_reads info.")
1691 print >> sys.stderr, "\t%s reads: min. length=%s, count=%s" % (side, self.min_len, self.out_count)
1692 if self.min_len < 20:
1693 print >> sys.stderr, "Warning: short reads (<20bp) will make TopHat quite slow and take large amount of memory because they are likely to be mapped to too many places"
1694
1695
1696 def prep_reads_cmd(params, reads_list, quals_list=None, aux_file=None, index_file=None, filter_reads=None, hits_to_filter=None):
1697 #generate a prep_reads cmd arguments
1698 filter_cmd = [prog_path("prep_reads")]
1699 filter_cmd.extend(params.cmd())
1700
1701 if params.read_params.reads_format == "fastq":
1702 filter_cmd += ["--fastq"]
1703 elif params.read_params.reads_format == "fasta":
1704 filter_cmd += ["--fasta"]
1705 if hits_to_filter:
1706 filter_cmd += ["--flt-hits="+hits_to_filter]
1707 if aux_file:
1708 filter_cmd += ["--aux-outfile="+aux_file]
1709 if index_file:
1710 filter_cmd += ["--index-outfile="+index_file]
1711 if filter_reads:
1712 filter_cmd += ["--flt-reads="+filter_reads]
1713 filter_cmd.append(reads_list)
1714 if quals_list:
1715 filter_cmd.append(quals_list)
1716 return filter_cmd
1717
1718 # Calls the prep_reads executable, which prepares an internal read library.
1719 # The read library features reads with monotonically increasing integer IDs.
1720 # prep_reads also filters out very low complexy or garbage reads as well as
1721 # polyA reads.
1722 #--> returns the tuple (internal_read_library_file, read_info)
1723 def prep_reads(params, reads_list, quals_list, side="left", prefilter_reads=None):
1724 reads_suffix = ".fq"
1725
1726 # for parallelization, we don't compress the read files
1727 do_use_zpacker = use_zpacker
1728 if params.system_params.num_threads > 1:
1729 do_use_zpacker = False
1730
1731 if do_use_zpacker: reads_suffix += ".z"
1732
1733 output_name=side+"_kept_reads"
1734 kept_reads_filename = tmp_dir + output_name + reads_suffix
1735
1736 if os.path.exists(kept_reads_filename):
1737 os.remove(kept_reads_filename)
1738 kept_reads = open(kept_reads_filename, "wb")
1739 log_fname=logging_dir + "prep_reads.log"
1740 filter_log = open(log_fname,"w")
1741
1742 info_file=output_dir+output_name+".info"
1743 index_file=kept_reads_filename+".index"
1744
1745 filter_cmd=prep_reads_cmd(params, reads_list, quals_list, info_file, index_file, prefilter_reads)
1746 shell_cmd = ' '.join(filter_cmd)
1747 #finally, add the compression pipe
1748 zip_cmd=[]
1749 if do_use_zpacker:
1750 zip_cmd=[ params.system_params.zipper ]
1751 zip_cmd.extend(params.system_params.zipper_opts)
1752 zip_cmd.extend(['-c','-'])
1753 shell_cmd +=' | '+' '.join(zip_cmd)
1754 shell_cmd += ' >' +kept_reads_filename
1755 retcode=0
1756 try:
1757 print >> run_log, shell_cmd
1758 if do_use_zpacker:
1759 filter_proc = subprocess.Popen(filter_cmd,
1760 stdout=subprocess.PIPE,
1761 stderr=filter_log)
1762 zip_proc=subprocess.Popen(zip_cmd,
1763 preexec_fn=subprocess_setup,
1764 stdin=filter_proc.stdout,
1765 stdout=kept_reads)
1766 filter_proc.stdout.close() #as per http://bugs.python.org/issue7678
1767 zip_proc.communicate()
1768 retcode=filter_proc.poll()
1769 if retcode==0:
1770 retcode=zip_proc.poll()
1771 else:
1772 retcode = subprocess.call(filter_cmd,
1773 stdout=kept_reads,
1774 stderr=filter_log)
1775 if retcode:
1776 die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname))
1777
1778 except OSError, o:
1779 errmsg=fail_str+str(o)
1780 die(errmsg+"\n"+log_tail(log_fname))
1781
1782 kept_reads.close()
1783 return kept_reads_filename, PrepReadsInfo(info_file, side)
1784
1785
1786 # Call bowtie
1787 def bowtie(params,
1788 bwt_idx_prefix,
1789 reads_list,
1790 reads_format,
1791 num_mismatches,
1792 mapped_reads,
1793 unmapped_reads,
1794 extra_output = "",
1795 mapping_type = "read_against_genome",
1796 multihits_out = None): #only --prefilter-multihits should activate this parameter for the initial prefilter search
1797 start_time = datetime.now()
1798 bwt_idx_name = bwt_idx_prefix.split('/')[-1]
1799 reads_file=reads_list[0]
1800 readfile_basename=getFileBaseName(reads_file)
1801
1802 g_mapping, t_mapping, seg_mapping = False, False, False
1803 if mapping_type == "read_against_transcriptome":
1804 t_mapping = True
1805 elif mapping_type == "seg_against_genome" or mapping_type == "seg_against_junctions":
1806 seg_mapping = True
1807 else:
1808 g_mapping = True
1809
1810 bowtie_str = "Bowtie"
1811 if params.bowtie2:
1812 bowtie_str += "2"
1813
1814 if seg_mapping:
1815 # we multiply max_hits by 2 when mapping segments
1816 # this multiplication factor (2) is also assumed in segment_juncs and long_spanning_reads,
1817 # which means "2" should not be changed without modifying those sub-programs.
1818 params.max_hits *= 2
1819 if not params.bowtie2:
1820 backup_bowtie_alignment_option = params.bowtie_alignment_option
1821 params.bowtie_alignment_option = "-v"
1822
1823 if t_mapping:
1824 print >> sys.stderr, "[%s] Mapping %s against transcriptome %s with %s %s" % (start_time.strftime("%c"),
1825 readfile_basename, bwt_idx_name, bowtie_str, extra_output)
1826 else:
1827 print >> sys.stderr, "[%s] Mapping %s against %s with %s %s" % (start_time.strftime("%c"),
1828 readfile_basename, bwt_idx_name, bowtie_str, extra_output)
1829 bwt_log = open(logging_dir + 'bowtie.'+readfile_basename+'.fixmap.log', "w")
1830 #bwt_mapped=mapped_reads
1831 unmapped_reads_out=None
1832 if unmapped_reads:
1833 unmapped_reads_out=unmapped_reads+".fq"
1834 use_FIFO = use_BWT_FIFO and use_zpacker and unmapped_reads and params.read_params.color
1835 if use_FIFO:
1836 global unmapped_reads_fifo
1837 unmapped_reads_fifo=unmapped_reads+".fifo"
1838 if os.path.exists(unmapped_reads_fifo):
1839 os.remove(unmapped_reads_fifo)
1840 try:
1841 os.mkfifo(unmapped_reads_fifo)
1842 except OSError, o:
1843 die(fail_str+"Error at mkfifo("+unmapped_reads_fifo+'). '+str(o))
1844 unmapped_reads_out+=".z"
1845
1846 # Launch Bowtie
1847 try:
1848 bowtie_cmd = [bowtie_path]
1849 if reads_format == "fastq":
1850 bowtie_cmd += ["-q"]
1851 elif reads_format == "fasta":
1852 bowtie_cmd += ["-f"]
1853
1854 if params.read_params.color:
1855 bowtie_cmd += ["-C", "--col-keepends"]
1856 if unmapped_reads:
1857 bowtie_cmd += ["--un", unmapped_reads_out]
1858
1859 unzip_cmd=None
1860 bam_input=False
1861 if len(reads_list) > 0 and reads_list[0].endswith('.bam'):
1862 bam_input=True
1863 unzip_cmd=[ prog_path('bam2fastx') ]
1864 if reads_format:
1865 unzip_cmd.append("--" + reads_format)
1866 unzip_cmd+=[reads_list]
1867
1868 if use_zpacker and (unzip_cmd is None):
1869 unzip_cmd=[ params.system_params.zipper ]
1870 unzip_cmd.extend(params.system_params.zipper_opts)
1871 unzip_cmd+=['-cd']
1872
1873 fifo_pid=None
1874 if use_FIFO:
1875 unm_zipcmd=[ params.system_params.zipper ]
1876 unm_zipcmd.extend(params.system_params.zipper_opts)
1877 unm_zipcmd+=['-c']
1878 print >> run_log, ' '.join(unm_zipcmd)+' < '+ unmapped_reads_fifo + ' > '+ unmapped_reads_out + ' & '
1879 fifo_pid=os.fork()
1880 if fifo_pid==0:
1881 def on_sig_exit(sig, func=None):
1882 os._exit(os.EX_OK)
1883 signal.signal(signal.SIGTERM, on_sig_exit)
1884 subprocess.call(unm_zipcmd,
1885 stdin=open(unmapped_reads_fifo, "r"),
1886 stdout=open(unmapped_reads_out, "wb"))
1887 os._exit(os.EX_OK)
1888
1889 fix_map_cmd = [prog_path('fix_map_ordering')]
1890
1891 if params.bowtie2:
1892 if t_mapping or g_mapping:
1893 max_penalty, min_penalty = params.bowtie2_params.mp.split(',')
1894 max_penalty, min_penalty = int(max_penalty), int(min_penalty)
1895 min_score = (max_penalty - 1) * num_mismatches
1896 fix_map_cmd += ["--bowtie2-min-score", str(min_score)]
1897
1898 samzip_cmd=None
1899
1900 # daehwan - just use bam format for multi-threading.
1901 if unmapped_reads or params.system_params.num_threads >= 1:
1902 #mapping on the genome
1903 #fix_map_ordering will write BAM file directly, and unmapped_reads as BAM file too
1904 mapped_reads += ".bam"
1905
1906 temp_sam_header = tmp_dir + bwt_idx_prefix.split('/')[-1] + ".bwt.samheader.sam"
1907 fix_map_cmd += ["--sam-header", temp_sam_header, "-", mapped_reads]
1908 if not params.read_params.color and unmapped_reads:
1909 unmapped_reads_out = unmapped_reads+".bam"
1910 fix_map_cmd += [unmapped_reads_out]
1911 else:
1912 #mapping on segment_juncs (spliced potential junctions)
1913 # don't use SAM headers, could be slow -- better use compressed SAM directly
1914 mapped_reads += ".sam"
1915 fix_map_cmd += ["-"]
1916 if use_zpacker:
1917 fix_map_cmd += ["-"]
1918 samzip_cmd=[ params.system_params.zipper ]
1919 samzip_cmd.extend(params.system_params.zipper_opts)
1920 samzip_cmd += ["-c"]
1921 mapped_reads += ".z"
1922 else:
1923 fix_map_cmd += [mapped_reads]
1924
1925 fix_map_cmd += ["--index-outfile", mapped_reads + ".index"]
1926
1927 max_hits = params.max_hits
1928 if t_mapping:
1929 max_hits = params.t_max_hits
1930 if num_mismatches > 3:
1931 num_mismatches = 3
1932
1933 if params.bowtie2:
1934 if seg_mapping:
1935 # since bowtie2 does not suppress reads that map to many places,
1936 # we suppress those in segment_juncs and long_spanning_reads.
1937 bowtie_cmd += ["-k", str(max_hits + 1)]
1938 else:
1939 bowtie_cmd += ["-k", str(max_hits)]
1940
1941 bowtie2_params = params.bowtie2_params
1942 if seg_mapping:
1943 # after intensive testing,
1944 # the following parameters seem to work faster than Bowtie1 and as sensitive as Bowtie1,
1945 # but a chance for further improvements remains.
1946 bowtie_cmd += ["-N", str(min(num_mismatches, 1))]
1947 bowtie_cmd += ["-i", "C,2,0"]
1948 bowtie_cmd += ["-L", "20"]
1949 # bowtie_cmd += ["-L", str(min(params.segment_length, 20))]
1950 else:
1951 bowtie2_preset = ""
1952 if bowtie2_params.very_fast:
1953 bowtie2_preset = "--very-fast"
1954 elif bowtie2_params.fast:
1955 bowtie2_preset = "--fast"
1956 elif bowtie2_params.sensitive:
1957 bowtie2_preset = "--sensitive"
1958 elif bowtie2_params.very_sensitive:
1959 bowtie2_preset = "--very-sensitive"
1960
1961 if bowtie2_preset != "":
1962 bowtie_cmd += [bowtie2_preset]
1963 else:
1964 bowtie_cmd += ["-D", str(bowtie2_params.D),
1965 "-R", str(bowtie2_params.R),
1966 "-N", str(bowtie2_params.N),
1967 "-L", str(bowtie2_params.L),
1968 "-i", bowtie2_params.i]
1969
1970 # "--n-ceil" is not correctly parsed in Bowtie2,
1971 # I (daehwan) already talked to Ben who will fix the problem.
1972 bowtie_cmd += [# "--n-ceil", bowtie2_params.n_ceil,
1973 "--gbar", str(bowtie2_params.gbar),
1974 "--mp", bowtie2_params.mp,
1975 "--np", str(bowtie2_params.np),
1976 "--rdg", bowtie2_params.rdg,
1977 "--rfg", bowtie2_params.rfg,
1978 "--score-min", bowtie2_params.score_min]
1979
1980 else:
1981 bowtie_cmd += [params.bowtie_alignment_option, str(num_mismatches),
1982 "-k", str(max_hits),
1983 "-m", str(max_hits),
1984 "-S"]
1985
1986 bowtie_cmd += ["-p", str(params.system_params.num_threads)]
1987
1988 if params.bowtie2: #always use headerless SAM file
1989 bowtie_cmd += ["--sam-no-hd"]
1990 else:
1991 bowtie_cmd += ["--sam-nohead"]
1992
1993 if not params.bowtie2:
1994 if multihits_out:
1995 bowtie_cmd += ["--max", multihits_out]
1996 else:
1997 bowtie_cmd += ["--max", "/dev/null"]
1998
1999 if params.bowtie2:
2000 bowtie_cmd += ["-x"]
2001
2002 bowtie_cmd += [ bwt_idx_prefix ]
2003 bowtie_proc=None
2004 shellcmd=""
2005 unzip_proc=None
2006 samzip_proc=None
2007 if multihits_out:
2008 #special prefilter bowtie run: we use prep_reads on the fly
2009 #in order to get multi-mapped reads to exclude later
2010 prep_cmd = prep_reads_cmd(params, ",".join(reads_list))
2011 preplog_fname=logging_dir + "prep_reads_prefilter.log"
2012 prepfilter_log = open(preplog_fname,"w")
2013 unzip_proc = subprocess.Popen(prep_cmd,
2014 stdout=subprocess.PIPE,
2015 stderr=prepfilter_log)
2016 shellcmd=' '.join(prep_cmd) + "|"
2017 else:
2018 z_input=use_zpacker and reads_file.endswith(".z")
2019 if z_input:
2020 unzip_proc = subprocess.Popen(unzip_cmd,
2021 stdin=open(reads_file, "rb"),
2022 stdout=subprocess.PIPE)
2023 shellcmd=' '.join(unzip_cmd) + "< " +reads_file +"|"
2024 else:
2025 #must be uncompressed fastq input (unmapped reads from a previous run)
2026 #or a BAM file with unmapped reads
2027 if bam_input:
2028 unzip_proc = subprocess.Popen(unzip_cmd, stdout=subprocess.PIPE)
2029 shellcmd=' '.join(unzip_cmd) + "|"
2030 bowtie_cmd += [reads_file]
2031 bowtie_proc = subprocess.Popen(bowtie_cmd,
2032 stdout=subprocess.PIPE,
2033 stderr=bwt_log)
2034 if unzip_proc:
2035 #input is compressed OR prep_reads is used as a filter
2036 bowtie_cmd += ['-']
2037 bowtie_proc = subprocess.Popen(bowtie_cmd,
2038 stdin=unzip_proc.stdout,
2039 stdout=subprocess.PIPE,
2040 stderr=bwt_log)
2041 unzip_proc.stdout.close() # see http://bugs.python.org/issue7678
2042
2043 shellcmd += ' '.join(bowtie_cmd) + '|' + ' '.join(fix_map_cmd)
2044 if samzip_cmd:
2045 shellcmd += "|"+ ' '.join(samzip_cmd)+" > " + mapped_reads
2046 fix_order_proc = subprocess.Popen(fix_map_cmd,
2047 stdin=bowtie_proc.stdout,
2048 stdout=subprocess.PIPE)
2049 bowtie_proc.stdout.close() # needed?
2050 samzip_proc = subprocess.Popen(samzip_cmd,
2051 preexec_fn=subprocess_setup,
2052 stdin=fix_order_proc.stdout,
2053 stdout=open(mapped_reads, "wb"))
2054 fix_order_proc.stdout.close()
2055 else:
2056 #write BAM output directly
2057 fix_order_proc = subprocess.Popen(fix_map_cmd,
2058 stdin=bowtie_proc.stdout)
2059 # stdout=open(mapped_reads, "w"))
2060 bowtie_proc.stdout.close()
2061
2062 # shellcmd += " > " + mapped_reads
2063 print >> run_log, shellcmd
2064 if samzip_proc:
2065 samzip_proc.communicate()
2066 else:
2067 fix_order_proc.communicate()
2068 if use_FIFO:
2069 if fifo_pid and not os.path.exists(unmapped_reads_out):
2070 try:
2071 os.kill(fifo_pid, signal.SIGTERM)
2072 except:
2073 pass
2074
2075 except OSError, o:
2076 die(fail_str+"Error: "+str(o))
2077
2078 # Success
2079 #finish_time = datetime.now()
2080 #duration = finish_time - start_time
2081 #print >> sys.stderr, "\t\t\t[%s elapsed]" % formatTD(duration)
2082 if use_FIFO:
2083 try:
2084 os.remove(unmapped_reads_fifo)
2085 except:
2086 pass
2087 if multihits_out and not os.path.exists(multihits_out):
2088 open(multihits_out, "w").close()
2089
2090 if seg_mapping:
2091 params.max_hits /= 2
2092 if not params.bowtie2:
2093 params.bowtie_alignment_option = backup_bowtie_alignment_option
2094
2095 return (mapped_reads, unmapped_reads_out)
2096
2097
2098 # Retrieve a .juncs file from a GFF file by calling the gtf_juncs executable
2099 def get_gtf_juncs(gff_annotation):
2100 th_log("Reading known junctions from GTF file")
2101 gtf_juncs_log = open(logging_dir + "gtf_juncs.log", "w")
2102
2103 gff_prefix = gff_annotation.split('/')[-1].split('.')[0]
2104
2105 gtf_juncs_out_name = tmp_dir + gff_prefix + ".juncs"
2106 gtf_juncs_out = open(gtf_juncs_out_name, "w")
2107
2108 gtf_juncs_cmd=[prog_path("gtf_juncs"), gff_annotation]
2109 try:
2110 print >> run_log, " ".join(gtf_juncs_cmd), " > "+gtf_juncs_out_name
2111 retcode = subprocess.call(gtf_juncs_cmd,
2112 stderr=gtf_juncs_log,
2113 stdout=gtf_juncs_out)
2114 # cvg_islands returned an error
2115 if retcode == 1:
2116 print >> sys.stderr, "\tWarning: TopHat did not find any junctions in GTF file"
2117 return (False, gtf_juncs_out_name)
2118 elif retcode != 0:
2119 die(fail_str+"Error: GTF junction extraction failed with err ="+str(retcode))
2120
2121 # cvg_islands not found
2122 except OSError, o:
2123 errmsg=fail_str+str(o)+"\n"
2124 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2125 errmsg+="Error: gtf_juncs not found on this system"
2126 die(errmsg)
2127 return (True, gtf_juncs_out_name)
2128
2129 # Call bowtie-build on the FASTA file of sythetic splice junction sequences
2130 def build_juncs_bwt_index(is_bowtie2, external_splice_prefix, color):
2131 th_log("Indexing splices")
2132 bowtie_build_log = open(logging_dir + "bowtie_build.log", "w")
2133
2134 #user_splices_out_prefix = output_dir + "user_splices_idx"
2135
2136 if is_bowtie2:
2137 bowtie_build_cmd = [prog_path("bowtie2-build")]
2138 else:
2139 bowtie_build_cmd = [prog_path("bowtie-build")]
2140
2141 if color:
2142 bowtie_build_cmd += ["-C"]
2143
2144 bowtie_build_cmd += [external_splice_prefix + ".fa",
2145 external_splice_prefix]
2146 try:
2147 print >> run_log, " ".join(bowtie_build_cmd)
2148 retcode = subprocess.call(bowtie_build_cmd,
2149 stdout=bowtie_build_log)
2150
2151 if retcode != 0:
2152 die(fail_str+"Error: Splice sequence indexing failed with err ="+ str(retcode))
2153 except OSError, o:
2154 errmsg=fail_str+str(o)+"\n"
2155 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2156 errmsg+="Error: bowtie-build not found on this system"
2157 die(errmsg)
2158 return external_splice_prefix
2159
2160 # Build a splice index from a .juncs file, suitable for use with specified read
2161 # (or read segment) lengths
2162 def build_juncs_index(is_bowtie2,
2163 min_anchor_length,
2164 max_seg_len,
2165 juncs_prefix,
2166 external_juncs,
2167 external_insertions,
2168 external_deletions,
2169 external_fusions,
2170 reference_fasta,
2171 color):
2172 th_log("Retrieving sequences for splices")
2173
2174 juncs_file_list = ",".join(external_juncs)
2175 insertions_file_list = ",".join(external_insertions)
2176 deletions_file_list = ",".join(external_deletions)
2177 fusions_file_list = ",".join(external_fusions)
2178
2179 juncs_db_log = open(logging_dir + "juncs_db.log", "w")
2180
2181 external_splices_out_prefix = tmp_dir + juncs_prefix
2182 external_splices_out_name = external_splices_out_prefix + ".fa"
2183
2184 external_splices_out = open(external_splices_out_name, "w")
2185 # juncs_db_cmd = [bin_dir + "juncs_db",
2186 juncs_db_cmd = [prog_path("juncs_db"),
2187 str(min_anchor_length),
2188 str(max_seg_len),
2189 juncs_file_list,
2190 insertions_file_list,
2191 deletions_file_list,
2192 fusions_file_list,
2193 reference_fasta]
2194 try:
2195 print >> run_log, " ".join(juncs_db_cmd)
2196 retcode = subprocess.call(juncs_db_cmd,
2197 stderr=juncs_db_log,
2198 stdout=external_splices_out)
2199
2200 if retcode != 0:
2201 die(fail_str+"Error: Splice sequence retrieval failed with err ="+str(retcode))
2202 # juncs_db not found
2203 except OSError, o:
2204 errmsg=fail_str+str(o)+"\n"
2205 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2206 errmsg+="Error: juncs_db not found on this system"
2207 die(errmsg)
2208
2209 external_splices_out_prefix = build_juncs_bwt_index(is_bowtie2, external_splices_out_prefix, color)
2210 return external_splices_out_prefix
2211
2212 def build_idx_from_fa(is_bowtie2, fasta_fname, out_dir, color):
2213 """ Build a bowtie index from a FASTA file.
2214
2215 Arguments:
2216 - `fasta_fname`: File path to FASTA file.
2217 - `out_dir`: Output directory to place index in. (includes os.sep)
2218
2219 Returns:
2220 - The path to the Bowtie index.
2221 """
2222 bwt_idx_path = out_dir + os.path.basename(fasta_fname).replace(".fa", "")
2223
2224 if is_bowtie2:
2225 bowtie_idx_cmd = [prog_path("bowtie2-build")]
2226 else:
2227 bowtie_idx_cmd = [prog_path("bowtie-build")]
2228
2229 if color:
2230 bowtie_idx_cmd += ["-C"]
2231
2232 bowtie_idx_cmd += [fasta_fname,
2233 bwt_idx_path]
2234 try:
2235 th_log("Building Bowtie index from " + os.path.basename(fasta_fname))
2236 print >> run_log, " ".join(bowtie_idx_cmd)
2237 retcode = subprocess.call(bowtie_idx_cmd,
2238 stdout=open(os.devnull, "w"),
2239 stderr=open(os.devnull, "w"))
2240 if retcode != 0:
2241 die(fail_str + "Error: Couldn't build bowtie index with err = "
2242 + str(retcode))
2243 except OSError, o:
2244 errmsg=fail_str+str(o)+"\n"
2245 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2246 errmsg+="Error: bowtie-build not found on this system"
2247 die(errmsg)
2248
2249 return bwt_idx_path
2250
2251 # Print out the sam header, embedding the user's specified library properties.
2252 # FIXME: also needs SQ dictionary lines
2253 def write_sam_header(read_params, sam_file):
2254 print >> sam_file, "@HD\tVN:1.0\tSO:coordinate"
2255 if read_params.read_group_id and read_params.sample_id:
2256 rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
2257 read_params.sample_id)
2258 if read_params.library_id:
2259 rg_str += "\tLB:%s" % read_params.library_id
2260 if read_params.description:
2261 rg_str += "\tDS:%s" % read_params.description
2262 if read_params.seq_platform_unit:
2263 rg_str += "\tPU:%s" % read_params.seq_platform_unit
2264 if read_params.seq_center:
2265 rg_str += "\tCN:%s" % read_params.seq_center
2266 if read_params.mate_inner_dist:
2267 rg_str += "\tPI:%s" % read_params.mate_inner_dist
2268 if read_params.seq_run_date:
2269 rg_str += "\tDT:%s" % read_params.seq_run_date
2270 if read_params.seq_platform:
2271 rg_str += "\tPL:%s" % read_params.seq_platform
2272
2273 print >> sam_file, rg_str
2274 print >> sam_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
2275
2276 # Write final TopHat output, via tophat_reports and wiggles
2277 def compile_reports(params, sam_header_filename, ref_fasta, mappings, readfiles, gff_annotation):
2278 th_log("Reporting output tracks")
2279 left_maps, right_maps = mappings
2280 left_reads, right_reads = readfiles
2281 left_maps = [x for x in left_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
2282 left_maps = ','.join(left_maps)
2283
2284 if len(right_maps) > 0:
2285 right_maps = [x for x in right_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
2286 right_maps = ','.join(right_maps)
2287 log_fname = logging_dir + "reports.log"
2288 report_log = open(log_fname, "w")
2289 junctions = output_dir + "junctions.bed"
2290 insertions = output_dir + "insertions.bed"
2291 deletions = output_dir + "deletions.bed"
2292 accepted_hits = output_dir + "accepted_hits"
2293 report_cmdpath = prog_path("tophat_reports")
2294 fusions = output_dir + "fusions.out"
2295 report_cmd = [report_cmdpath]
2296
2297 alignments_output_filename = tmp_dir + "accepted_hits"
2298
2299 report_cmd.extend(params.cmd())
2300 report_cmd.extend(["--samtools="+samtools_path])
2301 report_cmd.extend([ref_fasta,
2302 junctions,
2303 insertions,
2304 deletions,
2305 fusions,
2306 alignments_output_filename,
2307 left_maps,
2308 left_reads])
2309
2310 if len(right_maps) > 0 and right_reads:
2311 report_cmd.append(right_maps)
2312 report_cmd.append(right_reads)
2313
2314 # -- tophat_reports now produces (uncompressed) BAM stream,
2315 try:
2316 if params.report_params.convert_bam:
2317 if params.report_params.sort_bam:
2318 print >> run_log, " ".join(report_cmd)
2319 report_proc=subprocess.call(report_cmd,
2320 preexec_fn=subprocess_setup,
2321 stderr=report_log)
2322
2323 bam_parts = []
2324 for i in range(params.system_params.num_threads):
2325 bam_part_filename = "%s%d.bam" % (alignments_output_filename, i)
2326 if os.path.exists(bam_part_filename):
2327 bam_parts.append(bam_part_filename)
2328 else:
2329 break
2330
2331 num_bam_parts = len(bam_parts)
2332 pids = [0 for i in range(num_bam_parts)]
2333 sorted_bam_parts = ["%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)]
2334 #left_um_parts = ["%s%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)]
2335 #right_um_parts = ["%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)]
2336 for i in range(num_bam_parts):
2337 bamsort_cmd = [samtools_path,
2338 "sort",
2339 bam_parts[i],
2340 sorted_bam_parts[i]]
2341
2342 sorted_bam_parts[i] += ".bam"
2343 print >> run_log, " ".join(bamsort_cmd)
2344 pid = os.fork()
2345 if pid == 0:
2346 subprocess.call(bamsort_cmd,
2347 stderr=open(logging_dir + "reports.samtools_sort.log%d" % i, "w"))
2348 os._exit(os.EX_OK)
2349 else:
2350 pids[i] = pid
2351
2352 for i in range(len(pids)):
2353 if pids[i] > 0:
2354 os.waitpid(pids[i], 0)
2355 pids[i] = 0
2356
2357 # for bam_part in bam_parts:
2358 # os.remove(bam_part)
2359
2360 if num_bam_parts > 1:
2361 bammerge_cmd = [samtools_path,
2362 "merge",
2363 "-h",
2364 sam_header_filename,
2365 "%s.bam" % accepted_hits]
2366 bammerge_cmd += sorted_bam_parts
2367
2368 print >> run_log, " ".join(bammerge_cmd)
2369 subprocess.call(bammerge_cmd,
2370 stderr=open(logging_dir + "reports.samtools_merge.log", "w"))
2371
2372 for sorted_bam_part in sorted_bam_parts:
2373 os.remove(sorted_bam_part)
2374 else:
2375 os.rename(sorted_bam_parts[0], output_dir + "accepted_hits.bam")
2376
2377 else:
2378 print >> run_log, " ".join(report_cmd)
2379 report_proc=subprocess.call(report_cmd,
2380 preexec_fn=subprocess_setup,
2381 stdout=open(accepted_hits,"wb"),
2382 stderr=report_log)
2383 os.rename(accepted_hits, output_dir + "accepted_hits.bam")
2384 else:
2385 print >> run_log, " ".join(report_cmd)
2386 report_proc=subprocess.call(report_cmd,
2387 preexec_fn=subprocess_setup,
2388 stdout=open(accepted_hits,"wb"),
2389 stderr=report_log)
2390 tmp_sam = output_dir + "accepted_hits.sam"
2391
2392 bam_to_sam_cmd = [samtools_path, "view", "-h", accepted_hits]
2393 print >> run_log, " ".join(bam_to_sam_cmd) + " > " + tmp_sam
2394 bam_to_sam_log = open(logging_dir + "accepted_hits_bam_to_sam.log", "w")
2395 tmp_sam_file = open(tmp_sam, "w")
2396 ret = subprocess.call(bam_to_sam_cmd,
2397 stdout=tmp_sam_file,
2398 stderr=bam_to_sam_log)
2399 tmp_sam_file.close()
2400 os.remove(accepted_hits)
2401
2402 except OSError, o:
2403 die(fail_str+"Error: "+str(o)+"\n"+log_tail(log_fname))
2404
2405 return junctions
2406
2407
2408 # Split up each read in a FASTQ file into multiple segments. Creates a FASTQ file
2409 # for each segment This function needs to be fixed to support mixed read length
2410 # inputs
2411 def open_output_files(prefix, num_files_prev, num_files, out_segf, extension, params):
2412 i = num_files_prev + 1
2413 while i <= num_files:
2414 segfname=prefix+("_seg%d" % i)+extension
2415 out_segf.append(ZWriter(segfname,params.system_params))
2416 i += 1
2417
2418 def split_reads(reads_filename,
2419 prefix,
2420 fasta,
2421 params,
2422 segment_length):
2423 #reads_file = open(reads_filename)
2424 zreads = ZReader(reads_filename, params.system_params, False)
2425 out_segfiles = []
2426
2427 if fasta:
2428 extension = ".fa"
2429 else:
2430 extension = ".fq"
2431 if use_zpacker: extension += ".z"
2432 def convert_color_to_bp(color_seq):
2433 decode_dic = { 'A0':'A', 'A1':'C', 'A2':'G', 'A3':'T', 'A4':'N', 'A.':'N', 'AN':'N',
2434 'C0':'C', 'C1':'A', 'C2':'T', 'C3':'G', 'C4':'N', 'C.':'N', 'CN':'N',
2435 'G0':'G', 'G1':'T', 'G2':'A', 'G3':'C', 'G4':'N', 'G.':'N', 'GN':'N',
2436 'T0':'T', 'T1':'G', 'T2':'C', 'T3':'A', 'T4':'N', 'T.':'N', 'TN':'N',
2437 'N0':'N', 'N1':'N', 'N2':'N', 'N3':'N', 'N4':'N', 'N.':'N', 'NN':'N',
2438 '.0':'N', '.1':'N', '.2':'N', '.3':'N', '.4':'N', '..':'N', '.N':'N' }
2439
2440 base = color_seq[0]
2441 bp_seq = base
2442 for ch in color_seq[1:]:
2443 base = decode_dic[base+ch]
2444 bp_seq += base
2445 return bp_seq
2446
2447 def convert_bp_to_color(bp_seq):
2448 encode_dic = { 'AA':'0', 'CC':'0', 'GG':'0', 'TT':'0',
2449 'AC':'1', 'CA':'1', 'GT':'1', 'TG':'1',
2450 'AG':'2', 'CT':'2', 'GA':'2', 'TC':'2',
2451 'AT':'3', 'CG':'3', 'GC':'3', 'TA':'3',
2452 'A.':'4', 'C.':'4', 'G.':'4', 'T.':'4',
2453 '.A':'4', '.C':'4', '.G':'4', '.T':'4',
2454 '.N':'4', 'AN':'4', 'CN':'4', 'GN':'4',
2455 'TN':'4', 'NA':'4', 'NC':'4', 'NG':'4',
2456 'NT':'4', 'NN':'4', 'N.':'4', '..':'4' }
2457
2458 base = bp_seq[0]
2459 color_seq = base
2460 for ch in bp_seq[1:]:
2461 color_seq += encode_dic[base + ch]
2462 base = ch
2463
2464 return color_seq
2465
2466 def split_record(read_name, read_seq, read_qual, out_segf, offsets, color):
2467 if color:
2468 color_offset = 1
2469 read_seq_temp = convert_color_to_bp(read_seq)
2470
2471 seg_num = 1
2472 while seg_num + 1 < len(offsets):
2473 if read_seq[offsets[seg_num]+1] not in ['0', '1', '2', '3']:
2474 return
2475 seg_num += 1
2476 else:
2477 color_offset = 0
2478
2479 seg_num = 0
2480 last_seq_offset = 0
2481 while seg_num + 1 < len(offsets):
2482 f = out_segf[seg_num].file
2483 seg_seq = read_seq[last_seq_offset+color_offset:offsets[seg_num + 1]+color_offset]
2484 print >> f, "%s|%d:%d:%d" % (read_name,last_seq_offset,seg_num, len(offsets) - 1)
2485 if color:
2486 print >> f, "%s%s" % (read_seq_temp[last_seq_offset], seg_seq)
2487 else:
2488 print >> f, seg_seq
2489 if not fasta:
2490 seg_qual = read_qual[last_seq_offset:offsets[seg_num + 1]]
2491 print >> f, "+"
2492 print >> f, seg_qual
2493 seg_num += 1
2494 last_seq_offset = offsets[seg_num]
2495
2496 line_state = 0
2497 read_name = ""
2498 read_seq = ""
2499 read_quals = ""
2500 num_segments = 0
2501 offsets = []
2502 for line in zreads.file:
2503 if line.strip() == "":
2504 continue
2505 if line_state == 0:
2506 read_name = line.strip()
2507 elif line_state == 1:
2508 read_seq = line.strip()
2509
2510 read_length = len(read_seq)
2511 tmp_num_segments = read_length / segment_length
2512 offsets = [segment_length * i for i in range(0, tmp_num_segments + 1)]
2513
2514 # Bowtie's minimum read length here is 20bp, so if the last segment
2515 # is between 20 and segment_length bp long, go ahead and write it out
2516 if read_length % segment_length >= min(segment_length - 2, 20):
2517 offsets.append(read_length)
2518 tmp_num_segments += 1
2519 else:
2520 offsets[-1] = read_length
2521
2522 if tmp_num_segments == 1:
2523 offsets = [0, read_length]
2524
2525 if tmp_num_segments > num_segments:
2526 open_output_files(prefix, num_segments, tmp_num_segments, out_segfiles, extension, params)
2527 num_segments = tmp_num_segments
2528
2529 if fasta:
2530 split_record(read_name, read_seq, None, out_segfiles, offsets, params.read_params.color)
2531 elif line_state == 2:
2532 line = line.strip()
2533 else:
2534 read_quals = line.strip()
2535 if not fasta:
2536 split_record(read_name, read_seq, read_quals, out_segfiles, offsets, params.read_params.color)
2537
2538 line_state += 1
2539 if fasta:
2540 line_state %= 2
2541 else:
2542 line_state %= 4
2543 zreads.close()
2544 out_fnames=[]
2545 for zf in out_segfiles:
2546 zf.close()
2547 out_fnames.append(zf.fname)
2548 #return [o.fname for o in out_segfiles]
2549 return out_fnames
2550
2551 # Find possible splice junctions using the "closure search" strategy, and report
2552 # them in closures.juncs. Calls the executable closure_juncs
2553 def junctions_from_closures(params,
2554 left_maps,
2555 right_maps,
2556 ref_fasta):
2557 th_log("Searching for junctions via mate-pair closures")
2558
2559
2560 #maps = [x for x in seg_maps if (os.path.exists(x) and os.path.getsize(x) > 0)]
2561 #if len(maps) == 0:
2562 # return None
2563 #print >> sys.stderr, left_maps
2564 slash = left_maps[0].rfind('/')
2565 juncs_out = ""
2566 if slash != -1:
2567 juncs_out += left_maps[0][:slash+1]
2568 fusions_out = juncs_out
2569
2570 juncs_out += "closure.juncs"
2571 fusions_out += "closure.fusions"
2572
2573 juncs_log = open(logging_dir + "closure.log", "w")
2574 juncs_cmdpath=prog_path("closure_juncs")
2575 juncs_cmd = [juncs_cmdpath]
2576
2577 left_maps = ','.join(left_maps)
2578 right_maps = ','.join(right_maps)
2579
2580 juncs_cmd.extend(params.cmd())
2581 juncs_cmd.extend([juncs_out,
2582 fusions_out,
2583 ref_fasta,
2584 left_maps,
2585 right_maps])
2586 try:
2587 print >> run_log, ' '.join(juncs_cmd)
2588 retcode = subprocess.call(juncs_cmd,
2589 stderr=juncs_log)
2590
2591 # spanning_reads returned an error
2592 if retcode != 0:
2593 die(fail_str+"Error: closure-based junction search failed with err ="+str(retcode))
2594 # cvg_islands not found
2595 except OSError, o:
2596 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2597 print >> sys.stderr, fail_str, "Error: closure_juncs not found on this system"
2598 die(str(o))
2599 return [juncs_out]
2600
2601 # Find possible junctions by examining coverage and split segments in the initial
2602 # map and segment maps. Report junctions, insertions, and deletions in segment.juncs,
2603 # segment.insertions, and segment.deletions. Calls the executable
2604 # segment_juncs
2605 def junctions_from_segments(params,
2606 left_reads,
2607 left_reads_map,
2608 left_seg_maps,
2609 right_reads,
2610 right_reads_map,
2611 right_seg_maps,
2612 unmapped_reads,
2613 reads_format,
2614 ref_fasta):
2615 if left_reads_map != left_seg_maps[0]:
2616 th_log("Searching for junctions via segment mapping")
2617 out_path=getFileDir(left_seg_maps[0])
2618 juncs_out=out_path+"segment.juncs"
2619 insertions_out=out_path+"segment.insertions"
2620 deletions_out =out_path+"segment.deletions"
2621 fusions_out = out_path+"segment.fusions"
2622
2623 left_maps = ','.join(left_seg_maps)
2624 log_fname = logging_dir + "segment_juncs.log"
2625 segj_log = open(log_fname, "w")
2626 segj_cmd = [prog_path("segment_juncs")]
2627
2628 segj_cmd.extend(params.cmd())
2629 segj_cmd.extend(["--ium-reads", ",".join(unmapped_reads),
2630 ref_fasta,
2631 juncs_out,
2632 insertions_out,
2633 deletions_out,
2634 fusions_out,
2635 left_reads,
2636 left_reads_map,
2637 left_maps])
2638 if right_seg_maps:
2639 right_maps = ','.join(right_seg_maps)
2640 segj_cmd.extend([right_reads, right_reads_map, right_maps])
2641 try:
2642 print >> run_log, " ".join(segj_cmd)
2643 retcode = subprocess.call(segj_cmd,
2644 preexec_fn=subprocess_setup,
2645 stderr=segj_log)
2646
2647 # spanning_reads returned an error
2648 if retcode != 0:
2649 die(fail_str+"Error: segment-based junction search failed with err ="+str(retcode)+"\n"+log_tail(log_fname))
2650
2651 # cvg_islands not found
2652 except OSError, o:
2653 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2654 print >> sys.stderr, fail_str, "Error: segment_juncs not found on this system"
2655 die(str(o))
2656
2657 return [juncs_out, insertions_out, deletions_out, fusions_out]
2658
2659 # Joins mapped segments into full-length read alignments via the executable
2660 # long_spanning_reads
2661 def join_mapped_segments(params,
2662 sam_header_filename,
2663 reads,
2664 ref_fasta,
2665 possible_juncs,
2666 possible_insertions,
2667 possible_deletions,
2668 possible_fusions,
2669 contig_seg_maps,
2670 spliced_seg_maps,
2671 alignments_out_name):
2672 rn=""
2673 if len(contig_seg_maps)>1:
2674 th_log("Joining segment hits")
2675 rn=".segs"
2676 else:
2677 th_log("Processing bowtie hits")
2678 contig_seg_maps = ','.join(contig_seg_maps)
2679
2680 possible_juncs = ','.join(possible_juncs)
2681 possible_insertions = ",".join(possible_insertions)
2682 possible_deletions = ",".join(possible_deletions)
2683 possible_fusions = ",".join(possible_fusions)
2684
2685 log_fname=logging_dir + "long_spanning_reads"+rn+".log"
2686 align_log = open(log_fname, "w")
2687 align_cmd = [prog_path("long_spanning_reads")]
2688
2689 align_cmd.extend(params.cmd())
2690
2691 align_cmd.append(ref_fasta)
2692 align_cmd.extend([reads,
2693 possible_juncs,
2694 possible_insertions,
2695 possible_deletions,
2696 possible_fusions,
2697 alignments_out_name,
2698 contig_seg_maps])
2699
2700 if spliced_seg_maps:
2701 spliced_seg_maps = ','.join(spliced_seg_maps)
2702 align_cmd.append(spliced_seg_maps)
2703
2704 try:
2705 print >> run_log, " ".join(align_cmd)
2706 join_proc=subprocess.call(align_cmd,
2707 stderr=align_log)
2708
2709 except OSError, o:
2710 die(fail_str+"Error: "+str(o))
2711
2712 # This class collects spliced and unspliced alignments for each of the
2713 # left and right read files provided by the user.
2714 class Maps:
2715 def __init__(self,
2716 unspliced_sam,
2717 seg_maps,
2718 unmapped_segs,
2719 segs):
2720 self.unspliced_sam = unspliced_sam
2721 self.seg_maps = seg_maps
2722 self.unmapped_segs = unmapped_segs
2723 self.segs = segs
2724
2725 # Map2GTF stuff
2726 def m2g_convert_coords(params, gtf_fname, reads, out_fname):
2727 """ajjkljlks
2728
2729 Arguments:
2730 - `params`: TopHat parameters
2731 - `gtf_fname`: File name pointing to the annotation.
2732 - `reads`: The reads to convert coords (in Bowtie format).
2733 - `out_fname`: The file name pointing to the output.
2734 """
2735 m2g_cmd = [prog_path("map2gtf")]
2736 m2g_cmd.extend(params.cmd())
2737 m2g_cmd.append(gtf_fname)
2738 m2g_cmd.append(reads) #could be BAM file
2739 m2g_cmd.append(out_fname)
2740 fbasename = getFileBaseName(reads)
2741 m2g_log = logging_dir + "m2g_" + fbasename + ".out"
2742 m2g_err = logging_dir + "m2g_" + fbasename + ".err"
2743
2744 try:
2745 th_log("Converting " + fbasename + " to genomic coordinates (map2gtf)")
2746 print >> run_log, " ".join(m2g_cmd) + " > " + m2g_log
2747 ret = subprocess.call(m2g_cmd,
2748 stdout=open(m2g_log, "w"),
2749 stderr=open(m2g_err, "w"))
2750 if ret != 0:
2751 die(fail_str + " Error: map2gtf returned an error")
2752 except OSError, o:
2753 err_msg = fail_str + str(o)
2754 die(err_msg + "\n")
2755
2756
2757 def gtf_to_fasta(params, trans_gtf, genome, out_fname):
2758 """ Make the transcriptome from a GTF.
2759
2760 Arguments:
2761 - `trans_gtf`:
2762 - `genome`:
2763 - `out_fname`:
2764 """
2765 # TODO: Call gtf_to_fasta
2766 g2f_cmd = [prog_path("gtf_to_fasta")]
2767 g2f_cmd.extend(params.cmd())
2768 g2f_cmd.append(trans_gtf)
2769 g2f_cmd.append(genome)
2770 g2f_cmd.append(out_fname)
2771
2772 g2f_log = logging_dir + "g2f.out"
2773 g2f_err = logging_dir + "g2f.err"
2774
2775 try:
2776 print >> run_log, " ".join(g2f_cmd)+" > " + g2f_log
2777 ret = subprocess.call(g2f_cmd,
2778 stdout = open(g2f_log, "w"),
2779 stderr = open(g2f_err, "w"))
2780 if ret != 0:
2781 die(fail_str + " Error: gtf_to_fasta returned an error.")
2782 except OSError, o:
2783 err_msg = fail_str + str(o)
2784 die(err_msg + "\n")
2785
2786 def map2gtf(params, ref_fasta, left_reads, right_reads):
2787 """ Main GTF mapping function
2788
2789 Arguments:
2790 - `params`: The TopHat parameters.
2791 - `ref_fasta`: The reference genome.
2792 - `left_reads`: A list of reads.
2793 - `right_reads`: A list of reads (empty if single-end).
2794
2795 """
2796 test_input_file(params.gff_annotation)
2797
2798 # th_log("Reading in GTF file: " + params.gff_annotation)
2799 # transcripts = gtf_to_transcripts(params.gff_annotation)
2800
2801 gtf_name = getFileBaseName(params.gff_annotation)
2802 m2g_bwt_idx = None
2803 t_out_dir = tmp_dir
2804 if params.transcriptome_index and not params.transcriptome_outdir:
2805 m2g_bwt_idx = params.transcriptome_index
2806 th_log("Using pre-built transcriptome index..")
2807 else:
2808 th_log("Creating transcriptome data files..")
2809 if params.transcriptome_outdir:
2810 t_out_dir=params.transcriptome_outdir+"/"
2811 m2g_ref_fasta = t_out_dir + gtf_name + ".fa"
2812 gtf_to_fasta(params, params.gff_annotation, ref_fasta, m2g_ref_fasta)
2813 # trans_hash = transcripts_to_fasta(transcripts,
2814 # ref_fasta,
2815 # m2g_ref_fasta)
2816 m2g_bwt_idx = build_idx_from_fa(params.bowtie2, m2g_ref_fasta, t_out_dir, params.read_params.color)
2817 mapped_gtf_list = []
2818 unmapped_gtf_list = []
2819 # do the initial mapping in GTF coordinates
2820 for reads in [left_reads, right_reads]:
2821 if reads == None or os.path.getsize(reads) < 25 :
2822 continue
2823 fbasename = getFileBaseName(reads)
2824 mapped_gtf_out = tmp_dir + fbasename + ".m2g"
2825 #if use_zpacker:
2826 # mapped_gtf_out+=".z"
2827
2828 unmapped_gtf = tmp_dir + fbasename + ".m2g_um"
2829 #if use_BWT_FIFO:
2830 # unmapped_gtf += ".z"
2831
2832 (mapped_gtf_map, unmapped) = bowtie(params,
2833 m2g_bwt_idx,
2834 [reads],
2835 "fastq",
2836 params.t_mismatches,
2837 mapped_gtf_out,
2838 unmapped_gtf,
2839 "", "read_against_transcriptome")
2840 mapped_gtf_list.append(mapped_gtf_map)
2841 unmapped_gtf_list.append(unmapped)
2842
2843 sam_gtf_list = []
2844 for reads in mapped_gtf_list:
2845 fbasename = getFileBaseName(reads)
2846 sam_out_fname = tmp_dir + fbasename + "_converted.bam"
2847 m2g_convert_coords(params,
2848 params.gff_annotation,
2849 reads,
2850 sam_out_fname)
2851 sam_gtf_list.append(sam_out_fname)
2852
2853 if len(sam_gtf_list) < 2:
2854 sam_gtf_list.append(None)
2855 unmapped_gtf_list.append(None)
2856
2857 return (sam_gtf_list, unmapped_gtf_list)
2858 # end Map2GTF
2859
2860 def get_preflt_data(params, ri, target_reads, out_mappings, out_unmapped):
2861 ## extract mappings and unmapped reads from prefilter mappings and preflt_ium
2862 ##
2863 #this is accomplished by a special prep_reads usage (triggered by --flt-hits)
2864
2865 prep_cmd=prep_reads_cmd(params, params.preflt_data[ri].unmapped_reads, None,
2866 out_mappings,
2867 target_reads,
2868 params.preflt_data[ri].mappings)
2869 um_reads = open(out_unmapped, "wb")
2870 log_fname=logging_dir + "prep_reads.from_preflt."+str(ri)+".log"
2871 filter_log = open(log_fname,"w")
2872
2873 shell_cmd = " ".join(prep_cmd)
2874 #add the compression pipe
2875 zip_cmd=[]
2876 if use_zpacker:
2877 zip_cmd=[ params.system_params.zipper ]
2878 zip_cmd.extend(params.system_params.zipper_opts)
2879 zip_cmd.extend(['-c','-'])
2880 shell_cmd +=' | '+' '.join(zip_cmd)
2881 shell_cmd += ' >' + out_unmapped
2882 retcode=0
2883 try:
2884 print >> run_log, shell_cmd
2885 if use_zpacker:
2886 prep_proc = subprocess.Popen(prep_cmd,
2887 stdout=subprocess.PIPE,
2888 stderr=filter_log)
2889 zip_proc = subprocess.Popen(zip_cmd,
2890 preexec_fn=subprocess_setup,
2891 stdin=prep_proc.stdout,
2892 stdout=um_reads)
2893 prep_proc.stdout.close() #as per http://bugs.python.org/issue7678
2894 zip_proc.communicate()
2895 retcode=prep_proc.poll()
2896 if retcode==0:
2897 retcode=zip_proc.poll()
2898 else:
2899 retcode = subprocess.call(prep_cmd,
2900 stdout=um_reads,
2901 stderr=filter_log)
2902 if retcode:
2903 die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname))
2904
2905 except OSError, o:
2906 errmsg=fail_str+str(o)
2907 die(errmsg+"\n"+log_tail(log_fname))
2908 um_reads.close()
2909
2910 return (out_mappings, out_unmapped)
2911
2912
2913 # The main aligment routine of TopHat. This function executes most of the
2914 # workflow producing a set of candidate alignments for each cDNA fragment in a
2915 # pair of SAM alignment files (for paired end reads).
2916 def spliced_alignment(params,
2917 bwt_idx_prefix,
2918 sam_header_filename,
2919 ref_fasta,
2920 read_len,
2921 segment_len,
2922 prepared_reads,
2923 user_supplied_junctions,
2924 user_supplied_insertions,
2925 user_supplied_deletions):
2926
2927 possible_juncs = []
2928 possible_juncs.extend(user_supplied_junctions)
2929
2930 possible_insertions = []
2931 possible_insertions.extend(user_supplied_insertions)
2932 possible_deletions = []
2933 possible_deletions.extend(user_supplied_deletions)
2934 possible_fusions = []
2935
2936 left_reads, right_reads = prepared_reads
2937
2938 maps = [[], []] # maps[0] = left_reads mapping data, maps[1] = right_reads_mapping_data
2939
2940 #single_segments = False
2941
2942 # Before anything, map the reads using Map2GTF (if using annotation)
2943 m2g_maps = [ None, None ] # left, right
2944 initial_reads = [ left_reads, right_reads ]
2945
2946 if params.gff_annotation:
2947 (mapped_gtf_list, unmapped_gtf_list) = \
2948 map2gtf(params, ref_fasta, left_reads, right_reads)
2949
2950 m2g_left_maps, m2g_right_maps = mapped_gtf_list
2951 m2g_maps = [m2g_left_maps, m2g_right_maps]
2952 if params.transcriptome_only or not fileExists(unmapped_gtf_list[0]):
2953 # The case where the user doesn't want to map to anything other
2954 # than the transcriptome OR we have no unmapped reads
2955 maps[0] = [m2g_left_maps]
2956 if right_reads:
2957 maps[1] = [m2g_right_maps]
2958 return maps
2959 # Feed the unmapped reads into spliced_alignment()
2960 initial_reads = unmapped_gtf_list[:]
2961 th_log("Resuming TopHat pipeline with unmapped reads")
2962
2963 max_seg_len = segment_len #this is the ref seq span on either side of the junctions
2964 #to be extracted into segment_juncs.fa
2965
2966 num_segs = int(read_len / segment_len)
2967 if (read_len % segment_len) >= min(segment_len-2, 20):
2968 #remainder is shorter but long enough to become a new segment
2969 num_segs += 1
2970 else:
2971 # the last segment is longer
2972 if num_segs>1: max_seg_len += (read_len % segment_len)
2973
2974 if num_segs <= 1:
2975 print >> sys.stderr, "Warning: you have only one segment per read.\n\tIf the read length is greater than or equal to 45bp,\n\twe strongly recommend that you decrease --segment-length to about half the read length because TopHat will work better with multiple segments"
2976
2977 # Using the num_segs value returned by check_reads(),
2978 # decide which junction discovery strategy to use
2979 if num_segs < 3:
2980 #if params.butterfly_search != False:
2981 # params.butterfly_search = True
2982 if params.coverage_search != False:
2983 params.coverage_search = True
2984 if num_segs == 1:
2985 segment_len = read_len
2986 else: #num_segs >= 3:
2987 # if we have at least three segments, just use split segment search,
2988 # which is the most sensitive and specific, fastest, and lightest-weight.
2989 # so unless specifically requested, disable the other junction searches
2990 if params.closure_search != True:
2991 params.closure_search = False
2992 if params.coverage_search != True:
2993 params.coverage_search = False
2994 if params.butterfly_search != True:
2995 params.butterfly_search = False
2996
2997 # Perform the first part of the TopHat work flow on the left and right
2998 # reads of paired ends separately - we'll use the pairing information later
2999 have_left_IUM=False
3000 for ri in (0,1):
3001 reads=initial_reads[ri]
3002 if reads == None or not fileExists(reads,25):
3003 continue
3004 fbasename=getFileBaseName(reads)
3005 unspliced_out = tmp_dir + fbasename + ".mapped"
3006 unspliced_sam = None
3007 unmapped_reads = None
3008 #if use_zpacker: unspliced_out+=".z"
3009 unmapped_unspliced = tmp_dir + fbasename + "_unmapped"
3010 if params.prefilter_multi:
3011 #unmapped_unspliced += ".z"
3012 (unspliced_sam, unmapped_reads) = get_preflt_data(params, ri, reads, unspliced_out, unmapped_unspliced)
3013 else:
3014 # Perform the initial Bowtie mapping of the full length reads
3015 (unspliced_sam, unmapped_reads) = bowtie(params,
3016 bwt_idx_prefix,
3017 [reads],
3018 "fastq",
3019 params.genome_read_mismatches,
3020 unspliced_out,
3021 unmapped_unspliced,
3022 "",
3023 "read_against_genome")
3024
3025 # daehwan - check this out
3026 # params.bowtie2 = False
3027 # check_bowtie(params.bowtie2)
3028
3029 seg_maps = []
3030 unmapped_segs = []
3031 segs = []
3032 have_IUM = fileExists(unmapped_reads,25)
3033 if ri==0 and have_IUM:
3034 have_left_IUM = True
3035 if num_segs > 1 and have_IUM:
3036 # split up the IUM reads into segments
3037 # unmapped_reads can be in BAM format
3038 read_segments = split_reads(unmapped_reads,
3039 tmp_dir + fbasename,
3040 False,
3041 params,
3042 segment_len)
3043
3044 # Map each segment file independently with Bowtie
3045 for i in range(len(read_segments)):
3046 seg = read_segments[i]
3047 fbasename=getFileBaseName(seg)
3048 seg_out = tmp_dir + fbasename + ".bwtout"
3049 if use_zpacker: seg_out += ".z"
3050 unmapped_seg = tmp_dir + fbasename + "_unmapped.fq"
3051 if use_BWT_FIFO:
3052 unmapped_seg += ".z"
3053 extra_output = "(%d/%d)" % (i+1, len(read_segments))
3054 (seg_map, unmapped) = bowtie(params,
3055 bwt_idx_prefix,
3056 [seg],
3057 "fastq",
3058 params.segment_mismatches,
3059 seg_out,
3060 unmapped_seg,
3061 extra_output,
3062 "seg_against_genome")
3063 seg_maps.append(seg_map)
3064 unmapped_segs.append(unmapped)
3065 segs.append(seg)
3066
3067 # Collect the segment maps for left and right reads together
3068 maps[ri] = Maps(unspliced_sam, seg_maps, unmapped_segs, segs)
3069 else:
3070 # if there's only one segment, just collect the initial map as the only
3071 # map to be used downstream for coverage-based junction discovery
3072 read_segments = [reads]
3073 maps[ri] = Maps(unspliced_sam, [unspliced_sam], [unmapped_reads], [unmapped_reads])
3074
3075 # XXX: At this point if using M2G, have three sets of reads:
3076 # mapped to transcriptome, mapped to genome, and unmapped (potentially
3077 # spliced or poly-A tails) - hp
3078
3079 # Unless the user asked not to discover new junctions, start that process
3080 # here
3081 left_reads_map = maps[0].unspliced_sam
3082 left_seg_maps = maps[0].seg_maps
3083 unmapped_reads = maps[0].unmapped_segs
3084 if right_reads:
3085 right_reads_map = maps[1].unspliced_sam
3086 right_seg_maps = maps[1].seg_maps
3087 unmapped_reads.extend(maps[1].unmapped_segs)
3088 else:
3089 right_reads_map = None
3090 right_seg_maps = None
3091
3092 if params.find_novel_juncs and have_left_IUM: # or params.find_novel_indels:
3093 # Call segment_juncs to infer a list of possible splice junctions from
3094 # the regions of the genome covered in the initial and segment maps
3095 #if params.find_novel_juncs:
3096 #TODO: in m2g case, we might want to pass the m2g mappings as well,
3097 # or perhaps the GTF file directly
3098 # -> this could improve alternative junction detection?
3099 juncs = junctions_from_segments(params,
3100 left_reads,
3101 left_reads_map,
3102 left_seg_maps,
3103 right_reads,
3104 right_reads_map,
3105 right_seg_maps,
3106 unmapped_reads,
3107 "fastq",
3108 ref_fasta)
3109 if os.path.getsize(juncs[0]) != 0:
3110 possible_juncs.append(juncs[0])
3111 if params.find_novel_indels:
3112 if os.path.getsize(juncs[1]) != 0:
3113 possible_insertions.append(juncs[1])
3114 if os.path.getsize(juncs[2]) != 0:
3115 possible_deletions.append(juncs[2])
3116 if params.find_novel_fusions:
3117 if os.path.getsize(juncs[3]) != 0:
3118 possible_fusions.append(juncs[3])
3119 # Optionally, and for paired reads only, use a closure search to
3120 # discover addtional junctions
3121 if params.closure_search and left_reads and right_reads:
3122 juncs = junctions_from_closures(params,
3123 [maps[initial_reads[left_reads]].unspliced_sam, maps[initial_reads[left_reads]].seg_maps[-1]],
3124 [maps[initial_reads[right_reads]].unspliced_sam, maps[initial_reads[right_reads]].seg_maps[-1]],
3125 ref_fasta)
3126 if os.path.getsize(juncs[0]) != 0:
3127 possible_juncs.extend(juncs)
3128
3129 if len(possible_insertions) == 0 and len(possible_deletions) == 0 and len(possible_juncs) == 0 and len(possible_fusions) == 0:
3130 spliced_seg_maps = None
3131 junc_idx_prefix = None
3132 else:
3133 junc_idx_prefix = "segment_juncs"
3134 if len(possible_insertions) == 0:
3135 possible_insertions.append(os.devnull)
3136 # print >> sys.stderr, "Warning: insertions database is empty!"
3137 if len(possible_deletions) == 0:
3138 possible_deletions.append(os.devnull)
3139 # print >> sys.stderr, "Warning: deletions database is empty!"
3140 if len(possible_juncs) == 0:
3141 possible_juncs.append(os.devnull)
3142 print >> sys.stderr, "Warning: junction database is empty!"
3143 if len(possible_fusions) == 0:
3144 possible_fusions.append(os.devnull)
3145 if junc_idx_prefix:
3146 juncs_bwt_idx = build_juncs_index(params.bowtie2,
3147 3,
3148 max_seg_len,
3149 junc_idx_prefix,
3150 possible_juncs,
3151 possible_insertions,
3152 possible_deletions,
3153 possible_fusions,
3154 ref_fasta,
3155 params.read_params.color)
3156
3157 juncs_bwt_samheader = get_index_sam_header(params.bowtie2, params.read_params, juncs_bwt_idx, False)
3158
3159 # Now map read segments (or whole IUM reads, if num_segs == 1) to the splice
3160 # index with Bowtie
3161 # for reads in [left_reads, right_reads]:
3162 for ri in (0,1):
3163 reads = initial_reads[ri]
3164 spliced_seg_maps = []
3165
3166 if reads == None or os.path.getsize(reads)<25:
3167 continue
3168
3169 m2g_map = m2g_maps[ri]
3170
3171 if junc_idx_prefix :
3172 i = 0
3173 for seg in maps[ri].segs:
3174 #search each segment
3175 fsegname = getFileBaseName(seg)
3176 seg_out = tmp_dir + fsegname + ".to_spliced.bwtout"
3177 if use_zpacker: seg_out += ".z"
3178 extra_output = "(%d/%d)" % (i+1, len(maps[ri].segs))
3179 (seg_map, unmapped) = bowtie(params,
3180 tmp_dir + junc_idx_prefix,
3181 [seg],
3182 "fastq",
3183 params.segment_mismatches,
3184 seg_out,
3185 None,
3186 extra_output,
3187 "seg_against_junctions")
3188 spliced_seg_maps.append(seg_map)
3189 i += 1
3190
3191 # Join the contigous and spliced segment hits into full length
3192 # read alignments
3193 rfname=getFileBaseName(reads)
3194 rfdir=getFileDir(reads)
3195 mapped_reads = rfdir + rfname + ".candidates.bam"
3196 # -- spliced mappings built from all segment mappings vs genome and junc_db
3197 merged_map =rfdir + rfname + ".candidates_and_unspl.bam"
3198 #unspl_bwtfile = maps[ri].unspliced_bwt
3199 unspl_samfile = maps[ri].unspliced_sam
3200 join_mapped_segments(params,
3201 sam_header_filename,
3202 reads,
3203 ref_fasta,
3204 possible_juncs,
3205 possible_insertions,
3206 possible_deletions,
3207 possible_fusions,
3208 maps[ri].seg_maps,
3209 spliced_seg_maps,
3210 mapped_reads)
3211
3212 if num_segs > 1 or m2g_map:
3213 # Merge the spliced and unspliced full length alignments into
3214 # a single SAM file.
3215 # The individual SAM files are all already sorted in
3216 # increasing read ID order.
3217 # NOTE: We also should be able to address bug #134 here, by replacing
3218 # contiguous alignments that poke into an intron by a small amount by
3219 # the correct spliced alignment.
3220
3221 try:
3222 merge_cmd = [ prog_path("bam_merge"), "--index-outfile", merged_map + ".index",
3223 merged_map ]
3224
3225 if num_segs > 1:
3226 merge_cmd += [ maps[ri].unspliced_sam ]
3227 if m2g_map:
3228 merge_cmd += [ m2g_map ]
3229
3230 if os.path.exists(mapped_reads):
3231 merge_cmd += [ mapped_reads ]
3232 else:
3233 for bam_i in range(0, params.system_params.num_threads):
3234 temp_bam = mapped_reads[:-4] + str(bam_i) + ".bam"
3235 if os.path.exists(temp_bam):
3236 merge_cmd += [ temp_bam ]
3237 else:
3238 break
3239
3240 print >> run_log, " ".join(merge_cmd)
3241 ret = subprocess.call( merge_cmd,
3242 stderr=open(logging_dir + "sam_merge.log", "w") )
3243 if ret != 0:
3244 die(fail_str+"Error executing: "+" ".join(merge_cmd))
3245 except OSError, o:
3246 die(fail_str+"Error: "+str(o))
3247 maps[ri] = [merged_map]
3248
3249 if not params.system_params.keep_tmp:
3250 os.remove(mapped_reads)
3251 else:
3252 # no segments or transcriptome mappings, so no merge is needed
3253 # because join_mapped_segments() produced the final BAM already
3254 os.rename(mapped_reads, merged_map)
3255 maps[ri] = [merged_map]
3256 if not params.system_params.keep_tmp:
3257 os.remove(unspl_samfile)
3258
3259 return maps
3260
3261 def die(msg=None):
3262 if msg is not None:
3263 print >> sys.stderr, msg
3264 sys.exit(1)
3265
3266 # rough equivalent of the 'which' command to find external programs
3267 # (current script path is tested first, then PATH envvar)
3268 def which(program):
3269 def is_executable(fpath):
3270 return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
3271 fpath, fname = os.path.split(program)
3272 if fpath:
3273 if is_executable(program):
3274 return program
3275 else:
3276 progpath = os.path.join(bin_dir, program)
3277 if is_executable(progpath):
3278 return progpath
3279 for path in os.environ["PATH"].split(os.pathsep):
3280 progpath = os.path.join(path, program)
3281 if is_executable(progpath):
3282 return progpath
3283 return None
3284
3285 def prog_path(program):
3286 progpath=which(program)
3287 if progpath == None:
3288 die("Error locating program: "+program)
3289 return progpath
3290
3291 # FIXME: this should get set during the make dist autotools phase of the build
3292 def get_version():
3293 return "2.0.0"
3294
3295 def mlog(msg):
3296 print >> sys.stderr, "[DBGLOG]:"+msg
3297
3298 def test_input_file(filename):
3299 try:
3300 test_file = open(filename, "r")
3301 except IOError, o:
3302 die("Error: Opening file %s" % filename)
3303 return
3304
3305 def main(argv=None):
3306 warnings.filterwarnings("ignore", "tmpnam is a potential security risk")
3307
3308 # Initialize default parameter values
3309 params = TopHatParams()
3310 try:
3311 if argv is None:
3312 argv = sys.argv
3313 args = params.parse_options(argv)
3314 params.check()
3315
3316 bwt_idx_prefix = args[0]
3317 left_reads_list = args[1]
3318 left_quals_list, right_quals_list = [], []
3319 if (not params.read_params.quals and len(args) > 2) or (params.read_params.quals and len(args) > 3):
3320 if params.read_params.mate_inner_dist == None:
3321 params.read_params.mate_inner_dist = 50
3322 #die("Error: you must set the mean inner distance between mates with -r")
3323
3324 right_reads_list = args[2]
3325 if params.read_params.quals:
3326 left_quals_list = args[3]
3327 right_quals_list = args[4]
3328 else:
3329 right_reads_list = None
3330 if params.read_params.quals:
3331 left_quals_list = args[2]
3332
3333 print >> sys.stderr
3334 th_log("Beginning TopHat run (v"+get_version()+")")
3335 print >> sys.stderr, "-----------------------------------------------"
3336
3337 start_time = datetime.now()
3338 prepare_output_dir()
3339
3340 global run_log
3341 run_log = open(logging_dir + "run.log", "w", 0)
3342 global run_cmd
3343 run_cmd = " ".join(argv)
3344 print >> run_log, run_cmd
3345
3346 # Validate all the input files, check all prereqs before committing
3347 # to the run
3348 if params.gff_annotation:
3349 if not os.path.exists(params.gff_annotation):
3350 die("Error: cannot find transcript file %s" % params.gff_annotation)
3351 if os.path.getsize(params.gff_annotation)<10:
3352 die("Error: invalid transcript file %s" % params.gff_annotation)
3353
3354 if params.transcriptome_index:
3355 if params.gff_annotation:
3356 #gff file given, so transcriptome data will be written there
3357 gff_basename = getFileBaseName(params.gff_annotation)
3358 #just in case, check if it's not already there (-G/--GTF given again by mistake)
3359 tpath, tname = os.path.split(params.transcriptome_index)
3360 new_subdir=False
3361 if tpath in (".", "./") or not tpath :
3362 if not os.path.exists(params.transcriptome_index):
3363 os.makedirs(params.transcriptome_index)
3364 new_subdir=True
3365 if new_subdir or (os.path.exists(params.transcriptome_index) and os.path.isdir(params.transcriptome_index)):
3366 params.transcriptome_index = os.path.join(params.transcriptome_index, gff_basename)
3367 gff_out=params.transcriptome_index+".gff"
3368 if not (os.path.exists(gff_out) and os.path.getsize(gff_out)==os.path.getsize(params.gff_annotation)):
3369 #generate the transcriptome data files
3370 tpath, tname = os.path.split(params.transcriptome_index)
3371 params.transcriptome_outdir=tpath
3372 t_gff=params.transcriptome_index+".gff"
3373 if params.transcriptome_outdir:
3374 #will create the transcriptome data files
3375 if not os.path.exists(params.transcriptome_outdir):
3376 os.makedirs(params.transcriptome_outdir)
3377 copy(params.gff_annotation, t_gff)
3378 else:
3379 #try to use existing transcriptome data files
3380
3381 if not (os.path.exists(t_gff) and os.path.getsize(t_gff)>10):
3382 die("Error: GFF transcripts file not found or invalid (%s)" % t_gff)
3383 check_bowtie_index(params.transcriptome_index)
3384 params.gff_annotation=t_gff
3385 #end @ transcriptome_index given
3386
3387 (ref_fasta, ref_seq_dict) = check_index(bwt_idx_prefix)
3388
3389 check_bowtie(params.bowtie2)
3390 check_samtools()
3391 th_log("Generating SAM header for "+bwt_idx_prefix)
3392 sam_header_filename = get_index_sam_header(params.bowtie2, params.read_params, bwt_idx_prefix)
3393 #if not params.skip_check_reads:
3394 reads_list = left_reads_list
3395 if right_reads_list:
3396 reads_list = reads_list + "," + right_reads_list
3397 params.read_params = check_reads_format(params, reads_list)
3398
3399 user_supplied_juncs = []
3400 user_supplied_insertions = []
3401 user_supplied_deletions = []
3402 user_supplied_fusions = []
3403 global gtf_juncs
3404 if params.gff_annotation and params.find_GFF_juncs:
3405 test_input_file(params.gff_annotation)
3406 (found_juncs, gtf_juncs) = get_gtf_juncs(params.gff_annotation)
3407 ##-- we don't need these junctions in user_supplied_juncs anymore because now map2gtf does a much better job
3408 ## but we still need them loaded in gtf_juncs for later splice verification
3409 #if found_juncs and not params.gff_annotation:
3410 # user_supplied_juncs.append(gtf_juncs)
3411 #else:
3412 # gtf_juncs = None
3413 if params.raw_junctions:
3414 test_input_file(params.raw_junctions)
3415 user_supplied_juncs.append(params.raw_junctions)
3416
3417 if params.raw_insertions:
3418 test_input_file(params.raw_insertions)
3419 user_supplied_insertions.append(params.raw_insertions)
3420
3421 if params.raw_deletions:
3422 test_input_file(params.raw_deletions)
3423 user_supplied_deletions.append(params.raw_deletions)
3424
3425 global unmapped_reads_fifo
3426 unmapped_reads_fifo = tmp_dir + str(os.getpid())+".bwt_unmapped.z.fifo"
3427
3428 # Now start the time consuming stuff
3429 if params.prefilter_multi:
3430 sides=("left","right")
3431 read_lists=(left_reads_list, right_reads_list)
3432 for ri in (0,1):
3433 reads_list=read_lists[ri]
3434 if not reads_list:
3435 continue
3436 params.preflt_data[ri].multihit_reads = tmp_dir + sides[ri]+"_multimapped.fq"
3437 side_imap = tmp_dir + sides[ri]+"_im.bwtout"
3438 if use_zpacker:
3439 side_imap+=".z"
3440 side_ium = tmp_dir + sides[ri]+"_ium.fq"
3441 if use_BWT_FIFO:
3442 side_ium += ".z"
3443 th_log("Pre-filtering multi-mapped "+sides[ri]+" reads")
3444 rdlist=reads_list.split(',')
3445 bwt=bowtie(params, bwt_idx_prefix, rdlist,
3446 params.read_params.reads_format,
3447 params.genome_read_mismatches,
3448 side_imap, side_ium,
3449 "", "read_against_genome", params.preflt_data[ri].multihit_reads)
3450 params.preflt_data[ri].mappings = bwt[0] # initial mappings
3451 params.preflt_data[ri].unmapped_reads = bwt[1] # IUM reads
3452
3453 th_log("Preparing reads")
3454 left_kept_reads, left_reads_info = prep_reads(params,
3455 left_reads_list,
3456 left_quals_list,
3457 "left",
3458 params.preflt_data[0].multihit_reads)
3459 min_read_len=left_reads_info.min_len
3460 max_read_len=left_reads_info.max_len
3461 if right_reads_list:
3462 right_kept_reads, right_reads_info = prep_reads(params,
3463 right_reads_list,
3464 right_quals_list,
3465 "right",
3466 params.preflt_data[1].multihit_reads)
3467 min_read_len=min(right_reads_info.min_len, min_read_len)
3468 max_read_len=max(right_reads_info.max_len, max_read_len)
3469 else:
3470 right_kept_reads = None
3471 seed_len=params.read_params.seed_length
3472 if seed_len: #if read len was explicitly given
3473 seed_len = max(seed_len, min_read_len)
3474 #can't be smaller than minimum length observed
3475 else:
3476 seed_len = max_read_len
3477 params.read_params.seed_length=seed_len
3478 # turn off integer-quals
3479 if params.read_params.integer_quals:
3480 params.read_params.integer_quals = False
3481
3482 input_reads = [left_kept_reads, right_kept_reads]
3483 mappings = spliced_alignment(params,
3484 bwt_idx_prefix,
3485 sam_header_filename,
3486 ref_fasta,
3487 params.read_params.seed_length,
3488 params.segment_length,
3489 input_reads,
3490 user_supplied_juncs,
3491 user_supplied_insertions,
3492 user_supplied_deletions)
3493
3494 compile_reports(params,
3495 sam_header_filename,
3496 ref_fasta,
3497 mappings,
3498 input_reads,
3499 params.gff_annotation)
3500
3501 if not params.system_params.keep_tmp:
3502 for m in mappings[0]:
3503 os.remove(m)
3504 if left_kept_reads:
3505 os.remove(left_kept_reads)
3506 for m in mappings[1]:
3507 os.remove(m)
3508 if right_kept_reads:
3509 os.remove(right_kept_reads)
3510 tmp_files = os.listdir(tmp_dir)
3511 for t in tmp_files:
3512 os.remove(tmp_dir+t)
3513 os.rmdir(tmp_dir)
3514
3515 finish_time = datetime.now()
3516 duration = finish_time - start_time
3517 print >> sys.stderr,"-----------------------------------------------"
3518 th_log("Run complete: %s elapsed" % formatTD(duration))
3519
3520 except Usage, err:
3521 print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg)
3522 print >> sys.stderr, " for detailed help see http://tophat.cbcb.umd.edu/manual.html"
3523 return 2
3524
3525
3526 if __name__ == "__main__":
3527 sys.exit(main())

Properties

Name Value
svn:executable *