ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/tophat
Revision: 167
Committed: Fri Feb 10 05:51:56 2012 UTC (8 years, 5 months ago) by gpertea
File size: 150082 byte(s)
Log Message:
bam_merge needs fixing!

Line File contents
1 #!/usr/bin/env python
2
3 # encoding: utf-8
4 """
5 tophat.py
6
7 Created by Cole Trapnell on 2008-12-25.
8 Copyright (c) 2008 Cole Trapnell. All rights reserved.
9 Updated and maintained by Daehwan Kim and Geo Pertea since Jul 2010.
10 """
11 import sys
12 try:
13 import psyco
14 psyco.full()
15 except ImportError:
16 pass
17
18 import getopt
19 import subprocess
20 import errno
21 import os
22 import warnings
23 import re
24 import signal
25 from datetime import datetime, date, time
26 from shutil import copy
27 import logging
28
29 use_message = '''
30 TopHat maps short sequences from spliced transcripts to whole genomes.
31
32 Usage:
33 tophat [options] <bowtie_index> <reads1[,reads2,...]> [reads1[,reads2,...]] \\
34 [quals1,[quals2,...]] [quals1[,quals2,...]]
35
36 Options:
37 -v/--version
38 -o/--output-dir <string> [ default: ./tophat_out ]
39 --bowtie1 [ default: bowtie2 ]
40 -a/--min-anchor <int> [ default: 8 ]
41 -m/--splice-mismatches <0-2> [ default: 0 ]
42 -i/--min-intron-length <int> [ default: 50 ]
43 -I/--max-intron-length <int> [ default: 500000 ]
44 -g/--max-multihits <int> [ default: 20 ]
45 -x/--transcriptome-max-hits <int> [ default: 60 ]
46 -n/--transcriptome-mismatches <int> [ default: 1 ]
47 -M/--prefilter-multihits ( for -G/--GTF option, enable
48 an initial bowtie search
49 against the genome )
50 --max-insertion-length <int> [ default: 3 ]
51 --max-deletion-length <int> [ default: 3 ]
52 --solexa-quals
53 --solexa1.3-quals (same as phred64-quals)
54 --phred64-quals (same as solexa1.3-quals)
55 -Q/--quals
56 --integer-quals
57 -C/--color (Solid - color space)
58 --color-out
59 --library-type <string> (fr-unstranded, fr-firststrand,
60 fr-secondstrand)
61 -p/--num-threads <int> [ default: 1 ]
62 -G/--GTF <filename> (GTF/GFF with known transcripts)
63 --transcriptome-index <bwtidx> (transcriptome bowtie index)
64 -T/--transcriptome-only (map only to the transcriptome)
65 -j/--raw-juncs <filename>
66 --insertions <filename>
67 --deletions <filename>
68 -r/--mate-inner-dist <int>
69 --mate-std-dev <int> [ default: 20 ]
70 --no-novel-juncs
71 --no-novel-indels
72 --fusion-search
73 --fusion-anchor-length <int> [ default: 20 ]
74 --fusion-min-dist <int> [ default: 10000000 ]
75 --fusion-read-mismatches <int> [ default: 2 ]
76 --fusion-multireads <int> [ default: 2 ]
77 --fusion-multipairs <int> [ default: 2 ]
78 --no-gtf-juncs
79 --no-coverage-search
80 --coverage-search
81 --no-closure-search
82 --closure-search
83 --microexon-search
84 --butterfly-search
85 --no-butterfly-search
86 --keep-tmp
87 --tmp-dir <dirname> [ default: <output_dir>/tmp ]
88 -z/--zpacker <program> [ default: gzip ]
89 -X/--unmapped-fifo [use mkfifo to compress more temporary
90 files for color space reads]
91
92 Advanced Options:
93 --report-secondary-alignments
94 --genome-read-mismatches <int> [ default: 2 ]
95 -N/--read-mismatches <int> [ default: 2 ]
96 --segment-mismatches <int> [ default: 2 ]
97 --segment-length <int> [ default: 25 ]
98 --bowtie-n [ default: bowtie -v ]
99 --min-closure-exon <int> [ default: 100 ]
100 --min-closure-intron <int> [ default: 50 ]
101 --max-closure-intron <int> [ default: 5000 ]
102 --min-coverage-intron <int> [ default: 50 ]
103 --max-coverage-intron <int> [ default: 20000 ]
104 --min-segment-intron <int> [ default: 50 ]
105 --max-segment-intron <int> [ default: 500000 ]
106 --no-sort-bam (Output BAM is not coordinate-sorted)
107 --no-convert-bam (Do not convert to bam format.
108 Output is <output_dir>accepted_hit.sam.
109 Implies --no-sort-bam)
110 --keep-fasta-order
111 --allow-partial-mapping
112
113 Bowtie2 related options:
114 Preset options in --end-to-end mode (local alignment is not used in TopHat2)
115 --b2-very-fast
116 --b2-fast
117 --b2-sensitive
118 --b2-very-sensitive
119
120 Alignment options
121 --b2-N <int> [ default: 0 ]
122 --b2-L <int> [ default: 20 ]
123 --b2-i <func> [ default: S,1,1.25 ]
124 --b2-n-ceil <func> [ default: L,0,0.15 ]
125 --b2-gbar <int> [ default: 4 ]
126
127 Scoring options
128 --b2-mp <int>,<int> [ default: 6,2 ]
129 --b2-np <int> [ default: 1 ]
130 --b2-rdg <int>,<int> [ default: 5,3 ]
131 --b2-rfg <int>,<int> [ default: 5,3 ]
132 --b2-score-min <func> [ default: L,-0.6,-0.6 ]
133
134 Effort options
135 --b2-D <int> [ default: 15 ]
136 --b2-R <int> [ default: 2 ]
137
138
139 SAM Header Options (for embedding sequencing run metadata in output):
140 --rg-id <string> (read group ID)
141 --rg-sample <string> (sample ID)
142 --rg-library <string> (library ID)
143 --rg-description <string> (descriptive string, no tabs allowed)
144 --rg-platform-unit <string> (e.g Illumina lane ID)
145 --rg-center <string> (sequencing center name)
146 --rg-date <string> (ISO 8601 date of the sequencing run)
147 --rg-platform <string> (Sequencing platform descriptor)
148 '''
149
150 # -F/--min-isoform-fraction <float> [ default: 0.15 ]
151 # (obsolete)
152
153 class Usage(Exception):
154 def __init__(self, msg):
155 self.msg = msg
156
157 output_dir = "./tophat_out/"
158 logging_dir = output_dir + "logs/"
159 run_log = None
160 tophat_log = None #main log file handle
161 tophat_logger = None # main logging object
162 run_cmd = None
163 tmp_dir = output_dir + "tmp/"
164 bin_dir = sys.path[0] + "/"
165 use_zpacker = False # this is set by -z/--zpacker option (-z0 leaves it False)
166
167 use_BAM_Unmapped = False # automatically set to True for non-Solid reads, handles unmapped reads in BAM format
168
169 use_BWT_FIFO = False # can only be set to True if use_zpacker is True and only with -C/--color
170 # enabled by -X/-unmapped-fifo option (unless -z0)
171 unmapped_reads_fifo = None # if use_BWT_FIFO is True, this tricks bowtie into writing the
172 # unmapped reads into a compressed file
173
174 samtools_path = None
175 bowtie_path = None
176 fail_str = "\t[FAILED]\n"
177 gtf_juncs = None #file name with junctions extracted from given GFF file
178
179 #mapping types:
180
181
182 _reads_vs_G, _reads_vs_T, _segs_vs_G, _segs_vs_J = range(1,5)
183
184 def init_logger(log_fname):
185 global tophat_logger
186 tophat_logger = logging.getLogger('project')
187 formatter = logging.Formatter('%(asctime)s %(message)s', '[%Y-%m-%d %H:%M:%S]')
188 # level = logging.__dict__.get(options.loglevel.upper(),logging.DEBUG)
189 tophat_logger.setLevel(logging.DEBUG)
190
191 hstream = logging.StreamHandler(sys.stderr)
192 hstream.setFormatter(formatter)
193 tophat_logger.addHandler(hstream)
194 #
195 # Output logging information to file
196 if os.path.isfile(log_fname):
197 os.remove(log_fname)
198 global tophat_log
199 logfh = logging.FileHandler(log_fname)
200 logfh.setFormatter(formatter)
201 tophat_logger.addHandler(logfh)
202 tophat_log=logfh.stream
203
204
205 # TopHatParams captures all of the runtime paramaters used by TopHat, and many
206 # of these are passed as command line options to exectubles run by the pipeline
207
208 # This class and its nested classes also do options parsing through parse_options()
209 # and option validation via the member function check()
210
211 class BowtieOutputFiles:
212 def __init__(self,
213 mappings=None,
214 unmapped_reads=None,
215 multihit_reads=None):
216 self.mappings=mappings
217 self.unmapped_reads=unmapped_reads
218 self.multihit_reads=multihit_reads
219
220 class TopHatParams:
221
222 # SpliceConstraints is a group of runtime parameters that specify what
223 # constraints to put on junctions discovered by the program. These constraints
224 # are used to filter out spurious/false positive junctions.
225
226 class SpliceConstraints:
227 def __init__(self,
228 min_anchor_length,
229 min_intron_length,
230 max_intron_length,
231 splice_mismatches,
232 min_isoform_fraction):
233 self.min_anchor_length = min_anchor_length
234 self.min_intron_length = min_intron_length
235 self.max_intron_length = max_intron_length
236 self.splice_mismatches = splice_mismatches
237 self.min_isoform_fraction = min_isoform_fraction
238
239 def parse_options(self, opts):
240 for option, value in opts:
241 if option in ("-m", "--splice-mismatches"):
242 self.splice_mismatches = int(value)
243 elif option in ("-a", "--min-anchor"):
244 self.min_anchor_length = int(value)
245 elif option in ("-F", "--min-isoform-fraction"):
246 self.min_isoform_fraction = float(value)
247 elif option in ("-i", "--min-intron-length"):
248 self.min_intron_length = int(value)
249 elif option in ("-I", "--max-intron-length"):
250 self.max_intron_length = int(value)
251
252 def check(self):
253 if self.splice_mismatches not in [0,1,2]:
254 die("Error: arg to --splice-mismatches must be 0, 1, or 2")
255 if self.min_anchor_length < 4:
256 die("Error: arg to --min-anchor-len must be greater than 4")
257 if self.min_isoform_fraction < 0.0 or self.min_isoform_fraction > 1.0:
258 die("Error: arg to --min-isoform-fraction must be between 0.0 and 1.0")
259 if self.min_intron_length <= 0:
260 die("Error: arg to --min-intron-length must be greater than 0")
261 if self.max_intron_length <= 0:
262 die("Error: arg to --max-intron-length must be greater than 0")
263
264 # SystemParams is a group of runtime parameters that determine how to handle
265 # temporary files produced during a run and how many threads to use for threaded
266 # stages of the pipeline (e.g. Bowtie)
267
268 class SystemParams:
269 def __init__(self,
270 num_threads,
271 keep_tmp):
272 self.num_threads = num_threads
273 self.keep_tmp = keep_tmp
274 self.zipper = "gzip"
275 self.zipper_opts= []
276
277 def parse_options(self, opts):
278 global use_zpacker
279 global use_BWT_FIFO
280 for option, value in opts:
281 if option in ("-p", "--num-threads"):
282 self.num_threads = int(value)
283 elif option == "--keep-tmp":
284 self.keep_tmp = True
285 elif option in ("-z","--zpacker"):
286 if value.lower() in ["-", " ", ".", "0", "none", "f", "false", "no"]:
287 value=""
288 self.zipper = value
289 #if not self.zipper:
290 # self.zipper='gzip'
291 elif option in ("-X", "--unmapped-fifo"):
292 use_BWT_FIFO=True
293 if self.zipper:
294 use_zpacker=True
295 if self.num_threads>1 and not self.zipper_opts:
296 if self.zipper.endswith('pbzip2') or self.zipper.endswith('pigz'):
297 self.zipper_opts.append('-p'+str(self.num_threads))
298 else:
299 use_zpacker=False
300 if use_BWT_FIFO: use_BWT_FIFO=False
301 def cmd(self):
302 cmdline=[]
303 if self.zipper:
304 cmdline.extend(['-z',self.zipper])
305 if self.num_threads>1:
306 cmdline.extend(['-p'+str(self.num_threads)])
307 return cmdline
308
309 def check(self):
310 if self.num_threads<1 :
311 die("Error: arg to --num-threads must be greater than 0")
312 if self.zipper:
313 xzip=which(self.zipper)
314 if not xzip:
315 die("Error: cannot find compression program "+self.zipper)
316
317 # ReadParams is a group of runtime parameters that specify various properties
318 # of the user's reads (e.g. which quality scale their are on, how long the
319 # fragments are, etc).
320 class ReadParams:
321 def __init__(self,
322 solexa_quals,
323 phred64_quals,
324 quals,
325 integer_quals,
326 color,
327 library_type,
328 seed_length,
329 reads_format,
330 mate_inner_dist,
331 mate_inner_dist_std_dev,
332 read_group_id,
333 sample_id,
334 library_id,
335 description,
336 seq_platform_unit,
337 seq_center,
338 seq_run_date,
339 seq_platform):
340 self.solexa_quals = solexa_quals
341 self.phred64_quals = phred64_quals
342 self.quals = quals
343 self.integer_quals = integer_quals
344 self.color = color
345 self.library_type = library_type
346 self.seed_length = seed_length
347 self.reads_format = reads_format
348 self.mate_inner_dist = mate_inner_dist
349 self.mate_inner_dist_std_dev = mate_inner_dist_std_dev
350 self.read_group_id = read_group_id
351 self.sample_id = sample_id
352 self.library_id = library_id
353 self.description = description
354 self.seq_platform_unit = seq_platform_unit
355 self.seq_center = seq_center
356 self.seq_run_date = seq_run_date
357 self.seq_platform = seq_platform
358
359 def parse_options(self, opts):
360 for option, value in opts:
361 if option == "--solexa-quals":
362 self.solexa_quals = True
363 elif option in ("--solexa1.3-quals", "--phred64-quals"):
364 self.phred64_quals = True
365 elif option in ("-Q", "--quals"):
366 self.quals = True
367 elif option == "--integer-quals":
368 self.integer_quals = True
369 elif option in ("-C", "--color"):
370 self.color = True
371 elif option == "--library-type":
372 self.library_type = value
373 elif option in ("-s", "--seed-length"):
374 self.seed_length = int(value)
375 elif option in ("-r", "--mate-inner-dist"):
376 self.mate_inner_dist = int(value)
377 elif option == "--mate-std-dev":
378 self.mate_inner_dist_std_dev = int(value)
379 elif option == "--rg-id":
380 self.read_group_id = value
381 elif option == "--rg-sample":
382 self.sample_id = value
383 elif option == "--rg-library":
384 self.library_id = value
385 elif option == "--rg-description":
386 self.description = value
387 elif option == "--rg-platform-unit":
388 self.seq_platform_unit = value
389 elif option == "--rg-center":
390 self.seq_center = value
391 elif option == "--rg-date":
392 self.seq_run_date = value
393 elif option == "--rg-platform":
394 self.seq_platform = value
395
396 def check(self):
397 if self.seed_length and self.seed_length < 20:
398 die("Error: arg to --seed-length must be at least 20")
399
400 if self.mate_inner_dist_std_dev != None and self.mate_inner_dist_std_dev < 0:
401 die("Error: arg to --mate-std-dev must at least 0")
402 if (not self.read_group_id and self.sample_id) or (self.read_group_id and not self.sample_id):
403 die("Error: --rg-id and --rg-sample must be specified or omitted together")
404
405 # SearchParams is a group of runtime parameters that specify how TopHat will
406 # search for splice junctions
407
408 class SearchParams:
409 def __init__(self,
410 min_closure_exon,
411 min_closure_intron,
412 max_closure_intron,
413 min_coverage_intron,
414 max_coverage_intron,
415 min_segment_intron,
416 max_segment_intron):
417
418 self.min_closure_exon_length = min_closure_exon
419 self.min_closure_intron_length = min_closure_intron
420 self.max_closure_intron_length = max_closure_intron
421 self.min_coverage_intron_length = min_coverage_intron
422 self.max_coverage_intron_length = max_coverage_intron
423 self.min_segment_intron_length = min_segment_intron
424 self.max_segment_intron_length = max_segment_intron
425
426 def parse_options(self, opts):
427 for option, value in opts:
428 if option == "--min-closure-exon":
429 self.min_closure_exon_length = int(value)
430 if option == "--min-closure-intron":
431 self.min_closure_intron_length = int(value)
432 if option == "--max-closure-intron":
433 self.max_closure_intron_length = int(value)
434 if option == "--min-coverage-intron":
435 self.min_coverage_intron_length = int(value)
436 if option == "--max-coverage-intron":
437 self.max_coverage_intron_length = int(value)
438 if option == "--min-segment-intron":
439 self.min_segment_intron_length = int(value)
440 if option == "--max-segment-intron":
441 self.max_segment_intron_length = int(value)
442
443 def check(self):
444 if self.min_closure_exon_length < 0:
445 die("Error: arg to --min-closure-exon must be at least 20")
446 if self.min_closure_intron_length < 0:
447 die("Error: arg to --min-closure-intron must be at least 20")
448 if self.max_closure_intron_length < 0:
449 die("Error: arg to --max-closure-intron must be at least 20")
450 if self.min_coverage_intron_length < 0:
451 die("Error: arg to --min-coverage-intron must be at least 20")
452 if self.max_coverage_intron_length < 0:
453 die("Error: arg to --max-coverage-intron must be at least 20")
454 if self.min_segment_intron_length < 0:
455 die("Error: arg to --min-segment-intron must be at least 20")
456 if self.max_segment_intron_length < 0:
457 die("Error: arg to --max-segment-intron must be at least 20")
458
459 class ReportParams:
460 def __init__(self):
461 self.sort_bam = True
462 self.convert_bam = True
463
464 def parse_options(self, opts):
465 for option, value in opts:
466 if option == "--no-sort-bam":
467 self.sort_bam = False
468 if option == "--no-convert-bam":
469 self.convert_bam = False
470 self.sort_bam = False
471
472 class Bowtie2Params:
473 def __init__(self):
474 self.very_fast = False
475 self.fast = False
476 self.sensitive = False
477 self.very_sensitive = False
478
479 self.N = 0
480 self.L = 20
481 self.i = "S,1,1.25"
482 self.n_ceil = "L,0,0.15"
483 self.gbar = 4
484
485 self.mp = "6,2"
486 self.np = 1
487 self.rdg = "5,3"
488 self.rfg = "5,3"
489 self.score_min = "L,-0.6,-0.6"
490
491 self.D = 15
492 self.R = 2
493
494 def parse_options(self, opts):
495 for option, value in opts:
496 if option == "--b2-very-fast":
497 self.very_fast = True
498 if option == "--b2-fast":
499 self.fast = True
500 if option == "--b2-sensitive":
501 self.sensitive = True
502 if option == "--b2-very-sensitive":
503 self.very_sensitive = True
504
505 if option == "--b2-N":
506 self.N = int(value)
507 if option == "--b2-L":
508 self.L = 20
509 if option == "--b2-i":
510 self.i = value
511 if option == "--b2-n-ceil":
512 self.n_ceil = value
513 if option == "--b2-gbar":
514 self.gbar = 4
515
516 if option == "--b2-mp":
517 self.mp = value
518 if option == "--b2-np":
519 self.np = int(value)
520 if option == "--b2-rdg":
521 self.rdg = value
522 if option == "--b2-rfg":
523 self.rfg = value
524 if option == "--b2-score-min":
525 self.score_min = value
526
527 if option == "--b2-D":
528 self.D = int(value)
529 if option == "--b2-R":
530 self.R = int(value)
531
532 def check(self):
533 more_than_once = False
534 if self.very_fast:
535 if self.fast or self.sensitive or self.very_sensitive:
536 more_than_once = True
537 else:
538 if self.fast:
539 if self.sensitive or self.very_sensitive:
540 more_than_once = True
541 else:
542 if self.sensitive and self.very_sensitive:
543 more_than_once = True
544
545 if more_than_once:
546 die("Error: use only one of --b2-very-fast, --b2-fast, --b2-sensitive, --b2-very-sensitive")
547
548 if not self.N in [0, 1]:
549 die("Error: arg to --b2-N must be either 0 or 1")
550
551 function_re = r'^[CLSG],-?\d+(\.\d+)?,-?\d+(\.\d+)?$'
552 function_match = re.search(function_re, self.i)
553
554 if not function_match:
555 die("Error: arg to --b2-i must be <func> (e.g. --b2-i S,1,1.25)")
556
557 function_match = re.search(function_re, self.n_ceil)
558 if not function_match:
559 die("Error: arg to --b2-n-ceil must be <func> (e.g. --b2-n-ceil L,0,0.15)")
560
561 function_match = re.search(function_re, self.score_min)
562 if not function_match:
563 die("Error: arg to --b2-score-min must be <func> (e.g. --b2-score-min L,-0.6,-0.6)")
564
565 pair_re = r'^\d+,\d+$'
566 pair_match = re.search(pair_re, self.mp)
567 if not pair_match:
568 die("Error: arg to --b2-mp must be <int>,<int> (e.g. --b2-mp 6,2)")
569
570 pair_match = re.search(pair_re, self.rdg)
571 if not pair_match:
572 die("Error: arg to --b2-rdg must be <int>,<int> (e.g. --b2-mp 5,3)")
573
574 pair_match = re.search(pair_re, self.rfg)
575 if not pair_match:
576 die("Error: arg to --b2-rfg must be <int>,<int> (e.g. --b2-mp 5,3)")
577
578
579 def __init__(self):
580 self.splice_constraints = self.SpliceConstraints(8, # min_anchor
581 50, # min_intron
582 500000, # max_intron
583 0, # splice_mismatches
584 0.15) # min_isoform_frac
585
586 self.preflt_data = [ BowtieOutputFiles(), BowtieOutputFiles() ]
587
588 self.read_params = self.ReadParams(False, # solexa_scale
589 False,
590 False, # quals
591 None, # integer quals
592 False, # SOLiD - color space
593 "", # library type (e.g. "illumina-stranded-pair-end")
594 None, # seed_length
595 "fastq", # quality_format
596 None, # mate inner distance
597 20, # mate inner dist std dev
598 None, # read group id
599 None, # sample id
600 None, # library id
601 None, # description
602 None, # platform unit (i.e. lane)
603 None, # sequencing center
604 None, # run date
605 None) # sequencing platform
606
607 self.system_params = self.SystemParams(1, # bowtie_threads (num_threads)
608 False) # keep_tmp
609
610 self.search_params = self.SearchParams(100, # min_closure_exon_length
611 50, # min_closure_intron_length
612 5000, # max_closure_intron_length
613 50, # min_coverage_intron_length
614 20000, # max_coverage_intron_length
615 50, # min_segment_intron_length
616 500000) # max_segment_intron_length
617
618 self.report_params = self.ReportParams()
619
620 self.bowtie2_params = self.Bowtie2Params()
621
622 self.bowtie2 = True
623 self.gff_annotation = None
624 self.transcriptome_only = False
625 self.transcriptome_index = None
626 self.transcriptome_outdir = None
627 self.raw_junctions = None
628 self.find_novel_juncs = True
629 self.find_novel_indels = True
630 self.find_novel_fusions = True
631 self.find_GFF_juncs = True
632 self.max_hits = 20
633 self.t_max_hits = 60
634 self.max_seg_hits = 40
635 self.prefilter_multi = False
636 self.genome_read_mismatches = 2
637 self.max_read_mismatches = 2
638 self.t_mismatches = 1
639 self.segment_length = 25
640 self.segment_mismatches = 2
641 self.bowtie_alignment_option = "-v"
642 self.max_insertion_length = 3
643 self.max_deletion_length = 3
644 self.raw_insertions = None
645 self.raw_deletions = None
646 self.coverage_search = None
647 self.closure_search = False
648 self.microexon_search = False
649 self.butterfly_search = None
650 self.report_secondary_alignments = False
651
652 self.keep_fasta_order = False
653 self.partial_mapping = False
654
655 self.fusion_search = False
656 self.fusion_anchor_length = 20
657 self.fusion_min_dist = 10000000
658 self.fusion_read_mismatches = 2
659 self.fusion_multireads = 2
660 self.fusion_multipairs = 2
661
662 def check(self):
663 self.splice_constraints.check()
664 self.read_params.check()
665 self.system_params.check()
666 if self.segment_length < 10:
667 die("Error: arg to --segment-length must at least 10")
668 if self.segment_mismatches < 0 or self.segment_mismatches > 3:
669 die("Error: arg to --segment-mismatches must in [0, 3]")
670 if self.t_mismatches < 0 or self.t_mismatches > 3:
671 die("Error: arg to -n or --transcriptome-mismatches must in [0, 3]")
672
673 if self.read_params.color:
674 if self.bowtie2:
675 th_log("Warning: bowtie2 in colorspace is not yet supported; --bowtie1 option assumed.")
676 self.bowtie2=False
677 if self.fusion_search:
678 die("Error: fusion-search in colorspace is not yet supported")
679 if self.butterfly_search:
680 die("Error: butterfly-search in colorspace is not yet supported")
681
682 self.bowtie2_params.check()
683
684 library_types = ["fr-unstranded", "fr-firststrand", "fr-secondstrand"]
685
686 if self.read_params.library_type and self.read_params.library_type not in library_types:
687 die("Error: library-type should be one of: "+', '.join(library_types))
688
689 self.search_params.max_closure_intron_length = min(self.splice_constraints.max_intron_length,
690 self.search_params.max_closure_intron_length)
691
692 self.search_params.max_segment_intron_length = min(self.splice_constraints.max_intron_length,
693 self.search_params.max_segment_intron_length)
694
695 self.search_params.max_coverage_intron_length = min(self.splice_constraints.max_intron_length,
696 self.search_params.max_coverage_intron_length)
697
698 if self.max_insertion_length >= self.segment_length:
699 die("Error: the max insertion length ("+self.max_insertion_length+") can not be equal to or greater than the segment length ("+self.segment_length+")")
700
701 if self.max_insertion_length < 0:
702 die("Error: the max insertion length ("+self.max_insertion_length+") can not be less than 0")
703
704 if self.max_deletion_length >= self.splice_constraints.min_intron_length:
705 die("Error: the max deletion length ("+self.max_deletion_length+") can not be equal to or greater than the min intron length ("+self.splice_constraints.min_intron_length+")")
706
707 if self.max_deletion_length < 0:
708 die("Error: the max deletion length ("+self.max_deletion_length+") can not be less than 0")
709
710 def cmd(self):
711 cmd = ["--min-anchor", str(self.splice_constraints.min_anchor_length),
712 "--splice-mismatches", str(self.splice_constraints.splice_mismatches),
713 "--min-report-intron", str(self.splice_constraints.min_intron_length),
714 "--max-report-intron", str(self.splice_constraints.max_intron_length),
715 "--min-isoform-fraction", str(self.splice_constraints.min_isoform_fraction),
716 "--output-dir", output_dir,
717 "--max-multihits", str(self.max_hits),
718 "--max-seg-multihits", str(self.max_seg_hits),
719 "--segment-length", str(self.segment_length),
720 "--segment-mismatches", str(self.segment_mismatches),
721 "--min-closure-exon", str(self.search_params.min_closure_exon_length),
722 "--min-closure-intron", str(self.search_params.min_closure_intron_length),
723 "--max-closure-intron", str(self.search_params.max_closure_intron_length),
724 "--min-coverage-intron", str(self.search_params.min_coverage_intron_length),
725 "--max-coverage-intron", str(self.search_params.max_coverage_intron_length),
726 "--min-segment-intron", str(self.search_params.min_segment_intron_length),
727 "--max-segment-intron", str(self.search_params.max_segment_intron_length),
728 "--max-mismatches", str(self.max_read_mismatches),
729 "--max-insertion-length", str(self.max_insertion_length),
730 "--max-deletion-length", str(self.max_deletion_length)]
731
732 if not self.bowtie2:
733 cmd.extend(["--bowtie1"])
734
735 if self.fusion_search:
736 cmd.extend(["--fusion-search",
737 "--fusion-anchor-length", str(self.fusion_anchor_length),
738 "--fusion-min-dist", str(self.fusion_min_dist),
739 "--fusion-read-mismatches", str(self.fusion_read_mismatches),
740 "--fusion-multireads", str(self.fusion_multireads),
741 "--fusion-multipairs", str(self.fusion_multipairs)])
742
743 cmd.extend(self.system_params.cmd())
744
745 if self.read_params.mate_inner_dist != None:
746 cmd.extend(["--inner-dist-mean", str(self.read_params.mate_inner_dist),
747 "--inner-dist-std-dev", str(self.read_params.mate_inner_dist_std_dev)])
748 if self.gff_annotation != None:
749 cmd.extend(["--gtf-annotations", str(self.gff_annotation)])
750 if gtf_juncs:
751 cmd.extend(["--gtf-juncs", gtf_juncs])
752 if self.closure_search == False:
753 cmd.append("--no-closure-search")
754 if not self.coverage_search:
755 cmd.append("--no-coverage-search")
756 if not self.microexon_search:
757 cmd.append("--no-microexon-search")
758 if self.butterfly_search:
759 cmd.append("--butterfly-search")
760 if self.read_params.solexa_quals:
761 cmd.append("--solexa-quals")
762 if self.read_params.quals:
763 cmd.append("--quals")
764 if self.read_params.integer_quals:
765 cmd.append("--integer-quals")
766 if self.read_params.color:
767 cmd.append("--color")
768 if self.read_params.library_type:
769 cmd.extend(["--library-type", self.read_params.library_type])
770 if self.read_params.read_group_id:
771 cmd.extend(["--rg-id", self.read_params.read_group_id])
772 if self.read_params.phred64_quals:
773 cmd.append("--phred64-quals")
774 return cmd
775
776 # This is the master options parsing routine, which calls parse_options for
777 # the delegate classes (e.g. SpliceConstraints) that handle certain groups
778 # of options.
779 def parse_options(self, argv):
780 try:
781 opts, args = getopt.getopt(argv[1:], "hvp:m:n:N:F:a:i:I:G:Tr:o:j:Xz:s:g:x:MQC",
782 ["version",
783 "help",
784 "output-dir=",
785 "bowtie1",
786 "solexa-quals",
787 "solexa1.3-quals",
788 "phred64-quals",
789 "quals",
790 "integer-quals",
791 "color",
792 "library-type=",
793 "num-threads=",
794 "splice-mismatches=",
795 "max-multihits=",
796 "min-isoform-fraction=",
797 "min-anchor-length=",
798 "min-intron-length=",
799 "max-intron-length=",
800 "GTF=",
801 "transcriptome-only",
802 "transcriptome-max-hits=",
803 "transcriptome-index=",
804 "raw-juncs=",
805 "no-novel-juncs",
806 "allow-fusions",
807 "fusion-search",
808 "fusion-anchor-length=",
809 "fusion-min-dist=",
810 "fusion-read-mismatches=",
811 "fusion-multireads=",
812 "fusion-multipairs=",
813 "no-novel-indels",
814 "no-gtf-juncs",
815 "mate-inner-dist=",
816 "mate-std-dev=",
817 "no-closure-search",
818 "no-coverage-search",
819 "closure-search",
820 "coverage-search",
821 "prefilter-multihits",
822 "microexon-search",
823 "min-closure-exon=",
824 "min-closure-intron=",
825 "max-closure-intron=",
826 "min-coverage-intron=",
827 "max-coverage-intron=",
828 "min-segment-intron=",
829 "max-segment-intron=",
830 "seed-length=",
831 "genome-read-mismatches=",
832 "read-mismatches=",
833 "max-read-mismatches=",
834 "transcriptome-mismatches=",
835 "segment-length=",
836 "segment-mismatches=",
837 "bowtie-n",
838 "butterfly-search",
839 "no-butterfly-search",
840 "keep-tmp",
841 "rg-id=",
842 "rg-sample=",
843 "rg-library=",
844 "rg-description=",
845 "rg-platform-unit=",
846 "rg-center=",
847 "rg-date=",
848 "rg-platform=",
849 "tmp-dir=",
850 "zpacker=",
851 "unmapped-fifo",
852 "max-insertion-length=",
853 "max-deletion-length=",
854 "insertions=",
855 "deletions=",
856 "no-sort-bam",
857 "no-convert-bam",
858 "report-secondary-alignments",
859 "keep-fasta-order",
860 "allow-partial-mapping",
861 "b2-very-fast",
862 "b2-fast",
863 "b2-sensitive",
864 "b2-very-sensitive",
865 "b2-N=",
866 "b2-L=",
867 "b2-i=",
868 "b2-n-ceil=",
869 "b2-gbar=",
870 "b2-ma=",
871 "b2-mp=",
872 "b2-np=",
873 "b2-rdg=",
874 "b2-rfg=",
875 "b2-score-min=",
876 "b2-D=",
877 "b2-R="])
878 except getopt.error, msg:
879 raise Usage(msg)
880
881 self.splice_constraints.parse_options(opts)
882 self.system_params.parse_options(opts)
883 self.read_params.parse_options(opts)
884 self.search_params.parse_options(opts)
885 self.report_params.parse_options(opts)
886 self.bowtie2_params.parse_options(opts)
887 global use_BWT_FIFO
888 global use_BAM_Unmapped
889 if not self.read_params.color:
890 use_BWT_FIFO=False
891 use_BAM_Unmapped=True
892 global output_dir
893 global logging_dir
894 global tmp_dir
895
896 custom_tmp_dir = None
897 custom_out_dir = None
898 # option processing
899 for option, value in opts:
900 if option in ("-v", "--version"):
901 print "TopHat v%s" % (get_version())
902 sys.exit(0)
903 if option in ("-h", "--help"):
904 raise Usage(use_message)
905 if option == "--bowtie1":
906 self.bowtie2 = False
907 if option in ("-g", "--max-multihits"):
908 self.max_hits = int(value)
909 self.max_seg_hits = max(10, self.max_hits * 2)
910 if option in ("-x", "--transcriptome-max-hits"):
911 self.t_max_hits = int(value)
912 if option in ("-G", "--GTF"):
913 self.gff_annotation = value
914 if option in ("-T", "--transcriptome"):
915 self.transcriptome_only = True
916 if option == "--transcriptome-index":
917 self.transcriptome_index = value
918 if option in("-M", "--prefilter-multihits"):
919 self.prefilter_multi = True
920 if option in ("-j", "--raw-juncs"):
921 self.raw_junctions = value
922 if option == "--no-novel-juncs":
923 self.find_novel_juncs = False
924 if option == "--no-novel-indels":
925 self.find_novel_indels = False
926 if option == "--fusion-search":
927 self.fusion_search = True
928 if option == "--fusion-anchor-length":
929 self.fusion_anchor_length = int(value)
930 if option == "--fusion-min-dist":
931 self.fusion_min_dist = int(value)
932 if option == "--fusion-read-mismatches":
933 self.fusion_read_mismatches = int(value)
934 if option == "--fusion-multireads":
935 self.fusion_multireads = int(value)
936 if option == "--fusion-multipairs":
937 self.fusion_multipairs = int(value)
938 if option == "--no-gtf-juncs":
939 self.find_GFF_juncs = False
940 if option == "--no-coverage-search":
941 self.coverage_search = False
942 if option == "--coverage-search":
943 self.coverage_search = True
944 if option == "--no-closure-search":
945 self.closure_search = False
946 if option == "--closure-search":
947 self.closure_search = True
948 if option == "--microexon-search":
949 self.microexon_search = True
950 if option == "--butterfly-search":
951 self.butterfly_search = True
952 if option == "--no-butterfly-search":
953 self.butterfly_search = False
954 if option == "--genome-read-mismatches":
955 self.genome_read_mismatches = int(value)
956 if option in ("-N", "--read-mismatches", "--max-read-mismatches"):
957 self.max_read_mismatches = int(value)
958 if option in ("-n", "--transcriptome-mismatches"):
959 self.t_mismatches = int(value)
960 if option == "--segment-length":
961 self.segment_length = int(value)
962 if option == "--segment-mismatches":
963 self.segment_mismatches = int(value)
964 if option == "--bowtie-n":
965 self.bowtie_alignment_option = "-n"
966 if option == "--max-insertion-length":
967 self.max_insertion_length = int(value)
968 if option == "--max-deletion-length":
969 self.max_deletion_length = int(value)
970 if option == "--insertions":
971 self.raw_insertions = value
972 if option == "--deletions":
973 self.raw_deletions = value
974 if option == "--report-secondary-alignments":
975 self.report_secondary_alignments = True
976 if option == "--keep-fasta-order":
977 self.keep_fasta_order = True
978 if option == "--allow-partial-mapping":
979 self.partial_mapping = True
980 if option in ("-o", "--output-dir"):
981 custom_out_dir = value + "/"
982 if option == "--tmp-dir":
983 custom_tmp_dir = value + "/"
984 if self.transcriptome_only:
985 self.find_novel_juncs=False
986 self.find_novel_indels=False
987 if custom_out_dir:
988 output_dir = custom_out_dir
989 logging_dir = output_dir + "logs/"
990 tmp_dir = output_dir + "tmp/"
991 sam_header = tmp_dir + "stub_header.sam"
992 if custom_tmp_dir:
993 tmp_dir = custom_tmp_dir
994 sam_header = tmp_dir + "stub_header.sam"
995 if len(args) < 2:
996 raise Usage(use_message)
997 return args
998
999
1000 # check if a file exists and has non-zero (or minimum) size
1001 def fileExists(filepath, minfsize=1):
1002 if os.path.exists(filepath) and os.path.getsize(filepath)>=minfsize:
1003 return True
1004 else:
1005 return False
1006
1007
1008 def getFileDir(filepath):
1009 #if fullpath given, returns path including the ending /
1010 fpath, fname=os.path.split(filepath)
1011 if fpath: fpath+='/'
1012 return fpath
1013
1014 def getFileBaseName(filepath):
1015 fpath, fname=os.path.split(filepath)
1016 fbase, fext =os.path.splitext(fname)
1017 fx=fext.lower()
1018 if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fbase)>0:
1019 return fbase
1020 elif fx == '.z' or fx.find('.gz')==0 or fx.find('.bz')==0:
1021 fb, fext = os.path.splitext(fbase)
1022 fx=fext.lower()
1023 if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fb)>0:
1024 return fb
1025 else:
1026 return fbase
1027 else:
1028 if len(fbase)>0:
1029 return fbase
1030 else:
1031 return fname
1032
1033 # Returns the current time in a nice format
1034 def right_now():
1035 curr_time = datetime.now()
1036 return curr_time.strftime("%c")
1037
1038 # The TopHat logging formatter
1039 def th_log(out_str):
1040 #print >> sys.stderr, "[%s] %s" % (right_now(), out_str)
1041 if tophat_logger:
1042 tophat_logger.info(out_str)
1043
1044 def th_logp(out_str=""):
1045 print >> sys.stderr, out_str
1046 if tophat_log:
1047 print >> tophat_log, out_str
1048
1049 def die(msg=None):
1050 if msg is not None:
1051 th_logp(msg)
1052 sys.exit(1)
1053
1054 # Ensures that the output, logging, and temp directories are present. If not,
1055 # they are created
1056 def prepare_output_dir():
1057
1058 #th_log("Preparing output location "+output_dir)
1059 if os.path.exists(output_dir):
1060 pass
1061 else:
1062 os.mkdir(output_dir)
1063
1064 if os.path.exists(logging_dir):
1065 pass
1066 else:
1067 os.mkdir(logging_dir)
1068
1069 if os.path.exists(tmp_dir):
1070 pass
1071 else:
1072 try:
1073 os.makedirs(tmp_dir)
1074 except OSError, o:
1075 die("\nError creating directory %s (%s)" % (tmp_dir, o))
1076
1077
1078 # to be added as preexec_fn for every subprocess.Popen() call:
1079 # see http://bugs.python.org/issue1652
1080 def subprocess_setup():
1081 # Python installs a SIGPIPE handler by default, which causes
1082 # gzip or other de/compression pipes to complain about "stdout: Broken pipe"
1083 signal.signal(signal.SIGPIPE, signal.SIG_DFL)
1084
1085 # Check that the Bowtie index specified by the user is present and all files
1086 # are there.
1087 def check_bowtie_index(idx_prefix, is_bowtie2):
1088 th_log("Checking for Bowtie index files")
1089 idxext="ebwt"
1090 bowtie_ver=""
1091 if is_bowtie2:
1092 idxext="bt2"
1093 bowtie_ver="2 "
1094
1095 idx_fwd_1 = idx_prefix + ".1."+idxext
1096 idx_fwd_2 = idx_prefix + ".2."+idxext
1097 idx_rev_1 = idx_prefix + ".rev.1."+idxext
1098 idx_rev_2 = idx_prefix + ".rev.2."+idxext
1099
1100 if os.path.exists(idx_fwd_1) and \
1101 os.path.exists(idx_fwd_2) and \
1102 os.path.exists(idx_rev_1) and \
1103 os.path.exists(idx_rev_2):
1104 return
1105 else:
1106 bwtidxerr="Error: Could not find Bowtie "+bowtie_ver+"index files (" + idx_prefix + ".*."+idxext+")"
1107 bwtidx_env = os.environ.get("BOWTIE_INDEXES")
1108
1109 if bwtidx_env == None:
1110 die(bwtidxerr)
1111 if os.path.exists(bwtidx_env+idx_fwd_1) and \
1112 os.path.exists(bwtidx_env+idx_fwd_2) and \
1113 os.path.exists(bwtidx_env+idx_rev_1) and \
1114 os.path.exists(bwtidx_env+idx_rev_2):
1115 return
1116 else:
1117 die(bwtidxerr)
1118
1119 # Reconstructs the multifasta file from which the Bowtie index was created, if
1120 # it's not already there.
1121 def bowtie_idx_to_fa(idx_prefix):
1122 idx_name = idx_prefix.split('/')[-1]
1123 th_log("Reconstituting reference FASTA file from Bowtie index")
1124
1125 try:
1126 tmp_fasta_file_name = tmp_dir + idx_name + ".fa"
1127 tmp_fasta_file = open(tmp_fasta_file_name, "w")
1128
1129 inspect_log = open(logging_dir + "bowtie_inspect_recons.log", "w")
1130
1131 inspect_cmd = [prog_path("bowtie-inspect"),
1132 idx_prefix]
1133
1134 th_logp(" Executing: " + " ".join(inspect_cmd) + " > " + tmp_fasta_file_name)
1135 ret = subprocess.call(inspect_cmd,
1136 stdout=tmp_fasta_file,
1137 stderr=inspect_log)
1138
1139 # Bowtie reported an error
1140 if ret != 0:
1141 die(fail_str+"Error: bowtie-inspect returned an error")
1142
1143 # Bowtie not found
1144 except OSError, o:
1145 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1146 die(fail_str+"Error: bowtie-inspect not found on this system. Did you forget to include it in your PATH?")
1147
1148 return tmp_fasta_file_name
1149
1150 # Checks whether the multifasta file for the genome is present alongside the
1151 # Bowtie index files for it.
1152 def check_fasta(idx_prefix):
1153 th_log("Checking for reference FASTA file")
1154 idx_fasta = idx_prefix + ".fa"
1155 if os.path.exists(idx_fasta):
1156 return idx_fasta
1157 else:
1158 bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES")
1159 if bowtie_idx_env_var:
1160 idx_fasta = bowtie_idx_env_var + idx_prefix + ".fa"
1161 if os.path.exists(idx_fasta):
1162 return idx_fasta
1163
1164 th_logp("\tWarning: Could not find FASTA file " + idx_fasta)
1165 idx_fa = bowtie_idx_to_fa(idx_prefix)
1166 return idx_fa
1167
1168 # Check that both the Bowtie index and the genome's fasta file are present
1169 def check_index(idx_prefix, is_bowtie2):
1170 check_bowtie_index(idx_prefix, is_bowtie2)
1171 ref_fasta_file = check_fasta(idx_prefix)
1172
1173 return (ref_fasta_file, None)
1174
1175 # Retrive a tuple containing the system's version of Bowtie. Parsed from
1176 # `bowtie --version`
1177 def get_bowtie_version(is_bowtie2):
1178 try:
1179 # Launch Bowtie to capture its version info
1180 proc = subprocess.Popen([bowtie_path, "--version"],
1181 stdout=subprocess.PIPE)
1182
1183 stdout_value = proc.communicate()[0]
1184
1185 bowtie_version = None
1186 bowtie_out = repr(stdout_value)
1187
1188 # Find the version identifier
1189 if is_bowtie2:
1190 version_str = "bowtie2-align version "
1191 else:
1192 version_str = "bowtie version "
1193
1194 ver_str_idx = bowtie_out.find(version_str)
1195 if ver_str_idx != -1:
1196 nl = bowtie_out.find("\\n", ver_str_idx)
1197 version_val = bowtie_out[ver_str_idx + len(version_str):nl]
1198
1199 if is_bowtie2:
1200 ver_numbers = version_val.split('.')
1201 ver_numbers[2:] = ver_numbers[2].split('-')
1202 bowtie_version = [int(x) for x in ver_numbers[:3]] + [int(ver_numbers[3][4:])]
1203 else:
1204 bowtie_version = [int(x) for x in version_val.split('.')]
1205
1206 if len(bowtie_version) == 3:
1207 bowtie_version.append(0)
1208
1209 return bowtie_version
1210 except OSError, o:
1211 errmsg=fail_str+str(o)+"\n"
1212 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1213 errmsg+="Error: bowtie not found on this system"
1214 die(errmsg)
1215
1216 def get_index_sam_header(params, idx_prefix):
1217 try:
1218 temp_sam_header_filename = tmp_dir + "temp.samheader.sam"
1219 temp_sam_header_file = open(temp_sam_header_filename, "w")
1220
1221 bowtie_header_cmd = [bowtie_path]
1222
1223 read_params = params.read_params
1224 if not params.bowtie2:
1225 bowtie_header_cmd += ["--sam"]
1226
1227 if read_params.color:
1228 bowtie_header_cmd.append('-C')
1229
1230 bowtie_header_cmd.extend([idx_prefix, '/dev/null'])
1231 subprocess.call(bowtie_header_cmd,
1232 stdout=temp_sam_header_file,
1233 stderr=open('/dev/null'))
1234
1235 temp_sam_header_file.close()
1236 temp_sam_header_file = open(temp_sam_header_filename, "r")
1237
1238 bowtie_sam_header_filename = tmp_dir + idx_prefix.split('/')[-1] + ".bwt.samheader.sam"
1239 bowtie_sam_header_file = open(bowtie_sam_header_filename, "w")
1240
1241 preamble = []
1242 sq_dict_lines = []
1243
1244 for line in temp_sam_header_file.readlines():
1245 line = line.strip()
1246 if line.find("@SQ") != -1:
1247 # Sequence dictionary record
1248 cols = line.split('\t')
1249 seq_name = None
1250 for col in cols:
1251 fields = col.split(':')
1252 #print fields
1253 if len(fields) > 0 and fields[0] == "SN":
1254 seq_name = fields[1]
1255 if seq_name == None:
1256 die("Error: malformed sequence dictionary in sam header")
1257 sq_dict_lines.append([seq_name,line])
1258 elif line.find("CL"):
1259 continue
1260 else:
1261 preamble.append(line)
1262
1263 print >> bowtie_sam_header_file, "@HD\tVN:1.0\tSO:coordinate"
1264 if read_params.read_group_id and read_params.sample_id:
1265 rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
1266 read_params.sample_id)
1267 if read_params.library_id:
1268 rg_str += "\tLB:%s" % read_params.library_id
1269 if read_params.description:
1270 rg_str += "\tDS:%s" % read_params.description
1271 if read_params.seq_platform_unit:
1272 rg_str += "\tPU:%s" % read_params.seq_platform_unit
1273 if read_params.seq_center:
1274 rg_str += "\tCN:%s" % read_params.seq_center
1275 if read_params.mate_inner_dist:
1276 rg_str += "\tPI:%s" % read_params.mate_inner_dist
1277 if read_params.seq_run_date:
1278 rg_str += "\tDT:%s" % read_params.seq_run_date
1279 if read_params.seq_platform:
1280 rg_str += "\tPL:%s" % read_params.seq_platform
1281
1282 print >> bowtie_sam_header_file, rg_str
1283
1284 if not params.keep_fasta_order:
1285 sq_dict_lines.sort(lambda x,y: cmp(x[0],y[0]))
1286
1287 for [name, line] in sq_dict_lines:
1288 print >> bowtie_sam_header_file, line
1289 print >> bowtie_sam_header_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
1290
1291 bowtie_sam_header_file.close()
1292 temp_sam_header_file.close()
1293 return bowtie_sam_header_filename
1294
1295 except OSError, o:
1296 errmsg=fail_str+str(o)+"\n"
1297 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1298 errmsg+="Error: bowtie not found on this system"
1299 die(errmsg)
1300
1301 # Make sure Bowtie is installed and is recent enough to be useful
1302 def check_bowtie(params):
1303 bowtie_req=""
1304 if params.bowtie2:
1305 bowtie_req="2"
1306 log_msg = "Checking for Bowtie" + bowtie_req
1307 th_log(log_msg)
1308
1309 bowtie_bin = "bowtie"+bowtie_req
1310
1311 global bowtie_path
1312 bowtie_path=prog_path(bowtie_bin)
1313 bowtie_version = get_bowtie_version(params.bowtie2)
1314 if params.bowtie2 and bowtie_version == None:
1315 #try to fallback on bowtie 1
1316 params.bowtie2=False
1317 bowtie_version=get_bowtie_version(params.bowtie2)
1318 if bowtie_version == None:
1319 die("Error: Bowtie not found on this system.")
1320 if params.bowtie2:
1321 if bowtie_version[0] < 3 and bowtie_version[1] < 1 and bowtie_version[2] < 1 and bowtie_version[3] < 5:
1322 die("Error: TopHat requires Bowtie 2.0.0-beta5 or later")
1323 else:
1324 if bowtie_version[0] < 1 and bowtie_version[1] < 12 and bowtie_version[2] < 3:
1325 die("Error: TopHat requires Bowtie 0.12.3 or later")
1326 th_logp("\t\t Bowtie version:\t %s" % ".".join([str(x) for x in bowtie_version]))
1327
1328
1329 # Retrive a tuple containing the system's version of samtools. Parsed from
1330 # `samtools`
1331 def get_samtools_version():
1332 try:
1333 # Launch Bowtie to capture its version info
1334 proc = subprocess.Popen(samtools_path, stderr=subprocess.PIPE)
1335 samtools_out = proc.communicate()[1]
1336
1337 # Find the version identifier
1338 version_match = re.search(r'Version:\s+(\d+)\.(\d+).(\d+)([a-zA-Z]?)', samtools_out)
1339 samtools_version_arr = [int(version_match.group(x)) for x in [1,2,3]]
1340 if version_match.group(4):
1341 samtools_version_arr.append(version_match.group(4))
1342 else:
1343 samtools_version_arr.append(0)
1344
1345 return version_match.group(), samtools_version_arr
1346 except OSError, o:
1347 errmsg=fail_str+str(o)+"\n"
1348 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1349 errmsg+="Error: samtools not found on this system"
1350 die(errmsg)
1351
1352 # Make sure the SAM tools are installed and are recent enough to be useful
1353 def check_samtools():
1354 th_log("Checking for Samtools")
1355 global samtools_path
1356 samtools_path=prog_path("samtools")
1357 samtools_version_str, samtools_version_arr = get_samtools_version()
1358 if samtools_version_str == None:
1359 die("Error: Samtools not found on this system")
1360 elif samtools_version_arr[1] < 1 or samtools_version_arr[2] < 7:
1361 die("Error: TopHat requires Samtools 0.1.7 or later")
1362 th_logp("\t\tSamtools version:\t %s" % ".".join([str(x) for x in samtools_version_arr]))
1363
1364
1365
1366 class FastxReader:
1367 def __init__(self, i_file, is_color=0, fname=''):
1368 self.bufline=None
1369 self.format=None
1370 self.ifile=i_file
1371 self.nextRecord=None
1372 self.eof=None
1373 self.fname=fname
1374 self.lastline=None
1375 self.numrecords=0
1376 self.isColor=0
1377 if is_color : self.isColor=1
1378 # determine file type
1379 #no records processed yet, skip custom header lines if any
1380 hlines=10 # allow maximum 10 header lines
1381 self.lastline=" "
1382 while hlines>0 and self.lastline[0] not in "@>" :
1383 self.lastline=self.ifile.readline()
1384 hlines-=1
1385 if self.lastline[0] == '@':
1386 self.format='fastq'
1387 self.nextRecord=self.nextFastq
1388 elif self.lastline[0] == '>':
1389 self.format='fasta'
1390 self.nextRecord=self.nextFasta
1391 else:
1392 die("Error: cannot determine record type in input file %s" % fname)
1393 self.bufline=self.lastline
1394 self.lastline=None
1395
1396 def nextFastq(self):
1397 # returning tuple: (seqID, sequence_string, seq_len, qv_string)
1398 seqid,seqstr,qstr,seq_len='','','',0
1399 if self.eof: return (seqid, seqstr, seq_len, qstr)
1400 fline=self.getLine #shortcut to save a bit of time
1401 line=fline()
1402
1403 if not line : return (seqid, seqstr, seq_len, qstr)
1404 while len(line.rstrip())==0: # skip empty lines
1405 line=fline()
1406 if not line : return (seqid, seqstr,seq_len, qstr)
1407 try:
1408 if line[0] != "@":
1409 raise ValueError("Records in Fastq files should start with '@' character")
1410
1411 seqid = line[1:].rstrip()
1412 seqstr = fline().rstrip()
1413
1414 #There may now be more sequence lines, or the "+" quality marker line:
1415 while True:
1416 line = fline()
1417 if not line:
1418 raise ValueError("Premature end of file (missing quality values for "+seqid+")")
1419 if line[0] == "+":
1420 # -- sequence string ended
1421 #qtitle = line[1:].rstrip()
1422 #if qtitle and qtitle != seqid:
1423 # raise ValueError("Different read ID for sequence and quality (%s vs %s)" \
1424 # % (seqid, qtitle))
1425 break
1426 seqstr += line.rstrip() #removes trailing newlines
1427 #loop until + found
1428 seq_len = len(seqstr)
1429 #at least one line of quality data should follow
1430 qstrlen=0
1431 #now read next lines as quality values until seq_len is reached
1432 while True:
1433 line=fline()
1434 if not line : break #end of file
1435 qstr += line.rstrip()
1436 qstrlen=len(qstr)
1437 if qstrlen + self.isColor >= seq_len :
1438 break # qv string has reached the length of seq string
1439 #loop until qv has the same length as seq
1440
1441 if self.isColor:
1442 # and qstrlen==seq_len :
1443 if qstrlen==seq_len:
1444 #qual string may have a dummy qv at the beginning, should be stripped
1445 qstr = qstr[1:]
1446 qstrlen -= 1
1447 if qstrlen!=seq_len-1:
1448 raise ValueError("Length mismatch between sequence and quality strings "+ \
1449 "for %s (%i vs %i)." % (seqid, seq_len, qstrlen))
1450 else:
1451 if seq_len != qstrlen :
1452 raise ValueError("Length mismatch between sequence and quality strings "+ \
1453 "for %s (%i vs %i)." % (seqid, seq_len, qstrlen))
1454 except ValueError, err:
1455 die("\nError encountered parsing file "+self.fname+":\n "+str(err))
1456 #return the record
1457 self.numrecords+=1
1458 ##--discard the primer base [NO]
1459 if self.isColor :
1460 seq_len-=1
1461 seqstr = seqstr[1:]
1462 return (seqid, seqstr, seq_len, qstr)
1463
1464 def nextFasta(self):
1465 # returning tuple: (seqID, sequence_string, seq_len)
1466 seqid,seqstr,seq_len='','',0
1467 fline=self.getLine # shortcut to readline function of f
1468 line=fline() # this will use the buffer line if it's there
1469 if not line : return (seqid, seqstr, seq_len, None)
1470 while len(line.rstrip())==0: # skip empty lines
1471 line=fline()
1472 if not line : return (seqid, seqstr, seq_len, None)
1473 try:
1474 if line[0] != ">":
1475 raise ValueError("Records in Fasta files must start with '>' character")
1476 seqid = line[1:].split()[0]
1477 #more sequence lines, or the ">" quality marker line:
1478 while True:
1479 line = fline()
1480 if not line: break
1481 if line[0] == '>':
1482 #next sequence starts here
1483 self.ungetLine()
1484 break
1485 seqstr += line.rstrip()
1486 #loop until '>' found
1487 seq_len = len(seqstr)
1488 if seq_len < 3:
1489 raise ValueError("Read %s too short (%i)." \
1490 % (seqid, seq_len))
1491 except ValueError, err:
1492 die("\nError encountered parsing fasta file "+self.fname+"\n "+str(err))
1493 #return the record and continue
1494 self.numrecords+=1
1495 if self.isColor : # -- discard primer base
1496 seq_len-=1
1497 seqstr=seqstr[1:]
1498 return (seqid, seqstr, seq_len, None)
1499
1500 def getLine(self):
1501 if self.bufline: #return previously buffered line
1502 r=self.bufline
1503 self.bufline=None
1504 return r
1505 else: #read a new line from stream and return it
1506 if self.eof: return None
1507 self.lastline=self.ifile.readline()
1508 if not self.lastline:
1509 self.eof=1
1510 return None
1511 return self.lastline
1512 def ungetLine(self):
1513 if self.lastline is None:
1514 th_logp("Warning: FastxReader called ungetLine() with no prior line!")
1515 self.bufline=self.lastline
1516 self.lastline=None
1517 #< class FastxReader
1518
1519 def fa_write(fhandle, seq_id, seq):
1520 """
1521 Write to a file in the FASTA format.
1522
1523 Arguments:
1524 - `fhandle`: A file handle open for writing
1525 - `seq_id`: The sequence id string for this sequence
1526 - `seq`: An unformatted string of the sequence to write
1527 """
1528 line_len = 60
1529 fhandle.write(">" + seq_id + "\n")
1530 for i in xrange(len(seq) / line_len + 1):
1531 start = i * line_len
1532 #end = (i+1) * line_len if (i+1) * line_len < len(seq) else len(seq)
1533 if (i+1) * line_len < len(seq):
1534 end = (i+1) * line_len
1535 else:
1536 end = len(seq)
1537 fhandle.write( seq[ start:end ] + "\n")
1538
1539 class ZReader:
1540 def __init__(self, filename, sysparams, guess=True):
1541 self.fname=filename
1542 self.file=None
1543 self.fsrc=None
1544 self.popen=None
1545 pipecmd=[]
1546 s=filename.lower()
1547 if s.endswith(".bam"):
1548 # daehwan - want to ouptut mapped reads as well
1549 pipecmd=[prog_path("bam2fastx"), "--all", "-"]
1550 else:
1551 if guess:
1552 if s.endswith(".z") or s.endswith(".gz") or s.endswith(".gzip"):
1553 pipecmd=['gzip']
1554 else:
1555 if s.endswith(".bz2") or s.endswith(".bzip2") or s.endswith(".bzip"):
1556 pipecmd=['bzip2']
1557 if len(pipecmd)>0 and which(pipecmd[0]) is None:
1558 die("Error: cannot find %s to decompress input file %s " % (pipecmd, filename))
1559 if len(pipecmd)>0:
1560 if pipecmd[0]=='gzip' and sysparams.zipper.endswith('pigz'):
1561 pipecmd[0]=sysparams.zipper
1562 pipecmd.extend(sysparams.zipper_opts)
1563 elif pipecmd[0]=='bzip2' and sysparams.zipper.endswith('pbzip2'):
1564 pipecmd[0]=sysparams.zipper
1565 pipecmd.extend(sysparams.zipper_opts)
1566 else: #not guessing, but must still check if it's a compressed file
1567 if use_zpacker and filename.endswith(".z"):
1568 pipecmd=[sysparams.zipper]
1569 pipecmd.extend(sysparams.zipper_opts)
1570
1571 if pipecmd:
1572 pipecmd+=['-cd']
1573 if pipecmd:
1574 try:
1575 self.fsrc=open(self.fname, 'rb')
1576 self.popen=subprocess.Popen(pipecmd,
1577 preexec_fn=subprocess_setup,
1578 stdin=self.fsrc,
1579 stdout=subprocess.PIPE, stderr=tophat_log, close_fds=True)
1580 except Exception:
1581 die("Error: could not open pipe "+' '.join(pipecmd)+' < '+ self.fname)
1582 self.file=self.popen.stdout
1583 else:
1584 self.file=open(filename)
1585 def close(self):
1586 if self.fsrc: self.fsrc.close()
1587 self.file.close()
1588 if self.popen:
1589 self.popen.wait()
1590 self.popen=None
1591
1592 class ZWriter:
1593 def __init__(self, filename, sysparams):
1594 self.fname=filename
1595 if use_zpacker:
1596 pipecmd=[sysparams.zipper,"-cf", "-"]
1597 self.ftarget=open(filename, "wb")
1598 try:
1599 self.popen=subprocess.Popen(pipecmd,
1600 preexec_fn=subprocess_setup,
1601 stdin=subprocess.PIPE,
1602 stderr=tophat_log, stdout=self.ftarget, close_fds=True)
1603 except Exception:
1604 die("Error: could not open writer pipe "+' '.join(pipecmd)+' < '+ self.fname)
1605 self.file=self.popen.stdin # client writes to this end of the pipe
1606 else: #no compression
1607 self.file=open(filename, "w")
1608 self.ftarget=None
1609 self.popen=None
1610 def close(self):
1611 self.file.close()
1612 if self.ftarget: self.ftarget.close()
1613 if self.popen:
1614 self.popen.wait() #! required to actually flush the pipes (eek!)
1615 self.popen=None
1616
1617 # check_reads_format() examines the first few records in the user files
1618 # to determines the file format
1619 def check_reads_format(params, reads_files):
1620 #seed_len = params.read_params.seed_length
1621 fileformat = params.read_params.reads_format
1622
1623 observed_formats = set([])
1624 # observed_scales = set([])
1625 min_seed_len = 99999
1626 max_seed_len = 0
1627 files = reads_files.split(',')
1628
1629 for f_name in files:
1630 #try:
1631 zf = ZReader(f_name, params.system_params)
1632 #except IOError:
1633 # die("Error: could not open file "+f_name)
1634 freader=FastxReader(zf.file, params.read_params.color, zf.fname)
1635 toread=4 #just sample the first 4 reads
1636 while toread>0:
1637 seqid, seqstr, seq_len, qstr = freader.nextRecord()
1638 if not seqid: break
1639 toread-=1
1640 if seq_len < 20:
1641 th_logp("Warning: found a read < 20bp in "+f_name)
1642 else:
1643 min_seed_len = min(seq_len, min_seed_len)
1644 max_seed_len = max(seq_len, max_seed_len)
1645 zf.close()
1646 observed_formats.add(freader.format)
1647 if len(observed_formats) > 1:
1648 die("Error: TopHat requires all reads be either FASTQ or FASTA. Mixing formats is not supported.")
1649 fileformat=list(observed_formats)[0]
1650 #if seed_len != None:
1651 # seed_len = max(seed_len, max_seed_len)
1652 #else:
1653 # seed_len = max_seed_len
1654 #print >> sys.stderr, "\tmin read length: %dbp, max read length: %dbp" % (min_seed_len, max_seed_len)
1655 th_logp("\tformat:\t\t %s" % fileformat)
1656 if fileformat == "fastq":
1657 quality_scale = "phred33 (default)"
1658 if params.read_params.solexa_quals and not params.read_params.phred64_quals:
1659 quality_scale = "solexa33 (reads generated with GA pipeline version < 1.3)"
1660 elif params.read_params.phred64_quals:
1661 quality_scale = "phred64 (reads generated with GA pipeline version >= 1.3)"
1662 th_logp("\tquality scale:\t %s" % quality_scale)
1663 elif fileformat == "fasta":
1664 if params.read_params.color:
1665 params.read_params.integer_quals = True
1666
1667 #print seed_len, format, solexa_scale
1668 #NOTE: seed_len will be re-evaluated later by prep_reads
1669 return TopHatParams.ReadParams(params.read_params.solexa_quals,
1670 params.read_params.phred64_quals,
1671 params.read_params.quals,
1672 params.read_params.integer_quals,
1673 params.read_params.color,
1674 params.read_params.library_type,
1675 #seed_len,
1676 params.read_params.seed_length,
1677 fileformat,
1678 params.read_params.mate_inner_dist,
1679 params.read_params.mate_inner_dist_std_dev,
1680 params.read_params.read_group_id,
1681 params.read_params.sample_id,
1682 params.read_params.library_id,
1683 params.read_params.description,
1684 params.read_params.seq_platform_unit,
1685 params.read_params.seq_center,
1686 params.read_params.seq_run_date,
1687 params.read_params.seq_platform)
1688
1689 def log_tail(logfile, lines=1):
1690 f=open(logfile, "r")
1691 f.seek(0, 2)
1692 fbytes= f.tell()
1693 size=lines
1694 block=-1
1695 while size > 0 and fbytes+block*1024 > 0:
1696 if (fbytes+block*1024 > 0):
1697 ##Seek back once more, if possible
1698 f.seek( block*1024, 2 )
1699 else:
1700 #Seek to the beginning
1701 f.seek(0, 0)
1702 data= f.read( 1024 )
1703 linesFound= data.count('\n')
1704 size -= linesFound
1705 block -= 1
1706 if (fbytes + block*1024 > 0):
1707 f.seek(block*1024, 2)
1708 else:
1709 f.seek(0,0)
1710 f.readline() # find a newline
1711 lastBlocks= list( f.readlines() )
1712 f.close()
1713 return "\n".join(lastBlocks[-lines:])
1714
1715 # Format a DateTime as a pretty string.
1716 # FIXME: Currently doesn't support days!
1717 def formatTD(td):
1718 days = td.days
1719 hours = td.seconds // 3600
1720 minutes = (td.seconds % 3600) // 60
1721 seconds = td.seconds % 60
1722
1723 if days > 0:
1724 return '%d days %02d:%02d:%02d' % (days, hours, minutes, seconds)
1725 else:
1726 return '%02d:%02d:%02d' % (hours, minutes, seconds)
1727
1728 class PrepReadsInfo:
1729 def __init__(self, fname, side):
1730 try:
1731 f=open(fname,"r")
1732 self.min_len=int(f.readline().split("=")[-1])
1733 self.max_len=int(f.readline().split("=")[-1])
1734 self.in_count=int(f.readline().split("=")[-1])
1735 self.out_count=int(f.readline().split("=")[-1])
1736 if (self.out_count==0) or (self.max_len<16):
1737 raise Exception()
1738 except Exception, e:
1739 die(fail_str+"Error retrieving prep_reads info.")
1740 th_logp("\t%5s reads: min. length=%s, count=%s" % (side, self.min_len, self.out_count))
1741 if self.min_len < 20:
1742 th_logp("Warning: short reads (<20bp) will make TopHat quite slow and take large amount of memory because they are likely to be mapped to too many places")
1743
1744
1745 def prep_reads_cmd(params, reads_list, quals_list=None, aux_file=None, index_file=None, filter_reads=None, hits_to_filter=None):
1746 #generate a prep_reads cmd arguments
1747 filter_cmd = [prog_path("prep_reads")]
1748 filter_cmd.extend(params.cmd())
1749
1750 if params.read_params.reads_format == "fastq":
1751 filter_cmd += ["--fastq"]
1752 elif params.read_params.reads_format == "fasta":
1753 filter_cmd += ["--fasta"]
1754 if hits_to_filter:
1755 filter_cmd += ["--flt-hits="+hits_to_filter]
1756 if aux_file:
1757 filter_cmd += ["--aux-outfile="+aux_file]
1758 if index_file:
1759 filter_cmd += ["--index-outfile="+index_file]
1760 if filter_reads:
1761 filter_cmd += ["--flt-reads="+filter_reads]
1762 filter_cmd.append(reads_list)
1763 if quals_list:
1764 filter_cmd.append(quals_list)
1765 return filter_cmd
1766
1767 # Calls the prep_reads executable, which prepares an internal read library.
1768 # The read library features reads with monotonically increasing integer IDs.
1769 # prep_reads also filters out very low complexy or garbage reads as well as
1770 # polyA reads.
1771 #--> returns the tuple (internal_read_library_file, read_info)
1772 def prep_reads(params, reads_list, quals_list, side="left", prefilter_reads=None):
1773 reads_suffix = ".fq"
1774
1775 # for parallelization, we don't compress the read files
1776 do_use_zpacker = use_zpacker
1777 if params.system_params.num_threads > 1:
1778 do_use_zpacker = False
1779
1780 if do_use_zpacker: reads_suffix += ".z"
1781
1782 output_name=side+"_kept_reads"
1783 kept_reads_filename = tmp_dir + output_name + reads_suffix
1784
1785 if os.path.exists(kept_reads_filename):
1786 os.remove(kept_reads_filename)
1787 kept_reads = open(kept_reads_filename, "wb")
1788 log_fname=logging_dir + "prep_reads.log"
1789 filter_log = open(log_fname,"w")
1790
1791 info_file=output_dir+output_name+".info"
1792 index_file=kept_reads_filename+".index"
1793
1794 filter_cmd=prep_reads_cmd(params, reads_list, quals_list, info_file, index_file, prefilter_reads)
1795 shell_cmd = ' '.join(filter_cmd)
1796 #finally, add the compression pipe
1797 zip_cmd=[]
1798 if do_use_zpacker:
1799 zip_cmd=[ params.system_params.zipper ]
1800 zip_cmd.extend(params.system_params.zipper_opts)
1801 zip_cmd.extend(['-c','-'])
1802 shell_cmd +=' | '+' '.join(zip_cmd)
1803 shell_cmd += ' >' +kept_reads_filename
1804 retcode=0
1805 try:
1806 print >> run_log, shell_cmd
1807 if do_use_zpacker:
1808 filter_proc = subprocess.Popen(filter_cmd,
1809 stdout=subprocess.PIPE,
1810 stderr=filter_log)
1811 zip_proc=subprocess.Popen(zip_cmd,
1812 preexec_fn=subprocess_setup,
1813 stdin=filter_proc.stdout,
1814 stderr=tophat_log, stdout=kept_reads)
1815 filter_proc.stdout.close() #as per http://bugs.python.org/issue7678
1816 zip_proc.communicate()
1817 retcode=filter_proc.poll()
1818 if retcode==0:
1819 retcode=zip_proc.poll()
1820 else:
1821 retcode = subprocess.call(filter_cmd,
1822 stdout=kept_reads,
1823 stderr=filter_log)
1824 if retcode:
1825 die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname))
1826
1827 except OSError, o:
1828 errmsg=fail_str+str(o)
1829 die(errmsg+"\n"+log_tail(log_fname))
1830
1831 kept_reads.close()
1832 return kept_reads_filename, PrepReadsInfo(info_file, side)
1833
1834
1835 # Call bowtie
1836 def bowtie(params,
1837 bwt_idx_prefix,
1838 reads_list,
1839 reads_format,
1840 num_mismatches,
1841 mapped_reads,
1842 unmapped_reads,
1843 extra_output = "",
1844 mapping_type = _reads_vs_G,
1845 multihits_out = None): #only --prefilter-multihits should activate this parameter for the initial prefilter search
1846 start_time = datetime.now()
1847 bwt_idx_name = bwt_idx_prefix.split('/')[-1]
1848 reads_file=reads_list[0]
1849 readfile_basename=getFileBaseName(reads_file)
1850
1851 g_mapping, t_mapping, seg_mapping = False, False, False
1852 if mapping_type == _reads_vs_T:
1853 t_mapping = True
1854 elif mapping_type >= _segs_vs_G:
1855 seg_mapping = True
1856 else:
1857 g_mapping = True
1858
1859 bowtie_str = "Bowtie"
1860 if params.bowtie2:
1861 bowtie_str += "2"
1862
1863 if seg_mapping:
1864 if not params.bowtie2:
1865 backup_bowtie_alignment_option = params.bowtie_alignment_option
1866 params.bowtie_alignment_option = "-v"
1867
1868 if t_mapping:
1869 th_log("Mapping %s against transcriptome %s with %s %s" % (readfile_basename,
1870 bwt_idx_name, bowtie_str, extra_output))
1871 else:
1872 th_log("Mapping %s against %s with %s %s" % (readfile_basename,
1873 bwt_idx_name, bowtie_str, extra_output))
1874 bwt_log = open(logging_dir + 'bowtie.'+readfile_basename+'.fixmap.log', "w")
1875 #bwt_mapped=mapped_reads
1876 unmapped_reads_out=None
1877 if unmapped_reads:
1878 unmapped_reads_out=unmapped_reads+".fq"
1879 use_FIFO = use_BWT_FIFO and use_zpacker and unmapped_reads and params.read_params.color
1880 if use_FIFO:
1881 global unmapped_reads_fifo
1882 unmapped_reads_fifo=unmapped_reads+".fifo"
1883 if os.path.exists(unmapped_reads_fifo):
1884 os.remove(unmapped_reads_fifo)
1885 try:
1886 os.mkfifo(unmapped_reads_fifo)
1887 except OSError, o:
1888 die(fail_str+"Error at mkfifo("+unmapped_reads_fifo+'). '+str(o))
1889 unmapped_reads_out+=".z"
1890
1891 # Launch Bowtie
1892 try:
1893 bowtie_cmd = [bowtie_path]
1894 if reads_format == "fastq":
1895 bowtie_cmd += ["-q"]
1896 elif reads_format == "fasta":
1897 bowtie_cmd += ["-f"]
1898
1899 if params.read_params.color:
1900 bowtie_cmd += ["-C", "--col-keepends"]
1901 if unmapped_reads:
1902 bowtie_cmd += ["--un", unmapped_reads_out]
1903
1904 unzip_cmd=None
1905 bam_input=False
1906 if len(reads_list) > 0 and reads_list[0].endswith('.bam'):
1907 bam_input=True
1908 unzip_cmd=[ prog_path('bam2fastx'), "--all" ]
1909 if reads_format:
1910 unzip_cmd.append("--" + reads_format)
1911 unzip_cmd+=[reads_list[0]]
1912
1913 if use_zpacker and (unzip_cmd is None):
1914 unzip_cmd=[ params.system_params.zipper ]
1915 unzip_cmd.extend(params.system_params.zipper_opts)
1916 unzip_cmd+=['-cd']
1917
1918 fifo_pid=None
1919 if use_FIFO:
1920 unm_zipcmd=[ params.system_params.zipper ]
1921 unm_zipcmd.extend(params.system_params.zipper_opts)
1922 unm_zipcmd+=['-c']
1923 print >> run_log, ' '.join(unm_zipcmd)+' < '+ unmapped_reads_fifo + ' > '+ unmapped_reads_out + ' & '
1924 fifo_pid=os.fork()
1925 if fifo_pid==0:
1926 def on_sig_exit(sig, func=None):
1927 os._exit(os.EX_OK)
1928 signal.signal(signal.SIGTERM, on_sig_exit)
1929 subprocess.call(unm_zipcmd,
1930 stdin=open(unmapped_reads_fifo, "r"),
1931 stderr=tophat_log,
1932 stdout=open(unmapped_reads_out, "wb"))
1933 os._exit(os.EX_OK)
1934
1935 fix_map_cmd = [prog_path('fix_map_ordering')]
1936
1937 if params.bowtie2:
1938 if t_mapping or g_mapping:
1939 max_penalty, min_penalty = params.bowtie2_params.mp.split(',')
1940 max_penalty, min_penalty = int(max_penalty), int(min_penalty)
1941 min_score = (max_penalty - 1) * num_mismatches
1942 fix_map_cmd += ["--bowtie2-min-score", str(min_score)]
1943
1944 samzip_cmd=None
1945
1946 # daehwan - just use bam format for multi-threading.
1947 if unmapped_reads or params.system_params.num_threads >= 1:
1948 #mapping on the genome
1949 #fix_map_ordering will write BAM file directly, and unmapped_reads as BAM file too
1950 mapped_reads += ".bam"
1951
1952 temp_sam_header = tmp_dir + bwt_idx_prefix.split('/')[-1] + ".bwt.samheader.sam"
1953 fix_map_cmd += ["--sam-header", temp_sam_header, "-", mapped_reads]
1954 if not params.read_params.color and unmapped_reads:
1955 unmapped_reads_out = unmapped_reads+".bam"
1956 fix_map_cmd += [unmapped_reads_out]
1957 else:
1958 #mapping on segment_juncs (spliced potential junctions)
1959 # don't use SAM headers, could be slow -- better use compressed SAM directly
1960 mapped_reads += ".sam"
1961 fix_map_cmd += ["-"]
1962 if use_zpacker:
1963 fix_map_cmd += ["-"]
1964 samzip_cmd=[ params.system_params.zipper ]
1965 samzip_cmd.extend(params.system_params.zipper_opts)
1966 samzip_cmd += ["-c"]
1967 mapped_reads += ".z"
1968 else:
1969 fix_map_cmd += [mapped_reads]
1970
1971 fix_map_cmd += ["--index-outfile", mapped_reads + ".index"]
1972
1973 if t_mapping:
1974 max_hits = params.t_max_hits
1975 elif seg_mapping:
1976 max_hits = params.max_seg_hits
1977 else:
1978 max_hits = params.max_hits
1979
1980 if num_mismatches > 3:
1981 num_mismatches = 3
1982
1983 if params.bowtie2:
1984 if seg_mapping:
1985 # since bowtie2 does not suppress reads that map to many places,
1986 # we suppress those in segment_juncs and long_spanning_reads.
1987 bowtie_cmd += ["-k", str(max_hits + 1)]
1988 else:
1989 bowtie_cmd += ["-k", str(max_hits)]
1990
1991 bowtie2_params = params.bowtie2_params
1992 if seg_mapping:
1993 # after intensive testing,
1994 # the following parameters seem to work faster than Bowtie1 and as sensitive as Bowtie1,
1995 # but a chance for further improvements remains.
1996 bowtie_cmd += ["-N", str(min(num_mismatches, 1))]
1997 bowtie_cmd += ["-i", "C,2,0"]
1998 bowtie_cmd += ["-L", "20"]
1999 # bowtie_cmd += ["-L", str(min(params.segment_length, 20))]
2000 else:
2001 bowtie2_preset = ""
2002 if bowtie2_params.very_fast:
2003 bowtie2_preset = "--very-fast"
2004 elif bowtie2_params.fast:
2005 bowtie2_preset = "--fast"
2006 elif bowtie2_params.sensitive:
2007 bowtie2_preset = "--sensitive"
2008 elif bowtie2_params.very_sensitive:
2009 bowtie2_preset = "--very-sensitive"
2010
2011 if bowtie2_preset != "":
2012 bowtie_cmd += [bowtie2_preset]
2013 else:
2014 bowtie_cmd += ["-D", str(bowtie2_params.D),
2015 "-R", str(bowtie2_params.R),
2016 "-N", str(bowtie2_params.N),
2017 "-L", str(bowtie2_params.L),
2018 "-i", bowtie2_params.i]
2019
2020 # "--n-ceil" is not correctly parsed in Bowtie2,
2021 # I (daehwan) already talked to Ben who will fix the problem.
2022 bowtie_cmd += [# "--n-ceil", bowtie2_params.n_ceil,
2023 "--gbar", str(bowtie2_params.gbar),
2024 "--mp", bowtie2_params.mp,
2025 "--np", str(bowtie2_params.np),
2026 "--rdg", bowtie2_params.rdg,
2027 "--rfg", bowtie2_params.rfg,
2028 "--score-min", bowtie2_params.score_min]
2029
2030 else:
2031 bowtie_cmd += [params.bowtie_alignment_option, str(num_mismatches),
2032 "-k", str(max_hits),
2033 "-m", str(max_hits),
2034 "-S"]
2035
2036 bowtie_cmd += ["-p", str(params.system_params.num_threads)]
2037
2038 if params.bowtie2: #always use headerless SAM file
2039 bowtie_cmd += ["--sam-no-hd"]
2040 else:
2041 bowtie_cmd += ["--sam-nohead"]
2042
2043 if not params.bowtie2:
2044 if multihits_out:
2045 bowtie_cmd += ["--max", multihits_out]
2046 else:
2047 bowtie_cmd += ["--max", "/dev/null"]
2048
2049 if params.bowtie2:
2050 bowtie_cmd += ["-x"]
2051
2052 bowtie_cmd += [ bwt_idx_prefix ]
2053 bowtie_proc=None
2054 shellcmd=""
2055 unzip_proc=None
2056 samzip_proc=None
2057
2058 if multihits_out:
2059 #special prefilter bowtie run: we use prep_reads on the fly
2060 #in order to get multi-mapped reads to exclude later
2061 prep_cmd = prep_reads_cmd(params, ",".join(reads_list))
2062 preplog_fname=logging_dir + "prep_reads_prefilter.log"
2063 prepfilter_log = open(preplog_fname,"w")
2064 unzip_proc = subprocess.Popen(prep_cmd,
2065 stdout=subprocess.PIPE,
2066 stderr=prepfilter_log)
2067 shellcmd=' '.join(prep_cmd) + "|"
2068 else:
2069 z_input=use_zpacker and reads_file.endswith(".z")
2070 if z_input:
2071 unzip_proc = subprocess.Popen(unzip_cmd,
2072 stdin=open(reads_file, "rb"),
2073 stderr=tophat_log, stdout=subprocess.PIPE)
2074 shellcmd=' '.join(unzip_cmd) + "< " +reads_file +"|"
2075 else:
2076 #must be uncompressed fastq input (unmapped reads from a previous run)
2077 #or a BAM file with unmapped reads
2078 if bam_input:
2079 unzip_proc = subprocess.Popen(unzip_cmd, stderr=tophat_log, stdout=subprocess.PIPE)
2080 shellcmd=' '.join(unzip_cmd) + "|"
2081 else:
2082 bowtie_cmd += [reads_file]
2083 bowtie_proc = subprocess.Popen(bowtie_cmd,
2084 stdout=subprocess.PIPE,
2085 stderr=bwt_log)
2086 if unzip_proc:
2087 #input is compressed OR prep_reads is used as a filter
2088 bowtie_cmd += ['-']
2089 bowtie_proc = subprocess.Popen(bowtie_cmd,
2090 stdin=unzip_proc.stdout,
2091 stdout=subprocess.PIPE,
2092 stderr=bwt_log)
2093 unzip_proc.stdout.close() # see http://bugs.python.org/issue7678
2094
2095 shellcmd += ' '.join(bowtie_cmd) + '|' + ' '.join(fix_map_cmd)
2096 if samzip_cmd:
2097 shellcmd += "|"+ ' '.join(samzip_cmd)+" > " + mapped_reads
2098 fix_order_proc = subprocess.Popen(fix_map_cmd,
2099 stdin=bowtie_proc.stdout,
2100 stderr=tophat_log,
2101 stdout=subprocess.PIPE)
2102 bowtie_proc.stdout.close() # needed?
2103 samzip_proc = subprocess.Popen(samzip_cmd,
2104 preexec_fn=subprocess_setup,
2105 stdin=fix_order_proc.stdout,
2106 stderr=tophat_log,
2107 stdout=open(mapped_reads, "wb"))
2108 fix_order_proc.stdout.close()
2109 else:
2110 #write BAM output directly
2111 fix_order_proc = subprocess.Popen(fix_map_cmd,
2112 stdin=bowtie_proc.stdout,
2113 stderr=tophat_log)
2114 # stdout=open(mapped_reads, "w"))
2115 bowtie_proc.stdout.close()
2116
2117 # shellcmd += " > " + mapped_reads
2118 print >> run_log, shellcmd
2119 if samzip_proc:
2120 samzip_proc.communicate()
2121 else:
2122 fix_order_proc.communicate()
2123 if use_FIFO:
2124 if fifo_pid and not os.path.exists(unmapped_reads_out):
2125 try:
2126 os.kill(fifo_pid, signal.SIGTERM)
2127 except:
2128 pass
2129
2130 except OSError, o:
2131 die(fail_str+"Error: "+str(o))
2132
2133 # Success
2134 #finish_time = datetime.now()
2135 #duration = finish_time - start_time
2136 #print >> sys.stderr, "\t\t\t[%s elapsed]" % formatTD(duration)
2137 if use_FIFO:
2138 try:
2139 os.remove(unmapped_reads_fifo)
2140 except:
2141 pass
2142 if multihits_out and not os.path.exists(multihits_out):
2143 open(multihits_out, "w").close()
2144
2145 if seg_mapping:
2146 if not params.bowtie2:
2147 params.bowtie_alignment_option = backup_bowtie_alignment_option
2148
2149 return (mapped_reads, unmapped_reads_out)
2150
2151
2152 # Retrieve a .juncs file from a GFF file by calling the gtf_juncs executable
2153 def get_gtf_juncs(gff_annotation):
2154 th_log("Reading known junctions from GTF file")
2155 gtf_juncs_log = open(logging_dir + "gtf_juncs.log", "w")
2156
2157 gff_prefix = gff_annotation.split('/')[-1].split('.')[0]
2158
2159 gtf_juncs_out_name = tmp_dir + gff_prefix + ".juncs"
2160 gtf_juncs_out = open(gtf_juncs_out_name, "w")
2161
2162 gtf_juncs_cmd=[prog_path("gtf_juncs"), gff_annotation]
2163 try:
2164 print >> run_log, " ".join(gtf_juncs_cmd), " > "+gtf_juncs_out_name
2165 retcode = subprocess.call(gtf_juncs_cmd,
2166 stderr=gtf_juncs_log,
2167 stdout=gtf_juncs_out)
2168 # cvg_islands returned an error
2169 if retcode == 1:
2170 th_logp("\tWarning: TopHat did not find any junctions in GTF file")
2171 return (False, gtf_juncs_out_name)
2172 elif retcode != 0:
2173 die(fail_str+"Error: GTF junction extraction failed with err ="+str(retcode))
2174
2175 # cvg_islands not found
2176 except OSError, o:
2177 errmsg=fail_str+str(o)+"\n"
2178 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2179 errmsg+="Error: gtf_juncs not found on this system"
2180 die(errmsg)
2181 return (True, gtf_juncs_out_name)
2182
2183 # Call bowtie-build on the FASTA file of sythetic splice junction sequences
2184 def build_juncs_bwt_index(is_bowtie2, external_splice_prefix, color):
2185 th_log("Indexing splices")
2186 bowtie_build_log = open(logging_dir + "bowtie_build.log", "w")
2187
2188 #user_splices_out_prefix = output_dir + "user_splices_idx"
2189
2190 if is_bowtie2:
2191 bowtie_build_cmd = [prog_path("bowtie2-build")]
2192 else:
2193 bowtie_build_cmd = [prog_path("bowtie-build")]
2194
2195 if color:
2196 bowtie_build_cmd += ["-C"]
2197
2198 bowtie_build_cmd += [external_splice_prefix + ".fa",
2199 external_splice_prefix]
2200 try:
2201 print >> run_log, " ".join(bowtie_build_cmd)
2202 retcode = subprocess.call(bowtie_build_cmd,
2203 stdout=bowtie_build_log)
2204
2205 if retcode != 0:
2206 die(fail_str+"Error: Splice sequence indexing failed with err ="+ str(retcode))
2207 except OSError, o:
2208 errmsg=fail_str+str(o)+"\n"
2209 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2210 errmsg+="Error: bowtie-build not found on this system"
2211 die(errmsg)
2212 return external_splice_prefix
2213
2214 # Build a splice index from a .juncs file, suitable for use with specified read
2215 # (or read segment) lengths
2216 def build_juncs_index(is_bowtie2,
2217 min_anchor_length,
2218 max_seg_len,
2219 juncs_prefix,
2220 external_juncs,
2221 external_insertions,
2222 external_deletions,
2223 external_fusions,
2224 reference_fasta,
2225 color):
2226 th_log("Retrieving sequences for splices")
2227
2228 juncs_file_list = ",".join(external_juncs)
2229 insertions_file_list = ",".join(external_insertions)
2230 deletions_file_list = ",".join(external_deletions)
2231 fusions_file_list = ",".join(external_fusions)
2232
2233 # do not use insertions and deletions in case of Bowtie2
2234 if is_bowtie2:
2235 insertions_file_list = "/dev/null"
2236 deletions_file_list = "/dev/null"
2237
2238 juncs_db_log = open(logging_dir + "juncs_db.log", "w")
2239
2240 external_splices_out_prefix = tmp_dir + juncs_prefix
2241 external_splices_out_name = external_splices_out_prefix + ".fa"
2242
2243 external_splices_out = open(external_splices_out_name, "w")
2244 # juncs_db_cmd = [bin_dir + "juncs_db",
2245 juncs_db_cmd = [prog_path("juncs_db"),
2246 str(min_anchor_length),
2247 str(max_seg_len),
2248 juncs_file_list,
2249 insertions_file_list,
2250 deletions_file_list,
2251 fusions_file_list,
2252 reference_fasta]
2253 try:
2254 print >> run_log, " ".join(juncs_db_cmd) + " > " + external_splices_out_name
2255 retcode = subprocess.call(juncs_db_cmd,
2256 stderr=juncs_db_log,
2257 stdout=external_splices_out)
2258
2259 if retcode != 0:
2260 die(fail_str+"Error: Splice sequence retrieval failed with err ="+str(retcode))
2261 # juncs_db not found
2262 except OSError, o:
2263 errmsg=fail_str+str(o)+"\n"
2264 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2265 errmsg+="Error: juncs_db not found on this system"
2266 die(errmsg)
2267
2268 external_splices_out_prefix = build_juncs_bwt_index(is_bowtie2, external_splices_out_prefix, color)
2269 return external_splices_out_prefix
2270
2271 def build_idx_from_fa(is_bowtie2, fasta_fname, out_dir, color):
2272 """ Build a bowtie index from a FASTA file.
2273
2274 Arguments:
2275 - `fasta_fname`: File path to FASTA file.
2276 - `out_dir`: Output directory to place index in. (includes os.sep)
2277
2278 Returns:
2279 - The path to the Bowtie index.
2280 """
2281 bwt_idx_path = out_dir + os.path.basename(fasta_fname).replace(".fa", "")
2282
2283 if is_bowtie2:
2284 bowtie_idx_cmd = [prog_path("bowtie2-build")]
2285 else:
2286 bowtie_idx_cmd = [prog_path("bowtie-build")]
2287
2288 if color:
2289 bowtie_idx_cmd += ["-C"]
2290
2291 bowtie_idx_cmd += [fasta_fname,
2292 bwt_idx_path]
2293 try:
2294 th_log("Building Bowtie index from " + os.path.basename(fasta_fname))
2295 print >> run_log, " ".join(bowtie_idx_cmd)
2296 retcode = subprocess.call(bowtie_idx_cmd,
2297 stdout=open(os.devnull, "w"),
2298 stderr=open(os.devnull, "w"))
2299 if retcode != 0:
2300 die(fail_str + "Error: Couldn't build bowtie index with err = "
2301 + str(retcode))
2302 except OSError, o:
2303 errmsg=fail_str+str(o)+"\n"
2304 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2305 errmsg+="Error: bowtie-build not found on this system"
2306 die(errmsg)
2307
2308 return bwt_idx_path
2309
2310 # Print out the sam header, embedding the user's specified library properties.
2311 # FIXME: also needs SQ dictionary lines
2312 def write_sam_header(read_params, sam_file):
2313 print >> sam_file, "@HD\tVN:1.0\tSO:coordinate"
2314 if read_params.read_group_id and read_params.sample_id:
2315 rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
2316 read_params.sample_id)
2317 if read_params.library_id:
2318 rg_str += "\tLB:%s" % read_params.library_id
2319 if read_params.description:
2320 rg_str += "\tDS:%s" % read_params.description
2321 if read_params.seq_platform_unit:
2322 rg_str += "\tPU:%s" % read_params.seq_platform_unit
2323 if read_params.seq_center:
2324 rg_str += "\tCN:%s" % read_params.seq_center
2325 if read_params.mate_inner_dist:
2326 rg_str += "\tPI:%s" % read_params.mate_inner_dist
2327 if read_params.seq_run_date:
2328 rg_str += "\tDT:%s" % read_params.seq_run_date
2329 if read_params.seq_platform:
2330 rg_str += "\tPL:%s" % read_params.seq_platform
2331
2332 print >> sam_file, rg_str
2333 print >> sam_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
2334
2335 # Write final TopHat output, via tophat_reports and wiggles
2336 def compile_reports(params, sam_header_filename, ref_fasta, mappings, readfiles, gff_annotation):
2337 th_log("Reporting output tracks")
2338 left_maps, right_maps = mappings
2339 left_reads, right_reads = readfiles
2340 left_maps = [x for x in left_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
2341 left_maps = ','.join(left_maps)
2342
2343 if len(right_maps) > 0:
2344 right_maps = [x for x in right_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
2345 right_maps = ','.join(right_maps)
2346 log_fname = logging_dir + "reports.log"
2347 report_log = open(log_fname, "w")
2348 junctions = output_dir + "junctions.bed"
2349 insertions = output_dir + "insertions.bed"
2350 deletions = output_dir + "deletions.bed"
2351 accepted_hits = output_dir + "accepted_hits"
2352 report_cmdpath = prog_path("tophat_reports")
2353 fusions = output_dir + "fusions.out"
2354 report_cmd = [report_cmdpath]
2355
2356 alignments_output_filename = tmp_dir + "accepted_hits"
2357
2358 report_cmd.extend(params.cmd())
2359 report_cmd += ["--sam-header", sam_header_filename]
2360 if params.report_secondary_alignments:
2361 report_cmd += ["--report-secondary-alignments"]
2362
2363 report_cmd.extend(["--samtools="+samtools_path])
2364 report_cmd.extend([ref_fasta,
2365 junctions,
2366 insertions,
2367 deletions,
2368 fusions,
2369 alignments_output_filename,
2370 left_maps,
2371 left_reads])
2372
2373 if len(right_maps) > 0 and right_reads:
2374 report_cmd.append(right_maps)
2375 report_cmd.append(right_reads)
2376
2377 # -- tophat_reports now produces (uncompressed) BAM stream,
2378 try:
2379 if params.report_params.convert_bam:
2380 if params.report_params.sort_bam:
2381 print >> run_log, " ".join(report_cmd)
2382 report_proc=subprocess.call(report_cmd,
2383 preexec_fn=subprocess_setup,
2384 stderr=report_log)
2385
2386 bam_parts = []
2387 for i in range(params.system_params.num_threads):
2388 bam_part_filename = "%s%d.bam" % (alignments_output_filename, i)
2389 if os.path.exists(bam_part_filename):
2390 bam_parts.append(bam_part_filename)
2391 else:
2392 break
2393
2394 num_bam_parts = len(bam_parts)
2395 pids = [0 for i in range(num_bam_parts)]
2396 sorted_bam_parts = ["%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)]
2397 #left_um_parts = ["%s%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)]
2398 #right_um_parts = ["%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)]
2399 for i in range(num_bam_parts):
2400 bamsort_cmd = [samtools_path,
2401 "sort",
2402 bam_parts[i],
2403 sorted_bam_parts[i]]
2404
2405 sorted_bam_parts[i] += ".bam"
2406 print >> run_log, " ".join(bamsort_cmd)
2407 pid = os.fork()
2408 if pid == 0:
2409 subprocess.call(bamsort_cmd,
2410 stderr=open(logging_dir + "reports.samtools_sort.log%d" % i, "w"))
2411 os._exit(os.EX_OK)
2412 else:
2413 pids[i] = pid
2414
2415 for i in range(len(pids)):
2416 if pids[i] > 0:
2417 os.waitpid(pids[i], 0)
2418 pids[i] = 0
2419
2420 # for bam_part in bam_parts:
2421 # os.remove(bam_part)
2422
2423 if num_bam_parts > 1:
2424 bammerge_cmd = [samtools_path,
2425 "merge",
2426 "-h",
2427 sam_header_filename,
2428 "%s.bam" % accepted_hits]
2429 bammerge_cmd += sorted_bam_parts
2430
2431 print >> run_log, " ".join(bammerge_cmd)
2432 subprocess.call(bammerge_cmd,
2433 stderr=open(logging_dir + "reports.samtools_merge.log", "w"))
2434
2435 for sorted_bam_part in sorted_bam_parts:
2436 os.remove(sorted_bam_part)
2437 else:
2438 os.rename(sorted_bam_parts[0], output_dir + "accepted_hits.bam")
2439
2440 else:
2441 print >> run_log, " ".join(report_cmd)
2442 report_proc=subprocess.call(report_cmd,
2443 preexec_fn=subprocess_setup,
2444 stdout=open(accepted_hits,"wb"),
2445 stderr=report_log)
2446 os.rename(accepted_hits, output_dir + "accepted_hits.bam")
2447 else:
2448 print >> run_log, " ".join(report_cmd)
2449 report_proc=subprocess.call(report_cmd,
2450 preexec_fn=subprocess_setup,
2451 stdout=open(accepted_hits,"wb"),
2452 stderr=report_log)
2453 tmp_sam = output_dir + "accepted_hits.sam"
2454
2455 bam_to_sam_cmd = [samtools_path, "view", "-h", accepted_hits]
2456 print >> run_log, " ".join(bam_to_sam_cmd) + " > " + tmp_sam
2457 bam_to_sam_log = open(logging_dir + "accepted_hits_bam_to_sam.log", "w")
2458 tmp_sam_file = open(tmp_sam, "w")
2459 ret = subprocess.call(bam_to_sam_cmd,
2460 stdout=tmp_sam_file,
2461 stderr=bam_to_sam_log)
2462 tmp_sam_file.close()
2463 os.remove(accepted_hits)
2464
2465 except OSError, o:
2466 die(fail_str+"Error: "+str(o)+"\n"+log_tail(log_fname))
2467
2468 return junctions
2469
2470
2471 # Split up each read in a FASTQ file into multiple segments. Creates a FASTQ file
2472 # for each segment This function needs to be fixed to support mixed read length
2473 # inputs
2474 def open_output_files(prefix, num_files_prev, num_files, out_segf, extension, params):
2475 i = num_files_prev + 1
2476 while i <= num_files:
2477 segfname=prefix+("_seg%d" % i)+extension
2478 out_segf.append(ZWriter(segfname,params.system_params))
2479 i += 1
2480
2481 def split_reads(reads_filename,
2482 prefix,
2483 fasta,
2484 params,
2485 segment_length):
2486 #reads_file = open(reads_filename)
2487 zreads = ZReader(reads_filename, params.system_params, False)
2488 out_segfiles = []
2489
2490 if fasta:
2491 extension = ".fa"
2492 else:
2493 extension = ".fq"
2494 if use_zpacker: extension += ".z"
2495 def convert_color_to_bp(color_seq):
2496 decode_dic = { 'A0':'A', 'A1':'C', 'A2':'G', 'A3':'T', 'A4':'N', 'A.':'N', 'AN':'N',
2497 'C0':'C', 'C1':'A', 'C2':'T', 'C3':'G', 'C4':'N', 'C.':'N', 'CN':'N',
2498 'G0':'G', 'G1':'T', 'G2':'A', 'G3':'C', 'G4':'N', 'G.':'N', 'GN':'N',
2499 'T0':'T', 'T1':'G', 'T2':'C', 'T3':'A', 'T4':'N', 'T.':'N', 'TN':'N',
2500 'N0':'N', 'N1':'N', 'N2':'N', 'N3':'N', 'N4':'N', 'N.':'N', 'NN':'N',
2501 '.0':'N', '.1':'N', '.2':'N', '.3':'N', '.4':'N', '..':'N', '.N':'N' }
2502
2503 base = color_seq[0]
2504 bp_seq = base
2505 for ch in color_seq[1:]:
2506 base = decode_dic[base+ch]
2507 bp_seq += base
2508 return bp_seq
2509
2510 def convert_bp_to_color(bp_seq):
2511 encode_dic = { 'AA':'0', 'CC':'0', 'GG':'0', 'TT':'0',
2512 'AC':'1', 'CA':'1', 'GT':'1', 'TG':'1',
2513 'AG':'2', 'CT':'2', 'GA':'2', 'TC':'2',
2514 'AT':'3', 'CG':'3', 'GC':'3', 'TA':'3',
2515 'A.':'4', 'C.':'4', 'G.':'4', 'T.':'4',
2516 '.A':'4', '.C':'4', '.G':'4', '.T':'4',
2517 '.N':'4', 'AN':'4', 'CN':'4', 'GN':'4',
2518 'TN':'4', 'NA':'4', 'NC':'4', 'NG':'4',
2519 'NT':'4', 'NN':'4', 'N.':'4', '..':'4' }
2520
2521 base = bp_seq[0]
2522 color_seq = base
2523 for ch in bp_seq[1:]:
2524 color_seq += encode_dic[base + ch]
2525 base = ch
2526
2527 return color_seq
2528
2529 def split_record(read_name, read_seq, read_qual, out_segf, offsets, color):
2530 if color:
2531 color_offset = 1
2532 read_seq_temp = convert_color_to_bp(read_seq)
2533
2534 seg_num = 1
2535 while seg_num + 1 < len(offsets):
2536 if read_seq[offsets[seg_num]+1] not in ['0', '1', '2', '3']:
2537 return
2538 seg_num += 1
2539 else:
2540 color_offset = 0
2541
2542 seg_num = 0
2543 last_seq_offset = 0
2544 while seg_num + 1 < len(offsets):
2545 f = out_segf[seg_num].file
2546 seg_seq = read_seq[last_seq_offset+color_offset:offsets[seg_num + 1]+color_offset]
2547 print >> f, "%s|%d:%d:%d" % (read_name,last_seq_offset,seg_num, len(offsets) - 1)
2548 if color:
2549 print >> f, "%s%s" % (read_seq_temp[last_seq_offset], seg_seq)
2550 else:
2551 print >> f, seg_seq
2552 if not fasta:
2553 seg_qual = read_qual[last_seq_offset:offsets[seg_num + 1]]
2554 print >> f, "+"
2555 print >> f, seg_qual
2556 seg_num += 1
2557 last_seq_offset = offsets[seg_num]
2558
2559 line_state = 0
2560 read_name = ""
2561 read_seq = ""
2562 read_quals = ""
2563 num_segments = 0
2564 offsets = []
2565 for line in zreads.file:
2566 if line.strip() == "":
2567 continue
2568 if line_state == 0:
2569 read_name = line.strip()
2570 elif line_state == 1:
2571 read_seq = line.strip()
2572
2573 read_length = len(read_seq)
2574 tmp_num_segments = read_length / segment_length
2575 offsets = [segment_length * i for i in range(0, tmp_num_segments + 1)]
2576
2577 # Bowtie's minimum read length here is 20bp, so if the last segment
2578 # is between 20 and segment_length bp long, go ahead and write it out
2579 if read_length % segment_length >= min(segment_length - 2, 20):
2580 offsets.append(read_length)
2581 tmp_num_segments += 1
2582 else:
2583 offsets[-1] = read_length
2584
2585 if tmp_num_segments == 1:
2586 offsets = [0, read_length]
2587
2588 if tmp_num_segments > num_segments:
2589 open_output_files(prefix, num_segments, tmp_num_segments, out_segfiles, extension, params)
2590 num_segments = tmp_num_segments
2591
2592 if fasta:
2593 split_record(read_name, read_seq, None, out_segfiles, offsets, params.read_params.color)
2594 elif line_state == 2:
2595 line = line.strip()
2596 else:
2597 read_quals = line.strip()
2598 if not fasta:
2599 split_record(read_name, read_seq, read_quals, out_segfiles, offsets, params.read_params.color)
2600
2601 line_state += 1
2602 if fasta:
2603 line_state %= 2
2604 else:
2605 line_state %= 4
2606 zreads.close()
2607 out_fnames=[]
2608 for zf in out_segfiles:
2609 zf.close()
2610 out_fnames.append(zf.fname)
2611 #return [o.fname for o in out_segfiles]
2612 return out_fnames
2613
2614 # Find possible splice junctions using the "closure search" strategy, and report
2615 # them in closures.juncs. Calls the executable closure_juncs
2616 def junctions_from_closures(params,
2617 sam_header_filename,
2618 left_maps,
2619 right_maps,
2620 ref_fasta):
2621 th_log("Searching for junctions via mate-pair closures")
2622
2623
2624 #maps = [x for x in seg_maps if (os.path.exists(x) and os.path.getsize(x) > 0)]
2625 #if len(maps) == 0:
2626 # return None
2627 slash = left_maps[0].rfind('/')
2628 juncs_out = ""
2629 if slash != -1:
2630 juncs_out += left_maps[0][:slash+1]
2631 fusions_out = juncs_out
2632
2633 juncs_out += "closure.juncs"
2634 fusions_out += "closure.fusions"
2635
2636 juncs_log = open(logging_dir + "closure.log", "w")
2637 juncs_cmdpath=prog_path("closure_juncs")
2638 juncs_cmd = [juncs_cmdpath]
2639
2640 left_maps = ','.join(left_maps)
2641 right_maps = ','.join(right_maps)
2642
2643 juncs_cmd.extend(params.cmd())
2644 juncs_cmd.extend(["--sam-header", sam_header_filename,
2645 juncs_out,
2646 fusions_out,
2647 ref_fasta,
2648 left_maps,
2649 right_maps])
2650 try:
2651 print >> run_log, ' '.join(juncs_cmd)
2652 retcode = subprocess.call(juncs_cmd,
2653 stderr=juncs_log)
2654
2655 # spanning_reads returned an error
2656 if retcode != 0:
2657 die(fail_str+"Error: closure-based junction search failed with err ="+str(retcode))
2658 # cvg_islands not found
2659 except OSError, o:
2660 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2661 th_logp(fail_str + "Error: closure_juncs not found on this system")
2662 die(str(o))
2663 return [juncs_out]
2664
2665 # Find possible junctions by examining coverage and split segments in the initial
2666 # map and segment maps. Report junctions, insertions, and deletions in segment.juncs,
2667 # segment.insertions, and segment.deletions. Calls the executable
2668 # segment_juncs
2669 def junctions_from_segments(params,
2670 sam_header_filename,
2671 left_reads,
2672 left_reads_map,
2673 left_seg_maps,
2674 right_reads,
2675 right_reads_map,
2676 right_seg_maps,
2677 unmapped_reads,
2678 reads_format,
2679 ref_fasta):
2680 if left_reads_map != left_seg_maps[0]:
2681 th_log("Searching for junctions via segment mapping")
2682 out_path=getFileDir(left_seg_maps[0])
2683 juncs_out=out_path+"segment.juncs"
2684 insertions_out=out_path+"segment.insertions"
2685 deletions_out =out_path+"segment.deletions"
2686 fusions_out = out_path+"segment.fusions"
2687
2688 left_maps = ','.join(left_seg_maps)
2689 log_fname = logging_dir + "segment_juncs.log"
2690 segj_log = open(log_fname, "w")
2691 segj_cmd = [prog_path("segment_juncs")]
2692
2693 segj_cmd.extend(params.cmd())
2694 segj_cmd.extend(["--sam-header", sam_header_filename,
2695 "--ium-reads", ",".join(unmapped_reads),
2696 ref_fasta,
2697 juncs_out,
2698 insertions_out,
2699 deletions_out,
2700 fusions_out,
2701 left_reads,
2702 left_reads_map,
2703 left_maps])
2704 if right_seg_maps:
2705 right_maps = ','.join(right_seg_maps)
2706 segj_cmd.extend([right_reads, right_reads_map, right_maps])
2707 try:
2708 print >> run_log, " ".join(segj_cmd)
2709 retcode = subprocess.call(segj_cmd,
2710 preexec_fn=subprocess_setup,
2711 stderr=segj_log)
2712
2713 # spanning_reads returned an error
2714 if retcode != 0:
2715 die(fail_str+"Error: segment-based junction search failed with err ="+str(retcode)+"\n"+log_tail(log_fname))
2716
2717 # cvg_islands not found
2718 except OSError, o:
2719 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2720 th_logp(fail_str + "Error: segment_juncs not found on this system")
2721 die(str(o))
2722
2723 return [juncs_out, insertions_out, deletions_out, fusions_out]
2724
2725 # Joins mapped segments into full-length read alignments via the executable
2726 # long_spanning_reads
2727 def join_mapped_segments(params,
2728 sam_header_filename,
2729 reads,
2730 ref_fasta,
2731 possible_juncs,
2732 possible_insertions,
2733 possible_deletions,
2734 possible_fusions,
2735 contig_seg_maps,
2736 spliced_seg_maps,
2737 alignments_out_name):
2738 rn=""
2739 if len(contig_seg_maps)>1:
2740 th_log("Joining segment hits")
2741 rn=".segs"
2742 else:
2743 th_log("Processing bowtie hits")
2744 contig_seg_maps = ','.join(contig_seg_maps)
2745
2746 possible_juncs = ','.join(possible_juncs)
2747 possible_insertions = ",".join(possible_insertions)
2748 possible_deletions = ",".join(possible_deletions)
2749 possible_fusions = ",".join(possible_fusions)
2750
2751 log_fname=logging_dir + "long_spanning_reads"+rn+".log"
2752 align_log = open(log_fname, "w")
2753 align_cmd = [prog_path("long_spanning_reads")]
2754
2755 align_cmd.extend(params.cmd())
2756 align_cmd += ["--sam-header", sam_header_filename]
2757
2758 b2_params = params.bowtie2_params
2759 max_penalty, min_penalty = b2_params.mp.split(',')
2760 align_cmd += ["--bowtie2-max-penalty", max_penalty,
2761 "--bowtie2-min-penalty", min_penalty]
2762
2763 align_cmd += ["--bowtie2-penalty-for-N", str(b2_params.np)]
2764
2765 read_gap_open, read_gap_cont = b2_params.rdg.split(',')
2766 align_cmd += ["--bowtie2-read-gap-open", read_gap_open,
2767 "--bowtie2-read-gap-cont", read_gap_cont]
2768
2769 ref_gap_open, ref_gap_cont = b2_params.rfg.split(',')
2770 align_cmd += ["--bowtie2-ref-gap-open", ref_gap_open,
2771 "--bowtie2-ref-gap-cont", ref_gap_cont]
2772
2773 align_cmd.append(ref_fasta)
2774 align_cmd.extend([reads,
2775 possible_juncs,
2776 possible_insertions,
2777 possible_deletions,
2778 possible_fusions,
2779 alignments_out_name,
2780 contig_seg_maps])
2781
2782 if spliced_seg_maps:
2783 spliced_seg_maps = ','.join(spliced_seg_maps)
2784 align_cmd.append(spliced_seg_maps)
2785
2786 try:
2787 print >> run_log, " ".join(align_cmd)
2788 join_proc=subprocess.call(align_cmd,
2789 stderr=align_log)
2790
2791 except OSError, o:
2792 die(fail_str+"Error: "+str(o))
2793
2794 # This class collects spliced and unspliced alignments for each of the
2795 # left and right read files provided by the user.
2796 class Maps:
2797 def __init__(self,
2798 unspliced_sam,
2799 seg_maps,
2800 unmapped_segs,
2801 segs):
2802 self.unspliced_sam = unspliced_sam
2803 self.seg_maps = seg_maps
2804 self.unmapped_segs = unmapped_segs
2805 self.segs = segs
2806
2807 # Map2GTF stuff
2808 def m2g_convert_coords(params, sam_header_filename, gtf_fname, reads, out_fname):
2809 """ajjkljlks
2810
2811 Arguments:
2812 - `params`: TopHat parameters
2813 - `gtf_fname`: File name pointing to the annotation.
2814 - `reads`: The reads to convert coords (in Bowtie format).
2815 - `out_fname`: The file name pointing to the output.
2816 """
2817 m2g_cmd = [prog_path("map2gtf")]
2818 m2g_cmd.extend(params.cmd())
2819 m2g_cmd += ["--sam-header", sam_header_filename]
2820 m2g_cmd.append(gtf_fname)
2821 m2g_cmd.append(reads) #could be BAM file
2822 m2g_cmd.append(out_fname)
2823 fbasename = getFileBaseName(reads)
2824 m2g_log = logging_dir + "m2g_" + fbasename + ".out"
2825 m2g_err = logging_dir + "m2g_" + fbasename + ".err"
2826
2827 try:
2828 th_log("Converting " + fbasename + " to genomic coordinates (map2gtf)")
2829 print >> run_log, " ".join(m2g_cmd) + " > " + m2g_log
2830 ret = subprocess.call(m2g_cmd,
2831 stdout=open(m2g_log, "w"),
2832 stderr=open(m2g_err, "w"))
2833 if ret != 0:
2834 die(fail_str + " Error: map2gtf returned an error")
2835 except OSError, o:
2836 err_msg = fail_str + str(o)
2837 die(err_msg + "\n")
2838
2839
2840 def gtf_to_fasta(params, trans_gtf, genome, out_fname):
2841 """ Make the transcriptome from a GTF.
2842
2843 Arguments:
2844 - `trans_gtf`:
2845 - `genome`:
2846 - `out_fname`:
2847 """
2848 # TODO: Call gtf_to_fasta
2849 g2f_cmd = [prog_path("gtf_to_fasta")]
2850 g2f_cmd.extend(params.cmd())
2851 g2f_cmd.append(trans_gtf)
2852 g2f_cmd.append(genome)
2853 g2f_cmd.append(out_fname)
2854
2855 g2f_log = logging_dir + "g2f.out"
2856 g2f_err = logging_dir + "g2f.err"
2857
2858 try:
2859 print >> run_log, " ".join(g2f_cmd)+" > " + g2f_log
2860 ret = subprocess.call(g2f_cmd,
2861 stdout = open(g2f_log, "w"),
2862 stderr = open(g2f_err, "w"))
2863 if ret != 0:
2864 die(fail_str + " Error: gtf_to_fasta returned an error.")
2865 except OSError, o:
2866 err_msg = fail_str + str(o)
2867 die(err_msg + "\n")
2868
2869 def map2gtf(params, genome_sam_header_filename, ref_fasta, left_reads, right_reads):
2870 """ Main GTF mapping function
2871
2872 Arguments:
2873 - `params`: The TopHat parameters.
2874 - `ref_fasta`: The reference genome.
2875 - `left_reads`: A list of reads.
2876 - `right_reads`: A list of reads (empty if single-end).
2877
2878 """
2879 test_input_file(params.gff_annotation)
2880
2881 # th_log("Reading in GTF file: " + params.gff_annotation)
2882 # transcripts = gtf_to_transcripts(params.gff_annotation)
2883
2884 gtf_name = getFileBaseName(params.gff_annotation)
2885 m2g_bwt_idx = None
2886 t_out_dir = tmp_dir
2887 if params.transcriptome_index and not params.transcriptome_outdir:
2888 m2g_bwt_idx = params.transcriptome_index
2889 th_log("Using pre-built transcriptome index..")
2890 else:
2891 th_log("Creating transcriptome data files..")
2892 if params.transcriptome_outdir:
2893 t_out_dir=params.transcriptome_outdir+"/"
2894 m2g_ref_fasta = t_out_dir + gtf_name + ".fa"
2895 gtf_to_fasta(params, params.gff_annotation, ref_fasta, m2g_ref_fasta)
2896 m2g_bwt_idx = build_idx_from_fa(params.bowtie2, m2g_ref_fasta, t_out_dir, params.read_params.color)
2897
2898 get_index_sam_header(params, m2g_bwt_idx)
2899
2900 mapped_gtf_list = []
2901 unmapped_gtf_list = []
2902 # do the initial mapping in GTF coordinates
2903 for reads in [left_reads, right_reads]:
2904 if reads == None or os.path.getsize(reads) < 25 :
2905 continue
2906 fbasename = getFileBaseName(reads)
2907 mapped_gtf_out = tmp_dir + fbasename + ".m2g"
2908 #if use_zpacker:
2909 # mapped_gtf_out+=".z"
2910
2911 unmapped_gtf = tmp_dir + fbasename + ".m2g_um"
2912 #if use_BWT_FIFO:
2913 # unmapped_gtf += ".z"
2914
2915 (mapped_gtf_map, unmapped) = bowtie(params,
2916 m2g_bwt_idx,
2917 [reads],
2918 "fastq",
2919 params.t_mismatches,
2920 mapped_gtf_out,
2921 unmapped_gtf,
2922 "", _reads_vs_T)
2923 mapped_gtf_list.append(mapped_gtf_map)
2924 unmapped_gtf_list.append(unmapped)
2925
2926 sam_gtf_list = []
2927 for reads in mapped_gtf_list:
2928 fbasename = getFileBaseName(reads)
2929 sam_out_fname = tmp_dir + fbasename + "_converted.bam"
2930 m2g_convert_coords(params,
2931 genome_sam_header_filename,
2932 params.gff_annotation,
2933 reads,
2934 sam_out_fname)
2935 sam_gtf_list.append(sam_out_fname)
2936
2937 if len(sam_gtf_list) < 2:
2938 sam_gtf_list.append(None)
2939 unmapped_gtf_list.append(None)
2940
2941 return (sam_gtf_list, unmapped_gtf_list)
2942 # end Map2GTF
2943
2944 def get_preflt_data(params, ri, target_reads, out_mappings, out_unmapped):
2945 ## extract mappings and unmapped reads from prefilter mappings and preflt_ium
2946 ##
2947 #this is accomplished by a special prep_reads usage (triggered by --flt-hits)
2948
2949 prep_cmd=prep_reads_cmd(params, params.preflt_data[ri].unmapped_reads, None,
2950 out_mappings,
2951 target_reads,
2952 params.preflt_data[ri].mappings)
2953 um_reads = open(out_unmapped, "wb")
2954 log_fname=logging_dir + "prep_reads.from_preflt."+str(ri)+".log"
2955 filter_log = open(log_fname,"w")
2956
2957 shell_cmd = " ".join(prep_cmd)
2958 #add the compression pipe
2959 zip_cmd=[]
2960 if use_zpacker:
2961 zip_cmd=[ params.system_params.zipper ]
2962 zip_cmd.extend(params.system_params.zipper_opts)
2963 zip_cmd.extend(['-c','-'])
2964 shell_cmd +=' | '+' '.join(zip_cmd)
2965 shell_cmd += ' >' + out_unmapped
2966 retcode=0
2967 try:
2968 print >> run_log, shell_cmd
2969 if use_zpacker:
2970 prep_proc = subprocess.Popen(prep_cmd,
2971 stdout=subprocess.PIPE,
2972 stderr=filter_log)
2973 zip_proc = subprocess.Popen(zip_cmd,
2974 preexec_fn=subprocess_setup,
2975 stdin=prep_proc.stdout,
2976 stderr=tophat_log, stdout=um_reads)
2977 prep_proc.stdout.close() #as per http://bugs.python.org/issue7678
2978 zip_proc.communicate()
2979 retcode=prep_proc.poll()
2980 if retcode==0:
2981 retcode=zip_proc.poll()
2982 else:
2983 retcode = subprocess.call(prep_cmd,
2984 stdout=um_reads,
2985 stderr=filter_log)
2986 if retcode:
2987 die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname))
2988
2989 except OSError, o:
2990 errmsg=fail_str+str(o)
2991 die(errmsg+"\n"+log_tail(log_fname))
2992 um_reads.close()
2993
2994 return (out_mappings, out_unmapped)
2995
2996
2997 # The main aligment routine of TopHat. This function executes most of the
2998 # workflow producing a set of candidate alignments for each cDNA fragment in a
2999 # pair of SAM alignment files (for paired end reads).
3000 def spliced_alignment(params,
3001 bwt_idx_prefix,
3002 sam_header_filename,
3003 ref_fasta,
3004 read_len,
3005 segment_len,
3006 prepared_reads,
3007 user_supplied_junctions,
3008 user_supplied_insertions,
3009 user_supplied_deletions):
3010
3011 possible_juncs = []
3012 possible_juncs.extend(user_supplied_junctions)
3013
3014 possible_insertions = []
3015 possible_insertions.extend(user_supplied_insertions)
3016 possible_deletions = []
3017 possible_deletions.extend(user_supplied_deletions)
3018 possible_fusions = []
3019
3020 left_reads, right_reads = prepared_reads
3021
3022 maps = [[], []] # maps[0] = left_reads mapping data, maps[1] = right_reads_mapping_data
3023
3024 #single_segments = False
3025
3026 # Before anything, map the reads using Map2GTF (if using annotation)
3027 m2g_maps = [ None, None ] # left, right
3028 initial_reads = [ left_reads, right_reads ]
3029
3030 if params.gff_annotation:
3031 (mapped_gtf_list, unmapped_gtf_list) = \
3032 map2gtf(params, sam_header_filename, ref_fasta, left_reads, right_reads)
3033
3034 m2g_left_maps, m2g_right_maps = mapped_gtf_list
3035 m2g_maps = [m2g_left_maps, m2g_right_maps]
3036 if params.transcriptome_only or not fileExists(unmapped_gtf_list[0]):
3037 # The case where the user doesn't want to map to anything other
3038 # than the transcriptome OR we have no unmapped reads
3039 maps[0] = [m2g_left_maps]
3040 if right_reads:
3041 maps[1] = [m2g_right_maps]
3042 return maps
3043 # Feed the unmapped reads into spliced_alignment()
3044 initial_reads = unmapped_gtf_list[:]
3045 th_log("Resuming TopHat pipeline with unmapped reads")
3046
3047 max_seg_len = segment_len #this is the ref seq span on either side of the junctions
3048 #to be extracted into segment_juncs.fa
3049
3050 num_segs = int(read_len / segment_len)
3051 if (read_len % segment_len) >= min(segment_len-2, 20):
3052 #remainder is shorter but long enough to become a new segment
3053 num_segs += 1
3054 else:
3055 # the last segment is longer
3056 if num_segs>1: max_seg_len += (read_len % segment_len)
3057
3058 if num_segs <= 1:
3059 th_logp("Warning: you have only one segment per read.\n\tIf the read length is greater than or equal to 45bp,\n\twe strongly recommend that you decrease --segment-length to about half the read length because TopHat will work better with multiple segments")
3060
3061 # Using the num_segs value returned by check_reads(),
3062 # decide which junction discovery strategy to use
3063 if num_segs < 3:
3064 #if params.butterfly_search != False:
3065 # params.butterfly_search = True
3066 if params.coverage_search != False:
3067 params.coverage_search = True
3068 if num_segs == 1:
3069 segment_len = read_len
3070 else: #num_segs >= 3:
3071 # if we have at least three segments, just use split segment search,
3072 # which is the most sensitive and specific, fastest, and lightest-weight.
3073 # so unless specifically requested, disable the other junction searches
3074 if params.closure_search != True:
3075 params.closure_search = False
3076 if params.coverage_search != True:
3077 params.coverage_search = False
3078 if params.butterfly_search != True:
3079 params.butterfly_search = False
3080
3081 # Perform the first part of the TopHat work flow on the left and right
3082 # reads of paired ends separately - we'll use the pairing information later
3083 have_left_IUM=False
3084 for ri in (0,1):
3085 reads=initial_reads[ri]
3086 if reads == None or not fileExists(reads,25):
3087 continue
3088 fbasename=getFileBaseName(reads)
3089 unspliced_out = tmp_dir + fbasename + ".mapped"
3090 unspliced_sam = None
3091 unmapped_reads = None
3092 #if use_zpacker: unspliced_out+=".z"
3093 unmapped_unspliced = tmp_dir + fbasename + "_unmapped"
3094 if params.prefilter_multi:
3095 #unmapped_unspliced += ".z"
3096 (unspliced_sam, unmapped_reads) = get_preflt_data(params, ri, reads, unspliced_out, unmapped_unspliced)
3097 else:
3098 # Perform the initial Bowtie mapping of the full length reads
3099 (unspliced_sam, unmapped_reads) = bowtie(params,
3100 bwt_idx_prefix,
3101 [reads],
3102 "fastq",
3103 params.genome_read_mismatches,
3104 unspliced_out,
3105 unmapped_unspliced,
3106 "",
3107 _reads_vs_G)
3108
3109 # daehwan - check this out
3110 # params.bowtie2 = False
3111 # check_bowtie(params.bowtie2)
3112
3113 seg_maps = []
3114 unmapped_segs = []
3115 segs = []
3116 have_IUM = fileExists(unmapped_reads,25)
3117 if ri==0 and have_IUM:
3118 have_left_IUM = True
3119 if num_segs > 1 and have_IUM:
3120 # split up the IUM reads into segments
3121 # unmapped_reads can be in BAM format
3122 read_segments = split_reads(unmapped_reads,
3123 tmp_dir + fbasename,
3124 False,
3125 params,
3126 segment_len)
3127
3128 # Map each segment file independently with Bowtie
3129 for i in range(len(read_segments)):
3130 seg = read_segments[i]
3131 fbasename=getFileBaseName(seg)
3132 seg_out = tmp_dir + fbasename
3133 #if use_zpacker: seg_out += ".z"
3134 unmapped_seg = tmp_dir + fbasename + "_unmapped.fq"
3135 if use_BWT_FIFO:
3136 unmapped_seg += ".z"
3137 extra_output = "(%d/%d)" % (i+1, len(read_segments))
3138 (seg_map, unmapped) = bowtie(params,
3139 bwt_idx_prefix,
3140 [seg],
3141 "fastq",
3142 params.segment_mismatches,
3143 seg_out,
3144 unmapped_seg,
3145 extra_output,
3146 _segs_vs_G)
3147 seg_maps.append(seg_map)
3148 unmapped_segs.append(unmapped)
3149 segs.append(seg)
3150
3151 # Collect the segment maps for left and right reads together
3152 maps[ri] = Maps(unspliced_sam, seg_maps, unmapped_segs, segs)
3153 else:
3154 # if there's only one segment, just collect the initial map as the only
3155 # map to be used downstream for coverage-based junction discovery
3156 read_segments = [reads]
3157 maps[ri] = Maps(unspliced_sam, [unspliced_sam], [unmapped_reads], [unmapped_reads])
3158
3159 # XXX: At this point if using M2G, have three sets of reads:
3160 # mapped to transcriptome, mapped to genome, and unmapped (potentially
3161 # spliced or poly-A tails) - hp
3162
3163 # Unless the user asked not to discover new junctions, start that process
3164 # here
3165 left_reads_map = maps[0].unspliced_sam
3166 left_seg_maps = maps[0].seg_maps
3167 unmapped_reads = maps[0].unmapped_segs
3168 if right_reads:
3169 right_reads_map = maps[1].unspliced_sam
3170 right_seg_maps = maps[1].seg_maps
3171 unmapped_reads.extend(maps[1].unmapped_segs)
3172 else:
3173 right_reads_map = None
3174 right_seg_maps = None
3175
3176 if params.find_novel_juncs and have_left_IUM: # or params.find_novel_indels:
3177 # Call segment_juncs to infer a list of possible splice junctions from
3178 # the regions of the genome covered in the initial and segment maps
3179 #if params.find_novel_juncs:
3180 #TODO: in m2g case, we might want to pass the m2g mappings as well,
3181 # or perhaps the GTF file directly
3182 # -> this could improve alternative junction detection?
3183 juncs = junctions_from_segments(params,
3184 sam_header_filename,
3185 left_reads,
3186 left_reads_map,
3187 left_seg_maps,
3188 right_reads,
3189 right_reads_map,
3190 right_seg_maps,
3191 unmapped_reads,
3192 "fastq",
3193 ref_fasta)
3194 if os.path.getsize(juncs[0]) != 0:
3195 possible_juncs.append(juncs[0])
3196 if params.find_novel_indels:
3197 if os.path.getsize(juncs[1]) != 0:
3198 possible_insertions.append(juncs[1])
3199 if os.path.getsize(juncs[2]) != 0:
3200 possible_deletions.append(juncs[2])
3201 if params.find_novel_fusions:
3202 if os.path.getsize(juncs[3]) != 0:
3203 possible_fusions.append(juncs[3])
3204 # Optionally, and for paired reads only, use a closure search to
3205 # discover addtional junctions
3206 if params.closure_search and left_reads and right_reads:
3207 juncs = junctions_from_closures(params,
3208 sam_header_filename,
3209 [maps[initial_reads[left_reads]].unspliced_sam, maps[initial_reads[left_reads]].seg_maps[-1]],
3210 [maps[initial_reads[right_reads]].unspliced_sam, maps[initial_reads[right_reads]].seg_maps[-1]],
3211 ref_fasta)
3212 if os.path.getsize(juncs[0]) != 0:
3213 possible_juncs.extend(juncs)
3214
3215 if len(possible_insertions) == 0 and len(possible_deletions) == 0 and len(possible_juncs) == 0 and len(possible_fusions) == 0:
3216 spliced_seg_maps = None
3217 junc_idx_prefix = None
3218 else:
3219 junc_idx_prefix = "segment_juncs"
3220 if len(possible_insertions) == 0:
3221 possible_insertions.append(os.devnull)
3222 # print >> sys.stderr, "Warning: insertions database is empty!"
3223 if len(possible_deletions) == 0:
3224 possible_deletions.append(os.devnull)
3225 # print >> sys.stderr, "Warning: deletions database is empty!"
3226 if len(possible_juncs) == 0:
3227 possible_juncs.append(os.devnull)
3228 th_logp("Warning: junction database is empty!")
3229 if len(possible_fusions) == 0:
3230 possible_fusions.append(os.devnull)
3231 if junc_idx_prefix:
3232 juncs_bwt_idx = build_juncs_index(params.bowtie2,
3233 3,
3234 max_seg_len,
3235 junc_idx_prefix,
3236 possible_juncs,
3237 possible_insertions,
3238 possible_deletions,
3239 possible_fusions,
3240 ref_fasta,
3241 params.read_params.color)
3242
3243 juncs_bwt_samheader = get_index_sam_header(params, juncs_bwt_idx)
3244
3245 # Now map read segments (or whole IUM reads, if num_segs == 1) to the splice
3246 # index with Bowtie
3247 # for reads in [left_reads, right_reads]:
3248 for ri in (0,1):
3249 reads = initial_reads[ri]
3250 spliced_seg_maps = []
3251
3252 if reads == None or os.path.getsize(reads)<25:
3253 continue
3254
3255 m2g_map = m2g_maps[ri]
3256
3257 if junc_idx_prefix :
3258 i = 0
3259 for seg in maps[ri].segs:
3260 #search each segment
3261 fsegname = getFileBaseName(seg)
3262 seg_out = tmp_dir + fsegname + ".to_spliced"
3263 #if use_zpacker: seg_out += ".z"
3264 extra_output = "(%d/%d)" % (i+1, len(maps[ri].segs))
3265 (seg_map, unmapped) = bowtie(params,
3266 tmp_dir + junc_idx_prefix,
3267 [seg],
3268 "fastq",
3269 params.segment_mismatches,
3270 seg_out,
3271 None,
3272 extra_output,
3273 _segs_vs_J)
3274 spliced_seg_maps.append(seg_map)
3275 i += 1
3276
3277 # Join the contigous and spliced segment hits into full length
3278 # read alignments
3279 rfname=getFileBaseName(reads)
3280 rfdir=getFileDir(reads)
3281 mapped_reads = rfdir + rfname + ".candidates.bam"
3282 # -- spliced mappings built from all segment mappings vs genome and junc_db
3283 merged_map =rfdir + rfname + ".candidates_and_unspl.bam"
3284 #unspl_bwtfile = maps[ri].unspliced_bwt
3285 unspl_samfile = maps[ri].unspliced_sam
3286 join_mapped_segments(params,
3287 sam_header_filename,
3288 reads,
3289 ref_fasta,
3290 possible_juncs,
3291 possible_insertions,
3292 possible_deletions,
3293 possible_fusions,
3294 maps[ri].seg_maps,
3295 spliced_seg_maps,
3296 mapped_reads)
3297
3298 if num_segs > 1 or m2g_map:
3299 # Merge the spliced and unspliced full length alignments into
3300 # a single SAM file.
3301 # The individual SAM files are all already sorted in
3302 # increasing read ID order.
3303 # NOTE: We also should be able to address bug #134 here, by replacing
3304 # contiguous alignments that poke into an intron by a small amount by
3305 # the correct spliced alignment.
3306
3307 try:
3308 merge_cmd = [prog_path("bam_merge"),
3309 "--index-outfile", merged_map + ".index",
3310 "--sam-header", sam_header_filename,
3311 merged_map]
3312
3313 if num_segs > 1:
3314 merge_cmd += [ maps[ri].unspliced_sam ]
3315 if m2g_map:
3316 merge_cmd += [ m2g_map ]
3317
3318 if os.path.exists(mapped_reads):
3319 merge_cmd += [ mapped_reads ]
3320 else:
3321 for bam_i in range(0, params.system_params.num_threads):
3322 temp_bam = mapped_reads[:-4] + str(bam_i) + ".bam"
3323 if os.path.exists(temp_bam):
3324 merge_cmd += [ temp_bam ]
3325 else:
3326 break
3327
3328 print >> run_log, " ".join(merge_cmd)
3329 ret = subprocess.call( merge_cmd,
3330 stderr=open(logging_dir + "bam_merge.log", "w") )
3331 if ret != 0:
3332 die(fail_str+"Error executing: "+" ".join(merge_cmd))
3333 except OSError, o:
3334 die(fail_str+"Error: "+str(o))
3335 maps[ri] = [merged_map]
3336
3337 if not params.system_params.keep_tmp:
3338 os.remove(mapped_reads)
3339 else:
3340 # no segments or transcriptome mappings, so no merge is needed
3341 # because join_mapped_segments() produced the final BAM already
3342 os.rename(mapped_reads, merged_map)
3343 maps[ri] = [merged_map]
3344 if not params.system_params.keep_tmp:
3345 os.remove(unspl_samfile)
3346
3347 return maps
3348
3349 # rough equivalent of the 'which' command to find external programs
3350 # (current script path is tested first, then PATH envvar)
3351 def which(program):
3352 def is_executable(fpath):
3353 return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
3354 fpath, fname = os.path.split(program)
3355 if fpath:
3356 if is_executable(program):
3357 return program
3358 else:
3359 progpath = os.path.join(bin_dir, program)
3360 if is_executable(progpath):
3361 return progpath
3362 for path in os.environ["PATH"].split(os.pathsep):
3363 progpath = os.path.join(path, program)
3364 if is_executable(progpath):
3365 return progpath
3366 return None
3367
3368 def prog_path(program):
3369 progpath=which(program)
3370 if progpath == None:
3371 die("Error locating program: "+program)
3372 return progpath
3373
3374 # FIXME: this should get set during the make dist autotools phase of the build
3375 def get_version():
3376 return "2.0.0"
3377
3378 def mlog(msg):
3379 print >> sys.stderr, "[DBGLOG]:"+msg
3380
3381 def test_input_file(filename):
3382 try:
3383 test_file = open(filename, "r")
3384 except IOError, o:
3385 die("Error: Opening file %s" % filename)
3386 return
3387
3388 def main(argv=None):
3389 warnings.filterwarnings("ignore", "tmpnam is a potential security risk")
3390
3391 # Initialize default parameter values
3392 params = TopHatParams()
3393 try:
3394 if argv is None:
3395 argv = sys.argv
3396 args = params.parse_options(argv)
3397 params.check()
3398
3399 bwt_idx_prefix = args[0]
3400 left_reads_list = args[1]
3401 left_quals_list, right_quals_list = [], []
3402 if (not params.read_params.quals and len(args) > 2) or (params.read_params.quals and len(args) > 3):
3403 if params.read_params.mate_inner_dist == None:
3404 params.read_params.mate_inner_dist = 50
3405 #die("Error: you must set the mean inner distance between mates with -r")
3406
3407 right_reads_list = args[2]
3408 if params.read_params.quals:
3409 left_quals_list = args[3]
3410 right_quals_list = args[4]
3411 else:
3412 right_reads_list = None
3413 if params.read_params.quals:
3414 left_quals_list = args[2]
3415
3416 start_time = datetime.now()
3417 prepare_output_dir()
3418 init_logger(logging_dir + "tophat.log")
3419
3420 th_logp()
3421 th_log("Beginning TopHat run (v"+get_version()+")")
3422 th_logp("-----------------------------------------------")
3423
3424 global run_log
3425 run_log = open(logging_dir + "run.log", "w", 0)
3426 global run_cmd
3427 run_cmd = " ".join(argv)
3428 print >> run_log, run_cmd
3429
3430 check_bowtie(params)
3431 check_samtools()
3432
3433 # Validate all the input files, check all prereqs before committing
3434 # to the run
3435 if params.gff_annotation:
3436 if not os.path.exists(params.gff_annotation):
3437 die("Error: cannot find transcript file %s" % params.gff_annotation)
3438 if os.path.getsize(params.gff_annotation)<10:
3439 die("Error: invalid transcript file %s" % params.gff_annotation)
3440
3441 if params.transcriptome_index:
3442 if params.gff_annotation:
3443 #gff file given, so transcriptome data will be written there
3444 gff_basename = getFileBaseName(params.gff_annotation)
3445 #just in case, check if it's not already there (-G/--GTF given again by mistake)
3446 tpath, tname = os.path.split(params.transcriptome_index)
3447 new_subdir=False
3448 if tpath in (".", "./") or not tpath :
3449 if not os.path.exists(params.transcriptome_index):
3450 os.makedirs(params.transcriptome_index)
3451 new_subdir=True
3452 if new_subdir or (os.path.exists(params.transcriptome_index) and os.path.isdir(params.transcriptome_index)):
3453 params.transcriptome_index = os.path.join(params.transcriptome_index, gff_basename)
3454 gff_out=params.transcriptome_index+".gff"
3455 if not (os.path.exists(gff_out) and os.path.getsize(gff_out)==os.path.getsize(params.gff_annotation)):
3456 #generate the transcriptome data files
3457 tpath, tname = os.path.split(params.transcriptome_index)
3458 params.transcriptome_outdir=tpath
3459 t_gff=params.transcriptome_index+".gff"
3460 if params.transcriptome_outdir:
3461 #will create the transcriptome data files
3462 if not os.path.exists(params.transcriptome_outdir):
3463 os.makedirs(params.transcriptome_outdir)
3464 copy(params.gff_annotation, t_gff)
3465 else:
3466 #try to use existing transcriptome data files
3467
3468 if not (os.path.exists(t_gff) and os.path.getsize(t_gff)>10):
3469 die("Error: GFF transcripts file not found or invalid (%s)" % t_gff)
3470 check_bowtie_index(params.transcriptome_index, params.bowtie2)
3471 params.gff_annotation=t_gff
3472 #end @ transcriptome_index given
3473
3474 (ref_fasta, ref_seq_dict) = check_index(bwt_idx_prefix, params.bowtie2)
3475
3476 th_log("Generating SAM header for "+bwt_idx_prefix)
3477 sam_header_filename = get_index_sam_header(params, bwt_idx_prefix)
3478 #if not params.skip_check_reads:
3479 reads_list = left_reads_list
3480 if right_reads_list:
3481 reads_list = reads_list + "," + right_reads_list
3482 params.read_params = check_reads_format(params, reads_list)
3483
3484 user_supplied_juncs = []
3485 user_supplied_insertions = []
3486 user_supplied_deletions = []
3487 user_supplied_fusions = []
3488 global gtf_juncs
3489 if params.gff_annotation and params.find_GFF_juncs:
3490 test_input_file(params.gff_annotation)
3491 (found_juncs, gtf_juncs) = get_gtf_juncs(params.gff_annotation)
3492 ##-- we shouldn't need these junctions in user_supplied_juncs anymore because now map2gtf does a much better job
3493 ## but we still need them loaded in gtf_juncs for later splice verification
3494 if found_juncs:
3495 ## and not params.gff_annotation:
3496 user_supplied_juncs.append(gtf_juncs)
3497 #else:
3498 # gtf_juncs = None
3499 if params.raw_junctions:
3500 test_input_file(params.raw_junctions)
3501 user_supplied_juncs.append(params.raw_junctions)
3502
3503 if params.raw_insertions:
3504 test_input_file(params.raw_insertions)
3505 user_supplied_insertions.append(params.raw_insertions)
3506
3507 if params.raw_deletions:
3508 test_input_file(params.raw_deletions)
3509 user_supplied_deletions.append(params.raw_deletions)
3510
3511 global unmapped_reads_fifo
3512 unmapped_reads_fifo = tmp_dir + str(os.getpid())+".bwt_unmapped.z.fifo"
3513
3514 # Now start the time consuming stuff
3515 if params.prefilter_multi:
3516 sides=("left","right")
3517 read_lists=(left_reads_list, right_reads_list)
3518 for ri in (0,1):
3519 reads_list=read_lists[ri]
3520 if not reads_list:
3521 continue
3522 params.preflt_data[ri].multihit_reads = tmp_dir + sides[ri]+"_multimapped.fq"
3523 side_imap = tmp_dir + sides[ri]+"_im"
3524 #if use_zpacker: side_imap+=".z"
3525 side_ium = tmp_dir + sides[ri]+"_ium.fq"
3526 if use_BWT_FIFO:
3527 side_ium += ".z"
3528 th_log("Pre-filtering multi-mapped "+sides[ri]+" reads")
3529 rdlist=reads_list.split(',')
3530 bwt=bowtie(params, bwt_idx_prefix, rdlist,
3531 params.read_params.reads_format,
3532 params.genome_read_mismatches,
3533 side_imap, side_ium,
3534 "", _reads_vs_G, params.preflt_data[ri].multihit_reads)
3535 params.preflt_data[ri].mappings = bwt[0] # initial mappings
3536 params.preflt_data[ri].unmapped_reads = bwt[1] # IUM reads
3537
3538 th_log("Preparing reads")
3539 left_kept_reads, left_reads_info = prep_reads(params,
3540 left_reads_list,
3541 left_quals_list,
3542 "left",
3543 params.preflt_data[0].multihit_reads)
3544 min_read_len=left_reads_info.min_len
3545 max_read_len=left_reads_info.max_len
3546 if right_reads_list:
3547 right_kept_reads, right_reads_info = prep_reads(params,
3548 right_reads_list,
3549 right_quals_list,
3550 "right",
3551 params.preflt_data[1].multihit_reads)
3552 min_read_len=min(right_reads_info.min_len, min_read_len)
3553 max_read_len=max(right_reads_info.max_len, max_read_len)
3554 else:
3555 right_kept_reads = None
3556 seed_len=params.read_params.seed_length
3557 if seed_len: #if read len was explicitly given
3558 seed_len = max(seed_len, min_read_len)
3559 #can't be smaller than minimum length observed
3560 else:
3561 seed_len = max_read_len
3562 params.read_params.seed_length=seed_len
3563 # turn off integer-quals
3564 if params.read_params.integer_quals:
3565 params.read_params.integer_quals = False
3566
3567 input_reads = [left_kept_reads, right_kept_reads]
3568 mappings = spliced_alignment(params,
3569 bwt_idx_prefix,
3570 sam_header_filename,
3571 ref_fasta,
3572 params.read_params.seed_length,
3573 params.segment_length,
3574 input_reads,
3575 user_supplied_juncs,
3576 user_supplied_insertions,
3577 user_supplied_deletions)
3578
3579 compile_reports(params,
3580 sam_header_filename,
3581 ref_fasta,
3582 mappings,
3583 input_reads,
3584 params.gff_annotation)
3585
3586 if not params.system_params.keep_tmp:
3587 for m in mappings[0]:
3588 os.remove(m)
3589 if left_kept_reads:
3590 os.remove(left_kept_reads)
3591 for m in mappings[1]:
3592 os.remove(m)
3593 if right_kept_reads:
3594 os.remove(right_kept_reads)
3595 tmp_files = os.listdir(tmp_dir)
3596 for t in tmp_files:
3597 os.remove(tmp_dir+t)
3598 os.rmdir(tmp_dir)
3599
3600 finish_time = datetime.now()
3601 duration = finish_time - start_time
3602 th_logp("-----------------------------------------------")
3603 th_log("Run complete: %s elapsed" % formatTD(duration))
3604
3605 except Usage, err:
3606 th_logp(sys.argv[0].split("/")[-1] + ": " + str(err.msg))
3607 th_logp(" for detailed help see http://tophat.cbcb.umd.edu/manual.html")
3608 return 2
3609
3610
3611 if __name__ == "__main__":
3612 sys.exit(main())

Properties

Name Value
svn:executable *