ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/tophat
Revision: 164
Committed: Thu Feb 9 05:30:41 2012 UTC (8 years, 3 months ago) by gpertea
File size: 149805 byte(s)
Log Message:
wip

Line File contents
1 #!/usr/bin/env python
2
3 # encoding: utf-8
4 """
5 tophat.py
6
7 Created by Cole Trapnell on 2008-12-25.
8 Copyright (c) 2008 Cole Trapnell. All rights reserved.
9 Updated and maintained by Daehwan Kim and Geo Pertea since Jul 2010.
10 """
11 import sys
12 try:
13 import psyco
14 psyco.full()
15 except ImportError:
16 pass
17
18 import getopt
19 import subprocess
20 import errno
21 import os
22 import warnings
23 import re
24 import signal
25 from datetime import datetime, date, time
26 from shutil import copy
27 import logging
28
29 use_message = '''
30 TopHat maps short sequences from spliced transcripts to whole genomes.
31
32 Usage:
33 tophat [options] <bowtie_index> <reads1[,reads2,...]> [reads1[,reads2,...]] \\
34 [quals1,[quals2,...]] [quals1[,quals2,...]]
35
36 Options:
37 -v/--version
38 -o/--output-dir <string> [ default: ./tophat_out ]
39 --bowtie1 [ default: bowtie2 ]
40 -a/--min-anchor <int> [ default: 8 ]
41 -m/--splice-mismatches <0-2> [ default: 0 ]
42 -i/--min-intron-length <int> [ default: 50 ]
43 -I/--max-intron-length <int> [ default: 500000 ]
44 -g/--max-multihits <int> [ default: 20 ]
45 -x/--transcriptome-max-hits <int> [ default: 60 ]
46 -n/--transcriptome-mismatches <int> [ default: 1 ]
47 -M/--prefilter-multihits ( for -G/--GTF option, enable
48 an initial bowtie search
49 against the genome )
50 --max-insertion-length <int> [ default: 3 ]
51 --max-deletion-length <int> [ default: 3 ]
52 --solexa-quals
53 --solexa1.3-quals (same as phred64-quals)
54 --phred64-quals (same as solexa1.3-quals)
55 -Q/--quals
56 --integer-quals
57 -C/--color (Solid - color space)
58 --color-out
59 --library-type <string> (fr-unstranded, fr-firststrand,
60 fr-secondstrand)
61 -p/--num-threads <int> [ default: 1 ]
62 -G/--GTF <filename> (GTF/GFF with known transcripts)
63 --transcriptome-index <bwtidx> (transcriptome bowtie index)
64 -T/--transcriptome-only (map only to the transcriptome)
65 -j/--raw-juncs <filename>
66 --insertions <filename>
67 --deletions <filename>
68 -r/--mate-inner-dist <int>
69 --mate-std-dev <int> [ default: 20 ]
70 --no-novel-juncs
71 --no-novel-indels
72 --fusion-search
73 --fusion-anchor-length <int> [ default: 20 ]
74 --fusion-min-dist <int> [ default: 10000000 ]
75 --fusion-read-mismatches <int> [ default: 2 ]
76 --fusion-multireads <int> [ default: 2 ]
77 --fusion-multipairs <int> [ default: 2 ]
78 --no-gtf-juncs
79 --no-coverage-search
80 --coverage-search
81 --no-closure-search
82 --closure-search
83 --microexon-search
84 --butterfly-search
85 --no-butterfly-search
86 --keep-tmp
87 --tmp-dir <dirname> [ default: <output_dir>/tmp ]
88 -z/--zpacker <program> [ default: gzip ]
89 -X/--unmapped-fifo [use mkfifo to compress more temporary
90 files for color space reads]
91
92 Advanced Options:
93 --report-secondary-alignments
94 --genome-read-mismatches <int> [ default: 2 ]
95 -N/--read-mismatches <int> [ default: 2 ]
96 --segment-mismatches <int> [ default: 2 ]
97 --segment-length <int> [ default: 25 ]
98 --bowtie-n [ default: bowtie -v ]
99 --min-closure-exon <int> [ default: 100 ]
100 --min-closure-intron <int> [ default: 50 ]
101 --max-closure-intron <int> [ default: 5000 ]
102 --min-coverage-intron <int> [ default: 50 ]
103 --max-coverage-intron <int> [ default: 20000 ]
104 --min-segment-intron <int> [ default: 50 ]
105 --max-segment-intron <int> [ default: 500000 ]
106 --no-sort-bam (Output BAM is not coordinate-sorted)
107 --no-convert-bam (Do not convert to bam format.
108 Output is <output_dir>accepted_hit.sam.
109 Implies --no-sort-bam)
110 --keep-fasta-order
111 --allow-partial-mapping
112
113 Bowtie2 related options:
114 Preset options in --end-to-end mode (local alignment is not used in TopHat2)
115 --b2-very-fast
116 --b2-fast
117 --b2-sensitive
118 --b2-very-sensitive
119
120 Alignment options
121 --b2-N <int> [ default: 0 ]
122 --b2-L <int> [ default: 20 ]
123 --b2-i <func> [ default: S,1,1.25 ]
124 --b2-n-ceil <func> [ default: L,0,0.15 ]
125 --b2-gbar <int> [ default: 4 ]
126
127 Scoring options
128 --b2-mp <int>,<int> [ default: 6,2 ]
129 --b2-np <int> [ default: 1 ]
130 --b2-rdg <int>,<int> [ default: 5,3 ]
131 --b2-rfg <int>,<int> [ default: 5,3 ]
132 --b2-score-min <func> [ default: L,-0.6,-0.6 ]
133
134 Effort options
135 --b2-D <int> [ default: 15 ]
136 --b2-R <int> [ default: 2 ]
137
138
139 SAM Header Options (for embedding sequencing run metadata in output):
140 --rg-id <string> (read group ID)
141 --rg-sample <string> (sample ID)
142 --rg-library <string> (library ID)
143 --rg-description <string> (descriptive string, no tabs allowed)
144 --rg-platform-unit <string> (e.g Illumina lane ID)
145 --rg-center <string> (sequencing center name)
146 --rg-date <string> (ISO 8601 date of the sequencing run)
147 --rg-platform <string> (Sequencing platform descriptor)
148 '''
149
150 # -F/--min-isoform-fraction <float> [ default: 0.15 ]
151 # (obsolete)
152
153 class Usage(Exception):
154 def __init__(self, msg):
155 self.msg = msg
156
157 output_dir = "./tophat_out/"
158 logging_dir = output_dir + "logs/"
159 run_log = None
160 tophat_log = None #main log file handle
161 tophat_logger = None # main logging object
162 run_cmd = None
163 tmp_dir = output_dir + "tmp/"
164 bin_dir = sys.path[0] + "/"
165 use_zpacker = False # this is set by -z/--zpacker option (-z0 leaves it False)
166
167 use_BAM_Unmapped = False # automatically set to True for non-Solid reads, handles unmapped reads in BAM format
168
169 use_BWT_FIFO = False # can only be set to True if use_zpacker is True and only with -C/--color
170 # enabled by -X/-unmapped-fifo option (unless -z0)
171 unmapped_reads_fifo = None # if use_BWT_FIFO is True, this tricks bowtie into writing the
172 # unmapped reads into a compressed file
173
174 samtools_path = None
175 bowtie_path = None
176 fail_str = "\t[FAILED]\n"
177 gtf_juncs = None #file name with junctions extracted from given GFF file
178
179
180
181 def init_logger(log_fname):
182 global tophat_logger
183 tophat_logger = logging.getLogger('project')
184 formatter = logging.Formatter('%(asctime)s %(message)s', '[%Y-%m-%d %H:%M:%S]')
185 # level = logging.__dict__.get(options.loglevel.upper(),logging.DEBUG)
186 tophat_logger.setLevel(logging.DEBUG)
187
188 hstream = logging.StreamHandler(sys.stderr)
189 hstream.setFormatter(formatter)
190 tophat_logger.addHandler(hstream)
191 #
192 # Output logging information to file
193 if os.path.isfile(log_fname):
194 os.remove(log_fname)
195 global tophat_log
196 logfh = logging.FileHandler(log_fname)
197 logfh.setFormatter(formatter)
198 tophat_logger.addHandler(logfh)
199 tophat_log=logfh.stream
200
201
202 # TopHatParams captures all of the runtime paramaters used by TopHat, and many
203 # of these are passed as command line options to exectubles run by the pipeline
204
205 # This class and its nested classes also do options parsing through parse_options()
206 # and option validation via the member function check()
207
208 class BowtieOutputFiles:
209 def __init__(self,
210 mappings=None,
211 unmapped_reads=None,
212 multihit_reads=None):
213 self.mappings=mappings
214 self.unmapped_reads=unmapped_reads
215 self.multihit_reads=multihit_reads
216
217 class TopHatParams:
218
219 # SpliceConstraints is a group of runtime parameters that specify what
220 # constraints to put on junctions discovered by the program. These constraints
221 # are used to filter out spurious/false positive junctions.
222
223 class SpliceConstraints:
224 def __init__(self,
225 min_anchor_length,
226 min_intron_length,
227 max_intron_length,
228 splice_mismatches,
229 min_isoform_fraction):
230 self.min_anchor_length = min_anchor_length
231 self.min_intron_length = min_intron_length
232 self.max_intron_length = max_intron_length
233 self.splice_mismatches = splice_mismatches
234 self.min_isoform_fraction = min_isoform_fraction
235
236 def parse_options(self, opts):
237 for option, value in opts:
238 if option in ("-m", "--splice-mismatches"):
239 self.splice_mismatches = int(value)
240 elif option in ("-a", "--min-anchor"):
241 self.min_anchor_length = int(value)
242 elif option in ("-F", "--min-isoform-fraction"):
243 self.min_isoform_fraction = float(value)
244 elif option in ("-i", "--min-intron-length"):
245 self.min_intron_length = int(value)
246 elif option in ("-I", "--max-intron-length"):
247 self.max_intron_length = int(value)
248
249 def check(self):
250 if self.splice_mismatches not in [0,1,2]:
251 die("Error: arg to --splice-mismatches must be 0, 1, or 2")
252 if self.min_anchor_length < 4:
253 die("Error: arg to --min-anchor-len must be greater than 4")
254 if self.min_isoform_fraction < 0.0 or self.min_isoform_fraction > 1.0:
255 die("Error: arg to --min-isoform-fraction must be between 0.0 and 1.0")
256 if self.min_intron_length <= 0:
257 die("Error: arg to --min-intron-length must be greater than 0")
258 if self.max_intron_length <= 0:
259 die("Error: arg to --max-intron-length must be greater than 0")
260
261 # SystemParams is a group of runtime parameters that determine how to handle
262 # temporary files produced during a run and how many threads to use for threaded
263 # stages of the pipeline (e.g. Bowtie)
264
265 class SystemParams:
266 def __init__(self,
267 num_threads,
268 keep_tmp):
269 self.num_threads = num_threads
270 self.keep_tmp = keep_tmp
271 self.zipper = "gzip"
272 self.zipper_opts= []
273
274 def parse_options(self, opts):
275 global use_zpacker
276 global use_BWT_FIFO
277 for option, value in opts:
278 if option in ("-p", "--num-threads"):
279 self.num_threads = int(value)
280 elif option == "--keep-tmp":
281 self.keep_tmp = True
282 elif option in ("-z","--zpacker"):
283 if value.lower() in ["-", " ", ".", "0", "none", "f", "false", "no"]:
284 value=""
285 self.zipper = value
286 #if not self.zipper:
287 # self.zipper='gzip'
288 elif option in ("-X", "--unmapped-fifo"):
289 use_BWT_FIFO=True
290 if self.zipper:
291 use_zpacker=True
292 if self.num_threads>1 and not self.zipper_opts:
293 if self.zipper.endswith('pbzip2') or self.zipper.endswith('pigz'):
294 self.zipper_opts.append('-p'+str(self.num_threads))
295 else:
296 use_zpacker=False
297 if use_BWT_FIFO: use_BWT_FIFO=False
298 def cmd(self):
299 cmdline=[]
300 if self.zipper:
301 cmdline.extend(['-z',self.zipper])
302 if self.num_threads>1:
303 cmdline.extend(['-p'+str(self.num_threads)])
304 return cmdline
305
306 def check(self):
307 if self.num_threads<1 :
308 die("Error: arg to --num-threads must be greater than 0")
309 if self.zipper:
310 xzip=which(self.zipper)
311 if not xzip:
312 die("Error: cannot find compression program "+self.zipper)
313
314 # ReadParams is a group of runtime parameters that specify various properties
315 # of the user's reads (e.g. which quality scale their are on, how long the
316 # fragments are, etc).
317 class ReadParams:
318 def __init__(self,
319 solexa_quals,
320 phred64_quals,
321 quals,
322 integer_quals,
323 color,
324 library_type,
325 seed_length,
326 reads_format,
327 mate_inner_dist,
328 mate_inner_dist_std_dev,
329 read_group_id,
330 sample_id,
331 library_id,
332 description,
333 seq_platform_unit,
334 seq_center,
335 seq_run_date,
336 seq_platform):
337 self.solexa_quals = solexa_quals
338 self.phred64_quals = phred64_quals
339 self.quals = quals
340 self.integer_quals = integer_quals
341 self.color = color
342 self.library_type = library_type
343 self.seed_length = seed_length
344 self.reads_format = reads_format
345 self.mate_inner_dist = mate_inner_dist
346 self.mate_inner_dist_std_dev = mate_inner_dist_std_dev
347 self.read_group_id = read_group_id
348 self.sample_id = sample_id
349 self.library_id = library_id
350 self.description = description
351 self.seq_platform_unit = seq_platform_unit
352 self.seq_center = seq_center
353 self.seq_run_date = seq_run_date
354 self.seq_platform = seq_platform
355
356 def parse_options(self, opts):
357 for option, value in opts:
358 if option == "--solexa-quals":
359 self.solexa_quals = True
360 elif option in ("--solexa1.3-quals", "--phred64-quals"):
361 self.phred64_quals = True
362 elif option in ("-Q", "--quals"):
363 self.quals = True
364 elif option == "--integer-quals":
365 self.integer_quals = True
366 elif option in ("-C", "--color"):
367 self.color = True
368 elif option == "--library-type":
369 self.library_type = value
370 elif option in ("-s", "--seed-length"):
371 self.seed_length = int(value)
372 elif option in ("-r", "--mate-inner-dist"):
373 self.mate_inner_dist = int(value)
374 elif option == "--mate-std-dev":
375 self.mate_inner_dist_std_dev = int(value)
376 elif option == "--rg-id":
377 self.read_group_id = value
378 elif option == "--rg-sample":
379 self.sample_id = value
380 elif option == "--rg-library":
381 self.library_id = value
382 elif option == "--rg-description":
383 self.description = value
384 elif option == "--rg-platform-unit":
385 self.seq_platform_unit = value
386 elif option == "--rg-center":
387 self.seq_center = value
388 elif option == "--rg-date":
389 self.seq_run_date = value
390 elif option == "--rg-platform":
391 self.seq_platform = value
392
393 def check(self):
394 if self.seed_length and self.seed_length < 20:
395 die("Error: arg to --seed-length must be at least 20")
396
397 if self.mate_inner_dist_std_dev != None and self.mate_inner_dist_std_dev < 0:
398 die("Error: arg to --mate-std-dev must at least 0")
399 if (not self.read_group_id and self.sample_id) or (self.read_group_id and not self.sample_id):
400 die("Error: --rg-id and --rg-sample must be specified or omitted together")
401
402 # SearchParams is a group of runtime parameters that specify how TopHat will
403 # search for splice junctions
404
405 class SearchParams:
406 def __init__(self,
407 min_closure_exon,
408 min_closure_intron,
409 max_closure_intron,
410 min_coverage_intron,
411 max_coverage_intron,
412 min_segment_intron,
413 max_segment_intron):
414
415 self.min_closure_exon_length = min_closure_exon
416 self.min_closure_intron_length = min_closure_intron
417 self.max_closure_intron_length = max_closure_intron
418 self.min_coverage_intron_length = min_coverage_intron
419 self.max_coverage_intron_length = max_coverage_intron
420 self.min_segment_intron_length = min_segment_intron
421 self.max_segment_intron_length = max_segment_intron
422
423 def parse_options(self, opts):
424 for option, value in opts:
425 if option == "--min-closure-exon":
426 self.min_closure_exon_length = int(value)
427 if option == "--min-closure-intron":
428 self.min_closure_intron_length = int(value)
429 if option == "--max-closure-intron":
430 self.max_closure_intron_length = int(value)
431 if option == "--min-coverage-intron":
432 self.min_coverage_intron_length = int(value)
433 if option == "--max-coverage-intron":
434 self.max_coverage_intron_length = int(value)
435 if option == "--min-segment-intron":
436 self.min_segment_intron_length = int(value)
437 if option == "--max-segment-intron":
438 self.max_segment_intron_length = int(value)
439
440 def check(self):
441 if self.min_closure_exon_length < 0:
442 die("Error: arg to --min-closure-exon must be at least 20")
443 if self.min_closure_intron_length < 0:
444 die("Error: arg to --min-closure-intron must be at least 20")
445 if self.max_closure_intron_length < 0:
446 die("Error: arg to --max-closure-intron must be at least 20")
447 if self.min_coverage_intron_length < 0:
448 die("Error: arg to --min-coverage-intron must be at least 20")
449 if self.max_coverage_intron_length < 0:
450 die("Error: arg to --max-coverage-intron must be at least 20")
451 if self.min_segment_intron_length < 0:
452 die("Error: arg to --min-segment-intron must be at least 20")
453 if self.max_segment_intron_length < 0:
454 die("Error: arg to --max-segment-intron must be at least 20")
455
456 class ReportParams:
457 def __init__(self):
458 self.sort_bam = True
459 self.convert_bam = True
460
461 def parse_options(self, opts):
462 for option, value in opts:
463 if option == "--no-sort-bam":
464 self.sort_bam = False
465 if option == "--no-convert-bam":
466 self.convert_bam = False
467 self.sort_bam = False
468
469 class Bowtie2Params:
470 def __init__(self):
471 self.very_fast = False
472 self.fast = False
473 self.sensitive = False
474 self.very_sensitive = False
475
476 self.N = 0
477 self.L = 20
478 self.i = "S,1,1.25"
479 self.n_ceil = "L,0,0.15"
480 self.gbar = 4
481
482 self.mp = "6,2"
483 self.np = 1
484 self.rdg = "5,3"
485 self.rfg = "5,3"
486 self.score_min = "L,-0.6,-0.6"
487
488 self.D = 15
489 self.R = 2
490
491 def parse_options(self, opts):
492 for option, value in opts:
493 if option == "--b2-very-fast":
494 self.very_fast = True
495 if option == "--b2-fast":
496 self.fast = True
497 if option == "--b2-sensitive":
498 self.sensitive = True
499 if option == "--b2-very-sensitive":
500 self.very_sensitive = True
501
502 if option == "--b2-N":
503 self.N = int(value)
504 if option == "--b2-L":
505 self.L = 20
506 if option == "--b2-i":
507 self.i = value
508 if option == "--b2-n-ceil":
509 self.n_ceil = value
510 if option == "--b2-gbar":
511 self.gbar = 4
512
513 if option == "--b2-mp":
514 self.mp = value
515 if option == "--b2-np":
516 self.np = int(value)
517 if option == "--b2-rdg":
518 self.rdg = value
519 if option == "--b2-rfg":
520 self.rfg = value
521 if option == "--b2-score-min":
522 self.score_min = value
523
524 if option == "--b2-D":
525 self.D = int(value)
526 if option == "--b2-R":
527 self.R = int(value)
528
529 def check(self):
530 more_than_once = False
531 if self.very_fast:
532 if self.fast or self.sensitive or self.very_sensitive:
533 more_than_once = True
534 else:
535 if self.fast:
536 if self.sensitive or self.very_sensitive:
537 more_than_once = True
538 else:
539 if self.sensitive and self.very_sensitive:
540 more_than_once = True
541
542 if more_than_once:
543 die("Error: use only one of --b2-very-fast, --b2-fast, --b2-sensitive, --b2-very-sensitive")
544
545 if not self.N in [0, 1]:
546 die("Error: arg to --b2-N must be either 0 or 1")
547
548 function_re = r'^[CLSG],-?\d+(\.\d+)?,-?\d+(\.\d+)?$'
549 function_match = re.search(function_re, self.i)
550
551 if not function_match:
552 die("Error: arg to --b2-i must be <func> (e.g. --b2-i S,1,1.25)")
553
554 function_match = re.search(function_re, self.n_ceil)
555 if not function_match:
556 die("Error: arg to --b2-n-ceil must be <func> (e.g. --b2-n-ceil L,0,0.15)")
557
558 function_match = re.search(function_re, self.score_min)
559 if not function_match:
560 die("Error: arg to --b2-score-min must be <func> (e.g. --b2-score-min L,-0.6,-0.6)")
561
562 pair_re = r'^\d+,\d+$'
563 pair_match = re.search(pair_re, self.mp)
564 if not pair_match:
565 die("Error: arg to --b2-mp must be <int>,<int> (e.g. --b2-mp 6,2)")
566
567 pair_match = re.search(pair_re, self.rdg)
568 if not pair_match:
569 die("Error: arg to --b2-rdg must be <int>,<int> (e.g. --b2-mp 5,3)")
570
571 pair_match = re.search(pair_re, self.rfg)
572 if not pair_match:
573 die("Error: arg to --b2-rfg must be <int>,<int> (e.g. --b2-mp 5,3)")
574
575
576 def __init__(self):
577 self.splice_constraints = self.SpliceConstraints(8, # min_anchor
578 50, # min_intron
579 500000, # max_intron
580 0, # splice_mismatches
581 0.15) # min_isoform_frac
582
583 self.preflt_data = [ BowtieOutputFiles(), BowtieOutputFiles() ]
584
585 self.read_params = self.ReadParams(False, # solexa_scale
586 False,
587 False, # quals
588 None, # integer quals
589 False, # SOLiD - color space
590 "", # library type (e.g. "illumina-stranded-pair-end")
591 None, # seed_length
592 "fastq", # quality_format
593 None, # mate inner distance
594 20, # mate inner dist std dev
595 None, # read group id
596 None, # sample id
597 None, # library id
598 None, # description
599 None, # platform unit (i.e. lane)
600 None, # sequencing center
601 None, # run date
602 None) # sequencing platform
603
604 self.system_params = self.SystemParams(1, # bowtie_threads (num_threads)
605 False) # keep_tmp
606
607 self.search_params = self.SearchParams(100, # min_closure_exon_length
608 50, # min_closure_intron_length
609 5000, # max_closure_intron_length
610 50, # min_coverage_intron_length
611 20000, # max_coverage_intron_length
612 50, # min_segment_intron_length
613 500000) # max_segment_intron_length
614
615 self.report_params = self.ReportParams()
616
617 self.bowtie2_params = self.Bowtie2Params()
618
619 self.bowtie2 = True
620 self.gff_annotation = None
621 self.transcriptome_only = False
622 self.transcriptome_index = None
623 self.transcriptome_outdir = None
624 self.raw_junctions = None
625 self.find_novel_juncs = True
626 self.find_novel_indels = True
627 self.find_novel_fusions = True
628 self.find_GFF_juncs = True
629 self.max_hits = 20
630 self.t_max_hits = 60
631 self.max_seg_hits = 40
632 self.prefilter_multi = False
633 self.genome_read_mismatches = 2
634 self.max_read_mismatches = 2
635 self.t_mismatches = 1
636 self.segment_length = 25
637 self.segment_mismatches = 2
638 self.bowtie_alignment_option = "-v"
639 self.max_insertion_length = 3
640 self.max_deletion_length = 3
641 self.raw_insertions = None
642 self.raw_deletions = None
643 self.coverage_search = None
644 self.closure_search = False
645 self.microexon_search = False
646 self.butterfly_search = None
647 self.report_secondary_alignments = False
648
649 self.keep_fasta_order = False
650 self.partial_mapping = False
651
652 self.fusion_search = False
653 self.fusion_anchor_length = 20
654 self.fusion_min_dist = 10000000
655 self.fusion_read_mismatches = 2
656 self.fusion_multireads = 2
657 self.fusion_multipairs = 2
658
659 def check(self):
660 self.splice_constraints.check()
661 self.read_params.check()
662 self.system_params.check()
663 if self.segment_length < 10:
664 die("Error: arg to --segment-length must at least 10")
665 if self.segment_mismatches < 0 or self.segment_mismatches > 3:
666 die("Error: arg to --segment-mismatches must in [0, 3]")
667 if self.t_mismatches < 0 or self.t_mismatches > 3:
668 die("Error: arg to -n or --transcriptome-mismatches must in [0, 3]")
669
670 if self.read_params.color:
671 if self.bowtie2:
672 th_log("Warning: bowtie2 in colorspace is not yet supported; --bowtie1 option assumed.")
673 self.bowtie2=False
674 if self.fusion_search:
675 die("Error: fusion-search in colorspace is not yet supported")
676 if self.butterfly_search:
677 die("Error: butterfly-search in colorspace is not yet supported")
678
679 self.bowtie2_params.check()
680
681 library_types = ["fr-unstranded", "fr-firststrand", "fr-secondstrand"]
682
683 if self.read_params.library_type and self.read_params.library_type not in library_types:
684 die("Error: library-type should be one of: "+', '.join(library_types))
685
686 self.search_params.max_closure_intron_length = min(self.splice_constraints.max_intron_length,
687 self.search_params.max_closure_intron_length)
688
689 self.search_params.max_segment_intron_length = min(self.splice_constraints.max_intron_length,
690 self.search_params.max_segment_intron_length)
691
692 self.search_params.max_coverage_intron_length = min(self.splice_constraints.max_intron_length,
693 self.search_params.max_coverage_intron_length)
694
695 if self.max_insertion_length >= self.segment_length:
696 die("Error: the max insertion length ("+self.max_insertion_length+") can not be equal to or greater than the segment length ("+self.segment_length+")")
697
698 if self.max_insertion_length < 0:
699 die("Error: the max insertion length ("+self.max_insertion_length+") can not be less than 0")
700
701 if self.max_deletion_length >= self.splice_constraints.min_intron_length:
702 die("Error: the max deletion length ("+self.max_deletion_length+") can not be equal to or greater than the min intron length ("+self.splice_constraints.min_intron_length+")")
703
704 if self.max_deletion_length < 0:
705 die("Error: the max deletion length ("+self.max_deletion_length+") can not be less than 0")
706
707 def cmd(self):
708 cmd = ["--min-anchor", str(self.splice_constraints.min_anchor_length),
709 "--splice-mismatches", str(self.splice_constraints.splice_mismatches),
710 "--min-report-intron", str(self.splice_constraints.min_intron_length),
711 "--max-report-intron", str(self.splice_constraints.max_intron_length),
712 "--min-isoform-fraction", str(self.splice_constraints.min_isoform_fraction),
713 "--output-dir", output_dir,
714 "--max-multihits", str(self.max_hits),
715 "--max-seg-multihits", str(self.max_seg_hits),
716 "--segment-length", str(self.segment_length),
717 "--segment-mismatches", str(self.segment_mismatches),
718 "--min-closure-exon", str(self.search_params.min_closure_exon_length),
719 "--min-closure-intron", str(self.search_params.min_closure_intron_length),
720 "--max-closure-intron", str(self.search_params.max_closure_intron_length),
721 "--min-coverage-intron", str(self.search_params.min_coverage_intron_length),
722 "--max-coverage-intron", str(self.search_params.max_coverage_intron_length),
723 "--min-segment-intron", str(self.search_params.min_segment_intron_length),
724 "--max-segment-intron", str(self.search_params.max_segment_intron_length),
725 "--max-mismatches", str(self.max_read_mismatches),
726 "--max-insertion-length", str(self.max_insertion_length),
727 "--max-deletion-length", str(self.max_deletion_length)]
728
729 if not self.bowtie2:
730 cmd.extend(["--bowtie1"])
731
732 if self.fusion_search:
733 cmd.extend(["--fusion-search",
734 "--fusion-anchor-length", str(self.fusion_anchor_length),
735 "--fusion-min-dist", str(self.fusion_min_dist),
736 "--fusion-read-mismatches", str(self.fusion_read_mismatches),
737 "--fusion-multireads", str(self.fusion_multireads),
738 "--fusion-multipairs", str(self.fusion_multipairs)])
739
740 cmd.extend(self.system_params.cmd())
741
742 if self.read_params.mate_inner_dist != None:
743 cmd.extend(["--inner-dist-mean", str(self.read_params.mate_inner_dist),
744 "--inner-dist-std-dev", str(self.read_params.mate_inner_dist_std_dev)])
745 if self.gff_annotation != None:
746 cmd.extend(["--gtf-annotations", str(self.gff_annotation)])
747 if gtf_juncs:
748 cmd.extend(["--gtf-juncs", gtf_juncs])
749 if self.closure_search == False:
750 cmd.append("--no-closure-search")
751 if not self.coverage_search:
752 cmd.append("--no-coverage-search")
753 if not self.microexon_search:
754 cmd.append("--no-microexon-search")
755 if self.butterfly_search:
756 cmd.append("--butterfly-search")
757 if self.read_params.solexa_quals:
758 cmd.append("--solexa-quals")
759 if self.read_params.quals:
760 cmd.append("--quals")
761 if self.read_params.integer_quals:
762 cmd.append("--integer-quals")
763 if self.read_params.color:
764 cmd.append("--color")
765 if self.read_params.library_type:
766 cmd.extend(["--library-type", self.read_params.library_type])
767 if self.read_params.read_group_id:
768 cmd.extend(["--rg-id", self.read_params.read_group_id])
769 if self.read_params.phred64_quals:
770 cmd.append("--phred64-quals")
771 return cmd
772
773 # This is the master options parsing routine, which calls parse_options for
774 # the delegate classes (e.g. SpliceConstraints) that handle certain groups
775 # of options.
776 def parse_options(self, argv):
777 try:
778 opts, args = getopt.getopt(argv[1:], "hvp:m:n:N:F:a:i:I:G:Tr:o:j:Xz:s:g:x:MQC",
779 ["version",
780 "help",
781 "output-dir=",
782 "bowtie1",
783 "solexa-quals",
784 "solexa1.3-quals",
785 "phred64-quals",
786 "quals",
787 "integer-quals",
788 "color",
789 "library-type=",
790 "num-threads=",
791 "splice-mismatches=",
792 "max-multihits=",
793 "min-isoform-fraction=",
794 "min-anchor-length=",
795 "min-intron-length=",
796 "max-intron-length=",
797 "GTF=",
798 "transcriptome-only",
799 "transcriptome-max-hits=",
800 "transcriptome-index=",
801 "raw-juncs=",
802 "no-novel-juncs",
803 "allow-fusions",
804 "fusion-search",
805 "fusion-anchor-length=",
806 "fusion-min-dist=",
807 "fusion-read-mismatches=",
808 "fusion-multireads=",
809 "fusion-multipairs=",
810 "no-novel-indels",
811 "no-gtf-juncs",
812 "mate-inner-dist=",
813 "mate-std-dev=",
814 "no-closure-search",
815 "no-coverage-search",
816 "closure-search",
817 "coverage-search",
818 "prefilter-multihits",
819 "microexon-search",
820 "min-closure-exon=",
821 "min-closure-intron=",
822 "max-closure-intron=",
823 "min-coverage-intron=",
824 "max-coverage-intron=",
825 "min-segment-intron=",
826 "max-segment-intron=",
827 "seed-length=",
828 "genome-read-mismatches=",
829 "read-mismatches=",
830 "max-read-mismatches=",
831 "transcriptome-mismatches=",
832 "segment-length=",
833 "segment-mismatches=",
834 "bowtie-n",
835 "butterfly-search",
836 "no-butterfly-search",
837 "keep-tmp",
838 "rg-id=",
839 "rg-sample=",
840 "rg-library=",
841 "rg-description=",
842 "rg-platform-unit=",
843 "rg-center=",
844 "rg-date=",
845 "rg-platform=",
846 "tmp-dir=",
847 "zpacker=",
848 "unmapped-fifo",
849 "max-insertion-length=",
850 "max-deletion-length=",
851 "insertions=",
852 "deletions=",
853 "no-sort-bam",
854 "no-convert-bam",
855 "report-secondary-alignments",
856 "keep-fasta-order",
857 "allow-partial-mapping",
858 "b2-very-fast",
859 "b2-fast",
860 "b2-sensitive",
861 "b2-very-sensitive",
862 "b2-N=",
863 "b2-L=",
864 "b2-i=",
865 "b2-n-ceil=",
866 "b2-gbar=",
867 "b2-ma=",
868 "b2-mp=",
869 "b2-np=",
870 "b2-rdg=",
871 "b2-rfg=",
872 "b2-score-min=",
873 "b2-D=",
874 "b2-R="])
875 except getopt.error, msg:
876 raise Usage(msg)
877
878 self.splice_constraints.parse_options(opts)
879 self.system_params.parse_options(opts)
880 self.read_params.parse_options(opts)
881 self.search_params.parse_options(opts)
882 self.report_params.parse_options(opts)
883 self.bowtie2_params.parse_options(opts)
884 global use_BWT_FIFO
885 global use_BAM_Unmapped
886 if not self.read_params.color:
887 use_BWT_FIFO=False
888 use_BAM_Unmapped=True
889 global output_dir
890 global logging_dir
891 global tmp_dir
892
893 custom_tmp_dir = None
894 custom_out_dir = None
895 # option processing
896 for option, value in opts:
897 if option in ("-v", "--version"):
898 print "TopHat v%s" % (get_version())
899 sys.exit(0)
900 if option in ("-h", "--help"):
901 raise Usage(use_message)
902 if option == "--bowtie1":
903 self.bowtie2 = False
904 if option in ("-g", "--max-multihits"):
905 self.max_hits = int(value)
906 self.max_seg_hits = max(10, self.max_hits * 2)
907 if option in ("-x", "--transcriptome-max-hits"):
908 self.t_max_hits = int(value)
909 if option in ("-G", "--GTF"):
910 self.gff_annotation = value
911 if option in ("-T", "--transcriptome"):
912 self.transcriptome_only = True
913 if option == "--transcriptome-index":
914 self.transcriptome_index = value
915 if option in("-M", "--prefilter-multihits"):
916 self.prefilter_multi = True
917 if option in ("-j", "--raw-juncs"):
918 self.raw_junctions = value
919 if option == "--no-novel-juncs":
920 self.find_novel_juncs = False
921 if option == "--no-novel-indels":
922 self.find_novel_indels = False
923 if option == "--fusion-search":
924 self.fusion_search = True
925 if option == "--fusion-anchor-length":
926 self.fusion_anchor_length = int(value)
927 if option == "--fusion-min-dist":
928 self.fusion_min_dist = int(value)
929 if option == "--fusion-read-mismatches":
930 self.fusion_read_mismatches = int(value)
931 if option == "--fusion-multireads":
932 self.fusion_multireads = int(value)
933 if option == "--fusion-multipairs":
934 self.fusion_multipairs = int(value)
935 if option == "--no-gtf-juncs":
936 self.find_GFF_juncs = False
937 if option == "--no-coverage-search":
938 self.coverage_search = False
939 if option == "--coverage-search":
940 self.coverage_search = True
941 if option == "--no-closure-search":
942 self.closure_search = False
943 if option == "--closure-search":
944 self.closure_search = True
945 if option == "--microexon-search":
946 self.microexon_search = True
947 if option == "--butterfly-search":
948 self.butterfly_search = True
949 if option == "--no-butterfly-search":
950 self.butterfly_search = False
951 if option == "--genome-read-mismatches":
952 self.genome_read_mismatches = int(value)
953 if option in ("-N", "--read-mismatches", "--max-read-mismatches"):
954 self.max_read_mismatches = int(value)
955 if option in ("-n", "--transcriptome-mismatches"):
956 self.t_mismatches = int(value)
957 if option == "--segment-length":
958 self.segment_length = int(value)
959 if option == "--segment-mismatches":
960 self.segment_mismatches = int(value)
961 if option == "--bowtie-n":
962 self.bowtie_alignment_option = "-n"
963 if option == "--max-insertion-length":
964 self.max_insertion_length = int(value)
965 if option == "--max-deletion-length":
966 self.max_deletion_length = int(value)
967 if option == "--insertions":
968 self.raw_insertions = value
969 if option == "--deletions":
970 self.raw_deletions = value
971 if option == "--report-secondary-alignments":
972 self.report_secondary_alignments = True
973 if option == "--keep-fasta-order":
974 self.keep_fasta_order = True
975 if option == "--allow-partial-mapping":
976 self.partial_mapping = True
977 if option in ("-o", "--output-dir"):
978 custom_out_dir = value + "/"
979 if option == "--tmp-dir":
980 custom_tmp_dir = value + "/"
981 if self.transcriptome_only:
982 self.find_novel_juncs=False
983 self.find_novel_indels=False
984 if custom_out_dir:
985 output_dir = custom_out_dir
986 logging_dir = output_dir + "logs/"
987 tmp_dir = output_dir + "tmp/"
988 sam_header = tmp_dir + "stub_header.sam"
989 if custom_tmp_dir:
990 tmp_dir = custom_tmp_dir
991 sam_header = tmp_dir + "stub_header.sam"
992 if len(args) < 2:
993 raise Usage(use_message)
994 return args
995
996
997 # check if a file exists and has non-zero (or minimum) size
998 def fileExists(filepath, minfsize=1):
999 if os.path.exists(filepath) and os.path.getsize(filepath)>=minfsize:
1000 return True
1001 else:
1002 return False
1003
1004
1005 def getFileDir(filepath):
1006 #if fullpath given, returns path including the ending /
1007 fpath, fname=os.path.split(filepath)
1008 if fpath: fpath+='/'
1009 return fpath
1010
1011 def getFileBaseName(filepath):
1012 fpath, fname=os.path.split(filepath)
1013 fbase, fext =os.path.splitext(fname)
1014 fx=fext.lower()
1015 if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fbase)>0:
1016 return fbase
1017 elif fx == '.z' or fx.find('.gz')==0 or fx.find('.bz')==0:
1018 fb, fext = os.path.splitext(fbase)
1019 fx=fext.lower()
1020 if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fb)>0:
1021 return fb
1022 else:
1023 return fbase
1024 else:
1025 if len(fbase)>0:
1026 return fbase
1027 else:
1028 return fname
1029
1030 # Returns the current time in a nice format
1031 def right_now():
1032 curr_time = datetime.now()
1033 return curr_time.strftime("%c")
1034
1035 # The TopHat logging formatter
1036 def th_log(out_str):
1037 #print >> sys.stderr, "[%s] %s" % (right_now(), out_str)
1038 if tophat_logger:
1039 tophat_logger.info(out_str)
1040
1041 def th_logp(out_str=""):
1042 print >> sys.stderr, out_str
1043 if tophat_log:
1044 print >> tophat_log, out_str
1045
1046 def die(msg=None):
1047 if msg is not None:
1048 th_logp(msg)
1049 sys.exit(1)
1050
1051 # Ensures that the output, logging, and temp directories are present. If not,
1052 # they are created
1053 def prepare_output_dir():
1054
1055 #th_log("Preparing output location "+output_dir)
1056 if os.path.exists(output_dir):
1057 pass
1058 else:
1059 os.mkdir(output_dir)
1060
1061 if os.path.exists(logging_dir):
1062 pass
1063 else:
1064 os.mkdir(logging_dir)
1065
1066 if os.path.exists(tmp_dir):
1067 pass
1068 else:
1069 try:
1070 os.makedirs(tmp_dir)
1071 except OSError, o:
1072 die("\nError creating directory %s (%s)" % (tmp_dir, o))
1073
1074
1075 # to be added as preexec_fn for every subprocess.Popen() call:
1076 # see http://bugs.python.org/issue1652
1077 def subprocess_setup():
1078 # Python installs a SIGPIPE handler by default, which causes
1079 # gzip or other de/compression pipes to complain about "stdout: Broken pipe"
1080 signal.signal(signal.SIGPIPE, signal.SIG_DFL)
1081
1082 # Check that the Bowtie index specified by the user is present and all files
1083 # are there.
1084 def check_bowtie_index(idx_prefix, is_bowtie2):
1085 th_log("Checking for Bowtie index files")
1086 idxext="ebwt"
1087 bowtie_ver=""
1088 if is_bowtie2:
1089 idxext="bt2"
1090 bowtie_ver="2 "
1091
1092 idx_fwd_1 = idx_prefix + ".1."+idxext
1093 idx_fwd_2 = idx_prefix + ".2."+idxext
1094 idx_rev_1 = idx_prefix + ".rev.1."+idxext
1095 idx_rev_2 = idx_prefix + ".rev.2."+idxext
1096
1097 if os.path.exists(idx_fwd_1) and \
1098 os.path.exists(idx_fwd_2) and \
1099 os.path.exists(idx_rev_1) and \
1100 os.path.exists(idx_rev_2):
1101 return
1102 else:
1103 bwtidxerr="Error: Could not find Bowtie "+bowtie_ver+"index files (" + idx_prefix + ".*."+idxext+")"
1104 bwtidx_env = os.environ.get("BOWTIE_INDEXES")
1105
1106 if bwtidx_env == None:
1107 die(bwtidxerr)
1108 if os.path.exists(bwtidx_env+idx_fwd_1) and \
1109 os.path.exists(bwtidx_env+idx_fwd_2) and \
1110 os.path.exists(bwtidx_env+idx_rev_1) and \
1111 os.path.exists(bwtidx_env+idx_rev_2):
1112 return
1113 else:
1114 die(bwtidxerr)
1115
1116 # Reconstructs the multifasta file from which the Bowtie index was created, if
1117 # it's not already there.
1118 def bowtie_idx_to_fa(idx_prefix):
1119 idx_name = idx_prefix.split('/')[-1]
1120 th_log("Reconstituting reference FASTA file from Bowtie index")
1121
1122 try:
1123 tmp_fasta_file_name = tmp_dir + idx_name + ".fa"
1124 tmp_fasta_file = open(tmp_fasta_file_name, "w")
1125
1126 inspect_log = open(logging_dir + "bowtie_inspect_recons.log", "w")
1127
1128 inspect_cmd = [prog_path("bowtie-inspect"),
1129 idx_prefix]
1130
1131 th_logp(" Executing: " + " ".join(inspect_cmd) + " > " + tmp_fasta_file_name)
1132 ret = subprocess.call(inspect_cmd,
1133 stdout=tmp_fasta_file,
1134 stderr=inspect_log)
1135
1136 # Bowtie reported an error
1137 if ret != 0:
1138 die(fail_str+"Error: bowtie-inspect returned an error")
1139
1140 # Bowtie not found
1141 except OSError, o:
1142 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1143 die(fail_str+"Error: bowtie-inspect not found on this system. Did you forget to include it in your PATH?")
1144
1145 return tmp_fasta_file_name
1146
1147 # Checks whether the multifasta file for the genome is present alongside the
1148 # Bowtie index files for it.
1149 def check_fasta(idx_prefix):
1150 th_log("Checking for reference FASTA file")
1151 idx_fasta = idx_prefix + ".fa"
1152 if os.path.exists(idx_fasta):
1153 return idx_fasta
1154 else:
1155 bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES")
1156 if bowtie_idx_env_var:
1157 idx_fasta = bowtie_idx_env_var + idx_prefix + ".fa"
1158 if os.path.exists(idx_fasta):
1159 return idx_fasta
1160
1161 th_logp("\tWarning: Could not find FASTA file " + idx_fasta)
1162 idx_fa = bowtie_idx_to_fa(idx_prefix)
1163 return idx_fa
1164
1165 # Check that both the Bowtie index and the genome's fasta file are present
1166 def check_index(idx_prefix, is_bowtie2):
1167 check_bowtie_index(idx_prefix, is_bowtie2)
1168 ref_fasta_file = check_fasta(idx_prefix)
1169
1170 return (ref_fasta_file, None)
1171
1172 # Retrive a tuple containing the system's version of Bowtie. Parsed from
1173 # `bowtie --version`
1174 def get_bowtie_version(is_bowtie2):
1175 try:
1176 # Launch Bowtie to capture its version info
1177 proc = subprocess.Popen([bowtie_path, "--version"],
1178 stdout=subprocess.PIPE)
1179
1180 stdout_value = proc.communicate()[0]
1181
1182 bowtie_version = None
1183 bowtie_out = repr(stdout_value)
1184
1185 # Find the version identifier
1186 if is_bowtie2:
1187 version_str = "bowtie2-align version "
1188 else:
1189 version_str = "bowtie version "
1190
1191 ver_str_idx = bowtie_out.find(version_str)
1192 if ver_str_idx != -1:
1193 nl = bowtie_out.find("\\n", ver_str_idx)
1194 version_val = bowtie_out[ver_str_idx + len(version_str):nl]
1195
1196 if is_bowtie2:
1197 ver_numbers = version_val.split('.')
1198 ver_numbers[2:] = ver_numbers[2].split('-')
1199 bowtie_version = [int(x) for x in ver_numbers[:3]] + [int(ver_numbers[3][4:])]
1200 else:
1201 bowtie_version = [int(x) for x in version_val.split('.')]
1202
1203 if len(bowtie_version) == 3:
1204 bowtie_version.append(0)
1205
1206 return bowtie_version
1207 except OSError, o:
1208 errmsg=fail_str+str(o)+"\n"
1209 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1210 errmsg+="Error: bowtie not found on this system"
1211 die(errmsg)
1212
1213 def get_index_sam_header(params, idx_prefix):
1214 try:
1215 temp_sam_header_filename = tmp_dir + "temp.samheader.sam"
1216 temp_sam_header_file = open(temp_sam_header_filename, "w")
1217
1218 bowtie_header_cmd = [bowtie_path]
1219
1220 read_params = params.read_params
1221 if not params.bowtie2:
1222 bowtie_header_cmd += ["--sam"]
1223
1224 if read_params.color:
1225 bowtie_header_cmd.append('-C')
1226
1227 bowtie_header_cmd.extend([idx_prefix, '/dev/null'])
1228 subprocess.call(bowtie_header_cmd,
1229 stdout=temp_sam_header_file,
1230 stderr=open('/dev/null'))
1231
1232 temp_sam_header_file.close()
1233 temp_sam_header_file = open(temp_sam_header_filename, "r")
1234
1235 bowtie_sam_header_filename = tmp_dir + idx_prefix.split('/')[-1] + ".bwt.samheader.sam"
1236 bowtie_sam_header_file = open(bowtie_sam_header_filename, "w")
1237
1238 preamble = []
1239 sq_dict_lines = []
1240
1241 for line in temp_sam_header_file.readlines():
1242 line = line.strip()
1243 if line.find("@SQ") != -1:
1244 # Sequence dictionary record
1245 cols = line.split('\t')
1246 seq_name = None
1247 for col in cols:
1248 fields = col.split(':')
1249 #print fields
1250 if len(fields) > 0 and fields[0] == "SN":
1251 seq_name = fields[1]
1252 if seq_name == None:
1253 die("Error: malformed sequence dictionary in sam header")
1254 sq_dict_lines.append([seq_name,line])
1255 elif line.find("CL"):
1256 continue
1257 else:
1258 preamble.append(line)
1259
1260 print >> bowtie_sam_header_file, "@HD\tVN:1.0\tSO:coordinate"
1261 if read_params.read_group_id and read_params.sample_id:
1262 rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
1263 read_params.sample_id)
1264 if read_params.library_id:
1265 rg_str += "\tLB:%s" % read_params.library_id
1266 if read_params.description:
1267 rg_str += "\tDS:%s" % read_params.description
1268 if read_params.seq_platform_unit:
1269 rg_str += "\tPU:%s" % read_params.seq_platform_unit
1270 if read_params.seq_center:
1271 rg_str += "\tCN:%s" % read_params.seq_center
1272 if read_params.mate_inner_dist:
1273 rg_str += "\tPI:%s" % read_params.mate_inner_dist
1274 if read_params.seq_run_date:
1275 rg_str += "\tDT:%s" % read_params.seq_run_date
1276 if read_params.seq_platform:
1277 rg_str += "\tPL:%s" % read_params.seq_platform
1278
1279 print >> bowtie_sam_header_file, rg_str
1280
1281 if not params.keep_fasta_order:
1282 sq_dict_lines.sort(lambda x,y: cmp(x[0],y[0]))
1283
1284 for [name, line] in sq_dict_lines:
1285 print >> bowtie_sam_header_file, line
1286 print >> bowtie_sam_header_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
1287
1288 bowtie_sam_header_file.close()
1289 temp_sam_header_file.close()
1290 return bowtie_sam_header_filename
1291
1292 except OSError, o:
1293 errmsg=fail_str+str(o)+"\n"
1294 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1295 errmsg+="Error: bowtie not found on this system"
1296 die(errmsg)
1297
1298 # Make sure Bowtie is installed and is recent enough to be useful
1299 def check_bowtie(params):
1300 bowtie_req=""
1301 if params.bowtie2:
1302 bowtie_req="2"
1303 log_msg = "Checking for Bowtie" + bowtie_req
1304 th_log(log_msg)
1305
1306 bowtie_bin = "bowtie"+bowtie_req
1307
1308 global bowtie_path
1309 bowtie_path=prog_path(bowtie_bin)
1310 bowtie_version = get_bowtie_version(params.bowtie2)
1311 if params.bowtie2 and bowtie_version == None:
1312 #try to fallback on bowtie 1
1313 params.bowtie2=False
1314 bowtie_version=get_bowtie_version(params.bowtie2)
1315 if bowtie_version == None:
1316 die("Error: Bowtie not found on this system.")
1317 if params.bowtie2:
1318 if bowtie_version[0] < 3 and bowtie_version[1] < 1 and bowtie_version[2] < 1 and bowtie_version[3] < 5:
1319 die("Error: TopHat requires Bowtie 2.0.0-beta5 or later")
1320 else:
1321 if bowtie_version[0] < 1 and bowtie_version[1] < 12 and bowtie_version[2] < 3:
1322 die("Error: TopHat requires Bowtie 0.12.3 or later")
1323 th_logp("\t\t Bowtie version:\t %s" % ".".join([str(x) for x in bowtie_version]))
1324
1325
1326 # Retrive a tuple containing the system's version of samtools. Parsed from
1327 # `samtools`
1328 def get_samtools_version():
1329 try:
1330 # Launch Bowtie to capture its version info
1331 proc = subprocess.Popen(samtools_path, stderr=subprocess.PIPE)
1332 samtools_out = proc.communicate()[1]
1333
1334 # Find the version identifier
1335 version_match = re.search(r'Version:\s+(\d+)\.(\d+).(\d+)([a-zA-Z]?)', samtools_out)
1336 samtools_version_arr = [int(version_match.group(x)) for x in [1,2,3]]
1337 if version_match.group(4):
1338 samtools_version_arr.append(version_match.group(4))
1339 else:
1340 samtools_version_arr.append(0)
1341
1342 return version_match.group(), samtools_version_arr
1343 except OSError, o:
1344 errmsg=fail_str+str(o)+"\n"
1345 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1346 errmsg+="Error: samtools not found on this system"
1347 die(errmsg)
1348
1349 # Make sure the SAM tools are installed and are recent enough to be useful
1350 def check_samtools():
1351 th_log("Checking for Samtools")
1352 global samtools_path
1353 samtools_path=prog_path("samtools")
1354 samtools_version_str, samtools_version_arr = get_samtools_version()
1355 if samtools_version_str == None:
1356 die("Error: Samtools not found on this system")
1357 elif samtools_version_arr[1] < 1 or samtools_version_arr[2] < 7:
1358 die("Error: TopHat requires Samtools 0.1.7 or later")
1359 th_logp("\t\tSamtools version:\t %s" % ".".join([str(x) for x in samtools_version_arr]))
1360
1361
1362
1363 class FastxReader:
1364 def __init__(self, i_file, is_color=0, fname=''):
1365 self.bufline=None
1366 self.format=None
1367 self.ifile=i_file
1368 self.nextRecord=None
1369 self.eof=None
1370 self.fname=fname
1371 self.lastline=None
1372 self.numrecords=0
1373 self.isColor=0
1374 if is_color : self.isColor=1
1375 # determine file type
1376 #no records processed yet, skip custom header lines if any
1377 hlines=10 # allow maximum 10 header lines
1378 self.lastline=" "
1379 while hlines>0 and self.lastline[0] not in "@>" :
1380 self.lastline=self.ifile.readline()
1381 hlines-=1
1382 if self.lastline[0] == '@':
1383 self.format='fastq'
1384 self.nextRecord=self.nextFastq
1385 elif self.lastline[0] == '>':
1386 self.format='fasta'
1387 self.nextRecord=self.nextFasta
1388 else:
1389 die("Error: cannot determine record type in input file %s" % fname)
1390 self.bufline=self.lastline
1391 self.lastline=None
1392
1393 def nextFastq(self):
1394 # returning tuple: (seqID, sequence_string, seq_len, qv_string)
1395 seqid,seqstr,qstr,seq_len='','','',0
1396 if self.eof: return (seqid, seqstr, seq_len, qstr)
1397 fline=self.getLine #shortcut to save a bit of time
1398 line=fline()
1399
1400 if not line : return (seqid, seqstr, seq_len, qstr)
1401 while len(line.rstrip())==0: # skip empty lines
1402 line=fline()
1403 if not line : return (seqid, seqstr,seq_len, qstr)
1404 try:
1405 if line[0] != "@":
1406 raise ValueError("Records in Fastq files should start with '@' character")
1407
1408 seqid = line[1:].rstrip()
1409 seqstr = fline().rstrip()
1410
1411 #There may now be more sequence lines, or the "+" quality marker line:
1412 while True:
1413 line = fline()
1414 if not line:
1415 raise ValueError("Premature end of file (missing quality values for "+seqid+")")
1416 if line[0] == "+":
1417 # -- sequence string ended
1418 #qtitle = line[1:].rstrip()
1419 #if qtitle and qtitle != seqid:
1420 # raise ValueError("Different read ID for sequence and quality (%s vs %s)" \
1421 # % (seqid, qtitle))
1422 break
1423 seqstr += line.rstrip() #removes trailing newlines
1424 #loop until + found
1425 seq_len = len(seqstr)
1426 #at least one line of quality data should follow
1427 qstrlen=0
1428 #now read next lines as quality values until seq_len is reached
1429 while True:
1430 line=fline()
1431 if not line : break #end of file
1432 qstr += line.rstrip()
1433 qstrlen=len(qstr)
1434 if qstrlen + self.isColor >= seq_len :
1435 break # qv string has reached the length of seq string
1436 #loop until qv has the same length as seq
1437
1438 if self.isColor:
1439 # and qstrlen==seq_len :
1440 if qstrlen==seq_len:
1441 #qual string may have a dummy qv at the beginning, should be stripped
1442 qstr = qstr[1:]
1443 qstrlen -= 1
1444 if qstrlen!=seq_len-1:
1445 raise ValueError("Length mismatch between sequence and quality strings "+ \
1446 "for %s (%i vs %i)." % (seqid, seq_len, qstrlen))
1447 else:
1448 if seq_len != qstrlen :
1449 raise ValueError("Length mismatch between sequence and quality strings "+ \
1450 "for %s (%i vs %i)." % (seqid, seq_len, qstrlen))
1451 except ValueError, err:
1452 die("\nError encountered parsing file "+self.fname+":\n "+str(err))
1453 #return the record
1454 self.numrecords+=1
1455 ##--discard the primer base [NO]
1456 if self.isColor :
1457 seq_len-=1
1458 seqstr = seqstr[1:]
1459 return (seqid, seqstr, seq_len, qstr)
1460
1461 def nextFasta(self):
1462 # returning tuple: (seqID, sequence_string, seq_len)
1463 seqid,seqstr,seq_len='','',0
1464 fline=self.getLine # shortcut to readline function of f
1465 line=fline() # this will use the buffer line if it's there
1466 if not line : return (seqid, seqstr, seq_len, None)
1467 while len(line.rstrip())==0: # skip empty lines
1468 line=fline()
1469 if not line : return (seqid, seqstr, seq_len, None)
1470 try:
1471 if line[0] != ">":
1472 raise ValueError("Records in Fasta files must start with '>' character")
1473 seqid = line[1:].split()[0]
1474 #more sequence lines, or the ">" quality marker line:
1475 while True:
1476 line = fline()
1477 if not line: break
1478 if line[0] == '>':
1479 #next sequence starts here
1480 self.ungetLine()
1481 break
1482 seqstr += line.rstrip()
1483 #loop until '>' found
1484 seq_len = len(seqstr)
1485 if seq_len < 3:
1486 raise ValueError("Read %s too short (%i)." \
1487 % (seqid, seq_len))
1488 except ValueError, err:
1489 die("\nError encountered parsing fasta file "+self.fname+"\n "+str(err))
1490 #return the record and continue
1491 self.numrecords+=1
1492 if self.isColor : # -- discard primer base
1493 seq_len-=1
1494 seqstr=seqstr[1:]
1495 return (seqid, seqstr, seq_len, None)
1496
1497 def getLine(self):
1498 if self.bufline: #return previously buffered line
1499 r=self.bufline
1500 self.bufline=None
1501 return r
1502 else: #read a new line from stream and return it
1503 if self.eof: return None
1504 self.lastline=self.ifile.readline()
1505 if not self.lastline:
1506 self.eof=1
1507 return None
1508 return self.lastline
1509 def ungetLine(self):
1510 if self.lastline is None:
1511 th_logp("Warning: FastxReader called ungetLine() with no prior line!")
1512 self.bufline=self.lastline
1513 self.lastline=None
1514 #< class FastxReader
1515
1516 def fa_write(fhandle, seq_id, seq):
1517 """
1518 Write to a file in the FASTA format.
1519
1520 Arguments:
1521 - `fhandle`: A file handle open for writing
1522 - `seq_id`: The sequence id string for this sequence
1523 - `seq`: An unformatted string of the sequence to write
1524 """
1525 line_len = 60
1526 fhandle.write(">" + seq_id + "\n")
1527 for i in xrange(len(seq) / line_len + 1):
1528 start = i * line_len
1529 #end = (i+1) * line_len if (i+1) * line_len < len(seq) else len(seq)
1530 if (i+1) * line_len < len(seq):
1531 end = (i+1) * line_len
1532 else:
1533 end = len(seq)
1534 fhandle.write( seq[ start:end ] + "\n")
1535
1536 class ZReader:
1537 def __init__(self, filename, sysparams, guess=True):
1538 self.fname=filename
1539 self.file=None
1540 self.fsrc=None
1541 self.popen=None
1542 pipecmd=[]
1543 s=filename.lower()
1544 if s.endswith(".bam"):
1545 # daehwan - want to ouptut mapped reads as well
1546 pipecmd=[prog_path("bam2fastx"), "--all", "-"]
1547 else:
1548 if guess:
1549 if s.endswith(".z") or s.endswith(".gz") or s.endswith(".gzip"):
1550 pipecmd=['gzip']
1551 else:
1552 if s.endswith(".bz2") or s.endswith(".bzip2") or s.endswith(".bzip"):
1553 pipecmd=['bzip2']
1554 if len(pipecmd)>0 and which(pipecmd[0]) is None:
1555 die("Error: cannot find %s to decompress input file %s " % (pipecmd, filename))
1556 if len(pipecmd)>0:
1557 if pipecmd[0]=='gzip' and sysparams.zipper.endswith('pigz'):
1558 pipecmd[0]=sysparams.zipper
1559 pipecmd.extend(sysparams.zipper_opts)
1560 elif pipecmd[0]=='bzip2' and sysparams.zipper.endswith('pbzip2'):
1561 pipecmd[0]=sysparams.zipper
1562 pipecmd.extend(sysparams.zipper_opts)
1563 else: #not guessing, but must still check if it's a compressed file
1564 if use_zpacker and filename.endswith(".z"):
1565 pipecmd=[sysparams.zipper]
1566 pipecmd.extend(sysparams.zipper_opts)
1567
1568 if pipecmd:
1569 pipecmd+=['-cd']
1570 if pipecmd:
1571 try:
1572 self.fsrc=open(self.fname, 'rb')
1573 self.popen=subprocess.Popen(pipecmd,
1574 preexec_fn=subprocess_setup,
1575 stdin=self.fsrc,
1576 stdout=subprocess.PIPE, close_fds=True)
1577 except Exception:
1578 die("Error: could not open pipe "+' '.join(pipecmd)+' < '+ self.fname)
1579 self.file=self.popen.stdout
1580 else:
1581 self.file=open(filename)
1582 def close(self):
1583 if self.fsrc: self.fsrc.close()
1584 self.file.close()
1585 if self.popen:
1586 self.popen.wait()
1587 self.popen=None
1588
1589 class ZWriter:
1590 def __init__(self, filename, sysparams):
1591 self.fname=filename
1592 if use_zpacker:
1593 pipecmd=[sysparams.zipper,"-cf", "-"]
1594 self.ftarget=open(filename, "wb")
1595 try:
1596 self.popen=subprocess.Popen(pipecmd,
1597 preexec_fn=subprocess_setup,
1598 stdin=subprocess.PIPE,
1599 stdout=self.ftarget, close_fds=True)
1600 except Exception:
1601 die("Error: could not open writer pipe "+' '.join(pipecmd)+' < '+ self.fname)
1602 self.file=self.popen.stdin # client writes to this end of the pipe
1603 else: #no compression
1604 self.file=open(filename, "w")
1605 self.ftarget=None
1606 self.popen=None
1607 def close(self):
1608 self.file.close()
1609 if self.ftarget: self.ftarget.close()
1610 if self.popen:
1611 self.popen.wait() #! required to actually flush the pipes (eek!)
1612 self.popen=None
1613
1614 # check_reads_format() examines the first few records in the user files
1615 # to determines the file format
1616 def check_reads_format(params, reads_files):
1617 #seed_len = params.read_params.seed_length
1618 fileformat = params.read_params.reads_format
1619
1620 observed_formats = set([])
1621 # observed_scales = set([])
1622 min_seed_len = 99999
1623 max_seed_len = 0
1624 files = reads_files.split(',')
1625
1626 for f_name in files:
1627 #try:
1628 zf = ZReader(f_name, params.system_params)
1629 #except IOError:
1630 # die("Error: could not open file "+f_name)
1631 freader=FastxReader(zf.file, params.read_params.color, zf.fname)
1632 toread=4 #just sample the first 4 reads
1633 while toread>0:
1634 seqid, seqstr, seq_len, qstr = freader.nextRecord()
1635 if not seqid: break
1636 toread-=1
1637 if seq_len < 20:
1638 th_logp("Warning: found a read < 20bp in "+f_name)
1639 else:
1640 min_seed_len = min(seq_len, min_seed_len)
1641 max_seed_len = max(seq_len, max_seed_len)
1642 zf.close()
1643 observed_formats.add(freader.format)
1644 if len(observed_formats) > 1:
1645 die("Error: TopHat requires all reads be either FASTQ or FASTA. Mixing formats is not supported.")
1646 fileformat=list(observed_formats)[0]
1647 #if seed_len != None:
1648 # seed_len = max(seed_len, max_seed_len)
1649 #else:
1650 # seed_len = max_seed_len
1651 #print >> sys.stderr, "\tmin read length: %dbp, max read length: %dbp" % (min_seed_len, max_seed_len)
1652 th_logp("\tformat:\t\t %s" % fileformat)
1653 if fileformat == "fastq":
1654 quality_scale = "phred33 (default)"
1655 if params.read_params.solexa_quals and not params.read_params.phred64_quals:
1656 quality_scale = "solexa33 (reads generated with GA pipeline version < 1.3)"
1657 elif params.read_params.phred64_quals:
1658 quality_scale = "phred64 (reads generated with GA pipeline version >= 1.3)"
1659 th_logp("\tquality scale:\t %s" % quality_scale)
1660 elif fileformat == "fasta":
1661 if params.read_params.color:
1662 params.read_params.integer_quals = True
1663
1664 #print seed_len, format, solexa_scale
1665 #NOTE: seed_len will be re-evaluated later by prep_reads
1666 return TopHatParams.ReadParams(params.read_params.solexa_quals,
1667 params.read_params.phred64_quals,
1668 params.read_params.quals,
1669 params.read_params.integer_quals,
1670 params.read_params.color,
1671 params.read_params.library_type,
1672 #seed_len,
1673 params.read_params.seed_length,
1674 fileformat,
1675 params.read_params.mate_inner_dist,
1676 params.read_params.mate_inner_dist_std_dev,
1677 params.read_params.read_group_id,
1678 params.read_params.sample_id,
1679 params.read_params.library_id,
1680 params.read_params.description,
1681 params.read_params.seq_platform_unit,
1682 params.read_params.seq_center,
1683 params.read_params.seq_run_date,
1684 params.read_params.seq_platform)
1685
1686 def log_tail(logfile, lines=1):
1687 f=open(logfile, "r")
1688 f.seek(0, 2)
1689 fbytes= f.tell()
1690 size=lines
1691 block=-1
1692 while size > 0 and fbytes+block*1024 > 0:
1693 if (fbytes+block*1024 > 0):
1694 ##Seek back once more, if possible
1695 f.seek( block*1024, 2 )
1696 else:
1697 #Seek to the beginning
1698 f.seek(0, 0)
1699 data= f.read( 1024 )
1700 linesFound= data.count('\n')
1701 size -= linesFound
1702 block -= 1
1703 if (fbytes + block*1024 > 0):
1704 f.seek(block*1024, 2)
1705 else:
1706 f.seek(0,0)
1707 f.readline() # find a newline
1708 lastBlocks= list( f.readlines() )
1709 f.close()
1710 return "\n".join(lastBlocks[-lines:])
1711
1712 # Format a DateTime as a pretty string.
1713 # FIXME: Currently doesn't support days!
1714 def formatTD(td):
1715 days = td.days
1716 hours = td.seconds // 3600
1717 minutes = (td.seconds % 3600) // 60
1718 seconds = td.seconds % 60
1719
1720 if days > 0:
1721 return '%d days %02d:%02d:%02d' % (days, hours, minutes, seconds)
1722 else:
1723 return '%02d:%02d:%02d' % (hours, minutes, seconds)
1724
1725 class PrepReadsInfo:
1726 def __init__(self, fname, side):
1727 try:
1728 f=open(fname,"r")
1729 self.min_len=int(f.readline().split("=")[-1])
1730 self.max_len=int(f.readline().split("=")[-1])
1731 self.in_count=int(f.readline().split("=")[-1])
1732 self.out_count=int(f.readline().split("=")[-1])
1733 if (self.out_count==0) or (self.max_len<16):
1734 raise Exception()
1735 except Exception, e:
1736 die(fail_str+"Error retrieving prep_reads info.")
1737 th_logp("\t%s reads: min. length=%s, count=%s" % (side, self.min_len, self.out_count))
1738 if self.min_len < 20:
1739 th_logp("Warning: short reads (<20bp) will make TopHat quite slow and take large amount of memory because they are likely to be mapped to too many places")
1740
1741
1742 def prep_reads_cmd(params, reads_list, quals_list=None, aux_file=None, index_file=None, filter_reads=None, hits_to_filter=None):
1743 #generate a prep_reads cmd arguments
1744 filter_cmd = [prog_path("prep_reads")]
1745 filter_cmd.extend(params.cmd())
1746
1747 if params.read_params.reads_format == "fastq":
1748 filter_cmd += ["--fastq"]
1749 elif params.read_params.reads_format == "fasta":
1750 filter_cmd += ["--fasta"]
1751 if hits_to_filter:
1752 filter_cmd += ["--flt-hits="+hits_to_filter]
1753 if aux_file:
1754 filter_cmd += ["--aux-outfile="+aux_file]
1755 if index_file:
1756 filter_cmd += ["--index-outfile="+index_file]
1757 if filter_reads:
1758 filter_cmd += ["--flt-reads="+filter_reads]
1759 filter_cmd.append(reads_list)
1760 if quals_list:
1761 filter_cmd.append(quals_list)
1762 return filter_cmd
1763
1764 # Calls the prep_reads executable, which prepares an internal read library.
1765 # The read library features reads with monotonically increasing integer IDs.
1766 # prep_reads also filters out very low complexy or garbage reads as well as
1767 # polyA reads.
1768 #--> returns the tuple (internal_read_library_file, read_info)
1769 def prep_reads(params, reads_list, quals_list, side="left", prefilter_reads=None):
1770 reads_suffix = ".fq"
1771
1772 # for parallelization, we don't compress the read files
1773 do_use_zpacker = use_zpacker
1774 if params.system_params.num_threads > 1:
1775 do_use_zpacker = False
1776
1777 if do_use_zpacker: reads_suffix += ".z"
1778
1779 output_name=side+"_kept_reads"
1780 kept_reads_filename = tmp_dir + output_name + reads_suffix
1781
1782 if os.path.exists(kept_reads_filename):
1783 os.remove(kept_reads_filename)
1784 kept_reads = open(kept_reads_filename, "wb")
1785 log_fname=logging_dir + "prep_reads.log"
1786 filter_log = open(log_fname,"w")
1787
1788 info_file=output_dir+output_name+".info"
1789 index_file=kept_reads_filename+".index"
1790
1791 filter_cmd=prep_reads_cmd(params, reads_list, quals_list, info_file, index_file, prefilter_reads)
1792 shell_cmd = ' '.join(filter_cmd)
1793 #finally, add the compression pipe
1794 zip_cmd=[]
1795 if do_use_zpacker:
1796 zip_cmd=[ params.system_params.zipper ]
1797 zip_cmd.extend(params.system_params.zipper_opts)
1798 zip_cmd.extend(['-c','-'])
1799 shell_cmd +=' | '+' '.join(zip_cmd)
1800 shell_cmd += ' >' +kept_reads_filename
1801 retcode=0
1802 try:
1803 print >> run_log, shell_cmd
1804 if do_use_zpacker:
1805 filter_proc = subprocess.Popen(filter_cmd,
1806 stdout=subprocess.PIPE,
1807 stderr=filter_log)
1808 zip_proc=subprocess.Popen(zip_cmd,
1809 preexec_fn=subprocess_setup,
1810 stdin=filter_proc.stdout,
1811 stdout=kept_reads)
1812 filter_proc.stdout.close() #as per http://bugs.python.org/issue7678
1813 zip_proc.communicate()
1814 retcode=filter_proc.poll()
1815 if retcode==0:
1816 retcode=zip_proc.poll()
1817 else:
1818 retcode = subprocess.call(filter_cmd,
1819 stdout=kept_reads,
1820 stderr=filter_log)
1821 if retcode:
1822 die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname))
1823
1824 except OSError, o:
1825 errmsg=fail_str+str(o)
1826 die(errmsg+"\n"+log_tail(log_fname))
1827
1828 kept_reads.close()
1829 return kept_reads_filename, PrepReadsInfo(info_file, side)
1830
1831
1832 # Call bowtie
1833 def bowtie(params,
1834 bwt_idx_prefix,
1835 reads_list,
1836 reads_format,
1837 num_mismatches,
1838 mapped_reads,
1839 unmapped_reads,
1840 extra_output = "",
1841 mapping_type = "read_against_genome",
1842 multihits_out = None): #only --prefilter-multihits should activate this parameter for the initial prefilter search
1843 start_time = datetime.now()
1844 bwt_idx_name = bwt_idx_prefix.split('/')[-1]
1845 reads_file=reads_list[0]
1846 readfile_basename=getFileBaseName(reads_file)
1847
1848 g_mapping, t_mapping, seg_mapping = False, False, False
1849 if mapping_type == "read_against_transcriptome":
1850 t_mapping = True
1851 elif mapping_type == "seg_against_genome" or mapping_type == "seg_against_junctions":
1852 seg_mapping = True
1853 else:
1854 g_mapping = True
1855
1856 bowtie_str = "Bowtie"
1857 if params.bowtie2:
1858 bowtie_str += "2"
1859
1860 if seg_mapping:
1861 if not params.bowtie2:
1862 backup_bowtie_alignment_option = params.bowtie_alignment_option
1863 params.bowtie_alignment_option = "-v"
1864
1865 if t_mapping:
1866 th_log("Mapping %s against transcriptome %s with %s %s" % (readfile_basename,
1867 bwt_idx_name, bowtie_str, extra_output))
1868 else:
1869 th_log("Mapping %s against %s with %s %s" % (readfile_basename,
1870 bwt_idx_name, bowtie_str, extra_output))
1871 bwt_log = open(logging_dir + 'bowtie.'+readfile_basename+'.fixmap.log', "w")
1872 #bwt_mapped=mapped_reads
1873 unmapped_reads_out=None
1874 if unmapped_reads:
1875 unmapped_reads_out=unmapped_reads+".fq"
1876 use_FIFO = use_BWT_FIFO and use_zpacker and unmapped_reads and params.read_params.color
1877 if use_FIFO:
1878 global unmapped_reads_fifo
1879 unmapped_reads_fifo=unmapped_reads+".fifo"
1880 if os.path.exists(unmapped_reads_fifo):
1881 os.remove(unmapped_reads_fifo)
1882 try:
1883 os.mkfifo(unmapped_reads_fifo)
1884 except OSError, o:
1885 die(fail_str+"Error at mkfifo("+unmapped_reads_fifo+'). '+str(o))
1886 unmapped_reads_out+=".z"
1887
1888 # Launch Bowtie
1889 try:
1890 bowtie_cmd = [bowtie_path]
1891 if reads_format == "fastq":
1892 bowtie_cmd += ["-q"]
1893 elif reads_format == "fasta":
1894 bowtie_cmd += ["-f"]
1895
1896 if params.read_params.color:
1897 bowtie_cmd += ["-C", "--col-keepends"]
1898 if unmapped_reads:
1899 bowtie_cmd += ["--un", unmapped_reads_out]
1900
1901 unzip_cmd=None
1902 bam_input=False
1903 if len(reads_list) > 0 and reads_list[0].endswith('.bam'):
1904 bam_input=True
1905 unzip_cmd=[ prog_path('bam2fastx'), "--all" ]
1906 if reads_format:
1907 unzip_cmd.append("--" + reads_format)
1908 unzip_cmd+=[reads_list[0]]
1909
1910 if use_zpacker and (unzip_cmd is None):
1911 unzip_cmd=[ params.system_params.zipper ]
1912 unzip_cmd.extend(params.system_params.zipper_opts)
1913 unzip_cmd+=['-cd']
1914
1915 fifo_pid=None
1916 if use_FIFO:
1917 unm_zipcmd=[ params.system_params.zipper ]
1918 unm_zipcmd.extend(params.system_params.zipper_opts)
1919 unm_zipcmd+=['-c']
1920 print >> run_log, ' '.join(unm_zipcmd)+' < '+ unmapped_reads_fifo + ' > '+ unmapped_reads_out + ' & '
1921 fifo_pid=os.fork()
1922 if fifo_pid==0:
1923 def on_sig_exit(sig, func=None):
1924 os._exit(os.EX_OK)
1925 signal.signal(signal.SIGTERM, on_sig_exit)
1926 subprocess.call(unm_zipcmd,
1927 stdin=open(unmapped_reads_fifo, "r"),
1928 stdout=open(unmapped_reads_out, "wb"))
1929 os._exit(os.EX_OK)
1930
1931 fix_map_cmd = [prog_path('fix_map_ordering')]
1932
1933 if params.bowtie2:
1934 if t_mapping or g_mapping:
1935 max_penalty, min_penalty = params.bowtie2_params.mp.split(',')
1936 max_penalty, min_penalty = int(max_penalty), int(min_penalty)
1937 min_score = (max_penalty - 1) * num_mismatches
1938 fix_map_cmd += ["--bowtie2-min-score", str(min_score)]
1939
1940 samzip_cmd=None
1941
1942 # daehwan - just use bam format for multi-threading.
1943 if unmapped_reads or params.system_params.num_threads >= 1:
1944 #mapping on the genome
1945 #fix_map_ordering will write BAM file directly, and unmapped_reads as BAM file too
1946 mapped_reads += ".bam"
1947
1948 temp_sam_header = tmp_dir + bwt_idx_prefix.split('/')[-1] + ".bwt.samheader.sam"
1949 fix_map_cmd += ["--sam-header", temp_sam_header, "-", mapped_reads]
1950 if not params.read_params.color and unmapped_reads:
1951 unmapped_reads_out = unmapped_reads+".bam"
1952 fix_map_cmd += [unmapped_reads_out]
1953 else:
1954 #mapping on segment_juncs (spliced potential junctions)
1955 # don't use SAM headers, could be slow -- better use compressed SAM directly
1956 mapped_reads += ".sam"
1957 fix_map_cmd += ["-"]
1958 if use_zpacker:
1959 fix_map_cmd += ["-"]
1960 samzip_cmd=[ params.system_params.zipper ]
1961 samzip_cmd.extend(params.system_params.zipper_opts)
1962 samzip_cmd += ["-c"]
1963 mapped_reads += ".z"
1964 else:
1965 fix_map_cmd += [mapped_reads]
1966
1967 fix_map_cmd += ["--index-outfile", mapped_reads + ".index"]
1968
1969 if t_mapping:
1970 max_hits = params.t_max_hits
1971 elif seg_mapping:
1972 max_hits = params.max_seg_hits
1973 else:
1974 max_hits = params.max_hits
1975
1976 if num_mismatches > 3:
1977 num_mismatches = 3
1978
1979 if params.bowtie2:
1980 if seg_mapping:
1981 # since bowtie2 does not suppress reads that map to many places,
1982 # we suppress those in segment_juncs and long_spanning_reads.
1983 bowtie_cmd += ["-k", str(max_hits + 1)]
1984 else:
1985 bowtie_cmd += ["-k", str(max_hits)]
1986
1987 bowtie2_params = params.bowtie2_params
1988 if seg_mapping:
1989 # after intensive testing,
1990 # the following parameters seem to work faster than Bowtie1 and as sensitive as Bowtie1,
1991 # but a chance for further improvements remains.
1992 bowtie_cmd += ["-N", str(min(num_mismatches, 1))]
1993 bowtie_cmd += ["-i", "C,2,0"]
1994 bowtie_cmd += ["-L", "20"]
1995 # bowtie_cmd += ["-L", str(min(params.segment_length, 20))]
1996 else:
1997 bowtie2_preset = ""
1998 if bowtie2_params.very_fast:
1999 bowtie2_preset = "--very-fast"
2000 elif bowtie2_params.fast:
2001 bowtie2_preset = "--fast"
2002 elif bowtie2_params.sensitive:
2003 bowtie2_preset = "--sensitive"
2004 elif bowtie2_params.very_sensitive:
2005 bowtie2_preset = "--very-sensitive"
2006
2007 if bowtie2_preset != "":
2008 bowtie_cmd += [bowtie2_preset]
2009 else:
2010 bowtie_cmd += ["-D", str(bowtie2_params.D),
2011 "-R", str(bowtie2_params.R),
2012 "-N", str(bowtie2_params.N),
2013 "-L", str(bowtie2_params.L),
2014 "-i", bowtie2_params.i]
2015
2016 # "--n-ceil" is not correctly parsed in Bowtie2,
2017 # I (daehwan) already talked to Ben who will fix the problem.
2018 bowtie_cmd += [# "--n-ceil", bowtie2_params.n_ceil,
2019 "--gbar", str(bowtie2_params.gbar),
2020 "--mp", bowtie2_params.mp,
2021 "--np", str(bowtie2_params.np),
2022 "--rdg", bowtie2_params.rdg,
2023 "--rfg", bowtie2_params.rfg,
2024 "--score-min", bowtie2_params.score_min]
2025
2026 else:
2027 bowtie_cmd += [params.bowtie_alignment_option, str(num_mismatches),
2028 "-k", str(max_hits),
2029 "-m", str(max_hits),
2030 "-S"]
2031
2032 bowtie_cmd += ["-p", str(params.system_params.num_threads)]
2033
2034 if params.bowtie2: #always use headerless SAM file
2035 bowtie_cmd += ["--sam-no-hd"]
2036 else:
2037 bowtie_cmd += ["--sam-nohead"]
2038
2039 if not params.bowtie2:
2040 if multihits_out:
2041 bowtie_cmd += ["--max", multihits_out]
2042 else:
2043 bowtie_cmd += ["--max", "/dev/null"]
2044
2045 if params.bowtie2:
2046 bowtie_cmd += ["-x"]
2047
2048 bowtie_cmd += [ bwt_idx_prefix ]
2049 bowtie_proc=None
2050 shellcmd=""
2051 unzip_proc=None
2052 samzip_proc=None
2053
2054 if multihits_out:
2055 #special prefilter bowtie run: we use prep_reads on the fly
2056 #in order to get multi-mapped reads to exclude later
2057 prep_cmd = prep_reads_cmd(params, ",".join(reads_list))
2058 preplog_fname=logging_dir + "prep_reads_prefilter.log"
2059 prepfilter_log = open(preplog_fname,"w")
2060 unzip_proc = subprocess.Popen(prep_cmd,
2061 stdout=subprocess.PIPE,
2062 stderr=prepfilter_log)
2063 shellcmd=' '.join(prep_cmd) + "|"
2064 else:
2065 z_input=use_zpacker and reads_file.endswith(".z")
2066 if z_input:
2067 unzip_proc = subprocess.Popen(unzip_cmd,
2068 stdin=open(reads_file, "rb"),
2069 stdout=subprocess.PIPE)
2070 shellcmd=' '.join(unzip_cmd) + "< " +reads_file +"|"
2071 else:
2072 #must be uncompressed fastq input (unmapped reads from a previous run)
2073 #or a BAM file with unmapped reads
2074 if bam_input:
2075 unzip_proc = subprocess.Popen(unzip_cmd, stdout=subprocess.PIPE)
2076 shellcmd=' '.join(unzip_cmd) + "|"
2077 else:
2078 bowtie_cmd += [reads_file]
2079 bowtie_proc = subprocess.Popen(bowtie_cmd,
2080 stdout=subprocess.PIPE,
2081 stderr=bwt_log)
2082 if unzip_proc:
2083 #input is compressed OR prep_reads is used as a filter
2084 bowtie_cmd += ['-']
2085 bowtie_proc = subprocess.Popen(bowtie_cmd,
2086 stdin=unzip_proc.stdout,
2087 stdout=subprocess.PIPE,
2088 stderr=bwt_log)
2089 unzip_proc.stdout.close() # see http://bugs.python.org/issue7678
2090
2091 shellcmd += ' '.join(bowtie_cmd) + '|' + ' '.join(fix_map_cmd)
2092 if samzip_cmd:
2093 shellcmd += "|"+ ' '.join(samzip_cmd)+" > " + mapped_reads
2094 fix_order_proc = subprocess.Popen(fix_map_cmd,
2095 stdin=bowtie_proc.stdout,
2096 stdout=subprocess.PIPE)
2097 bowtie_proc.stdout.close() # needed?
2098 samzip_proc = subprocess.Popen(samzip_cmd,
2099 preexec_fn=subprocess_setup,
2100 stdin=fix_order_proc.stdout,
2101 stdout=open(mapped_reads, "wb"))
2102 fix_order_proc.stdout.close()
2103 else:
2104 #write BAM output directly
2105 fix_order_proc = subprocess.Popen(fix_map_cmd,
2106 stdin=bowtie_proc.stdout)
2107 # stdout=open(mapped_reads, "w"))
2108 bowtie_proc.stdout.close()
2109
2110 # shellcmd += " > " + mapped_reads
2111 print >> run_log, shellcmd
2112 if samzip_proc:
2113 samzip_proc.communicate()
2114 else:
2115 fix_order_proc.communicate()
2116 if use_FIFO:
2117 if fifo_pid and not os.path.exists(unmapped_reads_out):
2118 try:
2119 os.kill(fifo_pid, signal.SIGTERM)
2120 except:
2121 pass
2122
2123 except OSError, o:
2124 die(fail_str+"Error: "+str(o))
2125
2126 # Success
2127 #finish_time = datetime.now()
2128 #duration = finish_time - start_time
2129 #print >> sys.stderr, "\t\t\t[%s elapsed]" % formatTD(duration)
2130 if use_FIFO:
2131 try:
2132 os.remove(unmapped_reads_fifo)
2133 except:
2134 pass
2135 if multihits_out and not os.path.exists(multihits_out):
2136 open(multihits_out, "w").close()
2137
2138 if seg_mapping:
2139 if not params.bowtie2:
2140 params.bowtie_alignment_option = backup_bowtie_alignment_option
2141
2142 return (mapped_reads, unmapped_reads_out)
2143
2144
2145 # Retrieve a .juncs file from a GFF file by calling the gtf_juncs executable
2146 def get_gtf_juncs(gff_annotation):
2147 th_log("Reading known junctions from GTF file")
2148 gtf_juncs_log = open(logging_dir + "gtf_juncs.log", "w")
2149
2150 gff_prefix = gff_annotation.split('/')[-1].split('.')[0]
2151
2152 gtf_juncs_out_name = tmp_dir + gff_prefix + ".juncs"
2153 gtf_juncs_out = open(gtf_juncs_out_name, "w")
2154
2155 gtf_juncs_cmd=[prog_path("gtf_juncs"), gff_annotation]
2156 try:
2157 print >> run_log, " ".join(gtf_juncs_cmd), " > "+gtf_juncs_out_name
2158 retcode = subprocess.call(gtf_juncs_cmd,
2159 stderr=gtf_juncs_log,
2160 stdout=gtf_juncs_out)
2161 # cvg_islands returned an error
2162 if retcode == 1:
2163 th_logp("\tWarning: TopHat did not find any junctions in GTF file")
2164 return (False, gtf_juncs_out_name)
2165 elif retcode != 0:
2166 die(fail_str+"Error: GTF junction extraction failed with err ="+str(retcode))
2167
2168 # cvg_islands not found
2169 except OSError, o:
2170 errmsg=fail_str+str(o)+"\n"
2171 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2172 errmsg+="Error: gtf_juncs not found on this system"
2173 die(errmsg)
2174 return (True, gtf_juncs_out_name)
2175
2176 # Call bowtie-build on the FASTA file of sythetic splice junction sequences
2177 def build_juncs_bwt_index(is_bowtie2, external_splice_prefix, color):
2178 th_log("Indexing splices")
2179 bowtie_build_log = open(logging_dir + "bowtie_build.log", "w")
2180
2181 #user_splices_out_prefix = output_dir + "user_splices_idx"
2182
2183 if is_bowtie2:
2184 bowtie_build_cmd = [prog_path("bowtie2-build")]
2185 else:
2186 bowtie_build_cmd = [prog_path("bowtie-build")]
2187
2188 if color:
2189 bowtie_build_cmd += ["-C"]
2190
2191 bowtie_build_cmd += [external_splice_prefix + ".fa",
2192 external_splice_prefix]
2193 try:
2194 print >> run_log, " ".join(bowtie_build_cmd)
2195 retcode = subprocess.call(bowtie_build_cmd,
2196 stdout=bowtie_build_log)
2197
2198 if retcode != 0:
2199 die(fail_str+"Error: Splice sequence indexing failed with err ="+ str(retcode))
2200 except OSError, o:
2201 errmsg=fail_str+str(o)+"\n"
2202 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2203 errmsg+="Error: bowtie-build not found on this system"
2204 die(errmsg)
2205 return external_splice_prefix
2206
2207 # Build a splice index from a .juncs file, suitable for use with specified read
2208 # (or read segment) lengths
2209 def build_juncs_index(is_bowtie2,
2210 min_anchor_length,
2211 max_seg_len,
2212 juncs_prefix,
2213 external_juncs,
2214 external_insertions,
2215 external_deletions,
2216 external_fusions,
2217 reference_fasta,
2218 color):
2219 th_log("Retrieving sequences for splices")
2220
2221 juncs_file_list = ",".join(external_juncs)
2222 insertions_file_list = ",".join(external_insertions)
2223 deletions_file_list = ",".join(external_deletions)
2224 fusions_file_list = ",".join(external_fusions)
2225
2226 # do not use insertions and deletions in case of Bowtie2
2227 if is_bowtie2:
2228 insertions_file_list = "/dev/null"
2229 deletions_file_list = "/dev/null"
2230
2231 juncs_db_log = open(logging_dir + "juncs_db.log", "w")
2232
2233 external_splices_out_prefix = tmp_dir + juncs_prefix
2234 external_splices_out_name = external_splices_out_prefix + ".fa"
2235
2236 external_splices_out = open(external_splices_out_name, "w")
2237 # juncs_db_cmd = [bin_dir + "juncs_db",
2238 juncs_db_cmd = [prog_path("juncs_db"),
2239 str(min_anchor_length),
2240 str(max_seg_len),
2241 juncs_file_list,
2242 insertions_file_list,
2243 deletions_file_list,
2244 fusions_file_list,
2245 reference_fasta]
2246 try:
2247 print >> run_log, " ".join(juncs_db_cmd)
2248 retcode = subprocess.call(juncs_db_cmd,
2249 stderr=juncs_db_log,
2250 stdout=external_splices_out)
2251
2252 if retcode != 0:
2253 die(fail_str+"Error: Splice sequence retrieval failed with err ="+str(retcode))
2254 # juncs_db not found
2255 except OSError, o:
2256 errmsg=fail_str+str(o)+"\n"
2257 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2258 errmsg+="Error: juncs_db not found on this system"
2259 die(errmsg)
2260
2261 external_splices_out_prefix = build_juncs_bwt_index(is_bowtie2, external_splices_out_prefix, color)
2262 return external_splices_out_prefix
2263
2264 def build_idx_from_fa(is_bowtie2, fasta_fname, out_dir, color):
2265 """ Build a bowtie index from a FASTA file.
2266
2267 Arguments:
2268 - `fasta_fname`: File path to FASTA file.
2269 - `out_dir`: Output directory to place index in. (includes os.sep)
2270
2271 Returns:
2272 - The path to the Bowtie index.
2273 """
2274 bwt_idx_path = out_dir + os.path.basename(fasta_fname).replace(".fa", "")
2275
2276 if is_bowtie2:
2277 bowtie_idx_cmd = [prog_path("bowtie2-build")]
2278 else:
2279 bowtie_idx_cmd = [prog_path("bowtie-build")]
2280
2281 if color:
2282 bowtie_idx_cmd += ["-C"]
2283
2284 bowtie_idx_cmd += [fasta_fname,
2285 bwt_idx_path]
2286 try:
2287 th_log("Building Bowtie index from " + os.path.basename(fasta_fname))
2288 print >> run_log, " ".join(bowtie_idx_cmd)
2289 retcode = subprocess.call(bowtie_idx_cmd,
2290 stdout=open(os.devnull, "w"),
2291 stderr=open(os.devnull, "w"))
2292 if retcode != 0:
2293 die(fail_str + "Error: Couldn't build bowtie index with err = "
2294 + str(retcode))
2295 except OSError, o:
2296 errmsg=fail_str+str(o)+"\n"
2297 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2298 errmsg+="Error: bowtie-build not found on this system"
2299 die(errmsg)
2300
2301 return bwt_idx_path
2302
2303 # Print out the sam header, embedding the user's specified library properties.
2304 # FIXME: also needs SQ dictionary lines
2305 def write_sam_header(read_params, sam_file):
2306 print >> sam_file, "@HD\tVN:1.0\tSO:coordinate"
2307 if read_params.read_group_id and read_params.sample_id:
2308 rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
2309 read_params.sample_id)
2310 if read_params.library_id:
2311 rg_str += "\tLB:%s" % read_params.library_id
2312 if read_params.description:
2313 rg_str += "\tDS:%s" % read_params.description
2314 if read_params.seq_platform_unit:
2315 rg_str += "\tPU:%s" % read_params.seq_platform_unit
2316 if read_params.seq_center:
2317 rg_str += "\tCN:%s" % read_params.seq_center
2318 if read_params.mate_inner_dist:
2319 rg_str += "\tPI:%s" % read_params.mate_inner_dist
2320 if read_params.seq_run_date:
2321 rg_str += "\tDT:%s" % read_params.seq_run_date
2322 if read_params.seq_platform:
2323 rg_str += "\tPL:%s" % read_params.seq_platform
2324
2325 print >> sam_file, rg_str
2326 print >> sam_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
2327
2328 # Write final TopHat output, via tophat_reports and wiggles
2329 def compile_reports(params, sam_header_filename, ref_fasta, mappings, readfiles, gff_annotation):
2330 th_log("Reporting output tracks")
2331 left_maps, right_maps = mappings
2332 left_reads, right_reads = readfiles
2333 left_maps = [x for x in left_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
2334 left_maps = ','.join(left_maps)
2335
2336 if len(right_maps) > 0:
2337 right_maps = [x for x in right_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
2338 right_maps = ','.join(right_maps)
2339 log_fname = logging_dir + "reports.log"
2340 report_log = open(log_fname, "w")
2341 junctions = output_dir + "junctions.bed"
2342 insertions = output_dir + "insertions.bed"
2343 deletions = output_dir + "deletions.bed"
2344 accepted_hits = output_dir + "accepted_hits"
2345 report_cmdpath = prog_path("tophat_reports")
2346 fusions = output_dir + "fusions.out"
2347 report_cmd = [report_cmdpath]
2348
2349 alignments_output_filename = tmp_dir + "accepted_hits"
2350
2351 report_cmd.extend(params.cmd())
2352 report_cmd += ["--sam-header", sam_header_filename]
2353 if params.report_secondary_alignments:
2354 report_cmd += ["--report-secondary-alignments"]
2355
2356 report_cmd.extend(["--samtools="+samtools_path])
2357 report_cmd.extend([ref_fasta,
2358 junctions,
2359 insertions,
2360 deletions,
2361 fusions,
2362 alignments_output_filename,
2363 left_maps,
2364 left_reads])
2365
2366 if len(right_maps) > 0 and right_reads:
2367 report_cmd.append(right_maps)
2368 report_cmd.append(right_reads)
2369
2370 # -- tophat_reports now produces (uncompressed) BAM stream,
2371 try:
2372 if params.report_params.convert_bam:
2373 if params.report_params.sort_bam:
2374 print >> run_log, " ".join(report_cmd)
2375 report_proc=subprocess.call(report_cmd,
2376 preexec_fn=subprocess_setup,
2377 stderr=report_log)
2378
2379 bam_parts = []
2380 for i in range(params.system_params.num_threads):
2381 bam_part_filename = "%s%d.bam" % (alignments_output_filename, i)
2382 if os.path.exists(bam_part_filename):
2383 bam_parts.append(bam_part_filename)
2384 else:
2385 break
2386
2387 num_bam_parts = len(bam_parts)
2388 pids = [0 for i in range(num_bam_parts)]
2389 sorted_bam_parts = ["%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)]
2390 #left_um_parts = ["%s%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)]
2391 #right_um_parts = ["%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)]
2392 for i in range(num_bam_parts):
2393 bamsort_cmd = [samtools_path,
2394 "sort",
2395 bam_parts[i],
2396 sorted_bam_parts[i]]
2397
2398 sorted_bam_parts[i] += ".bam"
2399 print >> run_log, " ".join(bamsort_cmd)
2400 pid = os.fork()
2401 if pid == 0:
2402 subprocess.call(bamsort_cmd,
2403 stderr=open(logging_dir + "reports.samtools_sort.log%d" % i, "w"))
2404 os._exit(os.EX_OK)
2405 else:
2406 pids[i] = pid
2407
2408 for i in range(len(pids)):
2409 if pids[i] > 0:
2410 os.waitpid(pids[i], 0)
2411 pids[i] = 0
2412
2413 # for bam_part in bam_parts:
2414 # os.remove(bam_part)
2415
2416 if num_bam_parts > 1:
2417 bammerge_cmd = [samtools_path,
2418 "merge",
2419 "-h",
2420 sam_header_filename,
2421 "%s.bam" % accepted_hits]
2422 bammerge_cmd += sorted_bam_parts
2423
2424 print >> run_log, " ".join(bammerge_cmd)
2425 subprocess.call(bammerge_cmd,
2426 stderr=open(logging_dir + "reports.samtools_merge.log", "w"))
2427
2428 for sorted_bam_part in sorted_bam_parts:
2429 os.remove(sorted_bam_part)
2430 else:
2431 os.rename(sorted_bam_parts[0], output_dir + "accepted_hits.bam")
2432
2433 else:
2434 print >> run_log, " ".join(report_cmd)
2435 report_proc=subprocess.call(report_cmd,
2436 preexec_fn=subprocess_setup,
2437 stdout=open(accepted_hits,"wb"),
2438 stderr=report_log)
2439 os.rename(accepted_hits, output_dir + "accepted_hits.bam")
2440 else:
2441 print >> run_log, " ".join(report_cmd)
2442 report_proc=subprocess.call(report_cmd,
2443 preexec_fn=subprocess_setup,
2444 stdout=open(accepted_hits,"wb"),
2445 stderr=report_log)
2446 tmp_sam = output_dir + "accepted_hits.sam"
2447
2448 bam_to_sam_cmd = [samtools_path, "view", "-h", accepted_hits]
2449 print >> run_log, " ".join(bam_to_sam_cmd) + " > " + tmp_sam
2450 bam_to_sam_log = open(logging_dir + "accepted_hits_bam_to_sam.log", "w")
2451 tmp_sam_file = open(tmp_sam, "w")
2452 ret = subprocess.call(bam_to_sam_cmd,
2453 stdout=tmp_sam_file,
2454 stderr=bam_to_sam_log)
2455 tmp_sam_file.close()
2456 os.remove(accepted_hits)
2457
2458 except OSError, o:
2459 die(fail_str+"Error: "+str(o)+"\n"+log_tail(log_fname))
2460
2461 return junctions
2462
2463
2464 # Split up each read in a FASTQ file into multiple segments. Creates a FASTQ file
2465 # for each segment This function needs to be fixed to support mixed read length
2466 # inputs
2467 def open_output_files(prefix, num_files_prev, num_files, out_segf, extension, params):
2468 i = num_files_prev + 1
2469 while i <= num_files:
2470 segfname=prefix+("_seg%d" % i)+extension
2471 out_segf.append(ZWriter(segfname,params.system_params))
2472 i += 1
2473
2474 def split_reads(reads_filename,
2475 prefix,
2476 fasta,
2477 params,
2478 segment_length):
2479 #reads_file = open(reads_filename)
2480 zreads = ZReader(reads_filename, params.system_params, False)
2481 out_segfiles = []
2482
2483 if fasta:
2484 extension = ".fa"
2485 else:
2486 extension = ".fq"
2487 if use_zpacker: extension += ".z"
2488 def convert_color_to_bp(color_seq):
2489 decode_dic = { 'A0':'A', 'A1':'C', 'A2':'G', 'A3':'T', 'A4':'N', 'A.':'N', 'AN':'N',
2490 'C0':'C', 'C1':'A', 'C2':'T', 'C3':'G', 'C4':'N', 'C.':'N', 'CN':'N',
2491 'G0':'G', 'G1':'T', 'G2':'A', 'G3':'C', 'G4':'N', 'G.':'N', 'GN':'N',
2492 'T0':'T', 'T1':'G', 'T2':'C', 'T3':'A', 'T4':'N', 'T.':'N', 'TN':'N',
2493 'N0':'N', 'N1':'N', 'N2':'N', 'N3':'N', 'N4':'N', 'N.':'N', 'NN':'N',
2494 '.0':'N', '.1':'N', '.2':'N', '.3':'N', '.4':'N', '..':'N', '.N':'N' }
2495
2496 base = color_seq[0]
2497 bp_seq = base
2498 for ch in color_seq[1:]:
2499 base = decode_dic[base+ch]
2500 bp_seq += base
2501 return bp_seq
2502
2503 def convert_bp_to_color(bp_seq):
2504 encode_dic = { 'AA':'0', 'CC':'0', 'GG':'0', 'TT':'0',
2505 'AC':'1', 'CA':'1', 'GT':'1', 'TG':'1',
2506 'AG':'2', 'CT':'2', 'GA':'2', 'TC':'2',
2507 'AT':'3', 'CG':'3', 'GC':'3', 'TA':'3',
2508 'A.':'4', 'C.':'4', 'G.':'4', 'T.':'4',
2509 '.A':'4', '.C':'4', '.G':'4', '.T':'4',
2510 '.N':'4', 'AN':'4', 'CN':'4', 'GN':'4',
2511 'TN':'4', 'NA':'4', 'NC':'4', 'NG':'4',
2512 'NT':'4', 'NN':'4', 'N.':'4', '..':'4' }
2513
2514 base = bp_seq[0]
2515 color_seq = base
2516 for ch in bp_seq[1:]:
2517 color_seq += encode_dic[base + ch]
2518 base = ch
2519
2520 return color_seq
2521
2522 def split_record(read_name, read_seq, read_qual, out_segf, offsets, color):
2523 if color:
2524 color_offset = 1
2525 read_seq_temp = convert_color_to_bp(read_seq)
2526
2527 seg_num = 1
2528 while seg_num + 1 < len(offsets):
2529 if read_seq[offsets[seg_num]+1] not in ['0', '1', '2', '3']:
2530 return
2531 seg_num += 1
2532 else:
2533 color_offset = 0
2534
2535 seg_num = 0
2536 last_seq_offset = 0
2537 while seg_num + 1 < len(offsets):
2538 f = out_segf[seg_num].file
2539 seg_seq = read_seq[last_seq_offset+color_offset:offsets[seg_num + 1]+color_offset]
2540 print >> f, "%s|%d:%d:%d" % (read_name,last_seq_offset,seg_num, len(offsets) - 1)
2541 if color:
2542 print >> f, "%s%s" % (read_seq_temp[last_seq_offset], seg_seq)
2543 else:
2544 print >> f, seg_seq
2545 if not fasta:
2546 seg_qual = read_qual[last_seq_offset:offsets[seg_num + 1]]
2547 print >> f, "+"
2548 print >> f, seg_qual
2549 seg_num += 1
2550 last_seq_offset = offsets[seg_num]
2551
2552 line_state = 0
2553 read_name = ""
2554 read_seq = ""
2555 read_quals = ""
2556 num_segments = 0
2557 offsets = []
2558 for line in zreads.file:
2559 if line.strip() == "":
2560 continue
2561 if line_state == 0:
2562 read_name = line.strip()
2563 elif line_state == 1:
2564 read_seq = line.strip()
2565
2566 read_length = len(read_seq)
2567 tmp_num_segments = read_length / segment_length
2568 offsets = [segment_length * i for i in range(0, tmp_num_segments + 1)]
2569
2570 # Bowtie's minimum read length here is 20bp, so if the last segment
2571 # is between 20 and segment_length bp long, go ahead and write it out
2572 if read_length % segment_length >= min(segment_length - 2, 20):
2573 offsets.append(read_length)
2574 tmp_num_segments += 1
2575 else:
2576 offsets[-1] = read_length
2577
2578 if tmp_num_segments == 1:
2579 offsets = [0, read_length]
2580
2581 if tmp_num_segments > num_segments:
2582 open_output_files(prefix, num_segments, tmp_num_segments, out_segfiles, extension, params)
2583 num_segments = tmp_num_segments
2584
2585 if fasta:
2586 split_record(read_name, read_seq, None, out_segfiles, offsets, params.read_params.color)
2587 elif line_state == 2:
2588 line = line.strip()
2589 else:
2590 read_quals = line.strip()
2591 if not fasta:
2592 split_record(read_name, read_seq, read_quals, out_segfiles, offsets, params.read_params.color)
2593
2594 line_state += 1
2595 if fasta:
2596 line_state %= 2
2597 else:
2598 line_state %= 4
2599 zreads.close()
2600 out_fnames=[]
2601 for zf in out_segfiles:
2602 zf.close()
2603 out_fnames.append(zf.fname)
2604 #return [o.fname for o in out_segfiles]
2605 return out_fnames
2606
2607 # Find possible splice junctions using the "closure search" strategy, and report
2608 # them in closures.juncs. Calls the executable closure_juncs
2609 def junctions_from_closures(params,
2610 sam_header_filename,
2611 left_maps,
2612 right_maps,
2613 ref_fasta):
2614 th_log("Searching for junctions via mate-pair closures")
2615
2616
2617 #maps = [x for x in seg_maps if (os.path.exists(x) and os.path.getsize(x) > 0)]
2618 #if len(maps) == 0:
2619 # return None
2620 slash = left_maps[0].rfind('/')
2621 juncs_out = ""
2622 if slash != -1:
2623 juncs_out += left_maps[0][:slash+1]
2624 fusions_out = juncs_out
2625
2626 juncs_out += "closure.juncs"
2627 fusions_out += "closure.fusions"
2628
2629 juncs_log = open(logging_dir + "closure.log", "w")
2630 juncs_cmdpath=prog_path("closure_juncs")
2631 juncs_cmd = [juncs_cmdpath]
2632
2633 left_maps = ','.join(left_maps)
2634 right_maps = ','.join(right_maps)
2635
2636 juncs_cmd.extend(params.cmd())
2637 juncs_cmd.extend(["--sam-header", sam_header_filename,
2638 juncs_out,
2639 fusions_out,
2640 ref_fasta,
2641 left_maps,
2642 right_maps])
2643 try:
2644 print >> run_log, ' '.join(juncs_cmd)
2645 retcode = subprocess.call(juncs_cmd,
2646 stderr=juncs_log)
2647
2648 # spanning_reads returned an error
2649 if retcode != 0:
2650 die(fail_str+"Error: closure-based junction search failed with err ="+str(retcode))
2651 # cvg_islands not found
2652 except OSError, o:
2653 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2654 th_logp(fail_str + "Error: closure_juncs not found on this system")
2655 die(str(o))
2656 return [juncs_out]
2657
2658 # Find possible junctions by examining coverage and split segments in the initial
2659 # map and segment maps. Report junctions, insertions, and deletions in segment.juncs,
2660 # segment.insertions, and segment.deletions. Calls the executable
2661 # segment_juncs
2662 def junctions_from_segments(params,
2663 sam_header_filename,
2664 left_reads,
2665 left_reads_map,
2666 left_seg_maps,
2667 right_reads,
2668 right_reads_map,
2669 right_seg_maps,
2670 unmapped_reads,
2671 reads_format,
2672 ref_fasta):
2673 if left_reads_map != left_seg_maps[0]:
2674 th_log("Searching for junctions via segment mapping")
2675 out_path=getFileDir(left_seg_maps[0])
2676 juncs_out=out_path+"segment.juncs"
2677 insertions_out=out_path+"segment.insertions"
2678 deletions_out =out_path+"segment.deletions"
2679 fusions_out = out_path+"segment.fusions"
2680
2681 left_maps = ','.join(left_seg_maps)
2682 log_fname = logging_dir + "segment_juncs.log"
2683 segj_log = open(log_fname, "w")
2684 segj_cmd = [prog_path("segment_juncs")]
2685
2686 segj_cmd.extend(params.cmd())
2687 segj_cmd.extend(["--sam-header", sam_header_filename,
2688 "--ium-reads", ",".join(unmapped_reads),
2689 ref_fasta,
2690 juncs_out,
2691 insertions_out,
2692 deletions_out,
2693 fusions_out,
2694 left_reads,
2695 left_reads_map,
2696 left_maps])
2697 if right_seg_maps:
2698 right_maps = ','.join(right_seg_maps)
2699 segj_cmd.extend([right_reads, right_reads_map, right_maps])
2700 try:
2701 print >> run_log, " ".join(segj_cmd)
2702 retcode = subprocess.call(segj_cmd,
2703 preexec_fn=subprocess_setup,
2704 stderr=segj_log)
2705
2706 # spanning_reads returned an error
2707 if retcode != 0:
2708 die(fail_str+"Error: segment-based junction search failed with err ="+str(retcode)+"\n"+log_tail(log_fname))
2709
2710 # cvg_islands not found
2711 except OSError, o:
2712 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2713 th_logp(fail_str + "Error: segment_juncs not found on this system")
2714 die(str(o))
2715
2716 return [juncs_out, insertions_out, deletions_out, fusions_out]
2717
2718 # Joins mapped segments into full-length read alignments via the executable
2719 # long_spanning_reads
2720 def join_mapped_segments(params,
2721 sam_header_filename,
2722 reads,
2723 ref_fasta,
2724 possible_juncs,
2725 possible_insertions,
2726 possible_deletions,
2727 possible_fusions,
2728 contig_seg_maps,
2729 spliced_seg_maps,
2730 alignments_out_name):
2731 rn=""
2732 if len(contig_seg_maps)>1:
2733 th_log("Joining segment hits")
2734 rn=".segs"
2735 else:
2736 th_log("Processing bowtie hits")
2737 contig_seg_maps = ','.join(contig_seg_maps)
2738
2739 possible_juncs = ','.join(possible_juncs)
2740 possible_insertions = ",".join(possible_insertions)
2741 possible_deletions = ",".join(possible_deletions)
2742 possible_fusions = ",".join(possible_fusions)
2743
2744 log_fname=logging_dir + "long_spanning_reads"+rn+".log"
2745 align_log = open(log_fname, "w")
2746 align_cmd = [prog_path("long_spanning_reads")]
2747
2748 align_cmd.extend(params.cmd())
2749 align_cmd += ["--sam-header", sam_header_filename]
2750
2751 b2_params = params.bowtie2_params
2752 max_penalty, min_penalty = b2_params.mp.split(',')
2753 align_cmd += ["--bowtie2-max-penalty", max_penalty,
2754 "--bowtie2-min-penalty", min_penalty]
2755
2756 align_cmd += ["--bowtie2-penalty-for-N", str(b2_params.np)]
2757
2758 read_gap_open, read_gap_cont = b2_params.rdg.split(',')
2759 align_cmd += ["--bowtie2-read-gap-open", read_gap_open,
2760 "--bowtie2-read-gap-cont", read_gap_cont]
2761
2762 ref_gap_open, ref_gap_cont = b2_params.rfg.split(',')
2763 align_cmd += ["--bowtie2-ref-gap-open", ref_gap_open,
2764 "--bowtie2-ref-gap-cont", ref_gap_cont]
2765
2766 align_cmd.append(ref_fasta)
2767 align_cmd.extend([reads,
2768 possible_juncs,
2769 possible_insertions,
2770 possible_deletions,
2771 possible_fusions,
2772 alignments_out_name,
2773 contig_seg_maps])
2774
2775 if spliced_seg_maps:
2776 spliced_seg_maps = ','.join(spliced_seg_maps)
2777 align_cmd.append(spliced_seg_maps)
2778
2779 try:
2780 print >> run_log, " ".join(align_cmd)
2781 join_proc=subprocess.call(align_cmd,
2782 stderr=align_log)
2783
2784 except OSError, o:
2785 die(fail_str+"Error: "+str(o))
2786
2787 # This class collects spliced and unspliced alignments for each of the
2788 # left and right read files provided by the user.
2789 class Maps:
2790 def __init__(self,
2791 unspliced_sam,
2792 seg_maps,
2793 unmapped_segs,
2794 segs):
2795 self.unspliced_sam = unspliced_sam
2796 self.seg_maps = seg_maps
2797 self.unmapped_segs = unmapped_segs
2798 self.segs = segs
2799
2800 # Map2GTF stuff
2801 def m2g_convert_coords(params, sam_header_filename, gtf_fname, reads, out_fname):
2802 """ajjkljlks
2803
2804 Arguments:
2805 - `params`: TopHat parameters
2806 - `gtf_fname`: File name pointing to the annotation.
2807 - `reads`: The reads to convert coords (in Bowtie format).
2808 - `out_fname`: The file name pointing to the output.
2809 """
2810 m2g_cmd = [prog_path("map2gtf")]
2811 m2g_cmd.extend(params.cmd())
2812 m2g_cmd += ["--sam-header", sam_header_filename]
2813 m2g_cmd.append(gtf_fname)
2814 m2g_cmd.append(reads) #could be BAM file
2815 m2g_cmd.append(out_fname)
2816 fbasename = getFileBaseName(reads)
2817 m2g_log = logging_dir + "m2g_" + fbasename + ".out"
2818 m2g_err = logging_dir + "m2g_" + fbasename + ".err"
2819
2820 try:
2821 th_log("Converting " + fbasename + " to genomic coordinates (map2gtf)")
2822 print >> run_log, " ".join(m2g_cmd) + " > " + m2g_log
2823 ret = subprocess.call(m2g_cmd,
2824 stdout=open(m2g_log, "w"),
2825 stderr=open(m2g_err, "w"))
2826 if ret != 0:
2827 die(fail_str + " Error: map2gtf returned an error")
2828 except OSError, o:
2829 err_msg = fail_str + str(o)
2830 die(err_msg + "\n")
2831
2832
2833 def gtf_to_fasta(params, trans_gtf, genome, out_fname):
2834 """ Make the transcriptome from a GTF.
2835
2836 Arguments:
2837 - `trans_gtf`:
2838 - `genome`:
2839 - `out_fname`:
2840 """
2841 # TODO: Call gtf_to_fasta
2842 g2f_cmd = [prog_path("gtf_to_fasta")]
2843 g2f_cmd.extend(params.cmd())
2844 g2f_cmd.append(trans_gtf)
2845 g2f_cmd.append(genome)
2846 g2f_cmd.append(out_fname)
2847
2848 g2f_log = logging_dir + "g2f.out"
2849 g2f_err = logging_dir + "g2f.err"
2850
2851 try:
2852 print >> run_log, " ".join(g2f_cmd)+" > " + g2f_log
2853 ret = subprocess.call(g2f_cmd,
2854 stdout = open(g2f_log, "w"),
2855 stderr = open(g2f_err, "w"))
2856 if ret != 0:
2857 die(fail_str + " Error: gtf_to_fasta returned an error.")
2858 except OSError, o:
2859 err_msg = fail_str + str(o)
2860 die(err_msg + "\n")
2861
2862 def map2gtf(params, genome_sam_header_filename, ref_fasta, left_reads, right_reads):
2863 """ Main GTF mapping function
2864
2865 Arguments:
2866 - `params`: The TopHat parameters.
2867 - `ref_fasta`: The reference genome.
2868 - `left_reads`: A list of reads.
2869 - `right_reads`: A list of reads (empty if single-end).
2870
2871 """
2872 test_input_file(params.gff_annotation)
2873
2874 # th_log("Reading in GTF file: " + params.gff_annotation)
2875 # transcripts = gtf_to_transcripts(params.gff_annotation)
2876
2877 gtf_name = getFileBaseName(params.gff_annotation)
2878 m2g_bwt_idx = None
2879 t_out_dir = tmp_dir
2880 if params.transcriptome_index and not params.transcriptome_outdir:
2881 m2g_bwt_idx = params.transcriptome_index
2882 th_log("Using pre-built transcriptome index..")
2883 else:
2884 th_log("Creating transcriptome data files..")
2885 if params.transcriptome_outdir:
2886 t_out_dir=params.transcriptome_outdir+"/"
2887 m2g_ref_fasta = t_out_dir + gtf_name + ".fa"
2888 gtf_to_fasta(params, params.gff_annotation, ref_fasta, m2g_ref_fasta)
2889 m2g_bwt_idx = build_idx_from_fa(params.bowtie2, m2g_ref_fasta, t_out_dir, params.read_params.color)
2890
2891 get_index_sam_header(params, m2g_bwt_idx)
2892
2893 mapped_gtf_list = []
2894 unmapped_gtf_list = []
2895 # do the initial mapping in GTF coordinates
2896 for reads in [left_reads, right_reads]:
2897 if reads == None or os.path.getsize(reads) < 25 :
2898 continue
2899 fbasename = getFileBaseName(reads)
2900 mapped_gtf_out = tmp_dir + fbasename + ".m2g"
2901 #if use_zpacker:
2902 # mapped_gtf_out+=".z"
2903
2904 unmapped_gtf = tmp_dir + fbasename + ".m2g_um"
2905 #if use_BWT_FIFO:
2906 # unmapped_gtf += ".z"
2907
2908 (mapped_gtf_map, unmapped) = bowtie(params,
2909 m2g_bwt_idx,
2910 [reads],
2911 "fastq",
2912 params.t_mismatches,
2913 mapped_gtf_out,
2914 unmapped_gtf,
2915 "", "read_against_transcriptome")
2916 mapped_gtf_list.append(mapped_gtf_map)
2917 unmapped_gtf_list.append(unmapped)
2918
2919 sam_gtf_list = []
2920 for reads in mapped_gtf_list:
2921 fbasename = getFileBaseName(reads)
2922 sam_out_fname = tmp_dir + fbasename + "_converted.bam"
2923 m2g_convert_coords(params,
2924 genome_sam_header_filename,
2925 params.gff_annotation,
2926 reads,
2927 sam_out_fname)
2928 sam_gtf_list.append(sam_out_fname)
2929
2930 if len(sam_gtf_list) < 2:
2931 sam_gtf_list.append(None)
2932 unmapped_gtf_list.append(None)
2933
2934 return (sam_gtf_list, unmapped_gtf_list)
2935 # end Map2GTF
2936
2937 def get_preflt_data(params, ri, target_reads, out_mappings, out_unmapped):
2938 ## extract mappings and unmapped reads from prefilter mappings and preflt_ium
2939 ##
2940 #this is accomplished by a special prep_reads usage (triggered by --flt-hits)
2941
2942 prep_cmd=prep_reads_cmd(params, params.preflt_data[ri].unmapped_reads, None,
2943 out_mappings,
2944 target_reads,
2945 params.preflt_data[ri].mappings)
2946 um_reads = open(out_unmapped, "wb")
2947 log_fname=logging_dir + "prep_reads.from_preflt."+str(ri)+".log"
2948 filter_log = open(log_fname,"w")
2949
2950 shell_cmd = " ".join(prep_cmd)
2951 #add the compression pipe
2952 zip_cmd=[]
2953 if use_zpacker:
2954 zip_cmd=[ params.system_params.zipper ]
2955 zip_cmd.extend(params.system_params.zipper_opts)
2956 zip_cmd.extend(['-c','-'])
2957 shell_cmd +=' | '+' '.join(zip_cmd)
2958 shell_cmd += ' >' + out_unmapped
2959 retcode=0
2960 try:
2961 print >> run_log, shell_cmd
2962 if use_zpacker:
2963 prep_proc = subprocess.Popen(prep_cmd,
2964 stdout=subprocess.PIPE,
2965 stderr=filter_log)
2966 zip_proc = subprocess.Popen(zip_cmd,
2967 preexec_fn=subprocess_setup,
2968 stdin=prep_proc.stdout,
2969 stdout=um_reads)
2970 prep_proc.stdout.close() #as per http://bugs.python.org/issue7678
2971 zip_proc.communicate()
2972 retcode=prep_proc.poll()
2973 if retcode==0:
2974 retcode=zip_proc.poll()
2975 else:
2976 retcode = subprocess.call(prep_cmd,
2977 stdout=um_reads,
2978 stderr=filter_log)
2979 if retcode:
2980 die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname))
2981
2982 except OSError, o:
2983 errmsg=fail_str+str(o)
2984 die(errmsg+"\n"+log_tail(log_fname))
2985 um_reads.close()
2986
2987 return (out_mappings, out_unmapped)
2988
2989
2990 # The main aligment routine of TopHat. This function executes most of the
2991 # workflow producing a set of candidate alignments for each cDNA fragment in a
2992 # pair of SAM alignment files (for paired end reads).
2993 def spliced_alignment(params,
2994 bwt_idx_prefix,
2995 sam_header_filename,
2996 ref_fasta,
2997 read_len,
2998 segment_len,
2999 prepared_reads,
3000 user_supplied_junctions,
3001 user_supplied_insertions,
3002 user_supplied_deletions):
3003
3004 possible_juncs = []
3005 possible_juncs.extend(user_supplied_junctions)
3006
3007 possible_insertions = []
3008 possible_insertions.extend(user_supplied_insertions)
3009 possible_deletions = []
3010 possible_deletions.extend(user_supplied_deletions)
3011 possible_fusions = []
3012
3013 left_reads, right_reads = prepared_reads
3014
3015 maps = [[], []] # maps[0] = left_reads mapping data, maps[1] = right_reads_mapping_data
3016
3017 #single_segments = False
3018
3019 # Before anything, map the reads using Map2GTF (if using annotation)
3020 m2g_maps = [ None, None ] # left, right
3021 initial_reads = [ left_reads, right_reads ]
3022
3023 if params.gff_annotation:
3024 (mapped_gtf_list, unmapped_gtf_list) = \
3025 map2gtf(params, sam_header_filename, ref_fasta, left_reads, right_reads)
3026
3027 m2g_left_maps, m2g_right_maps = mapped_gtf_list
3028 m2g_maps = [m2g_left_maps, m2g_right_maps]
3029 if params.transcriptome_only or not fileExists(unmapped_gtf_list[0]):
3030 # The case where the user doesn't want to map to anything other
3031 # than the transcriptome OR we have no unmapped reads
3032 maps[0] = [m2g_left_maps]
3033 if right_reads:
3034 maps[1] = [m2g_right_maps]
3035 return maps
3036 # Feed the unmapped reads into spliced_alignment()
3037 initial_reads = unmapped_gtf_list[:]
3038 th_log("Resuming TopHat pipeline with unmapped reads")
3039
3040 max_seg_len = segment_len #this is the ref seq span on either side of the junctions
3041 #to be extracted into segment_juncs.fa
3042
3043 num_segs = int(read_len / segment_len)
3044 if (read_len % segment_len) >= min(segment_len-2, 20):
3045 #remainder is shorter but long enough to become a new segment
3046 num_segs += 1
3047 else:
3048 # the last segment is longer
3049 if num_segs>1: max_seg_len += (read_len % segment_len)
3050
3051 if num_segs <= 1:
3052 th_logp("Warning: you have only one segment per read.\n\tIf the read length is greater than or equal to 45bp,\n\twe strongly recommend that you decrease --segment-length to about half the read length because TopHat will work better with multiple segments")
3053
3054 # Using the num_segs value returned by check_reads(),
3055 # decide which junction discovery strategy to use
3056 if num_segs < 3:
3057 #if params.butterfly_search != False:
3058 # params.butterfly_search = True
3059 if params.coverage_search != False:
3060 params.coverage_search = True
3061 if num_segs == 1:
3062 segment_len = read_len
3063 else: #num_segs >= 3:
3064 # if we have at least three segments, just use split segment search,
3065 # which is the most sensitive and specific, fastest, and lightest-weight.
3066 # so unless specifically requested, disable the other junction searches
3067 if params.closure_search != True:
3068 params.closure_search = False
3069 if params.coverage_search != True:
3070 params.coverage_search = False
3071 if params.butterfly_search != True:
3072 params.butterfly_search = False
3073
3074 # Perform the first part of the TopHat work flow on the left and right
3075 # reads of paired ends separately - we'll use the pairing information later
3076 have_left_IUM=False
3077 for ri in (0,1):
3078 reads=initial_reads[ri]
3079 if reads == None or not fileExists(reads,25):
3080 continue
3081 fbasename=getFileBaseName(reads)
3082 unspliced_out = tmp_dir + fbasename + ".mapped"
3083 unspliced_sam = None
3084 unmapped_reads = None
3085 #if use_zpacker: unspliced_out+=".z"
3086 unmapped_unspliced = tmp_dir + fbasename + "_unmapped"
3087 if params.prefilter_multi:
3088 #unmapped_unspliced += ".z"
3089 (unspliced_sam, unmapped_reads) = get_preflt_data(params, ri, reads, unspliced_out, unmapped_unspliced)
3090 else:
3091 # Perform the initial Bowtie mapping of the full length reads
3092 (unspliced_sam, unmapped_reads) = bowtie(params,
3093 bwt_idx_prefix,
3094 [reads],
3095 "fastq",
3096 params.genome_read_mismatches,
3097 unspliced_out,
3098 unmapped_unspliced,
3099 "",
3100 "read_against_genome")
3101
3102 # daehwan - check this out
3103 # params.bowtie2 = False
3104 # check_bowtie(params.bowtie2)
3105
3106 seg_maps = []
3107 unmapped_segs = []
3108 segs = []
3109 have_IUM = fileExists(unmapped_reads,25)
3110 if ri==0 and have_IUM:
3111 have_left_IUM = True
3112 if num_segs > 1 and have_IUM:
3113 # split up the IUM reads into segments
3114 # unmapped_reads can be in BAM format
3115 read_segments = split_reads(unmapped_reads,
3116 tmp_dir + fbasename,
3117 False,
3118 params,
3119 segment_len)
3120
3121 # Map each segment file independently with Bowtie
3122 for i in range(len(read_segments)):
3123 seg = read_segments[i]
3124 fbasename=getFileBaseName(seg)
3125 seg_out = tmp_dir + fbasename + ".bwtout"
3126 if use_zpacker: seg_out += ".z"
3127 unmapped_seg = tmp_dir + fbasename + "_unmapped.fq"
3128 if use_BWT_FIFO:
3129 unmapped_seg += ".z"
3130 extra_output = "(%d/%d)" % (i+1, len(read_segments))
3131 (seg_map, unmapped) = bowtie(params,
3132 bwt_idx_prefix,
3133 [seg],
3134 "fastq",
3135 params.segment_mismatches,
3136 seg_out,
3137 unmapped_seg,
3138 extra_output,
3139 "seg_against_genome")
3140 seg_maps.append(seg_map)
3141 unmapped_segs.append(unmapped)
3142 segs.append(seg)
3143
3144 # Collect the segment maps for left and right reads together
3145 maps[ri] = Maps(unspliced_sam, seg_maps, unmapped_segs, segs)
3146 else:
3147 # if there's only one segment, just collect the initial map as the only
3148 # map to be used downstream for coverage-based junction discovery
3149 read_segments = [reads]
3150 maps[ri] = Maps(unspliced_sam, [unspliced_sam], [unmapped_reads], [unmapped_reads])
3151
3152 # XXX: At this point if using M2G, have three sets of reads:
3153 # mapped to transcriptome, mapped to genome, and unmapped (potentially
3154 # spliced or poly-A tails) - hp
3155
3156 # Unless the user asked not to discover new junctions, start that process
3157 # here
3158 left_reads_map = maps[0].unspliced_sam
3159 left_seg_maps = maps[0].seg_maps
3160 unmapped_reads = maps[0].unmapped_segs
3161 if right_reads:
3162 right_reads_map = maps[1].unspliced_sam
3163 right_seg_maps = maps[1].seg_maps
3164 unmapped_reads.extend(maps[1].unmapped_segs)
3165 else:
3166 right_reads_map = None
3167 right_seg_maps = None
3168
3169 if params.find_novel_juncs and have_left_IUM: # or params.find_novel_indels:
3170 # Call segment_juncs to infer a list of possible splice junctions from
3171 # the regions of the genome covered in the initial and segment maps
3172 #if params.find_novel_juncs:
3173 #TODO: in m2g case, we might want to pass the m2g mappings as well,
3174 # or perhaps the GTF file directly
3175 # -> this could improve alternative junction detection?
3176 juncs = junctions_from_segments(params,
3177 sam_header_filename,
3178 left_reads,
3179 left_reads_map,
3180 left_seg_maps,
3181 right_reads,
3182 right_reads_map,
3183 right_seg_maps,
3184 unmapped_reads,
3185 "fastq",
3186 ref_fasta)
3187 if os.path.getsize(juncs[0]) != 0:
3188 possible_juncs.append(juncs[0])
3189 if params.find_novel_indels:
3190 if os.path.getsize(juncs[1]) != 0:
3191 possible_insertions.append(juncs[1])
3192 if os.path.getsize(juncs[2]) != 0:
3193 possible_deletions.append(juncs[2])
3194 if params.find_novel_fusions:
3195 if os.path.getsize(juncs[3]) != 0:
3196 possible_fusions.append(juncs[3])
3197 # Optionally, and for paired reads only, use a closure search to
3198 # discover addtional junctions
3199 if params.closure_search and left_reads and right_reads:
3200 juncs = junctions_from_closures(params,
3201 sam_header_filename,
3202 [maps[initial_reads[left_reads]].unspliced_sam, maps[initial_reads[left_reads]].seg_maps[-1]],
3203 [maps[initial_reads[right_reads]].unspliced_sam, maps[initial_reads[right_reads]].seg_maps[-1]],
3204 ref_fasta)
3205 if os.path.getsize(juncs[0]) != 0:
3206 possible_juncs.extend(juncs)
3207
3208 if len(possible_insertions) == 0 and len(possible_deletions) == 0 and len(possible_juncs) == 0 and len(possible_fusions) == 0:
3209 spliced_seg_maps = None
3210 junc_idx_prefix = None
3211 else:
3212 junc_idx_prefix = "segment_juncs"
3213 if len(possible_insertions) == 0:
3214 possible_insertions.append(os.devnull)
3215 # print >> sys.stderr, "Warning: insertions database is empty!"
3216 if len(possible_deletions) == 0:
3217 possible_deletions.append(os.devnull)
3218 # print >> sys.stderr, "Warning: deletions database is empty!"
3219 if len(possible_juncs) == 0:
3220 possible_juncs.append(os.devnull)
3221 th_logp("Warning: junction database is empty!")
3222 if len(possible_fusions) == 0:
3223 possible_fusions.append(os.devnull)
3224 if junc_idx_prefix:
3225 juncs_bwt_idx = build_juncs_index(params.bowtie2,
3226 3,
3227 max_seg_len,
3228 junc_idx_prefix,
3229 possible_juncs,
3230 possible_insertions,
3231 possible_deletions,
3232 possible_fusions,
3233 ref_fasta,
3234 params.read_params.color)
3235
3236 juncs_bwt_samheader = get_index_sam_header(params, juncs_bwt_idx)
3237
3238 # Now map read segments (or whole IUM reads, if num_segs == 1) to the splice
3239 # index with Bowtie
3240 # for reads in [left_reads, right_reads]:
3241 for ri in (0,1):
3242 reads = initial_reads[ri]
3243 spliced_seg_maps = []
3244
3245 if reads == None or os.path.getsize(reads)<25:
3246 continue
3247
3248 m2g_map = m2g_maps[ri]
3249
3250 if junc_idx_prefix :
3251 i = 0
3252 for seg in maps[ri].segs:
3253 #search each segment
3254 fsegname = getFileBaseName(seg)
3255 seg_out = tmp_dir + fsegname + ".to_spliced.bwtout"
3256 if use_zpacker: seg_out += ".z"
3257 extra_output = "(%d/%d)" % (i+1, len(maps[ri].segs))
3258 (seg_map, unmapped) = bowtie(params,
3259 tmp_dir + junc_idx_prefix,
3260 [seg],
3261 "fastq",
3262 params.segment_mismatches,
3263 seg_out,
3264 None,
3265 extra_output,
3266 "seg_against_junctions")
3267 spliced_seg_maps.append(seg_map)
3268 i += 1
3269
3270 # Join the contigous and spliced segment hits into full length
3271 # read alignments
3272 rfname=getFileBaseName(reads)
3273 rfdir=getFileDir(reads)
3274 mapped_reads = rfdir + rfname + ".candidates.bam"
3275 # -- spliced mappings built from all segment mappings vs genome and junc_db
3276 merged_map =rfdir + rfname + ".candidates_and_unspl.bam"
3277 #unspl_bwtfile = maps[ri].unspliced_bwt
3278 unspl_samfile = maps[ri].unspliced_sam
3279 join_mapped_segments(params,
3280 sam_header_filename,
3281 reads,
3282 ref_fasta,
3283 possible_juncs,
3284 possible_insertions,
3285 possible_deletions,
3286 possible_fusions,
3287 maps[ri].seg_maps,
3288 spliced_seg_maps,
3289 mapped_reads)
3290
3291 if num_segs > 1 or m2g_map:
3292 # Merge the spliced and unspliced full length alignments into
3293 # a single SAM file.
3294 # The individual SAM files are all already sorted in
3295 # increasing read ID order.
3296 # NOTE: We also should be able to address bug #134 here, by replacing
3297 # contiguous alignments that poke into an intron by a small amount by
3298 # the correct spliced alignment.
3299
3300 try:
3301 merge_cmd = [prog_path("bam_merge"),
3302 "--index-outfile", merged_map + ".index",
3303 "--sam-header", sam_header_filename,
3304 merged_map]
3305
3306 if num_segs > 1:
3307 merge_cmd += [ maps[ri].unspliced_sam ]
3308 if m2g_map:
3309 merge_cmd += [ m2g_map ]
3310
3311 if os.path.exists(mapped_reads):
3312 merge_cmd += [ mapped_reads ]
3313 else:
3314 for bam_i in range(0, params.system_params.num_threads):
3315 temp_bam = mapped_reads[:-4] + str(bam_i) + ".bam"
3316 if os.path.exists(temp_bam):
3317 merge_cmd += [ temp_bam ]
3318 else:
3319 break
3320
3321 print >> run_log, " ".join(merge_cmd)
3322 ret = subprocess.call( merge_cmd,
3323 stderr=open(logging_dir + "sam_merge.log", "w") )
3324 if ret != 0:
3325 die(fail_str+"Error executing: "+" ".join(merge_cmd))
3326 except OSError, o:
3327 die(fail_str+"Error: "+str(o))
3328 maps[ri] = [merged_map]
3329
3330 if not params.system_params.keep_tmp:
3331 os.remove(mapped_reads)
3332 else:
3333 # no segments or transcriptome mappings, so no merge is needed
3334 # because join_mapped_segments() produced the final BAM already
3335 os.rename(mapped_reads, merged_map)
3336 maps[ri] = [merged_map]
3337 if not params.system_params.keep_tmp:
3338 os.remove(unspl_samfile)
3339
3340 return maps
3341
3342 # rough equivalent of the 'which' command to find external programs
3343 # (current script path is tested first, then PATH envvar)
3344 def which(program):
3345 def is_executable(fpath):
3346 return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
3347 fpath, fname = os.path.split(program)
3348 if fpath:
3349 if is_executable(program):
3350 return program
3351 else:
3352 progpath = os.path.join(bin_dir, program)
3353 if is_executable(progpath):
3354 return progpath
3355 for path in os.environ["PATH"].split(os.pathsep):
3356 progpath = os.path.join(path, program)
3357 if is_executable(progpath):
3358 return progpath
3359 return None
3360
3361 def prog_path(program):
3362 progpath=which(program)
3363 if progpath == None:
3364 die("Error locating program: "+program)
3365 return progpath
3366
3367 # FIXME: this should get set during the make dist autotools phase of the build
3368 def get_version():
3369 return "2.0.0"
3370
3371 def mlog(msg):
3372 print >> sys.stderr, "[DBGLOG]:"+msg
3373
3374 def test_input_file(filename):
3375 try:
3376 test_file = open(filename, "r")
3377 except IOError, o:
3378 die("Error: Opening file %s" % filename)
3379 return
3380
3381 def main(argv=None):
3382 warnings.filterwarnings("ignore", "tmpnam is a potential security risk")
3383
3384 # Initialize default parameter values
3385 params = TopHatParams()
3386 try:
3387 if argv is None:
3388 argv = sys.argv
3389 args = params.parse_options(argv)
3390 params.check()
3391
3392 bwt_idx_prefix = args[0]
3393 left_reads_list = args[1]
3394 left_quals_list, right_quals_list = [], []
3395 if (not params.read_params.quals and len(args) > 2) or (params.read_params.quals and len(args) > 3):
3396 if params.read_params.mate_inner_dist == None:
3397 params.read_params.mate_inner_dist = 50
3398 #die("Error: you must set the mean inner distance between mates with -r")
3399
3400 right_reads_list = args[2]
3401 if params.read_params.quals:
3402 left_quals_list = args[3]
3403 right_quals_list = args[4]
3404 else:
3405 right_reads_list = None
3406 if params.read_params.quals:
3407 left_quals_list = args[2]
3408
3409 start_time = datetime.now()
3410 prepare_output_dir()
3411 init_logger(logging_dir + "tophat.log")
3412
3413 th_logp()
3414 th_log("Beginning TopHat run (v"+get_version()+")")
3415 th_logp("-----------------------------------------------")
3416
3417 global run_log
3418 run_log = open(logging_dir + "run.log", "w", 0)
3419 global run_cmd
3420 run_cmd = " ".join(argv)
3421 print >> run_log, run_cmd
3422
3423 check_bowtie(params)
3424 check_samtools()
3425
3426 # Validate all the input files, check all prereqs before committing
3427 # to the run
3428 if params.gff_annotation:
3429 if not os.path.exists(params.gff_annotation):
3430 die("Error: cannot find transcript file %s" % params.gff_annotation)
3431 if os.path.getsize(params.gff_annotation)<10:
3432 die("Error: invalid transcript file %s" % params.gff_annotation)
3433
3434 if params.transcriptome_index:
3435 if params.gff_annotation:
3436 #gff file given, so transcriptome data will be written there
3437 gff_basename = getFileBaseName(params.gff_annotation)
3438 #just in case, check if it's not already there (-G/--GTF given again by mistake)
3439 tpath, tname = os.path.split(params.transcriptome_index)
3440 new_subdir=False
3441 if tpath in (".", "./") or not tpath :
3442 if not os.path.exists(params.transcriptome_index):
3443 os.makedirs(params.transcriptome_index)
3444 new_subdir=True
3445 if new_subdir or (os.path.exists(params.transcriptome_index) and os.path.isdir(params.transcriptome_index)):
3446 params.transcriptome_index = os.path.join(params.transcriptome_index, gff_basename)
3447 gff_out=params.transcriptome_index+".gff"
3448 if not (os.path.exists(gff_out) and os.path.getsize(gff_out)==os.path.getsize(params.gff_annotation)):
3449 #generate the transcriptome data files
3450 tpath, tname = os.path.split(params.transcriptome_index)
3451 params.transcriptome_outdir=tpath
3452 t_gff=params.transcriptome_index+".gff"
3453 if params.transcriptome_outdir:
3454 #will create the transcriptome data files
3455 if not os.path.exists(params.transcriptome_outdir):
3456 os.makedirs(params.transcriptome_outdir)
3457 copy(params.gff_annotation, t_gff)
3458 else:
3459 #try to use existing transcriptome data files
3460
3461 if not (os.path.exists(t_gff) and os.path.getsize(t_gff)>10):
3462 die("Error: GFF transcripts file not found or invalid (%s)" % t_gff)
3463 check_bowtie_index(params.transcriptome_index, params.bowtie2)
3464 params.gff_annotation=t_gff
3465 #end @ transcriptome_index given
3466
3467 (ref_fasta, ref_seq_dict) = check_index(bwt_idx_prefix, params.bowtie2)
3468
3469 th_log("Generating SAM header for "+bwt_idx_prefix)
3470 sam_header_filename = get_index_sam_header(params, bwt_idx_prefix)
3471 #if not params.skip_check_reads:
3472 reads_list = left_reads_list
3473 if right_reads_list:
3474 reads_list = reads_list + "," + right_reads_list
3475 params.read_params = check_reads_format(params, reads_list)
3476
3477 user_supplied_juncs = []
3478 user_supplied_insertions = []
3479 user_supplied_deletions = []
3480 user_supplied_fusions = []
3481 global gtf_juncs
3482 if params.gff_annotation and params.find_GFF_juncs:
3483 test_input_file(params.gff_annotation)
3484 (found_juncs, gtf_juncs) = get_gtf_juncs(params.gff_annotation)
3485 ##-- we shouldn't need these junctions in user_supplied_juncs anymore because now map2gtf does a much better job
3486 ## but we still need them loaded in gtf_juncs for later splice verification
3487 if found_juncs:
3488 ## and not params.gff_annotation:
3489 user_supplied_juncs.append(gtf_juncs)
3490 #else:
3491 # gtf_juncs = None
3492 if params.raw_junctions:
3493 test_input_file(params.raw_junctions)
3494 user_supplied_juncs.append(params.raw_junctions)
3495
3496 if params.raw_insertions:
3497 test_input_file(params.raw_insertions)
3498 user_supplied_insertions.append(params.raw_insertions)
3499
3500 if params.raw_deletions:
3501 test_input_file(params.raw_deletions)
3502 user_supplied_deletions.append(params.raw_deletions)
3503
3504 global unmapped_reads_fifo
3505 unmapped_reads_fifo = tmp_dir + str(os.getpid())+".bwt_unmapped.z.fifo"
3506
3507 # Now start the time consuming stuff
3508 if params.prefilter_multi:
3509 sides=("left","right")
3510 read_lists=(left_reads_list, right_reads_list)
3511 for ri in (0,1):
3512 reads_list=read_lists[ri]
3513 if not reads_list:
3514 continue
3515 params.preflt_data[ri].multihit_reads = tmp_dir + sides[ri]+"_multimapped.fq"
3516 side_imap = tmp_dir + sides[ri]+"_im.bwtout"
3517 if use_zpacker:
3518 side_imap+=".z"
3519 side_ium = tmp_dir + sides[ri]+"_ium.fq"
3520 if use_BWT_FIFO:
3521 side_ium += ".z"
3522 th_log("Pre-filtering multi-mapped "+sides[ri]+" reads")
3523 rdlist=reads_list.split(',')
3524 bwt=bowtie(params, bwt_idx_prefix, rdlist,
3525 params.read_params.reads_format,
3526 params.genome_read_mismatches,
3527 side_imap, side_ium,
3528 "", "read_against_genome", params.preflt_data[ri].multihit_reads)
3529 params.preflt_data[ri].mappings = bwt[0] # initial mappings
3530 params.preflt_data[ri].unmapped_reads = bwt[1] # IUM reads
3531
3532 th_log("Preparing reads")
3533 left_kept_reads, left_reads_info = prep_reads(params,
3534 left_reads_list,
3535 left_quals_list,
3536 "left",
3537 params.preflt_data[0].multihit_reads)
3538 min_read_len=left_reads_info.min_len
3539 max_read_len=left_reads_info.max_len
3540 if right_reads_list:
3541 right_kept_reads, right_reads_info = prep_reads(params,
3542 right_reads_list,
3543 right_quals_list,
3544 "right",
3545 params.preflt_data[1].multihit_reads)
3546 min_read_len=min(right_reads_info.min_len, min_read_len)
3547 max_read_len=max(right_reads_info.max_len, max_read_len)
3548 else:
3549 right_kept_reads = None
3550 seed_len=params.read_params.seed_length
3551 if seed_len: #if read len was explicitly given
3552 seed_len = max(seed_len, min_read_len)
3553 #can't be smaller than minimum length observed
3554 else:
3555 seed_len = max_read_len
3556 params.read_params.seed_length=seed_len
3557 # turn off integer-quals
3558 if params.read_params.integer_quals:
3559 params.read_params.integer_quals = False
3560
3561 input_reads = [left_kept_reads, right_kept_reads]
3562 mappings = spliced_alignment(params,
3563 bwt_idx_prefix,
3564 sam_header_filename,
3565 ref_fasta,
3566 params.read_params.seed_length,
3567 params.segment_length,
3568 input_reads,
3569 user_supplied_juncs,
3570 user_supplied_insertions,
3571 user_supplied_deletions)
3572
3573 compile_reports(params,
3574 sam_header_filename,
3575 ref_fasta,
3576 mappings,
3577 input_reads,
3578 params.gff_annotation)
3579
3580 if not params.system_params.keep_tmp:
3581 for m in mappings[0]:
3582 os.remove(m)
3583 if left_kept_reads:
3584 os.remove(left_kept_reads)
3585 for m in mappings[1]:
3586 os.remove(m)
3587 if right_kept_reads:
3588 os.remove(right_kept_reads)
3589 tmp_files = os.listdir(tmp_dir)
3590 for t in tmp_files:
3591 os.remove(tmp_dir+t)
3592 os.rmdir(tmp_dir)
3593
3594 finish_time = datetime.now()
3595 duration = finish_time - start_time
3596 th_logp("-----------------------------------------------")
3597 th_log("Run complete: %s elapsed" % formatTD(duration))
3598
3599 except Usage, err:
3600 th_logp(sys.argv[0].split("/")[-1] + ": " + str(err.msg))
3601 th_logp(" for detailed help see http://tophat.cbcb.umd.edu/manual.html")
3602 return 2
3603
3604
3605 if __name__ == "__main__":
3606 sys.exit(main())

Properties

Name Value
svn:executable *