ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/tophat
Revision: 159
Committed: Mon Jan 30 22:38:08 2012 UTC (7 years, 8 months ago) by gpertea
File size: 148993 byte(s)
Log Message:
wip

Line File contents
1 #!/usr/bin/env python
2
3 # encoding: utf-8
4 """
5 tophat.py
6
7 Created by Cole Trapnell on 2008-12-25.
8 Copyright (c) 2008 Cole Trapnell. All rights reserved.
9 Updated and maintained by Daehwan Kim and Geo Pertea since Jul 2010.
10 """
11 import sys
12 try:
13 import psyco
14 psyco.full()
15 except ImportError:
16 pass
17
18 import getopt
19 import subprocess
20 import errno
21 import os
22 import warnings
23 import re
24 import signal
25 from datetime import datetime, date, time
26 from shutil import copy
27
28 use_message = '''
29 TopHat maps short sequences from spliced transcripts to whole genomes.
30
31 Usage:
32 tophat [options] <bowtie_index> <reads1[,reads2,...]> [reads1[,reads2,...]] \\
33 [quals1,[quals2,...]] [quals1[,quals2,...]]
34
35 Options:
36 -v/--version
37 -o/--output-dir <string> [ default: ./tophat_out ]
38 --bowtie1 [ default: bowtie2 ]
39 -a/--min-anchor <int> [ default: 8 ]
40 -m/--splice-mismatches <0-2> [ default: 0 ]
41 -i/--min-intron-length <int> [ default: 50 ]
42 -I/--max-intron-length <int> [ default: 500000 ]
43 -g/--max-multihits <int> [ default: 20 ]
44 -x/--transcriptome-max-hits <int> [ default: 60 ]
45 -n/--transcriptome-mismatches <int> [ default: 1 ]
46 -M/--prefilter-multihits ( for -G/--GTF option, enable
47 an initial bowtie search
48 against the genome )
49 --max-insertion-length <int> [ default: 3 ]
50 --max-deletion-length <int> [ default: 3 ]
51 --solexa-quals
52 --solexa1.3-quals (same as phred64-quals)
53 --phred64-quals (same as solexa1.3-quals)
54 -Q/--quals
55 --integer-quals
56 -C/--color (Solid - color space)
57 --color-out
58 --library-type <string> (fr-unstranded, fr-firststrand,
59 fr-secondstrand)
60 -p/--num-threads <int> [ default: 1 ]
61 -G/--GTF <filename> (GTF/GFF with known transcripts)
62 --transcriptome-index <bwtidx> (transcriptome bowtie index)
63 -T/--transcriptome-only (map only to the transcriptome)
64 -j/--raw-juncs <filename>
65 --insertions <filename>
66 --deletions <filename>
67 -r/--mate-inner-dist <int>
68 --mate-std-dev <int> [ default: 20 ]
69 --no-novel-juncs
70 --no-novel-indels
71 --fusion-search
72 --fusion-anchor-length <int> [ default: 20 ]
73 --fusion-min-dist <int> [ default: 10000000 ]
74 --fusion-read-mismatches <int> [ default: 2 ]
75 --fusion-multireads <int> [ default: 2 ]
76 --fusion-multipairs <int> [ default: 2 ]
77 --no-gtf-juncs
78 --no-coverage-search
79 --coverage-search
80 --no-closure-search
81 --closure-search
82 --microexon-search
83 --butterfly-search
84 --no-butterfly-search
85 --keep-tmp
86 --tmp-dir <dirname> [ default: <output_dir>/tmp ]
87 -z/--zpacker <program> [ default: gzip ]
88 -X/--unmapped-fifo [use mkfifo to compress more temporary
89 files for color space reads]
90
91 Advanced Options:
92 --report-secondary-alignments
93 --genome-read-mismatches <int> [ default: 2 ]
94 -N/--read-mismatches <int> [ default: 2 ]
95 --segment-mismatches <int> [ default: 2 ]
96 --segment-length <int> [ default: 25 ]
97 --bowtie-n [ default: bowtie -v ]
98 --min-closure-exon <int> [ default: 100 ]
99 --min-closure-intron <int> [ default: 50 ]
100 --max-closure-intron <int> [ default: 5000 ]
101 --min-coverage-intron <int> [ default: 50 ]
102 --max-coverage-intron <int> [ default: 20000 ]
103 --min-segment-intron <int> [ default: 50 ]
104 --max-segment-intron <int> [ default: 500000 ]
105 --no-sort-bam (Output BAM is not coordinate-sorted)
106 --no-convert-bam (Do not convert to bam format.
107 Output is <output_dir>accepted_hit.sam.
108 Implies --no-sort-bam)
109 --keep-fasta-order
110 --allow-partial-mapping
111
112 Bowtie2 related options:
113 Preset options in --end-to-end mode (local alignment is not used in TopHat2)
114 --b2-very-fast
115 --b2-fast
116 --b2-sensitive
117 --b2-very-sensitive
118
119 Alignment options
120 --b2-N <int> [ default: 0 ]
121 --b2-L <int> [ default: 20 ]
122 --b2-i <func> [ default: S,1,1.25 ]
123 --b2-n-ceil <func> [ default: L,0,0.15 ]
124 --b2-gbar <int> [ default: 4 ]
125
126 Scoring options
127 --b2-mp <int>,<int> [ default: 6,2 ]
128 --b2-np <int> [ default: 1 ]
129 --b2-rdg <int>,<int> [ default: 5,3 ]
130 --b2-rfg <int>,<int> [ default: 5,3 ]
131 --b2-score-min <func> [ default: L,-0.6,-0.6 ]
132
133 Effort options
134 --b2-D <int> [ default: 15 ]
135 --b2-R <int> [ default: 2 ]
136
137
138 SAM Header Options (for embedding sequencing run metadata in output):
139 --rg-id <string> (read group ID)
140 --rg-sample <string> (sample ID)
141 --rg-library <string> (library ID)
142 --rg-description <string> (descriptive string, no tabs allowed)
143 --rg-platform-unit <string> (e.g Illumina lane ID)
144 --rg-center <string> (sequencing center name)
145 --rg-date <string> (ISO 8601 date of the sequencing run)
146 --rg-platform <string> (Sequencing platform descriptor)
147 '''
148
149 # -F/--min-isoform-fraction <float> [ default: 0.15 ]
150 # (obsolete)
151
152 class Usage(Exception):
153 def __init__(self, msg):
154 self.msg = msg
155
156 output_dir = "./tophat_out/"
157 logging_dir = output_dir + "logs/"
158 run_log = None
159 run_cmd = None
160 tmp_dir = output_dir + "tmp/"
161 bin_dir = sys.path[0] + "/"
162 use_zpacker = False # this is set by -z/--zpacker option (-z0 leaves it False)
163
164 use_BAM_Unmapped = False # automatically set to True for non-Solid reads, handles unmapped reads in BAM format
165
166 use_BWT_FIFO = False # can only be set to True if use_zpacker is True and only with -C/--color
167 # enabled by -X/-unmapped-fifo option (unless -z0)
168 unmapped_reads_fifo = None # if use_BWT_FIFO is True, this tricks bowtie into writing the
169 # unmapped reads into a compressed file
170
171 samtools_path = None
172 bowtie_path = None
173 fail_str = "\t[FAILED]\n"
174 gtf_juncs = None #file name with junctions extracted from given GFF file
175
176 # TopHatParams captures all of the runtime paramaters used by TopHat, and many
177 # of these are passed as command line options to exectubles run by the pipeline
178
179 # This class and its nested classes also do options parsing through parse_options()
180 # and option validation via the member function check()
181
182 class BowtieOutputFiles:
183 def __init__(self,
184 mappings=None,
185 unmapped_reads=None,
186 multihit_reads=None):
187 self.mappings=mappings
188 self.unmapped_reads=unmapped_reads
189 self.multihit_reads=multihit_reads
190
191 class TopHatParams:
192
193 # SpliceConstraints is a group of runtime parameters that specify what
194 # constraints to put on junctions discovered by the program. These constraints
195 # are used to filter out spurious/false positive junctions.
196
197 class SpliceConstraints:
198 def __init__(self,
199 min_anchor_length,
200 min_intron_length,
201 max_intron_length,
202 splice_mismatches,
203 min_isoform_fraction):
204 self.min_anchor_length = min_anchor_length
205 self.min_intron_length = min_intron_length
206 self.max_intron_length = max_intron_length
207 self.splice_mismatches = splice_mismatches
208 self.min_isoform_fraction = min_isoform_fraction
209
210 def parse_options(self, opts):
211 for option, value in opts:
212 if option in ("-m", "--splice-mismatches"):
213 self.splice_mismatches = int(value)
214 elif option in ("-a", "--min-anchor"):
215 self.min_anchor_length = int(value)
216 elif option in ("-F", "--min-isoform-fraction"):
217 self.min_isoform_fraction = float(value)
218 elif option in ("-i", "--min-intron-length"):
219 self.min_intron_length = int(value)
220 elif option in ("-I", "--max-intron-length"):
221 self.max_intron_length = int(value)
222
223 def check(self):
224 if self.splice_mismatches not in [0,1,2]:
225 die("Error: arg to --splice-mismatches must be 0, 1, or 2")
226 if self.min_anchor_length < 4:
227 die("Error: arg to --min-anchor-len must be greater than 4")
228 if self.min_isoform_fraction < 0.0 or self.min_isoform_fraction > 1.0:
229 die("Error: arg to --min-isoform-fraction must be between 0.0 and 1.0")
230 if self.min_intron_length <= 0:
231 die("Error: arg to --min-intron-length must be greater than 0")
232 if self.max_intron_length <= 0:
233 die("Error: arg to --max-intron-length must be greater than 0")
234
235 # SystemParams is a group of runtime parameters that determine how to handle
236 # temporary files produced during a run and how many threads to use for threaded
237 # stages of the pipeline (e.g. Bowtie)
238
239 class SystemParams:
240 def __init__(self,
241 num_threads,
242 keep_tmp):
243 self.num_threads = num_threads
244 self.keep_tmp = keep_tmp
245 self.zipper = "gzip"
246 self.zipper_opts= []
247
248 def parse_options(self, opts):
249 global use_zpacker
250 global use_BWT_FIFO
251 for option, value in opts:
252 if option in ("-p", "--num-threads"):
253 self.num_threads = int(value)
254 elif option == "--keep-tmp":
255 self.keep_tmp = True
256 elif option in ("-z","--zpacker"):
257 if value.lower() in ["-", " ", ".", "0", "none", "f", "false", "no"]:
258 value=""
259 self.zipper = value
260 #if not self.zipper:
261 # self.zipper='gzip'
262 elif option in ("-X", "--unmapped-fifo"):
263 use_BWT_FIFO=True
264 if self.zipper:
265 use_zpacker=True
266 if self.num_threads>1 and not self.zipper_opts:
267 if self.zipper.endswith('pbzip2') or self.zipper.endswith('pigz'):
268 self.zipper_opts.append('-p'+str(self.num_threads))
269 else:
270 use_zpacker=False
271 if use_BWT_FIFO: use_BWT_FIFO=False
272 def cmd(self):
273 cmdline=[]
274 if self.zipper:
275 cmdline.extend(['-z',self.zipper])
276 if self.num_threads>1:
277 cmdline.extend(['-p'+str(self.num_threads)])
278 return cmdline
279
280 def check(self):
281 if self.num_threads<1 :
282 die("Error: arg to --num-threads must be greater than 0")
283 if self.zipper:
284 xzip=which(self.zipper)
285 if not xzip:
286 die("Error: cannot find compression program "+self.zipper)
287
288 # ReadParams is a group of runtime parameters that specify various properties
289 # of the user's reads (e.g. which quality scale their are on, how long the
290 # fragments are, etc).
291 class ReadParams:
292 def __init__(self,
293 solexa_quals,
294 phred64_quals,
295 quals,
296 integer_quals,
297 color,
298 library_type,
299 seed_length,
300 reads_format,
301 mate_inner_dist,
302 mate_inner_dist_std_dev,
303 read_group_id,
304 sample_id,
305 library_id,
306 description,
307 seq_platform_unit,
308 seq_center,
309 seq_run_date,
310 seq_platform):
311 self.solexa_quals = solexa_quals
312 self.phred64_quals = phred64_quals
313 self.quals = quals
314 self.integer_quals = integer_quals
315 self.color = color
316 self.library_type = library_type
317 self.seed_length = seed_length
318 self.reads_format = reads_format
319 self.mate_inner_dist = mate_inner_dist
320 self.mate_inner_dist_std_dev = mate_inner_dist_std_dev
321 self.read_group_id = read_group_id
322 self.sample_id = sample_id
323 self.library_id = library_id
324 self.description = description
325 self.seq_platform_unit = seq_platform_unit
326 self.seq_center = seq_center
327 self.seq_run_date = seq_run_date
328 self.seq_platform = seq_platform
329
330 def parse_options(self, opts):
331 for option, value in opts:
332 if option == "--solexa-quals":
333 self.solexa_quals = True
334 elif option in ("--solexa1.3-quals", "--phred64-quals"):
335 self.phred64_quals = True
336 elif option in ("-Q", "--quals"):
337 self.quals = True
338 elif option == "--integer-quals":
339 self.integer_quals = True
340 elif option in ("-C", "--color"):
341 self.color = True
342 elif option == "--library-type":
343 self.library_type = value
344 elif option in ("-s", "--seed-length"):
345 self.seed_length = int(value)
346 elif option in ("-r", "--mate-inner-dist"):
347 self.mate_inner_dist = int(value)
348 elif option == "--mate-std-dev":
349 self.mate_inner_dist_std_dev = int(value)
350 elif option == "--rg-id":
351 self.read_group_id = value
352 elif option == "--rg-sample":
353 self.sample_id = value
354 elif option == "--rg-library":
355 self.library_id = value
356 elif option == "--rg-description":
357 self.description = value
358 elif option == "--rg-platform-unit":
359 self.seq_platform_unit = value
360 elif option == "--rg-center":
361 self.seq_center = value
362 elif option == "--rg-date":
363 self.seq_run_date = value
364 elif option == "--rg-platform":
365 self.seq_platform = value
366
367 def check(self):
368 if self.seed_length and self.seed_length < 20:
369 die("Error: arg to --seed-length must be at least 20")
370
371 if self.mate_inner_dist_std_dev != None and self.mate_inner_dist_std_dev < 0:
372 die("Error: arg to --mate-std-dev must at least 0")
373 if (not self.read_group_id and self.sample_id) or (self.read_group_id and not self.sample_id):
374 die("Error: --rg-id and --rg-sample must be specified or omitted together")
375
376 # SearchParams is a group of runtime parameters that specify how TopHat will
377 # search for splice junctions
378
379 class SearchParams:
380 def __init__(self,
381 min_closure_exon,
382 min_closure_intron,
383 max_closure_intron,
384 min_coverage_intron,
385 max_coverage_intron,
386 min_segment_intron,
387 max_segment_intron):
388
389 self.min_closure_exon_length = min_closure_exon
390 self.min_closure_intron_length = min_closure_intron
391 self.max_closure_intron_length = max_closure_intron
392 self.min_coverage_intron_length = min_coverage_intron
393 self.max_coverage_intron_length = max_coverage_intron
394 self.min_segment_intron_length = min_segment_intron
395 self.max_segment_intron_length = max_segment_intron
396
397 def parse_options(self, opts):
398 for option, value in opts:
399 if option == "--min-closure-exon":
400 self.min_closure_exon_length = int(value)
401 if option == "--min-closure-intron":
402 self.min_closure_intron_length = int(value)
403 if option == "--max-closure-intron":
404 self.max_closure_intron_length = int(value)
405 if option == "--min-coverage-intron":
406 self.min_coverage_intron_length = int(value)
407 if option == "--max-coverage-intron":
408 self.max_coverage_intron_length = int(value)
409 if option == "--min-segment-intron":
410 self.min_segment_intron_length = int(value)
411 if option == "--max-segment-intron":
412 self.max_segment_intron_length = int(value)
413
414 def check(self):
415 if self.min_closure_exon_length < 0:
416 die("Error: arg to --min-closure-exon must be at least 20")
417 if self.min_closure_intron_length < 0:
418 die("Error: arg to --min-closure-intron must be at least 20")
419 if self.max_closure_intron_length < 0:
420 die("Error: arg to --max-closure-intron must be at least 20")
421 if self.min_coverage_intron_length < 0:
422 die("Error: arg to --min-coverage-intron must be at least 20")
423 if self.max_coverage_intron_length < 0:
424 die("Error: arg to --max-coverage-intron must be at least 20")
425 if self.min_segment_intron_length < 0:
426 die("Error: arg to --min-segment-intron must be at least 20")
427 if self.max_segment_intron_length < 0:
428 die("Error: arg to --max-segment-intron must be at least 20")
429
430 class ReportParams:
431 def __init__(self):
432 self.sort_bam = True
433 self.convert_bam = True
434
435 def parse_options(self, opts):
436 for option, value in opts:
437 if option == "--no-sort-bam":
438 self.sort_bam = False
439 if option == "--no-convert-bam":
440 self.convert_bam = False
441 self.sort_bam = False
442
443 class Bowtie2Params:
444 def __init__(self):
445 self.very_fast = False
446 self.fast = False
447 self.sensitive = False
448 self.very_sensitive = False
449
450 self.N = 0
451 self.L = 20
452 self.i = "S,1,1.25"
453 self.n_ceil = "L,0,0.15"
454 self.gbar = 4
455
456 self.mp = "6,2"
457 self.np = 1
458 self.rdg = "5,3"
459 self.rfg = "5,3"
460 self.score_min = "L,-0.6,-0.6"
461
462 self.D = 15
463 self.R = 2
464
465 def parse_options(self, opts):
466 for option, value in opts:
467 if option == "--b2-very-fast":
468 self.very_fast = True
469 if option == "--b2-fast":
470 self.fast = True
471 if option == "--b2-sensitive":
472 self.sensitive = True
473 if option == "--b2-very-sensitive":
474 self.very_sensitive = True
475
476 if option == "--b2-N":
477 self.N = int(value)
478 if option == "--b2-L":
479 self.L = 20
480 if option == "--b2-i":
481 self.i = value
482 if option == "--b2-n-ceil":
483 self.n_ceil = value
484 if option == "--b2-gbar":
485 self.gbar = 4
486
487 if option == "--b2-mp":
488 self.mp = value
489 if option == "--b2-np":
490 self.np = int(value)
491 if option == "--b2-rdg":
492 self.rdg = value
493 if option == "--b2-rfg":
494 self.rfg = value
495 if option == "--b2-score-min":
496 self.score_min = value
497
498 if option == "--b2-D":
499 self.D = int(value)
500 if option == "--b2-R":
501 self.R = int(value)
502
503 def check(self):
504 more_than_once = False
505 if self.very_fast:
506 if self.fast or self.sensitive or self.very_sensitive:
507 more_than_once = True
508 else:
509 if self.fast:
510 if self.sensitive or self.very_sensitive:
511 more_than_once = True
512 else:
513 if self.sensitive and self.very_sensitive:
514 more_than_once = True
515
516 if more_than_once:
517 die("Error: use only one of --b2-very-fast, --b2-fast, --b2-sensitive, --b2-very-sensitive")
518
519 if not self.N in [0, 1]:
520 die("Error: arg to --b2-N must be either 0 or 1")
521
522 function_re = r'^[CLSG],-?\d+(\.\d+)?,-?\d+(\.\d+)?$'
523 function_match = re.search(function_re, self.i)
524
525 if not function_match:
526 die("Error: arg to --b2-i must be <func> (e.g. --b2-i S,1,1.25)")
527
528 function_match = re.search(function_re, self.n_ceil)
529 if not function_match:
530 die("Error: arg to --b2-n-ceil must be <func> (e.g. --b2-n-ceil L,0,0.15)")
531
532 function_match = re.search(function_re, self.score_min)
533 if not function_match:
534 die("Error: arg to --b2-score-min must be <func> (e.g. --b2-score-min L,-0.6,-0.6)")
535
536 pair_re = r'^\d+,\d+$'
537 pair_match = re.search(pair_re, self.mp)
538 if not pair_match:
539 die("Error: arg to --b2-mp must be <int>,<int> (e.g. --b2-mp 6,2)")
540
541 pair_match = re.search(pair_re, self.rdg)
542 if not pair_match:
543 die("Error: arg to --b2-rdg must be <int>,<int> (e.g. --b2-mp 5,3)")
544
545 pair_match = re.search(pair_re, self.rfg)
546 if not pair_match:
547 die("Error: arg to --b2-rfg must be <int>,<int> (e.g. --b2-mp 5,3)")
548
549
550 def __init__(self):
551 self.splice_constraints = self.SpliceConstraints(8, # min_anchor
552 50, # min_intron
553 500000, # max_intron
554 0, # splice_mismatches
555 0.15) # min_isoform_frac
556
557 self.preflt_data = [ BowtieOutputFiles(), BowtieOutputFiles() ]
558
559 self.read_params = self.ReadParams(False, # solexa_scale
560 False,
561 False, # quals
562 None, # integer quals
563 False, # SOLiD - color space
564 "", # library type (e.g. "illumina-stranded-pair-end")
565 None, # seed_length
566 "fastq", # quality_format
567 None, # mate inner distance
568 20, # mate inner dist std dev
569 None, # read group id
570 None, # sample id
571 None, # library id
572 None, # description
573 None, # platform unit (i.e. lane)
574 None, # sequencing center
575 None, # run date
576 None) # sequencing platform
577
578 self.system_params = self.SystemParams(1, # bowtie_threads (num_threads)
579 False) # keep_tmp
580
581 self.search_params = self.SearchParams(100, # min_closure_exon_length
582 50, # min_closure_intron_length
583 5000, # max_closure_intron_length
584 50, # min_coverage_intron_length
585 20000, # max_coverage_intron_length
586 50, # min_segment_intron_length
587 500000) # max_segment_intron_length
588
589 self.report_params = self.ReportParams()
590
591 self.bowtie2_params = self.Bowtie2Params()
592
593 self.bowtie2 = True
594 self.gff_annotation = None
595 self.transcriptome_only = False
596 self.transcriptome_index = None
597 self.transcriptome_outdir = None
598 self.raw_junctions = None
599 self.find_novel_juncs = True
600 self.find_novel_indels = True
601 self.find_novel_fusions = True
602 self.find_GFF_juncs = True
603 self.max_hits = 20
604 self.t_max_hits = 60
605 self.max_seg_hits = 40
606 self.prefilter_multi = False
607 self.genome_read_mismatches = 2
608 self.max_read_mismatches = 2
609 self.t_mismatches = 1
610 self.segment_length = 25
611 self.segment_mismatches = 2
612 self.bowtie_alignment_option = "-v"
613 self.max_insertion_length = 3
614 self.max_deletion_length = 3
615 self.raw_insertions = None
616 self.raw_deletions = None
617 self.coverage_search = None
618 self.closure_search = False
619 self.microexon_search = False
620 self.butterfly_search = None
621 self.report_secondary_alignments = False
622
623 self.keep_fasta_order = False
624 self.partial_mapping = False
625
626 self.fusion_search = False
627 self.fusion_anchor_length = 20
628 self.fusion_min_dist = 10000000
629 self.fusion_read_mismatches = 2
630 self.fusion_multireads = 2
631 self.fusion_multipairs = 2
632
633 def check(self):
634 self.splice_constraints.check()
635 self.read_params.check()
636 self.system_params.check()
637 self.bowtie2_params.check()
638 if self.segment_length < 10:
639 die("Error: arg to --segment-length must at least 10")
640 if self.segment_mismatches < 0 or self.segment_mismatches > 3:
641 die("Error: arg to --segment-mismatches must in [0, 3]")
642 if self.t_mismatches < 0 or self.t_mismatches > 3:
643 die("Error: arg to -n or --transcriptome-mismatches must in [0, 3]")
644
645 if self.read_params.color:
646 if self.bowtie2:
647 die("Error: bowtie2 in colorspace is not yet supported\nuse --bowtie1 option")
648 if self.fusion_search:
649 die("Error: fusion-search in colorspace is not yet supported")
650 if self.butterfly_search:
651 die("Error: butterfly-search in colorspace is not yet supported")
652
653 library_types = ["fr-unstranded", "fr-firststrand", "fr-secondstrand"]
654
655 if self.read_params.library_type and self.read_params.library_type not in library_types:
656 die("Error: library-type should be one of: "+', '.join(library_types))
657
658 self.search_params.max_closure_intron_length = min(self.splice_constraints.max_intron_length,
659 self.search_params.max_closure_intron_length)
660
661 self.search_params.max_segment_intron_length = min(self.splice_constraints.max_intron_length,
662 self.search_params.max_segment_intron_length)
663
664 self.search_params.max_coverage_intron_length = min(self.splice_constraints.max_intron_length,
665 self.search_params.max_coverage_intron_length)
666
667 if self.max_insertion_length >= self.segment_length:
668 die("Error: the max insertion length ("+self.max_insertion_length+") can not be equal to or greater than the segment length ("+self.segment_length+")")
669
670 if self.max_insertion_length < 0:
671 die("Error: the max insertion length ("+self.max_insertion_length+") can not be less than 0")
672
673 if self.max_deletion_length >= self.splice_constraints.min_intron_length:
674 die("Error: the max deletion length ("+self.max_deletion_length+") can not be equal to or greater than the min intron length ("+self.splice_constraints.min_intron_length+")")
675
676 if self.max_deletion_length < 0:
677 die("Error: the max deletion length ("+self.max_deletion_length+") can not be less than 0")
678
679 def cmd(self):
680 cmd = ["--min-anchor", str(self.splice_constraints.min_anchor_length),
681 "--splice-mismatches", str(self.splice_constraints.splice_mismatches),
682 "--min-report-intron", str(self.splice_constraints.min_intron_length),
683 "--max-report-intron", str(self.splice_constraints.max_intron_length),
684 "--min-isoform-fraction", str(self.splice_constraints.min_isoform_fraction),
685 "--output-dir", output_dir,
686 "--max-multihits", str(self.max_hits),
687 "--max-seg-multihits", str(self.max_seg_hits),
688 "--segment-length", str(self.segment_length),
689 "--segment-mismatches", str(self.segment_mismatches),
690 "--min-closure-exon", str(self.search_params.min_closure_exon_length),
691 "--min-closure-intron", str(self.search_params.min_closure_intron_length),
692 "--max-closure-intron", str(self.search_params.max_closure_intron_length),
693 "--min-coverage-intron", str(self.search_params.min_coverage_intron_length),
694 "--max-coverage-intron", str(self.search_params.max_coverage_intron_length),
695 "--min-segment-intron", str(self.search_params.min_segment_intron_length),
696 "--max-segment-intron", str(self.search_params.max_segment_intron_length),
697 "--max-mismatches", str(self.max_read_mismatches),
698 "--max-insertion-length", str(self.max_insertion_length),
699 "--max-deletion-length", str(self.max_deletion_length)]
700
701 if not self.bowtie2:
702 cmd.extend(["--bowtie1"])
703
704 if self.fusion_search:
705 cmd.extend(["--fusion-search",
706 "--fusion-anchor-length", str(self.fusion_anchor_length),
707 "--fusion-min-dist", str(self.fusion_min_dist),
708 "--fusion-read-mismatches", str(self.fusion_read_mismatches),
709 "--fusion-multireads", str(self.fusion_multireads),
710 "--fusion-multipairs", str(self.fusion_multipairs)])
711
712 cmd.extend(self.system_params.cmd())
713
714 if self.read_params.mate_inner_dist != None:
715 cmd.extend(["--inner-dist-mean", str(self.read_params.mate_inner_dist),
716 "--inner-dist-std-dev", str(self.read_params.mate_inner_dist_std_dev)])
717 if self.gff_annotation != None:
718 cmd.extend(["--gtf-annotations", str(self.gff_annotation)])
719 if gtf_juncs:
720 cmd.extend(["--gtf-juncs", gtf_juncs])
721 if self.closure_search == False:
722 cmd.append("--no-closure-search")
723 if not self.coverage_search:
724 cmd.append("--no-coverage-search")
725 if not self.microexon_search:
726 cmd.append("--no-microexon-search")
727 if self.butterfly_search:
728 cmd.append("--butterfly-search")
729 if self.read_params.solexa_quals:
730 cmd.append("--solexa-quals")
731 if self.read_params.quals:
732 cmd.append("--quals")
733 if self.read_params.integer_quals:
734 cmd.append("--integer-quals")
735 if self.read_params.color:
736 cmd.append("--color")
737 if self.read_params.library_type:
738 cmd.extend(["--library-type", self.read_params.library_type])
739 if self.read_params.read_group_id:
740 cmd.extend(["--rg-id", self.read_params.read_group_id])
741 if self.read_params.phred64_quals:
742 cmd.append("--phred64-quals")
743 return cmd
744
745 # This is the master options parsing routine, which calls parse_options for
746 # the delegate classes (e.g. SpliceConstraints) that handle certain groups
747 # of options.
748 def parse_options(self, argv):
749 try:
750 opts, args = getopt.getopt(argv[1:], "hvp:m:n:N:F:a:i:I:G:Tr:o:j:Xz:s:g:x:MQC",
751 ["version",
752 "help",
753 "output-dir=",
754 "bowtie1",
755 "solexa-quals",
756 "solexa1.3-quals",
757 "phred64-quals",
758 "quals",
759 "integer-quals",
760 "color",
761 "library-type=",
762 "num-threads=",
763 "splice-mismatches=",
764 "max-multihits=",
765 "min-isoform-fraction=",
766 "min-anchor-length=",
767 "min-intron-length=",
768 "max-intron-length=",
769 "GTF=",
770 "transcriptome-only",
771 "transcriptome-max-hits=",
772 "transcriptome-index=",
773 "raw-juncs=",
774 "no-novel-juncs",
775 "allow-fusions",
776 "fusion-search",
777 "fusion-anchor-length=",
778 "fusion-min-dist=",
779 "fusion-read-mismatches=",
780 "fusion-multireads=",
781 "fusion-multipairs=",
782 "no-novel-indels",
783 "no-gtf-juncs",
784 "mate-inner-dist=",
785 "mate-std-dev=",
786 "no-closure-search",
787 "no-coverage-search",
788 "closure-search",
789 "coverage-search",
790 "prefilter-multihits",
791 "microexon-search",
792 "min-closure-exon=",
793 "min-closure-intron=",
794 "max-closure-intron=",
795 "min-coverage-intron=",
796 "max-coverage-intron=",
797 "min-segment-intron=",
798 "max-segment-intron=",
799 "seed-length=",
800 "genome-read-mismatches=",
801 "read-mismatches=",
802 "max-read-mismatches=",
803 "transcriptome-mismatches=",
804 "segment-length=",
805 "segment-mismatches=",
806 "bowtie-n",
807 "butterfly-search",
808 "no-butterfly-search",
809 "keep-tmp",
810 "rg-id=",
811 "rg-sample=",
812 "rg-library=",
813 "rg-description=",
814 "rg-platform-unit=",
815 "rg-center=",
816 "rg-date=",
817 "rg-platform=",
818 "tmp-dir=",
819 "zpacker=",
820 "unmapped-fifo",
821 "max-insertion-length=",
822 "max-deletion-length=",
823 "insertions=",
824 "deletions=",
825 "no-sort-bam",
826 "no-convert-bam",
827 "report-secondary-alignments",
828 "keep-fasta-order",
829 "allow-partial-mapping",
830 "b2-very-fast",
831 "b2-fast",
832 "b2-sensitive",
833 "b2-very-sensitive",
834 "b2-N=",
835 "b2-L=",
836 "b2-i=",
837 "b2-n-ceil=",
838 "b2-gbar=",
839 "b2-ma=",
840 "b2-mp=",
841 "b2-np=",
842 "b2-rdg=",
843 "b2-rfg=",
844 "b2-score-min=",
845 "b2-D=",
846 "b2-R="])
847 except getopt.error, msg:
848 raise Usage(msg)
849
850 self.splice_constraints.parse_options(opts)
851 self.system_params.parse_options(opts)
852 self.read_params.parse_options(opts)
853 self.search_params.parse_options(opts)
854 self.report_params.parse_options(opts)
855 self.bowtie2_params.parse_options(opts)
856 global use_BWT_FIFO
857 global use_BAM_Unmapped
858 if not self.read_params.color:
859 use_BWT_FIFO=False
860 use_BAM_Unmapped=True
861 global output_dir
862 global logging_dir
863 global tmp_dir
864
865 custom_tmp_dir = None
866 custom_out_dir = None
867 # option processing
868 for option, value in opts:
869 if option in ("-v", "--version"):
870 print "TopHat v%s" % (get_version())
871 sys.exit(0)
872 if option in ("-h", "--help"):
873 raise Usage(use_message)
874 if option == "--bowtie1":
875 self.bowtie2 = False
876 if option in ("-g", "--max-multihits"):
877 self.max_hits = int(value)
878 self.max_seg_hits = max(10, self.max_hits * 2)
879 if option in ("-x", "--transcriptome-max-hits"):
880 self.t_max_hits = int(value)
881 if option in ("-G", "--GTF"):
882 self.gff_annotation = value
883 if option in ("-T", "--transcriptome"):
884 self.transcriptome_only = True
885 if option == "--transcriptome-index":
886 self.transcriptome_index = value
887 if option in("-M", "--prefilter-multihits"):
888 self.prefilter_multi = True
889 if option in ("-j", "--raw-juncs"):
890 self.raw_junctions = value
891 if option == "--no-novel-juncs":
892 self.find_novel_juncs = False
893 if option == "--no-novel-indels":
894 self.find_novel_indels = False
895 if option == "--fusion-search":
896 self.fusion_search = True
897 if option == "--fusion-anchor-length":
898 self.fusion_anchor_length = int(value)
899 if option == "--fusion-min-dist":
900 self.fusion_min_dist = int(value)
901 if option == "--fusion-read-mismatches":
902 self.fusion_read_mismatches = int(value)
903 if option == "--fusion-multireads":
904 self.fusion_multireads = int(value)
905 if option == "--fusion-multipairs":
906 self.fusion_multipairs = int(value)
907 if option == "--no-gtf-juncs":
908 self.find_GFF_juncs = False
909 if option == "--no-coverage-search":
910 self.coverage_search = False
911 if option == "--coverage-search":
912 self.coverage_search = True
913 if option == "--no-closure-search":
914 self.closure_search = False
915 if option == "--closure-search":
916 self.closure_search = True
917 if option == "--microexon-search":
918 self.microexon_search = True
919 if option == "--butterfly-search":
920 self.butterfly_search = True
921 if option == "--no-butterfly-search":
922 self.butterfly_search = False
923 if option == "--genome-read-mismatches":
924 self.genome_read_mismatches = int(value)
925 if option in ("-N", "--read-mismatches", "--max-read-mismatches"):
926 self.max_read_mismatches = int(value)
927 if option in ("-n", "--transcriptome-mismatches"):
928 self.t_mismatches = int(value)
929 if option == "--segment-length":
930 self.segment_length = int(value)
931 if option == "--segment-mismatches":
932 self.segment_mismatches = int(value)
933 if option == "--bowtie-n":
934 self.bowtie_alignment_option = "-n"
935 if option == "--max-insertion-length":
936 self.max_insertion_length = int(value)
937 if option == "--max-deletion-length":
938 self.max_deletion_length = int(value)
939 if option == "--insertions":
940 self.raw_insertions = value
941 if option == "--deletions":
942 self.raw_deletions = value
943 if option == "--report-secondary-alignments":
944 self.report_secondary_alignments = True
945 if option == "--keep-fasta-order":
946 self.keep_fasta_order = True
947 if option == "--allow-partial-mapping":
948 self.partial_mapping = True
949 if option in ("-o", "--output-dir"):
950 custom_out_dir = value + "/"
951 if option == "--tmp-dir":
952 custom_tmp_dir = value + "/"
953 if self.transcriptome_only:
954 self.find_novel_juncs=False
955 self.find_novel_indels=False
956 if custom_out_dir:
957 output_dir = custom_out_dir
958 logging_dir = output_dir + "logs/"
959 tmp_dir = output_dir + "tmp/"
960 sam_header = tmp_dir + "stub_header.sam"
961 if custom_tmp_dir:
962 tmp_dir = custom_tmp_dir
963 sam_header = tmp_dir + "stub_header.sam"
964 if len(args) < 2:
965 raise Usage(use_message)
966 return args
967
968
969 # check if a file exists and has non-zero (or minimum) size
970 def fileExists(filepath, minfsize=1):
971 if os.path.exists(filepath) and os.path.getsize(filepath)>=minfsize:
972 return True
973 else:
974 return False
975
976
977 def getFileDir(filepath):
978 #if fullpath given, returns path including the ending /
979 fpath, fname=os.path.split(filepath)
980 if fpath: fpath+='/'
981 return fpath
982
983 def getFileBaseName(filepath):
984 fpath, fname=os.path.split(filepath)
985 fbase, fext =os.path.splitext(fname)
986 fx=fext.lower()
987 if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fbase)>0:
988 return fbase
989 elif fx == '.z' or fx.find('.gz')==0 or fx.find('.bz')==0:
990 fb, fext = os.path.splitext(fbase)
991 fx=fext.lower()
992 if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fb)>0:
993 return fb
994 else:
995 return fbase
996 else:
997 if len(fbase)>0:
998 return fbase
999 else:
1000 return fname
1001
1002 # Returns the current time in a nice format
1003 def right_now():
1004 curr_time = datetime.now()
1005 return curr_time.strftime("%c")
1006
1007 # The TopHat logging formatter
1008 def th_log(out_str):
1009 print >> sys.stderr, "[%s] %s" % (right_now(), out_str)
1010
1011 # Ensures that the output, logging, and temp directories are present. If not,
1012 # they are created
1013 def prepare_output_dir():
1014
1015 th_log("Preparing output location "+output_dir)
1016 if os.path.exists(output_dir):
1017 pass
1018 else:
1019 os.mkdir(output_dir)
1020
1021 if os.path.exists(logging_dir):
1022 pass
1023 else:
1024 os.mkdir(logging_dir)
1025
1026 if os.path.exists(tmp_dir):
1027 pass
1028 else:
1029 try:
1030 os.makedirs(tmp_dir)
1031 except OSError, o:
1032 die("\nError creating directory %s (%s)" % (tmp_dir, o))
1033
1034
1035 # to be added as preexec_fn for every subprocess.Popen() call:
1036 # see http://bugs.python.org/issue1652
1037 def subprocess_setup():
1038 # Python installs a SIGPIPE handler by default, which causes
1039 # gzip or other de/compression pipes to complain about "stdout: Broken pipe"
1040 signal.signal(signal.SIGPIPE, signal.SIG_DFL)
1041
1042 # Check that the Bowtie index specified by the user is present and all files
1043 # are there.
1044 def check_bowtie_index(idx_prefix):
1045 th_log("Checking for Bowtie index files")
1046
1047 idx_fwd_1 = idx_prefix + ".1.ebwt"
1048 idx_fwd_2 = idx_prefix + ".2.ebwt"
1049 idx_rev_1 = idx_prefix + ".rev.1.ebwt"
1050 idx_rev_2 = idx_prefix + ".rev.2.ebwt"
1051
1052 if os.path.exists(idx_fwd_1) and \
1053 os.path.exists(idx_fwd_2) and \
1054 os.path.exists(idx_rev_1) and \
1055 os.path.exists(idx_rev_2):
1056 return
1057 else:
1058 bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES")
1059 if bowtie_idx_env_var == None:
1060 die("Error: Could not find Bowtie index files " + idx_prefix + ".*")
1061 idx_prefix = bowtie_idx_env_var + idx_prefix
1062
1063 idx_fwd_1 = idx_prefix + ".1.ebwt"
1064 idx_fwd_2 = idx_prefix + ".2.ebwt"
1065 idx_rev_1 = idx_prefix + ".rev.1.ebwt"
1066 idx_rev_2 = idx_prefix + ".rev.2.ebwt"
1067
1068 if os.path.exists(idx_fwd_1) and \
1069 os.path.exists(idx_fwd_2) and \
1070 os.path.exists(idx_rev_1) and \
1071 os.path.exists(idx_rev_2):
1072 return
1073 else:
1074 die("Error: Could not find Bowtie index files " + idx_prefix + ".*")
1075
1076 # Reconstructs the multifasta file from which the Bowtie index was created, if
1077 # it's not already there.
1078 def bowtie_idx_to_fa(idx_prefix):
1079 idx_name = idx_prefix.split('/')[-1]
1080 th_log("Reconstituting reference FASTA file from Bowtie index")
1081
1082 try:
1083 tmp_fasta_file_name = tmp_dir + idx_name + ".fa"
1084 tmp_fasta_file = open(tmp_fasta_file_name, "w")
1085
1086 inspect_log = open(logging_dir + "bowtie_inspect_recons.log", "w")
1087
1088 inspect_cmd = [prog_path("bowtie-inspect"),
1089 idx_prefix]
1090
1091 print >> sys.stderr, "Executing: " + " ".join(inspect_cmd) + " > " + tmp_fasta_file_name
1092 ret = subprocess.call(inspect_cmd,
1093 stdout=tmp_fasta_file,
1094 stderr=inspect_log)
1095
1096 # Bowtie reported an error
1097 if ret != 0:
1098 die(fail_str+"Error: bowtie-inspect returned an error")
1099
1100 # Bowtie not found
1101 except OSError, o:
1102 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1103 die(fail_str+"Error: bowtie-inspect not found on this system. Did you forget to include it in your PATH?")
1104
1105 return tmp_fasta_file_name
1106
1107 # Checks whether the multifasta file for the genome is present alongside the
1108 # Bowtie index files for it.
1109 def check_fasta(idx_prefix):
1110 th_log("Checking for reference FASTA file")
1111 idx_fasta = idx_prefix + ".fa"
1112 if os.path.exists(idx_fasta):
1113 return idx_fasta
1114 else:
1115 bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES")
1116 if bowtie_idx_env_var:
1117 idx_fasta = bowtie_idx_env_var + idx_prefix + ".fa"
1118 if os.path.exists(idx_fasta):
1119 return idx_fasta
1120
1121 print >> sys.stderr, "\tWarning: Could not find FASTA file " + idx_fasta
1122 idx_fa = bowtie_idx_to_fa(idx_prefix)
1123 return idx_fa
1124
1125 # Check that both the Bowtie index and the genome's fasta file are present
1126 def check_index(idx_prefix):
1127 check_bowtie_index(idx_prefix)
1128 ref_fasta_file = check_fasta(idx_prefix)
1129
1130 return (ref_fasta_file, None)
1131
1132 # Retrive a tuple containing the system's version of Bowtie. Parsed from
1133 # `bowtie --version`
1134 def get_bowtie_version(is_bowtie2):
1135 try:
1136 # Launch Bowtie to capture its version info
1137 proc = subprocess.Popen([bowtie_path, "--version"],
1138 stdout=subprocess.PIPE)
1139
1140 stdout_value = proc.communicate()[0]
1141
1142 bowtie_version = None
1143 bowtie_out = repr(stdout_value)
1144
1145 # Find the version identifier
1146 if is_bowtie2:
1147 version_str = "bowtie2-align version "
1148 else:
1149 version_str = "bowtie version "
1150
1151 ver_str_idx = bowtie_out.find(version_str)
1152 if ver_str_idx != -1:
1153 nl = bowtie_out.find("\\n", ver_str_idx)
1154 version_val = bowtie_out[ver_str_idx + len(version_str):nl]
1155
1156 if is_bowtie2:
1157 ver_numbers = version_val.split('.')
1158 ver_numbers[2:] = ver_numbers[2].split('-')
1159 bowtie_version = [int(x) for x in ver_numbers[:3]] + [int(ver_numbers[3][4:])]
1160 else:
1161 bowtie_version = [int(x) for x in version_val.split('.')]
1162
1163 if len(bowtie_version) == 3:
1164 bowtie_version.append(0)
1165
1166 return bowtie_version
1167 except OSError, o:
1168 errmsg=fail_str+str(o)+"\n"
1169 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1170 errmsg+="Error: bowtie not found on this system"
1171 die(errmsg)
1172
1173 def get_index_sam_header(params, idx_prefix):
1174 try:
1175 temp_sam_header_filename = tmp_dir + "temp.samheader.sam"
1176 temp_sam_header_file = open(temp_sam_header_filename, "w")
1177
1178 bowtie_header_cmd = [bowtie_path]
1179
1180 read_params = params.read_params
1181 if not params.bowtie2:
1182 bowtie_header_cmd += ["--sam"]
1183
1184 if read_params.color:
1185 bowtie_header_cmd.append('-C')
1186
1187 bowtie_header_cmd.extend([idx_prefix, '/dev/null'])
1188 subprocess.call(bowtie_header_cmd,
1189 stdout=temp_sam_header_file,
1190 stderr=open('/dev/null'))
1191
1192 temp_sam_header_file.close()
1193 temp_sam_header_file = open(temp_sam_header_filename, "r")
1194
1195 bowtie_sam_header_filename = tmp_dir + idx_prefix.split('/')[-1] + ".bwt.samheader.sam"
1196 bowtie_sam_header_file = open(bowtie_sam_header_filename, "w")
1197
1198 preamble = []
1199 sq_dict_lines = []
1200
1201 for line in temp_sam_header_file.readlines():
1202 line = line.strip()
1203 if line.find("@SQ") != -1:
1204 # Sequence dictionary record
1205 cols = line.split('\t')
1206 seq_name = None
1207 for col in cols:
1208 fields = col.split(':')
1209 #print fields
1210 if len(fields) > 0 and fields[0] == "SN":
1211 seq_name = fields[1]
1212 if seq_name == None:
1213 die("Error: malformed sequence dictionary in sam header")
1214 sq_dict_lines.append([seq_name,line])
1215 elif line.find("CL"):
1216 continue
1217 else:
1218 preamble.append(line)
1219
1220 print >> bowtie_sam_header_file, "@HD\tVN:1.0\tSO:coordinate"
1221 if read_params.read_group_id and read_params.sample_id:
1222 rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
1223 read_params.sample_id)
1224 if read_params.library_id:
1225 rg_str += "\tLB:%s" % read_params.library_id
1226 if read_params.description:
1227 rg_str += "\tDS:%s" % read_params.description
1228 if read_params.seq_platform_unit:
1229 rg_str += "\tPU:%s" % read_params.seq_platform_unit
1230 if read_params.seq_center:
1231 rg_str += "\tCN:%s" % read_params.seq_center
1232 if read_params.mate_inner_dist:
1233 rg_str += "\tPI:%s" % read_params.mate_inner_dist
1234 if read_params.seq_run_date:
1235 rg_str += "\tDT:%s" % read_params.seq_run_date
1236 if read_params.seq_platform:
1237 rg_str += "\tPL:%s" % read_params.seq_platform
1238
1239 print >> bowtie_sam_header_file, rg_str
1240
1241 if not params.keep_fasta_order:
1242 sq_dict_lines.sort(lambda x,y: cmp(x[0],y[0]))
1243
1244 for [name, line] in sq_dict_lines:
1245 print >> bowtie_sam_header_file, line
1246 print >> bowtie_sam_header_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
1247
1248 bowtie_sam_header_file.close()
1249 temp_sam_header_file.close()
1250 return bowtie_sam_header_filename
1251
1252 except OSError, o:
1253 errmsg=fail_str+str(o)+"\n"
1254 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1255 errmsg+="Error: bowtie not found on this system"
1256 die(errmsg)
1257
1258 # Make sure Bowtie is installed and is recent enough to be useful
1259 def check_bowtie(is_bowtie2):
1260 log_msg = "Checking for Bowtie"
1261 if is_bowtie2:
1262 log_msg += "2"
1263 th_log(log_msg)
1264
1265 bowtie_bin = "bowtie"
1266 if is_bowtie2:
1267 bowtie_bin += "2"
1268
1269 global bowtie_path
1270 bowtie_path=prog_path(bowtie_bin)
1271 bowtie_version = get_bowtie_version(is_bowtie2)
1272 if bowtie_version == None:
1273 die("Error: Bowtie not found on this system")
1274 elif is_bowtie2:
1275 if bowtie_version[0] < 3 and bowtie_version[1] < 1 and bowtie_version[2] < 1 and bowtie_version[3] < 5:
1276 die("Error: TopHat requires Bowtie 2.0.0-beta5 or later")
1277 elif not is_bowtie2:
1278 if bowtie_version[0] < 1 and bowtie_version[1] < 12 and bowtie_version[2] < 3:
1279 die("Error: TopHat requires Bowtie 0.12.3 or later")
1280 print >> sys.stderr, "\t version:\t\t\t %s" % ".".join([str(x) for x in bowtie_version])
1281
1282
1283 # Retrive a tuple containing the system's version of samtools. Parsed from
1284 # `samtools`
1285 def get_samtools_version():
1286 try:
1287 # Launch Bowtie to capture its version info
1288 proc = subprocess.Popen(samtools_path, stderr=subprocess.PIPE)
1289 samtools_out = proc.communicate()[1]
1290
1291 # Find the version identifier
1292 version_match = re.search(r'Version:\s+(\d+)\.(\d+).(\d+)([a-zA-Z]?)', samtools_out)
1293 samtools_version_arr = [int(version_match.group(x)) for x in [1,2,3]]
1294 if version_match.group(4):
1295 samtools_version_arr.append(version_match.group(4))
1296 else:
1297 samtools_version_arr.append(0)
1298
1299 return version_match.group(), samtools_version_arr
1300 except OSError, o:
1301 errmsg=fail_str+str(o)+"\n"
1302 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1303 errmsg+="Error: samtools not found on this system"
1304 die(errmsg)
1305
1306 # Make sure the SAM tools are installed and are recent enough to be useful
1307 def check_samtools():
1308 th_log("Checking for Samtools")
1309 global samtools_path
1310 samtools_path=prog_path("samtools")
1311 samtools_version_str, samtools_version_arr = get_samtools_version()
1312 if samtools_version_str == None:
1313 die("Error: Samtools not found on this system")
1314 elif samtools_version_arr[1] < 1 or samtools_version_arr[2] < 7:
1315 die("Error: TopHat requires Samtools 0.1.7 or later")
1316 print >> sys.stderr, "\tSamtools %s" % samtools_version_str
1317
1318
1319 class FastxReader:
1320 def __init__(self, i_file, is_color=0, fname=''):
1321 self.bufline=None
1322 self.format=None
1323 self.ifile=i_file
1324 self.nextRecord=None
1325 self.eof=None
1326 self.fname=fname
1327 self.lastline=None
1328 self.numrecords=0
1329 self.isColor=0
1330 if is_color : self.isColor=1
1331 # determine file type
1332 #no records processed yet, skip custom header lines if any
1333 hlines=10 # allow maximum 10 header lines
1334 self.lastline=" "
1335 while hlines>0 and self.lastline[0] not in "@>" :
1336 self.lastline=self.ifile.readline()
1337 hlines-=1
1338 if self.lastline[0] == '@':
1339 self.format='fastq'
1340 self.nextRecord=self.nextFastq
1341 elif self.lastline[0] == '>':
1342 self.format='fasta'
1343 self.nextRecord=self.nextFasta
1344 else:
1345 die("Error: cannot determine record type in input file %s" % fname)
1346 self.bufline=self.lastline
1347 self.lastline=None
1348
1349 def nextFastq(self):
1350 # returning tuple: (seqID, sequence_string, seq_len, qv_string)
1351 seqid,seqstr,qstr,seq_len='','','',0
1352 if self.eof: return (seqid, seqstr, seq_len, qstr)
1353 fline=self.getLine #shortcut to save a bit of time
1354 line=fline()
1355
1356 if not line : return (seqid, seqstr, seq_len, qstr)
1357 while len(line.rstrip())==0: # skip empty lines
1358 line=fline()
1359 if not line : return (seqid, seqstr,seq_len, qstr)
1360 try:
1361 if line[0] != "@":
1362 raise ValueError("Records in Fastq files should start with '@' character")
1363
1364 seqid = line[1:].rstrip()
1365 seqstr = fline().rstrip()
1366
1367 #There may now be more sequence lines, or the "+" quality marker line:
1368 while True:
1369 line = fline()
1370 if not line:
1371 raise ValueError("Premature end of file (missing quality values for "+seqid+")")
1372 if line[0] == "+":
1373 # -- sequence string ended
1374 #qtitle = line[1:].rstrip()
1375 #if qtitle and qtitle != seqid:
1376 # raise ValueError("Different read ID for sequence and quality (%s vs %s)" \
1377 # % (seqid, qtitle))
1378 break
1379 seqstr += line.rstrip() #removes trailing newlines
1380 #loop until + found
1381 seq_len = len(seqstr)
1382 #at least one line of quality data should follow
1383 qstrlen=0
1384 #now read next lines as quality values until seq_len is reached
1385 while True:
1386 line=fline()
1387 if not line : break #end of file
1388 qstr += line.rstrip()
1389 qstrlen=len(qstr)
1390 if qstrlen + self.isColor >= seq_len :
1391 break # qv string has reached the length of seq string
1392 #loop until qv has the same length as seq
1393
1394 if self.isColor:
1395 # and qstrlen==seq_len :
1396 if qstrlen==seq_len:
1397 #qual string may have a dummy qv at the beginning, should be stripped
1398 qstr = qstr[1:]
1399 qstrlen -= 1
1400 if qstrlen!=seq_len-1:
1401 raise ValueError("Length mismatch between sequence and quality strings "+ \
1402 "for %s (%i vs %i)." % (seqid, seq_len, qstrlen))
1403 else:
1404 if seq_len != qstrlen :
1405 raise ValueError("Length mismatch between sequence and quality strings "+ \
1406 "for %s (%i vs %i)." % (seqid, seq_len, qstrlen))
1407 except ValueError, err:
1408 die("\nError encountered parsing file "+self.fname+":\n "+str(err))
1409 #return the record
1410 self.numrecords+=1
1411 ##--discard the primer base [NO]
1412 if self.isColor :
1413 seq_len-=1
1414 seqstr = seqstr[1:]
1415 return (seqid, seqstr, seq_len, qstr)
1416
1417 def nextFasta(self):
1418 # returning tuple: (seqID, sequence_string, seq_len)
1419 seqid,seqstr,seq_len='','',0
1420 fline=self.getLine # shortcut to readline function of f
1421 line=fline() # this will use the buffer line if it's there
1422 if not line : return (seqid, seqstr, seq_len, None)
1423 while len(line.rstrip())==0: # skip empty lines
1424 line=fline()
1425 if not line : return (seqid, seqstr, seq_len, None)
1426 try:
1427 if line[0] != ">":
1428 raise ValueError("Records in Fasta files must start with '>' character")
1429 seqid = line[1:].split()[0]
1430 #more sequence lines, or the ">" quality marker line:
1431 while True:
1432 line = fline()
1433 if not line: break
1434 if line[0] == '>':
1435 #next sequence starts here
1436 self.ungetLine()
1437 break
1438 seqstr += line.rstrip()
1439 #loop until '>' found
1440 seq_len = len(seqstr)
1441 if seq_len < 3:
1442 raise ValueError("Read %s too short (%i)." \
1443 % (seqid, seq_len))
1444 except ValueError, err:
1445 die("\nError encountered parsing fasta file "+self.fname+"\n "+str(err))
1446 #return the record and continue
1447 self.numrecords+=1
1448 if self.isColor : # -- discard primer base
1449 seq_len-=1
1450 seqstr=seqstr[1:]
1451 return (seqid, seqstr, seq_len, None)
1452
1453 def getLine(self):
1454 if self.bufline: #return previously buffered line
1455 r=self.bufline
1456 self.bufline=None
1457 return r
1458 else: #read a new line from stream and return it
1459 if self.eof: return None
1460 self.lastline=self.ifile.readline()
1461 if not self.lastline:
1462 self.eof=1
1463 return None
1464 return self.lastline
1465 def ungetLine(self):
1466 if self.lastline is None:
1467 print >> sys.stderr, "Warning: FastxReader called ungetLine() with no prior line!"
1468 self.bufline=self.lastline
1469 self.lastline=None
1470 #< class FastxReader
1471
1472 def fa_write(fhandle, seq_id, seq):
1473 """
1474 Write to a file in the FASTA format.
1475
1476 Arguments:
1477 - `fhandle`: A file handle open for writing
1478 - `seq_id`: The sequence id string for this sequence
1479 - `seq`: An unformatted string of the sequence to write
1480 """
1481 line_len = 60
1482 fhandle.write(">" + seq_id + "\n")
1483 for i in xrange(len(seq) / line_len + 1):
1484 start = i * line_len
1485 #end = (i+1) * line_len if (i+1) * line_len < len(seq) else len(seq)
1486 if (i+1) * line_len < len(seq):
1487 end = (i+1) * line_len
1488 else:
1489 end = len(seq)
1490 fhandle.write( seq[ start:end ] + "\n")
1491
1492 class ZReader:
1493 def __init__(self, filename, sysparams, guess=True):
1494 self.fname=filename
1495 self.file=None
1496 self.fsrc=None
1497 self.popen=None
1498 pipecmd=[]
1499 s=filename.lower()
1500 if s.endswith(".bam"):
1501 # daehwan - want to ouptut mapped reads as well
1502 pipecmd=[prog_path("bam2fastx"), "--all", "-"]
1503 else:
1504 if guess:
1505 if s.endswith(".z") or s.endswith(".gz") or s.endswith(".gzip"):
1506 pipecmd=['gzip']
1507 else:
1508 if s.endswith(".bz2") or s.endswith(".bzip2") or s.endswith(".bzip"):
1509 pipecmd=['bzip2']
1510 if len(pipecmd)>0 and which(pipecmd[0]) is None:
1511 die("Error: cannot find %s to decompress input file %s " % (pipecmd, filename))
1512 if len(pipecmd)>0:
1513 if pipecmd[0]=='gzip' and sysparams.zipper.endswith('pigz'):
1514 pipecmd[0]=sysparams.zipper
1515 pipecmd.extend(sysparams.zipper_opts)
1516 elif pipecmd[0]=='bzip2' and sysparams.zipper.endswith('pbzip2'):
1517 pipecmd[0]=sysparams.zipper
1518 pipecmd.extend(sysparams.zipper_opts)
1519 else: #not guessing, but must still check if it's a compressed file
1520 if use_zpacker and filename.endswith(".z"):
1521 pipecmd=[sysparams.zipper]
1522 pipecmd.extend(sysparams.zipper_opts)
1523
1524 if pipecmd:
1525 pipecmd+=['-cd']
1526 if pipecmd:
1527 try:
1528 self.fsrc=open(self.fname, 'rb')
1529 self.popen=subprocess.Popen(pipecmd,
1530 preexec_fn=subprocess_setup,
1531 stdin=self.fsrc,
1532 stdout=subprocess.PIPE, close_fds=True)
1533 except Exception:
1534 die("Error: could not open pipe "+' '.join(pipecmd)+' < '+ self.fname)
1535 self.file=self.popen.stdout
1536 else:
1537 self.file=open(filename)
1538 def close(self):
1539 if self.fsrc: self.fsrc.close()
1540 self.file.close()
1541 if self.popen:
1542 self.popen.wait()
1543 self.popen=None
1544
1545 class ZWriter:
1546 def __init__(self, filename, sysparams):
1547 self.fname=filename
1548 if use_zpacker:
1549 pipecmd=[sysparams.zipper,"-cf", "-"]
1550 self.ftarget=open(filename, "wb")
1551 try:
1552 self.popen=subprocess.Popen(pipecmd,
1553 preexec_fn=subprocess_setup,
1554 stdin=subprocess.PIPE,
1555 stdout=self.ftarget, close_fds=True)
1556 except Exception:
1557 die("Error: could not open writer pipe "+' '.join(pipecmd)+' < '+ self.fname)
1558 self.file=self.popen.stdin # client writes to this end of the pipe
1559 else: #no compression
1560 self.file=open(filename, "w")
1561 self.ftarget=None
1562 self.popen=None
1563 def close(self):
1564 self.file.close()
1565 if self.ftarget: self.ftarget.close()
1566 if self.popen:
1567 self.popen.wait() #! required to actually flush the pipes (eek!)
1568 self.popen=None
1569
1570 # check_reads_format() examines the first few records in the user files
1571 # to determines the file format
1572 def check_reads_format(params, reads_files):
1573 #seed_len = params.read_params.seed_length
1574 fileformat = params.read_params.reads_format
1575
1576 observed_formats = set([])
1577 # observed_scales = set([])
1578 min_seed_len = 99999
1579 max_seed_len = 0
1580 files = reads_files.split(',')
1581
1582 for f_name in files:
1583 #try:
1584 zf = ZReader(f_name, params.system_params)
1585 #except IOError:
1586 # die("Error: could not open file "+f_name)
1587 freader=FastxReader(zf.file, params.read_params.color, zf.fname)
1588 toread=4 #just sample the first 4 reads
1589 while toread>0:
1590 seqid, seqstr, seq_len, qstr = freader.nextRecord()
1591 if not seqid: break
1592 toread-=1
1593 if seq_len < 20:
1594 print >> sys.stderr, "Warning: found a read < 20bp in", f_name
1595 else:
1596 min_seed_len = min(seq_len, min_seed_len)
1597 max_seed_len = max(seq_len, max_seed_len)
1598 zf.close()
1599 observed_formats.add(freader.format)
1600 if len(observed_formats) > 1:
1601 die("Error: TopHat requires all reads be either FASTQ or FASTA. Mixing formats is not supported.")
1602 fileformat=list(observed_formats)[0]
1603 #if seed_len != None:
1604 # seed_len = max(seed_len, max_seed_len)
1605 #else:
1606 # seed_len = max_seed_len
1607 #print >> sys.stderr, "\tmin read length: %dbp, max read length: %dbp" % (min_seed_len, max_seed_len)
1608 print >> sys.stderr, "\tformat:\t\t %s" % fileformat
1609 if fileformat == "fastq":
1610 quality_scale = "phred33 (default)"
1611 if params.read_params.solexa_quals and not params.read_params.phred64_quals:
1612 quality_scale = "solexa33 (reads generated with GA pipeline version < 1.3)"
1613 elif params.read_params.phred64_quals:
1614 quality_scale = "phred64 (reads generated with GA pipeline version >= 1.3)"
1615 print >> sys.stderr, "\tquality scale:\t %s" % quality_scale
1616 elif fileformat == "fasta":
1617 if params.read_params.color:
1618 params.read_params.integer_quals = True
1619
1620 #print seed_len, format, solexa_scale
1621 #NOTE: seed_len will be re-evaluated later by prep_reads
1622 return TopHatParams.ReadParams(params.read_params.solexa_quals,
1623 params.read_params.phred64_quals,
1624 params.read_params.quals,
1625 params.read_params.integer_quals,
1626 params.read_params.color,
1627 params.read_params.library_type,
1628 #seed_len,
1629 params.read_params.seed_length,
1630 fileformat,
1631 params.read_params.mate_inner_dist,
1632 params.read_params.mate_inner_dist_std_dev,
1633 params.read_params.read_group_id,
1634 params.read_params.sample_id,
1635 params.read_params.library_id,
1636 params.read_params.description,
1637 params.read_params.seq_platform_unit,
1638 params.read_params.seq_center,
1639 params.read_params.seq_run_date,
1640 params.read_params.seq_platform)
1641
1642 def log_tail(logfile, lines=1):
1643 f=open(logfile, "r")
1644 f.seek(0, 2)
1645 fbytes= f.tell()
1646 size=lines
1647 block=-1
1648 while size > 0 and fbytes+block*1024 > 0:
1649 if (fbytes+block*1024 > 0):
1650 ##Seek back once more, if possible
1651 f.seek( block*1024, 2 )
1652 else:
1653 #Seek to the beginning
1654 f.seek(0, 0)
1655 data= f.read( 1024 )
1656 linesFound= data.count('\n')
1657 size -= linesFound
1658 block -= 1
1659 if (fbytes + block*1024 > 0):
1660 f.seek(block*1024, 2)
1661 else:
1662 f.seek(0,0)
1663 f.readline() # find a newline
1664 lastBlocks= list( f.readlines() )
1665 f.close()
1666 return "\n".join(lastBlocks[-lines:])
1667
1668 # Format a DateTime as a pretty string.
1669 # FIXME: Currently doesn't support days!
1670 def formatTD(td):
1671 days = td.days
1672 hours = td.seconds // 3600
1673 minutes = (td.seconds % 3600) // 60
1674 seconds = td.seconds % 60
1675
1676 if days > 0:
1677 return '%d days %02d:%02d:%02d' % (days, hours, minutes, seconds)
1678 else:
1679 return '%02d:%02d:%02d' % (hours, minutes, seconds)
1680
1681 class PrepReadsInfo:
1682 def __init__(self, fname, side):
1683 try:
1684 f=open(fname,"r")
1685 self.min_len=int(f.readline().split("=")[-1])
1686 self.max_len=int(f.readline().split("=")[-1])
1687 self.in_count=int(f.readline().split("=")[-1])
1688 self.out_count=int(f.readline().split("=")[-1])
1689 if (self.out_count==0) or (self.max_len<16):
1690 raise Exception()
1691 except Exception, e:
1692 die(fail_str+"Error retrieving prep_reads info.")
1693 print >> sys.stderr, "\t%s reads: min. length=%s, count=%s" % (side, self.min_len, self.out_count)
1694 if self.min_len < 20:
1695 print >> sys.stderr, "Warning: short reads (<20bp) will make TopHat quite slow and take large amount of memory because they are likely to be mapped to too many places"
1696
1697
1698 def prep_reads_cmd(params, reads_list, quals_list=None, aux_file=None, index_file=None, filter_reads=None, hits_to_filter=None):
1699 #generate a prep_reads cmd arguments
1700 filter_cmd = [prog_path("prep_reads")]
1701 filter_cmd.extend(params.cmd())
1702
1703 if params.read_params.reads_format == "fastq":
1704 filter_cmd += ["--fastq"]
1705 elif params.read_params.reads_format == "fasta":
1706 filter_cmd += ["--fasta"]
1707 if hits_to_filter:
1708 filter_cmd += ["--flt-hits="+hits_to_filter]
1709 if aux_file:
1710 filter_cmd += ["--aux-outfile="+aux_file]
1711 if index_file:
1712 filter_cmd += ["--index-outfile="+index_file]
1713 if filter_reads:
1714 filter_cmd += ["--flt-reads="+filter_reads]
1715 filter_cmd.append(reads_list)
1716 if quals_list:
1717 filter_cmd.append(quals_list)
1718 return filter_cmd
1719
1720 # Calls the prep_reads executable, which prepares an internal read library.
1721 # The read library features reads with monotonically increasing integer IDs.
1722 # prep_reads also filters out very low complexy or garbage reads as well as
1723 # polyA reads.
1724 #--> returns the tuple (internal_read_library_file, read_info)
1725 def prep_reads(params, reads_list, quals_list, side="left", prefilter_reads=None):
1726 reads_suffix = ".fq"
1727
1728 # for parallelization, we don't compress the read files
1729 do_use_zpacker = use_zpacker
1730 if params.system_params.num_threads > 1:
1731 do_use_zpacker = False
1732
1733 if do_use_zpacker: reads_suffix += ".z"
1734
1735 output_name=side+"_kept_reads"
1736 kept_reads_filename = tmp_dir + output_name + reads_suffix
1737
1738 if os.path.exists(kept_reads_filename):
1739 os.remove(kept_reads_filename)
1740 kept_reads = open(kept_reads_filename, "wb")
1741 log_fname=logging_dir + "prep_reads.log"
1742 filter_log = open(log_fname,"w")
1743
1744 info_file=output_dir+output_name+".info"
1745 index_file=kept_reads_filename+".index"
1746
1747 filter_cmd=prep_reads_cmd(params, reads_list, quals_list, info_file, index_file, prefilter_reads)
1748 shell_cmd = ' '.join(filter_cmd)
1749 #finally, add the compression pipe
1750 zip_cmd=[]
1751 if do_use_zpacker:
1752 zip_cmd=[ params.system_params.zipper ]
1753 zip_cmd.extend(params.system_params.zipper_opts)
1754 zip_cmd.extend(['-c','-'])
1755 shell_cmd +=' | '+' '.join(zip_cmd)
1756 shell_cmd += ' >' +kept_reads_filename
1757 retcode=0
1758 try:
1759 print >> run_log, shell_cmd
1760 if do_use_zpacker:
1761 filter_proc = subprocess.Popen(filter_cmd,
1762 stdout=subprocess.PIPE,
1763 stderr=filter_log)
1764 zip_proc=subprocess.Popen(zip_cmd,
1765 preexec_fn=subprocess_setup,
1766 stdin=filter_proc.stdout,
1767 stdout=kept_reads)
1768 filter_proc.stdout.close() #as per http://bugs.python.org/issue7678
1769 zip_proc.communicate()
1770 retcode=filter_proc.poll()
1771 if retcode==0:
1772 retcode=zip_proc.poll()
1773 else:
1774 retcode = subprocess.call(filter_cmd,
1775 stdout=kept_reads,
1776 stderr=filter_log)
1777 if retcode:
1778 die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname))
1779
1780 except OSError, o:
1781 errmsg=fail_str+str(o)
1782 die(errmsg+"\n"+log_tail(log_fname))
1783
1784 kept_reads.close()
1785 return kept_reads_filename, PrepReadsInfo(info_file, side)
1786
1787
1788 # Call bowtie
1789 def bowtie(params,
1790 bwt_idx_prefix,
1791 reads_list,
1792 reads_format,
1793 num_mismatches,
1794 mapped_reads,
1795 unmapped_reads,
1796 extra_output = "",
1797 mapping_type = "read_against_genome",
1798 multihits_out = None): #only --prefilter-multihits should activate this parameter for the initial prefilter search
1799 start_time = datetime.now()
1800 bwt_idx_name = bwt_idx_prefix.split('/')[-1]
1801 reads_file=reads_list[0]
1802 readfile_basename=getFileBaseName(reads_file)
1803
1804 g_mapping, t_mapping, seg_mapping = False, False, False
1805 if mapping_type == "read_against_transcriptome":
1806 t_mapping = True
1807 elif mapping_type == "seg_against_genome" or mapping_type == "seg_against_junctions":
1808 seg_mapping = True
1809 else:
1810 g_mapping = True
1811
1812 bowtie_str = "Bowtie"
1813 if params.bowtie2:
1814 bowtie_str += "2"
1815
1816 if seg_mapping:
1817 if not params.bowtie2:
1818 backup_bowtie_alignment_option = params.bowtie_alignment_option
1819 params.bowtie_alignment_option = "-v"
1820
1821 if t_mapping:
1822 print >> sys.stderr, "[%s] Mapping %s against transcriptome %s with %s %s" % (start_time.strftime("%c"),
1823 readfile_basename, bwt_idx_name, bowtie_str, extra_output)
1824 else:
1825 print >> sys.stderr, "[%s] Mapping %s against %s with %s %s" % (start_time.strftime("%c"),
1826 readfile_basename, bwt_idx_name, bowtie_str, extra_output)
1827 bwt_log = open(logging_dir + 'bowtie.'+readfile_basename+'.fixmap.log', "w")
1828 #bwt_mapped=mapped_reads
1829 unmapped_reads_out=None
1830 if unmapped_reads:
1831 unmapped_reads_out=unmapped_reads+".fq"
1832 use_FIFO = use_BWT_FIFO and use_zpacker and unmapped_reads and params.read_params.color
1833 if use_FIFO:
1834 global unmapped_reads_fifo
1835 unmapped_reads_fifo=unmapped_reads+".fifo"
1836 if os.path.exists(unmapped_reads_fifo):
1837 os.remove(unmapped_reads_fifo)
1838 try:
1839 os.mkfifo(unmapped_reads_fifo)
1840 except OSError, o:
1841 die(fail_str+"Error at mkfifo("+unmapped_reads_fifo+'). '+str(o))
1842 unmapped_reads_out+=".z"
1843
1844 # Launch Bowtie
1845 try:
1846 bowtie_cmd = [bowtie_path]
1847 if reads_format == "fastq":
1848 bowtie_cmd += ["-q"]
1849 elif reads_format == "fasta":
1850 bowtie_cmd += ["-f"]
1851
1852 if params.read_params.color:
1853 bowtie_cmd += ["-C", "--col-keepends"]
1854 if unmapped_reads:
1855 bowtie_cmd += ["--un", unmapped_reads_out]
1856
1857 unzip_cmd=None
1858 bam_input=False
1859 if len(reads_list) > 0 and reads_list[0].endswith('.bam'):
1860 bam_input=True
1861 unzip_cmd=[ prog_path('bam2fastx') ]
1862 if reads_format:
1863 unzip_cmd.append("--" + reads_format)
1864 unzip_cmd+=[reads_list[0]]
1865
1866 if use_zpacker and (unzip_cmd is None):
1867 unzip_cmd=[ params.system_params.zipper ]
1868 unzip_cmd.extend(params.system_params.zipper_opts)
1869 unzip_cmd+=['-cd']
1870
1871 fifo_pid=None
1872 if use_FIFO:
1873 unm_zipcmd=[ params.system_params.zipper ]
1874 unm_zipcmd.extend(params.system_params.zipper_opts)
1875 unm_zipcmd+=['-c']
1876 print >> run_log, ' '.join(unm_zipcmd)+' < '+ unmapped_reads_fifo + ' > '+ unmapped_reads_out + ' & '
1877 fifo_pid=os.fork()
1878 if fifo_pid==0:
1879 def on_sig_exit(sig, func=None):
1880 os._exit(os.EX_OK)
1881 signal.signal(signal.SIGTERM, on_sig_exit)
1882 subprocess.call(unm_zipcmd,
1883 stdin=open(unmapped_reads_fifo, "r"),
1884 stdout=open(unmapped_reads_out, "wb"))
1885 os._exit(os.EX_OK)
1886
1887 fix_map_cmd = [prog_path('fix_map_ordering')]
1888
1889 if params.bowtie2:
1890 if t_mapping or g_mapping:
1891 max_penalty, min_penalty = params.bowtie2_params.mp.split(',')
1892 max_penalty, min_penalty = int(max_penalty), int(min_penalty)
1893 min_score = (max_penalty - 1) * num_mismatches
1894 fix_map_cmd += ["--bowtie2-min-score", str(min_score)]
1895
1896 samzip_cmd=None
1897
1898 # daehwan - just use bam format for multi-threading.
1899 if unmapped_reads or params.system_params.num_threads >= 1:
1900 #mapping on the genome
1901 #fix_map_ordering will write BAM file directly, and unmapped_reads as BAM file too
1902 mapped_reads += ".bam"
1903
1904 temp_sam_header = tmp_dir + bwt_idx_prefix.split('/')[-1] + ".bwt.samheader.sam"
1905 fix_map_cmd += ["--sam-header", temp_sam_header, "-", mapped_reads]
1906 if not params.read_params.color and unmapped_reads:
1907 unmapped_reads_out = unmapped_reads+".bam"
1908 fix_map_cmd += [unmapped_reads_out]
1909 else:
1910 #mapping on segment_juncs (spliced potential junctions)
1911 # don't use SAM headers, could be slow -- better use compressed SAM directly
1912 mapped_reads += ".sam"
1913 fix_map_cmd += ["-"]
1914 if use_zpacker:
1915 fix_map_cmd += ["-"]
1916 samzip_cmd=[ params.system_params.zipper ]
1917 samzip_cmd.extend(params.system_params.zipper_opts)
1918 samzip_cmd += ["-c"]
1919 mapped_reads += ".z"
1920 else:
1921 fix_map_cmd += [mapped_reads]
1922
1923 fix_map_cmd += ["--index-outfile", mapped_reads + ".index"]
1924
1925 if t_mapping:
1926 max_hits = params.t_max_hits
1927 elif seg_mapping:
1928 max_hits = params.max_seg_hits
1929 else:
1930 max_hits = params.max_hits
1931
1932 if num_mismatches > 3:
1933 num_mismatches = 3
1934
1935 if params.bowtie2:
1936 if seg_mapping:
1937 # since bowtie2 does not suppress reads that map to many places,
1938 # we suppress those in segment_juncs and long_spanning_reads.
1939 bowtie_cmd += ["-k", str(max_hits + 1)]
1940 else:
1941 bowtie_cmd += ["-k", str(max_hits)]
1942
1943 bowtie2_params = params.bowtie2_params
1944 if seg_mapping:
1945 # after intensive testing,
1946 # the following parameters seem to work faster than Bowtie1 and as sensitive as Bowtie1,
1947 # but a chance for further improvements remains.
1948 bowtie_cmd += ["-N", str(min(num_mismatches, 1))]
1949 bowtie_cmd += ["-i", "C,2,0"]
1950 bowtie_cmd += ["-L", "20"]
1951 # bowtie_cmd += ["-L", str(min(params.segment_length, 20))]
1952 else:
1953 bowtie2_preset = ""
1954 if bowtie2_params.very_fast:
1955 bowtie2_preset = "--very-fast"
1956 elif bowtie2_params.fast:
1957 bowtie2_preset = "--fast"
1958 elif bowtie2_params.sensitive:
1959 bowtie2_preset = "--sensitive"
1960 elif bowtie2_params.very_sensitive:
1961 bowtie2_preset = "--very-sensitive"
1962
1963 if bowtie2_preset != "":
1964 bowtie_cmd += [bowtie2_preset]
1965 else:
1966 bowtie_cmd += ["-D", str(bowtie2_params.D),
1967 "-R", str(bowtie2_params.R),
1968 "-N", str(bowtie2_params.N),
1969 "-L", str(bowtie2_params.L),
1970 "-i", bowtie2_params.i]
1971
1972 # "--n-ceil" is not correctly parsed in Bowtie2,
1973 # I (daehwan) already talked to Ben who will fix the problem.
1974 bowtie_cmd += [# "--n-ceil", bowtie2_params.n_ceil,
1975 "--gbar", str(bowtie2_params.gbar),
1976 "--mp", bowtie2_params.mp,
1977 "--np", str(bowtie2_params.np),
1978 "--rdg", bowtie2_params.rdg,
1979 "--rfg", bowtie2_params.rfg,
1980 "--score-min", bowtie2_params.score_min]
1981
1982 else:
1983 bowtie_cmd += [params.bowtie_alignment_option, str(num_mismatches),
1984 "-k", str(max_hits),
1985 "-m", str(max_hits),
1986 "-S"]
1987
1988 bowtie_cmd += ["-p", str(params.system_params.num_threads)]
1989
1990 if params.bowtie2: #always use headerless SAM file
1991 bowtie_cmd += ["--sam-no-hd"]
1992 else:
1993 bowtie_cmd += ["--sam-nohead"]
1994
1995 if not params.bowtie2:
1996 if multihits_out:
1997 bowtie_cmd += ["--max", multihits_out]
1998 else:
1999 bowtie_cmd += ["--max", "/dev/null"]
2000
2001 if params.bowtie2:
2002 bowtie_cmd += ["-x"]
2003
2004 bowtie_cmd += [ bwt_idx_prefix ]
2005 bowtie_proc=None
2006 shellcmd=""
2007 unzip_proc=None
2008 samzip_proc=None
2009
2010 if multihits_out:
2011 #special prefilter bowtie run: we use prep_reads on the fly
2012 #in order to get multi-mapped reads to exclude later
2013 prep_cmd = prep_reads_cmd(params, ",".join(reads_list))
2014 preplog_fname=logging_dir + "prep_reads_prefilter.log"
2015 prepfilter_log = open(preplog_fname,"w")
2016 unzip_proc = subprocess.Popen(prep_cmd,
2017 stdout=subprocess.PIPE,
2018 stderr=prepfilter_log)
2019 shellcmd=' '.join(prep_cmd) + "|"
2020 else:
2021 z_input=use_zpacker and reads_file.endswith(".z")
2022 if z_input:
2023 unzip_proc = subprocess.Popen(unzip_cmd,
2024 stdin=open(reads_file, "rb"),
2025 stdout=subprocess.PIPE)
2026 shellcmd=' '.join(unzip_cmd) + "< " +reads_file +"|"
2027 else:
2028 #must be uncompressed fastq input (unmapped reads from a previous run)
2029 #or a BAM file with unmapped reads
2030 if bam_input:
2031 unzip_proc = subprocess.Popen(unzip_cmd, stdout=subprocess.PIPE)
2032 shellcmd=' '.join(unzip_cmd) + "|"
2033 else:
2034 bowtie_cmd += [reads_file]
2035 bowtie_proc = subprocess.Popen(bowtie_cmd,
2036 stdout=subprocess.PIPE,
2037 stderr=bwt_log)
2038 if unzip_proc:
2039 #input is compressed OR prep_reads is used as a filter
2040 bowtie_cmd += ['-']
2041 bowtie_proc = subprocess.Popen(bowtie_cmd,
2042 stdin=unzip_proc.stdout,
2043 stdout=subprocess.PIPE,
2044 stderr=bwt_log)
2045 unzip_proc.stdout.close() # see http://bugs.python.org/issue7678
2046
2047 shellcmd += ' '.join(bowtie_cmd) + '|' + ' '.join(fix_map_cmd)
2048 if samzip_cmd:
2049 shellcmd += "|"+ ' '.join(samzip_cmd)+" > " + mapped_reads
2050 fix_order_proc = subprocess.Popen(fix_map_cmd,
2051 stdin=bowtie_proc.stdout,
2052 stdout=subprocess.PIPE)
2053 bowtie_proc.stdout.close() # needed?
2054 samzip_proc = subprocess.Popen(samzip_cmd,
2055 preexec_fn=subprocess_setup,
2056 stdin=fix_order_proc.stdout,
2057 stdout=open(mapped_reads, "wb"))
2058 fix_order_proc.stdout.close()
2059 else:
2060 #write BAM output directly
2061 fix_order_proc = subprocess.Popen(fix_map_cmd,
2062 stdin=bowtie_proc.stdout)
2063 # stdout=open(mapped_reads, "w"))
2064 bowtie_proc.stdout.close()
2065
2066 # shellcmd += " > " + mapped_reads
2067 print >> run_log, shellcmd
2068 if samzip_proc:
2069 samzip_proc.communicate()
2070 else:
2071 fix_order_proc.communicate()
2072 if use_FIFO:
2073 if fifo_pid and not os.path.exists(unmapped_reads_out):
2074 try:
2075 os.kill(fifo_pid, signal.SIGTERM)
2076 except:
2077 pass
2078
2079 except OSError, o:
2080 die(fail_str+"Error: "+str(o))
2081
2082 # Success
2083 #finish_time = datetime.now()
2084 #duration = finish_time - start_time
2085 #print >> sys.stderr, "\t\t\t[%s elapsed]" % formatTD(duration)
2086 if use_FIFO:
2087 try:
2088 os.remove(unmapped_reads_fifo)
2089 except:
2090 pass
2091 if multihits_out and not os.path.exists(multihits_out):
2092 open(multihits_out, "w").close()
2093
2094 if seg_mapping:
2095 if not params.bowtie2:
2096 params.bowtie_alignment_option = backup_bowtie_alignment_option
2097
2098 return (mapped_reads, unmapped_reads_out)
2099
2100
2101 # Retrieve a .juncs file from a GFF file by calling the gtf_juncs executable
2102 def get_gtf_juncs(gff_annotation):
2103 th_log("Reading known junctions from GTF file")
2104 gtf_juncs_log = open(logging_dir + "gtf_juncs.log", "w")
2105
2106 gff_prefix = gff_annotation.split('/')[-1].split('.')[0]
2107
2108 gtf_juncs_out_name = tmp_dir + gff_prefix + ".juncs"
2109 gtf_juncs_out = open(gtf_juncs_out_name, "w")
2110
2111 gtf_juncs_cmd=[prog_path("gtf_juncs"), gff_annotation]
2112 try:
2113 print >> run_log, " ".join(gtf_juncs_cmd), " > "+gtf_juncs_out_name
2114 retcode = subprocess.call(gtf_juncs_cmd,
2115 stderr=gtf_juncs_log,
2116 stdout=gtf_juncs_out)
2117 # cvg_islands returned an error
2118 if retcode == 1:
2119 print >> sys.stderr, "\tWarning: TopHat did not find any junctions in GTF file"
2120 return (False, gtf_juncs_out_name)
2121 elif retcode != 0:
2122 die(fail_str+"Error: GTF junction extraction failed with err ="+str(retcode))
2123
2124 # cvg_islands not found
2125 except OSError, o:
2126 errmsg=fail_str+str(o)+"\n"
2127 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2128 errmsg+="Error: gtf_juncs not found on this system"
2129 die(errmsg)
2130 return (True, gtf_juncs_out_name)
2131
2132 # Call bowtie-build on the FASTA file of sythetic splice junction sequences
2133 def build_juncs_bwt_index(is_bowtie2, external_splice_prefix, color):
2134 th_log("Indexing splices")
2135 bowtie_build_log = open(logging_dir + "bowtie_build.log", "w")
2136
2137 #user_splices_out_prefix = output_dir + "user_splices_idx"
2138
2139 if is_bowtie2:
2140 bowtie_build_cmd = [prog_path("bowtie2-build")]
2141 else:
2142 bowtie_build_cmd = [prog_path("bowtie-build")]
2143
2144 if color:
2145 bowtie_build_cmd += ["-C"]
2146
2147 bowtie_build_cmd += [external_splice_prefix + ".fa",
2148 external_splice_prefix]
2149 try:
2150 print >> run_log, " ".join(bowtie_build_cmd)
2151 retcode = subprocess.call(bowtie_build_cmd,
2152 stdout=bowtie_build_log)
2153
2154 if retcode != 0:
2155 die(fail_str+"Error: Splice sequence indexing failed with err ="+ str(retcode))
2156 except OSError, o:
2157 errmsg=fail_str+str(o)+"\n"
2158 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2159 errmsg+="Error: bowtie-build not found on this system"
2160 die(errmsg)
2161 return external_splice_prefix
2162
2163 # Build a splice index from a .juncs file, suitable for use with specified read
2164 # (or read segment) lengths
2165 def build_juncs_index(is_bowtie2,
2166 min_anchor_length,
2167 max_seg_len,
2168 juncs_prefix,
2169 external_juncs,
2170 external_insertions,
2171 external_deletions,
2172 external_fusions,
2173 reference_fasta,
2174 color):
2175 th_log("Retrieving sequences for splices")
2176
2177 juncs_file_list = ",".join(external_juncs)
2178 insertions_file_list = ",".join(external_insertions)
2179 deletions_file_list = ",".join(external_deletions)
2180 fusions_file_list = ",".join(external_fusions)
2181
2182 # do not use insertions and deletions in case of Bowtie2
2183 if is_bowtie2:
2184 insertions_file_list = "/dev/null"
2185 deletions_file_list = "/dev/null"
2186
2187 juncs_db_log = open(logging_dir + "juncs_db.log", "w")
2188
2189 external_splices_out_prefix = tmp_dir + juncs_prefix
2190 external_splices_out_name = external_splices_out_prefix + ".fa"
2191
2192 external_splices_out = open(external_splices_out_name, "w")
2193 # juncs_db_cmd = [bin_dir + "juncs_db",
2194 juncs_db_cmd = [prog_path("juncs_db"),
2195 str(min_anchor_length),
2196 str(max_seg_len),
2197 juncs_file_list,
2198 insertions_file_list,
2199 deletions_file_list,
2200 fusions_file_list,
2201 reference_fasta]
2202 try:
2203 print >> run_log, " ".join(juncs_db_cmd)
2204 retcode = subprocess.call(juncs_db_cmd,
2205 stderr=juncs_db_log,
2206 stdout=external_splices_out)
2207
2208 if retcode != 0:
2209 die(fail_str+"Error: Splice sequence retrieval failed with err ="+str(retcode))
2210 # juncs_db not found
2211 except OSError, o:
2212 errmsg=fail_str+str(o)+"\n"
2213 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2214 errmsg+="Error: juncs_db not found on this system"
2215 die(errmsg)
2216
2217 external_splices_out_prefix = build_juncs_bwt_index(is_bowtie2, external_splices_out_prefix, color)
2218 return external_splices_out_prefix
2219
2220 def build_idx_from_fa(is_bowtie2, fasta_fname, out_dir, color):
2221 """ Build a bowtie index from a FASTA file.
2222
2223 Arguments:
2224 - `fasta_fname`: File path to FASTA file.
2225 - `out_dir`: Output directory to place index in. (includes os.sep)
2226
2227 Returns:
2228 - The path to the Bowtie index.
2229 """
2230 bwt_idx_path = out_dir + os.path.basename(fasta_fname).replace(".fa", "")
2231
2232 if is_bowtie2:
2233 bowtie_idx_cmd = [prog_path("bowtie2-build")]
2234 else:
2235 bowtie_idx_cmd = [prog_path("bowtie-build")]
2236
2237 if color:
2238 bowtie_idx_cmd += ["-C"]
2239
2240 bowtie_idx_cmd += [fasta_fname,
2241 bwt_idx_path]
2242 try:
2243 th_log("Building Bowtie index from " + os.path.basename(fasta_fname))
2244 print >> run_log, " ".join(bowtie_idx_cmd)
2245 retcode = subprocess.call(bowtie_idx_cmd,
2246 stdout=open(os.devnull, "w"),
2247 stderr=open(os.devnull, "w"))
2248 if retcode != 0:
2249 die(fail_str + "Error: Couldn't build bowtie index with err = "
2250 + str(retcode))
2251 except OSError, o:
2252 errmsg=fail_str+str(o)+"\n"
2253 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2254 errmsg+="Error: bowtie-build not found on this system"
2255 die(errmsg)
2256
2257 return bwt_idx_path
2258
2259 # Print out the sam header, embedding the user's specified library properties.
2260 # FIXME: also needs SQ dictionary lines
2261 def write_sam_header(read_params, sam_file):
2262 print >> sam_file, "@HD\tVN:1.0\tSO:coordinate"
2263 if read_params.read_group_id and read_params.sample_id:
2264 rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
2265 read_params.sample_id)
2266 if read_params.library_id:
2267 rg_str += "\tLB:%s" % read_params.library_id
2268 if read_params.description:
2269 rg_str += "\tDS:%s" % read_params.description
2270 if read_params.seq_platform_unit:
2271 rg_str += "\tPU:%s" % read_params.seq_platform_unit
2272 if read_params.seq_center:
2273 rg_str += "\tCN:%s" % read_params.seq_center
2274 if read_params.mate_inner_dist:
2275 rg_str += "\tPI:%s" % read_params.mate_inner_dist
2276 if read_params.seq_run_date:
2277 rg_str += "\tDT:%s" % read_params.seq_run_date
2278 if read_params.seq_platform:
2279 rg_str += "\tPL:%s" % read_params.seq_platform
2280
2281 print >> sam_file, rg_str
2282 print >> sam_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
2283
2284 # Write final TopHat output, via tophat_reports and wiggles
2285 def compile_reports(params, sam_header_filename, ref_fasta, mappings, readfiles, gff_annotation):
2286 th_log("Reporting output tracks")
2287 left_maps, right_maps = mappings
2288 left_reads, right_reads = readfiles
2289 left_maps = [x for x in left_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
2290 left_maps = ','.join(left_maps)
2291
2292 if len(right_maps) > 0:
2293 right_maps = [x for x in right_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
2294 right_maps = ','.join(right_maps)
2295 log_fname = logging_dir + "reports.log"
2296 report_log = open(log_fname, "w")
2297 junctions = output_dir + "junctions.bed"
2298 insertions = output_dir + "insertions.bed"
2299 deletions = output_dir + "deletions.bed"
2300 accepted_hits = output_dir + "accepted_hits"
2301 report_cmdpath = prog_path("tophat_reports")
2302 fusions = output_dir + "fusions.out"
2303 report_cmd = [report_cmdpath]
2304
2305 alignments_output_filename = tmp_dir + "accepted_hits"
2306
2307 report_cmd.extend(params.cmd())
2308 report_cmd += ["--sam-header", sam_header_filename]
2309 if params.report_secondary_alignments:
2310 report_cmd += ["--report-secondary-alignments"]
2311
2312 report_cmd.extend(["--samtools="+samtools_path])
2313 report_cmd.extend([ref_fasta,
2314 junctions,
2315 insertions,
2316 deletions,
2317 fusions,
2318 alignments_output_filename,
2319 left_maps,
2320 left_reads])
2321
2322 if len(right_maps) > 0 and right_reads:
2323 report_cmd.append(right_maps)
2324 report_cmd.append(right_reads)
2325
2326 # -- tophat_reports now produces (uncompressed) BAM stream,
2327 try:
2328 if params.report_params.convert_bam:
2329 if params.report_params.sort_bam:
2330 print >> run_log, " ".join(report_cmd)
2331 report_proc=subprocess.call(report_cmd,
2332 preexec_fn=subprocess_setup,
2333 stderr=report_log)
2334
2335 bam_parts = []
2336 for i in range(params.system_params.num_threads):
2337 bam_part_filename = "%s%d.bam" % (alignments_output_filename, i)
2338 if os.path.exists(bam_part_filename):
2339 bam_parts.append(bam_part_filename)
2340 else:
2341 break
2342
2343 num_bam_parts = len(bam_parts)
2344 pids = [0 for i in range(num_bam_parts)]
2345 sorted_bam_parts = ["%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)]
2346 #left_um_parts = ["%s%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)]
2347 #right_um_parts = ["%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)]
2348 for i in range(num_bam_parts):
2349 bamsort_cmd = [samtools_path,
2350 "sort",
2351 bam_parts[i],
2352 sorted_bam_parts[i]]
2353
2354 sorted_bam_parts[i] += ".bam"
2355 print >> run_log, " ".join(bamsort_cmd)
2356 pid = os.fork()
2357 if pid == 0:
2358 subprocess.call(bamsort_cmd,
2359 stderr=open(logging_dir + "reports.samtools_sort.log%d" % i, "w"))
2360 os._exit(os.EX_OK)
2361 else:
2362 pids[i] = pid
2363
2364 for i in range(len(pids)):
2365 if pids[i] > 0:
2366 os.waitpid(pids[i], 0)
2367 pids[i] = 0
2368
2369 # for bam_part in bam_parts:
2370 # os.remove(bam_part)
2371
2372 if num_bam_parts > 1:
2373 bammerge_cmd = [samtools_path,
2374 "merge",
2375 "-h",
2376 sam_header_filename,
2377 "%s.bam" % accepted_hits]
2378 bammerge_cmd += sorted_bam_parts
2379
2380 print >> run_log, " ".join(bammerge_cmd)
2381 subprocess.call(bammerge_cmd,
2382 stderr=open(logging_dir + "reports.samtools_merge.log", "w"))
2383
2384 for sorted_bam_part in sorted_bam_parts:
2385 os.remove(sorted_bam_part)
2386 else:
2387 os.rename(sorted_bam_parts[0], output_dir + "accepted_hits.bam")
2388
2389 else:
2390 print >> run_log, " ".join(report_cmd)
2391 report_proc=subprocess.call(report_cmd,
2392 preexec_fn=subprocess_setup,
2393 stdout=open(accepted_hits,"wb"),
2394 stderr=report_log)
2395 os.rename(accepted_hits, output_dir + "accepted_hits.bam")
2396 else:
2397 print >> run_log, " ".join(report_cmd)
2398 report_proc=subprocess.call(report_cmd,
2399 preexec_fn=subprocess_setup,
2400 stdout=open(accepted_hits,"wb"),
2401 stderr=report_log)
2402 tmp_sam = output_dir + "accepted_hits.sam"
2403
2404 bam_to_sam_cmd = [samtools_path, "view", "-h", accepted_hits]
2405 print >> run_log, " ".join(bam_to_sam_cmd) + " > " + tmp_sam
2406 bam_to_sam_log = open(logging_dir + "accepted_hits_bam_to_sam.log", "w")
2407 tmp_sam_file = open(tmp_sam, "w")
2408 ret = subprocess.call(bam_to_sam_cmd,
2409 stdout=tmp_sam_file,
2410 stderr=bam_to_sam_log)
2411 tmp_sam_file.close()
2412 os.remove(accepted_hits)
2413
2414 except OSError, o:
2415 die(fail_str+"Error: "+str(o)+"\n"+log_tail(log_fname))
2416
2417 return junctions
2418
2419
2420 # Split up each read in a FASTQ file into multiple segments. Creates a FASTQ file
2421 # for each segment This function needs to be fixed to support mixed read length
2422 # inputs
2423 def open_output_files(prefix, num_files_prev, num_files, out_segf, extension, params):
2424 i = num_files_prev + 1
2425 while i <= num_files:
2426 segfname=prefix+("_seg%d" % i)+extension
2427 out_segf.append(ZWriter(segfname,params.system_params))
2428 i += 1
2429
2430 def split_reads(reads_filename,
2431 prefix,
2432 fasta,
2433 params,
2434 segment_length):
2435 #reads_file = open(reads_filename)
2436 zreads = ZReader(reads_filename, params.system_params, False)
2437 out_segfiles = []
2438
2439 if fasta:
2440 extension = ".fa"
2441 else:
2442 extension = ".fq"
2443 if use_zpacker: extension += ".z"
2444 def convert_color_to_bp(color_seq):
2445 decode_dic = { 'A0':'A', 'A1':'C', 'A2':'G', 'A3':'T', 'A4':'N', 'A.':'N', 'AN':'N',
2446 'C0':'C', 'C1':'A', 'C2':'T', 'C3':'G', 'C4':'N', 'C.':'N', 'CN':'N',
2447 'G0':'G', 'G1':'T', 'G2':'A', 'G3':'C', 'G4':'N', 'G.':'N', 'GN':'N',
2448 'T0':'T', 'T1':'G', 'T2':'C', 'T3':'A', 'T4':'N', 'T.':'N', 'TN':'N',
2449 'N0':'N', 'N1':'N', 'N2':'N', 'N3':'N', 'N4':'N', 'N.':'N', 'NN':'N',
2450 '.0':'N', '.1':'N', '.2':'N', '.3':'N', '.4':'N', '..':'N', '.N':'N' }
2451
2452 base = color_seq[0]
2453 bp_seq = base
2454 for ch in color_seq[1:]:
2455 base = decode_dic[base+ch]
2456 bp_seq += base
2457 return bp_seq
2458
2459 def convert_bp_to_color(bp_seq):
2460 encode_dic = { 'AA':'0', 'CC':'0', 'GG':'0', 'TT':'0',
2461 'AC':'1', 'CA':'1', 'GT':'1', 'TG':'1',
2462 'AG':'2', 'CT':'2', 'GA':'2', 'TC':'2',
2463 'AT':'3', 'CG':'3', 'GC':'3', 'TA':'3',
2464 'A.':'4', 'C.':'4', 'G.':'4', 'T.':'4',
2465 '.A':'4', '.C':'4', '.G':'4', '.T':'4',
2466 '.N':'4', 'AN':'4', 'CN':'4', 'GN':'4',
2467 'TN':'4', 'NA':'4', 'NC':'4', 'NG':'4',
2468 'NT':'4', 'NN':'4', 'N.':'4', '..':'4' }
2469
2470 base = bp_seq[0]
2471 color_seq = base
2472 for ch in bp_seq[1:]:
2473 color_seq += encode_dic[base + ch]
2474 base = ch
2475
2476 return color_seq
2477
2478 def split_record(read_name, read_seq, read_qual, out_segf, offsets, color):
2479 if color:
2480 color_offset = 1
2481 read_seq_temp = convert_color_to_bp(read_seq)
2482
2483 seg_num = 1
2484 while seg_num + 1 < len(offsets):
2485 if read_seq[offsets[seg_num]+1] not in ['0', '1', '2', '3']:
2486 return
2487 seg_num += 1
2488 else:
2489 color_offset = 0
2490
2491 seg_num = 0
2492 last_seq_offset = 0
2493 while seg_num + 1 < len(offsets):
2494 f = out_segf[seg_num].file
2495 seg_seq = read_seq[last_seq_offset+color_offset:offsets[seg_num + 1]+color_offset]
2496 print >> f, "%s|%d:%d:%d" % (read_name,last_seq_offset,seg_num, len(offsets) - 1)
2497 if color:
2498 print >> f, "%s%s" % (read_seq_temp[last_seq_offset], seg_seq)
2499 else:
2500 print >> f, seg_seq
2501 if not fasta:
2502 seg_qual = read_qual[last_seq_offset:offsets[seg_num + 1]]
2503 print >> f, "+"
2504 print >> f, seg_qual
2505 seg_num += 1
2506 last_seq_offset = offsets[seg_num]
2507
2508 line_state = 0
2509 read_name = ""
2510 read_seq = ""
2511 read_quals = ""
2512 num_segments = 0
2513 offsets = []
2514 for line in zreads.file:
2515 if line.strip() == "":
2516 continue
2517 if line_state == 0:
2518 read_name = line.strip()
2519 elif line_state == 1:
2520 read_seq = line.strip()
2521
2522 read_length = len(read_seq)
2523 tmp_num_segments = read_length / segment_length
2524 offsets = [segment_length * i for i in range(0, tmp_num_segments + 1)]
2525
2526 # Bowtie's minimum read length here is 20bp, so if the last segment
2527 # is between 20 and segment_length bp long, go ahead and write it out
2528 if read_length % segment_length >= min(segment_length - 2, 20):
2529 offsets.append(read_length)
2530 tmp_num_segments += 1
2531 else:
2532 offsets[-1] = read_length
2533
2534 if tmp_num_segments == 1:
2535 offsets = [0, read_length]
2536
2537 if tmp_num_segments > num_segments:
2538 open_output_files(prefix, num_segments, tmp_num_segments, out_segfiles, extension, params)
2539 num_segments = tmp_num_segments
2540
2541 if fasta:
2542 split_record(read_name, read_seq, None, out_segfiles, offsets, params.read_params.color)
2543 elif line_state == 2:
2544 line = line.strip()
2545 else:
2546 read_quals = line.strip()
2547 if not fasta:
2548 split_record(read_name, read_seq, read_quals, out_segfiles, offsets, params.read_params.color)
2549
2550 line_state += 1
2551 if fasta:
2552 line_state %= 2
2553 else:
2554 line_state %= 4
2555 zreads.close()
2556 out_fnames=[]
2557 for zf in out_segfiles:
2558 zf.close()
2559 out_fnames.append(zf.fname)
2560 #return [o.fname for o in out_segfiles]
2561 return out_fnames
2562
2563 # Find possible splice junctions using the "closure search" strategy, and report
2564 # them in closures.juncs. Calls the executable closure_juncs
2565 def junctions_from_closures(params,
2566 sam_header_filename,
2567 left_maps,
2568 right_maps,
2569 ref_fasta):
2570 th_log("Searching for junctions via mate-pair closures")
2571
2572
2573 #maps = [x for x in seg_maps if (os.path.exists(x) and os.path.getsize(x) > 0)]
2574 #if len(maps) == 0:
2575 # return None
2576 #print >> sys.stderr, left_maps
2577 slash = left_maps[0].rfind('/')
2578 juncs_out = ""
2579 if slash != -1:
2580 juncs_out += left_maps[0][:slash+1]
2581 fusions_out = juncs_out
2582
2583 juncs_out += "closure.juncs"
2584 fusions_out += "closure.fusions"
2585
2586 juncs_log = open(logging_dir + "closure.log", "w")
2587 juncs_cmdpath=prog_path("closure_juncs")
2588 juncs_cmd = [juncs_cmdpath]
2589
2590 left_maps = ','.join(left_maps)
2591 right_maps = ','.join(right_maps)
2592
2593 juncs_cmd.extend(params.cmd())
2594 juncs_cmd.extend(["--sam-header", sam_header_filename,
2595 juncs_out,
2596 fusions_out,
2597 ref_fasta,
2598 left_maps,
2599 right_maps])
2600 try:
2601 print >> run_log, ' '.join(juncs_cmd)
2602 retcode = subprocess.call(juncs_cmd,
2603 stderr=juncs_log)
2604
2605 # spanning_reads returned an error
2606 if retcode != 0:
2607 die(fail_str+"Error: closure-based junction search failed with err ="+str(retcode))
2608 # cvg_islands not found
2609 except OSError, o:
2610 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2611 print >> sys.stderr, fail_str, "Error: closure_juncs not found on this system"
2612 die(str(o))
2613 return [juncs_out]
2614
2615 # Find possible junctions by examining coverage and split segments in the initial
2616 # map and segment maps. Report junctions, insertions, and deletions in segment.juncs,
2617 # segment.insertions, and segment.deletions. Calls the executable
2618 # segment_juncs
2619 def junctions_from_segments(params,
2620 sam_header_filename,
2621 left_reads,
2622 left_reads_map,
2623 left_seg_maps,
2624 right_reads,
2625 right_reads_map,
2626 right_seg_maps,
2627 unmapped_reads,
2628 reads_format,
2629 ref_fasta):
2630 if left_reads_map != left_seg_maps[0]:
2631 th_log("Searching for junctions via segment mapping")
2632 out_path=getFileDir(left_seg_maps[0])
2633 juncs_out=out_path+"segment.juncs"
2634 insertions_out=out_path+"segment.insertions"
2635 deletions_out =out_path+"segment.deletions"
2636 fusions_out = out_path+"segment.fusions"
2637
2638 left_maps = ','.join(left_seg_maps)
2639 log_fname = logging_dir + "segment_juncs.log"
2640 segj_log = open(log_fname, "w")
2641 segj_cmd = [prog_path("segment_juncs")]
2642
2643 segj_cmd.extend(params.cmd())
2644 segj_cmd.extend(["--sam-header", sam_header_filename,
2645 "--ium-reads", ",".join(unmapped_reads),
2646 ref_fasta,
2647 juncs_out,
2648 insertions_out,
2649 deletions_out,
2650 fusions_out,
2651 left_reads,
2652 left_reads_map,
2653 left_maps])
2654 if right_seg_maps:
2655 right_maps = ','.join(right_seg_maps)
2656 segj_cmd.extend([right_reads, right_reads_map, right_maps])
2657 try:
2658 print >> run_log, " ".join(segj_cmd)
2659 retcode = subprocess.call(segj_cmd,
2660 preexec_fn=subprocess_setup,
2661 stderr=segj_log)
2662
2663 # spanning_reads returned an error
2664 if retcode != 0:
2665 die(fail_str+"Error: segment-based junction search failed with err ="+str(retcode)+"\n"+log_tail(log_fname))
2666
2667 # cvg_islands not found
2668 except OSError, o:
2669 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2670 print >> sys.stderr, fail_str, "Error: segment_juncs not found on this system"
2671 die(str(o))
2672
2673 return [juncs_out, insertions_out, deletions_out, fusions_out]
2674
2675 # Joins mapped segments into full-length read alignments via the executable
2676 # long_spanning_reads
2677 def join_mapped_segments(params,
2678 sam_header_filename,
2679 reads,
2680 ref_fasta,
2681 possible_juncs,
2682 possible_insertions,
2683 possible_deletions,
2684 possible_fusions,
2685 contig_seg_maps,
2686 spliced_seg_maps,
2687 alignments_out_name):
2688 rn=""
2689 if len(contig_seg_maps)>1:
2690 th_log("Joining segment hits")
2691 rn=".segs"
2692 else:
2693 th_log("Processing bowtie hits")
2694 contig_seg_maps = ','.join(contig_seg_maps)
2695
2696 possible_juncs = ','.join(possible_juncs)
2697 possible_insertions = ",".join(possible_insertions)
2698 possible_deletions = ",".join(possible_deletions)
2699 possible_fusions = ",".join(possible_fusions)
2700
2701 log_fname=logging_dir + "long_spanning_reads"+rn+".log"
2702 align_log = open(log_fname, "w")
2703 align_cmd = [prog_path("long_spanning_reads")]
2704
2705 align_cmd.extend(params.cmd())
2706 align_cmd += ["--sam-header", sam_header_filename]
2707
2708 b2_params = params.bowtie2_params
2709 max_penalty, min_penalty = b2_params.mp.split(',')
2710 align_cmd += ["--bowtie2-max-penalty", max_penalty,
2711 "--bowtie2-min-penalty", min_penalty]
2712
2713 align_cmd += ["--bowtie2-penalty-for-N", str(b2_params.np)]
2714
2715 read_gap_open, read_gap_cont = b2_params.rdg.split(',')
2716 align_cmd += ["--bowtie2-read-gap-open", read_gap_open,
2717 "--bowtie2-read-gap-cont", read_gap_cont]
2718
2719 ref_gap_open, ref_gap_cont = b2_params.rfg.split(',')
2720 align_cmd += ["--bowtie2-ref-gap-open", ref_gap_open,
2721 "--bowtie2-ref-gap-cont", ref_gap_cont]
2722
2723 align_cmd.append(ref_fasta)
2724 align_cmd.extend([reads,
2725 possible_juncs,
2726 possible_insertions,
2727 possible_deletions,
2728 possible_fusions,
2729 alignments_out_name,
2730 contig_seg_maps])
2731
2732 if spliced_seg_maps:
2733 spliced_seg_maps = ','.join(spliced_seg_maps)
2734 align_cmd.append(spliced_seg_maps)
2735
2736 try:
2737 print >> run_log, " ".join(align_cmd)
2738 join_proc=subprocess.call(align_cmd,
2739 stderr=align_log)
2740
2741 except OSError, o:
2742 die(fail_str+"Error: "+str(o))
2743
2744 # This class collects spliced and unspliced alignments for each of the
2745 # left and right read files provided by the user.
2746 class Maps:
2747 def __init__(self,
2748 unspliced_sam,
2749 seg_maps,
2750 unmapped_segs,
2751 segs):
2752 self.unspliced_sam = unspliced_sam
2753 self.seg_maps = seg_maps
2754 self.unmapped_segs = unmapped_segs
2755 self.segs = segs
2756
2757 # Map2GTF stuff
2758 def m2g_convert_coords(params, sam_header_filename, gtf_fname, reads, out_fname):
2759 """ajjkljlks
2760
2761 Arguments:
2762 - `params`: TopHat parameters
2763 - `gtf_fname`: File name pointing to the annotation.
2764 - `reads`: The reads to convert coords (in Bowtie format).
2765 - `out_fname`: The file name pointing to the output.
2766 """
2767 m2g_cmd = [prog_path("map2gtf")]
2768 m2g_cmd.extend(params.cmd())
2769 m2g_cmd += ["--sam-header", sam_header_filename]
2770 m2g_cmd.append(gtf_fname)
2771 m2g_cmd.append(reads) #could be BAM file
2772 m2g_cmd.append(out_fname)
2773 fbasename = getFileBaseName(reads)
2774 m2g_log = logging_dir + "m2g_" + fbasename + ".out"
2775 m2g_err = logging_dir + "m2g_" + fbasename + ".err"
2776
2777 try:
2778 th_log("Converting " + fbasename + " to genomic coordinates (map2gtf)")
2779 print >> run_log, " ".join(m2g_cmd) + " > " + m2g_log
2780 ret = subprocess.call(m2g_cmd,
2781 stdout=open(m2g_log, "w"),
2782 stderr=open(m2g_err, "w"))
2783 if ret != 0:
2784 die(fail_str + " Error: map2gtf returned an error")
2785 except OSError, o:
2786 err_msg = fail_str + str(o)
2787 die(err_msg + "\n")
2788
2789
2790 def gtf_to_fasta(params, trans_gtf, genome, out_fname):
2791 """ Make the transcriptome from a GTF.
2792
2793 Arguments:
2794 - `trans_gtf`:
2795 - `genome`:
2796 - `out_fname`:
2797 """
2798 # TODO: Call gtf_to_fasta
2799 g2f_cmd = [prog_path("gtf_to_fasta")]
2800 g2f_cmd.extend(params.cmd())
2801 g2f_cmd.append(trans_gtf)
2802 g2f_cmd.append(genome)
2803 g2f_cmd.append(out_fname)
2804
2805 g2f_log = logging_dir + "g2f.out"
2806 g2f_err = logging_dir + "g2f.err"
2807
2808 try:
2809 print >> run_log, " ".join(g2f_cmd)+" > " + g2f_log
2810 ret = subprocess.call(g2f_cmd,
2811 stdout = open(g2f_log, "w"),
2812 stderr = open(g2f_err, "w"))
2813 if ret != 0:
2814 die(fail_str + " Error: gtf_to_fasta returned an error.")
2815 except OSError, o:
2816 err_msg = fail_str + str(o)
2817 die(err_msg + "\n")
2818
2819 def map2gtf(params, genome_sam_header_filename, ref_fasta, left_reads, right_reads):
2820 """ Main GTF mapping function
2821
2822 Arguments:
2823 - `params`: The TopHat parameters.
2824 - `ref_fasta`: The reference genome.
2825 - `left_reads`: A list of reads.
2826 - `right_reads`: A list of reads (empty if single-end).
2827
2828 """
2829 test_input_file(params.gff_annotation)
2830
2831 # th_log("Reading in GTF file: " + params.gff_annotation)
2832 # transcripts = gtf_to_transcripts(params.gff_annotation)
2833
2834 gtf_name = getFileBaseName(params.gff_annotation)
2835 m2g_bwt_idx = None
2836 t_out_dir = tmp_dir
2837 if params.transcriptome_index and not params.transcriptome_outdir:
2838 m2g_bwt_idx = params.transcriptome_index
2839 th_log("Using pre-built transcriptome index..")
2840 else:
2841 th_log("Creating transcriptome data files..")
2842 if params.transcriptome_outdir:
2843 t_out_dir=params.transcriptome_outdir+"/"
2844 m2g_ref_fasta = t_out_dir + gtf_name + ".fa"
2845 gtf_to_fasta(params, params.gff_annotation, ref_fasta, m2g_ref_fasta)
2846 m2g_bwt_idx = build_idx_from_fa(params.bowtie2, m2g_ref_fasta, t_out_dir, params.read_params.color)
2847
2848 get_index_sam_header(params, m2g_bwt_idx)
2849
2850 mapped_gtf_list = []
2851 unmapped_gtf_list = []
2852 # do the initial mapping in GTF coordinates
2853 for reads in [left_reads, right_reads]:
2854 if reads == None or os.path.getsize(reads) < 25 :
2855 continue
2856 fbasename = getFileBaseName(reads)
2857 mapped_gtf_out = tmp_dir + fbasename + ".m2g"
2858 #if use_zpacker:
2859 # mapped_gtf_out+=".z"
2860
2861 unmapped_gtf = tmp_dir + fbasename + ".m2g_um"
2862 #if use_BWT_FIFO:
2863 # unmapped_gtf += ".z"
2864
2865 (mapped_gtf_map, unmapped) = bowtie(params,
2866 m2g_bwt_idx,
2867 [reads],
2868 "fastq",
2869 params.t_mismatches,
2870 mapped_gtf_out,
2871 unmapped_gtf,
2872 "", "read_against_transcriptome")
2873 mapped_gtf_list.append(mapped_gtf_map)
2874 unmapped_gtf_list.append(unmapped)
2875
2876 sam_gtf_list = []
2877 for reads in mapped_gtf_list:
2878 fbasename = getFileBaseName(reads)
2879 sam_out_fname = tmp_dir + fbasename + "_converted.bam"
2880 m2g_convert_coords(params,
2881 genome_sam_header_filename,
2882 params.gff_annotation,
2883 reads,
2884 sam_out_fname)
2885 sam_gtf_list.append(sam_out_fname)
2886
2887 if len(sam_gtf_list) < 2:
2888 sam_gtf_list.append(None)
2889 unmapped_gtf_list.append(None)
2890
2891 return (sam_gtf_list, unmapped_gtf_list)
2892 # end Map2GTF
2893
2894 def get_preflt_data(params, ri, target_reads, out_mappings, out_unmapped):
2895 ## extract mappings and unmapped reads from prefilter mappings and preflt_ium
2896 ##
2897 #this is accomplished by a special prep_reads usage (triggered by --flt-hits)
2898
2899 prep_cmd=prep_reads_cmd(params, params.preflt_data[ri].unmapped_reads, None,
2900 out_mappings,
2901 target_reads,
2902 params.preflt_data[ri].mappings)
2903 um_reads = open(out_unmapped, "wb")
2904 log_fname=logging_dir + "prep_reads.from_preflt."+str(ri)+".log"
2905 filter_log = open(log_fname,"w")
2906
2907 shell_cmd = " ".join(prep_cmd)
2908 #add the compression pipe
2909 zip_cmd=[]
2910 if use_zpacker:
2911 zip_cmd=[ params.system_params.zipper ]
2912 zip_cmd.extend(params.system_params.zipper_opts)
2913 zip_cmd.extend(['-c','-'])
2914 shell_cmd +=' | '+' '.join(zip_cmd)
2915 shell_cmd += ' >' + out_unmapped
2916 retcode=0
2917 try:
2918 print >> run_log, shell_cmd
2919 if use_zpacker:
2920 prep_proc = subprocess.Popen(prep_cmd,
2921 stdout=subprocess.PIPE,
2922 stderr=filter_log)
2923 zip_proc = subprocess.Popen(zip_cmd,
2924 preexec_fn=subprocess_setup,
2925 stdin=prep_proc.stdout,
2926 stdout=um_reads)
2927 prep_proc.stdout.close() #as per http://bugs.python.org/issue7678
2928 zip_proc.communicate()
2929 retcode=prep_proc.poll()
2930 if retcode==0:
2931 retcode=zip_proc.poll()
2932 else:
2933 retcode = subprocess.call(prep_cmd,
2934 stdout=um_reads,
2935 stderr=filter_log)
2936 if retcode:
2937 die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname))
2938
2939 except OSError, o:
2940 errmsg=fail_str+str(o)
2941 die(errmsg+"\n"+log_tail(log_fname))
2942 um_reads.close()
2943
2944 return (out_mappings, out_unmapped)
2945
2946
2947 # The main aligment routine of TopHat. This function executes most of the
2948 # workflow producing a set of candidate alignments for each cDNA fragment in a
2949 # pair of SAM alignment files (for paired end reads).
2950 def spliced_alignment(params,
2951 bwt_idx_prefix,
2952 sam_header_filename,
2953 ref_fasta,
2954 read_len,
2955 segment_len,
2956 prepared_reads,
2957 user_supplied_junctions,
2958 user_supplied_insertions,
2959 user_supplied_deletions):
2960
2961 possible_juncs = []
2962 possible_juncs.extend(user_supplied_junctions)
2963
2964 possible_insertions = []
2965 possible_insertions.extend(user_supplied_insertions)
2966 possible_deletions = []
2967 possible_deletions.extend(user_supplied_deletions)
2968 possible_fusions = []
2969
2970 left_reads, right_reads = prepared_reads
2971
2972 maps = [[], []] # maps[0] = left_reads mapping data, maps[1] = right_reads_mapping_data
2973
2974 #single_segments = False
2975
2976 # Before anything, map the reads using Map2GTF (if using annotation)
2977 m2g_maps = [ None, None ] # left, right
2978 initial_reads = [ left_reads, right_reads ]
2979
2980 if params.gff_annotation:
2981 (mapped_gtf_list, unmapped_gtf_list) = \
2982 map2gtf(params, sam_header_filename, ref_fasta, left_reads, right_reads)
2983
2984 m2g_left_maps, m2g_right_maps = mapped_gtf_list
2985 m2g_maps = [m2g_left_maps, m2g_right_maps]
2986 if params.transcriptome_only or not fileExists(unmapped_gtf_list[0]):
2987 # The case where the user doesn't want to map to anything other
2988 # than the transcriptome OR we have no unmapped reads
2989 maps[0] = [m2g_left_maps]
2990 if right_reads:
2991 maps[1] = [m2g_right_maps]
2992 return maps
2993 # Feed the unmapped reads into spliced_alignment()
2994 initial_reads = unmapped_gtf_list[:]
2995 th_log("Resuming TopHat pipeline with unmapped reads")
2996
2997 max_seg_len = segment_len #this is the ref seq span on either side of the junctions
2998 #to be extracted into segment_juncs.fa
2999
3000 num_segs = int(read_len / segment_len)
3001 if (read_len % segment_len) >= min(segment_len-2, 20):
3002 #remainder is shorter but long enough to become a new segment
3003 num_segs += 1
3004 else:
3005 # the last segment is longer
3006 if num_segs>1: max_seg_len += (read_len % segment_len)
3007
3008 if num_segs <= 1:
3009 print >> sys.stderr, "Warning: you have only one segment per read.\n\tIf the read length is greater than or equal to 45bp,\n\twe strongly recommend that you decrease --segment-length to about half the read length because TopHat will work better with multiple segments"
3010
3011 # Using the num_segs value returned by check_reads(),
3012 # decide which junction discovery strategy to use
3013 if num_segs < 3:
3014 #if params.butterfly_search != False:
3015 # params.butterfly_search = True
3016 if params.coverage_search != False:
3017 params.coverage_search = True
3018 if num_segs == 1:
3019 segment_len = read_len
3020 else: #num_segs >= 3:
3021 # if we have at least three segments, just use split segment search,
3022 # which is the most sensitive and specific, fastest, and lightest-weight.
3023 # so unless specifically requested, disable the other junction searches
3024 if params.closure_search != True:
3025 params.closure_search = False
3026 if params.coverage_search != True:
3027 params.coverage_search = False
3028 if params.butterfly_search != True:
3029 params.butterfly_search = False
3030
3031 # Perform the first part of the TopHat work flow on the left and right
3032 # reads of paired ends separately - we'll use the pairing information later
3033 have_left_IUM=False
3034 for ri in (0,1):
3035 reads=initial_reads[ri]
3036 if reads == None or not fileExists(reads,25):
3037 continue
3038 fbasename=getFileBaseName(reads)
3039 unspliced_out = tmp_dir + fbasename + ".mapped"
3040 unspliced_sam = None
3041 unmapped_reads = None
3042 #if use_zpacker: unspliced_out+=".z"
3043 unmapped_unspliced = tmp_dir + fbasename + "_unmapped"
3044 if params.prefilter_multi:
3045 #unmapped_unspliced += ".z"
3046 (unspliced_sam, unmapped_reads) = get_preflt_data(params, ri, reads, unspliced_out, unmapped_unspliced)
3047 else:
3048 # Perform the initial Bowtie mapping of the full length reads
3049 (unspliced_sam, unmapped_reads) = bowtie(params,
3050 bwt_idx_prefix,
3051 [reads],
3052 "fastq",
3053 params.genome_read_mismatches,
3054 unspliced_out,
3055 unmapped_unspliced,
3056 "",
3057 "read_against_genome")
3058
3059 # daehwan - check this out
3060 # params.bowtie2 = False
3061 # check_bowtie(params.bowtie2)
3062
3063 seg_maps = []
3064 unmapped_segs = []
3065 segs = []
3066 have_IUM = fileExists(unmapped_reads,25)
3067 if ri==0 and have_IUM:
3068 have_left_IUM = True
3069 if num_segs > 1 and have_IUM:
3070 # split up the IUM reads into segments
3071 # unmapped_reads can be in BAM format
3072 read_segments = split_reads(unmapped_reads,
3073 tmp_dir + fbasename,
3074 False,
3075 params,
3076 segment_len)
3077
3078 # Map each segment file independently with Bowtie
3079 for i in range(len(read_segments)):
3080 seg = read_segments[i]
3081 fbasename=getFileBaseName(seg)
3082 seg_out = tmp_dir + fbasename + ".bwtout"
3083 if use_zpacker: seg_out += ".z"
3084 unmapped_seg = tmp_dir + fbasename + "_unmapped.fq"
3085 if use_BWT_FIFO:
3086 unmapped_seg += ".z"
3087 extra_output = "(%d/%d)" % (i+1, len(read_segments))
3088 (seg_map, unmapped) = bowtie(params,
3089 bwt_idx_prefix,
3090 [seg],
3091 "fastq",
3092 params.segment_mismatches,
3093 seg_out,
3094 unmapped_seg,
3095 extra_output,
3096 "seg_against_genome")
3097 seg_maps.append(seg_map)
3098 unmapped_segs.append(unmapped)
3099 segs.append(seg)
3100
3101 # Collect the segment maps for left and right reads together
3102 maps[ri] = Maps(unspliced_sam, seg_maps, unmapped_segs, segs)
3103 else:
3104 # if there's only one segment, just collect the initial map as the only
3105 # map to be used downstream for coverage-based junction discovery
3106 read_segments = [reads]
3107 maps[ri] = Maps(unspliced_sam, [unspliced_sam], [unmapped_reads], [unmapped_reads])
3108
3109 # XXX: At this point if using M2G, have three sets of reads:
3110 # mapped to transcriptome, mapped to genome, and unmapped (potentially
3111 # spliced or poly-A tails) - hp
3112
3113 # Unless the user asked not to discover new junctions, start that process
3114 # here
3115 left_reads_map = maps[0].unspliced_sam
3116 left_seg_maps = maps[0].seg_maps
3117 unmapped_reads = maps[0].unmapped_segs
3118 if right_reads:
3119 right_reads_map = maps[1].unspliced_sam
3120 right_seg_maps = maps[1].seg_maps
3121 unmapped_reads.extend(maps[1].unmapped_segs)
3122 else:
3123 right_reads_map = None
3124 right_seg_maps = None
3125
3126 if params.find_novel_juncs and have_left_IUM: # or params.find_novel_indels:
3127 # Call segment_juncs to infer a list of possible splice junctions from
3128 # the regions of the genome covered in the initial and segment maps
3129 #if params.find_novel_juncs:
3130 #TODO: in m2g case, we might want to pass the m2g mappings as well,
3131 # or perhaps the GTF file directly
3132 # -> this could improve alternative junction detection?
3133 juncs = junctions_from_segments(params,
3134 sam_header_filename,
3135 left_reads,
3136 left_reads_map,
3137 left_seg_maps,
3138 right_reads,
3139 right_reads_map,
3140 right_seg_maps,
3141 unmapped_reads,
3142 "fastq",
3143 ref_fasta)
3144 if os.path.getsize(juncs[0]) != 0:
3145 possible_juncs.append(juncs[0])
3146 if params.find_novel_indels:
3147 if os.path.getsize(juncs[1]) != 0:
3148 possible_insertions.append(juncs[1])
3149 if os.path.getsize(juncs[2]) != 0:
3150 possible_deletions.append(juncs[2])
3151 if params.find_novel_fusions:
3152 if os.path.getsize(juncs[3]) != 0:
3153 possible_fusions.append(juncs[3])
3154 # Optionally, and for paired reads only, use a closure search to
3155 # discover addtional junctions
3156 if params.closure_search and left_reads and right_reads:
3157 juncs = junctions_from_closures(params,
3158 sam_header_filename,
3159 [maps[initial_reads[left_reads]].unspliced_sam, maps[initial_reads[left_reads]].seg_maps[-1]],
3160 [maps[initial_reads[right_reads]].unspliced_sam, maps[initial_reads[right_reads]].seg_maps[-1]],
3161 ref_fasta)
3162 if os.path.getsize(juncs[0]) != 0:
3163 possible_juncs.extend(juncs)
3164
3165 if len(possible_insertions) == 0 and len(possible_deletions) == 0 and len(possible_juncs) == 0 and len(possible_fusions) == 0:
3166 spliced_seg_maps = None
3167 junc_idx_prefix = None
3168 else:
3169 junc_idx_prefix = "segment_juncs"
3170 if len(possible_insertions) == 0:
3171 possible_insertions.append(os.devnull)
3172 # print >> sys.stderr, "Warning: insertions database is empty!"
3173 if len(possible_deletions) == 0:
3174 possible_deletions.append(os.devnull)
3175 # print >> sys.stderr, "Warning: deletions database is empty!"
3176 if len(possible_juncs) == 0:
3177 possible_juncs.append(os.devnull)
3178 print >> sys.stderr, "Warning: junction database is empty!"
3179 if len(possible_fusions) == 0:
3180 possible_fusions.append(os.devnull)
3181 if junc_idx_prefix:
3182 juncs_bwt_idx = build_juncs_index(params.bowtie2,
3183 3,
3184 max_seg_len,
3185 junc_idx_prefix,
3186 possible_juncs,
3187 possible_insertions,
3188 possible_deletions,
3189 possible_fusions,
3190 ref_fasta,
3191 params.read_params.color)
3192
3193 juncs_bwt_samheader = get_index_sam_header(params, juncs_bwt_idx)
3194
3195 # Now map read segments (or whole IUM reads, if num_segs == 1) to the splice
3196 # index with Bowtie
3197 # for reads in [left_reads, right_reads]:
3198 for ri in (0,1):
3199 reads = initial_reads[ri]
3200 spliced_seg_maps = []
3201
3202 if reads == None or os.path.getsize(reads)<25:
3203 continue
3204
3205 m2g_map = m2g_maps[ri]
3206
3207 if junc_idx_prefix :
3208 i = 0
3209 for seg in maps[ri].segs:
3210 #search each segment
3211 fsegname = getFileBaseName(seg)
3212 seg_out = tmp_dir + fsegname + ".to_spliced.bwtout"
3213 if use_zpacker: seg_out += ".z"
3214 extra_output = "(%d/%d)" % (i+1, len(maps[ri].segs))
3215 (seg_map, unmapped) = bowtie(params,
3216 tmp_dir + junc_idx_prefix,
3217 [seg],
3218 "fastq",
3219 params.segment_mismatches,
3220 seg_out,
3221 None,
3222 extra_output,
3223 "seg_against_junctions")
3224 spliced_seg_maps.append(seg_map)
3225 i += 1
3226
3227 # Join the contigous and spliced segment hits into full length
3228 # read alignments
3229 rfname=getFileBaseName(reads)
3230 rfdir=getFileDir(reads)
3231 mapped_reads = rfdir + rfname + ".candidates.bam"
3232 # -- spliced mappings built from all segment mappings vs genome and junc_db
3233 merged_map =rfdir + rfname + ".candidates_and_unspl.bam"
3234 #unspl_bwtfile = maps[ri].unspliced_bwt
3235 unspl_samfile = maps[ri].unspliced_sam
3236 join_mapped_segments(params,
3237 sam_header_filename,
3238 reads,
3239 ref_fasta,
3240 possible_juncs,
3241 possible_insertions,
3242 possible_deletions,
3243 possible_fusions,
3244 maps[ri].seg_maps,
3245 spliced_seg_maps,
3246 mapped_reads)
3247
3248 if num_segs > 1 or m2g_map:
3249 # Merge the spliced and unspliced full length alignments into
3250 # a single SAM file.
3251 # The individual SAM files are all already sorted in
3252 # increasing read ID order.
3253 # NOTE: We also should be able to address bug #134 here, by replacing
3254 # contiguous alignments that poke into an intron by a small amount by
3255 # the correct spliced alignment.
3256
3257 try:
3258 merge_cmd = [prog_path("bam_merge"),
3259 "--index-outfile", merged_map + ".index",
3260 "--sam-header", sam_header_filename,
3261 merged_map]
3262
3263 if num_segs > 1:
3264 merge_cmd += [ maps[ri].unspliced_sam ]
3265 if m2g_map:
3266 merge_cmd += [ m2g_map ]
3267
3268 if os.path.exists(mapped_reads):
3269 merge_cmd += [ mapped_reads ]
3270 else:
3271 for bam_i in range(0, params.system_params.num_threads):
3272 temp_bam = mapped_reads[:-4] + str(bam_i) + ".bam"
3273 if os.path.exists(temp_bam):
3274 merge_cmd += [ temp_bam ]
3275 else:
3276 break
3277
3278 print >> run_log, " ".join(merge_cmd)
3279 ret = subprocess.call( merge_cmd,
3280 stderr=open(logging_dir + "sam_merge.log", "w") )
3281 if ret != 0:
3282 die(fail_str+"Error executing: "+" ".join(merge_cmd))
3283 except OSError, o:
3284 die(fail_str+"Error: "+str(o))
3285 maps[ri] = [merged_map]
3286
3287 if not params.system_params.keep_tmp:
3288 os.remove(mapped_reads)
3289 else:
3290 # no segments or transcriptome mappings, so no merge is needed
3291 # because join_mapped_segments() produced the final BAM already
3292 os.rename(mapped_reads, merged_map)
3293 maps[ri] = [merged_map]
3294 if not params.system_params.keep_tmp:
3295 os.remove(unspl_samfile)
3296
3297 return maps
3298
3299 def die(msg=None):
3300 if msg is not None:
3301 print >> sys.stderr, msg
3302 sys.exit(1)
3303
3304 # rough equivalent of the 'which' command to find external programs
3305 # (current script path is tested first, then PATH envvar)
3306 def which(program):
3307 def is_executable(fpath):
3308 return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
3309 fpath, fname = os.path.split(program)
3310 if fpath:
3311 if is_executable(program):
3312 return program
3313 else:
3314 progpath = os.path.join(bin_dir, program)
3315 if is_executable(progpath):
3316 return progpath
3317 for path in os.environ["PATH"].split(os.pathsep):
3318 progpath = os.path.join(path, program)
3319 if is_executable(progpath):
3320 return progpath
3321 return None
3322
3323 def prog_path(program):
3324 progpath=which(program)
3325 if progpath == None:
3326 die("Error locating program: "+program)
3327 return progpath
3328
3329 # FIXME: this should get set during the make dist autotools phase of the build
3330 def get_version():
3331 return "2.0.0"
3332
3333 def mlog(msg):
3334 print >> sys.stderr, "[DBGLOG]:"+msg
3335
3336 def test_input_file(filename):
3337 try:
3338 test_file = open(filename, "r")
3339 except IOError, o:
3340 die("Error: Opening file %s" % filename)
3341 return
3342
3343 def main(argv=None):
3344 warnings.filterwarnings("ignore", "tmpnam is a potential security risk")
3345
3346 # Initialize default parameter values
3347 params = TopHatParams()
3348 try:
3349 if argv is None:
3350 argv = sys.argv
3351 args = params.parse_options(argv)
3352 params.check()
3353
3354 bwt_idx_prefix = args[0]
3355 left_reads_list = args[1]
3356 left_quals_list, right_quals_list = [], []
3357 if (not params.read_params.quals and len(args) > 2) or (params.read_params.quals and len(args) > 3):
3358 if params.read_params.mate_inner_dist == None:
3359 params.read_params.mate_inner_dist = 50
3360 #die("Error: you must set the mean inner distance between mates with -r")
3361
3362 right_reads_list = args[2]
3363 if params.read_params.quals:
3364 left_quals_list = args[3]
3365 right_quals_list = args[4]
3366 else:
3367 right_reads_list = None
3368 if params.read_params.quals:
3369 left_quals_list = args[2]
3370
3371 print >> sys.stderr
3372 th_log("Beginning TopHat run (v"+get_version()+")")
3373 print >> sys.stderr, "-----------------------------------------------"
3374
3375 start_time = datetime.now()
3376 prepare_output_dir()
3377
3378 global run_log
3379 run_log = open(logging_dir + "run.log", "w", 0)
3380 global run_cmd
3381 run_cmd = " ".join(argv)
3382 print >> run_log, run_cmd
3383
3384 # Validate all the input files, check all prereqs before committing
3385 # to the run
3386 if params.gff_annotation:
3387 if not os.path.exists(params.gff_annotation):
3388 die("Error: cannot find transcript file %s" % params.gff_annotation)
3389 if os.path.getsize(params.gff_annotation)<10:
3390 die("Error: invalid transcript file %s" % params.gff_annotation)
3391
3392 if params.transcriptome_index:
3393 if params.gff_annotation:
3394 #gff file given, so transcriptome data will be written there
3395 gff_basename = getFileBaseName(params.gff_annotation)
3396 #just in case, check if it's not already there (-G/--GTF given again by mistake)
3397 tpath, tname = os.path.split(params.transcriptome_index)
3398 new_subdir=False
3399 if tpath in (".", "./") or not tpath :
3400 if not os.path.exists(params.transcriptome_index):
3401 os.makedirs(params.transcriptome_index)
3402 new_subdir=True
3403 if new_subdir or (os.path.exists(params.transcriptome_index) and os.path.isdir(params.transcriptome_index)):
3404 params.transcriptome_index = os.path.join(params.transcriptome_index, gff_basename)
3405 gff_out=params.transcriptome_index+".gff"
3406 if not (os.path.exists(gff_out) and os.path.getsize(gff_out)==os.path.getsize(params.gff_annotation)):
3407 #generate the transcriptome data files
3408 tpath, tname = os.path.split(params.transcriptome_index)
3409 params.transcriptome_outdir=tpath
3410 t_gff=params.transcriptome_index+".gff"
3411 if params.transcriptome_outdir:
3412 #will create the transcriptome data files
3413 if not os.path.exists(params.transcriptome_outdir):
3414 os.makedirs(params.transcriptome_outdir)
3415 copy(params.gff_annotation, t_gff)
3416 else:
3417 #try to use existing transcriptome data files
3418
3419 if not (os.path.exists(t_gff) and os.path.getsize(t_gff)>10):
3420 die("Error: GFF transcripts file not found or invalid (%s)" % t_gff)
3421 check_bowtie_index(params.transcriptome_index)
3422 params.gff_annotation=t_gff
3423 #end @ transcriptome_index given
3424
3425 (ref_fasta, ref_seq_dict) = check_index(bwt_idx_prefix)
3426
3427 check_bowtie(params.bowtie2)
3428 check_samtools()
3429 th_log("Generating SAM header for "+bwt_idx_prefix)
3430 sam_header_filename = get_index_sam_header(params, bwt_idx_prefix)
3431 #if not params.skip_check_reads:
3432 reads_list = left_reads_list
3433 if right_reads_list:
3434 reads_list = reads_list + "," + right_reads_list
3435 params.read_params = check_reads_format(params, reads_list)
3436
3437 user_supplied_juncs = []
3438 user_supplied_insertions = []
3439 user_supplied_deletions = []
3440 user_supplied_fusions = []
3441 global gtf_juncs
3442 if params.gff_annotation and params.find_GFF_juncs:
3443 test_input_file(params.gff_annotation)
3444 (found_juncs, gtf_juncs) = get_gtf_juncs(params.gff_annotation)
3445 ##-- we shouldn't need these junctions in user_supplied_juncs anymore because now map2gtf does a much better job
3446 ## but we still need them loaded in gtf_juncs for later splice verification
3447 if found_juncs:
3448 ## and not params.gff_annotation:
3449 user_supplied_juncs.append(gtf_juncs)
3450 #else:
3451 # gtf_juncs = None
3452 if params.raw_junctions:
3453 test_input_file(params.raw_junctions)
3454 user_supplied_juncs.append(params.raw_junctions)
3455
3456 if params.raw_insertions:
3457 test_input_file(params.raw_insertions)
3458 user_supplied_insertions.append(params.raw_insertions)
3459
3460 if params.raw_deletions:
3461 test_input_file(params.raw_deletions)
3462 user_supplied_deletions.append(params.raw_deletions)
3463
3464 global unmapped_reads_fifo
3465 unmapped_reads_fifo = tmp_dir + str(os.getpid())+".bwt_unmapped.z.fifo"
3466
3467 # Now start the time consuming stuff
3468 if params.prefilter_multi:
3469 sides=("left","right")
3470 read_lists=(left_reads_list, right_reads_list)
3471 for ri in (0,1):
3472 reads_list=read_lists[ri]
3473 if not reads_list:
3474 continue
3475 params.preflt_data[ri].multihit_reads = tmp_dir + sides[ri]+"_multimapped.fq"
3476 side_imap = tmp_dir + sides[ri]+"_im.bwtout"
3477 if use_zpacker:
3478 side_imap+=".z"
3479 side_ium = tmp_dir + sides[ri]+"_ium.fq"
3480 if use_BWT_FIFO:
3481 side_ium += ".z"
3482 th_log("Pre-filtering multi-mapped "+sides[ri]+" reads")
3483 rdlist=reads_list.split(',')
3484 bwt=bowtie(params, bwt_idx_prefix, rdlist,
3485 params.read_params.reads_format,
3486 params.genome_read_mismatches,
3487 side_imap, side_ium,
3488 "", "read_against_genome", params.preflt_data[ri].multihit_reads)
3489 params.preflt_data[ri].mappings = bwt[0] # initial mappings
3490 params.preflt_data[ri].unmapped_reads = bwt[1] # IUM reads
3491
3492 th_log("Preparing reads")
3493 left_kept_reads, left_reads_info = prep_reads(params,
3494 left_reads_list,
3495 left_quals_list,
3496 "left",
3497 params.preflt_data[0].multihit_reads)
3498 min_read_len=left_reads_info.min_len
3499 max_read_len=left_reads_info.max_len
3500 if right_reads_list:
3501 right_kept_reads, right_reads_info = prep_reads(params,
3502 right_reads_list,
3503 right_quals_list,
3504 "right",
3505 params.preflt_data[1].multihit_reads)
3506 min_read_len=min(right_reads_info.min_len, min_read_len)
3507 max_read_len=max(right_reads_info.max_len, max_read_len)
3508 else:
3509 right_kept_reads = None
3510 seed_len=params.read_params.seed_length
3511 if seed_len: #if read len was explicitly given
3512 seed_len = max(seed_len, min_read_len)
3513 #can't be smaller than minimum length observed
3514 else:
3515 seed_len = max_read_len
3516 params.read_params.seed_length=seed_len
3517 # turn off integer-quals
3518 if params.read_params.integer_quals:
3519 params.read_params.integer_quals = False
3520
3521 input_reads = [left_kept_reads, right_kept_reads]
3522 mappings = spliced_alignment(params,
3523 bwt_idx_prefix,
3524 sam_header_filename,
3525 ref_fasta,
3526 params.read_params.seed_length,
3527 params.segment_length,
3528 input_reads,
3529 user_supplied_juncs,
3530 user_supplied_insertions,
3531 user_supplied_deletions)
3532
3533 compile_reports(params,
3534 sam_header_filename,
3535 ref_fasta,
3536 mappings,
3537 input_reads,
3538 params.gff_annotation)
3539
3540 if not params.system_params.keep_tmp:
3541 for m in mappings[0]:
3542 os.remove(m)
3543 if left_kept_reads:
3544 os.remove(left_kept_reads)
3545 for m in mappings[1]:
3546 os.remove(m)
3547 if right_kept_reads:
3548 os.remove(right_kept_reads)
3549 tmp_files = os.listdir(tmp_dir)
3550 for t in tmp_files:
3551 os.remove(tmp_dir+t)
3552 os.rmdir(tmp_dir)
3553
3554 finish_time = datetime.now()
3555 duration = finish_time - start_time
3556 print >> sys.stderr,"-----------------------------------------------"
3557 th_log("Run complete: %s elapsed" % formatTD(duration))
3558
3559 except Usage, err:
3560 print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg)
3561 print >> sys.stderr, " for detailed help see http://tophat.cbcb.umd.edu/manual.html"
3562 return 2
3563
3564
3565 if __name__ == "__main__":
3566 sys.exit(main())

Properties

Name Value
svn:executable *