ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/common.h
Revision: 168
Committed: Fri Feb 10 16:40:38 2012 UTC (8 years, 1 month ago) by gpertea
File size: 16171 byte(s)
Log Message:
finally fixed bam_merge

Line File contents
1 #ifndef COMMON_H
2 #define COMMON_H
3 /*
4 * common.h
5 * TopHat
6 *
7 * Created by Cole Trapnell on 11/26/08.
8 * Copyright 2008 Cole Trapnell. All rights reserved.
9 *
10 */
11 #include <stdint.h>
12 #include <cassert>
13 #include <cstring>
14 #include <cstdlib>
15 #include <cstdio>
16 #include <string>
17 #include <vector>
18 #include "bam/bam.h"
19 #include "bam/sam.h"
20
21
22 #define VMAXINT32 0xFFFFFFFF
23
24 #ifdef MEM_DEBUG
25 void process_mem_usage(double& vm_usage, double& resident_set);
26 void print_mem_usage();
27 #endif
28
29 extern bool bowtie2;
30 extern int bowtie2_min_score;
31 extern int bowtie2_max_penalty;
32 extern int bowtie2_min_penalty;
33 extern int bowtie2_penalty_for_N;
34 extern int bowtie2_read_gap_open;
35 extern int bowtie2_read_gap_cont;
36 extern int bowtie2_ref_gap_open;
37 extern int bowtie2_ref_gap_cont;
38
39 // daehwan - temporary for parallelization
40 extern bool parallel;
41
42 /*
43 * Maximum allowable length of an
44 * an insertion. Used mainly in
45 * segment_juncs.cpp
46 */
47 extern unsigned int max_insertion_length;
48
49 /*
50 * Maximum allowable length of a
51 * deletion. Used mainly in segment_juncs.cpp
52 * and long_spanning_reads.cpp
53 */
54 extern unsigned int max_deletion_length;
55
56 extern int inner_dist_mean;
57 extern int inner_dist_std_dev;
58 extern int max_mate_inner_dist;
59
60 extern int min_anchor_len;
61 extern int min_report_intron_length;
62 extern int max_report_intron_length;
63
64 extern int min_closure_intron_length;
65 extern int max_closure_intron_length;
66
67 extern int min_coverage_intron_length;
68 extern int max_coverage_intron_length;
69
70 extern int min_segment_intron_length;
71 extern int max_segment_intron_length;
72
73 extern uint32_t min_closure_exon_length;
74 extern int island_extension;
75 extern int num_threads;
76 extern int segment_length; // the read segment length used by the pipeline
77 extern int segment_mismatches;
78 extern int max_read_mismatches;
79
80 extern int max_splice_mismatches;
81
82 enum ReadFormat {FASTA, FASTQ};
83 extern ReadFormat reads_format;
84
85 extern bool verbose;
86 extern unsigned int max_multihits;
87 extern unsigned int max_seg_multihits;
88 extern bool no_closure_search;
89 extern bool no_coverage_search;
90 extern bool no_microexon_search;
91 extern bool butterfly_search;
92
93 extern float min_isoform_fraction;
94
95 extern std::string output_dir;
96 extern std::string gff_file;
97 extern std::string gene_filter;
98
99 extern std::string ium_reads;
100 extern std::string sam_header;
101 extern std::string sam_readgroup_id;
102 extern std::string zpacker; //path to program to use for de/compression (gzip, pigz, bzip2, pbzip2)
103 extern std::string samtools_path; //path to samtools executable
104 extern std::string aux_outfile; //auxiliary output file name
105 extern std::string index_outfile; //index output file name
106 extern bool solexa_quals;
107 extern bool phred64_quals;
108 extern bool quals;
109 extern bool integer_quals;
110 extern bool color;
111 extern std::string gtf_juncs;
112
113 extern bool report_secondary_alignments;
114
115 //prep_reads only: --flt-reads <bowtie-fastq_for--max>
116 // filter out reads if their numeric ID is in this fastq file
117 // OR if flt_mappings was given too, filter out reads if their ID
118 // is NOT in this fastq file
119 extern std::string flt_reads;
120
121 //prep_reads special usage: filter out mappings whose read ID
122 //is NOT found in the flt_reads file, and write them into
123 // aux_outfile; also reverses the flt_reads filter itself
124 extern std::string flt_mappings;
125
126 extern bool fusion_search;
127 extern size_t fusion_anchor_length;
128 extern size_t fusion_min_dist;
129 extern size_t fusion_read_mismatches;
130 extern size_t fusion_multireads;
131 extern size_t fusion_multipairs;
132
133 enum eLIBRARY_TYPE
134 {
135 LIBRARY_TYPE_NONE = 0,
136
137 FR_UNSTRANDED,
138 FR_FIRSTSTRAND,
139 FR_SECONDSTRAND,
140
141 FF_UNSTRANDED,
142 FF_FIRSTSTRAND,
143 FF_SECONDSTRAND,
144
145 NUM_LIBRARY_TYPE
146 };
147
148 extern eLIBRARY_TYPE library_type;
149 std::string getFext(const std::string& s); //returns file extension converted to lowercase
150 bool str_endsWith(std::string& str, const char* suffix);
151 void str_appendInt(std::string& str, int v);
152 FILE* fzOpen(std::string& fname, const char* mode);
153
154 int parseIntOpt(int lower, const char *errmsg, void (*print_usage)());
155 int parse_options(int argc, char** argv, void (*print_usage)());
156
157 void err_exit(const char* format,...); // exit with an error
158
159 char* get_token(char** str, const char* delims);
160 std::string guess_packer(const std::string& fname, bool use_all_cpus);
161 std::string getUnpackCmd(const std::string& fname, bool use_all_cpus=false);
162
163 void checkSamHeader();
164 void writeSamHeader(FILE* fout);
165
166 class FZPipe {
167 public:
168 FILE* file;
169 std::string filename;
170 std::string pipecmd;
171 bool is_bam;
172 FZPipe(const std::string& fname, bool is_mapping):filename(fname),pipecmd() {
173 //this constructor is only to use FZPipe as a READER
174 //also accepts/recognizes BAM files
175 //for which it only stores the filename, other fields/methods are unused
176 openRead(fname, is_mapping);
177 }
178
179 void openRead(const std::string& fname, bool is_mapping) {
180 filename=fname;
181 pipecmd="";
182 is_bam=false;
183 if (is_mapping && getFext(fname) == "bam") {
184 file=(FILE*)this;
185 is_bam=true;
186 return;
187 }
188 pipecmd=getUnpackCmd(fname); //also bam2fastx
189 this->openRead(fname.c_str(), pipecmd);
190 }
191
192 FZPipe():filename(),pipecmd() {
193 is_bam=false;
194 file=NULL;
195 }
196 FZPipe(std::string& fname, std::string& pcmd):filename(fname),pipecmd(pcmd) {
197 //open as a compressed file reader
198 is_bam=false;
199 file=NULL;
200 this->openRead(fname.c_str(), pipecmd);
201 }
202 void close() {
203 if (file!=NULL) {
204 if (pipecmd.empty()) fclose(file);
205 else pclose(file);
206 file=NULL;
207 }
208 }
209 FILE* openWrite(const char* fname, std::string& popencmd);
210 FILE* openWrite(const char* fname);
211 FILE* openRead(const char* fname, std::string& popencmd);
212
213 FILE* openRead(const char* fname);
214 FILE* openRead(const std::string fname, std::string& popencmd) {
215 return this->openRead(fname.c_str(),popencmd);
216 }
217 FILE* openRead(const std::string fname) {
218 return this->openRead(fname.c_str());
219 }
220 void rewind();
221
222 void seek(int64_t offset)
223 {
224 fseek(this->file, offset, SEEK_SET);
225 }
226 };
227
228 void err_die(const char* format,...);
229 void warn_msg(const char* format,...);
230
231 class GBamRecord {
232 bam1_t* b;
233 // b->data has the following strings concatenated:
234 // qname (including the terminal \0)
235 // +cigar (each event encoded on 32 bits)
236 // +seq (4bit-encoded)
237 // +qual
238 // +aux
239 bool novel;
240
241 char tag[2];
242 uint8_t abuf[512];
243 public:
244 GBamRecord(bam1_t* from_b=NULL) {
245 if (from_b==NULL) {
246 b=bam_init1();
247 novel=true;
248 }
249 else {
250 b=from_b;
251 novel=false;
252 }
253 }
254
255 void clear() {
256 if (novel) {
257 bam_destroy1(b);
258 }
259 novel=true;
260 b=bam_init1();
261 }
262
263 ~GBamRecord() {
264 if (novel) { bam_destroy1(b); }
265 }
266
267 void parse_error(const char* s) {
268 err_die("BAM parsing error: %s\n", s);
269 }
270
271 bam1_t* get_b() { return b; }
272
273 void set_mdata(int32_t mtid, int32_t m0pos, //0-based coordinate, -1 if not available
274 int32_t isize=0) { //mate info for current record
275 b->core.mtid=mtid;
276 b->core.mpos=m0pos; // should be -1 if '*'
277 b->core.isize=isize; //should be 0 if not available
278 }
279
280 void set_flags(uint16_t flags) {
281 b->core.flag=flags;
282 }
283
284 //creates a new record from 1-based alignment coordinate
285 //quals should be given as Phred33
286 //Warning: pos and mate_pos must be given 1-based!
287 GBamRecord(const char* qname, int32_t gseq_tid,
288 int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* quals=NULL);
289 GBamRecord(const char* qname, int32_t flags, int32_t g_tid,
290 int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos,
291 int insert_size, const char* qseq, const char* quals=NULL,
292 const std::vector<std::string>* aux_strings=NULL);
293 void set_cigar(const char* cigar); //converts and adds CIGAR string given in plain SAM text format
294 void add_sequence(const char* qseq, int slen=-1); //adds the DNA sequence given in plain text format
295 void add_quals(const char* quals); //quality values string in Phred33 format
296 void add_aux(const char* str); //adds one aux field in plain SAM text format (e.g. "NM:i:1")
297 void add_aux(const char tag[2], char atype, int len, uint8_t *data) {
298 int ori_len = b->data_len;
299 b->data_len += 3 + len;
300 b->l_aux += 3 + len;
301 if (b->m_data < b->data_len) {
302 b->m_data = b->data_len;
303 kroundup32(b->m_data);
304 b->data = (uint8_t*)realloc(b->data, b->m_data);
305 }
306 b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
307 b->data[ori_len + 2] = atype;
308 memcpy(b->data + ori_len + 3, data, len);
309 }
310 };
311
312 class GBamWriter {
313 samfile_t* bam_file;
314 bam_header_t* bam_header;
315 FILE* findex;
316 uint64_t wcount;
317 uint64_t idxcount;
318 int64_t idx_last_id;
319 bool external_header;
320 public:
321 void create(const char* fname, bool uncompressed=false) {
322 findex=NULL;
323 wcount=0;
324 idxcount=0;
325 idx_last_id=0;
326 external_header=false;
327 if (bam_header==NULL)
328 err_die("Error: no bam_header for GBamWriter::create()!\n");
329 if (uncompressed) {
330 bam_file=samopen(fname, "wbu", bam_header);
331 }
332 else {
333 bam_file=samopen(fname, "wb", bam_header);
334 }
335 if (bam_file==NULL)
336 err_die("Error: could not create BAM file %s!\n",fname);
337 //do we need to call bam_header_write() ?
338 }
339
340 void create(const char* fname, std::string idxfile) {
341 findex=NULL;
342 wcount=0;
343 idxcount=0;
344 idx_last_id=0;
345 external_header=false;
346 if (bam_header==NULL)
347 err_die("Error: no bam_header for GBamWriter::create()!\n");
348 bam_file=samopen(fname, "wb", bam_header);
349 if (bam_file==NULL)
350 err_die("Error: could not create BAM file %s!\n",fname);
351 if (!idxfile.empty()) {
352 findex = fopen(idxfile.c_str(), "w");
353 if (findex == NULL)
354 err_die("Error: cannot create file %s\n", idxfile.c_str());
355 }
356 }
357
358 void create(const char* fname, bam_header_t* bh, bool uncompressed=false) {
359 findex=NULL;
360 wcount=0;
361 idxcount=0;
362 idx_last_id=0;
363 external_header=false;
364 bam_header=bh;
365 create(fname, uncompressed);
366 }
367
368 GBamWriter(const char* fname, bam_header_t* bh, bool uncompressed=false) {
369 create(fname, bh, uncompressed);
370 external_header=true;
371 }
372
373 GBamWriter(const char* fname, bam_header_t* bh, std::string idxfile) {
374 bam_header=bh;
375 create(fname, idxfile);
376 external_header=true;
377 }
378
379
380 GBamWriter(const char* fname, const char* samfname, bool uncompressed=false) {
381 tamFile samf_in=sam_open(samfname);
382 if (samf_in==NULL)
383 err_die("Error: could not open SAM file %s\n", samfname);
384 bam_header=sam_header_read(samf_in);
385 if (bam_header==NULL)
386 err_die("Error: could not read SAM header from %s!\n",samfname);
387 sam_close(samf_in);
388 create(fname, uncompressed);
389 }
390
391 GBamWriter(const char* fname, const char* samfname, std::string idxfile) {
392 tamFile samf_in=sam_open(samfname);
393 if (samf_in==NULL)
394 err_die("Error: could not open SAM file %s\n", samfname);
395 bam_header=sam_header_read(samf_in);
396 if (bam_header==NULL)
397 err_die("Error: could not read SAM header from %s!\n",samfname);
398 sam_close(samf_in);
399 create(fname, idxfile);
400 }
401
402 ~GBamWriter() {
403 samclose(bam_file);
404 if (bam_header && !external_header)
405 bam_header_destroy(bam_header);
406 if (findex != NULL)
407 fclose(findex);
408 }
409 bam_header_t* get_header() { return bam_header; }
410 int32_t get_tid(const char *seq_name) {
411 if (bam_header==NULL)
412 err_die("Error: missing SAM header (get_tid())\n");
413 return bam_get_tid(bam_header, seq_name);
414 }
415
416 //just a convenience function for creating a new record, but it's NOT written
417 //given pos must be 1-based (so it'll be stored as pos-1 because BAM is 0-based)
418 GBamRecord* new_record(const char* qname, const char* gseqname,
419 int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* qual=NULL) {
420 int32_t gseq_tid=get_tid(gseqname);
421 if (gseq_tid < 0 && strcmp(gseqname, "*")) {
422 if (bam_header->n_targets == 0) {
423 err_die("Error: missing/invalid SAM header\n");
424 } else
425 fprintf(stderr, "Warning: reference '%s' not found in header, will consider it '*'.\n",
426 gseqname);
427 }
428
429 return (new GBamRecord(qname, gseq_tid, pos, reverse, qseq, cigar, qual));
430 }
431
432 GBamRecord* new_record(const char* qname, int32_t flags, const char* gseqname,
433 int pos, int map_qual, const char* cigar, const char* mgseqname, int mate_pos,
434 int insert_size, const char* qseq, const char* quals=NULL,
435 const std::vector<std::string>* aux_strings=NULL) {
436 int32_t gseq_tid=get_tid(gseqname);
437 if (gseq_tid < 0 && strcmp(gseqname, "*")) {
438 if (bam_header->n_targets == 0) {
439 err_die("Error: missing/invalid SAM header\n");
440 } else
441 fprintf(stderr, "Warning: reference '%s' not found in header, will consider it '*'.\n",
442 gseqname);
443 }
444 int32_t mgseq_tid=-1;
445 if (mgseqname!=NULL) {
446 if (strcmp(mgseqname, "=")==0) {
447 mgseq_tid=gseq_tid;
448 }
449 else {
450 mgseq_tid=get_tid(mgseqname);
451 if (mgseq_tid < 0 && strcmp(mgseqname, "*")) {
452 fprintf(stderr, "Warning: reference '%s' not found in header, will consider it '*'.\n",
453 mgseqname);
454 }
455 }
456 }
457 return (new GBamRecord(qname, flags, gseq_tid, pos, map_qual, cigar,
458 mgseq_tid, mate_pos, insert_size, qseq, quals, aux_strings));
459 }
460
461 void write(GBamRecord* brec) {
462 if (brec!=NULL) {
463 samwrite(this->bam_file,brec->get_b());
464 wcount++;
465 }
466
467 }
468 void write(bam1_t* b, int64_t read_id=0) {
469 /*
470 if (findex && read_id) {
471 if (idxcount >= 1000 && read_id != idx_last_id) {
472 int64_t offset = this->tell();
473 int block_offset = offset & 0xFFFF;
474 //int64_t block_address = (offset >> 16) & 0xFFFFFFFFFFFFLL;
475 // daehwan - this is a bug in bgzf.c in samtools
476 // I'll report this bug with a temporary solution, soon.
477
478 if (block_offset <= 0xF000) {
479 fprintf(findex, "%lu\t%ld\n", read_id, offset);
480 idxcount = 0;
481 }
482 }
483 idx_last_id=read_id;
484 idxcount++;
485 }
486 */
487 int64_t pre_block_addr=0; //offsets after last write()
488 int pre_block_offs=0; //but before this write()
489 int64_t pre_pos=0;
490 bool write_index=false;
491 if (findex && read_id) {
492 if (idxcount >= 1000 && read_id != idx_last_id) {
493 pre_pos = this->tell();
494 pre_block_offs = pre_pos & 0xFFFF;
495 pre_block_addr = (pre_pos >> 16) & 0xFFFFFFFFFFFFLL;
496 write_index=true;
497 }
498 idx_last_id=read_id;
499 idxcount++;
500 }
501
502 samwrite(this->bam_file, b);
503 wcount++;
504 if (write_index) {
505 int64_t offset = this->tell();
506 int post_block_offs = offset & 0xFFFF; //offsets after this write()
507 int64_t post_block_addr = (offset >> 16) & 0xFFFFFFFFFFFFLL;
508 int data_len = b->data_len+BAM_CORE_SIZE;
509 if (post_block_addr != pre_block_addr &&
510 post_block_offs>=data_len)
511 //all data written in this block
512 //WARNING: this check fails for very large BAM records (> 64K)
513 {
514 pre_pos = post_block_addr;
515 }
516 fprintf(findex, "%lu\t%ld\n", read_id, pre_pos);
517 }
518 }
519
520
521 int64_t tell() {
522 return bam_tell(this->bam_file->x.bam);
523 }
524
525
526 int64_t writtenCount() { return wcount; }
527
528 void flush() {
529 bgzf_flush(this->bam_file->x.bam);
530 }
531
532 void seek(int64_t offset) {
533 bam_seek(this->bam_file->x.bam, offset, SEEK_SET);
534 }
535 };
536
537 #endif