ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/common.h
Revision: 154
Committed: Tue Jan 24 02:29:21 2012 UTC (7 years, 8 months ago) by gpertea
File size: 12759 byte(s)
Log Message:
massive update with Daehwan's work

Line File contents
1 #ifndef COMMON_H
2 #define COMMON_H
3 /*
4 * common.h
5 * TopHat
6 *
7 * Created by Cole Trapnell on 11/26/08.
8 * Copyright 2008 Cole Trapnell. All rights reserved.
9 *
10 */
11 #include <stdint.h>
12 #include <cassert>
13 #include <cstring>
14 #include <cstdlib>
15 #include <cstdio>
16 #include <string>
17 #include <vector>
18 #include "bam/bam.h"
19 #include "bam/sam.h"
20
21
22 #define VMAXINT32 0xFFFFFFFF
23
24 #ifdef MEM_DEBUG
25 void process_mem_usage(double& vm_usage, double& resident_set);
26 void print_mem_usage();
27 #endif
28
29 extern bool bowtie2;
30 extern int bowtie2_min_score;
31
32 // daehwan - temporary for parallelization
33 extern bool parallel;
34
35 /*
36 * Maximum allowable length of an
37 * an insertion. Used mainly in
38 * segment_juncs.cpp
39 */
40 extern unsigned int max_insertion_length;
41
42 /*
43 * Maximum allowable length of a
44 * deletion. Used mainly in segment_juncs.cpp
45 * and long_spanning_reads.cpp
46 */
47 extern unsigned int max_deletion_length;
48
49 extern int inner_dist_mean;
50 extern int inner_dist_std_dev;
51 extern int max_mate_inner_dist;
52
53 extern int min_anchor_len;
54 extern int min_report_intron_length;
55 extern int max_report_intron_length;
56
57 extern int min_closure_intron_length;
58 extern int max_closure_intron_length;
59
60 extern int min_coverage_intron_length;
61 extern int max_coverage_intron_length;
62
63 extern int min_segment_intron_length;
64 extern int max_segment_intron_length;
65
66 extern uint32_t min_closure_exon_length;
67 extern int island_extension;
68 extern int num_threads;
69 extern int segment_length; // the read segment length used by the pipeline
70 extern int segment_mismatches;
71 extern int max_read_mismatches;
72
73 extern int max_splice_mismatches;
74
75 enum ReadFormat {FASTA, FASTQ};
76 extern ReadFormat reads_format;
77
78 extern bool verbose;
79 extern unsigned int max_multihits;
80 extern bool no_closure_search;
81 extern bool no_coverage_search;
82 extern bool no_microexon_search;
83 extern bool butterfly_search;
84
85 extern float min_isoform_fraction;
86
87 extern std::string output_dir;
88 extern std::string gff_file;
89 extern std::string gene_filter;
90
91 extern std::string ium_reads;
92 extern std::string sam_header;
93 extern std::string sam_readgroup_id;
94 extern std::string zpacker; //path to program to use for de/compression (gzip, pigz, bzip2, pbzip2)
95 extern std::string samtools_path; //path to samtools executable
96 extern std::string aux_outfile; //auxiliary output file name
97 extern std::string index_outfile; //index output file name
98 extern bool solexa_quals;
99 extern bool phred64_quals;
100 extern bool quals;
101 extern bool integer_quals;
102 extern bool color;
103 extern bool color_out;
104 extern std::string gtf_juncs;
105
106 //prep_reads only: --flt-reads <bowtie-fastq_for--max>
107 // filter out reads if their numeric ID is in this fastq file
108 // OR if flt_mappings was given too, filter out reads if their ID
109 // is NOT in this fastq file
110 extern std::string flt_reads;
111
112 //prep_reads special usage: filter out mappings whose read ID
113 //is NOT found in the flt_reads file, and write them into
114 // aux_outfile; also reverses the flt_reads filter itself
115 extern std::string flt_mappings;
116
117 extern bool fusion_search;
118 extern size_t fusion_anchor_length;
119 extern size_t fusion_min_dist;
120 extern size_t fusion_read_mismatches;
121 extern size_t fusion_multireads;
122 extern size_t fusion_multipairs;
123
124 enum eLIBRARY_TYPE
125 {
126 LIBRARY_TYPE_NONE = 0,
127
128 FR_UNSTRANDED,
129 FR_FIRSTSTRAND,
130 FR_SECONDSTRAND,
131
132 FF_UNSTRANDED,
133 FF_FIRSTSTRAND,
134 FF_SECONDSTRAND,
135
136 NUM_LIBRARY_TYPE
137 };
138
139 extern eLIBRARY_TYPE library_type;
140 std::string getFext(const std::string& s); //returns file extension converted to lowercase
141 bool str_endsWith(std::string& str, const char* suffix);
142 void str_appendInt(std::string& str, int v);
143 FILE* fzOpen(std::string& fname, const char* mode);
144
145 int parseIntOpt(int lower, const char *errmsg, void (*print_usage)());
146 int parse_options(int argc, char** argv, void (*print_usage)());
147
148 void err_exit(const char* format,...); // exit with an error
149
150 char* get_token(char** str, const char* delims);
151 std::string guess_packer(const std::string& fname, bool use_all_cpus);
152 std::string getUnpackCmd(const std::string& fname, bool use_all_cpus=false);
153
154 void checkSamHeader();
155 void writeSamHeader(FILE* fout);
156
157 class FZPipe {
158 public:
159 FILE* file;
160 std::string filename;
161 std::string pipecmd;
162 bool is_bam;
163 FZPipe(const std::string& fname, bool is_mapping):filename(fname),pipecmd() {
164 //this constructor is only to use FZPipe as a READER
165 //also accepts/recognizes BAM files
166 //for which it only stores the filename, other fields/methods are unused
167 openRead(fname, is_mapping);
168 }
169
170 void openRead(const std::string& fname, bool is_mapping) {
171 filename=fname;
172 pipecmd="";
173 is_bam=false;
174 if (is_mapping && getFext(fname) == "bam") {
175 file=(FILE*)this;
176 is_bam=true;
177 return;
178 }
179 pipecmd=getUnpackCmd(fname); //also bam2fastx
180 this->openRead(fname.c_str(), pipecmd);
181 }
182
183 FZPipe():filename(),pipecmd() {
184 is_bam=false;
185 file=NULL;
186 }
187 FZPipe(std::string& fname, std::string& pcmd):filename(fname),pipecmd(pcmd) {
188 //open as a compressed file reader
189 is_bam=false;
190 file=NULL;
191 this->openRead(fname.c_str(), pipecmd);
192 }
193 void close() {
194 if (file!=NULL) {
195 if (pipecmd.empty()) fclose(file);
196 else pclose(file);
197 file=NULL;
198 }
199 }
200 FILE* openWrite(const char* fname, std::string& popencmd);
201 FILE* openWrite(const char* fname);
202 FILE* openRead(const char* fname, std::string& popencmd);
203
204 FILE* openRead(const char* fname);
205 FILE* openRead(const std::string fname, std::string& popencmd) {
206 return this->openRead(fname.c_str(),popencmd);
207 }
208 FILE* openRead(const std::string fname) {
209 return this->openRead(fname.c_str());
210 }
211 void rewind();
212
213 void seek(int64_t offset)
214 {
215 fseek(this->file, offset, SEEK_SET);
216 }
217 };
218
219 void err_die(const char* format,...);
220
221 //uint8_t* realloc_bdata(bam1_t *b, int size);
222 //uint8_t* dupalloc_bdata(bam1_t *b, int size);
223
224 class GBamRecord {
225 bam1_t* b;
226 // b->data has the following strings concatenated:
227 // qname (including the terminal \0)
228 // +cigar (each event encoded on 32 bits)
229 // +seq (4bit-encoded)
230 // +qual
231 // +aux
232 bool novel;
233
234 char tag[2];
235 uint8_t abuf[512];
236 public:
237 GBamRecord(bam1_t* from_b=NULL) {
238 if (from_b==NULL) {
239 b=bam_init1();
240 novel=true;
241 }
242 else {
243 b=from_b;
244 novel=false;
245 }
246 }
247
248 void clear() {
249 if (novel) {
250 bam_destroy1(b);
251 }
252 novel=true;
253 b=bam_init1();
254 }
255
256 ~GBamRecord() {
257 if (novel) { bam_destroy1(b); }
258 }
259
260 void parse_error(const char* s) {
261 err_die("BAM parsing error: %s\n", s);
262 }
263
264 bam1_t* get_b() { return b; }
265
266 void set_mdata(int32_t mtid, int32_t m0pos, //0-based coordinate, -1 if not available
267 int32_t isize=0) { //mate info for current record
268 b->core.mtid=mtid;
269 b->core.mpos=m0pos; // should be -1 if '*'
270 b->core.isize=isize; //should be 0 if not available
271 }
272
273 void set_flags(uint16_t flags) {
274 b->core.flag=flags;
275 }
276
277 //creates a new record from 1-based alignment coordinate
278 //quals should be given as Phred33
279 //Warning: pos and mate_pos must be given 1-based!
280 GBamRecord(const char* qname, int32_t gseq_tid,
281 int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* quals=NULL);
282 GBamRecord(const char* qname, int32_t flags, int32_t g_tid,
283 int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos,
284 int insert_size, const char* qseq, const char* quals=NULL,
285 const std::vector<std::string>* aux_strings=NULL);
286 void set_cigar(const char* cigar); //converts and adds CIGAR string given in plain SAM text format
287 void add_sequence(const char* qseq, int slen=-1); //adds the DNA sequence given in plain text format
288 void add_quals(const char* quals); //quality values string in Phred33 format
289 void add_aux(const char* str); //adds one aux field in plain SAM text format (e.g. "NM:i:1")
290 void add_aux(const char tag[2], char atype, int len, uint8_t *data) {
291 int ori_len = b->data_len;
292 b->data_len += 3 + len;
293 b->l_aux += 3 + len;
294 if (b->m_data < b->data_len) {
295 b->m_data = b->data_len;
296 kroundup32(b->m_data);
297 b->data = (uint8_t*)realloc(b->data, b->m_data);
298 }
299 b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
300 b->data[ori_len + 2] = atype;
301 memcpy(b->data + ori_len + 3, data, len);
302 }
303 };
304
305 class GBamWriter {
306 samfile_t* bam_file;
307 bam_header_t* bam_header;
308 public:
309 void create(const char* fname, bool uncompressed=false) {
310 if (bam_header==NULL)
311 err_die("Error: no bam_header for GBamWriter::create()!\n");
312 if (uncompressed) {
313 bam_file=samopen(fname, "wbu", bam_header);
314 }
315 else {
316 bam_file=samopen(fname, "wb", bam_header);
317 }
318 if (bam_file==NULL)
319 err_die("Error: could not create BAM file %s!\n",fname);
320 //do we need to call bam_header_write() ?
321 }
322 void create(const char* fname, bam_header_t* bh, bool uncompressed=false) {
323 bam_header=bh;
324 create(fname,uncompressed);
325 }
326
327 GBamWriter(const char* fname, bam_header_t* bh, bool uncompressed=false) {
328 create(fname, bh, uncompressed);
329 }
330
331 GBamWriter(const char* fname, const char* samfname, bool uncompressed=false) {
332 tamFile samf_in=sam_open(samfname);
333 if (samf_in==NULL)
334 err_die("Error: could not open SAM file %s\n", samfname);
335 bam_header=sam_header_read(samf_in);
336 if (bam_header==NULL)
337 err_die("Error: could not read SAM header from %s!\n",samfname);
338 sam_close(samf_in);
339 create(fname, uncompressed);
340 }
341
342 ~GBamWriter() {
343 samclose(bam_file);
344 bam_header_destroy(bam_header);
345 }
346 bam_header_t* get_header() { return bam_header; }
347 int32_t get_tid(const char *seq_name) {
348 if (bam_header==NULL)
349 err_die("Error: missing SAM header (get_tid())\n");
350 return bam_get_tid(bam_header, seq_name);
351 }
352
353 //just a convenience function for creating a new record, but it's NOT written
354 //given pos must be 1-based (so it'll be stored as pos-1 because BAM is 0-based)
355 GBamRecord* new_record(const char* qname, const char* gseqname,
356 int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* qual=NULL) {
357 int32_t gseq_tid=get_tid(gseqname);
358 if (gseq_tid < 0 && strcmp(gseqname, "*")) {
359 if (bam_header->n_targets == 0) {
360 err_die("Error: missing/invalid SAM header\n");
361 } else
362 fprintf(stderr, "Warning: reference '%s' not found in header, will consider it '*'.\n",
363 gseqname);
364 }
365
366 return (new GBamRecord(qname, gseq_tid, pos, reverse, qseq, cigar, qual));
367 }
368
369 GBamRecord* new_record(const char* qname, int32_t flags, const char* gseqname,
370 int pos, int map_qual, const char* cigar, const char* mgseqname, int mate_pos,
371 int insert_size, const char* qseq, const char* quals=NULL,
372 const std::vector<std::string>* aux_strings=NULL) {
373 int32_t gseq_tid=get_tid(gseqname);
374 if (gseq_tid < 0 && strcmp(gseqname, "*")) {
375 if (bam_header->n_targets == 0) {
376 err_die("Error: missing/invalid SAM header\n");
377 } else
378 fprintf(stderr, "Warning: reference '%s' not found in header, will consider it '*'.\n",
379 gseqname);
380 }
381 int32_t mgseq_tid=-1;
382 if (mgseqname!=NULL) {
383 if (strcmp(mgseqname, "=")==0) {
384 mgseq_tid=gseq_tid;
385 }
386 else {
387 mgseq_tid=get_tid(mgseqname);
388 if (mgseq_tid < 0 && strcmp(mgseqname, "*")) {
389 fprintf(stderr, "Warning: reference '%s' not found in header, will consider it '*'.\n",
390 mgseqname);
391 }
392 }
393 }
394 return (new GBamRecord(qname, flags, gseq_tid, pos, map_qual, cigar,
395 mgseq_tid, mate_pos, insert_size, qseq, quals, aux_strings));
396 }
397
398 void write(GBamRecord* brec) {
399 if (brec!=NULL)
400 samwrite(this->bam_file,brec->get_b());
401 }
402 void write(bam1_t* b) {
403 samwrite(this->bam_file, b);
404 }
405
406 int64_t tell() {
407 return bam_tell(this->bam_file->x.bam);
408 }
409
410 void flush() {
411 bgzf_flush(this->bam_file->x.bam);
412 }
413
414 void seek(int64_t offset) {
415 bam_seek(this->bam_file->x.bam, offset, SEEK_SET);
416 }
417 };
418
419 #endif