ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/common.h
Revision: 154
Committed: Tue Jan 24 02:29:21 2012 UTC (8 years, 9 months ago) by gpertea
File size: 12759 byte(s)
Log Message:
massive update with Daehwan's work

Line User Rev File contents
1 gpertea 29 #ifndef COMMON_H
2     #define COMMON_H
3     /*
4     * common.h
5     * TopHat
6     *
7     * Created by Cole Trapnell on 11/26/08.
8     * Copyright 2008 Cole Trapnell. All rights reserved.
9     *
10     */
11     #include <stdint.h>
12     #include <cassert>
13     #include <cstring>
14     #include <cstdlib>
15     #include <cstdio>
16     #include <string>
17     #include <vector>
18     #include "bam/bam.h"
19     #include "bam/sam.h"
20    
21    
22 gpertea 135 #define VMAXINT32 0xFFFFFFFF
23    
24 gpertea 29 #ifdef MEM_DEBUG
25     void process_mem_usage(double& vm_usage, double& resident_set);
26     void print_mem_usage();
27     #endif
28    
29 gpertea 154 extern bool bowtie2;
30     extern int bowtie2_min_score;
31    
32     // daehwan - temporary for parallelization
33     extern bool parallel;
34    
35 gpertea 29 /*
36     * Maximum allowable length of an
37     * an insertion. Used mainly in
38     * segment_juncs.cpp
39     */
40     extern unsigned int max_insertion_length;
41    
42     /*
43     * Maximum allowable length of a
44     * deletion. Used mainly in segment_juncs.cpp
45     * and long_spanning_reads.cpp
46     */
47     extern unsigned int max_deletion_length;
48    
49     extern int inner_dist_mean;
50     extern int inner_dist_std_dev;
51     extern int max_mate_inner_dist;
52    
53     extern int min_anchor_len;
54     extern int min_report_intron_length;
55     extern int max_report_intron_length;
56    
57     extern int min_closure_intron_length;
58     extern int max_closure_intron_length;
59    
60     extern int min_coverage_intron_length;
61     extern int max_coverage_intron_length;
62    
63     extern int min_segment_intron_length;
64     extern int max_segment_intron_length;
65    
66     extern uint32_t min_closure_exon_length;
67     extern int island_extension;
68 gpertea 154 extern int num_threads;
69 gpertea 29 extern int segment_length; // the read segment length used by the pipeline
70     extern int segment_mismatches;
71 gpertea 135 extern int max_read_mismatches;
72 gpertea 29
73     extern int max_splice_mismatches;
74    
75     enum ReadFormat {FASTA, FASTQ};
76     extern ReadFormat reads_format;
77    
78     extern bool verbose;
79 gpertea 135 extern unsigned int max_multihits;
80 gpertea 29 extern bool no_closure_search;
81     extern bool no_coverage_search;
82     extern bool no_microexon_search;
83     extern bool butterfly_search;
84    
85     extern float min_isoform_fraction;
86    
87     extern std::string output_dir;
88     extern std::string gff_file;
89     extern std::string gene_filter;
90    
91     extern std::string ium_reads;
92     extern std::string sam_header;
93     extern std::string sam_readgroup_id;
94     extern std::string zpacker; //path to program to use for de/compression (gzip, pigz, bzip2, pbzip2)
95     extern std::string samtools_path; //path to samtools executable
96     extern std::string aux_outfile; //auxiliary output file name
97 gpertea 154 extern std::string index_outfile; //index output file name
98 gpertea 29 extern bool solexa_quals;
99     extern bool phred64_quals;
100     extern bool quals;
101     extern bool integer_quals;
102     extern bool color;
103     extern bool color_out;
104     extern std::string gtf_juncs;
105    
106 gpertea 135 //prep_reads only: --flt-reads <bowtie-fastq_for--max>
107     // filter out reads if their numeric ID is in this fastq file
108     // OR if flt_mappings was given too, filter out reads if their ID
109     // is NOT in this fastq file
110     extern std::string flt_reads;
111    
112     //prep_reads special usage: filter out mappings whose read ID
113     //is NOT found in the flt_reads file, and write them into
114     // aux_outfile; also reverses the flt_reads filter itself
115     extern std::string flt_mappings;
116    
117 gpertea 154 extern bool fusion_search;
118     extern size_t fusion_anchor_length;
119     extern size_t fusion_min_dist;
120     extern size_t fusion_read_mismatches;
121     extern size_t fusion_multireads;
122     extern size_t fusion_multipairs;
123    
124 gpertea 29 enum eLIBRARY_TYPE
125     {
126     LIBRARY_TYPE_NONE = 0,
127    
128     FR_UNSTRANDED,
129     FR_FIRSTSTRAND,
130     FR_SECONDSTRAND,
131    
132     FF_UNSTRANDED,
133     FF_FIRSTSTRAND,
134     FF_SECONDSTRAND,
135    
136     NUM_LIBRARY_TYPE
137     };
138    
139     extern eLIBRARY_TYPE library_type;
140     std::string getFext(const std::string& s); //returns file extension converted to lowercase
141     bool str_endsWith(std::string& str, const char* suffix);
142     void str_appendInt(std::string& str, int v);
143     FILE* fzOpen(std::string& fname, const char* mode);
144    
145     int parseIntOpt(int lower, const char *errmsg, void (*print_usage)());
146     int parse_options(int argc, char** argv, void (*print_usage)());
147    
148     void err_exit(const char* format,...); // exit with an error
149    
150     char* get_token(char** str, const char* delims);
151     std::string guess_packer(const std::string& fname, bool use_all_cpus);
152     std::string getUnpackCmd(const std::string& fname, bool use_all_cpus=false);
153    
154     void checkSamHeader();
155     void writeSamHeader(FILE* fout);
156    
157 gpertea 33 class FZPipe {
158     public:
159     FILE* file;
160     std::string filename;
161     std::string pipecmd;
162 gpertea 135 bool is_bam;
163 gpertea 154 FZPipe(const std::string& fname, bool is_mapping):filename(fname),pipecmd() {
164 gpertea 135 //this constructor is only to use FZPipe as a READER
165     //also accepts/recognizes BAM files
166     //for which it only stores the filename, other fields/methods are unused
167     openRead(fname, is_mapping);
168 gpertea 33 }
169 gpertea 29
170 gpertea 154 void openRead(const std::string& fname, bool is_mapping) {
171 gpertea 135 filename=fname;
172     pipecmd="";
173     is_bam=false;
174     if (is_mapping && getFext(fname) == "bam") {
175     file=(FILE*)this;
176     is_bam=true;
177     return;
178     }
179     pipecmd=getUnpackCmd(fname); //also bam2fastx
180     this->openRead(fname.c_str(), pipecmd);
181     }
182    
183 gpertea 33 FZPipe():filename(),pipecmd() {
184 gpertea 135 is_bam=false;
185 gpertea 33 file=NULL;
186     }
187     FZPipe(std::string& fname, std::string& pcmd):filename(fname),pipecmd(pcmd) {
188     //open as a compressed file reader
189 gpertea 135 is_bam=false;
190 gpertea 33 file=NULL;
191     this->openRead(fname.c_str(), pipecmd);
192     }
193     void close() {
194     if (file!=NULL) {
195     if (pipecmd.empty()) fclose(file);
196     else pclose(file);
197     file=NULL;
198     }
199     }
200     FILE* openWrite(const char* fname, std::string& popencmd);
201     FILE* openWrite(const char* fname);
202     FILE* openRead(const char* fname, std::string& popencmd);
203    
204     FILE* openRead(const char* fname);
205     FILE* openRead(const std::string fname, std::string& popencmd) {
206     return this->openRead(fname.c_str(),popencmd);
207     }
208     FILE* openRead(const std::string fname) {
209     return this->openRead(fname.c_str());
210     }
211     void rewind();
212 gpertea 154
213     void seek(int64_t offset)
214     {
215     fseek(this->file, offset, SEEK_SET);
216     }
217 gpertea 29 };
218    
219     void err_die(const char* format,...);
220    
221     //uint8_t* realloc_bdata(bam1_t *b, int size);
222     //uint8_t* dupalloc_bdata(bam1_t *b, int size);
223    
224     class GBamRecord {
225     bam1_t* b;
226     // b->data has the following strings concatenated:
227     // qname (including the terminal \0)
228     // +cigar (each event encoded on 32 bits)
229     // +seq (4bit-encoded)
230     // +qual
231     // +aux
232     bool novel;
233 gpertea 154
234     char tag[2];
235     uint8_t abuf[512];
236 gpertea 29 public:
237     GBamRecord(bam1_t* from_b=NULL) {
238     if (from_b==NULL) {
239     b=bam_init1();
240     novel=true;
241     }
242     else {
243     b=from_b;
244     novel=false;
245     }
246     }
247    
248     void clear() {
249     if (novel) {
250     bam_destroy1(b);
251     }
252     novel=true;
253     b=bam_init1();
254     }
255    
256     ~GBamRecord() {
257     if (novel) { bam_destroy1(b); }
258     }
259    
260     void parse_error(const char* s) {
261     err_die("BAM parsing error: %s\n", s);
262     }
263    
264     bam1_t* get_b() { return b; }
265    
266     void set_mdata(int32_t mtid, int32_t m0pos, //0-based coordinate, -1 if not available
267     int32_t isize=0) { //mate info for current record
268     b->core.mtid=mtid;
269     b->core.mpos=m0pos; // should be -1 if '*'
270     b->core.isize=isize; //should be 0 if not available
271     }
272    
273     void set_flags(uint16_t flags) {
274     b->core.flag=flags;
275     }
276    
277     //creates a new record from 1-based alignment coordinate
278     //quals should be given as Phred33
279     //Warning: pos and mate_pos must be given 1-based!
280     GBamRecord(const char* qname, int32_t gseq_tid,
281     int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* quals=NULL);
282     GBamRecord(const char* qname, int32_t flags, int32_t g_tid,
283     int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos,
284     int insert_size, const char* qseq, const char* quals=NULL,
285     const std::vector<std::string>* aux_strings=NULL);
286     void set_cigar(const char* cigar); //converts and adds CIGAR string given in plain SAM text format
287     void add_sequence(const char* qseq, int slen=-1); //adds the DNA sequence given in plain text format
288     void add_quals(const char* quals); //quality values string in Phred33 format
289     void add_aux(const char* str); //adds one aux field in plain SAM text format (e.g. "NM:i:1")
290     void add_aux(const char tag[2], char atype, int len, uint8_t *data) {
291     int ori_len = b->data_len;
292     b->data_len += 3 + len;
293     b->l_aux += 3 + len;
294     if (b->m_data < b->data_len) {
295     b->m_data = b->data_len;
296     kroundup32(b->m_data);
297     b->data = (uint8_t*)realloc(b->data, b->m_data);
298     }
299     b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
300     b->data[ori_len + 2] = atype;
301     memcpy(b->data + ori_len + 3, data, len);
302     }
303     };
304    
305     class GBamWriter {
306     samfile_t* bam_file;
307     bam_header_t* bam_header;
308     public:
309     void create(const char* fname, bool uncompressed=false) {
310     if (bam_header==NULL)
311     err_die("Error: no bam_header for GBamWriter::create()!\n");
312     if (uncompressed) {
313     bam_file=samopen(fname, "wbu", bam_header);
314     }
315     else {
316     bam_file=samopen(fname, "wb", bam_header);
317     }
318     if (bam_file==NULL)
319     err_die("Error: could not create BAM file %s!\n",fname);
320     //do we need to call bam_header_write() ?
321     }
322 gpertea 33 void create(const char* fname, bam_header_t* bh, bool uncompressed=false) {
323     bam_header=bh;
324     create(fname,uncompressed);
325     }
326 gpertea 29
327     GBamWriter(const char* fname, bam_header_t* bh, bool uncompressed=false) {
328 gpertea 33 create(fname, bh, uncompressed);
329 gpertea 29 }
330    
331     GBamWriter(const char* fname, const char* samfname, bool uncompressed=false) {
332     tamFile samf_in=sam_open(samfname);
333     if (samf_in==NULL)
334     err_die("Error: could not open SAM file %s\n", samfname);
335     bam_header=sam_header_read(samf_in);
336     if (bam_header==NULL)
337     err_die("Error: could not read SAM header from %s!\n",samfname);
338     sam_close(samf_in);
339     create(fname, uncompressed);
340     }
341    
342     ~GBamWriter() {
343     samclose(bam_file);
344     bam_header_destroy(bam_header);
345     }
346     bam_header_t* get_header() { return bam_header; }
347     int32_t get_tid(const char *seq_name) {
348     if (bam_header==NULL)
349     err_die("Error: missing SAM header (get_tid())\n");
350     return bam_get_tid(bam_header, seq_name);
351     }
352    
353     //just a convenience function for creating a new record, but it's NOT written
354     //given pos must be 1-based (so it'll be stored as pos-1 because BAM is 0-based)
355     GBamRecord* new_record(const char* qname, const char* gseqname,
356     int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* qual=NULL) {
357     int32_t gseq_tid=get_tid(gseqname);
358     if (gseq_tid < 0 && strcmp(gseqname, "*")) {
359     if (bam_header->n_targets == 0) {
360     err_die("Error: missing/invalid SAM header\n");
361     } else
362     fprintf(stderr, "Warning: reference '%s' not found in header, will consider it '*'.\n",
363     gseqname);
364     }
365    
366     return (new GBamRecord(qname, gseq_tid, pos, reverse, qseq, cigar, qual));
367     }
368    
369     GBamRecord* new_record(const char* qname, int32_t flags, const char* gseqname,
370     int pos, int map_qual, const char* cigar, const char* mgseqname, int mate_pos,
371     int insert_size, const char* qseq, const char* quals=NULL,
372     const std::vector<std::string>* aux_strings=NULL) {
373     int32_t gseq_tid=get_tid(gseqname);
374     if (gseq_tid < 0 && strcmp(gseqname, "*")) {
375     if (bam_header->n_targets == 0) {
376     err_die("Error: missing/invalid SAM header\n");
377     } else
378     fprintf(stderr, "Warning: reference '%s' not found in header, will consider it '*'.\n",
379     gseqname);
380     }
381     int32_t mgseq_tid=-1;
382     if (mgseqname!=NULL) {
383     if (strcmp(mgseqname, "=")==0) {
384     mgseq_tid=gseq_tid;
385     }
386     else {
387     mgseq_tid=get_tid(mgseqname);
388     if (mgseq_tid < 0 && strcmp(mgseqname, "*")) {
389     fprintf(stderr, "Warning: reference '%s' not found in header, will consider it '*'.\n",
390     mgseqname);
391     }
392     }
393     }
394     return (new GBamRecord(qname, flags, gseq_tid, pos, map_qual, cigar,
395     mgseq_tid, mate_pos, insert_size, qseq, quals, aux_strings));
396     }
397    
398     void write(GBamRecord* brec) {
399     if (brec!=NULL)
400     samwrite(this->bam_file,brec->get_b());
401     }
402 gpertea 33 void write(bam1_t* b) {
403     samwrite(this->bam_file, b);
404     }
405 gpertea 154
406     int64_t tell() {
407     return bam_tell(this->bam_file->x.bam);
408     }
409    
410     void flush() {
411     bgzf_flush(this->bam_file->x.bam);
412     }
413    
414     void seek(int64_t offset) {
415     bam_seek(this->bam_file->x.bam, offset, SEEK_SET);
416     }
417 gpertea 29 };
418    
419     #endif