ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/common.cpp
Revision: 135
Committed: Mon Dec 12 22:28:38 2011 UTC (9 years, 2 months ago) by gpertea
File size: 30113 byte(s)
Log Message:
wip - SplicedSAMHitFactory() still not implemented

Line User Rev File contents
1 gpertea 29 /*
2     * common.cpp
3     * TopHat
4     *
5     * Created by Cole Trapnell on 11/26/08.
6     * Copyright 2008 Cole Trapnell. All rights reserved.
7     *
8     */
9    
10     #ifdef HAVE_CONFIG_H
11     #include <config.h>
12     #endif
13    
14     #include <iostream>
15     #include <sstream>
16     #include <cstdarg>
17     #include <getopt.h>
18    
19     #include "common.h"
20    
21     using namespace std;
22    
23     #ifdef MEM_DEBUG
24     //function for debugging memory usage of current program in Linux
25    
26     #include <unistd.h>
27     #include <ios>
28     #include <fstream>
29    
30     //////////////////////////////////////////////////////////////////////////////
31     // process_mem_usage(double &, double &) - takes two doubles by reference,
32     // attempts to read the system-dependent data for a process' virtual memory
33     // size and resident set size, and return the results in KB.
34     //
35     // On failure, returns 0.0, 0.0
36    
37     void process_mem_usage(double& vm_usage, double& resident_set) {
38     using std::ios_base;
39     using std::ifstream;
40     using std::string;
41     vm_usage = 0.0;
42     resident_set = 0.0;
43     // 'file' stat seems to give the most reliable results
44     ifstream stat_stream("/proc/self/stat",ios_base::in);
45     // dummy vars for leading entries in stat that we don't care about
46     string pid, comm, state, ppid, pgrp, session, tty_nr;
47     string tpgid, flags, minflt, cminflt, majflt, cmajflt;
48     string utime, stime, cutime, cstime, priority, nice;
49     string O, itrealvalue, starttime;
50    
51     // the two fields we want
52     //
53     unsigned long vsize;
54     long rss;
55    
56     stat_stream >> pid >> comm >> state >> ppid >> pgrp >> session >> tty_nr
57     >> tpgid >> flags >> minflt >> cminflt >> majflt >> cmajflt
58     >> utime >> stime >> cutime >> cstime >> priority >> nice
59     >> O >> itrealvalue >> starttime >> vsize >> rss; // don't care about the rest
60    
61     stat_stream.close();
62    
63     long page_size_kb = sysconf(_SC_PAGE_SIZE) / 1024; // in case x86-64 is configured to use 2MB pages
64     vm_usage = vsize / 1024.0;
65     resident_set = rss * page_size_kb;
66     }
67    
68     void print_mem_usage() {
69     double vs, rs;
70     process_mem_usage(vs,rs);
71     vs/=1024;
72     rs/=1024;
73     fprintf(stderr, "VMSize: %6.1fMB\tRSize: %6.1fMB\n", vs, rs);
74     }
75     #endif
76    
77    
78     unsigned int max_insertion_length = 3;
79     unsigned int max_deletion_length = 3;
80    
81    
82     int inner_dist_mean = 200;
83     int inner_dist_std_dev = 20;
84     int max_mate_inner_dist = -1;
85    
86     int min_anchor_len = 8;
87     int min_report_intron_length = 50;
88     int max_report_intron_length = 500000;
89    
90     int min_closure_intron_length = 50;
91     int max_closure_intron_length = 5000;
92    
93     int min_coverage_intron_length = 50;
94     int max_coverage_intron_length = 20000;
95    
96     int min_segment_intron_length = 50;
97     int max_segment_intron_length = 500000;
98    
99     uint32_t min_closure_exon_length = 100;
100    
101     int island_extension = 25;
102     int segment_length = 25;
103     int segment_mismatches = 2;
104 gpertea 135 int max_read_mismatches = 2;
105 gpertea 29 int max_splice_mismatches = 1;
106    
107     ReadFormat reads_format = FASTQ;
108    
109     bool verbose = false;
110    
111 gpertea 135 unsigned int max_multihits = 40;
112 gpertea 29 bool no_closure_search = false;
113     bool no_coverage_search = false;
114     bool no_microexon_search = false;
115     bool butterfly_search = false;
116     int num_cpus = 1;
117     float min_isoform_fraction = 0.15f;
118    
119     string output_dir = "tophat_out";
120     string aux_outfile = ""; //auxiliary output file name (e.g. prep_reads read stats)
121     string gene_filter = "";
122     string gff_file = "";
123     string ium_reads = "";
124     string sam_header = "";
125     string sam_readgroup_id = "";
126     string zpacker = "";
127     string samtools_path = "samtools";
128    
129     bool solexa_quals = false;
130     bool phred64_quals = false;
131     bool quals = false;
132     bool integer_quals = false;
133     bool color = false;
134     bool color_out = false;
135    
136     string gtf_juncs = "";
137    
138 gpertea 135 string flt_reads = "";
139     string flt_mappings = "";
140    
141 gpertea 29 eLIBRARY_TYPE library_type = LIBRARY_TYPE_NONE;
142    
143     extern void print_usage();
144    
145     /**
146     * Parse an int out of optarg and enforce that it be at least 'lower';
147     * if it is less than 'lower', than output the given error message and
148     * exit with an error and a usage message.
149     */
150    
151     int parseIntOpt(int lower, const char *errmsg, void (*print_usage)()) {
152     long l;
153     char *endPtr= NULL;
154     l = strtol(optarg, &endPtr, 10);
155     if (endPtr != NULL) {
156     if (l < lower) {
157     cerr << errmsg << endl;
158     print_usage();
159     exit(1);
160     }
161     return (int32_t)l;
162     }
163     cerr << errmsg << endl;
164     print_usage();
165     exit(1);
166     return -1;
167     }
168    
169     /**
170     * Parse an int out of optarg and enforce that it be at least 'lower';
171     * if it is less than 'lower', than output the given error message and
172     * exit with an error and a usage message.
173     */
174     static float parseFloatOpt(float lower, float upper, const char *errmsg, void (*print_usage)()) {
175     float l;
176     l = (float)atof(optarg);
177    
178     if (l < lower) {
179     cerr << errmsg << endl;
180     print_usage();
181     exit(1);
182     }
183    
184     if (l > upper)
185     {
186     cerr << errmsg << endl;
187     print_usage();
188     exit(1);
189     }
190    
191     return l;
192    
193     cerr << errmsg << endl;
194     print_usage();
195     exit(1);
196     return -1;
197     }
198    
199     /*
200     this is from
201     http://www.winehq.org/pipermail/wine-patches/2001-November/001322.html
202     */
203     char* get_token(char** str, const char* delims)
204     {
205     char* token;
206     if (*str == NULL)
207     return NULL;
208    
209     token = *str;
210     while (**str != '\0')
211     {
212     if (strchr(delims, **str) != NULL)
213     {
214     **str = '\0';
215     ++(*str);
216     return token;
217     }
218    
219     ++(*str);
220     }
221    
222     *str = NULL;
223     return token;
224     }
225    
226    
227 gpertea 135 const char *short_options = "QCp:z:N:";
228 gpertea 29
229     enum
230     {
231     OPT_FASTA = 127,
232     OPT_FASTQ,
233     OPT_MIN_ANCHOR,
234     OPT_SPLICE_MISMATCHES,
235     OPT_VERBOSE,
236     OPT_INSERT_LENGTH_MEAN,
237     OPT_INSERT_LENGTH_STD_DEV,
238     OPT_MIN_ISOFORM_FRACTION,
239     OPT_OUTPUT_DIR,
240     OPT_GENE_FILTER,
241     OPT_GFF_ANNOTATIONS,
242     OPT_MAX_MULTIHITS,
243     OPT_NO_CLOSURE_SEARCH,
244     OPT_NO_COVERAGE_SEARCH,
245     OPT_NO_MICROEXON_SEARCH,
246     OPT_SEGMENT_LENGTH,
247 gpertea 135 OPT_READ_MISMATCHES,
248 gpertea 29 OPT_SEGMENT_MISMATCHES,
249     OPT_MIN_CLOSURE_EXON,
250     OPT_MAX_CLOSURE_INTRON,
251     OPT_MIN_CLOSURE_INTRON,
252     OPT_MAX_COVERAGE_INTRON,
253     OPT_MIN_COVERAGE_INTRON,
254     OPT_MIN_SEGMENT_INTRON,
255     OPT_MAX_SEGMENT_INTRON,
256     OPT_MIN_REPORT_INTRON,
257     OPT_MAX_REPORT_INTRON,
258     OPT_IUM_READS,
259     OPT_BUTTERFLY_SEARCH,
260     OPT_SOLEXA_QUALS,
261     OPT_PHRED64_QUALS,
262     OPT_SAM_HEADER,
263     OPT_SAM_READGROUP_ID,
264     OPT_QUALS,
265     OPT_INTEGER_QUALS,
266     OPT_COLOR,
267     OPT_COLOR_OUT,
268     OPT_LIBRARY_TYPE,
269     OPT_MAX_DELETION_LENGTH,
270     OPT_MAX_INSERTION_LENGTH,
271     OPT_NUM_CPUS,
272     OPT_ZPACKER,
273     OPT_SAMTOOLS,
274     OPT_AUX_OUT,
275 gpertea 135 OPT_GTF_JUNCS,
276     OPT_FILTER_READS,
277     OPT_FILTER_HITS
278 gpertea 29 };
279    
280     static struct option long_options[] = {
281     {"fasta", no_argument, 0, OPT_FASTA},
282     {"fastq", no_argument, 0, OPT_FASTQ},
283     {"min-anchor", required_argument, 0, OPT_MIN_ANCHOR},
284     {"sam-header", required_argument, 0, OPT_SAM_HEADER},
285     {"rg-id", required_argument, 0, OPT_SAM_READGROUP_ID},
286     {"splice-mismatches", required_argument, 0, OPT_SPLICE_MISMATCHES},
287     {"verbose", no_argument, 0, OPT_VERBOSE},
288     {"inner-dist-mean", required_argument, 0, OPT_INSERT_LENGTH_MEAN},
289     {"inner-dist-std-dev", required_argument, 0, OPT_INSERT_LENGTH_STD_DEV},
290     {"output-dir", required_argument, 0, OPT_OUTPUT_DIR},
291     {"gene-filter", required_argument, 0, OPT_GENE_FILTER},
292     {"gtf-annotations", required_argument, 0, OPT_GFF_ANNOTATIONS},
293     {"max-multihits", required_argument, 0, OPT_MAX_MULTIHITS},
294     {"no-closure-search", no_argument, 0, OPT_NO_CLOSURE_SEARCH},
295     {"no-coverage-search", no_argument, 0, OPT_NO_COVERAGE_SEARCH},
296     {"no-microexon-search", no_argument, 0, OPT_NO_MICROEXON_SEARCH},
297     {"segment-length", required_argument, 0, OPT_SEGMENT_LENGTH},
298     {"segment-mismatches", required_argument, 0, OPT_SEGMENT_MISMATCHES},
299 gpertea 135 {"max-mismatches", required_argument, 0, OPT_READ_MISMATCHES},
300 gpertea 29 {"min-closure-exon", required_argument, 0, OPT_MIN_CLOSURE_EXON},
301     {"min-closure-intron", required_argument, 0, OPT_MIN_CLOSURE_INTRON},
302     {"max-closure-intron", required_argument, 0, OPT_MAX_CLOSURE_INTRON},
303     {"min-coverage-intron", required_argument, 0, OPT_MIN_COVERAGE_INTRON},
304     {"max-coverage-intron", required_argument, 0, OPT_MAX_COVERAGE_INTRON},
305     {"min-segment-intron", required_argument, 0, OPT_MIN_SEGMENT_INTRON},
306     {"max-segment-intron", required_argument, 0, OPT_MAX_SEGMENT_INTRON},
307     {"min-report-intron", required_argument, 0, OPT_MIN_REPORT_INTRON},
308     {"max-report-intron", required_argument, 0, OPT_MAX_REPORT_INTRON},
309     {"min-isoform-fraction",required_argument, 0, OPT_MIN_ISOFORM_FRACTION},
310     {"ium-reads", required_argument, 0, OPT_IUM_READS},
311     {"butterfly-search", no_argument, 0, OPT_BUTTERFLY_SEARCH},
312     {"solexa-quals", no_argument, 0, OPT_SOLEXA_QUALS},
313     {"phred64-quals", no_argument, 0, OPT_PHRED64_QUALS},
314     {"quals", no_argument, 0, OPT_QUALS},
315     {"integer-quals", no_argument, 0, OPT_INTEGER_QUALS},
316     {"color", no_argument, 0, OPT_COLOR},
317     {"color-out", no_argument, 0, OPT_COLOR_OUT},
318     {"library-type", required_argument, 0, OPT_LIBRARY_TYPE},
319     {"max-deletion-length", required_argument, 0, OPT_MAX_DELETION_LENGTH},
320     {"max-insertion-length", required_argument, 0, OPT_MAX_INSERTION_LENGTH},
321     {"num-threads", required_argument, 0, OPT_NUM_CPUS},
322     {"zpacker", required_argument, 0, OPT_ZPACKER},
323     {"samtools", required_argument, 0, OPT_SAMTOOLS},
324     {"aux-outfile", required_argument, 0, OPT_AUX_OUT},
325     {"gtf-juncs", required_argument, 0, OPT_GTF_JUNCS},
326 gpertea 135 {"flt-reads",required_argument, 0, OPT_FILTER_READS},
327     {"flt-hits",required_argument, 0, OPT_FILTER_HITS},
328 gpertea 29 {0, 0, 0, 0} // terminator
329     };
330    
331    
332     void str_appendInt(string& str, int v) {
333     stringstream ss;
334     ss << v;
335     str.append(ss.str());
336     }
337    
338     bool str_endsWith(string& str, const char* suffix) {
339     if (str.empty() || str.length()<3) return false;
340     size_t l=strlen(suffix);
341     if (str.length()<=l) return false;
342     if (str.rfind(suffix, str.length()-l-1)!=string::npos) return true;
343     return false;
344     }
345    
346     int parse_options(int argc, char** argv, void (*print_usage)())
347     {
348     int option_index = 0;
349     int next_option;
350     do {
351     next_option = getopt_long(argc, argv, short_options, long_options, &option_index);
352     switch (next_option) {
353     case -1:
354     break;
355     case OPT_FASTA:
356     reads_format = FASTA;
357     break;
358     case OPT_FASTQ:
359     reads_format = FASTQ;
360     break;
361     case OPT_MIN_ANCHOR:
362     min_anchor_len = (uint32_t)parseIntOpt(3, "--min-anchor arg must be at least 3", print_usage);
363     break;
364     case OPT_SPLICE_MISMATCHES:
365     max_splice_mismatches = parseIntOpt(0, "--splice-mismatches arg must be at least 0", print_usage);
366     break;
367     case OPT_VERBOSE:
368     verbose = true;
369     break;
370     case OPT_INSERT_LENGTH_MEAN:
371     inner_dist_mean = parseIntOpt(-1024, "--inner-dist-mean arg must be at least -1024", print_usage);
372     break;
373     case OPT_INSERT_LENGTH_STD_DEV:
374     inner_dist_std_dev = parseIntOpt(0, "--inner-dist-std-dev arg must be at least 0", print_usage);
375     break;
376     case OPT_OUTPUT_DIR:
377     output_dir = optarg;
378     break;
379     case OPT_GENE_FILTER:
380     gene_filter = optarg;
381     break;
382     case OPT_GFF_ANNOTATIONS:
383     gff_file = optarg;
384     break;
385     case OPT_MAX_MULTIHITS:
386     max_multihits = parseIntOpt(1, "--max-multihits arg must be at least 1", print_usage);
387     break;
388     case OPT_NO_CLOSURE_SEARCH:
389     no_closure_search = true;
390     break;
391     case OPT_NO_COVERAGE_SEARCH:
392     no_coverage_search = true;
393     break;
394     case OPT_NO_MICROEXON_SEARCH:
395     no_microexon_search = true;
396     break;
397     case OPT_SEGMENT_LENGTH:
398     segment_length = parseIntOpt(4, "--segment-length arg must be at least 4", print_usage);
399     break;
400     case OPT_SEGMENT_MISMATCHES:
401     segment_mismatches = parseIntOpt(0, "--segment-mismatches arg must be at least 0", print_usage);
402     break;
403 gpertea 135 case 'N':
404     case OPT_READ_MISMATCHES:
405     max_read_mismatches = parseIntOpt(0, "--max-mismatches arg must be at least 0", print_usage);
406     break;
407 gpertea 29 case OPT_MIN_CLOSURE_EXON:
408     min_closure_exon_length = parseIntOpt(1, "--min-closure-exon arg must be at least 1", print_usage);
409     break;
410     case OPT_MIN_CLOSURE_INTRON:
411     min_closure_intron_length = parseIntOpt(1, "--min-closure-intron arg must be at least 1", print_usage);
412     break;
413     case OPT_MAX_CLOSURE_INTRON:
414     max_closure_intron_length = parseIntOpt(1, "--max-closure-intron arg must be at least 1", print_usage);
415     break;
416     case OPT_MIN_COVERAGE_INTRON:
417     min_coverage_intron_length = parseIntOpt(1, "--min-coverage-intron arg must be at least 1", print_usage);
418     break;
419     case OPT_MAX_COVERAGE_INTRON:
420     max_coverage_intron_length = parseIntOpt(1, "--max-coverage-intron arg must be at least 1", print_usage);
421     break;
422     case OPT_MIN_SEGMENT_INTRON:
423     min_segment_intron_length = parseIntOpt(1, "--min-segment-intron arg must be at least 1", print_usage);
424     break;
425     case OPT_MAX_SEGMENT_INTRON:
426     max_segment_intron_length = parseIntOpt(1, "--max-segment-intron arg must be at least 1", print_usage);
427     break;
428     case OPT_MIN_REPORT_INTRON:
429     min_report_intron_length = parseIntOpt(1, "--min-report-intron arg must be at least 1", print_usage);
430     break;
431     case OPT_MAX_REPORT_INTRON:
432     max_report_intron_length = parseIntOpt(1, "--max-report-intron arg must be at least 1", print_usage);
433     break;
434     case OPT_MIN_ISOFORM_FRACTION:
435     min_isoform_fraction = parseFloatOpt(0.0f, 1.0f, "--min-isoform-fraction arg must be [0.0,1.0]", print_usage);
436     break;
437     case OPT_IUM_READS:
438     ium_reads = optarg;
439     break;
440     case OPT_SAM_HEADER:
441     sam_header = optarg;
442     break;
443     case OPT_SAM_READGROUP_ID:
444     sam_readgroup_id = optarg;
445     break;
446     case OPT_BUTTERFLY_SEARCH:
447     butterfly_search = true;
448     break;
449     case OPT_SOLEXA_QUALS:
450     solexa_quals = true;
451     break;
452     case OPT_PHRED64_QUALS:
453     phred64_quals = true;
454     break;
455     case 'Q':
456     case OPT_QUALS:
457     quals = true;
458     break;
459     case OPT_INTEGER_QUALS:
460     integer_quals = true;
461     break;
462     case 'C':
463     case OPT_COLOR:
464     color = true;
465     break;
466     case OPT_COLOR_OUT:
467     color_out = true;
468     break;
469     case OPT_LIBRARY_TYPE:
470     if (strcmp(optarg, "fr-unstranded") == 0)
471     library_type = FR_UNSTRANDED;
472     else if (strcmp(optarg, "fr-firststrand") == 0)
473     library_type = FR_FIRSTSTRAND;
474     else if (strcmp(optarg, "fr-secondstrand") == 0)
475     library_type = FR_SECONDSTRAND;
476     else if (strcmp(optarg, "ff-unstranded") == 0)
477     library_type = FF_UNSTRANDED;
478     else if (strcmp(optarg, "ff-firststrand") == 0)
479     library_type = FF_FIRSTSTRAND;
480     else if (strcmp(optarg, "ff-secondstrand") == 0)
481     library_type = FF_SECONDSTRAND;
482     break;
483     case OPT_MAX_DELETION_LENGTH:
484     max_deletion_length = parseIntOpt(0, "--max-deletion-length must be at least 0", print_usage);
485     break;
486     case OPT_MAX_INSERTION_LENGTH:
487     max_insertion_length = parseIntOpt(0, "--max-insertion-length must be at least 0", print_usage);
488     break;
489     case 'z':
490     case OPT_ZPACKER:
491     zpacker = optarg;
492     break;
493     case OPT_SAMTOOLS:
494     samtools_path = optarg;
495     break;
496     case OPT_AUX_OUT:
497     aux_outfile = optarg;
498     break;
499     case 'p':
500     case OPT_NUM_CPUS:
501     num_cpus=parseIntOpt(1,"-p/--num-threads must be at least 1",print_usage);
502     break;
503     case OPT_GTF_JUNCS:
504     gtf_juncs = optarg;
505     break;
506 gpertea 135 case OPT_FILTER_READS:
507     flt_reads = optarg;
508     break;
509     case OPT_FILTER_HITS:
510     flt_mappings = optarg;
511     break;
512 gpertea 29 default:
513     print_usage();
514     return 1;
515     }
516     } while(next_option != -1);
517    
518     return 0;
519     }
520    
521    
522     // Error routine (prints error message and exits!)
523     void err_exit(const char* format,...){
524     va_list arguments;
525     va_start(arguments,format);
526     vfprintf(stderr,format,arguments);
527     va_end(arguments);
528     #ifdef DEBUG
529     // trigger a core dump for later inspection
530     abort();
531     #endif
532     exit(1);
533     }
534    
535     FILE* FZPipe::openRead(const char* fname, string& popencmd) {
536     pipecmd=popencmd;
537     filename=fname;
538     if (pipecmd.empty()) {
539     file=fopen(filename.c_str(), "r");
540     }
541     else {
542     string pcmd(pipecmd);
543     pcmd.append(" '");
544     pcmd.append(filename);
545     pcmd.append("'");
546     file=popen(pcmd.c_str(), "r");
547     }
548     return file;
549     }
550    
551     FILE* FZPipe::openRead(const char* fname) {
552     string pcmd;
553     return this->openRead(fname,pcmd);
554     }
555    
556     FILE* FZPipe::openWrite(const char* fname, string& popencmd) {
557     pipecmd=popencmd;
558     filename=fname;
559     if (pipecmd.empty()) {
560     file=fopen(filename.c_str(), "w");
561     }
562     else {
563     string pcmd(pipecmd);
564     pcmd.append(" - > '");
565     pcmd.append(filename.c_str());
566     pcmd.append("'");
567     file=popen(pcmd.c_str(), "w");
568     }
569     return file;
570     }
571    
572    
573     FILE* FZPipe::openWrite(const char* fname) {
574     string pcmd;
575     return this->openWrite(fname,pcmd);
576     }
577    
578     void FZPipe::rewind() {
579     if (pipecmd.empty()) {
580     if (file!=NULL) {
581     ::rewind(file);
582     return;
583     }
584     if (!filename.empty()) {
585     file=fopen(filename.c_str(),"r");
586     return;
587     }
588     }
589     if (filename.empty())
590     err_die("Error: FZStream::rewind() failed (missing filename)!\n");
591     this->close();
592     string pcmd(pipecmd);
593     pcmd.append(" '");
594     pcmd.append(filename);
595     pcmd.append("'");
596     file=popen(pcmd.c_str(), "r");
597     if (file==NULL) {
598     err_die("Error: FZStream::rewind() popen(%s) failed!\n",pcmd.c_str());
599     }
600     }
601    
602    
603     string getFext(const string& s) {
604     string r("");
605     //if (xpos!=NULL) *xpos=0;
606     if (s.empty() || s=="-") return r;
607     int slen=(int)s.length();
608     int p=s.rfind('.');
609     int d=s.rfind('/');
610     if (p<=0 || p>slen-2 || p<slen-7 || p<d) return r;
611     r=s.substr(p+1);
612     //if (xpos!=NULL) *xpos=p+1;
613     for(size_t i=0; i!=r.length(); i++)
614     r[i] = std::tolower(r[i]);
615     return r;
616     }
617    
618     string guess_packer(const string& fname, bool use_all_cpus) {
619     //only needed for the primary input files (given by user)
620     string picmd("");
621     string fext=getFext(fname);
622 gpertea 32 if (fext=="bam") {
623     picmd="bam2fastx";
624     return picmd;
625     }
626 gpertea 29 if (fext=="gz" || fext=="gzip" || fext=="z") {
627     if (use_all_cpus && str_endsWith(zpacker,"pigz")) {
628     picmd=zpacker;
629     if (num_cpus<2) picmd.append(" -p1");
630     else {
631     picmd.append(" -p");
632     str_appendInt(picmd, num_cpus);
633     //picmd.append(" -cd");
634     }
635     }
636     else picmd="gzip";
637     }
638     else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
639     if (use_all_cpus && str_endsWith(zpacker,"pbzip2")) {
640     picmd=zpacker;
641     if (num_cpus<2) picmd.append(" -p1");
642     else {
643     picmd.append(" -p");
644     str_appendInt(picmd, num_cpus);
645     //picmd.append(" -cd");
646     }
647     }
648     else picmd="bzip2";
649     }
650     return picmd;
651     }
652    
653     /*
654     string getBam2SamCmd(const string& fname) {
655     string pipecmd("");
656     string fext=getFext(fname);
657     if (fext=="bam") {
658     pipecmd=samtools_path;
659     pipecmd.append(" view");
660     }
661     return pipecmd;
662     }
663     */
664    
665     void err_die(const char* format,...) { // Error exit
666     va_list arguments;
667     va_start(arguments,format);
668     vfprintf(stderr,format,arguments);
669     va_end(arguments);
670     exit(1);
671     }
672    
673     string getUnpackCmd(const string& fname, bool use_all_cpus) {
674 gpertea 32 //prep_reads should use guess_packer() instead
675 gpertea 135 //otherwise compressed files MUST have the .z extension,
676     //as they are all internally generated
677 gpertea 29 string pipecmd("");
678 gpertea 32 string fext=getFext(fname);
679     if (fext=="bam") {
680     pipecmd="bam2fastx";
681     return pipecmd;
682     }
683     if (zpacker.empty() || fext!="z") {
684 gpertea 135 return pipecmd; //no packer used
685 gpertea 29 }
686     pipecmd=zpacker;
687     if (str_endsWith(pipecmd, "pigz") ||str_endsWith(pipecmd, "pbzip2")) {
688     if (use_all_cpus==false) pipecmd.append(" -p1");
689     else if (num_cpus>1) {
690     pipecmd.append(" -p");
691     str_appendInt(pipecmd,num_cpus);
692     }
693     }
694     if (!pipecmd.empty()) pipecmd.append(" -cd");
695     return pipecmd;
696     }
697    
698     void checkSamHeader() {
699     if (sam_header.empty())
700     err_die("Error: writeSamHeader() with empty sam_header string\n");
701     //copy the SAM header
702     FILE* fh=fopen(sam_header.c_str(), "r");
703     if (fh==NULL)
704     err_die("Error: cannot open SAM header file %s\n",sam_header.c_str());
705     fclose(fh);
706     }
707    
708     void writeSamHeader(FILE* fout) {
709     if (fout==NULL)
710     err_die("Error: writeSamHeader(NULL)\n");
711     checkSamHeader();
712     //copy the SAM header
713     FILE* fh=fopen(sam_header.c_str(), "r");
714     int ch=-1;
715     while ((ch=fgetc(fh))!=EOF) {
716     if (fputc(ch, fout)==EOF)
717     err_die("Error copying SAM header\n");
718     }
719     fclose(fh);
720     }
721    
722     //auxiliary functions for BAM record handling
723     uint8_t* realloc_bdata(bam1_t *b, int size) {
724     if (b->m_data < size) {
725     b->m_data = size;
726     kroundup32(b->m_data);
727     b->data = (uint8_t*)realloc(b->data, b->m_data);
728     }
729     if (b->data_len<size) b->data_len=size;
730     return b->data;
731     }
732    
733     uint8_t* dupalloc_bdata(bam1_t *b, int size) {
734     //same as realloc_bdata, but does not free previous data
735     //but returns it instead
736     //it ALWAYS duplicates data
737     b->m_data = size;
738     kroundup32(b->m_data);
739     uint8_t* odata=b->data;
740     b->data = (uint8_t*)malloc(b->m_data);
741     memcpy((void*)b->data, (void*)odata, b->data_len);
742     b->data_len=size;
743     return odata; //user must FREE this after
744     }
745    
746     extern unsigned short bam_char2flag_table[];
747    
748     GBamRecord::GBamRecord(const char* qname, int32_t gseq_tid,
749     int pos, bool reverse, const char* qseq, const char* cigar, const char* quals) {
750     novel=true;
751     b=bam_init1();
752     b->core.tid=gseq_tid;
753     if (pos<=0) {
754     b->core.pos=-1; //unmapped
755     //if (gseq_tid<0)
756     b->core.flag |= BAM_FUNMAP;
757     }
758     else b->core.pos=pos-1; //BAM is 0-based
759     b->core.qual=255;
760     int l_qseq=strlen(qseq);
761     //this may not be accurate, setting CIGAR is the correct way
762     //b->core.bin = bam_reg2bin(b->core.pos, b->core.pos+l_qseq-1);
763     b->core.l_qname=strlen(qname)+1; //includes the \0 at the end
764     memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname);
765     set_cigar(cigar); //this will also set core.bin
766     add_sequence(qseq, l_qseq);
767     add_quals(quals); //quals must be given as Phred33
768     if (reverse) { b->core.flag |= BAM_FREVERSE ; }
769     }
770    
771     GBamRecord::GBamRecord(const char* qname, int32_t flags, int32_t g_tid,
772     int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos,
773     int insert_size, const char* qseq, const char* quals,
774     const vector<string>* aux_strings) {
775     novel=true;
776     b=bam_init1();
777     b->core.tid=g_tid;
778     b->core.pos = (pos<=0) ? -1 : pos-1; //BAM is 0-based
779     b->core.qual=map_qual;
780     int l_qseq=strlen(qseq);
781     b->core.l_qname=strlen(qname)+1; //includes the \0 at the end
782     memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname);
783     set_cigar(cigar); //this will also set core.bin
784     add_sequence(qseq, l_qseq);
785     add_quals(quals); //quals must be given as Phred33
786     set_flags(flags);
787     set_mdata(mg_tid, (int32_t)(mate_pos-1), (int32_t)insert_size);
788     if (aux_strings!=NULL) {
789     for (vector<string>::const_iterator itr=aux_strings->begin();
790     itr!=aux_strings->end(); ++itr) {
791     add_aux(itr->c_str());
792     }
793     }
794     }
795     void GBamRecord::set_cigar(const char* cigar) {
796     //requires b->core.pos and b->core.flag to have been set properly PRIOR to this call
797     int doff=b->core.l_qname;
798     uint8_t* after_cigar=NULL;
799     int after_cigar_len=0;
800     uint8_t* prev_bdata=NULL;
801     if (b->data_len>doff) {
802     //cigar string already allocated, replace it
803     int d=b->core.l_qname + b->core.n_cigar * 4;//offset of after-cigar data
804     after_cigar=b->data+d;
805     after_cigar_len=b->data_len-d;
806     }
807     const char *s;
808     char *t;
809     int i, op;
810     long x;
811     b->core.n_cigar = 0;
812     if (cigar != NULL && strcmp(cigar, "*") != 0) {
813     for (s = cigar; *s; ++s) {
814     if (isalpha(*s)) b->core.n_cigar++;
815     else if (!isdigit(*s)) {
816     err_die("Error: invalid CIGAR character (%s)\n",cigar);
817     }
818     }
819     if (after_cigar_len>0) { //replace/insert into existing full data
820     prev_bdata=dupalloc_bdata(b, doff + b->core.n_cigar * 4 + after_cigar_len);
821     memcpy((void*)(b->data+doff+b->core.n_cigar*4),(void*)after_cigar, after_cigar_len);
822     free(prev_bdata);
823     }
824     else {
825     realloc_bdata(b, doff + b->core.n_cigar * 4);
826     }
827     for (i = 0, s = cigar; i != b->core.n_cigar; ++i) {
828     x = strtol(s, &t, 10);
829     op = toupper(*t);
830     if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH;
831     else if (op == 'I') op = BAM_CINS;
832     else if (op == 'D') op = BAM_CDEL;
833     else if (op == 'N') op = BAM_CREF_SKIP;
834     else if (op == 'S') op = BAM_CSOFT_CLIP;
835     else if (op == 'H') op = BAM_CHARD_CLIP;
836     else if (op == 'P') op = BAM_CPAD;
837     else err_die("Error: invalid CIGAR operation (%s)\n",cigar);
838     s = t + 1;
839     bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op;
840     }
841     if (*s) err_die("Error: unmatched CIGAR operation (%s)\n",cigar);
842     b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&b->core, bam1_cigar(b)));
843     } else {//no CIGAR string given
844     if (!(b->core.flag&BAM_FUNMAP)) {
845     fprintf(stderr, "Warning: mapped sequence without CIGAR (%s)\n", (char*)b->data);
846     b->core.flag |= BAM_FUNMAP;
847     }
848     b->core.bin = bam_reg2bin(b->core.pos, b->core.pos + 1);
849     }
850     } //set_cigar()
851    
852     void GBamRecord::add_sequence(const char* qseq, int slen) {
853     //must be called AFTER set_cigar (cannot replace existing sequence for now)
854     if (qseq==NULL) return; //should we ever care about this?
855     if (slen<0) slen=strlen(qseq);
856     int doff = b->core.l_qname + b->core.n_cigar * 4;
857     if (strcmp(qseq, "*")!=0) {
858     b->core.l_qseq=slen;
859     if (b->core.n_cigar && b->core.l_qseq != (int32_t)bam_cigar2qlen(&b->core, bam1_cigar(b)))
860     err_die("Error: CIGAR and sequence length are inconsistent!(%s)\n",
861     qseq);
862     uint8_t* p = (uint8_t*)realloc_bdata(b, doff + (b->core.l_qseq+1)/2 + b->core.l_qseq) + doff;
863     //also allocated quals memory
864     memset(p, 0, (b->core.l_qseq+1)/2);
865     for (int i = 0; i < b->core.l_qseq; ++i)
866     p[i/2] |= bam_nt16_table[(int)qseq[i]] << 4*(1-i%2);
867     } else b->core.l_qseq = 0;
868     }
869    
870     void GBamRecord::add_quals(const char* quals) {
871     //requires core.l_qseq already set
872     //and must be called AFTER add_sequence(), which also allocates the memory for quals
873     uint8_t* p = b->data+(b->core.l_qname + b->core.n_cigar * 4 + (b->core.l_qseq+1)/2);
874     if (quals==NULL || strcmp(quals, "*") == 0) {
875     for (int i=0;i < b->core.l_qseq; i++) p[i] = 0xff;
876     return;
877     }
878     for (int i=0;i < b->core.l_qseq; i++) p[i] = quals[i]-33;
879     }
880    
881     void GBamRecord::add_aux(const char* str) {
882     //requires: being called AFTER add_quals()
883     static char tag[2];
884     static uint8_t abuf[512];
885     //requires: being called AFTER add_quals()
886     int strl=strlen(str);
887     //int doff = b->core.l_qname + b->core.n_cigar*4 + (b->core.l_qseq+1)/2 + b->core.l_qseq + b->l_aux;
888     //int doff0=doff;
889     if (strl < 6 || str[2] != ':' || str[4] != ':')
890     parse_error("missing colon in auxiliary data");
891     tag[0] = str[0]; tag[1] = str[1];
892     uint8_t atype = str[3];
893     uint8_t* adata=abuf;
894     int alen=0;
895     if (atype == 'A' || atype == 'a' || atype == 'c' || atype == 'C') { // c and C for backward compatibility
896     atype='A';
897     alen=1;
898     adata=(uint8_t*)&str[5];
899     }
900     else if (atype == 'I' || atype == 'i') {
901     long long x=(long long)atoll(str + 5);
902     if (x < 0) {
903     if (x >= -127) {
904     atype='c';
905     abuf[0] = (int8_t)x;
906     alen=1;
907     }
908     else if (x >= -32767) {
909     atype = 's';
910     *(int16_t*)abuf = (int16_t)x;
911     alen=2;
912     }
913     else {
914     atype='i';
915     *(int32_t*)abuf = (int32_t)x;
916     alen=4;
917     if (x < -2147483648ll)
918     fprintf(stderr, "Parse warning: integer %lld is out of range.",
919     x);
920     }
921     } else { //x >=0
922     if (x <= 255) {
923     atype = 'C';
924     abuf[0] = (uint8_t)x;
925     alen=1;
926     }
927     else if (x <= 65535) {
928     atype='S';
929     *(uint16_t*)abuf = (uint16_t)x;
930     alen=2;
931     }
932     else {
933     atype='I';
934     *(uint32_t*)abuf = (uint32_t)x;
935     alen=4;
936     if (x > 4294967295ll)
937     fprintf(stderr, "Parse warning: integer %lld is out of range.",
938     x);
939     }
940     }
941     } //integer type
942     else if (atype == 'f') {
943     *(float*)abuf = (float)atof(str + 5);
944     alen = sizeof(float);
945     }
946     else if (atype == 'd') { //?
947     *(float*)abuf = (float)atof(str + 9);
948     alen=8;
949     }
950     else if (atype == 'Z' || atype == 'H') {
951     if (atype == 'H') { // check whether the hex string is valid
952     if ((strl - 5) % 2 == 1) parse_error("length of the hex string not even");
953     for (int i = 0; i < strl - 5; ++i) {
954     int c = toupper(str[5 + i]);
955     if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F')))
956     parse_error("invalid hex character");
957     }
958     }
959     memcpy(abuf, str + 5, strl - 5);
960     abuf[strl-5] = 0;
961     alen=strl-4;
962     } else parse_error("unrecognized aux type");
963     this->add_aux(tag, atype, alen, adata);
964     }//add_aux()