ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/common.cpp
Revision: 154
Committed: Tue Jan 24 02:29:21 2012 UTC (8 years, 9 months ago) by gpertea
File size: 32476 byte(s)
Log Message:
massive update with Daehwan's work

Line User Rev File contents
1 gpertea 29 /*
2     * common.cpp
3     * TopHat
4     *
5     * Created by Cole Trapnell on 11/26/08.
6     * Copyright 2008 Cole Trapnell. All rights reserved.
7     *
8     */
9    
10     #ifdef HAVE_CONFIG_H
11     #include <config.h>
12     #endif
13    
14     #include <iostream>
15     #include <sstream>
16     #include <cstdarg>
17 gpertea 154 #include <limits>
18 gpertea 29 #include <getopt.h>
19    
20     #include "common.h"
21    
22     using namespace std;
23    
24     #ifdef MEM_DEBUG
25     //function for debugging memory usage of current program in Linux
26    
27     #include <unistd.h>
28     #include <ios>
29     #include <fstream>
30    
31     //////////////////////////////////////////////////////////////////////////////
32     // process_mem_usage(double &, double &) - takes two doubles by reference,
33     // attempts to read the system-dependent data for a process' virtual memory
34     // size and resident set size, and return the results in KB.
35     //
36     // On failure, returns 0.0, 0.0
37    
38     void process_mem_usage(double& vm_usage, double& resident_set) {
39     using std::ios_base;
40     using std::ifstream;
41     using std::string;
42     vm_usage = 0.0;
43     resident_set = 0.0;
44     // 'file' stat seems to give the most reliable results
45     ifstream stat_stream("/proc/self/stat",ios_base::in);
46     // dummy vars for leading entries in stat that we don't care about
47     string pid, comm, state, ppid, pgrp, session, tty_nr;
48     string tpgid, flags, minflt, cminflt, majflt, cmajflt;
49     string utime, stime, cutime, cstime, priority, nice;
50     string O, itrealvalue, starttime;
51    
52     // the two fields we want
53     //
54     unsigned long vsize;
55     long rss;
56    
57     stat_stream >> pid >> comm >> state >> ppid >> pgrp >> session >> tty_nr
58     >> tpgid >> flags >> minflt >> cminflt >> majflt >> cmajflt
59     >> utime >> stime >> cutime >> cstime >> priority >> nice
60     >> O >> itrealvalue >> starttime >> vsize >> rss; // don't care about the rest
61    
62     stat_stream.close();
63    
64     long page_size_kb = sysconf(_SC_PAGE_SIZE) / 1024; // in case x86-64 is configured to use 2MB pages
65     vm_usage = vsize / 1024.0;
66     resident_set = rss * page_size_kb;
67     }
68    
69     void print_mem_usage() {
70     double vs, rs;
71     process_mem_usage(vs,rs);
72     vs/=1024;
73     rs/=1024;
74     fprintf(stderr, "VMSize: %6.1fMB\tRSize: %6.1fMB\n", vs, rs);
75     }
76     #endif
77    
78 gpertea 154 bool bowtie2 = true;
79     int bowtie2_min_score = -10;
80     int max_segment_mapping = 20;
81 gpertea 29
82 gpertea 154 // daehwan - temporary
83     bool parallel = true;
84    
85 gpertea 29 unsigned int max_insertion_length = 3;
86     unsigned int max_deletion_length = 3;
87    
88    
89     int inner_dist_mean = 200;
90     int inner_dist_std_dev = 20;
91     int max_mate_inner_dist = -1;
92    
93     int min_anchor_len = 8;
94     int min_report_intron_length = 50;
95     int max_report_intron_length = 500000;
96    
97     int min_closure_intron_length = 50;
98     int max_closure_intron_length = 5000;
99    
100     int min_coverage_intron_length = 50;
101     int max_coverage_intron_length = 20000;
102    
103     int min_segment_intron_length = 50;
104     int max_segment_intron_length = 500000;
105    
106     uint32_t min_closure_exon_length = 100;
107    
108     int island_extension = 25;
109     int segment_length = 25;
110     int segment_mismatches = 2;
111 gpertea 135 int max_read_mismatches = 2;
112 gpertea 29 int max_splice_mismatches = 1;
113    
114     ReadFormat reads_format = FASTQ;
115    
116     bool verbose = false;
117    
118 gpertea 135 unsigned int max_multihits = 40;
119 gpertea 29 bool no_closure_search = false;
120     bool no_coverage_search = false;
121     bool no_microexon_search = false;
122     bool butterfly_search = false;
123 gpertea 154 int num_threads = 1;
124    
125 gpertea 29 float min_isoform_fraction = 0.15f;
126    
127     string output_dir = "tophat_out";
128     string aux_outfile = ""; //auxiliary output file name (e.g. prep_reads read stats)
129 gpertea 154 string index_outfile = "";
130 gpertea 29 string gene_filter = "";
131     string gff_file = "";
132     string ium_reads = "";
133     string sam_header = "";
134     string sam_readgroup_id = "";
135     string zpacker = "";
136     string samtools_path = "samtools";
137    
138     bool solexa_quals = false;
139     bool phred64_quals = false;
140     bool quals = false;
141     bool integer_quals = false;
142     bool color = false;
143     bool color_out = false;
144    
145     string gtf_juncs = "";
146    
147 gpertea 135 string flt_reads = "";
148     string flt_mappings = "";
149    
150 gpertea 154 bool fusion_search = false;
151     size_t fusion_anchor_length = 20;
152     size_t fusion_min_dist = 10000000;
153     size_t fusion_read_mismatches = 2;
154     size_t fusion_multireads = 2;
155     size_t fusion_multipairs = 2;
156    
157 gpertea 29 eLIBRARY_TYPE library_type = LIBRARY_TYPE_NONE;
158    
159     extern void print_usage();
160    
161     /**
162     * Parse an int out of optarg and enforce that it be at least 'lower';
163     * if it is less than 'lower', than output the given error message and
164     * exit with an error and a usage message.
165     */
166    
167     int parseIntOpt(int lower, const char *errmsg, void (*print_usage)()) {
168     long l;
169     char *endPtr= NULL;
170     l = strtol(optarg, &endPtr, 10);
171     if (endPtr != NULL) {
172     if (l < lower) {
173     cerr << errmsg << endl;
174     print_usage();
175     exit(1);
176     }
177     return (int32_t)l;
178     }
179     cerr << errmsg << endl;
180     print_usage();
181     exit(1);
182     return -1;
183     }
184    
185     /**
186     * Parse an int out of optarg and enforce that it be at least 'lower';
187     * if it is less than 'lower', than output the given error message and
188     * exit with an error and a usage message.
189     */
190     static float parseFloatOpt(float lower, float upper, const char *errmsg, void (*print_usage)()) {
191     float l;
192     l = (float)atof(optarg);
193    
194     if (l < lower) {
195     cerr << errmsg << endl;
196     print_usage();
197     exit(1);
198     }
199    
200     if (l > upper)
201     {
202     cerr << errmsg << endl;
203     print_usage();
204     exit(1);
205     }
206    
207     return l;
208    
209     cerr << errmsg << endl;
210     print_usage();
211     exit(1);
212     return -1;
213     }
214    
215     /*
216     this is from
217     http://www.winehq.org/pipermail/wine-patches/2001-November/001322.html
218     */
219     char* get_token(char** str, const char* delims)
220     {
221     char* token;
222     if (*str == NULL)
223     return NULL;
224    
225     token = *str;
226     while (**str != '\0')
227     {
228     if (strchr(delims, **str) != NULL)
229     {
230     **str = '\0';
231     ++(*str);
232     return token;
233     }
234    
235     ++(*str);
236     }
237    
238     *str = NULL;
239     return token;
240     }
241    
242 gpertea 135 const char *short_options = "QCp:z:N:";
243 gpertea 29
244     enum
245     {
246     OPT_FASTA = 127,
247     OPT_FASTQ,
248     OPT_MIN_ANCHOR,
249     OPT_SPLICE_MISMATCHES,
250     OPT_VERBOSE,
251     OPT_INSERT_LENGTH_MEAN,
252     OPT_INSERT_LENGTH_STD_DEV,
253     OPT_MIN_ISOFORM_FRACTION,
254     OPT_OUTPUT_DIR,
255     OPT_GENE_FILTER,
256     OPT_GFF_ANNOTATIONS,
257     OPT_MAX_MULTIHITS,
258     OPT_NO_CLOSURE_SEARCH,
259     OPT_NO_COVERAGE_SEARCH,
260     OPT_NO_MICROEXON_SEARCH,
261     OPT_SEGMENT_LENGTH,
262 gpertea 135 OPT_READ_MISMATCHES,
263 gpertea 29 OPT_SEGMENT_MISMATCHES,
264     OPT_MIN_CLOSURE_EXON,
265     OPT_MAX_CLOSURE_INTRON,
266     OPT_MIN_CLOSURE_INTRON,
267     OPT_MAX_COVERAGE_INTRON,
268     OPT_MIN_COVERAGE_INTRON,
269     OPT_MIN_SEGMENT_INTRON,
270     OPT_MAX_SEGMENT_INTRON,
271     OPT_MIN_REPORT_INTRON,
272     OPT_MAX_REPORT_INTRON,
273     OPT_IUM_READS,
274     OPT_BUTTERFLY_SEARCH,
275     OPT_SOLEXA_QUALS,
276     OPT_PHRED64_QUALS,
277     OPT_SAM_HEADER,
278     OPT_SAM_READGROUP_ID,
279     OPT_QUALS,
280     OPT_INTEGER_QUALS,
281     OPT_COLOR,
282     OPT_COLOR_OUT,
283     OPT_LIBRARY_TYPE,
284     OPT_MAX_DELETION_LENGTH,
285     OPT_MAX_INSERTION_LENGTH,
286 gpertea 154 OPT_NUM_THREADS,
287 gpertea 29 OPT_ZPACKER,
288     OPT_SAMTOOLS,
289     OPT_AUX_OUT,
290 gpertea 154 OPT_INDEX_OUT,
291 gpertea 135 OPT_GTF_JUNCS,
292     OPT_FILTER_READS,
293 gpertea 154 OPT_FILTER_HITS,
294     OPT_FUSION_SEARCH,
295     OPT_FUSION_ANCHOR_LENGTH,
296     OPT_FUSION_MIN_DIST,
297     OPT_FUSION_READ_MISMATCHES,
298     OPT_FUSION_MULTIREADS,
299     OPT_FUSION_MULTIPAIRS,
300     OPT_BOWTIE1,
301     OPT_BOWTIE2_MIN_SCORE,
302 gpertea 29 };
303    
304     static struct option long_options[] = {
305     {"fasta", no_argument, 0, OPT_FASTA},
306     {"fastq", no_argument, 0, OPT_FASTQ},
307     {"min-anchor", required_argument, 0, OPT_MIN_ANCHOR},
308     {"sam-header", required_argument, 0, OPT_SAM_HEADER},
309     {"rg-id", required_argument, 0, OPT_SAM_READGROUP_ID},
310     {"splice-mismatches", required_argument, 0, OPT_SPLICE_MISMATCHES},
311     {"verbose", no_argument, 0, OPT_VERBOSE},
312     {"inner-dist-mean", required_argument, 0, OPT_INSERT_LENGTH_MEAN},
313     {"inner-dist-std-dev", required_argument, 0, OPT_INSERT_LENGTH_STD_DEV},
314     {"output-dir", required_argument, 0, OPT_OUTPUT_DIR},
315     {"gene-filter", required_argument, 0, OPT_GENE_FILTER},
316     {"gtf-annotations", required_argument, 0, OPT_GFF_ANNOTATIONS},
317     {"max-multihits", required_argument, 0, OPT_MAX_MULTIHITS},
318     {"no-closure-search", no_argument, 0, OPT_NO_CLOSURE_SEARCH},
319     {"no-coverage-search", no_argument, 0, OPT_NO_COVERAGE_SEARCH},
320     {"no-microexon-search", no_argument, 0, OPT_NO_MICROEXON_SEARCH},
321     {"segment-length", required_argument, 0, OPT_SEGMENT_LENGTH},
322     {"segment-mismatches", required_argument, 0, OPT_SEGMENT_MISMATCHES},
323 gpertea 135 {"max-mismatches", required_argument, 0, OPT_READ_MISMATCHES},
324 gpertea 29 {"min-closure-exon", required_argument, 0, OPT_MIN_CLOSURE_EXON},
325     {"min-closure-intron", required_argument, 0, OPT_MIN_CLOSURE_INTRON},
326     {"max-closure-intron", required_argument, 0, OPT_MAX_CLOSURE_INTRON},
327     {"min-coverage-intron", required_argument, 0, OPT_MIN_COVERAGE_INTRON},
328     {"max-coverage-intron", required_argument, 0, OPT_MAX_COVERAGE_INTRON},
329     {"min-segment-intron", required_argument, 0, OPT_MIN_SEGMENT_INTRON},
330     {"max-segment-intron", required_argument, 0, OPT_MAX_SEGMENT_INTRON},
331     {"min-report-intron", required_argument, 0, OPT_MIN_REPORT_INTRON},
332     {"max-report-intron", required_argument, 0, OPT_MAX_REPORT_INTRON},
333     {"min-isoform-fraction",required_argument, 0, OPT_MIN_ISOFORM_FRACTION},
334     {"ium-reads", required_argument, 0, OPT_IUM_READS},
335     {"butterfly-search", no_argument, 0, OPT_BUTTERFLY_SEARCH},
336     {"solexa-quals", no_argument, 0, OPT_SOLEXA_QUALS},
337     {"phred64-quals", no_argument, 0, OPT_PHRED64_QUALS},
338     {"quals", no_argument, 0, OPT_QUALS},
339     {"integer-quals", no_argument, 0, OPT_INTEGER_QUALS},
340     {"color", no_argument, 0, OPT_COLOR},
341     {"color-out", no_argument, 0, OPT_COLOR_OUT},
342     {"library-type", required_argument, 0, OPT_LIBRARY_TYPE},
343     {"max-deletion-length", required_argument, 0, OPT_MAX_DELETION_LENGTH},
344     {"max-insertion-length", required_argument, 0, OPT_MAX_INSERTION_LENGTH},
345 gpertea 154 {"num-threads", required_argument, 0, OPT_NUM_THREADS},
346 gpertea 29 {"zpacker", required_argument, 0, OPT_ZPACKER},
347     {"samtools", required_argument, 0, OPT_SAMTOOLS},
348     {"aux-outfile", required_argument, 0, OPT_AUX_OUT},
349 gpertea 154 {"index-outfile", required_argument, 0, OPT_INDEX_OUT},
350 gpertea 29 {"gtf-juncs", required_argument, 0, OPT_GTF_JUNCS},
351 gpertea 135 {"flt-reads",required_argument, 0, OPT_FILTER_READS},
352     {"flt-hits",required_argument, 0, OPT_FILTER_HITS},
353 gpertea 154 {"fusion-search", no_argument, 0, OPT_FUSION_SEARCH},
354     {"fusion-anchor-length", required_argument, 0, OPT_FUSION_ANCHOR_LENGTH},
355     {"fusion-min-dist", required_argument, 0, OPT_FUSION_MIN_DIST},
356     {"fusion-read-mismatches", required_argument, 0, OPT_FUSION_READ_MISMATCHES},
357     {"fusion-multireads", required_argument, 0, OPT_FUSION_MULTIREADS},
358     {"fusion-multipairs", required_argument, 0, OPT_FUSION_MULTIPAIRS},
359     {"bowtie1", no_argument, 0, OPT_BOWTIE1},
360     {"bowtie2-min-score", required_argument, 0, OPT_BOWTIE2_MIN_SCORE},
361 gpertea 29 {0, 0, 0, 0} // terminator
362     };
363    
364    
365     void str_appendInt(string& str, int v) {
366     stringstream ss;
367     ss << v;
368     str.append(ss.str());
369     }
370    
371     bool str_endsWith(string& str, const char* suffix) {
372     if (str.empty() || str.length()<3) return false;
373     size_t l=strlen(suffix);
374     if (str.length()<=l) return false;
375     if (str.rfind(suffix, str.length()-l-1)!=string::npos) return true;
376     return false;
377     }
378    
379     int parse_options(int argc, char** argv, void (*print_usage)())
380     {
381     int option_index = 0;
382     int next_option;
383     do {
384     next_option = getopt_long(argc, argv, short_options, long_options, &option_index);
385     switch (next_option) {
386     case -1:
387     break;
388     case OPT_FASTA:
389     reads_format = FASTA;
390     break;
391     case OPT_FASTQ:
392     reads_format = FASTQ;
393     break;
394     case OPT_MIN_ANCHOR:
395     min_anchor_len = (uint32_t)parseIntOpt(3, "--min-anchor arg must be at least 3", print_usage);
396     break;
397     case OPT_SPLICE_MISMATCHES:
398     max_splice_mismatches = parseIntOpt(0, "--splice-mismatches arg must be at least 0", print_usage);
399     break;
400     case OPT_VERBOSE:
401     verbose = true;
402     break;
403     case OPT_INSERT_LENGTH_MEAN:
404     inner_dist_mean = parseIntOpt(-1024, "--inner-dist-mean arg must be at least -1024", print_usage);
405     break;
406     case OPT_INSERT_LENGTH_STD_DEV:
407     inner_dist_std_dev = parseIntOpt(0, "--inner-dist-std-dev arg must be at least 0", print_usage);
408     break;
409     case OPT_OUTPUT_DIR:
410     output_dir = optarg;
411     break;
412     case OPT_GENE_FILTER:
413     gene_filter = optarg;
414     break;
415     case OPT_GFF_ANNOTATIONS:
416     gff_file = optarg;
417     break;
418     case OPT_MAX_MULTIHITS:
419     max_multihits = parseIntOpt(1, "--max-multihits arg must be at least 1", print_usage);
420     break;
421     case OPT_NO_CLOSURE_SEARCH:
422     no_closure_search = true;
423     break;
424     case OPT_NO_COVERAGE_SEARCH:
425     no_coverage_search = true;
426     break;
427     case OPT_NO_MICROEXON_SEARCH:
428     no_microexon_search = true;
429     break;
430     case OPT_SEGMENT_LENGTH:
431     segment_length = parseIntOpt(4, "--segment-length arg must be at least 4", print_usage);
432     break;
433     case OPT_SEGMENT_MISMATCHES:
434     segment_mismatches = parseIntOpt(0, "--segment-mismatches arg must be at least 0", print_usage);
435     break;
436 gpertea 135 case 'N':
437     case OPT_READ_MISMATCHES:
438     max_read_mismatches = parseIntOpt(0, "--max-mismatches arg must be at least 0", print_usage);
439     break;
440 gpertea 29 case OPT_MIN_CLOSURE_EXON:
441     min_closure_exon_length = parseIntOpt(1, "--min-closure-exon arg must be at least 1", print_usage);
442     break;
443     case OPT_MIN_CLOSURE_INTRON:
444     min_closure_intron_length = parseIntOpt(1, "--min-closure-intron arg must be at least 1", print_usage);
445     break;
446     case OPT_MAX_CLOSURE_INTRON:
447     max_closure_intron_length = parseIntOpt(1, "--max-closure-intron arg must be at least 1", print_usage);
448     break;
449     case OPT_MIN_COVERAGE_INTRON:
450     min_coverage_intron_length = parseIntOpt(1, "--min-coverage-intron arg must be at least 1", print_usage);
451     break;
452     case OPT_MAX_COVERAGE_INTRON:
453     max_coverage_intron_length = parseIntOpt(1, "--max-coverage-intron arg must be at least 1", print_usage);
454     break;
455     case OPT_MIN_SEGMENT_INTRON:
456     min_segment_intron_length = parseIntOpt(1, "--min-segment-intron arg must be at least 1", print_usage);
457     break;
458     case OPT_MAX_SEGMENT_INTRON:
459     max_segment_intron_length = parseIntOpt(1, "--max-segment-intron arg must be at least 1", print_usage);
460     break;
461     case OPT_MIN_REPORT_INTRON:
462     min_report_intron_length = parseIntOpt(1, "--min-report-intron arg must be at least 1", print_usage);
463     break;
464     case OPT_MAX_REPORT_INTRON:
465     max_report_intron_length = parseIntOpt(1, "--max-report-intron arg must be at least 1", print_usage);
466     break;
467     case OPT_MIN_ISOFORM_FRACTION:
468     min_isoform_fraction = parseFloatOpt(0.0f, 1.0f, "--min-isoform-fraction arg must be [0.0,1.0]", print_usage);
469     break;
470     case OPT_IUM_READS:
471     ium_reads = optarg;
472     break;
473     case OPT_SAM_HEADER:
474     sam_header = optarg;
475     break;
476     case OPT_SAM_READGROUP_ID:
477     sam_readgroup_id = optarg;
478     break;
479     case OPT_BUTTERFLY_SEARCH:
480     butterfly_search = true;
481     break;
482     case OPT_SOLEXA_QUALS:
483     solexa_quals = true;
484     break;
485     case OPT_PHRED64_QUALS:
486     phred64_quals = true;
487     break;
488     case 'Q':
489     case OPT_QUALS:
490     quals = true;
491     break;
492     case OPT_INTEGER_QUALS:
493     integer_quals = true;
494     break;
495     case 'C':
496     case OPT_COLOR:
497     color = true;
498     break;
499     case OPT_COLOR_OUT:
500     color_out = true;
501     break;
502     case OPT_LIBRARY_TYPE:
503     if (strcmp(optarg, "fr-unstranded") == 0)
504     library_type = FR_UNSTRANDED;
505     else if (strcmp(optarg, "fr-firststrand") == 0)
506     library_type = FR_FIRSTSTRAND;
507     else if (strcmp(optarg, "fr-secondstrand") == 0)
508     library_type = FR_SECONDSTRAND;
509     else if (strcmp(optarg, "ff-unstranded") == 0)
510     library_type = FF_UNSTRANDED;
511     else if (strcmp(optarg, "ff-firststrand") == 0)
512     library_type = FF_FIRSTSTRAND;
513     else if (strcmp(optarg, "ff-secondstrand") == 0)
514     library_type = FF_SECONDSTRAND;
515     break;
516     case OPT_MAX_DELETION_LENGTH:
517     max_deletion_length = parseIntOpt(0, "--max-deletion-length must be at least 0", print_usage);
518     break;
519     case OPT_MAX_INSERTION_LENGTH:
520     max_insertion_length = parseIntOpt(0, "--max-insertion-length must be at least 0", print_usage);
521     break;
522     case 'z':
523     case OPT_ZPACKER:
524     zpacker = optarg;
525     break;
526     case OPT_SAMTOOLS:
527     samtools_path = optarg;
528     break;
529     case OPT_AUX_OUT:
530     aux_outfile = optarg;
531     break;
532 gpertea 154 case OPT_INDEX_OUT:
533     index_outfile = optarg;
534     break;
535 gpertea 29 case 'p':
536 gpertea 154 case OPT_NUM_THREADS:
537     num_threads=parseIntOpt(1,"-p/--num-threads must be at least 1",print_usage);
538 gpertea 29 break;
539     case OPT_GTF_JUNCS:
540     gtf_juncs = optarg;
541     break;
542 gpertea 135 case OPT_FILTER_READS:
543     flt_reads = optarg;
544     break;
545     case OPT_FILTER_HITS:
546     flt_mappings = optarg;
547     break;
548 gpertea 154 case OPT_FUSION_SEARCH:
549     fusion_search = true;
550     break;
551     case OPT_FUSION_ANCHOR_LENGTH:
552     fusion_anchor_length = parseIntOpt(10, "--fusion-anchor-length must be at least 10", print_usage);
553     break;
554     case OPT_FUSION_MIN_DIST:
555     fusion_min_dist = parseIntOpt(0, "--fusion-min-dist must be at least 0", print_usage);
556     break;
557     case OPT_FUSION_READ_MISMATCHES:
558     fusion_read_mismatches = parseIntOpt(0, "--fusion-read-mismatches must be at least 0", print_usage);
559     break;
560     case OPT_FUSION_MULTIREADS:
561     fusion_multireads = parseIntOpt(1, "--fusion-multireads must be at least 1", print_usage);
562     break;
563     case OPT_FUSION_MULTIPAIRS:
564     fusion_multipairs = parseIntOpt(1, "--fusion-multipairs must be at least 0", print_usage);
565     break;
566     case OPT_BOWTIE1:
567     bowtie2 = false;
568     break;
569     case OPT_BOWTIE2_MIN_SCORE:
570     bowtie2_min_score = -1 * parseIntOpt(0, "--bowtie2-min-score must be at least 0", print_usage);
571     break;
572 gpertea 29 default:
573     print_usage();
574     return 1;
575     }
576     } while(next_option != -1);
577    
578     return 0;
579     }
580    
581    
582     // Error routine (prints error message and exits!)
583     void err_exit(const char* format,...){
584     va_list arguments;
585     va_start(arguments,format);
586     vfprintf(stderr,format,arguments);
587     va_end(arguments);
588     #ifdef DEBUG
589     // trigger a core dump for later inspection
590     abort();
591     #endif
592     exit(1);
593     }
594    
595     FILE* FZPipe::openRead(const char* fname, string& popencmd) {
596     pipecmd=popencmd;
597     filename=fname;
598     if (pipecmd.empty()) {
599     file=fopen(filename.c_str(), "r");
600     }
601     else {
602     string pcmd(pipecmd);
603     pcmd.append(" '");
604     pcmd.append(filename);
605     pcmd.append("'");
606     file=popen(pcmd.c_str(), "r");
607     }
608     return file;
609     }
610    
611     FILE* FZPipe::openRead(const char* fname) {
612     string pcmd;
613     return this->openRead(fname,pcmd);
614     }
615    
616     FILE* FZPipe::openWrite(const char* fname, string& popencmd) {
617     pipecmd=popencmd;
618     filename=fname;
619     if (pipecmd.empty()) {
620     file=fopen(filename.c_str(), "w");
621     }
622     else {
623     string pcmd(pipecmd);
624     pcmd.append(" - > '");
625     pcmd.append(filename.c_str());
626     pcmd.append("'");
627     file=popen(pcmd.c_str(), "w");
628     }
629     return file;
630     }
631    
632    
633     FILE* FZPipe::openWrite(const char* fname) {
634     string pcmd;
635     return this->openWrite(fname,pcmd);
636     }
637    
638     void FZPipe::rewind() {
639     if (pipecmd.empty()) {
640     if (file!=NULL) {
641     ::rewind(file);
642     return;
643     }
644     if (!filename.empty()) {
645     file=fopen(filename.c_str(),"r");
646     return;
647     }
648     }
649     if (filename.empty())
650     err_die("Error: FZStream::rewind() failed (missing filename)!\n");
651     this->close();
652     string pcmd(pipecmd);
653     pcmd.append(" '");
654     pcmd.append(filename);
655     pcmd.append("'");
656     file=popen(pcmd.c_str(), "r");
657     if (file==NULL) {
658     err_die("Error: FZStream::rewind() popen(%s) failed!\n",pcmd.c_str());
659     }
660     }
661    
662    
663     string getFext(const string& s) {
664     string r("");
665     //if (xpos!=NULL) *xpos=0;
666     if (s.empty() || s=="-") return r;
667     int slen=(int)s.length();
668     int p=s.rfind('.');
669     int d=s.rfind('/');
670     if (p<=0 || p>slen-2 || p<slen-7 || p<d) return r;
671     r=s.substr(p+1);
672     //if (xpos!=NULL) *xpos=p+1;
673     for(size_t i=0; i!=r.length(); i++)
674     r[i] = std::tolower(r[i]);
675     return r;
676     }
677    
678     string guess_packer(const string& fname, bool use_all_cpus) {
679     //only needed for the primary input files (given by user)
680     string picmd("");
681     string fext=getFext(fname);
682 gpertea 32 if (fext=="bam") {
683     picmd="bam2fastx";
684     return picmd;
685     }
686 gpertea 29 if (fext=="gz" || fext=="gzip" || fext=="z") {
687     if (use_all_cpus && str_endsWith(zpacker,"pigz")) {
688     picmd=zpacker;
689 gpertea 154 if (num_threads<2) picmd.append(" -p1");
690 gpertea 29 else {
691     picmd.append(" -p");
692 gpertea 154 str_appendInt(picmd, num_threads);
693 gpertea 29 //picmd.append(" -cd");
694     }
695     }
696     else picmd="gzip";
697     }
698     else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
699     if (use_all_cpus && str_endsWith(zpacker,"pbzip2")) {
700     picmd=zpacker;
701 gpertea 154 if (num_threads<2) picmd.append(" -p1");
702 gpertea 29 else {
703     picmd.append(" -p");
704 gpertea 154 str_appendInt(picmd, num_threads);
705 gpertea 29 //picmd.append(" -cd");
706     }
707     }
708     else picmd="bzip2";
709     }
710     return picmd;
711     }
712    
713     /*
714     string getBam2SamCmd(const string& fname) {
715     string pipecmd("");
716     string fext=getFext(fname);
717     if (fext=="bam") {
718     pipecmd=samtools_path;
719     pipecmd.append(" view");
720     }
721     return pipecmd;
722     }
723     */
724    
725     void err_die(const char* format,...) { // Error exit
726     va_list arguments;
727     va_start(arguments,format);
728     vfprintf(stderr,format,arguments);
729     va_end(arguments);
730     exit(1);
731     }
732    
733     string getUnpackCmd(const string& fname, bool use_all_cpus) {
734 gpertea 32 //prep_reads should use guess_packer() instead
735 gpertea 135 //otherwise compressed files MUST have the .z extension,
736     //as they are all internally generated
737 gpertea 29 string pipecmd("");
738 gpertea 32 string fext=getFext(fname);
739     if (fext=="bam") {
740     pipecmd="bam2fastx";
741     return pipecmd;
742     }
743     if (zpacker.empty() || fext!="z") {
744 gpertea 135 return pipecmd; //no packer used
745 gpertea 29 }
746     pipecmd=zpacker;
747     if (str_endsWith(pipecmd, "pigz") ||str_endsWith(pipecmd, "pbzip2")) {
748     if (use_all_cpus==false) pipecmd.append(" -p1");
749 gpertea 154 else if (num_threads>1) {
750 gpertea 29 pipecmd.append(" -p");
751 gpertea 154 str_appendInt(pipecmd,num_threads);
752 gpertea 29 }
753     }
754     if (!pipecmd.empty()) pipecmd.append(" -cd");
755     return pipecmd;
756     }
757    
758     void checkSamHeader() {
759     if (sam_header.empty())
760     err_die("Error: writeSamHeader() with empty sam_header string\n");
761     //copy the SAM header
762     FILE* fh=fopen(sam_header.c_str(), "r");
763     if (fh==NULL)
764     err_die("Error: cannot open SAM header file %s\n",sam_header.c_str());
765     fclose(fh);
766     }
767    
768     void writeSamHeader(FILE* fout) {
769     if (fout==NULL)
770     err_die("Error: writeSamHeader(NULL)\n");
771     checkSamHeader();
772     //copy the SAM header
773     FILE* fh=fopen(sam_header.c_str(), "r");
774     int ch=-1;
775     while ((ch=fgetc(fh))!=EOF) {
776     if (fputc(ch, fout)==EOF)
777     err_die("Error copying SAM header\n");
778     }
779     fclose(fh);
780     }
781    
782     //auxiliary functions for BAM record handling
783     uint8_t* realloc_bdata(bam1_t *b, int size) {
784     if (b->m_data < size) {
785     b->m_data = size;
786     kroundup32(b->m_data);
787     b->data = (uint8_t*)realloc(b->data, b->m_data);
788     }
789     if (b->data_len<size) b->data_len=size;
790     return b->data;
791     }
792    
793     uint8_t* dupalloc_bdata(bam1_t *b, int size) {
794     //same as realloc_bdata, but does not free previous data
795     //but returns it instead
796     //it ALWAYS duplicates data
797     b->m_data = size;
798     kroundup32(b->m_data);
799     uint8_t* odata=b->data;
800     b->data = (uint8_t*)malloc(b->m_data);
801     memcpy((void*)b->data, (void*)odata, b->data_len);
802     b->data_len=size;
803     return odata; //user must FREE this after
804     }
805    
806     extern unsigned short bam_char2flag_table[];
807    
808     GBamRecord::GBamRecord(const char* qname, int32_t gseq_tid,
809     int pos, bool reverse, const char* qseq, const char* cigar, const char* quals) {
810     novel=true;
811     b=bam_init1();
812     b->core.tid=gseq_tid;
813     if (pos<=0) {
814     b->core.pos=-1; //unmapped
815     //if (gseq_tid<0)
816     b->core.flag |= BAM_FUNMAP;
817     }
818     else b->core.pos=pos-1; //BAM is 0-based
819     b->core.qual=255;
820     int l_qseq=strlen(qseq);
821     //this may not be accurate, setting CIGAR is the correct way
822     //b->core.bin = bam_reg2bin(b->core.pos, b->core.pos+l_qseq-1);
823     b->core.l_qname=strlen(qname)+1; //includes the \0 at the end
824     memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname);
825     set_cigar(cigar); //this will also set core.bin
826     add_sequence(qseq, l_qseq);
827     add_quals(quals); //quals must be given as Phred33
828     if (reverse) { b->core.flag |= BAM_FREVERSE ; }
829     }
830    
831     GBamRecord::GBamRecord(const char* qname, int32_t flags, int32_t g_tid,
832     int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos,
833     int insert_size, const char* qseq, const char* quals,
834     const vector<string>* aux_strings) {
835     novel=true;
836     b=bam_init1();
837     b->core.tid=g_tid;
838     b->core.pos = (pos<=0) ? -1 : pos-1; //BAM is 0-based
839     b->core.qual=map_qual;
840     int l_qseq=strlen(qseq);
841     b->core.l_qname=strlen(qname)+1; //includes the \0 at the end
842     memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname);
843     set_cigar(cigar); //this will also set core.bin
844     add_sequence(qseq, l_qseq);
845     add_quals(quals); //quals must be given as Phred33
846     set_flags(flags);
847     set_mdata(mg_tid, (int32_t)(mate_pos-1), (int32_t)insert_size);
848     if (aux_strings!=NULL) {
849     for (vector<string>::const_iterator itr=aux_strings->begin();
850     itr!=aux_strings->end(); ++itr) {
851     add_aux(itr->c_str());
852     }
853     }
854     }
855     void GBamRecord::set_cigar(const char* cigar) {
856     //requires b->core.pos and b->core.flag to have been set properly PRIOR to this call
857     int doff=b->core.l_qname;
858     uint8_t* after_cigar=NULL;
859     int after_cigar_len=0;
860     uint8_t* prev_bdata=NULL;
861     if (b->data_len>doff) {
862     //cigar string already allocated, replace it
863     int d=b->core.l_qname + b->core.n_cigar * 4;//offset of after-cigar data
864     after_cigar=b->data+d;
865     after_cigar_len=b->data_len-d;
866     }
867     const char *s;
868     char *t;
869     int i, op;
870     long x;
871     b->core.n_cigar = 0;
872     if (cigar != NULL && strcmp(cigar, "*") != 0) {
873     for (s = cigar; *s; ++s) {
874     if (isalpha(*s)) b->core.n_cigar++;
875     else if (!isdigit(*s)) {
876     err_die("Error: invalid CIGAR character (%s)\n",cigar);
877     }
878     }
879     if (after_cigar_len>0) { //replace/insert into existing full data
880     prev_bdata=dupalloc_bdata(b, doff + b->core.n_cigar * 4 + after_cigar_len);
881     memcpy((void*)(b->data+doff+b->core.n_cigar*4),(void*)after_cigar, after_cigar_len);
882     free(prev_bdata);
883     }
884     else {
885     realloc_bdata(b, doff + b->core.n_cigar * 4);
886     }
887 gpertea 154 for (i = 0, s = cigar; i != (int)b->core.n_cigar; ++i) {
888 gpertea 29 x = strtol(s, &t, 10);
889     op = toupper(*t);
890     if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH;
891     else if (op == 'I') op = BAM_CINS;
892     else if (op == 'D') op = BAM_CDEL;
893     else if (op == 'N') op = BAM_CREF_SKIP;
894     else if (op == 'S') op = BAM_CSOFT_CLIP;
895     else if (op == 'H') op = BAM_CHARD_CLIP;
896     else if (op == 'P') op = BAM_CPAD;
897     else err_die("Error: invalid CIGAR operation (%s)\n",cigar);
898     s = t + 1;
899     bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op;
900     }
901     if (*s) err_die("Error: unmatched CIGAR operation (%s)\n",cigar);
902     b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&b->core, bam1_cigar(b)));
903     } else {//no CIGAR string given
904     if (!(b->core.flag&BAM_FUNMAP)) {
905     fprintf(stderr, "Warning: mapped sequence without CIGAR (%s)\n", (char*)b->data);
906     b->core.flag |= BAM_FUNMAP;
907     }
908     b->core.bin = bam_reg2bin(b->core.pos, b->core.pos + 1);
909     }
910     } //set_cigar()
911    
912     void GBamRecord::add_sequence(const char* qseq, int slen) {
913     //must be called AFTER set_cigar (cannot replace existing sequence for now)
914     if (qseq==NULL) return; //should we ever care about this?
915     if (slen<0) slen=strlen(qseq);
916     int doff = b->core.l_qname + b->core.n_cigar * 4;
917     if (strcmp(qseq, "*")!=0) {
918     b->core.l_qseq=slen;
919     if (b->core.n_cigar && b->core.l_qseq != (int32_t)bam_cigar2qlen(&b->core, bam1_cigar(b)))
920     err_die("Error: CIGAR and sequence length are inconsistent!(%s)\n",
921     qseq);
922     uint8_t* p = (uint8_t*)realloc_bdata(b, doff + (b->core.l_qseq+1)/2 + b->core.l_qseq) + doff;
923     //also allocated quals memory
924     memset(p, 0, (b->core.l_qseq+1)/2);
925     for (int i = 0; i < b->core.l_qseq; ++i)
926     p[i/2] |= bam_nt16_table[(int)qseq[i]] << 4*(1-i%2);
927     } else b->core.l_qseq = 0;
928     }
929    
930     void GBamRecord::add_quals(const char* quals) {
931     //requires core.l_qseq already set
932     //and must be called AFTER add_sequence(), which also allocates the memory for quals
933     uint8_t* p = b->data+(b->core.l_qname + b->core.n_cigar * 4 + (b->core.l_qseq+1)/2);
934     if (quals==NULL || strcmp(quals, "*") == 0) {
935     for (int i=0;i < b->core.l_qseq; i++) p[i] = 0xff;
936     return;
937     }
938     for (int i=0;i < b->core.l_qseq; i++) p[i] = quals[i]-33;
939     }
940    
941     void GBamRecord::add_aux(const char* str) {
942 gpertea 154 // daehwan - this is not thread-safe - I made these varaiables as class members
943 gpertea 29 //requires: being called AFTER add_quals()
944 gpertea 154 // static char tag[2];
945     // static uint8_t abuf[512];
946 gpertea 29 //requires: being called AFTER add_quals()
947     int strl=strlen(str);
948     //int doff = b->core.l_qname + b->core.n_cigar*4 + (b->core.l_qseq+1)/2 + b->core.l_qseq + b->l_aux;
949     //int doff0=doff;
950     if (strl < 6 || str[2] != ':' || str[4] != ':')
951     parse_error("missing colon in auxiliary data");
952     tag[0] = str[0]; tag[1] = str[1];
953     uint8_t atype = str[3];
954     uint8_t* adata=abuf;
955     int alen=0;
956     if (atype == 'A' || atype == 'a' || atype == 'c' || atype == 'C') { // c and C for backward compatibility
957     atype='A';
958     alen=1;
959     adata=(uint8_t*)&str[5];
960     }
961     else if (atype == 'I' || atype == 'i') {
962     long long x=(long long)atoll(str + 5);
963     if (x < 0) {
964     if (x >= -127) {
965     atype='c';
966     abuf[0] = (int8_t)x;
967     alen=1;
968     }
969     else if (x >= -32767) {
970     atype = 's';
971     *(int16_t*)abuf = (int16_t)x;
972     alen=2;
973     }
974     else {
975     atype='i';
976     *(int32_t*)abuf = (int32_t)x;
977     alen=4;
978     if (x < -2147483648ll)
979     fprintf(stderr, "Parse warning: integer %lld is out of range.",
980     x);
981     }
982     } else { //x >=0
983     if (x <= 255) {
984     atype = 'C';
985     abuf[0] = (uint8_t)x;
986     alen=1;
987     }
988     else if (x <= 65535) {
989     atype='S';
990     *(uint16_t*)abuf = (uint16_t)x;
991     alen=2;
992     }
993     else {
994     atype='I';
995     *(uint32_t*)abuf = (uint32_t)x;
996     alen=4;
997     if (x > 4294967295ll)
998     fprintf(stderr, "Parse warning: integer %lld is out of range.",
999     x);
1000     }
1001     }
1002     } //integer type
1003     else if (atype == 'f') {
1004     *(float*)abuf = (float)atof(str + 5);
1005     alen = sizeof(float);
1006     }
1007     else if (atype == 'd') { //?
1008     *(float*)abuf = (float)atof(str + 9);
1009     alen=8;
1010     }
1011     else if (atype == 'Z' || atype == 'H') {
1012     if (atype == 'H') { // check whether the hex string is valid
1013     if ((strl - 5) % 2 == 1) parse_error("length of the hex string not even");
1014     for (int i = 0; i < strl - 5; ++i) {
1015     int c = toupper(str[5 + i]);
1016     if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F')))
1017     parse_error("invalid hex character");
1018     }
1019     }
1020     memcpy(abuf, str + 5, strl - 5);
1021     abuf[strl-5] = 0;
1022     alen=strl-4;
1023     } else parse_error("unrecognized aux type");
1024     this->add_aux(tag, atype, alen, adata);
1025     }//add_aux()