ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/common.cpp
Revision: 159
Committed: Mon Jan 30 22:38:08 2012 UTC (8 years, 9 months ago) by gpertea
File size: 34750 byte(s)
Log Message:
wip

Line User Rev File contents
1 gpertea 29 /*
2     * common.cpp
3     * TopHat
4     *
5     * Created by Cole Trapnell on 11/26/08.
6     * Copyright 2008 Cole Trapnell. All rights reserved.
7     *
8     */
9    
10     #ifdef HAVE_CONFIG_H
11     #include <config.h>
12     #endif
13    
14     #include <iostream>
15     #include <sstream>
16     #include <cstdarg>
17 gpertea 154 #include <limits>
18 gpertea 29 #include <getopt.h>
19    
20     #include "common.h"
21    
22     using namespace std;
23    
24     #ifdef MEM_DEBUG
25     //function for debugging memory usage of current program in Linux
26    
27     #include <unistd.h>
28     #include <ios>
29     #include <fstream>
30    
31     //////////////////////////////////////////////////////////////////////////////
32     // process_mem_usage(double &, double &) - takes two doubles by reference,
33     // attempts to read the system-dependent data for a process' virtual memory
34     // size and resident set size, and return the results in KB.
35     //
36     // On failure, returns 0.0, 0.0
37    
38     void process_mem_usage(double& vm_usage, double& resident_set) {
39     using std::ios_base;
40     using std::ifstream;
41     using std::string;
42     vm_usage = 0.0;
43     resident_set = 0.0;
44     // 'file' stat seems to give the most reliable results
45     ifstream stat_stream("/proc/self/stat",ios_base::in);
46     // dummy vars for leading entries in stat that we don't care about
47     string pid, comm, state, ppid, pgrp, session, tty_nr;
48     string tpgid, flags, minflt, cminflt, majflt, cmajflt;
49     string utime, stime, cutime, cstime, priority, nice;
50     string O, itrealvalue, starttime;
51    
52     // the two fields we want
53     //
54     unsigned long vsize;
55     long rss;
56    
57     stat_stream >> pid >> comm >> state >> ppid >> pgrp >> session >> tty_nr
58     >> tpgid >> flags >> minflt >> cminflt >> majflt >> cmajflt
59     >> utime >> stime >> cutime >> cstime >> priority >> nice
60     >> O >> itrealvalue >> starttime >> vsize >> rss; // don't care about the rest
61    
62     stat_stream.close();
63    
64     long page_size_kb = sysconf(_SC_PAGE_SIZE) / 1024; // in case x86-64 is configured to use 2MB pages
65     vm_usage = vsize / 1024.0;
66     resident_set = rss * page_size_kb;
67     }
68    
69     void print_mem_usage() {
70     double vs, rs;
71     process_mem_usage(vs,rs);
72     vs/=1024;
73     rs/=1024;
74     fprintf(stderr, "VMSize: %6.1fMB\tRSize: %6.1fMB\n", vs, rs);
75     }
76     #endif
77    
78 gpertea 154 bool bowtie2 = true;
79     int bowtie2_min_score = -10;
80 gpertea 159 int bowtie2_max_penalty = 6;
81     int bowtie2_min_penalty = 2;
82     int bowtie2_penalty_for_N = 1;
83     int bowtie2_read_gap_open = 5;
84     int bowtie2_read_gap_cont = 3;
85     int bowtie2_ref_gap_open = 5;
86     int bowtie2_ref_gap_cont = 3;
87 gpertea 29
88 gpertea 154 // daehwan - temporary
89     bool parallel = true;
90    
91 gpertea 29 unsigned int max_insertion_length = 3;
92     unsigned int max_deletion_length = 3;
93    
94    
95     int inner_dist_mean = 200;
96     int inner_dist_std_dev = 20;
97     int max_mate_inner_dist = -1;
98    
99     int min_anchor_len = 8;
100     int min_report_intron_length = 50;
101     int max_report_intron_length = 500000;
102    
103     int min_closure_intron_length = 50;
104     int max_closure_intron_length = 5000;
105    
106     int min_coverage_intron_length = 50;
107     int max_coverage_intron_length = 20000;
108    
109     int min_segment_intron_length = 50;
110     int max_segment_intron_length = 500000;
111    
112     uint32_t min_closure_exon_length = 100;
113    
114     int island_extension = 25;
115     int segment_length = 25;
116     int segment_mismatches = 2;
117 gpertea 135 int max_read_mismatches = 2;
118 gpertea 29 int max_splice_mismatches = 1;
119    
120     ReadFormat reads_format = FASTQ;
121    
122     bool verbose = false;
123    
124 gpertea 159 unsigned int max_multihits = 20;
125     unsigned int max_seg_multihits = 40;
126 gpertea 29 bool no_closure_search = false;
127     bool no_coverage_search = false;
128     bool no_microexon_search = false;
129     bool butterfly_search = false;
130 gpertea 154 int num_threads = 1;
131    
132 gpertea 29 float min_isoform_fraction = 0.15f;
133    
134     string output_dir = "tophat_out";
135     string aux_outfile = ""; //auxiliary output file name (e.g. prep_reads read stats)
136 gpertea 154 string index_outfile = "";
137 gpertea 29 string gene_filter = "";
138     string gff_file = "";
139     string ium_reads = "";
140     string sam_header = "";
141     string sam_readgroup_id = "";
142     string zpacker = "";
143     string samtools_path = "samtools";
144    
145     bool solexa_quals = false;
146     bool phred64_quals = false;
147     bool quals = false;
148     bool integer_quals = false;
149     bool color = false;
150     string gtf_juncs = "";
151    
152 gpertea 159 bool report_secondary_alignments = false;
153    
154 gpertea 135 string flt_reads = "";
155     string flt_mappings = "";
156    
157 gpertea 154 bool fusion_search = false;
158     size_t fusion_anchor_length = 20;
159     size_t fusion_min_dist = 10000000;
160     size_t fusion_read_mismatches = 2;
161     size_t fusion_multireads = 2;
162     size_t fusion_multipairs = 2;
163    
164 gpertea 29 eLIBRARY_TYPE library_type = LIBRARY_TYPE_NONE;
165    
166     extern void print_usage();
167    
168     /**
169     * Parse an int out of optarg and enforce that it be at least 'lower';
170     * if it is less than 'lower', than output the given error message and
171     * exit with an error and a usage message.
172     */
173    
174     int parseIntOpt(int lower, const char *errmsg, void (*print_usage)()) {
175     long l;
176     char *endPtr= NULL;
177     l = strtol(optarg, &endPtr, 10);
178     if (endPtr != NULL) {
179     if (l < lower) {
180     cerr << errmsg << endl;
181     print_usage();
182     exit(1);
183     }
184     return (int32_t)l;
185     }
186     cerr << errmsg << endl;
187     print_usage();
188     exit(1);
189     return -1;
190     }
191    
192     /**
193     * Parse an int out of optarg and enforce that it be at least 'lower';
194     * if it is less than 'lower', than output the given error message and
195     * exit with an error and a usage message.
196     */
197     static float parseFloatOpt(float lower, float upper, const char *errmsg, void (*print_usage)()) {
198     float l;
199     l = (float)atof(optarg);
200    
201     if (l < lower) {
202     cerr << errmsg << endl;
203     print_usage();
204     exit(1);
205     }
206    
207     if (l > upper)
208     {
209     cerr << errmsg << endl;
210     print_usage();
211     exit(1);
212     }
213    
214     return l;
215    
216     cerr << errmsg << endl;
217     print_usage();
218     exit(1);
219     return -1;
220     }
221    
222     /*
223     this is from
224     http://www.winehq.org/pipermail/wine-patches/2001-November/001322.html
225     */
226     char* get_token(char** str, const char* delims)
227     {
228     char* token;
229     if (*str == NULL)
230     return NULL;
231    
232     token = *str;
233     while (**str != '\0')
234     {
235     if (strchr(delims, **str) != NULL)
236     {
237     **str = '\0';
238     ++(*str);
239     return token;
240     }
241    
242     ++(*str);
243     }
244    
245     *str = NULL;
246     return token;
247     }
248    
249 gpertea 135 const char *short_options = "QCp:z:N:";
250 gpertea 29
251     enum
252     {
253     OPT_FASTA = 127,
254     OPT_FASTQ,
255     OPT_MIN_ANCHOR,
256     OPT_SPLICE_MISMATCHES,
257     OPT_VERBOSE,
258     OPT_INSERT_LENGTH_MEAN,
259     OPT_INSERT_LENGTH_STD_DEV,
260     OPT_MIN_ISOFORM_FRACTION,
261     OPT_OUTPUT_DIR,
262     OPT_GENE_FILTER,
263     OPT_GFF_ANNOTATIONS,
264     OPT_MAX_MULTIHITS,
265 gpertea 159 OPT_MAX_SEG_MULTIHITS,
266 gpertea 29 OPT_NO_CLOSURE_SEARCH,
267     OPT_NO_COVERAGE_SEARCH,
268     OPT_NO_MICROEXON_SEARCH,
269     OPT_SEGMENT_LENGTH,
270 gpertea 135 OPT_READ_MISMATCHES,
271 gpertea 29 OPT_SEGMENT_MISMATCHES,
272     OPT_MIN_CLOSURE_EXON,
273     OPT_MAX_CLOSURE_INTRON,
274     OPT_MIN_CLOSURE_INTRON,
275     OPT_MAX_COVERAGE_INTRON,
276     OPT_MIN_COVERAGE_INTRON,
277     OPT_MIN_SEGMENT_INTRON,
278     OPT_MAX_SEGMENT_INTRON,
279     OPT_MIN_REPORT_INTRON,
280     OPT_MAX_REPORT_INTRON,
281     OPT_IUM_READS,
282     OPT_BUTTERFLY_SEARCH,
283     OPT_SOLEXA_QUALS,
284     OPT_PHRED64_QUALS,
285     OPT_SAM_HEADER,
286     OPT_SAM_READGROUP_ID,
287     OPT_QUALS,
288     OPT_INTEGER_QUALS,
289     OPT_COLOR,
290     OPT_LIBRARY_TYPE,
291     OPT_MAX_DELETION_LENGTH,
292     OPT_MAX_INSERTION_LENGTH,
293 gpertea 154 OPT_NUM_THREADS,
294 gpertea 29 OPT_ZPACKER,
295     OPT_SAMTOOLS,
296     OPT_AUX_OUT,
297 gpertea 154 OPT_INDEX_OUT,
298 gpertea 135 OPT_GTF_JUNCS,
299     OPT_FILTER_READS,
300 gpertea 154 OPT_FILTER_HITS,
301 gpertea 159 OPT_REPORT_SECONDARY_ALIGNMENTS,
302 gpertea 154 OPT_FUSION_SEARCH,
303     OPT_FUSION_ANCHOR_LENGTH,
304     OPT_FUSION_MIN_DIST,
305     OPT_FUSION_READ_MISMATCHES,
306     OPT_FUSION_MULTIREADS,
307     OPT_FUSION_MULTIPAIRS,
308     OPT_BOWTIE1,
309     OPT_BOWTIE2_MIN_SCORE,
310 gpertea 159 OPT_BOWTIE2_MAX_PENALTY,
311     OPT_BOWTIE2_MIN_PENALTY,
312     OPT_BOWTIE2_PENALTY_FOR_N,
313     OPT_BOWTIE2_READ_GAP_OPEN,
314     OPT_BOWTIE2_READ_GAP_CONT,
315     OPT_BOWTIE2_REF_GAP_OPEN,
316     OPT_BOWTIE2_REF_GAP_CONT,
317 gpertea 29 };
318    
319     static struct option long_options[] = {
320     {"fasta", no_argument, 0, OPT_FASTA},
321     {"fastq", no_argument, 0, OPT_FASTQ},
322     {"min-anchor", required_argument, 0, OPT_MIN_ANCHOR},
323     {"sam-header", required_argument, 0, OPT_SAM_HEADER},
324     {"rg-id", required_argument, 0, OPT_SAM_READGROUP_ID},
325     {"splice-mismatches", required_argument, 0, OPT_SPLICE_MISMATCHES},
326     {"verbose", no_argument, 0, OPT_VERBOSE},
327     {"inner-dist-mean", required_argument, 0, OPT_INSERT_LENGTH_MEAN},
328     {"inner-dist-std-dev", required_argument, 0, OPT_INSERT_LENGTH_STD_DEV},
329     {"output-dir", required_argument, 0, OPT_OUTPUT_DIR},
330     {"gene-filter", required_argument, 0, OPT_GENE_FILTER},
331     {"gtf-annotations", required_argument, 0, OPT_GFF_ANNOTATIONS},
332     {"max-multihits", required_argument, 0, OPT_MAX_MULTIHITS},
333 gpertea 159 {"max-seg-multihits", required_argument, 0, OPT_MAX_SEG_MULTIHITS},
334 gpertea 29 {"no-closure-search", no_argument, 0, OPT_NO_CLOSURE_SEARCH},
335     {"no-coverage-search", no_argument, 0, OPT_NO_COVERAGE_SEARCH},
336     {"no-microexon-search", no_argument, 0, OPT_NO_MICROEXON_SEARCH},
337     {"segment-length", required_argument, 0, OPT_SEGMENT_LENGTH},
338     {"segment-mismatches", required_argument, 0, OPT_SEGMENT_MISMATCHES},
339 gpertea 135 {"max-mismatches", required_argument, 0, OPT_READ_MISMATCHES},
340 gpertea 29 {"min-closure-exon", required_argument, 0, OPT_MIN_CLOSURE_EXON},
341     {"min-closure-intron", required_argument, 0, OPT_MIN_CLOSURE_INTRON},
342     {"max-closure-intron", required_argument, 0, OPT_MAX_CLOSURE_INTRON},
343     {"min-coverage-intron", required_argument, 0, OPT_MIN_COVERAGE_INTRON},
344     {"max-coverage-intron", required_argument, 0, OPT_MAX_COVERAGE_INTRON},
345     {"min-segment-intron", required_argument, 0, OPT_MIN_SEGMENT_INTRON},
346     {"max-segment-intron", required_argument, 0, OPT_MAX_SEGMENT_INTRON},
347     {"min-report-intron", required_argument, 0, OPT_MIN_REPORT_INTRON},
348     {"max-report-intron", required_argument, 0, OPT_MAX_REPORT_INTRON},
349     {"min-isoform-fraction",required_argument, 0, OPT_MIN_ISOFORM_FRACTION},
350     {"ium-reads", required_argument, 0, OPT_IUM_READS},
351     {"butterfly-search", no_argument, 0, OPT_BUTTERFLY_SEARCH},
352     {"solexa-quals", no_argument, 0, OPT_SOLEXA_QUALS},
353     {"phred64-quals", no_argument, 0, OPT_PHRED64_QUALS},
354     {"quals", no_argument, 0, OPT_QUALS},
355     {"integer-quals", no_argument, 0, OPT_INTEGER_QUALS},
356     {"color", no_argument, 0, OPT_COLOR},
357     {"library-type", required_argument, 0, OPT_LIBRARY_TYPE},
358     {"max-deletion-length", required_argument, 0, OPT_MAX_DELETION_LENGTH},
359     {"max-insertion-length", required_argument, 0, OPT_MAX_INSERTION_LENGTH},
360 gpertea 154 {"num-threads", required_argument, 0, OPT_NUM_THREADS},
361 gpertea 29 {"zpacker", required_argument, 0, OPT_ZPACKER},
362     {"samtools", required_argument, 0, OPT_SAMTOOLS},
363     {"aux-outfile", required_argument, 0, OPT_AUX_OUT},
364 gpertea 154 {"index-outfile", required_argument, 0, OPT_INDEX_OUT},
365 gpertea 29 {"gtf-juncs", required_argument, 0, OPT_GTF_JUNCS},
366 gpertea 135 {"flt-reads",required_argument, 0, OPT_FILTER_READS},
367     {"flt-hits",required_argument, 0, OPT_FILTER_HITS},
368 gpertea 159 {"report-secondary-alignments", no_argument, 0, OPT_REPORT_SECONDARY_ALIGNMENTS},
369 gpertea 154 {"fusion-search", no_argument, 0, OPT_FUSION_SEARCH},
370     {"fusion-anchor-length", required_argument, 0, OPT_FUSION_ANCHOR_LENGTH},
371     {"fusion-min-dist", required_argument, 0, OPT_FUSION_MIN_DIST},
372     {"fusion-read-mismatches", required_argument, 0, OPT_FUSION_READ_MISMATCHES},
373     {"fusion-multireads", required_argument, 0, OPT_FUSION_MULTIREADS},
374     {"fusion-multipairs", required_argument, 0, OPT_FUSION_MULTIPAIRS},
375     {"bowtie1", no_argument, 0, OPT_BOWTIE1},
376     {"bowtie2-min-score", required_argument, 0, OPT_BOWTIE2_MIN_SCORE},
377 gpertea 159 {"bowtie2-max-penalty", required_argument, 0, OPT_BOWTIE2_MAX_PENALTY},
378     {"bowtie2-min-penalty", required_argument, 0, OPT_BOWTIE2_MIN_PENALTY},
379     {"bowtie2-penalty-for-N", required_argument, 0, OPT_BOWTIE2_PENALTY_FOR_N},
380     {"bowtie2-read-gap-open", required_argument, 0, OPT_BOWTIE2_READ_GAP_OPEN},
381     {"bowtie2-read-gap-cont", required_argument, 0, OPT_BOWTIE2_READ_GAP_CONT},
382     {"bowtie2-ref-gap-open", required_argument, 0, OPT_BOWTIE2_REF_GAP_OPEN},
383     {"bowtie2-ref-gap-cont", required_argument, 0, OPT_BOWTIE2_REF_GAP_CONT},
384 gpertea 29 {0, 0, 0, 0} // terminator
385     };
386    
387    
388     void str_appendInt(string& str, int v) {
389     stringstream ss;
390     ss << v;
391     str.append(ss.str());
392     }
393    
394     bool str_endsWith(string& str, const char* suffix) {
395     if (str.empty() || str.length()<3) return false;
396     size_t l=strlen(suffix);
397     if (str.length()<=l) return false;
398     if (str.rfind(suffix, str.length()-l-1)!=string::npos) return true;
399     return false;
400     }
401    
402     int parse_options(int argc, char** argv, void (*print_usage)())
403     {
404     int option_index = 0;
405     int next_option;
406     do {
407     next_option = getopt_long(argc, argv, short_options, long_options, &option_index);
408     switch (next_option) {
409     case -1:
410     break;
411     case OPT_FASTA:
412     reads_format = FASTA;
413     break;
414     case OPT_FASTQ:
415     reads_format = FASTQ;
416     break;
417     case OPT_MIN_ANCHOR:
418     min_anchor_len = (uint32_t)parseIntOpt(3, "--min-anchor arg must be at least 3", print_usage);
419     break;
420     case OPT_SPLICE_MISMATCHES:
421     max_splice_mismatches = parseIntOpt(0, "--splice-mismatches arg must be at least 0", print_usage);
422     break;
423     case OPT_VERBOSE:
424     verbose = true;
425     break;
426     case OPT_INSERT_LENGTH_MEAN:
427     inner_dist_mean = parseIntOpt(-1024, "--inner-dist-mean arg must be at least -1024", print_usage);
428     break;
429     case OPT_INSERT_LENGTH_STD_DEV:
430     inner_dist_std_dev = parseIntOpt(0, "--inner-dist-std-dev arg must be at least 0", print_usage);
431     break;
432     case OPT_OUTPUT_DIR:
433     output_dir = optarg;
434     break;
435     case OPT_GENE_FILTER:
436     gene_filter = optarg;
437     break;
438     case OPT_GFF_ANNOTATIONS:
439     gff_file = optarg;
440     break;
441     case OPT_MAX_MULTIHITS:
442     max_multihits = parseIntOpt(1, "--max-multihits arg must be at least 1", print_usage);
443     break;
444 gpertea 159 case OPT_MAX_SEG_MULTIHITS:
445     max_seg_multihits = parseIntOpt(1, "--max-seg-multihits arg must be at least 1", print_usage);
446     break;
447 gpertea 29 case OPT_NO_CLOSURE_SEARCH:
448     no_closure_search = true;
449     break;
450     case OPT_NO_COVERAGE_SEARCH:
451     no_coverage_search = true;
452     break;
453     case OPT_NO_MICROEXON_SEARCH:
454     no_microexon_search = true;
455     break;
456     case OPT_SEGMENT_LENGTH:
457     segment_length = parseIntOpt(4, "--segment-length arg must be at least 4", print_usage);
458     break;
459     case OPT_SEGMENT_MISMATCHES:
460     segment_mismatches = parseIntOpt(0, "--segment-mismatches arg must be at least 0", print_usage);
461     break;
462 gpertea 135 case 'N':
463     case OPT_READ_MISMATCHES:
464     max_read_mismatches = parseIntOpt(0, "--max-mismatches arg must be at least 0", print_usage);
465     break;
466 gpertea 29 case OPT_MIN_CLOSURE_EXON:
467     min_closure_exon_length = parseIntOpt(1, "--min-closure-exon arg must be at least 1", print_usage);
468     break;
469     case OPT_MIN_CLOSURE_INTRON:
470     min_closure_intron_length = parseIntOpt(1, "--min-closure-intron arg must be at least 1", print_usage);
471     break;
472     case OPT_MAX_CLOSURE_INTRON:
473     max_closure_intron_length = parseIntOpt(1, "--max-closure-intron arg must be at least 1", print_usage);
474     break;
475     case OPT_MIN_COVERAGE_INTRON:
476     min_coverage_intron_length = parseIntOpt(1, "--min-coverage-intron arg must be at least 1", print_usage);
477     break;
478     case OPT_MAX_COVERAGE_INTRON:
479     max_coverage_intron_length = parseIntOpt(1, "--max-coverage-intron arg must be at least 1", print_usage);
480     break;
481     case OPT_MIN_SEGMENT_INTRON:
482     min_segment_intron_length = parseIntOpt(1, "--min-segment-intron arg must be at least 1", print_usage);
483     break;
484     case OPT_MAX_SEGMENT_INTRON:
485     max_segment_intron_length = parseIntOpt(1, "--max-segment-intron arg must be at least 1", print_usage);
486     break;
487     case OPT_MIN_REPORT_INTRON:
488     min_report_intron_length = parseIntOpt(1, "--min-report-intron arg must be at least 1", print_usage);
489     break;
490     case OPT_MAX_REPORT_INTRON:
491     max_report_intron_length = parseIntOpt(1, "--max-report-intron arg must be at least 1", print_usage);
492     break;
493     case OPT_MIN_ISOFORM_FRACTION:
494     min_isoform_fraction = parseFloatOpt(0.0f, 1.0f, "--min-isoform-fraction arg must be [0.0,1.0]", print_usage);
495     break;
496     case OPT_IUM_READS:
497     ium_reads = optarg;
498     break;
499     case OPT_SAM_HEADER:
500     sam_header = optarg;
501     break;
502     case OPT_SAM_READGROUP_ID:
503     sam_readgroup_id = optarg;
504     break;
505     case OPT_BUTTERFLY_SEARCH:
506     butterfly_search = true;
507     break;
508     case OPT_SOLEXA_QUALS:
509     solexa_quals = true;
510     break;
511     case OPT_PHRED64_QUALS:
512     phred64_quals = true;
513     break;
514     case 'Q':
515     case OPT_QUALS:
516     quals = true;
517     break;
518     case OPT_INTEGER_QUALS:
519     integer_quals = true;
520     break;
521     case 'C':
522     case OPT_COLOR:
523     color = true;
524     break;
525     case OPT_LIBRARY_TYPE:
526     if (strcmp(optarg, "fr-unstranded") == 0)
527     library_type = FR_UNSTRANDED;
528     else if (strcmp(optarg, "fr-firststrand") == 0)
529     library_type = FR_FIRSTSTRAND;
530     else if (strcmp(optarg, "fr-secondstrand") == 0)
531     library_type = FR_SECONDSTRAND;
532     else if (strcmp(optarg, "ff-unstranded") == 0)
533     library_type = FF_UNSTRANDED;
534     else if (strcmp(optarg, "ff-firststrand") == 0)
535     library_type = FF_FIRSTSTRAND;
536     else if (strcmp(optarg, "ff-secondstrand") == 0)
537     library_type = FF_SECONDSTRAND;
538     break;
539     case OPT_MAX_DELETION_LENGTH:
540     max_deletion_length = parseIntOpt(0, "--max-deletion-length must be at least 0", print_usage);
541     break;
542     case OPT_MAX_INSERTION_LENGTH:
543     max_insertion_length = parseIntOpt(0, "--max-insertion-length must be at least 0", print_usage);
544     break;
545     case 'z':
546     case OPT_ZPACKER:
547     zpacker = optarg;
548     break;
549     case OPT_SAMTOOLS:
550     samtools_path = optarg;
551     break;
552     case OPT_AUX_OUT:
553     aux_outfile = optarg;
554     break;
555 gpertea 154 case OPT_INDEX_OUT:
556     index_outfile = optarg;
557     break;
558 gpertea 29 case 'p':
559 gpertea 154 case OPT_NUM_THREADS:
560     num_threads=parseIntOpt(1,"-p/--num-threads must be at least 1",print_usage);
561 gpertea 29 break;
562     case OPT_GTF_JUNCS:
563     gtf_juncs = optarg;
564     break;
565 gpertea 135 case OPT_FILTER_READS:
566     flt_reads = optarg;
567     break;
568     case OPT_FILTER_HITS:
569     flt_mappings = optarg;
570     break;
571 gpertea 159 case OPT_REPORT_SECONDARY_ALIGNMENTS:
572     report_secondary_alignments = true;
573     break;
574 gpertea 154 case OPT_FUSION_SEARCH:
575     fusion_search = true;
576     break;
577     case OPT_FUSION_ANCHOR_LENGTH:
578     fusion_anchor_length = parseIntOpt(10, "--fusion-anchor-length must be at least 10", print_usage);
579     break;
580     case OPT_FUSION_MIN_DIST:
581     fusion_min_dist = parseIntOpt(0, "--fusion-min-dist must be at least 0", print_usage);
582     break;
583     case OPT_FUSION_READ_MISMATCHES:
584     fusion_read_mismatches = parseIntOpt(0, "--fusion-read-mismatches must be at least 0", print_usage);
585     break;
586     case OPT_FUSION_MULTIREADS:
587     fusion_multireads = parseIntOpt(1, "--fusion-multireads must be at least 1", print_usage);
588     break;
589     case OPT_FUSION_MULTIPAIRS:
590     fusion_multipairs = parseIntOpt(1, "--fusion-multipairs must be at least 0", print_usage);
591     break;
592     case OPT_BOWTIE1:
593     bowtie2 = false;
594     break;
595     case OPT_BOWTIE2_MIN_SCORE:
596     bowtie2_min_score = -1 * parseIntOpt(0, "--bowtie2-min-score must be at least 0", print_usage);
597     break;
598 gpertea 159 case OPT_BOWTIE2_MAX_PENALTY:
599     bowtie2_max_penalty = parseIntOpt(0, "--bowtie2-max-penalty must be at least 0", print_usage);
600     break;
601     case OPT_BOWTIE2_MIN_PENALTY:
602     bowtie2_min_penalty = parseIntOpt(0, "--bowtie2-min-penalty must be at least 0", print_usage);
603     break;
604     case OPT_BOWTIE2_PENALTY_FOR_N:
605     bowtie2_penalty_for_N = parseIntOpt(0, "--bowtie2-penalty-for-N must be at least 0", print_usage);
606     break;
607     case OPT_BOWTIE2_READ_GAP_OPEN:
608     bowtie2_read_gap_open = parseIntOpt(0, "--bowtie2-read-gap-open must be at least 0", print_usage);
609     break;
610     case OPT_BOWTIE2_READ_GAP_CONT:
611     bowtie2_read_gap_cont = parseIntOpt(0, "--bowtie2-read-gap-cont must be at least 0", print_usage);
612     break;
613     case OPT_BOWTIE2_REF_GAP_OPEN:
614     bowtie2_ref_gap_open = parseIntOpt(0, "--bowtie2-ref-gap-open must be at least 0", print_usage);
615     break;
616     case OPT_BOWTIE2_REF_GAP_CONT:
617     bowtie2_ref_gap_cont = parseIntOpt(0, "--bowtie2-ref-gap-cont must be at least 0", print_usage);
618     break;
619 gpertea 29 default:
620     print_usage();
621     return 1;
622     }
623     } while(next_option != -1);
624    
625     return 0;
626     }
627    
628    
629     // Error routine (prints error message and exits!)
630     void err_exit(const char* format,...){
631     va_list arguments;
632     va_start(arguments,format);
633     vfprintf(stderr,format,arguments);
634     va_end(arguments);
635     #ifdef DEBUG
636     // trigger a core dump for later inspection
637     abort();
638     #endif
639     exit(1);
640     }
641    
642     FILE* FZPipe::openRead(const char* fname, string& popencmd) {
643     pipecmd=popencmd;
644     filename=fname;
645     if (pipecmd.empty()) {
646     file=fopen(filename.c_str(), "r");
647     }
648     else {
649     string pcmd(pipecmd);
650     pcmd.append(" '");
651     pcmd.append(filename);
652     pcmd.append("'");
653     file=popen(pcmd.c_str(), "r");
654     }
655     return file;
656     }
657    
658     FILE* FZPipe::openRead(const char* fname) {
659     string pcmd;
660     return this->openRead(fname,pcmd);
661     }
662    
663     FILE* FZPipe::openWrite(const char* fname, string& popencmd) {
664     pipecmd=popencmd;
665     filename=fname;
666     if (pipecmd.empty()) {
667     file=fopen(filename.c_str(), "w");
668     }
669     else {
670     string pcmd(pipecmd);
671     pcmd.append(" - > '");
672     pcmd.append(filename.c_str());
673     pcmd.append("'");
674     file=popen(pcmd.c_str(), "w");
675     }
676     return file;
677     }
678    
679    
680     FILE* FZPipe::openWrite(const char* fname) {
681     string pcmd;
682     return this->openWrite(fname,pcmd);
683     }
684    
685     void FZPipe::rewind() {
686     if (pipecmd.empty()) {
687     if (file!=NULL) {
688     ::rewind(file);
689     return;
690     }
691     if (!filename.empty()) {
692     file=fopen(filename.c_str(),"r");
693     return;
694     }
695     }
696     if (filename.empty())
697     err_die("Error: FZStream::rewind() failed (missing filename)!\n");
698     this->close();
699     string pcmd(pipecmd);
700     pcmd.append(" '");
701     pcmd.append(filename);
702     pcmd.append("'");
703     file=popen(pcmd.c_str(), "r");
704     if (file==NULL) {
705     err_die("Error: FZStream::rewind() popen(%s) failed!\n",pcmd.c_str());
706     }
707     }
708    
709    
710     string getFext(const string& s) {
711     string r("");
712     //if (xpos!=NULL) *xpos=0;
713     if (s.empty() || s=="-") return r;
714     int slen=(int)s.length();
715     int p=s.rfind('.');
716     int d=s.rfind('/');
717     if (p<=0 || p>slen-2 || p<slen-7 || p<d) return r;
718     r=s.substr(p+1);
719     //if (xpos!=NULL) *xpos=p+1;
720     for(size_t i=0; i!=r.length(); i++)
721     r[i] = std::tolower(r[i]);
722     return r;
723     }
724    
725     string guess_packer(const string& fname, bool use_all_cpus) {
726     //only needed for the primary input files (given by user)
727     string picmd("");
728     string fext=getFext(fname);
729 gpertea 32 if (fext=="bam") {
730     picmd="bam2fastx";
731     return picmd;
732     }
733 gpertea 29 if (fext=="gz" || fext=="gzip" || fext=="z") {
734     if (use_all_cpus && str_endsWith(zpacker,"pigz")) {
735     picmd=zpacker;
736 gpertea 154 if (num_threads<2) picmd.append(" -p1");
737 gpertea 29 else {
738     picmd.append(" -p");
739 gpertea 154 str_appendInt(picmd, num_threads);
740 gpertea 29 //picmd.append(" -cd");
741     }
742     }
743     else picmd="gzip";
744     }
745     else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
746     if (use_all_cpus && str_endsWith(zpacker,"pbzip2")) {
747     picmd=zpacker;
748 gpertea 154 if (num_threads<2) picmd.append(" -p1");
749 gpertea 29 else {
750     picmd.append(" -p");
751 gpertea 154 str_appendInt(picmd, num_threads);
752 gpertea 29 //picmd.append(" -cd");
753     }
754     }
755     else picmd="bzip2";
756     }
757     return picmd;
758     }
759    
760     /*
761     string getBam2SamCmd(const string& fname) {
762     string pipecmd("");
763     string fext=getFext(fname);
764     if (fext=="bam") {
765     pipecmd=samtools_path;
766     pipecmd.append(" view");
767     }
768     return pipecmd;
769     }
770     */
771    
772     void err_die(const char* format,...) { // Error exit
773     va_list arguments;
774     va_start(arguments,format);
775     vfprintf(stderr,format,arguments);
776     va_end(arguments);
777     exit(1);
778     }
779    
780     string getUnpackCmd(const string& fname, bool use_all_cpus) {
781 gpertea 32 //prep_reads should use guess_packer() instead
782 gpertea 135 //otherwise compressed files MUST have the .z extension,
783     //as they are all internally generated
784 gpertea 29 string pipecmd("");
785 gpertea 32 string fext=getFext(fname);
786     if (fext=="bam") {
787     pipecmd="bam2fastx";
788     return pipecmd;
789     }
790     if (zpacker.empty() || fext!="z") {
791 gpertea 135 return pipecmd; //no packer used
792 gpertea 29 }
793     pipecmd=zpacker;
794     if (str_endsWith(pipecmd, "pigz") ||str_endsWith(pipecmd, "pbzip2")) {
795     if (use_all_cpus==false) pipecmd.append(" -p1");
796 gpertea 154 else if (num_threads>1) {
797 gpertea 29 pipecmd.append(" -p");
798 gpertea 154 str_appendInt(pipecmd,num_threads);
799 gpertea 29 }
800     }
801     if (!pipecmd.empty()) pipecmd.append(" -cd");
802     return pipecmd;
803     }
804    
805     void checkSamHeader() {
806     if (sam_header.empty())
807     err_die("Error: writeSamHeader() with empty sam_header string\n");
808     //copy the SAM header
809     FILE* fh=fopen(sam_header.c_str(), "r");
810     if (fh==NULL)
811     err_die("Error: cannot open SAM header file %s\n",sam_header.c_str());
812     fclose(fh);
813     }
814    
815     void writeSamHeader(FILE* fout) {
816     if (fout==NULL)
817     err_die("Error: writeSamHeader(NULL)\n");
818     checkSamHeader();
819     //copy the SAM header
820     FILE* fh=fopen(sam_header.c_str(), "r");
821     int ch=-1;
822     while ((ch=fgetc(fh))!=EOF) {
823     if (fputc(ch, fout)==EOF)
824     err_die("Error copying SAM header\n");
825     }
826     fclose(fh);
827     }
828    
829     //auxiliary functions for BAM record handling
830     uint8_t* realloc_bdata(bam1_t *b, int size) {
831     if (b->m_data < size) {
832     b->m_data = size;
833     kroundup32(b->m_data);
834     b->data = (uint8_t*)realloc(b->data, b->m_data);
835     }
836     if (b->data_len<size) b->data_len=size;
837     return b->data;
838     }
839    
840     uint8_t* dupalloc_bdata(bam1_t *b, int size) {
841     //same as realloc_bdata, but does not free previous data
842     //but returns it instead
843     //it ALWAYS duplicates data
844     b->m_data = size;
845     kroundup32(b->m_data);
846     uint8_t* odata=b->data;
847     b->data = (uint8_t*)malloc(b->m_data);
848     memcpy((void*)b->data, (void*)odata, b->data_len);
849     b->data_len=size;
850     return odata; //user must FREE this after
851     }
852    
853     extern unsigned short bam_char2flag_table[];
854    
855     GBamRecord::GBamRecord(const char* qname, int32_t gseq_tid,
856     int pos, bool reverse, const char* qseq, const char* cigar, const char* quals) {
857     novel=true;
858     b=bam_init1();
859     b->core.tid=gseq_tid;
860     if (pos<=0) {
861     b->core.pos=-1; //unmapped
862     //if (gseq_tid<0)
863     b->core.flag |= BAM_FUNMAP;
864     }
865     else b->core.pos=pos-1; //BAM is 0-based
866     b->core.qual=255;
867     int l_qseq=strlen(qseq);
868     //this may not be accurate, setting CIGAR is the correct way
869     //b->core.bin = bam_reg2bin(b->core.pos, b->core.pos+l_qseq-1);
870     b->core.l_qname=strlen(qname)+1; //includes the \0 at the end
871     memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname);
872     set_cigar(cigar); //this will also set core.bin
873     add_sequence(qseq, l_qseq);
874     add_quals(quals); //quals must be given as Phred33
875     if (reverse) { b->core.flag |= BAM_FREVERSE ; }
876     }
877    
878     GBamRecord::GBamRecord(const char* qname, int32_t flags, int32_t g_tid,
879     int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos,
880     int insert_size, const char* qseq, const char* quals,
881     const vector<string>* aux_strings) {
882     novel=true;
883     b=bam_init1();
884     b->core.tid=g_tid;
885     b->core.pos = (pos<=0) ? -1 : pos-1; //BAM is 0-based
886     b->core.qual=map_qual;
887     int l_qseq=strlen(qseq);
888     b->core.l_qname=strlen(qname)+1; //includes the \0 at the end
889     memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname);
890     set_cigar(cigar); //this will also set core.bin
891     add_sequence(qseq, l_qseq);
892     add_quals(quals); //quals must be given as Phred33
893     set_flags(flags);
894     set_mdata(mg_tid, (int32_t)(mate_pos-1), (int32_t)insert_size);
895     if (aux_strings!=NULL) {
896     for (vector<string>::const_iterator itr=aux_strings->begin();
897     itr!=aux_strings->end(); ++itr) {
898     add_aux(itr->c_str());
899     }
900     }
901     }
902     void GBamRecord::set_cigar(const char* cigar) {
903     //requires b->core.pos and b->core.flag to have been set properly PRIOR to this call
904     int doff=b->core.l_qname;
905     uint8_t* after_cigar=NULL;
906     int after_cigar_len=0;
907     uint8_t* prev_bdata=NULL;
908     if (b->data_len>doff) {
909     //cigar string already allocated, replace it
910     int d=b->core.l_qname + b->core.n_cigar * 4;//offset of after-cigar data
911     after_cigar=b->data+d;
912     after_cigar_len=b->data_len-d;
913     }
914     const char *s;
915     char *t;
916     int i, op;
917     long x;
918     b->core.n_cigar = 0;
919     if (cigar != NULL && strcmp(cigar, "*") != 0) {
920     for (s = cigar; *s; ++s) {
921     if (isalpha(*s)) b->core.n_cigar++;
922     else if (!isdigit(*s)) {
923     err_die("Error: invalid CIGAR character (%s)\n",cigar);
924     }
925     }
926     if (after_cigar_len>0) { //replace/insert into existing full data
927     prev_bdata=dupalloc_bdata(b, doff + b->core.n_cigar * 4 + after_cigar_len);
928     memcpy((void*)(b->data+doff+b->core.n_cigar*4),(void*)after_cigar, after_cigar_len);
929     free(prev_bdata);
930     }
931     else {
932     realloc_bdata(b, doff + b->core.n_cigar * 4);
933     }
934 gpertea 154 for (i = 0, s = cigar; i != (int)b->core.n_cigar; ++i) {
935 gpertea 29 x = strtol(s, &t, 10);
936     op = toupper(*t);
937     if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH;
938     else if (op == 'I') op = BAM_CINS;
939     else if (op == 'D') op = BAM_CDEL;
940     else if (op == 'N') op = BAM_CREF_SKIP;
941     else if (op == 'S') op = BAM_CSOFT_CLIP;
942     else if (op == 'H') op = BAM_CHARD_CLIP;
943     else if (op == 'P') op = BAM_CPAD;
944     else err_die("Error: invalid CIGAR operation (%s)\n",cigar);
945     s = t + 1;
946     bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op;
947     }
948     if (*s) err_die("Error: unmatched CIGAR operation (%s)\n",cigar);
949     b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&b->core, bam1_cigar(b)));
950     } else {//no CIGAR string given
951     if (!(b->core.flag&BAM_FUNMAP)) {
952     fprintf(stderr, "Warning: mapped sequence without CIGAR (%s)\n", (char*)b->data);
953     b->core.flag |= BAM_FUNMAP;
954     }
955     b->core.bin = bam_reg2bin(b->core.pos, b->core.pos + 1);
956     }
957     } //set_cigar()
958    
959     void GBamRecord::add_sequence(const char* qseq, int slen) {
960     //must be called AFTER set_cigar (cannot replace existing sequence for now)
961     if (qseq==NULL) return; //should we ever care about this?
962     if (slen<0) slen=strlen(qseq);
963     int doff = b->core.l_qname + b->core.n_cigar * 4;
964     if (strcmp(qseq, "*")!=0) {
965     b->core.l_qseq=slen;
966     if (b->core.n_cigar && b->core.l_qseq != (int32_t)bam_cigar2qlen(&b->core, bam1_cigar(b)))
967     err_die("Error: CIGAR and sequence length are inconsistent!(%s)\n",
968     qseq);
969     uint8_t* p = (uint8_t*)realloc_bdata(b, doff + (b->core.l_qseq+1)/2 + b->core.l_qseq) + doff;
970     //also allocated quals memory
971     memset(p, 0, (b->core.l_qseq+1)/2);
972     for (int i = 0; i < b->core.l_qseq; ++i)
973     p[i/2] |= bam_nt16_table[(int)qseq[i]] << 4*(1-i%2);
974     } else b->core.l_qseq = 0;
975     }
976    
977     void GBamRecord::add_quals(const char* quals) {
978     //requires core.l_qseq already set
979     //and must be called AFTER add_sequence(), which also allocates the memory for quals
980     uint8_t* p = b->data+(b->core.l_qname + b->core.n_cigar * 4 + (b->core.l_qseq+1)/2);
981     if (quals==NULL || strcmp(quals, "*") == 0) {
982     for (int i=0;i < b->core.l_qseq; i++) p[i] = 0xff;
983     return;
984     }
985     for (int i=0;i < b->core.l_qseq; i++) p[i] = quals[i]-33;
986     }
987    
988     void GBamRecord::add_aux(const char* str) {
989     //requires: being called AFTER add_quals()
990 gpertea 154 // static char tag[2];
991     // static uint8_t abuf[512];
992 gpertea 29 //requires: being called AFTER add_quals()
993     int strl=strlen(str);
994     //int doff = b->core.l_qname + b->core.n_cigar*4 + (b->core.l_qseq+1)/2 + b->core.l_qseq + b->l_aux;
995     //int doff0=doff;
996     if (strl < 6 || str[2] != ':' || str[4] != ':')
997     parse_error("missing colon in auxiliary data");
998     tag[0] = str[0]; tag[1] = str[1];
999     uint8_t atype = str[3];
1000     uint8_t* adata=abuf;
1001     int alen=0;
1002     if (atype == 'A' || atype == 'a' || atype == 'c' || atype == 'C') { // c and C for backward compatibility
1003     atype='A';
1004     alen=1;
1005     adata=(uint8_t*)&str[5];
1006     }
1007     else if (atype == 'I' || atype == 'i') {
1008     long long x=(long long)atoll(str + 5);
1009     if (x < 0) {
1010     if (x >= -127) {
1011     atype='c';
1012     abuf[0] = (int8_t)x;
1013     alen=1;
1014     }
1015     else if (x >= -32767) {
1016     atype = 's';
1017     *(int16_t*)abuf = (int16_t)x;
1018     alen=2;
1019     }
1020     else {
1021     atype='i';
1022     *(int32_t*)abuf = (int32_t)x;
1023     alen=4;
1024     if (x < -2147483648ll)
1025     fprintf(stderr, "Parse warning: integer %lld is out of range.",
1026     x);
1027     }
1028     } else { //x >=0
1029     if (x <= 255) {
1030     atype = 'C';
1031     abuf[0] = (uint8_t)x;
1032     alen=1;
1033     }
1034     else if (x <= 65535) {
1035     atype='S';
1036     *(uint16_t*)abuf = (uint16_t)x;
1037     alen=2;
1038     }
1039     else {
1040     atype='I';
1041     *(uint32_t*)abuf = (uint32_t)x;
1042     alen=4;
1043     if (x > 4294967295ll)
1044     fprintf(stderr, "Parse warning: integer %lld is out of range.",
1045     x);
1046     }
1047     }
1048     } //integer type
1049     else if (atype == 'f') {
1050     *(float*)abuf = (float)atof(str + 5);
1051     alen = sizeof(float);
1052     }
1053     else if (atype == 'd') { //?
1054     *(float*)abuf = (float)atof(str + 9);
1055     alen=8;
1056     }
1057     else if (atype == 'Z' || atype == 'H') {
1058     if (atype == 'H') { // check whether the hex string is valid
1059     if ((strl - 5) % 2 == 1) parse_error("length of the hex string not even");
1060     for (int i = 0; i < strl - 5; ++i) {
1061     int c = toupper(str[5 + i]);
1062     if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F')))
1063     parse_error("invalid hex character");
1064     }
1065     }
1066     memcpy(abuf, str + 5, strl - 5);
1067     abuf[strl-5] = 0;
1068     alen=strl-4;
1069     } else parse_error("unrecognized aux type");
1070     this->add_aux(tag, atype, alen, adata);
1071     }//add_aux()