ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/common.cpp
Revision: 159
Committed: Mon Jan 30 22:38:08 2012 UTC (7 years, 8 months ago) by gpertea
File size: 34750 byte(s)
Log Message:
wip

Line File contents
1 /*
2 * common.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 11/26/08.
6 * Copyright 2008 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #include <iostream>
15 #include <sstream>
16 #include <cstdarg>
17 #include <limits>
18 #include <getopt.h>
19
20 #include "common.h"
21
22 using namespace std;
23
24 #ifdef MEM_DEBUG
25 //function for debugging memory usage of current program in Linux
26
27 #include <unistd.h>
28 #include <ios>
29 #include <fstream>
30
31 //////////////////////////////////////////////////////////////////////////////
32 // process_mem_usage(double &, double &) - takes two doubles by reference,
33 // attempts to read the system-dependent data for a process' virtual memory
34 // size and resident set size, and return the results in KB.
35 //
36 // On failure, returns 0.0, 0.0
37
38 void process_mem_usage(double& vm_usage, double& resident_set) {
39 using std::ios_base;
40 using std::ifstream;
41 using std::string;
42 vm_usage = 0.0;
43 resident_set = 0.0;
44 // 'file' stat seems to give the most reliable results
45 ifstream stat_stream("/proc/self/stat",ios_base::in);
46 // dummy vars for leading entries in stat that we don't care about
47 string pid, comm, state, ppid, pgrp, session, tty_nr;
48 string tpgid, flags, minflt, cminflt, majflt, cmajflt;
49 string utime, stime, cutime, cstime, priority, nice;
50 string O, itrealvalue, starttime;
51
52 // the two fields we want
53 //
54 unsigned long vsize;
55 long rss;
56
57 stat_stream >> pid >> comm >> state >> ppid >> pgrp >> session >> tty_nr
58 >> tpgid >> flags >> minflt >> cminflt >> majflt >> cmajflt
59 >> utime >> stime >> cutime >> cstime >> priority >> nice
60 >> O >> itrealvalue >> starttime >> vsize >> rss; // don't care about the rest
61
62 stat_stream.close();
63
64 long page_size_kb = sysconf(_SC_PAGE_SIZE) / 1024; // in case x86-64 is configured to use 2MB pages
65 vm_usage = vsize / 1024.0;
66 resident_set = rss * page_size_kb;
67 }
68
69 void print_mem_usage() {
70 double vs, rs;
71 process_mem_usage(vs,rs);
72 vs/=1024;
73 rs/=1024;
74 fprintf(stderr, "VMSize: %6.1fMB\tRSize: %6.1fMB\n", vs, rs);
75 }
76 #endif
77
78 bool bowtie2 = true;
79 int bowtie2_min_score = -10;
80 int bowtie2_max_penalty = 6;
81 int bowtie2_min_penalty = 2;
82 int bowtie2_penalty_for_N = 1;
83 int bowtie2_read_gap_open = 5;
84 int bowtie2_read_gap_cont = 3;
85 int bowtie2_ref_gap_open = 5;
86 int bowtie2_ref_gap_cont = 3;
87
88 // daehwan - temporary
89 bool parallel = true;
90
91 unsigned int max_insertion_length = 3;
92 unsigned int max_deletion_length = 3;
93
94
95 int inner_dist_mean = 200;
96 int inner_dist_std_dev = 20;
97 int max_mate_inner_dist = -1;
98
99 int min_anchor_len = 8;
100 int min_report_intron_length = 50;
101 int max_report_intron_length = 500000;
102
103 int min_closure_intron_length = 50;
104 int max_closure_intron_length = 5000;
105
106 int min_coverage_intron_length = 50;
107 int max_coverage_intron_length = 20000;
108
109 int min_segment_intron_length = 50;
110 int max_segment_intron_length = 500000;
111
112 uint32_t min_closure_exon_length = 100;
113
114 int island_extension = 25;
115 int segment_length = 25;
116 int segment_mismatches = 2;
117 int max_read_mismatches = 2;
118 int max_splice_mismatches = 1;
119
120 ReadFormat reads_format = FASTQ;
121
122 bool verbose = false;
123
124 unsigned int max_multihits = 20;
125 unsigned int max_seg_multihits = 40;
126 bool no_closure_search = false;
127 bool no_coverage_search = false;
128 bool no_microexon_search = false;
129 bool butterfly_search = false;
130 int num_threads = 1;
131
132 float min_isoform_fraction = 0.15f;
133
134 string output_dir = "tophat_out";
135 string aux_outfile = ""; //auxiliary output file name (e.g. prep_reads read stats)
136 string index_outfile = "";
137 string gene_filter = "";
138 string gff_file = "";
139 string ium_reads = "";
140 string sam_header = "";
141 string sam_readgroup_id = "";
142 string zpacker = "";
143 string samtools_path = "samtools";
144
145 bool solexa_quals = false;
146 bool phred64_quals = false;
147 bool quals = false;
148 bool integer_quals = false;
149 bool color = false;
150 string gtf_juncs = "";
151
152 bool report_secondary_alignments = false;
153
154 string flt_reads = "";
155 string flt_mappings = "";
156
157 bool fusion_search = false;
158 size_t fusion_anchor_length = 20;
159 size_t fusion_min_dist = 10000000;
160 size_t fusion_read_mismatches = 2;
161 size_t fusion_multireads = 2;
162 size_t fusion_multipairs = 2;
163
164 eLIBRARY_TYPE library_type = LIBRARY_TYPE_NONE;
165
166 extern void print_usage();
167
168 /**
169 * Parse an int out of optarg and enforce that it be at least 'lower';
170 * if it is less than 'lower', than output the given error message and
171 * exit with an error and a usage message.
172 */
173
174 int parseIntOpt(int lower, const char *errmsg, void (*print_usage)()) {
175 long l;
176 char *endPtr= NULL;
177 l = strtol(optarg, &endPtr, 10);
178 if (endPtr != NULL) {
179 if (l < lower) {
180 cerr << errmsg << endl;
181 print_usage();
182 exit(1);
183 }
184 return (int32_t)l;
185 }
186 cerr << errmsg << endl;
187 print_usage();
188 exit(1);
189 return -1;
190 }
191
192 /**
193 * Parse an int out of optarg and enforce that it be at least 'lower';
194 * if it is less than 'lower', than output the given error message and
195 * exit with an error and a usage message.
196 */
197 static float parseFloatOpt(float lower, float upper, const char *errmsg, void (*print_usage)()) {
198 float l;
199 l = (float)atof(optarg);
200
201 if (l < lower) {
202 cerr << errmsg << endl;
203 print_usage();
204 exit(1);
205 }
206
207 if (l > upper)
208 {
209 cerr << errmsg << endl;
210 print_usage();
211 exit(1);
212 }
213
214 return l;
215
216 cerr << errmsg << endl;
217 print_usage();
218 exit(1);
219 return -1;
220 }
221
222 /*
223 this is from
224 http://www.winehq.org/pipermail/wine-patches/2001-November/001322.html
225 */
226 char* get_token(char** str, const char* delims)
227 {
228 char* token;
229 if (*str == NULL)
230 return NULL;
231
232 token = *str;
233 while (**str != '\0')
234 {
235 if (strchr(delims, **str) != NULL)
236 {
237 **str = '\0';
238 ++(*str);
239 return token;
240 }
241
242 ++(*str);
243 }
244
245 *str = NULL;
246 return token;
247 }
248
249 const char *short_options = "QCp:z:N:";
250
251 enum
252 {
253 OPT_FASTA = 127,
254 OPT_FASTQ,
255 OPT_MIN_ANCHOR,
256 OPT_SPLICE_MISMATCHES,
257 OPT_VERBOSE,
258 OPT_INSERT_LENGTH_MEAN,
259 OPT_INSERT_LENGTH_STD_DEV,
260 OPT_MIN_ISOFORM_FRACTION,
261 OPT_OUTPUT_DIR,
262 OPT_GENE_FILTER,
263 OPT_GFF_ANNOTATIONS,
264 OPT_MAX_MULTIHITS,
265 OPT_MAX_SEG_MULTIHITS,
266 OPT_NO_CLOSURE_SEARCH,
267 OPT_NO_COVERAGE_SEARCH,
268 OPT_NO_MICROEXON_SEARCH,
269 OPT_SEGMENT_LENGTH,
270 OPT_READ_MISMATCHES,
271 OPT_SEGMENT_MISMATCHES,
272 OPT_MIN_CLOSURE_EXON,
273 OPT_MAX_CLOSURE_INTRON,
274 OPT_MIN_CLOSURE_INTRON,
275 OPT_MAX_COVERAGE_INTRON,
276 OPT_MIN_COVERAGE_INTRON,
277 OPT_MIN_SEGMENT_INTRON,
278 OPT_MAX_SEGMENT_INTRON,
279 OPT_MIN_REPORT_INTRON,
280 OPT_MAX_REPORT_INTRON,
281 OPT_IUM_READS,
282 OPT_BUTTERFLY_SEARCH,
283 OPT_SOLEXA_QUALS,
284 OPT_PHRED64_QUALS,
285 OPT_SAM_HEADER,
286 OPT_SAM_READGROUP_ID,
287 OPT_QUALS,
288 OPT_INTEGER_QUALS,
289 OPT_COLOR,
290 OPT_LIBRARY_TYPE,
291 OPT_MAX_DELETION_LENGTH,
292 OPT_MAX_INSERTION_LENGTH,
293 OPT_NUM_THREADS,
294 OPT_ZPACKER,
295 OPT_SAMTOOLS,
296 OPT_AUX_OUT,
297 OPT_INDEX_OUT,
298 OPT_GTF_JUNCS,
299 OPT_FILTER_READS,
300 OPT_FILTER_HITS,
301 OPT_REPORT_SECONDARY_ALIGNMENTS,
302 OPT_FUSION_SEARCH,
303 OPT_FUSION_ANCHOR_LENGTH,
304 OPT_FUSION_MIN_DIST,
305 OPT_FUSION_READ_MISMATCHES,
306 OPT_FUSION_MULTIREADS,
307 OPT_FUSION_MULTIPAIRS,
308 OPT_BOWTIE1,
309 OPT_BOWTIE2_MIN_SCORE,
310 OPT_BOWTIE2_MAX_PENALTY,
311 OPT_BOWTIE2_MIN_PENALTY,
312 OPT_BOWTIE2_PENALTY_FOR_N,
313 OPT_BOWTIE2_READ_GAP_OPEN,
314 OPT_BOWTIE2_READ_GAP_CONT,
315 OPT_BOWTIE2_REF_GAP_OPEN,
316 OPT_BOWTIE2_REF_GAP_CONT,
317 };
318
319 static struct option long_options[] = {
320 {"fasta", no_argument, 0, OPT_FASTA},
321 {"fastq", no_argument, 0, OPT_FASTQ},
322 {"min-anchor", required_argument, 0, OPT_MIN_ANCHOR},
323 {"sam-header", required_argument, 0, OPT_SAM_HEADER},
324 {"rg-id", required_argument, 0, OPT_SAM_READGROUP_ID},
325 {"splice-mismatches", required_argument, 0, OPT_SPLICE_MISMATCHES},
326 {"verbose", no_argument, 0, OPT_VERBOSE},
327 {"inner-dist-mean", required_argument, 0, OPT_INSERT_LENGTH_MEAN},
328 {"inner-dist-std-dev", required_argument, 0, OPT_INSERT_LENGTH_STD_DEV},
329 {"output-dir", required_argument, 0, OPT_OUTPUT_DIR},
330 {"gene-filter", required_argument, 0, OPT_GENE_FILTER},
331 {"gtf-annotations", required_argument, 0, OPT_GFF_ANNOTATIONS},
332 {"max-multihits", required_argument, 0, OPT_MAX_MULTIHITS},
333 {"max-seg-multihits", required_argument, 0, OPT_MAX_SEG_MULTIHITS},
334 {"no-closure-search", no_argument, 0, OPT_NO_CLOSURE_SEARCH},
335 {"no-coverage-search", no_argument, 0, OPT_NO_COVERAGE_SEARCH},
336 {"no-microexon-search", no_argument, 0, OPT_NO_MICROEXON_SEARCH},
337 {"segment-length", required_argument, 0, OPT_SEGMENT_LENGTH},
338 {"segment-mismatches", required_argument, 0, OPT_SEGMENT_MISMATCHES},
339 {"max-mismatches", required_argument, 0, OPT_READ_MISMATCHES},
340 {"min-closure-exon", required_argument, 0, OPT_MIN_CLOSURE_EXON},
341 {"min-closure-intron", required_argument, 0, OPT_MIN_CLOSURE_INTRON},
342 {"max-closure-intron", required_argument, 0, OPT_MAX_CLOSURE_INTRON},
343 {"min-coverage-intron", required_argument, 0, OPT_MIN_COVERAGE_INTRON},
344 {"max-coverage-intron", required_argument, 0, OPT_MAX_COVERAGE_INTRON},
345 {"min-segment-intron", required_argument, 0, OPT_MIN_SEGMENT_INTRON},
346 {"max-segment-intron", required_argument, 0, OPT_MAX_SEGMENT_INTRON},
347 {"min-report-intron", required_argument, 0, OPT_MIN_REPORT_INTRON},
348 {"max-report-intron", required_argument, 0, OPT_MAX_REPORT_INTRON},
349 {"min-isoform-fraction",required_argument, 0, OPT_MIN_ISOFORM_FRACTION},
350 {"ium-reads", required_argument, 0, OPT_IUM_READS},
351 {"butterfly-search", no_argument, 0, OPT_BUTTERFLY_SEARCH},
352 {"solexa-quals", no_argument, 0, OPT_SOLEXA_QUALS},
353 {"phred64-quals", no_argument, 0, OPT_PHRED64_QUALS},
354 {"quals", no_argument, 0, OPT_QUALS},
355 {"integer-quals", no_argument, 0, OPT_INTEGER_QUALS},
356 {"color", no_argument, 0, OPT_COLOR},
357 {"library-type", required_argument, 0, OPT_LIBRARY_TYPE},
358 {"max-deletion-length", required_argument, 0, OPT_MAX_DELETION_LENGTH},
359 {"max-insertion-length", required_argument, 0, OPT_MAX_INSERTION_LENGTH},
360 {"num-threads", required_argument, 0, OPT_NUM_THREADS},
361 {"zpacker", required_argument, 0, OPT_ZPACKER},
362 {"samtools", required_argument, 0, OPT_SAMTOOLS},
363 {"aux-outfile", required_argument, 0, OPT_AUX_OUT},
364 {"index-outfile", required_argument, 0, OPT_INDEX_OUT},
365 {"gtf-juncs", required_argument, 0, OPT_GTF_JUNCS},
366 {"flt-reads",required_argument, 0, OPT_FILTER_READS},
367 {"flt-hits",required_argument, 0, OPT_FILTER_HITS},
368 {"report-secondary-alignments", no_argument, 0, OPT_REPORT_SECONDARY_ALIGNMENTS},
369 {"fusion-search", no_argument, 0, OPT_FUSION_SEARCH},
370 {"fusion-anchor-length", required_argument, 0, OPT_FUSION_ANCHOR_LENGTH},
371 {"fusion-min-dist", required_argument, 0, OPT_FUSION_MIN_DIST},
372 {"fusion-read-mismatches", required_argument, 0, OPT_FUSION_READ_MISMATCHES},
373 {"fusion-multireads", required_argument, 0, OPT_FUSION_MULTIREADS},
374 {"fusion-multipairs", required_argument, 0, OPT_FUSION_MULTIPAIRS},
375 {"bowtie1", no_argument, 0, OPT_BOWTIE1},
376 {"bowtie2-min-score", required_argument, 0, OPT_BOWTIE2_MIN_SCORE},
377 {"bowtie2-max-penalty", required_argument, 0, OPT_BOWTIE2_MAX_PENALTY},
378 {"bowtie2-min-penalty", required_argument, 0, OPT_BOWTIE2_MIN_PENALTY},
379 {"bowtie2-penalty-for-N", required_argument, 0, OPT_BOWTIE2_PENALTY_FOR_N},
380 {"bowtie2-read-gap-open", required_argument, 0, OPT_BOWTIE2_READ_GAP_OPEN},
381 {"bowtie2-read-gap-cont", required_argument, 0, OPT_BOWTIE2_READ_GAP_CONT},
382 {"bowtie2-ref-gap-open", required_argument, 0, OPT_BOWTIE2_REF_GAP_OPEN},
383 {"bowtie2-ref-gap-cont", required_argument, 0, OPT_BOWTIE2_REF_GAP_CONT},
384 {0, 0, 0, 0} // terminator
385 };
386
387
388 void str_appendInt(string& str, int v) {
389 stringstream ss;
390 ss << v;
391 str.append(ss.str());
392 }
393
394 bool str_endsWith(string& str, const char* suffix) {
395 if (str.empty() || str.length()<3) return false;
396 size_t l=strlen(suffix);
397 if (str.length()<=l) return false;
398 if (str.rfind(suffix, str.length()-l-1)!=string::npos) return true;
399 return false;
400 }
401
402 int parse_options(int argc, char** argv, void (*print_usage)())
403 {
404 int option_index = 0;
405 int next_option;
406 do {
407 next_option = getopt_long(argc, argv, short_options, long_options, &option_index);
408 switch (next_option) {
409 case -1:
410 break;
411 case OPT_FASTA:
412 reads_format = FASTA;
413 break;
414 case OPT_FASTQ:
415 reads_format = FASTQ;
416 break;
417 case OPT_MIN_ANCHOR:
418 min_anchor_len = (uint32_t)parseIntOpt(3, "--min-anchor arg must be at least 3", print_usage);
419 break;
420 case OPT_SPLICE_MISMATCHES:
421 max_splice_mismatches = parseIntOpt(0, "--splice-mismatches arg must be at least 0", print_usage);
422 break;
423 case OPT_VERBOSE:
424 verbose = true;
425 break;
426 case OPT_INSERT_LENGTH_MEAN:
427 inner_dist_mean = parseIntOpt(-1024, "--inner-dist-mean arg must be at least -1024", print_usage);
428 break;
429 case OPT_INSERT_LENGTH_STD_DEV:
430 inner_dist_std_dev = parseIntOpt(0, "--inner-dist-std-dev arg must be at least 0", print_usage);
431 break;
432 case OPT_OUTPUT_DIR:
433 output_dir = optarg;
434 break;
435 case OPT_GENE_FILTER:
436 gene_filter = optarg;
437 break;
438 case OPT_GFF_ANNOTATIONS:
439 gff_file = optarg;
440 break;
441 case OPT_MAX_MULTIHITS:
442 max_multihits = parseIntOpt(1, "--max-multihits arg must be at least 1", print_usage);
443 break;
444 case OPT_MAX_SEG_MULTIHITS:
445 max_seg_multihits = parseIntOpt(1, "--max-seg-multihits arg must be at least 1", print_usage);
446 break;
447 case OPT_NO_CLOSURE_SEARCH:
448 no_closure_search = true;
449 break;
450 case OPT_NO_COVERAGE_SEARCH:
451 no_coverage_search = true;
452 break;
453 case OPT_NO_MICROEXON_SEARCH:
454 no_microexon_search = true;
455 break;
456 case OPT_SEGMENT_LENGTH:
457 segment_length = parseIntOpt(4, "--segment-length arg must be at least 4", print_usage);
458 break;
459 case OPT_SEGMENT_MISMATCHES:
460 segment_mismatches = parseIntOpt(0, "--segment-mismatches arg must be at least 0", print_usage);
461 break;
462 case 'N':
463 case OPT_READ_MISMATCHES:
464 max_read_mismatches = parseIntOpt(0, "--max-mismatches arg must be at least 0", print_usage);
465 break;
466 case OPT_MIN_CLOSURE_EXON:
467 min_closure_exon_length = parseIntOpt(1, "--min-closure-exon arg must be at least 1", print_usage);
468 break;
469 case OPT_MIN_CLOSURE_INTRON:
470 min_closure_intron_length = parseIntOpt(1, "--min-closure-intron arg must be at least 1", print_usage);
471 break;
472 case OPT_MAX_CLOSURE_INTRON:
473 max_closure_intron_length = parseIntOpt(1, "--max-closure-intron arg must be at least 1", print_usage);
474 break;
475 case OPT_MIN_COVERAGE_INTRON:
476 min_coverage_intron_length = parseIntOpt(1, "--min-coverage-intron arg must be at least 1", print_usage);
477 break;
478 case OPT_MAX_COVERAGE_INTRON:
479 max_coverage_intron_length = parseIntOpt(1, "--max-coverage-intron arg must be at least 1", print_usage);
480 break;
481 case OPT_MIN_SEGMENT_INTRON:
482 min_segment_intron_length = parseIntOpt(1, "--min-segment-intron arg must be at least 1", print_usage);
483 break;
484 case OPT_MAX_SEGMENT_INTRON:
485 max_segment_intron_length = parseIntOpt(1, "--max-segment-intron arg must be at least 1", print_usage);
486 break;
487 case OPT_MIN_REPORT_INTRON:
488 min_report_intron_length = parseIntOpt(1, "--min-report-intron arg must be at least 1", print_usage);
489 break;
490 case OPT_MAX_REPORT_INTRON:
491 max_report_intron_length = parseIntOpt(1, "--max-report-intron arg must be at least 1", print_usage);
492 break;
493 case OPT_MIN_ISOFORM_FRACTION:
494 min_isoform_fraction = parseFloatOpt(0.0f, 1.0f, "--min-isoform-fraction arg must be [0.0,1.0]", print_usage);
495 break;
496 case OPT_IUM_READS:
497 ium_reads = optarg;
498 break;
499 case OPT_SAM_HEADER:
500 sam_header = optarg;
501 break;
502 case OPT_SAM_READGROUP_ID:
503 sam_readgroup_id = optarg;
504 break;
505 case OPT_BUTTERFLY_SEARCH:
506 butterfly_search = true;
507 break;
508 case OPT_SOLEXA_QUALS:
509 solexa_quals = true;
510 break;
511 case OPT_PHRED64_QUALS:
512 phred64_quals = true;
513 break;
514 case 'Q':
515 case OPT_QUALS:
516 quals = true;
517 break;
518 case OPT_INTEGER_QUALS:
519 integer_quals = true;
520 break;
521 case 'C':
522 case OPT_COLOR:
523 color = true;
524 break;
525 case OPT_LIBRARY_TYPE:
526 if (strcmp(optarg, "fr-unstranded") == 0)
527 library_type = FR_UNSTRANDED;
528 else if (strcmp(optarg, "fr-firststrand") == 0)
529 library_type = FR_FIRSTSTRAND;
530 else if (strcmp(optarg, "fr-secondstrand") == 0)
531 library_type = FR_SECONDSTRAND;
532 else if (strcmp(optarg, "ff-unstranded") == 0)
533 library_type = FF_UNSTRANDED;
534 else if (strcmp(optarg, "ff-firststrand") == 0)
535 library_type = FF_FIRSTSTRAND;
536 else if (strcmp(optarg, "ff-secondstrand") == 0)
537 library_type = FF_SECONDSTRAND;
538 break;
539 case OPT_MAX_DELETION_LENGTH:
540 max_deletion_length = parseIntOpt(0, "--max-deletion-length must be at least 0", print_usage);
541 break;
542 case OPT_MAX_INSERTION_LENGTH:
543 max_insertion_length = parseIntOpt(0, "--max-insertion-length must be at least 0", print_usage);
544 break;
545 case 'z':
546 case OPT_ZPACKER:
547 zpacker = optarg;
548 break;
549 case OPT_SAMTOOLS:
550 samtools_path = optarg;
551 break;
552 case OPT_AUX_OUT:
553 aux_outfile = optarg;
554 break;
555 case OPT_INDEX_OUT:
556 index_outfile = optarg;
557 break;
558 case 'p':
559 case OPT_NUM_THREADS:
560 num_threads=parseIntOpt(1,"-p/--num-threads must be at least 1",print_usage);
561 break;
562 case OPT_GTF_JUNCS:
563 gtf_juncs = optarg;
564 break;
565 case OPT_FILTER_READS:
566 flt_reads = optarg;
567 break;
568 case OPT_FILTER_HITS:
569 flt_mappings = optarg;
570 break;
571 case OPT_REPORT_SECONDARY_ALIGNMENTS:
572 report_secondary_alignments = true;
573 break;
574 case OPT_FUSION_SEARCH:
575 fusion_search = true;
576 break;
577 case OPT_FUSION_ANCHOR_LENGTH:
578 fusion_anchor_length = parseIntOpt(10, "--fusion-anchor-length must be at least 10", print_usage);
579 break;
580 case OPT_FUSION_MIN_DIST:
581 fusion_min_dist = parseIntOpt(0, "--fusion-min-dist must be at least 0", print_usage);
582 break;
583 case OPT_FUSION_READ_MISMATCHES:
584 fusion_read_mismatches = parseIntOpt(0, "--fusion-read-mismatches must be at least 0", print_usage);
585 break;
586 case OPT_FUSION_MULTIREADS:
587 fusion_multireads = parseIntOpt(1, "--fusion-multireads must be at least 1", print_usage);
588 break;
589 case OPT_FUSION_MULTIPAIRS:
590 fusion_multipairs = parseIntOpt(1, "--fusion-multipairs must be at least 0", print_usage);
591 break;
592 case OPT_BOWTIE1:
593 bowtie2 = false;
594 break;
595 case OPT_BOWTIE2_MIN_SCORE:
596 bowtie2_min_score = -1 * parseIntOpt(0, "--bowtie2-min-score must be at least 0", print_usage);
597 break;
598 case OPT_BOWTIE2_MAX_PENALTY:
599 bowtie2_max_penalty = parseIntOpt(0, "--bowtie2-max-penalty must be at least 0", print_usage);
600 break;
601 case OPT_BOWTIE2_MIN_PENALTY:
602 bowtie2_min_penalty = parseIntOpt(0, "--bowtie2-min-penalty must be at least 0", print_usage);
603 break;
604 case OPT_BOWTIE2_PENALTY_FOR_N:
605 bowtie2_penalty_for_N = parseIntOpt(0, "--bowtie2-penalty-for-N must be at least 0", print_usage);
606 break;
607 case OPT_BOWTIE2_READ_GAP_OPEN:
608 bowtie2_read_gap_open = parseIntOpt(0, "--bowtie2-read-gap-open must be at least 0", print_usage);
609 break;
610 case OPT_BOWTIE2_READ_GAP_CONT:
611 bowtie2_read_gap_cont = parseIntOpt(0, "--bowtie2-read-gap-cont must be at least 0", print_usage);
612 break;
613 case OPT_BOWTIE2_REF_GAP_OPEN:
614 bowtie2_ref_gap_open = parseIntOpt(0, "--bowtie2-ref-gap-open must be at least 0", print_usage);
615 break;
616 case OPT_BOWTIE2_REF_GAP_CONT:
617 bowtie2_ref_gap_cont = parseIntOpt(0, "--bowtie2-ref-gap-cont must be at least 0", print_usage);
618 break;
619 default:
620 print_usage();
621 return 1;
622 }
623 } while(next_option != -1);
624
625 return 0;
626 }
627
628
629 // Error routine (prints error message and exits!)
630 void err_exit(const char* format,...){
631 va_list arguments;
632 va_start(arguments,format);
633 vfprintf(stderr,format,arguments);
634 va_end(arguments);
635 #ifdef DEBUG
636 // trigger a core dump for later inspection
637 abort();
638 #endif
639 exit(1);
640 }
641
642 FILE* FZPipe::openRead(const char* fname, string& popencmd) {
643 pipecmd=popencmd;
644 filename=fname;
645 if (pipecmd.empty()) {
646 file=fopen(filename.c_str(), "r");
647 }
648 else {
649 string pcmd(pipecmd);
650 pcmd.append(" '");
651 pcmd.append(filename);
652 pcmd.append("'");
653 file=popen(pcmd.c_str(), "r");
654 }
655 return file;
656 }
657
658 FILE* FZPipe::openRead(const char* fname) {
659 string pcmd;
660 return this->openRead(fname,pcmd);
661 }
662
663 FILE* FZPipe::openWrite(const char* fname, string& popencmd) {
664 pipecmd=popencmd;
665 filename=fname;
666 if (pipecmd.empty()) {
667 file=fopen(filename.c_str(), "w");
668 }
669 else {
670 string pcmd(pipecmd);
671 pcmd.append(" - > '");
672 pcmd.append(filename.c_str());
673 pcmd.append("'");
674 file=popen(pcmd.c_str(), "w");
675 }
676 return file;
677 }
678
679
680 FILE* FZPipe::openWrite(const char* fname) {
681 string pcmd;
682 return this->openWrite(fname,pcmd);
683 }
684
685 void FZPipe::rewind() {
686 if (pipecmd.empty()) {
687 if (file!=NULL) {
688 ::rewind(file);
689 return;
690 }
691 if (!filename.empty()) {
692 file=fopen(filename.c_str(),"r");
693 return;
694 }
695 }
696 if (filename.empty())
697 err_die("Error: FZStream::rewind() failed (missing filename)!\n");
698 this->close();
699 string pcmd(pipecmd);
700 pcmd.append(" '");
701 pcmd.append(filename);
702 pcmd.append("'");
703 file=popen(pcmd.c_str(), "r");
704 if (file==NULL) {
705 err_die("Error: FZStream::rewind() popen(%s) failed!\n",pcmd.c_str());
706 }
707 }
708
709
710 string getFext(const string& s) {
711 string r("");
712 //if (xpos!=NULL) *xpos=0;
713 if (s.empty() || s=="-") return r;
714 int slen=(int)s.length();
715 int p=s.rfind('.');
716 int d=s.rfind('/');
717 if (p<=0 || p>slen-2 || p<slen-7 || p<d) return r;
718 r=s.substr(p+1);
719 //if (xpos!=NULL) *xpos=p+1;
720 for(size_t i=0; i!=r.length(); i++)
721 r[i] = std::tolower(r[i]);
722 return r;
723 }
724
725 string guess_packer(const string& fname, bool use_all_cpus) {
726 //only needed for the primary input files (given by user)
727 string picmd("");
728 string fext=getFext(fname);
729 if (fext=="bam") {
730 picmd="bam2fastx";
731 return picmd;
732 }
733 if (fext=="gz" || fext=="gzip" || fext=="z") {
734 if (use_all_cpus && str_endsWith(zpacker,"pigz")) {
735 picmd=zpacker;
736 if (num_threads<2) picmd.append(" -p1");
737 else {
738 picmd.append(" -p");
739 str_appendInt(picmd, num_threads);
740 //picmd.append(" -cd");
741 }
742 }
743 else picmd="gzip";
744 }
745 else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
746 if (use_all_cpus && str_endsWith(zpacker,"pbzip2")) {
747 picmd=zpacker;
748 if (num_threads<2) picmd.append(" -p1");
749 else {
750 picmd.append(" -p");
751 str_appendInt(picmd, num_threads);
752 //picmd.append(" -cd");
753 }
754 }
755 else picmd="bzip2";
756 }
757 return picmd;
758 }
759
760 /*
761 string getBam2SamCmd(const string& fname) {
762 string pipecmd("");
763 string fext=getFext(fname);
764 if (fext=="bam") {
765 pipecmd=samtools_path;
766 pipecmd.append(" view");
767 }
768 return pipecmd;
769 }
770 */
771
772 void err_die(const char* format,...) { // Error exit
773 va_list arguments;
774 va_start(arguments,format);
775 vfprintf(stderr,format,arguments);
776 va_end(arguments);
777 exit(1);
778 }
779
780 string getUnpackCmd(const string& fname, bool use_all_cpus) {
781 //prep_reads should use guess_packer() instead
782 //otherwise compressed files MUST have the .z extension,
783 //as they are all internally generated
784 string pipecmd("");
785 string fext=getFext(fname);
786 if (fext=="bam") {
787 pipecmd="bam2fastx";
788 return pipecmd;
789 }
790 if (zpacker.empty() || fext!="z") {
791 return pipecmd; //no packer used
792 }
793 pipecmd=zpacker;
794 if (str_endsWith(pipecmd, "pigz") ||str_endsWith(pipecmd, "pbzip2")) {
795 if (use_all_cpus==false) pipecmd.append(" -p1");
796 else if (num_threads>1) {
797 pipecmd.append(" -p");
798 str_appendInt(pipecmd,num_threads);
799 }
800 }
801 if (!pipecmd.empty()) pipecmd.append(" -cd");
802 return pipecmd;
803 }
804
805 void checkSamHeader() {
806 if (sam_header.empty())
807 err_die("Error: writeSamHeader() with empty sam_header string\n");
808 //copy the SAM header
809 FILE* fh=fopen(sam_header.c_str(), "r");
810 if (fh==NULL)
811 err_die("Error: cannot open SAM header file %s\n",sam_header.c_str());
812 fclose(fh);
813 }
814
815 void writeSamHeader(FILE* fout) {
816 if (fout==NULL)
817 err_die("Error: writeSamHeader(NULL)\n");
818 checkSamHeader();
819 //copy the SAM header
820 FILE* fh=fopen(sam_header.c_str(), "r");
821 int ch=-1;
822 while ((ch=fgetc(fh))!=EOF) {
823 if (fputc(ch, fout)==EOF)
824 err_die("Error copying SAM header\n");
825 }
826 fclose(fh);
827 }
828
829 //auxiliary functions for BAM record handling
830 uint8_t* realloc_bdata(bam1_t *b, int size) {
831 if (b->m_data < size) {
832 b->m_data = size;
833 kroundup32(b->m_data);
834 b->data = (uint8_t*)realloc(b->data, b->m_data);
835 }
836 if (b->data_len<size) b->data_len=size;
837 return b->data;
838 }
839
840 uint8_t* dupalloc_bdata(bam1_t *b, int size) {
841 //same as realloc_bdata, but does not free previous data
842 //but returns it instead
843 //it ALWAYS duplicates data
844 b->m_data = size;
845 kroundup32(b->m_data);
846 uint8_t* odata=b->data;
847 b->data = (uint8_t*)malloc(b->m_data);
848 memcpy((void*)b->data, (void*)odata, b->data_len);
849 b->data_len=size;
850 return odata; //user must FREE this after
851 }
852
853 extern unsigned short bam_char2flag_table[];
854
855 GBamRecord::GBamRecord(const char* qname, int32_t gseq_tid,
856 int pos, bool reverse, const char* qseq, const char* cigar, const char* quals) {
857 novel=true;
858 b=bam_init1();
859 b->core.tid=gseq_tid;
860 if (pos<=0) {
861 b->core.pos=-1; //unmapped
862 //if (gseq_tid<0)
863 b->core.flag |= BAM_FUNMAP;
864 }
865 else b->core.pos=pos-1; //BAM is 0-based
866 b->core.qual=255;
867 int l_qseq=strlen(qseq);
868 //this may not be accurate, setting CIGAR is the correct way
869 //b->core.bin = bam_reg2bin(b->core.pos, b->core.pos+l_qseq-1);
870 b->core.l_qname=strlen(qname)+1; //includes the \0 at the end
871 memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname);
872 set_cigar(cigar); //this will also set core.bin
873 add_sequence(qseq, l_qseq);
874 add_quals(quals); //quals must be given as Phred33
875 if (reverse) { b->core.flag |= BAM_FREVERSE ; }
876 }
877
878 GBamRecord::GBamRecord(const char* qname, int32_t flags, int32_t g_tid,
879 int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos,
880 int insert_size, const char* qseq, const char* quals,
881 const vector<string>* aux_strings) {
882 novel=true;
883 b=bam_init1();
884 b->core.tid=g_tid;
885 b->core.pos = (pos<=0) ? -1 : pos-1; //BAM is 0-based
886 b->core.qual=map_qual;
887 int l_qseq=strlen(qseq);
888 b->core.l_qname=strlen(qname)+1; //includes the \0 at the end
889 memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname);
890 set_cigar(cigar); //this will also set core.bin
891 add_sequence(qseq, l_qseq);
892 add_quals(quals); //quals must be given as Phred33
893 set_flags(flags);
894 set_mdata(mg_tid, (int32_t)(mate_pos-1), (int32_t)insert_size);
895 if (aux_strings!=NULL) {
896 for (vector<string>::const_iterator itr=aux_strings->begin();
897 itr!=aux_strings->end(); ++itr) {
898 add_aux(itr->c_str());
899 }
900 }
901 }
902 void GBamRecord::set_cigar(const char* cigar) {
903 //requires b->core.pos and b->core.flag to have been set properly PRIOR to this call
904 int doff=b->core.l_qname;
905 uint8_t* after_cigar=NULL;
906 int after_cigar_len=0;
907 uint8_t* prev_bdata=NULL;
908 if (b->data_len>doff) {
909 //cigar string already allocated, replace it
910 int d=b->core.l_qname + b->core.n_cigar * 4;//offset of after-cigar data
911 after_cigar=b->data+d;
912 after_cigar_len=b->data_len-d;
913 }
914 const char *s;
915 char *t;
916 int i, op;
917 long x;
918 b->core.n_cigar = 0;
919 if (cigar != NULL && strcmp(cigar, "*") != 0) {
920 for (s = cigar; *s; ++s) {
921 if (isalpha(*s)) b->core.n_cigar++;
922 else if (!isdigit(*s)) {
923 err_die("Error: invalid CIGAR character (%s)\n",cigar);
924 }
925 }
926 if (after_cigar_len>0) { //replace/insert into existing full data
927 prev_bdata=dupalloc_bdata(b, doff + b->core.n_cigar * 4 + after_cigar_len);
928 memcpy((void*)(b->data+doff+b->core.n_cigar*4),(void*)after_cigar, after_cigar_len);
929 free(prev_bdata);
930 }
931 else {
932 realloc_bdata(b, doff + b->core.n_cigar * 4);
933 }
934 for (i = 0, s = cigar; i != (int)b->core.n_cigar; ++i) {
935 x = strtol(s, &t, 10);
936 op = toupper(*t);
937 if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH;
938 else if (op == 'I') op = BAM_CINS;
939 else if (op == 'D') op = BAM_CDEL;
940 else if (op == 'N') op = BAM_CREF_SKIP;
941 else if (op == 'S') op = BAM_CSOFT_CLIP;
942 else if (op == 'H') op = BAM_CHARD_CLIP;
943 else if (op == 'P') op = BAM_CPAD;
944 else err_die("Error: invalid CIGAR operation (%s)\n",cigar);
945 s = t + 1;
946 bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op;
947 }
948 if (*s) err_die("Error: unmatched CIGAR operation (%s)\n",cigar);
949 b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&b->core, bam1_cigar(b)));
950 } else {//no CIGAR string given
951 if (!(b->core.flag&BAM_FUNMAP)) {
952 fprintf(stderr, "Warning: mapped sequence without CIGAR (%s)\n", (char*)b->data);
953 b->core.flag |= BAM_FUNMAP;
954 }
955 b->core.bin = bam_reg2bin(b->core.pos, b->core.pos + 1);
956 }
957 } //set_cigar()
958
959 void GBamRecord::add_sequence(const char* qseq, int slen) {
960 //must be called AFTER set_cigar (cannot replace existing sequence for now)
961 if (qseq==NULL) return; //should we ever care about this?
962 if (slen<0) slen=strlen(qseq);
963 int doff = b->core.l_qname + b->core.n_cigar * 4;
964 if (strcmp(qseq, "*")!=0) {
965 b->core.l_qseq=slen;
966 if (b->core.n_cigar && b->core.l_qseq != (int32_t)bam_cigar2qlen(&b->core, bam1_cigar(b)))
967 err_die("Error: CIGAR and sequence length are inconsistent!(%s)\n",
968 qseq);
969 uint8_t* p = (uint8_t*)realloc_bdata(b, doff + (b->core.l_qseq+1)/2 + b->core.l_qseq) + doff;
970 //also allocated quals memory
971 memset(p, 0, (b->core.l_qseq+1)/2);
972 for (int i = 0; i < b->core.l_qseq; ++i)
973 p[i/2] |= bam_nt16_table[(int)qseq[i]] << 4*(1-i%2);
974 } else b->core.l_qseq = 0;
975 }
976
977 void GBamRecord::add_quals(const char* quals) {
978 //requires core.l_qseq already set
979 //and must be called AFTER add_sequence(), which also allocates the memory for quals
980 uint8_t* p = b->data+(b->core.l_qname + b->core.n_cigar * 4 + (b->core.l_qseq+1)/2);
981 if (quals==NULL || strcmp(quals, "*") == 0) {
982 for (int i=0;i < b->core.l_qseq; i++) p[i] = 0xff;
983 return;
984 }
985 for (int i=0;i < b->core.l_qseq; i++) p[i] = quals[i]-33;
986 }
987
988 void GBamRecord::add_aux(const char* str) {
989 //requires: being called AFTER add_quals()
990 // static char tag[2];
991 // static uint8_t abuf[512];
992 //requires: being called AFTER add_quals()
993 int strl=strlen(str);
994 //int doff = b->core.l_qname + b->core.n_cigar*4 + (b->core.l_qseq+1)/2 + b->core.l_qseq + b->l_aux;
995 //int doff0=doff;
996 if (strl < 6 || str[2] != ':' || str[4] != ':')
997 parse_error("missing colon in auxiliary data");
998 tag[0] = str[0]; tag[1] = str[1];
999 uint8_t atype = str[3];
1000 uint8_t* adata=abuf;
1001 int alen=0;
1002 if (atype == 'A' || atype == 'a' || atype == 'c' || atype == 'C') { // c and C for backward compatibility
1003 atype='A';
1004 alen=1;
1005 adata=(uint8_t*)&str[5];
1006 }
1007 else if (atype == 'I' || atype == 'i') {
1008 long long x=(long long)atoll(str + 5);
1009 if (x < 0) {
1010 if (x >= -127) {
1011 atype='c';
1012 abuf[0] = (int8_t)x;
1013 alen=1;
1014 }
1015 else if (x >= -32767) {
1016 atype = 's';
1017 *(int16_t*)abuf = (int16_t)x;
1018 alen=2;
1019 }
1020 else {
1021 atype='i';
1022 *(int32_t*)abuf = (int32_t)x;
1023 alen=4;
1024 if (x < -2147483648ll)
1025 fprintf(stderr, "Parse warning: integer %lld is out of range.",
1026 x);
1027 }
1028 } else { //x >=0
1029 if (x <= 255) {
1030 atype = 'C';
1031 abuf[0] = (uint8_t)x;
1032 alen=1;
1033 }
1034 else if (x <= 65535) {
1035 atype='S';
1036 *(uint16_t*)abuf = (uint16_t)x;
1037 alen=2;
1038 }
1039 else {
1040 atype='I';
1041 *(uint32_t*)abuf = (uint32_t)x;
1042 alen=4;
1043 if (x > 4294967295ll)
1044 fprintf(stderr, "Parse warning: integer %lld is out of range.",
1045 x);
1046 }
1047 }
1048 } //integer type
1049 else if (atype == 'f') {
1050 *(float*)abuf = (float)atof(str + 5);
1051 alen = sizeof(float);
1052 }
1053 else if (atype == 'd') { //?
1054 *(float*)abuf = (float)atof(str + 9);
1055 alen=8;
1056 }
1057 else if (atype == 'Z' || atype == 'H') {
1058 if (atype == 'H') { // check whether the hex string is valid
1059 if ((strl - 5) % 2 == 1) parse_error("length of the hex string not even");
1060 for (int i = 0; i < strl - 5; ++i) {
1061 int c = toupper(str[5 + i]);
1062 if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F')))
1063 parse_error("invalid hex character");
1064 }
1065 }
1066 memcpy(abuf, str + 5, strl - 5);
1067 abuf[strl-5] = 0;
1068 alen=strl-4;
1069 } else parse_error("unrecognized aux type");
1070 this->add_aux(tag, atype, alen, adata);
1071 }//add_aux()