ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/common.cpp
Revision: 154
Committed: Tue Jan 24 02:29:21 2012 UTC (7 years, 8 months ago) by gpertea
File size: 32476 byte(s)
Log Message:
massive update with Daehwan's work

Line File contents
1 /*
2 * common.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 11/26/08.
6 * Copyright 2008 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #include <iostream>
15 #include <sstream>
16 #include <cstdarg>
17 #include <limits>
18 #include <getopt.h>
19
20 #include "common.h"
21
22 using namespace std;
23
24 #ifdef MEM_DEBUG
25 //function for debugging memory usage of current program in Linux
26
27 #include <unistd.h>
28 #include <ios>
29 #include <fstream>
30
31 //////////////////////////////////////////////////////////////////////////////
32 // process_mem_usage(double &, double &) - takes two doubles by reference,
33 // attempts to read the system-dependent data for a process' virtual memory
34 // size and resident set size, and return the results in KB.
35 //
36 // On failure, returns 0.0, 0.0
37
38 void process_mem_usage(double& vm_usage, double& resident_set) {
39 using std::ios_base;
40 using std::ifstream;
41 using std::string;
42 vm_usage = 0.0;
43 resident_set = 0.0;
44 // 'file' stat seems to give the most reliable results
45 ifstream stat_stream("/proc/self/stat",ios_base::in);
46 // dummy vars for leading entries in stat that we don't care about
47 string pid, comm, state, ppid, pgrp, session, tty_nr;
48 string tpgid, flags, minflt, cminflt, majflt, cmajflt;
49 string utime, stime, cutime, cstime, priority, nice;
50 string O, itrealvalue, starttime;
51
52 // the two fields we want
53 //
54 unsigned long vsize;
55 long rss;
56
57 stat_stream >> pid >> comm >> state >> ppid >> pgrp >> session >> tty_nr
58 >> tpgid >> flags >> minflt >> cminflt >> majflt >> cmajflt
59 >> utime >> stime >> cutime >> cstime >> priority >> nice
60 >> O >> itrealvalue >> starttime >> vsize >> rss; // don't care about the rest
61
62 stat_stream.close();
63
64 long page_size_kb = sysconf(_SC_PAGE_SIZE) / 1024; // in case x86-64 is configured to use 2MB pages
65 vm_usage = vsize / 1024.0;
66 resident_set = rss * page_size_kb;
67 }
68
69 void print_mem_usage() {
70 double vs, rs;
71 process_mem_usage(vs,rs);
72 vs/=1024;
73 rs/=1024;
74 fprintf(stderr, "VMSize: %6.1fMB\tRSize: %6.1fMB\n", vs, rs);
75 }
76 #endif
77
78 bool bowtie2 = true;
79 int bowtie2_min_score = -10;
80 int max_segment_mapping = 20;
81
82 // daehwan - temporary
83 bool parallel = true;
84
85 unsigned int max_insertion_length = 3;
86 unsigned int max_deletion_length = 3;
87
88
89 int inner_dist_mean = 200;
90 int inner_dist_std_dev = 20;
91 int max_mate_inner_dist = -1;
92
93 int min_anchor_len = 8;
94 int min_report_intron_length = 50;
95 int max_report_intron_length = 500000;
96
97 int min_closure_intron_length = 50;
98 int max_closure_intron_length = 5000;
99
100 int min_coverage_intron_length = 50;
101 int max_coverage_intron_length = 20000;
102
103 int min_segment_intron_length = 50;
104 int max_segment_intron_length = 500000;
105
106 uint32_t min_closure_exon_length = 100;
107
108 int island_extension = 25;
109 int segment_length = 25;
110 int segment_mismatches = 2;
111 int max_read_mismatches = 2;
112 int max_splice_mismatches = 1;
113
114 ReadFormat reads_format = FASTQ;
115
116 bool verbose = false;
117
118 unsigned int max_multihits = 40;
119 bool no_closure_search = false;
120 bool no_coverage_search = false;
121 bool no_microexon_search = false;
122 bool butterfly_search = false;
123 int num_threads = 1;
124
125 float min_isoform_fraction = 0.15f;
126
127 string output_dir = "tophat_out";
128 string aux_outfile = ""; //auxiliary output file name (e.g. prep_reads read stats)
129 string index_outfile = "";
130 string gene_filter = "";
131 string gff_file = "";
132 string ium_reads = "";
133 string sam_header = "";
134 string sam_readgroup_id = "";
135 string zpacker = "";
136 string samtools_path = "samtools";
137
138 bool solexa_quals = false;
139 bool phred64_quals = false;
140 bool quals = false;
141 bool integer_quals = false;
142 bool color = false;
143 bool color_out = false;
144
145 string gtf_juncs = "";
146
147 string flt_reads = "";
148 string flt_mappings = "";
149
150 bool fusion_search = false;
151 size_t fusion_anchor_length = 20;
152 size_t fusion_min_dist = 10000000;
153 size_t fusion_read_mismatches = 2;
154 size_t fusion_multireads = 2;
155 size_t fusion_multipairs = 2;
156
157 eLIBRARY_TYPE library_type = LIBRARY_TYPE_NONE;
158
159 extern void print_usage();
160
161 /**
162 * Parse an int out of optarg and enforce that it be at least 'lower';
163 * if it is less than 'lower', than output the given error message and
164 * exit with an error and a usage message.
165 */
166
167 int parseIntOpt(int lower, const char *errmsg, void (*print_usage)()) {
168 long l;
169 char *endPtr= NULL;
170 l = strtol(optarg, &endPtr, 10);
171 if (endPtr != NULL) {
172 if (l < lower) {
173 cerr << errmsg << endl;
174 print_usage();
175 exit(1);
176 }
177 return (int32_t)l;
178 }
179 cerr << errmsg << endl;
180 print_usage();
181 exit(1);
182 return -1;
183 }
184
185 /**
186 * Parse an int out of optarg and enforce that it be at least 'lower';
187 * if it is less than 'lower', than output the given error message and
188 * exit with an error and a usage message.
189 */
190 static float parseFloatOpt(float lower, float upper, const char *errmsg, void (*print_usage)()) {
191 float l;
192 l = (float)atof(optarg);
193
194 if (l < lower) {
195 cerr << errmsg << endl;
196 print_usage();
197 exit(1);
198 }
199
200 if (l > upper)
201 {
202 cerr << errmsg << endl;
203 print_usage();
204 exit(1);
205 }
206
207 return l;
208
209 cerr << errmsg << endl;
210 print_usage();
211 exit(1);
212 return -1;
213 }
214
215 /*
216 this is from
217 http://www.winehq.org/pipermail/wine-patches/2001-November/001322.html
218 */
219 char* get_token(char** str, const char* delims)
220 {
221 char* token;
222 if (*str == NULL)
223 return NULL;
224
225 token = *str;
226 while (**str != '\0')
227 {
228 if (strchr(delims, **str) != NULL)
229 {
230 **str = '\0';
231 ++(*str);
232 return token;
233 }
234
235 ++(*str);
236 }
237
238 *str = NULL;
239 return token;
240 }
241
242 const char *short_options = "QCp:z:N:";
243
244 enum
245 {
246 OPT_FASTA = 127,
247 OPT_FASTQ,
248 OPT_MIN_ANCHOR,
249 OPT_SPLICE_MISMATCHES,
250 OPT_VERBOSE,
251 OPT_INSERT_LENGTH_MEAN,
252 OPT_INSERT_LENGTH_STD_DEV,
253 OPT_MIN_ISOFORM_FRACTION,
254 OPT_OUTPUT_DIR,
255 OPT_GENE_FILTER,
256 OPT_GFF_ANNOTATIONS,
257 OPT_MAX_MULTIHITS,
258 OPT_NO_CLOSURE_SEARCH,
259 OPT_NO_COVERAGE_SEARCH,
260 OPT_NO_MICROEXON_SEARCH,
261 OPT_SEGMENT_LENGTH,
262 OPT_READ_MISMATCHES,
263 OPT_SEGMENT_MISMATCHES,
264 OPT_MIN_CLOSURE_EXON,
265 OPT_MAX_CLOSURE_INTRON,
266 OPT_MIN_CLOSURE_INTRON,
267 OPT_MAX_COVERAGE_INTRON,
268 OPT_MIN_COVERAGE_INTRON,
269 OPT_MIN_SEGMENT_INTRON,
270 OPT_MAX_SEGMENT_INTRON,
271 OPT_MIN_REPORT_INTRON,
272 OPT_MAX_REPORT_INTRON,
273 OPT_IUM_READS,
274 OPT_BUTTERFLY_SEARCH,
275 OPT_SOLEXA_QUALS,
276 OPT_PHRED64_QUALS,
277 OPT_SAM_HEADER,
278 OPT_SAM_READGROUP_ID,
279 OPT_QUALS,
280 OPT_INTEGER_QUALS,
281 OPT_COLOR,
282 OPT_COLOR_OUT,
283 OPT_LIBRARY_TYPE,
284 OPT_MAX_DELETION_LENGTH,
285 OPT_MAX_INSERTION_LENGTH,
286 OPT_NUM_THREADS,
287 OPT_ZPACKER,
288 OPT_SAMTOOLS,
289 OPT_AUX_OUT,
290 OPT_INDEX_OUT,
291 OPT_GTF_JUNCS,
292 OPT_FILTER_READS,
293 OPT_FILTER_HITS,
294 OPT_FUSION_SEARCH,
295 OPT_FUSION_ANCHOR_LENGTH,
296 OPT_FUSION_MIN_DIST,
297 OPT_FUSION_READ_MISMATCHES,
298 OPT_FUSION_MULTIREADS,
299 OPT_FUSION_MULTIPAIRS,
300 OPT_BOWTIE1,
301 OPT_BOWTIE2_MIN_SCORE,
302 };
303
304 static struct option long_options[] = {
305 {"fasta", no_argument, 0, OPT_FASTA},
306 {"fastq", no_argument, 0, OPT_FASTQ},
307 {"min-anchor", required_argument, 0, OPT_MIN_ANCHOR},
308 {"sam-header", required_argument, 0, OPT_SAM_HEADER},
309 {"rg-id", required_argument, 0, OPT_SAM_READGROUP_ID},
310 {"splice-mismatches", required_argument, 0, OPT_SPLICE_MISMATCHES},
311 {"verbose", no_argument, 0, OPT_VERBOSE},
312 {"inner-dist-mean", required_argument, 0, OPT_INSERT_LENGTH_MEAN},
313 {"inner-dist-std-dev", required_argument, 0, OPT_INSERT_LENGTH_STD_DEV},
314 {"output-dir", required_argument, 0, OPT_OUTPUT_DIR},
315 {"gene-filter", required_argument, 0, OPT_GENE_FILTER},
316 {"gtf-annotations", required_argument, 0, OPT_GFF_ANNOTATIONS},
317 {"max-multihits", required_argument, 0, OPT_MAX_MULTIHITS},
318 {"no-closure-search", no_argument, 0, OPT_NO_CLOSURE_SEARCH},
319 {"no-coverage-search", no_argument, 0, OPT_NO_COVERAGE_SEARCH},
320 {"no-microexon-search", no_argument, 0, OPT_NO_MICROEXON_SEARCH},
321 {"segment-length", required_argument, 0, OPT_SEGMENT_LENGTH},
322 {"segment-mismatches", required_argument, 0, OPT_SEGMENT_MISMATCHES},
323 {"max-mismatches", required_argument, 0, OPT_READ_MISMATCHES},
324 {"min-closure-exon", required_argument, 0, OPT_MIN_CLOSURE_EXON},
325 {"min-closure-intron", required_argument, 0, OPT_MIN_CLOSURE_INTRON},
326 {"max-closure-intron", required_argument, 0, OPT_MAX_CLOSURE_INTRON},
327 {"min-coverage-intron", required_argument, 0, OPT_MIN_COVERAGE_INTRON},
328 {"max-coverage-intron", required_argument, 0, OPT_MAX_COVERAGE_INTRON},
329 {"min-segment-intron", required_argument, 0, OPT_MIN_SEGMENT_INTRON},
330 {"max-segment-intron", required_argument, 0, OPT_MAX_SEGMENT_INTRON},
331 {"min-report-intron", required_argument, 0, OPT_MIN_REPORT_INTRON},
332 {"max-report-intron", required_argument, 0, OPT_MAX_REPORT_INTRON},
333 {"min-isoform-fraction",required_argument, 0, OPT_MIN_ISOFORM_FRACTION},
334 {"ium-reads", required_argument, 0, OPT_IUM_READS},
335 {"butterfly-search", no_argument, 0, OPT_BUTTERFLY_SEARCH},
336 {"solexa-quals", no_argument, 0, OPT_SOLEXA_QUALS},
337 {"phred64-quals", no_argument, 0, OPT_PHRED64_QUALS},
338 {"quals", no_argument, 0, OPT_QUALS},
339 {"integer-quals", no_argument, 0, OPT_INTEGER_QUALS},
340 {"color", no_argument, 0, OPT_COLOR},
341 {"color-out", no_argument, 0, OPT_COLOR_OUT},
342 {"library-type", required_argument, 0, OPT_LIBRARY_TYPE},
343 {"max-deletion-length", required_argument, 0, OPT_MAX_DELETION_LENGTH},
344 {"max-insertion-length", required_argument, 0, OPT_MAX_INSERTION_LENGTH},
345 {"num-threads", required_argument, 0, OPT_NUM_THREADS},
346 {"zpacker", required_argument, 0, OPT_ZPACKER},
347 {"samtools", required_argument, 0, OPT_SAMTOOLS},
348 {"aux-outfile", required_argument, 0, OPT_AUX_OUT},
349 {"index-outfile", required_argument, 0, OPT_INDEX_OUT},
350 {"gtf-juncs", required_argument, 0, OPT_GTF_JUNCS},
351 {"flt-reads",required_argument, 0, OPT_FILTER_READS},
352 {"flt-hits",required_argument, 0, OPT_FILTER_HITS},
353 {"fusion-search", no_argument, 0, OPT_FUSION_SEARCH},
354 {"fusion-anchor-length", required_argument, 0, OPT_FUSION_ANCHOR_LENGTH},
355 {"fusion-min-dist", required_argument, 0, OPT_FUSION_MIN_DIST},
356 {"fusion-read-mismatches", required_argument, 0, OPT_FUSION_READ_MISMATCHES},
357 {"fusion-multireads", required_argument, 0, OPT_FUSION_MULTIREADS},
358 {"fusion-multipairs", required_argument, 0, OPT_FUSION_MULTIPAIRS},
359 {"bowtie1", no_argument, 0, OPT_BOWTIE1},
360 {"bowtie2-min-score", required_argument, 0, OPT_BOWTIE2_MIN_SCORE},
361 {0, 0, 0, 0} // terminator
362 };
363
364
365 void str_appendInt(string& str, int v) {
366 stringstream ss;
367 ss << v;
368 str.append(ss.str());
369 }
370
371 bool str_endsWith(string& str, const char* suffix) {
372 if (str.empty() || str.length()<3) return false;
373 size_t l=strlen(suffix);
374 if (str.length()<=l) return false;
375 if (str.rfind(suffix, str.length()-l-1)!=string::npos) return true;
376 return false;
377 }
378
379 int parse_options(int argc, char** argv, void (*print_usage)())
380 {
381 int option_index = 0;
382 int next_option;
383 do {
384 next_option = getopt_long(argc, argv, short_options, long_options, &option_index);
385 switch (next_option) {
386 case -1:
387 break;
388 case OPT_FASTA:
389 reads_format = FASTA;
390 break;
391 case OPT_FASTQ:
392 reads_format = FASTQ;
393 break;
394 case OPT_MIN_ANCHOR:
395 min_anchor_len = (uint32_t)parseIntOpt(3, "--min-anchor arg must be at least 3", print_usage);
396 break;
397 case OPT_SPLICE_MISMATCHES:
398 max_splice_mismatches = parseIntOpt(0, "--splice-mismatches arg must be at least 0", print_usage);
399 break;
400 case OPT_VERBOSE:
401 verbose = true;
402 break;
403 case OPT_INSERT_LENGTH_MEAN:
404 inner_dist_mean = parseIntOpt(-1024, "--inner-dist-mean arg must be at least -1024", print_usage);
405 break;
406 case OPT_INSERT_LENGTH_STD_DEV:
407 inner_dist_std_dev = parseIntOpt(0, "--inner-dist-std-dev arg must be at least 0", print_usage);
408 break;
409 case OPT_OUTPUT_DIR:
410 output_dir = optarg;
411 break;
412 case OPT_GENE_FILTER:
413 gene_filter = optarg;
414 break;
415 case OPT_GFF_ANNOTATIONS:
416 gff_file = optarg;
417 break;
418 case OPT_MAX_MULTIHITS:
419 max_multihits = parseIntOpt(1, "--max-multihits arg must be at least 1", print_usage);
420 break;
421 case OPT_NO_CLOSURE_SEARCH:
422 no_closure_search = true;
423 break;
424 case OPT_NO_COVERAGE_SEARCH:
425 no_coverage_search = true;
426 break;
427 case OPT_NO_MICROEXON_SEARCH:
428 no_microexon_search = true;
429 break;
430 case OPT_SEGMENT_LENGTH:
431 segment_length = parseIntOpt(4, "--segment-length arg must be at least 4", print_usage);
432 break;
433 case OPT_SEGMENT_MISMATCHES:
434 segment_mismatches = parseIntOpt(0, "--segment-mismatches arg must be at least 0", print_usage);
435 break;
436 case 'N':
437 case OPT_READ_MISMATCHES:
438 max_read_mismatches = parseIntOpt(0, "--max-mismatches arg must be at least 0", print_usage);
439 break;
440 case OPT_MIN_CLOSURE_EXON:
441 min_closure_exon_length = parseIntOpt(1, "--min-closure-exon arg must be at least 1", print_usage);
442 break;
443 case OPT_MIN_CLOSURE_INTRON:
444 min_closure_intron_length = parseIntOpt(1, "--min-closure-intron arg must be at least 1", print_usage);
445 break;
446 case OPT_MAX_CLOSURE_INTRON:
447 max_closure_intron_length = parseIntOpt(1, "--max-closure-intron arg must be at least 1", print_usage);
448 break;
449 case OPT_MIN_COVERAGE_INTRON:
450 min_coverage_intron_length = parseIntOpt(1, "--min-coverage-intron arg must be at least 1", print_usage);
451 break;
452 case OPT_MAX_COVERAGE_INTRON:
453 max_coverage_intron_length = parseIntOpt(1, "--max-coverage-intron arg must be at least 1", print_usage);
454 break;
455 case OPT_MIN_SEGMENT_INTRON:
456 min_segment_intron_length = parseIntOpt(1, "--min-segment-intron arg must be at least 1", print_usage);
457 break;
458 case OPT_MAX_SEGMENT_INTRON:
459 max_segment_intron_length = parseIntOpt(1, "--max-segment-intron arg must be at least 1", print_usage);
460 break;
461 case OPT_MIN_REPORT_INTRON:
462 min_report_intron_length = parseIntOpt(1, "--min-report-intron arg must be at least 1", print_usage);
463 break;
464 case OPT_MAX_REPORT_INTRON:
465 max_report_intron_length = parseIntOpt(1, "--max-report-intron arg must be at least 1", print_usage);
466 break;
467 case OPT_MIN_ISOFORM_FRACTION:
468 min_isoform_fraction = parseFloatOpt(0.0f, 1.0f, "--min-isoform-fraction arg must be [0.0,1.0]", print_usage);
469 break;
470 case OPT_IUM_READS:
471 ium_reads = optarg;
472 break;
473 case OPT_SAM_HEADER:
474 sam_header = optarg;
475 break;
476 case OPT_SAM_READGROUP_ID:
477 sam_readgroup_id = optarg;
478 break;
479 case OPT_BUTTERFLY_SEARCH:
480 butterfly_search = true;
481 break;
482 case OPT_SOLEXA_QUALS:
483 solexa_quals = true;
484 break;
485 case OPT_PHRED64_QUALS:
486 phred64_quals = true;
487 break;
488 case 'Q':
489 case OPT_QUALS:
490 quals = true;
491 break;
492 case OPT_INTEGER_QUALS:
493 integer_quals = true;
494 break;
495 case 'C':
496 case OPT_COLOR:
497 color = true;
498 break;
499 case OPT_COLOR_OUT:
500 color_out = true;
501 break;
502 case OPT_LIBRARY_TYPE:
503 if (strcmp(optarg, "fr-unstranded") == 0)
504 library_type = FR_UNSTRANDED;
505 else if (strcmp(optarg, "fr-firststrand") == 0)
506 library_type = FR_FIRSTSTRAND;
507 else if (strcmp(optarg, "fr-secondstrand") == 0)
508 library_type = FR_SECONDSTRAND;
509 else if (strcmp(optarg, "ff-unstranded") == 0)
510 library_type = FF_UNSTRANDED;
511 else if (strcmp(optarg, "ff-firststrand") == 0)
512 library_type = FF_FIRSTSTRAND;
513 else if (strcmp(optarg, "ff-secondstrand") == 0)
514 library_type = FF_SECONDSTRAND;
515 break;
516 case OPT_MAX_DELETION_LENGTH:
517 max_deletion_length = parseIntOpt(0, "--max-deletion-length must be at least 0", print_usage);
518 break;
519 case OPT_MAX_INSERTION_LENGTH:
520 max_insertion_length = parseIntOpt(0, "--max-insertion-length must be at least 0", print_usage);
521 break;
522 case 'z':
523 case OPT_ZPACKER:
524 zpacker = optarg;
525 break;
526 case OPT_SAMTOOLS:
527 samtools_path = optarg;
528 break;
529 case OPT_AUX_OUT:
530 aux_outfile = optarg;
531 break;
532 case OPT_INDEX_OUT:
533 index_outfile = optarg;
534 break;
535 case 'p':
536 case OPT_NUM_THREADS:
537 num_threads=parseIntOpt(1,"-p/--num-threads must be at least 1",print_usage);
538 break;
539 case OPT_GTF_JUNCS:
540 gtf_juncs = optarg;
541 break;
542 case OPT_FILTER_READS:
543 flt_reads = optarg;
544 break;
545 case OPT_FILTER_HITS:
546 flt_mappings = optarg;
547 break;
548 case OPT_FUSION_SEARCH:
549 fusion_search = true;
550 break;
551 case OPT_FUSION_ANCHOR_LENGTH:
552 fusion_anchor_length = parseIntOpt(10, "--fusion-anchor-length must be at least 10", print_usage);
553 break;
554 case OPT_FUSION_MIN_DIST:
555 fusion_min_dist = parseIntOpt(0, "--fusion-min-dist must be at least 0", print_usage);
556 break;
557 case OPT_FUSION_READ_MISMATCHES:
558 fusion_read_mismatches = parseIntOpt(0, "--fusion-read-mismatches must be at least 0", print_usage);
559 break;
560 case OPT_FUSION_MULTIREADS:
561 fusion_multireads = parseIntOpt(1, "--fusion-multireads must be at least 1", print_usage);
562 break;
563 case OPT_FUSION_MULTIPAIRS:
564 fusion_multipairs = parseIntOpt(1, "--fusion-multipairs must be at least 0", print_usage);
565 break;
566 case OPT_BOWTIE1:
567 bowtie2 = false;
568 break;
569 case OPT_BOWTIE2_MIN_SCORE:
570 bowtie2_min_score = -1 * parseIntOpt(0, "--bowtie2-min-score must be at least 0", print_usage);
571 break;
572 default:
573 print_usage();
574 return 1;
575 }
576 } while(next_option != -1);
577
578 return 0;
579 }
580
581
582 // Error routine (prints error message and exits!)
583 void err_exit(const char* format,...){
584 va_list arguments;
585 va_start(arguments,format);
586 vfprintf(stderr,format,arguments);
587 va_end(arguments);
588 #ifdef DEBUG
589 // trigger a core dump for later inspection
590 abort();
591 #endif
592 exit(1);
593 }
594
595 FILE* FZPipe::openRead(const char* fname, string& popencmd) {
596 pipecmd=popencmd;
597 filename=fname;
598 if (pipecmd.empty()) {
599 file=fopen(filename.c_str(), "r");
600 }
601 else {
602 string pcmd(pipecmd);
603 pcmd.append(" '");
604 pcmd.append(filename);
605 pcmd.append("'");
606 file=popen(pcmd.c_str(), "r");
607 }
608 return file;
609 }
610
611 FILE* FZPipe::openRead(const char* fname) {
612 string pcmd;
613 return this->openRead(fname,pcmd);
614 }
615
616 FILE* FZPipe::openWrite(const char* fname, string& popencmd) {
617 pipecmd=popencmd;
618 filename=fname;
619 if (pipecmd.empty()) {
620 file=fopen(filename.c_str(), "w");
621 }
622 else {
623 string pcmd(pipecmd);
624 pcmd.append(" - > '");
625 pcmd.append(filename.c_str());
626 pcmd.append("'");
627 file=popen(pcmd.c_str(), "w");
628 }
629 return file;
630 }
631
632
633 FILE* FZPipe::openWrite(const char* fname) {
634 string pcmd;
635 return this->openWrite(fname,pcmd);
636 }
637
638 void FZPipe::rewind() {
639 if (pipecmd.empty()) {
640 if (file!=NULL) {
641 ::rewind(file);
642 return;
643 }
644 if (!filename.empty()) {
645 file=fopen(filename.c_str(),"r");
646 return;
647 }
648 }
649 if (filename.empty())
650 err_die("Error: FZStream::rewind() failed (missing filename)!\n");
651 this->close();
652 string pcmd(pipecmd);
653 pcmd.append(" '");
654 pcmd.append(filename);
655 pcmd.append("'");
656 file=popen(pcmd.c_str(), "r");
657 if (file==NULL) {
658 err_die("Error: FZStream::rewind() popen(%s) failed!\n",pcmd.c_str());
659 }
660 }
661
662
663 string getFext(const string& s) {
664 string r("");
665 //if (xpos!=NULL) *xpos=0;
666 if (s.empty() || s=="-") return r;
667 int slen=(int)s.length();
668 int p=s.rfind('.');
669 int d=s.rfind('/');
670 if (p<=0 || p>slen-2 || p<slen-7 || p<d) return r;
671 r=s.substr(p+1);
672 //if (xpos!=NULL) *xpos=p+1;
673 for(size_t i=0; i!=r.length(); i++)
674 r[i] = std::tolower(r[i]);
675 return r;
676 }
677
678 string guess_packer(const string& fname, bool use_all_cpus) {
679 //only needed for the primary input files (given by user)
680 string picmd("");
681 string fext=getFext(fname);
682 if (fext=="bam") {
683 picmd="bam2fastx";
684 return picmd;
685 }
686 if (fext=="gz" || fext=="gzip" || fext=="z") {
687 if (use_all_cpus && str_endsWith(zpacker,"pigz")) {
688 picmd=zpacker;
689 if (num_threads<2) picmd.append(" -p1");
690 else {
691 picmd.append(" -p");
692 str_appendInt(picmd, num_threads);
693 //picmd.append(" -cd");
694 }
695 }
696 else picmd="gzip";
697 }
698 else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
699 if (use_all_cpus && str_endsWith(zpacker,"pbzip2")) {
700 picmd=zpacker;
701 if (num_threads<2) picmd.append(" -p1");
702 else {
703 picmd.append(" -p");
704 str_appendInt(picmd, num_threads);
705 //picmd.append(" -cd");
706 }
707 }
708 else picmd="bzip2";
709 }
710 return picmd;
711 }
712
713 /*
714 string getBam2SamCmd(const string& fname) {
715 string pipecmd("");
716 string fext=getFext(fname);
717 if (fext=="bam") {
718 pipecmd=samtools_path;
719 pipecmd.append(" view");
720 }
721 return pipecmd;
722 }
723 */
724
725 void err_die(const char* format,...) { // Error exit
726 va_list arguments;
727 va_start(arguments,format);
728 vfprintf(stderr,format,arguments);
729 va_end(arguments);
730 exit(1);
731 }
732
733 string getUnpackCmd(const string& fname, bool use_all_cpus) {
734 //prep_reads should use guess_packer() instead
735 //otherwise compressed files MUST have the .z extension,
736 //as they are all internally generated
737 string pipecmd("");
738 string fext=getFext(fname);
739 if (fext=="bam") {
740 pipecmd="bam2fastx";
741 return pipecmd;
742 }
743 if (zpacker.empty() || fext!="z") {
744 return pipecmd; //no packer used
745 }
746 pipecmd=zpacker;
747 if (str_endsWith(pipecmd, "pigz") ||str_endsWith(pipecmd, "pbzip2")) {
748 if (use_all_cpus==false) pipecmd.append(" -p1");
749 else if (num_threads>1) {
750 pipecmd.append(" -p");
751 str_appendInt(pipecmd,num_threads);
752 }
753 }
754 if (!pipecmd.empty()) pipecmd.append(" -cd");
755 return pipecmd;
756 }
757
758 void checkSamHeader() {
759 if (sam_header.empty())
760 err_die("Error: writeSamHeader() with empty sam_header string\n");
761 //copy the SAM header
762 FILE* fh=fopen(sam_header.c_str(), "r");
763 if (fh==NULL)
764 err_die("Error: cannot open SAM header file %s\n",sam_header.c_str());
765 fclose(fh);
766 }
767
768 void writeSamHeader(FILE* fout) {
769 if (fout==NULL)
770 err_die("Error: writeSamHeader(NULL)\n");
771 checkSamHeader();
772 //copy the SAM header
773 FILE* fh=fopen(sam_header.c_str(), "r");
774 int ch=-1;
775 while ((ch=fgetc(fh))!=EOF) {
776 if (fputc(ch, fout)==EOF)
777 err_die("Error copying SAM header\n");
778 }
779 fclose(fh);
780 }
781
782 //auxiliary functions for BAM record handling
783 uint8_t* realloc_bdata(bam1_t *b, int size) {
784 if (b->m_data < size) {
785 b->m_data = size;
786 kroundup32(b->m_data);
787 b->data = (uint8_t*)realloc(b->data, b->m_data);
788 }
789 if (b->data_len<size) b->data_len=size;
790 return b->data;
791 }
792
793 uint8_t* dupalloc_bdata(bam1_t *b, int size) {
794 //same as realloc_bdata, but does not free previous data
795 //but returns it instead
796 //it ALWAYS duplicates data
797 b->m_data = size;
798 kroundup32(b->m_data);
799 uint8_t* odata=b->data;
800 b->data = (uint8_t*)malloc(b->m_data);
801 memcpy((void*)b->data, (void*)odata, b->data_len);
802 b->data_len=size;
803 return odata; //user must FREE this after
804 }
805
806 extern unsigned short bam_char2flag_table[];
807
808 GBamRecord::GBamRecord(const char* qname, int32_t gseq_tid,
809 int pos, bool reverse, const char* qseq, const char* cigar, const char* quals) {
810 novel=true;
811 b=bam_init1();
812 b->core.tid=gseq_tid;
813 if (pos<=0) {
814 b->core.pos=-1; //unmapped
815 //if (gseq_tid<0)
816 b->core.flag |= BAM_FUNMAP;
817 }
818 else b->core.pos=pos-1; //BAM is 0-based
819 b->core.qual=255;
820 int l_qseq=strlen(qseq);
821 //this may not be accurate, setting CIGAR is the correct way
822 //b->core.bin = bam_reg2bin(b->core.pos, b->core.pos+l_qseq-1);
823 b->core.l_qname=strlen(qname)+1; //includes the \0 at the end
824 memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname);
825 set_cigar(cigar); //this will also set core.bin
826 add_sequence(qseq, l_qseq);
827 add_quals(quals); //quals must be given as Phred33
828 if (reverse) { b->core.flag |= BAM_FREVERSE ; }
829 }
830
831 GBamRecord::GBamRecord(const char* qname, int32_t flags, int32_t g_tid,
832 int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos,
833 int insert_size, const char* qseq, const char* quals,
834 const vector<string>* aux_strings) {
835 novel=true;
836 b=bam_init1();
837 b->core.tid=g_tid;
838 b->core.pos = (pos<=0) ? -1 : pos-1; //BAM is 0-based
839 b->core.qual=map_qual;
840 int l_qseq=strlen(qseq);
841 b->core.l_qname=strlen(qname)+1; //includes the \0 at the end
842 memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname);
843 set_cigar(cigar); //this will also set core.bin
844 add_sequence(qseq, l_qseq);
845 add_quals(quals); //quals must be given as Phred33
846 set_flags(flags);
847 set_mdata(mg_tid, (int32_t)(mate_pos-1), (int32_t)insert_size);
848 if (aux_strings!=NULL) {
849 for (vector<string>::const_iterator itr=aux_strings->begin();
850 itr!=aux_strings->end(); ++itr) {
851 add_aux(itr->c_str());
852 }
853 }
854 }
855 void GBamRecord::set_cigar(const char* cigar) {
856 //requires b->core.pos and b->core.flag to have been set properly PRIOR to this call
857 int doff=b->core.l_qname;
858 uint8_t* after_cigar=NULL;
859 int after_cigar_len=0;
860 uint8_t* prev_bdata=NULL;
861 if (b->data_len>doff) {
862 //cigar string already allocated, replace it
863 int d=b->core.l_qname + b->core.n_cigar * 4;//offset of after-cigar data
864 after_cigar=b->data+d;
865 after_cigar_len=b->data_len-d;
866 }
867 const char *s;
868 char *t;
869 int i, op;
870 long x;
871 b->core.n_cigar = 0;
872 if (cigar != NULL && strcmp(cigar, "*") != 0) {
873 for (s = cigar; *s; ++s) {
874 if (isalpha(*s)) b->core.n_cigar++;
875 else if (!isdigit(*s)) {
876 err_die("Error: invalid CIGAR character (%s)\n",cigar);
877 }
878 }
879 if (after_cigar_len>0) { //replace/insert into existing full data
880 prev_bdata=dupalloc_bdata(b, doff + b->core.n_cigar * 4 + after_cigar_len);
881 memcpy((void*)(b->data+doff+b->core.n_cigar*4),(void*)after_cigar, after_cigar_len);
882 free(prev_bdata);
883 }
884 else {
885 realloc_bdata(b, doff + b->core.n_cigar * 4);
886 }
887 for (i = 0, s = cigar; i != (int)b->core.n_cigar; ++i) {
888 x = strtol(s, &t, 10);
889 op = toupper(*t);
890 if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH;
891 else if (op == 'I') op = BAM_CINS;
892 else if (op == 'D') op = BAM_CDEL;
893 else if (op == 'N') op = BAM_CREF_SKIP;
894 else if (op == 'S') op = BAM_CSOFT_CLIP;
895 else if (op == 'H') op = BAM_CHARD_CLIP;
896 else if (op == 'P') op = BAM_CPAD;
897 else err_die("Error: invalid CIGAR operation (%s)\n",cigar);
898 s = t + 1;
899 bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op;
900 }
901 if (*s) err_die("Error: unmatched CIGAR operation (%s)\n",cigar);
902 b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&b->core, bam1_cigar(b)));
903 } else {//no CIGAR string given
904 if (!(b->core.flag&BAM_FUNMAP)) {
905 fprintf(stderr, "Warning: mapped sequence without CIGAR (%s)\n", (char*)b->data);
906 b->core.flag |= BAM_FUNMAP;
907 }
908 b->core.bin = bam_reg2bin(b->core.pos, b->core.pos + 1);
909 }
910 } //set_cigar()
911
912 void GBamRecord::add_sequence(const char* qseq, int slen) {
913 //must be called AFTER set_cigar (cannot replace existing sequence for now)
914 if (qseq==NULL) return; //should we ever care about this?
915 if (slen<0) slen=strlen(qseq);
916 int doff = b->core.l_qname + b->core.n_cigar * 4;
917 if (strcmp(qseq, "*")!=0) {
918 b->core.l_qseq=slen;
919 if (b->core.n_cigar && b->core.l_qseq != (int32_t)bam_cigar2qlen(&b->core, bam1_cigar(b)))
920 err_die("Error: CIGAR and sequence length are inconsistent!(%s)\n",
921 qseq);
922 uint8_t* p = (uint8_t*)realloc_bdata(b, doff + (b->core.l_qseq+1)/2 + b->core.l_qseq) + doff;
923 //also allocated quals memory
924 memset(p, 0, (b->core.l_qseq+1)/2);
925 for (int i = 0; i < b->core.l_qseq; ++i)
926 p[i/2] |= bam_nt16_table[(int)qseq[i]] << 4*(1-i%2);
927 } else b->core.l_qseq = 0;
928 }
929
930 void GBamRecord::add_quals(const char* quals) {
931 //requires core.l_qseq already set
932 //and must be called AFTER add_sequence(), which also allocates the memory for quals
933 uint8_t* p = b->data+(b->core.l_qname + b->core.n_cigar * 4 + (b->core.l_qseq+1)/2);
934 if (quals==NULL || strcmp(quals, "*") == 0) {
935 for (int i=0;i < b->core.l_qseq; i++) p[i] = 0xff;
936 return;
937 }
938 for (int i=0;i < b->core.l_qseq; i++) p[i] = quals[i]-33;
939 }
940
941 void GBamRecord::add_aux(const char* str) {
942 // daehwan - this is not thread-safe - I made these varaiables as class members
943 //requires: being called AFTER add_quals()
944 // static char tag[2];
945 // static uint8_t abuf[512];
946 //requires: being called AFTER add_quals()
947 int strl=strlen(str);
948 //int doff = b->core.l_qname + b->core.n_cigar*4 + (b->core.l_qseq+1)/2 + b->core.l_qseq + b->l_aux;
949 //int doff0=doff;
950 if (strl < 6 || str[2] != ':' || str[4] != ':')
951 parse_error("missing colon in auxiliary data");
952 tag[0] = str[0]; tag[1] = str[1];
953 uint8_t atype = str[3];
954 uint8_t* adata=abuf;
955 int alen=0;
956 if (atype == 'A' || atype == 'a' || atype == 'c' || atype == 'C') { // c and C for backward compatibility
957 atype='A';
958 alen=1;
959 adata=(uint8_t*)&str[5];
960 }
961 else if (atype == 'I' || atype == 'i') {
962 long long x=(long long)atoll(str + 5);
963 if (x < 0) {
964 if (x >= -127) {
965 atype='c';
966 abuf[0] = (int8_t)x;
967 alen=1;
968 }
969 else if (x >= -32767) {
970 atype = 's';
971 *(int16_t*)abuf = (int16_t)x;
972 alen=2;
973 }
974 else {
975 atype='i';
976 *(int32_t*)abuf = (int32_t)x;
977 alen=4;
978 if (x < -2147483648ll)
979 fprintf(stderr, "Parse warning: integer %lld is out of range.",
980 x);
981 }
982 } else { //x >=0
983 if (x <= 255) {
984 atype = 'C';
985 abuf[0] = (uint8_t)x;
986 alen=1;
987 }
988 else if (x <= 65535) {
989 atype='S';
990 *(uint16_t*)abuf = (uint16_t)x;
991 alen=2;
992 }
993 else {
994 atype='I';
995 *(uint32_t*)abuf = (uint32_t)x;
996 alen=4;
997 if (x > 4294967295ll)
998 fprintf(stderr, "Parse warning: integer %lld is out of range.",
999 x);
1000 }
1001 }
1002 } //integer type
1003 else if (atype == 'f') {
1004 *(float*)abuf = (float)atof(str + 5);
1005 alen = sizeof(float);
1006 }
1007 else if (atype == 'd') { //?
1008 *(float*)abuf = (float)atof(str + 9);
1009 alen=8;
1010 }
1011 else if (atype == 'Z' || atype == 'H') {
1012 if (atype == 'H') { // check whether the hex string is valid
1013 if ((strl - 5) % 2 == 1) parse_error("length of the hex string not even");
1014 for (int i = 0; i < strl - 5; ++i) {
1015 int c = toupper(str[5 + i]);
1016 if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F')))
1017 parse_error("invalid hex character");
1018 }
1019 }
1020 memcpy(abuf, str + 5, strl - 5);
1021 abuf[strl-5] = 0;
1022 alen=strl-4;
1023 } else parse_error("unrecognized aux type");
1024 this->add_aux(tag, atype, alen, adata);
1025 }//add_aux()