ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/common.cpp
Revision: 135
Committed: Mon Dec 12 22:28:38 2011 UTC (9 years, 2 months ago) by gpertea
File size: 30113 byte(s)
Log Message:
wip - SplicedSAMHitFactory() still not implemented

Line File contents
1 /*
2 * common.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 11/26/08.
6 * Copyright 2008 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #include <iostream>
15 #include <sstream>
16 #include <cstdarg>
17 #include <getopt.h>
18
19 #include "common.h"
20
21 using namespace std;
22
23 #ifdef MEM_DEBUG
24 //function for debugging memory usage of current program in Linux
25
26 #include <unistd.h>
27 #include <ios>
28 #include <fstream>
29
30 //////////////////////////////////////////////////////////////////////////////
31 // process_mem_usage(double &, double &) - takes two doubles by reference,
32 // attempts to read the system-dependent data for a process' virtual memory
33 // size and resident set size, and return the results in KB.
34 //
35 // On failure, returns 0.0, 0.0
36
37 void process_mem_usage(double& vm_usage, double& resident_set) {
38 using std::ios_base;
39 using std::ifstream;
40 using std::string;
41 vm_usage = 0.0;
42 resident_set = 0.0;
43 // 'file' stat seems to give the most reliable results
44 ifstream stat_stream("/proc/self/stat",ios_base::in);
45 // dummy vars for leading entries in stat that we don't care about
46 string pid, comm, state, ppid, pgrp, session, tty_nr;
47 string tpgid, flags, minflt, cminflt, majflt, cmajflt;
48 string utime, stime, cutime, cstime, priority, nice;
49 string O, itrealvalue, starttime;
50
51 // the two fields we want
52 //
53 unsigned long vsize;
54 long rss;
55
56 stat_stream >> pid >> comm >> state >> ppid >> pgrp >> session >> tty_nr
57 >> tpgid >> flags >> minflt >> cminflt >> majflt >> cmajflt
58 >> utime >> stime >> cutime >> cstime >> priority >> nice
59 >> O >> itrealvalue >> starttime >> vsize >> rss; // don't care about the rest
60
61 stat_stream.close();
62
63 long page_size_kb = sysconf(_SC_PAGE_SIZE) / 1024; // in case x86-64 is configured to use 2MB pages
64 vm_usage = vsize / 1024.0;
65 resident_set = rss * page_size_kb;
66 }
67
68 void print_mem_usage() {
69 double vs, rs;
70 process_mem_usage(vs,rs);
71 vs/=1024;
72 rs/=1024;
73 fprintf(stderr, "VMSize: %6.1fMB\tRSize: %6.1fMB\n", vs, rs);
74 }
75 #endif
76
77
78 unsigned int max_insertion_length = 3;
79 unsigned int max_deletion_length = 3;
80
81
82 int inner_dist_mean = 200;
83 int inner_dist_std_dev = 20;
84 int max_mate_inner_dist = -1;
85
86 int min_anchor_len = 8;
87 int min_report_intron_length = 50;
88 int max_report_intron_length = 500000;
89
90 int min_closure_intron_length = 50;
91 int max_closure_intron_length = 5000;
92
93 int min_coverage_intron_length = 50;
94 int max_coverage_intron_length = 20000;
95
96 int min_segment_intron_length = 50;
97 int max_segment_intron_length = 500000;
98
99 uint32_t min_closure_exon_length = 100;
100
101 int island_extension = 25;
102 int segment_length = 25;
103 int segment_mismatches = 2;
104 int max_read_mismatches = 2;
105 int max_splice_mismatches = 1;
106
107 ReadFormat reads_format = FASTQ;
108
109 bool verbose = false;
110
111 unsigned int max_multihits = 40;
112 bool no_closure_search = false;
113 bool no_coverage_search = false;
114 bool no_microexon_search = false;
115 bool butterfly_search = false;
116 int num_cpus = 1;
117 float min_isoform_fraction = 0.15f;
118
119 string output_dir = "tophat_out";
120 string aux_outfile = ""; //auxiliary output file name (e.g. prep_reads read stats)
121 string gene_filter = "";
122 string gff_file = "";
123 string ium_reads = "";
124 string sam_header = "";
125 string sam_readgroup_id = "";
126 string zpacker = "";
127 string samtools_path = "samtools";
128
129 bool solexa_quals = false;
130 bool phred64_quals = false;
131 bool quals = false;
132 bool integer_quals = false;
133 bool color = false;
134 bool color_out = false;
135
136 string gtf_juncs = "";
137
138 string flt_reads = "";
139 string flt_mappings = "";
140
141 eLIBRARY_TYPE library_type = LIBRARY_TYPE_NONE;
142
143 extern void print_usage();
144
145 /**
146 * Parse an int out of optarg and enforce that it be at least 'lower';
147 * if it is less than 'lower', than output the given error message and
148 * exit with an error and a usage message.
149 */
150
151 int parseIntOpt(int lower, const char *errmsg, void (*print_usage)()) {
152 long l;
153 char *endPtr= NULL;
154 l = strtol(optarg, &endPtr, 10);
155 if (endPtr != NULL) {
156 if (l < lower) {
157 cerr << errmsg << endl;
158 print_usage();
159 exit(1);
160 }
161 return (int32_t)l;
162 }
163 cerr << errmsg << endl;
164 print_usage();
165 exit(1);
166 return -1;
167 }
168
169 /**
170 * Parse an int out of optarg and enforce that it be at least 'lower';
171 * if it is less than 'lower', than output the given error message and
172 * exit with an error and a usage message.
173 */
174 static float parseFloatOpt(float lower, float upper, const char *errmsg, void (*print_usage)()) {
175 float l;
176 l = (float)atof(optarg);
177
178 if (l < lower) {
179 cerr << errmsg << endl;
180 print_usage();
181 exit(1);
182 }
183
184 if (l > upper)
185 {
186 cerr << errmsg << endl;
187 print_usage();
188 exit(1);
189 }
190
191 return l;
192
193 cerr << errmsg << endl;
194 print_usage();
195 exit(1);
196 return -1;
197 }
198
199 /*
200 this is from
201 http://www.winehq.org/pipermail/wine-patches/2001-November/001322.html
202 */
203 char* get_token(char** str, const char* delims)
204 {
205 char* token;
206 if (*str == NULL)
207 return NULL;
208
209 token = *str;
210 while (**str != '\0')
211 {
212 if (strchr(delims, **str) != NULL)
213 {
214 **str = '\0';
215 ++(*str);
216 return token;
217 }
218
219 ++(*str);
220 }
221
222 *str = NULL;
223 return token;
224 }
225
226
227 const char *short_options = "QCp:z:N:";
228
229 enum
230 {
231 OPT_FASTA = 127,
232 OPT_FASTQ,
233 OPT_MIN_ANCHOR,
234 OPT_SPLICE_MISMATCHES,
235 OPT_VERBOSE,
236 OPT_INSERT_LENGTH_MEAN,
237 OPT_INSERT_LENGTH_STD_DEV,
238 OPT_MIN_ISOFORM_FRACTION,
239 OPT_OUTPUT_DIR,
240 OPT_GENE_FILTER,
241 OPT_GFF_ANNOTATIONS,
242 OPT_MAX_MULTIHITS,
243 OPT_NO_CLOSURE_SEARCH,
244 OPT_NO_COVERAGE_SEARCH,
245 OPT_NO_MICROEXON_SEARCH,
246 OPT_SEGMENT_LENGTH,
247 OPT_READ_MISMATCHES,
248 OPT_SEGMENT_MISMATCHES,
249 OPT_MIN_CLOSURE_EXON,
250 OPT_MAX_CLOSURE_INTRON,
251 OPT_MIN_CLOSURE_INTRON,
252 OPT_MAX_COVERAGE_INTRON,
253 OPT_MIN_COVERAGE_INTRON,
254 OPT_MIN_SEGMENT_INTRON,
255 OPT_MAX_SEGMENT_INTRON,
256 OPT_MIN_REPORT_INTRON,
257 OPT_MAX_REPORT_INTRON,
258 OPT_IUM_READS,
259 OPT_BUTTERFLY_SEARCH,
260 OPT_SOLEXA_QUALS,
261 OPT_PHRED64_QUALS,
262 OPT_SAM_HEADER,
263 OPT_SAM_READGROUP_ID,
264 OPT_QUALS,
265 OPT_INTEGER_QUALS,
266 OPT_COLOR,
267 OPT_COLOR_OUT,
268 OPT_LIBRARY_TYPE,
269 OPT_MAX_DELETION_LENGTH,
270 OPT_MAX_INSERTION_LENGTH,
271 OPT_NUM_CPUS,
272 OPT_ZPACKER,
273 OPT_SAMTOOLS,
274 OPT_AUX_OUT,
275 OPT_GTF_JUNCS,
276 OPT_FILTER_READS,
277 OPT_FILTER_HITS
278 };
279
280 static struct option long_options[] = {
281 {"fasta", no_argument, 0, OPT_FASTA},
282 {"fastq", no_argument, 0, OPT_FASTQ},
283 {"min-anchor", required_argument, 0, OPT_MIN_ANCHOR},
284 {"sam-header", required_argument, 0, OPT_SAM_HEADER},
285 {"rg-id", required_argument, 0, OPT_SAM_READGROUP_ID},
286 {"splice-mismatches", required_argument, 0, OPT_SPLICE_MISMATCHES},
287 {"verbose", no_argument, 0, OPT_VERBOSE},
288 {"inner-dist-mean", required_argument, 0, OPT_INSERT_LENGTH_MEAN},
289 {"inner-dist-std-dev", required_argument, 0, OPT_INSERT_LENGTH_STD_DEV},
290 {"output-dir", required_argument, 0, OPT_OUTPUT_DIR},
291 {"gene-filter", required_argument, 0, OPT_GENE_FILTER},
292 {"gtf-annotations", required_argument, 0, OPT_GFF_ANNOTATIONS},
293 {"max-multihits", required_argument, 0, OPT_MAX_MULTIHITS},
294 {"no-closure-search", no_argument, 0, OPT_NO_CLOSURE_SEARCH},
295 {"no-coverage-search", no_argument, 0, OPT_NO_COVERAGE_SEARCH},
296 {"no-microexon-search", no_argument, 0, OPT_NO_MICROEXON_SEARCH},
297 {"segment-length", required_argument, 0, OPT_SEGMENT_LENGTH},
298 {"segment-mismatches", required_argument, 0, OPT_SEGMENT_MISMATCHES},
299 {"max-mismatches", required_argument, 0, OPT_READ_MISMATCHES},
300 {"min-closure-exon", required_argument, 0, OPT_MIN_CLOSURE_EXON},
301 {"min-closure-intron", required_argument, 0, OPT_MIN_CLOSURE_INTRON},
302 {"max-closure-intron", required_argument, 0, OPT_MAX_CLOSURE_INTRON},
303 {"min-coverage-intron", required_argument, 0, OPT_MIN_COVERAGE_INTRON},
304 {"max-coverage-intron", required_argument, 0, OPT_MAX_COVERAGE_INTRON},
305 {"min-segment-intron", required_argument, 0, OPT_MIN_SEGMENT_INTRON},
306 {"max-segment-intron", required_argument, 0, OPT_MAX_SEGMENT_INTRON},
307 {"min-report-intron", required_argument, 0, OPT_MIN_REPORT_INTRON},
308 {"max-report-intron", required_argument, 0, OPT_MAX_REPORT_INTRON},
309 {"min-isoform-fraction",required_argument, 0, OPT_MIN_ISOFORM_FRACTION},
310 {"ium-reads", required_argument, 0, OPT_IUM_READS},
311 {"butterfly-search", no_argument, 0, OPT_BUTTERFLY_SEARCH},
312 {"solexa-quals", no_argument, 0, OPT_SOLEXA_QUALS},
313 {"phred64-quals", no_argument, 0, OPT_PHRED64_QUALS},
314 {"quals", no_argument, 0, OPT_QUALS},
315 {"integer-quals", no_argument, 0, OPT_INTEGER_QUALS},
316 {"color", no_argument, 0, OPT_COLOR},
317 {"color-out", no_argument, 0, OPT_COLOR_OUT},
318 {"library-type", required_argument, 0, OPT_LIBRARY_TYPE},
319 {"max-deletion-length", required_argument, 0, OPT_MAX_DELETION_LENGTH},
320 {"max-insertion-length", required_argument, 0, OPT_MAX_INSERTION_LENGTH},
321 {"num-threads", required_argument, 0, OPT_NUM_CPUS},
322 {"zpacker", required_argument, 0, OPT_ZPACKER},
323 {"samtools", required_argument, 0, OPT_SAMTOOLS},
324 {"aux-outfile", required_argument, 0, OPT_AUX_OUT},
325 {"gtf-juncs", required_argument, 0, OPT_GTF_JUNCS},
326 {"flt-reads",required_argument, 0, OPT_FILTER_READS},
327 {"flt-hits",required_argument, 0, OPT_FILTER_HITS},
328 {0, 0, 0, 0} // terminator
329 };
330
331
332 void str_appendInt(string& str, int v) {
333 stringstream ss;
334 ss << v;
335 str.append(ss.str());
336 }
337
338 bool str_endsWith(string& str, const char* suffix) {
339 if (str.empty() || str.length()<3) return false;
340 size_t l=strlen(suffix);
341 if (str.length()<=l) return false;
342 if (str.rfind(suffix, str.length()-l-1)!=string::npos) return true;
343 return false;
344 }
345
346 int parse_options(int argc, char** argv, void (*print_usage)())
347 {
348 int option_index = 0;
349 int next_option;
350 do {
351 next_option = getopt_long(argc, argv, short_options, long_options, &option_index);
352 switch (next_option) {
353 case -1:
354 break;
355 case OPT_FASTA:
356 reads_format = FASTA;
357 break;
358 case OPT_FASTQ:
359 reads_format = FASTQ;
360 break;
361 case OPT_MIN_ANCHOR:
362 min_anchor_len = (uint32_t)parseIntOpt(3, "--min-anchor arg must be at least 3", print_usage);
363 break;
364 case OPT_SPLICE_MISMATCHES:
365 max_splice_mismatches = parseIntOpt(0, "--splice-mismatches arg must be at least 0", print_usage);
366 break;
367 case OPT_VERBOSE:
368 verbose = true;
369 break;
370 case OPT_INSERT_LENGTH_MEAN:
371 inner_dist_mean = parseIntOpt(-1024, "--inner-dist-mean arg must be at least -1024", print_usage);
372 break;
373 case OPT_INSERT_LENGTH_STD_DEV:
374 inner_dist_std_dev = parseIntOpt(0, "--inner-dist-std-dev arg must be at least 0", print_usage);
375 break;
376 case OPT_OUTPUT_DIR:
377 output_dir = optarg;
378 break;
379 case OPT_GENE_FILTER:
380 gene_filter = optarg;
381 break;
382 case OPT_GFF_ANNOTATIONS:
383 gff_file = optarg;
384 break;
385 case OPT_MAX_MULTIHITS:
386 max_multihits = parseIntOpt(1, "--max-multihits arg must be at least 1", print_usage);
387 break;
388 case OPT_NO_CLOSURE_SEARCH:
389 no_closure_search = true;
390 break;
391 case OPT_NO_COVERAGE_SEARCH:
392 no_coverage_search = true;
393 break;
394 case OPT_NO_MICROEXON_SEARCH:
395 no_microexon_search = true;
396 break;
397 case OPT_SEGMENT_LENGTH:
398 segment_length = parseIntOpt(4, "--segment-length arg must be at least 4", print_usage);
399 break;
400 case OPT_SEGMENT_MISMATCHES:
401 segment_mismatches = parseIntOpt(0, "--segment-mismatches arg must be at least 0", print_usage);
402 break;
403 case 'N':
404 case OPT_READ_MISMATCHES:
405 max_read_mismatches = parseIntOpt(0, "--max-mismatches arg must be at least 0", print_usage);
406 break;
407 case OPT_MIN_CLOSURE_EXON:
408 min_closure_exon_length = parseIntOpt(1, "--min-closure-exon arg must be at least 1", print_usage);
409 break;
410 case OPT_MIN_CLOSURE_INTRON:
411 min_closure_intron_length = parseIntOpt(1, "--min-closure-intron arg must be at least 1", print_usage);
412 break;
413 case OPT_MAX_CLOSURE_INTRON:
414 max_closure_intron_length = parseIntOpt(1, "--max-closure-intron arg must be at least 1", print_usage);
415 break;
416 case OPT_MIN_COVERAGE_INTRON:
417 min_coverage_intron_length = parseIntOpt(1, "--min-coverage-intron arg must be at least 1", print_usage);
418 break;
419 case OPT_MAX_COVERAGE_INTRON:
420 max_coverage_intron_length = parseIntOpt(1, "--max-coverage-intron arg must be at least 1", print_usage);
421 break;
422 case OPT_MIN_SEGMENT_INTRON:
423 min_segment_intron_length = parseIntOpt(1, "--min-segment-intron arg must be at least 1", print_usage);
424 break;
425 case OPT_MAX_SEGMENT_INTRON:
426 max_segment_intron_length = parseIntOpt(1, "--max-segment-intron arg must be at least 1", print_usage);
427 break;
428 case OPT_MIN_REPORT_INTRON:
429 min_report_intron_length = parseIntOpt(1, "--min-report-intron arg must be at least 1", print_usage);
430 break;
431 case OPT_MAX_REPORT_INTRON:
432 max_report_intron_length = parseIntOpt(1, "--max-report-intron arg must be at least 1", print_usage);
433 break;
434 case OPT_MIN_ISOFORM_FRACTION:
435 min_isoform_fraction = parseFloatOpt(0.0f, 1.0f, "--min-isoform-fraction arg must be [0.0,1.0]", print_usage);
436 break;
437 case OPT_IUM_READS:
438 ium_reads = optarg;
439 break;
440 case OPT_SAM_HEADER:
441 sam_header = optarg;
442 break;
443 case OPT_SAM_READGROUP_ID:
444 sam_readgroup_id = optarg;
445 break;
446 case OPT_BUTTERFLY_SEARCH:
447 butterfly_search = true;
448 break;
449 case OPT_SOLEXA_QUALS:
450 solexa_quals = true;
451 break;
452 case OPT_PHRED64_QUALS:
453 phred64_quals = true;
454 break;
455 case 'Q':
456 case OPT_QUALS:
457 quals = true;
458 break;
459 case OPT_INTEGER_QUALS:
460 integer_quals = true;
461 break;
462 case 'C':
463 case OPT_COLOR:
464 color = true;
465 break;
466 case OPT_COLOR_OUT:
467 color_out = true;
468 break;
469 case OPT_LIBRARY_TYPE:
470 if (strcmp(optarg, "fr-unstranded") == 0)
471 library_type = FR_UNSTRANDED;
472 else if (strcmp(optarg, "fr-firststrand") == 0)
473 library_type = FR_FIRSTSTRAND;
474 else if (strcmp(optarg, "fr-secondstrand") == 0)
475 library_type = FR_SECONDSTRAND;
476 else if (strcmp(optarg, "ff-unstranded") == 0)
477 library_type = FF_UNSTRANDED;
478 else if (strcmp(optarg, "ff-firststrand") == 0)
479 library_type = FF_FIRSTSTRAND;
480 else if (strcmp(optarg, "ff-secondstrand") == 0)
481 library_type = FF_SECONDSTRAND;
482 break;
483 case OPT_MAX_DELETION_LENGTH:
484 max_deletion_length = parseIntOpt(0, "--max-deletion-length must be at least 0", print_usage);
485 break;
486 case OPT_MAX_INSERTION_LENGTH:
487 max_insertion_length = parseIntOpt(0, "--max-insertion-length must be at least 0", print_usage);
488 break;
489 case 'z':
490 case OPT_ZPACKER:
491 zpacker = optarg;
492 break;
493 case OPT_SAMTOOLS:
494 samtools_path = optarg;
495 break;
496 case OPT_AUX_OUT:
497 aux_outfile = optarg;
498 break;
499 case 'p':
500 case OPT_NUM_CPUS:
501 num_cpus=parseIntOpt(1,"-p/--num-threads must be at least 1",print_usage);
502 break;
503 case OPT_GTF_JUNCS:
504 gtf_juncs = optarg;
505 break;
506 case OPT_FILTER_READS:
507 flt_reads = optarg;
508 break;
509 case OPT_FILTER_HITS:
510 flt_mappings = optarg;
511 break;
512 default:
513 print_usage();
514 return 1;
515 }
516 } while(next_option != -1);
517
518 return 0;
519 }
520
521
522 // Error routine (prints error message and exits!)
523 void err_exit(const char* format,...){
524 va_list arguments;
525 va_start(arguments,format);
526 vfprintf(stderr,format,arguments);
527 va_end(arguments);
528 #ifdef DEBUG
529 // trigger a core dump for later inspection
530 abort();
531 #endif
532 exit(1);
533 }
534
535 FILE* FZPipe::openRead(const char* fname, string& popencmd) {
536 pipecmd=popencmd;
537 filename=fname;
538 if (pipecmd.empty()) {
539 file=fopen(filename.c_str(), "r");
540 }
541 else {
542 string pcmd(pipecmd);
543 pcmd.append(" '");
544 pcmd.append(filename);
545 pcmd.append("'");
546 file=popen(pcmd.c_str(), "r");
547 }
548 return file;
549 }
550
551 FILE* FZPipe::openRead(const char* fname) {
552 string pcmd;
553 return this->openRead(fname,pcmd);
554 }
555
556 FILE* FZPipe::openWrite(const char* fname, string& popencmd) {
557 pipecmd=popencmd;
558 filename=fname;
559 if (pipecmd.empty()) {
560 file=fopen(filename.c_str(), "w");
561 }
562 else {
563 string pcmd(pipecmd);
564 pcmd.append(" - > '");
565 pcmd.append(filename.c_str());
566 pcmd.append("'");
567 file=popen(pcmd.c_str(), "w");
568 }
569 return file;
570 }
571
572
573 FILE* FZPipe::openWrite(const char* fname) {
574 string pcmd;
575 return this->openWrite(fname,pcmd);
576 }
577
578 void FZPipe::rewind() {
579 if (pipecmd.empty()) {
580 if (file!=NULL) {
581 ::rewind(file);
582 return;
583 }
584 if (!filename.empty()) {
585 file=fopen(filename.c_str(),"r");
586 return;
587 }
588 }
589 if (filename.empty())
590 err_die("Error: FZStream::rewind() failed (missing filename)!\n");
591 this->close();
592 string pcmd(pipecmd);
593 pcmd.append(" '");
594 pcmd.append(filename);
595 pcmd.append("'");
596 file=popen(pcmd.c_str(), "r");
597 if (file==NULL) {
598 err_die("Error: FZStream::rewind() popen(%s) failed!\n",pcmd.c_str());
599 }
600 }
601
602
603 string getFext(const string& s) {
604 string r("");
605 //if (xpos!=NULL) *xpos=0;
606 if (s.empty() || s=="-") return r;
607 int slen=(int)s.length();
608 int p=s.rfind('.');
609 int d=s.rfind('/');
610 if (p<=0 || p>slen-2 || p<slen-7 || p<d) return r;
611 r=s.substr(p+1);
612 //if (xpos!=NULL) *xpos=p+1;
613 for(size_t i=0; i!=r.length(); i++)
614 r[i] = std::tolower(r[i]);
615 return r;
616 }
617
618 string guess_packer(const string& fname, bool use_all_cpus) {
619 //only needed for the primary input files (given by user)
620 string picmd("");
621 string fext=getFext(fname);
622 if (fext=="bam") {
623 picmd="bam2fastx";
624 return picmd;
625 }
626 if (fext=="gz" || fext=="gzip" || fext=="z") {
627 if (use_all_cpus && str_endsWith(zpacker,"pigz")) {
628 picmd=zpacker;
629 if (num_cpus<2) picmd.append(" -p1");
630 else {
631 picmd.append(" -p");
632 str_appendInt(picmd, num_cpus);
633 //picmd.append(" -cd");
634 }
635 }
636 else picmd="gzip";
637 }
638 else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
639 if (use_all_cpus && str_endsWith(zpacker,"pbzip2")) {
640 picmd=zpacker;
641 if (num_cpus<2) picmd.append(" -p1");
642 else {
643 picmd.append(" -p");
644 str_appendInt(picmd, num_cpus);
645 //picmd.append(" -cd");
646 }
647 }
648 else picmd="bzip2";
649 }
650 return picmd;
651 }
652
653 /*
654 string getBam2SamCmd(const string& fname) {
655 string pipecmd("");
656 string fext=getFext(fname);
657 if (fext=="bam") {
658 pipecmd=samtools_path;
659 pipecmd.append(" view");
660 }
661 return pipecmd;
662 }
663 */
664
665 void err_die(const char* format,...) { // Error exit
666 va_list arguments;
667 va_start(arguments,format);
668 vfprintf(stderr,format,arguments);
669 va_end(arguments);
670 exit(1);
671 }
672
673 string getUnpackCmd(const string& fname, bool use_all_cpus) {
674 //prep_reads should use guess_packer() instead
675 //otherwise compressed files MUST have the .z extension,
676 //as they are all internally generated
677 string pipecmd("");
678 string fext=getFext(fname);
679 if (fext=="bam") {
680 pipecmd="bam2fastx";
681 return pipecmd;
682 }
683 if (zpacker.empty() || fext!="z") {
684 return pipecmd; //no packer used
685 }
686 pipecmd=zpacker;
687 if (str_endsWith(pipecmd, "pigz") ||str_endsWith(pipecmd, "pbzip2")) {
688 if (use_all_cpus==false) pipecmd.append(" -p1");
689 else if (num_cpus>1) {
690 pipecmd.append(" -p");
691 str_appendInt(pipecmd,num_cpus);
692 }
693 }
694 if (!pipecmd.empty()) pipecmd.append(" -cd");
695 return pipecmd;
696 }
697
698 void checkSamHeader() {
699 if (sam_header.empty())
700 err_die("Error: writeSamHeader() with empty sam_header string\n");
701 //copy the SAM header
702 FILE* fh=fopen(sam_header.c_str(), "r");
703 if (fh==NULL)
704 err_die("Error: cannot open SAM header file %s\n",sam_header.c_str());
705 fclose(fh);
706 }
707
708 void writeSamHeader(FILE* fout) {
709 if (fout==NULL)
710 err_die("Error: writeSamHeader(NULL)\n");
711 checkSamHeader();
712 //copy the SAM header
713 FILE* fh=fopen(sam_header.c_str(), "r");
714 int ch=-1;
715 while ((ch=fgetc(fh))!=EOF) {
716 if (fputc(ch, fout)==EOF)
717 err_die("Error copying SAM header\n");
718 }
719 fclose(fh);
720 }
721
722 //auxiliary functions for BAM record handling
723 uint8_t* realloc_bdata(bam1_t *b, int size) {
724 if (b->m_data < size) {
725 b->m_data = size;
726 kroundup32(b->m_data);
727 b->data = (uint8_t*)realloc(b->data, b->m_data);
728 }
729 if (b->data_len<size) b->data_len=size;
730 return b->data;
731 }
732
733 uint8_t* dupalloc_bdata(bam1_t *b, int size) {
734 //same as realloc_bdata, but does not free previous data
735 //but returns it instead
736 //it ALWAYS duplicates data
737 b->m_data = size;
738 kroundup32(b->m_data);
739 uint8_t* odata=b->data;
740 b->data = (uint8_t*)malloc(b->m_data);
741 memcpy((void*)b->data, (void*)odata, b->data_len);
742 b->data_len=size;
743 return odata; //user must FREE this after
744 }
745
746 extern unsigned short bam_char2flag_table[];
747
748 GBamRecord::GBamRecord(const char* qname, int32_t gseq_tid,
749 int pos, bool reverse, const char* qseq, const char* cigar, const char* quals) {
750 novel=true;
751 b=bam_init1();
752 b->core.tid=gseq_tid;
753 if (pos<=0) {
754 b->core.pos=-1; //unmapped
755 //if (gseq_tid<0)
756 b->core.flag |= BAM_FUNMAP;
757 }
758 else b->core.pos=pos-1; //BAM is 0-based
759 b->core.qual=255;
760 int l_qseq=strlen(qseq);
761 //this may not be accurate, setting CIGAR is the correct way
762 //b->core.bin = bam_reg2bin(b->core.pos, b->core.pos+l_qseq-1);
763 b->core.l_qname=strlen(qname)+1; //includes the \0 at the end
764 memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname);
765 set_cigar(cigar); //this will also set core.bin
766 add_sequence(qseq, l_qseq);
767 add_quals(quals); //quals must be given as Phred33
768 if (reverse) { b->core.flag |= BAM_FREVERSE ; }
769 }
770
771 GBamRecord::GBamRecord(const char* qname, int32_t flags, int32_t g_tid,
772 int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos,
773 int insert_size, const char* qseq, const char* quals,
774 const vector<string>* aux_strings) {
775 novel=true;
776 b=bam_init1();
777 b->core.tid=g_tid;
778 b->core.pos = (pos<=0) ? -1 : pos-1; //BAM is 0-based
779 b->core.qual=map_qual;
780 int l_qseq=strlen(qseq);
781 b->core.l_qname=strlen(qname)+1; //includes the \0 at the end
782 memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname);
783 set_cigar(cigar); //this will also set core.bin
784 add_sequence(qseq, l_qseq);
785 add_quals(quals); //quals must be given as Phred33
786 set_flags(flags);
787 set_mdata(mg_tid, (int32_t)(mate_pos-1), (int32_t)insert_size);
788 if (aux_strings!=NULL) {
789 for (vector<string>::const_iterator itr=aux_strings->begin();
790 itr!=aux_strings->end(); ++itr) {
791 add_aux(itr->c_str());
792 }
793 }
794 }
795 void GBamRecord::set_cigar(const char* cigar) {
796 //requires b->core.pos and b->core.flag to have been set properly PRIOR to this call
797 int doff=b->core.l_qname;
798 uint8_t* after_cigar=NULL;
799 int after_cigar_len=0;
800 uint8_t* prev_bdata=NULL;
801 if (b->data_len>doff) {
802 //cigar string already allocated, replace it
803 int d=b->core.l_qname + b->core.n_cigar * 4;//offset of after-cigar data
804 after_cigar=b->data+d;
805 after_cigar_len=b->data_len-d;
806 }
807 const char *s;
808 char *t;
809 int i, op;
810 long x;
811 b->core.n_cigar = 0;
812 if (cigar != NULL && strcmp(cigar, "*") != 0) {
813 for (s = cigar; *s; ++s) {
814 if (isalpha(*s)) b->core.n_cigar++;
815 else if (!isdigit(*s)) {
816 err_die("Error: invalid CIGAR character (%s)\n",cigar);
817 }
818 }
819 if (after_cigar_len>0) { //replace/insert into existing full data
820 prev_bdata=dupalloc_bdata(b, doff + b->core.n_cigar * 4 + after_cigar_len);
821 memcpy((void*)(b->data+doff+b->core.n_cigar*4),(void*)after_cigar, after_cigar_len);
822 free(prev_bdata);
823 }
824 else {
825 realloc_bdata(b, doff + b->core.n_cigar * 4);
826 }
827 for (i = 0, s = cigar; i != b->core.n_cigar; ++i) {
828 x = strtol(s, &t, 10);
829 op = toupper(*t);
830 if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH;
831 else if (op == 'I') op = BAM_CINS;
832 else if (op == 'D') op = BAM_CDEL;
833 else if (op == 'N') op = BAM_CREF_SKIP;
834 else if (op == 'S') op = BAM_CSOFT_CLIP;
835 else if (op == 'H') op = BAM_CHARD_CLIP;
836 else if (op == 'P') op = BAM_CPAD;
837 else err_die("Error: invalid CIGAR operation (%s)\n",cigar);
838 s = t + 1;
839 bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op;
840 }
841 if (*s) err_die("Error: unmatched CIGAR operation (%s)\n",cigar);
842 b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&b->core, bam1_cigar(b)));
843 } else {//no CIGAR string given
844 if (!(b->core.flag&BAM_FUNMAP)) {
845 fprintf(stderr, "Warning: mapped sequence without CIGAR (%s)\n", (char*)b->data);
846 b->core.flag |= BAM_FUNMAP;
847 }
848 b->core.bin = bam_reg2bin(b->core.pos, b->core.pos + 1);
849 }
850 } //set_cigar()
851
852 void GBamRecord::add_sequence(const char* qseq, int slen) {
853 //must be called AFTER set_cigar (cannot replace existing sequence for now)
854 if (qseq==NULL) return; //should we ever care about this?
855 if (slen<0) slen=strlen(qseq);
856 int doff = b->core.l_qname + b->core.n_cigar * 4;
857 if (strcmp(qseq, "*")!=0) {
858 b->core.l_qseq=slen;
859 if (b->core.n_cigar && b->core.l_qseq != (int32_t)bam_cigar2qlen(&b->core, bam1_cigar(b)))
860 err_die("Error: CIGAR and sequence length are inconsistent!(%s)\n",
861 qseq);
862 uint8_t* p = (uint8_t*)realloc_bdata(b, doff + (b->core.l_qseq+1)/2 + b->core.l_qseq) + doff;
863 //also allocated quals memory
864 memset(p, 0, (b->core.l_qseq+1)/2);
865 for (int i = 0; i < b->core.l_qseq; ++i)
866 p[i/2] |= bam_nt16_table[(int)qseq[i]] << 4*(1-i%2);
867 } else b->core.l_qseq = 0;
868 }
869
870 void GBamRecord::add_quals(const char* quals) {
871 //requires core.l_qseq already set
872 //and must be called AFTER add_sequence(), which also allocates the memory for quals
873 uint8_t* p = b->data+(b->core.l_qname + b->core.n_cigar * 4 + (b->core.l_qseq+1)/2);
874 if (quals==NULL || strcmp(quals, "*") == 0) {
875 for (int i=0;i < b->core.l_qseq; i++) p[i] = 0xff;
876 return;
877 }
878 for (int i=0;i < b->core.l_qseq; i++) p[i] = quals[i]-33;
879 }
880
881 void GBamRecord::add_aux(const char* str) {
882 //requires: being called AFTER add_quals()
883 static char tag[2];
884 static uint8_t abuf[512];
885 //requires: being called AFTER add_quals()
886 int strl=strlen(str);
887 //int doff = b->core.l_qname + b->core.n_cigar*4 + (b->core.l_qseq+1)/2 + b->core.l_qseq + b->l_aux;
888 //int doff0=doff;
889 if (strl < 6 || str[2] != ':' || str[4] != ':')
890 parse_error("missing colon in auxiliary data");
891 tag[0] = str[0]; tag[1] = str[1];
892 uint8_t atype = str[3];
893 uint8_t* adata=abuf;
894 int alen=0;
895 if (atype == 'A' || atype == 'a' || atype == 'c' || atype == 'C') { // c and C for backward compatibility
896 atype='A';
897 alen=1;
898 adata=(uint8_t*)&str[5];
899 }
900 else if (atype == 'I' || atype == 'i') {
901 long long x=(long long)atoll(str + 5);
902 if (x < 0) {
903 if (x >= -127) {
904 atype='c';
905 abuf[0] = (int8_t)x;
906 alen=1;
907 }
908 else if (x >= -32767) {
909 atype = 's';
910 *(int16_t*)abuf = (int16_t)x;
911 alen=2;
912 }
913 else {
914 atype='i';
915 *(int32_t*)abuf = (int32_t)x;
916 alen=4;
917 if (x < -2147483648ll)
918 fprintf(stderr, "Parse warning: integer %lld is out of range.",
919 x);
920 }
921 } else { //x >=0
922 if (x <= 255) {
923 atype = 'C';
924 abuf[0] = (uint8_t)x;
925 alen=1;
926 }
927 else if (x <= 65535) {
928 atype='S';
929 *(uint16_t*)abuf = (uint16_t)x;
930 alen=2;
931 }
932 else {
933 atype='I';
934 *(uint32_t*)abuf = (uint32_t)x;
935 alen=4;
936 if (x > 4294967295ll)
937 fprintf(stderr, "Parse warning: integer %lld is out of range.",
938 x);
939 }
940 }
941 } //integer type
942 else if (atype == 'f') {
943 *(float*)abuf = (float)atof(str + 5);
944 alen = sizeof(float);
945 }
946 else if (atype == 'd') { //?
947 *(float*)abuf = (float)atof(str + 9);
948 alen=8;
949 }
950 else if (atype == 'Z' || atype == 'H') {
951 if (atype == 'H') { // check whether the hex string is valid
952 if ((strl - 5) % 2 == 1) parse_error("length of the hex string not even");
953 for (int i = 0; i < strl - 5; ++i) {
954 int c = toupper(str[5 + i]);
955 if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F')))
956 parse_error("invalid hex character");
957 }
958 }
959 memcpy(abuf, str + 5, strl - 5);
960 abuf[strl-5] = 0;
961 alen=strl-4;
962 } else parse_error("unrecognized aux type");
963 this->add_aux(tag, atype, alen, adata);
964 }//add_aux()