1 |
/* |
2 |
* common.cpp |
3 |
* TopHat |
4 |
* |
5 |
* Created by Cole Trapnell on 11/26/08. |
6 |
* Copyright 2008 Cole Trapnell. All rights reserved. |
7 |
* |
8 |
*/ |
9 |
|
10 |
#ifdef HAVE_CONFIG_H |
11 |
#include <config.h> |
12 |
#endif |
13 |
|
14 |
#include <iostream> |
15 |
#include <sstream> |
16 |
#include <cstdarg> |
17 |
#include <getopt.h> |
18 |
|
19 |
#include "common.h" |
20 |
|
21 |
using namespace std; |
22 |
|
23 |
#ifdef MEM_DEBUG |
24 |
//function for debugging memory usage of current program in Linux |
25 |
|
26 |
#include <unistd.h> |
27 |
#include <ios> |
28 |
#include <fstream> |
29 |
|
30 |
////////////////////////////////////////////////////////////////////////////// |
31 |
// process_mem_usage(double &, double &) - takes two doubles by reference, |
32 |
// attempts to read the system-dependent data for a process' virtual memory |
33 |
// size and resident set size, and return the results in KB. |
34 |
// |
35 |
// On failure, returns 0.0, 0.0 |
36 |
|
37 |
void process_mem_usage(double& vm_usage, double& resident_set) { |
38 |
using std::ios_base; |
39 |
using std::ifstream; |
40 |
using std::string; |
41 |
vm_usage = 0.0; |
42 |
resident_set = 0.0; |
43 |
// 'file' stat seems to give the most reliable results |
44 |
ifstream stat_stream("/proc/self/stat",ios_base::in); |
45 |
// dummy vars for leading entries in stat that we don't care about |
46 |
string pid, comm, state, ppid, pgrp, session, tty_nr; |
47 |
string tpgid, flags, minflt, cminflt, majflt, cmajflt; |
48 |
string utime, stime, cutime, cstime, priority, nice; |
49 |
string O, itrealvalue, starttime; |
50 |
|
51 |
// the two fields we want |
52 |
// |
53 |
unsigned long vsize; |
54 |
long rss; |
55 |
|
56 |
stat_stream >> pid >> comm >> state >> ppid >> pgrp >> session >> tty_nr |
57 |
>> tpgid >> flags >> minflt >> cminflt >> majflt >> cmajflt |
58 |
>> utime >> stime >> cutime >> cstime >> priority >> nice |
59 |
>> O >> itrealvalue >> starttime >> vsize >> rss; // don't care about the rest |
60 |
|
61 |
stat_stream.close(); |
62 |
|
63 |
long page_size_kb = sysconf(_SC_PAGE_SIZE) / 1024; // in case x86-64 is configured to use 2MB pages |
64 |
vm_usage = vsize / 1024.0; |
65 |
resident_set = rss * page_size_kb; |
66 |
} |
67 |
|
68 |
void print_mem_usage() { |
69 |
double vs, rs; |
70 |
process_mem_usage(vs,rs); |
71 |
vs/=1024; |
72 |
rs/=1024; |
73 |
fprintf(stderr, "VMSize: %6.1fMB\tRSize: %6.1fMB\n", vs, rs); |
74 |
} |
75 |
#endif |
76 |
|
77 |
|
78 |
unsigned int max_insertion_length = 3; |
79 |
unsigned int max_deletion_length = 3; |
80 |
|
81 |
|
82 |
int inner_dist_mean = 200; |
83 |
int inner_dist_std_dev = 20; |
84 |
int max_mate_inner_dist = -1; |
85 |
|
86 |
int min_anchor_len = 8; |
87 |
int min_report_intron_length = 50; |
88 |
int max_report_intron_length = 500000; |
89 |
|
90 |
int min_closure_intron_length = 50; |
91 |
int max_closure_intron_length = 5000; |
92 |
|
93 |
int min_coverage_intron_length = 50; |
94 |
int max_coverage_intron_length = 20000; |
95 |
|
96 |
int min_segment_intron_length = 50; |
97 |
int max_segment_intron_length = 500000; |
98 |
|
99 |
uint32_t min_closure_exon_length = 100; |
100 |
|
101 |
int island_extension = 25; |
102 |
int segment_length = 25; |
103 |
int segment_mismatches = 2; |
104 |
|
105 |
int max_splice_mismatches = 1; |
106 |
|
107 |
ReadFormat reads_format = FASTQ; |
108 |
|
109 |
bool verbose = false; |
110 |
|
111 |
int max_multihits = 40; |
112 |
bool no_closure_search = false; |
113 |
bool no_coverage_search = false; |
114 |
bool no_microexon_search = false; |
115 |
bool butterfly_search = false; |
116 |
int num_cpus = 1; |
117 |
float min_isoform_fraction = 0.15f; |
118 |
|
119 |
string output_dir = "tophat_out"; |
120 |
string aux_outfile = ""; //auxiliary output file name (e.g. prep_reads read stats) |
121 |
string gene_filter = ""; |
122 |
string gff_file = ""; |
123 |
string ium_reads = ""; |
124 |
string sam_header = ""; |
125 |
string sam_readgroup_id = ""; |
126 |
string zpacker = ""; |
127 |
string samtools_path = "samtools"; |
128 |
|
129 |
bool solexa_quals = false; |
130 |
bool phred64_quals = false; |
131 |
bool quals = false; |
132 |
bool integer_quals = false; |
133 |
bool color = false; |
134 |
bool color_out = false; |
135 |
|
136 |
string gtf_juncs = ""; |
137 |
|
138 |
eLIBRARY_TYPE library_type = LIBRARY_TYPE_NONE; |
139 |
|
140 |
extern void print_usage(); |
141 |
|
142 |
/** |
143 |
* Parse an int out of optarg and enforce that it be at least 'lower'; |
144 |
* if it is less than 'lower', than output the given error message and |
145 |
* exit with an error and a usage message. |
146 |
*/ |
147 |
|
148 |
int parseIntOpt(int lower, const char *errmsg, void (*print_usage)()) { |
149 |
long l; |
150 |
char *endPtr= NULL; |
151 |
l = strtol(optarg, &endPtr, 10); |
152 |
if (endPtr != NULL) { |
153 |
if (l < lower) { |
154 |
cerr << errmsg << endl; |
155 |
print_usage(); |
156 |
exit(1); |
157 |
} |
158 |
return (int32_t)l; |
159 |
} |
160 |
cerr << errmsg << endl; |
161 |
print_usage(); |
162 |
exit(1); |
163 |
return -1; |
164 |
} |
165 |
|
166 |
/** |
167 |
* Parse an int out of optarg and enforce that it be at least 'lower'; |
168 |
* if it is less than 'lower', than output the given error message and |
169 |
* exit with an error and a usage message. |
170 |
*/ |
171 |
static float parseFloatOpt(float lower, float upper, const char *errmsg, void (*print_usage)()) { |
172 |
float l; |
173 |
l = (float)atof(optarg); |
174 |
|
175 |
if (l < lower) { |
176 |
cerr << errmsg << endl; |
177 |
print_usage(); |
178 |
exit(1); |
179 |
} |
180 |
|
181 |
if (l > upper) |
182 |
{ |
183 |
cerr << errmsg << endl; |
184 |
print_usage(); |
185 |
exit(1); |
186 |
} |
187 |
|
188 |
return l; |
189 |
|
190 |
cerr << errmsg << endl; |
191 |
print_usage(); |
192 |
exit(1); |
193 |
return -1; |
194 |
} |
195 |
|
196 |
/* |
197 |
this is from |
198 |
http://www.winehq.org/pipermail/wine-patches/2001-November/001322.html |
199 |
*/ |
200 |
char* get_token(char** str, const char* delims) |
201 |
{ |
202 |
char* token; |
203 |
if (*str == NULL) |
204 |
return NULL; |
205 |
|
206 |
token = *str; |
207 |
while (**str != '\0') |
208 |
{ |
209 |
if (strchr(delims, **str) != NULL) |
210 |
{ |
211 |
**str = '\0'; |
212 |
++(*str); |
213 |
return token; |
214 |
} |
215 |
|
216 |
++(*str); |
217 |
} |
218 |
|
219 |
*str = NULL; |
220 |
return token; |
221 |
} |
222 |
|
223 |
|
224 |
const char *short_options = "QCp:z:"; |
225 |
|
226 |
enum |
227 |
{ |
228 |
OPT_FASTA = 127, |
229 |
OPT_FASTQ, |
230 |
OPT_MIN_ANCHOR, |
231 |
OPT_SPLICE_MISMATCHES, |
232 |
OPT_VERBOSE, |
233 |
OPT_INSERT_LENGTH_MEAN, |
234 |
OPT_INSERT_LENGTH_STD_DEV, |
235 |
OPT_MIN_ISOFORM_FRACTION, |
236 |
OPT_OUTPUT_DIR, |
237 |
OPT_GENE_FILTER, |
238 |
OPT_GFF_ANNOTATIONS, |
239 |
OPT_MAX_MULTIHITS, |
240 |
OPT_NO_CLOSURE_SEARCH, |
241 |
OPT_NO_COVERAGE_SEARCH, |
242 |
OPT_NO_MICROEXON_SEARCH, |
243 |
OPT_SEGMENT_LENGTH, |
244 |
OPT_SEGMENT_MISMATCHES, |
245 |
OPT_MIN_CLOSURE_EXON, |
246 |
OPT_MAX_CLOSURE_INTRON, |
247 |
OPT_MIN_CLOSURE_INTRON, |
248 |
OPT_MAX_COVERAGE_INTRON, |
249 |
OPT_MIN_COVERAGE_INTRON, |
250 |
OPT_MIN_SEGMENT_INTRON, |
251 |
OPT_MAX_SEGMENT_INTRON, |
252 |
OPT_MIN_REPORT_INTRON, |
253 |
OPT_MAX_REPORT_INTRON, |
254 |
OPT_IUM_READS, |
255 |
OPT_BUTTERFLY_SEARCH, |
256 |
OPT_SOLEXA_QUALS, |
257 |
OPT_PHRED64_QUALS, |
258 |
OPT_SAM_HEADER, |
259 |
OPT_SAM_READGROUP_ID, |
260 |
OPT_QUALS, |
261 |
OPT_INTEGER_QUALS, |
262 |
OPT_COLOR, |
263 |
OPT_COLOR_OUT, |
264 |
OPT_LIBRARY_TYPE, |
265 |
OPT_MAX_DELETION_LENGTH, |
266 |
OPT_MAX_INSERTION_LENGTH, |
267 |
OPT_NUM_CPUS, |
268 |
OPT_ZPACKER, |
269 |
OPT_SAMTOOLS, |
270 |
OPT_AUX_OUT, |
271 |
OPT_GTF_JUNCS |
272 |
}; |
273 |
|
274 |
static struct option long_options[] = { |
275 |
{"fasta", no_argument, 0, OPT_FASTA}, |
276 |
{"fastq", no_argument, 0, OPT_FASTQ}, |
277 |
{"min-anchor", required_argument, 0, OPT_MIN_ANCHOR}, |
278 |
{"sam-header", required_argument, 0, OPT_SAM_HEADER}, |
279 |
{"rg-id", required_argument, 0, OPT_SAM_READGROUP_ID}, |
280 |
{"splice-mismatches", required_argument, 0, OPT_SPLICE_MISMATCHES}, |
281 |
{"verbose", no_argument, 0, OPT_VERBOSE}, |
282 |
{"inner-dist-mean", required_argument, 0, OPT_INSERT_LENGTH_MEAN}, |
283 |
{"inner-dist-std-dev", required_argument, 0, OPT_INSERT_LENGTH_STD_DEV}, |
284 |
{"output-dir", required_argument, 0, OPT_OUTPUT_DIR}, |
285 |
{"gene-filter", required_argument, 0, OPT_GENE_FILTER}, |
286 |
{"gtf-annotations", required_argument, 0, OPT_GFF_ANNOTATIONS}, |
287 |
{"max-multihits", required_argument, 0, OPT_MAX_MULTIHITS}, |
288 |
{"no-closure-search", no_argument, 0, OPT_NO_CLOSURE_SEARCH}, |
289 |
{"no-coverage-search", no_argument, 0, OPT_NO_COVERAGE_SEARCH}, |
290 |
{"no-microexon-search", no_argument, 0, OPT_NO_MICROEXON_SEARCH}, |
291 |
{"segment-length", required_argument, 0, OPT_SEGMENT_LENGTH}, |
292 |
{"segment-mismatches", required_argument, 0, OPT_SEGMENT_MISMATCHES}, |
293 |
{"min-closure-exon", required_argument, 0, OPT_MIN_CLOSURE_EXON}, |
294 |
{"min-closure-intron", required_argument, 0, OPT_MIN_CLOSURE_INTRON}, |
295 |
{"max-closure-intron", required_argument, 0, OPT_MAX_CLOSURE_INTRON}, |
296 |
{"min-coverage-intron", required_argument, 0, OPT_MIN_COVERAGE_INTRON}, |
297 |
{"max-coverage-intron", required_argument, 0, OPT_MAX_COVERAGE_INTRON}, |
298 |
{"min-segment-intron", required_argument, 0, OPT_MIN_SEGMENT_INTRON}, |
299 |
{"max-segment-intron", required_argument, 0, OPT_MAX_SEGMENT_INTRON}, |
300 |
{"min-report-intron", required_argument, 0, OPT_MIN_REPORT_INTRON}, |
301 |
{"max-report-intron", required_argument, 0, OPT_MAX_REPORT_INTRON}, |
302 |
{"min-isoform-fraction",required_argument, 0, OPT_MIN_ISOFORM_FRACTION}, |
303 |
{"ium-reads", required_argument, 0, OPT_IUM_READS}, |
304 |
{"butterfly-search", no_argument, 0, OPT_BUTTERFLY_SEARCH}, |
305 |
{"solexa-quals", no_argument, 0, OPT_SOLEXA_QUALS}, |
306 |
{"phred64-quals", no_argument, 0, OPT_PHRED64_QUALS}, |
307 |
{"quals", no_argument, 0, OPT_QUALS}, |
308 |
{"integer-quals", no_argument, 0, OPT_INTEGER_QUALS}, |
309 |
{"color", no_argument, 0, OPT_COLOR}, |
310 |
{"color-out", no_argument, 0, OPT_COLOR_OUT}, |
311 |
{"library-type", required_argument, 0, OPT_LIBRARY_TYPE}, |
312 |
{"max-deletion-length", required_argument, 0, OPT_MAX_DELETION_LENGTH}, |
313 |
{"max-insertion-length", required_argument, 0, OPT_MAX_INSERTION_LENGTH}, |
314 |
{"num-threads", required_argument, 0, OPT_NUM_CPUS}, |
315 |
{"zpacker", required_argument, 0, OPT_ZPACKER}, |
316 |
{"samtools", required_argument, 0, OPT_SAMTOOLS}, |
317 |
{"aux-outfile", required_argument, 0, OPT_AUX_OUT}, |
318 |
{"gtf-juncs", required_argument, 0, OPT_GTF_JUNCS}, |
319 |
{0, 0, 0, 0} // terminator |
320 |
}; |
321 |
|
322 |
|
323 |
void str_appendInt(string& str, int v) { |
324 |
stringstream ss; |
325 |
ss << v; |
326 |
str.append(ss.str()); |
327 |
} |
328 |
|
329 |
bool str_endsWith(string& str, const char* suffix) { |
330 |
if (str.empty() || str.length()<3) return false; |
331 |
size_t l=strlen(suffix); |
332 |
if (str.length()<=l) return false; |
333 |
if (str.rfind(suffix, str.length()-l-1)!=string::npos) return true; |
334 |
return false; |
335 |
} |
336 |
|
337 |
int parse_options(int argc, char** argv, void (*print_usage)()) |
338 |
{ |
339 |
int option_index = 0; |
340 |
int next_option; |
341 |
do { |
342 |
next_option = getopt_long(argc, argv, short_options, long_options, &option_index); |
343 |
switch (next_option) { |
344 |
case -1: |
345 |
break; |
346 |
case OPT_FASTA: |
347 |
reads_format = FASTA; |
348 |
break; |
349 |
case OPT_FASTQ: |
350 |
reads_format = FASTQ; |
351 |
break; |
352 |
case OPT_MIN_ANCHOR: |
353 |
min_anchor_len = (uint32_t)parseIntOpt(3, "--min-anchor arg must be at least 3", print_usage); |
354 |
break; |
355 |
case OPT_SPLICE_MISMATCHES: |
356 |
max_splice_mismatches = parseIntOpt(0, "--splice-mismatches arg must be at least 0", print_usage); |
357 |
break; |
358 |
case OPT_VERBOSE: |
359 |
verbose = true; |
360 |
break; |
361 |
case OPT_INSERT_LENGTH_MEAN: |
362 |
inner_dist_mean = parseIntOpt(-1024, "--inner-dist-mean arg must be at least -1024", print_usage); |
363 |
break; |
364 |
case OPT_INSERT_LENGTH_STD_DEV: |
365 |
inner_dist_std_dev = parseIntOpt(0, "--inner-dist-std-dev arg must be at least 0", print_usage); |
366 |
break; |
367 |
case OPT_OUTPUT_DIR: |
368 |
output_dir = optarg; |
369 |
break; |
370 |
case OPT_GENE_FILTER: |
371 |
gene_filter = optarg; |
372 |
break; |
373 |
case OPT_GFF_ANNOTATIONS: |
374 |
gff_file = optarg; |
375 |
break; |
376 |
case OPT_MAX_MULTIHITS: |
377 |
max_multihits = parseIntOpt(1, "--max-multihits arg must be at least 1", print_usage); |
378 |
break; |
379 |
case OPT_NO_CLOSURE_SEARCH: |
380 |
no_closure_search = true; |
381 |
break; |
382 |
case OPT_NO_COVERAGE_SEARCH: |
383 |
no_coverage_search = true; |
384 |
break; |
385 |
case OPT_NO_MICROEXON_SEARCH: |
386 |
no_microexon_search = true; |
387 |
break; |
388 |
case OPT_SEGMENT_LENGTH: |
389 |
segment_length = parseIntOpt(4, "--segment-length arg must be at least 4", print_usage); |
390 |
break; |
391 |
case OPT_SEGMENT_MISMATCHES: |
392 |
segment_mismatches = parseIntOpt(0, "--segment-mismatches arg must be at least 0", print_usage); |
393 |
break; |
394 |
case OPT_MIN_CLOSURE_EXON: |
395 |
min_closure_exon_length = parseIntOpt(1, "--min-closure-exon arg must be at least 1", print_usage); |
396 |
break; |
397 |
case OPT_MIN_CLOSURE_INTRON: |
398 |
min_closure_intron_length = parseIntOpt(1, "--min-closure-intron arg must be at least 1", print_usage); |
399 |
break; |
400 |
case OPT_MAX_CLOSURE_INTRON: |
401 |
max_closure_intron_length = parseIntOpt(1, "--max-closure-intron arg must be at least 1", print_usage); |
402 |
break; |
403 |
case OPT_MIN_COVERAGE_INTRON: |
404 |
min_coverage_intron_length = parseIntOpt(1, "--min-coverage-intron arg must be at least 1", print_usage); |
405 |
break; |
406 |
case OPT_MAX_COVERAGE_INTRON: |
407 |
max_coverage_intron_length = parseIntOpt(1, "--max-coverage-intron arg must be at least 1", print_usage); |
408 |
break; |
409 |
case OPT_MIN_SEGMENT_INTRON: |
410 |
min_segment_intron_length = parseIntOpt(1, "--min-segment-intron arg must be at least 1", print_usage); |
411 |
break; |
412 |
case OPT_MAX_SEGMENT_INTRON: |
413 |
max_segment_intron_length = parseIntOpt(1, "--max-segment-intron arg must be at least 1", print_usage); |
414 |
break; |
415 |
case OPT_MIN_REPORT_INTRON: |
416 |
min_report_intron_length = parseIntOpt(1, "--min-report-intron arg must be at least 1", print_usage); |
417 |
break; |
418 |
case OPT_MAX_REPORT_INTRON: |
419 |
max_report_intron_length = parseIntOpt(1, "--max-report-intron arg must be at least 1", print_usage); |
420 |
break; |
421 |
case OPT_MIN_ISOFORM_FRACTION: |
422 |
min_isoform_fraction = parseFloatOpt(0.0f, 1.0f, "--min-isoform-fraction arg must be [0.0,1.0]", print_usage); |
423 |
break; |
424 |
case OPT_IUM_READS: |
425 |
ium_reads = optarg; |
426 |
break; |
427 |
case OPT_SAM_HEADER: |
428 |
sam_header = optarg; |
429 |
break; |
430 |
case OPT_SAM_READGROUP_ID: |
431 |
sam_readgroup_id = optarg; |
432 |
break; |
433 |
case OPT_BUTTERFLY_SEARCH: |
434 |
butterfly_search = true; |
435 |
break; |
436 |
case OPT_SOLEXA_QUALS: |
437 |
solexa_quals = true; |
438 |
break; |
439 |
case OPT_PHRED64_QUALS: |
440 |
phred64_quals = true; |
441 |
break; |
442 |
case 'Q': |
443 |
case OPT_QUALS: |
444 |
quals = true; |
445 |
break; |
446 |
case OPT_INTEGER_QUALS: |
447 |
integer_quals = true; |
448 |
break; |
449 |
case 'C': |
450 |
case OPT_COLOR: |
451 |
color = true; |
452 |
break; |
453 |
case OPT_COLOR_OUT: |
454 |
color_out = true; |
455 |
break; |
456 |
case OPT_LIBRARY_TYPE: |
457 |
if (strcmp(optarg, "fr-unstranded") == 0) |
458 |
library_type = FR_UNSTRANDED; |
459 |
else if (strcmp(optarg, "fr-firststrand") == 0) |
460 |
library_type = FR_FIRSTSTRAND; |
461 |
else if (strcmp(optarg, "fr-secondstrand") == 0) |
462 |
library_type = FR_SECONDSTRAND; |
463 |
else if (strcmp(optarg, "ff-unstranded") == 0) |
464 |
library_type = FF_UNSTRANDED; |
465 |
else if (strcmp(optarg, "ff-firststrand") == 0) |
466 |
library_type = FF_FIRSTSTRAND; |
467 |
else if (strcmp(optarg, "ff-secondstrand") == 0) |
468 |
library_type = FF_SECONDSTRAND; |
469 |
break; |
470 |
case OPT_MAX_DELETION_LENGTH: |
471 |
max_deletion_length = parseIntOpt(0, "--max-deletion-length must be at least 0", print_usage); |
472 |
break; |
473 |
case OPT_MAX_INSERTION_LENGTH: |
474 |
max_insertion_length = parseIntOpt(0, "--max-insertion-length must be at least 0", print_usage); |
475 |
break; |
476 |
case 'z': |
477 |
case OPT_ZPACKER: |
478 |
zpacker = optarg; |
479 |
break; |
480 |
case OPT_SAMTOOLS: |
481 |
samtools_path = optarg; |
482 |
break; |
483 |
case OPT_AUX_OUT: |
484 |
aux_outfile = optarg; |
485 |
break; |
486 |
case 'p': |
487 |
case OPT_NUM_CPUS: |
488 |
num_cpus=parseIntOpt(1,"-p/--num-threads must be at least 1",print_usage); |
489 |
break; |
490 |
case OPT_GTF_JUNCS: |
491 |
gtf_juncs = optarg; |
492 |
break; |
493 |
default: |
494 |
print_usage(); |
495 |
return 1; |
496 |
} |
497 |
} while(next_option != -1); |
498 |
|
499 |
return 0; |
500 |
} |
501 |
|
502 |
|
503 |
// Error routine (prints error message and exits!) |
504 |
void err_exit(const char* format,...){ |
505 |
va_list arguments; |
506 |
va_start(arguments,format); |
507 |
vfprintf(stderr,format,arguments); |
508 |
va_end(arguments); |
509 |
#ifdef DEBUG |
510 |
// trigger a core dump for later inspection |
511 |
abort(); |
512 |
#endif |
513 |
exit(1); |
514 |
} |
515 |
|
516 |
FILE* FZPipe::openRead(const char* fname, string& popencmd) { |
517 |
pipecmd=popencmd; |
518 |
filename=fname; |
519 |
if (pipecmd.empty()) { |
520 |
file=fopen(filename.c_str(), "r"); |
521 |
} |
522 |
else { |
523 |
string pcmd(pipecmd); |
524 |
pcmd.append(" '"); |
525 |
pcmd.append(filename); |
526 |
pcmd.append("'"); |
527 |
file=popen(pcmd.c_str(), "r"); |
528 |
} |
529 |
return file; |
530 |
} |
531 |
|
532 |
FILE* FZPipe::openRead(const char* fname) { |
533 |
string pcmd; |
534 |
return this->openRead(fname,pcmd); |
535 |
} |
536 |
|
537 |
FILE* FZPipe::openWrite(const char* fname, string& popencmd) { |
538 |
pipecmd=popencmd; |
539 |
filename=fname; |
540 |
if (pipecmd.empty()) { |
541 |
file=fopen(filename.c_str(), "w"); |
542 |
} |
543 |
else { |
544 |
string pcmd(pipecmd); |
545 |
pcmd.append(" - > '"); |
546 |
pcmd.append(filename.c_str()); |
547 |
pcmd.append("'"); |
548 |
file=popen(pcmd.c_str(), "w"); |
549 |
} |
550 |
return file; |
551 |
} |
552 |
|
553 |
|
554 |
FILE* FZPipe::openWrite(const char* fname) { |
555 |
string pcmd; |
556 |
return this->openWrite(fname,pcmd); |
557 |
} |
558 |
|
559 |
void FZPipe::rewind() { |
560 |
if (pipecmd.empty()) { |
561 |
if (file!=NULL) { |
562 |
::rewind(file); |
563 |
return; |
564 |
} |
565 |
if (!filename.empty()) { |
566 |
file=fopen(filename.c_str(),"r"); |
567 |
return; |
568 |
} |
569 |
} |
570 |
if (filename.empty()) |
571 |
err_die("Error: FZStream::rewind() failed (missing filename)!\n"); |
572 |
this->close(); |
573 |
string pcmd(pipecmd); |
574 |
pcmd.append(" '"); |
575 |
pcmd.append(filename); |
576 |
pcmd.append("'"); |
577 |
file=popen(pcmd.c_str(), "r"); |
578 |
if (file==NULL) { |
579 |
err_die("Error: FZStream::rewind() popen(%s) failed!\n",pcmd.c_str()); |
580 |
} |
581 |
} |
582 |
|
583 |
|
584 |
string getFext(const string& s) { |
585 |
string r(""); |
586 |
//if (xpos!=NULL) *xpos=0; |
587 |
if (s.empty() || s=="-") return r; |
588 |
int slen=(int)s.length(); |
589 |
int p=s.rfind('.'); |
590 |
int d=s.rfind('/'); |
591 |
if (p<=0 || p>slen-2 || p<slen-7 || p<d) return r; |
592 |
r=s.substr(p+1); |
593 |
//if (xpos!=NULL) *xpos=p+1; |
594 |
for(size_t i=0; i!=r.length(); i++) |
595 |
r[i] = std::tolower(r[i]); |
596 |
return r; |
597 |
} |
598 |
|
599 |
string guess_packer(const string& fname, bool use_all_cpus) { |
600 |
//only needed for the primary input files (given by user) |
601 |
string picmd(""); |
602 |
string fext=getFext(fname); |
603 |
if (fext=="gz" || fext=="gzip" || fext=="z") { |
604 |
if (use_all_cpus && str_endsWith(zpacker,"pigz")) { |
605 |
picmd=zpacker; |
606 |
if (num_cpus<2) picmd.append(" -p1"); |
607 |
else { |
608 |
picmd.append(" -p"); |
609 |
str_appendInt(picmd, num_cpus); |
610 |
//picmd.append(" -cd"); |
611 |
} |
612 |
} |
613 |
else picmd="gzip"; |
614 |
} |
615 |
else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") { |
616 |
if (use_all_cpus && str_endsWith(zpacker,"pbzip2")) { |
617 |
picmd=zpacker; |
618 |
if (num_cpus<2) picmd.append(" -p1"); |
619 |
else { |
620 |
picmd.append(" -p"); |
621 |
str_appendInt(picmd, num_cpus); |
622 |
//picmd.append(" -cd"); |
623 |
} |
624 |
} |
625 |
else picmd="bzip2"; |
626 |
} |
627 |
return picmd; |
628 |
} |
629 |
|
630 |
/* |
631 |
string getBam2SamCmd(const string& fname) { |
632 |
string pipecmd(""); |
633 |
string fext=getFext(fname); |
634 |
if (fext=="bam") { |
635 |
pipecmd=samtools_path; |
636 |
pipecmd.append(" view"); |
637 |
} |
638 |
return pipecmd; |
639 |
} |
640 |
*/ |
641 |
|
642 |
void err_die(const char* format,...) { // Error exit |
643 |
va_list arguments; |
644 |
va_start(arguments,format); |
645 |
vfprintf(stderr,format,arguments); |
646 |
va_end(arguments); |
647 |
exit(1); |
648 |
} |
649 |
|
650 |
string getUnpackCmd(const string& fname, bool use_all_cpus) { |
651 |
//prep_reads should use guess_packer() instead |
652 |
string pipecmd(""); |
653 |
if (zpacker.empty() || getFext(fname)!="z") { |
654 |
return pipecmd; |
655 |
} |
656 |
pipecmd=zpacker; |
657 |
if (str_endsWith(pipecmd, "pigz") ||str_endsWith(pipecmd, "pbzip2")) { |
658 |
if (use_all_cpus==false) pipecmd.append(" -p1"); |
659 |
else if (num_cpus>1) { |
660 |
pipecmd.append(" -p"); |
661 |
str_appendInt(pipecmd,num_cpus); |
662 |
} |
663 |
} |
664 |
if (!pipecmd.empty()) pipecmd.append(" -cd"); |
665 |
return pipecmd; |
666 |
} |
667 |
|
668 |
void checkSamHeader() { |
669 |
if (sam_header.empty()) |
670 |
err_die("Error: writeSamHeader() with empty sam_header string\n"); |
671 |
//copy the SAM header |
672 |
FILE* fh=fopen(sam_header.c_str(), "r"); |
673 |
if (fh==NULL) |
674 |
err_die("Error: cannot open SAM header file %s\n",sam_header.c_str()); |
675 |
fclose(fh); |
676 |
} |
677 |
|
678 |
void writeSamHeader(FILE* fout) { |
679 |
if (fout==NULL) |
680 |
err_die("Error: writeSamHeader(NULL)\n"); |
681 |
checkSamHeader(); |
682 |
//copy the SAM header |
683 |
FILE* fh=fopen(sam_header.c_str(), "r"); |
684 |
int ch=-1; |
685 |
while ((ch=fgetc(fh))!=EOF) { |
686 |
if (fputc(ch, fout)==EOF) |
687 |
err_die("Error copying SAM header\n"); |
688 |
} |
689 |
fclose(fh); |
690 |
} |
691 |
|
692 |
//auxiliary functions for BAM record handling |
693 |
uint8_t* realloc_bdata(bam1_t *b, int size) { |
694 |
if (b->m_data < size) { |
695 |
b->m_data = size; |
696 |
kroundup32(b->m_data); |
697 |
b->data = (uint8_t*)realloc(b->data, b->m_data); |
698 |
} |
699 |
if (b->data_len<size) b->data_len=size; |
700 |
return b->data; |
701 |
} |
702 |
|
703 |
uint8_t* dupalloc_bdata(bam1_t *b, int size) { |
704 |
//same as realloc_bdata, but does not free previous data |
705 |
//but returns it instead |
706 |
//it ALWAYS duplicates data |
707 |
b->m_data = size; |
708 |
kroundup32(b->m_data); |
709 |
uint8_t* odata=b->data; |
710 |
b->data = (uint8_t*)malloc(b->m_data); |
711 |
memcpy((void*)b->data, (void*)odata, b->data_len); |
712 |
b->data_len=size; |
713 |
return odata; //user must FREE this after |
714 |
} |
715 |
|
716 |
extern unsigned short bam_char2flag_table[]; |
717 |
|
718 |
GBamRecord::GBamRecord(const char* qname, int32_t gseq_tid, |
719 |
int pos, bool reverse, const char* qseq, const char* cigar, const char* quals) { |
720 |
novel=true; |
721 |
b=bam_init1(); |
722 |
b->core.tid=gseq_tid; |
723 |
if (pos<=0) { |
724 |
b->core.pos=-1; //unmapped |
725 |
//if (gseq_tid<0) |
726 |
b->core.flag |= BAM_FUNMAP; |
727 |
} |
728 |
else b->core.pos=pos-1; //BAM is 0-based |
729 |
b->core.qual=255; |
730 |
int l_qseq=strlen(qseq); |
731 |
//this may not be accurate, setting CIGAR is the correct way |
732 |
//b->core.bin = bam_reg2bin(b->core.pos, b->core.pos+l_qseq-1); |
733 |
b->core.l_qname=strlen(qname)+1; //includes the \0 at the end |
734 |
memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname); |
735 |
set_cigar(cigar); //this will also set core.bin |
736 |
add_sequence(qseq, l_qseq); |
737 |
add_quals(quals); //quals must be given as Phred33 |
738 |
if (reverse) { b->core.flag |= BAM_FREVERSE ; } |
739 |
} |
740 |
|
741 |
GBamRecord::GBamRecord(const char* qname, int32_t flags, int32_t g_tid, |
742 |
int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos, |
743 |
int insert_size, const char* qseq, const char* quals, |
744 |
const vector<string>* aux_strings) { |
745 |
novel=true; |
746 |
b=bam_init1(); |
747 |
b->core.tid=g_tid; |
748 |
b->core.pos = (pos<=0) ? -1 : pos-1; //BAM is 0-based |
749 |
b->core.qual=map_qual; |
750 |
int l_qseq=strlen(qseq); |
751 |
b->core.l_qname=strlen(qname)+1; //includes the \0 at the end |
752 |
memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname); |
753 |
set_cigar(cigar); //this will also set core.bin |
754 |
add_sequence(qseq, l_qseq); |
755 |
add_quals(quals); //quals must be given as Phred33 |
756 |
set_flags(flags); |
757 |
set_mdata(mg_tid, (int32_t)(mate_pos-1), (int32_t)insert_size); |
758 |
if (aux_strings!=NULL) { |
759 |
for (vector<string>::const_iterator itr=aux_strings->begin(); |
760 |
itr!=aux_strings->end(); ++itr) { |
761 |
add_aux(itr->c_str()); |
762 |
} |
763 |
} |
764 |
} |
765 |
void GBamRecord::set_cigar(const char* cigar) { |
766 |
//requires b->core.pos and b->core.flag to have been set properly PRIOR to this call |
767 |
int doff=b->core.l_qname; |
768 |
uint8_t* after_cigar=NULL; |
769 |
int after_cigar_len=0; |
770 |
uint8_t* prev_bdata=NULL; |
771 |
if (b->data_len>doff) { |
772 |
//cigar string already allocated, replace it |
773 |
int d=b->core.l_qname + b->core.n_cigar * 4;//offset of after-cigar data |
774 |
after_cigar=b->data+d; |
775 |
after_cigar_len=b->data_len-d; |
776 |
} |
777 |
const char *s; |
778 |
char *t; |
779 |
int i, op; |
780 |
long x; |
781 |
b->core.n_cigar = 0; |
782 |
if (cigar != NULL && strcmp(cigar, "*") != 0) { |
783 |
for (s = cigar; *s; ++s) { |
784 |
if (isalpha(*s)) b->core.n_cigar++; |
785 |
else if (!isdigit(*s)) { |
786 |
err_die("Error: invalid CIGAR character (%s)\n",cigar); |
787 |
} |
788 |
} |
789 |
if (after_cigar_len>0) { //replace/insert into existing full data |
790 |
prev_bdata=dupalloc_bdata(b, doff + b->core.n_cigar * 4 + after_cigar_len); |
791 |
memcpy((void*)(b->data+doff+b->core.n_cigar*4),(void*)after_cigar, after_cigar_len); |
792 |
free(prev_bdata); |
793 |
} |
794 |
else { |
795 |
realloc_bdata(b, doff + b->core.n_cigar * 4); |
796 |
} |
797 |
for (i = 0, s = cigar; i != b->core.n_cigar; ++i) { |
798 |
x = strtol(s, &t, 10); |
799 |
op = toupper(*t); |
800 |
if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH; |
801 |
else if (op == 'I') op = BAM_CINS; |
802 |
else if (op == 'D') op = BAM_CDEL; |
803 |
else if (op == 'N') op = BAM_CREF_SKIP; |
804 |
else if (op == 'S') op = BAM_CSOFT_CLIP; |
805 |
else if (op == 'H') op = BAM_CHARD_CLIP; |
806 |
else if (op == 'P') op = BAM_CPAD; |
807 |
else err_die("Error: invalid CIGAR operation (%s)\n",cigar); |
808 |
s = t + 1; |
809 |
bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op; |
810 |
} |
811 |
if (*s) err_die("Error: unmatched CIGAR operation (%s)\n",cigar); |
812 |
b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&b->core, bam1_cigar(b))); |
813 |
} else {//no CIGAR string given |
814 |
if (!(b->core.flag&BAM_FUNMAP)) { |
815 |
fprintf(stderr, "Warning: mapped sequence without CIGAR (%s)\n", (char*)b->data); |
816 |
b->core.flag |= BAM_FUNMAP; |
817 |
} |
818 |
b->core.bin = bam_reg2bin(b->core.pos, b->core.pos + 1); |
819 |
} |
820 |
} //set_cigar() |
821 |
|
822 |
void GBamRecord::add_sequence(const char* qseq, int slen) { |
823 |
//must be called AFTER set_cigar (cannot replace existing sequence for now) |
824 |
if (qseq==NULL) return; //should we ever care about this? |
825 |
if (slen<0) slen=strlen(qseq); |
826 |
int doff = b->core.l_qname + b->core.n_cigar * 4; |
827 |
if (strcmp(qseq, "*")!=0) { |
828 |
b->core.l_qseq=slen; |
829 |
if (b->core.n_cigar && b->core.l_qseq != (int32_t)bam_cigar2qlen(&b->core, bam1_cigar(b))) |
830 |
err_die("Error: CIGAR and sequence length are inconsistent!(%s)\n", |
831 |
qseq); |
832 |
uint8_t* p = (uint8_t*)realloc_bdata(b, doff + (b->core.l_qseq+1)/2 + b->core.l_qseq) + doff; |
833 |
//also allocated quals memory |
834 |
memset(p, 0, (b->core.l_qseq+1)/2); |
835 |
for (int i = 0; i < b->core.l_qseq; ++i) |
836 |
p[i/2] |= bam_nt16_table[(int)qseq[i]] << 4*(1-i%2); |
837 |
} else b->core.l_qseq = 0; |
838 |
} |
839 |
|
840 |
void GBamRecord::add_quals(const char* quals) { |
841 |
//requires core.l_qseq already set |
842 |
//and must be called AFTER add_sequence(), which also allocates the memory for quals |
843 |
uint8_t* p = b->data+(b->core.l_qname + b->core.n_cigar * 4 + (b->core.l_qseq+1)/2); |
844 |
if (quals==NULL || strcmp(quals, "*") == 0) { |
845 |
for (int i=0;i < b->core.l_qseq; i++) p[i] = 0xff; |
846 |
return; |
847 |
} |
848 |
for (int i=0;i < b->core.l_qseq; i++) p[i] = quals[i]-33; |
849 |
} |
850 |
|
851 |
void GBamRecord::add_aux(const char* str) { |
852 |
//requires: being called AFTER add_quals() |
853 |
static char tag[2]; |
854 |
static uint8_t abuf[512]; |
855 |
//requires: being called AFTER add_quals() |
856 |
int strl=strlen(str); |
857 |
//int doff = b->core.l_qname + b->core.n_cigar*4 + (b->core.l_qseq+1)/2 + b->core.l_qseq + b->l_aux; |
858 |
//int doff0=doff; |
859 |
if (strl < 6 || str[2] != ':' || str[4] != ':') |
860 |
parse_error("missing colon in auxiliary data"); |
861 |
tag[0] = str[0]; tag[1] = str[1]; |
862 |
uint8_t atype = str[3]; |
863 |
uint8_t* adata=abuf; |
864 |
int alen=0; |
865 |
if (atype == 'A' || atype == 'a' || atype == 'c' || atype == 'C') { // c and C for backward compatibility |
866 |
atype='A'; |
867 |
alen=1; |
868 |
adata=(uint8_t*)&str[5]; |
869 |
} |
870 |
else if (atype == 'I' || atype == 'i') { |
871 |
long long x=(long long)atoll(str + 5); |
872 |
if (x < 0) { |
873 |
if (x >= -127) { |
874 |
atype='c'; |
875 |
abuf[0] = (int8_t)x; |
876 |
alen=1; |
877 |
} |
878 |
else if (x >= -32767) { |
879 |
atype = 's'; |
880 |
*(int16_t*)abuf = (int16_t)x; |
881 |
alen=2; |
882 |
} |
883 |
else { |
884 |
atype='i'; |
885 |
*(int32_t*)abuf = (int32_t)x; |
886 |
alen=4; |
887 |
if (x < -2147483648ll) |
888 |
fprintf(stderr, "Parse warning: integer %lld is out of range.", |
889 |
x); |
890 |
} |
891 |
} else { //x >=0 |
892 |
if (x <= 255) { |
893 |
atype = 'C'; |
894 |
abuf[0] = (uint8_t)x; |
895 |
alen=1; |
896 |
} |
897 |
else if (x <= 65535) { |
898 |
atype='S'; |
899 |
*(uint16_t*)abuf = (uint16_t)x; |
900 |
alen=2; |
901 |
} |
902 |
else { |
903 |
atype='I'; |
904 |
*(uint32_t*)abuf = (uint32_t)x; |
905 |
alen=4; |
906 |
if (x > 4294967295ll) |
907 |
fprintf(stderr, "Parse warning: integer %lld is out of range.", |
908 |
x); |
909 |
} |
910 |
} |
911 |
} //integer type |
912 |
else if (atype == 'f') { |
913 |
*(float*)abuf = (float)atof(str + 5); |
914 |
alen = sizeof(float); |
915 |
} |
916 |
else if (atype == 'd') { //? |
917 |
*(float*)abuf = (float)atof(str + 9); |
918 |
alen=8; |
919 |
} |
920 |
else if (atype == 'Z' || atype == 'H') { |
921 |
if (atype == 'H') { // check whether the hex string is valid |
922 |
if ((strl - 5) % 2 == 1) parse_error("length of the hex string not even"); |
923 |
for (int i = 0; i < strl - 5; ++i) { |
924 |
int c = toupper(str[5 + i]); |
925 |
if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F'))) |
926 |
parse_error("invalid hex character"); |
927 |
} |
928 |
} |
929 |
memcpy(abuf, str + 5, strl - 5); |
930 |
abuf[strl-5] = 0; |
931 |
alen=strl-4; |
932 |
} else parse_error("unrecognized aux type"); |
933 |
this->add_aux(tag, atype, alen, adata); |
934 |
}//add_aux() |