1 |
gpertea |
29 |
/* |
2 |
|
|
* common.cpp |
3 |
|
|
* TopHat |
4 |
|
|
* |
5 |
|
|
* Created by Cole Trapnell on 11/26/08. |
6 |
|
|
* Copyright 2008 Cole Trapnell. All rights reserved. |
7 |
|
|
* |
8 |
|
|
*/ |
9 |
|
|
|
10 |
|
|
#ifdef HAVE_CONFIG_H |
11 |
|
|
#include <config.h> |
12 |
|
|
#endif |
13 |
|
|
|
14 |
|
|
#include <iostream> |
15 |
|
|
#include <sstream> |
16 |
|
|
#include <cstdarg> |
17 |
gpertea |
154 |
#include <limits> |
18 |
gpertea |
29 |
#include <getopt.h> |
19 |
|
|
|
20 |
|
|
#include "common.h" |
21 |
|
|
|
22 |
|
|
using namespace std; |
23 |
|
|
|
24 |
|
|
#ifdef MEM_DEBUG |
25 |
|
|
//function for debugging memory usage of current program in Linux |
26 |
|
|
|
27 |
|
|
#include <unistd.h> |
28 |
|
|
#include <ios> |
29 |
|
|
#include <fstream> |
30 |
|
|
|
31 |
|
|
////////////////////////////////////////////////////////////////////////////// |
32 |
|
|
// process_mem_usage(double &, double &) - takes two doubles by reference, |
33 |
|
|
// attempts to read the system-dependent data for a process' virtual memory |
34 |
|
|
// size and resident set size, and return the results in KB. |
35 |
|
|
// |
36 |
|
|
// On failure, returns 0.0, 0.0 |
37 |
|
|
|
38 |
|
|
void process_mem_usage(double& vm_usage, double& resident_set) { |
39 |
|
|
using std::ios_base; |
40 |
|
|
using std::ifstream; |
41 |
|
|
using std::string; |
42 |
|
|
vm_usage = 0.0; |
43 |
|
|
resident_set = 0.0; |
44 |
|
|
// 'file' stat seems to give the most reliable results |
45 |
|
|
ifstream stat_stream("/proc/self/stat",ios_base::in); |
46 |
|
|
// dummy vars for leading entries in stat that we don't care about |
47 |
|
|
string pid, comm, state, ppid, pgrp, session, tty_nr; |
48 |
|
|
string tpgid, flags, minflt, cminflt, majflt, cmajflt; |
49 |
|
|
string utime, stime, cutime, cstime, priority, nice; |
50 |
|
|
string O, itrealvalue, starttime; |
51 |
|
|
|
52 |
|
|
// the two fields we want |
53 |
|
|
// |
54 |
|
|
unsigned long vsize; |
55 |
|
|
long rss; |
56 |
|
|
|
57 |
|
|
stat_stream >> pid >> comm >> state >> ppid >> pgrp >> session >> tty_nr |
58 |
|
|
>> tpgid >> flags >> minflt >> cminflt >> majflt >> cmajflt |
59 |
|
|
>> utime >> stime >> cutime >> cstime >> priority >> nice |
60 |
|
|
>> O >> itrealvalue >> starttime >> vsize >> rss; // don't care about the rest |
61 |
|
|
|
62 |
|
|
stat_stream.close(); |
63 |
|
|
|
64 |
|
|
long page_size_kb = sysconf(_SC_PAGE_SIZE) / 1024; // in case x86-64 is configured to use 2MB pages |
65 |
|
|
vm_usage = vsize / 1024.0; |
66 |
|
|
resident_set = rss * page_size_kb; |
67 |
|
|
} |
68 |
|
|
|
69 |
|
|
void print_mem_usage() { |
70 |
|
|
double vs, rs; |
71 |
|
|
process_mem_usage(vs,rs); |
72 |
|
|
vs/=1024; |
73 |
|
|
rs/=1024; |
74 |
|
|
fprintf(stderr, "VMSize: %6.1fMB\tRSize: %6.1fMB\n", vs, rs); |
75 |
|
|
} |
76 |
|
|
#endif |
77 |
|
|
|
78 |
gpertea |
154 |
bool bowtie2 = true; |
79 |
|
|
int bowtie2_min_score = -10; |
80 |
|
|
int max_segment_mapping = 20; |
81 |
gpertea |
29 |
|
82 |
gpertea |
154 |
// daehwan - temporary |
83 |
|
|
bool parallel = true; |
84 |
|
|
|
85 |
gpertea |
29 |
unsigned int max_insertion_length = 3; |
86 |
|
|
unsigned int max_deletion_length = 3; |
87 |
|
|
|
88 |
|
|
|
89 |
|
|
int inner_dist_mean = 200; |
90 |
|
|
int inner_dist_std_dev = 20; |
91 |
|
|
int max_mate_inner_dist = -1; |
92 |
|
|
|
93 |
|
|
int min_anchor_len = 8; |
94 |
|
|
int min_report_intron_length = 50; |
95 |
|
|
int max_report_intron_length = 500000; |
96 |
|
|
|
97 |
|
|
int min_closure_intron_length = 50; |
98 |
|
|
int max_closure_intron_length = 5000; |
99 |
|
|
|
100 |
|
|
int min_coverage_intron_length = 50; |
101 |
|
|
int max_coverage_intron_length = 20000; |
102 |
|
|
|
103 |
|
|
int min_segment_intron_length = 50; |
104 |
|
|
int max_segment_intron_length = 500000; |
105 |
|
|
|
106 |
|
|
uint32_t min_closure_exon_length = 100; |
107 |
|
|
|
108 |
|
|
int island_extension = 25; |
109 |
|
|
int segment_length = 25; |
110 |
|
|
int segment_mismatches = 2; |
111 |
gpertea |
135 |
int max_read_mismatches = 2; |
112 |
gpertea |
29 |
int max_splice_mismatches = 1; |
113 |
|
|
|
114 |
|
|
ReadFormat reads_format = FASTQ; |
115 |
|
|
|
116 |
|
|
bool verbose = false; |
117 |
|
|
|
118 |
gpertea |
135 |
unsigned int max_multihits = 40; |
119 |
gpertea |
29 |
bool no_closure_search = false; |
120 |
|
|
bool no_coverage_search = false; |
121 |
|
|
bool no_microexon_search = false; |
122 |
|
|
bool butterfly_search = false; |
123 |
gpertea |
154 |
int num_threads = 1; |
124 |
|
|
|
125 |
gpertea |
29 |
float min_isoform_fraction = 0.15f; |
126 |
|
|
|
127 |
|
|
string output_dir = "tophat_out"; |
128 |
|
|
string aux_outfile = ""; //auxiliary output file name (e.g. prep_reads read stats) |
129 |
gpertea |
154 |
string index_outfile = ""; |
130 |
gpertea |
29 |
string gene_filter = ""; |
131 |
|
|
string gff_file = ""; |
132 |
|
|
string ium_reads = ""; |
133 |
|
|
string sam_header = ""; |
134 |
|
|
string sam_readgroup_id = ""; |
135 |
|
|
string zpacker = ""; |
136 |
|
|
string samtools_path = "samtools"; |
137 |
|
|
|
138 |
|
|
bool solexa_quals = false; |
139 |
|
|
bool phred64_quals = false; |
140 |
|
|
bool quals = false; |
141 |
|
|
bool integer_quals = false; |
142 |
|
|
bool color = false; |
143 |
|
|
bool color_out = false; |
144 |
|
|
|
145 |
|
|
string gtf_juncs = ""; |
146 |
|
|
|
147 |
gpertea |
135 |
string flt_reads = ""; |
148 |
|
|
string flt_mappings = ""; |
149 |
|
|
|
150 |
gpertea |
154 |
bool fusion_search = false; |
151 |
|
|
size_t fusion_anchor_length = 20; |
152 |
|
|
size_t fusion_min_dist = 10000000; |
153 |
|
|
size_t fusion_read_mismatches = 2; |
154 |
|
|
size_t fusion_multireads = 2; |
155 |
|
|
size_t fusion_multipairs = 2; |
156 |
|
|
|
157 |
gpertea |
29 |
eLIBRARY_TYPE library_type = LIBRARY_TYPE_NONE; |
158 |
|
|
|
159 |
|
|
extern void print_usage(); |
160 |
|
|
|
161 |
|
|
/** |
162 |
|
|
* Parse an int out of optarg and enforce that it be at least 'lower'; |
163 |
|
|
* if it is less than 'lower', than output the given error message and |
164 |
|
|
* exit with an error and a usage message. |
165 |
|
|
*/ |
166 |
|
|
|
167 |
|
|
int parseIntOpt(int lower, const char *errmsg, void (*print_usage)()) { |
168 |
|
|
long l; |
169 |
|
|
char *endPtr= NULL; |
170 |
|
|
l = strtol(optarg, &endPtr, 10); |
171 |
|
|
if (endPtr != NULL) { |
172 |
|
|
if (l < lower) { |
173 |
|
|
cerr << errmsg << endl; |
174 |
|
|
print_usage(); |
175 |
|
|
exit(1); |
176 |
|
|
} |
177 |
|
|
return (int32_t)l; |
178 |
|
|
} |
179 |
|
|
cerr << errmsg << endl; |
180 |
|
|
print_usage(); |
181 |
|
|
exit(1); |
182 |
|
|
return -1; |
183 |
|
|
} |
184 |
|
|
|
185 |
|
|
/** |
186 |
|
|
* Parse an int out of optarg and enforce that it be at least 'lower'; |
187 |
|
|
* if it is less than 'lower', than output the given error message and |
188 |
|
|
* exit with an error and a usage message. |
189 |
|
|
*/ |
190 |
|
|
static float parseFloatOpt(float lower, float upper, const char *errmsg, void (*print_usage)()) { |
191 |
|
|
float l; |
192 |
|
|
l = (float)atof(optarg); |
193 |
|
|
|
194 |
|
|
if (l < lower) { |
195 |
|
|
cerr << errmsg << endl; |
196 |
|
|
print_usage(); |
197 |
|
|
exit(1); |
198 |
|
|
} |
199 |
|
|
|
200 |
|
|
if (l > upper) |
201 |
|
|
{ |
202 |
|
|
cerr << errmsg << endl; |
203 |
|
|
print_usage(); |
204 |
|
|
exit(1); |
205 |
|
|
} |
206 |
|
|
|
207 |
|
|
return l; |
208 |
|
|
|
209 |
|
|
cerr << errmsg << endl; |
210 |
|
|
print_usage(); |
211 |
|
|
exit(1); |
212 |
|
|
return -1; |
213 |
|
|
} |
214 |
|
|
|
215 |
|
|
/* |
216 |
|
|
this is from |
217 |
|
|
http://www.winehq.org/pipermail/wine-patches/2001-November/001322.html |
218 |
|
|
*/ |
219 |
|
|
char* get_token(char** str, const char* delims) |
220 |
|
|
{ |
221 |
|
|
char* token; |
222 |
|
|
if (*str == NULL) |
223 |
|
|
return NULL; |
224 |
|
|
|
225 |
|
|
token = *str; |
226 |
|
|
while (**str != '\0') |
227 |
|
|
{ |
228 |
|
|
if (strchr(delims, **str) != NULL) |
229 |
|
|
{ |
230 |
|
|
**str = '\0'; |
231 |
|
|
++(*str); |
232 |
|
|
return token; |
233 |
|
|
} |
234 |
|
|
|
235 |
|
|
++(*str); |
236 |
|
|
} |
237 |
|
|
|
238 |
|
|
*str = NULL; |
239 |
|
|
return token; |
240 |
|
|
} |
241 |
|
|
|
242 |
gpertea |
135 |
const char *short_options = "QCp:z:N:"; |
243 |
gpertea |
29 |
|
244 |
|
|
enum |
245 |
|
|
{ |
246 |
|
|
OPT_FASTA = 127, |
247 |
|
|
OPT_FASTQ, |
248 |
|
|
OPT_MIN_ANCHOR, |
249 |
|
|
OPT_SPLICE_MISMATCHES, |
250 |
|
|
OPT_VERBOSE, |
251 |
|
|
OPT_INSERT_LENGTH_MEAN, |
252 |
|
|
OPT_INSERT_LENGTH_STD_DEV, |
253 |
|
|
OPT_MIN_ISOFORM_FRACTION, |
254 |
|
|
OPT_OUTPUT_DIR, |
255 |
|
|
OPT_GENE_FILTER, |
256 |
|
|
OPT_GFF_ANNOTATIONS, |
257 |
|
|
OPT_MAX_MULTIHITS, |
258 |
|
|
OPT_NO_CLOSURE_SEARCH, |
259 |
|
|
OPT_NO_COVERAGE_SEARCH, |
260 |
|
|
OPT_NO_MICROEXON_SEARCH, |
261 |
|
|
OPT_SEGMENT_LENGTH, |
262 |
gpertea |
135 |
OPT_READ_MISMATCHES, |
263 |
gpertea |
29 |
OPT_SEGMENT_MISMATCHES, |
264 |
|
|
OPT_MIN_CLOSURE_EXON, |
265 |
|
|
OPT_MAX_CLOSURE_INTRON, |
266 |
|
|
OPT_MIN_CLOSURE_INTRON, |
267 |
|
|
OPT_MAX_COVERAGE_INTRON, |
268 |
|
|
OPT_MIN_COVERAGE_INTRON, |
269 |
|
|
OPT_MIN_SEGMENT_INTRON, |
270 |
|
|
OPT_MAX_SEGMENT_INTRON, |
271 |
|
|
OPT_MIN_REPORT_INTRON, |
272 |
|
|
OPT_MAX_REPORT_INTRON, |
273 |
|
|
OPT_IUM_READS, |
274 |
|
|
OPT_BUTTERFLY_SEARCH, |
275 |
|
|
OPT_SOLEXA_QUALS, |
276 |
|
|
OPT_PHRED64_QUALS, |
277 |
|
|
OPT_SAM_HEADER, |
278 |
|
|
OPT_SAM_READGROUP_ID, |
279 |
|
|
OPT_QUALS, |
280 |
|
|
OPT_INTEGER_QUALS, |
281 |
|
|
OPT_COLOR, |
282 |
|
|
OPT_COLOR_OUT, |
283 |
|
|
OPT_LIBRARY_TYPE, |
284 |
|
|
OPT_MAX_DELETION_LENGTH, |
285 |
|
|
OPT_MAX_INSERTION_LENGTH, |
286 |
gpertea |
154 |
OPT_NUM_THREADS, |
287 |
gpertea |
29 |
OPT_ZPACKER, |
288 |
|
|
OPT_SAMTOOLS, |
289 |
|
|
OPT_AUX_OUT, |
290 |
gpertea |
154 |
OPT_INDEX_OUT, |
291 |
gpertea |
135 |
OPT_GTF_JUNCS, |
292 |
|
|
OPT_FILTER_READS, |
293 |
gpertea |
154 |
OPT_FILTER_HITS, |
294 |
|
|
OPT_FUSION_SEARCH, |
295 |
|
|
OPT_FUSION_ANCHOR_LENGTH, |
296 |
|
|
OPT_FUSION_MIN_DIST, |
297 |
|
|
OPT_FUSION_READ_MISMATCHES, |
298 |
|
|
OPT_FUSION_MULTIREADS, |
299 |
|
|
OPT_FUSION_MULTIPAIRS, |
300 |
|
|
OPT_BOWTIE1, |
301 |
|
|
OPT_BOWTIE2_MIN_SCORE, |
302 |
gpertea |
29 |
}; |
303 |
|
|
|
304 |
|
|
static struct option long_options[] = { |
305 |
|
|
{"fasta", no_argument, 0, OPT_FASTA}, |
306 |
|
|
{"fastq", no_argument, 0, OPT_FASTQ}, |
307 |
|
|
{"min-anchor", required_argument, 0, OPT_MIN_ANCHOR}, |
308 |
|
|
{"sam-header", required_argument, 0, OPT_SAM_HEADER}, |
309 |
|
|
{"rg-id", required_argument, 0, OPT_SAM_READGROUP_ID}, |
310 |
|
|
{"splice-mismatches", required_argument, 0, OPT_SPLICE_MISMATCHES}, |
311 |
|
|
{"verbose", no_argument, 0, OPT_VERBOSE}, |
312 |
|
|
{"inner-dist-mean", required_argument, 0, OPT_INSERT_LENGTH_MEAN}, |
313 |
|
|
{"inner-dist-std-dev", required_argument, 0, OPT_INSERT_LENGTH_STD_DEV}, |
314 |
|
|
{"output-dir", required_argument, 0, OPT_OUTPUT_DIR}, |
315 |
|
|
{"gene-filter", required_argument, 0, OPT_GENE_FILTER}, |
316 |
|
|
{"gtf-annotations", required_argument, 0, OPT_GFF_ANNOTATIONS}, |
317 |
|
|
{"max-multihits", required_argument, 0, OPT_MAX_MULTIHITS}, |
318 |
|
|
{"no-closure-search", no_argument, 0, OPT_NO_CLOSURE_SEARCH}, |
319 |
|
|
{"no-coverage-search", no_argument, 0, OPT_NO_COVERAGE_SEARCH}, |
320 |
|
|
{"no-microexon-search", no_argument, 0, OPT_NO_MICROEXON_SEARCH}, |
321 |
|
|
{"segment-length", required_argument, 0, OPT_SEGMENT_LENGTH}, |
322 |
|
|
{"segment-mismatches", required_argument, 0, OPT_SEGMENT_MISMATCHES}, |
323 |
gpertea |
135 |
{"max-mismatches", required_argument, 0, OPT_READ_MISMATCHES}, |
324 |
gpertea |
29 |
{"min-closure-exon", required_argument, 0, OPT_MIN_CLOSURE_EXON}, |
325 |
|
|
{"min-closure-intron", required_argument, 0, OPT_MIN_CLOSURE_INTRON}, |
326 |
|
|
{"max-closure-intron", required_argument, 0, OPT_MAX_CLOSURE_INTRON}, |
327 |
|
|
{"min-coverage-intron", required_argument, 0, OPT_MIN_COVERAGE_INTRON}, |
328 |
|
|
{"max-coverage-intron", required_argument, 0, OPT_MAX_COVERAGE_INTRON}, |
329 |
|
|
{"min-segment-intron", required_argument, 0, OPT_MIN_SEGMENT_INTRON}, |
330 |
|
|
{"max-segment-intron", required_argument, 0, OPT_MAX_SEGMENT_INTRON}, |
331 |
|
|
{"min-report-intron", required_argument, 0, OPT_MIN_REPORT_INTRON}, |
332 |
|
|
{"max-report-intron", required_argument, 0, OPT_MAX_REPORT_INTRON}, |
333 |
|
|
{"min-isoform-fraction",required_argument, 0, OPT_MIN_ISOFORM_FRACTION}, |
334 |
|
|
{"ium-reads", required_argument, 0, OPT_IUM_READS}, |
335 |
|
|
{"butterfly-search", no_argument, 0, OPT_BUTTERFLY_SEARCH}, |
336 |
|
|
{"solexa-quals", no_argument, 0, OPT_SOLEXA_QUALS}, |
337 |
|
|
{"phred64-quals", no_argument, 0, OPT_PHRED64_QUALS}, |
338 |
|
|
{"quals", no_argument, 0, OPT_QUALS}, |
339 |
|
|
{"integer-quals", no_argument, 0, OPT_INTEGER_QUALS}, |
340 |
|
|
{"color", no_argument, 0, OPT_COLOR}, |
341 |
|
|
{"color-out", no_argument, 0, OPT_COLOR_OUT}, |
342 |
|
|
{"library-type", required_argument, 0, OPT_LIBRARY_TYPE}, |
343 |
|
|
{"max-deletion-length", required_argument, 0, OPT_MAX_DELETION_LENGTH}, |
344 |
|
|
{"max-insertion-length", required_argument, 0, OPT_MAX_INSERTION_LENGTH}, |
345 |
gpertea |
154 |
{"num-threads", required_argument, 0, OPT_NUM_THREADS}, |
346 |
gpertea |
29 |
{"zpacker", required_argument, 0, OPT_ZPACKER}, |
347 |
|
|
{"samtools", required_argument, 0, OPT_SAMTOOLS}, |
348 |
|
|
{"aux-outfile", required_argument, 0, OPT_AUX_OUT}, |
349 |
gpertea |
154 |
{"index-outfile", required_argument, 0, OPT_INDEX_OUT}, |
350 |
gpertea |
29 |
{"gtf-juncs", required_argument, 0, OPT_GTF_JUNCS}, |
351 |
gpertea |
135 |
{"flt-reads",required_argument, 0, OPT_FILTER_READS}, |
352 |
|
|
{"flt-hits",required_argument, 0, OPT_FILTER_HITS}, |
353 |
gpertea |
154 |
{"fusion-search", no_argument, 0, OPT_FUSION_SEARCH}, |
354 |
|
|
{"fusion-anchor-length", required_argument, 0, OPT_FUSION_ANCHOR_LENGTH}, |
355 |
|
|
{"fusion-min-dist", required_argument, 0, OPT_FUSION_MIN_DIST}, |
356 |
|
|
{"fusion-read-mismatches", required_argument, 0, OPT_FUSION_READ_MISMATCHES}, |
357 |
|
|
{"fusion-multireads", required_argument, 0, OPT_FUSION_MULTIREADS}, |
358 |
|
|
{"fusion-multipairs", required_argument, 0, OPT_FUSION_MULTIPAIRS}, |
359 |
|
|
{"bowtie1", no_argument, 0, OPT_BOWTIE1}, |
360 |
|
|
{"bowtie2-min-score", required_argument, 0, OPT_BOWTIE2_MIN_SCORE}, |
361 |
gpertea |
29 |
{0, 0, 0, 0} // terminator |
362 |
|
|
}; |
363 |
|
|
|
364 |
|
|
|
365 |
|
|
void str_appendInt(string& str, int v) { |
366 |
|
|
stringstream ss; |
367 |
|
|
ss << v; |
368 |
|
|
str.append(ss.str()); |
369 |
|
|
} |
370 |
|
|
|
371 |
|
|
bool str_endsWith(string& str, const char* suffix) { |
372 |
|
|
if (str.empty() || str.length()<3) return false; |
373 |
|
|
size_t l=strlen(suffix); |
374 |
|
|
if (str.length()<=l) return false; |
375 |
|
|
if (str.rfind(suffix, str.length()-l-1)!=string::npos) return true; |
376 |
|
|
return false; |
377 |
|
|
} |
378 |
|
|
|
379 |
|
|
int parse_options(int argc, char** argv, void (*print_usage)()) |
380 |
|
|
{ |
381 |
|
|
int option_index = 0; |
382 |
|
|
int next_option; |
383 |
|
|
do { |
384 |
|
|
next_option = getopt_long(argc, argv, short_options, long_options, &option_index); |
385 |
|
|
switch (next_option) { |
386 |
|
|
case -1: |
387 |
|
|
break; |
388 |
|
|
case OPT_FASTA: |
389 |
|
|
reads_format = FASTA; |
390 |
|
|
break; |
391 |
|
|
case OPT_FASTQ: |
392 |
|
|
reads_format = FASTQ; |
393 |
|
|
break; |
394 |
|
|
case OPT_MIN_ANCHOR: |
395 |
|
|
min_anchor_len = (uint32_t)parseIntOpt(3, "--min-anchor arg must be at least 3", print_usage); |
396 |
|
|
break; |
397 |
|
|
case OPT_SPLICE_MISMATCHES: |
398 |
|
|
max_splice_mismatches = parseIntOpt(0, "--splice-mismatches arg must be at least 0", print_usage); |
399 |
|
|
break; |
400 |
|
|
case OPT_VERBOSE: |
401 |
|
|
verbose = true; |
402 |
|
|
break; |
403 |
|
|
case OPT_INSERT_LENGTH_MEAN: |
404 |
|
|
inner_dist_mean = parseIntOpt(-1024, "--inner-dist-mean arg must be at least -1024", print_usage); |
405 |
|
|
break; |
406 |
|
|
case OPT_INSERT_LENGTH_STD_DEV: |
407 |
|
|
inner_dist_std_dev = parseIntOpt(0, "--inner-dist-std-dev arg must be at least 0", print_usage); |
408 |
|
|
break; |
409 |
|
|
case OPT_OUTPUT_DIR: |
410 |
|
|
output_dir = optarg; |
411 |
|
|
break; |
412 |
|
|
case OPT_GENE_FILTER: |
413 |
|
|
gene_filter = optarg; |
414 |
|
|
break; |
415 |
|
|
case OPT_GFF_ANNOTATIONS: |
416 |
|
|
gff_file = optarg; |
417 |
|
|
break; |
418 |
|
|
case OPT_MAX_MULTIHITS: |
419 |
|
|
max_multihits = parseIntOpt(1, "--max-multihits arg must be at least 1", print_usage); |
420 |
|
|
break; |
421 |
|
|
case OPT_NO_CLOSURE_SEARCH: |
422 |
|
|
no_closure_search = true; |
423 |
|
|
break; |
424 |
|
|
case OPT_NO_COVERAGE_SEARCH: |
425 |
|
|
no_coverage_search = true; |
426 |
|
|
break; |
427 |
|
|
case OPT_NO_MICROEXON_SEARCH: |
428 |
|
|
no_microexon_search = true; |
429 |
|
|
break; |
430 |
|
|
case OPT_SEGMENT_LENGTH: |
431 |
|
|
segment_length = parseIntOpt(4, "--segment-length arg must be at least 4", print_usage); |
432 |
|
|
break; |
433 |
|
|
case OPT_SEGMENT_MISMATCHES: |
434 |
|
|
segment_mismatches = parseIntOpt(0, "--segment-mismatches arg must be at least 0", print_usage); |
435 |
|
|
break; |
436 |
gpertea |
135 |
case 'N': |
437 |
|
|
case OPT_READ_MISMATCHES: |
438 |
|
|
max_read_mismatches = parseIntOpt(0, "--max-mismatches arg must be at least 0", print_usage); |
439 |
|
|
break; |
440 |
gpertea |
29 |
case OPT_MIN_CLOSURE_EXON: |
441 |
|
|
min_closure_exon_length = parseIntOpt(1, "--min-closure-exon arg must be at least 1", print_usage); |
442 |
|
|
break; |
443 |
|
|
case OPT_MIN_CLOSURE_INTRON: |
444 |
|
|
min_closure_intron_length = parseIntOpt(1, "--min-closure-intron arg must be at least 1", print_usage); |
445 |
|
|
break; |
446 |
|
|
case OPT_MAX_CLOSURE_INTRON: |
447 |
|
|
max_closure_intron_length = parseIntOpt(1, "--max-closure-intron arg must be at least 1", print_usage); |
448 |
|
|
break; |
449 |
|
|
case OPT_MIN_COVERAGE_INTRON: |
450 |
|
|
min_coverage_intron_length = parseIntOpt(1, "--min-coverage-intron arg must be at least 1", print_usage); |
451 |
|
|
break; |
452 |
|
|
case OPT_MAX_COVERAGE_INTRON: |
453 |
|
|
max_coverage_intron_length = parseIntOpt(1, "--max-coverage-intron arg must be at least 1", print_usage); |
454 |
|
|
break; |
455 |
|
|
case OPT_MIN_SEGMENT_INTRON: |
456 |
|
|
min_segment_intron_length = parseIntOpt(1, "--min-segment-intron arg must be at least 1", print_usage); |
457 |
|
|
break; |
458 |
|
|
case OPT_MAX_SEGMENT_INTRON: |
459 |
|
|
max_segment_intron_length = parseIntOpt(1, "--max-segment-intron arg must be at least 1", print_usage); |
460 |
|
|
break; |
461 |
|
|
case OPT_MIN_REPORT_INTRON: |
462 |
|
|
min_report_intron_length = parseIntOpt(1, "--min-report-intron arg must be at least 1", print_usage); |
463 |
|
|
break; |
464 |
|
|
case OPT_MAX_REPORT_INTRON: |
465 |
|
|
max_report_intron_length = parseIntOpt(1, "--max-report-intron arg must be at least 1", print_usage); |
466 |
|
|
break; |
467 |
|
|
case OPT_MIN_ISOFORM_FRACTION: |
468 |
|
|
min_isoform_fraction = parseFloatOpt(0.0f, 1.0f, "--min-isoform-fraction arg must be [0.0,1.0]", print_usage); |
469 |
|
|
break; |
470 |
|
|
case OPT_IUM_READS: |
471 |
|
|
ium_reads = optarg; |
472 |
|
|
break; |
473 |
|
|
case OPT_SAM_HEADER: |
474 |
|
|
sam_header = optarg; |
475 |
|
|
break; |
476 |
|
|
case OPT_SAM_READGROUP_ID: |
477 |
|
|
sam_readgroup_id = optarg; |
478 |
|
|
break; |
479 |
|
|
case OPT_BUTTERFLY_SEARCH: |
480 |
|
|
butterfly_search = true; |
481 |
|
|
break; |
482 |
|
|
case OPT_SOLEXA_QUALS: |
483 |
|
|
solexa_quals = true; |
484 |
|
|
break; |
485 |
|
|
case OPT_PHRED64_QUALS: |
486 |
|
|
phred64_quals = true; |
487 |
|
|
break; |
488 |
|
|
case 'Q': |
489 |
|
|
case OPT_QUALS: |
490 |
|
|
quals = true; |
491 |
|
|
break; |
492 |
|
|
case OPT_INTEGER_QUALS: |
493 |
|
|
integer_quals = true; |
494 |
|
|
break; |
495 |
|
|
case 'C': |
496 |
|
|
case OPT_COLOR: |
497 |
|
|
color = true; |
498 |
|
|
break; |
499 |
|
|
case OPT_COLOR_OUT: |
500 |
|
|
color_out = true; |
501 |
|
|
break; |
502 |
|
|
case OPT_LIBRARY_TYPE: |
503 |
|
|
if (strcmp(optarg, "fr-unstranded") == 0) |
504 |
|
|
library_type = FR_UNSTRANDED; |
505 |
|
|
else if (strcmp(optarg, "fr-firststrand") == 0) |
506 |
|
|
library_type = FR_FIRSTSTRAND; |
507 |
|
|
else if (strcmp(optarg, "fr-secondstrand") == 0) |
508 |
|
|
library_type = FR_SECONDSTRAND; |
509 |
|
|
else if (strcmp(optarg, "ff-unstranded") == 0) |
510 |
|
|
library_type = FF_UNSTRANDED; |
511 |
|
|
else if (strcmp(optarg, "ff-firststrand") == 0) |
512 |
|
|
library_type = FF_FIRSTSTRAND; |
513 |
|
|
else if (strcmp(optarg, "ff-secondstrand") == 0) |
514 |
|
|
library_type = FF_SECONDSTRAND; |
515 |
|
|
break; |
516 |
|
|
case OPT_MAX_DELETION_LENGTH: |
517 |
|
|
max_deletion_length = parseIntOpt(0, "--max-deletion-length must be at least 0", print_usage); |
518 |
|
|
break; |
519 |
|
|
case OPT_MAX_INSERTION_LENGTH: |
520 |
|
|
max_insertion_length = parseIntOpt(0, "--max-insertion-length must be at least 0", print_usage); |
521 |
|
|
break; |
522 |
|
|
case 'z': |
523 |
|
|
case OPT_ZPACKER: |
524 |
|
|
zpacker = optarg; |
525 |
|
|
break; |
526 |
|
|
case OPT_SAMTOOLS: |
527 |
|
|
samtools_path = optarg; |
528 |
|
|
break; |
529 |
|
|
case OPT_AUX_OUT: |
530 |
|
|
aux_outfile = optarg; |
531 |
|
|
break; |
532 |
gpertea |
154 |
case OPT_INDEX_OUT: |
533 |
|
|
index_outfile = optarg; |
534 |
|
|
break; |
535 |
gpertea |
29 |
case 'p': |
536 |
gpertea |
154 |
case OPT_NUM_THREADS: |
537 |
|
|
num_threads=parseIntOpt(1,"-p/--num-threads must be at least 1",print_usage); |
538 |
gpertea |
29 |
break; |
539 |
|
|
case OPT_GTF_JUNCS: |
540 |
|
|
gtf_juncs = optarg; |
541 |
|
|
break; |
542 |
gpertea |
135 |
case OPT_FILTER_READS: |
543 |
|
|
flt_reads = optarg; |
544 |
|
|
break; |
545 |
|
|
case OPT_FILTER_HITS: |
546 |
|
|
flt_mappings = optarg; |
547 |
|
|
break; |
548 |
gpertea |
154 |
case OPT_FUSION_SEARCH: |
549 |
|
|
fusion_search = true; |
550 |
|
|
break; |
551 |
|
|
case OPT_FUSION_ANCHOR_LENGTH: |
552 |
|
|
fusion_anchor_length = parseIntOpt(10, "--fusion-anchor-length must be at least 10", print_usage); |
553 |
|
|
break; |
554 |
|
|
case OPT_FUSION_MIN_DIST: |
555 |
|
|
fusion_min_dist = parseIntOpt(0, "--fusion-min-dist must be at least 0", print_usage); |
556 |
|
|
break; |
557 |
|
|
case OPT_FUSION_READ_MISMATCHES: |
558 |
|
|
fusion_read_mismatches = parseIntOpt(0, "--fusion-read-mismatches must be at least 0", print_usage); |
559 |
|
|
break; |
560 |
|
|
case OPT_FUSION_MULTIREADS: |
561 |
|
|
fusion_multireads = parseIntOpt(1, "--fusion-multireads must be at least 1", print_usage); |
562 |
|
|
break; |
563 |
|
|
case OPT_FUSION_MULTIPAIRS: |
564 |
|
|
fusion_multipairs = parseIntOpt(1, "--fusion-multipairs must be at least 0", print_usage); |
565 |
|
|
break; |
566 |
|
|
case OPT_BOWTIE1: |
567 |
|
|
bowtie2 = false; |
568 |
|
|
break; |
569 |
|
|
case OPT_BOWTIE2_MIN_SCORE: |
570 |
|
|
bowtie2_min_score = -1 * parseIntOpt(0, "--bowtie2-min-score must be at least 0", print_usage); |
571 |
|
|
break; |
572 |
gpertea |
29 |
default: |
573 |
|
|
print_usage(); |
574 |
|
|
return 1; |
575 |
|
|
} |
576 |
|
|
} while(next_option != -1); |
577 |
|
|
|
578 |
|
|
return 0; |
579 |
|
|
} |
580 |
|
|
|
581 |
|
|
|
582 |
|
|
// Error routine (prints error message and exits!) |
583 |
|
|
void err_exit(const char* format,...){ |
584 |
|
|
va_list arguments; |
585 |
|
|
va_start(arguments,format); |
586 |
|
|
vfprintf(stderr,format,arguments); |
587 |
|
|
va_end(arguments); |
588 |
|
|
#ifdef DEBUG |
589 |
|
|
// trigger a core dump for later inspection |
590 |
|
|
abort(); |
591 |
|
|
#endif |
592 |
|
|
exit(1); |
593 |
|
|
} |
594 |
|
|
|
595 |
|
|
FILE* FZPipe::openRead(const char* fname, string& popencmd) { |
596 |
|
|
pipecmd=popencmd; |
597 |
|
|
filename=fname; |
598 |
|
|
if (pipecmd.empty()) { |
599 |
|
|
file=fopen(filename.c_str(), "r"); |
600 |
|
|
} |
601 |
|
|
else { |
602 |
|
|
string pcmd(pipecmd); |
603 |
|
|
pcmd.append(" '"); |
604 |
|
|
pcmd.append(filename); |
605 |
|
|
pcmd.append("'"); |
606 |
|
|
file=popen(pcmd.c_str(), "r"); |
607 |
|
|
} |
608 |
|
|
return file; |
609 |
|
|
} |
610 |
|
|
|
611 |
|
|
FILE* FZPipe::openRead(const char* fname) { |
612 |
|
|
string pcmd; |
613 |
|
|
return this->openRead(fname,pcmd); |
614 |
|
|
} |
615 |
|
|
|
616 |
|
|
FILE* FZPipe::openWrite(const char* fname, string& popencmd) { |
617 |
|
|
pipecmd=popencmd; |
618 |
|
|
filename=fname; |
619 |
|
|
if (pipecmd.empty()) { |
620 |
|
|
file=fopen(filename.c_str(), "w"); |
621 |
|
|
} |
622 |
|
|
else { |
623 |
|
|
string pcmd(pipecmd); |
624 |
|
|
pcmd.append(" - > '"); |
625 |
|
|
pcmd.append(filename.c_str()); |
626 |
|
|
pcmd.append("'"); |
627 |
|
|
file=popen(pcmd.c_str(), "w"); |
628 |
|
|
} |
629 |
|
|
return file; |
630 |
|
|
} |
631 |
|
|
|
632 |
|
|
|
633 |
|
|
FILE* FZPipe::openWrite(const char* fname) { |
634 |
|
|
string pcmd; |
635 |
|
|
return this->openWrite(fname,pcmd); |
636 |
|
|
} |
637 |
|
|
|
638 |
|
|
void FZPipe::rewind() { |
639 |
|
|
if (pipecmd.empty()) { |
640 |
|
|
if (file!=NULL) { |
641 |
|
|
::rewind(file); |
642 |
|
|
return; |
643 |
|
|
} |
644 |
|
|
if (!filename.empty()) { |
645 |
|
|
file=fopen(filename.c_str(),"r"); |
646 |
|
|
return; |
647 |
|
|
} |
648 |
|
|
} |
649 |
|
|
if (filename.empty()) |
650 |
|
|
err_die("Error: FZStream::rewind() failed (missing filename)!\n"); |
651 |
|
|
this->close(); |
652 |
|
|
string pcmd(pipecmd); |
653 |
|
|
pcmd.append(" '"); |
654 |
|
|
pcmd.append(filename); |
655 |
|
|
pcmd.append("'"); |
656 |
|
|
file=popen(pcmd.c_str(), "r"); |
657 |
|
|
if (file==NULL) { |
658 |
|
|
err_die("Error: FZStream::rewind() popen(%s) failed!\n",pcmd.c_str()); |
659 |
|
|
} |
660 |
|
|
} |
661 |
|
|
|
662 |
|
|
|
663 |
|
|
string getFext(const string& s) { |
664 |
|
|
string r(""); |
665 |
|
|
//if (xpos!=NULL) *xpos=0; |
666 |
|
|
if (s.empty() || s=="-") return r; |
667 |
|
|
int slen=(int)s.length(); |
668 |
|
|
int p=s.rfind('.'); |
669 |
|
|
int d=s.rfind('/'); |
670 |
|
|
if (p<=0 || p>slen-2 || p<slen-7 || p<d) return r; |
671 |
|
|
r=s.substr(p+1); |
672 |
|
|
//if (xpos!=NULL) *xpos=p+1; |
673 |
|
|
for(size_t i=0; i!=r.length(); i++) |
674 |
|
|
r[i] = std::tolower(r[i]); |
675 |
|
|
return r; |
676 |
|
|
} |
677 |
|
|
|
678 |
|
|
string guess_packer(const string& fname, bool use_all_cpus) { |
679 |
|
|
//only needed for the primary input files (given by user) |
680 |
|
|
string picmd(""); |
681 |
|
|
string fext=getFext(fname); |
682 |
gpertea |
32 |
if (fext=="bam") { |
683 |
|
|
picmd="bam2fastx"; |
684 |
|
|
return picmd; |
685 |
|
|
} |
686 |
gpertea |
29 |
if (fext=="gz" || fext=="gzip" || fext=="z") { |
687 |
|
|
if (use_all_cpus && str_endsWith(zpacker,"pigz")) { |
688 |
|
|
picmd=zpacker; |
689 |
gpertea |
154 |
if (num_threads<2) picmd.append(" -p1"); |
690 |
gpertea |
29 |
else { |
691 |
|
|
picmd.append(" -p"); |
692 |
gpertea |
154 |
str_appendInt(picmd, num_threads); |
693 |
gpertea |
29 |
//picmd.append(" -cd"); |
694 |
|
|
} |
695 |
|
|
} |
696 |
|
|
else picmd="gzip"; |
697 |
|
|
} |
698 |
|
|
else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") { |
699 |
|
|
if (use_all_cpus && str_endsWith(zpacker,"pbzip2")) { |
700 |
|
|
picmd=zpacker; |
701 |
gpertea |
154 |
if (num_threads<2) picmd.append(" -p1"); |
702 |
gpertea |
29 |
else { |
703 |
|
|
picmd.append(" -p"); |
704 |
gpertea |
154 |
str_appendInt(picmd, num_threads); |
705 |
gpertea |
29 |
//picmd.append(" -cd"); |
706 |
|
|
} |
707 |
|
|
} |
708 |
|
|
else picmd="bzip2"; |
709 |
|
|
} |
710 |
|
|
return picmd; |
711 |
|
|
} |
712 |
|
|
|
713 |
|
|
/* |
714 |
|
|
string getBam2SamCmd(const string& fname) { |
715 |
|
|
string pipecmd(""); |
716 |
|
|
string fext=getFext(fname); |
717 |
|
|
if (fext=="bam") { |
718 |
|
|
pipecmd=samtools_path; |
719 |
|
|
pipecmd.append(" view"); |
720 |
|
|
} |
721 |
|
|
return pipecmd; |
722 |
|
|
} |
723 |
|
|
*/ |
724 |
|
|
|
725 |
|
|
void err_die(const char* format,...) { // Error exit |
726 |
|
|
va_list arguments; |
727 |
|
|
va_start(arguments,format); |
728 |
|
|
vfprintf(stderr,format,arguments); |
729 |
|
|
va_end(arguments); |
730 |
|
|
exit(1); |
731 |
|
|
} |
732 |
|
|
|
733 |
|
|
string getUnpackCmd(const string& fname, bool use_all_cpus) { |
734 |
gpertea |
32 |
//prep_reads should use guess_packer() instead |
735 |
gpertea |
135 |
//otherwise compressed files MUST have the .z extension, |
736 |
|
|
//as they are all internally generated |
737 |
gpertea |
29 |
string pipecmd(""); |
738 |
gpertea |
32 |
string fext=getFext(fname); |
739 |
|
|
if (fext=="bam") { |
740 |
|
|
pipecmd="bam2fastx"; |
741 |
|
|
return pipecmd; |
742 |
|
|
} |
743 |
|
|
if (zpacker.empty() || fext!="z") { |
744 |
gpertea |
135 |
return pipecmd; //no packer used |
745 |
gpertea |
29 |
} |
746 |
|
|
pipecmd=zpacker; |
747 |
|
|
if (str_endsWith(pipecmd, "pigz") ||str_endsWith(pipecmd, "pbzip2")) { |
748 |
|
|
if (use_all_cpus==false) pipecmd.append(" -p1"); |
749 |
gpertea |
154 |
else if (num_threads>1) { |
750 |
gpertea |
29 |
pipecmd.append(" -p"); |
751 |
gpertea |
154 |
str_appendInt(pipecmd,num_threads); |
752 |
gpertea |
29 |
} |
753 |
|
|
} |
754 |
|
|
if (!pipecmd.empty()) pipecmd.append(" -cd"); |
755 |
|
|
return pipecmd; |
756 |
|
|
} |
757 |
|
|
|
758 |
|
|
void checkSamHeader() { |
759 |
|
|
if (sam_header.empty()) |
760 |
|
|
err_die("Error: writeSamHeader() with empty sam_header string\n"); |
761 |
|
|
//copy the SAM header |
762 |
|
|
FILE* fh=fopen(sam_header.c_str(), "r"); |
763 |
|
|
if (fh==NULL) |
764 |
|
|
err_die("Error: cannot open SAM header file %s\n",sam_header.c_str()); |
765 |
|
|
fclose(fh); |
766 |
|
|
} |
767 |
|
|
|
768 |
|
|
void writeSamHeader(FILE* fout) { |
769 |
|
|
if (fout==NULL) |
770 |
|
|
err_die("Error: writeSamHeader(NULL)\n"); |
771 |
|
|
checkSamHeader(); |
772 |
|
|
//copy the SAM header |
773 |
|
|
FILE* fh=fopen(sam_header.c_str(), "r"); |
774 |
|
|
int ch=-1; |
775 |
|
|
while ((ch=fgetc(fh))!=EOF) { |
776 |
|
|
if (fputc(ch, fout)==EOF) |
777 |
|
|
err_die("Error copying SAM header\n"); |
778 |
|
|
} |
779 |
|
|
fclose(fh); |
780 |
|
|
} |
781 |
|
|
|
782 |
|
|
//auxiliary functions for BAM record handling |
783 |
|
|
uint8_t* realloc_bdata(bam1_t *b, int size) { |
784 |
|
|
if (b->m_data < size) { |
785 |
|
|
b->m_data = size; |
786 |
|
|
kroundup32(b->m_data); |
787 |
|
|
b->data = (uint8_t*)realloc(b->data, b->m_data); |
788 |
|
|
} |
789 |
|
|
if (b->data_len<size) b->data_len=size; |
790 |
|
|
return b->data; |
791 |
|
|
} |
792 |
|
|
|
793 |
|
|
uint8_t* dupalloc_bdata(bam1_t *b, int size) { |
794 |
|
|
//same as realloc_bdata, but does not free previous data |
795 |
|
|
//but returns it instead |
796 |
|
|
//it ALWAYS duplicates data |
797 |
|
|
b->m_data = size; |
798 |
|
|
kroundup32(b->m_data); |
799 |
|
|
uint8_t* odata=b->data; |
800 |
|
|
b->data = (uint8_t*)malloc(b->m_data); |
801 |
|
|
memcpy((void*)b->data, (void*)odata, b->data_len); |
802 |
|
|
b->data_len=size; |
803 |
|
|
return odata; //user must FREE this after |
804 |
|
|
} |
805 |
|
|
|
806 |
|
|
extern unsigned short bam_char2flag_table[]; |
807 |
|
|
|
808 |
|
|
GBamRecord::GBamRecord(const char* qname, int32_t gseq_tid, |
809 |
|
|
int pos, bool reverse, const char* qseq, const char* cigar, const char* quals) { |
810 |
|
|
novel=true; |
811 |
|
|
b=bam_init1(); |
812 |
|
|
b->core.tid=gseq_tid; |
813 |
|
|
if (pos<=0) { |
814 |
|
|
b->core.pos=-1; //unmapped |
815 |
|
|
//if (gseq_tid<0) |
816 |
|
|
b->core.flag |= BAM_FUNMAP; |
817 |
|
|
} |
818 |
|
|
else b->core.pos=pos-1; //BAM is 0-based |
819 |
|
|
b->core.qual=255; |
820 |
|
|
int l_qseq=strlen(qseq); |
821 |
|
|
//this may not be accurate, setting CIGAR is the correct way |
822 |
|
|
//b->core.bin = bam_reg2bin(b->core.pos, b->core.pos+l_qseq-1); |
823 |
|
|
b->core.l_qname=strlen(qname)+1; //includes the \0 at the end |
824 |
|
|
memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname); |
825 |
|
|
set_cigar(cigar); //this will also set core.bin |
826 |
|
|
add_sequence(qseq, l_qseq); |
827 |
|
|
add_quals(quals); //quals must be given as Phred33 |
828 |
|
|
if (reverse) { b->core.flag |= BAM_FREVERSE ; } |
829 |
|
|
} |
830 |
|
|
|
831 |
|
|
GBamRecord::GBamRecord(const char* qname, int32_t flags, int32_t g_tid, |
832 |
|
|
int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos, |
833 |
|
|
int insert_size, const char* qseq, const char* quals, |
834 |
|
|
const vector<string>* aux_strings) { |
835 |
|
|
novel=true; |
836 |
|
|
b=bam_init1(); |
837 |
|
|
b->core.tid=g_tid; |
838 |
|
|
b->core.pos = (pos<=0) ? -1 : pos-1; //BAM is 0-based |
839 |
|
|
b->core.qual=map_qual; |
840 |
|
|
int l_qseq=strlen(qseq); |
841 |
|
|
b->core.l_qname=strlen(qname)+1; //includes the \0 at the end |
842 |
|
|
memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname); |
843 |
|
|
set_cigar(cigar); //this will also set core.bin |
844 |
|
|
add_sequence(qseq, l_qseq); |
845 |
|
|
add_quals(quals); //quals must be given as Phred33 |
846 |
|
|
set_flags(flags); |
847 |
|
|
set_mdata(mg_tid, (int32_t)(mate_pos-1), (int32_t)insert_size); |
848 |
|
|
if (aux_strings!=NULL) { |
849 |
|
|
for (vector<string>::const_iterator itr=aux_strings->begin(); |
850 |
|
|
itr!=aux_strings->end(); ++itr) { |
851 |
|
|
add_aux(itr->c_str()); |
852 |
|
|
} |
853 |
|
|
} |
854 |
|
|
} |
855 |
|
|
void GBamRecord::set_cigar(const char* cigar) { |
856 |
|
|
//requires b->core.pos and b->core.flag to have been set properly PRIOR to this call |
857 |
|
|
int doff=b->core.l_qname; |
858 |
|
|
uint8_t* after_cigar=NULL; |
859 |
|
|
int after_cigar_len=0; |
860 |
|
|
uint8_t* prev_bdata=NULL; |
861 |
|
|
if (b->data_len>doff) { |
862 |
|
|
//cigar string already allocated, replace it |
863 |
|
|
int d=b->core.l_qname + b->core.n_cigar * 4;//offset of after-cigar data |
864 |
|
|
after_cigar=b->data+d; |
865 |
|
|
after_cigar_len=b->data_len-d; |
866 |
|
|
} |
867 |
|
|
const char *s; |
868 |
|
|
char *t; |
869 |
|
|
int i, op; |
870 |
|
|
long x; |
871 |
|
|
b->core.n_cigar = 0; |
872 |
|
|
if (cigar != NULL && strcmp(cigar, "*") != 0) { |
873 |
|
|
for (s = cigar; *s; ++s) { |
874 |
|
|
if (isalpha(*s)) b->core.n_cigar++; |
875 |
|
|
else if (!isdigit(*s)) { |
876 |
|
|
err_die("Error: invalid CIGAR character (%s)\n",cigar); |
877 |
|
|
} |
878 |
|
|
} |
879 |
|
|
if (after_cigar_len>0) { //replace/insert into existing full data |
880 |
|
|
prev_bdata=dupalloc_bdata(b, doff + b->core.n_cigar * 4 + after_cigar_len); |
881 |
|
|
memcpy((void*)(b->data+doff+b->core.n_cigar*4),(void*)after_cigar, after_cigar_len); |
882 |
|
|
free(prev_bdata); |
883 |
|
|
} |
884 |
|
|
else { |
885 |
|
|
realloc_bdata(b, doff + b->core.n_cigar * 4); |
886 |
|
|
} |
887 |
gpertea |
154 |
for (i = 0, s = cigar; i != (int)b->core.n_cigar; ++i) { |
888 |
gpertea |
29 |
x = strtol(s, &t, 10); |
889 |
|
|
op = toupper(*t); |
890 |
|
|
if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH; |
891 |
|
|
else if (op == 'I') op = BAM_CINS; |
892 |
|
|
else if (op == 'D') op = BAM_CDEL; |
893 |
|
|
else if (op == 'N') op = BAM_CREF_SKIP; |
894 |
|
|
else if (op == 'S') op = BAM_CSOFT_CLIP; |
895 |
|
|
else if (op == 'H') op = BAM_CHARD_CLIP; |
896 |
|
|
else if (op == 'P') op = BAM_CPAD; |
897 |
|
|
else err_die("Error: invalid CIGAR operation (%s)\n",cigar); |
898 |
|
|
s = t + 1; |
899 |
|
|
bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op; |
900 |
|
|
} |
901 |
|
|
if (*s) err_die("Error: unmatched CIGAR operation (%s)\n",cigar); |
902 |
|
|
b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&b->core, bam1_cigar(b))); |
903 |
|
|
} else {//no CIGAR string given |
904 |
|
|
if (!(b->core.flag&BAM_FUNMAP)) { |
905 |
|
|
fprintf(stderr, "Warning: mapped sequence without CIGAR (%s)\n", (char*)b->data); |
906 |
|
|
b->core.flag |= BAM_FUNMAP; |
907 |
|
|
} |
908 |
|
|
b->core.bin = bam_reg2bin(b->core.pos, b->core.pos + 1); |
909 |
|
|
} |
910 |
|
|
} //set_cigar() |
911 |
|
|
|
912 |
|
|
void GBamRecord::add_sequence(const char* qseq, int slen) { |
913 |
|
|
//must be called AFTER set_cigar (cannot replace existing sequence for now) |
914 |
|
|
if (qseq==NULL) return; //should we ever care about this? |
915 |
|
|
if (slen<0) slen=strlen(qseq); |
916 |
|
|
int doff = b->core.l_qname + b->core.n_cigar * 4; |
917 |
|
|
if (strcmp(qseq, "*")!=0) { |
918 |
|
|
b->core.l_qseq=slen; |
919 |
|
|
if (b->core.n_cigar && b->core.l_qseq != (int32_t)bam_cigar2qlen(&b->core, bam1_cigar(b))) |
920 |
|
|
err_die("Error: CIGAR and sequence length are inconsistent!(%s)\n", |
921 |
|
|
qseq); |
922 |
|
|
uint8_t* p = (uint8_t*)realloc_bdata(b, doff + (b->core.l_qseq+1)/2 + b->core.l_qseq) + doff; |
923 |
|
|
//also allocated quals memory |
924 |
|
|
memset(p, 0, (b->core.l_qseq+1)/2); |
925 |
|
|
for (int i = 0; i < b->core.l_qseq; ++i) |
926 |
|
|
p[i/2] |= bam_nt16_table[(int)qseq[i]] << 4*(1-i%2); |
927 |
|
|
} else b->core.l_qseq = 0; |
928 |
|
|
} |
929 |
|
|
|
930 |
|
|
void GBamRecord::add_quals(const char* quals) { |
931 |
|
|
//requires core.l_qseq already set |
932 |
|
|
//and must be called AFTER add_sequence(), which also allocates the memory for quals |
933 |
|
|
uint8_t* p = b->data+(b->core.l_qname + b->core.n_cigar * 4 + (b->core.l_qseq+1)/2); |
934 |
|
|
if (quals==NULL || strcmp(quals, "*") == 0) { |
935 |
|
|
for (int i=0;i < b->core.l_qseq; i++) p[i] = 0xff; |
936 |
|
|
return; |
937 |
|
|
} |
938 |
|
|
for (int i=0;i < b->core.l_qseq; i++) p[i] = quals[i]-33; |
939 |
|
|
} |
940 |
|
|
|
941 |
|
|
void GBamRecord::add_aux(const char* str) { |
942 |
gpertea |
154 |
// daehwan - this is not thread-safe - I made these varaiables as class members |
943 |
gpertea |
29 |
//requires: being called AFTER add_quals() |
944 |
gpertea |
154 |
// static char tag[2]; |
945 |
|
|
// static uint8_t abuf[512]; |
946 |
gpertea |
29 |
//requires: being called AFTER add_quals() |
947 |
|
|
int strl=strlen(str); |
948 |
|
|
//int doff = b->core.l_qname + b->core.n_cigar*4 + (b->core.l_qseq+1)/2 + b->core.l_qseq + b->l_aux; |
949 |
|
|
//int doff0=doff; |
950 |
|
|
if (strl < 6 || str[2] != ':' || str[4] != ':') |
951 |
|
|
parse_error("missing colon in auxiliary data"); |
952 |
|
|
tag[0] = str[0]; tag[1] = str[1]; |
953 |
|
|
uint8_t atype = str[3]; |
954 |
|
|
uint8_t* adata=abuf; |
955 |
|
|
int alen=0; |
956 |
|
|
if (atype == 'A' || atype == 'a' || atype == 'c' || atype == 'C') { // c and C for backward compatibility |
957 |
|
|
atype='A'; |
958 |
|
|
alen=1; |
959 |
|
|
adata=(uint8_t*)&str[5]; |
960 |
|
|
} |
961 |
|
|
else if (atype == 'I' || atype == 'i') { |
962 |
|
|
long long x=(long long)atoll(str + 5); |
963 |
|
|
if (x < 0) { |
964 |
|
|
if (x >= -127) { |
965 |
|
|
atype='c'; |
966 |
|
|
abuf[0] = (int8_t)x; |
967 |
|
|
alen=1; |
968 |
|
|
} |
969 |
|
|
else if (x >= -32767) { |
970 |
|
|
atype = 's'; |
971 |
|
|
*(int16_t*)abuf = (int16_t)x; |
972 |
|
|
alen=2; |
973 |
|
|
} |
974 |
|
|
else { |
975 |
|
|
atype='i'; |
976 |
|
|
*(int32_t*)abuf = (int32_t)x; |
977 |
|
|
alen=4; |
978 |
|
|
if (x < -2147483648ll) |
979 |
|
|
fprintf(stderr, "Parse warning: integer %lld is out of range.", |
980 |
|
|
x); |
981 |
|
|
} |
982 |
|
|
} else { //x >=0 |
983 |
|
|
if (x <= 255) { |
984 |
|
|
atype = 'C'; |
985 |
|
|
abuf[0] = (uint8_t)x; |
986 |
|
|
alen=1; |
987 |
|
|
} |
988 |
|
|
else if (x <= 65535) { |
989 |
|
|
atype='S'; |
990 |
|
|
*(uint16_t*)abuf = (uint16_t)x; |
991 |
|
|
alen=2; |
992 |
|
|
} |
993 |
|
|
else { |
994 |
|
|
atype='I'; |
995 |
|
|
*(uint32_t*)abuf = (uint32_t)x; |
996 |
|
|
alen=4; |
997 |
|
|
if (x > 4294967295ll) |
998 |
|
|
fprintf(stderr, "Parse warning: integer %lld is out of range.", |
999 |
|
|
x); |
1000 |
|
|
} |
1001 |
|
|
} |
1002 |
|
|
} //integer type |
1003 |
|
|
else if (atype == 'f') { |
1004 |
|
|
*(float*)abuf = (float)atof(str + 5); |
1005 |
|
|
alen = sizeof(float); |
1006 |
|
|
} |
1007 |
|
|
else if (atype == 'd') { //? |
1008 |
|
|
*(float*)abuf = (float)atof(str + 9); |
1009 |
|
|
alen=8; |
1010 |
|
|
} |
1011 |
|
|
else if (atype == 'Z' || atype == 'H') { |
1012 |
|
|
if (atype == 'H') { // check whether the hex string is valid |
1013 |
|
|
if ((strl - 5) % 2 == 1) parse_error("length of the hex string not even"); |
1014 |
|
|
for (int i = 0; i < strl - 5; ++i) { |
1015 |
|
|
int c = toupper(str[5 + i]); |
1016 |
|
|
if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F'))) |
1017 |
|
|
parse_error("invalid hex character"); |
1018 |
|
|
} |
1019 |
|
|
} |
1020 |
|
|
memcpy(abuf, str + 5, strl - 5); |
1021 |
|
|
abuf[strl-5] = 0; |
1022 |
|
|
alen=strl-4; |
1023 |
|
|
} else parse_error("unrecognized aux type"); |
1024 |
|
|
this->add_aux(tag, atype, alen, adata); |
1025 |
|
|
}//add_aux() |