1 |
#include "GArgs.h" |
2 |
#include "GStr.h" |
3 |
#include "GHash.hh" |
4 |
#include "GList.hh" |
5 |
#include <ctype.h> |
6 |
#include "GAlnExtend.h" |
7 |
|
8 |
#define USAGE "Usage:\n\ |
9 |
fqtrim [{-5 <5adaptor> -3 <3adaptor>|-f <adaptors_file>}] [-a <min_matchlen>]\\\n\ |
10 |
[-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\ |
11 |
[-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\ |
12 |
<input.fq>[,<input_mates.fq>\n\ |
13 |
\n\ |
14 |
Trim low quality bases at the 3' end and can trim adaptor sequence(s), filter\n\ |
15 |
for low complexity and collapse duplicate reads.\n\ |
16 |
If read pairs should be trimmed and kept together (i.e. without discarding\n\ |
17 |
one read in a pair), the two file names should be given delimited by a comma\n\ |
18 |
or a colon character\n\ |
19 |
\n\ |
20 |
Options:\n\ |
21 |
-n rename all the reads using the <prefix> followed by a read counter;\n\ |
22 |
if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\ |
23 |
the read duplication count\n\ |
24 |
-o unless this parameter is '-', write the trimmed/filtered reads to \n\ |
25 |
file(s) named <input>.<outsuffix> which will be created in the \n\ |
26 |
current (working) directory; (writes to stdout if -o- is given);\n\ |
27 |
a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\ |
28 |
-f file with adaptor sequences to trim, each line having this format:\n\ |
29 |
<5'-adaptor-sequence> <3'-adaptor-sequence>\n\ |
30 |
-5 trim the given adaptor or primer sequence at the 5' end of each read\n\ |
31 |
(e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\ |
32 |
-3 trim the given adaptor sequence at the 3' end of each read\n\ |
33 |
(e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\ |
34 |
-A disable polyA/T trimming (enabled by default)\n\ |
35 |
-y minimum length of exact match to adaptor sequence at the proper end (6)\n\ |
36 |
-q trim bases with quality value lower than <minq> (starting at the 3' end)\n\ |
37 |
-t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\ |
38 |
-m maximum percentage of Ns allowed in a read after trimming (default 7)\n\ |
39 |
-l minimum \"clean\" length after trimming that a read must have\n\ |
40 |
in order to pass the filter (default: 16)\n\ |
41 |
-r write a simple \"trash report\" file listing the discarded reads with a\n\ |
42 |
one letter code for the reason why they were trashed\n\ |
43 |
-D apply a low-complexity (dust) filter and discard any read that has over\n\ |
44 |
50% of its length detected as low complexity\n\ |
45 |
-C collapse duplicate reads and add a prefix with their count to the read\n\ |
46 |
name (e.g. for microRNAs)\n\ |
47 |
-p input is phred64/phred33 (use -p64 or -p33)\n\ |
48 |
-Q convert quality values to the other Phred qv type\n\ |
49 |
-V verbose processing\n\ |
50 |
" |
51 |
|
52 |
//-z for -o option, the output stream(s) will be first piped into the given\n |
53 |
// <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n |
54 |
|
55 |
|
56 |
// example 3' adaptor for miRNAs: TCGTATGCCGTCTTCTGCTTG |
57 |
|
58 |
//For paired reads sequencing: |
59 |
//3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT |
60 |
//5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG |
61 |
//FILE* f_out=NULL; //stdout if not provided |
62 |
//FILE* f_out2=NULL; //for paired reads |
63 |
//FILE* f_in=NULL; //input fastq (stdin if not provided) |
64 |
//FILE* f_in2=NULL; //for paired reads |
65 |
|
66 |
FILE* freport=NULL; |
67 |
|
68 |
bool debug=false; |
69 |
bool verbose=false; |
70 |
bool doCollapse=false; |
71 |
bool doDust=false; |
72 |
bool doPolyTrim=true; |
73 |
bool fastaOutput=false; |
74 |
bool trashReport=false; |
75 |
bool revCompl=false; //also reverse complement adaptor sequences |
76 |
int min_read_len=16; |
77 |
double max_perc_N=7.0; |
78 |
int dust_cutoff=16; |
79 |
bool isfasta=false; |
80 |
bool convert_phred=false; |
81 |
GStr outsuffix; // -o |
82 |
GStr prefix; |
83 |
GStr zcmd; |
84 |
int num_trimmed5=0; |
85 |
int num_trimmed3=0; |
86 |
int num_trimmedN=0; |
87 |
int num_trimmedQ=0; |
88 |
int min_trimmed5=INT_MAX; |
89 |
int min_trimmed3=INT_MAX; |
90 |
|
91 |
int qvtrim_qmin=0; |
92 |
int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases |
93 |
int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet) |
94 |
int qv_cvtadd=0; //could be -31 or +31 |
95 |
|
96 |
// adaptor matching metrics -- for X-drop ungapped extension |
97 |
//const int match_reward=2; |
98 |
//const int mismatch_penalty=3; |
99 |
const int match_reward=2; |
100 |
const int mismatch_penalty=4; |
101 |
const int Xdrop=10; |
102 |
|
103 |
const int poly_m_score=2; //match score |
104 |
const int poly_mis_score=-3; //mismatch |
105 |
const int poly_dropoff_score=7; |
106 |
int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed |
107 |
|
108 |
const char *polyA_seed="AAAA"; |
109 |
const char *polyT_seed="TTTT"; |
110 |
|
111 |
struct CASeqData { |
112 |
//positional data for every possible hexamer in an adaptor |
113 |
GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adaptor sequence |
114 |
GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adaptor sequence |
115 |
GStr seq; //actual adaptor sequence data |
116 |
GStr seqr; //reverse complement sequence |
117 |
int amlen; //fraction of adaptor length matching that's |
118 |
//enough to consider the alignment |
119 |
GAlnTrimType trim_type; |
120 |
bool use_reverse; |
121 |
CASeqData(bool rev=false):seq(),seqr(), |
122 |
amlen(0), use_reverse(rev) { |
123 |
trim_type=galn_None; //should be updated later! |
124 |
for (int i=0;i<4096;i++) { |
125 |
pz[i]=NULL; |
126 |
pzr[i]=NULL; |
127 |
} |
128 |
} |
129 |
|
130 |
void update(const char* s) { |
131 |
seq=s; |
132 |
table6mers(seq.chars(), seq.length(), pz); |
133 |
amlen=iround(double(seq.length())*0.8); |
134 |
if (amlen<12) |
135 |
amlen=12; |
136 |
if (!use_reverse) return; |
137 |
//reverse complement |
138 |
seqr=s; |
139 |
int slen=seq.length(); |
140 |
for (int i=0;i<slen;i++) |
141 |
seqr[i]=ntComplement(seq[slen-i-1]); |
142 |
table6mers(seqr.chars(), seqr.length(), pzr); |
143 |
} |
144 |
|
145 |
~CASeqData() { |
146 |
for (int i=0;i<4096;i++) { |
147 |
delete pz[i]; |
148 |
delete pzr[i]; |
149 |
} |
150 |
} |
151 |
}; |
152 |
|
153 |
GVec<CASeqData> adaptors5; |
154 |
GVec<CASeqData> adaptors3; |
155 |
|
156 |
CGreedyAlignData* gxmem_l=NULL; |
157 |
CGreedyAlignData* gxmem_r=NULL; |
158 |
|
159 |
// element in dhash: |
160 |
class FqDupRec { |
161 |
public: |
162 |
int count; //how many of these reads are the same |
163 |
int len; //length of qv |
164 |
char* firstname; //optional, only if we want to keep the original read names |
165 |
char* qv; |
166 |
FqDupRec(GStr* qstr=NULL, const char* rname=NULL) { |
167 |
len=0; |
168 |
qv=NULL; |
169 |
firstname=NULL; |
170 |
count=0; |
171 |
if (qstr!=NULL) { |
172 |
qv=Gstrdup(qstr->chars()); |
173 |
len=qstr->length(); |
174 |
count++; |
175 |
} |
176 |
if (rname!=NULL) firstname=Gstrdup(rname); |
177 |
} |
178 |
~FqDupRec() { |
179 |
GFREE(qv); |
180 |
GFREE(firstname); |
181 |
} |
182 |
void add(GStr& d) { //collapse another record into this one |
183 |
if (d.length()!=len) |
184 |
GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n"); |
185 |
count++; |
186 |
for (int i=0;i<len;i++) |
187 |
qv[i]=(qv[i]+d[i])/2; |
188 |
} |
189 |
}; |
190 |
|
191 |
void openfw(FILE* &f, GArgs& args, char opt) { |
192 |
GStr s=args.getOpt(opt); |
193 |
if (!s.is_empty()) { |
194 |
if (s=='-') f=stdout; |
195 |
else { |
196 |
f=fopen(s.chars(),"w"); |
197 |
if (f==NULL) GError("Error creating file: %s\n", s.chars()); |
198 |
} |
199 |
} |
200 |
} |
201 |
|
202 |
#define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh) |
203 |
#define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh) |
204 |
|
205 |
GHash<FqDupRec> dhash; //hash to keep track of duplicates |
206 |
|
207 |
void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type); |
208 |
int loadAdaptors(const char* fname); |
209 |
|
210 |
void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2, |
211 |
GStr& s, GStr& infname, GStr& infname2); |
212 |
// uses outsuffix to generate output file names and open file handles as needed |
213 |
|
214 |
void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter); |
215 |
void trash_report(char trashcode, GStr& rname, FILE* freport); |
216 |
|
217 |
bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv, |
218 |
GStr& rname, GStr& rinfo, GStr& infname); |
219 |
|
220 |
char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3); |
221 |
//returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed |
222 |
|
223 |
bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured |
224 |
bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured |
225 |
int dust(GStr& seq); |
226 |
bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured |
227 |
bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed); |
228 |
bool trim_adaptor5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured |
229 |
bool trim_adaptor3(GStr& seq, int &l5, int &l3); |
230 |
|
231 |
void convertPhred(char* q, int len); |
232 |
void convertPhred(GStr& q); |
233 |
|
234 |
int main(int argc, char * const argv[]) { |
235 |
GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:"); |
236 |
int e; |
237 |
if ((e=args.isError())>0) { |
238 |
GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]); |
239 |
exit(224); |
240 |
} |
241 |
debug=(args.getOpt('Y')!=NULL); |
242 |
verbose=(args.getOpt('V')!=NULL); |
243 |
convert_phred=(args.getOpt('Q')!=NULL); |
244 |
doCollapse=(args.getOpt('C')!=NULL); |
245 |
doDust=(args.getOpt('D')!=NULL); |
246 |
revCompl=(args.getOpt('R')!=NULL); |
247 |
if (args.getOpt('A')) doPolyTrim=false; |
248 |
/* |
249 |
rawFormat=(args.getOpt('R')!=NULL); |
250 |
if (rawFormat) { |
251 |
GError("Sorry, raw qseq format parsing is not implemented yet!\n"); |
252 |
} |
253 |
*/ |
254 |
prefix=args.getOpt('n'); |
255 |
GStr s=args.getOpt('l'); |
256 |
if (!s.is_empty()) |
257 |
min_read_len=s.asInt(); |
258 |
s=args.getOpt('m'); |
259 |
if (!s.is_empty()) |
260 |
max_perc_N=s.asDouble(); |
261 |
s=args.getOpt('d'); |
262 |
if (!s.is_empty()) { |
263 |
dust_cutoff=s.asInt(); |
264 |
doDust=true; |
265 |
} |
266 |
s=args.getOpt('q'); |
267 |
if (!s.is_empty()) { |
268 |
qvtrim_qmin=s.asInt(); |
269 |
} |
270 |
s=args.getOpt('t'); |
271 |
if (!s.is_empty()) { |
272 |
qvtrim_max=s.asInt(); |
273 |
} |
274 |
s=args.getOpt('p'); |
275 |
if (!s.is_empty()) { |
276 |
int v=s.asInt(); |
277 |
if (v==33) { |
278 |
qv_phredtype=33; |
279 |
qv_cvtadd=31; |
280 |
} |
281 |
else if (v==64) { |
282 |
qv_phredtype=64; |
283 |
qv_cvtadd=-31; |
284 |
} |
285 |
else |
286 |
GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE); |
287 |
} |
288 |
s=args.getOpt('f'); |
289 |
if (!s.is_empty()) { |
290 |
loadAdaptors(s.chars()); |
291 |
} |
292 |
bool fileAdaptors=adaptors5.Count()+adaptors3.Count(); |
293 |
s=args.getOpt('5'); |
294 |
if (!s.is_empty()) { |
295 |
if (fileAdaptors) |
296 |
GError("Error: options -5 and -f cannot be used together!\n"); |
297 |
s.upper(); |
298 |
addAdaptor(adaptors5, s, galn_TrimLeft); |
299 |
} |
300 |
s=args.getOpt('3'); |
301 |
if (!s.is_empty()) { |
302 |
if (fileAdaptors) |
303 |
GError("Error: options -3 and -f cannot be used together!\n"); |
304 |
s.upper(); |
305 |
addAdaptor(adaptors3, s, galn_TrimRight); |
306 |
} |
307 |
s=args.getOpt('y'); |
308 |
if (!s.is_empty()) { |
309 |
int minmatch=s.asInt(); |
310 |
poly_minScore=minmatch*poly_m_score; |
311 |
} |
312 |
|
313 |
if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o'); |
314 |
else outsuffix="-"; |
315 |
trashReport= (args.getOpt('r')!=NULL); |
316 |
int fcount=args.startNonOpt(); |
317 |
if (fcount==0) { |
318 |
GMessage(USAGE); |
319 |
exit(224); |
320 |
} |
321 |
if (fcount>1 && doCollapse) { |
322 |
GError("%s Sorry, the -C option only works with a single input.\n", USAGE); |
323 |
} |
324 |
//openfw(f_out, args, 'o'); |
325 |
//if (f_out==NULL) f_out=stdout; |
326 |
if (trashReport) |
327 |
openfw(freport, args, 'r'); |
328 |
char* infile=NULL; |
329 |
|
330 |
if (adaptors5.Count()>0) |
331 |
//gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2); |
332 |
gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop); |
333 |
if (adaptors3.Count()>0) |
334 |
gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop); |
335 |
|
336 |
while ((infile=args.nextNonOpt())!=NULL) { |
337 |
//for each input file |
338 |
int incounter=0; //counter for input reads |
339 |
int outcounter=0; //counter for output reads |
340 |
int trash_s=0; //too short from the get go |
341 |
int trash_Q=0; |
342 |
int trash_N=0; |
343 |
int trash_D=0; |
344 |
int trash_poly=0; |
345 |
int trash_A3=0; |
346 |
int trash_A5=0; |
347 |
s=infile; |
348 |
GStr infname; |
349 |
GStr infname2; |
350 |
FILE* f_in=NULL; |
351 |
FILE* f_in2=NULL; |
352 |
FILE* f_out=NULL; |
353 |
FILE* f_out2=NULL; |
354 |
bool paired_reads=false; |
355 |
setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2); |
356 |
GLineReader fq(f_in); |
357 |
GLineReader* fq2=NULL; |
358 |
if (f_in2!=NULL) { |
359 |
fq2=new GLineReader(f_in2); |
360 |
paired_reads=true; |
361 |
} |
362 |
GStr rseq, rqv, seqid, seqinfo; |
363 |
GStr rseq2, rqv2, seqid2, seqinfo2; |
364 |
while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) { |
365 |
incounter++; |
366 |
int a5=0, a3=0, b5=0, b3=0; |
367 |
char tcode=0, tcode2=0; |
368 |
tcode=process_read(seqid, rseq, rqv, a5, a3); |
369 |
if (fq2!=NULL) { |
370 |
getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2); |
371 |
if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) { |
372 |
GError("Error: no paired match for read %s vs %s (%s,%s)\n", |
373 |
seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars()); |
374 |
} |
375 |
tcode2=process_read(seqid2, rseq2, rqv2, b5, b3); |
376 |
//decide what to do with this pair and print rseq2 if the pair makes it |
377 |
if (tcode>1 && tcode2<=1) { |
378 |
//"untrash" rseq |
379 |
if (a3-a5+1<min_read_len) { |
380 |
a5=1; |
381 |
if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); } |
382 |
} |
383 |
tcode=1; |
384 |
} |
385 |
else if (tcode<=1 && tcode2>1) { |
386 |
//"untrash" rseq2 |
387 |
if (b3-b5+1<min_read_len) { |
388 |
b5=1; |
389 |
if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); } |
390 |
} |
391 |
tcode2=1; |
392 |
} |
393 |
if (tcode<=1) { //trimmed or left intact -- write it! |
394 |
if (tcode>0) { |
395 |
rseq2=rseq2.substr(b5,b3-b5+1); |
396 |
if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1); |
397 |
} |
398 |
int nocounter=0; |
399 |
writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter); |
400 |
} |
401 |
} //pair read |
402 |
if (tcode>1) { //trashed |
403 |
#ifdef GDEBUG |
404 |
GMessage(" !!!!TRASH code = %c\n",tcode); |
405 |
#endif |
406 |
if (tcode=='s') trash_s++; |
407 |
else if (tcode=='A' || tcode=='T') trash_poly++; |
408 |
else if (tcode=='Q') trash_Q++; |
409 |
else if (tcode=='N') trash_N++; |
410 |
else if (tcode=='D') trash_D++; |
411 |
else if (tcode=='3') trash_A3++; |
412 |
else if (tcode=='5') trash_A5++; |
413 |
if (trashReport) trash_report(tcode, seqid, freport); |
414 |
} |
415 |
else if (!doCollapse) { //write it |
416 |
if (tcode>0) { |
417 |
rseq=rseq.substr(a5,a3-a5+1); |
418 |
if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1); |
419 |
} |
420 |
#ifdef GDEBUG |
421 |
GMessage(" After trimming:\n"); |
422 |
GMessage("%s\n",rseq.chars()); |
423 |
#endif |
424 |
writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter); |
425 |
} |
426 |
} //for each fastq record |
427 |
delete fq2; |
428 |
FRCLOSE(f_in); |
429 |
FRCLOSE(f_in2); |
430 |
if (doCollapse) { |
431 |
outcounter=0; |
432 |
int maxdup_count=1; |
433 |
char* maxdup_seq=NULL; |
434 |
dhash.startIterate(); |
435 |
FqDupRec* qd=NULL; |
436 |
char* seq=NULL; |
437 |
while ((qd=dhash.NextData(seq))!=NULL) { |
438 |
GStr rseq(seq); |
439 |
//do the dusting here |
440 |
if (doDust) { |
441 |
int dustbases=dust(rseq); |
442 |
if (dustbases>(rseq.length()>>1)) { |
443 |
if (trashReport && qd->firstname!=NULL) { |
444 |
fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count); |
445 |
} |
446 |
trash_D+=qd->count; |
447 |
continue; |
448 |
} |
449 |
} |
450 |
outcounter++; |
451 |
if (qd->count>maxdup_count) { |
452 |
maxdup_count=qd->count; |
453 |
maxdup_seq=seq; |
454 |
} |
455 |
if (isfasta) { |
456 |
if (prefix.is_empty()) { |
457 |
fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count, |
458 |
rseq.chars()); |
459 |
} |
460 |
else { //use custom read name |
461 |
fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter, |
462 |
qd->count, rseq.chars()); |
463 |
} |
464 |
} |
465 |
else { //fastq format |
466 |
if (convert_phred) convertPhred(qd->qv, qd->len); |
467 |
if (prefix.is_empty()) { |
468 |
fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count, |
469 |
rseq.chars(), qd->qv); |
470 |
} |
471 |
else { //use custom read name |
472 |
fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter, |
473 |
qd->count, rseq.chars(), qd->qv); |
474 |
} |
475 |
} |
476 |
}//for each element of dhash |
477 |
if (maxdup_count>1) { |
478 |
GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq); |
479 |
} |
480 |
} //collapse entries |
481 |
if (verbose) { |
482 |
if (paired_reads) { |
483 |
GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars()); |
484 |
GMessage("Number of input pairs :%9d\n", incounter); |
485 |
GMessage(" Output pairs :%9d\n", outcounter); |
486 |
} |
487 |
else { |
488 |
GMessage(">Input file : %s\n", infname.chars()); |
489 |
GMessage("Number of input reads :%9d\n", incounter); |
490 |
GMessage(" Output reads :%9d\n", outcounter); |
491 |
} |
492 |
GMessage("------------------------------------\n"); |
493 |
if (num_trimmed5) |
494 |
GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5); |
495 |
if (num_trimmed3) |
496 |
GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3); |
497 |
GMessage("------------------------------------\n"); |
498 |
if (trash_s>0) |
499 |
GMessage(" Trashed by length:%9d\n", trash_s); |
500 |
if (trash_N>0) |
501 |
GMessage(" Trashed by N%%:%9d\n", trash_N); |
502 |
if (trash_Q>0) |
503 |
GMessage("Trashed by low quality:%9d\n", trash_Q); |
504 |
if (trash_poly>0) |
505 |
GMessage(" Trashed by poly-A/T:%9d\n", trash_poly); |
506 |
if (trash_A5>0) |
507 |
GMessage(" Trashed by 5' adaptor:%9d\n", trash_A5); |
508 |
if (trash_A3>0) |
509 |
GMessage(" Trashed by 3' adaptor:%9d\n", trash_A3); |
510 |
} |
511 |
if (trashReport) { |
512 |
FWCLOSE(freport); |
513 |
} |
514 |
FWCLOSE(f_out); |
515 |
FWCLOSE(f_out2); |
516 |
} //while each input file |
517 |
delete gxmem_l; |
518 |
delete gxmem_r; |
519 |
//getc(stdin); |
520 |
} |
521 |
|
522 |
class NData { |
523 |
public: |
524 |
int NPos[1024]; //there should be no reads longer than 1K ? |
525 |
int NCount; |
526 |
int end5; |
527 |
int end3; |
528 |
int seqlen; |
529 |
double perc_N; //percentage of Ns in end5..end3 range only! |
530 |
const char* seq; |
531 |
bool valid; |
532 |
NData() { |
533 |
seqlen=0; |
534 |
NCount=0; |
535 |
end5=0; |
536 |
end3=0; |
537 |
seq=NULL; |
538 |
perc_N=0; |
539 |
valid=true; |
540 |
} |
541 |
void init(GStr& rseq) { |
542 |
NCount=0; |
543 |
perc_N=0; |
544 |
seqlen=rseq.length(); |
545 |
seq=rseq.chars(); |
546 |
end5=1; |
547 |
end3=seqlen; |
548 |
for (int i=0;i<seqlen;i++) |
549 |
if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT") |
550 |
NPos[NCount]=i; |
551 |
NCount++; |
552 |
} |
553 |
perc_N=(NCount*100.0)/seqlen; |
554 |
valid=true; |
555 |
} |
556 |
void N_calc() { //only in the region after trimming |
557 |
int n=0; |
558 |
for (int i=end3-1;i<end5;i++) { |
559 |
if (seq[i]=='N') n++; |
560 |
} |
561 |
perc_N=(n*100.0)/(end5-end3+1); |
562 |
} |
563 |
}; |
564 |
|
565 |
static NData feat; |
566 |
int perc_lenN=12; // incremental distance from ends, in percentage of |
567 |
// sequence length, where N-trimming is done (default:12 %) (autolimited to 20) |
568 |
|
569 |
void N_analyze(int l5, int l3, int p5, int p3) { |
570 |
/* assumes feat was filled properly */ |
571 |
int old_dif, t5,t3,v; |
572 |
if (l3<l5+2 || p5>p3 ) { |
573 |
feat.end5=l5+1; |
574 |
feat.end3=l3+1; |
575 |
return; |
576 |
} |
577 |
|
578 |
t5=feat.NPos[p5]-l5; |
579 |
t3=l3-feat.NPos[p3]; |
580 |
old_dif=p3-p5; |
581 |
v=(int)((((double)(l3-l5))*perc_lenN)/100); |
582 |
if (v>20) v=20; /* enforce N-search limit for very long reads */ |
583 |
if (t5 < v ) { |
584 |
l5=feat.NPos[p5]+1; |
585 |
p5++; |
586 |
} |
587 |
if (t3 < v) { |
588 |
l3=feat.NPos[p3]-1; |
589 |
p3--; |
590 |
} |
591 |
/* restNs=p3-p5; number of Ns in the new CLR */ |
592 |
if (p3-p5==old_dif) { /* no change, return */ |
593 |
/* don't trim if this may shorten the read too much */ |
594 |
//if (l5-l3<min_read_len) return; |
595 |
feat.end5=l5+1; |
596 |
feat.end3=l3+1; |
597 |
return; |
598 |
} |
599 |
else |
600 |
N_analyze(l5,l3, p5,p3); |
601 |
} |
602 |
|
603 |
|
604 |
bool qtrim(GStr& qvs, int &l5, int &l3) { |
605 |
if (qvtrim_qmin==0 || qvs.is_empty()) return false; |
606 |
if (qv_phredtype==0) { |
607 |
//try to guess the Phred type |
608 |
int vmin=256, vmax=0; |
609 |
for (int i=0;i<qvs.length();i++) { |
610 |
if (vmin>qvs[i]) vmin=qvs[i]; |
611 |
if (vmax<qvs[i]) vmax=qvs[i]; |
612 |
} |
613 |
if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; } |
614 |
if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; } |
615 |
if (qv_phredtype==0) { |
616 |
GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n"); |
617 |
} |
618 |
if (verbose) |
619 |
GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64)); |
620 |
} //guessing Phred type |
621 |
for (l3=qvs.length()-1;l3>2;l3--) { |
622 |
if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break; |
623 |
} |
624 |
//just in case, check also the 5' the end (?) |
625 |
for (l5=0;l5<qvs.length()-3;l5++) { |
626 |
if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break; |
627 |
} |
628 |
if (qvtrim_max>0) { |
629 |
if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max; |
630 |
if (l5>qvtrim_max) l5=qvtrim_max; |
631 |
} |
632 |
return (l5>0 || l3<qvs.length()-1); |
633 |
} |
634 |
|
635 |
bool ntrim(GStr& rseq, int &l5, int &l3) { |
636 |
//count Ns in the sequence, trim N-rich ends |
637 |
feat.init(rseq); |
638 |
l5=feat.end5-1; |
639 |
l3=feat.end3-1; |
640 |
N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1); |
641 |
if (l5==feat.end5-1 && l3==feat.end3-1) { |
642 |
if (feat.perc_N>max_perc_N) { |
643 |
feat.valid=false; |
644 |
l3=l5+1; |
645 |
return true; |
646 |
} |
647 |
else { |
648 |
return false; //nothing changed |
649 |
} |
650 |
} |
651 |
l5=feat.end5-1; |
652 |
l3=feat.end3-1; |
653 |
if (l3-l5+1<min_read_len) { |
654 |
feat.valid=false; |
655 |
return true; |
656 |
} |
657 |
feat.N_calc(); |
658 |
|
659 |
if (feat.perc_N>max_perc_N) { |
660 |
feat.valid=false; |
661 |
l3=l5+1; |
662 |
return true; |
663 |
} |
664 |
return true; |
665 |
} |
666 |
|
667 |
//--------------- dust functions ---------------- |
668 |
class DNADuster { |
669 |
public: |
670 |
int dustword; |
671 |
int dustwindow; |
672 |
int dustwindow2; |
673 |
int dustcutoff; |
674 |
int mv, iv, jv; |
675 |
int counts[32*32*32]; |
676 |
int iis[32*32*32]; |
677 |
DNADuster(int cutoff=16, int winsize=32, int wordsize=3) { |
678 |
dustword=wordsize; |
679 |
dustwindow=winsize; |
680 |
dustwindow2 = (winsize>>1); |
681 |
dustcutoff=cutoff; |
682 |
mv=0; |
683 |
iv=0; |
684 |
jv=0; |
685 |
} |
686 |
void setWindowSize(int value) { |
687 |
dustwindow = value; |
688 |
dustwindow2 = (dustwindow >> 1); |
689 |
} |
690 |
void setWordSize(int value) { |
691 |
dustword=value; |
692 |
} |
693 |
void wo1(int len, const char* s, int ivv) { |
694 |
int i, ii, j, v, t, n, n1, sum; |
695 |
int js, nis; |
696 |
n = 32 * 32 * 32; |
697 |
n1 = n - 1; |
698 |
nis = 0; |
699 |
i = 0; |
700 |
ii = 0; |
701 |
sum = 0; |
702 |
v = 0; |
703 |
for (j=0; j < len; j++, s++) { |
704 |
ii <<= 5; |
705 |
if (*s<=32) { |
706 |
i=0; |
707 |
continue; |
708 |
} |
709 |
ii |= *s - 'A'; //assume uppercase! |
710 |
ii &= n1; |
711 |
i++; |
712 |
if (i >= dustword) { |
713 |
for (js=0; js < nis && iis[js] != ii; js++) ; |
714 |
if (js == nis) { |
715 |
iis[nis] = ii; |
716 |
counts[ii] = 0; |
717 |
nis++; |
718 |
} |
719 |
if ((t = counts[ii]) > 0) { |
720 |
sum += t; |
721 |
v = 10 * sum / j; |
722 |
if (mv < v) { |
723 |
mv = v; |
724 |
iv = ivv; |
725 |
jv = j; |
726 |
} |
727 |
} |
728 |
counts[ii]++; |
729 |
} |
730 |
} |
731 |
} |
732 |
|
733 |
int wo(int len, const char* s, int* beg, int* end) { |
734 |
int i, l1; |
735 |
l1 = len - dustword + 1; |
736 |
if (l1 < 0) { |
737 |
*beg = 0; |
738 |
*end = len - 1; |
739 |
return 0; |
740 |
} |
741 |
mv = 0; |
742 |
iv = 0; |
743 |
jv = 0; |
744 |
for (i=0; i < l1; i++) { |
745 |
wo1(len-i, s+i, i); |
746 |
} |
747 |
*beg = iv; |
748 |
*end = iv + jv; |
749 |
return mv; |
750 |
} |
751 |
|
752 |
void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) { |
753 |
int i, j, l, a, b, v; |
754 |
if (cutoff==0) cutoff=dustcutoff; |
755 |
a=0;b=0; |
756 |
//GMessage("Dust cutoff=%d\n", cutoff); |
757 |
for (i=0; i < seqlen; i += dustwindow2) { |
758 |
l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i; |
759 |
v = wo(l, seq+i, &a, &b); |
760 |
if (v > cutoff) { |
761 |
//for (j = a; j <= b && j < dustwindow2; j++) { |
762 |
for (j = a; j <= b; j++) { |
763 |
seqmsk[i+j]='N';//could be made lowercase instead |
764 |
} |
765 |
} |
766 |
} |
767 |
//return first; |
768 |
} |
769 |
}; |
770 |
|
771 |
static DNADuster duster; |
772 |
|
773 |
int dust(GStr& rseq) { |
774 |
char* seq=Gstrdup(rseq.chars()); |
775 |
duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff); |
776 |
//check the number of Ns: |
777 |
int ncount=0; |
778 |
for (int i=0;i<rseq.length();i++) { |
779 |
if (seq[i]=='N') ncount++; |
780 |
} |
781 |
GFREE(seq); |
782 |
return ncount; |
783 |
} |
784 |
|
785 |
struct SLocScore { |
786 |
int pos; |
787 |
int score; |
788 |
SLocScore(int p=0,int s=0) { |
789 |
pos=p; |
790 |
score=s; |
791 |
} |
792 |
void set(int p, int s) { |
793 |
pos=p; |
794 |
score=s; |
795 |
} |
796 |
void add(int p, int add) { |
797 |
pos=p; |
798 |
score+=add; |
799 |
} |
800 |
}; |
801 |
|
802 |
bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) { |
803 |
if (!doPolyTrim) return false; |
804 |
int rlen=seq.length(); |
805 |
l5=0; |
806 |
l3=rlen-1; |
807 |
int32 seedVal=*(int32*)poly_seed; |
808 |
char polyChar=poly_seed[0]; |
809 |
//assumes N trimming was already done |
810 |
//so a poly match should be very close to the end of the read |
811 |
// -- find the initial match (seed) |
812 |
int lmin=GMAX((rlen-16), 0); |
813 |
int li; |
814 |
for (li=rlen-4;li>lmin;li--) { |
815 |
if (seedVal==*(int*)&(seq[li])) { |
816 |
break; |
817 |
} |
818 |
} |
819 |
if (li<=lmin) return false; |
820 |
//seed found, try to extend it both ways |
821 |
//extend right |
822 |
int ri=li+3; |
823 |
SLocScore loc(ri, poly_m_score<<2); |
824 |
SLocScore maxloc(loc); |
825 |
//extend right |
826 |
while (ri<rlen-1) { |
827 |
ri++; |
828 |
if (seq[ri]==polyChar) { |
829 |
loc.add(ri,poly_m_score); |
830 |
} |
831 |
else if (seq[ri]=='N') { |
832 |
loc.add(ri,0); |
833 |
} |
834 |
else { //mismatch |
835 |
loc.add(ri,poly_mis_score); |
836 |
if (maxloc.score-loc.score>poly_dropoff_score) break; |
837 |
} |
838 |
if (maxloc.score<=loc.score) { |
839 |
maxloc=loc; |
840 |
} |
841 |
} |
842 |
ri=maxloc.pos; |
843 |
if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end |
844 |
//ri = right boundary for the poly match |
845 |
//extend left |
846 |
loc.set(li, maxloc.score); |
847 |
maxloc.pos=li; |
848 |
while (li>0) { |
849 |
li--; |
850 |
if (seq[li]==polyChar) { |
851 |
loc.add(li,poly_m_score); |
852 |
} |
853 |
else if (seq[li]=='N') { |
854 |
loc.add(li,0); |
855 |
} |
856 |
else { //mismatch |
857 |
loc.add(li,poly_mis_score); |
858 |
if (maxloc.score-loc.score>poly_dropoff_score) break; |
859 |
} |
860 |
if (maxloc.score<=loc.score) { |
861 |
maxloc=loc; |
862 |
} |
863 |
} |
864 |
li=maxloc.pos; |
865 |
if ((maxloc.score==poly_minScore && ri==rlen-1) || |
866 |
(maxloc.score>poly_minScore && ri>=rlen-3) || |
867 |
(maxloc.score>(poly_minScore*3) && ri>=rlen-8)) { |
868 |
//trimming this li-ri match at 3' end |
869 |
l3=li-1; |
870 |
if (l3<0) l3=0; |
871 |
return true; |
872 |
} |
873 |
return false; |
874 |
} |
875 |
|
876 |
bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) { |
877 |
if (!doPolyTrim) return false; |
878 |
int rlen=seq.length(); |
879 |
l5=0; |
880 |
l3=rlen-1; |
881 |
int32 seedVal=*(int32*)poly_seed; |
882 |
char polyChar=poly_seed[0]; |
883 |
//assumes N trimming was already done |
884 |
//so a poly match should be very close to the end of the read |
885 |
// -- find the initial match (seed) |
886 |
int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds |
887 |
int li; |
888 |
for (li=0;li<=lmax;li++) { |
889 |
if (seedVal==*(int*)&(seq[li])) { |
890 |
break; |
891 |
} |
892 |
} |
893 |
if (li>lmax) return false; |
894 |
//seed found, try to extend it both ways |
895 |
//extend left |
896 |
int ri=li+3; //save rightmost base of the seed |
897 |
SLocScore loc(li, poly_m_score<<2); |
898 |
SLocScore maxloc(loc); |
899 |
while (li>0) { |
900 |
li--; |
901 |
if (seq[li]==polyChar) { |
902 |
loc.add(li,poly_m_score); |
903 |
} |
904 |
else if (seq[li]=='N') { |
905 |
loc.add(li,0); |
906 |
} |
907 |
else { //mismatch |
908 |
loc.add(li,poly_mis_score); |
909 |
if (maxloc.score-loc.score>poly_dropoff_score) break; |
910 |
} |
911 |
if (maxloc.score<=loc.score) { |
912 |
maxloc=loc; |
913 |
} |
914 |
} |
915 |
li=maxloc.pos; |
916 |
if (li>5) return false; //no trimming wanted, too far from 5' end |
917 |
//li = right boundary for the poly match |
918 |
|
919 |
//extend right |
920 |
loc.set(ri, maxloc.score); |
921 |
maxloc.pos=ri; |
922 |
while (ri<rlen-1) { |
923 |
ri++; |
924 |
if (seq[ri]==polyChar) { |
925 |
loc.add(ri,poly_m_score); |
926 |
} |
927 |
else if (seq[ri]=='N') { |
928 |
loc.add(ri,0); |
929 |
} |
930 |
else { //mismatch |
931 |
loc.add(ri,poly_mis_score); |
932 |
if (maxloc.score-loc.score>poly_dropoff_score) break; |
933 |
} |
934 |
if (maxloc.score<=loc.score) { |
935 |
maxloc=loc; |
936 |
} |
937 |
} |
938 |
ri=maxloc.pos; |
939 |
if ((maxloc.score==poly_minScore && li==0) || |
940 |
(maxloc.score>poly_minScore && li<2) |
941 |
|| (maxloc.score>(poly_minScore*3) && li<8)) { |
942 |
//adjust l5 to reflect this trimming of 5' end |
943 |
l5=ri+1; |
944 |
if (l5>rlen-1) l5=rlen-1; |
945 |
return true; |
946 |
} |
947 |
return false; |
948 |
} |
949 |
|
950 |
bool trim_adaptor3(GStr& seq, int&l5, int &l3) { |
951 |
if (adaptors3.Count()==0) return false; |
952 |
int rlen=seq.length(); |
953 |
l5=0; |
954 |
l3=rlen-1; |
955 |
bool trimmed=false; |
956 |
GStr wseq(seq); |
957 |
int wlen=rlen; |
958 |
GXSeqData seqdata; |
959 |
int numruns=revCompl ? 2 : 1; |
960 |
GList<GXAlnInfo> bestalns(true, true, false); |
961 |
for (int ai=0;ai<adaptors3.Count();ai++) { |
962 |
for (int r=0;r<numruns;r++) { |
963 |
if (r) { |
964 |
seqdata.update(adaptors3[ai].seqr.chars(), adaptors3[ai].seqr.length(), |
965 |
adaptors3[ai].pzr, wseq.chars(), wlen, adaptors3[ai].amlen); |
966 |
} |
967 |
else { |
968 |
seqdata.update(adaptors3[ai].seq.chars(), adaptors3[ai].seq.length(), |
969 |
adaptors3[ai].pz, wseq.chars(), wlen, adaptors3[ai].amlen); |
970 |
} |
971 |
GXAlnInfo* aln=match_adaptor(seqdata, adaptors3[ai].trim_type, gxmem_r, 86); |
972 |
if (aln) { |
973 |
if (aln->strong) { |
974 |
trimmed=true; |
975 |
bestalns.Add(aln); |
976 |
break; //will check the rest next time |
977 |
} |
978 |
else bestalns.Add(aln); |
979 |
} |
980 |
}//forward and reverse adaptors |
981 |
if (trimmed) break; //will check the rest in the next cycle |
982 |
}//for each 3' adaptor |
983 |
if (bestalns.Count()>0) { |
984 |
GXAlnInfo* aln=bestalns[0]; |
985 |
if (aln->sl-1 > wlen-aln->sr) { |
986 |
//keep left side |
987 |
l3-=(wlen-aln->sl+1); |
988 |
if (l3<0) l3=0; |
989 |
} |
990 |
else { //keep right side |
991 |
l5+=aln->sr; |
992 |
if (l5>=rlen) l5=rlen-1; |
993 |
} |
994 |
//delete aln; |
995 |
//if (l3-l5+1<min_read_len) return true; |
996 |
wseq=seq.substr(l5,l3-l5+1); |
997 |
wlen=wseq.length(); |
998 |
return true; //break the loops here to report a good find |
999 |
} |
1000 |
return false; |
1001 |
} |
1002 |
|
1003 |
bool trim_adaptor5(GStr& seq, int&l5, int &l3) { |
1004 |
if (adaptors5.Count()==0) return false; |
1005 |
int rlen=seq.length(); |
1006 |
l5=0; |
1007 |
l3=rlen-1; |
1008 |
bool trimmed=false; |
1009 |
GStr wseq(seq); |
1010 |
int wlen=rlen; |
1011 |
GXSeqData seqdata; |
1012 |
int numruns=revCompl ? 2 : 1; |
1013 |
GList<GXAlnInfo> bestalns(true, true, false); |
1014 |
for (int ai=0;ai<adaptors5.Count();ai++) { |
1015 |
for (int r=0;r<numruns;r++) { |
1016 |
if (r) { |
1017 |
seqdata.update(adaptors5[ai].seqr.chars(), adaptors5[ai].seqr.length(), |
1018 |
adaptors5[ai].pzr, wseq.chars(), wlen, adaptors5[ai].amlen); |
1019 |
} |
1020 |
else { |
1021 |
seqdata.update(adaptors5[ai].seq.chars(), adaptors5[ai].seq.length(), |
1022 |
adaptors5[ai].pz, wseq.chars(), wlen, adaptors5[ai].amlen); |
1023 |
} |
1024 |
GXAlnInfo* aln=match_adaptor(seqdata, adaptors5[ai].trim_type, gxmem_l, 90); |
1025 |
if (aln) { |
1026 |
if (aln->strong) { |
1027 |
trimmed=true; |
1028 |
bestalns.Add(aln); |
1029 |
break; //will check the rest next time |
1030 |
} |
1031 |
else bestalns.Add(aln); |
1032 |
} |
1033 |
} //forward and reverse? |
1034 |
if (trimmed) break; //will check the rest in the next cycle |
1035 |
}//for each 5' adaptor |
1036 |
if (bestalns.Count()>0) { |
1037 |
GXAlnInfo* aln=bestalns[0]; |
1038 |
if (aln->sl-1 > wlen-aln->sr) { |
1039 |
//keep left side |
1040 |
l3-=(wlen-aln->sl+1); |
1041 |
if (l3<0) l3=0; |
1042 |
} |
1043 |
else { //keep right side |
1044 |
l5+=aln->sr; |
1045 |
if (l5>=rlen) l5=rlen-1; |
1046 |
} |
1047 |
//delete aln; |
1048 |
//if (l3-l5+1<min_read_len) return true; |
1049 |
wseq=seq.substr(l5,l3-l5+1); |
1050 |
wlen=wseq.length(); |
1051 |
return true; //break the loops here to report a good find |
1052 |
} |
1053 |
return false; |
1054 |
} |
1055 |
|
1056 |
//convert qvs to/from phred64 from/to phread33 |
1057 |
void convertPhred(GStr& q) { |
1058 |
for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd; |
1059 |
} |
1060 |
|
1061 |
void convertPhred(char* q, int len) { |
1062 |
for (int i=0;i<len;i++) q[i]+=qv_cvtadd; |
1063 |
} |
1064 |
|
1065 |
bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv, |
1066 |
GStr& rname, GStr& rinfo, GStr& infname) { |
1067 |
rseq=""; |
1068 |
rqv=""; |
1069 |
rname=""; |
1070 |
rinfo=""; |
1071 |
if (fq.eof()) return false; |
1072 |
char* l=fq.getLine(); |
1073 |
while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines |
1074 |
if (l==NULL) return false; |
1075 |
/* if (rawFormat) { |
1076 |
//TODO: implement raw qseq parsing here |
1077 |
//if (raw type=N) then continue; //skip invalid/bad records |
1078 |
} //raw qseq format |
1079 |
else { // FASTQ or FASTA */ |
1080 |
isfasta=(l[0]=='>'); |
1081 |
if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n", |
1082 |
infname.chars(), l); |
1083 |
GStr s(l); |
1084 |
rname=&(l[1]); |
1085 |
for (int i=0;i<rname.length();i++) |
1086 |
if (rname[i]<=' ') { |
1087 |
if (i<rname.length()-2) rinfo=rname.substr(i+1); |
1088 |
rname.cut(i); |
1089 |
break; |
1090 |
} |
1091 |
//now get the sequence |
1092 |
if ((l=fq.getLine())==NULL) |
1093 |
GError("Error: unexpected EOF after header for read %s (%s)\n", |
1094 |
rname.chars(), infname.chars()); |
1095 |
rseq=l; //this must be the DNA line |
1096 |
while ((l=fq.getLine())!=NULL) { |
1097 |
//seq can span multiple lines |
1098 |
if (l[0]=='>' || l[0]=='+') { |
1099 |
fq.pushBack(); |
1100 |
break; // |
1101 |
} |
1102 |
rseq+=l; |
1103 |
} //check for multi-line seq |
1104 |
if (!isfasta) { //reading fastq quality values, which can also be multi-line |
1105 |
if ((l=fq.getLine())==NULL) |
1106 |
GError("Error: unexpected EOF after sequence for %s\n", rname.chars()); |
1107 |
if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n"); |
1108 |
if ((l=fq.getLine())==NULL) |
1109 |
GError("Error: unexpected EOF after qv header for %s\n", rname.chars()); |
1110 |
rqv=l; |
1111 |
//if (rqv.length()!=rseq.length()) |
1112 |
// GError("Error: qv len != seq len for %s\n", rname.chars()); |
1113 |
while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) { |
1114 |
rqv+=l; //append to qv string |
1115 |
} |
1116 |
}// fastq |
1117 |
// } //<-- FASTA or FASTQ |
1118 |
rseq.upper(); |
1119 |
return true; |
1120 |
} |
1121 |
|
1122 |
#ifdef GDEBUG |
1123 |
void showTrim(GStr& s, int l5, int l3) { |
1124 |
if (l5>0 || l3==0) { |
1125 |
color_bg(c_red); |
1126 |
} |
1127 |
for (int i=0;i<s.length()-1;i++) { |
1128 |
if (i && i==l5) color_resetbg(); |
1129 |
fprintf(stderr, "%c", s[i]); |
1130 |
if (i && i==l3) color_bg(c_red); |
1131 |
} |
1132 |
fprintf(stderr, "%c", s[s.length()-1]); |
1133 |
color_reset(); |
1134 |
fprintf(stderr, "\n"); |
1135 |
} |
1136 |
#endif |
1137 |
|
1138 |
char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) { |
1139 |
//returns 0 if the read was untouched, 1 if it was just trimmed |
1140 |
// and a trash code if it was trashed |
1141 |
l5=0; |
1142 |
l3=rseq.length()-1; |
1143 |
#ifdef GDEBUG |
1144 |
//rseq.reverse(); |
1145 |
GMessage(">%s\n", rname.chars()); |
1146 |
GMessage("%s\n",rseq.chars()); |
1147 |
#endif |
1148 |
if (l3-l5+1<min_read_len) { |
1149 |
return 's'; |
1150 |
} |
1151 |
GStr wseq(rseq.chars()); |
1152 |
GStr wqv(rqv.chars()); |
1153 |
int w5=l5; |
1154 |
int w3=l3; |
1155 |
//first do the q-based trimming |
1156 |
if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming |
1157 |
if (w3-w5+1<min_read_len) { |
1158 |
return 'Q'; //invalid read |
1159 |
} |
1160 |
//-- keep only the w5..w3 range |
1161 |
l5=w5; |
1162 |
l3=w3; |
1163 |
wseq=wseq.substr(w5, w3-w5+1); |
1164 |
if (!wqv.is_empty()) |
1165 |
wqv=wqv.substr(w5, w3-w5+1); |
1166 |
} //qv trimming |
1167 |
// N-trimming on the remaining read seq |
1168 |
if (ntrim(wseq, w5, w3)) { |
1169 |
//GMessage("before: %s\n",wseq.chars()); |
1170 |
//GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1); |
1171 |
l5+=w5; |
1172 |
l3-=(wseq.length()-1-w3); |
1173 |
if (w3-w5+1<min_read_len) { |
1174 |
return 'N'; //to be trashed |
1175 |
} |
1176 |
//-- keep only the w5..w3 range |
1177 |
wseq=wseq.substr(w5, w3-w5+1); |
1178 |
if (!wqv.is_empty()) |
1179 |
wqv=wqv.substr(w5, w3-w5+1); |
1180 |
w5=0; |
1181 |
w3=wseq.length()-1; |
1182 |
} |
1183 |
char trim_code; |
1184 |
//clean the more dirty end first - 3' |
1185 |
int prev_w5=0; |
1186 |
int prev_w3=0; |
1187 |
bool w3upd=true; |
1188 |
bool w5upd=true; |
1189 |
do { |
1190 |
trim_code=0; |
1191 |
if (w3upd && trim_poly3(wseq, w5, w3, polyA_seed)) { |
1192 |
trim_code='A'; |
1193 |
} |
1194 |
else if (w3upd && trim_poly3(wseq, w5, w3, polyT_seed)) { |
1195 |
trim_code='T'; |
1196 |
} |
1197 |
else if (w5upd && trim_poly5(wseq, w5, w3, polyA_seed)) { |
1198 |
trim_code='A'; |
1199 |
} |
1200 |
else if (w5upd && trim_poly5(wseq, w5, w3, polyT_seed)) { |
1201 |
trim_code='T'; |
1202 |
} |
1203 |
else if (trim_adaptor5(wseq, w5, w3)) { |
1204 |
trim_code='5'; |
1205 |
} |
1206 |
else if (trim_adaptor3(wseq, w5, w3)) { |
1207 |
trim_code='3'; |
1208 |
} |
1209 |
if (trim_code) { |
1210 |
w3upd=(w3!=prev_w3); |
1211 |
w5upd=(w5!=prev_w5); |
1212 |
if (w3upd) prev_w3=w3; |
1213 |
if (w5upd) prev_w5=w5; |
1214 |
#ifdef GDEBUG |
1215 |
GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3); |
1216 |
showTrim(wseq, w5, w3); |
1217 |
#endif |
1218 |
int trimlen=wseq.length()-(w3-w5+1); |
1219 |
num_trimmed3++; |
1220 |
if (trimlen<min_trimmed3) |
1221 |
min_trimmed3=trimlen; |
1222 |
l5+=w5; |
1223 |
l3-=(wseq.length()-1-w3); |
1224 |
if (w3-w5+1<min_read_len) { |
1225 |
return trim_code; |
1226 |
} |
1227 |
//-- keep only the w5..w3 range |
1228 |
wseq=wseq.substr(w5, w3-w5+1); |
1229 |
if (!wqv.is_empty()) |
1230 |
wqv=wqv.substr(w5, w3-w5+1); |
1231 |
}//trimming at 3' end |
1232 |
} while (trim_code); |
1233 |
|
1234 |
if (doCollapse) { |
1235 |
//keep read for later |
1236 |
FqDupRec* dr=dhash.Find(wseq.chars()); |
1237 |
if (dr==NULL) { //new entry |
1238 |
//if (prefix.is_empty()) |
1239 |
dhash.Add(wseq.chars(), |
1240 |
new FqDupRec(&wqv, rname.chars())); |
1241 |
//else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length())); |
1242 |
} |
1243 |
else |
1244 |
dr->add(wqv); |
1245 |
} //collapsing duplicates |
1246 |
else { //not collapsing duplicates |
1247 |
//apply the dust filter now |
1248 |
if (doDust) { |
1249 |
int dustbases=dust(wseq); |
1250 |
if (dustbases>(wseq.length()>>1)) { |
1251 |
return 'D'; |
1252 |
} |
1253 |
} |
1254 |
} //not collapsing duplicates |
1255 |
return (l5>0 || l3<rseq.length()-1) ? 1 : 0; |
1256 |
} |
1257 |
|
1258 |
void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) { |
1259 |
//GMessage("printing Header..%c%s\n",recmarker, rname.chars()); |
1260 |
if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars()); |
1261 |
else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars()); |
1262 |
} |
1263 |
|
1264 |
void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) { |
1265 |
outcounter++; |
1266 |
bool asFasta=(rqv.is_empty() || fastaOutput); |
1267 |
if (asFasta) { |
1268 |
if (prefix.is_empty()) { |
1269 |
printHeader(f_out, '>',rname,rinfo); |
1270 |
fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now |
1271 |
} |
1272 |
else { |
1273 |
fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter, |
1274 |
rseq.chars()); |
1275 |
} |
1276 |
} |
1277 |
else { //fastq |
1278 |
if (convert_phred) convertPhred(rqv); |
1279 |
if (prefix.is_empty()) { |
1280 |
printHeader(f_out, '@', rname, rinfo); |
1281 |
fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars()); |
1282 |
} |
1283 |
else |
1284 |
fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter, |
1285 |
rseq.chars(),rqv.chars() ); |
1286 |
} |
1287 |
} |
1288 |
|
1289 |
void trash_report(char trashcode, GStr& rname, FILE* freport) { |
1290 |
if (freport==NULL || trashcode<=' ') return; |
1291 |
if (trashcode=='3' || trashcode=='5') { |
1292 |
fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode); |
1293 |
} |
1294 |
else { |
1295 |
fprintf(freport, "%s\t%c\n",rname.chars(),trashcode); |
1296 |
} |
1297 |
//tcounter++; |
1298 |
} |
1299 |
|
1300 |
GStr getFext(GStr& s, int* xpos=NULL) { |
1301 |
//s must be a filename without a path |
1302 |
GStr r(""); |
1303 |
if (xpos!=NULL) *xpos=0; |
1304 |
if (s.is_empty() || s=="-") return r; |
1305 |
int p=s.rindex('.'); |
1306 |
int d=s.rindex('/'); |
1307 |
if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r; |
1308 |
r=s.substr(p+1); |
1309 |
if (xpos!=NULL) *xpos=p+1; |
1310 |
r.lower(); |
1311 |
return r; |
1312 |
} |
1313 |
|
1314 |
void baseFileName(GStr& fname) { |
1315 |
//remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) . |
1316 |
int xpos=0; |
1317 |
GStr fext=getFext(fname, &xpos); |
1318 |
if (xpos<=1) return; |
1319 |
bool extdel=false; |
1320 |
GStr f2; |
1321 |
if (fext=="z" || fext=="zip") { |
1322 |
extdel=true; |
1323 |
} |
1324 |
else if (fext.length()>=2) { |
1325 |
f2=fext.substr(0,2); |
1326 |
extdel=(f2=="gz" || f2=="bz"); |
1327 |
} |
1328 |
if (extdel) { |
1329 |
fname.cut(xpos-1); |
1330 |
fext=getFext(fname, &xpos); |
1331 |
if (xpos<=1) return; |
1332 |
} |
1333 |
extdel=false; |
1334 |
if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") { |
1335 |
extdel=true; |
1336 |
} |
1337 |
else if (fext.length()>=2) { |
1338 |
extdel=(fext.substr(0,2)=="fa"); |
1339 |
} |
1340 |
if (extdel) fname.cut(xpos-1); |
1341 |
GStr fncp(fname); |
1342 |
fncp.lower(); |
1343 |
fncp.chomp("seq"); |
1344 |
fncp.chomp("sequence"); |
1345 |
fncp.trimR("_."); |
1346 |
if (fncp.length()<fname.length()) fname.cut(fncp.length()); |
1347 |
} |
1348 |
|
1349 |
FILE* prepOutFile(GStr& infname, GStr& pocmd) { |
1350 |
FILE* f_out=NULL; |
1351 |
GStr fname(getFileName(infname.chars())); |
1352 |
//eliminate known extensions |
1353 |
baseFileName(fname); |
1354 |
if (outsuffix.is_empty() || outsuffix=="-") { return stdout; } |
1355 |
else if (pocmd.is_empty()) { |
1356 |
GStr oname(fname); |
1357 |
oname.append('.'); |
1358 |
oname.append(outsuffix); |
1359 |
f_out=fopen(oname.chars(),"w"); |
1360 |
if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars()); |
1361 |
} |
1362 |
else { |
1363 |
GStr oname(">"); |
1364 |
oname.append(fname); |
1365 |
oname.append('.'); |
1366 |
oname.append(outsuffix); |
1367 |
pocmd.append(oname); |
1368 |
f_out=popen(pocmd.chars(), "w"); |
1369 |
if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars()); |
1370 |
} |
1371 |
return f_out; |
1372 |
} |
1373 |
|
1374 |
void guess_unzip(GStr& fname, GStr& picmd) { |
1375 |
GStr fext=getFext(fname); |
1376 |
if (fext=="gz" || fext=="gzip" || fext=="z") { |
1377 |
picmd="gzip -cd "; |
1378 |
} |
1379 |
else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") { |
1380 |
picmd="bzip2 -cd "; |
1381 |
} |
1382 |
} |
1383 |
|
1384 |
void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type) { |
1385 |
//TODO: prepare CASeqData here, and collect hexamers as well |
1386 |
if (seq.is_empty() || seq=="-" || |
1387 |
seq=="N/A" || seq==".") return; |
1388 |
|
1389 |
CASeqData adata(revCompl); |
1390 |
int idx=adaptors.Add(adata); |
1391 |
if (idx<0) GError("Error: failed to add adaptor!\n"); |
1392 |
adaptors[idx].trim_type=trim_type; |
1393 |
adaptors[idx].update(seq.chars()); |
1394 |
} |
1395 |
|
1396 |
|
1397 |
int loadAdaptors(const char* fname) { |
1398 |
GLineReader lr(fname); |
1399 |
char* l; |
1400 |
while ((l=lr.nextLine())!=NULL) { |
1401 |
if (lr.length()<=3 || l[0]=='#') continue; |
1402 |
if ( l[0]==' ' || l[0]=='\t' || l[0]==',' || |
1403 |
l[0]==';'|| l[0]==':' ) { |
1404 |
int i=1; |
1405 |
while (l[i]!=0 && isspace(l[i])) { |
1406 |
i++; |
1407 |
} |
1408 |
if (l[i]!=0) { |
1409 |
GStr s(&(l[i])); |
1410 |
#ifdef GDEBUG |
1411 |
//s.reverse(); |
1412 |
#endif |
1413 |
addAdaptor(adaptors3, s, galn_TrimRight); |
1414 |
continue; |
1415 |
} |
1416 |
} |
1417 |
else { |
1418 |
GStr s(l); |
1419 |
s.startTokenize("\t ;,:"); |
1420 |
GStr a5,a3; |
1421 |
if (s.nextToken(a5)) |
1422 |
s.nextToken(a3); |
1423 |
else continue; //no tokens on this line |
1424 |
GAlnTrimType ttype5=galn_TrimLeft; |
1425 |
a5.upper(); |
1426 |
a3.upper(); |
1427 |
if (a3.is_empty() || a3==a5 || a3=="=") { |
1428 |
a3.clear(); |
1429 |
ttype5=galn_TrimEither; |
1430 |
} |
1431 |
#ifdef GDEBUG |
1432 |
// a5.reverse(); |
1433 |
// a3.reverse(); |
1434 |
#endif |
1435 |
addAdaptor(adaptors5, a5, ttype5); |
1436 |
addAdaptor(adaptors3, a3, galn_TrimRight); |
1437 |
} |
1438 |
} |
1439 |
return adaptors5.Count()+adaptors3.Count(); |
1440 |
} |
1441 |
|
1442 |
void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2, |
1443 |
GStr& s, GStr& infname, GStr& infname2) { |
1444 |
// uses outsuffix to generate output file names and open file handles as needed |
1445 |
infname=""; |
1446 |
infname2=""; |
1447 |
f_in=NULL; |
1448 |
f_in2=NULL; |
1449 |
f_out=NULL; |
1450 |
f_out2=NULL; |
1451 |
//analyze outsuffix intent |
1452 |
GStr pocmd; |
1453 |
if (outsuffix=="-") { |
1454 |
f_out=stdout; |
1455 |
} |
1456 |
else { |
1457 |
GStr ox=getFext(outsuffix); |
1458 |
if (ox.length()>2) ox=ox.substr(0,2); |
1459 |
if (ox=="gz") pocmd="gzip -9 -c "; |
1460 |
else if (ox=="bz") pocmd="bzip2 -9 -c "; |
1461 |
} |
1462 |
if (s=="-") { |
1463 |
f_in=stdin; |
1464 |
infname="stdin"; |
1465 |
f_out=prepOutFile(infname, pocmd); |
1466 |
return; |
1467 |
} // streaming from stdin |
1468 |
s.startTokenize(",:"); |
1469 |
s.nextToken(infname); |
1470 |
bool paired=s.nextToken(infname2); |
1471 |
if (fileExists(infname.chars())==0) |
1472 |
GError("Error: cannot find file %s!\n",infname.chars()); |
1473 |
GStr fname(getFileName(infname.chars())); |
1474 |
GStr picmd; |
1475 |
guess_unzip(fname, picmd); |
1476 |
if (picmd.is_empty()) { |
1477 |
f_in=fopen(infname.chars(), "r"); |
1478 |
if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars()); |
1479 |
} |
1480 |
else { |
1481 |
picmd.append(infname); |
1482 |
f_in=popen(picmd.chars(), "r"); |
1483 |
if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars()); |
1484 |
} |
1485 |
if (f_out==stdout) { |
1486 |
if (paired) GError("Error: output suffix required for paired reads\n"); |
1487 |
return; |
1488 |
} |
1489 |
f_out=prepOutFile(infname, pocmd); |
1490 |
if (!paired) return; |
1491 |
if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n"); |
1492 |
// ---- paired reads:------------- |
1493 |
if (fileExists(infname2.chars())==0) |
1494 |
GError("Error: cannot find file %s!\n",infname2.chars()); |
1495 |
picmd=""; |
1496 |
GStr fname2(getFileName(infname2.chars())); |
1497 |
guess_unzip(fname2, picmd); |
1498 |
if (picmd.is_empty()) { |
1499 |
f_in2=fopen(infname2.chars(), "r"); |
1500 |
if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars()); |
1501 |
} |
1502 |
else { |
1503 |
picmd.append(infname2); |
1504 |
f_in2=popen(picmd.chars(), "r"); |
1505 |
if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars()); |
1506 |
} |
1507 |
f_out2=prepOutFile(infname2, pocmd); |
1508 |
} |