ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 189
Committed: Fri Feb 17 23:08:53 2012 UTC (7 years, 5 months ago) by gpertea
File size: 48914 byte(s)
Log Message:
added -a option to control the minimum exact adaptor match to be trimmed at the ends

Line File contents
1 #include "GArgs.h"
2 #include "GStr.h"
3 #include "GHash.hh"
4 #include "GList.hh"
5 #include <ctype.h>
6 #include "GAlnExtend.h"
7 #include <inttypes.h>
8
9 #define USAGE "Usage:\n\
10 fqtrim [{-5 <5adaptor> -3 <3adaptor>|-f <adaptors_file>}] [-a <min_match>]\\\n\
11 [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
12 [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
13 [-a <min_poly>] [-A] <input.fq>[,<input_mates.fq>\n\
14 \n\
15 Trim low quality bases at the 3' end and can trim adaptor sequence(s), filter\n\
16 for low complexity and collapse duplicate reads.\n\
17 If read pairs should be trimmed and kept together (i.e. without discarding\n\
18 one read in a pair), the two file names should be given delimited by a comma\n\
19 or a colon character\n\
20 \n\
21 Options:\n\
22 -n rename all the reads using the <prefix> followed by a read counter;\n\
23 if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
24 the read duplication count\n\
25 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
26 file(s) named <input>.<outsuffix> which will be created in the \n\
27 current (working) directory; (writes to stdout if -o- is given);\n\
28 a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
29 -f file with adaptor sequences to trim, each line having this format:\n\
30 <5'-adaptor-sequence> <3'-adaptor-sequence>\n\
31 -5 trim the given adaptor or primer sequence at the 5' end of each read\n\
32 (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
33 -3 trim the given adaptor sequence at the 3' end of each read\n\
34 (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
35 -a minimum exact match to adaptor sequence to trim at the end (6)\n\
36 -A disable polyA/T trimming (enabled by default)\n\
37 -y minimum length of poly-A/T run to remove (6)\n\
38 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
39 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
40 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
41 -l minimum \"clean\" length after trimming that a read must have\n\
42 in order to pass the filter (default: 16)\n\
43 -r write a simple \"trash report\" file listing the discarded reads with a\n\
44 one letter code for the reason why they were trashed\n\
45 -D apply a low-complexity (dust) filter and discard any read that has over\n\
46 50%% of its length detected as low complexity\n\
47 -C collapse duplicate reads and add a prefix with their count to the read\n\
48 name (e.g. for microRNAs)\n\
49 -p input is phred64/phred33 (use -p64 or -p33)\n\
50 -Q convert quality values to the other Phred qv type\n\
51 -V be verbose when done (print summary counts)\n\
52 "
53
54 //-z for -o option, the output stream(s) will be first piped into the given\n
55 // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
56
57
58 // example 3' adaptor for miRNAs: TCGTATGCCGTCTTCTGCTTG
59
60 //For paired reads sequencing:
61 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
62 //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
63 //FILE* f_out=NULL; //stdout if not provided
64 //FILE* f_out2=NULL; //for paired reads
65 //FILE* f_in=NULL; //input fastq (stdin if not provided)
66 //FILE* f_in2=NULL; //for paired reads
67
68 FILE* freport=NULL;
69
70 bool debug=false;
71 bool verbose=false;
72 bool doCollapse=false;
73 bool doDust=false;
74 bool doPolyTrim=true;
75 bool fastaOutput=false;
76 bool trashReport=false;
77 bool revCompl=false; //also reverse complement adaptor sequences
78 int min_read_len=16;
79 double max_perc_N=7.0;
80 int dust_cutoff=16;
81 bool isfasta=false;
82 bool convert_phred=false;
83 GStr outsuffix; // -o
84 GStr prefix;
85 GStr zcmd;
86 char isACGT[256];
87
88 uint num_trimN=0; //reads trimmed by N%
89 uint num_trimQ=0; //reads trimmed by qv threshold
90 uint num_trimV=0; //reads trimmed by adapter match
91 uint num_trimA=0; //reads trimmed by polyA
92 uint num_trimT=0; //reads trimmed by polyT
93 uint num_trim5=0; //number of reads trimmed at 5' end
94 uint num_trim3=0; //number of reads trimmed at 3' end
95
96 uint64 b_totalIn=0; //total number of input bases
97 uint64 b_totalN=0; //total number of undetermined bases found in input bases
98
99 uint64 b_trimN=0; //total number of bases trimmed due to N-trim
100 uint64 b_trimQ=0; //number of bases trimmed due to qv threshold
101 uint64 b_trimV=0; //total number of bases trimmed due to adaptor matches
102 uint64 b_trimA=0; //number of bases trimmed due to poly-A tails
103 uint64 b_trimT=0; //number of bases trimmed due to poly-T tails
104 uint64 b_trim5=0; //total bases trimmed on the 5' side
105 uint64 b_trim3=0; //total bases trimmed on the 3' side
106 //int min_trimmed5=INT_MAX;
107 //int min_trimmed3=INT_MAX;
108
109 int qvtrim_qmin=0;
110 int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
111 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
112 int qv_cvtadd=0; //could be -31 or +31
113
114 // adaptor matching metrics -- for X-drop ungapped extension
115 //const int match_reward=2;
116 //const int mismatch_penalty=3;
117 int match_reward=1;
118 int mismatch_penalty=5;
119 int Xdrop=16;
120 int minEndAdapter=6;
121
122
123 const int poly_m_score=2; //match score
124 const int poly_mis_score=-3; //mismatch
125 const int poly_dropoff_score=7;
126 int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
127
128 const char *polyA_seed="AAAA";
129 const char *polyT_seed="TTTT";
130
131 struct CASeqData {
132 //positional data for every possible hexamer in an adaptor
133 GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adaptor sequence
134 GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adaptor sequence
135 GStr seq; //actual adaptor sequence data
136 GStr seqr; //reverse complement sequence
137 int amlen; //fraction of adaptor length matching that's
138 //enough to consider the alignment
139 GAlnTrimType trim_type;
140 bool use_reverse;
141 CASeqData(bool rev=false):seq(),seqr(),
142 amlen(0), use_reverse(rev) {
143 trim_type=galn_None; //should be updated later!
144 for (int i=0;i<4096;i++) {
145 pz[i]=NULL;
146 pzr[i]=NULL;
147 }
148 }
149
150 void update(const char* s) {
151 seq=s;
152 table6mers(seq.chars(), seq.length(), pz);
153 amlen=calc_safelen(seq.length());
154 if (!use_reverse) return;
155 //reverse complement
156 seqr=s;
157 int slen=seq.length();
158 for (int i=0;i<slen;i++)
159 seqr[i]=ntComplement(seq[slen-i-1]);
160 table6mers(seqr.chars(), seqr.length(), pzr);
161 }
162
163 ~CASeqData() {
164 for (int i=0;i<4096;i++) {
165 delete pz[i];
166 delete pzr[i];
167 }
168 }
169 };
170
171 GVec<CASeqData> adaptors5;
172 GVec<CASeqData> adaptors3;
173
174 CGreedyAlignData* gxmem_l=NULL;
175 CGreedyAlignData* gxmem_r=NULL;
176
177 // element in dhash:
178 class FqDupRec {
179 public:
180 int count; //how many of these reads are the same
181 int len; //length of qv
182 char* firstname; //optional, only if we want to keep the original read names
183 char* qv;
184 FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
185 len=0;
186 qv=NULL;
187 firstname=NULL;
188 count=0;
189 if (qstr!=NULL) {
190 qv=Gstrdup(qstr->chars());
191 len=qstr->length();
192 count++;
193 }
194 if (rname!=NULL) firstname=Gstrdup(rname);
195 }
196 ~FqDupRec() {
197 GFREE(qv);
198 GFREE(firstname);
199 }
200 void add(GStr& d) { //collapse another record into this one
201 if (d.length()!=len)
202 GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
203 count++;
204 for (int i=0;i<len;i++)
205 qv[i]=(qv[i]+d[i])/2;
206 }
207 };
208
209 void openfw(FILE* &f, GArgs& args, char opt) {
210 GStr s=args.getOpt(opt);
211 if (!s.is_empty()) {
212 if (s=='-') f=stdout;
213 else {
214 f=fopen(s.chars(),"w");
215 if (f==NULL) GError("Error creating file: %s\n", s.chars());
216 }
217 }
218 }
219
220 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
221 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
222
223 GHash<FqDupRec> dhash; //hash to keep track of duplicates
224
225 void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type);
226 int loadAdaptors(const char* fname);
227
228 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
229 GStr& s, GStr& infname, GStr& infname2);
230 // uses outsuffix to generate output file names and open file handles as needed
231
232 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
233 void trash_report(char trashcode, GStr& rname, FILE* freport);
234
235 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
236 GStr& rname, GStr& rinfo, GStr& infname);
237
238 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
239 //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
240
241 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
242 bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
243 int dust(GStr& seq);
244 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
245 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
246 bool trim_adaptor5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
247 bool trim_adaptor3(GStr& seq, int &l5, int &l3);
248
249 void convertPhred(char* q, int len);
250 void convertPhred(GStr& q);
251
252 int main(int argc, char * const argv[]) {
253 GArgs args(argc, argv, "MATCH=MISMATCH=XDROP=YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:y:");
254 int e;
255 if ((e=args.isError())>0) {
256 GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
257 exit(224);
258 }
259 debug=(args.getOpt('Y')!=NULL);
260 verbose=(args.getOpt('V')!=NULL);
261 convert_phred=(args.getOpt('Q')!=NULL);
262 doCollapse=(args.getOpt('C')!=NULL);
263 doDust=(args.getOpt('D')!=NULL);
264 revCompl=(args.getOpt('R')!=NULL);
265 if (args.getOpt('A')) doPolyTrim=false;
266 /*
267 rawFormat=(args.getOpt('R')!=NULL);
268 if (rawFormat) {
269 GError("Sorry, raw qseq format parsing is not implemented yet!\n");
270 }
271 */
272 prefix=args.getOpt('n');
273 GStr s=args.getOpt('l');
274 if (!s.is_empty())
275 min_read_len=s.asInt();
276 s=args.getOpt('m');
277 if (!s.is_empty())
278 max_perc_N=s.asDouble();
279 s=args.getOpt('d');
280 if (!s.is_empty()) {
281 dust_cutoff=s.asInt();
282 doDust=true;
283 }
284 s=args.getOpt('q');
285 if (!s.is_empty()) {
286 qvtrim_qmin=s.asInt();
287 }
288 s=args.getOpt('t');
289 if (!s.is_empty()) {
290 qvtrim_max=s.asInt();
291 }
292 s=args.getOpt("MATCH");
293 if (!s.is_empty())
294 match_reward=s.asInt();
295 s=args.getOpt("MISMATCH");
296 if (!s.is_empty())
297 mismatch_penalty=s.asInt();
298 s=args.getOpt("XDROP");
299 if (!s.is_empty())
300 Xdrop=s.asInt();
301
302 s=args.getOpt('p');
303 if (!s.is_empty()) {
304 int v=s.asInt();
305 if (v==33) {
306 qv_phredtype=33;
307 qv_cvtadd=31;
308 }
309 else if (v==64) {
310 qv_phredtype=64;
311 qv_cvtadd=-31;
312 }
313 else
314 GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
315 }
316 memset((void*)isACGT, 0, 256);
317 isACGT['A']=isACGT['a']=isACGT['C']=isACGT['c']=1;
318 isACGT['G']=isACGT['g']=isACGT['T']=isACGT['t']=1;
319 s=args.getOpt('f');
320 if (!s.is_empty()) {
321 loadAdaptors(s.chars());
322 }
323 bool fileAdaptors=adaptors5.Count()+adaptors3.Count();
324 s=args.getOpt('5');
325 if (!s.is_empty()) {
326 if (fileAdaptors)
327 GError("Error: options -5 and -f cannot be used together!\n");
328 s.upper();
329 addAdaptor(adaptors5, s, galn_TrimLeft);
330 }
331 s=args.getOpt('3');
332 if (!s.is_empty()) {
333 if (fileAdaptors)
334 GError("Error: options -3 and -f cannot be used together!\n");
335 s.upper();
336 addAdaptor(adaptors3, s, galn_TrimRight);
337 }
338 s=args.getOpt('y');
339 if (!s.is_empty()) {
340 int minmatch=s.asInt();
341 if (minmatch>2)
342 poly_minScore=minmatch*poly_m_score;
343 else GMessage("Warning: invalid -y option, ignored.\n");
344 }
345 s=args.getOpt('a');
346 if (!s.is_empty()) {
347 int minmatch=s.asInt();
348 if (minmatch>2)
349 minEndAdapter=minmatch;
350 else GMessage("Warning: invalid -a option, ignored.\n");
351 }
352
353
354 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
355 else outsuffix="-";
356 trashReport= (args.getOpt('r')!=NULL);
357 int fcount=args.startNonOpt();
358 if (fcount==0) {
359 GMessage(USAGE);
360 exit(224);
361 }
362 if (fcount>1 && doCollapse) {
363 GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
364 }
365 //openfw(f_out, args, 'o');
366 //if (f_out==NULL) f_out=stdout;
367 if (trashReport)
368 openfw(freport, args, 'r');
369 char* infile=NULL;
370
371 if (adaptors5.Count()>0)
372 //gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
373 gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
374 if (adaptors3.Count()>0)
375 gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
376
377 while ((infile=args.nextNonOpt())!=NULL) {
378 //for each input file
379 int incounter=0; //counter for input reads
380 int outcounter=0; //counter for output reads
381 int trash_s=0; //too short from the get go
382 int trash_Q=0;
383 int trash_N=0;
384 int trash_D=0;
385 int trash_poly=0;
386 int trash_V=0;
387 num_trimN=0;
388 num_trimQ=0;
389 num_trimV=0;
390 num_trimA=0;
391 num_trimT=0;
392 num_trim5=0;
393 num_trim3=0;
394
395 b_totalIn=0;
396 b_totalN=0;
397 b_trimN=0;
398 b_trimQ=0;
399 b_trimV=0;
400 b_trimA=0;
401 b_trimT=0;
402 b_trim5=0;
403 b_trim3=0;
404
405 s=infile;
406 GStr infname;
407 GStr infname2;
408 FILE* f_in=NULL;
409 FILE* f_in2=NULL;
410 FILE* f_out=NULL;
411 FILE* f_out2=NULL;
412 bool paired_reads=false;
413 setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
414 GLineReader fq(f_in);
415 GLineReader* fq2=NULL;
416 if (f_in2!=NULL) {
417 fq2=new GLineReader(f_in2);
418 paired_reads=true;
419 }
420 GStr rseq, rqv, seqid, seqinfo;
421 GStr rseq2, rqv2, seqid2, seqinfo2;
422 while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
423 incounter++;
424 int a5=0, a3=0, b5=0, b3=0;
425 char tcode=0, tcode2=0;
426 tcode=process_read(seqid, rseq, rqv, a5, a3);
427 if (a5>0) {
428 b_trim5+=a5;
429 num_trim5++;
430 }
431 if (a3<rseq.length()-1) {
432 b_trim3+=rseq.length()-a3-1;
433 num_trim3++;
434 }
435 if (fq2!=NULL) {
436 getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
437 if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
438 GError("Error: no paired match for read %s vs %s (%s,%s)\n",
439 seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
440 }
441 tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
442 if (b5>0) {
443 b_trim5+=b5;
444 num_trim5++;
445 }
446 if (b3<rseq2.length()-1) {
447 b_trim3+=rseq2.length()-b3-1;
448 num_trim3++;
449 }
450 //decide what to do with this pair and print rseq2 if the pair makes it
451 if (tcode>1 && tcode2<=1) {
452 //"untrash" rseq
453 if (a3-a5+1<min_read_len) {
454 a5=1;
455 if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
456 }
457 tcode=1;
458 }
459 else if (tcode<=1 && tcode2>1) {
460 //"untrash" rseq2
461 if (b3-b5+1<min_read_len) {
462 b5=1;
463 if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
464 }
465 tcode2=1;
466 }
467 if (tcode<=1) { //trimmed or left intact -- write it!
468 if (tcode>0) {
469 rseq2=rseq2.substr(b5,b3-b5+1);
470 if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
471 }
472 int nocounter=0;
473 writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
474 }
475 } //pair read
476 if (tcode>1) { //trashed
477 #ifdef GDEBUG
478 GMessage(" !!!!TRASH code = %c\n",tcode);
479 #endif
480 if (tcode=='s') trash_s++;
481 else if (tcode=='A' || tcode=='T') trash_poly++;
482 else if (tcode=='Q') trash_Q++;
483 else if (tcode=='N') trash_N++;
484 else if (tcode=='D') trash_D++;
485 else if (tcode=='V') trash_V++;
486 //else if (tcode=='5') trash_A5++;
487 if (trashReport) trash_report(tcode, seqid, freport);
488 }
489 else if (!doCollapse) { //write it
490 if (tcode>0) {
491 rseq=rseq.substr(a5,a3-a5+1);
492 if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
493 }
494 #ifdef GDEBUG
495 GMessage(" After trimming:\n");
496 GMessage("%s\n",rseq.chars());
497 #endif
498 writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
499 }
500 } //for each fastq record
501 delete fq2;
502 FRCLOSE(f_in);
503 FRCLOSE(f_in2);
504 if (doCollapse) {
505 outcounter=0;
506 int maxdup_count=1;
507 char* maxdup_seq=NULL;
508 dhash.startIterate();
509 FqDupRec* qd=NULL;
510 char* seq=NULL;
511 while ((qd=dhash.NextData(seq))!=NULL) {
512 GStr rseq(seq);
513 //do the dusting here
514 if (doDust) {
515 int dustbases=dust(rseq);
516 if (dustbases>(rseq.length()>>1)) {
517 if (trashReport && qd->firstname!=NULL) {
518 fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
519 }
520 trash_D+=qd->count;
521 continue;
522 }
523 }
524 outcounter++;
525 if (qd->count>maxdup_count) {
526 maxdup_count=qd->count;
527 maxdup_seq=seq;
528 }
529 if (isfasta) {
530 if (prefix.is_empty()) {
531 fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
532 rseq.chars());
533 }
534 else { //use custom read name
535 fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
536 qd->count, rseq.chars());
537 }
538 }
539 else { //fastq format
540 if (convert_phred) convertPhred(qd->qv, qd->len);
541 if (prefix.is_empty()) {
542 fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
543 rseq.chars(), qd->qv);
544 }
545 else { //use custom read name
546 fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
547 qd->count, rseq.chars(), qd->qv);
548 }
549 }
550 }//for each element of dhash
551 if (maxdup_count>1) {
552 GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
553 }
554 } //collapse entries
555 if (verbose) {
556 if (paired_reads) {
557 GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
558 GMessage("Number of input pairs :%9u\n", incounter);
559 GMessage(" Output pairs :%9u\t(%u trashed)\n", outcounter, incounter-outcounter);
560 }
561 else {
562 GMessage(">Input file : %s\n", infname.chars());
563 GMessage("Number of input reads :%9d\n", incounter);
564 GMessage(" Output reads :%9d (%u trashed)\n", outcounter, incounter-outcounter);
565 }
566 GMessage("--------------- Read trimming: ------------------\n");
567 if (num_trim5)
568 GMessage(" 5' trimmed :%9u\n", num_trim5);
569 if (num_trim3)
570 GMessage(" 3' trimmed :%9u\n", num_trim3);
571 if (num_trimQ)
572 GMessage(" q.v. trimmed :%9u\n", num_trimQ);
573 if (num_trimN)
574 GMessage(" N trimmed :%9u\n", num_trimN);
575 if (num_trimT)
576 GMessage(" poly-T trimmed :%9u\n", num_trimT);
577 if (num_trimA)
578 GMessage(" poly-A trimmed :%9u\n", num_trimA);
579 if (num_trimV)
580 GMessage(" Adapter trimmed :%9u\n", num_trimV);
581 GMessage("------------------------------------\n");
582 if (trash_s>0)
583 GMessage("Trashed by initial len:%9d\n", trash_s);
584 if (trash_N>0)
585 GMessage(" Trashed by N%%:%9d\n", trash_N);
586 if (trash_Q>0)
587 GMessage("Trashed by low quality:%9d\n", trash_Q);
588 if (trash_poly>0)
589 GMessage(" Trashed by poly-A/T:%9d\n", trash_poly);
590 if (trash_V>0)
591 GMessage(" Trashed by adaptor:%9d\n", trash_V);
592 GMessage("\n--------------- Base counts --------------------\n");
593 GMessage(" Input bases :%12llu\n", b_totalIn);
594 double percN=100.0* ((double)b_totalN/(double)b_totalIn);
595 GMessage(" N bases :%12llu (%4.2f%%)\n", b_totalN, percN);
596 GMessage(" trimmed from 5':%12llu\n", b_trim5);
597 GMessage(" trimmed from 3':%12llu\n", b_trim3);
598 GMessage("\n");
599 if (b_trimQ)
600 GMessage(" q.v. trimmed :%12llu\n", b_trimQ);
601 if (b_trimN)
602 GMessage(" N trimmed :%12llu\n", b_trimN);
603 if (b_trimT)
604 GMessage(" poly-T trimmed :%12llu\n", b_trimT);
605 if (b_trimA)
606 GMessage(" poly-A trimmed :%12llu\n", b_trimA);
607 if (b_trimV)
608 GMessage(" Adapter trimmed :%12llu\n", b_trimV);
609
610 }
611 if (trashReport) {
612 FWCLOSE(freport);
613 }
614 FWCLOSE(f_out);
615 FWCLOSE(f_out2);
616 } //while each input file
617 delete gxmem_l;
618 delete gxmem_r;
619 //getc(stdin);
620 }
621
622 class NData {
623 public:
624 int NPos[1024]; //there should be no reads longer than 1K ?
625 int NCount;
626 int end5;
627 int end3;
628 int seqlen;
629 double perc_N; //percentage of Ns in end5..end3 range only!
630 const char* seq;
631 bool valid;
632 NData() {
633 seqlen=0;
634 NCount=0;
635 end5=0;
636 end3=0;
637 seq=NULL;
638 perc_N=0;
639 valid=true;
640 }
641 void init(GStr& rseq) {
642 NCount=0;
643 perc_N=0;
644 seqlen=rseq.length();
645 seq=rseq.chars();
646 end5=1;
647 end3=seqlen;
648 for (int i=0;i<seqlen;i++)
649 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
650 NPos[NCount]=i;
651 NCount++;
652 }
653 perc_N=(NCount*100.0)/seqlen;
654 valid=true;
655 }
656 void N_calc() { //only in the region after trimming
657 int n=0;
658 for (int i=end3-1;i<end5;i++) {
659 if (seq[i]=='N') n++;
660 }
661 perc_N=(n*100.0)/(end5-end3+1);
662 }
663 };
664
665 static NData feat;
666 int perc_lenN=12; // incremental distance from ends, in percentage of
667 // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
668
669 void N_analyze(int l5, int l3, int p5, int p3) {
670 /* assumes feat was filled properly */
671 int old_dif, t5,t3,v;
672 if (l3<l5+2 || p5>p3 ) {
673 feat.end5=l5+1;
674 feat.end3=l3+1;
675 return;
676 }
677
678 t5=feat.NPos[p5]-l5;
679 t3=l3-feat.NPos[p3];
680 old_dif=p3-p5;
681 v=(int)((((double)(l3-l5))*perc_lenN)/100);
682 if (v>20) v=20; /* enforce N-search limit for very long reads */
683 if (t5 < v ) {
684 l5=feat.NPos[p5]+1;
685 p5++;
686 }
687 if (t3 < v) {
688 l3=feat.NPos[p3]-1;
689 p3--;
690 }
691 /* restNs=p3-p5; number of Ns in the new CLR */
692 if (p3-p5==old_dif) { /* no change, return */
693 /* don't trim if this may shorten the read too much */
694 //if (l5-l3<min_read_len) return;
695 feat.end5=l5+1;
696 feat.end3=l3+1;
697 return;
698 }
699 else
700 N_analyze(l5,l3, p5,p3);
701 }
702
703
704 bool qtrim(GStr& qvs, int &l5, int &l3) {
705 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
706 if (qv_phredtype==0) {
707 //try to guess the Phred type
708 int vmin=256, vmax=0;
709 for (int i=0;i<qvs.length();i++) {
710 if (vmin>qvs[i]) vmin=qvs[i];
711 if (vmax<qvs[i]) vmax=qvs[i];
712 }
713 if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
714 if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
715 if (qv_phredtype==0) {
716 GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
717 }
718 if (verbose)
719 GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
720 } //guessing Phred type
721 for (l3=qvs.length()-1;l3>2;l3--) {
722 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
723 }
724 //just in case, check also the 5' the end (?)
725 for (l5=0;l5<qvs.length()-3;l5++) {
726 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
727 }
728 if (qvtrim_max>0) {
729 if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
730 if (l5>qvtrim_max) l5=qvtrim_max;
731 }
732 return (l5>0 || l3<qvs.length()-1);
733 }
734
735 bool ntrim(GStr& rseq, int &l5, int &l3) {
736 //count Ns in the sequence, trim N-rich ends
737 feat.init(rseq);
738 l5=feat.end5-1;
739 l3=feat.end3-1;
740 N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
741 if (l5==feat.end5-1 && l3==feat.end3-1) {
742 if (feat.perc_N>max_perc_N) {
743 feat.valid=false;
744 l3=l5+1;
745 return true;
746 }
747 else {
748 return false; //nothing changed
749 }
750 }
751 l5=feat.end5-1;
752 l3=feat.end3-1;
753 if (l3-l5+1<min_read_len) {
754 feat.valid=false;
755 return true;
756 }
757 feat.N_calc();
758
759 if (feat.perc_N>max_perc_N) {
760 feat.valid=false;
761 l3=l5+1;
762 return true;
763 }
764 return true;
765 }
766
767 //--------------- dust functions ----------------
768 class DNADuster {
769 public:
770 int dustword;
771 int dustwindow;
772 int dustwindow2;
773 int dustcutoff;
774 int mv, iv, jv;
775 int counts[32*32*32];
776 int iis[32*32*32];
777 DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
778 dustword=wordsize;
779 dustwindow=winsize;
780 dustwindow2 = (winsize>>1);
781 dustcutoff=cutoff;
782 mv=0;
783 iv=0;
784 jv=0;
785 }
786 void setWindowSize(int value) {
787 dustwindow = value;
788 dustwindow2 = (dustwindow >> 1);
789 }
790 void setWordSize(int value) {
791 dustword=value;
792 }
793 void wo1(int len, const char* s, int ivv) {
794 int i, ii, j, v, t, n, n1, sum;
795 int js, nis;
796 n = 32 * 32 * 32;
797 n1 = n - 1;
798 nis = 0;
799 i = 0;
800 ii = 0;
801 sum = 0;
802 v = 0;
803 for (j=0; j < len; j++, s++) {
804 ii <<= 5;
805 if (*s<=32) {
806 i=0;
807 continue;
808 }
809 ii |= *s - 'A'; //assume uppercase!
810 ii &= n1;
811 i++;
812 if (i >= dustword) {
813 for (js=0; js < nis && iis[js] != ii; js++) ;
814 if (js == nis) {
815 iis[nis] = ii;
816 counts[ii] = 0;
817 nis++;
818 }
819 if ((t = counts[ii]) > 0) {
820 sum += t;
821 v = 10 * sum / j;
822 if (mv < v) {
823 mv = v;
824 iv = ivv;
825 jv = j;
826 }
827 }
828 counts[ii]++;
829 }
830 }
831 }
832
833 int wo(int len, const char* s, int* beg, int* end) {
834 int i, l1;
835 l1 = len - dustword + 1;
836 if (l1 < 0) {
837 *beg = 0;
838 *end = len - 1;
839 return 0;
840 }
841 mv = 0;
842 iv = 0;
843 jv = 0;
844 for (i=0; i < l1; i++) {
845 wo1(len-i, s+i, i);
846 }
847 *beg = iv;
848 *end = iv + jv;
849 return mv;
850 }
851
852 void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
853 int i, j, l, a, b, v;
854 if (cutoff==0) cutoff=dustcutoff;
855 a=0;b=0;
856 //GMessage("Dust cutoff=%d\n", cutoff);
857 for (i=0; i < seqlen; i += dustwindow2) {
858 l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
859 v = wo(l, seq+i, &a, &b);
860 if (v > cutoff) {
861 //for (j = a; j <= b && j < dustwindow2; j++) {
862 for (j = a; j <= b; j++) {
863 seqmsk[i+j]='N';//could be made lowercase instead
864 }
865 }
866 }
867 //return first;
868 }
869 };
870
871 static DNADuster duster;
872
873 int dust(GStr& rseq) {
874 char* seq=Gstrdup(rseq.chars());
875 duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
876 //check the number of Ns:
877 int ncount=0;
878 for (int i=0;i<rseq.length();i++) {
879 if (seq[i]=='N') ncount++;
880 }
881 GFREE(seq);
882 return ncount;
883 }
884
885 struct SLocScore {
886 int pos;
887 int score;
888 SLocScore(int p=0,int s=0) {
889 pos=p;
890 score=s;
891 }
892 void set(int p, int s) {
893 pos=p;
894 score=s;
895 }
896 void add(int p, int add) {
897 pos=p;
898 score+=add;
899 }
900 };
901
902 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
903 if (!doPolyTrim) return false;
904 int rlen=seq.length();
905 l5=0;
906 l3=rlen-1;
907 int32 seedVal=*(int32*)poly_seed;
908 char polyChar=poly_seed[0];
909 //assumes N trimming was already done
910 //so a poly match should be very close to the end of the read
911 // -- find the initial match (seed)
912 int lmin=GMAX((rlen-16), 0);
913 int li;
914 for (li=rlen-4;li>lmin;li--) {
915 if (seedVal==*(int*)&(seq[li])) {
916 break;
917 }
918 }
919 if (li<=lmin) return false;
920 //seed found, try to extend it both ways
921 //extend right
922 int ri=li+3;
923 SLocScore loc(ri, poly_m_score<<2);
924 SLocScore maxloc(loc);
925 //extend right
926 while (ri<rlen-1) {
927 ri++;
928 if (seq[ri]==polyChar) {
929 loc.add(ri,poly_m_score);
930 }
931 else if (seq[ri]=='N') {
932 loc.add(ri,0);
933 }
934 else { //mismatch
935 loc.add(ri,poly_mis_score);
936 if (maxloc.score-loc.score>poly_dropoff_score) break;
937 }
938 if (maxloc.score<=loc.score) {
939 maxloc=loc;
940 }
941 }
942 ri=maxloc.pos;
943 if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
944 //ri = right boundary for the poly match
945 //extend left
946 loc.set(li, maxloc.score);
947 maxloc.pos=li;
948 while (li>0) {
949 li--;
950 if (seq[li]==polyChar) {
951 loc.add(li,poly_m_score);
952 }
953 else if (seq[li]=='N') {
954 loc.add(li,0);
955 }
956 else { //mismatch
957 loc.add(li,poly_mis_score);
958 if (maxloc.score-loc.score>poly_dropoff_score) break;
959 }
960 if (maxloc.score<=loc.score) {
961 maxloc=loc;
962 }
963 }
964 li=maxloc.pos;
965 if ((maxloc.score==poly_minScore && ri==rlen-1) ||
966 (maxloc.score>poly_minScore && ri>=rlen-3) ||
967 (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
968 //trimming this li-ri match at 3' end
969 l3=li-1;
970 if (l3<0) l3=0;
971 return true;
972 }
973 return false;
974 }
975
976 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
977 if (!doPolyTrim) return false;
978 int rlen=seq.length();
979 l5=0;
980 l3=rlen-1;
981 int32 seedVal=*(int32*)poly_seed;
982 char polyChar=poly_seed[0];
983 //assumes N trimming was already done
984 //so a poly match should be very close to the end of the read
985 // -- find the initial match (seed)
986 int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
987 int li;
988 for (li=0;li<=lmax;li++) {
989 if (seedVal==*(int*)&(seq[li])) {
990 break;
991 }
992 }
993 if (li>lmax) return false;
994 //seed found, try to extend it both ways
995 //extend left
996 int ri=li+3; //save rightmost base of the seed
997 SLocScore loc(li, poly_m_score<<2);
998 SLocScore maxloc(loc);
999 while (li>0) {
1000 li--;
1001 if (seq[li]==polyChar) {
1002 loc.add(li,poly_m_score);
1003 }
1004 else if (seq[li]=='N') {
1005 loc.add(li,0);
1006 }
1007 else { //mismatch
1008 loc.add(li,poly_mis_score);
1009 if (maxloc.score-loc.score>poly_dropoff_score) break;
1010 }
1011 if (maxloc.score<=loc.score) {
1012 maxloc=loc;
1013 }
1014 }
1015 li=maxloc.pos;
1016 if (li>5) return false; //no trimming wanted, too far from 5' end
1017 //li = right boundary for the poly match
1018
1019 //extend right
1020 loc.set(ri, maxloc.score);
1021 maxloc.pos=ri;
1022 while (ri<rlen-1) {
1023 ri++;
1024 if (seq[ri]==polyChar) {
1025 loc.add(ri,poly_m_score);
1026 }
1027 else if (seq[ri]=='N') {
1028 loc.add(ri,0);
1029 }
1030 else { //mismatch
1031 loc.add(ri,poly_mis_score);
1032 if (maxloc.score-loc.score>poly_dropoff_score) break;
1033 }
1034 if (maxloc.score<=loc.score) {
1035 maxloc=loc;
1036 }
1037 }
1038 ri=maxloc.pos;
1039 if ((maxloc.score==poly_minScore && li==0) ||
1040 (maxloc.score>poly_minScore && li<2)
1041 || (maxloc.score>(poly_minScore*3) && li<8)) {
1042 //adjust l5 to reflect this trimming of 5' end
1043 l5=ri+1;
1044 if (l5>rlen-1) l5=rlen-1;
1045 return true;
1046 }
1047 return false;
1048 }
1049
1050 bool trim_adaptor3(GStr& seq, int&l5, int &l3) {
1051 if (adaptors3.Count()==0) return false;
1052 int rlen=seq.length();
1053 l5=0;
1054 l3=rlen-1;
1055 bool trimmed=false;
1056 GStr wseq(seq);
1057 int wlen=rlen;
1058 GXSeqData seqdata;
1059 int numruns=revCompl ? 2 : 1;
1060 GList<GXAlnInfo> bestalns(true, true, false);
1061 for (int ai=0;ai<adaptors3.Count();ai++) {
1062 for (int r=0;r<numruns;r++) {
1063 if (r) {
1064 seqdata.update(adaptors3[ai].seqr.chars(), adaptors3[ai].seqr.length(),
1065 adaptors3[ai].pzr, wseq.chars(), wlen, adaptors3[ai].amlen);
1066 }
1067 else {
1068 seqdata.update(adaptors3[ai].seq.chars(), adaptors3[ai].seq.length(),
1069 adaptors3[ai].pz, wseq.chars(), wlen, adaptors3[ai].amlen);
1070 }
1071 GXAlnInfo* aln=match_adaptor(seqdata, adaptors3[ai].trim_type, minEndAdapter, gxmem_r, 86);
1072 if (aln) {
1073 if (aln->strong) {
1074 trimmed=true;
1075 bestalns.Add(aln);
1076 break; //will check the rest next time
1077 }
1078 else bestalns.Add(aln);
1079 }
1080 }//forward and reverse adaptors
1081 if (trimmed) break; //will check the rest in the next cycle
1082 }//for each 3' adaptor
1083 if (bestalns.Count()>0) {
1084 GXAlnInfo* aln=bestalns[0];
1085 if (aln->sl-1 > wlen-aln->sr) {
1086 //keep left side
1087 l3-=(wlen-aln->sl+1);
1088 if (l3<0) l3=0;
1089 }
1090 else { //keep right side
1091 l5+=aln->sr;
1092 if (l5>=rlen) l5=rlen-1;
1093 }
1094 //delete aln;
1095 //if (l3-l5+1<min_read_len) return true;
1096 wseq=seq.substr(l5,l3-l5+1);
1097 wlen=wseq.length();
1098 return true; //break the loops here to report a good find
1099 }
1100 return false;
1101 }
1102
1103 bool trim_adaptor5(GStr& seq, int&l5, int &l3) {
1104 if (adaptors5.Count()==0) return false;
1105 int rlen=seq.length();
1106 l5=0;
1107 l3=rlen-1;
1108 bool trimmed=false;
1109 GStr wseq(seq);
1110 int wlen=rlen;
1111 GXSeqData seqdata;
1112 int numruns=revCompl ? 2 : 1;
1113 GList<GXAlnInfo> bestalns(true, true, false);
1114 for (int ai=0;ai<adaptors5.Count();ai++) {
1115 for (int r=0;r<numruns;r++) {
1116 if (r) {
1117 seqdata.update(adaptors5[ai].seqr.chars(), adaptors5[ai].seqr.length(),
1118 adaptors5[ai].pzr, wseq.chars(), wlen, adaptors5[ai].amlen);
1119 }
1120 else {
1121 seqdata.update(adaptors5[ai].seq.chars(), adaptors5[ai].seq.length(),
1122 adaptors5[ai].pz, wseq.chars(), wlen, adaptors5[ai].amlen);
1123 }
1124 GXAlnInfo* aln=match_adaptor(seqdata, adaptors5[ai].trim_type,
1125 minEndAdapter, gxmem_l, 90);
1126 if (aln) {
1127 if (aln->strong) {
1128 trimmed=true;
1129 bestalns.Add(aln);
1130 break; //will check the rest next time
1131 }
1132 else bestalns.Add(aln);
1133 }
1134 } //forward and reverse?
1135 if (trimmed) break; //will check the rest in the next cycle
1136 }//for each 5' adaptor
1137 if (bestalns.Count()>0) {
1138 GXAlnInfo* aln=bestalns[0];
1139 if (aln->sl-1 > wlen-aln->sr) {
1140 //keep left side
1141 l3-=(wlen-aln->sl+1);
1142 if (l3<0) l3=0;
1143 }
1144 else { //keep right side
1145 l5+=aln->sr;
1146 if (l5>=rlen) l5=rlen-1;
1147 }
1148 //delete aln;
1149 //if (l3-l5+1<min_read_len) return true;
1150 wseq=seq.substr(l5,l3-l5+1);
1151 wlen=wseq.length();
1152 return true; //break the loops here to report a good find
1153 }
1154 return false;
1155 }
1156
1157 //convert qvs to/from phred64 from/to phread33
1158 void convertPhred(GStr& q) {
1159 for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1160 }
1161
1162 void convertPhred(char* q, int len) {
1163 for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1164 }
1165
1166 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1167 GStr& rname, GStr& rinfo, GStr& infname) {
1168 rseq="";
1169 rqv="";
1170 rname="";
1171 rinfo="";
1172 if (fq.eof()) return false;
1173 char* l=fq.getLine();
1174 while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1175 if (l==NULL) return false;
1176 /* if (rawFormat) {
1177 //TODO: implement raw qseq parsing here
1178 //if (raw type=N) then continue; //skip invalid/bad records
1179 } //raw qseq format
1180 else { // FASTQ or FASTA */
1181 isfasta=(l[0]=='>');
1182 if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1183 infname.chars(), l);
1184 GStr s(l);
1185 rname=&(l[1]);
1186 for (int i=0;i<rname.length();i++)
1187 if (rname[i]<=' ') {
1188 if (i<rname.length()-2) rinfo=rname.substr(i+1);
1189 rname.cut(i);
1190 break;
1191 }
1192 //now get the sequence
1193 if ((l=fq.getLine())==NULL)
1194 GError("Error: unexpected EOF after header for read %s (%s)\n",
1195 rname.chars(), infname.chars());
1196 rseq=l; //this must be the DNA line
1197 while ((l=fq.getLine())!=NULL) {
1198 //seq can span multiple lines
1199 if (l[0]=='>' || l[0]=='+') {
1200 fq.pushBack();
1201 break; //
1202 }
1203 rseq+=l;
1204 } //check for multi-line seq
1205 if (!isfasta) { //reading fastq quality values, which can also be multi-line
1206 if ((l=fq.getLine())==NULL)
1207 GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1208 if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1209 if ((l=fq.getLine())==NULL)
1210 GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1211 rqv=l;
1212 //if (rqv.length()!=rseq.length())
1213 // GError("Error: qv len != seq len for %s\n", rname.chars());
1214 while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1215 rqv+=l; //append to qv string
1216 }
1217 }// fastq
1218 // } //<-- FASTA or FASTQ
1219 rseq.upper();
1220 return true;
1221 }
1222
1223 #ifdef GDEBUG
1224 void showTrim(GStr& s, int l5, int l3) {
1225 if (l5>0 || l3==0) {
1226 color_bg(c_red);
1227 }
1228 for (int i=0;i<s.length()-1;i++) {
1229 if (i && i==l5) color_resetbg();
1230 fprintf(stderr, "%c", s[i]);
1231 if (i && i==l3) color_bg(c_red);
1232 }
1233 fprintf(stderr, "%c", s[s.length()-1]);
1234 color_reset();
1235 fprintf(stderr, "\n");
1236 }
1237 #endif
1238
1239 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1240 //returns 0 if the read was untouched, 1 if it was just trimmed
1241 // and a trash code if it was trashed
1242 l5=0;
1243 l3=rseq.length()-1;
1244 #ifdef GDEBUG
1245 //rseq.reverse();
1246 GMessage(">%s\n", rname.chars());
1247 GMessage("%s\n",rseq.chars());
1248 #endif
1249 if (l3-l5+1<min_read_len) {
1250 return 's';
1251 }
1252 GStr wseq(rseq.chars());
1253 GStr wqv(rqv.chars());
1254 int w5=l5;
1255 int w3=l3;
1256 //count Ns
1257 b_totalIn+=rseq.length();
1258 for (int i=0;i<rseq.length();i++) {
1259 if (isACGT[(int)rseq[i]]==0) b_totalN++;
1260 }
1261
1262 //first do the q-based trimming
1263 if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1264 b_trimQ+=(w5-l5)+(l3-w3);
1265 num_trimQ++;
1266 if (w3-w5+1<min_read_len) {
1267 return 'Q'; //invalid read
1268 }
1269 //-- keep only the w5..w3 range
1270 l5=w5;
1271 l3=w3;
1272 wseq=wseq.substr(w5, w3-w5+1);
1273 if (!wqv.is_empty())
1274 wqv=wqv.substr(w5, w3-w5+1);
1275 } //qv trimming
1276 // N-trimming on the remaining read seq
1277 if (ntrim(wseq, w5, w3)) {
1278 //Note: ntrim resets w5 and w3
1279 //GMessage("before: %s\n",wseq.chars());
1280 //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1281 int trim3=(wseq.length()-1-w3);
1282 b_trimN+=w5+trim3;
1283 num_trimN++;
1284 l5+=w5;
1285 l3-=trim3;
1286 if (w3-w5+1<min_read_len) {
1287 return 'N'; //to be trashed
1288 }
1289 //-- keep only the w5..w3 range
1290 wseq=wseq.substr(w5, w3-w5+1);
1291 if (!wqv.is_empty())
1292 wqv=wqv.substr(w5, w3-w5+1);
1293 w5=0;
1294 w3=wseq.length()-1;
1295 }
1296 char trim_code;
1297 //clean the more dirty end first - 3'
1298 bool trimmedA=false;
1299 bool trimmedT=false;
1300 bool trimmedV=false;
1301
1302 int prev_w5=0;
1303 int prev_w3=0;
1304 bool w3upd=true;
1305 bool w5upd=true;
1306 do {
1307 trim_code=0;
1308 if (w3upd && trim_poly3(wseq, w5, w3, polyA_seed)) {
1309 trim_code='A';
1310 b_trimA+=(w5+(wseq.length()-1-w3));
1311 if (!trimmedA) { num_trimA++; trimmedA=true; }
1312 }
1313 else if (w3upd && trim_poly3(wseq, w5, w3, polyT_seed)) {
1314 trim_code='T';
1315 b_trimT+=(w5+(wseq.length()-1-w3));
1316 if (!trimmedT) { num_trimT++; trimmedT=true; }
1317 }
1318 else if (w5upd && trim_poly5(wseq, w5, w3, polyA_seed)) {
1319 trim_code='A';
1320 b_trimA+=(w5+(wseq.length()-1-w3));
1321 if (!trimmedA) { num_trimA++; trimmedA=true; }
1322 }
1323 else if (w5upd && trim_poly5(wseq, w5, w3, polyT_seed)) {
1324 trim_code='T';
1325 b_trimT+=(w5+(wseq.length()-1-w3));
1326 if (!trimmedT) { num_trimT++; trimmedT=true; }
1327 }
1328 else if (trim_adaptor5(wseq, w5, w3)) {
1329 trim_code='V';
1330 b_trimV+=(w5+(wseq.length()-1-w3));
1331 if (!trimmedV) { num_trimV++; trimmedV=true; }
1332 }
1333 else if (trim_adaptor3(wseq, w5, w3)) {
1334 trim_code='V';
1335 b_trimV+=(w5+(wseq.length()-1-w3));
1336 if (!trimmedV) { num_trimV++; trimmedV=true; }
1337 }
1338 if (trim_code) {
1339 w3upd=(w3!=prev_w3);
1340 w5upd=(w5!=prev_w5);
1341 if (w3upd) prev_w3=w3;
1342 if (w5upd) prev_w5=w5;
1343 #ifdef GDEBUG
1344 GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1345 showTrim(wseq, w5, w3);
1346 #endif
1347 //int trimlen=wseq.length()-(w3-w5+1);
1348 l5+=w5;
1349 l3-=(wseq.length()-1-w3);
1350 if (w3-w5+1<min_read_len) {
1351 return trim_code;
1352 }
1353 //-- keep only the w5..w3 range
1354 wseq=wseq.substr(w5, w3-w5+1);
1355 if (!wqv.is_empty())
1356 wqv=wqv.substr(w5, w3-w5+1);
1357 }//trimming at 3' end
1358 } while (trim_code);
1359 if (doCollapse) {
1360 //keep read for later
1361 FqDupRec* dr=dhash.Find(wseq.chars());
1362 if (dr==NULL) { //new entry
1363 //if (prefix.is_empty())
1364 dhash.Add(wseq.chars(),
1365 new FqDupRec(&wqv, rname.chars()));
1366 //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1367 }
1368 else
1369 dr->add(wqv);
1370 } //collapsing duplicates
1371 else { //not collapsing duplicates
1372 //apply the dust filter now
1373 if (doDust) {
1374 int dustbases=dust(wseq);
1375 if (dustbases>(wseq.length()>>1)) {
1376 return 'D';
1377 }
1378 }
1379 } //not collapsing duplicates
1380 return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1381 }
1382
1383 void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1384 //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1385 if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1386 else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1387 }
1388
1389 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1390 outcounter++;
1391 bool asFasta=(rqv.is_empty() || fastaOutput);
1392 if (asFasta) {
1393 if (prefix.is_empty()) {
1394 printHeader(f_out, '>',rname,rinfo);
1395 fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1396 }
1397 else {
1398 fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1399 rseq.chars());
1400 }
1401 }
1402 else { //fastq
1403 if (convert_phred) convertPhred(rqv);
1404 if (prefix.is_empty()) {
1405 printHeader(f_out, '@', rname, rinfo);
1406 fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1407 }
1408 else
1409 fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1410 rseq.chars(),rqv.chars() );
1411 }
1412 }
1413
1414 void trash_report(char trashcode, GStr& rname, FILE* freport) {
1415 if (freport==NULL || trashcode<=' ') return;
1416 if (trashcode=='3' || trashcode=='5') {
1417 fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1418 }
1419 else {
1420 fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1421 }
1422 //tcounter++;
1423 }
1424
1425 GStr getFext(GStr& s, int* xpos=NULL) {
1426 //s must be a filename without a path
1427 GStr r("");
1428 if (xpos!=NULL) *xpos=0;
1429 if (s.is_empty() || s=="-") return r;
1430 int p=s.rindex('.');
1431 int d=s.rindex('/');
1432 if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1433 r=s.substr(p+1);
1434 if (xpos!=NULL) *xpos=p+1;
1435 r.lower();
1436 return r;
1437 }
1438
1439 void baseFileName(GStr& fname) {
1440 //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1441 int xpos=0;
1442 GStr fext=getFext(fname, &xpos);
1443 if (xpos<=1) return;
1444 bool extdel=false;
1445 GStr f2;
1446 if (fext=="z" || fext=="zip") {
1447 extdel=true;
1448 }
1449 else if (fext.length()>=2) {
1450 f2=fext.substr(0,2);
1451 extdel=(f2=="gz" || f2=="bz");
1452 }
1453 if (extdel) {
1454 fname.cut(xpos-1);
1455 fext=getFext(fname, &xpos);
1456 if (xpos<=1) return;
1457 }
1458 extdel=false;
1459 if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1460 extdel=true;
1461 }
1462 else if (fext.length()>=2) {
1463 extdel=(fext.substr(0,2)=="fa");
1464 }
1465 if (extdel) fname.cut(xpos-1);
1466 GStr fncp(fname);
1467 fncp.lower();
1468 fncp.chomp("seq");
1469 fncp.chomp("sequence");
1470 fncp.trimR("_.");
1471 if (fncp.length()<fname.length()) fname.cut(fncp.length());
1472 }
1473
1474 FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1475 FILE* f_out=NULL;
1476 GStr fname(getFileName(infname.chars()));
1477 //eliminate known extensions
1478 baseFileName(fname);
1479 if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1480 else if (pocmd.is_empty()) {
1481 GStr oname(fname);
1482 oname.append('.');
1483 oname.append(outsuffix);
1484 f_out=fopen(oname.chars(),"w");
1485 if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1486 }
1487 else {
1488 GStr oname(">");
1489 oname.append(fname);
1490 oname.append('.');
1491 oname.append(outsuffix);
1492 pocmd.append(oname);
1493 f_out=popen(pocmd.chars(), "w");
1494 if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1495 }
1496 return f_out;
1497 }
1498
1499 void guess_unzip(GStr& fname, GStr& picmd) {
1500 GStr fext=getFext(fname);
1501 if (fext=="gz" || fext=="gzip" || fext=="z") {
1502 picmd="gzip -cd ";
1503 }
1504 else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1505 picmd="bzip2 -cd ";
1506 }
1507 }
1508
1509 void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type) {
1510 //TODO: prepare CASeqData here, and collect hexamers as well
1511 if (seq.is_empty() || seq=="-" ||
1512 seq=="N/A" || seq==".") return;
1513
1514 CASeqData adata(revCompl);
1515 int idx=adaptors.Add(adata);
1516 if (idx<0) GError("Error: failed to add adaptor!\n");
1517 adaptors[idx].trim_type=trim_type;
1518 adaptors[idx].update(seq.chars());
1519 }
1520
1521
1522 int loadAdaptors(const char* fname) {
1523 GLineReader lr(fname);
1524 char* l;
1525 while ((l=lr.nextLine())!=NULL) {
1526 if (lr.length()<=3 || l[0]=='#') continue;
1527 if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1528 l[0]==';'|| l[0]==':' ) {
1529 int i=1;
1530 while (l[i]!=0 && isspace(l[i])) {
1531 i++;
1532 }
1533 if (l[i]!=0) {
1534 GStr s(&(l[i]));
1535 #ifdef GDEBUG
1536 //s.reverse();
1537 #endif
1538 addAdaptor(adaptors3, s, galn_TrimRight);
1539 continue;
1540 }
1541 }
1542 else {
1543 GStr s(l);
1544 s.startTokenize("\t ;,:");
1545 GStr a5,a3;
1546 if (s.nextToken(a5))
1547 s.nextToken(a3);
1548 else continue; //no tokens on this line
1549 GAlnTrimType ttype5=galn_TrimLeft;
1550 a5.upper();
1551 a3.upper();
1552 if (a3.is_empty() || a3==a5 || a3=="=") {
1553 a3.clear();
1554 ttype5=galn_TrimEither;
1555 }
1556 #ifdef GDEBUG
1557 // a5.reverse();
1558 // a3.reverse();
1559 #endif
1560 addAdaptor(adaptors5, a5, ttype5);
1561 addAdaptor(adaptors3, a3, galn_TrimRight);
1562 }
1563 }
1564 return adaptors5.Count()+adaptors3.Count();
1565 }
1566
1567 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1568 GStr& s, GStr& infname, GStr& infname2) {
1569 // uses outsuffix to generate output file names and open file handles as needed
1570 infname="";
1571 infname2="";
1572 f_in=NULL;
1573 f_in2=NULL;
1574 f_out=NULL;
1575 f_out2=NULL;
1576 //analyze outsuffix intent
1577 GStr pocmd;
1578 if (outsuffix=="-") {
1579 f_out=stdout;
1580 }
1581 else {
1582 GStr ox=getFext(outsuffix);
1583 if (ox.length()>2) ox=ox.substr(0,2);
1584 if (ox=="gz") pocmd="gzip -9 -c ";
1585 else if (ox=="bz") pocmd="bzip2 -9 -c ";
1586 }
1587 if (s=="-") {
1588 f_in=stdin;
1589 infname="stdin";
1590 f_out=prepOutFile(infname, pocmd);
1591 return;
1592 } // streaming from stdin
1593 s.startTokenize(",:");
1594 s.nextToken(infname);
1595 bool paired=s.nextToken(infname2);
1596 if (fileExists(infname.chars())==0)
1597 GError("Error: cannot find file %s!\n",infname.chars());
1598 GStr fname(getFileName(infname.chars()));
1599 GStr picmd;
1600 guess_unzip(fname, picmd);
1601 if (picmd.is_empty()) {
1602 f_in=fopen(infname.chars(), "r");
1603 if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1604 }
1605 else {
1606 picmd.append(infname);
1607 f_in=popen(picmd.chars(), "r");
1608 if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1609 }
1610 if (f_out==stdout) {
1611 if (paired) GError("Error: output suffix required for paired reads\n");
1612 return;
1613 }
1614 f_out=prepOutFile(infname, pocmd);
1615 if (!paired) return;
1616 if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1617 // ---- paired reads:-------------
1618 if (fileExists(infname2.chars())==0)
1619 GError("Error: cannot find file %s!\n",infname2.chars());
1620 picmd="";
1621 GStr fname2(getFileName(infname2.chars()));
1622 guess_unzip(fname2, picmd);
1623 if (picmd.is_empty()) {
1624 f_in2=fopen(infname2.chars(), "r");
1625 if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1626 }
1627 else {
1628 picmd.append(infname2);
1629 f_in2=popen(picmd.chars(), "r");
1630 if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1631 }
1632 f_out2=prepOutFile(infname2, pocmd);
1633 }