ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 76
Committed: Tue Sep 27 12:37:02 2011 UTC (8 years, 2 months ago) by gpertea
File size: 53160 byte(s)
Log Message:
u

Line File contents
1 #include "GArgs.h"
2 #include "GStr.h"
3 #include "GHash.hh"
4 #include "GList.hh"
5 #include <ctype.h>
6
7 #define USAGE "Usage:\n\
8 fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
9 [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
10 [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
11 <input.fq>[,<input_mates.fq>\n\
12 \n\
13 Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
14 for low complexity and collapse duplicate reads.\n\
15 If read pairs should be trimmed and kept together (i.e. without discarding\n\
16 one read in a pair), the two file names should be given delimited by a comma\n\
17 or a colon character\n\
18 \n\
19 Options:\n\
20 -n rename all the reads using the <prefix> followed by a read counter;\n\
21 if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
22 the read duplication count\n\
23 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
24 file(s) named <input>.<outsuffix> which will be created in the \n\
25 current (working) directory; (writes to stdout if -o- is given);\n\
26 a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
27 -f file with adapter sequences to trim, each line having this format:\n\
28 <5'-adapter-sequence> <3'-adapter-sequence>\n\
29 -5 trim the given adapter or primer sequence at the 5' end of each read\n\
30 (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
31 -3 trim the given adapter sequence at the 3' end of each read\n\
32 (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
33 -a minimum length of exact match to adaptor sequence at the proper end (6)\n\
34 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
35 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
36 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
37 -l minimum \"clean\" length after trimming that a read must have\n\
38 in order to pass the filter (default: 16)\n\
39 -r write a simple \"trash report\" file listing the discarded reads with a\n\
40 one letter code for the reason why they were trashed\n\
41 -D apply a low-complexity (dust) filter and discard any read that has over\n\
42 50% of its length detected as low complexity\n\
43 -C collapse duplicate reads and add a prefix with their count to the read\n\
44 name (e.g. for microRNAs)\n\
45 -p input is phred64/phred33 (use -p64 or -p33)\n\
46 -Q convert quality values to the other Phred qv type\n\
47 -V verbose processing\n\
48 "
49
50 //-z for -o option, the output stream(s) will be first piped into the given\n
51 // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
52
53
54 // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
55
56 //For paired reads sequencing:
57 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
58 //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
59 //FILE* f_out=NULL; //stdout if not provided
60 //FILE* f_out2=NULL; //for paired reads
61 //FILE* f_in=NULL; //input fastq (stdin if not provided)
62 //FILE* f_in2=NULL; //for paired reads
63 FILE* freport=NULL;
64
65 bool debug=false;
66 bool verbose=false;
67 bool doCollapse=false;
68 bool doDust=false;
69 bool fastaOutput=false;
70 bool trashReport=false;
71 //bool rawFormat=false;
72 int min_read_len=16;
73 double max_perc_N=7.0;
74 int dust_cutoff=16;
75 bool isfasta=false;
76 bool convert_phred=false;
77 GStr outsuffix; // -o
78 //GStr adapter3;
79 //GStr adapter5;
80 GStr prefix;
81 GStr zcmd;
82 int num_trimmed5=0;
83 int num_trimmed3=0;
84 int num_trimmedN=0;
85 int num_trimmedQ=0;
86 int min_trimmed5=INT_MAX;
87 int min_trimmed3=INT_MAX;
88
89 int qvtrim_qmin=0;
90 int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
91 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
92 int qv_cvtadd=0; //could be -31 or +31
93
94 int a3len=0;
95 int a5len=0;
96 // adaptor matching metrics -- for extendMatch() function
97 const int a_m_score=2; //match score
98 const int a_mis_score=-3; //mismatch
99 const int a_dropoff_score=7;
100 int a_min_score=12; //an exact match of 6 bases at the proper ends WILL be trimmed
101 const int a_min_chain_score=15; //for gapped alignments
102
103 class CSegChain;
104
105 class CSegPair {
106 public:
107 GSeg a;
108 GSeg b; //the adapter segment
109 int score;
110 int flags;
111 CSegChain* chain;
112 CSegPair(int astart=0, int aend=0, int bstart=0, int bend=0, int mscore=0):a(astart,aend),b(bstart, bend) {
113 score=mscore;
114 if (score==0) score=a.len()*a_m_score;
115 flags=0;
116 chain=NULL;
117 }
118 int len() { return a.len(); }
119 bool operator==(CSegPair& d){
120 //return (a.start==d.a.start && a.end==d.a.end && b.start==d.b.start && b.end==d.b.end);
121 //make equal even segments that are included into one another:
122 return (d.a.start>=a.start && d.a.end<=a.end && d.b.start>=b.start && d.b.end<=b.end);
123 }
124 bool operator>(CSegPair& d){ //ordering based on b (adaptor) start coord and score
125 if (b.start==d.b.start) {
126 if (score==d.score) {
127 //just try to be consistent:
128 if (b.end==d.b.end) {
129 return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
130 }
131 return (b.end>d.b.end);
132 }
133 else return (score<d.score);
134 }
135 return (b.start>d.b.start);
136 }
137 bool operator<(CSegPair& d){ //ordering based on b (adaptor) coord
138 /*if (b.start==d.b.start && b.end==d.b.end) {
139 return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
140 }
141 return (b.start==d.b.start)?(b.end<d.b.end):(b.start<d.b.start);*/
142 if (b.start==d.b.start) {
143 if (score==d.score) {
144 //just try to be consistent:
145 if (b.end==d.b.end) {
146 return (a.start==d.a.start)?(a.end>d.a.end):(a.start>d.a.start);
147 }
148 return (b.end<d.b.end);
149 }
150 else return (score>d.score);
151 }
152 return (b.start<d.b.start);
153 }
154 };
155
156 int cmpSegEnds(pointer sa, pointer sb) { //sort by adaptor seg ends AND score
157 CSegPair& x = *(CSegPair *)sa;
158 CSegPair& y = *(CSegPair *)sb;
159 /*
160 if (x.b.end==y.b.end) {
161 if (x.b.start==y.b.start) {
162 if (x.a.end==y.a.end) {
163 if (x.a.start==y.a.start) return 0;
164 return ((x.a.start>y.a.start) ? -1 : 1);
165 }
166 else {
167 return ((x.a.end>y.a.end) ? -1 : 1);
168 }
169 }
170 else {
171 return ((x.b.start>y.b.start) ? -1 : 1);
172 }
173 }
174 else {
175 return ((x.b.end>y.b.end) ? -1 : 1);
176 }
177 */
178 if (x.b.end==y.b.end) {
179 if (x.score==y.score) {
180 if (x.b.start==y.b.start) {
181 if (x.a.end==y.a.end) {
182 if (x.a.start==y.a.start) return 0;
183 return ((x.a.start<y.a.start) ? -1 : 1);
184 }
185 else {
186 return ((x.a.end<y.a.end) ? -1 : 1);
187 }
188 }
189 else {
190 return ((x.b.start<y.b.start) ? -1 : 1);
191 }
192 } else return ((x.score>y.score) ? -1 : 1);
193 }
194 else {
195 return ((x.b.end>y.b.end) ? -1 : 1);
196 }
197
198 }
199
200 class CSegChain:public GList<CSegPair> {
201 public:
202 uint astart;
203 uint aend;
204 uint bstart;
205 uint bend;
206 int score;
207 bool endSort;
208 CSegChain(bool aln5=false):GList<CSegPair>(true,true,true) {//sorted, free elements, unique
209 //as SegPairs are inserted, they will be sorted by a.start coordinate
210 score=0;
211 astart=MAX_UINT;
212 aend=0;
213 bstart=MAX_UINT;
214 bend=0;
215 endSort=aln5;
216 if (aln5) { setSorted(cmpSegEnds); }
217 }
218 bool operator==(CSegChain& d) {
219 //return (score==d.score);
220 return (astart==d.astart && aend==d.aend && bstart==d.bstart && bend==d.bend);
221 }
222 bool operator>(CSegChain& d) { // order based on b (adaptor) coordinate
223 //return (score<d.score);
224 if (bstart==d.bstart && bend==d.bend) {
225 return (astart==d.astart)?(aend>d.aend):(astart>d.astart);
226 }
227 return (bstart==d.bstart)?(bend>d.bend):(bstart>d.bstart);
228 }
229 bool operator<(CSegChain& d) {
230 //return (score>d.score);
231 if (bstart==d.bstart && bend==d.bend) {
232 return (astart==d.astart)?(aend<d.aend):(astart<d.astart);
233 }
234 return (bstart==d.bstart)?(bend<d.bend):(bstart<d.bstart);
235 }
236 void addSegPair(CSegPair* segp) {
237 if (AddIfNew(segp)!=segp) return;
238 score+=segp->score;
239 if (astart>segp->a.start) astart=segp->a.start;
240 if (aend<segp->a.end) aend=segp->a.end;
241 if (bstart>segp->b.start) bstart=segp->b.start;
242 if (bend<segp->b.end) bend=segp->b.end;
243 }
244 //for building actual chains:
245 bool extendChain(CSegPair* segp) { //segp expected to be "Greater Than" current chain
246 int bgap=0;
247 int agap=0;
248 //if (endSort) {
249 if (bstart>segp->b.start) {
250 bgap = (int)(bstart-segp->b.end);
251 if (abs(bgap)>2) return false;
252 agap = (int)(astart-segp->a.end);
253 if (abs(agap)>2) return false;
254 }
255 else {
256 bgap = (int) (segp->b.start-bend);
257 if (abs(bgap)>2) return false;
258 agap = (int)(segp->a.start-aend);
259 if (abs(agap)>2) return false;
260 }
261 if (agap*bgap<0) return false;
262 addSegPair(segp);
263 score-=abs(agap)+abs(bgap);
264 return true;
265 }
266 };
267
268 // element in dhash:
269 class FqDupRec {
270 public:
271 int count; //how many of these reads are the same
272 int len; //length of qv
273 char* firstname; //optional, only if we want to keep the original read names
274 char* qv;
275 FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
276 len=0;
277 qv=NULL;
278 firstname=NULL;
279 count=0;
280 if (qstr!=NULL) {
281 qv=Gstrdup(qstr->chars());
282 len=qstr->length();
283 count++;
284 }
285 if (rname!=NULL) firstname=Gstrdup(rname);
286 }
287 ~FqDupRec() {
288 GFREE(qv);
289 GFREE(firstname);
290 }
291 void add(GStr& d) { //collapse another record into this one
292 if (d.length()!=len)
293 GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
294 count++;
295 for (int i=0;i<len;i++)
296 qv[i]=(qv[i]+d[i])/2;
297 }
298 };
299
300 void openfw(FILE* &f, GArgs& args, char opt) {
301 GStr s=args.getOpt(opt);
302 if (!s.is_empty()) {
303 if (s=='-') f=stdout;
304 else {
305 f=fopen(s.chars(),"w");
306 if (f==NULL) GError("Error creating file: %s\n", s.chars());
307 }
308 }
309 }
310
311 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
312 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
313
314 GHash<FqDupRec> dhash; //hash to keep track of duplicates
315
316 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
317 GStr& s, GStr& infname, GStr& infname2);
318 // uses outsuffix to generate output file names and open file handles as needed
319
320 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
321 void trash_report(char trashcode, GStr& rname, FILE* freport);
322
323 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
324 GStr& rname, GStr& rinfo, GStr& infname);
325
326 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
327 //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
328
329 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
330 bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
331 int dust(GStr& seq);
332 bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
333 bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
334
335 void convertPhred(char* q, int len);
336 void convertPhred(GStr& q);
337
338 int main(int argc, char * const argv[]) {
339 GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
340 int e;
341 if ((e=args.isError())>0) {
342 GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
343 exit(224);
344 }
345 debug=(args.getOpt('Y')!=NULL);
346 verbose=(args.getOpt('V')!=NULL);
347 convert_phred=(args.getOpt('Q')!=NULL);
348 doCollapse=(args.getOpt('C')!=NULL);
349 doDust=(args.getOpt('D')!=NULL);
350 /*
351 rawFormat=(args.getOpt('R')!=NULL);
352 if (rawFormat) {
353 GError("Sorry, raw qseq format parsing is not implemented yet!\n");
354 }
355 */
356 prefix=args.getOpt('n');
357 GStr s=args.getOpt('l');
358 if (!s.is_empty())
359 min_read_len=s.asInt();
360 s=args.getOpt('m');
361 if (!s.is_empty())
362 max_perc_N=s.asDouble();
363 s=args.getOpt('d');
364 if (!s.is_empty()) {
365 dust_cutoff=s.asInt();
366 doDust=true;
367 }
368 s=args.getOpt('q');
369 if (!s.is_empty()) {
370 qvtrim_qmin=s.asInt();
371 }
372 s=args.getOpt('t');
373 if (!s.is_empty()) {
374 qvtrim_max=s.asInt();
375 }
376 s=args.getOpt('p');
377 if (!s.is_empty()) {
378 int v=s.asInt();
379 if (v==33) {
380 qv_phredtype=33;
381 qv_cvtadd=31;
382 }
383 else if (v==64) {
384 qv_phredtype=64;
385 qv_cvtadd=-31;
386 }
387 else
388 GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
389 }
390 if (args.getOpt('3')!=NULL) {
391 adapter3=args.getOpt('3');
392 adapter3.upper();
393 a3len=adapter3.length();
394 }
395 if (args.getOpt('5')!=NULL) {
396 adapter5=args.getOpt('5');
397 adapter5.upper();
398 a5len=adapter5.length();
399 }
400 s=args.getOpt('a');
401 if (!s.is_empty()) {
402 int a_minmatch=s.asInt();
403 a_min_score=a_minmatch<<1;
404 }
405
406 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
407 else outsuffix="-";
408 trashReport= (args.getOpt('r')!=NULL);
409 int fcount=args.startNonOpt();
410 if (fcount==0) {
411 GMessage(USAGE);
412 exit(224);
413 }
414 if (fcount>1 && doCollapse) {
415 GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
416 }
417 //openfw(f_out, args, 'o');
418 //if (f_out==NULL) f_out=stdout;
419 if (trashReport)
420 openfw(freport, args, 'r');
421 char* infile=NULL;
422 while ((infile=args.nextNonOpt())!=NULL) {
423 int incounter=0; //counter for input reads
424 int outcounter=0; //counter for output reads
425 int trash_s=0; //too short from the get go
426 int trash_Q=0;
427 int trash_N=0;
428 int trash_D=0;
429 int trash_A3=0;
430 int trash_A5=0;
431 s=infile;
432 GStr infname;
433 GStr infname2;
434 FILE* f_in=NULL;
435 FILE* f_in2=NULL;
436 FILE* f_out=NULL;
437 FILE* f_out2=NULL;
438 bool paired_reads=false;
439 setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
440 GLineReader fq(f_in);
441 GLineReader* fq2=NULL;
442 if (f_in2!=NULL) {
443 fq2=new GLineReader(f_in2);
444 paired_reads=true;
445 }
446 GStr rseq, rqv, seqid, seqinfo;
447 GStr rseq2, rqv2, seqid2, seqinfo2;
448 while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
449 incounter++;
450 int a5=0, a3=0, b5=0, b3=0;
451 char tcode=0, tcode2=0;
452 tcode=process_read(seqid, rseq, rqv, a5, a3);
453 //if (!doCollapse) {
454 if (fq2!=NULL) {
455 getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
456 if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
457 GError("Error: no paired match for read %s vs %s (%s,%s)\n",
458 seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
459 }
460 tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
461 //decide what to do with this pair and print rseq2 if the pair makes it
462 if (tcode>1 && tcode2<=1) {
463 //"untrash" rseq
464 if (a3-a5+1<min_read_len) {
465 a5=1;
466 if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
467 }
468 tcode=1;
469 }
470 else if (tcode<=1 && tcode2>1) {
471 //"untrash" rseq2
472 if (b3-b5+1<min_read_len) {
473 b5=1;
474 if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
475 }
476 tcode2=1;
477 }
478 if (tcode<=1) { //trimmed or left intact -- write it!
479 if (tcode>0) {
480 rseq2=rseq2.substr(b5,b3-b5+1);
481 if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
482 }
483 int nocounter=0;
484 writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
485 }
486 } //paired read
487 // }
488 if (tcode>1) { //trashed
489 if (tcode=='s') trash_s++;
490 else if (tcode=='Q') trash_Q++;
491 else if (tcode=='N') trash_N++;
492 else if (tcode=='D') trash_D++;
493 else if (tcode=='3') trash_A3++;
494 else if (tcode=='5') trash_A5++;
495 if (trashReport) trash_report(tcode, seqid, freport);
496 }
497 else if (!doCollapse) { //write it
498 if (tcode>0) {
499 rseq=rseq.substr(a5,a3-a5+1);
500 if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
501 }
502 writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
503 }
504 } //for each fastq record
505 delete fq2;
506 FRCLOSE(f_in);
507 FRCLOSE(f_in2);
508 if (doCollapse) {
509 outcounter=0;
510 int maxdup_count=1;
511 char* maxdup_seq=NULL;
512 dhash.startIterate();
513 FqDupRec* qd=NULL;
514 char* seq=NULL;
515 while ((qd=dhash.NextData(seq))!=NULL) {
516 GStr rseq(seq);
517 //do the dusting here
518 if (doDust) {
519 int dustbases=dust(rseq);
520 if (dustbases>(rseq.length()>>1)) {
521 if (trashReport && qd->firstname!=NULL) {
522 fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
523 }
524 trash_D+=qd->count;
525 continue;
526 }
527 }
528 outcounter++;
529 if (qd->count>maxdup_count) {
530 maxdup_count=qd->count;
531 maxdup_seq=seq;
532 }
533 if (isfasta) {
534 if (prefix.is_empty()) {
535 fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
536 rseq.chars());
537 }
538 else { //use custom read name
539 fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
540 qd->count, rseq.chars());
541 }
542 }
543 else { //fastq format
544 if (convert_phred) convertPhred(qd->qv, qd->len);
545 if (prefix.is_empty()) {
546 fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
547 rseq.chars(), qd->qv);
548 }
549 else { //use custom read name
550 fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
551 qd->count, rseq.chars(), qd->qv);
552 }
553 }
554 }//for each element of dhash
555 if (maxdup_count>1) {
556 GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
557 }
558 } //collapse entries
559 if (verbose) {
560 if (paired_reads) {
561 GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
562 GMessage("Number of input pairs :%9d\n", incounter);
563 GMessage(" Output pairs :%9d\n", outcounter);
564 }
565 else {
566 GMessage(">Input file : %s\n", infname.chars());
567 GMessage("Number of input reads :%9d\n", incounter);
568 GMessage(" Output reads :%9d\n", outcounter);
569 }
570 GMessage("------------------------------------\n");
571 if (num_trimmed5)
572 GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
573 if (num_trimmed3)
574 GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
575 GMessage("------------------------------------\n");
576 if (trash_s>0)
577 GMessage(" Trashed by length:%9d\n", trash_s);
578 if (trash_N>0)
579 GMessage(" Trashed by N%%:%9d\n", trash_N);
580 if (trash_Q>0)
581 GMessage("Trashed by low quality:%9d\n", trash_Q);
582 if (trash_A5>0)
583 GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
584 if (trash_A3>0)
585 GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
586 }
587 if (trashReport) {
588 FWCLOSE(freport);
589 }
590 FWCLOSE(f_out);
591 FWCLOSE(f_out2);
592 } //while each input file
593
594 //getc(stdin);
595 }
596
597 class NData {
598 public:
599 int NPos[1024]; //there should be no reads longer than 1K ?
600 int NCount;
601 int end5;
602 int end3;
603 int seqlen;
604 double perc_N; //percentage of Ns in end5..end3 range only!
605 const char* seq;
606 bool valid;
607 NData() {
608 NCount=0;
609 end5=0;
610 end3=0;
611 seq=NULL;
612 perc_N=0;
613 valid=true;
614 }
615 void init(GStr& rseq) {
616 NCount=0;
617 perc_N=0;
618 seqlen=rseq.length();
619 seq=rseq.chars();
620 end5=1;
621 end3=seqlen;
622 for (int i=0;i<seqlen;i++)
623 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
624 NPos[NCount]=i;
625 NCount++;
626 }
627 perc_N=(NCount*100.0)/seqlen;
628 valid=true;
629 }
630 void N_calc() { //only in the region after trimming
631 int n=0;
632 for (int i=end3-1;i<end5;i++) {
633 if (seq[i]=='N') n++;
634 }
635 perc_N=(n*100.0)/(end5-end3+1);
636 }
637 };
638
639 static NData feat;
640 int perc_lenN=12; // incremental distance from ends, in percentage of
641 // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
642
643 void N_analyze(int l5, int l3, int p5, int p3) {
644 /* assumes feat was filled properly */
645 int old_dif, t5,t3,v;
646 if (l3<l5+2 || p5>p3 ) {
647 feat.end5=l5+1;
648 feat.end3=l3+1;
649 return;
650 }
651
652 t5=feat.NPos[p5]-l5;
653 t3=l3-feat.NPos[p3];
654 old_dif=p3-p5;
655 v=(int)((((double)(l3-l5))*perc_lenN)/100);
656 if (v>20) v=20; /* enforce N-search limit for very long reads */
657 if (t5 < v ) {
658 l5=feat.NPos[p5]+1;
659 p5++;
660 }
661 if (t3 < v) {
662 l3=feat.NPos[p3]-1;
663 p3--;
664 }
665 /* restNs=p3-p5; number of Ns in the new CLR */
666 if (p3-p5==old_dif) { /* no change, return */
667 /* don't trim if this may shorten the read too much */
668 //if (l5-l3<min_read_len) return;
669 feat.end5=l5+1;
670 feat.end3=l3+1;
671 return;
672 }
673 else
674 N_analyze(l5,l3, p5,p3);
675 }
676
677
678 bool qtrim(GStr& qvs, int &l5, int &l3) {
679 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
680 if (qv_phredtype==0) {
681 //try to guess the Phred type
682 int vmin=256, vmax=0;
683 for (int i=0;i<qvs.length();i++) {
684 if (vmin>qvs[i]) vmin=qvs[i];
685 if (vmax<qvs[i]) vmax=qvs[i];
686 }
687 if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
688 if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
689 if (qv_phredtype==0) {
690 GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
691 }
692 if (verbose)
693 GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
694 } //guessing Phred type
695 for (l3=qvs.length()-1;l3>2;l3--) {
696 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
697 }
698 //just in case, check also the 5' the end (?)
699 for (l5=0;l5<qvs.length()-3;l5++) {
700 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
701 }
702 if (qvtrim_max>0) {
703 if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
704 if (l5>qvtrim_max) l5=qvtrim_max;
705 }
706 return (l5>0 || l3<qvs.length()-1);
707 }
708
709 bool ntrim(GStr& rseq, int &l5, int &l3) {
710 //count Ns in the sequence, trim N-rich ends
711 feat.init(rseq);
712 l5=feat.end5-1;
713 l3=feat.end3-1;
714 N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
715 if (l5==feat.end5-1 && l3==feat.end3-1) {
716 if (feat.perc_N>max_perc_N) {
717 feat.valid=false;
718 l3=l5+1;
719 return true;
720 }
721 else {
722 return false; //nothing changed
723 }
724 }
725 l5=feat.end5-1;
726 l3=feat.end3-1;
727 if (l3-l5+1<min_read_len) {
728 feat.valid=false;
729 return true;
730 }
731 feat.N_calc();
732
733 if (feat.perc_N>max_perc_N) {
734 feat.valid=false;
735 l3=l5+1;
736 return true;
737 }
738 return true;
739 }
740
741 //--------------- dust functions ----------------
742 class DNADuster {
743 public:
744 int dustword;
745 int dustwindow;
746 int dustwindow2;
747 int dustcutoff;
748 int mv, iv, jv;
749 int counts[32*32*32];
750 int iis[32*32*32];
751 DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
752 dustword=wordsize;
753 dustwindow=winsize;
754 dustwindow2 = (winsize>>1);
755 dustcutoff=cutoff;
756 mv=0;
757 iv=0;
758 jv=0;
759 }
760 void setWindowSize(int value) {
761 dustwindow = value;
762 dustwindow2 = (dustwindow >> 1);
763 }
764 void setWordSize(int value) {
765 dustword=value;
766 }
767 void wo1(int len, const char* s, int ivv) {
768 int i, ii, j, v, t, n, n1, sum;
769 int js, nis;
770 n = 32 * 32 * 32;
771 n1 = n - 1;
772 nis = 0;
773 i = 0;
774 ii = 0;
775 sum = 0;
776 v = 0;
777 for (j=0; j < len; j++, s++) {
778 ii <<= 5;
779 if (*s<=32) {
780 i=0;
781 continue;
782 }
783 ii |= *s - 'A'; //assume uppercase!
784 ii &= n1;
785 i++;
786 if (i >= dustword) {
787 for (js=0; js < nis && iis[js] != ii; js++) ;
788 if (js == nis) {
789 iis[nis] = ii;
790 counts[ii] = 0;
791 nis++;
792 }
793 if ((t = counts[ii]) > 0) {
794 sum += t;
795 v = 10 * sum / j;
796 if (mv < v) {
797 mv = v;
798 iv = ivv;
799 jv = j;
800 }
801 }
802 counts[ii]++;
803 }
804 }
805 }
806
807 int wo(int len, const char* s, int* beg, int* end) {
808 int i, l1;
809 l1 = len - dustword + 1;
810 if (l1 < 0) {
811 *beg = 0;
812 *end = len - 1;
813 return 0;
814 }
815 mv = 0;
816 iv = 0;
817 jv = 0;
818 for (i=0; i < l1; i++) {
819 wo1(len-i, s+i, i);
820 }
821 *beg = iv;
822 *end = iv + jv;
823 return mv;
824 }
825
826 void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
827 int i, j, l, a, b, v;
828 if (cutoff==0) cutoff=dustcutoff;
829 a=0;b=0;
830 //GMessage("Dust cutoff=%d\n", cutoff);
831 for (i=0; i < seqlen; i += dustwindow2) {
832 l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
833 v = wo(l, seq+i, &a, &b);
834 if (v > cutoff) {
835 //for (j = a; j <= b && j < dustwindow2; j++) {
836 for (j = a; j <= b; j++) {
837 seqmsk[i+j]='N';//could be made lowercase instead
838 }
839 }
840 }
841 //return first;
842 }
843 };
844
845 static DNADuster duster;
846
847 int dust(GStr& rseq) {
848 char* seq=Gstrdup(rseq.chars());
849 duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
850 //check the number of Ns:
851 int ncount=0;
852 for (int i=0;i<rseq.length();i++) {
853 if (seq[i]=='N') ncount++;
854 }
855 GFREE(seq);
856 return ncount;
857 }
858
859
860 // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
861 //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
862 //check minimum score and
863 //for 3' adapter trimming:
864 // require that the right end of the alignment for either the adaptor OR the read must be
865 // < 3 distance from its right end
866 // for 5' adapter trimming:
867 // require that the left end of the alignment for either the adaptor OR the read must
868 // be at coordinate < 3 from start
869
870 bool extendMatch(const char* a, int alen, int ai,
871 const char* b, int blen, int bi, int mlen, int& l5, int& l3, CSegChain& segs, bool end5=false) {
872 //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
873 #ifdef DEBUG
874 GStr dbg(b);
875 #endif
876 //if (debug) {
877 // GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
878 // }
879 int a_l=ai; //alignment coordinates on a
880 int a_r=ai+mlen-1;
881 int b_l=bi; //alignment coordinates on b
882 int b_r=bi+mlen-1;
883 int ai_maxscore=ai;
884 int bi_maxscore=bi;
885 int score=mlen*a_m_score;
886 int maxscore=score;
887 int mism5score=a_mis_score;
888 if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
889 //try to extend to the left first, if possible
890 while (ai>0 && bi>0) {
891 ai--;
892 bi--;
893 score+= (a[ai]==b[bi])? a_m_score : mism5score;
894 if (score>maxscore) {
895 ai_maxscore=ai;
896 bi_maxscore=bi;
897 maxscore=score;
898 }
899 else if (maxscore-score>a_dropoff_score) break;
900 }
901 a_l=ai_maxscore;
902 b_l=bi_maxscore;
903 //if (debug) GMessage(" after l-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
904 //now extend to the right
905 ai_maxscore=a_r;
906 bi_maxscore=b_r;
907 ai=a_r;
908 bi=b_r;
909 score=maxscore;
910 //sometimes there are extra AAAAs at the end of the read, ignore those
911 if (strcmp(&a[alen-4],"AAAA")==0) {
912 alen-=3;
913 while (a[alen-1]=='A' && alen>ai) alen--;
914 }
915 while (ai<alen-1 && bi<blen-1) {
916 ai++;
917 bi++;
918 //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
919 if (a[ai]==b[bi]) { //match
920 score+=a_m_score;
921 if (ai>=alen-2) {
922 score+=a_m_score-(alen-ai-1);
923 }
924 }
925 else { //mismatch
926 score+=a_mis_score;
927 }
928 if (score>maxscore) {
929 ai_maxscore=ai;
930 bi_maxscore=bi;
931 maxscore=score;
932 }
933 else if (maxscore-score>a_dropoff_score) break;
934 }
935 a_r=ai_maxscore;
936 b_r=bi_maxscore;
937 int a_ovh3=alen-a_r-1;
938 int b_ovh3=blen-b_r-1;
939 int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
940 int mmovh5=(a_l<b_l)? a_l : b_l;
941 //if (debug) GMessage(" after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
942 #ifdef DEBUG
943 if (debug) GMessage(" extended to: %*s\n",a_r+1,dbg.substr(b_l,b_r-b_l+1).chars());
944 #endif
945 if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
946 if (a_l<a_ovh3) {
947 //adapter closer to the left end (typical for 5' adapter)
948 l5=a_r+1;
949 l3=alen-1;
950 }
951 else {
952 //adapter matching at the right end (typical for 3' adapter)
953 l5=0;
954 l3=a_l-1;
955 }
956 return true;
957 }
958 else { //keep this segment pair for later (gapped alignment)
959 segs.addSegPair(new CSegPair(a_l+1, a_r+1, b_l+1, b_r+1, maxscore));
960 //this will also update min & max coordinates in segs (segs.astart, .aend, .bstart, .bend)
961 }
962 //do not trim:
963 l5=0;
964 l3=alen-1;
965 return false;
966 }
967
968 /*
969 int getWordValue(const char* s, int wlen) {
970 int r=0;
971 while (wlen--) { r+=(((int)s[wlen])<<wlen) }
972 return r;
973 }
974 */
975 int get3mer_value(const char* s) {
976 return (s[0]<<16)+(s[1]<<8)+s[2];
977 }
978
979 int w3_match(int qv, const char* str, int slen, int start_index=0) {
980 if (start_index>=slen || start_index<0) return -1;
981 for (int i=start_index;i<slen-3;i++) {
982 int rv=get3mer_value(str+i);
983 if (rv==qv) return i;
984 }
985 return -1;
986 }
987
988 int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
989 if (end_index>=slen) return -1;
990 if (end_index<0) end_index=slen-1;
991 for (int i=end_index-2;i>=0;i--) {
992 int rv=get3mer_value(str+i);
993 if (rv==qv) return i;
994 }
995 return -1;
996 }
997
998 int fast4match(int32 qv, const char* str, int slen, int start_index=0) {
999 if (start_index>=slen || start_index<0) return -1;
1000 for (int i=start_index;i<slen-4;i++) {
1001 int32* rv=(int32*)(str+i);
1002 if (*rv==qv) return i;
1003 }
1004 return -1;
1005 }
1006
1007 int fast4rmatch(int32 qv, const char* str, int slen, int end_index=-1) {
1008 if (end_index>=slen) return -1;
1009 if (end_index<0) end_index=slen-1;
1010 for (int i=end_index-3;i>=0;i--) {
1011 int32* rv=(int32*)(str+i);
1012 if (*rv==qv) return i;
1013 }
1014 return -1;
1015 }
1016
1017 #ifdef DEBUG
1018 void dbgPrintChain(CSegChain& chain, const char* aseq) {
1019 GStr s(aseq);
1020 for (int i=0;i<chain.Count();i++) {
1021 CSegPair& seg=*chain[i];
1022 GMessage(" dbg chain seg%d: %*s [%d-%d:%d-%d]\n",i,seg.a.start-1+seg.len(),
1023 s.substr(seg.b.start-1, seg.len()).chars(), seg.b.start,seg.b.end,seg.a.start,seg.a.end);
1024 }
1025 }
1026 #endif
1027
1028 bool trim_adapter3(GStr& seq, int&l5, int &l3) {
1029 int rlen=seq.length();
1030 l5=0;
1031 l3=rlen-1;
1032 //first try a full match, we might get lucky
1033 int fi=-1;
1034 if ((fi=seq.index(adapter3))>=0) {
1035 if (fi<rlen-fi-a3len) {//match is closer to the right end
1036 l5=fi+a3len;
1037 l3=rlen-1;
1038 }
1039 else {
1040 l5=0;
1041 l3=fi-1;
1042 }
1043 return true;
1044 }
1045 #ifdef DEBUG
1046 if (debug) GMessage(">TRIM3 >> Read: %s\n",seq.chars());
1047 #endif
1048
1049 //also, for fast detection of other adapter-only reads that start past
1050 // the beginning of the adapter sequence, try to see if the first a3len-4
1051 // bases of the read are a substring of the adapter
1052 if (rlen>a3len-3) {
1053 GStr rstart=seq.substr(1,a3len-4);
1054 if ((fi=adapter3.index(rstart))>=0) {
1055 l3=rlen-1;
1056 l5=a3len-4;
1057 while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
1058 return true;
1059 }
1060 }
1061 CSegChain a3segs; //no chains here, just an ordered collection of segment pairs
1062 //check the easy cases - 11 bases exact match at the end
1063 int fdlen=11;
1064 if (a3len<16) {
1065 fdlen=a3len>>1;
1066 }
1067 if (fdlen>4) {
1068 //check if we're lucky enough to have the last 11 bases of the read a part of the adapter
1069 GStr rstart=seq.substr(-fdlen-3,fdlen);
1070 if ((fi=adapter3.index(rstart))>=0) {
1071 #ifdef DEBUG
1072 if (debug) GMessage(" W11match found: %*s\n", rlen-3, (adapter3.substr(fi,fdlen)).chars());
1073 #endif
1074 if (extendMatch(seq.chars(), rlen, rlen-fdlen-3,
1075 adapter3.chars(), a3len, fi, fdlen, l5,l3, a3segs))
1076 return true;
1077 }
1078 //another easy case: first 11 characters of the adaptor found as a substring of the read
1079 GStr bstr=adapter3.substr(0, fdlen);
1080 if ((fi=seq.rindex(bstr))>=0) {
1081 #ifdef DEBUG
1082 if (debug) GMessage(" A11match found: %*s\n", fi+fdlen, bstr.chars());
1083 #endif
1084 if (extendMatch(seq.chars(), rlen, fi,
1085 adapter3.chars(), a3len, 0, fdlen, l5,l3, a3segs))
1086 return true;
1087 }
1088 } //tried to match 11 bases first
1089
1090 //no easy cases, so let's do the wmer hashing for the first 12 bases of the adaptor
1091 //-- only extend if the match is in the 3' (ending) region of the read
1092 int wordSize=3;
1093 int hlen=12;
1094 if (hlen>a3len-wordSize) hlen=a3len-wordSize;
1095 int imin=rlen>>1; //last half of the read, left boundary for the wmer match
1096 if (imin<a3len) { imin=GMIN(a3len, rlen-wordSize); }
1097 imin=rlen-imin;
1098 for (int iw=0;iw<hlen;iw++) {
1099 //int32* qv=(int32*)(adapter3.chars()+iw);
1100 int qv=get3mer_value(adapter3.chars()+iw);
1101 fi=-1;
1102 //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1103 while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1104 //GMessage(" ... fi=%d after w3_rmatch() (imin=%d)\n", fi, imin);
1105
1106 #ifdef DEBUG
1107 if (debug) GMessage(" Wmatch found: %*s\n", fi+wordSize, (adapter3.substr(iw,wordSize)).chars());
1108 #endif
1109 if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1110 a3len, iw, wordSize, l5,l3, a3segs)) return true;
1111 fi--;
1112 if (fi<imin) break;
1113 }
1114 } //for each wmer in the first hlen bases of the adaptor
1115 /*
1116 //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1117 //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1118 if (a3segs.bstart>3 || a3segs.bend<(uint)(hlen-wordSize)) return false;
1119 int hlen2=a3len-wordSize;
1120 //if (hlen2>a3len-4) hlen2=a3len-4;
1121 if (hlen2>hlen) {
1122 #ifdef DEBUG
1123 if (debug && a3segs.Count()>0) {
1124 GMessage(" >>>>>2nd. hash: %s\n",seq.chars());
1125 }
1126 #endif
1127 for (int iw=hlen;iw<hlen2;iw++) {
1128 //int* qv=(int32 *)(adapter3.chars()+iw);
1129 int qv=get3mer_value(adapter3.chars()+iw);
1130 fi=-1;
1131 //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1132 while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1133 extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1134 a3len, iw, wordSize, l5,l3, a3segs);
1135 fi--;
1136 if (fi<imin) break;
1137 }
1138 } //for each wmer between hlen2 and hlen bases of the adaptor
1139 }
1140 //lastly, analyze collected a3segs for a possible gapped alignment:
1141 GList<CSegChain> segchains(false,true,false);
1142 #ifdef DEBUG
1143 if (debug && a3segs.Count()>0) {
1144 GMessage(">>>>>>>>> Read: %s\n",seq.chars());
1145 }
1146 #endif
1147 for (int i=0;i<a3segs.Count();i++) {
1148 if (a3segs[i]->chain==NULL) {
1149 if (a3segs[i]->b.start>3) continue; //don't start a hopeless chain
1150 CSegChain* newchain=new CSegChain();
1151 newchain->setFreeItem(false);
1152 newchain->addSegPair(a3segs[i]);
1153 a3segs[i]->chain=newchain;
1154 segchains.Add(newchain); //just to free them when done
1155 }
1156 for (int j=i+1;j<a3segs.Count();j++) {
1157 CSegChain* chain=a3segs[i]->chain;
1158 if (chain->extendChain(a3segs[j])) {
1159 a3segs[j]->chain=chain;
1160 #ifdef DEBUG
1161 if (debug) dbgPrintChain(*chain, adapter3.chars());
1162 #endif
1163 //save time by checking here if the extended chain is already acceptable for trimming
1164 if (chain->aend>(uint)(rlen-4) && chain->bstart<4 && chain->score>a_min_chain_score) {
1165 l5=0;
1166 l3=chain->astart-2;
1167 #ifdef DEBUG
1168 if (debug && a3segs.Count()>0) {
1169 GMessage(">>> >> trimmed-3: %*s\n",l3-l5+1,seq.substr(l5,l3-l5+1).chars());
1170 }
1171 #endif
1172 return true;
1173 }
1174 } //chain can be extended
1175 }
1176 } //collect segment alignments into chains
1177 */
1178 return false; //no adapter parts found
1179 }
1180
1181 bool trim_adapter5(GStr& seq, int&l5, int &l3) {
1182 //if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars());
1183 int rlen=seq.length();
1184 l5=0;
1185 l3=rlen-1;
1186 //try to see if adapter is fully included in the read
1187 int fi=-1;
1188 if ((fi=seq.index(adapter5))>=0) {
1189 if (fi<rlen-fi-a5len) {//match is closer to the right end
1190 l5=fi+a5len;
1191 l3=rlen-1;
1192 }
1193 else {
1194 l5=0;
1195 l3=fi-1;
1196 }
1197 return true;
1198 }
1199 #ifdef DEBUG
1200 if (debug) GMessage(">TRIM5 >> Read: %s\n",seq.chars());
1201 #endif
1202
1203 CSegChain a5segs(true); //list of segment pairs to analyze later if no extendMatch succeeded
1204
1205 //try the easy way out first - look for an exact match of 11 bases
1206 int fdlen=11;
1207 if (a5len<16) {
1208 fdlen=a5len>>1;
1209 }
1210 if (fdlen>4) {
1211 GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus
1212 if ((fi=adapter5.index(rstart))>=0) {
1213 #ifdef DEBUG
1214 if (debug) GMessage(" W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars());
1215 #endif
1216 if (extendMatch(seq.chars(), rlen, 1,
1217 adapter5.chars(), a5len, fi, fdlen, l5,l3, a5segs, true))
1218 return true;
1219 }
1220 //another easy case: last 11 characters of the adaptor found as a substring of the read
1221 GStr bstr=adapter5.substr(-fdlen);
1222 if ((fi=seq.index(bstr))>=0) {
1223 #ifdef DEBUG
1224 if (debug) GMessage(" A11match found: %*s\n", fi+fdlen, bstr.chars());
1225 #endif
1226 if (extendMatch(seq.chars(), rlen, fi,
1227 adapter5.chars(), a5len, a5len-fdlen, fdlen, l5,l3,a5segs,true))
1228 return true;
1229 }
1230 } //tried to matching at most 11 bases first
1231
1232 //-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor
1233 //-- only extend a wmer if it matches in the 5' (beginning) region of the read
1234 int wordSize=3;
1235 int hlen=12;
1236 if (hlen>a5len-wordSize) hlen=a5len-wordSize;
1237 int imax=rlen>>1; //first half of the read, right boundary for the wmer match
1238 if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); }
1239 for (int iw=0;iw<=hlen;iw++) {
1240 int apstart=a5len-iw-wordSize;
1241 fi=0;
1242 //int* qv=(int32 *)(adapter5.chars()+apstart);
1243 int qv=get3mer_value(adapter5.chars()+apstart);
1244 //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1245 while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1246 #ifdef DEBUG
1247 if (debug) GMessage(" Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars());
1248 #endif
1249 if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1250 a5len, apstart, wordSize, l5,l3, a5segs, true)) return true;
1251 fi++;
1252 if (fi>imax) break;
1253 }
1254 } //for each wmer in the last hlen bases of the adaptor
1255 /*
1256
1257 //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1258 //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1259 if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1260 int hlen2=a5len-wordSize;
1261 //if (hlen2>a5len-wordSize) hlen2=a5len-wordSize;
1262 #ifdef DEBUG
1263 if (debug && a5segs.Count()>0) {
1264 GMessage(" >>>>>2nd. hash: %s\n",seq.chars());
1265 }
1266 #endif
1267 if (hlen2>hlen) {
1268 for (int iw=hlen+1;iw<=hlen2;iw++) {
1269 int apstart=a5len-iw-wordSize;
1270 fi=0;
1271 //int* qv=(int32 *)(adapter5.chars()+apstart);
1272 int qv=get3mer_value(adapter5.chars()+apstart);
1273 //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1274 while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1275 extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1276 a5len, apstart, wordSize, l5,l3, a5segs, true);
1277 fi++;
1278 if (fi>imax) break;
1279 }
1280 } //for each wmer between hlen2 and hlen bases of the adaptor
1281 }
1282 if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1283 // lastly, analyze collected a5segs for a possible gapped alignment:
1284 GList<CSegChain> segchains(false,true,false);
1285 #ifdef DEBUG
1286 if (debug && a5segs.Count()>0) {
1287 GMessage(">>>>>>>>> Read: %s\n",seq.chars());
1288 }
1289 #endif
1290 for (int i=0;i<a5segs.Count();i++) {
1291 if (a5segs[i]->chain==NULL) {
1292 if (a5segs[i]->b.end<(int)(a5len-4)) continue; //don't start a hopeless chain
1293 CSegChain* newchain=new CSegChain(true);
1294 newchain->setFreeItem(false);
1295 newchain->addSegPair(a5segs[i]);
1296 a5segs[i]->chain=newchain;
1297 segchains.Add(newchain); //just to free them when done
1298 }
1299 for (int j=i+1;j<a5segs.Count();j++) {
1300 CSegChain* chain=a5segs[i]->chain;
1301 if (chain->extendChain(a5segs[j])) {
1302 a5segs[j]->chain=chain;
1303 #ifdef DEBUG
1304 if (debug) dbgPrintChain(*chain, adapter5.chars());
1305 #endif
1306 //save time by checking here if the extended chain is already acceptable for trimming
1307 if (chain->bend>(uint)(a5len-3) && chain->astart<4 && chain->score>a_min_chain_score) {
1308 l5=chain->aend;
1309 l3=rlen-1;
1310 return true;
1311 }
1312 } //chain can be extended
1313 }
1314 } //collect segment alignments into chains
1315 */
1316 return false; //no adapter parts found
1317 }
1318
1319 //convert qvs to/from phred64 from/to phread33
1320 void convertPhred(GStr& q) {
1321 for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1322 }
1323
1324 void convertPhred(char* q, int len) {
1325 for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1326 }
1327
1328 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1329 GStr& rname, GStr& rinfo, GStr& infname) {
1330 rseq="";
1331 rqv="";
1332 rname="";
1333 rinfo="";
1334 if (fq.eof()) return false;
1335 char* l=fq.getLine();
1336 while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1337 if (l==NULL) return false;
1338 /* if (rawFormat) {
1339 //TODO: implement raw qseq parsing here
1340 //if (raw type=N) then continue; //skip invalid/bad records
1341 } //raw qseq format
1342 else { // FASTQ or FASTA */
1343 isfasta=(l[0]=='>');
1344 if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1345 infname.chars(), l);
1346 GStr s(l);
1347 rname=&(l[1]);
1348 for (int i=0;i<rname.length();i++)
1349 if (rname[i]<=' ') {
1350 if (i<rname.length()-2) rinfo=rname.substr(i+1);
1351 rname.cut(i);
1352 break;
1353 }
1354 //now get the sequence
1355 if ((l=fq.getLine())==NULL)
1356 GError("Error: unexpected EOF after header for read %s (%s)\n",
1357 rname.chars(), infname.chars());
1358 rseq=l; //this must be the DNA line
1359 while ((l=fq.getLine())!=NULL) {
1360 //seq can span multiple lines
1361 if (l[0]=='>' || l[0]=='+') {
1362 fq.pushBack();
1363 break; //
1364 }
1365 rseq+=l;
1366 } //check for multi-line seq
1367 if (!isfasta) { //reading fastq quality values, which can also be multi-line
1368 if ((l=fq.getLine())==NULL)
1369 GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1370 if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1371 if ((l=fq.getLine())==NULL)
1372 GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1373 rqv=l;
1374 //if (rqv.length()!=rseq.length())
1375 // GError("Error: qv len != seq len for %s\n", rname.chars());
1376 while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1377 rqv+=l; //append to qv string
1378 }
1379 }// fastq
1380 // } //<-- FASTA or FASTQ
1381 rseq.upper(); //TODO: what if we care about masking?
1382 return true;
1383 }
1384
1385 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1386 //returns 0 if the read was untouched, 1 if it was just trimmed
1387 // and a trash code if it was trashed
1388 l5=0;
1389 l3=rseq.length()-1;
1390 if (l3-l5+1<min_read_len) {
1391 return 's';
1392 }
1393 GStr wseq(rseq.chars());
1394 GStr wqv(rqv.chars());
1395 int w5=l5;
1396 int w3=l3;
1397 //first do the q-based trimming
1398 if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1399 if (w3-w5+1<min_read_len) {
1400 return 'Q'; //invalid read
1401 }
1402 //-- keep only the w5..w3 range
1403 l5=w5;
1404 l3=w3;
1405 wseq=wseq.substr(w5, w3-w5+1);
1406 if (!wqv.is_empty())
1407 wqv=wqv.substr(w5, w3-w5+1);
1408 } //qv trimming
1409 // N-trimming on the remaining read seq
1410 if (ntrim(wseq, w5, w3)) {
1411 //GMessage("before: %s\n",wseq.chars());
1412 //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1413 l5+=w5;
1414 l3-=(wseq.length()-1-w3);
1415 if (w3-w5+1<min_read_len) {
1416 return 'N'; //to be trashed
1417 }
1418 //-- keep only the w5..w3 range
1419 wseq=wseq.substr(w5, w3-w5+1);
1420 if (!wqv.is_empty())
1421 wqv=wqv.substr(w5, w3-w5+1);
1422 w5=0;
1423 w3=wseq.length()-1;
1424 }
1425 if (a3len>0) {
1426 if (trim_adapter3(wseq, w5, w3)) {
1427 int trimlen=wseq.length()-(w3-w5+1);
1428 num_trimmed3++;
1429 if (trimlen<min_trimmed3)
1430 min_trimmed3=trimlen;
1431 l5+=w5;
1432 l3-=(wseq.length()-1-w3);
1433 if (w3-w5+1<min_read_len) {
1434 return '3';
1435 }
1436 //-- keep only the w5..w3 range
1437 wseq=wseq.substr(w5, w3-w5+1);
1438 if (!wqv.is_empty())
1439 wqv=wqv.substr(w5, w3-w5+1);
1440 }//some adapter was trimmed
1441 } //adapter trimming
1442 if (a5len>0) {
1443 if (trim_adapter5(wseq, w5, w3)) {
1444 int trimlen=wseq.length()-(w3-w5+1);
1445 num_trimmed5++;
1446 if (trimlen<min_trimmed5)
1447 min_trimmed5=trimlen;
1448 l5+=w5;
1449 l3-=(wseq.length()-1-w3);
1450 if (w3-w5+1<min_read_len) {
1451 return '5';
1452 }
1453 //-- keep only the w5..w3 range
1454 wseq=wseq.substr(w5, w3-w5+1);
1455 if (!wqv.is_empty())
1456 wqv=wqv.substr(w5, w3-w5+1);
1457 }//some adapter was trimmed
1458 } //adapter trimming
1459 if (doCollapse) {
1460 //keep read for later
1461 FqDupRec* dr=dhash.Find(wseq.chars());
1462 if (dr==NULL) { //new entry
1463 //if (prefix.is_empty())
1464 dhash.Add(wseq.chars(),
1465 new FqDupRec(&wqv, rname.chars()));
1466 //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1467 }
1468 else
1469 dr->add(wqv);
1470 } //collapsing duplicates
1471 else { //not collapsing duplicates
1472 //apply the dust filter now
1473 if (doDust) {
1474 int dustbases=dust(wseq);
1475 if (dustbases>(wseq.length()>>1)) {
1476 return 'D';
1477 }
1478 }
1479 } //not collapsing duplicates
1480 return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1481 }
1482
1483 void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1484 //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1485 if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1486 else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1487 }
1488
1489 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1490 outcounter++;
1491 bool asFasta=(rqv.is_empty() || fastaOutput);
1492 if (asFasta) {
1493 if (prefix.is_empty()) {
1494 printHeader(f_out, '>',rname,rinfo);
1495 fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1496 }
1497 else {
1498 fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1499 rseq.chars());
1500 }
1501 }
1502 else { //fastq
1503 if (convert_phred) convertPhred(rqv);
1504 if (prefix.is_empty()) {
1505 printHeader(f_out, '@', rname, rinfo);
1506 fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1507 }
1508 else
1509 fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1510 rseq.chars(),rqv.chars() );
1511 }
1512 }
1513
1514 void trash_report(char trashcode, GStr& rname, FILE* freport) {
1515 if (freport==NULL || trashcode<=' ') return;
1516 if (trashcode=='3' || trashcode=='5') {
1517 fprintf(freport, "%s\tA%c\n",rname.chars(),trashcode);
1518 }
1519 else {
1520 fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1521 }
1522 //tcounter++;
1523 }
1524
1525 GStr getFext(GStr& s, int* xpos=NULL) {
1526 //s must be a filename without a path
1527 GStr r("");
1528 if (xpos!=NULL) *xpos=0;
1529 if (s.is_empty() || s=="-") return r;
1530 int p=s.rindex('.');
1531 int d=s.rindex('/');
1532 if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1533 r=s.substr(p+1);
1534 if (xpos!=NULL) *xpos=p+1;
1535 r.lower();
1536 return r;
1537 }
1538
1539 void baseFileName(GStr& fname) {
1540 //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1541 int xpos=0;
1542 GStr fext=getFext(fname, &xpos);
1543 if (xpos<=1) return;
1544 bool extdel=false;
1545 GStr f2;
1546 if (fext=="z" || fext=="zip") {
1547 extdel=true;
1548 }
1549 else if (fext.length()>=2) {
1550 f2=fext.substr(0,2);
1551 extdel=(f2=="gz" || f2=="bz");
1552 }
1553 if (extdel) {
1554 fname.cut(xpos-1);
1555 fext=getFext(fname, &xpos);
1556 if (xpos<=1) return;
1557 }
1558 extdel=false;
1559 if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1560 extdel=true;
1561 }
1562 else if (fext.length()>=2) {
1563 extdel=(fext.substr(0,2)=="fa");
1564 }
1565 if (extdel) fname.cut(xpos-1);
1566 GStr fncp(fname);
1567 fncp.lower();
1568 fncp.chomp("seq");
1569 fncp.chomp("sequence");
1570 fncp.trimR("_.");
1571 if (fncp.length()<fname.length()) fname.cut(fncp.length());
1572 }
1573
1574 FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1575 FILE* f_out=NULL;
1576 GStr fname(getFileName(infname.chars()));
1577 //eliminate known extensions
1578 baseFileName(fname);
1579 if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1580 else if (pocmd.is_empty()) {
1581 GStr oname(fname);
1582 oname.append('.');
1583 oname.append(outsuffix);
1584 f_out=fopen(oname.chars(),"w");
1585 if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1586 }
1587 else {
1588 GStr oname(">");
1589 oname.append(fname);
1590 oname.append('.');
1591 oname.append(outsuffix);
1592 pocmd.append(oname);
1593 f_out=popen(pocmd.chars(), "w");
1594 if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1595 }
1596 return f_out;
1597 }
1598
1599 void guess_unzip(GStr& fname, GStr& picmd) {
1600 GStr fext=getFext(fname);
1601 if (fext=="gz" || fext=="gzip" || fext=="z") {
1602 picmd="gzip -cd ";
1603 }
1604 else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1605 picmd="bzip2 -cd ";
1606 }
1607 }
1608
1609 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1610 GStr& s, GStr& infname, GStr& infname2) {
1611 // uses outsuffix to generate output file names and open file handles as needed
1612 infname="";
1613 infname2="";
1614 f_in=NULL;
1615 f_in2=NULL;
1616 f_out=NULL;
1617 f_out2=NULL;
1618 //analyze outsuffix intent
1619 GStr pocmd;
1620 if (outsuffix=="-") {
1621 f_out=stdout;
1622 }
1623 else {
1624 GStr ox=getFext(outsuffix);
1625 if (ox.length()>2) ox=ox.substr(0,2);
1626 if (ox=="gz") pocmd="gzip -9 -c ";
1627 else if (ox=="bz") pocmd="bzip2 -9 -c ";
1628 }
1629 if (s=="-") {
1630 f_in=stdin;
1631 infname="stdin";
1632 f_out=prepOutFile(infname, pocmd);
1633 return;
1634 } // streaming from stdin
1635 s.startTokenize(",:");
1636 s.nextToken(infname);
1637 bool paired=s.nextToken(infname2);
1638 if (fileExists(infname.chars())==0)
1639 GError("Error: cannot find file %s!\n",infname.chars());
1640 GStr fname(getFileName(infname.chars()));
1641 GStr picmd;
1642 guess_unzip(fname, picmd);
1643 if (picmd.is_empty()) {
1644 f_in=fopen(infname.chars(), "r");
1645 if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1646 }
1647 else {
1648 picmd.append(infname);
1649 f_in=popen(picmd.chars(), "r");
1650 if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1651 }
1652 if (f_out==stdout) {
1653 if (paired) GError("Error: output suffix required for paired reads\n");
1654 return;
1655 }
1656 f_out=prepOutFile(infname, pocmd);
1657 if (!paired) return;
1658 if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1659 // ---- paired reads:-------------
1660 if (fileExists(infname2.chars())==0)
1661 GError("Error: cannot find file %s!\n",infname2.chars());
1662 picmd="";
1663 GStr fname2(getFileName(infname2.chars()));
1664 guess_unzip(fname2, picmd);
1665 if (picmd.is_empty()) {
1666 f_in2=fopen(infname2.chars(), "r");
1667 if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1668 }
1669 else {
1670 picmd.append(infname2);
1671 f_in2=popen(picmd.chars(), "r");
1672 if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1673 }
1674 f_out2=prepOutFile(infname2, pocmd);
1675 }