ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 12
Committed: Wed Jan 26 19:05:05 2011 UTC (8 years, 9 months ago) by gpertea
File size: 41915 byte(s)
Log Message:
added qv-threshold trimming

Line File contents
1 #include "GArgs.h"
2 #include "GStr.h"
3 #include "GHash.hh"
4 #include "GList.hh"
5
6 #define USAGE "Usage:\n\
7 fqtrim [-5 <5'adapter>] [-3 <3'adapter>] [-l <minlen>] [-q <minqv>] [-C] [-D]\\\n\
8 [-p {64|33}] [-n <rename_prefix>] [-o <trimmed.fq>] [-r <discarded.lst>]\\\n\
9 [-Q] <input.fq>\n\
10 \n\
11 Trim low quality bases at 3' end, optionally trim adapter sequence, filter\n\
12 for low complexity and collapse duplicate reads\n\
13 \n\
14 Options:\n\
15 -n rename all the reads using the <prefix> followed by a read counter;\n\
16 if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
17 the read duplication count\n\
18 -o write the trimmed/filtered fastq into <trimmed.fq>(instead of stdout)\n\
19 -5 trim the given adapter or primer sequence at the 5' end of each read\n\
20 (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
21 -3 trim the given adapter sequence at the 3' end of each read\n\
22 (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
23 -q trim bases with quality value lower than <minq> (starting the 3' end)\n\
24 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
25 -l minimum \"clean\" length after trimming that a read must have\n\
26 in order to pass the filter (default: 16)\n\
27 -r write a simple \"trash report\" file listing the discarded reads with a\n\
28 one letter code for the reason why they were trashed\n\
29 -D apply a low-complexity (dust) filter and discard any read that has over\n\
30 50% of its length detected as low complexity\n\
31 -C collapse duplicate reads and add a prefix with their count to the read\n\
32 name (e.g. for microRNAs)\n\
33 -p input is phred64/phred33 (use -p64 or -p33)\n\
34 -Q convert quality values to the other Phred qv type\n\
35 -V verbose processing\n\
36 "
37 // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
38
39 //For pair ends sequencing:
40 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
41 //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
42 FILE* f_out=NULL; //stdout if not provided
43 FILE* f_in=NULL; //input fastq (stdin if not provided)
44 FILE* freport=NULL;
45 bool debug=false;
46 bool verbose=false;
47 bool doCollapse=false;
48 bool doDust=false;
49 bool trashReport=false;
50 //bool rawFormat=false;
51 int min_read_len=16;
52 double max_perc_N=7.0;
53 int dust_cutoff=16;
54 bool isfasta=false;
55 bool convert_phred=false;
56 GStr prefix;
57 GStr adapter3;
58 GStr adapter5;
59
60 int qvtrim_min=0;
61
62 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
63 int qv_cvtadd=0; //could be -31 or +31
64
65 int a3len=0;
66 int a5len=0;
67 // adaptor matching metrics -- for extendMatch() function
68 const int a_m_score=2; //match score
69 const int a_mis_score=-3; //mismatch
70 const int a_dropoff_score=7;
71 const int a_min_score=8; //an exact match of 4 bases at the proper ends WILL be trimmed
72 const int a_min_chain_score=15; //for gapped alignments
73
74 class CSegChain;
75
76 class CSegPair {
77 public:
78 GSeg a;
79 GSeg b; //the adapter segment
80 int score;
81 int flags;
82 CSegChain* chain;
83 CSegPair(int astart=0, int aend=0, int bstart=0, int bend=0, int mscore=0):a(astart,aend),b(bstart, bend) {
84 score=mscore;
85 if (score==0) score=a.len()*a_m_score;
86 flags=0;
87 chain=NULL;
88 }
89 int len() { return a.len(); }
90 bool operator==(CSegPair& d){
91 //return (a.start==d.a.start && a.end==d.a.end && b.start==d.b.start && b.end==d.b.end);
92 //make equal even segments that are included into one another:
93 return (d.a.start>=a.start && d.a.end<=a.end && d.b.start>=b.start && d.b.end<=b.end);
94 }
95 bool operator>(CSegPair& d){ //ordering based on b (adaptor) start coord and score
96 if (b.start==d.b.start) {
97 if (score==d.score) {
98 //just try to be consistent:
99 if (b.end==d.b.end) {
100 return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
101 }
102 return (b.end>d.b.end);
103 }
104 else return (score<d.score);
105 }
106 return (b.start>d.b.start);
107 }
108 bool operator<(CSegPair& d){ //ordering based on b (adaptor) coord
109 /*if (b.start==d.b.start && b.end==d.b.end) {
110 return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
111 }
112 return (b.start==d.b.start)?(b.end<d.b.end):(b.start<d.b.start);*/
113 if (b.start==d.b.start) {
114 if (score==d.score) {
115 //just try to be consistent:
116 if (b.end==d.b.end) {
117 return (a.start==d.a.start)?(a.end>d.a.end):(a.start>d.a.start);
118 }
119 return (b.end<d.b.end);
120 }
121 else return (score>d.score);
122 }
123 return (b.start<d.b.start);
124 }
125 };
126
127 int cmpSegEnds(pointer sa, pointer sb) { //sort by adaptor seg ends AND score
128 CSegPair& x = *(CSegPair *)sa;
129 CSegPair& y = *(CSegPair *)sb;
130 /*
131 if (x.b.end==y.b.end) {
132 if (x.b.start==y.b.start) {
133 if (x.a.end==y.a.end) {
134 if (x.a.start==y.a.start) return 0;
135 return ((x.a.start>y.a.start) ? -1 : 1);
136 }
137 else {
138 return ((x.a.end>y.a.end) ? -1 : 1);
139 }
140 }
141 else {
142 return ((x.b.start>y.b.start) ? -1 : 1);
143 }
144 }
145 else {
146 return ((x.b.end>y.b.end) ? -1 : 1);
147 }
148 */
149 if (x.b.end==y.b.end) {
150 if (x.score==y.score) {
151 if (x.b.start==y.b.start) {
152 if (x.a.end==y.a.end) {
153 if (x.a.start==y.a.start) return 0;
154 return ((x.a.start<y.a.start) ? -1 : 1);
155 }
156 else {
157 return ((x.a.end<y.a.end) ? -1 : 1);
158 }
159 }
160 else {
161 return ((x.b.start<y.b.start) ? -1 : 1);
162 }
163 } else return ((x.score>y.score) ? -1 : 1);
164 }
165 else {
166 return ((x.b.end>y.b.end) ? -1 : 1);
167 }
168
169 }
170
171 class CSegChain:public GList<CSegPair> {
172 public:
173 uint astart;
174 uint aend;
175 uint bstart;
176 uint bend;
177 int score;
178 bool endSort;
179 CSegChain(bool aln5=false):GList<CSegPair>(true,true,true) {//sorted, free elements, unique
180 //as SegPairs are inserted, they will be sorted by a.start coordinate
181 score=0;
182 astart=MAX_UINT;
183 aend=0;
184 bstart=MAX_UINT;
185 bend=0;
186 endSort=aln5;
187 if (aln5) { setSorted(cmpSegEnds); }
188 }
189 bool operator==(CSegChain& d) {
190 //return (score==d.score);
191 return (astart==d.astart && aend==d.aend && bstart==d.bstart && bend==d.bend);
192 }
193 bool operator>(CSegChain& d) { // order based on b (adaptor) coordinate
194 //return (score<d.score);
195 if (bstart==d.bstart && bend==d.bend) {
196 return (astart==d.astart)?(aend>d.aend):(astart>d.astart);
197 }
198 return (bstart==d.bstart)?(bend>d.bend):(bstart>d.bstart);
199 }
200 bool operator<(CSegChain& d) {
201 //return (score>d.score);
202 if (bstart==d.bstart && bend==d.bend) {
203 return (astart==d.astart)?(aend<d.aend):(astart<d.astart);
204 }
205 return (bstart==d.bstart)?(bend<d.bend):(bstart<d.bstart);
206 }
207 void addSegPair(CSegPair* segp) {
208 if (AddIfNew(segp)!=segp) return;
209 score+=segp->score;
210 if (astart>segp->a.start) astart=segp->a.start;
211 if (aend<segp->a.end) aend=segp->a.end;
212 if (bstart>segp->b.start) bstart=segp->b.start;
213 if (bend<segp->b.end) bend=segp->b.end;
214 }
215 //for building actual chains:
216 bool extendChain(CSegPair* segp) { //segp expected to be "Greater Than" current chain
217 int bgap=0;
218 int agap=0;
219 //if (endSort) {
220 if (bstart>segp->b.start) {
221 bgap = (int)(bstart-segp->b.end);
222 if (abs(bgap)>2) return false;
223 agap = (int)(astart-segp->a.end);
224 if (abs(agap)>2) return false;
225 }
226 else {
227 bgap = (int) (segp->b.start-bend);
228 if (abs(bgap)>2) return false;
229 agap = (int)(segp->a.start-aend);
230 if (abs(agap)>2) return false;
231 }
232 if (agap*bgap<0) return false;
233 addSegPair(segp);
234 score-=abs(agap)+abs(bgap);
235 return true;
236 }
237 };
238
239 // element in dhash:
240 class FqDupRec {
241 public:
242 int count; //how many of these reads are the same
243 int len; //length of qv
244 char* firstname; //optional, only if we want to keep the original read names
245 char* qv;
246 FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
247 len=0;
248 qv=NULL;
249 firstname=NULL;
250 count=0;
251 if (qstr!=NULL) {
252 qv=Gstrdup(qstr->chars());
253 len=qstr->length();
254 count++;
255 }
256 if (rname!=NULL) firstname=Gstrdup(rname);
257 }
258 ~FqDupRec() {
259 GFREE(qv);
260 GFREE(firstname);
261 }
262 void add(GStr& d) { //collapse another record into this one
263 if (d.length()!=len)
264 GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
265 count++;
266 for (int i=0;i<len;i++)
267 qv[i]=(qv[i]+d[i])/2;
268 }
269 };
270
271 void openfw(FILE* &f, GArgs& args, char opt) {
272 GStr s=args.getOpt(opt);
273 if (!s.is_empty()) {
274 if (s=='-') f=stdout;
275 else {
276 f=fopen(s,"w");
277 if (f==NULL) GError("Error creating file: %s\n", s.chars());
278 }
279 }
280 }
281
282 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
283 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
284
285 GHash<FqDupRec> dhash; //hash to keep track of duplicates
286
287 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
288 bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
289 int dust(GStr& seq);
290 bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
291 bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
292
293 void convertPhred(char* q, int len);
294 void convertPhred(GStr& q);
295
296 int main(int argc, char * const argv[]) {
297 GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:o:");
298 int e;
299 int icounter=0; //counter for input reads
300 int outcounter=0; //counter for output reads
301 if ((e=args.isError())>0) {
302 GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
303 exit(224);
304 }
305 debug=(args.getOpt('Y')!=NULL);
306 debug=(args.getOpt('V')!=NULL);
307 convert_phred=(args.getOpt('Q')!=NULL);
308 doCollapse=(args.getOpt('C')!=NULL);
309 doDust=(args.getOpt('D')!=NULL);
310 /*
311 rawFormat=(args.getOpt('R')!=NULL);
312 if (rawFormat) {
313 GError("Sorry, raw qseq format parsing is not implemented yet!\n");
314 }
315 */
316 prefix=args.getOpt('n');
317 GStr s=args.getOpt('l');
318 if (!s.is_empty())
319 min_read_len=s.asInt();
320 s=args.getOpt('m');
321 if (!s.is_empty())
322 max_perc_N=s.asDouble();
323 s=args.getOpt('d');
324 if (!s.is_empty()) {
325 dust_cutoff=s.asInt();
326 doDust=true;
327 }
328 s=args.getOpt('q');
329 if (!s.is_empty()) {
330 qvtrim_min=s.asInt();
331 }
332 s=args.getOpt('p');
333 if (!s.is_empty()) {
334 int v=s.asInt();
335 if (v==33) {
336 qv_phredtype=33;
337 qv_cvtadd=31;
338 }
339 else if (v==64) {
340 qv_phredtype=64;
341 qv_cvtadd=-31;
342 }
343 else
344 GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
345 }
346 if (args.getOpt('3')!=NULL) {
347 adapter3=args.getOpt('3');
348 adapter3.upper();
349 a3len=adapter3.length();
350 }
351 if (args.getOpt('5')!=NULL) {
352 adapter5=args.getOpt('5');
353 adapter5.upper();
354 a5len=adapter5.length();
355 }
356 trashReport= (args.getOpt('r')!=NULL);
357 if (args.startNonOpt()==0) {
358 GMessage(USAGE);
359 exit(224);
360 }
361
362 openfw(f_out, args, 'o');
363 if (f_out==NULL) f_out=stdout;
364 if (trashReport)
365 openfw(freport, args, 'r');
366 char* infile=NULL;
367 while ((infile=args.nextNonOpt())!=NULL) {
368 GStr infname(infile);
369 if (strcmp(infile,"-")==0) {
370 f_in=stdin; infname="stdin"; }
371 else {
372 f_in=fopen(infile,"r");
373 if (f_in==NULL)
374 GError("Cannot open input file %s!\n",infile);
375 }
376 GLineReader fq(f_in);
377 char* l=NULL;
378 while ((l=fq.getLine())!=NULL) {
379 GStr rname; //current read name
380 GStr rseq; //current read sequence
381 GStr rqv; //current read quality values
382 GStr s;
383 /* if (rawFormat) {
384 //TODO: implement qseq parsing here
385 //if (raw type=N) then continue; //skip invalid/bad records
386
387 } //raw qseq format
388 else { // FASTQ or FASTA */
389 isfasta=(l[0]=='>');
390 if (!isfasta && l[0]!='@') GError("Error: fastq record marker not detected!\n");
391 s=l;
392 rname=&(l[1]);
393 icounter++;
394 for (int i=0;i<rname.length();i++)
395 if (rname[i]<=' ') { rname.cut(i); break; }
396 //now get the sequence
397 if ((l=fq.getLine())==NULL)
398 GError("Error: unexpected EOF after header for %s\n",rname.chars());
399 rseq=l; //this must be the DNA line
400 while ((l=fq.getLine())!=NULL) {
401 //seq can span multiple lines
402 if (l[0]=='>' || l[0]=='+') {
403 fq.pushBack();
404 break; //
405 }
406 rseq+=l;
407 } //check for multi-line seq
408 if (!isfasta) { //reading fastq quality values, which can also be multi-line
409 if ((l=fq.getLine())==NULL)
410 GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
411 if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
412 if ((l=fq.getLine())==NULL)
413 GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
414 rqv=l;
415 //if (rqv.length()!=rseq.length())
416 // GError("Error: qv len != seq len for %s\n", rname.chars());
417 while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
418 rqv+=l; //append to qv string
419 }
420 }// fastq
421 // } //<-- FASTA or FASTQ
422 rseq.upper();
423 int l5=0;
424 int l3=rseq.length()-1;
425 if (l3-l5+1<min_read_len) {
426 if (trashReport) {
427 fprintf(freport, "%s\ts\t%s\n",rname.chars(), rseq.chars());
428 }
429 continue;
430 }
431 if (ntrim(rseq, l5, l3)) { // N-trimming
432 //GMessage("before: %s\n",rseq.chars());
433 //GMessage("after : %s (%d)\n",rseq.substr(l5,l3-l5+1).chars(),l3-l5+1);
434 if (l3-l5+1<min_read_len) {
435 if (trashReport) {
436 fprintf(freport, "%s\tN\t%s\n", rname.chars(), rseq.chars());
437 }
438 continue; //invalid read
439 }
440 //-- keep only the l5..l3 range
441 rseq=rseq.substr(l5, l3-l5+1);
442 if (!rqv.is_empty())
443 rqv=rqv.substr(l5, l3-l5+1);
444 l5=0;
445 l3=rseq.length()-1;
446 }
447 if (qvtrim_min!=0 && !rqv.is_empty() && qtrim(rqv, l5, l3)) { // qv-threshold trimming
448 if (l3-l5+1<min_read_len) {
449 if (trashReport) {
450 fprintf(freport, "%s\tQ\t%s\n", rname.chars(), rseq.chars());
451 }
452 continue; //invalid read
453 }
454 //-- keep only the l5..l3 range
455 rseq=rseq.substr(l5, l3-l5+1);
456 if (!rqv.is_empty())
457 rqv=rqv.substr(l5, l3-l5+1);
458 } //qv trimming
459 if (a3len>0) {
460 if (trim_adapter3(rseq, l5, l3)) {
461 if (l3-l5+1<min_read_len) {
462 if (trashReport) {
463 fprintf(freport, "%s\tA3\t%s\n",rname.chars(), rseq.chars());
464 }
465 continue;
466 }
467 //-- keep only the l5..l3 range
468 rseq=rseq.substr(l5, l3-l5+1);
469 if (!rqv.is_empty())
470 rqv=rqv.substr(l5, l3-l5+1);
471 }//some adapter was trimmed
472 } //adapter trimming
473 if (a5len>0) {
474 if (trim_adapter5(rseq, l5, l3)) {
475 if (l3-l5+1<min_read_len) {
476 if (trashReport) {
477 fprintf(freport, "%s\tA5\t%s\n",rname.chars(), rseq.chars());
478 }
479 continue;
480 }
481 //-- keep only the l5..l3 range
482 rseq=rseq.substr(l5, l3-l5+1);
483 if (!rqv.is_empty())
484 rqv=rqv.substr(l5, l3-l5+1);
485 }//some adapter was trimmed
486 } //adapter trimming
487 if (doCollapse) {
488 //keep read for later
489 FqDupRec* dr=dhash.Find(rseq.chars());
490 if (dr==NULL) { //new entry
491 //if (prefix.is_empty())
492 dhash.Add(rseq.chars(),
493 new FqDupRec(&rqv, rname.chars()));
494 //else dhash.Add(rseq.chars(), new FqDupRec(rqv.chars(),rqv.length()));
495 }
496 else
497 dr->add(rqv);
498 } //collapsing duplicates
499 else { //not collapsing duplicates
500 //do the dust filter now
501 if (doDust) {
502 int dustbases=dust(rseq);
503 if (dustbases>(rseq.length()>>1)) {
504 if (trashReport) {
505 fprintf(freport, "%s\tD\t%s\n",rname.chars(),rseq.chars());
506 }
507 continue;
508 }
509 }
510 //print this record here
511 outcounter++;
512 if (isfasta) {
513 if (prefix.is_empty())
514 fprintf(f_out, ">%s\n%s\n", rname.chars(), rseq.chars());
515 else
516 fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
517 rseq.chars());
518 }
519 else { //fastq
520 if (convert_phred) convertPhred(rqv);
521 if (prefix.is_empty())
522 fprintf(f_out, "@%s\n%s\n+\n%s\n", rname.chars(), rseq.chars(),rqv.chars());
523 else
524 fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
525 rseq.chars(),rqv.chars() );
526 }
527 } //not collapsing duplicates
528 } //for each fastq record
529 } //while each input file
530 FRCLOSE(f_in);
531 if (doCollapse) {
532 outcounter=0;
533 int maxdup_count=1;
534 char* maxdup_seq=NULL;
535 dhash.startIterate();
536 FqDupRec* qd=NULL;
537 char* seq=NULL;
538 while ((qd=dhash.NextData(seq))!=NULL) {
539 GStr rseq(seq);
540 //do the dusting here
541 if (doDust) {
542 int dustbases=dust(rseq);
543 if (dustbases>(rseq.length()>>1)) {
544 if (trashReport && qd->firstname!=NULL) {
545 fprintf(freport, "%s_x%d\tD\t%s\n",qd->firstname, qd->count,seq);
546 }
547 continue;
548 }
549 }
550 outcounter++;
551 if (qd->count>maxdup_count) {
552 maxdup_count=qd->count;
553 maxdup_seq=seq;
554 }
555 if (isfasta) {
556 if (prefix.is_empty()) {
557 fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
558 rseq.chars());
559 }
560 else { //use custom read name
561 fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
562 qd->count, rseq.chars());
563 }
564 }
565 else { //fastq format
566 if (convert_phred) convertPhred(qd->qv, qd->len);
567 if (prefix.is_empty()) {
568 fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
569 rseq.chars(), qd->qv);
570 }
571 else { //use custom read name
572 fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
573 qd->count, rseq.chars(), qd->qv);
574 }
575 }
576 }//for each element of dhash
577 if (maxdup_count>1) {
578 GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
579 }
580 } //report collapsed dhash entries
581 GMessage("Number of input reads: %9d\n", icounter);
582 GMessage(" Output records: %9d\n", outcounter);
583 if (trashReport) {
584 FWCLOSE(freport);
585 }
586
587 FWCLOSE(f_out);
588 //getc(stdin);
589 }
590
591 class NData {
592 public:
593 int NPos[1024]; //there should be no reads longer than 1K ?
594 int NCount;
595 int end5;
596 int end3;
597 int seqlen;
598 double perc_N; //percentage of Ns in end5..end3 range only!
599 const char* seq;
600 bool valid;
601 NData() {
602 NCount=0;
603 end5=0;
604 end3=0;
605 seq=NULL;
606 perc_N=0;
607 valid=true;
608 }
609 void init(GStr& rseq) {
610 NCount=0;
611 perc_N=0;
612 seqlen=rseq.length();
613 seq=rseq.chars();
614 end5=1;
615 end3=seqlen;
616 for (int i=0;i<seqlen;i++)
617 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
618 NPos[NCount]=i;
619 NCount++;
620 }
621 perc_N=(NCount*100.0)/seqlen;
622 valid=true;
623 }
624 void N_calc() { //only in the region after trimming
625 int n=0;
626 for (int i=end3-1;i<end5;i++) {
627 if (seq[i]=='N') n++;
628 }
629 perc_N=(n*100.0)/(end5-end3+1);
630 }
631 };
632
633 static NData feat;
634 int perc_lenN=12; // incremental distance from ends, in percentage of
635 // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
636
637 void N_analyze(int l5, int l3, int p5, int p3) {
638 /* assumes feat was filled properly */
639 int old_dif, t5,t3,v;
640 if (l3<l5+2 || p5>p3 ) {
641 feat.end5=l5+1;
642 feat.end3=l3+1;
643 return;
644 }
645
646 t5=feat.NPos[p5]-l5;
647 t3=l3-feat.NPos[p3];
648 old_dif=p3-p5;
649 v=(int)((((double)(l3-l5))*perc_lenN)/100);
650 if (v>20) v=20; /* enforce N-search limit for very long reads */
651 if (t5 < v ) {
652 l5=feat.NPos[p5]+1;
653 p5++;
654 }
655 if (t3 < v) {
656 l3=feat.NPos[p3]-1;
657 p3--;
658 }
659 /* restNs=p3-p5; number of Ns in the new CLR */
660 if (p3-p5==old_dif) { /* no change, return */
661 /* don't trim if this may shorten the read too much */
662 //if (l5-l3<min_read_len) return;
663 feat.end5=l5+1;
664 feat.end3=l3+1;
665 return;
666 }
667 else
668 N_analyze(l5,l3, p5,p3);
669 }
670
671
672 bool qtrim(GStr& qvs, int &l5, int &l3) {
673 if (qvtrim_min==0 || qvs.is_empty()) return false;
674 if (qv_phredtype==0) {
675 //try to guess the Phred type
676 int vmin=256, vmax=0;
677 for (int i=0;i<qvs.length();i++) {
678 if (vmin>qvs[i]) vmin=qvs[i];
679 if (vmax<qvs[i]) vmax=qvs[i];
680 }
681 if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
682 if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
683 if (qv_phredtype==0) {
684 GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
685 }
686 } //guessing the Phred type
687 for (l3=qvs.length()-1;l3>2;l3--) {
688 if (qvs[l3]-qv_phredtype>=qvtrim_min && qvs[l3-1]-qv_phredtype>=qvtrim_min) break;
689 }
690 //just in case, check also the 5' the end (?)
691 for (l5=0;l5<qvs.length()-3;l5++) {
692 if (qvs[l5]-qv_phredtype>=qvtrim_min && qvs[l5+1]-qv_phredtype>=qvtrim_min) break;
693 }
694 return (l5>0 || l3<qvs.length()-1);
695 }
696
697 bool ntrim(GStr& rseq, int &l5, int &l3) {
698 //count Ns in the sequence, trim N-rich ends
699 feat.init(rseq);
700 l5=feat.end5-1;
701 l3=feat.end3-1;
702 N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
703 if (l5==feat.end5-1 && l3==feat.end3-1) {
704 if (feat.perc_N>max_perc_N) {
705 feat.valid=false;
706 l3=l5+1;
707 return true;
708 }
709 else {
710 return false; //nothing changed
711 }
712 }
713 l5=feat.end5-1;
714 l3=feat.end3-1;
715 if (l3-l5+1<min_read_len) {
716 feat.valid=false;
717 return true;
718 }
719 feat.N_calc();
720
721 if (feat.perc_N>max_perc_N) {
722 feat.valid=false;
723 l3=l5+1;
724 return true;
725 }
726 return true;
727 }
728
729 //--------------- dust functions ----------------
730 class DNADuster {
731 public:
732 int dustword;
733 int dustwindow;
734 int dustwindow2;
735 int dustcutoff;
736 int mv, iv, jv;
737 int counts[32*32*32];
738 int iis[32*32*32];
739 DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
740 dustword=wordsize;
741 dustwindow=winsize;
742 dustwindow2 = (winsize>>1);
743 dustcutoff=cutoff;
744 mv=0;
745 iv=0;
746 jv=0;
747 }
748 void setWindowSize(int value) {
749 dustwindow = value;
750 dustwindow2 = (dustwindow >> 1);
751 }
752 void setWordSize(int value) {
753 dustword=value;
754 }
755 void wo1(int len, const char* s, int ivv) {
756 int i, ii, j, v, t, n, n1, sum;
757 int js, nis;
758 n = 32 * 32 * 32;
759 n1 = n - 1;
760 nis = 0;
761 i = 0;
762 ii = 0;
763 sum = 0;
764 v = 0;
765 for (j=0; j < len; j++, s++) {
766 ii <<= 5;
767 if (*s<=32) {
768 i=0;
769 continue;
770 }
771 ii |= *s - 'A'; //assume uppercase!
772 ii &= n1;
773 i++;
774 if (i >= dustword) {
775 for (js=0; js < nis && iis[js] != ii; js++) ;
776 if (js == nis) {
777 iis[nis] = ii;
778 counts[ii] = 0;
779 nis++;
780 }
781 if ((t = counts[ii]) > 0) {
782 sum += t;
783 v = 10 * sum / j;
784 if (mv < v) {
785 mv = v;
786 iv = ivv;
787 jv = j;
788 }
789 }
790 counts[ii]++;
791 }
792 }
793 }
794
795 int wo(int len, const char* s, int* beg, int* end) {
796 int i, l1;
797 l1 = len - dustword + 1;
798 if (l1 < 0) {
799 *beg = 0;
800 *end = len - 1;
801 return 0;
802 }
803 mv = 0;
804 iv = 0;
805 jv = 0;
806 for (i=0; i < l1; i++) {
807 wo1(len-i, s+i, i);
808 }
809 *beg = iv;
810 *end = iv + jv;
811 return mv;
812 }
813
814 void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
815 int i, j, l, a, b, v;
816 if (cutoff==0) cutoff=dustcutoff;
817 a=0;b=0;
818 //GMessage("Dust cutoff=%d\n", cutoff);
819 for (i=0; i < seqlen; i += dustwindow2) {
820 l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
821 v = wo(l, seq+i, &a, &b);
822 if (v > cutoff) {
823 //for (j = a; j <= b && j < dustwindow2; j++) {
824 for (j = a; j <= b; j++) {
825 seqmsk[i+j]='N';//could be made lowercase instead
826 }
827 }
828 }
829 //return first;
830 }
831 };
832
833 static DNADuster duster;
834
835 int dust(GStr& rseq) {
836 char* seq=Gstrdup(rseq.chars());
837 duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
838 //check the number of Ns:
839 int ncount=0;
840 for (int i=0;i<rseq.length();i++) {
841 if (seq[i]=='N') ncount++;
842 }
843 GFREE(seq);
844 return ncount;
845 }
846
847
848 // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
849 //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
850 //check minimum score and
851 //for 3' adapter trimming:
852 // require that the right end of the alignment for either the adaptor OR the read must be
853 // < 3 distance from its right end
854 // for 5' adapter trimming:
855 // require that the left end of the alignment for either the adaptor OR the read must
856 // be at coordinate < 3 from start
857
858 bool extendMatch(const char* a, int alen, int ai,
859 const char* b, int blen, int bi, int mlen, int& l5, int& l3, CSegChain& segs, bool end5=false) {
860 //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
861 #ifdef DEBUG
862 GStr dbg(b);
863 #endif
864 //if (debug) {
865 // GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
866 // }
867 int a_l=ai; //alignment coordinates on a
868 int a_r=ai+mlen-1;
869 int b_l=bi; //alignment coordinates on b
870 int b_r=bi+mlen-1;
871 int ai_maxscore=ai;
872 int bi_maxscore=bi;
873 int score=mlen*a_m_score;
874 int maxscore=score;
875 int mism5score=a_mis_score;
876 if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
877 //try to extend to the left first, if possible
878 while (ai>0 && bi>0) {
879 ai--;
880 bi--;
881 score+= (a[ai]==b[bi])? a_m_score : mism5score;
882 if (score>maxscore) {
883 ai_maxscore=ai;
884 bi_maxscore=bi;
885 maxscore=score;
886 }
887 else if (maxscore-score>a_dropoff_score) break;
888 }
889 a_l=ai_maxscore;
890 b_l=bi_maxscore;
891 //if (debug) GMessage(" after l-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
892 //now extend to the right
893 ai_maxscore=a_r;
894 bi_maxscore=b_r;
895 ai=a_r;
896 bi=b_r;
897 score=maxscore;
898 //sometimes there are extra AAAAs at the end of the read, ignore those
899 if (strcmp(&a[alen-4],"AAAA")==0) {
900 alen-=3;
901 while (a[alen-1]=='A' && alen>ai) alen--;
902 }
903 while (ai<alen-1 && bi<blen-1) {
904 ai++;
905 bi++;
906 //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
907 if (a[ai]==b[bi]) { //match
908 score+=a_m_score;
909 if (ai>=alen-2) {
910 score+=a_m_score-(alen-ai-1);
911 }
912 }
913 else { //mismatch
914 score+=a_mis_score;
915 }
916 if (score>maxscore) {
917 ai_maxscore=ai;
918 bi_maxscore=bi;
919 maxscore=score;
920 }
921 else if (maxscore-score>a_dropoff_score) break;
922 }
923 a_r=ai_maxscore;
924 b_r=bi_maxscore;
925 int a_ovh3=alen-a_r-1;
926 int b_ovh3=blen-b_r-1;
927 int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
928 int mmovh5=(a_l<b_l)? a_l : b_l;
929 //if (debug) GMessage(" after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
930 #ifdef DEBUG
931 if (debug) GMessage(" extended to: %*s\n",a_r+1,dbg.substr(b_l,b_r-b_l+1).chars());
932 #endif
933 if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
934 if (a_l<a_ovh3) {
935 //adapter closer to the left end (typical for 5' adapter)
936 l5=a_r+1;
937 l3=alen-1;
938 }
939 else {
940 //adapter matching at the right end (typical for 3' adapter)
941 l5=0;
942 l3=a_l-1;
943 }
944 return true;
945 }
946 else { //keep this segment pair for later (gapped alignment)
947 segs.addSegPair(new CSegPair(a_l+1, a_r+1, b_l+1, b_r+1, maxscore));
948 //this will also update min & max coordinates in segs (segs.astart, .aend, .bstart, .bend)
949 }
950 //do not trim:
951 l5=0;
952 l3=alen-1;
953 return false;
954 }
955
956 /*
957 int getWordValue(const char* s, int wlen) {
958 int r=0;
959 while (wlen--) { r+=(((int)s[wlen])<<wlen) }
960 return r;
961 }
962 */
963 int get3mer_value(const char* s) {
964 return (s[0]<<16)+(s[1]<<8)+s[2];
965 }
966
967 int w3_match(int qv, const char* str, int slen, int start_index=0) {
968 if (start_index>=slen || start_index<0) return -1;
969 for (int i=start_index;i<slen-3;i++) {
970 int rv=get3mer_value(str+i);
971 if (rv==qv) return i;
972 }
973 return -1;
974 }
975
976 int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
977 if (end_index>=slen) return -1;
978 if (end_index<0) end_index=slen-1;
979 for (int i=end_index-2;i>=0;i--) {
980 int rv=get3mer_value(str+i);
981 if (rv==qv) return i;
982 }
983 return -1;
984 }
985
986 int fast4match(int32 qv, const char* str, int slen, int start_index=0) {
987 if (start_index>=slen || start_index<0) return -1;
988 for (int i=start_index;i<slen-4;i++) {
989 int32* rv=(int32*)(str+i);
990 if (*rv==qv) return i;
991 }
992 return -1;
993 }
994
995 int fast4rmatch(int32 qv, const char* str, int slen, int end_index=-1) {
996 if (end_index>=slen) return -1;
997 if (end_index<0) end_index=slen-1;
998 for (int i=end_index-3;i>=0;i--) {
999 int32* rv=(int32*)(str+i);
1000 if (*rv==qv) return i;
1001 }
1002 return -1;
1003 }
1004
1005 #ifdef DEBUG
1006 void dbgPrintChain(CSegChain& chain, const char* aseq) {
1007 GStr s(aseq);
1008 for (int i=0;i<chain.Count();i++) {
1009 CSegPair& seg=*chain[i];
1010 GMessage(" dbg chain seg%d: %*s [%d-%d:%d-%d]\n",i,seg.a.start-1+seg.len(),
1011 s.substr(seg.b.start-1, seg.len()).chars(), seg.b.start,seg.b.end,seg.a.start,seg.a.end);
1012 }
1013 }
1014 #endif
1015
1016 bool trim_adapter3(GStr& seq, int&l5, int &l3) {
1017 int rlen=seq.length();
1018 l5=0;
1019 l3=rlen-1;
1020 //first try a full match, we might get lucky
1021 int fi=-1;
1022 if ((fi=seq.index(adapter3))>=0) {
1023 if (fi<rlen-fi-a3len) {//match is closer to the right end
1024 l5=fi+a3len;
1025 l3=rlen-1;
1026 }
1027 else {
1028 l5=0;
1029 l3=fi-1;
1030 }
1031 return true;
1032 }
1033 #ifdef DEBUG
1034 if (debug) GMessage(">TRIM3 >> Read: %s\n",seq.chars());
1035 #endif
1036
1037 //also, for fast detection of other adapter-only reads that start past
1038 // the beginning of the adapter sequence, try to see if the first a3len-4
1039 // bases of the read are a substring of the adapter
1040 if (rlen>a3len-3) {
1041 GStr rstart=seq.substr(1,a3len-4);
1042 if ((fi=adapter3.index(rstart))>=0) {
1043 l3=rlen-1;
1044 l5=a3len-4;
1045 while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
1046 return true;
1047 }
1048 }
1049 CSegChain a3segs; //no chains here, just an ordered collection of segment pairs
1050 //check the easy cases - 11 bases exact match at the end
1051 int fdlen=11;
1052 if (a3len<16) {
1053 fdlen=a3len>>1;
1054 }
1055 if (fdlen>4) {
1056 //check if we're lucky enough to have the last 11 bases of the read a part of the adapter
1057 GStr rstart=seq.substr(-fdlen-3,fdlen);
1058 if ((fi=adapter3.index(rstart))>=0) {
1059 #ifdef DEBUG
1060 if (debug) GMessage(" W11match found: %*s\n", rlen-3, (adapter3.substr(fi,fdlen)).chars());
1061 #endif
1062 if (extendMatch(seq.chars(), rlen, rlen-fdlen-3,
1063 adapter3.chars(), a3len, fi, fdlen, l5,l3, a3segs))
1064 return true;
1065 }
1066 //another easy case: first 11 characters of the adaptor found as a substring of the read
1067 GStr bstr=adapter3.substr(0, fdlen);
1068 if ((fi=seq.rindex(bstr))>=0) {
1069 #ifdef DEBUG
1070 if (debug) GMessage(" A11match found: %*s\n", fi+fdlen, bstr.chars());
1071 #endif
1072 if (extendMatch(seq.chars(), rlen, fi,
1073 adapter3.chars(), a3len, 0, fdlen, l5,l3, a3segs))
1074 return true;
1075 }
1076 } //tried to match 11 bases first
1077
1078 //no easy cases, so let's do the wmer hashing for the first 12 bases of the adaptor
1079 //-- only extend if the match is in the 3' (ending) region of the read
1080 int wordSize=3;
1081 int hlen=12;
1082 if (hlen>a3len-wordSize) hlen=a3len-wordSize;
1083 int imin=rlen>>1; //last half of the read, left boundary for the wmer match
1084 if (imin<a3len) { imin=GMIN(a3len, rlen-wordSize); }
1085 imin=rlen-imin;
1086 for (int iw=0;iw<hlen;iw++) {
1087 //int32* qv=(int32*)(adapter3.chars()+iw);
1088 int qv=get3mer_value(adapter3.chars()+iw);
1089 fi=-1;
1090 //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1091 while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1092 //GMessage(" ... fi=%d after w3_rmatch() (imin=%d)\n", fi, imin);
1093
1094 #ifdef DEBUG
1095 if (debug) GMessage(" Wmatch found: %*s\n", fi+wordSize, (adapter3.substr(iw,wordSize)).chars());
1096 #endif
1097 if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1098 a3len, iw, wordSize, l5,l3, a3segs)) return true;
1099 fi--;
1100 if (fi<imin) break;
1101 }
1102 } //for each wmer in the first hlen bases of the adaptor
1103 /*
1104 //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1105 //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1106 if (a3segs.bstart>3 || a3segs.bend<(uint)(hlen-wordSize)) return false;
1107 int hlen2=a3len-wordSize;
1108 //if (hlen2>a3len-4) hlen2=a3len-4;
1109 if (hlen2>hlen) {
1110 #ifdef DEBUG
1111 if (debug && a3segs.Count()>0) {
1112 GMessage(" >>>>>2nd. hash: %s\n",seq.chars());
1113 }
1114 #endif
1115 for (int iw=hlen;iw<hlen2;iw++) {
1116 //int* qv=(int32 *)(adapter3.chars()+iw);
1117 int qv=get3mer_value(adapter3.chars()+iw);
1118 fi=-1;
1119 //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1120 while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1121 extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1122 a3len, iw, wordSize, l5,l3, a3segs);
1123 fi--;
1124 if (fi<imin) break;
1125 }
1126 } //for each wmer between hlen2 and hlen bases of the adaptor
1127 }
1128 //lastly, analyze collected a3segs for a possible gapped alignment:
1129 GList<CSegChain> segchains(false,true,false);
1130 #ifdef DEBUG
1131 if (debug && a3segs.Count()>0) {
1132 GMessage(">>>>>>>>> Read: %s\n",seq.chars());
1133 }
1134 #endif
1135 for (int i=0;i<a3segs.Count();i++) {
1136 if (a3segs[i]->chain==NULL) {
1137 if (a3segs[i]->b.start>3) continue; //don't start a hopeless chain
1138 CSegChain* newchain=new CSegChain();
1139 newchain->setFreeItem(false);
1140 newchain->addSegPair(a3segs[i]);
1141 a3segs[i]->chain=newchain;
1142 segchains.Add(newchain); //just to free them when done
1143 }
1144 for (int j=i+1;j<a3segs.Count();j++) {
1145 CSegChain* chain=a3segs[i]->chain;
1146 if (chain->extendChain(a3segs[j])) {
1147 a3segs[j]->chain=chain;
1148 #ifdef DEBUG
1149 if (debug) dbgPrintChain(*chain, adapter3.chars());
1150 #endif
1151 //save time by checking here if the extended chain is already acceptable for trimming
1152 if (chain->aend>(uint)(rlen-4) && chain->bstart<4 && chain->score>a_min_chain_score) {
1153 l5=0;
1154 l3=chain->astart-2;
1155 #ifdef DEBUG
1156 if (debug && a3segs.Count()>0) {
1157 GMessage(">>> >> trimmed-3: %*s\n",l3-l5+1,seq.substr(l5,l3-l5+1).chars());
1158 }
1159 #endif
1160 return true;
1161 }
1162 } //chain can be extended
1163 }
1164 } //collect segment alignments into chains
1165 */
1166 return false; //no adapter parts found
1167 }
1168
1169 bool trim_adapter5(GStr& seq, int&l5, int &l3) {
1170 //if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars());
1171 int rlen=seq.length();
1172 l5=0;
1173 l3=rlen-1;
1174 //try to see if adapter is fully included in the read
1175 int fi=-1;
1176 if ((fi=seq.index(adapter5))>=0) {
1177 if (fi<rlen-fi-a5len) {//match is closer to the right end
1178 l5=fi+a5len;
1179 l3=rlen-1;
1180 }
1181 else {
1182 l5=0;
1183 l3=fi-1;
1184 }
1185 return true;
1186 }
1187 #ifdef DEBUG
1188 if (debug) GMessage(">TRIM5 >> Read: %s\n",seq.chars());
1189 #endif
1190
1191 CSegChain a5segs(true); //list of segment pairs to analyze later if no extendMatch succeeded
1192
1193 //try the easy way out first - look for an exact match of 11 bases
1194 int fdlen=11;
1195 if (a5len<16) {
1196 fdlen=a5len>>1;
1197 }
1198 if (fdlen>4) {
1199 GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus
1200 if ((fi=adapter5.index(rstart))>=0) {
1201 #ifdef DEBUG
1202 if (debug) GMessage(" W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars());
1203 #endif
1204 if (extendMatch(seq.chars(), rlen, 1,
1205 adapter5.chars(), a5len, fi, fdlen, l5,l3, a5segs, true))
1206 return true;
1207 }
1208 //another easy case: last 11 characters of the adaptor found as a substring of the read
1209 GStr bstr=adapter5.substr(-fdlen);
1210 if ((fi=seq.index(bstr))>=0) {
1211 #ifdef DEBUG
1212 if (debug) GMessage(" A11match found: %*s\n", fi+fdlen, bstr.chars());
1213 #endif
1214 if (extendMatch(seq.chars(), rlen, fi,
1215 adapter5.chars(), a5len, a5len-fdlen, fdlen, l5,l3,a5segs,true))
1216 return true;
1217 }
1218 } //tried to matching at most 11 bases first
1219
1220 //-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor
1221 //-- only extend a wmer if it matches in the 5' (beginning) region of the read
1222 int wordSize=3;
1223 int hlen=12;
1224 if (hlen>a5len-wordSize) hlen=a5len-wordSize;
1225 int imax=rlen>>1; //first half of the read, right boundary for the wmer match
1226 if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); }
1227 for (int iw=0;iw<=hlen;iw++) {
1228 int apstart=a5len-iw-wordSize;
1229 fi=0;
1230 //int* qv=(int32 *)(adapter5.chars()+apstart);
1231 int qv=get3mer_value(adapter5.chars()+apstart);
1232 //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1233 while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1234 #ifdef DEBUG
1235 if (debug) GMessage(" Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars());
1236 #endif
1237 if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1238 a5len, apstart, wordSize, l5,l3, a5segs, true)) return true;
1239 fi++;
1240 if (fi>imax) break;
1241 }
1242 } //for each wmer in the last hlen bases of the adaptor
1243 /*
1244
1245 //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1246 //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1247 if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1248 int hlen2=a5len-wordSize;
1249 //if (hlen2>a5len-wordSize) hlen2=a5len-wordSize;
1250 #ifdef DEBUG
1251 if (debug && a5segs.Count()>0) {
1252 GMessage(" >>>>>2nd. hash: %s\n",seq.chars());
1253 }
1254 #endif
1255 if (hlen2>hlen) {
1256 for (int iw=hlen+1;iw<=hlen2;iw++) {
1257 int apstart=a5len-iw-wordSize;
1258 fi=0;
1259 //int* qv=(int32 *)(adapter5.chars()+apstart);
1260 int qv=get3mer_value(adapter5.chars()+apstart);
1261 //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1262 while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1263 extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1264 a5len, apstart, wordSize, l5,l3, a5segs, true);
1265 fi++;
1266 if (fi>imax) break;
1267 }
1268 } //for each wmer between hlen2 and hlen bases of the adaptor
1269 }
1270 if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1271 // lastly, analyze collected a5segs for a possible gapped alignment:
1272 GList<CSegChain> segchains(false,true,false);
1273 #ifdef DEBUG
1274 if (debug && a5segs.Count()>0) {
1275 GMessage(">>>>>>>>> Read: %s\n",seq.chars());
1276 }
1277 #endif
1278 for (int i=0;i<a5segs.Count();i++) {
1279 if (a5segs[i]->chain==NULL) {
1280 if (a5segs[i]->b.end<(int)(a5len-4)) continue; //don't start a hopeless chain
1281 CSegChain* newchain=new CSegChain(true);
1282 newchain->setFreeItem(false);
1283 newchain->addSegPair(a5segs[i]);
1284 a5segs[i]->chain=newchain;
1285 segchains.Add(newchain); //just to free them when done
1286 }
1287 for (int j=i+1;j<a5segs.Count();j++) {
1288 CSegChain* chain=a5segs[i]->chain;
1289 if (chain->extendChain(a5segs[j])) {
1290 a5segs[j]->chain=chain;
1291 #ifdef DEBUG
1292 if (debug) dbgPrintChain(*chain, adapter5.chars());
1293 #endif
1294 //save time by checking here if the extended chain is already acceptable for trimming
1295 if (chain->bend>(uint)(a5len-3) && chain->astart<4 && chain->score>a_min_chain_score) {
1296 l5=chain->aend;
1297 l3=rlen-1;
1298 return true;
1299 }
1300 } //chain can be extended
1301 }
1302 } //collect segment alignments into chains
1303 */
1304 return false; //no adapter parts found
1305 }
1306
1307 //convert qvs to/from phred64 from/to phread33
1308 void convertPhred(GStr& q) {
1309 for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1310 }
1311
1312 void convertPhred(char* q, int len) {
1313 for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1314 }