ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 10
Committed: Tue Mar 23 17:46:56 2010 UTC (9 years, 8 months ago) by gpertea
File size: 24465 byte(s)
Log Message:
bugfix: prevent crash with -o option is not given

Line File contents
1 #include "GArgs.h"
2 #include "GStr.h"
3 #include "GHash.hh"
4 #include "GList.hh"
5
6 #define USAGE "Usage:\n\
7 fqtrim [-5 <5'adapter>] [-3 <3'adapter>] [-l <minlen>] [-C] [-D] [-Q] \\\n\
8 [-n <rename_prefix>] [-o <trimmed.fq>] [-r <discarded.lst>] <input.fq>\n\
9 \n\
10 Trim low quality bases at 3' end, optional adapter, filter for low complexity and\n\
11 optionally collapse duplicates in the input fastq data.\n\
12 \n\
13 Options:\n\
14 -n rename all the reads using the <prefix> followed by a read counter;\n\
15 if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
16 the read duplication count\n\
17 -o write the trimmed/filtered fastq into <trimmed.fq>(instead of stdout)\n\
18 -5 trim the given adapter or primer sequence at the 5' end of each read\n\
19 (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
20 -3 trim the given adapter sequence at the 3' end of each read\n\
21 (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
22 -l minimum \"clean\" length at the high quality 5'end that a read must\n\
23 have in order to pass the filter (default: 16)\n\
24 -r write a simple \"trash report\" file listing the discarded reads with a\n\
25 one letter code for the reason why they were trashed\n\
26 -C collapse duplicate reads and add a prefix with their count to the read\n\
27 name\n\
28 -D apply a low-complexity (dust) filter and discard any read that has at\n\
29 least half of it masked by this\n\
30 -Q quality values in the input data are interpreted as phred64, convert\n\
31 them to phred33\n\
32 "
33 // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
34 FILE* f_out=NULL; //stdout if not provided
35 FILE* f_in=NULL; //input fastq (stdin if not provided)
36 FILE* freport=NULL;
37 bool debug=false;
38 bool doCollapse=false;
39 bool doDust=false;
40 bool trashReport=false;
41 bool rawFormat=false;
42 int min_read_len=16;
43 int dust_cutoff=16;
44 bool isfasta=false;
45 bool phred64=false;
46 GStr prefix;
47 GStr adapter3;
48 GStr adapter5;
49 int a3len=0;
50 int a5len=0;
51 // adaptor matching metrics -- for extendMatch() function
52 static const int a_m_score=2; //match score
53 static const int a_mis_score=-3; //mismatch
54 static const int a_dropoff_score=7;
55 static const int a_min_score=7;
56
57 // element in dhash:
58 class FqDupRec {
59 public:
60 int count; //how many of these reads are the same
61 int len; //length of qv
62 char* firstname; //optional, only if we want to keep the original read names
63 char* qv;
64 FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
65 len=0;
66 qv=NULL;
67 firstname=NULL;
68 count=0;
69 if (qstr!=NULL) {
70 qv=Gstrdup(qstr->chars());
71 len=qstr->length();
72 count++;
73 }
74 if (rname!=NULL) firstname=Gstrdup(rname);
75 }
76 ~FqDupRec() {
77 GFREE(qv);
78 GFREE(firstname);
79 }
80 void add(GStr& d) { //collapse another record into this one
81 if (d.length()!=len)
82 GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
83 count++;
84 for (int i=0;i<len;i++)
85 qv[i]=(qv[i]+d[i])/2;
86 }
87 };
88
89 void openfw(FILE* &f, GArgs& args, char opt) {
90 GStr s=args.getOpt(opt);
91 if (!s.is_empty()) {
92 if (s=='-') f=stdout;
93 else {
94 f=fopen(s,"w");
95 if (f==NULL) GError("Error creating file: %s\n", s.chars());
96 }
97 }
98 }
99
100 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
101 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
102
103 GHash<FqDupRec> dhash; //hash to keep track of duplicates
104
105 bool ntrim(GStr& rseq, GStr &rqv, int &l5, int &l3); //returns true if any trimming occured
106
107 int dust(GStr& seq);
108 bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
109 bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
110
111 void convertQ64(char* q, int len);
112 void convertQ64(GStr& q);
113
114 int main(int argc, char * const argv[]) {
115 GArgs args(argc, argv, "YQRDCl:d:3:5:n:r:o:");
116 int e;
117 int icounter=0; //counter for input reads
118 int outcounter=0; //counter for output reads
119 if ((e=args.isError())>0) {
120 GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
121 exit(224);
122 }
123 debug=(args.getOpt('Y')!=NULL);
124 phred64=(args.getOpt('Q')!=NULL);
125 doCollapse=(args.getOpt('C')!=NULL);
126 doDust=(args.getOpt('D')!=NULL);
127 rawFormat=(args.getOpt('R')!=NULL);
128 if (rawFormat) {
129 GError("Sorry, raw qseq format parsing is not implemented yet!\n");
130 }
131 prefix=args.getOpt('n');
132 GStr s=args.getOpt('l');
133 if (!s.is_empty())
134 min_read_len=s.asInt();
135 s=args.getOpt('d');
136 if (!s.is_empty()) {
137 dust_cutoff=s.asInt();
138 doDust=true;
139 }
140
141 if (args.getOpt('3')!=NULL) {
142 adapter3=args.getOpt('3');
143 adapter3.upper();
144 a3len=adapter3.length();
145 }
146 if (args.getOpt('5')!=NULL) {
147 adapter5=args.getOpt('5');
148 adapter5.upper();
149 a5len=adapter5.length();
150 }
151 trashReport= (args.getOpt('r')!=NULL);
152 if (args.startNonOpt()==0) {
153 GMessage(USAGE);
154 exit(224);
155 }
156
157 openfw(f_out, args, 'o');
158 if (f_out==NULL) f_out=stdout;
159 if (trashReport)
160 openfw(freport, args, 'r');
161 char* infile=NULL;
162 while ((infile=args.nextNonOpt())!=NULL) {
163 GStr infname(infile);
164 if (strcmp(infile,"-")==0) {
165 f_in=stdin; infname="stdin"; }
166 else {
167 f_in=fopen(infile,"r");
168 if (f_in==NULL)
169 GError("Cannot open input file %s!\n",infile);
170 }
171 GLineReader fq(f_in);
172 char* l=NULL;
173 while ((l=fq.getLine())!=NULL) {
174 GStr rname; //current read name
175 GStr rseq; //current read sequence
176 GStr rqv; //current read quality values
177 GStr s;
178 if (rawFormat) {
179 //TODO: implement qseq parsing here
180 //if (raw type=N) then continue; //skip invalid/bad records
181
182 } //raw qseq format
183 else { // FASTQ or FASTA
184 isfasta=(l[0]=='>');
185 if (!isfasta && l[0]!='@') GError("Error: fastq record marker not detected!\n");
186 s=l;
187 rname=&(l[1]);
188 icounter++;
189 //GMessage("readname=%s\n",rname.chars());
190 for (int i=0;i<rname.length();i++)
191 if (rname[i]<=' ') { rname.cut(i); break; }
192 //now get the sequence
193 if ((l=fq.getLine())==NULL)
194 GError("Error: unexpected EOF after header for %s\n",rname.chars());
195 rseq=l; //this must be the DNA line
196 if (isfasta) {
197 while ((l=fq.getLine())!=NULL) {
198 //fasta can have multiple sequence lines
199 if (l[0]=='>') {
200 fq.pushBack();
201 break; //
202 }
203 rseq+=l;
204 }
205 } //multi-line fasta file
206 if (!isfasta) {
207 if ((l=fq.getLine())==NULL)
208 GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
209 if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
210 if ((l=fq.getLine())==NULL)
211 GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
212 rqv=l;
213 if (rqv.length()!=rseq.length())
214 GError("Error: qv len != seq len for %s\n", rname.chars());
215 }
216 } //<-- FASTA or FASTQ
217 rseq.upper();
218 int l5=0;
219 int l3=rseq.length()-1;
220 if (l3-l5+1<min_read_len) {
221 if (trashReport) {
222 fprintf(freport, "%s\ts\t%s\n",rname.chars(), rseq.chars());
223 }
224 continue;
225 }
226 if (ntrim(rseq, rqv, l5, l3)) {
227 if (l3-l5+1<min_read_len) {
228 if (trashReport) {
229 fprintf(freport, "%s\tN\t%s\n", rname.chars(), rseq.chars());
230 }
231 continue; //invalid read
232 }
233 //-- keep only the l5..l3 range
234 rseq=rseq.substr(l5, l3-l5+1);
235 if (!rqv.is_empty())
236 rqv=rqv.substr(l5, l3-l5+1);
237 }
238
239 if (a3len>0) {
240 if (trim_adapter3(rseq, l5, l3)) {
241 if (l3-l5+1<min_read_len) {
242 if (trashReport) {
243 fprintf(freport, "%s\tA3\t%s\n",rname.chars(), rseq.chars());
244 }
245 continue;
246 }
247 //-- keep only the l5..l3 range
248 rseq=rseq.substr(l5, l3-l5+1);
249 if (!rqv.is_empty())
250 rqv=rqv.substr(l5, l3-l5+1);
251 }//some adapter was trimmed
252 } //adapter trimming
253 if (a5len>0) {
254 if (trim_adapter5(rseq, l5, l3)) {
255 if (l3-l5+1<min_read_len) {
256 if (trashReport) {
257 fprintf(freport, "%s\tA5\t%s\n",rname.chars(), rseq.chars());
258 }
259 continue;
260 }
261 //-- keep only the l5..l3 range
262 rseq=rseq.substr(l5, l3-l5+1);
263 if (!rqv.is_empty())
264 rqv=rqv.substr(l5, l3-l5+1);
265 }//some adapter was trimmed
266 } //adapter trimming
267 if (doCollapse) {
268 //keep read for later
269 FqDupRec* dr=dhash.Find(rseq.chars());
270 if (dr==NULL) { //new entry
271 //if (prefix.is_empty())
272 dhash.Add(rseq.chars(),
273 new FqDupRec(&rqv, rname.chars()));
274 //else dhash.Add(rseq.chars(), new FqDupRec(rqv.chars(),rqv.length()));
275 }
276 else
277 dr->add(rqv);
278 } //collapsing duplicates
279 else { //not collapsing duplicates
280 //do the dust filter now
281 if (doDust) {
282 int dustbases=dust(rseq);
283 if (dustbases>(rseq.length()>>1)) {
284 if (trashReport) {
285 fprintf(freport, "%s\tD\t%s\n",rname.chars(),rseq.chars());
286 }
287 continue;
288 }
289 }
290 //print this record here
291 outcounter++;
292 if (isfasta) {
293 if (prefix.is_empty())
294 fprintf(f_out, ">%s\n%s\n", rname.chars(), rseq.chars());
295 else
296 fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
297 rseq.chars());
298 }
299 else { //fastq
300 if (phred64) convertQ64(rqv);
301 if (prefix.is_empty())
302 fprintf(f_out, "@%s\n%s\n+\n%s\n", rname.chars(), rseq.chars(),rqv.chars());
303 else
304 fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
305 rseq.chars(),rqv.chars() );
306 }
307 } //not collapsing duplicates
308 } //for each fastq record
309 } //while each input file
310 FRCLOSE(f_in);
311 if (doCollapse) {
312 outcounter=0;
313 int maxdup_count=1;
314 char* maxdup_seq=NULL;
315 dhash.startIterate();
316 FqDupRec* qd=NULL;
317 char* seq=NULL;
318 while ((qd=dhash.NextData(seq))!=NULL) {
319 GStr rseq(seq);
320 //do the dusting here
321 if (doDust) {
322 int dustbases=dust(rseq);
323 if (dustbases>(rseq.length()>>1)) {
324 if (trashReport && qd->firstname!=NULL) {
325 fprintf(freport, "%s_x%d\tD\t%s\n",qd->firstname, qd->count,seq);
326 }
327 continue;
328 }
329 }
330 outcounter++;
331 if (qd->count>maxdup_count) {
332 maxdup_count=qd->count;
333 maxdup_seq=seq;
334 }
335 if (isfasta) {
336 if (prefix.is_empty()) {
337 fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
338 rseq.chars());
339 }
340 else { //use custom read name
341 fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
342 qd->count, rseq.chars());
343 }
344 }
345 else { //fastq format
346 if (phred64) convertQ64(qd->qv, qd->len);
347 if (prefix.is_empty()) {
348 fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
349 rseq.chars(), qd->qv);
350 }
351 else { //use custom read name
352 fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
353 qd->count, rseq.chars(), qd->qv);
354 }
355 }
356 }//for each element of dhash
357 if (maxdup_count>1) {
358 GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
359 }
360 } //report collapsed dhash entries
361 GMessage("Number of input reads: %9d\n", icounter);
362 GMessage(" Output records: %9d\n", outcounter);
363 if (trashReport) {
364 FWCLOSE(freport);
365 }
366
367 FWCLOSE(f_out);
368 }
369
370 class NData {
371 public:
372 int NPos[1024]; //there should be no reads longer than 1K ?
373 int NCount;
374 int end5;
375 int end3;
376 int seqlen;
377 double perc_N; //percentage of Ns in end5..end3 range only!
378 const char* seq;
379 bool valid;
380 NData() {
381 NCount=0;
382 end5=0;
383 end3=0;
384 seq=NULL;
385 perc_N=0;
386 valid=true;
387 }
388 void init(GStr& rseq) {
389 NCount=0;
390 perc_N=0;
391 seqlen=rseq.length();
392 seq=rseq.chars();
393 end5=1;
394 end3=seqlen;
395 for (int i=0;i<seqlen;i++)
396 if (rseq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
397 NPos[NCount]=i;
398 NCount++;
399 }
400 perc_N=(NCount*100.0)/seqlen;
401 valid=true;
402 }
403 void N_calc() { //only in the region after trimming
404 int n=0;
405 for (int i=end3-1;i<end5;i++) {
406 if (seq[i]=='N') n++;
407 }
408 perc_N=(n*100.0)/(end5-end3+1);
409 }
410 };
411
412 static NData feat;
413 int perc_lenN=12; // incremental distance from ends, in percentage of
414 // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
415
416 void N_analyze(int l5, int l3, int p5, int p3) {
417 /* assumes feat was filled properly */
418 int old_dif, t5,t3,v;
419 if (l3-l5<min_read_len || p5>p3 ) {
420 feat.end5=l5+1;
421 feat.end3=l3+1;
422 return;
423 }
424
425 t5=feat.NPos[p5]-l5;
426 t3=l3-feat.NPos[p3];
427 old_dif=p3-p5;
428 v=(int)((((double)(l3-l5))*perc_lenN)/100);
429 if (v>20) v=20; /* enforce N-search limit for very long reads */
430 if (t5 < v ) {
431 l5=feat.NPos[p5]+1;
432 p5++;
433 }
434 if (t3 < v) {
435 l3=feat.NPos[p3]-1;
436 p3--;
437 }
438 /* restNs=p3-p5; number of Ns in the new CLR */
439 if (p3-p5==old_dif) { /* no change, return */
440 /* don't trim if this may shorten the read too much */
441 //if (l5-l3<min_read_len) return;
442 feat.end5=l5+1;
443 feat.end3=l3+1;
444 return;
445 }
446 else
447 N_analyze(l5,l3, p5,p3);
448 }
449
450
451 bool ntrim(GStr& rseq, GStr &rqv, int &l5, int &l3) {
452 //count Ns in the sequence
453 feat.init(rseq);
454 l5=feat.end5-1;
455 l3=feat.end3-1;
456 N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
457 if (l5==feat.end5-1 && l3==feat.end3-1)
458 return false; //nothing changed
459 l5=feat.end5-1;
460 l3=feat.end3-1;
461 if (l3-l5+1<min_read_len) {
462 feat.valid=false;
463 return true;
464 }
465 feat.N_calc();
466 if (feat.perc_N>6.2) { //not valid if more than 1 N per 16 bases
467 feat.valid=false;
468 l3=l5+1;
469 return true;
470 }
471 return true;
472 }
473
474 //--------------- dust functions ----------------
475 class DNADuster {
476 public:
477 int dustword;
478 int dustwindow;
479 int dustwindow2;
480 int dustcutoff;
481 int mv, iv, jv;
482 int counts[32*32*32];
483 int iis[32*32*32];
484 DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
485 dustword=wordsize;
486 dustwindow=winsize;
487 dustwindow2 = (winsize>>1);
488 dustcutoff=cutoff;
489 mv=0;
490 iv=0;
491 jv=0;
492 }
493 void setWindowSize(int value) {
494 dustwindow = value;
495 dustwindow2 = (dustwindow >> 1);
496 }
497 void setWordSize(int value) {
498 dustword=value;
499 }
500 void wo1(int len, const char* s, int ivv) {
501 int i, ii, j, v, t, n, n1, sum;
502 int js, nis;
503 n = 32 * 32 * 32;
504 n1 = n - 1;
505 nis = 0;
506 i = 0;
507 ii = 0;
508 sum = 0;
509 v = 0;
510 for (j=0; j < len; j++, s++) {
511 ii <<= 5;
512 if (*s<=32) {
513 i=0;
514 continue;
515 }
516 ii |= *s - 'A'; //assume uppercase!
517 ii &= n1;
518 i++;
519 if (i >= dustword) {
520 for (js=0; js < nis && iis[js] != ii; js++) ;
521 if (js == nis) {
522 iis[nis] = ii;
523 counts[ii] = 0;
524 nis++;
525 }
526 if ((t = counts[ii]) > 0) {
527 sum += t;
528 v = 10 * sum / j;
529 if (mv < v) {
530 mv = v;
531 iv = ivv;
532 jv = j;
533 }
534 }
535 counts[ii]++;
536 }
537 }
538 }
539
540 int wo(int len, const char* s, int* beg, int* end) {
541 int i, l1;
542 l1 = len - dustword + 1;
543 if (l1 < 0) {
544 *beg = 0;
545 *end = len - 1;
546 return 0;
547 }
548 mv = 0;
549 iv = 0;
550 jv = 0;
551 for (i=0; i < l1; i++) {
552 wo1(len-i, s+i, i);
553 }
554 *beg = iv;
555 *end = iv + jv;
556 return mv;
557 }
558
559 void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
560 int i, j, l, a, b, v;
561 if (cutoff==0) cutoff=dustcutoff;
562 a=0;b=0;
563 //GMessage("Dust cutoff=%d\n", cutoff);
564 for (i=0; i < seqlen; i += dustwindow2) {
565 l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
566 v = wo(l, seq+i, &a, &b);
567 if (v > cutoff) {
568 //for (j = a; j <= b && j < dustwindow2; j++) {
569 for (j = a; j <= b; j++) {
570 seqmsk[i+j]='N';//could be made lowercase instead
571 }
572 }
573 }
574 //return first;
575 }
576 };
577
578 static DNADuster duster;
579
580 int dust(GStr& rseq) {
581 char* seq=Gstrdup(rseq.chars());
582 duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
583 //check the number of Ns:
584 int ncount=0;
585 for (int i=0;i<rseq.length();i++) {
586 if (seq[i]=='N') ncount++;
587 }
588 GFREE(seq);
589 return ncount;
590 }
591
592 // ------------------ adapter matching - triplet matching
593 //look for matching triplets amongst the first 9 nucleotides of the 3' adaptor
594 // or the last 9 nucleotides for the 5' adapter
595 //when a triplet match is found, simply try to extend the alignment using a drop-off scheme
596 // check minimum score and
597 // for 3' adapter trimming:
598 // require that the right end of the alignment for either the adaptor OR the read must be
599 // < 3 distance from its right end
600 // for 5' adapter trimming:
601 // require that the left end of the alignment for either the adaptor OR the read must
602 // be at coordinate 0
603
604 bool extendMatch(const char* a, int alen, int ai,
605 const char* b, int blen, int bi, int mlen, int& l5, int& l3, bool end5=false) {
606 //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
607 //if (debug)
608 // GStr dbg(b);
609 //GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(ai, mlen)).chars(), ai);
610 int a_l=ai; //alignment coordinates on a
611 int a_r=ai+mlen-1;
612 int b_l=bi; //alignment coordinates on b
613 int b_r=bi+mlen-1;
614 int ai_maxscore=ai;
615 int bi_maxscore=bi;
616 int score=mlen*a_m_score;
617 int maxscore=score;
618 int mism5score=a_mis_score;
619 if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
620 //try to extend to the left first, if possible
621 while (ai>0 && bi>0) {
622 ai--;
623 bi--;
624 score+= (a[ai]==b[bi])? a_m_score : mism5score;
625 if (score>maxscore) {
626 ai_maxscore=ai;
627 bi_maxscore=bi;
628 maxscore=score;
629 }
630 else if (maxscore-score>a_dropoff_score) break;
631 }
632 a_l=ai_maxscore;
633 b_l=bi_maxscore;
634 //if (debug) GMessage(" after l-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
635 //now extend to the right
636 ai_maxscore=a_r;
637 bi_maxscore=b_r;
638 ai=a_r;
639 bi=b_r;
640 score=maxscore;
641 //sometimes there are extra AAAAs at the end of the read, ignore those
642 if (strcmp(&a[alen-4],"AAAA")==0) {
643 alen-=3;
644 while (a[alen-1]=='A' && alen>ai) alen--;
645 }
646 while (ai<alen-1 && bi<blen-1) {
647 ai++;
648 bi++;
649 //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
650 if (a[ai]==b[bi]) { //match
651 score+=a_m_score;
652 if (ai>=alen-2) {
653 score+=a_m_score-(alen-ai-1);
654 }
655 }
656 else { //mismatch
657 score+=a_mis_score;
658 }
659 if (score>maxscore) {
660 ai_maxscore=ai;
661 bi_maxscore=bi;
662 maxscore=score;
663 }
664 else if (maxscore-score>a_dropoff_score) break;
665 }
666 a_r=ai_maxscore;
667 b_r=bi_maxscore;
668 int a_ovh3=alen-a_r-1;
669 int b_ovh3=blen-b_r-1;
670 int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
671 int mmovh5=(a_l<b_l)? a_l : b_l;
672 //if (debug) GMessage(" after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
673 if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
674 if (a_l<a_ovh3) {
675 //adapter closer to the left end (typical for 5' adapter)
676 l5=a_r+1;
677 l3=alen-1;
678 }
679 else {
680 //adapter matching at the right end (typical for 3' adapter)
681 l5=0;
682 l3=a_l-1;
683 }
684 return true;
685 }
686 return false;
687 }
688
689 bool trim_adapter3(GStr& seq, int&l5, int &l3) {
690 int rlen=seq.length();
691 l5=0;
692 l3=rlen-1;
693 //first try a full match, we might get lucky
694 int fi=-1;
695 if ((fi=seq.index(adapter3))>=0) {
696 if (fi<rlen-fi-a3len) {//match is closer to the right end
697 l5=fi+a3len;
698 l3=rlen-1;
699 }
700 else {
701 l5=0;
702 l3=fi-1;
703 }
704 return true;
705 }
706 //also, for fast detection of other adapter-only reads that start past
707 // the beginning of the adapter sequence, try to see if the first a3len-4
708 // characters of the read are a substring of the adapter
709 GStr rstart=seq.substr(0,a3len-4);
710 if ((fi=adapter3.index(rstart))>=0) {
711 l3=rlen-1;
712 l5=a3len-4;
713 while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
714 return true;
715 }
716
717 //another easy case: first half of the adaptor matches
718 int a3hlen=a3len>>1;
719 GStr ahalf=adapter3.substr(0, a3hlen);
720 if ((fi=seq.index(ahalf))>=0) {
721 extendMatch(seq.chars(), rlen, fi,
722 adapter3.chars(), a3len, 0, a3hlen, l5,l3);
723 return true;
724 }
725 //no easy cases, so let's do the word hashing
726 for (int iw=0;iw<6;iw++) {
727 GStr aword=adapter3.substr(iw,3);
728 if ((fi=seq.index(aword))>=0 && rlen-fi>3) {
729 if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
730 a3len, iw, 3, l5,l3)) return true;
731 }
732 }
733 return false; //no adapter parts found
734 }
735
736 bool trim_adapter5(GStr& seq, int&l5, int &l3) {
737 //if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars());
738 int rlen=seq.length();
739 l5=0;
740 l3=rlen-1;
741 //try to see if adapter is fully included in the read
742 int fi=-1;
743 if ((fi=seq.index(adapter5))>=0) {
744 if (fi<rlen-fi-a5len) {//match is closer to the right end
745 l5=fi+a5len;
746 l3=rlen-1;
747 }
748 else {
749 l5=0;
750 l3=fi-1;
751 }
752 return true;
753 }
754 //for fast detection of adapter-rich reads, check if the first 12
755 //characters of the read are a substring of the adapter
756 GStr rstart=seq.substr(1,12);
757 if ((fi=adapter5.index(rstart))>=0) {
758 //l3=rlen-1;
759 //l5=a5len-4;
760 //while (fi+l5<a5len && l5<l3 && adapter5[fi+l5]==seq[l5]) l5++;
761 //return true;
762 //if (debug) GMessage(" first 12char of the read match adaptor!\n");
763 extendMatch(seq.chars(), rlen, 1,
764 adapter5.chars(), a5len, fi, 12, l5,l3, true);
765 return true;
766 }
767
768 //another easy case: last 12 characters of the adaptor found as a substring of the read
769 int aplen=12;
770 int apstart=a5len-aplen-2;
771 if (apstart<0) { apstart=0; aplen=a5len-1; }
772 GStr bstr=adapter5.substr(apstart, aplen);
773 if ((fi=seq.index(bstr))>=0) {
774 //if (debug) GMessage(" last 12char of adaptor match the read!\n");
775 extendMatch(seq.chars(), rlen, fi,
776 adapter5.chars(), a5len, apstart, aplen, l5,l3,true);
777 return true;
778 }
779 //no easy cases, find a triplet match as a seed for alignment extension
780 //find triplets at the right end of the adapter
781 for (int iw=0;iw<6;iw++) {
782 apstart=a5len-iw-4;
783 GStr aword=adapter5.substr(apstart,3);
784 if ((fi=seq.index(aword))>=0) {
785 //if (debug) GMessage("extending wmatch :%*s\n", fi+3, aword.chars());
786 if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
787 a5len, apstart, 3, l5,l3,true)) {
788 return true;
789 }
790 }
791 }
792 return false; //no adapter parts found
793 }
794
795 //conversion of phred64 to phread33
796 void convertQ64(GStr& q) {
797 for (int i=0;i<q.length();i++) q[i]-=31;
798 }
799
800 void convertQ64(char* q, int len) {
801 for (int i=0;i<len;i++) q[i]-=31;
802 }