ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 9
Committed: Tue Mar 23 17:12:14 2010 UTC (9 years, 6 months ago) by gpertea
File size: 24432 byte(s)
Log Message:
fixed bug causing overtrimming

Line User Rev File contents
1 gpertea 4 #include "GArgs.h"
2     #include "GStr.h"
3     #include "GHash.hh"
4     #include "GList.hh"
5    
6     #define USAGE "Usage:\n\
7     fqtrim [-5 <5'adapter>] [-3 <3'adapter>] [-l <minlen>] [-C] [-D] [-Q] \\\n\
8     [-n <rename_prefix>] [-o <trimmed.fq>] [-r <discarded.lst>] <input.fq>\n\
9     \n\
10     Trim low quality bases at 3' end, optional adapter, filter for low complexity and\n\
11     optionally collapse duplicates in the input fastq data.\n\
12     \n\
13     Options:\n\
14     -n rename all the reads using the <prefix> followed by a read counter;\n\
15     if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
16     the read duplication count\n\
17     -o write the trimmed/filtered fastq into <trimmed.fq>(instead of stdout)\n\
18     -5 trim the given adapter or primer sequence at the 5' end of each read\n\
19     (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
20     -3 trim the given adapter sequence at the 3' end of each read\n\
21     (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
22     -l minimum \"clean\" length at the high quality 5'end that a read must\n\
23     have in order to pass the filter (default: 16)\n\
24     -r write a simple \"trash report\" file listing the discarded reads with a\n\
25     one letter code for the reason why they were trashed\n\
26     -C collapse duplicate reads and add a prefix with their count to the read\n\
27     name\n\
28     -D apply a low-complexity (dust) filter and discard any read that has at\n\
29     least half of it masked by this\n\
30     -Q quality values in the input data are interpreted as phred64, convert\n\
31     them to phred33\n\
32     "
33     // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
34     FILE* f_out=NULL; //stdout if not provided
35     FILE* f_in=NULL; //input fastq (stdin if not provided)
36     FILE* freport=NULL;
37     bool debug=false;
38     bool doCollapse=false;
39     bool doDust=false;
40     bool trashReport=false;
41     bool rawFormat=false;
42     int min_read_len=16;
43     int dust_cutoff=16;
44     bool isfasta=false;
45     bool phred64=false;
46     GStr prefix;
47     GStr adapter3;
48     GStr adapter5;
49     int a3len=0;
50     int a5len=0;
51     // adaptor matching metrics -- for extendMatch() function
52     static const int a_m_score=2; //match score
53     static const int a_mis_score=-3; //mismatch
54     static const int a_dropoff_score=7;
55     static const int a_min_score=7;
56    
57     // element in dhash:
58     class FqDupRec {
59     public:
60     int count; //how many of these reads are the same
61     int len; //length of qv
62     char* firstname; //optional, only if we want to keep the original read names
63     char* qv;
64     FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
65     len=0;
66     qv=NULL;
67     firstname=NULL;
68     count=0;
69     if (qstr!=NULL) {
70     qv=Gstrdup(qstr->chars());
71     len=qstr->length();
72     count++;
73     }
74     if (rname!=NULL) firstname=Gstrdup(rname);
75     }
76     ~FqDupRec() {
77     GFREE(qv);
78     GFREE(firstname);
79     }
80     void add(GStr& d) { //collapse another record into this one
81     if (d.length()!=len)
82     GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
83     count++;
84     for (int i=0;i<len;i++)
85     qv[i]=(qv[i]+d[i])/2;
86     }
87     };
88    
89     void openfw(FILE* &f, GArgs& args, char opt) {
90     GStr s=args.getOpt(opt);
91     if (!s.is_empty()) {
92     if (s=='-') f=stdout;
93     else {
94     f=fopen(s,"w");
95     if (f==NULL) GError("Error creating file: %s\n", s.chars());
96     }
97     }
98     }
99    
100     #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
101     #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
102    
103     GHash<FqDupRec> dhash; //hash to keep track of duplicates
104    
105     bool ntrim(GStr& rseq, GStr &rqv, int &l5, int &l3); //returns true if any trimming occured
106    
107     int dust(GStr& seq);
108     bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
109     bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
110    
111     void convertQ64(char* q, int len);
112     void convertQ64(GStr& q);
113    
114     int main(int argc, char * const argv[]) {
115 gpertea 9 GArgs args(argc, argv, "YQRDCl:d:3:5:n:r:o:");
116 gpertea 4 int e;
117     int icounter=0; //counter for input reads
118     int outcounter=0; //counter for output reads
119     if ((e=args.isError())>0) {
120     GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
121     exit(224);
122     }
123 gpertea 9 debug=(args.getOpt('Y')!=NULL);
124 gpertea 4 phred64=(args.getOpt('Q')!=NULL);
125     doCollapse=(args.getOpt('C')!=NULL);
126     doDust=(args.getOpt('D')!=NULL);
127     rawFormat=(args.getOpt('R')!=NULL);
128     if (rawFormat) {
129     GError("Sorry, raw qseq format parsing is not implemented yet!\n");
130     }
131     prefix=args.getOpt('n');
132     GStr s=args.getOpt('l');
133     if (!s.is_empty())
134     min_read_len=s.asInt();
135     s=args.getOpt('d');
136     if (!s.is_empty()) {
137     dust_cutoff=s.asInt();
138     doDust=true;
139     }
140    
141     if (args.getOpt('3')!=NULL) {
142     adapter3=args.getOpt('3');
143     adapter3.upper();
144     a3len=adapter3.length();
145     }
146     if (args.getOpt('5')!=NULL) {
147     adapter5=args.getOpt('5');
148     adapter5.upper();
149     a5len=adapter5.length();
150     }
151     trashReport= (args.getOpt('r')!=NULL);
152     if (args.startNonOpt()==0) {
153     GMessage(USAGE);
154     exit(224);
155     }
156    
157     openfw(f_out, args, 'o');
158     if (trashReport)
159     openfw(freport, args, 'r');
160     char* infile=NULL;
161     while ((infile=args.nextNonOpt())!=NULL) {
162     GStr infname(infile);
163     if (strcmp(infile,"-")==0) {
164     f_in=stdin; infname="stdin"; }
165     else {
166     f_in=fopen(infile,"r");
167     if (f_in==NULL)
168     GError("Cannot open input file %s!\n",infile);
169     }
170     GLineReader fq(f_in);
171     char* l=NULL;
172     while ((l=fq.getLine())!=NULL) {
173     GStr rname; //current read name
174     GStr rseq; //current read sequence
175     GStr rqv; //current read quality values
176     GStr s;
177     if (rawFormat) {
178     //TODO: implement qseq parsing here
179     //if (raw type=N) then continue; //skip invalid/bad records
180    
181     } //raw qseq format
182     else { // FASTQ or FASTA
183     isfasta=(l[0]=='>');
184     if (!isfasta && l[0]!='@') GError("Error: fastq record marker not detected!\n");
185     s=l;
186     rname=&(l[1]);
187     icounter++;
188     //GMessage("readname=%s\n",rname.chars());
189     for (int i=0;i<rname.length();i++)
190     if (rname[i]<=' ') { rname.cut(i); break; }
191     //now get the sequence
192     if ((l=fq.getLine())==NULL)
193     GError("Error: unexpected EOF after header for %s\n",rname.chars());
194     rseq=l; //this must be the DNA line
195     if (isfasta) {
196     while ((l=fq.getLine())!=NULL) {
197     //fasta can have multiple sequence lines
198     if (l[0]=='>') {
199     fq.pushBack();
200     break; //
201     }
202     rseq+=l;
203     }
204     } //multi-line fasta file
205     if (!isfasta) {
206     if ((l=fq.getLine())==NULL)
207     GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
208     if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
209     if ((l=fq.getLine())==NULL)
210     GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
211     rqv=l;
212     if (rqv.length()!=rseq.length())
213     GError("Error: qv len != seq len for %s\n", rname.chars());
214     }
215     } //<-- FASTA or FASTQ
216     rseq.upper();
217     int l5=0;
218     int l3=rseq.length()-1;
219     if (l3-l5+1<min_read_len) {
220     if (trashReport) {
221     fprintf(freport, "%s\ts\t%s\n",rname.chars(), rseq.chars());
222     }
223     continue;
224     }
225     if (ntrim(rseq, rqv, l5, l3)) {
226     if (l3-l5+1<min_read_len) {
227     if (trashReport) {
228     fprintf(freport, "%s\tN\t%s\n", rname.chars(), rseq.chars());
229     }
230     continue; //invalid read
231     }
232     //-- keep only the l5..l3 range
233     rseq=rseq.substr(l5, l3-l5+1);
234     if (!rqv.is_empty())
235     rqv=rqv.substr(l5, l3-l5+1);
236     }
237    
238     if (a3len>0) {
239     if (trim_adapter3(rseq, l5, l3)) {
240     if (l3-l5+1<min_read_len) {
241     if (trashReport) {
242     fprintf(freport, "%s\tA3\t%s\n",rname.chars(), rseq.chars());
243     }
244     continue;
245     }
246     //-- keep only the l5..l3 range
247     rseq=rseq.substr(l5, l3-l5+1);
248     if (!rqv.is_empty())
249     rqv=rqv.substr(l5, l3-l5+1);
250     }//some adapter was trimmed
251     } //adapter trimming
252     if (a5len>0) {
253     if (trim_adapter5(rseq, l5, l3)) {
254     if (l3-l5+1<min_read_len) {
255     if (trashReport) {
256     fprintf(freport, "%s\tA5\t%s\n",rname.chars(), rseq.chars());
257     }
258     continue;
259     }
260     //-- keep only the l5..l3 range
261     rseq=rseq.substr(l5, l3-l5+1);
262     if (!rqv.is_empty())
263     rqv=rqv.substr(l5, l3-l5+1);
264     }//some adapter was trimmed
265     } //adapter trimming
266     if (doCollapse) {
267     //keep read for later
268     FqDupRec* dr=dhash.Find(rseq.chars());
269     if (dr==NULL) { //new entry
270     //if (prefix.is_empty())
271     dhash.Add(rseq.chars(),
272     new FqDupRec(&rqv, rname.chars()));
273     //else dhash.Add(rseq.chars(), new FqDupRec(rqv.chars(),rqv.length()));
274     }
275     else
276     dr->add(rqv);
277     } //collapsing duplicates
278     else { //not collapsing duplicates
279     //do the dust filter now
280     if (doDust) {
281     int dustbases=dust(rseq);
282     if (dustbases>(rseq.length()>>1)) {
283     if (trashReport) {
284     fprintf(freport, "%s\tD\t%s\n",rname.chars(),rseq.chars());
285     }
286     continue;
287     }
288     }
289     //print this record here
290     outcounter++;
291     if (isfasta) {
292     if (prefix.is_empty())
293     fprintf(f_out, ">%s\n%s\n", rname.chars(), rseq.chars());
294     else
295     fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
296     rseq.chars());
297     }
298     else { //fastq
299     if (phred64) convertQ64(rqv);
300     if (prefix.is_empty())
301     fprintf(f_out, "@%s\n%s\n+\n%s\n", rname.chars(), rseq.chars(),rqv.chars());
302     else
303     fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
304     rseq.chars(),rqv.chars() );
305     }
306     } //not collapsing duplicates
307     } //for each fastq record
308     } //while each input file
309     FRCLOSE(f_in);
310     if (doCollapse) {
311     outcounter=0;
312     int maxdup_count=1;
313     char* maxdup_seq=NULL;
314     dhash.startIterate();
315     FqDupRec* qd=NULL;
316     char* seq=NULL;
317     while ((qd=dhash.NextData(seq))!=NULL) {
318     GStr rseq(seq);
319     //do the dusting here
320     if (doDust) {
321     int dustbases=dust(rseq);
322     if (dustbases>(rseq.length()>>1)) {
323     if (trashReport && qd->firstname!=NULL) {
324     fprintf(freport, "%s_x%d\tD\t%s\n",qd->firstname, qd->count,seq);
325     }
326     continue;
327     }
328     }
329     outcounter++;
330     if (qd->count>maxdup_count) {
331     maxdup_count=qd->count;
332     maxdup_seq=seq;
333     }
334     if (isfasta) {
335     if (prefix.is_empty()) {
336     fprintf(f_out, "@%s_x%d\n%s\n", qd->firstname, qd->count,
337     rseq.chars());
338     }
339     else { //use custom read name
340     fprintf(f_out, "@%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
341     qd->count, rseq.chars());
342     }
343     }
344     else { //fastq format
345     if (phred64) convertQ64(qd->qv, qd->len);
346     if (prefix.is_empty()) {
347     fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
348     rseq.chars(), qd->qv);
349     }
350     else { //use custom read name
351     fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
352     qd->count, rseq.chars(), qd->qv);
353     }
354     }
355     }//for each element of dhash
356     if (maxdup_count>1) {
357     GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
358     }
359     } //report collapsed dhash entries
360     GMessage("Number of input reads: %9d\n", icounter);
361     GMessage(" Output records: %9d\n", outcounter);
362     if (trashReport) {
363     FWCLOSE(freport);
364     }
365    
366     FWCLOSE(f_out);
367     }
368    
369     class NData {
370     public:
371     int NPos[1024]; //there should be no reads longer than 1K ?
372     int NCount;
373     int end5;
374     int end3;
375     int seqlen;
376     double perc_N; //percentage of Ns in end5..end3 range only!
377     const char* seq;
378     bool valid;
379     NData() {
380     NCount=0;
381     end5=0;
382     end3=0;
383     seq=NULL;
384     perc_N=0;
385     valid=true;
386     }
387     void init(GStr& rseq) {
388     NCount=0;
389     perc_N=0;
390     seqlen=rseq.length();
391     seq=rseq.chars();
392     end5=1;
393     end3=seqlen;
394     for (int i=0;i<seqlen;i++)
395     if (rseq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
396     NPos[NCount]=i;
397     NCount++;
398     }
399     perc_N=(NCount*100.0)/seqlen;
400     valid=true;
401     }
402     void N_calc() { //only in the region after trimming
403     int n=0;
404     for (int i=end3-1;i<end5;i++) {
405     if (seq[i]=='N') n++;
406     }
407     perc_N=(n*100.0)/(end5-end3+1);
408     }
409     };
410    
411     static NData feat;
412     int perc_lenN=12; // incremental distance from ends, in percentage of
413     // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
414    
415     void N_analyze(int l5, int l3, int p5, int p3) {
416     /* assumes feat was filled properly */
417     int old_dif, t5,t3,v;
418     if (l3-l5<min_read_len || p5>p3 ) {
419     feat.end5=l5+1;
420     feat.end3=l3+1;
421     return;
422     }
423    
424     t5=feat.NPos[p5]-l5;
425     t3=l3-feat.NPos[p3];
426     old_dif=p3-p5;
427     v=(int)((((double)(l3-l5))*perc_lenN)/100);
428     if (v>20) v=20; /* enforce N-search limit for very long reads */
429     if (t5 < v ) {
430     l5=feat.NPos[p5]+1;
431     p5++;
432     }
433     if (t3 < v) {
434     l3=feat.NPos[p3]-1;
435     p3--;
436     }
437     /* restNs=p3-p5; number of Ns in the new CLR */
438     if (p3-p5==old_dif) { /* no change, return */
439     /* don't trim if this may shorten the read too much */
440     //if (l5-l3<min_read_len) return;
441     feat.end5=l5+1;
442     feat.end3=l3+1;
443     return;
444     }
445     else
446     N_analyze(l5,l3, p5,p3);
447     }
448    
449    
450     bool ntrim(GStr& rseq, GStr &rqv, int &l5, int &l3) {
451     //count Ns in the sequence
452     feat.init(rseq);
453     l5=feat.end5-1;
454     l3=feat.end3-1;
455     N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
456     if (l5==feat.end5-1 && l3==feat.end3-1)
457     return false; //nothing changed
458     l5=feat.end5-1;
459     l3=feat.end3-1;
460     if (l3-l5+1<min_read_len) {
461     feat.valid=false;
462     return true;
463     }
464     feat.N_calc();
465     if (feat.perc_N>6.2) { //not valid if more than 1 N per 16 bases
466     feat.valid=false;
467     l3=l5+1;
468     return true;
469     }
470     return true;
471     }
472    
473     //--------------- dust functions ----------------
474     class DNADuster {
475     public:
476     int dustword;
477     int dustwindow;
478     int dustwindow2;
479     int dustcutoff;
480     int mv, iv, jv;
481     int counts[32*32*32];
482     int iis[32*32*32];
483     DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
484     dustword=wordsize;
485     dustwindow=winsize;
486     dustwindow2 = (winsize>>1);
487     dustcutoff=cutoff;
488     mv=0;
489     iv=0;
490     jv=0;
491     }
492     void setWindowSize(int value) {
493     dustwindow = value;
494     dustwindow2 = (dustwindow >> 1);
495     }
496     void setWordSize(int value) {
497     dustword=value;
498     }
499     void wo1(int len, const char* s, int ivv) {
500     int i, ii, j, v, t, n, n1, sum;
501     int js, nis;
502     n = 32 * 32 * 32;
503     n1 = n - 1;
504     nis = 0;
505     i = 0;
506     ii = 0;
507     sum = 0;
508     v = 0;
509     for (j=0; j < len; j++, s++) {
510     ii <<= 5;
511     if (*s<=32) {
512     i=0;
513     continue;
514     }
515     ii |= *s - 'A'; //assume uppercase!
516     ii &= n1;
517     i++;
518     if (i >= dustword) {
519     for (js=0; js < nis && iis[js] != ii; js++) ;
520     if (js == nis) {
521     iis[nis] = ii;
522     counts[ii] = 0;
523     nis++;
524     }
525     if ((t = counts[ii]) > 0) {
526     sum += t;
527     v = 10 * sum / j;
528     if (mv < v) {
529     mv = v;
530     iv = ivv;
531     jv = j;
532     }
533     }
534     counts[ii]++;
535     }
536     }
537     }
538    
539     int wo(int len, const char* s, int* beg, int* end) {
540     int i, l1;
541     l1 = len - dustword + 1;
542     if (l1 < 0) {
543     *beg = 0;
544     *end = len - 1;
545     return 0;
546     }
547     mv = 0;
548     iv = 0;
549     jv = 0;
550     for (i=0; i < l1; i++) {
551     wo1(len-i, s+i, i);
552     }
553     *beg = iv;
554     *end = iv + jv;
555     return mv;
556     }
557    
558     void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
559     int i, j, l, a, b, v;
560     if (cutoff==0) cutoff=dustcutoff;
561     a=0;b=0;
562     //GMessage("Dust cutoff=%d\n", cutoff);
563     for (i=0; i < seqlen; i += dustwindow2) {
564     l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
565     v = wo(l, seq+i, &a, &b);
566     if (v > cutoff) {
567     //for (j = a; j <= b && j < dustwindow2; j++) {
568     for (j = a; j <= b; j++) {
569     seqmsk[i+j]='N';//could be made lowercase instead
570     }
571     }
572     }
573     //return first;
574     }
575     };
576    
577     static DNADuster duster;
578    
579     int dust(GStr& rseq) {
580     char* seq=Gstrdup(rseq.chars());
581     duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
582     //check the number of Ns:
583     int ncount=0;
584     for (int i=0;i<rseq.length();i++) {
585     if (seq[i]=='N') ncount++;
586     }
587     GFREE(seq);
588     return ncount;
589     }
590    
591     // ------------------ adapter matching - triplet matching
592     //look for matching triplets amongst the first 9 nucleotides of the 3' adaptor
593     // or the last 9 nucleotides for the 5' adapter
594     //when a triplet match is found, simply try to extend the alignment using a drop-off scheme
595     // check minimum score and
596     // for 3' adapter trimming:
597     // require that the right end of the alignment for either the adaptor OR the read must be
598     // < 3 distance from its right end
599     // for 5' adapter trimming:
600     // require that the left end of the alignment for either the adaptor OR the read must
601     // be at coordinate 0
602    
603     bool extendMatch(const char* a, int alen, int ai,
604 gpertea 9 const char* b, int blen, int bi, int mlen, int& l5, int& l3, bool end5=false) {
605 gpertea 4 //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
606 gpertea 9 //if (debug)
607     // GStr dbg(b);
608 gpertea 4 //GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(ai, mlen)).chars(), ai);
609     int a_l=ai; //alignment coordinates on a
610     int a_r=ai+mlen-1;
611     int b_l=bi; //alignment coordinates on b
612     int b_r=bi+mlen-1;
613     int ai_maxscore=ai;
614     int bi_maxscore=bi;
615     int score=mlen*a_m_score;
616     int maxscore=score;
617 gpertea 9 int mism5score=a_mis_score;
618     if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
619 gpertea 4 //try to extend to the left first, if possible
620     while (ai>0 && bi>0) {
621     ai--;
622     bi--;
623     score+= (a[ai]==b[bi])? a_m_score : mism5score;
624     if (score>maxscore) {
625     ai_maxscore=ai;
626     bi_maxscore=bi;
627     maxscore=score;
628     }
629     else if (maxscore-score>a_dropoff_score) break;
630     }
631     a_l=ai_maxscore;
632     b_l=bi_maxscore;
633 gpertea 9 //if (debug) GMessage(" after l-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
634 gpertea 4 //now extend to the right
635     ai_maxscore=a_r;
636     bi_maxscore=b_r;
637     ai=a_r;
638     bi=b_r;
639     score=maxscore;
640     //sometimes there are extra AAAAs at the end of the read, ignore those
641     if (strcmp(&a[alen-4],"AAAA")==0) {
642     alen-=3;
643     while (a[alen-1]=='A' && alen>ai) alen--;
644     }
645     while (ai<alen-1 && bi<blen-1) {
646     ai++;
647     bi++;
648     //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
649     if (a[ai]==b[bi]) { //match
650     score+=a_m_score;
651     if (ai>=alen-2) {
652     score+=a_m_score-(alen-ai-1);
653     }
654     }
655     else { //mismatch
656     score+=a_mis_score;
657     }
658     if (score>maxscore) {
659     ai_maxscore=ai;
660     bi_maxscore=bi;
661     maxscore=score;
662     }
663     else if (maxscore-score>a_dropoff_score) break;
664     }
665     a_r=ai_maxscore;
666     b_r=bi_maxscore;
667 gpertea 9 int a_ovh3=alen-a_r-1;
668     int b_ovh3=blen-b_r-1;
669     int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
670     int mmovh5=(a_l<b_l)? a_l : b_l;
671     //if (debug) GMessage(" after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
672     if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
673     if (a_l<a_ovh3) {
674 gpertea 4 //adapter closer to the left end (typical for 5' adapter)
675     l5=a_r+1;
676     l3=alen-1;
677     }
678     else {
679     //adapter matching at the right end (typical for 3' adapter)
680     l5=0;
681     l3=a_l-1;
682     }
683     return true;
684     }
685     return false;
686     }
687    
688     bool trim_adapter3(GStr& seq, int&l5, int &l3) {
689     int rlen=seq.length();
690     l5=0;
691     l3=rlen-1;
692     //first try a full match, we might get lucky
693     int fi=-1;
694     if ((fi=seq.index(adapter3))>=0) {
695     if (fi<rlen-fi-a3len) {//match is closer to the right end
696     l5=fi+a3len;
697     l3=rlen-1;
698     }
699     else {
700     l5=0;
701     l3=fi-1;
702     }
703     return true;
704     }
705     //also, for fast detection of other adapter-only reads that start past
706     // the beginning of the adapter sequence, try to see if the first a3len-4
707     // characters of the read are a substring of the adapter
708     GStr rstart=seq.substr(0,a3len-4);
709     if ((fi=adapter3.index(rstart))>=0) {
710     l3=rlen-1;
711     l5=a3len-4;
712     while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
713     return true;
714     }
715    
716     //another easy case: first half of the adaptor matches
717     int a3hlen=a3len>>1;
718     GStr ahalf=adapter3.substr(0, a3hlen);
719     if ((fi=seq.index(ahalf))>=0) {
720     extendMatch(seq.chars(), rlen, fi,
721     adapter3.chars(), a3len, 0, a3hlen, l5,l3);
722     return true;
723     }
724     //no easy cases, so let's do the word hashing
725     for (int iw=0;iw<6;iw++) {
726     GStr aword=adapter3.substr(iw,3);
727     if ((fi=seq.index(aword))>=0 && rlen-fi>3) {
728     if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
729     a3len, iw, 3, l5,l3)) return true;
730     }
731     }
732     return false; //no adapter parts found
733     }
734    
735     bool trim_adapter5(GStr& seq, int&l5, int &l3) {
736 gpertea 9 //if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars());
737 gpertea 4 int rlen=seq.length();
738     l5=0;
739     l3=rlen-1;
740     //try to see if adapter is fully included in the read
741     int fi=-1;
742     if ((fi=seq.index(adapter5))>=0) {
743     if (fi<rlen-fi-a5len) {//match is closer to the right end
744     l5=fi+a5len;
745     l3=rlen-1;
746     }
747     else {
748     l5=0;
749     l3=fi-1;
750     }
751     return true;
752     }
753     //for fast detection of adapter-rich reads, check if the first 12
754     //characters of the read are a substring of the adapter
755     GStr rstart=seq.substr(1,12);
756     if ((fi=adapter5.index(rstart))>=0) {
757     //l3=rlen-1;
758     //l5=a5len-4;
759     //while (fi+l5<a5len && l5<l3 && adapter5[fi+l5]==seq[l5]) l5++;
760     //return true;
761 gpertea 9 //if (debug) GMessage(" first 12char of the read match adaptor!\n");
762 gpertea 4 extendMatch(seq.chars(), rlen, 1,
763 gpertea 9 adapter5.chars(), a5len, fi, 12, l5,l3, true);
764 gpertea 4 return true;
765     }
766    
767     //another easy case: last 12 characters of the adaptor found as a substring of the read
768     int aplen=12;
769     int apstart=a5len-aplen-2;
770     if (apstart<0) { apstart=0; aplen=a5len-1; }
771     GStr bstr=adapter5.substr(apstart, aplen);
772     if ((fi=seq.index(bstr))>=0) {
773 gpertea 9 //if (debug) GMessage(" last 12char of adaptor match the read!\n");
774 gpertea 4 extendMatch(seq.chars(), rlen, fi,
775 gpertea 9 adapter5.chars(), a5len, apstart, aplen, l5,l3,true);
776 gpertea 4 return true;
777     }
778     //no easy cases, find a triplet match as a seed for alignment extension
779     //find triplets at the right end of the adapter
780     for (int iw=0;iw<6;iw++) {
781     apstart=a5len-iw-4;
782     GStr aword=adapter5.substr(apstart,3);
783     if ((fi=seq.index(aword))>=0) {
784 gpertea 9 //if (debug) GMessage("extending wmatch :%*s\n", fi+3, aword.chars());
785 gpertea 4 if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
786 gpertea 9 a5len, apstart, 3, l5,l3,true)) {
787 gpertea 4 return true;
788     }
789     }
790     }
791     return false; //no adapter parts found
792     }
793    
794     //conversion of phred64 to phread33
795     void convertQ64(GStr& q) {
796     for (int i=0;i<q.length();i++) q[i]-=31;
797     }
798    
799     void convertQ64(char* q, int len) {
800     for (int i=0;i<len;i++) q[i]-=31;
801     }