ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 11
Committed: Tue Mar 23 18:03:12 2010 UTC (9 years, 7 months ago) by gpertea
File size: 24503 byte(s)
Log Message:
update l5,l3 to defaults if no trimming is performed

Line User Rev File contents
1 gpertea 4 #include "GArgs.h"
2     #include "GStr.h"
3     #include "GHash.hh"
4     #include "GList.hh"
5    
6     #define USAGE "Usage:\n\
7     fqtrim [-5 <5'adapter>] [-3 <3'adapter>] [-l <minlen>] [-C] [-D] [-Q] \\\n\
8     [-n <rename_prefix>] [-o <trimmed.fq>] [-r <discarded.lst>] <input.fq>\n\
9     \n\
10     Trim low quality bases at 3' end, optional adapter, filter for low complexity and\n\
11     optionally collapse duplicates in the input fastq data.\n\
12     \n\
13     Options:\n\
14     -n rename all the reads using the <prefix> followed by a read counter;\n\
15     if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
16     the read duplication count\n\
17     -o write the trimmed/filtered fastq into <trimmed.fq>(instead of stdout)\n\
18     -5 trim the given adapter or primer sequence at the 5' end of each read\n\
19     (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
20     -3 trim the given adapter sequence at the 3' end of each read\n\
21     (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
22     -l minimum \"clean\" length at the high quality 5'end that a read must\n\
23     have in order to pass the filter (default: 16)\n\
24     -r write a simple \"trash report\" file listing the discarded reads with a\n\
25     one letter code for the reason why they were trashed\n\
26     -C collapse duplicate reads and add a prefix with their count to the read\n\
27     name\n\
28     -D apply a low-complexity (dust) filter and discard any read that has at\n\
29     least half of it masked by this\n\
30     -Q quality values in the input data are interpreted as phred64, convert\n\
31     them to phred33\n\
32     "
33     // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
34     FILE* f_out=NULL; //stdout if not provided
35     FILE* f_in=NULL; //input fastq (stdin if not provided)
36     FILE* freport=NULL;
37     bool debug=false;
38     bool doCollapse=false;
39     bool doDust=false;
40     bool trashReport=false;
41     bool rawFormat=false;
42     int min_read_len=16;
43     int dust_cutoff=16;
44     bool isfasta=false;
45     bool phred64=false;
46     GStr prefix;
47     GStr adapter3;
48     GStr adapter5;
49     int a3len=0;
50     int a5len=0;
51     // adaptor matching metrics -- for extendMatch() function
52     static const int a_m_score=2; //match score
53     static const int a_mis_score=-3; //mismatch
54     static const int a_dropoff_score=7;
55     static const int a_min_score=7;
56    
57     // element in dhash:
58     class FqDupRec {
59     public:
60     int count; //how many of these reads are the same
61     int len; //length of qv
62     char* firstname; //optional, only if we want to keep the original read names
63     char* qv;
64     FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
65     len=0;
66     qv=NULL;
67     firstname=NULL;
68     count=0;
69     if (qstr!=NULL) {
70     qv=Gstrdup(qstr->chars());
71     len=qstr->length();
72     count++;
73     }
74     if (rname!=NULL) firstname=Gstrdup(rname);
75     }
76     ~FqDupRec() {
77     GFREE(qv);
78     GFREE(firstname);
79     }
80     void add(GStr& d) { //collapse another record into this one
81     if (d.length()!=len)
82     GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
83     count++;
84     for (int i=0;i<len;i++)
85     qv[i]=(qv[i]+d[i])/2;
86     }
87     };
88    
89     void openfw(FILE* &f, GArgs& args, char opt) {
90     GStr s=args.getOpt(opt);
91     if (!s.is_empty()) {
92     if (s=='-') f=stdout;
93     else {
94     f=fopen(s,"w");
95     if (f==NULL) GError("Error creating file: %s\n", s.chars());
96     }
97     }
98     }
99    
100     #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
101     #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
102    
103     GHash<FqDupRec> dhash; //hash to keep track of duplicates
104    
105     bool ntrim(GStr& rseq, GStr &rqv, int &l5, int &l3); //returns true if any trimming occured
106    
107     int dust(GStr& seq);
108     bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
109     bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
110    
111     void convertQ64(char* q, int len);
112     void convertQ64(GStr& q);
113    
114     int main(int argc, char * const argv[]) {
115 gpertea 9 GArgs args(argc, argv, "YQRDCl:d:3:5:n:r:o:");
116 gpertea 4 int e;
117     int icounter=0; //counter for input reads
118     int outcounter=0; //counter for output reads
119     if ((e=args.isError())>0) {
120     GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
121     exit(224);
122     }
123 gpertea 9 debug=(args.getOpt('Y')!=NULL);
124 gpertea 4 phred64=(args.getOpt('Q')!=NULL);
125     doCollapse=(args.getOpt('C')!=NULL);
126     doDust=(args.getOpt('D')!=NULL);
127     rawFormat=(args.getOpt('R')!=NULL);
128     if (rawFormat) {
129     GError("Sorry, raw qseq format parsing is not implemented yet!\n");
130     }
131     prefix=args.getOpt('n');
132     GStr s=args.getOpt('l');
133     if (!s.is_empty())
134     min_read_len=s.asInt();
135     s=args.getOpt('d');
136     if (!s.is_empty()) {
137     dust_cutoff=s.asInt();
138     doDust=true;
139     }
140    
141     if (args.getOpt('3')!=NULL) {
142     adapter3=args.getOpt('3');
143     adapter3.upper();
144     a3len=adapter3.length();
145     }
146     if (args.getOpt('5')!=NULL) {
147     adapter5=args.getOpt('5');
148     adapter5.upper();
149     a5len=adapter5.length();
150     }
151     trashReport= (args.getOpt('r')!=NULL);
152     if (args.startNonOpt()==0) {
153     GMessage(USAGE);
154     exit(224);
155     }
156    
157     openfw(f_out, args, 'o');
158 gpertea 10 if (f_out==NULL) f_out=stdout;
159 gpertea 4 if (trashReport)
160     openfw(freport, args, 'r');
161     char* infile=NULL;
162     while ((infile=args.nextNonOpt())!=NULL) {
163     GStr infname(infile);
164     if (strcmp(infile,"-")==0) {
165     f_in=stdin; infname="stdin"; }
166     else {
167     f_in=fopen(infile,"r");
168     if (f_in==NULL)
169     GError("Cannot open input file %s!\n",infile);
170     }
171     GLineReader fq(f_in);
172     char* l=NULL;
173     while ((l=fq.getLine())!=NULL) {
174     GStr rname; //current read name
175     GStr rseq; //current read sequence
176     GStr rqv; //current read quality values
177     GStr s;
178     if (rawFormat) {
179     //TODO: implement qseq parsing here
180     //if (raw type=N) then continue; //skip invalid/bad records
181    
182     } //raw qseq format
183     else { // FASTQ or FASTA
184     isfasta=(l[0]=='>');
185     if (!isfasta && l[0]!='@') GError("Error: fastq record marker not detected!\n");
186     s=l;
187     rname=&(l[1]);
188     icounter++;
189     //GMessage("readname=%s\n",rname.chars());
190     for (int i=0;i<rname.length();i++)
191     if (rname[i]<=' ') { rname.cut(i); break; }
192     //now get the sequence
193     if ((l=fq.getLine())==NULL)
194     GError("Error: unexpected EOF after header for %s\n",rname.chars());
195     rseq=l; //this must be the DNA line
196     if (isfasta) {
197     while ((l=fq.getLine())!=NULL) {
198     //fasta can have multiple sequence lines
199     if (l[0]=='>') {
200     fq.pushBack();
201     break; //
202     }
203     rseq+=l;
204     }
205     } //multi-line fasta file
206     if (!isfasta) {
207     if ((l=fq.getLine())==NULL)
208     GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
209     if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
210     if ((l=fq.getLine())==NULL)
211     GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
212     rqv=l;
213     if (rqv.length()!=rseq.length())
214     GError("Error: qv len != seq len for %s\n", rname.chars());
215     }
216     } //<-- FASTA or FASTQ
217     rseq.upper();
218     int l5=0;
219     int l3=rseq.length()-1;
220     if (l3-l5+1<min_read_len) {
221     if (trashReport) {
222     fprintf(freport, "%s\ts\t%s\n",rname.chars(), rseq.chars());
223     }
224     continue;
225     }
226     if (ntrim(rseq, rqv, l5, l3)) {
227     if (l3-l5+1<min_read_len) {
228     if (trashReport) {
229     fprintf(freport, "%s\tN\t%s\n", rname.chars(), rseq.chars());
230     }
231     continue; //invalid read
232     }
233     //-- keep only the l5..l3 range
234     rseq=rseq.substr(l5, l3-l5+1);
235     if (!rqv.is_empty())
236     rqv=rqv.substr(l5, l3-l5+1);
237     }
238    
239     if (a3len>0) {
240     if (trim_adapter3(rseq, l5, l3)) {
241     if (l3-l5+1<min_read_len) {
242     if (trashReport) {
243     fprintf(freport, "%s\tA3\t%s\n",rname.chars(), rseq.chars());
244     }
245     continue;
246     }
247     //-- keep only the l5..l3 range
248     rseq=rseq.substr(l5, l3-l5+1);
249     if (!rqv.is_empty())
250     rqv=rqv.substr(l5, l3-l5+1);
251     }//some adapter was trimmed
252     } //adapter trimming
253     if (a5len>0) {
254     if (trim_adapter5(rseq, l5, l3)) {
255     if (l3-l5+1<min_read_len) {
256     if (trashReport) {
257     fprintf(freport, "%s\tA5\t%s\n",rname.chars(), rseq.chars());
258     }
259     continue;
260     }
261     //-- keep only the l5..l3 range
262     rseq=rseq.substr(l5, l3-l5+1);
263     if (!rqv.is_empty())
264     rqv=rqv.substr(l5, l3-l5+1);
265     }//some adapter was trimmed
266     } //adapter trimming
267     if (doCollapse) {
268     //keep read for later
269     FqDupRec* dr=dhash.Find(rseq.chars());
270     if (dr==NULL) { //new entry
271     //if (prefix.is_empty())
272     dhash.Add(rseq.chars(),
273     new FqDupRec(&rqv, rname.chars()));
274     //else dhash.Add(rseq.chars(), new FqDupRec(rqv.chars(),rqv.length()));
275     }
276     else
277     dr->add(rqv);
278     } //collapsing duplicates
279     else { //not collapsing duplicates
280     //do the dust filter now
281     if (doDust) {
282     int dustbases=dust(rseq);
283     if (dustbases>(rseq.length()>>1)) {
284     if (trashReport) {
285     fprintf(freport, "%s\tD\t%s\n",rname.chars(),rseq.chars());
286     }
287     continue;
288     }
289     }
290     //print this record here
291     outcounter++;
292     if (isfasta) {
293     if (prefix.is_empty())
294     fprintf(f_out, ">%s\n%s\n", rname.chars(), rseq.chars());
295     else
296     fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
297     rseq.chars());
298     }
299     else { //fastq
300     if (phred64) convertQ64(rqv);
301     if (prefix.is_empty())
302     fprintf(f_out, "@%s\n%s\n+\n%s\n", rname.chars(), rseq.chars(),rqv.chars());
303     else
304     fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
305     rseq.chars(),rqv.chars() );
306     }
307     } //not collapsing duplicates
308     } //for each fastq record
309     } //while each input file
310     FRCLOSE(f_in);
311     if (doCollapse) {
312     outcounter=0;
313     int maxdup_count=1;
314     char* maxdup_seq=NULL;
315     dhash.startIterate();
316     FqDupRec* qd=NULL;
317     char* seq=NULL;
318     while ((qd=dhash.NextData(seq))!=NULL) {
319     GStr rseq(seq);
320     //do the dusting here
321     if (doDust) {
322     int dustbases=dust(rseq);
323     if (dustbases>(rseq.length()>>1)) {
324     if (trashReport && qd->firstname!=NULL) {
325     fprintf(freport, "%s_x%d\tD\t%s\n",qd->firstname, qd->count,seq);
326     }
327     continue;
328     }
329     }
330     outcounter++;
331     if (qd->count>maxdup_count) {
332     maxdup_count=qd->count;
333     maxdup_seq=seq;
334     }
335     if (isfasta) {
336     if (prefix.is_empty()) {
337 gpertea 10 fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
338 gpertea 4 rseq.chars());
339     }
340     else { //use custom read name
341 gpertea 10 fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
342 gpertea 4 qd->count, rseq.chars());
343     }
344     }
345     else { //fastq format
346     if (phred64) convertQ64(qd->qv, qd->len);
347     if (prefix.is_empty()) {
348     fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
349     rseq.chars(), qd->qv);
350     }
351     else { //use custom read name
352     fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
353     qd->count, rseq.chars(), qd->qv);
354     }
355     }
356     }//for each element of dhash
357     if (maxdup_count>1) {
358     GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
359     }
360     } //report collapsed dhash entries
361     GMessage("Number of input reads: %9d\n", icounter);
362     GMessage(" Output records: %9d\n", outcounter);
363     if (trashReport) {
364     FWCLOSE(freport);
365     }
366    
367     FWCLOSE(f_out);
368     }
369    
370     class NData {
371     public:
372     int NPos[1024]; //there should be no reads longer than 1K ?
373     int NCount;
374     int end5;
375     int end3;
376     int seqlen;
377     double perc_N; //percentage of Ns in end5..end3 range only!
378     const char* seq;
379     bool valid;
380     NData() {
381     NCount=0;
382     end5=0;
383     end3=0;
384     seq=NULL;
385     perc_N=0;
386     valid=true;
387     }
388     void init(GStr& rseq) {
389     NCount=0;
390     perc_N=0;
391     seqlen=rseq.length();
392     seq=rseq.chars();
393     end5=1;
394     end3=seqlen;
395     for (int i=0;i<seqlen;i++)
396     if (rseq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
397     NPos[NCount]=i;
398     NCount++;
399     }
400     perc_N=(NCount*100.0)/seqlen;
401     valid=true;
402     }
403     void N_calc() { //only in the region after trimming
404     int n=0;
405     for (int i=end3-1;i<end5;i++) {
406     if (seq[i]=='N') n++;
407     }
408     perc_N=(n*100.0)/(end5-end3+1);
409     }
410     };
411    
412     static NData feat;
413     int perc_lenN=12; // incremental distance from ends, in percentage of
414     // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
415    
416     void N_analyze(int l5, int l3, int p5, int p3) {
417     /* assumes feat was filled properly */
418     int old_dif, t5,t3,v;
419     if (l3-l5<min_read_len || p5>p3 ) {
420     feat.end5=l5+1;
421     feat.end3=l3+1;
422     return;
423     }
424    
425     t5=feat.NPos[p5]-l5;
426     t3=l3-feat.NPos[p3];
427     old_dif=p3-p5;
428     v=(int)((((double)(l3-l5))*perc_lenN)/100);
429     if (v>20) v=20; /* enforce N-search limit for very long reads */
430     if (t5 < v ) {
431     l5=feat.NPos[p5]+1;
432     p5++;
433     }
434     if (t3 < v) {
435     l3=feat.NPos[p3]-1;
436     p3--;
437     }
438     /* restNs=p3-p5; number of Ns in the new CLR */
439     if (p3-p5==old_dif) { /* no change, return */
440     /* don't trim if this may shorten the read too much */
441     //if (l5-l3<min_read_len) return;
442     feat.end5=l5+1;
443     feat.end3=l3+1;
444     return;
445     }
446     else
447     N_analyze(l5,l3, p5,p3);
448     }
449    
450    
451     bool ntrim(GStr& rseq, GStr &rqv, int &l5, int &l3) {
452     //count Ns in the sequence
453     feat.init(rseq);
454     l5=feat.end5-1;
455     l3=feat.end3-1;
456     N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
457     if (l5==feat.end5-1 && l3==feat.end3-1)
458     return false; //nothing changed
459     l5=feat.end5-1;
460     l3=feat.end3-1;
461     if (l3-l5+1<min_read_len) {
462     feat.valid=false;
463     return true;
464     }
465     feat.N_calc();
466     if (feat.perc_N>6.2) { //not valid if more than 1 N per 16 bases
467     feat.valid=false;
468     l3=l5+1;
469     return true;
470     }
471     return true;
472     }
473    
474     //--------------- dust functions ----------------
475     class DNADuster {
476     public:
477     int dustword;
478     int dustwindow;
479     int dustwindow2;
480     int dustcutoff;
481     int mv, iv, jv;
482     int counts[32*32*32];
483     int iis[32*32*32];
484     DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
485     dustword=wordsize;
486     dustwindow=winsize;
487     dustwindow2 = (winsize>>1);
488     dustcutoff=cutoff;
489     mv=0;
490     iv=0;
491     jv=0;
492     }
493     void setWindowSize(int value) {
494     dustwindow = value;
495     dustwindow2 = (dustwindow >> 1);
496     }
497     void setWordSize(int value) {
498     dustword=value;
499     }
500     void wo1(int len, const char* s, int ivv) {
501     int i, ii, j, v, t, n, n1, sum;
502     int js, nis;
503     n = 32 * 32 * 32;
504     n1 = n - 1;
505     nis = 0;
506     i = 0;
507     ii = 0;
508     sum = 0;
509     v = 0;
510     for (j=0; j < len; j++, s++) {
511     ii <<= 5;
512     if (*s<=32) {
513     i=0;
514     continue;
515     }
516     ii |= *s - 'A'; //assume uppercase!
517     ii &= n1;
518     i++;
519     if (i >= dustword) {
520     for (js=0; js < nis && iis[js] != ii; js++) ;
521     if (js == nis) {
522     iis[nis] = ii;
523     counts[ii] = 0;
524     nis++;
525     }
526     if ((t = counts[ii]) > 0) {
527     sum += t;
528     v = 10 * sum / j;
529     if (mv < v) {
530     mv = v;
531     iv = ivv;
532     jv = j;
533     }
534     }
535     counts[ii]++;
536     }
537     }
538     }
539    
540     int wo(int len, const char* s, int* beg, int* end) {
541     int i, l1;
542     l1 = len - dustword + 1;
543     if (l1 < 0) {
544     *beg = 0;
545     *end = len - 1;
546     return 0;
547     }
548     mv = 0;
549     iv = 0;
550     jv = 0;
551     for (i=0; i < l1; i++) {
552     wo1(len-i, s+i, i);
553     }
554     *beg = iv;
555     *end = iv + jv;
556     return mv;
557     }
558    
559     void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
560     int i, j, l, a, b, v;
561     if (cutoff==0) cutoff=dustcutoff;
562     a=0;b=0;
563     //GMessage("Dust cutoff=%d\n", cutoff);
564     for (i=0; i < seqlen; i += dustwindow2) {
565     l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
566     v = wo(l, seq+i, &a, &b);
567     if (v > cutoff) {
568     //for (j = a; j <= b && j < dustwindow2; j++) {
569     for (j = a; j <= b; j++) {
570     seqmsk[i+j]='N';//could be made lowercase instead
571     }
572     }
573     }
574     //return first;
575     }
576     };
577    
578     static DNADuster duster;
579    
580     int dust(GStr& rseq) {
581     char* seq=Gstrdup(rseq.chars());
582     duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
583     //check the number of Ns:
584     int ncount=0;
585     for (int i=0;i<rseq.length();i++) {
586     if (seq[i]=='N') ncount++;
587     }
588     GFREE(seq);
589     return ncount;
590     }
591    
592     // ------------------ adapter matching - triplet matching
593     //look for matching triplets amongst the first 9 nucleotides of the 3' adaptor
594     // or the last 9 nucleotides for the 5' adapter
595     //when a triplet match is found, simply try to extend the alignment using a drop-off scheme
596     // check minimum score and
597     // for 3' adapter trimming:
598     // require that the right end of the alignment for either the adaptor OR the read must be
599     // < 3 distance from its right end
600     // for 5' adapter trimming:
601     // require that the left end of the alignment for either the adaptor OR the read must
602     // be at coordinate 0
603    
604     bool extendMatch(const char* a, int alen, int ai,
605 gpertea 9 const char* b, int blen, int bi, int mlen, int& l5, int& l3, bool end5=false) {
606 gpertea 4 //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
607 gpertea 9 //if (debug)
608     // GStr dbg(b);
609 gpertea 4 //GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(ai, mlen)).chars(), ai);
610     int a_l=ai; //alignment coordinates on a
611     int a_r=ai+mlen-1;
612     int b_l=bi; //alignment coordinates on b
613     int b_r=bi+mlen-1;
614     int ai_maxscore=ai;
615     int bi_maxscore=bi;
616     int score=mlen*a_m_score;
617     int maxscore=score;
618 gpertea 9 int mism5score=a_mis_score;
619     if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
620 gpertea 4 //try to extend to the left first, if possible
621     while (ai>0 && bi>0) {
622     ai--;
623     bi--;
624     score+= (a[ai]==b[bi])? a_m_score : mism5score;
625     if (score>maxscore) {
626     ai_maxscore=ai;
627     bi_maxscore=bi;
628     maxscore=score;
629     }
630     else if (maxscore-score>a_dropoff_score) break;
631     }
632     a_l=ai_maxscore;
633     b_l=bi_maxscore;
634 gpertea 9 //if (debug) GMessage(" after l-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
635 gpertea 4 //now extend to the right
636     ai_maxscore=a_r;
637     bi_maxscore=b_r;
638     ai=a_r;
639     bi=b_r;
640     score=maxscore;
641     //sometimes there are extra AAAAs at the end of the read, ignore those
642     if (strcmp(&a[alen-4],"AAAA")==0) {
643     alen-=3;
644     while (a[alen-1]=='A' && alen>ai) alen--;
645     }
646     while (ai<alen-1 && bi<blen-1) {
647     ai++;
648     bi++;
649     //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
650     if (a[ai]==b[bi]) { //match
651     score+=a_m_score;
652     if (ai>=alen-2) {
653     score+=a_m_score-(alen-ai-1);
654     }
655     }
656     else { //mismatch
657     score+=a_mis_score;
658     }
659     if (score>maxscore) {
660     ai_maxscore=ai;
661     bi_maxscore=bi;
662     maxscore=score;
663     }
664     else if (maxscore-score>a_dropoff_score) break;
665     }
666     a_r=ai_maxscore;
667     b_r=bi_maxscore;
668 gpertea 9 int a_ovh3=alen-a_r-1;
669     int b_ovh3=blen-b_r-1;
670     int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
671     int mmovh5=(a_l<b_l)? a_l : b_l;
672     //if (debug) GMessage(" after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
673     if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
674     if (a_l<a_ovh3) {
675 gpertea 4 //adapter closer to the left end (typical for 5' adapter)
676     l5=a_r+1;
677     l3=alen-1;
678     }
679     else {
680     //adapter matching at the right end (typical for 3' adapter)
681     l5=0;
682     l3=a_l-1;
683     }
684     return true;
685     }
686 gpertea 11 //do not trim:
687     l5=0;
688     l3=alen-1;
689 gpertea 4 return false;
690     }
691    
692     bool trim_adapter3(GStr& seq, int&l5, int &l3) {
693     int rlen=seq.length();
694     l5=0;
695     l3=rlen-1;
696     //first try a full match, we might get lucky
697     int fi=-1;
698     if ((fi=seq.index(adapter3))>=0) {
699     if (fi<rlen-fi-a3len) {//match is closer to the right end
700     l5=fi+a3len;
701     l3=rlen-1;
702     }
703     else {
704     l5=0;
705     l3=fi-1;
706     }
707     return true;
708     }
709     //also, for fast detection of other adapter-only reads that start past
710     // the beginning of the adapter sequence, try to see if the first a3len-4
711     // characters of the read are a substring of the adapter
712     GStr rstart=seq.substr(0,a3len-4);
713     if ((fi=adapter3.index(rstart))>=0) {
714     l3=rlen-1;
715     l5=a3len-4;
716     while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
717     return true;
718     }
719    
720     //another easy case: first half of the adaptor matches
721     int a3hlen=a3len>>1;
722     GStr ahalf=adapter3.substr(0, a3hlen);
723     if ((fi=seq.index(ahalf))>=0) {
724     extendMatch(seq.chars(), rlen, fi,
725     adapter3.chars(), a3len, 0, a3hlen, l5,l3);
726     return true;
727     }
728     //no easy cases, so let's do the word hashing
729     for (int iw=0;iw<6;iw++) {
730     GStr aword=adapter3.substr(iw,3);
731     if ((fi=seq.index(aword))>=0 && rlen-fi>3) {
732     if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
733     a3len, iw, 3, l5,l3)) return true;
734     }
735     }
736     return false; //no adapter parts found
737     }
738    
739     bool trim_adapter5(GStr& seq, int&l5, int &l3) {
740 gpertea 9 //if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars());
741 gpertea 4 int rlen=seq.length();
742     l5=0;
743     l3=rlen-1;
744     //try to see if adapter is fully included in the read
745     int fi=-1;
746     if ((fi=seq.index(adapter5))>=0) {
747     if (fi<rlen-fi-a5len) {//match is closer to the right end
748     l5=fi+a5len;
749     l3=rlen-1;
750     }
751     else {
752     l5=0;
753     l3=fi-1;
754     }
755     return true;
756     }
757     //for fast detection of adapter-rich reads, check if the first 12
758     //characters of the read are a substring of the adapter
759     GStr rstart=seq.substr(1,12);
760     if ((fi=adapter5.index(rstart))>=0) {
761     //l3=rlen-1;
762     //l5=a5len-4;
763     //while (fi+l5<a5len && l5<l3 && adapter5[fi+l5]==seq[l5]) l5++;
764     //return true;
765 gpertea 9 //if (debug) GMessage(" first 12char of the read match adaptor!\n");
766 gpertea 4 extendMatch(seq.chars(), rlen, 1,
767 gpertea 9 adapter5.chars(), a5len, fi, 12, l5,l3, true);
768 gpertea 4 return true;
769     }
770    
771     //another easy case: last 12 characters of the adaptor found as a substring of the read
772     int aplen=12;
773     int apstart=a5len-aplen-2;
774     if (apstart<0) { apstart=0; aplen=a5len-1; }
775     GStr bstr=adapter5.substr(apstart, aplen);
776     if ((fi=seq.index(bstr))>=0) {
777 gpertea 9 //if (debug) GMessage(" last 12char of adaptor match the read!\n");
778 gpertea 4 extendMatch(seq.chars(), rlen, fi,
779 gpertea 9 adapter5.chars(), a5len, apstart, aplen, l5,l3,true);
780 gpertea 4 return true;
781     }
782     //no easy cases, find a triplet match as a seed for alignment extension
783     //find triplets at the right end of the adapter
784     for (int iw=0;iw<6;iw++) {
785     apstart=a5len-iw-4;
786     GStr aword=adapter5.substr(apstart,3);
787     if ((fi=seq.index(aword))>=0) {
788 gpertea 9 //if (debug) GMessage("extending wmatch :%*s\n", fi+3, aword.chars());
789 gpertea 4 if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
790 gpertea 9 a5len, apstart, 3, l5,l3,true)) {
791 gpertea 4 return true;
792     }
793     }
794     }
795     return false; //no adapter parts found
796     }
797    
798     //conversion of phred64 to phread33
799     void convertQ64(GStr& q) {
800     for (int i=0;i<q.length();i++) q[i]-=31;
801     }
802    
803     void convertQ64(char* q, int len) {
804     for (int i=0;i<len;i++) q[i]-=31;
805     }