ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 13
Committed: Mon Jul 18 20:35:43 2011 UTC (8 years, 1 month ago) by gpertea
File size: 52599 byte(s)
Log Message:
sync with my local source

Line User Rev File contents
1 gpertea 4 #include "GArgs.h"
2     #include "GStr.h"
3     #include "GHash.hh"
4     #include "GList.hh"
5 gpertea 13 #include <ctype.h>
6 gpertea 4
7     #define USAGE "Usage:\n\
8 gpertea 13 fqtrim [-5 <5adapter>] [-3 <3adapter>] [-a <min_matchlen>] [-p {64|33}] [-q <minq> [-t <trim_max>]]\\\n\
9     [-n <rename_prefix>] [-o <outsuffix>] [-z <zcmd>] [-r <discarded.lst>]\\\n\
10     [-l <minlen>] [-C] [-D] [-Q] <input.fq>[,<input_mates.fq>\n\
11 gpertea 4 \n\
12 gpertea 13 Trim low quality bases at the 3' end, optionally trim adapter sequence, filter\n\
13 gpertea 12 for low complexity and collapse duplicate reads\n\
14 gpertea 13 If read pairs should be trimmed and kept together (i.e. without discarding\n\
15     one read in a pair), the two file names should be given delimited by a comma\n\
16     or a colon character\n\
17 gpertea 4 \n\
18     Options:\n\
19     -n rename all the reads using the <prefix> followed by a read counter;\n\
20     if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
21     the read duplication count\n\
22 gpertea 13 -o write the trimmed/filtered reads to file(s) named <input>.<outsuffix>\n\
23     which will be created in the current (working) directory\n\
24 gpertea 4 -5 trim the given adapter or primer sequence at the 5' end of each read\n\
25     (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
26     -3 trim the given adapter sequence at the 3' end of each read\n\
27     (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
28 gpertea 13 -a minimum bases to match to adaptor sequence (default 5)\n\
29     -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
30     -t for -q option, maximum trimming at the 3' end is limited to <trim_max>\n\
31 gpertea 12 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
32     -l minimum \"clean\" length after trimming that a read must have\n\
33     in order to pass the filter (default: 16)\n\
34 gpertea 4 -r write a simple \"trash report\" file listing the discarded reads with a\n\
35     one letter code for the reason why they were trashed\n\
36 gpertea 12 -D apply a low-complexity (dust) filter and discard any read that has over\n\
37     50% of its length detected as low complexity\n\
38 gpertea 4 -C collapse duplicate reads and add a prefix with their count to the read\n\
39 gpertea 12 name (e.g. for microRNAs)\n\
40     -p input is phred64/phred33 (use -p64 or -p33)\n\
41     -Q convert quality values to the other Phred qv type\n\
42     -V verbose processing\n\
43 gpertea 4 "
44 gpertea 13
45     //-z for -o option, the output stream(s) will be first piped into the given\n
46     // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
47    
48    
49 gpertea 4 // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
50 gpertea 12
51     //For pair ends sequencing:
52     //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
53     //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
54 gpertea 13 //FILE* f_out=NULL; //stdout if not provided
55     //FILE* f_out2=NULL; //for paired reads
56     //FILE* f_in=NULL; //input fastq (stdin if not provided)
57     //FILE* f_in2=NULL; //for paired reads
58 gpertea 4 FILE* freport=NULL;
59 gpertea 13
60 gpertea 4 bool debug=false;
61 gpertea 12 bool verbose=false;
62 gpertea 4 bool doCollapse=false;
63     bool doDust=false;
64 gpertea 13 bool fastaOutput=false;
65 gpertea 4 bool trashReport=false;
66 gpertea 12 //bool rawFormat=false;
67 gpertea 4 int min_read_len=16;
68 gpertea 12 double max_perc_N=7.0;
69 gpertea 4 int dust_cutoff=16;
70     bool isfasta=false;
71 gpertea 12 bool convert_phred=false;
72 gpertea 13 GStr outsuffix; // -o
73 gpertea 4 GStr adapter3;
74     GStr adapter5;
75 gpertea 13 GStr prefix;
76     GStr zcmd;
77     int num_trimmed5=0;
78     int num_trimmed3=0;
79     int num_trimmedN=0;
80     int num_trimmedQ=0;
81     int min_trimmed5=INT_MAX;
82     int min_trimmed3=INT_MAX;
83 gpertea 12
84 gpertea 13 int qvtrim_qmin=0;
85     int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
86 gpertea 12 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
87     int qv_cvtadd=0; //could be -31 or +31
88    
89 gpertea 4 int a3len=0;
90     int a5len=0;
91     // adaptor matching metrics -- for extendMatch() function
92 gpertea 12 const int a_m_score=2; //match score
93     const int a_mis_score=-3; //mismatch
94     const int a_dropoff_score=7;
95 gpertea 13 int a_min_score=10; //an exact match of 5 bases at the proper ends WILL be trimmed
96 gpertea 12 const int a_min_chain_score=15; //for gapped alignments
97 gpertea 4
98 gpertea 12 class CSegChain;
99    
100     class CSegPair {
101     public:
102     GSeg a;
103     GSeg b; //the adapter segment
104     int score;
105     int flags;
106     CSegChain* chain;
107     CSegPair(int astart=0, int aend=0, int bstart=0, int bend=0, int mscore=0):a(astart,aend),b(bstart, bend) {
108     score=mscore;
109     if (score==0) score=a.len()*a_m_score;
110     flags=0;
111     chain=NULL;
112     }
113     int len() { return a.len(); }
114     bool operator==(CSegPair& d){
115     //return (a.start==d.a.start && a.end==d.a.end && b.start==d.b.start && b.end==d.b.end);
116     //make equal even segments that are included into one another:
117     return (d.a.start>=a.start && d.a.end<=a.end && d.b.start>=b.start && d.b.end<=b.end);
118     }
119     bool operator>(CSegPair& d){ //ordering based on b (adaptor) start coord and score
120     if (b.start==d.b.start) {
121     if (score==d.score) {
122     //just try to be consistent:
123     if (b.end==d.b.end) {
124     return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
125     }
126     return (b.end>d.b.end);
127     }
128     else return (score<d.score);
129     }
130     return (b.start>d.b.start);
131     }
132     bool operator<(CSegPair& d){ //ordering based on b (adaptor) coord
133     /*if (b.start==d.b.start && b.end==d.b.end) {
134     return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
135     }
136     return (b.start==d.b.start)?(b.end<d.b.end):(b.start<d.b.start);*/
137     if (b.start==d.b.start) {
138     if (score==d.score) {
139     //just try to be consistent:
140     if (b.end==d.b.end) {
141     return (a.start==d.a.start)?(a.end>d.a.end):(a.start>d.a.start);
142     }
143     return (b.end<d.b.end);
144     }
145     else return (score>d.score);
146     }
147     return (b.start<d.b.start);
148     }
149     };
150    
151     int cmpSegEnds(pointer sa, pointer sb) { //sort by adaptor seg ends AND score
152     CSegPair& x = *(CSegPair *)sa;
153     CSegPair& y = *(CSegPair *)sb;
154     /*
155     if (x.b.end==y.b.end) {
156     if (x.b.start==y.b.start) {
157     if (x.a.end==y.a.end) {
158     if (x.a.start==y.a.start) return 0;
159     return ((x.a.start>y.a.start) ? -1 : 1);
160     }
161     else {
162     return ((x.a.end>y.a.end) ? -1 : 1);
163     }
164     }
165     else {
166     return ((x.b.start>y.b.start) ? -1 : 1);
167     }
168     }
169     else {
170     return ((x.b.end>y.b.end) ? -1 : 1);
171     }
172     */
173     if (x.b.end==y.b.end) {
174     if (x.score==y.score) {
175     if (x.b.start==y.b.start) {
176     if (x.a.end==y.a.end) {
177     if (x.a.start==y.a.start) return 0;
178     return ((x.a.start<y.a.start) ? -1 : 1);
179     }
180     else {
181     return ((x.a.end<y.a.end) ? -1 : 1);
182     }
183     }
184     else {
185     return ((x.b.start<y.b.start) ? -1 : 1);
186     }
187     } else return ((x.score>y.score) ? -1 : 1);
188     }
189     else {
190     return ((x.b.end>y.b.end) ? -1 : 1);
191     }
192    
193     }
194    
195     class CSegChain:public GList<CSegPair> {
196     public:
197     uint astart;
198     uint aend;
199     uint bstart;
200     uint bend;
201     int score;
202     bool endSort;
203     CSegChain(bool aln5=false):GList<CSegPair>(true,true,true) {//sorted, free elements, unique
204     //as SegPairs are inserted, they will be sorted by a.start coordinate
205     score=0;
206     astart=MAX_UINT;
207     aend=0;
208     bstart=MAX_UINT;
209     bend=0;
210     endSort=aln5;
211     if (aln5) { setSorted(cmpSegEnds); }
212     }
213     bool operator==(CSegChain& d) {
214     //return (score==d.score);
215     return (astart==d.astart && aend==d.aend && bstart==d.bstart && bend==d.bend);
216     }
217     bool operator>(CSegChain& d) { // order based on b (adaptor) coordinate
218     //return (score<d.score);
219     if (bstart==d.bstart && bend==d.bend) {
220     return (astart==d.astart)?(aend>d.aend):(astart>d.astart);
221     }
222     return (bstart==d.bstart)?(bend>d.bend):(bstart>d.bstart);
223     }
224     bool operator<(CSegChain& d) {
225     //return (score>d.score);
226     if (bstart==d.bstart && bend==d.bend) {
227     return (astart==d.astart)?(aend<d.aend):(astart<d.astart);
228     }
229     return (bstart==d.bstart)?(bend<d.bend):(bstart<d.bstart);
230     }
231     void addSegPair(CSegPair* segp) {
232     if (AddIfNew(segp)!=segp) return;
233     score+=segp->score;
234     if (astart>segp->a.start) astart=segp->a.start;
235     if (aend<segp->a.end) aend=segp->a.end;
236     if (bstart>segp->b.start) bstart=segp->b.start;
237     if (bend<segp->b.end) bend=segp->b.end;
238     }
239     //for building actual chains:
240     bool extendChain(CSegPair* segp) { //segp expected to be "Greater Than" current chain
241     int bgap=0;
242     int agap=0;
243     //if (endSort) {
244     if (bstart>segp->b.start) {
245     bgap = (int)(bstart-segp->b.end);
246     if (abs(bgap)>2) return false;
247     agap = (int)(astart-segp->a.end);
248     if (abs(agap)>2) return false;
249     }
250     else {
251     bgap = (int) (segp->b.start-bend);
252     if (abs(bgap)>2) return false;
253     agap = (int)(segp->a.start-aend);
254     if (abs(agap)>2) return false;
255     }
256     if (agap*bgap<0) return false;
257     addSegPair(segp);
258     score-=abs(agap)+abs(bgap);
259     return true;
260     }
261     };
262    
263 gpertea 4 // element in dhash:
264     class FqDupRec {
265     public:
266     int count; //how many of these reads are the same
267     int len; //length of qv
268     char* firstname; //optional, only if we want to keep the original read names
269     char* qv;
270     FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
271     len=0;
272     qv=NULL;
273     firstname=NULL;
274     count=0;
275     if (qstr!=NULL) {
276     qv=Gstrdup(qstr->chars());
277     len=qstr->length();
278     count++;
279     }
280     if (rname!=NULL) firstname=Gstrdup(rname);
281     }
282     ~FqDupRec() {
283     GFREE(qv);
284     GFREE(firstname);
285     }
286     void add(GStr& d) { //collapse another record into this one
287     if (d.length()!=len)
288     GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
289     count++;
290     for (int i=0;i<len;i++)
291     qv[i]=(qv[i]+d[i])/2;
292     }
293     };
294    
295     void openfw(FILE* &f, GArgs& args, char opt) {
296     GStr s=args.getOpt(opt);
297     if (!s.is_empty()) {
298     if (s=='-') f=stdout;
299     else {
300 gpertea 13 f=fopen(s.chars(),"w");
301 gpertea 4 if (f==NULL) GError("Error creating file: %s\n", s.chars());
302     }
303     }
304     }
305    
306     #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
307     #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
308    
309     GHash<FqDupRec> dhash; //hash to keep track of duplicates
310    
311 gpertea 13 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
312     GStr& s, GStr& infname, GStr& infname2);
313     // uses outsuffix to generate output file names and open file handles as needed
314    
315     void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
316     void trash_report(char trashcode, GStr& rname, FILE* freport);
317    
318     bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
319     GStr& rname, GStr& rinfo, GStr& infname);
320    
321     char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
322     //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
323    
324 gpertea 12 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
325     bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
326 gpertea 4 int dust(GStr& seq);
327     bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
328     bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
329    
330 gpertea 12 void convertPhred(char* q, int len);
331     void convertPhred(GStr& q);
332 gpertea 4
333     int main(int argc, char * const argv[]) {
334 gpertea 13 GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:t:o:z:a:");
335 gpertea 4 int e;
336     if ((e=args.isError())>0) {
337     GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
338     exit(224);
339     }
340 gpertea 9 debug=(args.getOpt('Y')!=NULL);
341 gpertea 13 verbose=(args.getOpt('V')!=NULL);
342 gpertea 12 convert_phred=(args.getOpt('Q')!=NULL);
343 gpertea 4 doCollapse=(args.getOpt('C')!=NULL);
344     doDust=(args.getOpt('D')!=NULL);
345 gpertea 12 /*
346 gpertea 4 rawFormat=(args.getOpt('R')!=NULL);
347     if (rawFormat) {
348     GError("Sorry, raw qseq format parsing is not implemented yet!\n");
349     }
350 gpertea 12 */
351 gpertea 4 prefix=args.getOpt('n');
352     GStr s=args.getOpt('l');
353     if (!s.is_empty())
354     min_read_len=s.asInt();
355 gpertea 12 s=args.getOpt('m');
356     if (!s.is_empty())
357     max_perc_N=s.asDouble();
358 gpertea 4 s=args.getOpt('d');
359     if (!s.is_empty()) {
360     dust_cutoff=s.asInt();
361     doDust=true;
362     }
363 gpertea 12 s=args.getOpt('q');
364     if (!s.is_empty()) {
365 gpertea 13 qvtrim_qmin=s.asInt();
366 gpertea 12 }
367 gpertea 13 s=args.getOpt('t');
368     if (!s.is_empty()) {
369     qvtrim_max=s.asInt();
370     }
371 gpertea 12 s=args.getOpt('p');
372     if (!s.is_empty()) {
373     int v=s.asInt();
374     if (v==33) {
375     qv_phredtype=33;
376     qv_cvtadd=31;
377     }
378     else if (v==64) {
379     qv_phredtype=64;
380     qv_cvtadd=-31;
381     }
382     else
383     GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
384     }
385 gpertea 4 if (args.getOpt('3')!=NULL) {
386     adapter3=args.getOpt('3');
387     adapter3.upper();
388     a3len=adapter3.length();
389     }
390     if (args.getOpt('5')!=NULL) {
391     adapter5=args.getOpt('5');
392     adapter5.upper();
393     a5len=adapter5.length();
394     }
395 gpertea 13 s=args.getOpt('a');
396     if (!s.is_empty()) {
397     int a_minmatch=s.asInt();
398     a_min_score=a_minmatch<<1;
399     }
400    
401     if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
402 gpertea 4 trashReport= (args.getOpt('r')!=NULL);
403 gpertea 13 int fcount=args.startNonOpt();
404     if (fcount==0) {
405 gpertea 4 GMessage(USAGE);
406     exit(224);
407     }
408 gpertea 13 if (fcount>1 && doCollapse) {
409     GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
410     }
411     //openfw(f_out, args, 'o');
412     //if (f_out==NULL) f_out=stdout;
413 gpertea 4 if (trashReport)
414     openfw(freport, args, 'r');
415     char* infile=NULL;
416     while ((infile=args.nextNonOpt())!=NULL) {
417 gpertea 13 int incounter=0; //counter for input reads
418     int outcounter=0; //counter for output reads
419     int trash_s=0; //too short from the get go
420     int trash_Q=0;
421     int trash_N=0;
422     int trash_D=0;
423     int trash_A3=0;
424     int trash_A5=0;
425     s=infile;
426     GStr infname;
427     GStr infname2;
428     FILE* f_in=NULL;
429     FILE* f_in2=NULL;
430     FILE* f_out=NULL;
431     FILE* f_out2=NULL;
432     bool paired_reads=false;
433     setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
434     GLineReader fq(f_in);
435     GLineReader* fq2=NULL;
436     if (f_in2!=NULL) {
437     fq2=new GLineReader(f_in2);
438     paired_reads=true;
439     }
440     GStr rseq, rqv, seqid, seqinfo;
441     GStr rseq2, rqv2, seqid2, seqinfo2;
442     while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
443     incounter++;
444     int a5=0, a3=0, b5=0, b3=0;
445     char tcode=0, tcode2=0;
446     tcode=process_read(seqid, rseq, rqv, a5, a3);
447     //if (!doCollapse) {
448     if (fq2!=NULL) {
449     getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
450     if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
451     GError("Error: no paired match for read %s vs %s (%s,%s)\n",
452     seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
453     }
454     tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
455     //decide what to do with this pair and print rseq2 if the pair makes it
456     if (tcode>1 && tcode2<=1) {
457     //"untrash" rseq
458     if (a3-a5+1<min_read_len) {
459     a5=1;
460     if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
461 gpertea 4 }
462 gpertea 13 tcode=1;
463     }
464     else if (tcode<=1 && tcode2>1) {
465     //"untrash" rseq2
466     if (b3-b5+1<min_read_len) {
467     b5=1;
468     if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
469     }
470     tcode2=1;
471     }
472     if (tcode<=1) { //trimmed or left intact -- write it!
473     if (tcode>0) {
474     rseq2=rseq2.substr(b5,b3-b5+1);
475     if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
476 gpertea 4 }
477 gpertea 13 int nocounter=0;
478     writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
479     }
480     } //paired read
481     // }
482     if (tcode>1) { //trashed
483     if (tcode=='s') trash_s++;
484     else if (tcode=='Q') trash_Q++;
485     else if (tcode=='N') trash_N++;
486     else if (tcode=='D') trash_D++;
487     else if (tcode=='3') trash_A3++;
488     else if (tcode=='5') trash_A5++;
489     if (trashReport) trash_report(tcode, seqid, freport);
490     }
491     else if (!doCollapse) { //write it
492     if (tcode>0) {
493     rseq=rseq.substr(a5,a3-a5+1);
494     if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
495     }
496     writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
497     }
498     } //for each fastq record
499     delete fq2;
500     FRCLOSE(f_in);
501     FRCLOSE(f_in2);
502     if (doCollapse) {
503     outcounter=0;
504     int maxdup_count=1;
505     char* maxdup_seq=NULL;
506     dhash.startIterate();
507     FqDupRec* qd=NULL;
508     char* seq=NULL;
509     while ((qd=dhash.NextData(seq))!=NULL) {
510     GStr rseq(seq);
511     //do the dusting here
512     if (doDust) {
513     int dustbases=dust(rseq);
514     if (dustbases>(rseq.length()>>1)) {
515     if (trashReport && qd->firstname!=NULL) {
516     fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
517 gpertea 4 }
518 gpertea 13 trash_D+=qd->count;
519     continue;
520     }
521     }
522     outcounter++;
523     if (qd->count>maxdup_count) {
524     maxdup_count=qd->count;
525     maxdup_seq=seq;
526     }
527     if (isfasta) {
528     if (prefix.is_empty()) {
529     fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
530     rseq.chars());
531 gpertea 4 }
532 gpertea 13 else { //use custom read name
533     fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
534     qd->count, rseq.chars());
535 gpertea 4 }
536 gpertea 13 }
537     else { //fastq format
538     if (convert_phred) convertPhred(qd->qv, qd->len);
539     if (prefix.is_empty()) {
540     fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
541     rseq.chars(), qd->qv);
542 gpertea 4 }
543 gpertea 13 else { //use custom read name
544     fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
545     qd->count, rseq.chars(), qd->qv);
546     }
547     }
548     }//for each element of dhash
549     if (maxdup_count>1) {
550     GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
551 gpertea 4 }
552 gpertea 13 } //collapse entries
553     if (verbose) {
554     if (paired_reads) {
555     GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
556     GMessage("Number of input pairs :%9d\n", incounter);
557     GMessage(" Output pairs :%9d\n", outcounter);
558     }
559     else {
560     GMessage(">Input file : %s\n", infname.chars());
561     GMessage("Number of input reads :%9d\n", incounter);
562     GMessage(" Output reads :%9d\n", outcounter);
563     }
564     GMessage("------------------------------------\n");
565     if (num_trimmed5)
566     GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
567     if (num_trimmed3)
568     GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
569     GMessage("------------------------------------\n");
570     if (trash_s>0)
571     GMessage(" Trashed by length:%9d\n", trash_s);
572     if (trash_N>0)
573     GMessage(" Trashed by N%%:%9d\n", trash_N);
574     if (trash_Q>0)
575     GMessage("Trashed by low quality:%9d\n", trash_Q);
576     if (trash_A5>0)
577     GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
578     if (trash_A3>0)
579     GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
580     }
581     if (trashReport) {
582     FWCLOSE(freport);
583 gpertea 4 }
584 gpertea 13 FWCLOSE(f_out);
585     FWCLOSE(f_out2);
586     } //while each input file
587 gpertea 4
588 gpertea 12 //getc(stdin);
589 gpertea 4 }
590    
591     class NData {
592     public:
593     int NPos[1024]; //there should be no reads longer than 1K ?
594     int NCount;
595     int end5;
596     int end3;
597     int seqlen;
598     double perc_N; //percentage of Ns in end5..end3 range only!
599     const char* seq;
600     bool valid;
601     NData() {
602     NCount=0;
603     end5=0;
604     end3=0;
605     seq=NULL;
606     perc_N=0;
607     valid=true;
608     }
609     void init(GStr& rseq) {
610     NCount=0;
611     perc_N=0;
612     seqlen=rseq.length();
613     seq=rseq.chars();
614     end5=1;
615     end3=seqlen;
616     for (int i=0;i<seqlen;i++)
617 gpertea 12 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
618 gpertea 4 NPos[NCount]=i;
619     NCount++;
620     }
621     perc_N=(NCount*100.0)/seqlen;
622     valid=true;
623     }
624     void N_calc() { //only in the region after trimming
625     int n=0;
626     for (int i=end3-1;i<end5;i++) {
627     if (seq[i]=='N') n++;
628     }
629     perc_N=(n*100.0)/(end5-end3+1);
630     }
631     };
632    
633     static NData feat;
634     int perc_lenN=12; // incremental distance from ends, in percentage of
635     // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
636    
637     void N_analyze(int l5, int l3, int p5, int p3) {
638     /* assumes feat was filled properly */
639     int old_dif, t5,t3,v;
640 gpertea 12 if (l3<l5+2 || p5>p3 ) {
641 gpertea 4 feat.end5=l5+1;
642     feat.end3=l3+1;
643     return;
644     }
645    
646     t5=feat.NPos[p5]-l5;
647     t3=l3-feat.NPos[p3];
648     old_dif=p3-p5;
649     v=(int)((((double)(l3-l5))*perc_lenN)/100);
650     if (v>20) v=20; /* enforce N-search limit for very long reads */
651     if (t5 < v ) {
652     l5=feat.NPos[p5]+1;
653     p5++;
654     }
655     if (t3 < v) {
656     l3=feat.NPos[p3]-1;
657     p3--;
658     }
659     /* restNs=p3-p5; number of Ns in the new CLR */
660     if (p3-p5==old_dif) { /* no change, return */
661     /* don't trim if this may shorten the read too much */
662     //if (l5-l3<min_read_len) return;
663     feat.end5=l5+1;
664     feat.end3=l3+1;
665     return;
666     }
667     else
668     N_analyze(l5,l3, p5,p3);
669     }
670    
671    
672 gpertea 12 bool qtrim(GStr& qvs, int &l5, int &l3) {
673 gpertea 13 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
674 gpertea 12 if (qv_phredtype==0) {
675     //try to guess the Phred type
676     int vmin=256, vmax=0;
677     for (int i=0;i<qvs.length();i++) {
678     if (vmin>qvs[i]) vmin=qvs[i];
679     if (vmax<qvs[i]) vmax=qvs[i];
680     }
681     if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
682     if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
683     if (qv_phredtype==0) {
684     GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
685     }
686 gpertea 13 if (verbose)
687     GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
688     } //guessing Phred type
689 gpertea 12 for (l3=qvs.length()-1;l3>2;l3--) {
690 gpertea 13 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
691 gpertea 12 }
692     //just in case, check also the 5' the end (?)
693     for (l5=0;l5<qvs.length()-3;l5++) {
694 gpertea 13 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
695 gpertea 12 }
696 gpertea 13 if (qvtrim_max>0) {
697     if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
698     if (l5>qvtrim_max) l5=qvtrim_max;
699     }
700 gpertea 12 return (l5>0 || l3<qvs.length()-1);
701     }
702    
703     bool ntrim(GStr& rseq, int &l5, int &l3) {
704     //count Ns in the sequence, trim N-rich ends
705 gpertea 4 feat.init(rseq);
706     l5=feat.end5-1;
707     l3=feat.end3-1;
708     N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
709 gpertea 12 if (l5==feat.end5-1 && l3==feat.end3-1) {
710     if (feat.perc_N>max_perc_N) {
711     feat.valid=false;
712     l3=l5+1;
713     return true;
714     }
715     else {
716     return false; //nothing changed
717     }
718     }
719 gpertea 4 l5=feat.end5-1;
720     l3=feat.end3-1;
721     if (l3-l5+1<min_read_len) {
722     feat.valid=false;
723     return true;
724     }
725     feat.N_calc();
726 gpertea 12
727     if (feat.perc_N>max_perc_N) {
728 gpertea 4 feat.valid=false;
729     l3=l5+1;
730     return true;
731     }
732     return true;
733     }
734    
735     //--------------- dust functions ----------------
736     class DNADuster {
737     public:
738     int dustword;
739     int dustwindow;
740     int dustwindow2;
741     int dustcutoff;
742     int mv, iv, jv;
743     int counts[32*32*32];
744     int iis[32*32*32];
745     DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
746     dustword=wordsize;
747     dustwindow=winsize;
748     dustwindow2 = (winsize>>1);
749     dustcutoff=cutoff;
750     mv=0;
751     iv=0;
752     jv=0;
753     }
754     void setWindowSize(int value) {
755     dustwindow = value;
756     dustwindow2 = (dustwindow >> 1);
757     }
758     void setWordSize(int value) {
759     dustword=value;
760     }
761     void wo1(int len, const char* s, int ivv) {
762     int i, ii, j, v, t, n, n1, sum;
763     int js, nis;
764     n = 32 * 32 * 32;
765     n1 = n - 1;
766     nis = 0;
767     i = 0;
768     ii = 0;
769     sum = 0;
770     v = 0;
771     for (j=0; j < len; j++, s++) {
772     ii <<= 5;
773     if (*s<=32) {
774     i=0;
775     continue;
776     }
777     ii |= *s - 'A'; //assume uppercase!
778     ii &= n1;
779     i++;
780     if (i >= dustword) {
781     for (js=0; js < nis && iis[js] != ii; js++) ;
782     if (js == nis) {
783     iis[nis] = ii;
784     counts[ii] = 0;
785     nis++;
786     }
787     if ((t = counts[ii]) > 0) {
788     sum += t;
789     v = 10 * sum / j;
790     if (mv < v) {
791     mv = v;
792     iv = ivv;
793     jv = j;
794     }
795     }
796     counts[ii]++;
797     }
798     }
799     }
800    
801     int wo(int len, const char* s, int* beg, int* end) {
802     int i, l1;
803     l1 = len - dustword + 1;
804     if (l1 < 0) {
805     *beg = 0;
806     *end = len - 1;
807     return 0;
808     }
809     mv = 0;
810     iv = 0;
811     jv = 0;
812     for (i=0; i < l1; i++) {
813     wo1(len-i, s+i, i);
814     }
815     *beg = iv;
816     *end = iv + jv;
817     return mv;
818     }
819    
820     void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
821     int i, j, l, a, b, v;
822     if (cutoff==0) cutoff=dustcutoff;
823     a=0;b=0;
824     //GMessage("Dust cutoff=%d\n", cutoff);
825     for (i=0; i < seqlen; i += dustwindow2) {
826     l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
827     v = wo(l, seq+i, &a, &b);
828     if (v > cutoff) {
829     //for (j = a; j <= b && j < dustwindow2; j++) {
830     for (j = a; j <= b; j++) {
831     seqmsk[i+j]='N';//could be made lowercase instead
832     }
833     }
834     }
835     //return first;
836     }
837     };
838    
839     static DNADuster duster;
840    
841     int dust(GStr& rseq) {
842     char* seq=Gstrdup(rseq.chars());
843     duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
844     //check the number of Ns:
845     int ncount=0;
846     for (int i=0;i<rseq.length();i++) {
847     if (seq[i]=='N') ncount++;
848     }
849     GFREE(seq);
850     return ncount;
851     }
852    
853 gpertea 12
854     // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
855     //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
856     //check minimum score and
857     //for 3' adapter trimming:
858 gpertea 4 // require that the right end of the alignment for either the adaptor OR the read must be
859     // < 3 distance from its right end
860     // for 5' adapter trimming:
861     // require that the left end of the alignment for either the adaptor OR the read must
862 gpertea 12 // be at coordinate < 3 from start
863 gpertea 4
864     bool extendMatch(const char* a, int alen, int ai,
865 gpertea 12 const char* b, int blen, int bi, int mlen, int& l5, int& l3, CSegChain& segs, bool end5=false) {
866 gpertea 4 //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
867 gpertea 12 #ifdef DEBUG
868     GStr dbg(b);
869     #endif
870     //if (debug) {
871     // GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
872     // }
873 gpertea 4 int a_l=ai; //alignment coordinates on a
874     int a_r=ai+mlen-1;
875     int b_l=bi; //alignment coordinates on b
876     int b_r=bi+mlen-1;
877     int ai_maxscore=ai;
878     int bi_maxscore=bi;
879     int score=mlen*a_m_score;
880     int maxscore=score;
881 gpertea 9 int mism5score=a_mis_score;
882     if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
883 gpertea 4 //try to extend to the left first, if possible
884     while (ai>0 && bi>0) {
885     ai--;
886     bi--;
887     score+= (a[ai]==b[bi])? a_m_score : mism5score;
888     if (score>maxscore) {
889     ai_maxscore=ai;
890     bi_maxscore=bi;
891     maxscore=score;
892     }
893     else if (maxscore-score>a_dropoff_score) break;
894     }
895     a_l=ai_maxscore;
896     b_l=bi_maxscore;
897 gpertea 9 //if (debug) GMessage(" after l-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
898 gpertea 4 //now extend to the right
899     ai_maxscore=a_r;
900     bi_maxscore=b_r;
901     ai=a_r;
902     bi=b_r;
903     score=maxscore;
904     //sometimes there are extra AAAAs at the end of the read, ignore those
905     if (strcmp(&a[alen-4],"AAAA")==0) {
906     alen-=3;
907     while (a[alen-1]=='A' && alen>ai) alen--;
908     }
909     while (ai<alen-1 && bi<blen-1) {
910     ai++;
911     bi++;
912     //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
913     if (a[ai]==b[bi]) { //match
914     score+=a_m_score;
915     if (ai>=alen-2) {
916     score+=a_m_score-(alen-ai-1);
917     }
918     }
919     else { //mismatch
920     score+=a_mis_score;
921     }
922     if (score>maxscore) {
923     ai_maxscore=ai;
924     bi_maxscore=bi;
925     maxscore=score;
926     }
927     else if (maxscore-score>a_dropoff_score) break;
928     }
929     a_r=ai_maxscore;
930     b_r=bi_maxscore;
931 gpertea 9 int a_ovh3=alen-a_r-1;
932     int b_ovh3=blen-b_r-1;
933     int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
934     int mmovh5=(a_l<b_l)? a_l : b_l;
935     //if (debug) GMessage(" after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
936 gpertea 12 #ifdef DEBUG
937     if (debug) GMessage(" extended to: %*s\n",a_r+1,dbg.substr(b_l,b_r-b_l+1).chars());
938     #endif
939 gpertea 9 if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
940     if (a_l<a_ovh3) {
941 gpertea 4 //adapter closer to the left end (typical for 5' adapter)
942     l5=a_r+1;
943     l3=alen-1;
944     }
945     else {
946     //adapter matching at the right end (typical for 3' adapter)
947     l5=0;
948     l3=a_l-1;
949     }
950     return true;
951     }
952 gpertea 12 else { //keep this segment pair for later (gapped alignment)
953     segs.addSegPair(new CSegPair(a_l+1, a_r+1, b_l+1, b_r+1, maxscore));
954     //this will also update min & max coordinates in segs (segs.astart, .aend, .bstart, .bend)
955     }
956 gpertea 11 //do not trim:
957     l5=0;
958     l3=alen-1;
959 gpertea 4 return false;
960     }
961    
962 gpertea 12 /*
963     int getWordValue(const char* s, int wlen) {
964     int r=0;
965     while (wlen--) { r+=(((int)s[wlen])<<wlen) }
966     return r;
967     }
968     */
969     int get3mer_value(const char* s) {
970     return (s[0]<<16)+(s[1]<<8)+s[2];
971     }
972    
973     int w3_match(int qv, const char* str, int slen, int start_index=0) {
974     if (start_index>=slen || start_index<0) return -1;
975     for (int i=start_index;i<slen-3;i++) {
976     int rv=get3mer_value(str+i);
977     if (rv==qv) return i;
978     }
979     return -1;
980     }
981    
982     int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
983     if (end_index>=slen) return -1;
984     if (end_index<0) end_index=slen-1;
985     for (int i=end_index-2;i>=0;i--) {
986     int rv=get3mer_value(str+i);
987     if (rv==qv) return i;
988     }
989     return -1;
990     }
991    
992     int fast4match(int32 qv, const char* str, int slen, int start_index=0) {
993     if (start_index>=slen || start_index<0) return -1;
994     for (int i=start_index;i<slen-4;i++) {
995     int32* rv=(int32*)(str+i);
996     if (*rv==qv) return i;
997     }
998     return -1;
999     }
1000    
1001     int fast4rmatch(int32 qv, const char* str, int slen, int end_index=-1) {
1002     if (end_index>=slen) return -1;
1003     if (end_index<0) end_index=slen-1;
1004     for (int i=end_index-3;i>=0;i--) {
1005     int32* rv=(int32*)(str+i);
1006     if (*rv==qv) return i;
1007     }
1008     return -1;
1009     }
1010    
1011     #ifdef DEBUG
1012     void dbgPrintChain(CSegChain& chain, const char* aseq) {
1013     GStr s(aseq);
1014     for (int i=0;i<chain.Count();i++) {
1015     CSegPair& seg=*chain[i];
1016     GMessage(" dbg chain seg%d: %*s [%d-%d:%d-%d]\n",i,seg.a.start-1+seg.len(),
1017     s.substr(seg.b.start-1, seg.len()).chars(), seg.b.start,seg.b.end,seg.a.start,seg.a.end);
1018     }
1019     }
1020     #endif
1021    
1022 gpertea 4 bool trim_adapter3(GStr& seq, int&l5, int &l3) {
1023     int rlen=seq.length();
1024     l5=0;
1025     l3=rlen-1;
1026     //first try a full match, we might get lucky
1027     int fi=-1;
1028     if ((fi=seq.index(adapter3))>=0) {
1029     if (fi<rlen-fi-a3len) {//match is closer to the right end
1030     l5=fi+a3len;
1031     l3=rlen-1;
1032     }
1033     else {
1034     l5=0;
1035 gpertea 12 l3=fi-1;
1036 gpertea 4 }
1037     return true;
1038     }
1039 gpertea 12 #ifdef DEBUG
1040     if (debug) GMessage(">TRIM3 >> Read: %s\n",seq.chars());
1041     #endif
1042    
1043 gpertea 4 //also, for fast detection of other adapter-only reads that start past
1044     // the beginning of the adapter sequence, try to see if the first a3len-4
1045 gpertea 12 // bases of the read are a substring of the adapter
1046     if (rlen>a3len-3) {
1047     GStr rstart=seq.substr(1,a3len-4);
1048     if ((fi=adapter3.index(rstart))>=0) {
1049     l3=rlen-1;
1050     l5=a3len-4;
1051     while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
1052     return true;
1053     }
1054     }
1055     CSegChain a3segs; //no chains here, just an ordered collection of segment pairs
1056     //check the easy cases - 11 bases exact match at the end
1057     int fdlen=11;
1058     if (a3len<16) {
1059     fdlen=a3len>>1;
1060 gpertea 4 }
1061 gpertea 12 if (fdlen>4) {
1062     //check if we're lucky enough to have the last 11 bases of the read a part of the adapter
1063     GStr rstart=seq.substr(-fdlen-3,fdlen);
1064     if ((fi=adapter3.index(rstart))>=0) {
1065     #ifdef DEBUG
1066     if (debug) GMessage(" W11match found: %*s\n", rlen-3, (adapter3.substr(fi,fdlen)).chars());
1067     #endif
1068     if (extendMatch(seq.chars(), rlen, rlen-fdlen-3,
1069     adapter3.chars(), a3len, fi, fdlen, l5,l3, a3segs))
1070     return true;
1071     }
1072     //another easy case: first 11 characters of the adaptor found as a substring of the read
1073     GStr bstr=adapter3.substr(0, fdlen);
1074     if ((fi=seq.rindex(bstr))>=0) {
1075     #ifdef DEBUG
1076     if (debug) GMessage(" A11match found: %*s\n", fi+fdlen, bstr.chars());
1077     #endif
1078     if (extendMatch(seq.chars(), rlen, fi,
1079     adapter3.chars(), a3len, 0, fdlen, l5,l3, a3segs))
1080     return true;
1081     }
1082     } //tried to match 11 bases first
1083    
1084     //no easy cases, so let's do the wmer hashing for the first 12 bases of the adaptor
1085     //-- only extend if the match is in the 3' (ending) region of the read
1086     int wordSize=3;
1087     int hlen=12;
1088     if (hlen>a3len-wordSize) hlen=a3len-wordSize;
1089     int imin=rlen>>1; //last half of the read, left boundary for the wmer match
1090     if (imin<a3len) { imin=GMIN(a3len, rlen-wordSize); }
1091     imin=rlen-imin;
1092     for (int iw=0;iw<hlen;iw++) {
1093     //int32* qv=(int32*)(adapter3.chars()+iw);
1094     int qv=get3mer_value(adapter3.chars()+iw);
1095     fi=-1;
1096     //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1097     while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1098     //GMessage(" ... fi=%d after w3_rmatch() (imin=%d)\n", fi, imin);
1099    
1100     #ifdef DEBUG
1101     if (debug) GMessage(" Wmatch found: %*s\n", fi+wordSize, (adapter3.substr(iw,wordSize)).chars());
1102     #endif
1103     if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1104     a3len, iw, wordSize, l5,l3, a3segs)) return true;
1105     fi--;
1106     if (fi<imin) break;
1107     }
1108     } //for each wmer in the first hlen bases of the adaptor
1109     /*
1110     //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1111     //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1112     if (a3segs.bstart>3 || a3segs.bend<(uint)(hlen-wordSize)) return false;
1113     int hlen2=a3len-wordSize;
1114     //if (hlen2>a3len-4) hlen2=a3len-4;
1115     if (hlen2>hlen) {
1116     #ifdef DEBUG
1117     if (debug && a3segs.Count()>0) {
1118     GMessage(" >>>>>2nd. hash: %s\n",seq.chars());
1119     }
1120     #endif
1121     for (int iw=hlen;iw<hlen2;iw++) {
1122     //int* qv=(int32 *)(adapter3.chars()+iw);
1123     int qv=get3mer_value(adapter3.chars()+iw);
1124     fi=-1;
1125     //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1126     while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1127     extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1128     a3len, iw, wordSize, l5,l3, a3segs);
1129     fi--;
1130     if (fi<imin) break;
1131     }
1132     } //for each wmer between hlen2 and hlen bases of the adaptor
1133     }
1134     //lastly, analyze collected a3segs for a possible gapped alignment:
1135     GList<CSegChain> segchains(false,true,false);
1136     #ifdef DEBUG
1137     if (debug && a3segs.Count()>0) {
1138     GMessage(">>>>>>>>> Read: %s\n",seq.chars());
1139 gpertea 4 }
1140 gpertea 12 #endif
1141     for (int i=0;i<a3segs.Count();i++) {
1142     if (a3segs[i]->chain==NULL) {
1143     if (a3segs[i]->b.start>3) continue; //don't start a hopeless chain
1144     CSegChain* newchain=new CSegChain();
1145     newchain->setFreeItem(false);
1146     newchain->addSegPair(a3segs[i]);
1147     a3segs[i]->chain=newchain;
1148     segchains.Add(newchain); //just to free them when done
1149     }
1150     for (int j=i+1;j<a3segs.Count();j++) {
1151     CSegChain* chain=a3segs[i]->chain;
1152     if (chain->extendChain(a3segs[j])) {
1153     a3segs[j]->chain=chain;
1154     #ifdef DEBUG
1155     if (debug) dbgPrintChain(*chain, adapter3.chars());
1156     #endif
1157     //save time by checking here if the extended chain is already acceptable for trimming
1158     if (chain->aend>(uint)(rlen-4) && chain->bstart<4 && chain->score>a_min_chain_score) {
1159     l5=0;
1160     l3=chain->astart-2;
1161     #ifdef DEBUG
1162     if (debug && a3segs.Count()>0) {
1163     GMessage(">>> >> trimmed-3: %*s\n",l3-l5+1,seq.substr(l5,l3-l5+1).chars());
1164     }
1165     #endif
1166     return true;
1167     }
1168     } //chain can be extended
1169 gpertea 4 }
1170 gpertea 12 } //collect segment alignments into chains
1171     */
1172 gpertea 4 return false; //no adapter parts found
1173     }
1174    
1175     bool trim_adapter5(GStr& seq, int&l5, int &l3) {
1176 gpertea 9 //if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars());
1177 gpertea 4 int rlen=seq.length();
1178     l5=0;
1179     l3=rlen-1;
1180     //try to see if adapter is fully included in the read
1181     int fi=-1;
1182     if ((fi=seq.index(adapter5))>=0) {
1183     if (fi<rlen-fi-a5len) {//match is closer to the right end
1184     l5=fi+a5len;
1185     l3=rlen-1;
1186     }
1187     else {
1188     l5=0;
1189     l3=fi-1;
1190     }
1191     return true;
1192     }
1193 gpertea 12 #ifdef DEBUG
1194     if (debug) GMessage(">TRIM5 >> Read: %s\n",seq.chars());
1195     #endif
1196    
1197     CSegChain a5segs(true); //list of segment pairs to analyze later if no extendMatch succeeded
1198    
1199     //try the easy way out first - look for an exact match of 11 bases
1200     int fdlen=11;
1201     if (a5len<16) {
1202     fdlen=a5len>>1;
1203 gpertea 4 }
1204 gpertea 12 if (fdlen>4) {
1205     GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus
1206     if ((fi=adapter5.index(rstart))>=0) {
1207     #ifdef DEBUG
1208     if (debug) GMessage(" W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars());
1209     #endif
1210     if (extendMatch(seq.chars(), rlen, 1,
1211     adapter5.chars(), a5len, fi, fdlen, l5,l3, a5segs, true))
1212     return true;
1213     }
1214     //another easy case: last 11 characters of the adaptor found as a substring of the read
1215     GStr bstr=adapter5.substr(-fdlen);
1216     if ((fi=seq.index(bstr))>=0) {
1217     #ifdef DEBUG
1218     if (debug) GMessage(" A11match found: %*s\n", fi+fdlen, bstr.chars());
1219     #endif
1220     if (extendMatch(seq.chars(), rlen, fi,
1221     adapter5.chars(), a5len, a5len-fdlen, fdlen, l5,l3,a5segs,true))
1222     return true;
1223     }
1224     } //tried to matching at most 11 bases first
1225    
1226     //-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor
1227     //-- only extend a wmer if it matches in the 5' (beginning) region of the read
1228     int wordSize=3;
1229     int hlen=12;
1230     if (hlen>a5len-wordSize) hlen=a5len-wordSize;
1231     int imax=rlen>>1; //first half of the read, right boundary for the wmer match
1232     if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); }
1233     for (int iw=0;iw<=hlen;iw++) {
1234     int apstart=a5len-iw-wordSize;
1235     fi=0;
1236     //int* qv=(int32 *)(adapter5.chars()+apstart);
1237     int qv=get3mer_value(adapter5.chars()+apstart);
1238     //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1239     while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1240     #ifdef DEBUG
1241     if (debug) GMessage(" Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars());
1242     #endif
1243     if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1244     a5len, apstart, wordSize, l5,l3, a5segs, true)) return true;
1245     fi++;
1246     if (fi>imax) break;
1247     }
1248     } //for each wmer in the last hlen bases of the adaptor
1249     /*
1250    
1251     //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1252     //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1253     if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1254     int hlen2=a5len-wordSize;
1255     //if (hlen2>a5len-wordSize) hlen2=a5len-wordSize;
1256     #ifdef DEBUG
1257     if (debug && a5segs.Count()>0) {
1258     GMessage(" >>>>>2nd. hash: %s\n",seq.chars());
1259     }
1260     #endif
1261     if (hlen2>hlen) {
1262     for (int iw=hlen+1;iw<=hlen2;iw++) {
1263     int apstart=a5len-iw-wordSize;
1264     fi=0;
1265     //int* qv=(int32 *)(adapter5.chars()+apstart);
1266     int qv=get3mer_value(adapter5.chars()+apstart);
1267     //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1268     while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1269     extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1270     a5len, apstart, wordSize, l5,l3, a5segs, true);
1271     fi++;
1272     if (fi>imax) break;
1273     }
1274     } //for each wmer between hlen2 and hlen bases of the adaptor
1275     }
1276     if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1277     // lastly, analyze collected a5segs for a possible gapped alignment:
1278     GList<CSegChain> segchains(false,true,false);
1279     #ifdef DEBUG
1280     if (debug && a5segs.Count()>0) {
1281     GMessage(">>>>>>>>> Read: %s\n",seq.chars());
1282 gpertea 4 }
1283 gpertea 12 #endif
1284     for (int i=0;i<a5segs.Count();i++) {
1285     if (a5segs[i]->chain==NULL) {
1286     if (a5segs[i]->b.end<(int)(a5len-4)) continue; //don't start a hopeless chain
1287     CSegChain* newchain=new CSegChain(true);
1288     newchain->setFreeItem(false);
1289     newchain->addSegPair(a5segs[i]);
1290     a5segs[i]->chain=newchain;
1291     segchains.Add(newchain); //just to free them when done
1292     }
1293     for (int j=i+1;j<a5segs.Count();j++) {
1294     CSegChain* chain=a5segs[i]->chain;
1295     if (chain->extendChain(a5segs[j])) {
1296     a5segs[j]->chain=chain;
1297     #ifdef DEBUG
1298     if (debug) dbgPrintChain(*chain, adapter5.chars());
1299     #endif
1300     //save time by checking here if the extended chain is already acceptable for trimming
1301     if (chain->bend>(uint)(a5len-3) && chain->astart<4 && chain->score>a_min_chain_score) {
1302     l5=chain->aend;
1303     l3=rlen-1;
1304     return true;
1305     }
1306     } //chain can be extended
1307 gpertea 4 }
1308 gpertea 12 } //collect segment alignments into chains
1309     */
1310 gpertea 4 return false; //no adapter parts found
1311     }
1312    
1313 gpertea 12 //convert qvs to/from phred64 from/to phread33
1314     void convertPhred(GStr& q) {
1315     for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1316 gpertea 4 }
1317    
1318 gpertea 12 void convertPhred(char* q, int len) {
1319     for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1320 gpertea 4 }
1321    
1322 gpertea 13 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1323     GStr& rname, GStr& rinfo, GStr& infname) {
1324     rseq="";
1325     rqv="";
1326     rname="";
1327     rinfo="";
1328     if (fq.eof()) return false;
1329     char* l=fq.getLine();
1330     while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1331     if (l==NULL) return false;
1332     /* if (rawFormat) {
1333     //TODO: implement raw qseq parsing here
1334     //if (raw type=N) then continue; //skip invalid/bad records
1335     } //raw qseq format
1336     else { // FASTQ or FASTA */
1337     isfasta=(l[0]=='>');
1338     if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1339     infname.chars(), l);
1340     GStr s(l);
1341     rname=&(l[1]);
1342     for (int i=0;i<rname.length();i++)
1343     if (rname[i]<=' ') {
1344     if (i<rname.length()-2) rinfo=rname.substr(i+1);
1345     rname.cut(i);
1346     break;
1347     }
1348     //now get the sequence
1349     if ((l=fq.getLine())==NULL)
1350     GError("Error: unexpected EOF after header for read %s (%s)\n",
1351     rname.chars(), infname.chars());
1352     rseq=l; //this must be the DNA line
1353     while ((l=fq.getLine())!=NULL) {
1354     //seq can span multiple lines
1355     if (l[0]=='>' || l[0]=='+') {
1356     fq.pushBack();
1357     break; //
1358     }
1359     rseq+=l;
1360     } //check for multi-line seq
1361     if (!isfasta) { //reading fastq quality values, which can also be multi-line
1362     if ((l=fq.getLine())==NULL)
1363     GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1364     if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1365     if ((l=fq.getLine())==NULL)
1366     GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1367     rqv=l;
1368     //if (rqv.length()!=rseq.length())
1369     // GError("Error: qv len != seq len for %s\n", rname.chars());
1370     while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1371     rqv+=l; //append to qv string
1372     }
1373     }// fastq
1374     // } //<-- FASTA or FASTQ
1375     rseq.upper(); //TODO: what if we care about masking?
1376     return true;
1377     }
1378    
1379     char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1380     //returns 0 if the read was untouched, 1 if it was just trimmed
1381     // and a trash code if it was trashed
1382     l5=0;
1383     l3=rseq.length()-1;
1384     if (l3-l5+1<min_read_len) {
1385     return 's';
1386     }
1387     GStr wseq(rseq.chars());
1388     GStr wqv(rqv.chars());
1389     int w5=l5;
1390     int w3=l3;
1391     //first do the q-based trimming
1392     if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1393     if (w3-w5+1<min_read_len) {
1394     return 'Q'; //invalid read
1395     }
1396     //-- keep only the w5..w3 range
1397     l5=w5;
1398     l3=w3;
1399     wseq=wseq.substr(w5, w3-w5+1);
1400     if (!wqv.is_empty())
1401     wqv=wqv.substr(w5, w3-w5+1);
1402     } //qv trimming
1403     // N-trimming on the remaining read seq
1404     if (ntrim(wseq, w5, w3)) {
1405     //GMessage("before: %s\n",wseq.chars());
1406     //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1407     l5+=w5;
1408     l3-=(wseq.length()-1-w3);
1409     if (w3-w5+1<min_read_len) {
1410     return 'N'; //to be trashed
1411     }
1412     //-- keep only the w5..w3 range
1413     wseq=wseq.substr(w5, w3-w5+1);
1414     if (!wqv.is_empty())
1415     wqv=wqv.substr(w5, w3-w5+1);
1416     w5=0;
1417     w3=wseq.length()-1;
1418     }
1419     if (a3len>0) {
1420     if (trim_adapter3(wseq, w5, w3)) {
1421     int trimlen=wseq.length()-(w3-w5+1);
1422     num_trimmed3++;
1423     if (trimlen<min_trimmed3)
1424     min_trimmed3=trimlen;
1425     l5+=w5;
1426     l3-=(wseq.length()-1-w3);
1427     if (w3-w5+1<min_read_len) {
1428     return '3';
1429     }
1430     //-- keep only the w5..w3 range
1431     wseq=wseq.substr(w5, w3-w5+1);
1432     if (!wqv.is_empty())
1433     wqv=wqv.substr(w5, w3-w5+1);
1434     }//some adapter was trimmed
1435     } //adapter trimming
1436     if (a5len>0) {
1437     if (trim_adapter5(wseq, w5, w3)) {
1438     int trimlen=wseq.length()-(w3-w5+1);
1439     num_trimmed5++;
1440     if (trimlen<min_trimmed5)
1441     min_trimmed5=trimlen;
1442     l5+=w5;
1443     l3-=(wseq.length()-1-w3);
1444     if (w3-w5+1<min_read_len) {
1445     return '5';
1446     }
1447     //-- keep only the w5..w3 range
1448     wseq=wseq.substr(w5, w3-w5+1);
1449     if (!wqv.is_empty())
1450     wqv=wqv.substr(w5, w3-w5+1);
1451     }//some adapter was trimmed
1452     } //adapter trimming
1453     if (doCollapse) {
1454     //keep read for later
1455     FqDupRec* dr=dhash.Find(wseq.chars());
1456     if (dr==NULL) { //new entry
1457     //if (prefix.is_empty())
1458     dhash.Add(wseq.chars(),
1459     new FqDupRec(&wqv, rname.chars()));
1460     //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1461     }
1462     else
1463     dr->add(wqv);
1464     } //collapsing duplicates
1465     else { //not collapsing duplicates
1466     //apply the dust filter now
1467     if (doDust) {
1468     int dustbases=dust(wseq);
1469     if (dustbases>(wseq.length()>>1)) {
1470     return 'D';
1471     }
1472     }
1473     } //not collapsing duplicates
1474     return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1475     }
1476    
1477     void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1478     //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1479     if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1480     else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1481     }
1482    
1483     void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1484     outcounter++;
1485     bool asFasta=(rqv.is_empty() || fastaOutput);
1486     if (asFasta) {
1487     if (prefix.is_empty()) {
1488     printHeader(f_out, '>',rname,rinfo);
1489     fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1490     }
1491     else {
1492     fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1493     rseq.chars());
1494     }
1495     }
1496     else { //fastq
1497     if (convert_phred) convertPhred(rqv);
1498     if (prefix.is_empty()) {
1499     printHeader(f_out, '@', rname, rinfo);
1500     fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1501     }
1502     else
1503     fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1504     rseq.chars(),rqv.chars() );
1505     }
1506     }
1507    
1508     void trash_report(char trashcode, GStr& rname, FILE* freport) {
1509     if (freport==NULL || trashcode<=' ') return;
1510     if (trashcode=='3' || trashcode=='5') {
1511     fprintf(freport, "%s\tA%c\n",rname.chars(),trashcode);
1512     }
1513     else {
1514     fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1515     }
1516     //tcounter++;
1517     }
1518    
1519     GStr getFext(GStr& s, int* xpos=NULL) {
1520     //s must be a filename without a path
1521     GStr r("");
1522     if (xpos!=NULL) *xpos=0;
1523     if (s.is_empty() || s=="-") return r;
1524     int p=s.rindex('.');
1525     int d=s.rindex('/');
1526     if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1527     r=s.substr(p+1);
1528     if (xpos!=NULL) *xpos=p+1;
1529     r.lower();
1530     return r;
1531     }
1532    
1533     void baseFileName(GStr& fname) {
1534     //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1535     int xpos=0;
1536     GStr fext=getFext(fname, &xpos);
1537     if (xpos<=1) return;
1538     bool extdel=false;
1539     GStr f2;
1540     if (fext=="z" || fext=="zip") {
1541     extdel=true;
1542     }
1543     else if (fext.length()>=2) {
1544     f2=fext.substr(0,2);
1545     extdel=(f2=="gz" || f2=="bz");
1546     }
1547     if (extdel) {
1548     fname.cut(xpos-1);
1549     fext=getFext(fname, &xpos);
1550     if (xpos<=1) return;
1551     }
1552     extdel=false;
1553     if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1554     extdel=true;
1555     }
1556     else if (fext.length()>=2) {
1557     extdel=(fext.substr(0,2)=="fa");
1558     }
1559     if (extdel) fname.cut(xpos-1);
1560     GStr fncp(fname);
1561     fncp.lower();
1562     fncp.chomp("seq");
1563     fncp.chomp("sequence");
1564     fncp.trimR("_.");
1565     if (fncp.length()<fname.length()) fname.cut(fncp.length());
1566     }
1567    
1568     FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1569     FILE* f_out=NULL;
1570     GStr fname(getFileName(infname.chars()));
1571     //eliminate known extensions
1572     baseFileName(fname);
1573     if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1574     else if (pocmd.is_empty()) {
1575     GStr oname(fname);
1576     oname.append('.');
1577     oname.append(outsuffix);
1578     f_out=fopen(oname.chars(),"w");
1579     if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1580     }
1581     else {
1582     GStr oname(">");
1583     oname.append(fname);
1584     oname.append('.');
1585     oname.append(outsuffix);
1586     pocmd.append(oname);
1587     f_out=popen(pocmd.chars(), "w");
1588     if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1589     }
1590     return f_out;
1591     }
1592    
1593     void guess_unzip(GStr& fname, GStr& picmd) {
1594     GStr fext=getFext(fname);
1595     if (fext=="gz" || fext=="gzip" || fext=="z") {
1596     picmd="gzip -cd ";
1597     }
1598     else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1599     picmd="bzip2 -cd ";
1600     }
1601     }
1602    
1603     void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1604     GStr& s, GStr& infname, GStr& infname2) {
1605     // uses outsuffix to generate output file names and open file handles as needed
1606     infname="";
1607     infname2="";
1608     f_in=NULL;
1609     f_in2=NULL;
1610     f_out=NULL;
1611     f_out2=NULL;
1612     //analyze outsuffix intent
1613     GStr pocmd;
1614     GStr ox=getFext(outsuffix);
1615     if (ox.length()>2) ox=ox.substr(0,2);
1616     if (ox=="gz") pocmd="gzip -9 -c ";
1617     else if (ox=="bz") pocmd="bzip2 -9 -c ";
1618     if (s=="-") {
1619     f_in=stdin;
1620     infname="stdin";
1621     f_out=prepOutFile(infname, pocmd);
1622     return;
1623     } // streaming from stdin
1624     s.startTokenize(",:");
1625     s.nextToken(infname);
1626     bool paired=s.nextToken(infname2);
1627     if (fileExists(infname.chars())==0)
1628     GError("Error: cannot find file %s!\n",infname.chars());
1629     GStr fname(getFileName(infname.chars()));
1630     GStr picmd;
1631     guess_unzip(fname, picmd);
1632     if (picmd.is_empty()) {
1633     f_in=fopen(infname.chars(), "r");
1634     if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1635     }
1636     else {
1637     picmd.append(infname);
1638     f_in=popen(picmd.chars(), "r");
1639     if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1640     }
1641     f_out=prepOutFile(infname, pocmd);
1642     if (!paired) return;
1643     if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1644     // ---- paired reads:-------------
1645     if (fileExists(infname2.chars())==0)
1646     GError("Error: cannot find file %s!\n",infname2.chars());
1647     picmd="";
1648     GStr fname2(getFileName(infname2.chars()));
1649     guess_unzip(fname2, picmd);
1650     if (picmd.is_empty()) {
1651     f_in2=fopen(infname2.chars(), "r");
1652     if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1653     }
1654     else {
1655     picmd.append(infname2);
1656     f_in2=popen(picmd.chars(), "r");
1657     if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1658     }
1659     f_out2=prepOutFile(infname2, pocmd);
1660     }