ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 94
Committed: Tue Oct 4 21:27:35 2011 UTC (7 years, 10 months ago) by gpertea
File size: 48809 byte(s)
Log Message:
wip

Line User Rev File contents
1 gpertea 4 #include "GArgs.h"
2     #include "GStr.h"
3     #include "GHash.hh"
4     #include "GList.hh"
5 gpertea 13 #include <ctype.h>
6 gpertea 94 #include "GAlnExtend.h"
7 gpertea 4
8     #define USAGE "Usage:\n\
9 gpertea 76 fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10     [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11     [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12     <input.fq>[,<input_mates.fq>\n\
13 gpertea 4 \n\
14 gpertea 76 Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
15     for low complexity and collapse duplicate reads.\n\
16 gpertea 13 If read pairs should be trimmed and kept together (i.e. without discarding\n\
17     one read in a pair), the two file names should be given delimited by a comma\n\
18     or a colon character\n\
19 gpertea 4 \n\
20     Options:\n\
21     -n rename all the reads using the <prefix> followed by a read counter;\n\
22     if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23     the read duplication count\n\
24 gpertea 76 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
25     file(s) named <input>.<outsuffix> which will be created in the \n\
26     current (working) directory; (writes to stdout if -o- is given);\n\
27     a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28     -f file with adapter sequences to trim, each line having this format:\n\
29     <5'-adapter-sequence> <3'-adapter-sequence>\n\
30 gpertea 4 -5 trim the given adapter or primer sequence at the 5' end of each read\n\
31     (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32     -3 trim the given adapter sequence at the 3' end of each read\n\
33     (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 gpertea 94 -A disable polyA trimming (enabled by default)\n\
35     -T enable polyT trimming (disabled by default)\n\
36     -y minimum length of exact match to adaptor sequence at the proper end (6)\n\
37 gpertea 13 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
38 gpertea 76 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
39 gpertea 12 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
40     -l minimum \"clean\" length after trimming that a read must have\n\
41     in order to pass the filter (default: 16)\n\
42 gpertea 4 -r write a simple \"trash report\" file listing the discarded reads with a\n\
43     one letter code for the reason why they were trashed\n\
44 gpertea 12 -D apply a low-complexity (dust) filter and discard any read that has over\n\
45     50% of its length detected as low complexity\n\
46 gpertea 4 -C collapse duplicate reads and add a prefix with their count to the read\n\
47 gpertea 12 name (e.g. for microRNAs)\n\
48     -p input is phred64/phred33 (use -p64 or -p33)\n\
49     -Q convert quality values to the other Phred qv type\n\
50     -V verbose processing\n\
51 gpertea 4 "
52 gpertea 13
53     //-z for -o option, the output stream(s) will be first piped into the given\n
54     // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
55    
56    
57 gpertea 4 // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
58 gpertea 12
59 gpertea 76 //For paired reads sequencing:
60 gpertea 12 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
61     //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
62 gpertea 13 //FILE* f_out=NULL; //stdout if not provided
63     //FILE* f_out2=NULL; //for paired reads
64     //FILE* f_in=NULL; //input fastq (stdin if not provided)
65     //FILE* f_in2=NULL; //for paired reads
66 gpertea 4 FILE* freport=NULL;
67 gpertea 13
68 gpertea 4 bool debug=false;
69 gpertea 12 bool verbose=false;
70 gpertea 4 bool doCollapse=false;
71     bool doDust=false;
72 gpertea 13 bool fastaOutput=false;
73 gpertea 4 bool trashReport=false;
74 gpertea 12 //bool rawFormat=false;
75 gpertea 4 int min_read_len=16;
76 gpertea 12 double max_perc_N=7.0;
77 gpertea 4 int dust_cutoff=16;
78     bool isfasta=false;
79 gpertea 12 bool convert_phred=false;
80 gpertea 13 GStr outsuffix; // -o
81     GStr prefix;
82     GStr zcmd;
83     int num_trimmed5=0;
84     int num_trimmed3=0;
85     int num_trimmedN=0;
86     int num_trimmedQ=0;
87     int min_trimmed5=INT_MAX;
88     int min_trimmed3=INT_MAX;
89 gpertea 12
90 gpertea 13 int qvtrim_qmin=0;
91     int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
92 gpertea 12 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
93     int qv_cvtadd=0; //could be -31 or +31
94    
95 gpertea 94 // adaptor matching metrics -- for X-drop ungapped extension
96     const int poly_m_score=2; //match score
97     const int poly_mis_score=-3; //mismatch
98 gpertea 88 const int poly_dropoff_score=7;
99 gpertea 94 int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
100 gpertea 4
101 gpertea 94 const char *polyA_seed="AAAA";
102     const char *polyT_seed="TTTT";
103 gpertea 12
104 gpertea 94 struct CAdapters {
105     GStr a5;
106     GStr a3;
107     CAdapters(const char* s5=NULL, const char* s3=NULL):a5(s5),a3(s3) {
108 gpertea 12 }
109 gpertea 94 };
110 gpertea 12
111 gpertea 94 GPVec<CAdapters> adapters;
112 gpertea 12
113 gpertea 4 // element in dhash:
114     class FqDupRec {
115     public:
116     int count; //how many of these reads are the same
117     int len; //length of qv
118     char* firstname; //optional, only if we want to keep the original read names
119     char* qv;
120     FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
121     len=0;
122     qv=NULL;
123     firstname=NULL;
124     count=0;
125     if (qstr!=NULL) {
126     qv=Gstrdup(qstr->chars());
127     len=qstr->length();
128     count++;
129     }
130     if (rname!=NULL) firstname=Gstrdup(rname);
131     }
132     ~FqDupRec() {
133     GFREE(qv);
134     GFREE(firstname);
135     }
136     void add(GStr& d) { //collapse another record into this one
137     if (d.length()!=len)
138     GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
139     count++;
140     for (int i=0;i<len;i++)
141     qv[i]=(qv[i]+d[i])/2;
142     }
143     };
144    
145     void openfw(FILE* &f, GArgs& args, char opt) {
146     GStr s=args.getOpt(opt);
147     if (!s.is_empty()) {
148     if (s=='-') f=stdout;
149     else {
150 gpertea 13 f=fopen(s.chars(),"w");
151 gpertea 4 if (f==NULL) GError("Error creating file: %s\n", s.chars());
152     }
153     }
154     }
155    
156     #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
157     #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
158    
159     GHash<FqDupRec> dhash; //hash to keep track of duplicates
160    
161 gpertea 94 int loadAdapters(const char* fname);
162    
163 gpertea 13 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
164     GStr& s, GStr& infname, GStr& infname2);
165     // uses outsuffix to generate output file names and open file handles as needed
166    
167     void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
168     void trash_report(char trashcode, GStr& rname, FILE* freport);
169    
170     bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
171     GStr& rname, GStr& rinfo, GStr& infname);
172    
173     char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
174     //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
175    
176 gpertea 12 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
177     bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
178 gpertea 4 int dust(GStr& seq);
179 gpertea 94 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
180     bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
181 gpertea 4 bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
182 gpertea 94 bool trim_adapter3(GStr& seq, int &l5, int &l3);
183 gpertea 4
184 gpertea 12 void convertPhred(char* q, int len);
185     void convertPhred(GStr& q);
186 gpertea 4
187     int main(int argc, char * const argv[]) {
188 gpertea 76 GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
189 gpertea 4 int e;
190     if ((e=args.isError())>0) {
191     GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
192     exit(224);
193     }
194 gpertea 9 debug=(args.getOpt('Y')!=NULL);
195 gpertea 13 verbose=(args.getOpt('V')!=NULL);
196 gpertea 12 convert_phred=(args.getOpt('Q')!=NULL);
197 gpertea 4 doCollapse=(args.getOpt('C')!=NULL);
198     doDust=(args.getOpt('D')!=NULL);
199 gpertea 12 /*
200 gpertea 4 rawFormat=(args.getOpt('R')!=NULL);
201     if (rawFormat) {
202     GError("Sorry, raw qseq format parsing is not implemented yet!\n");
203     }
204 gpertea 12 */
205 gpertea 4 prefix=args.getOpt('n');
206     GStr s=args.getOpt('l');
207     if (!s.is_empty())
208     min_read_len=s.asInt();
209 gpertea 12 s=args.getOpt('m');
210     if (!s.is_empty())
211     max_perc_N=s.asDouble();
212 gpertea 4 s=args.getOpt('d');
213     if (!s.is_empty()) {
214     dust_cutoff=s.asInt();
215     doDust=true;
216     }
217 gpertea 12 s=args.getOpt('q');
218     if (!s.is_empty()) {
219 gpertea 13 qvtrim_qmin=s.asInt();
220 gpertea 12 }
221 gpertea 13 s=args.getOpt('t');
222     if (!s.is_empty()) {
223     qvtrim_max=s.asInt();
224     }
225 gpertea 12 s=args.getOpt('p');
226     if (!s.is_empty()) {
227     int v=s.asInt();
228     if (v==33) {
229     qv_phredtype=33;
230     qv_cvtadd=31;
231     }
232     else if (v==64) {
233     qv_phredtype=64;
234     qv_cvtadd=-31;
235     }
236     else
237     GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
238     }
239 gpertea 94 s=args.getOpt('f');
240     if (!s.is_empty()) {
241     loadAdapters(s.chars());
242     }
243     bool fileAdapters=adapters.Count();
244     s=args.getOpt('5');
245     if (!s.is_empty()) {
246     if (fileAdapters)
247     GError("Error: options -5 and -f cannot be used together!\n");
248     s.upper();
249     adapters.Add(new CAdapters(s.chars()));
250 gpertea 4 }
251 gpertea 94 s=args.getOpt('3');
252     if (!s.is_empty()) {
253     if (fileAdapters)
254     GError("Error: options -3 and -f cannot be used together!\n");
255     s.upper();
256     if (adapters.Count()>0)
257     adapters[0]->a3=s.chars();
258     else adapters.Add(NULL, new CAdapters(s.chars()));
259 gpertea 4 }
260 gpertea 94 s=args.getOpt('y');
261 gpertea 13 if (!s.is_empty()) {
262 gpertea 94 int minmatch=s.asInt();
263     poly_minScore=minmatch*poly_m_score;
264 gpertea 13 }
265 gpertea 76
266 gpertea 13 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
267 gpertea 76 else outsuffix="-";
268 gpertea 4 trashReport= (args.getOpt('r')!=NULL);
269 gpertea 13 int fcount=args.startNonOpt();
270     if (fcount==0) {
271 gpertea 4 GMessage(USAGE);
272     exit(224);
273     }
274 gpertea 13 if (fcount>1 && doCollapse) {
275     GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
276     }
277     //openfw(f_out, args, 'o');
278     //if (f_out==NULL) f_out=stdout;
279 gpertea 4 if (trashReport)
280     openfw(freport, args, 'r');
281     char* infile=NULL;
282     while ((infile=args.nextNonOpt())!=NULL) {
283 gpertea 13 int incounter=0; //counter for input reads
284     int outcounter=0; //counter for output reads
285     int trash_s=0; //too short from the get go
286     int trash_Q=0;
287     int trash_N=0;
288     int trash_D=0;
289     int trash_A3=0;
290     int trash_A5=0;
291     s=infile;
292     GStr infname;
293     GStr infname2;
294     FILE* f_in=NULL;
295     FILE* f_in2=NULL;
296     FILE* f_out=NULL;
297     FILE* f_out2=NULL;
298     bool paired_reads=false;
299     setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
300     GLineReader fq(f_in);
301     GLineReader* fq2=NULL;
302     if (f_in2!=NULL) {
303     fq2=new GLineReader(f_in2);
304     paired_reads=true;
305     }
306     GStr rseq, rqv, seqid, seqinfo;
307     GStr rseq2, rqv2, seqid2, seqinfo2;
308     while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
309     incounter++;
310     int a5=0, a3=0, b5=0, b3=0;
311     char tcode=0, tcode2=0;
312     tcode=process_read(seqid, rseq, rqv, a5, a3);
313     //if (!doCollapse) {
314     if (fq2!=NULL) {
315     getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
316     if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
317     GError("Error: no paired match for read %s vs %s (%s,%s)\n",
318     seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
319     }
320     tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
321     //decide what to do with this pair and print rseq2 if the pair makes it
322     if (tcode>1 && tcode2<=1) {
323     //"untrash" rseq
324     if (a3-a5+1<min_read_len) {
325     a5=1;
326     if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
327 gpertea 4 }
328 gpertea 13 tcode=1;
329     }
330     else if (tcode<=1 && tcode2>1) {
331     //"untrash" rseq2
332     if (b3-b5+1<min_read_len) {
333     b5=1;
334     if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
335     }
336     tcode2=1;
337     }
338     if (tcode<=1) { //trimmed or left intact -- write it!
339     if (tcode>0) {
340     rseq2=rseq2.substr(b5,b3-b5+1);
341     if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
342 gpertea 4 }
343 gpertea 13 int nocounter=0;
344     writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
345     }
346     } //paired read
347     // }
348     if (tcode>1) { //trashed
349     if (tcode=='s') trash_s++;
350     else if (tcode=='Q') trash_Q++;
351     else if (tcode=='N') trash_N++;
352     else if (tcode=='D') trash_D++;
353     else if (tcode=='3') trash_A3++;
354     else if (tcode=='5') trash_A5++;
355     if (trashReport) trash_report(tcode, seqid, freport);
356     }
357     else if (!doCollapse) { //write it
358     if (tcode>0) {
359     rseq=rseq.substr(a5,a3-a5+1);
360     if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
361     }
362     writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
363     }
364     } //for each fastq record
365     delete fq2;
366     FRCLOSE(f_in);
367     FRCLOSE(f_in2);
368     if (doCollapse) {
369     outcounter=0;
370     int maxdup_count=1;
371     char* maxdup_seq=NULL;
372     dhash.startIterate();
373     FqDupRec* qd=NULL;
374     char* seq=NULL;
375     while ((qd=dhash.NextData(seq))!=NULL) {
376     GStr rseq(seq);
377     //do the dusting here
378     if (doDust) {
379     int dustbases=dust(rseq);
380     if (dustbases>(rseq.length()>>1)) {
381     if (trashReport && qd->firstname!=NULL) {
382     fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
383 gpertea 4 }
384 gpertea 13 trash_D+=qd->count;
385     continue;
386     }
387     }
388     outcounter++;
389     if (qd->count>maxdup_count) {
390     maxdup_count=qd->count;
391     maxdup_seq=seq;
392     }
393     if (isfasta) {
394     if (prefix.is_empty()) {
395     fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
396     rseq.chars());
397 gpertea 4 }
398 gpertea 13 else { //use custom read name
399     fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
400     qd->count, rseq.chars());
401 gpertea 4 }
402 gpertea 13 }
403     else { //fastq format
404     if (convert_phred) convertPhred(qd->qv, qd->len);
405     if (prefix.is_empty()) {
406     fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
407     rseq.chars(), qd->qv);
408 gpertea 4 }
409 gpertea 13 else { //use custom read name
410     fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
411     qd->count, rseq.chars(), qd->qv);
412     }
413     }
414     }//for each element of dhash
415     if (maxdup_count>1) {
416     GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
417 gpertea 4 }
418 gpertea 13 } //collapse entries
419     if (verbose) {
420     if (paired_reads) {
421     GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
422     GMessage("Number of input pairs :%9d\n", incounter);
423     GMessage(" Output pairs :%9d\n", outcounter);
424     }
425     else {
426     GMessage(">Input file : %s\n", infname.chars());
427     GMessage("Number of input reads :%9d\n", incounter);
428     GMessage(" Output reads :%9d\n", outcounter);
429     }
430     GMessage("------------------------------------\n");
431     if (num_trimmed5)
432     GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
433     if (num_trimmed3)
434     GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
435     GMessage("------------------------------------\n");
436     if (trash_s>0)
437     GMessage(" Trashed by length:%9d\n", trash_s);
438     if (trash_N>0)
439     GMessage(" Trashed by N%%:%9d\n", trash_N);
440     if (trash_Q>0)
441     GMessage("Trashed by low quality:%9d\n", trash_Q);
442     if (trash_A5>0)
443     GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
444     if (trash_A3>0)
445     GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
446     }
447     if (trashReport) {
448     FWCLOSE(freport);
449 gpertea 4 }
450 gpertea 13 FWCLOSE(f_out);
451     FWCLOSE(f_out2);
452     } //while each input file
453 gpertea 4
454 gpertea 12 //getc(stdin);
455 gpertea 4 }
456    
457     class NData {
458     public:
459     int NPos[1024]; //there should be no reads longer than 1K ?
460     int NCount;
461     int end5;
462     int end3;
463     int seqlen;
464     double perc_N; //percentage of Ns in end5..end3 range only!
465     const char* seq;
466     bool valid;
467     NData() {
468     NCount=0;
469     end5=0;
470     end3=0;
471     seq=NULL;
472     perc_N=0;
473     valid=true;
474     }
475     void init(GStr& rseq) {
476     NCount=0;
477     perc_N=0;
478     seqlen=rseq.length();
479     seq=rseq.chars();
480     end5=1;
481     end3=seqlen;
482     for (int i=0;i<seqlen;i++)
483 gpertea 12 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
484 gpertea 4 NPos[NCount]=i;
485     NCount++;
486     }
487     perc_N=(NCount*100.0)/seqlen;
488     valid=true;
489     }
490     void N_calc() { //only in the region after trimming
491     int n=0;
492     for (int i=end3-1;i<end5;i++) {
493     if (seq[i]=='N') n++;
494     }
495     perc_N=(n*100.0)/(end5-end3+1);
496     }
497     };
498    
499     static NData feat;
500     int perc_lenN=12; // incremental distance from ends, in percentage of
501     // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
502    
503     void N_analyze(int l5, int l3, int p5, int p3) {
504     /* assumes feat was filled properly */
505     int old_dif, t5,t3,v;
506 gpertea 12 if (l3<l5+2 || p5>p3 ) {
507 gpertea 4 feat.end5=l5+1;
508     feat.end3=l3+1;
509     return;
510     }
511    
512     t5=feat.NPos[p5]-l5;
513     t3=l3-feat.NPos[p3];
514     old_dif=p3-p5;
515     v=(int)((((double)(l3-l5))*perc_lenN)/100);
516     if (v>20) v=20; /* enforce N-search limit for very long reads */
517     if (t5 < v ) {
518     l5=feat.NPos[p5]+1;
519     p5++;
520     }
521     if (t3 < v) {
522     l3=feat.NPos[p3]-1;
523     p3--;
524     }
525     /* restNs=p3-p5; number of Ns in the new CLR */
526     if (p3-p5==old_dif) { /* no change, return */
527     /* don't trim if this may shorten the read too much */
528     //if (l5-l3<min_read_len) return;
529     feat.end5=l5+1;
530     feat.end3=l3+1;
531     return;
532     }
533     else
534     N_analyze(l5,l3, p5,p3);
535     }
536    
537    
538 gpertea 12 bool qtrim(GStr& qvs, int &l5, int &l3) {
539 gpertea 13 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
540 gpertea 12 if (qv_phredtype==0) {
541     //try to guess the Phred type
542     int vmin=256, vmax=0;
543     for (int i=0;i<qvs.length();i++) {
544     if (vmin>qvs[i]) vmin=qvs[i];
545     if (vmax<qvs[i]) vmax=qvs[i];
546     }
547     if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
548     if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
549     if (qv_phredtype==0) {
550     GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
551     }
552 gpertea 13 if (verbose)
553     GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
554     } //guessing Phred type
555 gpertea 12 for (l3=qvs.length()-1;l3>2;l3--) {
556 gpertea 13 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
557 gpertea 12 }
558     //just in case, check also the 5' the end (?)
559     for (l5=0;l5<qvs.length()-3;l5++) {
560 gpertea 13 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
561 gpertea 12 }
562 gpertea 13 if (qvtrim_max>0) {
563     if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
564     if (l5>qvtrim_max) l5=qvtrim_max;
565     }
566 gpertea 12 return (l5>0 || l3<qvs.length()-1);
567     }
568    
569     bool ntrim(GStr& rseq, int &l5, int &l3) {
570     //count Ns in the sequence, trim N-rich ends
571 gpertea 4 feat.init(rseq);
572     l5=feat.end5-1;
573     l3=feat.end3-1;
574     N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
575 gpertea 12 if (l5==feat.end5-1 && l3==feat.end3-1) {
576     if (feat.perc_N>max_perc_N) {
577     feat.valid=false;
578     l3=l5+1;
579     return true;
580     }
581     else {
582     return false; //nothing changed
583     }
584     }
585 gpertea 4 l5=feat.end5-1;
586     l3=feat.end3-1;
587     if (l3-l5+1<min_read_len) {
588     feat.valid=false;
589     return true;
590     }
591     feat.N_calc();
592 gpertea 12
593     if (feat.perc_N>max_perc_N) {
594 gpertea 4 feat.valid=false;
595     l3=l5+1;
596     return true;
597     }
598     return true;
599     }
600    
601     //--------------- dust functions ----------------
602     class DNADuster {
603     public:
604     int dustword;
605     int dustwindow;
606     int dustwindow2;
607     int dustcutoff;
608     int mv, iv, jv;
609     int counts[32*32*32];
610     int iis[32*32*32];
611     DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
612     dustword=wordsize;
613     dustwindow=winsize;
614     dustwindow2 = (winsize>>1);
615     dustcutoff=cutoff;
616     mv=0;
617     iv=0;
618     jv=0;
619     }
620     void setWindowSize(int value) {
621     dustwindow = value;
622     dustwindow2 = (dustwindow >> 1);
623     }
624     void setWordSize(int value) {
625     dustword=value;
626     }
627     void wo1(int len, const char* s, int ivv) {
628     int i, ii, j, v, t, n, n1, sum;
629     int js, nis;
630     n = 32 * 32 * 32;
631     n1 = n - 1;
632     nis = 0;
633     i = 0;
634     ii = 0;
635     sum = 0;
636     v = 0;
637     for (j=0; j < len; j++, s++) {
638     ii <<= 5;
639     if (*s<=32) {
640     i=0;
641     continue;
642     }
643     ii |= *s - 'A'; //assume uppercase!
644     ii &= n1;
645     i++;
646     if (i >= dustword) {
647     for (js=0; js < nis && iis[js] != ii; js++) ;
648     if (js == nis) {
649     iis[nis] = ii;
650     counts[ii] = 0;
651     nis++;
652     }
653     if ((t = counts[ii]) > 0) {
654     sum += t;
655     v = 10 * sum / j;
656     if (mv < v) {
657     mv = v;
658     iv = ivv;
659     jv = j;
660     }
661     }
662     counts[ii]++;
663     }
664     }
665     }
666    
667     int wo(int len, const char* s, int* beg, int* end) {
668     int i, l1;
669     l1 = len - dustword + 1;
670     if (l1 < 0) {
671     *beg = 0;
672     *end = len - 1;
673     return 0;
674     }
675     mv = 0;
676     iv = 0;
677     jv = 0;
678     for (i=0; i < l1; i++) {
679     wo1(len-i, s+i, i);
680     }
681     *beg = iv;
682     *end = iv + jv;
683     return mv;
684     }
685    
686     void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
687     int i, j, l, a, b, v;
688     if (cutoff==0) cutoff=dustcutoff;
689     a=0;b=0;
690     //GMessage("Dust cutoff=%d\n", cutoff);
691     for (i=0; i < seqlen; i += dustwindow2) {
692     l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
693     v = wo(l, seq+i, &a, &b);
694     if (v > cutoff) {
695     //for (j = a; j <= b && j < dustwindow2; j++) {
696     for (j = a; j <= b; j++) {
697     seqmsk[i+j]='N';//could be made lowercase instead
698     }
699     }
700     }
701     //return first;
702     }
703     };
704    
705     static DNADuster duster;
706    
707     int dust(GStr& rseq) {
708     char* seq=Gstrdup(rseq.chars());
709     duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
710     //check the number of Ns:
711     int ncount=0;
712     for (int i=0;i<rseq.length();i++) {
713     if (seq[i]=='N') ncount++;
714     }
715     GFREE(seq);
716     return ncount;
717     }
718    
719 gpertea 12 int get3mer_value(const char* s) {
720     return (s[0]<<16)+(s[1]<<8)+s[2];
721     }
722    
723     int w3_match(int qv, const char* str, int slen, int start_index=0) {
724     if (start_index>=slen || start_index<0) return -1;
725     for (int i=start_index;i<slen-3;i++) {
726     int rv=get3mer_value(str+i);
727     if (rv==qv) return i;
728     }
729     return -1;
730     }
731    
732     int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
733     if (end_index>=slen) return -1;
734     if (end_index<0) end_index=slen-1;
735     for (int i=end_index-2;i>=0;i--) {
736     int rv=get3mer_value(str+i);
737     if (rv==qv) return i;
738     }
739     return -1;
740     }
741    
742 gpertea 94 struct SLocScore {
743     int pos;
744     int score;
745     SLocScore(int p=0,int s=0) {
746     pos=p;
747     score=s;
748     }
749     void set(int p, int s) {
750     pos=p;
751     score=s;
752     }
753     void add(int p, int add) {
754     pos=p;
755     score+=add;
756     }
757     };
758 gpertea 12
759 gpertea 94 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
760     int rlen=seq.length();
761     l5=0;
762     l3=rlen-1;
763     int32 seedVal=*(int32*)poly_seed;
764     char polyChar=poly_seed[0];
765     //assumes N trimming was already done
766     //so a poly match should be very close to the end of the read
767     // -- find the initial match (seed)
768     int lmin=GMAX((rlen-12), 0);
769     int li;
770     for (li=rlen-4;li>lmin;li--) {
771     if (seedVal==*(int*)&(seq[li])) {
772     break;
773     }
774 gpertea 12 }
775 gpertea 94 if (li<=lmin) return false;
776     //seed found, try to extend it both ways
777     //extend right
778     int ri=li+3;
779     SLocScore loc(ri, poly_m_score<<2);
780     SLocScore maxloc(loc);
781     //extend right
782     while (ri<rlen-2) {
783     ri++;
784     if (seq[ri]==polyChar) {
785     loc.add(ri,poly_m_score);
786     }
787     else if (seq[ri]=='N') {
788     loc.add(ri,0);
789     }
790     else { //mismatch
791     loc.add(ri,poly_mis_score);
792     if (maxloc.score-loc.score>poly_dropoff_score) break;
793     }
794     if (maxloc.score<=loc.score) {
795     maxloc=loc;
796     }
797 gpertea 12 }
798 gpertea 94 ri=maxloc.pos;
799     if (ri<rlen-3) return false; //no trimming wanted, too far from 3' end
800     //ri = right boundary for the poly match
801     //extend left
802     loc.set(li, maxloc.score);
803     maxloc.pos=li;
804     while (li>0) {
805     li--;
806     if (seq[li]==polyChar) {
807     loc.add(li,poly_m_score);
808     }
809     else if (seq[li]=='N') {
810     loc.add(li,0);
811     }
812     else { //mismatch
813     loc.add(li,poly_mis_score);
814     if (maxloc.score-loc.score>poly_dropoff_score) break;
815     }
816     if (maxloc.score<=loc.score) {
817     maxloc=loc;
818     }
819     }
820     if (maxloc.score>poly_minScore && ri>=rlen-3) {
821     l5=li;
822     l3=ri;
823     return true;
824     }
825     return false;
826 gpertea 12 }
827    
828 gpertea 94
829     bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
830 gpertea 88 int rlen=seq.length();
831     l5=0;
832     l3=rlen-1;
833 gpertea 94 int32 seedVal=*(int32*)poly_seed;
834     char polyChar=poly_seed[0];
835 gpertea 88 //assumes N trimming was already done
836     //so a poly match should be very close to the end of the read
837     // -- find the initial match (seed)
838 gpertea 94 int lmax=GMIN(8, rlen-4);//how far from 5' end to look for 4-mer seeds
839     int li;
840     for (li=0;li<=lmax;li++) {
841     if (seedVal==*(int*)&(seq[li])) {
842 gpertea 88 break;
843     }
844     }
845 gpertea 94 if (li>lmax) return false;
846     //seed found, try to extend it both ways
847     //extend left
848     int ri=li+3; //save rightmost base of the seed
849     SLocScore loc(li, poly_m_score<<2);
850     SLocScore maxloc(loc);
851     while (li>0) {
852     li--;
853     if (seq[li]==polyChar) {
854     loc.add(li,poly_m_score);
855     }
856     else if (seq[li]=='N') {
857     loc.add(li,0);
858     }
859     else { //mismatch
860     loc.add(li,poly_mis_score);
861     if (maxloc.score-loc.score>poly_dropoff_score) break;
862     }
863     if (maxloc.score<=loc.score) {
864     maxloc=loc;
865     }
866 gpertea 88 }
867 gpertea 94 li=maxloc.pos;
868     if (li>3) return false; //no trimming wanted, too far from 5' end
869     //li = right boundary for the poly match
870    
871     //extend right
872     loc.set(ri, maxloc.score);
873     maxloc.pos=ri;
874     while (ri<rlen-2) {
875     ri++;
876     if (seq[ri]==polyChar) {
877     loc.add(ri,poly_m_score);
878     }
879     else if (seq[ri]=='N') {
880     loc.add(ri,0);
881     }
882     else { //mismatch
883     loc.add(ri,poly_mis_score);
884     if (maxloc.score-loc.score>poly_dropoff_score) break;
885     }
886     if (maxloc.score<=loc.score) {
887     maxloc=loc;
888     }
889     }
890    
891     if (maxloc.score>poly_minScore && li<=3) {
892     l5=li;
893     l3=ri;
894     return true;
895     }
896     return false;
897 gpertea 88 }
898    
899 gpertea 4 bool trim_adapter3(GStr& seq, int&l5, int &l3) {
900     int rlen=seq.length();
901     l5=0;
902     l3=rlen-1;
903     //first try a full match, we might get lucky
904     int fi=-1;
905     if ((fi=seq.index(adapter3))>=0) {
906     if (fi<rlen-fi-a3len) {//match is closer to the right end
907     l5=fi+a3len;
908     l3=rlen-1;
909     }
910     else {
911     l5=0;
912 gpertea 12 l3=fi-1;
913 gpertea 4 }
914     return true;
915     }
916 gpertea 12 #ifdef DEBUG
917     if (debug) GMessage(">TRIM3 >> Read: %s\n",seq.chars());
918     #endif
919    
920 gpertea 4 //also, for fast detection of other adapter-only reads that start past
921     // the beginning of the adapter sequence, try to see if the first a3len-4
922 gpertea 12 // bases of the read are a substring of the adapter
923     if (rlen>a3len-3) {
924     GStr rstart=seq.substr(1,a3len-4);
925     if ((fi=adapter3.index(rstart))>=0) {
926     l3=rlen-1;
927     l5=a3len-4;
928     while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
929     return true;
930     }
931     }
932     CSegChain a3segs; //no chains here, just an ordered collection of segment pairs
933     //check the easy cases - 11 bases exact match at the end
934     int fdlen=11;
935     if (a3len<16) {
936     fdlen=a3len>>1;
937 gpertea 4 }
938 gpertea 12 if (fdlen>4) {
939     //check if we're lucky enough to have the last 11 bases of the read a part of the adapter
940     GStr rstart=seq.substr(-fdlen-3,fdlen);
941     if ((fi=adapter3.index(rstart))>=0) {
942     #ifdef DEBUG
943     if (debug) GMessage(" W11match found: %*s\n", rlen-3, (adapter3.substr(fi,fdlen)).chars());
944     #endif
945     if (extendMatch(seq.chars(), rlen, rlen-fdlen-3,
946     adapter3.chars(), a3len, fi, fdlen, l5,l3, a3segs))
947     return true;
948     }
949     //another easy case: first 11 characters of the adaptor found as a substring of the read
950     GStr bstr=adapter3.substr(0, fdlen);
951     if ((fi=seq.rindex(bstr))>=0) {
952     #ifdef DEBUG
953     if (debug) GMessage(" A11match found: %*s\n", fi+fdlen, bstr.chars());
954     #endif
955     if (extendMatch(seq.chars(), rlen, fi,
956     adapter3.chars(), a3len, 0, fdlen, l5,l3, a3segs))
957     return true;
958     }
959     } //tried to match 11 bases first
960    
961     //no easy cases, so let's do the wmer hashing for the first 12 bases of the adaptor
962     //-- only extend if the match is in the 3' (ending) region of the read
963     int wordSize=3;
964     int hlen=12;
965     if (hlen>a3len-wordSize) hlen=a3len-wordSize;
966     int imin=rlen>>1; //last half of the read, left boundary for the wmer match
967     if (imin<a3len) { imin=GMIN(a3len, rlen-wordSize); }
968     imin=rlen-imin;
969     for (int iw=0;iw<hlen;iw++) {
970     //int32* qv=(int32*)(adapter3.chars()+iw);
971     int qv=get3mer_value(adapter3.chars()+iw);
972     fi=-1;
973     //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
974     while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
975     //GMessage(" ... fi=%d after w3_rmatch() (imin=%d)\n", fi, imin);
976    
977     #ifdef DEBUG
978     if (debug) GMessage(" Wmatch found: %*s\n", fi+wordSize, (adapter3.substr(iw,wordSize)).chars());
979     #endif
980     if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
981     a3len, iw, wordSize, l5,l3, a3segs)) return true;
982     fi--;
983     if (fi<imin) break;
984     }
985     } //for each wmer in the first hlen bases of the adaptor
986     /*
987     //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
988     //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
989     if (a3segs.bstart>3 || a3segs.bend<(uint)(hlen-wordSize)) return false;
990     int hlen2=a3len-wordSize;
991     //if (hlen2>a3len-4) hlen2=a3len-4;
992     if (hlen2>hlen) {
993     #ifdef DEBUG
994     if (debug && a3segs.Count()>0) {
995     GMessage(" >>>>>2nd. hash: %s\n",seq.chars());
996     }
997     #endif
998     for (int iw=hlen;iw<hlen2;iw++) {
999     //int* qv=(int32 *)(adapter3.chars()+iw);
1000     int qv=get3mer_value(adapter3.chars()+iw);
1001     fi=-1;
1002     //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1003     while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1004     extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1005     a3len, iw, wordSize, l5,l3, a3segs);
1006     fi--;
1007     if (fi<imin) break;
1008     }
1009     } //for each wmer between hlen2 and hlen bases of the adaptor
1010     }
1011     //lastly, analyze collected a3segs for a possible gapped alignment:
1012     GList<CSegChain> segchains(false,true,false);
1013     #ifdef DEBUG
1014     if (debug && a3segs.Count()>0) {
1015     GMessage(">>>>>>>>> Read: %s\n",seq.chars());
1016 gpertea 4 }
1017 gpertea 12 #endif
1018     for (int i=0;i<a3segs.Count();i++) {
1019     if (a3segs[i]->chain==NULL) {
1020     if (a3segs[i]->b.start>3) continue; //don't start a hopeless chain
1021     CSegChain* newchain=new CSegChain();
1022     newchain->setFreeItem(false);
1023     newchain->addSegPair(a3segs[i]);
1024     a3segs[i]->chain=newchain;
1025     segchains.Add(newchain); //just to free them when done
1026     }
1027     for (int j=i+1;j<a3segs.Count();j++) {
1028     CSegChain* chain=a3segs[i]->chain;
1029     if (chain->extendChain(a3segs[j])) {
1030     a3segs[j]->chain=chain;
1031     #ifdef DEBUG
1032     if (debug) dbgPrintChain(*chain, adapter3.chars());
1033     #endif
1034     //save time by checking here if the extended chain is already acceptable for trimming
1035     if (chain->aend>(uint)(rlen-4) && chain->bstart<4 && chain->score>a_min_chain_score) {
1036     l5=0;
1037     l3=chain->astart-2;
1038     #ifdef DEBUG
1039     if (debug && a3segs.Count()>0) {
1040     GMessage(">>> >> trimmed-3: %*s\n",l3-l5+1,seq.substr(l5,l3-l5+1).chars());
1041     }
1042     #endif
1043     return true;
1044     }
1045     } //chain can be extended
1046 gpertea 4 }
1047 gpertea 12 } //collect segment alignments into chains
1048     */
1049 gpertea 4 return false; //no adapter parts found
1050     }
1051    
1052     bool trim_adapter5(GStr& seq, int&l5, int &l3) {
1053 gpertea 9 //if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars());
1054 gpertea 4 int rlen=seq.length();
1055     l5=0;
1056     l3=rlen-1;
1057     //try to see if adapter is fully included in the read
1058     int fi=-1;
1059     if ((fi=seq.index(adapter5))>=0) {
1060     if (fi<rlen-fi-a5len) {//match is closer to the right end
1061     l5=fi+a5len;
1062     l3=rlen-1;
1063     }
1064     else {
1065     l5=0;
1066     l3=fi-1;
1067     }
1068     return true;
1069     }
1070 gpertea 12 #ifdef DEBUG
1071     if (debug) GMessage(">TRIM5 >> Read: %s\n",seq.chars());
1072     #endif
1073    
1074     CSegChain a5segs(true); //list of segment pairs to analyze later if no extendMatch succeeded
1075    
1076     //try the easy way out first - look for an exact match of 11 bases
1077     int fdlen=11;
1078     if (a5len<16) {
1079     fdlen=a5len>>1;
1080 gpertea 4 }
1081 gpertea 12 if (fdlen>4) {
1082     GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus
1083     if ((fi=adapter5.index(rstart))>=0) {
1084     #ifdef DEBUG
1085     if (debug) GMessage(" W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars());
1086     #endif
1087     if (extendMatch(seq.chars(), rlen, 1,
1088     adapter5.chars(), a5len, fi, fdlen, l5,l3, a5segs, true))
1089     return true;
1090     }
1091     //another easy case: last 11 characters of the adaptor found as a substring of the read
1092     GStr bstr=adapter5.substr(-fdlen);
1093     if ((fi=seq.index(bstr))>=0) {
1094     #ifdef DEBUG
1095     if (debug) GMessage(" A11match found: %*s\n", fi+fdlen, bstr.chars());
1096     #endif
1097     if (extendMatch(seq.chars(), rlen, fi,
1098     adapter5.chars(), a5len, a5len-fdlen, fdlen, l5,l3,a5segs,true))
1099     return true;
1100     }
1101     } //tried to matching at most 11 bases first
1102    
1103     //-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor
1104     //-- only extend a wmer if it matches in the 5' (beginning) region of the read
1105     int wordSize=3;
1106     int hlen=12;
1107     if (hlen>a5len-wordSize) hlen=a5len-wordSize;
1108     int imax=rlen>>1; //first half of the read, right boundary for the wmer match
1109     if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); }
1110     for (int iw=0;iw<=hlen;iw++) {
1111     int apstart=a5len-iw-wordSize;
1112     fi=0;
1113     //int* qv=(int32 *)(adapter5.chars()+apstart);
1114     int qv=get3mer_value(adapter5.chars()+apstart);
1115     //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1116     while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1117     #ifdef DEBUG
1118     if (debug) GMessage(" Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars());
1119     #endif
1120     if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1121     a5len, apstart, wordSize, l5,l3, a5segs, true)) return true;
1122     fi++;
1123     if (fi>imax) break;
1124     }
1125     } //for each wmer in the last hlen bases of the adaptor
1126     /*
1127    
1128     //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1129     //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1130     if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1131     int hlen2=a5len-wordSize;
1132     //if (hlen2>a5len-wordSize) hlen2=a5len-wordSize;
1133     #ifdef DEBUG
1134     if (debug && a5segs.Count()>0) {
1135     GMessage(" >>>>>2nd. hash: %s\n",seq.chars());
1136     }
1137     #endif
1138     if (hlen2>hlen) {
1139     for (int iw=hlen+1;iw<=hlen2;iw++) {
1140     int apstart=a5len-iw-wordSize;
1141     fi=0;
1142     //int* qv=(int32 *)(adapter5.chars()+apstart);
1143     int qv=get3mer_value(adapter5.chars()+apstart);
1144     //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1145     while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1146     extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1147     a5len, apstart, wordSize, l5,l3, a5segs, true);
1148     fi++;
1149     if (fi>imax) break;
1150     }
1151     } //for each wmer between hlen2 and hlen bases of the adaptor
1152     }
1153     if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1154     // lastly, analyze collected a5segs for a possible gapped alignment:
1155     GList<CSegChain> segchains(false,true,false);
1156     #ifdef DEBUG
1157     if (debug && a5segs.Count()>0) {
1158     GMessage(">>>>>>>>> Read: %s\n",seq.chars());
1159 gpertea 4 }
1160 gpertea 12 #endif
1161     for (int i=0;i<a5segs.Count();i++) {
1162     if (a5segs[i]->chain==NULL) {
1163     if (a5segs[i]->b.end<(int)(a5len-4)) continue; //don't start a hopeless chain
1164     CSegChain* newchain=new CSegChain(true);
1165     newchain->setFreeItem(false);
1166     newchain->addSegPair(a5segs[i]);
1167     a5segs[i]->chain=newchain;
1168     segchains.Add(newchain); //just to free them when done
1169     }
1170     for (int j=i+1;j<a5segs.Count();j++) {
1171     CSegChain* chain=a5segs[i]->chain;
1172     if (chain->extendChain(a5segs[j])) {
1173     a5segs[j]->chain=chain;
1174     #ifdef DEBUG
1175     if (debug) dbgPrintChain(*chain, adapter5.chars());
1176     #endif
1177     //save time by checking here if the extended chain is already acceptable for trimming
1178     if (chain->bend>(uint)(a5len-3) && chain->astart<4 && chain->score>a_min_chain_score) {
1179     l5=chain->aend;
1180     l3=rlen-1;
1181     return true;
1182     }
1183     } //chain can be extended
1184 gpertea 4 }
1185 gpertea 12 } //collect segment alignments into chains
1186     */
1187 gpertea 4 return false; //no adapter parts found
1188     }
1189    
1190 gpertea 12 //convert qvs to/from phred64 from/to phread33
1191     void convertPhred(GStr& q) {
1192     for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1193 gpertea 4 }
1194    
1195 gpertea 12 void convertPhred(char* q, int len) {
1196     for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1197 gpertea 4 }
1198    
1199 gpertea 13 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1200     GStr& rname, GStr& rinfo, GStr& infname) {
1201     rseq="";
1202     rqv="";
1203     rname="";
1204     rinfo="";
1205     if (fq.eof()) return false;
1206     char* l=fq.getLine();
1207     while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1208     if (l==NULL) return false;
1209     /* if (rawFormat) {
1210     //TODO: implement raw qseq parsing here
1211     //if (raw type=N) then continue; //skip invalid/bad records
1212     } //raw qseq format
1213     else { // FASTQ or FASTA */
1214     isfasta=(l[0]=='>');
1215     if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1216     infname.chars(), l);
1217     GStr s(l);
1218     rname=&(l[1]);
1219     for (int i=0;i<rname.length();i++)
1220     if (rname[i]<=' ') {
1221     if (i<rname.length()-2) rinfo=rname.substr(i+1);
1222     rname.cut(i);
1223     break;
1224     }
1225     //now get the sequence
1226     if ((l=fq.getLine())==NULL)
1227     GError("Error: unexpected EOF after header for read %s (%s)\n",
1228     rname.chars(), infname.chars());
1229     rseq=l; //this must be the DNA line
1230     while ((l=fq.getLine())!=NULL) {
1231     //seq can span multiple lines
1232     if (l[0]=='>' || l[0]=='+') {
1233     fq.pushBack();
1234     break; //
1235     }
1236     rseq+=l;
1237     } //check for multi-line seq
1238     if (!isfasta) { //reading fastq quality values, which can also be multi-line
1239     if ((l=fq.getLine())==NULL)
1240     GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1241     if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1242     if ((l=fq.getLine())==NULL)
1243     GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1244     rqv=l;
1245     //if (rqv.length()!=rseq.length())
1246     // GError("Error: qv len != seq len for %s\n", rname.chars());
1247     while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1248     rqv+=l; //append to qv string
1249     }
1250     }// fastq
1251     // } //<-- FASTA or FASTQ
1252 gpertea 94 rseq.upper();
1253 gpertea 13 return true;
1254     }
1255    
1256     char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1257     //returns 0 if the read was untouched, 1 if it was just trimmed
1258     // and a trash code if it was trashed
1259     l5=0;
1260     l3=rseq.length()-1;
1261     if (l3-l5+1<min_read_len) {
1262     return 's';
1263     }
1264     GStr wseq(rseq.chars());
1265     GStr wqv(rqv.chars());
1266     int w5=l5;
1267     int w3=l3;
1268     //first do the q-based trimming
1269     if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1270     if (w3-w5+1<min_read_len) {
1271     return 'Q'; //invalid read
1272     }
1273     //-- keep only the w5..w3 range
1274     l5=w5;
1275     l3=w3;
1276     wseq=wseq.substr(w5, w3-w5+1);
1277     if (!wqv.is_empty())
1278     wqv=wqv.substr(w5, w3-w5+1);
1279     } //qv trimming
1280     // N-trimming on the remaining read seq
1281     if (ntrim(wseq, w5, w3)) {
1282     //GMessage("before: %s\n",wseq.chars());
1283     //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1284     l5+=w5;
1285     l3-=(wseq.length()-1-w3);
1286     if (w3-w5+1<min_read_len) {
1287     return 'N'; //to be trashed
1288     }
1289     //-- keep only the w5..w3 range
1290     wseq=wseq.substr(w5, w3-w5+1);
1291     if (!wqv.is_empty())
1292     wqv=wqv.substr(w5, w3-w5+1);
1293     w5=0;
1294     w3=wseq.length()-1;
1295     }
1296     if (a3len>0) {
1297     if (trim_adapter3(wseq, w5, w3)) {
1298     int trimlen=wseq.length()-(w3-w5+1);
1299     num_trimmed3++;
1300     if (trimlen<min_trimmed3)
1301     min_trimmed3=trimlen;
1302     l5+=w5;
1303     l3-=(wseq.length()-1-w3);
1304     if (w3-w5+1<min_read_len) {
1305     return '3';
1306     }
1307     //-- keep only the w5..w3 range
1308     wseq=wseq.substr(w5, w3-w5+1);
1309     if (!wqv.is_empty())
1310     wqv=wqv.substr(w5, w3-w5+1);
1311     }//some adapter was trimmed
1312     } //adapter trimming
1313     if (a5len>0) {
1314     if (trim_adapter5(wseq, w5, w3)) {
1315     int trimlen=wseq.length()-(w3-w5+1);
1316     num_trimmed5++;
1317     if (trimlen<min_trimmed5)
1318     min_trimmed5=trimlen;
1319     l5+=w5;
1320     l3-=(wseq.length()-1-w3);
1321     if (w3-w5+1<min_read_len) {
1322     return '5';
1323     }
1324     //-- keep only the w5..w3 range
1325     wseq=wseq.substr(w5, w3-w5+1);
1326     if (!wqv.is_empty())
1327     wqv=wqv.substr(w5, w3-w5+1);
1328     }//some adapter was trimmed
1329     } //adapter trimming
1330     if (doCollapse) {
1331     //keep read for later
1332     FqDupRec* dr=dhash.Find(wseq.chars());
1333     if (dr==NULL) { //new entry
1334     //if (prefix.is_empty())
1335     dhash.Add(wseq.chars(),
1336     new FqDupRec(&wqv, rname.chars()));
1337     //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1338     }
1339     else
1340     dr->add(wqv);
1341     } //collapsing duplicates
1342     else { //not collapsing duplicates
1343     //apply the dust filter now
1344     if (doDust) {
1345     int dustbases=dust(wseq);
1346     if (dustbases>(wseq.length()>>1)) {
1347     return 'D';
1348     }
1349     }
1350     } //not collapsing duplicates
1351     return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1352     }
1353    
1354     void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1355     //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1356     if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1357     else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1358     }
1359    
1360     void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1361     outcounter++;
1362     bool asFasta=(rqv.is_empty() || fastaOutput);
1363     if (asFasta) {
1364     if (prefix.is_empty()) {
1365     printHeader(f_out, '>',rname,rinfo);
1366     fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1367     }
1368     else {
1369     fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1370     rseq.chars());
1371     }
1372     }
1373     else { //fastq
1374     if (convert_phred) convertPhred(rqv);
1375     if (prefix.is_empty()) {
1376     printHeader(f_out, '@', rname, rinfo);
1377     fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1378     }
1379     else
1380     fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1381     rseq.chars(),rqv.chars() );
1382     }
1383     }
1384    
1385     void trash_report(char trashcode, GStr& rname, FILE* freport) {
1386     if (freport==NULL || trashcode<=' ') return;
1387     if (trashcode=='3' || trashcode=='5') {
1388     fprintf(freport, "%s\tA%c\n",rname.chars(),trashcode);
1389     }
1390     else {
1391     fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1392     }
1393     //tcounter++;
1394     }
1395    
1396     GStr getFext(GStr& s, int* xpos=NULL) {
1397     //s must be a filename without a path
1398     GStr r("");
1399     if (xpos!=NULL) *xpos=0;
1400     if (s.is_empty() || s=="-") return r;
1401     int p=s.rindex('.');
1402     int d=s.rindex('/');
1403     if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1404     r=s.substr(p+1);
1405     if (xpos!=NULL) *xpos=p+1;
1406     r.lower();
1407     return r;
1408     }
1409    
1410     void baseFileName(GStr& fname) {
1411     //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1412     int xpos=0;
1413     GStr fext=getFext(fname, &xpos);
1414     if (xpos<=1) return;
1415     bool extdel=false;
1416     GStr f2;
1417     if (fext=="z" || fext=="zip") {
1418     extdel=true;
1419     }
1420     else if (fext.length()>=2) {
1421     f2=fext.substr(0,2);
1422     extdel=(f2=="gz" || f2=="bz");
1423     }
1424     if (extdel) {
1425     fname.cut(xpos-1);
1426     fext=getFext(fname, &xpos);
1427     if (xpos<=1) return;
1428     }
1429     extdel=false;
1430     if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1431     extdel=true;
1432     }
1433     else if (fext.length()>=2) {
1434     extdel=(fext.substr(0,2)=="fa");
1435     }
1436     if (extdel) fname.cut(xpos-1);
1437     GStr fncp(fname);
1438     fncp.lower();
1439     fncp.chomp("seq");
1440     fncp.chomp("sequence");
1441     fncp.trimR("_.");
1442     if (fncp.length()<fname.length()) fname.cut(fncp.length());
1443     }
1444    
1445     FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1446     FILE* f_out=NULL;
1447     GStr fname(getFileName(infname.chars()));
1448     //eliminate known extensions
1449     baseFileName(fname);
1450     if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1451     else if (pocmd.is_empty()) {
1452     GStr oname(fname);
1453     oname.append('.');
1454     oname.append(outsuffix);
1455     f_out=fopen(oname.chars(),"w");
1456     if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1457     }
1458     else {
1459     GStr oname(">");
1460     oname.append(fname);
1461     oname.append('.');
1462     oname.append(outsuffix);
1463     pocmd.append(oname);
1464     f_out=popen(pocmd.chars(), "w");
1465     if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1466     }
1467     return f_out;
1468     }
1469    
1470     void guess_unzip(GStr& fname, GStr& picmd) {
1471     GStr fext=getFext(fname);
1472     if (fext=="gz" || fext=="gzip" || fext=="z") {
1473     picmd="gzip -cd ";
1474     }
1475     else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1476     picmd="bzip2 -cd ";
1477     }
1478     }
1479    
1480 gpertea 94
1481     int loadAdapters(const char* fname) {
1482     GLineReader lr(fname);
1483     char* l;
1484     while ((l=lr.nextLine())!=NULL) {
1485     if (lr.length()<=3 || l[0]=='#') continue;
1486     char* s5;
1487     char* s3;
1488     if (isspace(l[0]) || l[0]==',' || l[0]==';'|| l[0]==':') {
1489     int i=1;
1490     while (l[i]!=0 && isspace(l[i])) {
1491     i++;
1492     }
1493     if (l[i]!=0) {
1494     adapters.Add(new CAdapters(NULL, &(l[i])));
1495     continue;
1496     }
1497     }
1498     else {
1499     p=l;
1500     while (*delpos!='\0' && strchr(fTokenDelimiter, *delpos)!=NULL)
1501     delpos++;
1502     s5.startTokenize("\t ;,:",);
1503     s5.nextToken(s3);
1504     }
1505     }
1506    
1507     }
1508    
1509 gpertea 13 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1510     GStr& s, GStr& infname, GStr& infname2) {
1511     // uses outsuffix to generate output file names and open file handles as needed
1512     infname="";
1513     infname2="";
1514     f_in=NULL;
1515     f_in2=NULL;
1516     f_out=NULL;
1517     f_out2=NULL;
1518     //analyze outsuffix intent
1519     GStr pocmd;
1520 gpertea 76 if (outsuffix=="-") {
1521     f_out=stdout;
1522     }
1523     else {
1524     GStr ox=getFext(outsuffix);
1525     if (ox.length()>2) ox=ox.substr(0,2);
1526     if (ox=="gz") pocmd="gzip -9 -c ";
1527     else if (ox=="bz") pocmd="bzip2 -9 -c ";
1528     }
1529 gpertea 13 if (s=="-") {
1530     f_in=stdin;
1531     infname="stdin";
1532     f_out=prepOutFile(infname, pocmd);
1533     return;
1534     } // streaming from stdin
1535     s.startTokenize(",:");
1536     s.nextToken(infname);
1537     bool paired=s.nextToken(infname2);
1538     if (fileExists(infname.chars())==0)
1539     GError("Error: cannot find file %s!\n",infname.chars());
1540     GStr fname(getFileName(infname.chars()));
1541     GStr picmd;
1542     guess_unzip(fname, picmd);
1543     if (picmd.is_empty()) {
1544     f_in=fopen(infname.chars(), "r");
1545     if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1546     }
1547     else {
1548     picmd.append(infname);
1549     f_in=popen(picmd.chars(), "r");
1550     if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1551     }
1552 gpertea 76 if (f_out==stdout) {
1553     if (paired) GError("Error: output suffix required for paired reads\n");
1554     return;
1555     }
1556 gpertea 13 f_out=prepOutFile(infname, pocmd);
1557     if (!paired) return;
1558     if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1559     // ---- paired reads:-------------
1560     if (fileExists(infname2.chars())==0)
1561     GError("Error: cannot find file %s!\n",infname2.chars());
1562     picmd="";
1563     GStr fname2(getFileName(infname2.chars()));
1564     guess_unzip(fname2, picmd);
1565     if (picmd.is_empty()) {
1566     f_in2=fopen(infname2.chars(), "r");
1567     if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1568     }
1569     else {
1570     picmd.append(infname2);
1571     f_in2=popen(picmd.chars(), "r");
1572     if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1573     }
1574     f_out2=prepOutFile(infname2, pocmd);
1575     }