ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 177
Committed: Thu Feb 16 07:10:32 2012 UTC (7 years, 6 months ago) by gpertea
File size: 43339 byte(s)
Log Message:
collectSeeds unified

Line User Rev File contents
1 gpertea 4 #include "GArgs.h"
2     #include "GStr.h"
3     #include "GHash.hh"
4     #include "GList.hh"
5 gpertea 13 #include <ctype.h>
6 gpertea 94 #include "GAlnExtend.h"
7 gpertea 4
8     #define USAGE "Usage:\n\
9 gpertea 76 fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10 gpertea 172 [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11 gpertea 76 [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12     <input.fq>[,<input_mates.fq>\n\
13 gpertea 4 \n\
14 gpertea 76 Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
15     for low complexity and collapse duplicate reads.\n\
16 gpertea 13 If read pairs should be trimmed and kept together (i.e. without discarding\n\
17     one read in a pair), the two file names should be given delimited by a comma\n\
18     or a colon character\n\
19 gpertea 4 \n\
20     Options:\n\
21     -n rename all the reads using the <prefix> followed by a read counter;\n\
22     if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23     the read duplication count\n\
24 gpertea 76 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
25     file(s) named <input>.<outsuffix> which will be created in the \n\
26     current (working) directory; (writes to stdout if -o- is given);\n\
27     a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28     -f file with adapter sequences to trim, each line having this format:\n\
29     <5'-adapter-sequence> <3'-adapter-sequence>\n\
30 gpertea 4 -5 trim the given adapter or primer sequence at the 5' end of each read\n\
31     (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32     -3 trim the given adapter sequence at the 3' end of each read\n\
33     (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 gpertea 103 -A disable polyA/T trimming (enabled by default)\n\
35 gpertea 94 -y minimum length of exact match to adaptor sequence at the proper end (6)\n\
36 gpertea 13 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37 gpertea 76 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
38 gpertea 12 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
39     -l minimum \"clean\" length after trimming that a read must have\n\
40     in order to pass the filter (default: 16)\n\
41 gpertea 4 -r write a simple \"trash report\" file listing the discarded reads with a\n\
42     one letter code for the reason why they were trashed\n\
43 gpertea 12 -D apply a low-complexity (dust) filter and discard any read that has over\n\
44     50% of its length detected as low complexity\n\
45 gpertea 4 -C collapse duplicate reads and add a prefix with their count to the read\n\
46 gpertea 12 name (e.g. for microRNAs)\n\
47     -p input is phred64/phred33 (use -p64 or -p33)\n\
48     -Q convert quality values to the other Phred qv type\n\
49     -V verbose processing\n\
50 gpertea 4 "
51 gpertea 170
52 gpertea 13 //-z for -o option, the output stream(s) will be first piped into the given\n
53     // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54    
55 gpertea 170
56 gpertea 4 // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
57 gpertea 12
58 gpertea 76 //For paired reads sequencing:
59 gpertea 12 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
60     //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
61 gpertea 13 //FILE* f_out=NULL; //stdout if not provided
62     //FILE* f_out2=NULL; //for paired reads
63     //FILE* f_in=NULL; //input fastq (stdin if not provided)
64     //FILE* f_in2=NULL; //for paired reads
65 gpertea 170
66 gpertea 4 FILE* freport=NULL;
67 gpertea 13
68 gpertea 4 bool debug=false;
69 gpertea 12 bool verbose=false;
70 gpertea 4 bool doCollapse=false;
71     bool doDust=false;
72 gpertea 103 bool doPolyTrim=true;
73 gpertea 13 bool fastaOutput=false;
74 gpertea 4 bool trashReport=false;
75 gpertea 172 bool revCompl=false; //also reverse complement adaptor sequences
76 gpertea 4 int min_read_len=16;
77 gpertea 12 double max_perc_N=7.0;
78 gpertea 4 int dust_cutoff=16;
79     bool isfasta=false;
80 gpertea 12 bool convert_phred=false;
81 gpertea 170 GStr outsuffix; // -o
82 gpertea 13 GStr prefix;
83     GStr zcmd;
84     int num_trimmed5=0;
85     int num_trimmed3=0;
86     int num_trimmedN=0;
87     int num_trimmedQ=0;
88     int min_trimmed5=INT_MAX;
89     int min_trimmed3=INT_MAX;
90 gpertea 12
91 gpertea 13 int qvtrim_qmin=0;
92     int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
93 gpertea 12 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
94     int qv_cvtadd=0; //could be -31 or +31
95    
96 gpertea 94 // adaptor matching metrics -- for X-drop ungapped extension
97 gpertea 177 //const int match_reward=2;
98     //const int mismatch_penalty=3;
99 gpertea 103 const int match_reward=2;
100 gpertea 177 const int mismatch_penalty=4;
101     const int Xdrop=10;
102 gpertea 103
103 gpertea 94 const int poly_m_score=2; //match score
104     const int poly_mis_score=-3; //mismatch
105 gpertea 88 const int poly_dropoff_score=7;
106 gpertea 94 int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
107 gpertea 4
108 gpertea 94 const char *polyA_seed="AAAA";
109     const char *polyT_seed="TTTT";
110 gpertea 12
111 gpertea 170 struct CASeqData {
112     //positional data for every possible hexamer in an adapter
113 gpertea 172 GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adapter sequence
114     GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adapter sequence
115 gpertea 170 GStr seq; //actual adapter sequence data
116     GStr seqr; //reverse complement sequence
117 gpertea 175 int amlen; //fraction of adapter length matching that's
118     //enough to consider the alignment
119 gpertea 172 bool use_reverse;
120 gpertea 175 CASeqData(bool rev=false):seq(),seqr(),
121     amlen(0), use_reverse(rev) {
122 gpertea 172 for (int i=0;i<4096;i++) {
123     pz[i]=NULL;
124     pzr[i]=NULL;
125     }
126     }
127    
128     void update(const char* s) {
129 gpertea 175 seq=s;
130     table6mers(seq.chars(), seq.length(), pz);
131     amlen=iround(double(seq.length())*0.8);
132     if (amlen<12)
133     amlen=12;
134     if (!use_reverse) return;
135     //reverse complement
136     seqr=s;
137     int slen=seq.length();
138     for (int i=0;i<slen;i++)
139     seqr[i]=ntComplement(seq[slen-i-1]);
140     table6mers(seqr.chars(), seqr.length(), pzr);
141 gpertea 170 }
142    
143 gpertea 172 ~CASeqData() {
144     for (int i=0;i<4096;i++) {
145     delete pz[i];
146     delete pzr[i];
147 gpertea 170 }
148     }
149     };
150    
151     GVec<CASeqData> adapters5;
152     GVec<CASeqData> adapters3;
153    
154 gpertea 103 CGreedyAlignData* gxmem_l=NULL;
155     CGreedyAlignData* gxmem_r=NULL;
156    
157 gpertea 4 // element in dhash:
158     class FqDupRec {
159     public:
160     int count; //how many of these reads are the same
161     int len; //length of qv
162     char* firstname; //optional, only if we want to keep the original read names
163     char* qv;
164     FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
165     len=0;
166     qv=NULL;
167     firstname=NULL;
168     count=0;
169     if (qstr!=NULL) {
170     qv=Gstrdup(qstr->chars());
171     len=qstr->length();
172     count++;
173     }
174     if (rname!=NULL) firstname=Gstrdup(rname);
175     }
176     ~FqDupRec() {
177     GFREE(qv);
178     GFREE(firstname);
179     }
180     void add(GStr& d) { //collapse another record into this one
181     if (d.length()!=len)
182     GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
183     count++;
184     for (int i=0;i<len;i++)
185     qv[i]=(qv[i]+d[i])/2;
186     }
187     };
188    
189     void openfw(FILE* &f, GArgs& args, char opt) {
190     GStr s=args.getOpt(opt);
191     if (!s.is_empty()) {
192     if (s=='-') f=stdout;
193     else {
194 gpertea 13 f=fopen(s.chars(),"w");
195 gpertea 4 if (f==NULL) GError("Error creating file: %s\n", s.chars());
196     }
197     }
198     }
199    
200     #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
201     #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
202    
203     GHash<FqDupRec> dhash; //hash to keep track of duplicates
204    
205 gpertea 172 void addAdapter(GVec<CASeqData>& adapters, GStr& seq);
206 gpertea 94 int loadAdapters(const char* fname);
207    
208 gpertea 170 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
209     GStr& s, GStr& infname, GStr& infname2);
210 gpertea 13 // uses outsuffix to generate output file names and open file handles as needed
211 gpertea 170
212 gpertea 13 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
213     void trash_report(char trashcode, GStr& rname, FILE* freport);
214    
215 gpertea 170 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
216 gpertea 13 GStr& rname, GStr& rinfo, GStr& infname);
217    
218 gpertea 170 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
219 gpertea 13 //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
220    
221 gpertea 12 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
222     bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
223 gpertea 4 int dust(GStr& seq);
224 gpertea 94 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
225     bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
226 gpertea 4 bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
227 gpertea 94 bool trim_adapter3(GStr& seq, int &l5, int &l3);
228 gpertea 4
229 gpertea 12 void convertPhred(char* q, int len);
230     void convertPhred(GStr& q);
231 gpertea 4
232     int main(int argc, char * const argv[]) {
233 gpertea 172 GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
234 gpertea 4 int e;
235     if ((e=args.isError())>0) {
236     GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
237     exit(224);
238     }
239 gpertea 9 debug=(args.getOpt('Y')!=NULL);
240 gpertea 13 verbose=(args.getOpt('V')!=NULL);
241 gpertea 12 convert_phred=(args.getOpt('Q')!=NULL);
242 gpertea 4 doCollapse=(args.getOpt('C')!=NULL);
243     doDust=(args.getOpt('D')!=NULL);
244 gpertea 172 revCompl=(args.getOpt('R')!=NULL);
245 gpertea 103 if (args.getOpt('A')) doPolyTrim=false;
246 gpertea 12 /*
247 gpertea 4 rawFormat=(args.getOpt('R')!=NULL);
248     if (rawFormat) {
249     GError("Sorry, raw qseq format parsing is not implemented yet!\n");
250     }
251 gpertea 12 */
252 gpertea 4 prefix=args.getOpt('n');
253     GStr s=args.getOpt('l');
254 gpertea 170 if (!s.is_empty())
255 gpertea 4 min_read_len=s.asInt();
256 gpertea 12 s=args.getOpt('m');
257 gpertea 170 if (!s.is_empty())
258 gpertea 12 max_perc_N=s.asDouble();
259 gpertea 4 s=args.getOpt('d');
260     if (!s.is_empty()) {
261     dust_cutoff=s.asInt();
262     doDust=true;
263     }
264 gpertea 12 s=args.getOpt('q');
265     if (!s.is_empty()) {
266 gpertea 13 qvtrim_qmin=s.asInt();
267 gpertea 12 }
268 gpertea 13 s=args.getOpt('t');
269     if (!s.is_empty()) {
270     qvtrim_max=s.asInt();
271     }
272 gpertea 12 s=args.getOpt('p');
273     if (!s.is_empty()) {
274     int v=s.asInt();
275     if (v==33) {
276     qv_phredtype=33;
277     qv_cvtadd=31;
278     }
279     else if (v==64) {
280     qv_phredtype=64;
281     qv_cvtadd=-31;
282     }
283 gpertea 170 else
284 gpertea 12 GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
285     }
286 gpertea 94 s=args.getOpt('f');
287     if (!s.is_empty()) {
288     loadAdapters(s.chars());
289     }
290 gpertea 102 bool fileAdapters=adapters5.Count()+adapters3.Count();
291 gpertea 94 s=args.getOpt('5');
292     if (!s.is_empty()) {
293     if (fileAdapters)
294     GError("Error: options -5 and -f cannot be used together!\n");
295     s.upper();
296 gpertea 172 addAdapter(adapters5, s);
297 gpertea 4 }
298 gpertea 94 s=args.getOpt('3');
299     if (!s.is_empty()) {
300     if (fileAdapters)
301     GError("Error: options -3 and -f cannot be used together!\n");
302 gpertea 102 s.upper();
303 gpertea 172 addAdapter(adapters3, s);
304 gpertea 4 }
305 gpertea 94 s=args.getOpt('y');
306 gpertea 13 if (!s.is_empty()) {
307 gpertea 94 int minmatch=s.asInt();
308     poly_minScore=minmatch*poly_m_score;
309 gpertea 13 }
310 gpertea 170
311 gpertea 13 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
312 gpertea 76 else outsuffix="-";
313 gpertea 4 trashReport= (args.getOpt('r')!=NULL);
314 gpertea 13 int fcount=args.startNonOpt();
315     if (fcount==0) {
316 gpertea 4 GMessage(USAGE);
317     exit(224);
318     }
319 gpertea 13 if (fcount>1 && doCollapse) {
320     GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
321     }
322     //openfw(f_out, args, 'o');
323     //if (f_out==NULL) f_out=stdout;
324 gpertea 4 if (trashReport)
325     openfw(freport, args, 'r');
326     char* infile=NULL;
327 gpertea 103
328     if (adapters5.Count()>0)
329 gpertea 177 //gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
330     gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
331 gpertea 103 if (adapters3.Count()>0)
332     gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
333    
334 gpertea 4 while ((infile=args.nextNonOpt())!=NULL) {
335 gpertea 103 //for each input file
336 gpertea 13 int incounter=0; //counter for input reads
337     int outcounter=0; //counter for output reads
338     int trash_s=0; //too short from the get go
339     int trash_Q=0;
340     int trash_N=0;
341     int trash_D=0;
342 gpertea 103 int trash_poly=0;
343 gpertea 13 int trash_A3=0;
344     int trash_A5=0;
345     s=infile;
346     GStr infname;
347     GStr infname2;
348     FILE* f_in=NULL;
349     FILE* f_in2=NULL;
350     FILE* f_out=NULL;
351     FILE* f_out2=NULL;
352     bool paired_reads=false;
353     setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
354     GLineReader fq(f_in);
355     GLineReader* fq2=NULL;
356     if (f_in2!=NULL) {
357     fq2=new GLineReader(f_in2);
358     paired_reads=true;
359     }
360     GStr rseq, rqv, seqid, seqinfo;
361     GStr rseq2, rqv2, seqid2, seqinfo2;
362     while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
363     incounter++;
364     int a5=0, a3=0, b5=0, b3=0;
365     char tcode=0, tcode2=0;
366     tcode=process_read(seqid, rseq, rqv, a5, a3);
367 gpertea 103 if (fq2!=NULL) {
368 gpertea 13 getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
369     if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
370     GError("Error: no paired match for read %s vs %s (%s,%s)\n",
371     seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
372     }
373     tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
374     //decide what to do with this pair and print rseq2 if the pair makes it
375     if (tcode>1 && tcode2<=1) {
376     //"untrash" rseq
377     if (a3-a5+1<min_read_len) {
378     a5=1;
379     if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
380 gpertea 4 }
381 gpertea 13 tcode=1;
382     }
383     else if (tcode<=1 && tcode2>1) {
384     //"untrash" rseq2
385     if (b3-b5+1<min_read_len) {
386     b5=1;
387     if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
388     }
389     tcode2=1;
390     }
391     if (tcode<=1) { //trimmed or left intact -- write it!
392     if (tcode>0) {
393     rseq2=rseq2.substr(b5,b3-b5+1);
394     if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
395 gpertea 4 }
396 gpertea 13 int nocounter=0;
397     writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
398     }
399 gpertea 103 } //pair read
400 gpertea 13 if (tcode>1) { //trashed
401 gpertea 103 #ifdef GDEBUG
402 gpertea 175 GMessage(" !!!!TRASH code = %c\n",tcode);
403 gpertea 103 #endif
404 gpertea 13 if (tcode=='s') trash_s++;
405 gpertea 103 else if (tcode=='A' || tcode=='T') trash_poly++;
406 gpertea 13 else if (tcode=='Q') trash_Q++;
407     else if (tcode=='N') trash_N++;
408     else if (tcode=='D') trash_D++;
409     else if (tcode=='3') trash_A3++;
410     else if (tcode=='5') trash_A5++;
411     if (trashReport) trash_report(tcode, seqid, freport);
412     }
413     else if (!doCollapse) { //write it
414     if (tcode>0) {
415     rseq=rseq.substr(a5,a3-a5+1);
416     if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
417     }
418 gpertea 103 #ifdef GDEBUG
419     GMessage(" After trimming:\n");
420 gpertea 105 GMessage("%s\n",rseq.chars());
421 gpertea 103 #endif
422 gpertea 13 writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
423     }
424     } //for each fastq record
425     delete fq2;
426     FRCLOSE(f_in);
427     FRCLOSE(f_in2);
428     if (doCollapse) {
429     outcounter=0;
430     int maxdup_count=1;
431     char* maxdup_seq=NULL;
432     dhash.startIterate();
433     FqDupRec* qd=NULL;
434     char* seq=NULL;
435     while ((qd=dhash.NextData(seq))!=NULL) {
436     GStr rseq(seq);
437     //do the dusting here
438     if (doDust) {
439     int dustbases=dust(rseq);
440     if (dustbases>(rseq.length()>>1)) {
441     if (trashReport && qd->firstname!=NULL) {
442     fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
443 gpertea 4 }
444 gpertea 13 trash_D+=qd->count;
445     continue;
446     }
447     }
448     outcounter++;
449 gpertea 170 if (qd->count>maxdup_count) {
450 gpertea 13 maxdup_count=qd->count;
451     maxdup_seq=seq;
452     }
453     if (isfasta) {
454     if (prefix.is_empty()) {
455 gpertea 170 fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
456 gpertea 13 rseq.chars());
457 gpertea 4 }
458 gpertea 13 else { //use custom read name
459     fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
460     qd->count, rseq.chars());
461 gpertea 4 }
462 gpertea 13 }
463     else { //fastq format
464     if (convert_phred) convertPhred(qd->qv, qd->len);
465     if (prefix.is_empty()) {
466 gpertea 170 fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
467 gpertea 13 rseq.chars(), qd->qv);
468 gpertea 4 }
469 gpertea 13 else { //use custom read name
470     fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
471     qd->count, rseq.chars(), qd->qv);
472     }
473     }
474     }//for each element of dhash
475     if (maxdup_count>1) {
476     GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
477 gpertea 4 }
478 gpertea 13 } //collapse entries
479     if (verbose) {
480     if (paired_reads) {
481     GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
482     GMessage("Number of input pairs :%9d\n", incounter);
483     GMessage(" Output pairs :%9d\n", outcounter);
484     }
485     else {
486     GMessage(">Input file : %s\n", infname.chars());
487     GMessage("Number of input reads :%9d\n", incounter);
488     GMessage(" Output reads :%9d\n", outcounter);
489     }
490     GMessage("------------------------------------\n");
491     if (num_trimmed5)
492     GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
493     if (num_trimmed3)
494     GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
495     GMessage("------------------------------------\n");
496     if (trash_s>0)
497     GMessage(" Trashed by length:%9d\n", trash_s);
498     if (trash_N>0)
499     GMessage(" Trashed by N%%:%9d\n", trash_N);
500     if (trash_Q>0)
501     GMessage("Trashed by low quality:%9d\n", trash_Q);
502 gpertea 103 if (trash_poly>0)
503     GMessage(" Trashed by poly-A/T:%9d\n", trash_poly);
504 gpertea 13 if (trash_A5>0)
505     GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
506     if (trash_A3>0)
507     GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
508     }
509     if (trashReport) {
510     FWCLOSE(freport);
511 gpertea 4 }
512 gpertea 13 FWCLOSE(f_out);
513     FWCLOSE(f_out2);
514     } //while each input file
515 gpertea 103 delete gxmem_l;
516     delete gxmem_r;
517 gpertea 12 //getc(stdin);
518 gpertea 4 }
519    
520     class NData {
521     public:
522     int NPos[1024]; //there should be no reads longer than 1K ?
523     int NCount;
524     int end5;
525     int end3;
526     int seqlen;
527     double perc_N; //percentage of Ns in end5..end3 range only!
528     const char* seq;
529     bool valid;
530     NData() {
531 gpertea 170 seqlen=0;
532 gpertea 4 NCount=0;
533     end5=0;
534     end3=0;
535     seq=NULL;
536     perc_N=0;
537     valid=true;
538     }
539     void init(GStr& rseq) {
540     NCount=0;
541     perc_N=0;
542     seqlen=rseq.length();
543     seq=rseq.chars();
544     end5=1;
545     end3=seqlen;
546     for (int i=0;i<seqlen;i++)
547 gpertea 12 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
548 gpertea 4 NPos[NCount]=i;
549     NCount++;
550     }
551     perc_N=(NCount*100.0)/seqlen;
552     valid=true;
553     }
554     void N_calc() { //only in the region after trimming
555     int n=0;
556     for (int i=end3-1;i<end5;i++) {
557     if (seq[i]=='N') n++;
558     }
559     perc_N=(n*100.0)/(end5-end3+1);
560     }
561     };
562 gpertea 170
563 gpertea 4 static NData feat;
564     int perc_lenN=12; // incremental distance from ends, in percentage of
565     // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
566 gpertea 170
567 gpertea 4 void N_analyze(int l5, int l3, int p5, int p3) {
568     /* assumes feat was filled properly */
569     int old_dif, t5,t3,v;
570 gpertea 12 if (l3<l5+2 || p5>p3 ) {
571 gpertea 4 feat.end5=l5+1;
572     feat.end3=l3+1;
573 gpertea 170 return;
574 gpertea 4 }
575    
576     t5=feat.NPos[p5]-l5;
577     t3=l3-feat.NPos[p3];
578     old_dif=p3-p5;
579     v=(int)((((double)(l3-l5))*perc_lenN)/100);
580 gpertea 170 if (v>20) v=20; /* enforce N-search limit for very long reads */
581 gpertea 4 if (t5 < v ) {
582     l5=feat.NPos[p5]+1;
583     p5++;
584     }
585     if (t3 < v) {
586     l3=feat.NPos[p3]-1;
587     p3--;
588     }
589     /* restNs=p3-p5; number of Ns in the new CLR */
590     if (p3-p5==old_dif) { /* no change, return */
591     /* don't trim if this may shorten the read too much */
592     //if (l5-l3<min_read_len) return;
593     feat.end5=l5+1;
594     feat.end3=l3+1;
595     return;
596     }
597 gpertea 170 else
598 gpertea 4 N_analyze(l5,l3, p5,p3);
599     }
600    
601    
602 gpertea 12 bool qtrim(GStr& qvs, int &l5, int &l3) {
603 gpertea 13 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
604 gpertea 12 if (qv_phredtype==0) {
605     //try to guess the Phred type
606     int vmin=256, vmax=0;
607     for (int i=0;i<qvs.length();i++) {
608     if (vmin>qvs[i]) vmin=qvs[i];
609     if (vmax<qvs[i]) vmax=qvs[i];
610     }
611     if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
612     if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
613     if (qv_phredtype==0) {
614     GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
615     }
616 gpertea 13 if (verbose)
617     GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
618     } //guessing Phred type
619 gpertea 12 for (l3=qvs.length()-1;l3>2;l3--) {
620 gpertea 13 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
621 gpertea 12 }
622     //just in case, check also the 5' the end (?)
623     for (l5=0;l5<qvs.length()-3;l5++) {
624 gpertea 13 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
625 gpertea 12 }
626 gpertea 13 if (qvtrim_max>0) {
627     if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
628     if (l5>qvtrim_max) l5=qvtrim_max;
629     }
630 gpertea 12 return (l5>0 || l3<qvs.length()-1);
631     }
632    
633     bool ntrim(GStr& rseq, int &l5, int &l3) {
634     //count Ns in the sequence, trim N-rich ends
635 gpertea 4 feat.init(rseq);
636     l5=feat.end5-1;
637     l3=feat.end3-1;
638 gpertea 170 N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
639 gpertea 12 if (l5==feat.end5-1 && l3==feat.end3-1) {
640     if (feat.perc_N>max_perc_N) {
641     feat.valid=false;
642     l3=l5+1;
643     return true;
644     }
645     else {
646     return false; //nothing changed
647     }
648     }
649 gpertea 4 l5=feat.end5-1;
650     l3=feat.end3-1;
651     if (l3-l5+1<min_read_len) {
652     feat.valid=false;
653     return true;
654     }
655     feat.N_calc();
656 gpertea 170
657 gpertea 12 if (feat.perc_N>max_perc_N) {
658 gpertea 4 feat.valid=false;
659     l3=l5+1;
660     return true;
661     }
662     return true;
663     }
664    
665     //--------------- dust functions ----------------
666     class DNADuster {
667     public:
668 gpertea 170 int dustword;
669     int dustwindow;
670     int dustwindow2;
671 gpertea 4 int dustcutoff;
672     int mv, iv, jv;
673     int counts[32*32*32];
674     int iis[32*32*32];
675     DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
676     dustword=wordsize;
677     dustwindow=winsize;
678     dustwindow2 = (winsize>>1);
679     dustcutoff=cutoff;
680     mv=0;
681     iv=0;
682     jv=0;
683     }
684     void setWindowSize(int value) {
685     dustwindow = value;
686     dustwindow2 = (dustwindow >> 1);
687     }
688     void setWordSize(int value) {
689     dustword=value;
690     }
691     void wo1(int len, const char* s, int ivv) {
692     int i, ii, j, v, t, n, n1, sum;
693     int js, nis;
694     n = 32 * 32 * 32;
695     n1 = n - 1;
696     nis = 0;
697     i = 0;
698     ii = 0;
699     sum = 0;
700     v = 0;
701     for (j=0; j < len; j++, s++) {
702     ii <<= 5;
703     if (*s<=32) {
704     i=0;
705     continue;
706     }
707     ii |= *s - 'A'; //assume uppercase!
708     ii &= n1;
709     i++;
710     if (i >= dustword) {
711     for (js=0; js < nis && iis[js] != ii; js++) ;
712     if (js == nis) {
713     iis[nis] = ii;
714     counts[ii] = 0;
715     nis++;
716     }
717     if ((t = counts[ii]) > 0) {
718     sum += t;
719     v = 10 * sum / j;
720     if (mv < v) {
721     mv = v;
722     iv = ivv;
723     jv = j;
724     }
725     }
726     counts[ii]++;
727     }
728     }
729     }
730    
731     int wo(int len, const char* s, int* beg, int* end) {
732     int i, l1;
733     l1 = len - dustword + 1;
734     if (l1 < 0) {
735     *beg = 0;
736     *end = len - 1;
737     return 0;
738     }
739     mv = 0;
740     iv = 0;
741     jv = 0;
742     for (i=0; i < l1; i++) {
743     wo1(len-i, s+i, i);
744     }
745     *beg = iv;
746     *end = iv + jv;
747     return mv;
748     }
749    
750     void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
751     int i, j, l, a, b, v;
752     if (cutoff==0) cutoff=dustcutoff;
753     a=0;b=0;
754     //GMessage("Dust cutoff=%d\n", cutoff);
755     for (i=0; i < seqlen; i += dustwindow2) {
756     l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
757     v = wo(l, seq+i, &a, &b);
758     if (v > cutoff) {
759     //for (j = a; j <= b && j < dustwindow2; j++) {
760     for (j = a; j <= b; j++) {
761     seqmsk[i+j]='N';//could be made lowercase instead
762     }
763     }
764     }
765 gpertea 170 //return first;
766 gpertea 4 }
767     };
768    
769     static DNADuster duster;
770    
771     int dust(GStr& rseq) {
772     char* seq=Gstrdup(rseq.chars());
773     duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
774     //check the number of Ns:
775     int ncount=0;
776     for (int i=0;i<rseq.length();i++) {
777     if (seq[i]=='N') ncount++;
778     }
779     GFREE(seq);
780     return ncount;
781     }
782    
783 gpertea 94 struct SLocScore {
784     int pos;
785     int score;
786     SLocScore(int p=0,int s=0) {
787     pos=p;
788     score=s;
789     }
790     void set(int p, int s) {
791     pos=p;
792     score=s;
793     }
794     void add(int p, int add) {
795     pos=p;
796     score+=add;
797     }
798     };
799 gpertea 12
800 gpertea 94 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
801 gpertea 103 if (!doPolyTrim) return false;
802 gpertea 94 int rlen=seq.length();
803     l5=0;
804     l3=rlen-1;
805     int32 seedVal=*(int32*)poly_seed;
806     char polyChar=poly_seed[0];
807     //assumes N trimming was already done
808     //so a poly match should be very close to the end of the read
809     // -- find the initial match (seed)
810 gpertea 108 int lmin=GMAX((rlen-16), 0);
811 gpertea 94 int li;
812     for (li=rlen-4;li>lmin;li--) {
813     if (seedVal==*(int*)&(seq[li])) {
814     break;
815     }
816 gpertea 12 }
817 gpertea 94 if (li<=lmin) return false;
818     //seed found, try to extend it both ways
819     //extend right
820     int ri=li+3;
821     SLocScore loc(ri, poly_m_score<<2);
822     SLocScore maxloc(loc);
823     //extend right
824 gpertea 170 while (ri<rlen-1) {
825 gpertea 94 ri++;
826     if (seq[ri]==polyChar) {
827     loc.add(ri,poly_m_score);
828     }
829     else if (seq[ri]=='N') {
830     loc.add(ri,0);
831     }
832     else { //mismatch
833     loc.add(ri,poly_mis_score);
834     if (maxloc.score-loc.score>poly_dropoff_score) break;
835     }
836     if (maxloc.score<=loc.score) {
837     maxloc=loc;
838     }
839 gpertea 12 }
840 gpertea 94 ri=maxloc.pos;
841 gpertea 103 if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
842 gpertea 94 //ri = right boundary for the poly match
843     //extend left
844     loc.set(li, maxloc.score);
845     maxloc.pos=li;
846     while (li>0) {
847     li--;
848     if (seq[li]==polyChar) {
849     loc.add(li,poly_m_score);
850     }
851     else if (seq[li]=='N') {
852     loc.add(li,0);
853     }
854     else { //mismatch
855     loc.add(li,poly_mis_score);
856     if (maxloc.score-loc.score>poly_dropoff_score) break;
857     }
858     if (maxloc.score<=loc.score) {
859     maxloc=loc;
860     }
861     }
862 gpertea 105 li=maxloc.pos;
863 gpertea 108 if ((maxloc.score==poly_minScore && ri==rlen-1) ||
864     (maxloc.score>poly_minScore && ri>=rlen-3) ||
865     (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
866 gpertea 105 //trimming this li-ri match at 3' end
867     l3=li-1;
868     if (l3<0) l3=0;
869 gpertea 94 return true;
870     }
871     return false;
872 gpertea 12 }
873    
874 gpertea 94 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
875 gpertea 103 if (!doPolyTrim) return false;
876 gpertea 88 int rlen=seq.length();
877     l5=0;
878     l3=rlen-1;
879 gpertea 94 int32 seedVal=*(int32*)poly_seed;
880     char polyChar=poly_seed[0];
881 gpertea 88 //assumes N trimming was already done
882     //so a poly match should be very close to the end of the read
883     // -- find the initial match (seed)
884 gpertea 108 int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
885 gpertea 94 int li;
886     for (li=0;li<=lmax;li++) {
887     if (seedVal==*(int*)&(seq[li])) {
888 gpertea 88 break;
889     }
890     }
891 gpertea 94 if (li>lmax) return false;
892     //seed found, try to extend it both ways
893     //extend left
894     int ri=li+3; //save rightmost base of the seed
895     SLocScore loc(li, poly_m_score<<2);
896     SLocScore maxloc(loc);
897     while (li>0) {
898     li--;
899     if (seq[li]==polyChar) {
900     loc.add(li,poly_m_score);
901     }
902     else if (seq[li]=='N') {
903     loc.add(li,0);
904     }
905     else { //mismatch
906     loc.add(li,poly_mis_score);
907     if (maxloc.score-loc.score>poly_dropoff_score) break;
908     }
909     if (maxloc.score<=loc.score) {
910     maxloc=loc;
911     }
912 gpertea 88 }
913 gpertea 94 li=maxloc.pos;
914 gpertea 103 if (li>5) return false; //no trimming wanted, too far from 5' end
915 gpertea 94 //li = right boundary for the poly match
916    
917     //extend right
918     loc.set(ri, maxloc.score);
919     maxloc.pos=ri;
920 gpertea 170 while (ri<rlen-1) {
921 gpertea 94 ri++;
922     if (seq[ri]==polyChar) {
923     loc.add(ri,poly_m_score);
924     }
925     else if (seq[ri]=='N') {
926     loc.add(ri,0);
927     }
928     else { //mismatch
929     loc.add(ri,poly_mis_score);
930     if (maxloc.score-loc.score>poly_dropoff_score) break;
931     }
932     if (maxloc.score<=loc.score) {
933     maxloc=loc;
934     }
935     }
936 gpertea 105 ri=maxloc.pos;
937 gpertea 103 if ((maxloc.score==poly_minScore && li==0) ||
938     (maxloc.score>poly_minScore && li<2)
939 gpertea 108 || (maxloc.score>(poly_minScore*3) && li<8)) {
940 gpertea 105 //adjust l5 to reflect this trimming of 5' end
941     l5=ri+1;
942     if (l5>rlen-1) l5=rlen-1;
943 gpertea 94 return true;
944     }
945     return false;
946 gpertea 88 }
947    
948 gpertea 4 bool trim_adapter3(GStr& seq, int&l5, int &l3) {
949 gpertea 103 if (adapters3.Count()==0) return false;
950 gpertea 4 int rlen=seq.length();
951     l5=0;
952     l3=rlen-1;
953 gpertea 103 bool trimmed=false;
954 gpertea 175 GStr wseq(seq);
955 gpertea 103 int wlen=rlen;
956 gpertea 175 GXSeqData seqdata;
957 gpertea 103 for (int ai=0;ai<adapters3.Count();ai++) {
958 gpertea 175 seqdata.update(adapters3[ai].seq.chars(), adapters3[ai].seq.length(),
959     adapters3[ai].pz, wseq.chars(), wlen, adapters3[ai].amlen);
960     GXAlnInfo* bestaln=match_RightEnd(seqdata, gxmem_r, 86);
961     if (bestaln) {
962 gpertea 103 trimmed=true;
963 gpertea 175 //keep unmatched region on the left OR right (the longer one)
964     if (bestaln->sl > wlen-bestaln->sr) {
965     //keep left side
966     l3-=(wlen-bestaln->sl+1);
967     if (l3<0) l3=0;
968     }
969     else { //keep right side
970     l5+=bestaln->sr;
971     if (l5>=rlen) l5=rlen-1;
972     }
973     delete bestaln;
974 gpertea 103 if (l3-l5+1<min_read_len) return true;
975     wseq=seq.substr(l5,l3-l5+1);
976     wlen=wseq.length();
977     }
978     }//for each 5' adapter
979     return trimmed;
980 gpertea 4 }
981    
982     bool trim_adapter5(GStr& seq, int&l5, int &l3) {
983 gpertea 103 if (adapters5.Count()==0) return false;
984 gpertea 4 int rlen=seq.length();
985     l5=0;
986     l3=rlen-1;
987 gpertea 102 bool trimmed=false;
988 gpertea 175 GStr wseq(seq);
989 gpertea 103 int wlen=rlen;
990 gpertea 175 GXSeqData seqdata;
991 gpertea 102 for (int ai=0;ai<adapters5.Count();ai++) {
992 gpertea 175 seqdata.update(adapters5[ai].seq.chars(), adapters5[ai].seq.length(),
993     adapters5[ai].pz, wseq.chars(), wlen, adapters5[ai].amlen);
994     GXAlnInfo* bestaln=match_LeftEnd(seqdata, gxmem_l, 90);
995     if (bestaln) {
996 gpertea 103 trimmed=true;
997 gpertea 177 if (bestaln->sl-1 > wlen-bestaln->sr) {
998 gpertea 175 //keep left side
999     l3-=(wlen-bestaln->sl+1);
1000     if (l3<0) l3=0;
1001     }
1002     else { //keep right side
1003     l5+=bestaln->sr;
1004     if (l5>=rlen) l5=rlen-1;
1005     }
1006     delete bestaln;
1007 gpertea 103 if (l3-l5+1<min_read_len) return true;
1008     wseq=seq.substr(l5,l3-l5+1);
1009     wlen=wseq.length();
1010     }
1011 gpertea 102 }//for each 5' adapter
1012     return trimmed;
1013 gpertea 98 }
1014 gpertea 4
1015 gpertea 170 //convert qvs to/from phred64 from/to phread33
1016 gpertea 12 void convertPhred(GStr& q) {
1017     for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1018 gpertea 4 }
1019    
1020 gpertea 12 void convertPhred(char* q, int len) {
1021     for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1022 gpertea 4 }
1023    
1024 gpertea 170 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1025 gpertea 13 GStr& rname, GStr& rinfo, GStr& infname) {
1026     rseq="";
1027     rqv="";
1028     rname="";
1029     rinfo="";
1030     if (fq.eof()) return false;
1031     char* l=fq.getLine();
1032     while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1033     if (l==NULL) return false;
1034     /* if (rawFormat) {
1035     //TODO: implement raw qseq parsing here
1036     //if (raw type=N) then continue; //skip invalid/bad records
1037     } //raw qseq format
1038     else { // FASTQ or FASTA */
1039     isfasta=(l[0]=='>');
1040 gpertea 170 if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1041 gpertea 13 infname.chars(), l);
1042     GStr s(l);
1043     rname=&(l[1]);
1044     for (int i=0;i<rname.length();i++)
1045 gpertea 170 if (rname[i]<=' ') {
1046     if (i<rname.length()-2) rinfo=rname.substr(i+1);
1047     rname.cut(i);
1048     break;
1049 gpertea 13 }
1050     //now get the sequence
1051 gpertea 170 if ((l=fq.getLine())==NULL)
1052 gpertea 13 GError("Error: unexpected EOF after header for read %s (%s)\n",
1053     rname.chars(), infname.chars());
1054     rseq=l; //this must be the DNA line
1055     while ((l=fq.getLine())!=NULL) {
1056     //seq can span multiple lines
1057     if (l[0]=='>' || l[0]=='+') {
1058 gpertea 170 fq.pushBack();
1059 gpertea 13 break; //
1060     }
1061     rseq+=l;
1062 gpertea 170 } //check for multi-line seq
1063 gpertea 13 if (!isfasta) { //reading fastq quality values, which can also be multi-line
1064     if ((l=fq.getLine())==NULL)
1065     GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1066     if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1067 gpertea 170 if ((l=fq.getLine())==NULL)
1068 gpertea 13 GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1069     rqv=l;
1070 gpertea 170 //if (rqv.length()!=rseq.length())
1071 gpertea 13 // GError("Error: qv len != seq len for %s\n", rname.chars());
1072     while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1073     rqv+=l; //append to qv string
1074     }
1075     }// fastq
1076     // } //<-- FASTA or FASTQ
1077 gpertea 94 rseq.upper();
1078 gpertea 13 return true;
1079     }
1080    
1081 gpertea 105 #ifdef GDEBUG
1082     void showTrim(GStr& s, int l5, int l3) {
1083 gpertea 177 if (l5>0 || l3==0) {
1084 gpertea 105 color_bg(c_red);
1085     }
1086     for (int i=0;i<s.length()-1;i++) {
1087     if (i && i==l5) color_resetbg();
1088     fprintf(stderr, "%c", s[i]);
1089 gpertea 177 if (i && i==l3) color_bg(c_red);
1090 gpertea 105 }
1091     fprintf(stderr, "%c", s[s.length()-1]);
1092     color_reset();
1093     fprintf(stderr, "\n");
1094     }
1095     #endif
1096    
1097 gpertea 13 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1098 gpertea 170 //returns 0 if the read was untouched, 1 if it was just trimmed
1099 gpertea 13 // and a trash code if it was trashed
1100     l5=0;
1101     l3=rseq.length()-1;
1102 gpertea 103 #ifdef GDEBUG
1103 gpertea 108 //rseq.reverse();
1104 gpertea 103 GMessage(">%s\n", rname.chars());
1105 gpertea 105 GMessage("%s\n",rseq.chars());
1106 gpertea 103 #endif
1107 gpertea 13 if (l3-l5+1<min_read_len) {
1108     return 's';
1109     }
1110     GStr wseq(rseq.chars());
1111     GStr wqv(rqv.chars());
1112     int w5=l5;
1113     int w3=l3;
1114     //first do the q-based trimming
1115     if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1116     if (w3-w5+1<min_read_len) {
1117     return 'Q'; //invalid read
1118     }
1119     //-- keep only the w5..w3 range
1120     l5=w5;
1121     l3=w3;
1122     wseq=wseq.substr(w5, w3-w5+1);
1123     if (!wqv.is_empty())
1124     wqv=wqv.substr(w5, w3-w5+1);
1125     } //qv trimming
1126     // N-trimming on the remaining read seq
1127     if (ntrim(wseq, w5, w3)) {
1128     //GMessage("before: %s\n",wseq.chars());
1129     //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1130     l5+=w5;
1131     l3-=(wseq.length()-1-w3);
1132     if (w3-w5+1<min_read_len) {
1133     return 'N'; //to be trashed
1134     }
1135     //-- keep only the w5..w3 range
1136     wseq=wseq.substr(w5, w3-w5+1);
1137     if (!wqv.is_empty())
1138     wqv=wqv.substr(w5, w3-w5+1);
1139     w5=0;
1140     w3=wseq.length()-1;
1141     }
1142 gpertea 103 char trim_code;
1143     do {
1144     trim_code=0;
1145     if (trim_poly5(wseq, w5, w3, polyA_seed)) {
1146     trim_code='A';
1147     }
1148     else if (trim_poly5(wseq, w5, w3, polyT_seed)) {
1149     trim_code='T';
1150     }
1151     else if (trim_adapter5(wseq, w5, w3)) {
1152     trim_code='5';
1153     }
1154     if (trim_code) {
1155 gpertea 105 #ifdef GDEBUG
1156     GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1157     showTrim(wseq, w5, w3);
1158     #endif
1159 gpertea 13 int trimlen=wseq.length()-(w3-w5+1);
1160 gpertea 103 num_trimmed5++;
1161     if (trimlen<min_trimmed5)
1162     min_trimmed5=trimlen;
1163 gpertea 13 l5+=w5;
1164     l3-=(wseq.length()-1-w3);
1165     if (w3-w5+1<min_read_len) {
1166 gpertea 103 return trim_code;
1167 gpertea 13 }
1168     //-- keep only the w5..w3 range
1169     wseq=wseq.substr(w5, w3-w5+1);
1170     if (!wqv.is_empty())
1171     wqv=wqv.substr(w5, w3-w5+1);
1172 gpertea 103 }// trimmed at 5' end
1173     } while (trim_code);
1174    
1175     do {
1176     trim_code=0;
1177     if (trim_poly3(wseq, w5, w3, polyA_seed)) {
1178     trim_code='A';
1179     }
1180     else if (trim_poly3(wseq, w5, w3, polyT_seed)) {
1181     trim_code='T';
1182     }
1183     else if (trim_adapter3(wseq, w5, w3)) {
1184     trim_code='3';
1185     }
1186     if (trim_code) {
1187 gpertea 105 #ifdef GDEBUG
1188     GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1189     showTrim(wseq, w5, w3);
1190     #endif
1191 gpertea 13 int trimlen=wseq.length()-(w3-w5+1);
1192 gpertea 103 num_trimmed3++;
1193 gpertea 170 if (trimlen<min_trimmed3)
1194 gpertea 103 min_trimmed3=trimlen;
1195 gpertea 13 l5+=w5;
1196     l3-=(wseq.length()-1-w3);
1197     if (w3-w5+1<min_read_len) {
1198 gpertea 103 return trim_code;
1199 gpertea 13 }
1200     //-- keep only the w5..w3 range
1201     wseq=wseq.substr(w5, w3-w5+1);
1202     if (!wqv.is_empty())
1203     wqv=wqv.substr(w5, w3-w5+1);
1204 gpertea 103 }//trimming at 3' end
1205     } while (trim_code);
1206    
1207    
1208 gpertea 13 if (doCollapse) {
1209     //keep read for later
1210     FqDupRec* dr=dhash.Find(wseq.chars());
1211     if (dr==NULL) { //new entry
1212 gpertea 170 //if (prefix.is_empty())
1213     dhash.Add(wseq.chars(),
1214 gpertea 13 new FqDupRec(&wqv, rname.chars()));
1215     //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1216     }
1217     else
1218     dr->add(wqv);
1219     } //collapsing duplicates
1220     else { //not collapsing duplicates
1221     //apply the dust filter now
1222     if (doDust) {
1223     int dustbases=dust(wseq);
1224     if (dustbases>(wseq.length()>>1)) {
1225     return 'D';
1226     }
1227     }
1228     } //not collapsing duplicates
1229     return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1230     }
1231    
1232     void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1233     //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1234     if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1235     else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1236     }
1237    
1238     void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1239     outcounter++;
1240     bool asFasta=(rqv.is_empty() || fastaOutput);
1241     if (asFasta) {
1242     if (prefix.is_empty()) {
1243     printHeader(f_out, '>',rname,rinfo);
1244     fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1245     }
1246     else {
1247 gpertea 170 fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1248 gpertea 13 rseq.chars());
1249     }
1250     }
1251     else { //fastq
1252     if (convert_phred) convertPhred(rqv);
1253     if (prefix.is_empty()) {
1254     printHeader(f_out, '@', rname, rinfo);
1255     fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1256     }
1257     else
1258 gpertea 170 fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1259 gpertea 13 rseq.chars(),rqv.chars() );
1260     }
1261     }
1262    
1263     void trash_report(char trashcode, GStr& rname, FILE* freport) {
1264     if (freport==NULL || trashcode<=' ') return;
1265     if (trashcode=='3' || trashcode=='5') {
1266 gpertea 103 fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1267 gpertea 13 }
1268     else {
1269     fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1270     }
1271     //tcounter++;
1272     }
1273    
1274     GStr getFext(GStr& s, int* xpos=NULL) {
1275     //s must be a filename without a path
1276     GStr r("");
1277     if (xpos!=NULL) *xpos=0;
1278     if (s.is_empty() || s=="-") return r;
1279     int p=s.rindex('.');
1280     int d=s.rindex('/');
1281     if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1282     r=s.substr(p+1);
1283     if (xpos!=NULL) *xpos=p+1;
1284     r.lower();
1285     return r;
1286     }
1287    
1288     void baseFileName(GStr& fname) {
1289     //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1290     int xpos=0;
1291     GStr fext=getFext(fname, &xpos);
1292     if (xpos<=1) return;
1293     bool extdel=false;
1294     GStr f2;
1295     if (fext=="z" || fext=="zip") {
1296     extdel=true;
1297     }
1298     else if (fext.length()>=2) {
1299     f2=fext.substr(0,2);
1300     extdel=(f2=="gz" || f2=="bz");
1301     }
1302     if (extdel) {
1303     fname.cut(xpos-1);
1304     fext=getFext(fname, &xpos);
1305     if (xpos<=1) return;
1306     }
1307     extdel=false;
1308     if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1309     extdel=true;
1310     }
1311     else if (fext.length()>=2) {
1312     extdel=(fext.substr(0,2)=="fa");
1313     }
1314     if (extdel) fname.cut(xpos-1);
1315     GStr fncp(fname);
1316     fncp.lower();
1317     fncp.chomp("seq");
1318     fncp.chomp("sequence");
1319     fncp.trimR("_.");
1320     if (fncp.length()<fname.length()) fname.cut(fncp.length());
1321     }
1322    
1323     FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1324     FILE* f_out=NULL;
1325     GStr fname(getFileName(infname.chars()));
1326     //eliminate known extensions
1327     baseFileName(fname);
1328     if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1329     else if (pocmd.is_empty()) {
1330     GStr oname(fname);
1331     oname.append('.');
1332     oname.append(outsuffix);
1333     f_out=fopen(oname.chars(),"w");
1334     if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1335     }
1336     else {
1337     GStr oname(">");
1338     oname.append(fname);
1339     oname.append('.');
1340     oname.append(outsuffix);
1341     pocmd.append(oname);
1342     f_out=popen(pocmd.chars(), "w");
1343     if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1344     }
1345     return f_out;
1346     }
1347    
1348     void guess_unzip(GStr& fname, GStr& picmd) {
1349     GStr fext=getFext(fname);
1350     if (fext=="gz" || fext=="gzip" || fext=="z") {
1351     picmd="gzip -cd ";
1352     }
1353     else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1354     picmd="bzip2 -cd ";
1355     }
1356     }
1357    
1358 gpertea 170 void addAdapter(GVec<CASeqData>& adapters, GStr& seq) {
1359     //TODO: prepare CASeqData here, and collect hexamers as well
1360 gpertea 172 CASeqData adata(revCompl);
1361     int idx=adapters.Add(adata);
1362     if (idx<0) GError("Error: failed to add adaptor!\n");
1363     adapters[idx].update(seq.chars());
1364 gpertea 170 }
1365    
1366    
1367 gpertea 94 int loadAdapters(const char* fname) {
1368     GLineReader lr(fname);
1369     char* l;
1370     while ((l=lr.nextLine())!=NULL) {
1371     if (lr.length()<=3 || l[0]=='#') continue;
1372 gpertea 98 if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1373     l[0]==';'|| l[0]==':' ) {
1374 gpertea 94 int i=1;
1375     while (l[i]!=0 && isspace(l[i])) {
1376     i++;
1377     }
1378     if (l[i]!=0) {
1379 gpertea 102 GStr s(&(l[i]));
1380 gpertea 108 #ifdef GDEBUG
1381     //s.reverse();
1382     #endif
1383 gpertea 170 addAdapter(adapters3, s);
1384 gpertea 94 continue;
1385     }
1386     }
1387     else {
1388 gpertea 98 GStr s(l);
1389     s.startTokenize("\t ;,:");
1390     GStr a5,a3;
1391     if (s.nextToken(a5))
1392     s.nextToken(a3);
1393 gpertea 102 a5.upper();
1394     a3.upper();
1395 gpertea 108 #ifdef GDEBUG
1396     // a5.reverse();
1397     // a3.reverse();
1398     #endif
1399 gpertea 170 addAdapter(adapters5, a5);
1400     addAdapter(adapters3, a3);
1401 gpertea 94 }
1402     }
1403 gpertea 102 return adapters5.Count()+adapters3.Count();
1404 gpertea 94 }
1405    
1406 gpertea 170 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1407     GStr& s, GStr& infname, GStr& infname2) {
1408 gpertea 13 // uses outsuffix to generate output file names and open file handles as needed
1409     infname="";
1410     infname2="";
1411     f_in=NULL;
1412     f_in2=NULL;
1413     f_out=NULL;
1414     f_out2=NULL;
1415     //analyze outsuffix intent
1416     GStr pocmd;
1417 gpertea 76 if (outsuffix=="-") {
1418     f_out=stdout;
1419     }
1420     else {
1421     GStr ox=getFext(outsuffix);
1422     if (ox.length()>2) ox=ox.substr(0,2);
1423     if (ox=="gz") pocmd="gzip -9 -c ";
1424     else if (ox=="bz") pocmd="bzip2 -9 -c ";
1425     }
1426 gpertea 13 if (s=="-") {
1427     f_in=stdin;
1428     infname="stdin";
1429     f_out=prepOutFile(infname, pocmd);
1430     return;
1431     } // streaming from stdin
1432     s.startTokenize(",:");
1433     s.nextToken(infname);
1434     bool paired=s.nextToken(infname2);
1435 gpertea 170 if (fileExists(infname.chars())==0)
1436 gpertea 13 GError("Error: cannot find file %s!\n",infname.chars());
1437     GStr fname(getFileName(infname.chars()));
1438     GStr picmd;
1439     guess_unzip(fname, picmd);
1440     if (picmd.is_empty()) {
1441     f_in=fopen(infname.chars(), "r");
1442     if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1443     }
1444     else {
1445     picmd.append(infname);
1446     f_in=popen(picmd.chars(), "r");
1447     if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1448     }
1449 gpertea 76 if (f_out==stdout) {
1450     if (paired) GError("Error: output suffix required for paired reads\n");
1451     return;
1452     }
1453 gpertea 13 f_out=prepOutFile(infname, pocmd);
1454     if (!paired) return;
1455     if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1456     // ---- paired reads:-------------
1457 gpertea 170 if (fileExists(infname2.chars())==0)
1458 gpertea 13 GError("Error: cannot find file %s!\n",infname2.chars());
1459     picmd="";
1460     GStr fname2(getFileName(infname2.chars()));
1461     guess_unzip(fname2, picmd);
1462     if (picmd.is_empty()) {
1463     f_in2=fopen(infname2.chars(), "r");
1464     if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1465     }
1466     else {
1467     picmd.append(infname2);
1468     f_in2=popen(picmd.chars(), "r");
1469     if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1470     }
1471     f_out2=prepOutFile(infname2, pocmd);
1472     }