ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 175
Committed: Thu Feb 16 03:46:59 2012 UTC (7 years, 8 months ago) by gpertea
File size: 43190 byte(s)
Log Message:
Line User Rev File contents
1 gpertea 4 #include "GArgs.h"
2     #include "GStr.h"
3     #include "GHash.hh"
4     #include "GList.hh"
5 gpertea 13 #include <ctype.h>
6 gpertea 94 #include "GAlnExtend.h"
7 gpertea 4
8     #define USAGE "Usage:\n\
9 gpertea 76 fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10 gpertea 172 [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11 gpertea 76 [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12     <input.fq>[,<input_mates.fq>\n\
13 gpertea 4 \n\
14 gpertea 76 Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
15     for low complexity and collapse duplicate reads.\n\
16 gpertea 13 If read pairs should be trimmed and kept together (i.e. without discarding\n\
17     one read in a pair), the two file names should be given delimited by a comma\n\
18     or a colon character\n\
19 gpertea 4 \n\
20     Options:\n\
21     -n rename all the reads using the <prefix> followed by a read counter;\n\
22     if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23     the read duplication count\n\
24 gpertea 76 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
25     file(s) named <input>.<outsuffix> which will be created in the \n\
26     current (working) directory; (writes to stdout if -o- is given);\n\
27     a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28     -f file with adapter sequences to trim, each line having this format:\n\
29     <5'-adapter-sequence> <3'-adapter-sequence>\n\
30 gpertea 4 -5 trim the given adapter or primer sequence at the 5' end of each read\n\
31     (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32     -3 trim the given adapter sequence at the 3' end of each read\n\
33     (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 gpertea 103 -A disable polyA/T trimming (enabled by default)\n\
35 gpertea 94 -y minimum length of exact match to adaptor sequence at the proper end (6)\n\
36 gpertea 13 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37 gpertea 76 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
38 gpertea 12 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
39     -l minimum \"clean\" length after trimming that a read must have\n\
40     in order to pass the filter (default: 16)\n\
41 gpertea 4 -r write a simple \"trash report\" file listing the discarded reads with a\n\
42     one letter code for the reason why they were trashed\n\
43 gpertea 12 -D apply a low-complexity (dust) filter and discard any read that has over\n\
44     50% of its length detected as low complexity\n\
45 gpertea 4 -C collapse duplicate reads and add a prefix with their count to the read\n\
46 gpertea 12 name (e.g. for microRNAs)\n\
47     -p input is phred64/phred33 (use -p64 or -p33)\n\
48     -Q convert quality values to the other Phred qv type\n\
49     -V verbose processing\n\
50 gpertea 4 "
51 gpertea 170
52 gpertea 13 //-z for -o option, the output stream(s) will be first piped into the given\n
53     // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54    
55 gpertea 170
56 gpertea 4 // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
57 gpertea 12
58 gpertea 76 //For paired reads sequencing:
59 gpertea 12 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
60     //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
61 gpertea 13 //FILE* f_out=NULL; //stdout if not provided
62     //FILE* f_out2=NULL; //for paired reads
63     //FILE* f_in=NULL; //input fastq (stdin if not provided)
64     //FILE* f_in2=NULL; //for paired reads
65 gpertea 170
66 gpertea 4 FILE* freport=NULL;
67 gpertea 13
68 gpertea 4 bool debug=false;
69 gpertea 12 bool verbose=false;
70 gpertea 4 bool doCollapse=false;
71     bool doDust=false;
72 gpertea 103 bool doPolyTrim=true;
73 gpertea 13 bool fastaOutput=false;
74 gpertea 4 bool trashReport=false;
75 gpertea 172 bool revCompl=false; //also reverse complement adaptor sequences
76 gpertea 4 int min_read_len=16;
77 gpertea 12 double max_perc_N=7.0;
78 gpertea 4 int dust_cutoff=16;
79     bool isfasta=false;
80 gpertea 12 bool convert_phred=false;
81 gpertea 170 GStr outsuffix; // -o
82 gpertea 13 GStr prefix;
83     GStr zcmd;
84     int num_trimmed5=0;
85     int num_trimmed3=0;
86     int num_trimmedN=0;
87     int num_trimmedQ=0;
88     int min_trimmed5=INT_MAX;
89     int min_trimmed3=INT_MAX;
90 gpertea 12
91 gpertea 13 int qvtrim_qmin=0;
92     int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
93 gpertea 12 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
94     int qv_cvtadd=0; //could be -31 or +31
95    
96 gpertea 94 // adaptor matching metrics -- for X-drop ungapped extension
97 gpertea 103 const int match_reward=2;
98     const int mismatch_penalty=3;
99     const int Xdrop=8;
100    
101 gpertea 94 const int poly_m_score=2; //match score
102     const int poly_mis_score=-3; //mismatch
103 gpertea 88 const int poly_dropoff_score=7;
104 gpertea 94 int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
105 gpertea 4
106 gpertea 94 const char *polyA_seed="AAAA";
107     const char *polyT_seed="TTTT";
108 gpertea 12
109 gpertea 170 struct CASeqData {
110     //positional data for every possible hexamer in an adapter
111 gpertea 172 GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adapter sequence
112     GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adapter sequence
113 gpertea 170 GStr seq; //actual adapter sequence data
114     GStr seqr; //reverse complement sequence
115 gpertea 175 int amlen; //fraction of adapter length matching that's
116     //enough to consider the alignment
117 gpertea 172 bool use_reverse;
118 gpertea 175 CASeqData(bool rev=false):seq(),seqr(),
119     amlen(0), use_reverse(rev) {
120 gpertea 172 for (int i=0;i<4096;i++) {
121     pz[i]=NULL;
122     pzr[i]=NULL;
123     }
124     }
125    
126     void update(const char* s) {
127 gpertea 175 seq=s;
128     table6mers(seq.chars(), seq.length(), pz);
129     amlen=iround(double(seq.length())*0.8);
130     if (amlen<12)
131     amlen=12;
132     if (!use_reverse) return;
133     //reverse complement
134     seqr=s;
135     int slen=seq.length();
136     for (int i=0;i<slen;i++)
137     seqr[i]=ntComplement(seq[slen-i-1]);
138     table6mers(seqr.chars(), seqr.length(), pzr);
139 gpertea 170 }
140    
141 gpertea 172 ~CASeqData() {
142     for (int i=0;i<4096;i++) {
143     delete pz[i];
144     delete pzr[i];
145 gpertea 170 }
146     }
147     };
148    
149     GVec<CASeqData> adapters5;
150     GVec<CASeqData> adapters3;
151    
152 gpertea 103 CGreedyAlignData* gxmem_l=NULL;
153     CGreedyAlignData* gxmem_r=NULL;
154    
155 gpertea 4 // element in dhash:
156     class FqDupRec {
157     public:
158     int count; //how many of these reads are the same
159     int len; //length of qv
160     char* firstname; //optional, only if we want to keep the original read names
161     char* qv;
162     FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
163     len=0;
164     qv=NULL;
165     firstname=NULL;
166     count=0;
167     if (qstr!=NULL) {
168     qv=Gstrdup(qstr->chars());
169     len=qstr->length();
170     count++;
171     }
172     if (rname!=NULL) firstname=Gstrdup(rname);
173     }
174     ~FqDupRec() {
175     GFREE(qv);
176     GFREE(firstname);
177     }
178     void add(GStr& d) { //collapse another record into this one
179     if (d.length()!=len)
180     GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
181     count++;
182     for (int i=0;i<len;i++)
183     qv[i]=(qv[i]+d[i])/2;
184     }
185     };
186    
187     void openfw(FILE* &f, GArgs& args, char opt) {
188     GStr s=args.getOpt(opt);
189     if (!s.is_empty()) {
190     if (s=='-') f=stdout;
191     else {
192 gpertea 13 f=fopen(s.chars(),"w");
193 gpertea 4 if (f==NULL) GError("Error creating file: %s\n", s.chars());
194     }
195     }
196     }
197    
198     #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
199     #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
200    
201     GHash<FqDupRec> dhash; //hash to keep track of duplicates
202    
203 gpertea 172 void addAdapter(GVec<CASeqData>& adapters, GStr& seq);
204 gpertea 94 int loadAdapters(const char* fname);
205    
206 gpertea 170 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
207     GStr& s, GStr& infname, GStr& infname2);
208 gpertea 13 // uses outsuffix to generate output file names and open file handles as needed
209 gpertea 170
210 gpertea 13 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
211     void trash_report(char trashcode, GStr& rname, FILE* freport);
212    
213 gpertea 170 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
214 gpertea 13 GStr& rname, GStr& rinfo, GStr& infname);
215    
216 gpertea 170 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
217 gpertea 13 //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
218    
219 gpertea 12 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
220     bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
221 gpertea 4 int dust(GStr& seq);
222 gpertea 94 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
223     bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
224 gpertea 4 bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
225 gpertea 94 bool trim_adapter3(GStr& seq, int &l5, int &l3);
226 gpertea 4
227 gpertea 12 void convertPhred(char* q, int len);
228     void convertPhred(GStr& q);
229 gpertea 4
230     int main(int argc, char * const argv[]) {
231 gpertea 172 GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
232 gpertea 4 int e;
233     if ((e=args.isError())>0) {
234     GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
235     exit(224);
236     }
237 gpertea 9 debug=(args.getOpt('Y')!=NULL);
238 gpertea 13 verbose=(args.getOpt('V')!=NULL);
239 gpertea 12 convert_phred=(args.getOpt('Q')!=NULL);
240 gpertea 4 doCollapse=(args.getOpt('C')!=NULL);
241     doDust=(args.getOpt('D')!=NULL);
242 gpertea 172 revCompl=(args.getOpt('R')!=NULL);
243 gpertea 103 if (args.getOpt('A')) doPolyTrim=false;
244 gpertea 12 /*
245 gpertea 4 rawFormat=(args.getOpt('R')!=NULL);
246     if (rawFormat) {
247     GError("Sorry, raw qseq format parsing is not implemented yet!\n");
248     }
249 gpertea 12 */
250 gpertea 4 prefix=args.getOpt('n');
251     GStr s=args.getOpt('l');
252 gpertea 170 if (!s.is_empty())
253 gpertea 4 min_read_len=s.asInt();
254 gpertea 12 s=args.getOpt('m');
255 gpertea 170 if (!s.is_empty())
256 gpertea 12 max_perc_N=s.asDouble();
257 gpertea 4 s=args.getOpt('d');
258     if (!s.is_empty()) {
259     dust_cutoff=s.asInt();
260     doDust=true;
261     }
262 gpertea 12 s=args.getOpt('q');
263     if (!s.is_empty()) {
264 gpertea 13 qvtrim_qmin=s.asInt();
265 gpertea 12 }
266 gpertea 13 s=args.getOpt('t');
267     if (!s.is_empty()) {
268     qvtrim_max=s.asInt();
269     }
270 gpertea 12 s=args.getOpt('p');
271     if (!s.is_empty()) {
272     int v=s.asInt();
273     if (v==33) {
274     qv_phredtype=33;
275     qv_cvtadd=31;
276     }
277     else if (v==64) {
278     qv_phredtype=64;
279     qv_cvtadd=-31;
280     }
281 gpertea 170 else
282 gpertea 12 GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
283     }
284 gpertea 94 s=args.getOpt('f');
285     if (!s.is_empty()) {
286     loadAdapters(s.chars());
287     }
288 gpertea 102 bool fileAdapters=adapters5.Count()+adapters3.Count();
289 gpertea 94 s=args.getOpt('5');
290     if (!s.is_empty()) {
291     if (fileAdapters)
292     GError("Error: options -5 and -f cannot be used together!\n");
293     s.upper();
294 gpertea 172 addAdapter(adapters5, s);
295 gpertea 4 }
296 gpertea 94 s=args.getOpt('3');
297     if (!s.is_empty()) {
298     if (fileAdapters)
299     GError("Error: options -3 and -f cannot be used together!\n");
300 gpertea 102 s.upper();
301 gpertea 172 addAdapter(adapters3, s);
302 gpertea 4 }
303 gpertea 94 s=args.getOpt('y');
304 gpertea 13 if (!s.is_empty()) {
305 gpertea 94 int minmatch=s.asInt();
306     poly_minScore=minmatch*poly_m_score;
307 gpertea 13 }
308 gpertea 170
309 gpertea 13 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
310 gpertea 76 else outsuffix="-";
311 gpertea 4 trashReport= (args.getOpt('r')!=NULL);
312 gpertea 13 int fcount=args.startNonOpt();
313     if (fcount==0) {
314 gpertea 4 GMessage(USAGE);
315     exit(224);
316     }
317 gpertea 13 if (fcount>1 && doCollapse) {
318     GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
319     }
320     //openfw(f_out, args, 'o');
321     //if (f_out==NULL) f_out=stdout;
322 gpertea 4 if (trashReport)
323     openfw(freport, args, 'r');
324     char* infile=NULL;
325 gpertea 103
326     if (adapters5.Count()>0)
327     gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
328     if (adapters3.Count()>0)
329     gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
330    
331 gpertea 4 while ((infile=args.nextNonOpt())!=NULL) {
332 gpertea 103 //for each input file
333 gpertea 13 int incounter=0; //counter for input reads
334     int outcounter=0; //counter for output reads
335     int trash_s=0; //too short from the get go
336     int trash_Q=0;
337     int trash_N=0;
338     int trash_D=0;
339 gpertea 103 int trash_poly=0;
340 gpertea 13 int trash_A3=0;
341     int trash_A5=0;
342     s=infile;
343     GStr infname;
344     GStr infname2;
345     FILE* f_in=NULL;
346     FILE* f_in2=NULL;
347     FILE* f_out=NULL;
348     FILE* f_out2=NULL;
349     bool paired_reads=false;
350     setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
351     GLineReader fq(f_in);
352     GLineReader* fq2=NULL;
353     if (f_in2!=NULL) {
354     fq2=new GLineReader(f_in2);
355     paired_reads=true;
356     }
357     GStr rseq, rqv, seqid, seqinfo;
358     GStr rseq2, rqv2, seqid2, seqinfo2;
359     while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
360     incounter++;
361     int a5=0, a3=0, b5=0, b3=0;
362     char tcode=0, tcode2=0;
363     tcode=process_read(seqid, rseq, rqv, a5, a3);
364 gpertea 103 if (fq2!=NULL) {
365 gpertea 13 getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
366     if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
367     GError("Error: no paired match for read %s vs %s (%s,%s)\n",
368     seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
369     }
370     tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
371     //decide what to do with this pair and print rseq2 if the pair makes it
372     if (tcode>1 && tcode2<=1) {
373     //"untrash" rseq
374     if (a3-a5+1<min_read_len) {
375     a5=1;
376     if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
377 gpertea 4 }
378 gpertea 13 tcode=1;
379     }
380     else if (tcode<=1 && tcode2>1) {
381     //"untrash" rseq2
382     if (b3-b5+1<min_read_len) {
383     b5=1;
384     if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
385     }
386     tcode2=1;
387     }
388     if (tcode<=1) { //trimmed or left intact -- write it!
389     if (tcode>0) {
390     rseq2=rseq2.substr(b5,b3-b5+1);
391     if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
392 gpertea 4 }
393 gpertea 13 int nocounter=0;
394     writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
395     }
396 gpertea 103 } //pair read
397 gpertea 13 if (tcode>1) { //trashed
398 gpertea 103 #ifdef GDEBUG
399 gpertea 175 GMessage(" !!!!TRASH code = %c\n",tcode);
400 gpertea 103 #endif
401 gpertea 13 if (tcode=='s') trash_s++;
402 gpertea 103 else if (tcode=='A' || tcode=='T') trash_poly++;
403 gpertea 13 else if (tcode=='Q') trash_Q++;
404     else if (tcode=='N') trash_N++;
405     else if (tcode=='D') trash_D++;
406     else if (tcode=='3') trash_A3++;
407     else if (tcode=='5') trash_A5++;
408     if (trashReport) trash_report(tcode, seqid, freport);
409     }
410     else if (!doCollapse) { //write it
411     if (tcode>0) {
412     rseq=rseq.substr(a5,a3-a5+1);
413     if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
414     }
415 gpertea 103 #ifdef GDEBUG
416     GMessage(" After trimming:\n");
417 gpertea 105 GMessage("%s\n",rseq.chars());
418 gpertea 103 #endif
419 gpertea 13 writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
420     }
421     } //for each fastq record
422     delete fq2;
423     FRCLOSE(f_in);
424     FRCLOSE(f_in2);
425     if (doCollapse) {
426     outcounter=0;
427     int maxdup_count=1;
428     char* maxdup_seq=NULL;
429     dhash.startIterate();
430     FqDupRec* qd=NULL;
431     char* seq=NULL;
432     while ((qd=dhash.NextData(seq))!=NULL) {
433     GStr rseq(seq);
434     //do the dusting here
435     if (doDust) {
436     int dustbases=dust(rseq);
437     if (dustbases>(rseq.length()>>1)) {
438     if (trashReport && qd->firstname!=NULL) {
439     fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
440 gpertea 4 }
441 gpertea 13 trash_D+=qd->count;
442     continue;
443     }
444     }
445     outcounter++;
446 gpertea 170 if (qd->count>maxdup_count) {
447 gpertea 13 maxdup_count=qd->count;
448     maxdup_seq=seq;
449     }
450     if (isfasta) {
451     if (prefix.is_empty()) {
452 gpertea 170 fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
453 gpertea 13 rseq.chars());
454 gpertea 4 }
455 gpertea 13 else { //use custom read name
456     fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
457     qd->count, rseq.chars());
458 gpertea 4 }
459 gpertea 13 }
460     else { //fastq format
461     if (convert_phred) convertPhred(qd->qv, qd->len);
462     if (prefix.is_empty()) {
463 gpertea 170 fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
464 gpertea 13 rseq.chars(), qd->qv);
465 gpertea 4 }
466 gpertea 13 else { //use custom read name
467     fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
468     qd->count, rseq.chars(), qd->qv);
469     }
470     }
471     }//for each element of dhash
472     if (maxdup_count>1) {
473     GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
474 gpertea 4 }
475 gpertea 13 } //collapse entries
476     if (verbose) {
477     if (paired_reads) {
478     GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
479     GMessage("Number of input pairs :%9d\n", incounter);
480     GMessage(" Output pairs :%9d\n", outcounter);
481     }
482     else {
483     GMessage(">Input file : %s\n", infname.chars());
484     GMessage("Number of input reads :%9d\n", incounter);
485     GMessage(" Output reads :%9d\n", outcounter);
486     }
487     GMessage("------------------------------------\n");
488     if (num_trimmed5)
489     GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
490     if (num_trimmed3)
491     GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
492     GMessage("------------------------------------\n");
493     if (trash_s>0)
494     GMessage(" Trashed by length:%9d\n", trash_s);
495     if (trash_N>0)
496     GMessage(" Trashed by N%%:%9d\n", trash_N);
497     if (trash_Q>0)
498     GMessage("Trashed by low quality:%9d\n", trash_Q);
499 gpertea 103 if (trash_poly>0)
500     GMessage(" Trashed by poly-A/T:%9d\n", trash_poly);
501 gpertea 13 if (trash_A5>0)
502     GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
503     if (trash_A3>0)
504     GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
505     }
506     if (trashReport) {
507     FWCLOSE(freport);
508 gpertea 4 }
509 gpertea 13 FWCLOSE(f_out);
510     FWCLOSE(f_out2);
511     } //while each input file
512 gpertea 103 delete gxmem_l;
513     delete gxmem_r;
514 gpertea 12 //getc(stdin);
515 gpertea 4 }
516    
517     class NData {
518     public:
519     int NPos[1024]; //there should be no reads longer than 1K ?
520     int NCount;
521     int end5;
522     int end3;
523     int seqlen;
524     double perc_N; //percentage of Ns in end5..end3 range only!
525     const char* seq;
526     bool valid;
527     NData() {
528 gpertea 170 seqlen=0;
529 gpertea 4 NCount=0;
530     end5=0;
531     end3=0;
532     seq=NULL;
533     perc_N=0;
534     valid=true;
535     }
536     void init(GStr& rseq) {
537     NCount=0;
538     perc_N=0;
539     seqlen=rseq.length();
540     seq=rseq.chars();
541     end5=1;
542     end3=seqlen;
543     for (int i=0;i<seqlen;i++)
544 gpertea 12 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
545 gpertea 4 NPos[NCount]=i;
546     NCount++;
547     }
548     perc_N=(NCount*100.0)/seqlen;
549     valid=true;
550     }
551     void N_calc() { //only in the region after trimming
552     int n=0;
553     for (int i=end3-1;i<end5;i++) {
554     if (seq[i]=='N') n++;
555     }
556     perc_N=(n*100.0)/(end5-end3+1);
557     }
558     };
559 gpertea 170
560 gpertea 4 static NData feat;
561     int perc_lenN=12; // incremental distance from ends, in percentage of
562     // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
563 gpertea 170
564 gpertea 4 void N_analyze(int l5, int l3, int p5, int p3) {
565     /* assumes feat was filled properly */
566     int old_dif, t5,t3,v;
567 gpertea 12 if (l3<l5+2 || p5>p3 ) {
568 gpertea 4 feat.end5=l5+1;
569     feat.end3=l3+1;
570 gpertea 170 return;
571 gpertea 4 }
572    
573     t5=feat.NPos[p5]-l5;
574     t3=l3-feat.NPos[p3];
575     old_dif=p3-p5;
576     v=(int)((((double)(l3-l5))*perc_lenN)/100);
577 gpertea 170 if (v>20) v=20; /* enforce N-search limit for very long reads */
578 gpertea 4 if (t5 < v ) {
579     l5=feat.NPos[p5]+1;
580     p5++;
581     }
582     if (t3 < v) {
583     l3=feat.NPos[p3]-1;
584     p3--;
585     }
586     /* restNs=p3-p5; number of Ns in the new CLR */
587     if (p3-p5==old_dif) { /* no change, return */
588     /* don't trim if this may shorten the read too much */
589     //if (l5-l3<min_read_len) return;
590     feat.end5=l5+1;
591     feat.end3=l3+1;
592     return;
593     }
594 gpertea 170 else
595 gpertea 4 N_analyze(l5,l3, p5,p3);
596     }
597    
598    
599 gpertea 12 bool qtrim(GStr& qvs, int &l5, int &l3) {
600 gpertea 13 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
601 gpertea 12 if (qv_phredtype==0) {
602     //try to guess the Phred type
603     int vmin=256, vmax=0;
604     for (int i=0;i<qvs.length();i++) {
605     if (vmin>qvs[i]) vmin=qvs[i];
606     if (vmax<qvs[i]) vmax=qvs[i];
607     }
608     if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
609     if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
610     if (qv_phredtype==0) {
611     GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
612     }
613 gpertea 13 if (verbose)
614     GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
615     } //guessing Phred type
616 gpertea 12 for (l3=qvs.length()-1;l3>2;l3--) {
617 gpertea 13 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
618 gpertea 12 }
619     //just in case, check also the 5' the end (?)
620     for (l5=0;l5<qvs.length()-3;l5++) {
621 gpertea 13 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
622 gpertea 12 }
623 gpertea 13 if (qvtrim_max>0) {
624     if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
625     if (l5>qvtrim_max) l5=qvtrim_max;
626     }
627 gpertea 12 return (l5>0 || l3<qvs.length()-1);
628     }
629    
630     bool ntrim(GStr& rseq, int &l5, int &l3) {
631     //count Ns in the sequence, trim N-rich ends
632 gpertea 4 feat.init(rseq);
633     l5=feat.end5-1;
634     l3=feat.end3-1;
635 gpertea 170 N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
636 gpertea 12 if (l5==feat.end5-1 && l3==feat.end3-1) {
637     if (feat.perc_N>max_perc_N) {
638     feat.valid=false;
639     l3=l5+1;
640     return true;
641     }
642     else {
643     return false; //nothing changed
644     }
645     }
646 gpertea 4 l5=feat.end5-1;
647     l3=feat.end3-1;
648     if (l3-l5+1<min_read_len) {
649     feat.valid=false;
650     return true;
651     }
652     feat.N_calc();
653 gpertea 170
654 gpertea 12 if (feat.perc_N>max_perc_N) {
655 gpertea 4 feat.valid=false;
656     l3=l5+1;
657     return true;
658     }
659     return true;
660     }
661    
662     //--------------- dust functions ----------------
663     class DNADuster {
664     public:
665 gpertea 170 int dustword;
666     int dustwindow;
667     int dustwindow2;
668 gpertea 4 int dustcutoff;
669     int mv, iv, jv;
670     int counts[32*32*32];
671     int iis[32*32*32];
672     DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
673     dustword=wordsize;
674     dustwindow=winsize;
675     dustwindow2 = (winsize>>1);
676     dustcutoff=cutoff;
677     mv=0;
678     iv=0;
679     jv=0;
680     }
681     void setWindowSize(int value) {
682     dustwindow = value;
683     dustwindow2 = (dustwindow >> 1);
684     }
685     void setWordSize(int value) {
686     dustword=value;
687     }
688     void wo1(int len, const char* s, int ivv) {
689     int i, ii, j, v, t, n, n1, sum;
690     int js, nis;
691     n = 32 * 32 * 32;
692     n1 = n - 1;
693     nis = 0;
694     i = 0;
695     ii = 0;
696     sum = 0;
697     v = 0;
698     for (j=0; j < len; j++, s++) {
699     ii <<= 5;
700     if (*s<=32) {
701     i=0;
702     continue;
703     }
704     ii |= *s - 'A'; //assume uppercase!
705     ii &= n1;
706     i++;
707     if (i >= dustword) {
708     for (js=0; js < nis && iis[js] != ii; js++) ;
709     if (js == nis) {
710     iis[nis] = ii;
711     counts[ii] = 0;
712     nis++;
713     }
714     if ((t = counts[ii]) > 0) {
715     sum += t;
716     v = 10 * sum / j;
717     if (mv < v) {
718     mv = v;
719     iv = ivv;
720     jv = j;
721     }
722     }
723     counts[ii]++;
724     }
725     }
726     }
727    
728     int wo(int len, const char* s, int* beg, int* end) {
729     int i, l1;
730     l1 = len - dustword + 1;
731     if (l1 < 0) {
732     *beg = 0;
733     *end = len - 1;
734     return 0;
735     }
736     mv = 0;
737     iv = 0;
738     jv = 0;
739     for (i=0; i < l1; i++) {
740     wo1(len-i, s+i, i);
741     }
742     *beg = iv;
743     *end = iv + jv;
744     return mv;
745     }
746    
747     void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
748     int i, j, l, a, b, v;
749     if (cutoff==0) cutoff=dustcutoff;
750     a=0;b=0;
751     //GMessage("Dust cutoff=%d\n", cutoff);
752     for (i=0; i < seqlen; i += dustwindow2) {
753     l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
754     v = wo(l, seq+i, &a, &b);
755     if (v > cutoff) {
756     //for (j = a; j <= b && j < dustwindow2; j++) {
757     for (j = a; j <= b; j++) {
758     seqmsk[i+j]='N';//could be made lowercase instead
759     }
760     }
761     }
762 gpertea 170 //return first;
763 gpertea 4 }
764     };
765    
766     static DNADuster duster;
767    
768     int dust(GStr& rseq) {
769     char* seq=Gstrdup(rseq.chars());
770     duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
771     //check the number of Ns:
772     int ncount=0;
773     for (int i=0;i<rseq.length();i++) {
774     if (seq[i]=='N') ncount++;
775     }
776     GFREE(seq);
777     return ncount;
778     }
779    
780 gpertea 94 struct SLocScore {
781     int pos;
782     int score;
783     SLocScore(int p=0,int s=0) {
784     pos=p;
785     score=s;
786     }
787     void set(int p, int s) {
788     pos=p;
789     score=s;
790     }
791     void add(int p, int add) {
792     pos=p;
793     score+=add;
794     }
795     };
796 gpertea 12
797 gpertea 94 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
798 gpertea 103 if (!doPolyTrim) return false;
799 gpertea 94 int rlen=seq.length();
800     l5=0;
801     l3=rlen-1;
802     int32 seedVal=*(int32*)poly_seed;
803     char polyChar=poly_seed[0];
804     //assumes N trimming was already done
805     //so a poly match should be very close to the end of the read
806     // -- find the initial match (seed)
807 gpertea 108 int lmin=GMAX((rlen-16), 0);
808 gpertea 94 int li;
809     for (li=rlen-4;li>lmin;li--) {
810     if (seedVal==*(int*)&(seq[li])) {
811     break;
812     }
813 gpertea 12 }
814 gpertea 94 if (li<=lmin) return false;
815     //seed found, try to extend it both ways
816     //extend right
817     int ri=li+3;
818     SLocScore loc(ri, poly_m_score<<2);
819     SLocScore maxloc(loc);
820     //extend right
821 gpertea 170 while (ri<rlen-1) {
822 gpertea 94 ri++;
823     if (seq[ri]==polyChar) {
824     loc.add(ri,poly_m_score);
825     }
826     else if (seq[ri]=='N') {
827     loc.add(ri,0);
828     }
829     else { //mismatch
830     loc.add(ri,poly_mis_score);
831     if (maxloc.score-loc.score>poly_dropoff_score) break;
832     }
833     if (maxloc.score<=loc.score) {
834     maxloc=loc;
835     }
836 gpertea 12 }
837 gpertea 94 ri=maxloc.pos;
838 gpertea 103 if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
839 gpertea 94 //ri = right boundary for the poly match
840     //extend left
841     loc.set(li, maxloc.score);
842     maxloc.pos=li;
843     while (li>0) {
844     li--;
845     if (seq[li]==polyChar) {
846     loc.add(li,poly_m_score);
847     }
848     else if (seq[li]=='N') {
849     loc.add(li,0);
850     }
851     else { //mismatch
852     loc.add(li,poly_mis_score);
853     if (maxloc.score-loc.score>poly_dropoff_score) break;
854     }
855     if (maxloc.score<=loc.score) {
856     maxloc=loc;
857     }
858     }
859 gpertea 105 li=maxloc.pos;
860 gpertea 108 if ((maxloc.score==poly_minScore && ri==rlen-1) ||
861     (maxloc.score>poly_minScore && ri>=rlen-3) ||
862     (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
863 gpertea 105 //trimming this li-ri match at 3' end
864     l3=li-1;
865     if (l3<0) l3=0;
866 gpertea 94 return true;
867     }
868     return false;
869 gpertea 12 }
870    
871 gpertea 94 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
872 gpertea 103 if (!doPolyTrim) return false;
873 gpertea 88 int rlen=seq.length();
874     l5=0;
875     l3=rlen-1;
876 gpertea 94 int32 seedVal=*(int32*)poly_seed;
877     char polyChar=poly_seed[0];
878 gpertea 88 //assumes N trimming was already done
879     //so a poly match should be very close to the end of the read
880     // -- find the initial match (seed)
881 gpertea 108 int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
882 gpertea 94 int li;
883     for (li=0;li<=lmax;li++) {
884     if (seedVal==*(int*)&(seq[li])) {
885 gpertea 88 break;
886     }
887     }
888 gpertea 94 if (li>lmax) return false;
889     //seed found, try to extend it both ways
890     //extend left
891     int ri=li+3; //save rightmost base of the seed
892     SLocScore loc(li, poly_m_score<<2);
893     SLocScore maxloc(loc);
894     while (li>0) {
895     li--;
896     if (seq[li]==polyChar) {
897     loc.add(li,poly_m_score);
898     }
899     else if (seq[li]=='N') {
900     loc.add(li,0);
901     }
902     else { //mismatch
903     loc.add(li,poly_mis_score);
904     if (maxloc.score-loc.score>poly_dropoff_score) break;
905     }
906     if (maxloc.score<=loc.score) {
907     maxloc=loc;
908     }
909 gpertea 88 }
910 gpertea 94 li=maxloc.pos;
911 gpertea 103 if (li>5) return false; //no trimming wanted, too far from 5' end
912 gpertea 94 //li = right boundary for the poly match
913    
914     //extend right
915     loc.set(ri, maxloc.score);
916     maxloc.pos=ri;
917 gpertea 170 while (ri<rlen-1) {
918 gpertea 94 ri++;
919     if (seq[ri]==polyChar) {
920     loc.add(ri,poly_m_score);
921     }
922     else if (seq[ri]=='N') {
923     loc.add(ri,0);
924     }
925     else { //mismatch
926     loc.add(ri,poly_mis_score);
927     if (maxloc.score-loc.score>poly_dropoff_score) break;
928     }
929     if (maxloc.score<=loc.score) {
930     maxloc=loc;
931     }
932     }
933 gpertea 105 ri=maxloc.pos;
934 gpertea 103 if ((maxloc.score==poly_minScore && li==0) ||
935     (maxloc.score>poly_minScore && li<2)
936 gpertea 108 || (maxloc.score>(poly_minScore*3) && li<8)) {
937 gpertea 105 //adjust l5 to reflect this trimming of 5' end
938     l5=ri+1;
939     if (l5>rlen-1) l5=rlen-1;
940 gpertea 94 return true;
941     }
942     return false;
943 gpertea 88 }
944    
945 gpertea 4 bool trim_adapter3(GStr& seq, int&l5, int &l3) {
946 gpertea 103 if (adapters3.Count()==0) return false;
947 gpertea 4 int rlen=seq.length();
948     l5=0;
949     l3=rlen-1;
950 gpertea 103 bool trimmed=false;
951 gpertea 175 GStr wseq(seq);
952 gpertea 103 int wlen=rlen;
953 gpertea 175 GXSeqData seqdata;
954 gpertea 103 for (int ai=0;ai<adapters3.Count();ai++) {
955 gpertea 175 seqdata.update(adapters3[ai].seq.chars(), adapters3[ai].seq.length(),
956     adapters3[ai].pz, wseq.chars(), wlen, adapters3[ai].amlen);
957     GXAlnInfo* bestaln=match_RightEnd(seqdata, gxmem_r, 86);
958     if (bestaln) {
959 gpertea 103 trimmed=true;
960 gpertea 175 //keep unmatched region on the left OR right (the longer one)
961     if (bestaln->sl > wlen-bestaln->sr) {
962     //keep left side
963     l3-=(wlen-bestaln->sl+1);
964     if (l3<0) l3=0;
965     }
966     else { //keep right side
967     l5+=bestaln->sr;
968     if (l5>=rlen) l5=rlen-1;
969     }
970     delete bestaln;
971 gpertea 103 if (l3-l5+1<min_read_len) return true;
972     wseq=seq.substr(l5,l3-l5+1);
973     wlen=wseq.length();
974     }
975     }//for each 5' adapter
976     return trimmed;
977 gpertea 4 }
978    
979     bool trim_adapter5(GStr& seq, int&l5, int &l3) {
980 gpertea 103 if (adapters5.Count()==0) return false;
981 gpertea 4 int rlen=seq.length();
982     l5=0;
983     l3=rlen-1;
984 gpertea 102 bool trimmed=false;
985 gpertea 175 GStr wseq(seq);
986 gpertea 103 int wlen=rlen;
987 gpertea 175 GXSeqData seqdata;
988 gpertea 102 for (int ai=0;ai<adapters5.Count();ai++) {
989 gpertea 175 seqdata.update(adapters5[ai].seq.chars(), adapters5[ai].seq.length(),
990     adapters5[ai].pz, wseq.chars(), wlen, adapters5[ai].amlen);
991     GXAlnInfo* bestaln=match_LeftEnd(seqdata, gxmem_l, 90);
992     if (bestaln) {
993 gpertea 103 trimmed=true;
994 gpertea 175 if (bestaln->sl > wlen-bestaln->sr) {
995     //keep left side
996     l3-=(wlen-bestaln->sl+1);
997     if (l3<0) l3=0;
998     }
999     else { //keep right side
1000     l5+=bestaln->sr;
1001     if (l5>=rlen) l5=rlen-1;
1002     }
1003     delete bestaln;
1004 gpertea 103 if (l3-l5+1<min_read_len) return true;
1005     wseq=seq.substr(l5,l3-l5+1);
1006     wlen=wseq.length();
1007     }
1008 gpertea 102 }//for each 5' adapter
1009     return trimmed;
1010 gpertea 98 }
1011 gpertea 4
1012 gpertea 170 //convert qvs to/from phred64 from/to phread33
1013 gpertea 12 void convertPhred(GStr& q) {
1014     for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1015 gpertea 4 }
1016    
1017 gpertea 12 void convertPhred(char* q, int len) {
1018     for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1019 gpertea 4 }
1020    
1021 gpertea 170 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1022 gpertea 13 GStr& rname, GStr& rinfo, GStr& infname) {
1023     rseq="";
1024     rqv="";
1025     rname="";
1026     rinfo="";
1027     if (fq.eof()) return false;
1028     char* l=fq.getLine();
1029     while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1030     if (l==NULL) return false;
1031     /* if (rawFormat) {
1032     //TODO: implement raw qseq parsing here
1033     //if (raw type=N) then continue; //skip invalid/bad records
1034     } //raw qseq format
1035     else { // FASTQ or FASTA */
1036     isfasta=(l[0]=='>');
1037 gpertea 170 if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1038 gpertea 13 infname.chars(), l);
1039     GStr s(l);
1040     rname=&(l[1]);
1041     for (int i=0;i<rname.length();i++)
1042 gpertea 170 if (rname[i]<=' ') {
1043     if (i<rname.length()-2) rinfo=rname.substr(i+1);
1044     rname.cut(i);
1045     break;
1046 gpertea 13 }
1047     //now get the sequence
1048 gpertea 170 if ((l=fq.getLine())==NULL)
1049 gpertea 13 GError("Error: unexpected EOF after header for read %s (%s)\n",
1050     rname.chars(), infname.chars());
1051     rseq=l; //this must be the DNA line
1052     while ((l=fq.getLine())!=NULL) {
1053     //seq can span multiple lines
1054     if (l[0]=='>' || l[0]=='+') {
1055 gpertea 170 fq.pushBack();
1056 gpertea 13 break; //
1057     }
1058     rseq+=l;
1059 gpertea 170 } //check for multi-line seq
1060 gpertea 13 if (!isfasta) { //reading fastq quality values, which can also be multi-line
1061     if ((l=fq.getLine())==NULL)
1062     GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1063     if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1064 gpertea 170 if ((l=fq.getLine())==NULL)
1065 gpertea 13 GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1066     rqv=l;
1067 gpertea 170 //if (rqv.length()!=rseq.length())
1068 gpertea 13 // GError("Error: qv len != seq len for %s\n", rname.chars());
1069     while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1070     rqv+=l; //append to qv string
1071     }
1072     }// fastq
1073     // } //<-- FASTA or FASTQ
1074 gpertea 94 rseq.upper();
1075 gpertea 13 return true;
1076     }
1077    
1078 gpertea 105 #ifdef GDEBUG
1079     void showTrim(GStr& s, int l5, int l3) {
1080     if (l5>0) {
1081     color_bg(c_red);
1082     }
1083     for (int i=0;i<s.length()-1;i++) {
1084     if (i && i==l5) color_resetbg();
1085     fprintf(stderr, "%c", s[i]);
1086     if (i==l3) color_bg(c_red);
1087     }
1088     fprintf(stderr, "%c", s[s.length()-1]);
1089     color_reset();
1090     fprintf(stderr, "\n");
1091     }
1092     #endif
1093    
1094 gpertea 13 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1095 gpertea 170 //returns 0 if the read was untouched, 1 if it was just trimmed
1096 gpertea 13 // and a trash code if it was trashed
1097     l5=0;
1098     l3=rseq.length()-1;
1099 gpertea 103 #ifdef GDEBUG
1100 gpertea 108 //rseq.reverse();
1101 gpertea 103 GMessage(">%s\n", rname.chars());
1102 gpertea 105 GMessage("%s\n",rseq.chars());
1103 gpertea 103 #endif
1104 gpertea 13 if (l3-l5+1<min_read_len) {
1105     return 's';
1106     }
1107     GStr wseq(rseq.chars());
1108     GStr wqv(rqv.chars());
1109     int w5=l5;
1110     int w3=l3;
1111     //first do the q-based trimming
1112     if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1113     if (w3-w5+1<min_read_len) {
1114     return 'Q'; //invalid read
1115     }
1116     //-- keep only the w5..w3 range
1117     l5=w5;
1118     l3=w3;
1119     wseq=wseq.substr(w5, w3-w5+1);
1120     if (!wqv.is_empty())
1121     wqv=wqv.substr(w5, w3-w5+1);
1122     } //qv trimming
1123     // N-trimming on the remaining read seq
1124     if (ntrim(wseq, w5, w3)) {
1125     //GMessage("before: %s\n",wseq.chars());
1126     //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1127     l5+=w5;
1128     l3-=(wseq.length()-1-w3);
1129     if (w3-w5+1<min_read_len) {
1130     return 'N'; //to be trashed
1131     }
1132     //-- keep only the w5..w3 range
1133     wseq=wseq.substr(w5, w3-w5+1);
1134     if (!wqv.is_empty())
1135     wqv=wqv.substr(w5, w3-w5+1);
1136     w5=0;
1137     w3=wseq.length()-1;
1138     }
1139 gpertea 103 char trim_code;
1140     do {
1141     trim_code=0;
1142     if (trim_poly5(wseq, w5, w3, polyA_seed)) {
1143     trim_code='A';
1144     }
1145     else if (trim_poly5(wseq, w5, w3, polyT_seed)) {
1146     trim_code='T';
1147     }
1148     else if (trim_adapter5(wseq, w5, w3)) {
1149     trim_code='5';
1150     }
1151     if (trim_code) {
1152 gpertea 105 #ifdef GDEBUG
1153     GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1154     showTrim(wseq, w5, w3);
1155     #endif
1156 gpertea 13 int trimlen=wseq.length()-(w3-w5+1);
1157 gpertea 103 num_trimmed5++;
1158     if (trimlen<min_trimmed5)
1159     min_trimmed5=trimlen;
1160 gpertea 13 l5+=w5;
1161     l3-=(wseq.length()-1-w3);
1162     if (w3-w5+1<min_read_len) {
1163 gpertea 103 return trim_code;
1164 gpertea 13 }
1165     //-- keep only the w5..w3 range
1166     wseq=wseq.substr(w5, w3-w5+1);
1167     if (!wqv.is_empty())
1168     wqv=wqv.substr(w5, w3-w5+1);
1169 gpertea 103 }// trimmed at 5' end
1170     } while (trim_code);
1171    
1172     do {
1173     trim_code=0;
1174     if (trim_poly3(wseq, w5, w3, polyA_seed)) {
1175     trim_code='A';
1176     }
1177     else if (trim_poly3(wseq, w5, w3, polyT_seed)) {
1178     trim_code='T';
1179     }
1180     else if (trim_adapter3(wseq, w5, w3)) {
1181     trim_code='3';
1182     }
1183     if (trim_code) {
1184 gpertea 105 #ifdef GDEBUG
1185     GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1186     showTrim(wseq, w5, w3);
1187     #endif
1188 gpertea 13 int trimlen=wseq.length()-(w3-w5+1);
1189 gpertea 103 num_trimmed3++;
1190 gpertea 170 if (trimlen<min_trimmed3)
1191 gpertea 103 min_trimmed3=trimlen;
1192 gpertea 13 l5+=w5;
1193     l3-=(wseq.length()-1-w3);
1194     if (w3-w5+1<min_read_len) {
1195 gpertea 103 return trim_code;
1196 gpertea 13 }
1197     //-- keep only the w5..w3 range
1198     wseq=wseq.substr(w5, w3-w5+1);
1199     if (!wqv.is_empty())
1200     wqv=wqv.substr(w5, w3-w5+1);
1201 gpertea 103 }//trimming at 3' end
1202     } while (trim_code);
1203    
1204    
1205 gpertea 13 if (doCollapse) {
1206     //keep read for later
1207     FqDupRec* dr=dhash.Find(wseq.chars());
1208     if (dr==NULL) { //new entry
1209 gpertea 170 //if (prefix.is_empty())
1210     dhash.Add(wseq.chars(),
1211 gpertea 13 new FqDupRec(&wqv, rname.chars()));
1212     //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1213     }
1214     else
1215     dr->add(wqv);
1216     } //collapsing duplicates
1217     else { //not collapsing duplicates
1218     //apply the dust filter now
1219     if (doDust) {
1220     int dustbases=dust(wseq);
1221     if (dustbases>(wseq.length()>>1)) {
1222     return 'D';
1223     }
1224     }
1225     } //not collapsing duplicates
1226     return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1227     }
1228    
1229     void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1230     //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1231     if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1232     else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1233     }
1234    
1235     void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1236     outcounter++;
1237     bool asFasta=(rqv.is_empty() || fastaOutput);
1238     if (asFasta) {
1239     if (prefix.is_empty()) {
1240     printHeader(f_out, '>',rname,rinfo);
1241     fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1242     }
1243     else {
1244 gpertea 170 fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1245 gpertea 13 rseq.chars());
1246     }
1247     }
1248     else { //fastq
1249     if (convert_phred) convertPhred(rqv);
1250     if (prefix.is_empty()) {
1251     printHeader(f_out, '@', rname, rinfo);
1252     fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1253     }
1254     else
1255 gpertea 170 fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1256 gpertea 13 rseq.chars(),rqv.chars() );
1257     }
1258     }
1259    
1260     void trash_report(char trashcode, GStr& rname, FILE* freport) {
1261     if (freport==NULL || trashcode<=' ') return;
1262     if (trashcode=='3' || trashcode=='5') {
1263 gpertea 103 fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1264 gpertea 13 }
1265     else {
1266     fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1267     }
1268     //tcounter++;
1269     }
1270    
1271     GStr getFext(GStr& s, int* xpos=NULL) {
1272     //s must be a filename without a path
1273     GStr r("");
1274     if (xpos!=NULL) *xpos=0;
1275     if (s.is_empty() || s=="-") return r;
1276     int p=s.rindex('.');
1277     int d=s.rindex('/');
1278     if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1279     r=s.substr(p+1);
1280     if (xpos!=NULL) *xpos=p+1;
1281     r.lower();
1282     return r;
1283     }
1284    
1285     void baseFileName(GStr& fname) {
1286     //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1287     int xpos=0;
1288     GStr fext=getFext(fname, &xpos);
1289     if (xpos<=1) return;
1290     bool extdel=false;
1291     GStr f2;
1292     if (fext=="z" || fext=="zip") {
1293     extdel=true;
1294     }
1295     else if (fext.length()>=2) {
1296     f2=fext.substr(0,2);
1297     extdel=(f2=="gz" || f2=="bz");
1298     }
1299     if (extdel) {
1300     fname.cut(xpos-1);
1301     fext=getFext(fname, &xpos);
1302     if (xpos<=1) return;
1303     }
1304     extdel=false;
1305     if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1306     extdel=true;
1307     }
1308     else if (fext.length()>=2) {
1309     extdel=(fext.substr(0,2)=="fa");
1310     }
1311     if (extdel) fname.cut(xpos-1);
1312     GStr fncp(fname);
1313     fncp.lower();
1314     fncp.chomp("seq");
1315     fncp.chomp("sequence");
1316     fncp.trimR("_.");
1317     if (fncp.length()<fname.length()) fname.cut(fncp.length());
1318     }
1319    
1320     FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1321     FILE* f_out=NULL;
1322     GStr fname(getFileName(infname.chars()));
1323     //eliminate known extensions
1324     baseFileName(fname);
1325     if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1326     else if (pocmd.is_empty()) {
1327     GStr oname(fname);
1328     oname.append('.');
1329     oname.append(outsuffix);
1330     f_out=fopen(oname.chars(),"w");
1331     if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1332     }
1333     else {
1334     GStr oname(">");
1335     oname.append(fname);
1336     oname.append('.');
1337     oname.append(outsuffix);
1338     pocmd.append(oname);
1339     f_out=popen(pocmd.chars(), "w");
1340     if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1341     }
1342     return f_out;
1343     }
1344    
1345     void guess_unzip(GStr& fname, GStr& picmd) {
1346     GStr fext=getFext(fname);
1347     if (fext=="gz" || fext=="gzip" || fext=="z") {
1348     picmd="gzip -cd ";
1349     }
1350     else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1351     picmd="bzip2 -cd ";
1352     }
1353     }
1354    
1355 gpertea 170 void addAdapter(GVec<CASeqData>& adapters, GStr& seq) {
1356     //TODO: prepare CASeqData here, and collect hexamers as well
1357 gpertea 172 CASeqData adata(revCompl);
1358     int idx=adapters.Add(adata);
1359     if (idx<0) GError("Error: failed to add adaptor!\n");
1360     adapters[idx].update(seq.chars());
1361 gpertea 170 }
1362    
1363    
1364 gpertea 94 int loadAdapters(const char* fname) {
1365     GLineReader lr(fname);
1366     char* l;
1367     while ((l=lr.nextLine())!=NULL) {
1368     if (lr.length()<=3 || l[0]=='#') continue;
1369 gpertea 98 if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1370     l[0]==';'|| l[0]==':' ) {
1371 gpertea 94 int i=1;
1372     while (l[i]!=0 && isspace(l[i])) {
1373     i++;
1374     }
1375     if (l[i]!=0) {
1376 gpertea 102 GStr s(&(l[i]));
1377 gpertea 108 #ifdef GDEBUG
1378     //s.reverse();
1379     #endif
1380 gpertea 170 addAdapter(adapters3, s);
1381 gpertea 94 continue;
1382     }
1383     }
1384     else {
1385 gpertea 98 GStr s(l);
1386     s.startTokenize("\t ;,:");
1387     GStr a5,a3;
1388     if (s.nextToken(a5))
1389     s.nextToken(a3);
1390 gpertea 102 a5.upper();
1391     a3.upper();
1392 gpertea 108 #ifdef GDEBUG
1393     // a5.reverse();
1394     // a3.reverse();
1395     #endif
1396 gpertea 170 addAdapter(adapters5, a5);
1397     addAdapter(adapters3, a3);
1398 gpertea 94 }
1399     }
1400 gpertea 102 return adapters5.Count()+adapters3.Count();
1401 gpertea 94 }
1402    
1403 gpertea 170 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1404     GStr& s, GStr& infname, GStr& infname2) {
1405 gpertea 13 // uses outsuffix to generate output file names and open file handles as needed
1406     infname="";
1407     infname2="";
1408     f_in=NULL;
1409     f_in2=NULL;
1410     f_out=NULL;
1411     f_out2=NULL;
1412     //analyze outsuffix intent
1413     GStr pocmd;
1414 gpertea 76 if (outsuffix=="-") {
1415     f_out=stdout;
1416     }
1417     else {
1418     GStr ox=getFext(outsuffix);
1419     if (ox.length()>2) ox=ox.substr(0,2);
1420     if (ox=="gz") pocmd="gzip -9 -c ";
1421     else if (ox=="bz") pocmd="bzip2 -9 -c ";
1422     }
1423 gpertea 13 if (s=="-") {
1424     f_in=stdin;
1425     infname="stdin";
1426     f_out=prepOutFile(infname, pocmd);
1427     return;
1428     } // streaming from stdin
1429     s.startTokenize(",:");
1430     s.nextToken(infname);
1431     bool paired=s.nextToken(infname2);
1432 gpertea 170 if (fileExists(infname.chars())==0)
1433 gpertea 13 GError("Error: cannot find file %s!\n",infname.chars());
1434     GStr fname(getFileName(infname.chars()));
1435     GStr picmd;
1436     guess_unzip(fname, picmd);
1437     if (picmd.is_empty()) {
1438     f_in=fopen(infname.chars(), "r");
1439     if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1440     }
1441     else {
1442     picmd.append(infname);
1443     f_in=popen(picmd.chars(), "r");
1444     if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1445     }
1446 gpertea 76 if (f_out==stdout) {
1447     if (paired) GError("Error: output suffix required for paired reads\n");
1448     return;
1449     }
1450 gpertea 13 f_out=prepOutFile(infname, pocmd);
1451     if (!paired) return;
1452     if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1453     // ---- paired reads:-------------
1454 gpertea 170 if (fileExists(infname2.chars())==0)
1455 gpertea 13 GError("Error: cannot find file %s!\n",infname2.chars());
1456     picmd="";
1457     GStr fname2(getFileName(infname2.chars()));
1458     guess_unzip(fname2, picmd);
1459     if (picmd.is_empty()) {
1460     f_in2=fopen(infname2.chars(), "r");
1461     if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1462     }
1463     else {
1464     picmd.append(infname2);
1465     f_in2=popen(picmd.chars(), "r");
1466     if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1467     }
1468     f_out2=prepOutFile(infname2, pocmd);
1469     }