ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 172
Committed: Wed Feb 15 03:33:40 2012 UTC (7 years, 9 months ago) by gpertea
File size: 42690 byte(s)
Log Message:
wip fqtrim

Line User Rev File contents
1 gpertea 4 #include "GArgs.h"
2     #include "GStr.h"
3     #include "GHash.hh"
4     #include "GList.hh"
5 gpertea 13 #include <ctype.h>
6 gpertea 94 #include "GAlnExtend.h"
7 gpertea 4
8     #define USAGE "Usage:\n\
9 gpertea 76 fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10 gpertea 172 [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11 gpertea 76 [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12     <input.fq>[,<input_mates.fq>\n\
13 gpertea 4 \n\
14 gpertea 76 Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
15     for low complexity and collapse duplicate reads.\n\
16 gpertea 13 If read pairs should be trimmed and kept together (i.e. without discarding\n\
17     one read in a pair), the two file names should be given delimited by a comma\n\
18     or a colon character\n\
19 gpertea 4 \n\
20     Options:\n\
21     -n rename all the reads using the <prefix> followed by a read counter;\n\
22     if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23     the read duplication count\n\
24 gpertea 76 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
25     file(s) named <input>.<outsuffix> which will be created in the \n\
26     current (working) directory; (writes to stdout if -o- is given);\n\
27     a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28     -f file with adapter sequences to trim, each line having this format:\n\
29     <5'-adapter-sequence> <3'-adapter-sequence>\n\
30 gpertea 4 -5 trim the given adapter or primer sequence at the 5' end of each read\n\
31     (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32     -3 trim the given adapter sequence at the 3' end of each read\n\
33     (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 gpertea 103 -A disable polyA/T trimming (enabled by default)\n\
35 gpertea 94 -y minimum length of exact match to adaptor sequence at the proper end (6)\n\
36 gpertea 13 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37 gpertea 76 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
38 gpertea 12 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
39     -l minimum \"clean\" length after trimming that a read must have\n\
40     in order to pass the filter (default: 16)\n\
41 gpertea 4 -r write a simple \"trash report\" file listing the discarded reads with a\n\
42     one letter code for the reason why they were trashed\n\
43 gpertea 12 -D apply a low-complexity (dust) filter and discard any read that has over\n\
44     50% of its length detected as low complexity\n\
45 gpertea 4 -C collapse duplicate reads and add a prefix with their count to the read\n\
46 gpertea 12 name (e.g. for microRNAs)\n\
47     -p input is phred64/phred33 (use -p64 or -p33)\n\
48     -Q convert quality values to the other Phred qv type\n\
49     -V verbose processing\n\
50 gpertea 4 "
51 gpertea 170
52 gpertea 13 //-z for -o option, the output stream(s) will be first piped into the given\n
53     // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54    
55 gpertea 170
56 gpertea 4 // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
57 gpertea 12
58 gpertea 76 //For paired reads sequencing:
59 gpertea 12 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
60     //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
61 gpertea 13 //FILE* f_out=NULL; //stdout if not provided
62     //FILE* f_out2=NULL; //for paired reads
63     //FILE* f_in=NULL; //input fastq (stdin if not provided)
64     //FILE* f_in2=NULL; //for paired reads
65 gpertea 170
66 gpertea 4 FILE* freport=NULL;
67 gpertea 13
68 gpertea 4 bool debug=false;
69 gpertea 12 bool verbose=false;
70 gpertea 4 bool doCollapse=false;
71     bool doDust=false;
72 gpertea 103 bool doPolyTrim=true;
73 gpertea 13 bool fastaOutput=false;
74 gpertea 4 bool trashReport=false;
75 gpertea 172 bool revCompl=false; //also reverse complement adaptor sequences
76 gpertea 4 int min_read_len=16;
77 gpertea 12 double max_perc_N=7.0;
78 gpertea 4 int dust_cutoff=16;
79     bool isfasta=false;
80 gpertea 12 bool convert_phred=false;
81 gpertea 170 GStr outsuffix; // -o
82 gpertea 13 GStr prefix;
83     GStr zcmd;
84     int num_trimmed5=0;
85     int num_trimmed3=0;
86     int num_trimmedN=0;
87     int num_trimmedQ=0;
88     int min_trimmed5=INT_MAX;
89     int min_trimmed3=INT_MAX;
90 gpertea 12
91 gpertea 13 int qvtrim_qmin=0;
92     int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
93 gpertea 12 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
94     int qv_cvtadd=0; //could be -31 or +31
95    
96 gpertea 94 // adaptor matching metrics -- for X-drop ungapped extension
97 gpertea 103 const int match_reward=2;
98     const int mismatch_penalty=3;
99     const int Xdrop=8;
100    
101 gpertea 94 const int poly_m_score=2; //match score
102     const int poly_mis_score=-3; //mismatch
103 gpertea 88 const int poly_dropoff_score=7;
104 gpertea 94 int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
105 gpertea 4
106 gpertea 94 const char *polyA_seed="AAAA";
107     const char *polyT_seed="TTTT";
108 gpertea 12
109 gpertea 170 struct CASeqData {
110     //positional data for every possible hexamer in an adapter
111 gpertea 172 GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adapter sequence
112     GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adapter sequence
113 gpertea 170 GStr seq; //actual adapter sequence data
114     GStr seqr; //reverse complement sequence
115 gpertea 172 bool use_reverse;
116 gpertea 170 CASeqData(bool rev=false):seq(),seqr() {
117 gpertea 172 use_reverse=rev;
118     for (int i=0;i<4096;i++) {
119     pz[i]=NULL;
120     pzr[i]=NULL;
121     }
122     }
123    
124     void update(const char* s) {
125     seq=s;
126     table6mers(seq.chars(), seq.length(), pz);
127     if (!use_reverse) return;
128     //reverse complement
129     seqr=s;
130     int slen=seq.length();
131     for (int i=0;i<slen;i++)
132     seqr[i]=ntComplement(seq[slen-i-1]);
133     table6mers(seqr.chars(), seqr.length(), pzr);
134 gpertea 170 }
135    
136 gpertea 172 ~CASeqData() {
137     for (int i=0;i<4096;i++) {
138     delete pz[i];
139     delete pzr[i];
140 gpertea 170 }
141     }
142     };
143    
144     GVec<CASeqData> adapters5;
145     GVec<CASeqData> adapters3;
146    
147 gpertea 103 CGreedyAlignData* gxmem_l=NULL;
148     CGreedyAlignData* gxmem_r=NULL;
149    
150 gpertea 4 // element in dhash:
151     class FqDupRec {
152     public:
153     int count; //how many of these reads are the same
154     int len; //length of qv
155     char* firstname; //optional, only if we want to keep the original read names
156     char* qv;
157     FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
158     len=0;
159     qv=NULL;
160     firstname=NULL;
161     count=0;
162     if (qstr!=NULL) {
163     qv=Gstrdup(qstr->chars());
164     len=qstr->length();
165     count++;
166     }
167     if (rname!=NULL) firstname=Gstrdup(rname);
168     }
169     ~FqDupRec() {
170     GFREE(qv);
171     GFREE(firstname);
172     }
173     void add(GStr& d) { //collapse another record into this one
174     if (d.length()!=len)
175     GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
176     count++;
177     for (int i=0;i<len;i++)
178     qv[i]=(qv[i]+d[i])/2;
179     }
180     };
181    
182     void openfw(FILE* &f, GArgs& args, char opt) {
183     GStr s=args.getOpt(opt);
184     if (!s.is_empty()) {
185     if (s=='-') f=stdout;
186     else {
187 gpertea 13 f=fopen(s.chars(),"w");
188 gpertea 4 if (f==NULL) GError("Error creating file: %s\n", s.chars());
189     }
190     }
191     }
192    
193     #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
194     #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
195    
196     GHash<FqDupRec> dhash; //hash to keep track of duplicates
197    
198 gpertea 172 void addAdapter(GVec<CASeqData>& adapters, GStr& seq);
199 gpertea 94 int loadAdapters(const char* fname);
200    
201 gpertea 170 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
202     GStr& s, GStr& infname, GStr& infname2);
203 gpertea 13 // uses outsuffix to generate output file names and open file handles as needed
204 gpertea 170
205 gpertea 13 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
206     void trash_report(char trashcode, GStr& rname, FILE* freport);
207    
208 gpertea 170 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
209 gpertea 13 GStr& rname, GStr& rinfo, GStr& infname);
210    
211 gpertea 170 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
212 gpertea 13 //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
213    
214 gpertea 12 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
215     bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
216 gpertea 4 int dust(GStr& seq);
217 gpertea 94 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
218     bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
219 gpertea 4 bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
220 gpertea 94 bool trim_adapter3(GStr& seq, int &l5, int &l3);
221 gpertea 4
222 gpertea 12 void convertPhred(char* q, int len);
223     void convertPhred(GStr& q);
224 gpertea 4
225     int main(int argc, char * const argv[]) {
226 gpertea 172 GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
227 gpertea 4 int e;
228     if ((e=args.isError())>0) {
229     GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
230     exit(224);
231     }
232 gpertea 9 debug=(args.getOpt('Y')!=NULL);
233 gpertea 13 verbose=(args.getOpt('V')!=NULL);
234 gpertea 12 convert_phred=(args.getOpt('Q')!=NULL);
235 gpertea 4 doCollapse=(args.getOpt('C')!=NULL);
236     doDust=(args.getOpt('D')!=NULL);
237 gpertea 172 revCompl=(args.getOpt('R')!=NULL);
238 gpertea 103 if (args.getOpt('A')) doPolyTrim=false;
239 gpertea 12 /*
240 gpertea 4 rawFormat=(args.getOpt('R')!=NULL);
241     if (rawFormat) {
242     GError("Sorry, raw qseq format parsing is not implemented yet!\n");
243     }
244 gpertea 12 */
245 gpertea 4 prefix=args.getOpt('n');
246     GStr s=args.getOpt('l');
247 gpertea 170 if (!s.is_empty())
248 gpertea 4 min_read_len=s.asInt();
249 gpertea 12 s=args.getOpt('m');
250 gpertea 170 if (!s.is_empty())
251 gpertea 12 max_perc_N=s.asDouble();
252 gpertea 4 s=args.getOpt('d');
253     if (!s.is_empty()) {
254     dust_cutoff=s.asInt();
255     doDust=true;
256     }
257 gpertea 12 s=args.getOpt('q');
258     if (!s.is_empty()) {
259 gpertea 13 qvtrim_qmin=s.asInt();
260 gpertea 12 }
261 gpertea 13 s=args.getOpt('t');
262     if (!s.is_empty()) {
263     qvtrim_max=s.asInt();
264     }
265 gpertea 12 s=args.getOpt('p');
266     if (!s.is_empty()) {
267     int v=s.asInt();
268     if (v==33) {
269     qv_phredtype=33;
270     qv_cvtadd=31;
271     }
272     else if (v==64) {
273     qv_phredtype=64;
274     qv_cvtadd=-31;
275     }
276 gpertea 170 else
277 gpertea 12 GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
278     }
279 gpertea 94 s=args.getOpt('f');
280     if (!s.is_empty()) {
281     loadAdapters(s.chars());
282     }
283 gpertea 102 bool fileAdapters=adapters5.Count()+adapters3.Count();
284 gpertea 94 s=args.getOpt('5');
285     if (!s.is_empty()) {
286     if (fileAdapters)
287     GError("Error: options -5 and -f cannot be used together!\n");
288     s.upper();
289 gpertea 172 addAdapter(adapters5, s);
290 gpertea 4 }
291 gpertea 94 s=args.getOpt('3');
292     if (!s.is_empty()) {
293     if (fileAdapters)
294     GError("Error: options -3 and -f cannot be used together!\n");
295 gpertea 102 s.upper();
296 gpertea 172 addAdapter(adapters3, s);
297 gpertea 4 }
298 gpertea 94 s=args.getOpt('y');
299 gpertea 13 if (!s.is_empty()) {
300 gpertea 94 int minmatch=s.asInt();
301     poly_minScore=minmatch*poly_m_score;
302 gpertea 13 }
303 gpertea 170
304 gpertea 13 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
305 gpertea 76 else outsuffix="-";
306 gpertea 4 trashReport= (args.getOpt('r')!=NULL);
307 gpertea 13 int fcount=args.startNonOpt();
308     if (fcount==0) {
309 gpertea 4 GMessage(USAGE);
310     exit(224);
311     }
312 gpertea 13 if (fcount>1 && doCollapse) {
313     GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
314     }
315     //openfw(f_out, args, 'o');
316     //if (f_out==NULL) f_out=stdout;
317 gpertea 4 if (trashReport)
318     openfw(freport, args, 'r');
319     char* infile=NULL;
320 gpertea 103
321     if (adapters5.Count()>0)
322     gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
323     if (adapters3.Count()>0)
324     gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
325    
326 gpertea 4 while ((infile=args.nextNonOpt())!=NULL) {
327 gpertea 103 //for each input file
328 gpertea 13 int incounter=0; //counter for input reads
329     int outcounter=0; //counter for output reads
330     int trash_s=0; //too short from the get go
331     int trash_Q=0;
332     int trash_N=0;
333     int trash_D=0;
334 gpertea 103 int trash_poly=0;
335 gpertea 13 int trash_A3=0;
336     int trash_A5=0;
337     s=infile;
338     GStr infname;
339     GStr infname2;
340     FILE* f_in=NULL;
341     FILE* f_in2=NULL;
342     FILE* f_out=NULL;
343     FILE* f_out2=NULL;
344     bool paired_reads=false;
345     setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
346     GLineReader fq(f_in);
347     GLineReader* fq2=NULL;
348     if (f_in2!=NULL) {
349     fq2=new GLineReader(f_in2);
350     paired_reads=true;
351     }
352     GStr rseq, rqv, seqid, seqinfo;
353     GStr rseq2, rqv2, seqid2, seqinfo2;
354     while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
355     incounter++;
356     int a5=0, a3=0, b5=0, b3=0;
357     char tcode=0, tcode2=0;
358     tcode=process_read(seqid, rseq, rqv, a5, a3);
359 gpertea 103 if (fq2!=NULL) {
360 gpertea 13 getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
361     if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
362     GError("Error: no paired match for read %s vs %s (%s,%s)\n",
363     seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
364     }
365     tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
366     //decide what to do with this pair and print rseq2 if the pair makes it
367     if (tcode>1 && tcode2<=1) {
368     //"untrash" rseq
369     if (a3-a5+1<min_read_len) {
370     a5=1;
371     if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
372 gpertea 4 }
373 gpertea 13 tcode=1;
374     }
375     else if (tcode<=1 && tcode2>1) {
376     //"untrash" rseq2
377     if (b3-b5+1<min_read_len) {
378     b5=1;
379     if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
380     }
381     tcode2=1;
382     }
383     if (tcode<=1) { //trimmed or left intact -- write it!
384     if (tcode>0) {
385     rseq2=rseq2.substr(b5,b3-b5+1);
386     if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
387 gpertea 4 }
388 gpertea 13 int nocounter=0;
389     writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
390     }
391 gpertea 103 } //pair read
392 gpertea 13 if (tcode>1) { //trashed
393 gpertea 103 #ifdef GDEBUG
394     GMessage(" !!!!TRASH => 'N'\n");
395     #endif
396 gpertea 13 if (tcode=='s') trash_s++;
397 gpertea 103 else if (tcode=='A' || tcode=='T') trash_poly++;
398 gpertea 13 else if (tcode=='Q') trash_Q++;
399     else if (tcode=='N') trash_N++;
400     else if (tcode=='D') trash_D++;
401     else if (tcode=='3') trash_A3++;
402     else if (tcode=='5') trash_A5++;
403     if (trashReport) trash_report(tcode, seqid, freport);
404     }
405     else if (!doCollapse) { //write it
406     if (tcode>0) {
407     rseq=rseq.substr(a5,a3-a5+1);
408     if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
409     }
410 gpertea 103 #ifdef GDEBUG
411     GMessage(" After trimming:\n");
412 gpertea 105 GMessage("%s\n",rseq.chars());
413 gpertea 103 #endif
414 gpertea 13 writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
415     }
416     } //for each fastq record
417     delete fq2;
418     FRCLOSE(f_in);
419     FRCLOSE(f_in2);
420     if (doCollapse) {
421     outcounter=0;
422     int maxdup_count=1;
423     char* maxdup_seq=NULL;
424     dhash.startIterate();
425     FqDupRec* qd=NULL;
426     char* seq=NULL;
427     while ((qd=dhash.NextData(seq))!=NULL) {
428     GStr rseq(seq);
429     //do the dusting here
430     if (doDust) {
431     int dustbases=dust(rseq);
432     if (dustbases>(rseq.length()>>1)) {
433     if (trashReport && qd->firstname!=NULL) {
434     fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
435 gpertea 4 }
436 gpertea 13 trash_D+=qd->count;
437     continue;
438     }
439     }
440     outcounter++;
441 gpertea 170 if (qd->count>maxdup_count) {
442 gpertea 13 maxdup_count=qd->count;
443     maxdup_seq=seq;
444     }
445     if (isfasta) {
446     if (prefix.is_empty()) {
447 gpertea 170 fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
448 gpertea 13 rseq.chars());
449 gpertea 4 }
450 gpertea 13 else { //use custom read name
451     fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
452     qd->count, rseq.chars());
453 gpertea 4 }
454 gpertea 13 }
455     else { //fastq format
456     if (convert_phred) convertPhred(qd->qv, qd->len);
457     if (prefix.is_empty()) {
458 gpertea 170 fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
459 gpertea 13 rseq.chars(), qd->qv);
460 gpertea 4 }
461 gpertea 13 else { //use custom read name
462     fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
463     qd->count, rseq.chars(), qd->qv);
464     }
465     }
466     }//for each element of dhash
467     if (maxdup_count>1) {
468     GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
469 gpertea 4 }
470 gpertea 13 } //collapse entries
471     if (verbose) {
472     if (paired_reads) {
473     GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
474     GMessage("Number of input pairs :%9d\n", incounter);
475     GMessage(" Output pairs :%9d\n", outcounter);
476     }
477     else {
478     GMessage(">Input file : %s\n", infname.chars());
479     GMessage("Number of input reads :%9d\n", incounter);
480     GMessage(" Output reads :%9d\n", outcounter);
481     }
482     GMessage("------------------------------------\n");
483     if (num_trimmed5)
484     GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
485     if (num_trimmed3)
486     GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
487     GMessage("------------------------------------\n");
488     if (trash_s>0)
489     GMessage(" Trashed by length:%9d\n", trash_s);
490     if (trash_N>0)
491     GMessage(" Trashed by N%%:%9d\n", trash_N);
492     if (trash_Q>0)
493     GMessage("Trashed by low quality:%9d\n", trash_Q);
494 gpertea 103 if (trash_poly>0)
495     GMessage(" Trashed by poly-A/T:%9d\n", trash_poly);
496 gpertea 13 if (trash_A5>0)
497     GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
498     if (trash_A3>0)
499     GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
500     }
501     if (trashReport) {
502     FWCLOSE(freport);
503 gpertea 4 }
504 gpertea 13 FWCLOSE(f_out);
505     FWCLOSE(f_out2);
506     } //while each input file
507 gpertea 103 delete gxmem_l;
508     delete gxmem_r;
509 gpertea 12 //getc(stdin);
510 gpertea 4 }
511    
512     class NData {
513     public:
514     int NPos[1024]; //there should be no reads longer than 1K ?
515     int NCount;
516     int end5;
517     int end3;
518     int seqlen;
519     double perc_N; //percentage of Ns in end5..end3 range only!
520     const char* seq;
521     bool valid;
522     NData() {
523 gpertea 170 seqlen=0;
524 gpertea 4 NCount=0;
525     end5=0;
526     end3=0;
527     seq=NULL;
528     perc_N=0;
529     valid=true;
530     }
531     void init(GStr& rseq) {
532     NCount=0;
533     perc_N=0;
534     seqlen=rseq.length();
535     seq=rseq.chars();
536     end5=1;
537     end3=seqlen;
538     for (int i=0;i<seqlen;i++)
539 gpertea 12 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
540 gpertea 4 NPos[NCount]=i;
541     NCount++;
542     }
543     perc_N=(NCount*100.0)/seqlen;
544     valid=true;
545     }
546     void N_calc() { //only in the region after trimming
547     int n=0;
548     for (int i=end3-1;i<end5;i++) {
549     if (seq[i]=='N') n++;
550     }
551     perc_N=(n*100.0)/(end5-end3+1);
552     }
553     };
554 gpertea 170
555 gpertea 4 static NData feat;
556     int perc_lenN=12; // incremental distance from ends, in percentage of
557     // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
558 gpertea 170
559 gpertea 4 void N_analyze(int l5, int l3, int p5, int p3) {
560     /* assumes feat was filled properly */
561     int old_dif, t5,t3,v;
562 gpertea 12 if (l3<l5+2 || p5>p3 ) {
563 gpertea 4 feat.end5=l5+1;
564     feat.end3=l3+1;
565 gpertea 170 return;
566 gpertea 4 }
567    
568     t5=feat.NPos[p5]-l5;
569     t3=l3-feat.NPos[p3];
570     old_dif=p3-p5;
571     v=(int)((((double)(l3-l5))*perc_lenN)/100);
572 gpertea 170 if (v>20) v=20; /* enforce N-search limit for very long reads */
573 gpertea 4 if (t5 < v ) {
574     l5=feat.NPos[p5]+1;
575     p5++;
576     }
577     if (t3 < v) {
578     l3=feat.NPos[p3]-1;
579     p3--;
580     }
581     /* restNs=p3-p5; number of Ns in the new CLR */
582     if (p3-p5==old_dif) { /* no change, return */
583     /* don't trim if this may shorten the read too much */
584     //if (l5-l3<min_read_len) return;
585     feat.end5=l5+1;
586     feat.end3=l3+1;
587     return;
588     }
589 gpertea 170 else
590 gpertea 4 N_analyze(l5,l3, p5,p3);
591     }
592    
593    
594 gpertea 12 bool qtrim(GStr& qvs, int &l5, int &l3) {
595 gpertea 13 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
596 gpertea 12 if (qv_phredtype==0) {
597     //try to guess the Phred type
598     int vmin=256, vmax=0;
599     for (int i=0;i<qvs.length();i++) {
600     if (vmin>qvs[i]) vmin=qvs[i];
601     if (vmax<qvs[i]) vmax=qvs[i];
602     }
603     if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
604     if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
605     if (qv_phredtype==0) {
606     GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
607     }
608 gpertea 13 if (verbose)
609     GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
610     } //guessing Phred type
611 gpertea 12 for (l3=qvs.length()-1;l3>2;l3--) {
612 gpertea 13 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
613 gpertea 12 }
614     //just in case, check also the 5' the end (?)
615     for (l5=0;l5<qvs.length()-3;l5++) {
616 gpertea 13 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
617 gpertea 12 }
618 gpertea 13 if (qvtrim_max>0) {
619     if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
620     if (l5>qvtrim_max) l5=qvtrim_max;
621     }
622 gpertea 12 return (l5>0 || l3<qvs.length()-1);
623     }
624    
625     bool ntrim(GStr& rseq, int &l5, int &l3) {
626     //count Ns in the sequence, trim N-rich ends
627 gpertea 4 feat.init(rseq);
628     l5=feat.end5-1;
629     l3=feat.end3-1;
630 gpertea 170 N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
631 gpertea 12 if (l5==feat.end5-1 && l3==feat.end3-1) {
632     if (feat.perc_N>max_perc_N) {
633     feat.valid=false;
634     l3=l5+1;
635     return true;
636     }
637     else {
638     return false; //nothing changed
639     }
640     }
641 gpertea 4 l5=feat.end5-1;
642     l3=feat.end3-1;
643     if (l3-l5+1<min_read_len) {
644     feat.valid=false;
645     return true;
646     }
647     feat.N_calc();
648 gpertea 170
649 gpertea 12 if (feat.perc_N>max_perc_N) {
650 gpertea 4 feat.valid=false;
651     l3=l5+1;
652     return true;
653     }
654     return true;
655     }
656    
657     //--------------- dust functions ----------------
658     class DNADuster {
659     public:
660 gpertea 170 int dustword;
661     int dustwindow;
662     int dustwindow2;
663 gpertea 4 int dustcutoff;
664     int mv, iv, jv;
665     int counts[32*32*32];
666     int iis[32*32*32];
667     DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
668     dustword=wordsize;
669     dustwindow=winsize;
670     dustwindow2 = (winsize>>1);
671     dustcutoff=cutoff;
672     mv=0;
673     iv=0;
674     jv=0;
675     }
676     void setWindowSize(int value) {
677     dustwindow = value;
678     dustwindow2 = (dustwindow >> 1);
679     }
680     void setWordSize(int value) {
681     dustword=value;
682     }
683     void wo1(int len, const char* s, int ivv) {
684     int i, ii, j, v, t, n, n1, sum;
685     int js, nis;
686     n = 32 * 32 * 32;
687     n1 = n - 1;
688     nis = 0;
689     i = 0;
690     ii = 0;
691     sum = 0;
692     v = 0;
693     for (j=0; j < len; j++, s++) {
694     ii <<= 5;
695     if (*s<=32) {
696     i=0;
697     continue;
698     }
699     ii |= *s - 'A'; //assume uppercase!
700     ii &= n1;
701     i++;
702     if (i >= dustword) {
703     for (js=0; js < nis && iis[js] != ii; js++) ;
704     if (js == nis) {
705     iis[nis] = ii;
706     counts[ii] = 0;
707     nis++;
708     }
709     if ((t = counts[ii]) > 0) {
710     sum += t;
711     v = 10 * sum / j;
712     if (mv < v) {
713     mv = v;
714     iv = ivv;
715     jv = j;
716     }
717     }
718     counts[ii]++;
719     }
720     }
721     }
722    
723     int wo(int len, const char* s, int* beg, int* end) {
724     int i, l1;
725     l1 = len - dustword + 1;
726     if (l1 < 0) {
727     *beg = 0;
728     *end = len - 1;
729     return 0;
730     }
731     mv = 0;
732     iv = 0;
733     jv = 0;
734     for (i=0; i < l1; i++) {
735     wo1(len-i, s+i, i);
736     }
737     *beg = iv;
738     *end = iv + jv;
739     return mv;
740     }
741    
742     void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
743     int i, j, l, a, b, v;
744     if (cutoff==0) cutoff=dustcutoff;
745     a=0;b=0;
746     //GMessage("Dust cutoff=%d\n", cutoff);
747     for (i=0; i < seqlen; i += dustwindow2) {
748     l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
749     v = wo(l, seq+i, &a, &b);
750     if (v > cutoff) {
751     //for (j = a; j <= b && j < dustwindow2; j++) {
752     for (j = a; j <= b; j++) {
753     seqmsk[i+j]='N';//could be made lowercase instead
754     }
755     }
756     }
757 gpertea 170 //return first;
758 gpertea 4 }
759     };
760    
761     static DNADuster duster;
762    
763     int dust(GStr& rseq) {
764     char* seq=Gstrdup(rseq.chars());
765     duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
766     //check the number of Ns:
767     int ncount=0;
768     for (int i=0;i<rseq.length();i++) {
769     if (seq[i]=='N') ncount++;
770     }
771     GFREE(seq);
772     return ncount;
773     }
774    
775 gpertea 94 struct SLocScore {
776     int pos;
777     int score;
778     SLocScore(int p=0,int s=0) {
779     pos=p;
780     score=s;
781     }
782     void set(int p, int s) {
783     pos=p;
784     score=s;
785     }
786     void add(int p, int add) {
787     pos=p;
788     score+=add;
789     }
790     };
791 gpertea 12
792 gpertea 94 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
793 gpertea 103 if (!doPolyTrim) return false;
794 gpertea 94 int rlen=seq.length();
795     l5=0;
796     l3=rlen-1;
797     int32 seedVal=*(int32*)poly_seed;
798     char polyChar=poly_seed[0];
799     //assumes N trimming was already done
800     //so a poly match should be very close to the end of the read
801     // -- find the initial match (seed)
802 gpertea 108 int lmin=GMAX((rlen-16), 0);
803 gpertea 94 int li;
804     for (li=rlen-4;li>lmin;li--) {
805     if (seedVal==*(int*)&(seq[li])) {
806     break;
807     }
808 gpertea 12 }
809 gpertea 94 if (li<=lmin) return false;
810     //seed found, try to extend it both ways
811     //extend right
812     int ri=li+3;
813     SLocScore loc(ri, poly_m_score<<2);
814     SLocScore maxloc(loc);
815     //extend right
816 gpertea 170 while (ri<rlen-1) {
817 gpertea 94 ri++;
818     if (seq[ri]==polyChar) {
819     loc.add(ri,poly_m_score);
820     }
821     else if (seq[ri]=='N') {
822     loc.add(ri,0);
823     }
824     else { //mismatch
825     loc.add(ri,poly_mis_score);
826     if (maxloc.score-loc.score>poly_dropoff_score) break;
827     }
828     if (maxloc.score<=loc.score) {
829     maxloc=loc;
830     }
831 gpertea 12 }
832 gpertea 94 ri=maxloc.pos;
833 gpertea 103 if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
834 gpertea 94 //ri = right boundary for the poly match
835     //extend left
836     loc.set(li, maxloc.score);
837     maxloc.pos=li;
838     while (li>0) {
839     li--;
840     if (seq[li]==polyChar) {
841     loc.add(li,poly_m_score);
842     }
843     else if (seq[li]=='N') {
844     loc.add(li,0);
845     }
846     else { //mismatch
847     loc.add(li,poly_mis_score);
848     if (maxloc.score-loc.score>poly_dropoff_score) break;
849     }
850     if (maxloc.score<=loc.score) {
851     maxloc=loc;
852     }
853     }
854 gpertea 105 li=maxloc.pos;
855 gpertea 108 if ((maxloc.score==poly_minScore && ri==rlen-1) ||
856     (maxloc.score>poly_minScore && ri>=rlen-3) ||
857     (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
858 gpertea 105 //trimming this li-ri match at 3' end
859     l3=li-1;
860     if (l3<0) l3=0;
861 gpertea 94 return true;
862     }
863     return false;
864 gpertea 12 }
865    
866 gpertea 94 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
867 gpertea 103 if (!doPolyTrim) return false;
868 gpertea 88 int rlen=seq.length();
869     l5=0;
870     l3=rlen-1;
871 gpertea 94 int32 seedVal=*(int32*)poly_seed;
872     char polyChar=poly_seed[0];
873 gpertea 88 //assumes N trimming was already done
874     //so a poly match should be very close to the end of the read
875     // -- find the initial match (seed)
876 gpertea 108 int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
877 gpertea 94 int li;
878     for (li=0;li<=lmax;li++) {
879     if (seedVal==*(int*)&(seq[li])) {
880 gpertea 88 break;
881     }
882     }
883 gpertea 94 if (li>lmax) return false;
884     //seed found, try to extend it both ways
885     //extend left
886     int ri=li+3; //save rightmost base of the seed
887     SLocScore loc(li, poly_m_score<<2);
888     SLocScore maxloc(loc);
889     while (li>0) {
890     li--;
891     if (seq[li]==polyChar) {
892     loc.add(li,poly_m_score);
893     }
894     else if (seq[li]=='N') {
895     loc.add(li,0);
896     }
897     else { //mismatch
898     loc.add(li,poly_mis_score);
899     if (maxloc.score-loc.score>poly_dropoff_score) break;
900     }
901     if (maxloc.score<=loc.score) {
902     maxloc=loc;
903     }
904 gpertea 88 }
905 gpertea 94 li=maxloc.pos;
906 gpertea 103 if (li>5) return false; //no trimming wanted, too far from 5' end
907 gpertea 94 //li = right boundary for the poly match
908    
909     //extend right
910     loc.set(ri, maxloc.score);
911     maxloc.pos=ri;
912 gpertea 170 while (ri<rlen-1) {
913 gpertea 94 ri++;
914     if (seq[ri]==polyChar) {
915     loc.add(ri,poly_m_score);
916     }
917     else if (seq[ri]=='N') {
918     loc.add(ri,0);
919     }
920     else { //mismatch
921     loc.add(ri,poly_mis_score);
922     if (maxloc.score-loc.score>poly_dropoff_score) break;
923     }
924     if (maxloc.score<=loc.score) {
925     maxloc=loc;
926     }
927     }
928 gpertea 105 ri=maxloc.pos;
929 gpertea 103 if ((maxloc.score==poly_minScore && li==0) ||
930     (maxloc.score>poly_minScore && li<2)
931 gpertea 108 || (maxloc.score>(poly_minScore*3) && li<8)) {
932 gpertea 105 //adjust l5 to reflect this trimming of 5' end
933     l5=ri+1;
934     if (l5>rlen-1) l5=rlen-1;
935 gpertea 94 return true;
936     }
937     return false;
938 gpertea 88 }
939    
940 gpertea 4 bool trim_adapter3(GStr& seq, int&l5, int &l3) {
941 gpertea 103 if (adapters3.Count()==0) return false;
942 gpertea 4 int rlen=seq.length();
943     l5=0;
944     l3=rlen-1;
945 gpertea 103 bool trimmed=false;
946     GStr wseq(seq.chars());
947     int wlen=rlen;
948     for (int ai=0;ai<adapters3.Count();ai++) {
949 gpertea 172 //if (adapters3[ai].is_empty()) continue;
950     int alen=adapters3[ai].seq.length();
951     GStr& aseq=adapters3[ai].seq;
952     GXAlnInfo* r_bestaln=match_RightEnd(aseq.chars(), alen, adapters3[ai].pz,
953     wseq.chars(), wlen, gxmem_r, 74);
954 gpertea 103 if (r_bestaln) {
955     trimmed=true;
956     //keep unmatched region on the left, if any
957     l3-=(wlen-r_bestaln->sl+1);
958     delete r_bestaln;
959     if (l3<0) l3=0;
960     if (l3-l5+1<min_read_len) return true;
961     wseq=seq.substr(l5,l3-l5+1);
962     wlen=wseq.length();
963     }
964     }//for each 5' adapter
965     return trimmed;
966 gpertea 4 }
967    
968     bool trim_adapter5(GStr& seq, int&l5, int &l3) {
969 gpertea 103 if (adapters5.Count()==0) return false;
970 gpertea 4 int rlen=seq.length();
971     l5=0;
972     l3=rlen-1;
973 gpertea 102 bool trimmed=false;
974 gpertea 103 GStr wseq(seq.chars());
975     int wlen=rlen;
976 gpertea 102 for (int ai=0;ai<adapters5.Count();ai++) {
977 gpertea 172 //if (adapters5[ai].is_empty()) continue;
978     int alen=adapters5[ai].seq.length();
979     GStr& aseq=adapters5[ai].seq;
980 gpertea 170 GXAlnInfo* l_bestaln=match_LeftEnd(aseq.chars(), alen, adapters5[ai].pz,
981 gpertea 172 wseq.chars(), wlen, gxmem_l, 84);
982 gpertea 103 if (l_bestaln) {
983     trimmed=true;
984     l5+=l_bestaln->sr;
985     delete l_bestaln;
986     if (l5>=rlen) l5=rlen-1;
987     if (l3-l5+1<min_read_len) return true;
988     wseq=seq.substr(l5,l3-l5+1);
989     wlen=wseq.length();
990     }
991 gpertea 102 }//for each 5' adapter
992     return trimmed;
993 gpertea 98 }
994 gpertea 4
995 gpertea 170 //convert qvs to/from phred64 from/to phread33
996 gpertea 12 void convertPhred(GStr& q) {
997     for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
998 gpertea 4 }
999    
1000 gpertea 12 void convertPhred(char* q, int len) {
1001     for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1002 gpertea 4 }
1003    
1004 gpertea 170 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1005 gpertea 13 GStr& rname, GStr& rinfo, GStr& infname) {
1006     rseq="";
1007     rqv="";
1008     rname="";
1009     rinfo="";
1010     if (fq.eof()) return false;
1011     char* l=fq.getLine();
1012     while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1013     if (l==NULL) return false;
1014     /* if (rawFormat) {
1015     //TODO: implement raw qseq parsing here
1016     //if (raw type=N) then continue; //skip invalid/bad records
1017     } //raw qseq format
1018     else { // FASTQ or FASTA */
1019     isfasta=(l[0]=='>');
1020 gpertea 170 if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1021 gpertea 13 infname.chars(), l);
1022     GStr s(l);
1023     rname=&(l[1]);
1024     for (int i=0;i<rname.length();i++)
1025 gpertea 170 if (rname[i]<=' ') {
1026     if (i<rname.length()-2) rinfo=rname.substr(i+1);
1027     rname.cut(i);
1028     break;
1029 gpertea 13 }
1030     //now get the sequence
1031 gpertea 170 if ((l=fq.getLine())==NULL)
1032 gpertea 13 GError("Error: unexpected EOF after header for read %s (%s)\n",
1033     rname.chars(), infname.chars());
1034     rseq=l; //this must be the DNA line
1035     while ((l=fq.getLine())!=NULL) {
1036     //seq can span multiple lines
1037     if (l[0]=='>' || l[0]=='+') {
1038 gpertea 170 fq.pushBack();
1039 gpertea 13 break; //
1040     }
1041     rseq+=l;
1042 gpertea 170 } //check for multi-line seq
1043 gpertea 13 if (!isfasta) { //reading fastq quality values, which can also be multi-line
1044     if ((l=fq.getLine())==NULL)
1045     GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1046     if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1047 gpertea 170 if ((l=fq.getLine())==NULL)
1048 gpertea 13 GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1049     rqv=l;
1050 gpertea 170 //if (rqv.length()!=rseq.length())
1051 gpertea 13 // GError("Error: qv len != seq len for %s\n", rname.chars());
1052     while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1053     rqv+=l; //append to qv string
1054     }
1055     }// fastq
1056     // } //<-- FASTA or FASTQ
1057 gpertea 94 rseq.upper();
1058 gpertea 13 return true;
1059     }
1060    
1061 gpertea 105 #ifdef GDEBUG
1062     void showTrim(GStr& s, int l5, int l3) {
1063     if (l5>0) {
1064     color_bg(c_red);
1065     }
1066     for (int i=0;i<s.length()-1;i++) {
1067     if (i && i==l5) color_resetbg();
1068     fprintf(stderr, "%c", s[i]);
1069     if (i==l3) color_bg(c_red);
1070     }
1071     fprintf(stderr, "%c", s[s.length()-1]);
1072     color_reset();
1073     fprintf(stderr, "\n");
1074     }
1075     #endif
1076    
1077 gpertea 13 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1078 gpertea 170 //returns 0 if the read was untouched, 1 if it was just trimmed
1079 gpertea 13 // and a trash code if it was trashed
1080     l5=0;
1081     l3=rseq.length()-1;
1082 gpertea 103 #ifdef GDEBUG
1083 gpertea 108 //rseq.reverse();
1084 gpertea 103 GMessage(">%s\n", rname.chars());
1085 gpertea 105 GMessage("%s\n",rseq.chars());
1086 gpertea 103 #endif
1087 gpertea 13 if (l3-l5+1<min_read_len) {
1088     return 's';
1089     }
1090     GStr wseq(rseq.chars());
1091     GStr wqv(rqv.chars());
1092     int w5=l5;
1093     int w3=l3;
1094     //first do the q-based trimming
1095     if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1096     if (w3-w5+1<min_read_len) {
1097     return 'Q'; //invalid read
1098     }
1099     //-- keep only the w5..w3 range
1100     l5=w5;
1101     l3=w3;
1102     wseq=wseq.substr(w5, w3-w5+1);
1103     if (!wqv.is_empty())
1104     wqv=wqv.substr(w5, w3-w5+1);
1105     } //qv trimming
1106     // N-trimming on the remaining read seq
1107     if (ntrim(wseq, w5, w3)) {
1108     //GMessage("before: %s\n",wseq.chars());
1109     //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1110     l5+=w5;
1111     l3-=(wseq.length()-1-w3);
1112     if (w3-w5+1<min_read_len) {
1113     return 'N'; //to be trashed
1114     }
1115     //-- keep only the w5..w3 range
1116     wseq=wseq.substr(w5, w3-w5+1);
1117     if (!wqv.is_empty())
1118     wqv=wqv.substr(w5, w3-w5+1);
1119     w5=0;
1120     w3=wseq.length()-1;
1121     }
1122 gpertea 103 char trim_code;
1123     do {
1124     trim_code=0;
1125     if (trim_poly5(wseq, w5, w3, polyA_seed)) {
1126     trim_code='A';
1127     }
1128     else if (trim_poly5(wseq, w5, w3, polyT_seed)) {
1129     trim_code='T';
1130     }
1131     else if (trim_adapter5(wseq, w5, w3)) {
1132     trim_code='5';
1133     }
1134     if (trim_code) {
1135 gpertea 105 #ifdef GDEBUG
1136     GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1137     showTrim(wseq, w5, w3);
1138     #endif
1139 gpertea 13 int trimlen=wseq.length()-(w3-w5+1);
1140 gpertea 103 num_trimmed5++;
1141     if (trimlen<min_trimmed5)
1142     min_trimmed5=trimlen;
1143 gpertea 13 l5+=w5;
1144     l3-=(wseq.length()-1-w3);
1145     if (w3-w5+1<min_read_len) {
1146 gpertea 103 return trim_code;
1147 gpertea 13 }
1148     //-- keep only the w5..w3 range
1149     wseq=wseq.substr(w5, w3-w5+1);
1150     if (!wqv.is_empty())
1151     wqv=wqv.substr(w5, w3-w5+1);
1152 gpertea 103 }// trimmed at 5' end
1153     } while (trim_code);
1154    
1155     do {
1156     trim_code=0;
1157     if (trim_poly3(wseq, w5, w3, polyA_seed)) {
1158     trim_code='A';
1159     }
1160     else if (trim_poly3(wseq, w5, w3, polyT_seed)) {
1161     trim_code='T';
1162     }
1163     else if (trim_adapter3(wseq, w5, w3)) {
1164     trim_code='3';
1165     }
1166     if (trim_code) {
1167 gpertea 105 #ifdef GDEBUG
1168     GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1169     showTrim(wseq, w5, w3);
1170     #endif
1171 gpertea 13 int trimlen=wseq.length()-(w3-w5+1);
1172 gpertea 103 num_trimmed3++;
1173 gpertea 170 if (trimlen<min_trimmed3)
1174 gpertea 103 min_trimmed3=trimlen;
1175 gpertea 13 l5+=w5;
1176     l3-=(wseq.length()-1-w3);
1177     if (w3-w5+1<min_read_len) {
1178 gpertea 103 return trim_code;
1179 gpertea 13 }
1180     //-- keep only the w5..w3 range
1181     wseq=wseq.substr(w5, w3-w5+1);
1182     if (!wqv.is_empty())
1183     wqv=wqv.substr(w5, w3-w5+1);
1184 gpertea 103 }//trimming at 3' end
1185     } while (trim_code);
1186    
1187    
1188 gpertea 13 if (doCollapse) {
1189     //keep read for later
1190     FqDupRec* dr=dhash.Find(wseq.chars());
1191     if (dr==NULL) { //new entry
1192 gpertea 170 //if (prefix.is_empty())
1193     dhash.Add(wseq.chars(),
1194 gpertea 13 new FqDupRec(&wqv, rname.chars()));
1195     //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1196     }
1197     else
1198     dr->add(wqv);
1199     } //collapsing duplicates
1200     else { //not collapsing duplicates
1201     //apply the dust filter now
1202     if (doDust) {
1203     int dustbases=dust(wseq);
1204     if (dustbases>(wseq.length()>>1)) {
1205     return 'D';
1206     }
1207     }
1208     } //not collapsing duplicates
1209     return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1210     }
1211    
1212     void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1213     //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1214     if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1215     else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1216     }
1217    
1218     void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1219     outcounter++;
1220     bool asFasta=(rqv.is_empty() || fastaOutput);
1221     if (asFasta) {
1222     if (prefix.is_empty()) {
1223     printHeader(f_out, '>',rname,rinfo);
1224     fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1225     }
1226     else {
1227 gpertea 170 fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1228 gpertea 13 rseq.chars());
1229     }
1230     }
1231     else { //fastq
1232     if (convert_phred) convertPhred(rqv);
1233     if (prefix.is_empty()) {
1234     printHeader(f_out, '@', rname, rinfo);
1235     fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1236     }
1237     else
1238 gpertea 170 fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1239 gpertea 13 rseq.chars(),rqv.chars() );
1240     }
1241     }
1242    
1243     void trash_report(char trashcode, GStr& rname, FILE* freport) {
1244     if (freport==NULL || trashcode<=' ') return;
1245     if (trashcode=='3' || trashcode=='5') {
1246 gpertea 103 fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1247 gpertea 13 }
1248     else {
1249     fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1250     }
1251     //tcounter++;
1252     }
1253    
1254     GStr getFext(GStr& s, int* xpos=NULL) {
1255     //s must be a filename without a path
1256     GStr r("");
1257     if (xpos!=NULL) *xpos=0;
1258     if (s.is_empty() || s=="-") return r;
1259     int p=s.rindex('.');
1260     int d=s.rindex('/');
1261     if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1262     r=s.substr(p+1);
1263     if (xpos!=NULL) *xpos=p+1;
1264     r.lower();
1265     return r;
1266     }
1267    
1268     void baseFileName(GStr& fname) {
1269     //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1270     int xpos=0;
1271     GStr fext=getFext(fname, &xpos);
1272     if (xpos<=1) return;
1273     bool extdel=false;
1274     GStr f2;
1275     if (fext=="z" || fext=="zip") {
1276     extdel=true;
1277     }
1278     else if (fext.length()>=2) {
1279     f2=fext.substr(0,2);
1280     extdel=(f2=="gz" || f2=="bz");
1281     }
1282     if (extdel) {
1283     fname.cut(xpos-1);
1284     fext=getFext(fname, &xpos);
1285     if (xpos<=1) return;
1286     }
1287     extdel=false;
1288     if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1289     extdel=true;
1290     }
1291     else if (fext.length()>=2) {
1292     extdel=(fext.substr(0,2)=="fa");
1293     }
1294     if (extdel) fname.cut(xpos-1);
1295     GStr fncp(fname);
1296     fncp.lower();
1297     fncp.chomp("seq");
1298     fncp.chomp("sequence");
1299     fncp.trimR("_.");
1300     if (fncp.length()<fname.length()) fname.cut(fncp.length());
1301     }
1302    
1303     FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1304     FILE* f_out=NULL;
1305     GStr fname(getFileName(infname.chars()));
1306     //eliminate known extensions
1307     baseFileName(fname);
1308     if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1309     else if (pocmd.is_empty()) {
1310     GStr oname(fname);
1311     oname.append('.');
1312     oname.append(outsuffix);
1313     f_out=fopen(oname.chars(),"w");
1314     if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1315     }
1316     else {
1317     GStr oname(">");
1318     oname.append(fname);
1319     oname.append('.');
1320     oname.append(outsuffix);
1321     pocmd.append(oname);
1322     f_out=popen(pocmd.chars(), "w");
1323     if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1324     }
1325     return f_out;
1326     }
1327    
1328     void guess_unzip(GStr& fname, GStr& picmd) {
1329     GStr fext=getFext(fname);
1330     if (fext=="gz" || fext=="gzip" || fext=="z") {
1331     picmd="gzip -cd ";
1332     }
1333     else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1334     picmd="bzip2 -cd ";
1335     }
1336     }
1337    
1338 gpertea 170 void addAdapter(GVec<CASeqData>& adapters, GStr& seq) {
1339     //TODO: prepare CASeqData here, and collect hexamers as well
1340 gpertea 172 CASeqData adata(revCompl);
1341     int idx=adapters.Add(adata);
1342     if (idx<0) GError("Error: failed to add adaptor!\n");
1343     adapters[idx].update(seq.chars());
1344 gpertea 170 }
1345    
1346    
1347 gpertea 94 int loadAdapters(const char* fname) {
1348     GLineReader lr(fname);
1349     char* l;
1350     while ((l=lr.nextLine())!=NULL) {
1351     if (lr.length()<=3 || l[0]=='#') continue;
1352 gpertea 98 if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1353     l[0]==';'|| l[0]==':' ) {
1354 gpertea 94 int i=1;
1355     while (l[i]!=0 && isspace(l[i])) {
1356     i++;
1357     }
1358     if (l[i]!=0) {
1359 gpertea 102 GStr s(&(l[i]));
1360 gpertea 108 #ifdef GDEBUG
1361     //s.reverse();
1362     #endif
1363 gpertea 170 addAdapter(adapters3, s);
1364 gpertea 94 continue;
1365     }
1366     }
1367     else {
1368 gpertea 98 GStr s(l);
1369     s.startTokenize("\t ;,:");
1370     GStr a5,a3;
1371     if (s.nextToken(a5))
1372     s.nextToken(a3);
1373 gpertea 102 a5.upper();
1374     a3.upper();
1375 gpertea 108 #ifdef GDEBUG
1376     // a5.reverse();
1377     // a3.reverse();
1378     #endif
1379 gpertea 170 addAdapter(adapters5, a5);
1380     addAdapter(adapters3, a3);
1381 gpertea 94 }
1382     }
1383 gpertea 102 return adapters5.Count()+adapters3.Count();
1384 gpertea 94 }
1385    
1386 gpertea 170 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1387     GStr& s, GStr& infname, GStr& infname2) {
1388 gpertea 13 // uses outsuffix to generate output file names and open file handles as needed
1389     infname="";
1390     infname2="";
1391     f_in=NULL;
1392     f_in2=NULL;
1393     f_out=NULL;
1394     f_out2=NULL;
1395     //analyze outsuffix intent
1396     GStr pocmd;
1397 gpertea 76 if (outsuffix=="-") {
1398     f_out=stdout;
1399     }
1400     else {
1401     GStr ox=getFext(outsuffix);
1402     if (ox.length()>2) ox=ox.substr(0,2);
1403     if (ox=="gz") pocmd="gzip -9 -c ";
1404     else if (ox=="bz") pocmd="bzip2 -9 -c ";
1405     }
1406 gpertea 13 if (s=="-") {
1407     f_in=stdin;
1408     infname="stdin";
1409     f_out=prepOutFile(infname, pocmd);
1410     return;
1411     } // streaming from stdin
1412     s.startTokenize(",:");
1413     s.nextToken(infname);
1414     bool paired=s.nextToken(infname2);
1415 gpertea 170 if (fileExists(infname.chars())==0)
1416 gpertea 13 GError("Error: cannot find file %s!\n",infname.chars());
1417     GStr fname(getFileName(infname.chars()));
1418     GStr picmd;
1419     guess_unzip(fname, picmd);
1420     if (picmd.is_empty()) {
1421     f_in=fopen(infname.chars(), "r");
1422     if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1423     }
1424     else {
1425     picmd.append(infname);
1426     f_in=popen(picmd.chars(), "r");
1427     if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1428     }
1429 gpertea 76 if (f_out==stdout) {
1430     if (paired) GError("Error: output suffix required for paired reads\n");
1431     return;
1432     }
1433 gpertea 13 f_out=prepOutFile(infname, pocmd);
1434     if (!paired) return;
1435     if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1436     // ---- paired reads:-------------
1437 gpertea 170 if (fileExists(infname2.chars())==0)
1438 gpertea 13 GError("Error: cannot find file %s!\n",infname2.chars());
1439     picmd="";
1440     GStr fname2(getFileName(infname2.chars()));
1441     guess_unzip(fname2, picmd);
1442     if (picmd.is_empty()) {
1443     f_in2=fopen(infname2.chars(), "r");
1444     if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1445     }
1446     else {
1447     picmd.append(infname2);
1448     f_in2=popen(picmd.chars(), "r");
1449     if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1450     }
1451     f_out2=prepOutFile(infname2, pocmd);
1452     }