ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 88
Committed: Mon Oct 3 16:34:03 2011 UTC (7 years, 11 months ago) by gpertea
File size: 54110 byte(s)
Log Message:
starting to add polyA/T trimming

Line File contents
1 #include "GArgs.h"
2 #include "GStr.h"
3 #include "GHash.hh"
4 #include "GList.hh"
5 #include <ctype.h>
6
7 #define USAGE "Usage:\n\
8 fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
9 [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
10 [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
11 <input.fq>[,<input_mates.fq>\n\
12 \n\
13 Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
14 for low complexity and collapse duplicate reads.\n\
15 If read pairs should be trimmed and kept together (i.e. without discarding\n\
16 one read in a pair), the two file names should be given delimited by a comma\n\
17 or a colon character\n\
18 \n\
19 Options:\n\
20 -n rename all the reads using the <prefix> followed by a read counter;\n\
21 if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
22 the read duplication count\n\
23 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
24 file(s) named <input>.<outsuffix> which will be created in the \n\
25 current (working) directory; (writes to stdout if -o- is given);\n\
26 a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
27 -f file with adapter sequences to trim, each line having this format:\n\
28 <5'-adapter-sequence> <3'-adapter-sequence>\n\
29 -5 trim the given adapter or primer sequence at the 5' end of each read\n\
30 (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
31 -3 trim the given adapter sequence at the 3' end of each read\n\
32 (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
33 -a minimum length of exact match to adaptor sequence at the proper end (6)\n\
34 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
35 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
36 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
37 -l minimum \"clean\" length after trimming that a read must have\n\
38 in order to pass the filter (default: 16)\n\
39 -r write a simple \"trash report\" file listing the discarded reads with a\n\
40 one letter code for the reason why they were trashed\n\
41 -D apply a low-complexity (dust) filter and discard any read that has over\n\
42 50% of its length detected as low complexity\n\
43 -C collapse duplicate reads and add a prefix with their count to the read\n\
44 name (e.g. for microRNAs)\n\
45 -p input is phred64/phred33 (use -p64 or -p33)\n\
46 -Q convert quality values to the other Phred qv type\n\
47 -V verbose processing\n\
48 "
49
50 //-z for -o option, the output stream(s) will be first piped into the given\n
51 // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
52
53
54 // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
55
56 //For paired reads sequencing:
57 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
58 //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
59 //FILE* f_out=NULL; //stdout if not provided
60 //FILE* f_out2=NULL; //for paired reads
61 //FILE* f_in=NULL; //input fastq (stdin if not provided)
62 //FILE* f_in2=NULL; //for paired reads
63 FILE* freport=NULL;
64
65 bool debug=false;
66 bool verbose=false;
67 bool doCollapse=false;
68 bool doDust=false;
69 bool fastaOutput=false;
70 bool trashReport=false;
71 //bool rawFormat=false;
72 int min_read_len=16;
73 double max_perc_N=7.0;
74 int dust_cutoff=16;
75 bool isfasta=false;
76 bool convert_phred=false;
77 GStr outsuffix; // -o
78 //GStr adapter3;
79 //GStr adapter5;
80 GStr prefix;
81 GStr zcmd;
82 int num_trimmed5=0;
83 int num_trimmed3=0;
84 int num_trimmedN=0;
85 int num_trimmedQ=0;
86 int min_trimmed5=INT_MAX;
87 int min_trimmed3=INT_MAX;
88
89 int qvtrim_qmin=0;
90 int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
91 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
92 int qv_cvtadd=0; //could be -31 or +31
93
94 int a3len=0;
95 int a5len=0;
96 // adaptor matching metrics -- for extendMatch() function
97 const int match_score=2; //match score
98 const int mismatch_score=-3; //mismatch
99
100 const int a_m_score=2; //match score
101 const int a_mis_score=-3; //mismatch
102 const int a_dropoff_score=7;
103 const int poly_dropoff_score=7;
104 int a_min_score=12; //an exact match of 6 bases at the proper ends WILL be trimmed
105 const int a_min_chain_score=15; //for gapped alignments
106
107 class CSegChain;
108
109 class CSegPair {
110 public:
111 GSeg a;
112 GSeg b; //the adapter segment
113 int score;
114 int flags;
115 CSegChain* chain;
116 CSegPair(int astart=0, int aend=0, int bstart=0, int bend=0, int mscore=0):a(astart,aend),b(bstart, bend) {
117 score=mscore;
118 if (score==0) score=a.len()*a_m_score;
119 flags=0;
120 chain=NULL;
121 }
122 int len() { return a.len(); }
123 bool operator==(CSegPair& d){
124 //return (a.start==d.a.start && a.end==d.a.end && b.start==d.b.start && b.end==d.b.end);
125 //make equal even segments that are included into one another:
126 return (d.a.start>=a.start && d.a.end<=a.end && d.b.start>=b.start && d.b.end<=b.end);
127 }
128 bool operator>(CSegPair& d){ //ordering based on b (adaptor) start coord and score
129 if (b.start==d.b.start) {
130 if (score==d.score) {
131 //just try to be consistent:
132 if (b.end==d.b.end) {
133 return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
134 }
135 return (b.end>d.b.end);
136 }
137 else return (score<d.score);
138 }
139 return (b.start>d.b.start);
140 }
141 bool operator<(CSegPair& d){ //ordering based on b (adaptor) coord
142 /*if (b.start==d.b.start && b.end==d.b.end) {
143 return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
144 }
145 return (b.start==d.b.start)?(b.end<d.b.end):(b.start<d.b.start);*/
146 if (b.start==d.b.start) {
147 if (score==d.score) {
148 //just try to be consistent:
149 if (b.end==d.b.end) {
150 return (a.start==d.a.start)?(a.end>d.a.end):(a.start>d.a.start);
151 }
152 return (b.end<d.b.end);
153 }
154 else return (score>d.score);
155 }
156 return (b.start<d.b.start);
157 }
158 };
159
160 int cmpSegEnds(pointer sa, pointer sb) { //sort by adaptor seg ends AND score
161 CSegPair& x = *(CSegPair *)sa;
162 CSegPair& y = *(CSegPair *)sb;
163 /*
164 if (x.b.end==y.b.end) {
165 if (x.b.start==y.b.start) {
166 if (x.a.end==y.a.end) {
167 if (x.a.start==y.a.start) return 0;
168 return ((x.a.start>y.a.start) ? -1 : 1);
169 }
170 else {
171 return ((x.a.end>y.a.end) ? -1 : 1);
172 }
173 }
174 else {
175 return ((x.b.start>y.b.start) ? -1 : 1);
176 }
177 }
178 else {
179 return ((x.b.end>y.b.end) ? -1 : 1);
180 }
181 */
182 if (x.b.end==y.b.end) {
183 if (x.score==y.score) {
184 if (x.b.start==y.b.start) {
185 if (x.a.end==y.a.end) {
186 if (x.a.start==y.a.start) return 0;
187 return ((x.a.start<y.a.start) ? -1 : 1);
188 }
189 else {
190 return ((x.a.end<y.a.end) ? -1 : 1);
191 }
192 }
193 else {
194 return ((x.b.start<y.b.start) ? -1 : 1);
195 }
196 } else return ((x.score>y.score) ? -1 : 1);
197 }
198 else {
199 return ((x.b.end>y.b.end) ? -1 : 1);
200 }
201
202 }
203
204 class CSegChain:public GList<CSegPair> {
205 public:
206 uint astart;
207 uint aend;
208 uint bstart;
209 uint bend;
210 int score;
211 bool endSort;
212 CSegChain(bool aln5=false):GList<CSegPair>(true,true,true) {//sorted, free elements, unique
213 //as SegPairs are inserted, they will be sorted by a.start coordinate
214 score=0;
215 astart=MAX_UINT;
216 aend=0;
217 bstart=MAX_UINT;
218 bend=0;
219 endSort=aln5;
220 if (aln5) { setSorted(cmpSegEnds); }
221 }
222 bool operator==(CSegChain& d) {
223 //return (score==d.score);
224 return (astart==d.astart && aend==d.aend && bstart==d.bstart && bend==d.bend);
225 }
226 bool operator>(CSegChain& d) { // order based on b (adaptor) coordinate
227 //return (score<d.score);
228 if (bstart==d.bstart && bend==d.bend) {
229 return (astart==d.astart)?(aend>d.aend):(astart>d.astart);
230 }
231 return (bstart==d.bstart)?(bend>d.bend):(bstart>d.bstart);
232 }
233 bool operator<(CSegChain& d) {
234 //return (score>d.score);
235 if (bstart==d.bstart && bend==d.bend) {
236 return (astart==d.astart)?(aend<d.aend):(astart<d.astart);
237 }
238 return (bstart==d.bstart)?(bend<d.bend):(bstart<d.bstart);
239 }
240 void addSegPair(CSegPair* segp) {
241 if (AddIfNew(segp)!=segp) return;
242 score+=segp->score;
243 if (astart>segp->a.start) astart=segp->a.start;
244 if (aend<segp->a.end) aend=segp->a.end;
245 if (bstart>segp->b.start) bstart=segp->b.start;
246 if (bend<segp->b.end) bend=segp->b.end;
247 }
248 //for building actual chains:
249 bool extendChain(CSegPair* segp) { //segp expected to be "Greater Than" current chain
250 int bgap=0;
251 int agap=0;
252 //if (endSort) {
253 if (bstart>segp->b.start) {
254 bgap = (int)(bstart-segp->b.end);
255 if (abs(bgap)>2) return false;
256 agap = (int)(astart-segp->a.end);
257 if (abs(agap)>2) return false;
258 }
259 else {
260 bgap = (int) (segp->b.start-bend);
261 if (abs(bgap)>2) return false;
262 agap = (int)(segp->a.start-aend);
263 if (abs(agap)>2) return false;
264 }
265 if (agap*bgap<0) return false;
266 addSegPair(segp);
267 score-=abs(agap)+abs(bgap);
268 return true;
269 }
270 };
271
272 // element in dhash:
273 class FqDupRec {
274 public:
275 int count; //how many of these reads are the same
276 int len; //length of qv
277 char* firstname; //optional, only if we want to keep the original read names
278 char* qv;
279 FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
280 len=0;
281 qv=NULL;
282 firstname=NULL;
283 count=0;
284 if (qstr!=NULL) {
285 qv=Gstrdup(qstr->chars());
286 len=qstr->length();
287 count++;
288 }
289 if (rname!=NULL) firstname=Gstrdup(rname);
290 }
291 ~FqDupRec() {
292 GFREE(qv);
293 GFREE(firstname);
294 }
295 void add(GStr& d) { //collapse another record into this one
296 if (d.length()!=len)
297 GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
298 count++;
299 for (int i=0;i<len;i++)
300 qv[i]=(qv[i]+d[i])/2;
301 }
302 };
303
304 void openfw(FILE* &f, GArgs& args, char opt) {
305 GStr s=args.getOpt(opt);
306 if (!s.is_empty()) {
307 if (s=='-') f=stdout;
308 else {
309 f=fopen(s.chars(),"w");
310 if (f==NULL) GError("Error creating file: %s\n", s.chars());
311 }
312 }
313 }
314
315 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
316 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
317
318 GHash<FqDupRec> dhash; //hash to keep track of duplicates
319
320 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
321 GStr& s, GStr& infname, GStr& infname2);
322 // uses outsuffix to generate output file names and open file handles as needed
323
324 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
325 void trash_report(char trashcode, GStr& rname, FILE* freport);
326
327 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
328 GStr& rname, GStr& rinfo, GStr& infname);
329
330 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
331 //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
332
333 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
334 bool polytrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
335 bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
336 int dust(GStr& seq);
337 bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
338 bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
339
340 void convertPhred(char* q, int len);
341 void convertPhred(GStr& q);
342
343 int main(int argc, char * const argv[]) {
344 GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
345 int e;
346 if ((e=args.isError())>0) {
347 GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
348 exit(224);
349 }
350 debug=(args.getOpt('Y')!=NULL);
351 verbose=(args.getOpt('V')!=NULL);
352 convert_phred=(args.getOpt('Q')!=NULL);
353 doCollapse=(args.getOpt('C')!=NULL);
354 doDust=(args.getOpt('D')!=NULL);
355 /*
356 rawFormat=(args.getOpt('R')!=NULL);
357 if (rawFormat) {
358 GError("Sorry, raw qseq format parsing is not implemented yet!\n");
359 }
360 */
361 prefix=args.getOpt('n');
362 GStr s=args.getOpt('l');
363 if (!s.is_empty())
364 min_read_len=s.asInt();
365 s=args.getOpt('m');
366 if (!s.is_empty())
367 max_perc_N=s.asDouble();
368 s=args.getOpt('d');
369 if (!s.is_empty()) {
370 dust_cutoff=s.asInt();
371 doDust=true;
372 }
373 s=args.getOpt('q');
374 if (!s.is_empty()) {
375 qvtrim_qmin=s.asInt();
376 }
377 s=args.getOpt('t');
378 if (!s.is_empty()) {
379 qvtrim_max=s.asInt();
380 }
381 s=args.getOpt('p');
382 if (!s.is_empty()) {
383 int v=s.asInt();
384 if (v==33) {
385 qv_phredtype=33;
386 qv_cvtadd=31;
387 }
388 else if (v==64) {
389 qv_phredtype=64;
390 qv_cvtadd=-31;
391 }
392 else
393 GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
394 }
395 if (args.getOpt('3')!=NULL) {
396 adapter3=args.getOpt('3');
397 adapter3.upper();
398 a3len=adapter3.length();
399 }
400 if (args.getOpt('5')!=NULL) {
401 adapter5=args.getOpt('5');
402 adapter5.upper();
403 a5len=adapter5.length();
404 }
405 s=args.getOpt('a');
406 if (!s.is_empty()) {
407 int a_minmatch=s.asInt();
408 a_min_score=a_minmatch<<1;
409 }
410
411 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
412 else outsuffix="-";
413 trashReport= (args.getOpt('r')!=NULL);
414 int fcount=args.startNonOpt();
415 if (fcount==0) {
416 GMessage(USAGE);
417 exit(224);
418 }
419 if (fcount>1 && doCollapse) {
420 GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
421 }
422 //openfw(f_out, args, 'o');
423 //if (f_out==NULL) f_out=stdout;
424 if (trashReport)
425 openfw(freport, args, 'r');
426 char* infile=NULL;
427 while ((infile=args.nextNonOpt())!=NULL) {
428 int incounter=0; //counter for input reads
429 int outcounter=0; //counter for output reads
430 int trash_s=0; //too short from the get go
431 int trash_Q=0;
432 int trash_N=0;
433 int trash_D=0;
434 int trash_A3=0;
435 int trash_A5=0;
436 s=infile;
437 GStr infname;
438 GStr infname2;
439 FILE* f_in=NULL;
440 FILE* f_in2=NULL;
441 FILE* f_out=NULL;
442 FILE* f_out2=NULL;
443 bool paired_reads=false;
444 setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
445 GLineReader fq(f_in);
446 GLineReader* fq2=NULL;
447 if (f_in2!=NULL) {
448 fq2=new GLineReader(f_in2);
449 paired_reads=true;
450 }
451 GStr rseq, rqv, seqid, seqinfo;
452 GStr rseq2, rqv2, seqid2, seqinfo2;
453 while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
454 incounter++;
455 int a5=0, a3=0, b5=0, b3=0;
456 char tcode=0, tcode2=0;
457 tcode=process_read(seqid, rseq, rqv, a5, a3);
458 //if (!doCollapse) {
459 if (fq2!=NULL) {
460 getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
461 if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
462 GError("Error: no paired match for read %s vs %s (%s,%s)\n",
463 seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
464 }
465 tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
466 //decide what to do with this pair and print rseq2 if the pair makes it
467 if (tcode>1 && tcode2<=1) {
468 //"untrash" rseq
469 if (a3-a5+1<min_read_len) {
470 a5=1;
471 if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
472 }
473 tcode=1;
474 }
475 else if (tcode<=1 && tcode2>1) {
476 //"untrash" rseq2
477 if (b3-b5+1<min_read_len) {
478 b5=1;
479 if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
480 }
481 tcode2=1;
482 }
483 if (tcode<=1) { //trimmed or left intact -- write it!
484 if (tcode>0) {
485 rseq2=rseq2.substr(b5,b3-b5+1);
486 if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
487 }
488 int nocounter=0;
489 writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
490 }
491 } //paired read
492 // }
493 if (tcode>1) { //trashed
494 if (tcode=='s') trash_s++;
495 else if (tcode=='Q') trash_Q++;
496 else if (tcode=='N') trash_N++;
497 else if (tcode=='D') trash_D++;
498 else if (tcode=='3') trash_A3++;
499 else if (tcode=='5') trash_A5++;
500 if (trashReport) trash_report(tcode, seqid, freport);
501 }
502 else if (!doCollapse) { //write it
503 if (tcode>0) {
504 rseq=rseq.substr(a5,a3-a5+1);
505 if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
506 }
507 writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
508 }
509 } //for each fastq record
510 delete fq2;
511 FRCLOSE(f_in);
512 FRCLOSE(f_in2);
513 if (doCollapse) {
514 outcounter=0;
515 int maxdup_count=1;
516 char* maxdup_seq=NULL;
517 dhash.startIterate();
518 FqDupRec* qd=NULL;
519 char* seq=NULL;
520 while ((qd=dhash.NextData(seq))!=NULL) {
521 GStr rseq(seq);
522 //do the dusting here
523 if (doDust) {
524 int dustbases=dust(rseq);
525 if (dustbases>(rseq.length()>>1)) {
526 if (trashReport && qd->firstname!=NULL) {
527 fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
528 }
529 trash_D+=qd->count;
530 continue;
531 }
532 }
533 outcounter++;
534 if (qd->count>maxdup_count) {
535 maxdup_count=qd->count;
536 maxdup_seq=seq;
537 }
538 if (isfasta) {
539 if (prefix.is_empty()) {
540 fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
541 rseq.chars());
542 }
543 else { //use custom read name
544 fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
545 qd->count, rseq.chars());
546 }
547 }
548 else { //fastq format
549 if (convert_phred) convertPhred(qd->qv, qd->len);
550 if (prefix.is_empty()) {
551 fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
552 rseq.chars(), qd->qv);
553 }
554 else { //use custom read name
555 fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
556 qd->count, rseq.chars(), qd->qv);
557 }
558 }
559 }//for each element of dhash
560 if (maxdup_count>1) {
561 GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
562 }
563 } //collapse entries
564 if (verbose) {
565 if (paired_reads) {
566 GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
567 GMessage("Number of input pairs :%9d\n", incounter);
568 GMessage(" Output pairs :%9d\n", outcounter);
569 }
570 else {
571 GMessage(">Input file : %s\n", infname.chars());
572 GMessage("Number of input reads :%9d\n", incounter);
573 GMessage(" Output reads :%9d\n", outcounter);
574 }
575 GMessage("------------------------------------\n");
576 if (num_trimmed5)
577 GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
578 if (num_trimmed3)
579 GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
580 GMessage("------------------------------------\n");
581 if (trash_s>0)
582 GMessage(" Trashed by length:%9d\n", trash_s);
583 if (trash_N>0)
584 GMessage(" Trashed by N%%:%9d\n", trash_N);
585 if (trash_Q>0)
586 GMessage("Trashed by low quality:%9d\n", trash_Q);
587 if (trash_A5>0)
588 GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
589 if (trash_A3>0)
590 GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
591 }
592 if (trashReport) {
593 FWCLOSE(freport);
594 }
595 FWCLOSE(f_out);
596 FWCLOSE(f_out2);
597 } //while each input file
598
599 //getc(stdin);
600 }
601
602 class NData {
603 public:
604 int NPos[1024]; //there should be no reads longer than 1K ?
605 int NCount;
606 int end5;
607 int end3;
608 int seqlen;
609 double perc_N; //percentage of Ns in end5..end3 range only!
610 const char* seq;
611 bool valid;
612 NData() {
613 NCount=0;
614 end5=0;
615 end3=0;
616 seq=NULL;
617 perc_N=0;
618 valid=true;
619 }
620 void init(GStr& rseq) {
621 NCount=0;
622 perc_N=0;
623 seqlen=rseq.length();
624 seq=rseq.chars();
625 end5=1;
626 end3=seqlen;
627 for (int i=0;i<seqlen;i++)
628 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
629 NPos[NCount]=i;
630 NCount++;
631 }
632 perc_N=(NCount*100.0)/seqlen;
633 valid=true;
634 }
635 void N_calc() { //only in the region after trimming
636 int n=0;
637 for (int i=end3-1;i<end5;i++) {
638 if (seq[i]=='N') n++;
639 }
640 perc_N=(n*100.0)/(end5-end3+1);
641 }
642 };
643
644 static NData feat;
645 int perc_lenN=12; // incremental distance from ends, in percentage of
646 // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
647
648 void N_analyze(int l5, int l3, int p5, int p3) {
649 /* assumes feat was filled properly */
650 int old_dif, t5,t3,v;
651 if (l3<l5+2 || p5>p3 ) {
652 feat.end5=l5+1;
653 feat.end3=l3+1;
654 return;
655 }
656
657 t5=feat.NPos[p5]-l5;
658 t3=l3-feat.NPos[p3];
659 old_dif=p3-p5;
660 v=(int)((((double)(l3-l5))*perc_lenN)/100);
661 if (v>20) v=20; /* enforce N-search limit for very long reads */
662 if (t5 < v ) {
663 l5=feat.NPos[p5]+1;
664 p5++;
665 }
666 if (t3 < v) {
667 l3=feat.NPos[p3]-1;
668 p3--;
669 }
670 /* restNs=p3-p5; number of Ns in the new CLR */
671 if (p3-p5==old_dif) { /* no change, return */
672 /* don't trim if this may shorten the read too much */
673 //if (l5-l3<min_read_len) return;
674 feat.end5=l5+1;
675 feat.end3=l3+1;
676 return;
677 }
678 else
679 N_analyze(l5,l3, p5,p3);
680 }
681
682
683 bool qtrim(GStr& qvs, int &l5, int &l3) {
684 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
685 if (qv_phredtype==0) {
686 //try to guess the Phred type
687 int vmin=256, vmax=0;
688 for (int i=0;i<qvs.length();i++) {
689 if (vmin>qvs[i]) vmin=qvs[i];
690 if (vmax<qvs[i]) vmax=qvs[i];
691 }
692 if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
693 if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
694 if (qv_phredtype==0) {
695 GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
696 }
697 if (verbose)
698 GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
699 } //guessing Phred type
700 for (l3=qvs.length()-1;l3>2;l3--) {
701 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
702 }
703 //just in case, check also the 5' the end (?)
704 for (l5=0;l5<qvs.length()-3;l5++) {
705 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
706 }
707 if (qvtrim_max>0) {
708 if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
709 if (l5>qvtrim_max) l5=qvtrim_max;
710 }
711 return (l5>0 || l3<qvs.length()-1);
712 }
713
714 bool ntrim(GStr& rseq, int &l5, int &l3) {
715 //count Ns in the sequence, trim N-rich ends
716 feat.init(rseq);
717 l5=feat.end5-1;
718 l3=feat.end3-1;
719 N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
720 if (l5==feat.end5-1 && l3==feat.end3-1) {
721 if (feat.perc_N>max_perc_N) {
722 feat.valid=false;
723 l3=l5+1;
724 return true;
725 }
726 else {
727 return false; //nothing changed
728 }
729 }
730 l5=feat.end5-1;
731 l3=feat.end3-1;
732 if (l3-l5+1<min_read_len) {
733 feat.valid=false;
734 return true;
735 }
736 feat.N_calc();
737
738 if (feat.perc_N>max_perc_N) {
739 feat.valid=false;
740 l3=l5+1;
741 return true;
742 }
743 return true;
744 }
745
746 //--------------- dust functions ----------------
747 class DNADuster {
748 public:
749 int dustword;
750 int dustwindow;
751 int dustwindow2;
752 int dustcutoff;
753 int mv, iv, jv;
754 int counts[32*32*32];
755 int iis[32*32*32];
756 DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
757 dustword=wordsize;
758 dustwindow=winsize;
759 dustwindow2 = (winsize>>1);
760 dustcutoff=cutoff;
761 mv=0;
762 iv=0;
763 jv=0;
764 }
765 void setWindowSize(int value) {
766 dustwindow = value;
767 dustwindow2 = (dustwindow >> 1);
768 }
769 void setWordSize(int value) {
770 dustword=value;
771 }
772 void wo1(int len, const char* s, int ivv) {
773 int i, ii, j, v, t, n, n1, sum;
774 int js, nis;
775 n = 32 * 32 * 32;
776 n1 = n - 1;
777 nis = 0;
778 i = 0;
779 ii = 0;
780 sum = 0;
781 v = 0;
782 for (j=0; j < len; j++, s++) {
783 ii <<= 5;
784 if (*s<=32) {
785 i=0;
786 continue;
787 }
788 ii |= *s - 'A'; //assume uppercase!
789 ii &= n1;
790 i++;
791 if (i >= dustword) {
792 for (js=0; js < nis && iis[js] != ii; js++) ;
793 if (js == nis) {
794 iis[nis] = ii;
795 counts[ii] = 0;
796 nis++;
797 }
798 if ((t = counts[ii]) > 0) {
799 sum += t;
800 v = 10 * sum / j;
801 if (mv < v) {
802 mv = v;
803 iv = ivv;
804 jv = j;
805 }
806 }
807 counts[ii]++;
808 }
809 }
810 }
811
812 int wo(int len, const char* s, int* beg, int* end) {
813 int i, l1;
814 l1 = len - dustword + 1;
815 if (l1 < 0) {
816 *beg = 0;
817 *end = len - 1;
818 return 0;
819 }
820 mv = 0;
821 iv = 0;
822 jv = 0;
823 for (i=0; i < l1; i++) {
824 wo1(len-i, s+i, i);
825 }
826 *beg = iv;
827 *end = iv + jv;
828 return mv;
829 }
830
831 void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
832 int i, j, l, a, b, v;
833 if (cutoff==0) cutoff=dustcutoff;
834 a=0;b=0;
835 //GMessage("Dust cutoff=%d\n", cutoff);
836 for (i=0; i < seqlen; i += dustwindow2) {
837 l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
838 v = wo(l, seq+i, &a, &b);
839 if (v > cutoff) {
840 //for (j = a; j <= b && j < dustwindow2; j++) {
841 for (j = a; j <= b; j++) {
842 seqmsk[i+j]='N';//could be made lowercase instead
843 }
844 }
845 }
846 //return first;
847 }
848 };
849
850 static DNADuster duster;
851
852 int dust(GStr& rseq) {
853 char* seq=Gstrdup(rseq.chars());
854 duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
855 //check the number of Ns:
856 int ncount=0;
857 for (int i=0;i<rseq.length();i++) {
858 if (seq[i]=='N') ncount++;
859 }
860 GFREE(seq);
861 return ncount;
862 }
863
864
865 // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
866 //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
867 //check minimum score and
868 //for 3' adapter trimming:
869 // require that the right end of the alignment for either the adaptor OR the read must be
870 // < 3 distance from its right end
871 // for 5' adapter trimming:
872 // require that the left end of the alignment for either the adaptor OR the read must
873 // be at coordinate < 3 from start
874
875 bool extendMatch(const char* a, int alen, int ai,
876 const char* b, int blen, int bi, int mlen, int& l5, int& l3, CSegChain& segs, bool end5=false) {
877 //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
878 #ifdef DEBUG
879 GStr dbg(b);
880 #endif
881 //if (debug) {
882 // GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
883 // }
884 int a_l=ai; //alignment coordinates on a
885 int a_r=ai+mlen-1;
886 int b_l=bi; //alignment coordinates on b
887 int b_r=bi+mlen-1;
888 int ai_maxscore=ai;
889 int bi_maxscore=bi;
890 int score=mlen*a_m_score;
891 int maxscore=score;
892 int mism5score=a_mis_score;
893 if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
894 //try to extend to the left first, if possible
895 while (ai>0 && bi>0) {
896 ai--;
897 bi--;
898 score+= (a[ai]==b[bi])? a_m_score : mism5score;
899 if (score>maxscore) {
900 ai_maxscore=ai;
901 bi_maxscore=bi;
902 maxscore=score;
903 }
904 else if (maxscore-score>a_dropoff_score) break;
905 }
906 a_l=ai_maxscore;
907 b_l=bi_maxscore;
908 //if (debug) GMessage(" after l-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
909 //now extend to the right
910 ai_maxscore=a_r;
911 bi_maxscore=b_r;
912 ai=a_r;
913 bi=b_r;
914 score=maxscore;
915 //sometimes there are extra AAAAs at the end of the read, ignore those
916 if (strcmp(&a[alen-4],"AAAA")==0) {
917 alen-=3;
918 while (a[alen-1]=='A' && alen>ai) alen--;
919 }
920 while (ai<alen-1 && bi<blen-1) {
921 ai++;
922 bi++;
923 //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
924 if (a[ai]==b[bi]) { //match
925 score+=a_m_score;
926 if (ai>=alen-2) {
927 score+=a_m_score-(alen-ai-1);
928 }
929 }
930 else { //mismatch
931 score+=a_mis_score;
932 }
933 if (score>maxscore) {
934 ai_maxscore=ai;
935 bi_maxscore=bi;
936 maxscore=score;
937 }
938 else if (maxscore-score>a_dropoff_score) break;
939 }
940 a_r=ai_maxscore;
941 b_r=bi_maxscore;
942 int a_ovh3=alen-a_r-1;
943 int b_ovh3=blen-b_r-1;
944 int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
945 int mmovh5=(a_l<b_l)? a_l : b_l;
946 //if (debug) GMessage(" after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
947 #ifdef DEBUG
948 if (debug) GMessage(" extended to: %*s\n",a_r+1,dbg.substr(b_l,b_r-b_l+1).chars());
949 #endif
950 if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
951 if (a_l<a_ovh3) {
952 //adapter closer to the left end (typical for 5' adapter)
953 l5=a_r+1;
954 l3=alen-1;
955 }
956 else {
957 //adapter matching at the right end (typical for 3' adapter)
958 l5=0;
959 l3=a_l-1;
960 }
961 return true;
962 }
963 else { //keep this segment pair for later (gapped alignment)
964 segs.addSegPair(new CSegPair(a_l+1, a_r+1, b_l+1, b_r+1, maxscore));
965 //this will also update min & max coordinates in segs (segs.astart, .aend, .bstart, .bend)
966 }
967 //do not trim:
968 l5=0;
969 l3=alen-1;
970 return false;
971 }
972
973 /*
974 int getWordValue(const char* s, int wlen) {
975 int r=0;
976 while (wlen--) { r+=(((int)s[wlen])<<wlen) }
977 return r;
978 }
979 */
980 int get3mer_value(const char* s) {
981 return (s[0]<<16)+(s[1]<<8)+s[2];
982 }
983
984 int w3_match(int qv, const char* str, int slen, int start_index=0) {
985 if (start_index>=slen || start_index<0) return -1;
986 for (int i=start_index;i<slen-3;i++) {
987 int rv=get3mer_value(str+i);
988 if (rv==qv) return i;
989 }
990 return -1;
991 }
992
993 int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
994 if (end_index>=slen) return -1;
995 if (end_index<0) end_index=slen-1;
996 for (int i=end_index-2;i>=0;i--) {
997 int rv=get3mer_value(str+i);
998 if (rv==qv) return i;
999 }
1000 return -1;
1001 }
1002
1003 int fast4match(int32 qv, const char* str, int slen, int start_index=0) {
1004 if (start_index>=slen || start_index<0) return -1;
1005 for (int i=start_index;i<slen-4;i++) {
1006 int32* rv=(int32*)(str+i);
1007 if (*rv==qv) return i;
1008 }
1009 return -1;
1010 }
1011
1012 int fast4rmatch(int32 qv, const char* str, int slen, int end_index=-1) {
1013 if (end_index>=slen) return -1;
1014 if (end_index<0) end_index=slen-1;
1015 for (int i=end_index-3;i>=0;i--) {
1016 int32* rv=(int32*)(str+i);
1017 if (*rv==qv) return i;
1018 }
1019 return -1;
1020 }
1021
1022 #ifdef DEBUG
1023 void dbgPrintChain(CSegChain& chain, const char* aseq) {
1024 GStr s(aseq);
1025 for (int i=0;i<chain.Count();i++) {
1026 CSegPair& seg=*chain[i];
1027 GMessage(" dbg chain seg%d: %*s [%d-%d:%d-%d]\n",i,seg.a.start-1+seg.len(),
1028 s.substr(seg.b.start-1, seg.len()).chars(), seg.b.start,seg.b.end,seg.a.start,seg.a.end);
1029 }
1030 }
1031 #endif
1032 const char *pA_seed="AAAA";
1033 const char *pT_seed="TTTT";
1034 const int polyA_seed=*(int*)pA_seed;
1035 const int polyT_seed=*(int*)pT_seed;
1036
1037 bool trim_poly3(GStr &seq, int &l5, int &l3) {
1038 int rlen=seq.length();
1039 l5=0;
1040 l3=rlen-1;
1041 //assumes N trimming was already done
1042 //so a poly match should be very close to the end of the read
1043 //TODO: should we attempt polyA and polyT at the same time, same end?
1044 // -- find the initial match (seed)
1045 int rlo=GMAX((rlen-10), 0);
1046 int si;
1047 for (si=rlen-5;si>rlo;si--) {
1048 if (polyA_seed==*(int*)&(seq.chars()+si)) {
1049 break;
1050 }
1051 }
1052 if (si<=rlo) return false;
1053 //seed found, try to extend it to left and right
1054 int score=4*match_score;
1055 max_rlo=rlo;
1056 rlo--;
1057 while (rlo>=0) {
1058 rlo--;
1059 score+=
1060
1061 }
1062 }
1063
1064 bool trim_adapter3(GStr& seq, int&l5, int &l3) {
1065 int rlen=seq.length();
1066 l5=0;
1067 l3=rlen-1;
1068 //first try a full match, we might get lucky
1069 int fi=-1;
1070 if ((fi=seq.index(adapter3))>=0) {
1071 if (fi<rlen-fi-a3len) {//match is closer to the right end
1072 l5=fi+a3len;
1073 l3=rlen-1;
1074 }
1075 else {
1076 l5=0;
1077 l3=fi-1;
1078 }
1079 return true;
1080 }
1081 #ifdef DEBUG
1082 if (debug) GMessage(">TRIM3 >> Read: %s\n",seq.chars());
1083 #endif
1084
1085 //also, for fast detection of other adapter-only reads that start past
1086 // the beginning of the adapter sequence, try to see if the first a3len-4
1087 // bases of the read are a substring of the adapter
1088 if (rlen>a3len-3) {
1089 GStr rstart=seq.substr(1,a3len-4);
1090 if ((fi=adapter3.index(rstart))>=0) {
1091 l3=rlen-1;
1092 l5=a3len-4;
1093 while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
1094 return true;
1095 }
1096 }
1097 CSegChain a3segs; //no chains here, just an ordered collection of segment pairs
1098 //check the easy cases - 11 bases exact match at the end
1099 int fdlen=11;
1100 if (a3len<16) {
1101 fdlen=a3len>>1;
1102 }
1103 if (fdlen>4) {
1104 //check if we're lucky enough to have the last 11 bases of the read a part of the adapter
1105 GStr rstart=seq.substr(-fdlen-3,fdlen);
1106 if ((fi=adapter3.index(rstart))>=0) {
1107 #ifdef DEBUG
1108 if (debug) GMessage(" W11match found: %*s\n", rlen-3, (adapter3.substr(fi,fdlen)).chars());
1109 #endif
1110 if (extendMatch(seq.chars(), rlen, rlen-fdlen-3,
1111 adapter3.chars(), a3len, fi, fdlen, l5,l3, a3segs))
1112 return true;
1113 }
1114 //another easy case: first 11 characters of the adaptor found as a substring of the read
1115 GStr bstr=adapter3.substr(0, fdlen);
1116 if ((fi=seq.rindex(bstr))>=0) {
1117 #ifdef DEBUG
1118 if (debug) GMessage(" A11match found: %*s\n", fi+fdlen, bstr.chars());
1119 #endif
1120 if (extendMatch(seq.chars(), rlen, fi,
1121 adapter3.chars(), a3len, 0, fdlen, l5,l3, a3segs))
1122 return true;
1123 }
1124 } //tried to match 11 bases first
1125
1126 //no easy cases, so let's do the wmer hashing for the first 12 bases of the adaptor
1127 //-- only extend if the match is in the 3' (ending) region of the read
1128 int wordSize=3;
1129 int hlen=12;
1130 if (hlen>a3len-wordSize) hlen=a3len-wordSize;
1131 int imin=rlen>>1; //last half of the read, left boundary for the wmer match
1132 if (imin<a3len) { imin=GMIN(a3len, rlen-wordSize); }
1133 imin=rlen-imin;
1134 for (int iw=0;iw<hlen;iw++) {
1135 //int32* qv=(int32*)(adapter3.chars()+iw);
1136 int qv=get3mer_value(adapter3.chars()+iw);
1137 fi=-1;
1138 //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1139 while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1140 //GMessage(" ... fi=%d after w3_rmatch() (imin=%d)\n", fi, imin);
1141
1142 #ifdef DEBUG
1143 if (debug) GMessage(" Wmatch found: %*s\n", fi+wordSize, (adapter3.substr(iw,wordSize)).chars());
1144 #endif
1145 if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1146 a3len, iw, wordSize, l5,l3, a3segs)) return true;
1147 fi--;
1148 if (fi<imin) break;
1149 }
1150 } //for each wmer in the first hlen bases of the adaptor
1151 /*
1152 //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1153 //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1154 if (a3segs.bstart>3 || a3segs.bend<(uint)(hlen-wordSize)) return false;
1155 int hlen2=a3len-wordSize;
1156 //if (hlen2>a3len-4) hlen2=a3len-4;
1157 if (hlen2>hlen) {
1158 #ifdef DEBUG
1159 if (debug && a3segs.Count()>0) {
1160 GMessage(" >>>>>2nd. hash: %s\n",seq.chars());
1161 }
1162 #endif
1163 for (int iw=hlen;iw<hlen2;iw++) {
1164 //int* qv=(int32 *)(adapter3.chars()+iw);
1165 int qv=get3mer_value(adapter3.chars()+iw);
1166 fi=-1;
1167 //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1168 while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1169 extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1170 a3len, iw, wordSize, l5,l3, a3segs);
1171 fi--;
1172 if (fi<imin) break;
1173 }
1174 } //for each wmer between hlen2 and hlen bases of the adaptor
1175 }
1176 //lastly, analyze collected a3segs for a possible gapped alignment:
1177 GList<CSegChain> segchains(false,true,false);
1178 #ifdef DEBUG
1179 if (debug && a3segs.Count()>0) {
1180 GMessage(">>>>>>>>> Read: %s\n",seq.chars());
1181 }
1182 #endif
1183 for (int i=0;i<a3segs.Count();i++) {
1184 if (a3segs[i]->chain==NULL) {
1185 if (a3segs[i]->b.start>3) continue; //don't start a hopeless chain
1186 CSegChain* newchain=new CSegChain();
1187 newchain->setFreeItem(false);
1188 newchain->addSegPair(a3segs[i]);
1189 a3segs[i]->chain=newchain;
1190 segchains.Add(newchain); //just to free them when done
1191 }
1192 for (int j=i+1;j<a3segs.Count();j++) {
1193 CSegChain* chain=a3segs[i]->chain;
1194 if (chain->extendChain(a3segs[j])) {
1195 a3segs[j]->chain=chain;
1196 #ifdef DEBUG
1197 if (debug) dbgPrintChain(*chain, adapter3.chars());
1198 #endif
1199 //save time by checking here if the extended chain is already acceptable for trimming
1200 if (chain->aend>(uint)(rlen-4) && chain->bstart<4 && chain->score>a_min_chain_score) {
1201 l5=0;
1202 l3=chain->astart-2;
1203 #ifdef DEBUG
1204 if (debug && a3segs.Count()>0) {
1205 GMessage(">>> >> trimmed-3: %*s\n",l3-l5+1,seq.substr(l5,l3-l5+1).chars());
1206 }
1207 #endif
1208 return true;
1209 }
1210 } //chain can be extended
1211 }
1212 } //collect segment alignments into chains
1213 */
1214 return false; //no adapter parts found
1215 }
1216
1217 bool trim_adapter5(GStr& seq, int&l5, int &l3) {
1218 //if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars());
1219 int rlen=seq.length();
1220 l5=0;
1221 l3=rlen-1;
1222 //try to see if adapter is fully included in the read
1223 int fi=-1;
1224 if ((fi=seq.index(adapter5))>=0) {
1225 if (fi<rlen-fi-a5len) {//match is closer to the right end
1226 l5=fi+a5len;
1227 l3=rlen-1;
1228 }
1229 else {
1230 l5=0;
1231 l3=fi-1;
1232 }
1233 return true;
1234 }
1235 #ifdef DEBUG
1236 if (debug) GMessage(">TRIM5 >> Read: %s\n",seq.chars());
1237 #endif
1238
1239 CSegChain a5segs(true); //list of segment pairs to analyze later if no extendMatch succeeded
1240
1241 //try the easy way out first - look for an exact match of 11 bases
1242 int fdlen=11;
1243 if (a5len<16) {
1244 fdlen=a5len>>1;
1245 }
1246 if (fdlen>4) {
1247 GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus
1248 if ((fi=adapter5.index(rstart))>=0) {
1249 #ifdef DEBUG
1250 if (debug) GMessage(" W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars());
1251 #endif
1252 if (extendMatch(seq.chars(), rlen, 1,
1253 adapter5.chars(), a5len, fi, fdlen, l5,l3, a5segs, true))
1254 return true;
1255 }
1256 //another easy case: last 11 characters of the adaptor found as a substring of the read
1257 GStr bstr=adapter5.substr(-fdlen);
1258 if ((fi=seq.index(bstr))>=0) {
1259 #ifdef DEBUG
1260 if (debug) GMessage(" A11match found: %*s\n", fi+fdlen, bstr.chars());
1261 #endif
1262 if (extendMatch(seq.chars(), rlen, fi,
1263 adapter5.chars(), a5len, a5len-fdlen, fdlen, l5,l3,a5segs,true))
1264 return true;
1265 }
1266 } //tried to matching at most 11 bases first
1267
1268 //-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor
1269 //-- only extend a wmer if it matches in the 5' (beginning) region of the read
1270 int wordSize=3;
1271 int hlen=12;
1272 if (hlen>a5len-wordSize) hlen=a5len-wordSize;
1273 int imax=rlen>>1; //first half of the read, right boundary for the wmer match
1274 if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); }
1275 for (int iw=0;iw<=hlen;iw++) {
1276 int apstart=a5len-iw-wordSize;
1277 fi=0;
1278 //int* qv=(int32 *)(adapter5.chars()+apstart);
1279 int qv=get3mer_value(adapter5.chars()+apstart);
1280 //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1281 while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1282 #ifdef DEBUG
1283 if (debug) GMessage(" Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars());
1284 #endif
1285 if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1286 a5len, apstart, wordSize, l5,l3, a5segs, true)) return true;
1287 fi++;
1288 if (fi>imax) break;
1289 }
1290 } //for each wmer in the last hlen bases of the adaptor
1291 /*
1292
1293 //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1294 //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1295 if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1296 int hlen2=a5len-wordSize;
1297 //if (hlen2>a5len-wordSize) hlen2=a5len-wordSize;
1298 #ifdef DEBUG
1299 if (debug && a5segs.Count()>0) {
1300 GMessage(" >>>>>2nd. hash: %s\n",seq.chars());
1301 }
1302 #endif
1303 if (hlen2>hlen) {
1304 for (int iw=hlen+1;iw<=hlen2;iw++) {
1305 int apstart=a5len-iw-wordSize;
1306 fi=0;
1307 //int* qv=(int32 *)(adapter5.chars()+apstart);
1308 int qv=get3mer_value(adapter5.chars()+apstart);
1309 //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1310 while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1311 extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1312 a5len, apstart, wordSize, l5,l3, a5segs, true);
1313 fi++;
1314 if (fi>imax) break;
1315 }
1316 } //for each wmer between hlen2 and hlen bases of the adaptor
1317 }
1318 if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1319 // lastly, analyze collected a5segs for a possible gapped alignment:
1320 GList<CSegChain> segchains(false,true,false);
1321 #ifdef DEBUG
1322 if (debug && a5segs.Count()>0) {
1323 GMessage(">>>>>>>>> Read: %s\n",seq.chars());
1324 }
1325 #endif
1326 for (int i=0;i<a5segs.Count();i++) {
1327 if (a5segs[i]->chain==NULL) {
1328 if (a5segs[i]->b.end<(int)(a5len-4)) continue; //don't start a hopeless chain
1329 CSegChain* newchain=new CSegChain(true);
1330 newchain->setFreeItem(false);
1331 newchain->addSegPair(a5segs[i]);
1332 a5segs[i]->chain=newchain;
1333 segchains.Add(newchain); //just to free them when done
1334 }
1335 for (int j=i+1;j<a5segs.Count();j++) {
1336 CSegChain* chain=a5segs[i]->chain;
1337 if (chain->extendChain(a5segs[j])) {
1338 a5segs[j]->chain=chain;
1339 #ifdef DEBUG
1340 if (debug) dbgPrintChain(*chain, adapter5.chars());
1341 #endif
1342 //save time by checking here if the extended chain is already acceptable for trimming
1343 if (chain->bend>(uint)(a5len-3) && chain->astart<4 && chain->score>a_min_chain_score) {
1344 l5=chain->aend;
1345 l3=rlen-1;
1346 return true;
1347 }
1348 } //chain can be extended
1349 }
1350 } //collect segment alignments into chains
1351 */
1352 return false; //no adapter parts found
1353 }
1354
1355 //convert qvs to/from phred64 from/to phread33
1356 void convertPhred(GStr& q) {
1357 for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1358 }
1359
1360 void convertPhred(char* q, int len) {
1361 for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1362 }
1363
1364 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1365 GStr& rname, GStr& rinfo, GStr& infname) {
1366 rseq="";
1367 rqv="";
1368 rname="";
1369 rinfo="";
1370 if (fq.eof()) return false;
1371 char* l=fq.getLine();
1372 while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1373 if (l==NULL) return false;
1374 /* if (rawFormat) {
1375 //TODO: implement raw qseq parsing here
1376 //if (raw type=N) then continue; //skip invalid/bad records
1377 } //raw qseq format
1378 else { // FASTQ or FASTA */
1379 isfasta=(l[0]=='>');
1380 if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1381 infname.chars(), l);
1382 GStr s(l);
1383 rname=&(l[1]);
1384 for (int i=0;i<rname.length();i++)
1385 if (rname[i]<=' ') {
1386 if (i<rname.length()-2) rinfo=rname.substr(i+1);
1387 rname.cut(i);
1388 break;
1389 }
1390 //now get the sequence
1391 if ((l=fq.getLine())==NULL)
1392 GError("Error: unexpected EOF after header for read %s (%s)\n",
1393 rname.chars(), infname.chars());
1394 rseq=l; //this must be the DNA line
1395 while ((l=fq.getLine())!=NULL) {
1396 //seq can span multiple lines
1397 if (l[0]=='>' || l[0]=='+') {
1398 fq.pushBack();
1399 break; //
1400 }
1401 rseq+=l;
1402 } //check for multi-line seq
1403 if (!isfasta) { //reading fastq quality values, which can also be multi-line
1404 if ((l=fq.getLine())==NULL)
1405 GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1406 if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1407 if ((l=fq.getLine())==NULL)
1408 GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1409 rqv=l;
1410 //if (rqv.length()!=rseq.length())
1411 // GError("Error: qv len != seq len for %s\n", rname.chars());
1412 while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1413 rqv+=l; //append to qv string
1414 }
1415 }// fastq
1416 // } //<-- FASTA or FASTQ
1417 rseq.upper(); //TODO: what if we care about masking?
1418 return true;
1419 }
1420
1421 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1422 //returns 0 if the read was untouched, 1 if it was just trimmed
1423 // and a trash code if it was trashed
1424 l5=0;
1425 l3=rseq.length()-1;
1426 if (l3-l5+1<min_read_len) {
1427 return 's';
1428 }
1429 GStr wseq(rseq.chars());
1430 GStr wqv(rqv.chars());
1431 int w5=l5;
1432 int w3=l3;
1433 //first do the q-based trimming
1434 if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1435 if (w3-w5+1<min_read_len) {
1436 return 'Q'; //invalid read
1437 }
1438 //-- keep only the w5..w3 range
1439 l5=w5;
1440 l3=w3;
1441 wseq=wseq.substr(w5, w3-w5+1);
1442 if (!wqv.is_empty())
1443 wqv=wqv.substr(w5, w3-w5+1);
1444 } //qv trimming
1445 // N-trimming on the remaining read seq
1446 if (ntrim(wseq, w5, w3)) {
1447 //GMessage("before: %s\n",wseq.chars());
1448 //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1449 l5+=w5;
1450 l3-=(wseq.length()-1-w3);
1451 if (w3-w5+1<min_read_len) {
1452 return 'N'; //to be trashed
1453 }
1454 //-- keep only the w5..w3 range
1455 wseq=wseq.substr(w5, w3-w5+1);
1456 if (!wqv.is_empty())
1457 wqv=wqv.substr(w5, w3-w5+1);
1458 w5=0;
1459 w3=wseq.length()-1;
1460 }
1461 if (a3len>0) {
1462 if (trim_adapter3(wseq, w5, w3)) {
1463 int trimlen=wseq.length()-(w3-w5+1);
1464 num_trimmed3++;
1465 if (trimlen<min_trimmed3)
1466 min_trimmed3=trimlen;
1467 l5+=w5;
1468 l3-=(wseq.length()-1-w3);
1469 if (w3-w5+1<min_read_len) {
1470 return '3';
1471 }
1472 //-- keep only the w5..w3 range
1473 wseq=wseq.substr(w5, w3-w5+1);
1474 if (!wqv.is_empty())
1475 wqv=wqv.substr(w5, w3-w5+1);
1476 }//some adapter was trimmed
1477 } //adapter trimming
1478 if (a5len>0) {
1479 if (trim_adapter5(wseq, w5, w3)) {
1480 int trimlen=wseq.length()-(w3-w5+1);
1481 num_trimmed5++;
1482 if (trimlen<min_trimmed5)
1483 min_trimmed5=trimlen;
1484 l5+=w5;
1485 l3-=(wseq.length()-1-w3);
1486 if (w3-w5+1<min_read_len) {
1487 return '5';
1488 }
1489 //-- keep only the w5..w3 range
1490 wseq=wseq.substr(w5, w3-w5+1);
1491 if (!wqv.is_empty())
1492 wqv=wqv.substr(w5, w3-w5+1);
1493 }//some adapter was trimmed
1494 } //adapter trimming
1495 if (doCollapse) {
1496 //keep read for later
1497 FqDupRec* dr=dhash.Find(wseq.chars());
1498 if (dr==NULL) { //new entry
1499 //if (prefix.is_empty())
1500 dhash.Add(wseq.chars(),
1501 new FqDupRec(&wqv, rname.chars()));
1502 //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1503 }
1504 else
1505 dr->add(wqv);
1506 } //collapsing duplicates
1507 else { //not collapsing duplicates
1508 //apply the dust filter now
1509 if (doDust) {
1510 int dustbases=dust(wseq);
1511 if (dustbases>(wseq.length()>>1)) {
1512 return 'D';
1513 }
1514 }
1515 } //not collapsing duplicates
1516 return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1517 }
1518
1519 void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1520 //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1521 if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1522 else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1523 }
1524
1525 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1526 outcounter++;
1527 bool asFasta=(rqv.is_empty() || fastaOutput);
1528 if (asFasta) {
1529 if (prefix.is_empty()) {
1530 printHeader(f_out, '>',rname,rinfo);
1531 fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1532 }
1533 else {
1534 fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1535 rseq.chars());
1536 }
1537 }
1538 else { //fastq
1539 if (convert_phred) convertPhred(rqv);
1540 if (prefix.is_empty()) {
1541 printHeader(f_out, '@', rname, rinfo);
1542 fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1543 }
1544 else
1545 fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1546 rseq.chars(),rqv.chars() );
1547 }
1548 }
1549
1550 void trash_report(char trashcode, GStr& rname, FILE* freport) {
1551 if (freport==NULL || trashcode<=' ') return;
1552 if (trashcode=='3' || trashcode=='5') {
1553 fprintf(freport, "%s\tA%c\n",rname.chars(),trashcode);
1554 }
1555 else {
1556 fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1557 }
1558 //tcounter++;
1559 }
1560
1561 GStr getFext(GStr& s, int* xpos=NULL) {
1562 //s must be a filename without a path
1563 GStr r("");
1564 if (xpos!=NULL) *xpos=0;
1565 if (s.is_empty() || s=="-") return r;
1566 int p=s.rindex('.');
1567 int d=s.rindex('/');
1568 if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1569 r=s.substr(p+1);
1570 if (xpos!=NULL) *xpos=p+1;
1571 r.lower();
1572 return r;
1573 }
1574
1575 void baseFileName(GStr& fname) {
1576 //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1577 int xpos=0;
1578 GStr fext=getFext(fname, &xpos);
1579 if (xpos<=1) return;
1580 bool extdel=false;
1581 GStr f2;
1582 if (fext=="z" || fext=="zip") {
1583 extdel=true;
1584 }
1585 else if (fext.length()>=2) {
1586 f2=fext.substr(0,2);
1587 extdel=(f2=="gz" || f2=="bz");
1588 }
1589 if (extdel) {
1590 fname.cut(xpos-1);
1591 fext=getFext(fname, &xpos);
1592 if (xpos<=1) return;
1593 }
1594 extdel=false;
1595 if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1596 extdel=true;
1597 }
1598 else if (fext.length()>=2) {
1599 extdel=(fext.substr(0,2)=="fa");
1600 }
1601 if (extdel) fname.cut(xpos-1);
1602 GStr fncp(fname);
1603 fncp.lower();
1604 fncp.chomp("seq");
1605 fncp.chomp("sequence");
1606 fncp.trimR("_.");
1607 if (fncp.length()<fname.length()) fname.cut(fncp.length());
1608 }
1609
1610 FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1611 FILE* f_out=NULL;
1612 GStr fname(getFileName(infname.chars()));
1613 //eliminate known extensions
1614 baseFileName(fname);
1615 if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1616 else if (pocmd.is_empty()) {
1617 GStr oname(fname);
1618 oname.append('.');
1619 oname.append(outsuffix);
1620 f_out=fopen(oname.chars(),"w");
1621 if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1622 }
1623 else {
1624 GStr oname(">");
1625 oname.append(fname);
1626 oname.append('.');
1627 oname.append(outsuffix);
1628 pocmd.append(oname);
1629 f_out=popen(pocmd.chars(), "w");
1630 if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1631 }
1632 return f_out;
1633 }
1634
1635 void guess_unzip(GStr& fname, GStr& picmd) {
1636 GStr fext=getFext(fname);
1637 if (fext=="gz" || fext=="gzip" || fext=="z") {
1638 picmd="gzip -cd ";
1639 }
1640 else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1641 picmd="bzip2 -cd ";
1642 }
1643 }
1644
1645 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1646 GStr& s, GStr& infname, GStr& infname2) {
1647 // uses outsuffix to generate output file names and open file handles as needed
1648 infname="";
1649 infname2="";
1650 f_in=NULL;
1651 f_in2=NULL;
1652 f_out=NULL;
1653 f_out2=NULL;
1654 //analyze outsuffix intent
1655 GStr pocmd;
1656 if (outsuffix=="-") {
1657 f_out=stdout;
1658 }
1659 else {
1660 GStr ox=getFext(outsuffix);
1661 if (ox.length()>2) ox=ox.substr(0,2);
1662 if (ox=="gz") pocmd="gzip -9 -c ";
1663 else if (ox=="bz") pocmd="bzip2 -9 -c ";
1664 }
1665 if (s=="-") {
1666 f_in=stdin;
1667 infname="stdin";
1668 f_out=prepOutFile(infname, pocmd);
1669 return;
1670 } // streaming from stdin
1671 s.startTokenize(",:");
1672 s.nextToken(infname);
1673 bool paired=s.nextToken(infname2);
1674 if (fileExists(infname.chars())==0)
1675 GError("Error: cannot find file %s!\n",infname.chars());
1676 GStr fname(getFileName(infname.chars()));
1677 GStr picmd;
1678 guess_unzip(fname, picmd);
1679 if (picmd.is_empty()) {
1680 f_in=fopen(infname.chars(), "r");
1681 if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1682 }
1683 else {
1684 picmd.append(infname);
1685 f_in=popen(picmd.chars(), "r");
1686 if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1687 }
1688 if (f_out==stdout) {
1689 if (paired) GError("Error: output suffix required for paired reads\n");
1690 return;
1691 }
1692 f_out=prepOutFile(infname, pocmd);
1693 if (!paired) return;
1694 if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1695 // ---- paired reads:-------------
1696 if (fileExists(infname2.chars())==0)
1697 GError("Error: cannot find file %s!\n",infname2.chars());
1698 picmd="";
1699 GStr fname2(getFileName(infname2.chars()));
1700 guess_unzip(fname2, picmd);
1701 if (picmd.is_empty()) {
1702 f_in2=fopen(infname2.chars(), "r");
1703 if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1704 }
1705 else {
1706 picmd.append(infname2);
1707 f_in2=popen(picmd.chars(), "r");
1708 if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1709 }
1710 f_out2=prepOutFile(infname2, pocmd);
1711 }