ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 13
Committed: Mon Jul 18 20:35:43 2011 UTC (8 years, 3 months ago) by gpertea
File size: 52599 byte(s)
Log Message:
sync with my local source

Line File contents
1 #include "GArgs.h"
2 #include "GStr.h"
3 #include "GHash.hh"
4 #include "GList.hh"
5 #include <ctype.h>
6
7 #define USAGE "Usage:\n\
8 fqtrim [-5 <5adapter>] [-3 <3adapter>] [-a <min_matchlen>] [-p {64|33}] [-q <minq> [-t <trim_max>]]\\\n\
9 [-n <rename_prefix>] [-o <outsuffix>] [-z <zcmd>] [-r <discarded.lst>]\\\n\
10 [-l <minlen>] [-C] [-D] [-Q] <input.fq>[,<input_mates.fq>\n\
11 \n\
12 Trim low quality bases at the 3' end, optionally trim adapter sequence, filter\n\
13 for low complexity and collapse duplicate reads\n\
14 If read pairs should be trimmed and kept together (i.e. without discarding\n\
15 one read in a pair), the two file names should be given delimited by a comma\n\
16 or a colon character\n\
17 \n\
18 Options:\n\
19 -n rename all the reads using the <prefix> followed by a read counter;\n\
20 if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
21 the read duplication count\n\
22 -o write the trimmed/filtered reads to file(s) named <input>.<outsuffix>\n\
23 which will be created in the current (working) directory\n\
24 -5 trim the given adapter or primer sequence at the 5' end of each read\n\
25 (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
26 -3 trim the given adapter sequence at the 3' end of each read\n\
27 (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
28 -a minimum bases to match to adaptor sequence (default 5)\n\
29 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
30 -t for -q option, maximum trimming at the 3' end is limited to <trim_max>\n\
31 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
32 -l minimum \"clean\" length after trimming that a read must have\n\
33 in order to pass the filter (default: 16)\n\
34 -r write a simple \"trash report\" file listing the discarded reads with a\n\
35 one letter code for the reason why they were trashed\n\
36 -D apply a low-complexity (dust) filter and discard any read that has over\n\
37 50% of its length detected as low complexity\n\
38 -C collapse duplicate reads and add a prefix with their count to the read\n\
39 name (e.g. for microRNAs)\n\
40 -p input is phred64/phred33 (use -p64 or -p33)\n\
41 -Q convert quality values to the other Phred qv type\n\
42 -V verbose processing\n\
43 "
44
45 //-z for -o option, the output stream(s) will be first piped into the given\n
46 // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
47
48
49 // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
50
51 //For pair ends sequencing:
52 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
53 //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
54 //FILE* f_out=NULL; //stdout if not provided
55 //FILE* f_out2=NULL; //for paired reads
56 //FILE* f_in=NULL; //input fastq (stdin if not provided)
57 //FILE* f_in2=NULL; //for paired reads
58 FILE* freport=NULL;
59
60 bool debug=false;
61 bool verbose=false;
62 bool doCollapse=false;
63 bool doDust=false;
64 bool fastaOutput=false;
65 bool trashReport=false;
66 //bool rawFormat=false;
67 int min_read_len=16;
68 double max_perc_N=7.0;
69 int dust_cutoff=16;
70 bool isfasta=false;
71 bool convert_phred=false;
72 GStr outsuffix; // -o
73 GStr adapter3;
74 GStr adapter5;
75 GStr prefix;
76 GStr zcmd;
77 int num_trimmed5=0;
78 int num_trimmed3=0;
79 int num_trimmedN=0;
80 int num_trimmedQ=0;
81 int min_trimmed5=INT_MAX;
82 int min_trimmed3=INT_MAX;
83
84 int qvtrim_qmin=0;
85 int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
86 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
87 int qv_cvtadd=0; //could be -31 or +31
88
89 int a3len=0;
90 int a5len=0;
91 // adaptor matching metrics -- for extendMatch() function
92 const int a_m_score=2; //match score
93 const int a_mis_score=-3; //mismatch
94 const int a_dropoff_score=7;
95 int a_min_score=10; //an exact match of 5 bases at the proper ends WILL be trimmed
96 const int a_min_chain_score=15; //for gapped alignments
97
98 class CSegChain;
99
100 class CSegPair {
101 public:
102 GSeg a;
103 GSeg b; //the adapter segment
104 int score;
105 int flags;
106 CSegChain* chain;
107 CSegPair(int astart=0, int aend=0, int bstart=0, int bend=0, int mscore=0):a(astart,aend),b(bstart, bend) {
108 score=mscore;
109 if (score==0) score=a.len()*a_m_score;
110 flags=0;
111 chain=NULL;
112 }
113 int len() { return a.len(); }
114 bool operator==(CSegPair& d){
115 //return (a.start==d.a.start && a.end==d.a.end && b.start==d.b.start && b.end==d.b.end);
116 //make equal even segments that are included into one another:
117 return (d.a.start>=a.start && d.a.end<=a.end && d.b.start>=b.start && d.b.end<=b.end);
118 }
119 bool operator>(CSegPair& d){ //ordering based on b (adaptor) start coord and score
120 if (b.start==d.b.start) {
121 if (score==d.score) {
122 //just try to be consistent:
123 if (b.end==d.b.end) {
124 return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
125 }
126 return (b.end>d.b.end);
127 }
128 else return (score<d.score);
129 }
130 return (b.start>d.b.start);
131 }
132 bool operator<(CSegPair& d){ //ordering based on b (adaptor) coord
133 /*if (b.start==d.b.start && b.end==d.b.end) {
134 return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
135 }
136 return (b.start==d.b.start)?(b.end<d.b.end):(b.start<d.b.start);*/
137 if (b.start==d.b.start) {
138 if (score==d.score) {
139 //just try to be consistent:
140 if (b.end==d.b.end) {
141 return (a.start==d.a.start)?(a.end>d.a.end):(a.start>d.a.start);
142 }
143 return (b.end<d.b.end);
144 }
145 else return (score>d.score);
146 }
147 return (b.start<d.b.start);
148 }
149 };
150
151 int cmpSegEnds(pointer sa, pointer sb) { //sort by adaptor seg ends AND score
152 CSegPair& x = *(CSegPair *)sa;
153 CSegPair& y = *(CSegPair *)sb;
154 /*
155 if (x.b.end==y.b.end) {
156 if (x.b.start==y.b.start) {
157 if (x.a.end==y.a.end) {
158 if (x.a.start==y.a.start) return 0;
159 return ((x.a.start>y.a.start) ? -1 : 1);
160 }
161 else {
162 return ((x.a.end>y.a.end) ? -1 : 1);
163 }
164 }
165 else {
166 return ((x.b.start>y.b.start) ? -1 : 1);
167 }
168 }
169 else {
170 return ((x.b.end>y.b.end) ? -1 : 1);
171 }
172 */
173 if (x.b.end==y.b.end) {
174 if (x.score==y.score) {
175 if (x.b.start==y.b.start) {
176 if (x.a.end==y.a.end) {
177 if (x.a.start==y.a.start) return 0;
178 return ((x.a.start<y.a.start) ? -1 : 1);
179 }
180 else {
181 return ((x.a.end<y.a.end) ? -1 : 1);
182 }
183 }
184 else {
185 return ((x.b.start<y.b.start) ? -1 : 1);
186 }
187 } else return ((x.score>y.score) ? -1 : 1);
188 }
189 else {
190 return ((x.b.end>y.b.end) ? -1 : 1);
191 }
192
193 }
194
195 class CSegChain:public GList<CSegPair> {
196 public:
197 uint astart;
198 uint aend;
199 uint bstart;
200 uint bend;
201 int score;
202 bool endSort;
203 CSegChain(bool aln5=false):GList<CSegPair>(true,true,true) {//sorted, free elements, unique
204 //as SegPairs are inserted, they will be sorted by a.start coordinate
205 score=0;
206 astart=MAX_UINT;
207 aend=0;
208 bstart=MAX_UINT;
209 bend=0;
210 endSort=aln5;
211 if (aln5) { setSorted(cmpSegEnds); }
212 }
213 bool operator==(CSegChain& d) {
214 //return (score==d.score);
215 return (astart==d.astart && aend==d.aend && bstart==d.bstart && bend==d.bend);
216 }
217 bool operator>(CSegChain& d) { // order based on b (adaptor) coordinate
218 //return (score<d.score);
219 if (bstart==d.bstart && bend==d.bend) {
220 return (astart==d.astart)?(aend>d.aend):(astart>d.astart);
221 }
222 return (bstart==d.bstart)?(bend>d.bend):(bstart>d.bstart);
223 }
224 bool operator<(CSegChain& d) {
225 //return (score>d.score);
226 if (bstart==d.bstart && bend==d.bend) {
227 return (astart==d.astart)?(aend<d.aend):(astart<d.astart);
228 }
229 return (bstart==d.bstart)?(bend<d.bend):(bstart<d.bstart);
230 }
231 void addSegPair(CSegPair* segp) {
232 if (AddIfNew(segp)!=segp) return;
233 score+=segp->score;
234 if (astart>segp->a.start) astart=segp->a.start;
235 if (aend<segp->a.end) aend=segp->a.end;
236 if (bstart>segp->b.start) bstart=segp->b.start;
237 if (bend<segp->b.end) bend=segp->b.end;
238 }
239 //for building actual chains:
240 bool extendChain(CSegPair* segp) { //segp expected to be "Greater Than" current chain
241 int bgap=0;
242 int agap=0;
243 //if (endSort) {
244 if (bstart>segp->b.start) {
245 bgap = (int)(bstart-segp->b.end);
246 if (abs(bgap)>2) return false;
247 agap = (int)(astart-segp->a.end);
248 if (abs(agap)>2) return false;
249 }
250 else {
251 bgap = (int) (segp->b.start-bend);
252 if (abs(bgap)>2) return false;
253 agap = (int)(segp->a.start-aend);
254 if (abs(agap)>2) return false;
255 }
256 if (agap*bgap<0) return false;
257 addSegPair(segp);
258 score-=abs(agap)+abs(bgap);
259 return true;
260 }
261 };
262
263 // element in dhash:
264 class FqDupRec {
265 public:
266 int count; //how many of these reads are the same
267 int len; //length of qv
268 char* firstname; //optional, only if we want to keep the original read names
269 char* qv;
270 FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
271 len=0;
272 qv=NULL;
273 firstname=NULL;
274 count=0;
275 if (qstr!=NULL) {
276 qv=Gstrdup(qstr->chars());
277 len=qstr->length();
278 count++;
279 }
280 if (rname!=NULL) firstname=Gstrdup(rname);
281 }
282 ~FqDupRec() {
283 GFREE(qv);
284 GFREE(firstname);
285 }
286 void add(GStr& d) { //collapse another record into this one
287 if (d.length()!=len)
288 GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
289 count++;
290 for (int i=0;i<len;i++)
291 qv[i]=(qv[i]+d[i])/2;
292 }
293 };
294
295 void openfw(FILE* &f, GArgs& args, char opt) {
296 GStr s=args.getOpt(opt);
297 if (!s.is_empty()) {
298 if (s=='-') f=stdout;
299 else {
300 f=fopen(s.chars(),"w");
301 if (f==NULL) GError("Error creating file: %s\n", s.chars());
302 }
303 }
304 }
305
306 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
307 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
308
309 GHash<FqDupRec> dhash; //hash to keep track of duplicates
310
311 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
312 GStr& s, GStr& infname, GStr& infname2);
313 // uses outsuffix to generate output file names and open file handles as needed
314
315 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
316 void trash_report(char trashcode, GStr& rname, FILE* freport);
317
318 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
319 GStr& rname, GStr& rinfo, GStr& infname);
320
321 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
322 //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
323
324 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
325 bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
326 int dust(GStr& seq);
327 bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
328 bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
329
330 void convertPhred(char* q, int len);
331 void convertPhred(GStr& q);
332
333 int main(int argc, char * const argv[]) {
334 GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:t:o:z:a:");
335 int e;
336 if ((e=args.isError())>0) {
337 GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
338 exit(224);
339 }
340 debug=(args.getOpt('Y')!=NULL);
341 verbose=(args.getOpt('V')!=NULL);
342 convert_phred=(args.getOpt('Q')!=NULL);
343 doCollapse=(args.getOpt('C')!=NULL);
344 doDust=(args.getOpt('D')!=NULL);
345 /*
346 rawFormat=(args.getOpt('R')!=NULL);
347 if (rawFormat) {
348 GError("Sorry, raw qseq format parsing is not implemented yet!\n");
349 }
350 */
351 prefix=args.getOpt('n');
352 GStr s=args.getOpt('l');
353 if (!s.is_empty())
354 min_read_len=s.asInt();
355 s=args.getOpt('m');
356 if (!s.is_empty())
357 max_perc_N=s.asDouble();
358 s=args.getOpt('d');
359 if (!s.is_empty()) {
360 dust_cutoff=s.asInt();
361 doDust=true;
362 }
363 s=args.getOpt('q');
364 if (!s.is_empty()) {
365 qvtrim_qmin=s.asInt();
366 }
367 s=args.getOpt('t');
368 if (!s.is_empty()) {
369 qvtrim_max=s.asInt();
370 }
371 s=args.getOpt('p');
372 if (!s.is_empty()) {
373 int v=s.asInt();
374 if (v==33) {
375 qv_phredtype=33;
376 qv_cvtadd=31;
377 }
378 else if (v==64) {
379 qv_phredtype=64;
380 qv_cvtadd=-31;
381 }
382 else
383 GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
384 }
385 if (args.getOpt('3')!=NULL) {
386 adapter3=args.getOpt('3');
387 adapter3.upper();
388 a3len=adapter3.length();
389 }
390 if (args.getOpt('5')!=NULL) {
391 adapter5=args.getOpt('5');
392 adapter5.upper();
393 a5len=adapter5.length();
394 }
395 s=args.getOpt('a');
396 if (!s.is_empty()) {
397 int a_minmatch=s.asInt();
398 a_min_score=a_minmatch<<1;
399 }
400
401 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
402 trashReport= (args.getOpt('r')!=NULL);
403 int fcount=args.startNonOpt();
404 if (fcount==0) {
405 GMessage(USAGE);
406 exit(224);
407 }
408 if (fcount>1 && doCollapse) {
409 GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
410 }
411 //openfw(f_out, args, 'o');
412 //if (f_out==NULL) f_out=stdout;
413 if (trashReport)
414 openfw(freport, args, 'r');
415 char* infile=NULL;
416 while ((infile=args.nextNonOpt())!=NULL) {
417 int incounter=0; //counter for input reads
418 int outcounter=0; //counter for output reads
419 int trash_s=0; //too short from the get go
420 int trash_Q=0;
421 int trash_N=0;
422 int trash_D=0;
423 int trash_A3=0;
424 int trash_A5=0;
425 s=infile;
426 GStr infname;
427 GStr infname2;
428 FILE* f_in=NULL;
429 FILE* f_in2=NULL;
430 FILE* f_out=NULL;
431 FILE* f_out2=NULL;
432 bool paired_reads=false;
433 setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
434 GLineReader fq(f_in);
435 GLineReader* fq2=NULL;
436 if (f_in2!=NULL) {
437 fq2=new GLineReader(f_in2);
438 paired_reads=true;
439 }
440 GStr rseq, rqv, seqid, seqinfo;
441 GStr rseq2, rqv2, seqid2, seqinfo2;
442 while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
443 incounter++;
444 int a5=0, a3=0, b5=0, b3=0;
445 char tcode=0, tcode2=0;
446 tcode=process_read(seqid, rseq, rqv, a5, a3);
447 //if (!doCollapse) {
448 if (fq2!=NULL) {
449 getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
450 if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
451 GError("Error: no paired match for read %s vs %s (%s,%s)\n",
452 seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
453 }
454 tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
455 //decide what to do with this pair and print rseq2 if the pair makes it
456 if (tcode>1 && tcode2<=1) {
457 //"untrash" rseq
458 if (a3-a5+1<min_read_len) {
459 a5=1;
460 if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
461 }
462 tcode=1;
463 }
464 else if (tcode<=1 && tcode2>1) {
465 //"untrash" rseq2
466 if (b3-b5+1<min_read_len) {
467 b5=1;
468 if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
469 }
470 tcode2=1;
471 }
472 if (tcode<=1) { //trimmed or left intact -- write it!
473 if (tcode>0) {
474 rseq2=rseq2.substr(b5,b3-b5+1);
475 if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
476 }
477 int nocounter=0;
478 writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
479 }
480 } //paired read
481 // }
482 if (tcode>1) { //trashed
483 if (tcode=='s') trash_s++;
484 else if (tcode=='Q') trash_Q++;
485 else if (tcode=='N') trash_N++;
486 else if (tcode=='D') trash_D++;
487 else if (tcode=='3') trash_A3++;
488 else if (tcode=='5') trash_A5++;
489 if (trashReport) trash_report(tcode, seqid, freport);
490 }
491 else if (!doCollapse) { //write it
492 if (tcode>0) {
493 rseq=rseq.substr(a5,a3-a5+1);
494 if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
495 }
496 writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
497 }
498 } //for each fastq record
499 delete fq2;
500 FRCLOSE(f_in);
501 FRCLOSE(f_in2);
502 if (doCollapse) {
503 outcounter=0;
504 int maxdup_count=1;
505 char* maxdup_seq=NULL;
506 dhash.startIterate();
507 FqDupRec* qd=NULL;
508 char* seq=NULL;
509 while ((qd=dhash.NextData(seq))!=NULL) {
510 GStr rseq(seq);
511 //do the dusting here
512 if (doDust) {
513 int dustbases=dust(rseq);
514 if (dustbases>(rseq.length()>>1)) {
515 if (trashReport && qd->firstname!=NULL) {
516 fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
517 }
518 trash_D+=qd->count;
519 continue;
520 }
521 }
522 outcounter++;
523 if (qd->count>maxdup_count) {
524 maxdup_count=qd->count;
525 maxdup_seq=seq;
526 }
527 if (isfasta) {
528 if (prefix.is_empty()) {
529 fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
530 rseq.chars());
531 }
532 else { //use custom read name
533 fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
534 qd->count, rseq.chars());
535 }
536 }
537 else { //fastq format
538 if (convert_phred) convertPhred(qd->qv, qd->len);
539 if (prefix.is_empty()) {
540 fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
541 rseq.chars(), qd->qv);
542 }
543 else { //use custom read name
544 fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
545 qd->count, rseq.chars(), qd->qv);
546 }
547 }
548 }//for each element of dhash
549 if (maxdup_count>1) {
550 GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
551 }
552 } //collapse entries
553 if (verbose) {
554 if (paired_reads) {
555 GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
556 GMessage("Number of input pairs :%9d\n", incounter);
557 GMessage(" Output pairs :%9d\n", outcounter);
558 }
559 else {
560 GMessage(">Input file : %s\n", infname.chars());
561 GMessage("Number of input reads :%9d\n", incounter);
562 GMessage(" Output reads :%9d\n", outcounter);
563 }
564 GMessage("------------------------------------\n");
565 if (num_trimmed5)
566 GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
567 if (num_trimmed3)
568 GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
569 GMessage("------------------------------------\n");
570 if (trash_s>0)
571 GMessage(" Trashed by length:%9d\n", trash_s);
572 if (trash_N>0)
573 GMessage(" Trashed by N%%:%9d\n", trash_N);
574 if (trash_Q>0)
575 GMessage("Trashed by low quality:%9d\n", trash_Q);
576 if (trash_A5>0)
577 GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
578 if (trash_A3>0)
579 GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
580 }
581 if (trashReport) {
582 FWCLOSE(freport);
583 }
584 FWCLOSE(f_out);
585 FWCLOSE(f_out2);
586 } //while each input file
587
588 //getc(stdin);
589 }
590
591 class NData {
592 public:
593 int NPos[1024]; //there should be no reads longer than 1K ?
594 int NCount;
595 int end5;
596 int end3;
597 int seqlen;
598 double perc_N; //percentage of Ns in end5..end3 range only!
599 const char* seq;
600 bool valid;
601 NData() {
602 NCount=0;
603 end5=0;
604 end3=0;
605 seq=NULL;
606 perc_N=0;
607 valid=true;
608 }
609 void init(GStr& rseq) {
610 NCount=0;
611 perc_N=0;
612 seqlen=rseq.length();
613 seq=rseq.chars();
614 end5=1;
615 end3=seqlen;
616 for (int i=0;i<seqlen;i++)
617 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
618 NPos[NCount]=i;
619 NCount++;
620 }
621 perc_N=(NCount*100.0)/seqlen;
622 valid=true;
623 }
624 void N_calc() { //only in the region after trimming
625 int n=0;
626 for (int i=end3-1;i<end5;i++) {
627 if (seq[i]=='N') n++;
628 }
629 perc_N=(n*100.0)/(end5-end3+1);
630 }
631 };
632
633 static NData feat;
634 int perc_lenN=12; // incremental distance from ends, in percentage of
635 // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
636
637 void N_analyze(int l5, int l3, int p5, int p3) {
638 /* assumes feat was filled properly */
639 int old_dif, t5,t3,v;
640 if (l3<l5+2 || p5>p3 ) {
641 feat.end5=l5+1;
642 feat.end3=l3+1;
643 return;
644 }
645
646 t5=feat.NPos[p5]-l5;
647 t3=l3-feat.NPos[p3];
648 old_dif=p3-p5;
649 v=(int)((((double)(l3-l5))*perc_lenN)/100);
650 if (v>20) v=20; /* enforce N-search limit for very long reads */
651 if (t5 < v ) {
652 l5=feat.NPos[p5]+1;
653 p5++;
654 }
655 if (t3 < v) {
656 l3=feat.NPos[p3]-1;
657 p3--;
658 }
659 /* restNs=p3-p5; number of Ns in the new CLR */
660 if (p3-p5==old_dif) { /* no change, return */
661 /* don't trim if this may shorten the read too much */
662 //if (l5-l3<min_read_len) return;
663 feat.end5=l5+1;
664 feat.end3=l3+1;
665 return;
666 }
667 else
668 N_analyze(l5,l3, p5,p3);
669 }
670
671
672 bool qtrim(GStr& qvs, int &l5, int &l3) {
673 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
674 if (qv_phredtype==0) {
675 //try to guess the Phred type
676 int vmin=256, vmax=0;
677 for (int i=0;i<qvs.length();i++) {
678 if (vmin>qvs[i]) vmin=qvs[i];
679 if (vmax<qvs[i]) vmax=qvs[i];
680 }
681 if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
682 if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
683 if (qv_phredtype==0) {
684 GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
685 }
686 if (verbose)
687 GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
688 } //guessing Phred type
689 for (l3=qvs.length()-1;l3>2;l3--) {
690 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
691 }
692 //just in case, check also the 5' the end (?)
693 for (l5=0;l5<qvs.length()-3;l5++) {
694 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
695 }
696 if (qvtrim_max>0) {
697 if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
698 if (l5>qvtrim_max) l5=qvtrim_max;
699 }
700 return (l5>0 || l3<qvs.length()-1);
701 }
702
703 bool ntrim(GStr& rseq, int &l5, int &l3) {
704 //count Ns in the sequence, trim N-rich ends
705 feat.init(rseq);
706 l5=feat.end5-1;
707 l3=feat.end3-1;
708 N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
709 if (l5==feat.end5-1 && l3==feat.end3-1) {
710 if (feat.perc_N>max_perc_N) {
711 feat.valid=false;
712 l3=l5+1;
713 return true;
714 }
715 else {
716 return false; //nothing changed
717 }
718 }
719 l5=feat.end5-1;
720 l3=feat.end3-1;
721 if (l3-l5+1<min_read_len) {
722 feat.valid=false;
723 return true;
724 }
725 feat.N_calc();
726
727 if (feat.perc_N>max_perc_N) {
728 feat.valid=false;
729 l3=l5+1;
730 return true;
731 }
732 return true;
733 }
734
735 //--------------- dust functions ----------------
736 class DNADuster {
737 public:
738 int dustword;
739 int dustwindow;
740 int dustwindow2;
741 int dustcutoff;
742 int mv, iv, jv;
743 int counts[32*32*32];
744 int iis[32*32*32];
745 DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
746 dustword=wordsize;
747 dustwindow=winsize;
748 dustwindow2 = (winsize>>1);
749 dustcutoff=cutoff;
750 mv=0;
751 iv=0;
752 jv=0;
753 }
754 void setWindowSize(int value) {
755 dustwindow = value;
756 dustwindow2 = (dustwindow >> 1);
757 }
758 void setWordSize(int value) {
759 dustword=value;
760 }
761 void wo1(int len, const char* s, int ivv) {
762 int i, ii, j, v, t, n, n1, sum;
763 int js, nis;
764 n = 32 * 32 * 32;
765 n1 = n - 1;
766 nis = 0;
767 i = 0;
768 ii = 0;
769 sum = 0;
770 v = 0;
771 for (j=0; j < len; j++, s++) {
772 ii <<= 5;
773 if (*s<=32) {
774 i=0;
775 continue;
776 }
777 ii |= *s - 'A'; //assume uppercase!
778 ii &= n1;
779 i++;
780 if (i >= dustword) {
781 for (js=0; js < nis && iis[js] != ii; js++) ;
782 if (js == nis) {
783 iis[nis] = ii;
784 counts[ii] = 0;
785 nis++;
786 }
787 if ((t = counts[ii]) > 0) {
788 sum += t;
789 v = 10 * sum / j;
790 if (mv < v) {
791 mv = v;
792 iv = ivv;
793 jv = j;
794 }
795 }
796 counts[ii]++;
797 }
798 }
799 }
800
801 int wo(int len, const char* s, int* beg, int* end) {
802 int i, l1;
803 l1 = len - dustword + 1;
804 if (l1 < 0) {
805 *beg = 0;
806 *end = len - 1;
807 return 0;
808 }
809 mv = 0;
810 iv = 0;
811 jv = 0;
812 for (i=0; i < l1; i++) {
813 wo1(len-i, s+i, i);
814 }
815 *beg = iv;
816 *end = iv + jv;
817 return mv;
818 }
819
820 void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
821 int i, j, l, a, b, v;
822 if (cutoff==0) cutoff=dustcutoff;
823 a=0;b=0;
824 //GMessage("Dust cutoff=%d\n", cutoff);
825 for (i=0; i < seqlen; i += dustwindow2) {
826 l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
827 v = wo(l, seq+i, &a, &b);
828 if (v > cutoff) {
829 //for (j = a; j <= b && j < dustwindow2; j++) {
830 for (j = a; j <= b; j++) {
831 seqmsk[i+j]='N';//could be made lowercase instead
832 }
833 }
834 }
835 //return first;
836 }
837 };
838
839 static DNADuster duster;
840
841 int dust(GStr& rseq) {
842 char* seq=Gstrdup(rseq.chars());
843 duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
844 //check the number of Ns:
845 int ncount=0;
846 for (int i=0;i<rseq.length();i++) {
847 if (seq[i]=='N') ncount++;
848 }
849 GFREE(seq);
850 return ncount;
851 }
852
853
854 // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
855 //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
856 //check minimum score and
857 //for 3' adapter trimming:
858 // require that the right end of the alignment for either the adaptor OR the read must be
859 // < 3 distance from its right end
860 // for 5' adapter trimming:
861 // require that the left end of the alignment for either the adaptor OR the read must
862 // be at coordinate < 3 from start
863
864 bool extendMatch(const char* a, int alen, int ai,
865 const char* b, int blen, int bi, int mlen, int& l5, int& l3, CSegChain& segs, bool end5=false) {
866 //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
867 #ifdef DEBUG
868 GStr dbg(b);
869 #endif
870 //if (debug) {
871 // GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
872 // }
873 int a_l=ai; //alignment coordinates on a
874 int a_r=ai+mlen-1;
875 int b_l=bi; //alignment coordinates on b
876 int b_r=bi+mlen-1;
877 int ai_maxscore=ai;
878 int bi_maxscore=bi;
879 int score=mlen*a_m_score;
880 int maxscore=score;
881 int mism5score=a_mis_score;
882 if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
883 //try to extend to the left first, if possible
884 while (ai>0 && bi>0) {
885 ai--;
886 bi--;
887 score+= (a[ai]==b[bi])? a_m_score : mism5score;
888 if (score>maxscore) {
889 ai_maxscore=ai;
890 bi_maxscore=bi;
891 maxscore=score;
892 }
893 else if (maxscore-score>a_dropoff_score) break;
894 }
895 a_l=ai_maxscore;
896 b_l=bi_maxscore;
897 //if (debug) GMessage(" after l-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
898 //now extend to the right
899 ai_maxscore=a_r;
900 bi_maxscore=b_r;
901 ai=a_r;
902 bi=b_r;
903 score=maxscore;
904 //sometimes there are extra AAAAs at the end of the read, ignore those
905 if (strcmp(&a[alen-4],"AAAA")==0) {
906 alen-=3;
907 while (a[alen-1]=='A' && alen>ai) alen--;
908 }
909 while (ai<alen-1 && bi<blen-1) {
910 ai++;
911 bi++;
912 //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
913 if (a[ai]==b[bi]) { //match
914 score+=a_m_score;
915 if (ai>=alen-2) {
916 score+=a_m_score-(alen-ai-1);
917 }
918 }
919 else { //mismatch
920 score+=a_mis_score;
921 }
922 if (score>maxscore) {
923 ai_maxscore=ai;
924 bi_maxscore=bi;
925 maxscore=score;
926 }
927 else if (maxscore-score>a_dropoff_score) break;
928 }
929 a_r=ai_maxscore;
930 b_r=bi_maxscore;
931 int a_ovh3=alen-a_r-1;
932 int b_ovh3=blen-b_r-1;
933 int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
934 int mmovh5=(a_l<b_l)? a_l : b_l;
935 //if (debug) GMessage(" after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
936 #ifdef DEBUG
937 if (debug) GMessage(" extended to: %*s\n",a_r+1,dbg.substr(b_l,b_r-b_l+1).chars());
938 #endif
939 if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
940 if (a_l<a_ovh3) {
941 //adapter closer to the left end (typical for 5' adapter)
942 l5=a_r+1;
943 l3=alen-1;
944 }
945 else {
946 //adapter matching at the right end (typical for 3' adapter)
947 l5=0;
948 l3=a_l-1;
949 }
950 return true;
951 }
952 else { //keep this segment pair for later (gapped alignment)
953 segs.addSegPair(new CSegPair(a_l+1, a_r+1, b_l+1, b_r+1, maxscore));
954 //this will also update min & max coordinates in segs (segs.astart, .aend, .bstart, .bend)
955 }
956 //do not trim:
957 l5=0;
958 l3=alen-1;
959 return false;
960 }
961
962 /*
963 int getWordValue(const char* s, int wlen) {
964 int r=0;
965 while (wlen--) { r+=(((int)s[wlen])<<wlen) }
966 return r;
967 }
968 */
969 int get3mer_value(const char* s) {
970 return (s[0]<<16)+(s[1]<<8)+s[2];
971 }
972
973 int w3_match(int qv, const char* str, int slen, int start_index=0) {
974 if (start_index>=slen || start_index<0) return -1;
975 for (int i=start_index;i<slen-3;i++) {
976 int rv=get3mer_value(str+i);
977 if (rv==qv) return i;
978 }
979 return -1;
980 }
981
982 int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
983 if (end_index>=slen) return -1;
984 if (end_index<0) end_index=slen-1;
985 for (int i=end_index-2;i>=0;i--) {
986 int rv=get3mer_value(str+i);
987 if (rv==qv) return i;
988 }
989 return -1;
990 }
991
992 int fast4match(int32 qv, const char* str, int slen, int start_index=0) {
993 if (start_index>=slen || start_index<0) return -1;
994 for (int i=start_index;i<slen-4;i++) {
995 int32* rv=(int32*)(str+i);
996 if (*rv==qv) return i;
997 }
998 return -1;
999 }
1000
1001 int fast4rmatch(int32 qv, const char* str, int slen, int end_index=-1) {
1002 if (end_index>=slen) return -1;
1003 if (end_index<0) end_index=slen-1;
1004 for (int i=end_index-3;i>=0;i--) {
1005 int32* rv=(int32*)(str+i);
1006 if (*rv==qv) return i;
1007 }
1008 return -1;
1009 }
1010
1011 #ifdef DEBUG
1012 void dbgPrintChain(CSegChain& chain, const char* aseq) {
1013 GStr s(aseq);
1014 for (int i=0;i<chain.Count();i++) {
1015 CSegPair& seg=*chain[i];
1016 GMessage(" dbg chain seg%d: %*s [%d-%d:%d-%d]\n",i,seg.a.start-1+seg.len(),
1017 s.substr(seg.b.start-1, seg.len()).chars(), seg.b.start,seg.b.end,seg.a.start,seg.a.end);
1018 }
1019 }
1020 #endif
1021
1022 bool trim_adapter3(GStr& seq, int&l5, int &l3) {
1023 int rlen=seq.length();
1024 l5=0;
1025 l3=rlen-1;
1026 //first try a full match, we might get lucky
1027 int fi=-1;
1028 if ((fi=seq.index(adapter3))>=0) {
1029 if (fi<rlen-fi-a3len) {//match is closer to the right end
1030 l5=fi+a3len;
1031 l3=rlen-1;
1032 }
1033 else {
1034 l5=0;
1035 l3=fi-1;
1036 }
1037 return true;
1038 }
1039 #ifdef DEBUG
1040 if (debug) GMessage(">TRIM3 >> Read: %s\n",seq.chars());
1041 #endif
1042
1043 //also, for fast detection of other adapter-only reads that start past
1044 // the beginning of the adapter sequence, try to see if the first a3len-4
1045 // bases of the read are a substring of the adapter
1046 if (rlen>a3len-3) {
1047 GStr rstart=seq.substr(1,a3len-4);
1048 if ((fi=adapter3.index(rstart))>=0) {
1049 l3=rlen-1;
1050 l5=a3len-4;
1051 while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
1052 return true;
1053 }
1054 }
1055 CSegChain a3segs; //no chains here, just an ordered collection of segment pairs
1056 //check the easy cases - 11 bases exact match at the end
1057 int fdlen=11;
1058 if (a3len<16) {
1059 fdlen=a3len>>1;
1060 }
1061 if (fdlen>4) {
1062 //check if we're lucky enough to have the last 11 bases of the read a part of the adapter
1063 GStr rstart=seq.substr(-fdlen-3,fdlen);
1064 if ((fi=adapter3.index(rstart))>=0) {
1065 #ifdef DEBUG
1066 if (debug) GMessage(" W11match found: %*s\n", rlen-3, (adapter3.substr(fi,fdlen)).chars());
1067 #endif
1068 if (extendMatch(seq.chars(), rlen, rlen-fdlen-3,
1069 adapter3.chars(), a3len, fi, fdlen, l5,l3, a3segs))
1070 return true;
1071 }
1072 //another easy case: first 11 characters of the adaptor found as a substring of the read
1073 GStr bstr=adapter3.substr(0, fdlen);
1074 if ((fi=seq.rindex(bstr))>=0) {
1075 #ifdef DEBUG
1076 if (debug) GMessage(" A11match found: %*s\n", fi+fdlen, bstr.chars());
1077 #endif
1078 if (extendMatch(seq.chars(), rlen, fi,
1079 adapter3.chars(), a3len, 0, fdlen, l5,l3, a3segs))
1080 return true;
1081 }
1082 } //tried to match 11 bases first
1083
1084 //no easy cases, so let's do the wmer hashing for the first 12 bases of the adaptor
1085 //-- only extend if the match is in the 3' (ending) region of the read
1086 int wordSize=3;
1087 int hlen=12;
1088 if (hlen>a3len-wordSize) hlen=a3len-wordSize;
1089 int imin=rlen>>1; //last half of the read, left boundary for the wmer match
1090 if (imin<a3len) { imin=GMIN(a3len, rlen-wordSize); }
1091 imin=rlen-imin;
1092 for (int iw=0;iw<hlen;iw++) {
1093 //int32* qv=(int32*)(adapter3.chars()+iw);
1094 int qv=get3mer_value(adapter3.chars()+iw);
1095 fi=-1;
1096 //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1097 while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1098 //GMessage(" ... fi=%d after w3_rmatch() (imin=%d)\n", fi, imin);
1099
1100 #ifdef DEBUG
1101 if (debug) GMessage(" Wmatch found: %*s\n", fi+wordSize, (adapter3.substr(iw,wordSize)).chars());
1102 #endif
1103 if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1104 a3len, iw, wordSize, l5,l3, a3segs)) return true;
1105 fi--;
1106 if (fi<imin) break;
1107 }
1108 } //for each wmer in the first hlen bases of the adaptor
1109 /*
1110 //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1111 //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1112 if (a3segs.bstart>3 || a3segs.bend<(uint)(hlen-wordSize)) return false;
1113 int hlen2=a3len-wordSize;
1114 //if (hlen2>a3len-4) hlen2=a3len-4;
1115 if (hlen2>hlen) {
1116 #ifdef DEBUG
1117 if (debug && a3segs.Count()>0) {
1118 GMessage(" >>>>>2nd. hash: %s\n",seq.chars());
1119 }
1120 #endif
1121 for (int iw=hlen;iw<hlen2;iw++) {
1122 //int* qv=(int32 *)(adapter3.chars()+iw);
1123 int qv=get3mer_value(adapter3.chars()+iw);
1124 fi=-1;
1125 //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1126 while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1127 extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1128 a3len, iw, wordSize, l5,l3, a3segs);
1129 fi--;
1130 if (fi<imin) break;
1131 }
1132 } //for each wmer between hlen2 and hlen bases of the adaptor
1133 }
1134 //lastly, analyze collected a3segs for a possible gapped alignment:
1135 GList<CSegChain> segchains(false,true,false);
1136 #ifdef DEBUG
1137 if (debug && a3segs.Count()>0) {
1138 GMessage(">>>>>>>>> Read: %s\n",seq.chars());
1139 }
1140 #endif
1141 for (int i=0;i<a3segs.Count();i++) {
1142 if (a3segs[i]->chain==NULL) {
1143 if (a3segs[i]->b.start>3) continue; //don't start a hopeless chain
1144 CSegChain* newchain=new CSegChain();
1145 newchain->setFreeItem(false);
1146 newchain->addSegPair(a3segs[i]);
1147 a3segs[i]->chain=newchain;
1148 segchains.Add(newchain); //just to free them when done
1149 }
1150 for (int j=i+1;j<a3segs.Count();j++) {
1151 CSegChain* chain=a3segs[i]->chain;
1152 if (chain->extendChain(a3segs[j])) {
1153 a3segs[j]->chain=chain;
1154 #ifdef DEBUG
1155 if (debug) dbgPrintChain(*chain, adapter3.chars());
1156 #endif
1157 //save time by checking here if the extended chain is already acceptable for trimming
1158 if (chain->aend>(uint)(rlen-4) && chain->bstart<4 && chain->score>a_min_chain_score) {
1159 l5=0;
1160 l3=chain->astart-2;
1161 #ifdef DEBUG
1162 if (debug && a3segs.Count()>0) {
1163 GMessage(">>> >> trimmed-3: %*s\n",l3-l5+1,seq.substr(l5,l3-l5+1).chars());
1164 }
1165 #endif
1166 return true;
1167 }
1168 } //chain can be extended
1169 }
1170 } //collect segment alignments into chains
1171 */
1172 return false; //no adapter parts found
1173 }
1174
1175 bool trim_adapter5(GStr& seq, int&l5, int &l3) {
1176 //if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars());
1177 int rlen=seq.length();
1178 l5=0;
1179 l3=rlen-1;
1180 //try to see if adapter is fully included in the read
1181 int fi=-1;
1182 if ((fi=seq.index(adapter5))>=0) {
1183 if (fi<rlen-fi-a5len) {//match is closer to the right end
1184 l5=fi+a5len;
1185 l3=rlen-1;
1186 }
1187 else {
1188 l5=0;
1189 l3=fi-1;
1190 }
1191 return true;
1192 }
1193 #ifdef DEBUG
1194 if (debug) GMessage(">TRIM5 >> Read: %s\n",seq.chars());
1195 #endif
1196
1197 CSegChain a5segs(true); //list of segment pairs to analyze later if no extendMatch succeeded
1198
1199 //try the easy way out first - look for an exact match of 11 bases
1200 int fdlen=11;
1201 if (a5len<16) {
1202 fdlen=a5len>>1;
1203 }
1204 if (fdlen>4) {
1205 GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus
1206 if ((fi=adapter5.index(rstart))>=0) {
1207 #ifdef DEBUG
1208 if (debug) GMessage(" W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars());
1209 #endif
1210 if (extendMatch(seq.chars(), rlen, 1,
1211 adapter5.chars(), a5len, fi, fdlen, l5,l3, a5segs, true))
1212 return true;
1213 }
1214 //another easy case: last 11 characters of the adaptor found as a substring of the read
1215 GStr bstr=adapter5.substr(-fdlen);
1216 if ((fi=seq.index(bstr))>=0) {
1217 #ifdef DEBUG
1218 if (debug) GMessage(" A11match found: %*s\n", fi+fdlen, bstr.chars());
1219 #endif
1220 if (extendMatch(seq.chars(), rlen, fi,
1221 adapter5.chars(), a5len, a5len-fdlen, fdlen, l5,l3,a5segs,true))
1222 return true;
1223 }
1224 } //tried to matching at most 11 bases first
1225
1226 //-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor
1227 //-- only extend a wmer if it matches in the 5' (beginning) region of the read
1228 int wordSize=3;
1229 int hlen=12;
1230 if (hlen>a5len-wordSize) hlen=a5len-wordSize;
1231 int imax=rlen>>1; //first half of the read, right boundary for the wmer match
1232 if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); }
1233 for (int iw=0;iw<=hlen;iw++) {
1234 int apstart=a5len-iw-wordSize;
1235 fi=0;
1236 //int* qv=(int32 *)(adapter5.chars()+apstart);
1237 int qv=get3mer_value(adapter5.chars()+apstart);
1238 //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1239 while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1240 #ifdef DEBUG
1241 if (debug) GMessage(" Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars());
1242 #endif
1243 if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1244 a5len, apstart, wordSize, l5,l3, a5segs, true)) return true;
1245 fi++;
1246 if (fi>imax) break;
1247 }
1248 } //for each wmer in the last hlen bases of the adaptor
1249 /*
1250
1251 //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1252 //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1253 if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1254 int hlen2=a5len-wordSize;
1255 //if (hlen2>a5len-wordSize) hlen2=a5len-wordSize;
1256 #ifdef DEBUG
1257 if (debug && a5segs.Count()>0) {
1258 GMessage(" >>>>>2nd. hash: %s\n",seq.chars());
1259 }
1260 #endif
1261 if (hlen2>hlen) {
1262 for (int iw=hlen+1;iw<=hlen2;iw++) {
1263 int apstart=a5len-iw-wordSize;
1264 fi=0;
1265 //int* qv=(int32 *)(adapter5.chars()+apstart);
1266 int qv=get3mer_value(adapter5.chars()+apstart);
1267 //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1268 while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1269 extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1270 a5len, apstart, wordSize, l5,l3, a5segs, true);
1271 fi++;
1272 if (fi>imax) break;
1273 }
1274 } //for each wmer between hlen2 and hlen bases of the adaptor
1275 }
1276 if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1277 // lastly, analyze collected a5segs for a possible gapped alignment:
1278 GList<CSegChain> segchains(false,true,false);
1279 #ifdef DEBUG
1280 if (debug && a5segs.Count()>0) {
1281 GMessage(">>>>>>>>> Read: %s\n",seq.chars());
1282 }
1283 #endif
1284 for (int i=0;i<a5segs.Count();i++) {
1285 if (a5segs[i]->chain==NULL) {
1286 if (a5segs[i]->b.end<(int)(a5len-4)) continue; //don't start a hopeless chain
1287 CSegChain* newchain=new CSegChain(true);
1288 newchain->setFreeItem(false);
1289 newchain->addSegPair(a5segs[i]);
1290 a5segs[i]->chain=newchain;
1291 segchains.Add(newchain); //just to free them when done
1292 }
1293 for (int j=i+1;j<a5segs.Count();j++) {
1294 CSegChain* chain=a5segs[i]->chain;
1295 if (chain->extendChain(a5segs[j])) {
1296 a5segs[j]->chain=chain;
1297 #ifdef DEBUG
1298 if (debug) dbgPrintChain(*chain, adapter5.chars());
1299 #endif
1300 //save time by checking here if the extended chain is already acceptable for trimming
1301 if (chain->bend>(uint)(a5len-3) && chain->astart<4 && chain->score>a_min_chain_score) {
1302 l5=chain->aend;
1303 l3=rlen-1;
1304 return true;
1305 }
1306 } //chain can be extended
1307 }
1308 } //collect segment alignments into chains
1309 */
1310 return false; //no adapter parts found
1311 }
1312
1313 //convert qvs to/from phred64 from/to phread33
1314 void convertPhred(GStr& q) {
1315 for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1316 }
1317
1318 void convertPhred(char* q, int len) {
1319 for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1320 }
1321
1322 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1323 GStr& rname, GStr& rinfo, GStr& infname) {
1324 rseq="";
1325 rqv="";
1326 rname="";
1327 rinfo="";
1328 if (fq.eof()) return false;
1329 char* l=fq.getLine();
1330 while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1331 if (l==NULL) return false;
1332 /* if (rawFormat) {
1333 //TODO: implement raw qseq parsing here
1334 //if (raw type=N) then continue; //skip invalid/bad records
1335 } //raw qseq format
1336 else { // FASTQ or FASTA */
1337 isfasta=(l[0]=='>');
1338 if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1339 infname.chars(), l);
1340 GStr s(l);
1341 rname=&(l[1]);
1342 for (int i=0;i<rname.length();i++)
1343 if (rname[i]<=' ') {
1344 if (i<rname.length()-2) rinfo=rname.substr(i+1);
1345 rname.cut(i);
1346 break;
1347 }
1348 //now get the sequence
1349 if ((l=fq.getLine())==NULL)
1350 GError("Error: unexpected EOF after header for read %s (%s)\n",
1351 rname.chars(), infname.chars());
1352 rseq=l; //this must be the DNA line
1353 while ((l=fq.getLine())!=NULL) {
1354 //seq can span multiple lines
1355 if (l[0]=='>' || l[0]=='+') {
1356 fq.pushBack();
1357 break; //
1358 }
1359 rseq+=l;
1360 } //check for multi-line seq
1361 if (!isfasta) { //reading fastq quality values, which can also be multi-line
1362 if ((l=fq.getLine())==NULL)
1363 GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1364 if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1365 if ((l=fq.getLine())==NULL)
1366 GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1367 rqv=l;
1368 //if (rqv.length()!=rseq.length())
1369 // GError("Error: qv len != seq len for %s\n", rname.chars());
1370 while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1371 rqv+=l; //append to qv string
1372 }
1373 }// fastq
1374 // } //<-- FASTA or FASTQ
1375 rseq.upper(); //TODO: what if we care about masking?
1376 return true;
1377 }
1378
1379 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1380 //returns 0 if the read was untouched, 1 if it was just trimmed
1381 // and a trash code if it was trashed
1382 l5=0;
1383 l3=rseq.length()-1;
1384 if (l3-l5+1<min_read_len) {
1385 return 's';
1386 }
1387 GStr wseq(rseq.chars());
1388 GStr wqv(rqv.chars());
1389 int w5=l5;
1390 int w3=l3;
1391 //first do the q-based trimming
1392 if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1393 if (w3-w5+1<min_read_len) {
1394 return 'Q'; //invalid read
1395 }
1396 //-- keep only the w5..w3 range
1397 l5=w5;
1398 l3=w3;
1399 wseq=wseq.substr(w5, w3-w5+1);
1400 if (!wqv.is_empty())
1401 wqv=wqv.substr(w5, w3-w5+1);
1402 } //qv trimming
1403 // N-trimming on the remaining read seq
1404 if (ntrim(wseq, w5, w3)) {
1405 //GMessage("before: %s\n",wseq.chars());
1406 //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1407 l5+=w5;
1408 l3-=(wseq.length()-1-w3);
1409 if (w3-w5+1<min_read_len) {
1410 return 'N'; //to be trashed
1411 }
1412 //-- keep only the w5..w3 range
1413 wseq=wseq.substr(w5, w3-w5+1);
1414 if (!wqv.is_empty())
1415 wqv=wqv.substr(w5, w3-w5+1);
1416 w5=0;
1417 w3=wseq.length()-1;
1418 }
1419 if (a3len>0) {
1420 if (trim_adapter3(wseq, w5, w3)) {
1421 int trimlen=wseq.length()-(w3-w5+1);
1422 num_trimmed3++;
1423 if (trimlen<min_trimmed3)
1424 min_trimmed3=trimlen;
1425 l5+=w5;
1426 l3-=(wseq.length()-1-w3);
1427 if (w3-w5+1<min_read_len) {
1428 return '3';
1429 }
1430 //-- keep only the w5..w3 range
1431 wseq=wseq.substr(w5, w3-w5+1);
1432 if (!wqv.is_empty())
1433 wqv=wqv.substr(w5, w3-w5+1);
1434 }//some adapter was trimmed
1435 } //adapter trimming
1436 if (a5len>0) {
1437 if (trim_adapter5(wseq, w5, w3)) {
1438 int trimlen=wseq.length()-(w3-w5+1);
1439 num_trimmed5++;
1440 if (trimlen<min_trimmed5)
1441 min_trimmed5=trimlen;
1442 l5+=w5;
1443 l3-=(wseq.length()-1-w3);
1444 if (w3-w5+1<min_read_len) {
1445 return '5';
1446 }
1447 //-- keep only the w5..w3 range
1448 wseq=wseq.substr(w5, w3-w5+1);
1449 if (!wqv.is_empty())
1450 wqv=wqv.substr(w5, w3-w5+1);
1451 }//some adapter was trimmed
1452 } //adapter trimming
1453 if (doCollapse) {
1454 //keep read for later
1455 FqDupRec* dr=dhash.Find(wseq.chars());
1456 if (dr==NULL) { //new entry
1457 //if (prefix.is_empty())
1458 dhash.Add(wseq.chars(),
1459 new FqDupRec(&wqv, rname.chars()));
1460 //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1461 }
1462 else
1463 dr->add(wqv);
1464 } //collapsing duplicates
1465 else { //not collapsing duplicates
1466 //apply the dust filter now
1467 if (doDust) {
1468 int dustbases=dust(wseq);
1469 if (dustbases>(wseq.length()>>1)) {
1470 return 'D';
1471 }
1472 }
1473 } //not collapsing duplicates
1474 return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1475 }
1476
1477 void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1478 //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1479 if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1480 else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1481 }
1482
1483 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1484 outcounter++;
1485 bool asFasta=(rqv.is_empty() || fastaOutput);
1486 if (asFasta) {
1487 if (prefix.is_empty()) {
1488 printHeader(f_out, '>',rname,rinfo);
1489 fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1490 }
1491 else {
1492 fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1493 rseq.chars());
1494 }
1495 }
1496 else { //fastq
1497 if (convert_phred) convertPhred(rqv);
1498 if (prefix.is_empty()) {
1499 printHeader(f_out, '@', rname, rinfo);
1500 fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1501 }
1502 else
1503 fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1504 rseq.chars(),rqv.chars() );
1505 }
1506 }
1507
1508 void trash_report(char trashcode, GStr& rname, FILE* freport) {
1509 if (freport==NULL || trashcode<=' ') return;
1510 if (trashcode=='3' || trashcode=='5') {
1511 fprintf(freport, "%s\tA%c\n",rname.chars(),trashcode);
1512 }
1513 else {
1514 fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1515 }
1516 //tcounter++;
1517 }
1518
1519 GStr getFext(GStr& s, int* xpos=NULL) {
1520 //s must be a filename without a path
1521 GStr r("");
1522 if (xpos!=NULL) *xpos=0;
1523 if (s.is_empty() || s=="-") return r;
1524 int p=s.rindex('.');
1525 int d=s.rindex('/');
1526 if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1527 r=s.substr(p+1);
1528 if (xpos!=NULL) *xpos=p+1;
1529 r.lower();
1530 return r;
1531 }
1532
1533 void baseFileName(GStr& fname) {
1534 //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1535 int xpos=0;
1536 GStr fext=getFext(fname, &xpos);
1537 if (xpos<=1) return;
1538 bool extdel=false;
1539 GStr f2;
1540 if (fext=="z" || fext=="zip") {
1541 extdel=true;
1542 }
1543 else if (fext.length()>=2) {
1544 f2=fext.substr(0,2);
1545 extdel=(f2=="gz" || f2=="bz");
1546 }
1547 if (extdel) {
1548 fname.cut(xpos-1);
1549 fext=getFext(fname, &xpos);
1550 if (xpos<=1) return;
1551 }
1552 extdel=false;
1553 if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1554 extdel=true;
1555 }
1556 else if (fext.length()>=2) {
1557 extdel=(fext.substr(0,2)=="fa");
1558 }
1559 if (extdel) fname.cut(xpos-1);
1560 GStr fncp(fname);
1561 fncp.lower();
1562 fncp.chomp("seq");
1563 fncp.chomp("sequence");
1564 fncp.trimR("_.");
1565 if (fncp.length()<fname.length()) fname.cut(fncp.length());
1566 }
1567
1568 FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1569 FILE* f_out=NULL;
1570 GStr fname(getFileName(infname.chars()));
1571 //eliminate known extensions
1572 baseFileName(fname);
1573 if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1574 else if (pocmd.is_empty()) {
1575 GStr oname(fname);
1576 oname.append('.');
1577 oname.append(outsuffix);
1578 f_out=fopen(oname.chars(),"w");
1579 if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1580 }
1581 else {
1582 GStr oname(">");
1583 oname.append(fname);
1584 oname.append('.');
1585 oname.append(outsuffix);
1586 pocmd.append(oname);
1587 f_out=popen(pocmd.chars(), "w");
1588 if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1589 }
1590 return f_out;
1591 }
1592
1593 void guess_unzip(GStr& fname, GStr& picmd) {
1594 GStr fext=getFext(fname);
1595 if (fext=="gz" || fext=="gzip" || fext=="z") {
1596 picmd="gzip -cd ";
1597 }
1598 else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1599 picmd="bzip2 -cd ";
1600 }
1601 }
1602
1603 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1604 GStr& s, GStr& infname, GStr& infname2) {
1605 // uses outsuffix to generate output file names and open file handles as needed
1606 infname="";
1607 infname2="";
1608 f_in=NULL;
1609 f_in2=NULL;
1610 f_out=NULL;
1611 f_out2=NULL;
1612 //analyze outsuffix intent
1613 GStr pocmd;
1614 GStr ox=getFext(outsuffix);
1615 if (ox.length()>2) ox=ox.substr(0,2);
1616 if (ox=="gz") pocmd="gzip -9 -c ";
1617 else if (ox=="bz") pocmd="bzip2 -9 -c ";
1618 if (s=="-") {
1619 f_in=stdin;
1620 infname="stdin";
1621 f_out=prepOutFile(infname, pocmd);
1622 return;
1623 } // streaming from stdin
1624 s.startTokenize(",:");
1625 s.nextToken(infname);
1626 bool paired=s.nextToken(infname2);
1627 if (fileExists(infname.chars())==0)
1628 GError("Error: cannot find file %s!\n",infname.chars());
1629 GStr fname(getFileName(infname.chars()));
1630 GStr picmd;
1631 guess_unzip(fname, picmd);
1632 if (picmd.is_empty()) {
1633 f_in=fopen(infname.chars(), "r");
1634 if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1635 }
1636 else {
1637 picmd.append(infname);
1638 f_in=popen(picmd.chars(), "r");
1639 if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1640 }
1641 f_out=prepOutFile(infname, pocmd);
1642 if (!paired) return;
1643 if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1644 // ---- paired reads:-------------
1645 if (fileExists(infname2.chars())==0)
1646 GError("Error: cannot find file %s!\n",infname2.chars());
1647 picmd="";
1648 GStr fname2(getFileName(infname2.chars()));
1649 guess_unzip(fname2, picmd);
1650 if (picmd.is_empty()) {
1651 f_in2=fopen(infname2.chars(), "r");
1652 if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1653 }
1654 else {
1655 picmd.append(infname2);
1656 f_in2=popen(picmd.chars(), "r");
1657 if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1658 }
1659 f_out2=prepOutFile(infname2, pocmd);
1660 }