ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 170
Committed: Tue Feb 14 22:35:51 2012 UTC (7 years, 9 months ago) by gpertea
File size: 42010 byte(s)
Log Message:
wip fqtrim

Line File contents
1 #include "GArgs.h"
2 #include "GStr.h"
3 #include "GHash.hh"
4 #include "GList.hh"
5 #include <ctype.h>
6 #include "GAlnExtend.h"
7
8 #define USAGE "Usage:\n\
9 fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10 [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11 [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12 <input.fq>[,<input_mates.fq>\n\
13 \n\
14 Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
15 for low complexity and collapse duplicate reads.\n\
16 If read pairs should be trimmed and kept together (i.e. without discarding\n\
17 one read in a pair), the two file names should be given delimited by a comma\n\
18 or a colon character\n\
19 \n\
20 Options:\n\
21 -n rename all the reads using the <prefix> followed by a read counter;\n\
22 if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23 the read duplication count\n\
24 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
25 file(s) named <input>.<outsuffix> which will be created in the \n\
26 current (working) directory; (writes to stdout if -o- is given);\n\
27 a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28 -f file with adapter sequences to trim, each line having this format:\n\
29 <5'-adapter-sequence> <3'-adapter-sequence>\n\
30 -5 trim the given adapter or primer sequence at the 5' end of each read\n\
31 (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32 -3 trim the given adapter sequence at the 3' end of each read\n\
33 (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 -A disable polyA/T trimming (enabled by default)\n\
35 -y minimum length of exact match to adaptor sequence at the proper end (6)\n\
36 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
38 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
39 -l minimum \"clean\" length after trimming that a read must have\n\
40 in order to pass the filter (default: 16)\n\
41 -r write a simple \"trash report\" file listing the discarded reads with a\n\
42 one letter code for the reason why they were trashed\n\
43 -D apply a low-complexity (dust) filter and discard any read that has over\n\
44 50% of its length detected as low complexity\n\
45 -C collapse duplicate reads and add a prefix with their count to the read\n\
46 name (e.g. for microRNAs)\n\
47 -p input is phred64/phred33 (use -p64 or -p33)\n\
48 -Q convert quality values to the other Phred qv type\n\
49 -V verbose processing\n\
50 "
51
52 //-z for -o option, the output stream(s) will be first piped into the given\n
53 // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54
55
56 // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
57
58 //For paired reads sequencing:
59 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
60 //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
61 //FILE* f_out=NULL; //stdout if not provided
62 //FILE* f_out2=NULL; //for paired reads
63 //FILE* f_in=NULL; //input fastq (stdin if not provided)
64 //FILE* f_in2=NULL; //for paired reads
65
66 FILE* freport=NULL;
67
68 bool debug=false;
69 bool verbose=false;
70 bool doCollapse=false;
71 bool doDust=false;
72 bool doPolyTrim=true;
73 bool fastaOutput=false;
74 bool trashReport=false;
75 //bool rawFormat=false;
76 int min_read_len=16;
77 double max_perc_N=7.0;
78 int dust_cutoff=16;
79 bool isfasta=false;
80 bool convert_phred=false;
81 GStr outsuffix; // -o
82 GStr prefix;
83 GStr zcmd;
84 int num_trimmed5=0;
85 int num_trimmed3=0;
86 int num_trimmedN=0;
87 int num_trimmedQ=0;
88 int min_trimmed5=INT_MAX;
89 int min_trimmed3=INT_MAX;
90
91 int qvtrim_qmin=0;
92 int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
93 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
94 int qv_cvtadd=0; //could be -31 or +31
95
96 // adaptor matching metrics -- for X-drop ungapped extension
97 const int match_reward=2;
98 const int mismatch_penalty=3;
99 const int Xdrop=8;
100
101 const int poly_m_score=2; //match score
102 const int poly_mis_score=-3; //mismatch
103 const int poly_dropoff_score=7;
104 int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
105
106 const char *polyA_seed="AAAA";
107 const char *polyT_seed="TTTT";
108
109 struct CASeqData {
110 //positional data for every possible hexamer in an adapter
111 GVec<uint16>* pz[64]; //0-based coordinates of all possible hexamers in the adapter sequence
112 GVec<uint16>* pzr[64]; //0-based coordinates of all possible hexamers for the reverse complement of the adapter sequence
113 GStr seq; //actual adapter sequence data
114 GStr seqr; //reverse complement sequence
115 CASeqData(bool rev=false):seq(),seqr() {
116 for (int i=0;i<63;i++) {
117 pz[i]=new GVec<uint16>(1);
118 if (rev) pzr[i]=new GVec<uint16>(1);
119 }
120 }
121
122 ~CSeqData() {
123 for (int i=0;i<63;i++) {
124 delete pz[i];
125 if (rev) { delete pzr[i]; }
126 }
127 }
128 };
129
130 GVec<CASeqData> adapters5;
131 GVec<CASeqData> adapters3;
132
133 CGreedyAlignData* gxmem_l=NULL;
134 CGreedyAlignData* gxmem_r=NULL;
135
136 // element in dhash:
137 class FqDupRec {
138 public:
139 int count; //how many of these reads are the same
140 int len; //length of qv
141 char* firstname; //optional, only if we want to keep the original read names
142 char* qv;
143 FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
144 len=0;
145 qv=NULL;
146 firstname=NULL;
147 count=0;
148 if (qstr!=NULL) {
149 qv=Gstrdup(qstr->chars());
150 len=qstr->length();
151 count++;
152 }
153 if (rname!=NULL) firstname=Gstrdup(rname);
154 }
155 ~FqDupRec() {
156 GFREE(qv);
157 GFREE(firstname);
158 }
159 void add(GStr& d) { //collapse another record into this one
160 if (d.length()!=len)
161 GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
162 count++;
163 for (int i=0;i<len;i++)
164 qv[i]=(qv[i]+d[i])/2;
165 }
166 };
167
168 void openfw(FILE* &f, GArgs& args, char opt) {
169 GStr s=args.getOpt(opt);
170 if (!s.is_empty()) {
171 if (s=='-') f=stdout;
172 else {
173 f=fopen(s.chars(),"w");
174 if (f==NULL) GError("Error creating file: %s\n", s.chars());
175 }
176 }
177 }
178
179 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
180 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
181
182 GHash<FqDupRec> dhash; //hash to keep track of duplicates
183
184 int loadAdapters(const char* fname);
185
186 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
187 GStr& s, GStr& infname, GStr& infname2);
188 // uses outsuffix to generate output file names and open file handles as needed
189
190 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
191 void trash_report(char trashcode, GStr& rname, FILE* freport);
192
193 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
194 GStr& rname, GStr& rinfo, GStr& infname);
195
196 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
197 //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
198
199 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
200 bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
201 int dust(GStr& seq);
202 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
203 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
204 bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
205 bool trim_adapter3(GStr& seq, int &l5, int &l3);
206
207 void convertPhred(char* q, int len);
208 void convertPhred(GStr& q);
209
210 int main(int argc, char * const argv[]) {
211 GArgs args(argc, argv, "YQDCVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
212 int e;
213 if ((e=args.isError())>0) {
214 GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
215 exit(224);
216 }
217 debug=(args.getOpt('Y')!=NULL);
218 verbose=(args.getOpt('V')!=NULL);
219 convert_phred=(args.getOpt('Q')!=NULL);
220 doCollapse=(args.getOpt('C')!=NULL);
221 doDust=(args.getOpt('D')!=NULL);
222 if (args.getOpt('A')) doPolyTrim=false;
223 /*
224 rawFormat=(args.getOpt('R')!=NULL);
225 if (rawFormat) {
226 GError("Sorry, raw qseq format parsing is not implemented yet!\n");
227 }
228 */
229 prefix=args.getOpt('n');
230 GStr s=args.getOpt('l');
231 if (!s.is_empty())
232 min_read_len=s.asInt();
233 s=args.getOpt('m');
234 if (!s.is_empty())
235 max_perc_N=s.asDouble();
236 s=args.getOpt('d');
237 if (!s.is_empty()) {
238 dust_cutoff=s.asInt();
239 doDust=true;
240 }
241 s=args.getOpt('q');
242 if (!s.is_empty()) {
243 qvtrim_qmin=s.asInt();
244 }
245 s=args.getOpt('t');
246 if (!s.is_empty()) {
247 qvtrim_max=s.asInt();
248 }
249 s=args.getOpt('p');
250 if (!s.is_empty()) {
251 int v=s.asInt();
252 if (v==33) {
253 qv_phredtype=33;
254 qv_cvtadd=31;
255 }
256 else if (v==64) {
257 qv_phredtype=64;
258 qv_cvtadd=-31;
259 }
260 else
261 GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
262 }
263 s=args.getOpt('f');
264 if (!s.is_empty()) {
265 loadAdapters(s.chars());
266 }
267 bool fileAdapters=adapters5.Count()+adapters3.Count();
268 s=args.getOpt('5');
269 if (!s.is_empty()) {
270 if (fileAdapters)
271 GError("Error: options -5 and -f cannot be used together!\n");
272 s.upper();
273 adapters5.Add(s);
274 }
275 s=args.getOpt('3');
276 if (!s.is_empty()) {
277 if (fileAdapters)
278 GError("Error: options -3 and -f cannot be used together!\n");
279 s.upper();
280 adapters3.Add(s);
281 }
282 s=args.getOpt('y');
283 if (!s.is_empty()) {
284 int minmatch=s.asInt();
285 poly_minScore=minmatch*poly_m_score;
286 }
287
288 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
289 else outsuffix="-";
290 trashReport= (args.getOpt('r')!=NULL);
291 int fcount=args.startNonOpt();
292 if (fcount==0) {
293 GMessage(USAGE);
294 exit(224);
295 }
296 if (fcount>1 && doCollapse) {
297 GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
298 }
299 //openfw(f_out, args, 'o');
300 //if (f_out==NULL) f_out=stdout;
301 if (trashReport)
302 openfw(freport, args, 'r');
303 char* infile=NULL;
304
305 if (adapters5.Count()>0)
306 gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
307 if (adapters3.Count()>0)
308 gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
309
310 while ((infile=args.nextNonOpt())!=NULL) {
311 //for each input file
312 int incounter=0; //counter for input reads
313 int outcounter=0; //counter for output reads
314 int trash_s=0; //too short from the get go
315 int trash_Q=0;
316 int trash_N=0;
317 int trash_D=0;
318 int trash_poly=0;
319 int trash_A3=0;
320 int trash_A5=0;
321 s=infile;
322 GStr infname;
323 GStr infname2;
324 FILE* f_in=NULL;
325 FILE* f_in2=NULL;
326 FILE* f_out=NULL;
327 FILE* f_out2=NULL;
328 bool paired_reads=false;
329 setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
330 GLineReader fq(f_in);
331 GLineReader* fq2=NULL;
332 if (f_in2!=NULL) {
333 fq2=new GLineReader(f_in2);
334 paired_reads=true;
335 }
336 GStr rseq, rqv, seqid, seqinfo;
337 GStr rseq2, rqv2, seqid2, seqinfo2;
338 while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
339 incounter++;
340 int a5=0, a3=0, b5=0, b3=0;
341 char tcode=0, tcode2=0;
342 tcode=process_read(seqid, rseq, rqv, a5, a3);
343 if (fq2!=NULL) {
344 getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
345 if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
346 GError("Error: no paired match for read %s vs %s (%s,%s)\n",
347 seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
348 }
349 tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
350 //decide what to do with this pair and print rseq2 if the pair makes it
351 if (tcode>1 && tcode2<=1) {
352 //"untrash" rseq
353 if (a3-a5+1<min_read_len) {
354 a5=1;
355 if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
356 }
357 tcode=1;
358 }
359 else if (tcode<=1 && tcode2>1) {
360 //"untrash" rseq2
361 if (b3-b5+1<min_read_len) {
362 b5=1;
363 if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
364 }
365 tcode2=1;
366 }
367 if (tcode<=1) { //trimmed or left intact -- write it!
368 if (tcode>0) {
369 rseq2=rseq2.substr(b5,b3-b5+1);
370 if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
371 }
372 int nocounter=0;
373 writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
374 }
375 } //pair read
376 if (tcode>1) { //trashed
377 #ifdef GDEBUG
378 GMessage(" !!!!TRASH => 'N'\n");
379 #endif
380 if (tcode=='s') trash_s++;
381 else if (tcode=='A' || tcode=='T') trash_poly++;
382 else if (tcode=='Q') trash_Q++;
383 else if (tcode=='N') trash_N++;
384 else if (tcode=='D') trash_D++;
385 else if (tcode=='3') trash_A3++;
386 else if (tcode=='5') trash_A5++;
387 if (trashReport) trash_report(tcode, seqid, freport);
388 }
389 else if (!doCollapse) { //write it
390 if (tcode>0) {
391 rseq=rseq.substr(a5,a3-a5+1);
392 if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
393 }
394 #ifdef GDEBUG
395 GMessage(" After trimming:\n");
396 GMessage("%s\n",rseq.chars());
397 #endif
398 writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
399 }
400 } //for each fastq record
401 delete fq2;
402 FRCLOSE(f_in);
403 FRCLOSE(f_in2);
404 if (doCollapse) {
405 outcounter=0;
406 int maxdup_count=1;
407 char* maxdup_seq=NULL;
408 dhash.startIterate();
409 FqDupRec* qd=NULL;
410 char* seq=NULL;
411 while ((qd=dhash.NextData(seq))!=NULL) {
412 GStr rseq(seq);
413 //do the dusting here
414 if (doDust) {
415 int dustbases=dust(rseq);
416 if (dustbases>(rseq.length()>>1)) {
417 if (trashReport && qd->firstname!=NULL) {
418 fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
419 }
420 trash_D+=qd->count;
421 continue;
422 }
423 }
424 outcounter++;
425 if (qd->count>maxdup_count) {
426 maxdup_count=qd->count;
427 maxdup_seq=seq;
428 }
429 if (isfasta) {
430 if (prefix.is_empty()) {
431 fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
432 rseq.chars());
433 }
434 else { //use custom read name
435 fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
436 qd->count, rseq.chars());
437 }
438 }
439 else { //fastq format
440 if (convert_phred) convertPhred(qd->qv, qd->len);
441 if (prefix.is_empty()) {
442 fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
443 rseq.chars(), qd->qv);
444 }
445 else { //use custom read name
446 fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
447 qd->count, rseq.chars(), qd->qv);
448 }
449 }
450 }//for each element of dhash
451 if (maxdup_count>1) {
452 GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
453 }
454 } //collapse entries
455 if (verbose) {
456 if (paired_reads) {
457 GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
458 GMessage("Number of input pairs :%9d\n", incounter);
459 GMessage(" Output pairs :%9d\n", outcounter);
460 }
461 else {
462 GMessage(">Input file : %s\n", infname.chars());
463 GMessage("Number of input reads :%9d\n", incounter);
464 GMessage(" Output reads :%9d\n", outcounter);
465 }
466 GMessage("------------------------------------\n");
467 if (num_trimmed5)
468 GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
469 if (num_trimmed3)
470 GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
471 GMessage("------------------------------------\n");
472 if (trash_s>0)
473 GMessage(" Trashed by length:%9d\n", trash_s);
474 if (trash_N>0)
475 GMessage(" Trashed by N%%:%9d\n", trash_N);
476 if (trash_Q>0)
477 GMessage("Trashed by low quality:%9d\n", trash_Q);
478 if (trash_poly>0)
479 GMessage(" Trashed by poly-A/T:%9d\n", trash_poly);
480 if (trash_A5>0)
481 GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
482 if (trash_A3>0)
483 GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
484 }
485 if (trashReport) {
486 FWCLOSE(freport);
487 }
488 FWCLOSE(f_out);
489 FWCLOSE(f_out2);
490 } //while each input file
491 delete gxmem_l;
492 delete gxmem_r;
493 //getc(stdin);
494 }
495
496 class NData {
497 public:
498 int NPos[1024]; //there should be no reads longer than 1K ?
499 int NCount;
500 int end5;
501 int end3;
502 int seqlen;
503 double perc_N; //percentage of Ns in end5..end3 range only!
504 const char* seq;
505 bool valid;
506 NData() {
507 seqlen=0;
508 NCount=0;
509 end5=0;
510 end3=0;
511 seq=NULL;
512 perc_N=0;
513 valid=true;
514 }
515 void init(GStr& rseq) {
516 NCount=0;
517 perc_N=0;
518 seqlen=rseq.length();
519 seq=rseq.chars();
520 end5=1;
521 end3=seqlen;
522 for (int i=0;i<seqlen;i++)
523 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
524 NPos[NCount]=i;
525 NCount++;
526 }
527 perc_N=(NCount*100.0)/seqlen;
528 valid=true;
529 }
530 void N_calc() { //only in the region after trimming
531 int n=0;
532 for (int i=end3-1;i<end5;i++) {
533 if (seq[i]=='N') n++;
534 }
535 perc_N=(n*100.0)/(end5-end3+1);
536 }
537 };
538
539 static NData feat;
540 int perc_lenN=12; // incremental distance from ends, in percentage of
541 // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
542
543 void N_analyze(int l5, int l3, int p5, int p3) {
544 /* assumes feat was filled properly */
545 int old_dif, t5,t3,v;
546 if (l3<l5+2 || p5>p3 ) {
547 feat.end5=l5+1;
548 feat.end3=l3+1;
549 return;
550 }
551
552 t5=feat.NPos[p5]-l5;
553 t3=l3-feat.NPos[p3];
554 old_dif=p3-p5;
555 v=(int)((((double)(l3-l5))*perc_lenN)/100);
556 if (v>20) v=20; /* enforce N-search limit for very long reads */
557 if (t5 < v ) {
558 l5=feat.NPos[p5]+1;
559 p5++;
560 }
561 if (t3 < v) {
562 l3=feat.NPos[p3]-1;
563 p3--;
564 }
565 /* restNs=p3-p5; number of Ns in the new CLR */
566 if (p3-p5==old_dif) { /* no change, return */
567 /* don't trim if this may shorten the read too much */
568 //if (l5-l3<min_read_len) return;
569 feat.end5=l5+1;
570 feat.end3=l3+1;
571 return;
572 }
573 else
574 N_analyze(l5,l3, p5,p3);
575 }
576
577
578 bool qtrim(GStr& qvs, int &l5, int &l3) {
579 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
580 if (qv_phredtype==0) {
581 //try to guess the Phred type
582 int vmin=256, vmax=0;
583 for (int i=0;i<qvs.length();i++) {
584 if (vmin>qvs[i]) vmin=qvs[i];
585 if (vmax<qvs[i]) vmax=qvs[i];
586 }
587 if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
588 if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
589 if (qv_phredtype==0) {
590 GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
591 }
592 if (verbose)
593 GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
594 } //guessing Phred type
595 for (l3=qvs.length()-1;l3>2;l3--) {
596 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
597 }
598 //just in case, check also the 5' the end (?)
599 for (l5=0;l5<qvs.length()-3;l5++) {
600 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
601 }
602 if (qvtrim_max>0) {
603 if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
604 if (l5>qvtrim_max) l5=qvtrim_max;
605 }
606 return (l5>0 || l3<qvs.length()-1);
607 }
608
609 bool ntrim(GStr& rseq, int &l5, int &l3) {
610 //count Ns in the sequence, trim N-rich ends
611 feat.init(rseq);
612 l5=feat.end5-1;
613 l3=feat.end3-1;
614 N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
615 if (l5==feat.end5-1 && l3==feat.end3-1) {
616 if (feat.perc_N>max_perc_N) {
617 feat.valid=false;
618 l3=l5+1;
619 return true;
620 }
621 else {
622 return false; //nothing changed
623 }
624 }
625 l5=feat.end5-1;
626 l3=feat.end3-1;
627 if (l3-l5+1<min_read_len) {
628 feat.valid=false;
629 return true;
630 }
631 feat.N_calc();
632
633 if (feat.perc_N>max_perc_N) {
634 feat.valid=false;
635 l3=l5+1;
636 return true;
637 }
638 return true;
639 }
640
641 //--------------- dust functions ----------------
642 class DNADuster {
643 public:
644 int dustword;
645 int dustwindow;
646 int dustwindow2;
647 int dustcutoff;
648 int mv, iv, jv;
649 int counts[32*32*32];
650 int iis[32*32*32];
651 DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
652 dustword=wordsize;
653 dustwindow=winsize;
654 dustwindow2 = (winsize>>1);
655 dustcutoff=cutoff;
656 mv=0;
657 iv=0;
658 jv=0;
659 }
660 void setWindowSize(int value) {
661 dustwindow = value;
662 dustwindow2 = (dustwindow >> 1);
663 }
664 void setWordSize(int value) {
665 dustword=value;
666 }
667 void wo1(int len, const char* s, int ivv) {
668 int i, ii, j, v, t, n, n1, sum;
669 int js, nis;
670 n = 32 * 32 * 32;
671 n1 = n - 1;
672 nis = 0;
673 i = 0;
674 ii = 0;
675 sum = 0;
676 v = 0;
677 for (j=0; j < len; j++, s++) {
678 ii <<= 5;
679 if (*s<=32) {
680 i=0;
681 continue;
682 }
683 ii |= *s - 'A'; //assume uppercase!
684 ii &= n1;
685 i++;
686 if (i >= dustword) {
687 for (js=0; js < nis && iis[js] != ii; js++) ;
688 if (js == nis) {
689 iis[nis] = ii;
690 counts[ii] = 0;
691 nis++;
692 }
693 if ((t = counts[ii]) > 0) {
694 sum += t;
695 v = 10 * sum / j;
696 if (mv < v) {
697 mv = v;
698 iv = ivv;
699 jv = j;
700 }
701 }
702 counts[ii]++;
703 }
704 }
705 }
706
707 int wo(int len, const char* s, int* beg, int* end) {
708 int i, l1;
709 l1 = len - dustword + 1;
710 if (l1 < 0) {
711 *beg = 0;
712 *end = len - 1;
713 return 0;
714 }
715 mv = 0;
716 iv = 0;
717 jv = 0;
718 for (i=0; i < l1; i++) {
719 wo1(len-i, s+i, i);
720 }
721 *beg = iv;
722 *end = iv + jv;
723 return mv;
724 }
725
726 void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
727 int i, j, l, a, b, v;
728 if (cutoff==0) cutoff=dustcutoff;
729 a=0;b=0;
730 //GMessage("Dust cutoff=%d\n", cutoff);
731 for (i=0; i < seqlen; i += dustwindow2) {
732 l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
733 v = wo(l, seq+i, &a, &b);
734 if (v > cutoff) {
735 //for (j = a; j <= b && j < dustwindow2; j++) {
736 for (j = a; j <= b; j++) {
737 seqmsk[i+j]='N';//could be made lowercase instead
738 }
739 }
740 }
741 //return first;
742 }
743 };
744
745 static DNADuster duster;
746
747 int dust(GStr& rseq) {
748 char* seq=Gstrdup(rseq.chars());
749 duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
750 //check the number of Ns:
751 int ncount=0;
752 for (int i=0;i<rseq.length();i++) {
753 if (seq[i]=='N') ncount++;
754 }
755 GFREE(seq);
756 return ncount;
757 }
758
759 struct SLocScore {
760 int pos;
761 int score;
762 SLocScore(int p=0,int s=0) {
763 pos=p;
764 score=s;
765 }
766 void set(int p, int s) {
767 pos=p;
768 score=s;
769 }
770 void add(int p, int add) {
771 pos=p;
772 score+=add;
773 }
774 };
775
776 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
777 if (!doPolyTrim) return false;
778 int rlen=seq.length();
779 l5=0;
780 l3=rlen-1;
781 int32 seedVal=*(int32*)poly_seed;
782 char polyChar=poly_seed[0];
783 //assumes N trimming was already done
784 //so a poly match should be very close to the end of the read
785 // -- find the initial match (seed)
786 int lmin=GMAX((rlen-16), 0);
787 int li;
788 for (li=rlen-4;li>lmin;li--) {
789 if (seedVal==*(int*)&(seq[li])) {
790 break;
791 }
792 }
793 if (li<=lmin) return false;
794 //seed found, try to extend it both ways
795 //extend right
796 int ri=li+3;
797 SLocScore loc(ri, poly_m_score<<2);
798 SLocScore maxloc(loc);
799 //extend right
800 while (ri<rlen-1) {
801 ri++;
802 if (seq[ri]==polyChar) {
803 loc.add(ri,poly_m_score);
804 }
805 else if (seq[ri]=='N') {
806 loc.add(ri,0);
807 }
808 else { //mismatch
809 loc.add(ri,poly_mis_score);
810 if (maxloc.score-loc.score>poly_dropoff_score) break;
811 }
812 if (maxloc.score<=loc.score) {
813 maxloc=loc;
814 }
815 }
816 ri=maxloc.pos;
817 if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
818 //ri = right boundary for the poly match
819 //extend left
820 loc.set(li, maxloc.score);
821 maxloc.pos=li;
822 while (li>0) {
823 li--;
824 if (seq[li]==polyChar) {
825 loc.add(li,poly_m_score);
826 }
827 else if (seq[li]=='N') {
828 loc.add(li,0);
829 }
830 else { //mismatch
831 loc.add(li,poly_mis_score);
832 if (maxloc.score-loc.score>poly_dropoff_score) break;
833 }
834 if (maxloc.score<=loc.score) {
835 maxloc=loc;
836 }
837 }
838 li=maxloc.pos;
839 if ((maxloc.score==poly_minScore && ri==rlen-1) ||
840 (maxloc.score>poly_minScore && ri>=rlen-3) ||
841 (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
842 //trimming this li-ri match at 3' end
843 l3=li-1;
844 if (l3<0) l3=0;
845 return true;
846 }
847 return false;
848 }
849
850 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
851 if (!doPolyTrim) return false;
852 int rlen=seq.length();
853 l5=0;
854 l3=rlen-1;
855 int32 seedVal=*(int32*)poly_seed;
856 char polyChar=poly_seed[0];
857 //assumes N trimming was already done
858 //so a poly match should be very close to the end of the read
859 // -- find the initial match (seed)
860 int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
861 int li;
862 for (li=0;li<=lmax;li++) {
863 if (seedVal==*(int*)&(seq[li])) {
864 break;
865 }
866 }
867 if (li>lmax) return false;
868 //seed found, try to extend it both ways
869 //extend left
870 int ri=li+3; //save rightmost base of the seed
871 SLocScore loc(li, poly_m_score<<2);
872 SLocScore maxloc(loc);
873 while (li>0) {
874 li--;
875 if (seq[li]==polyChar) {
876 loc.add(li,poly_m_score);
877 }
878 else if (seq[li]=='N') {
879 loc.add(li,0);
880 }
881 else { //mismatch
882 loc.add(li,poly_mis_score);
883 if (maxloc.score-loc.score>poly_dropoff_score) break;
884 }
885 if (maxloc.score<=loc.score) {
886 maxloc=loc;
887 }
888 }
889 li=maxloc.pos;
890 if (li>5) return false; //no trimming wanted, too far from 5' end
891 //li = right boundary for the poly match
892
893 //extend right
894 loc.set(ri, maxloc.score);
895 maxloc.pos=ri;
896 while (ri<rlen-1) {
897 ri++;
898 if (seq[ri]==polyChar) {
899 loc.add(ri,poly_m_score);
900 }
901 else if (seq[ri]=='N') {
902 loc.add(ri,0);
903 }
904 else { //mismatch
905 loc.add(ri,poly_mis_score);
906 if (maxloc.score-loc.score>poly_dropoff_score) break;
907 }
908 if (maxloc.score<=loc.score) {
909 maxloc=loc;
910 }
911 }
912 ri=maxloc.pos;
913 if ((maxloc.score==poly_minScore && li==0) ||
914 (maxloc.score>poly_minScore && li<2)
915 || (maxloc.score>(poly_minScore*3) && li<8)) {
916 //adjust l5 to reflect this trimming of 5' end
917 l5=ri+1;
918 if (l5>rlen-1) l5=rlen-1;
919 return true;
920 }
921 return false;
922 }
923
924 bool trim_adapter3(GStr& seq, int&l5, int &l3) {
925 if (adapters3.Count()==0) return false;
926 int rlen=seq.length();
927 l5=0;
928 l3=rlen-1;
929 bool trimmed=false;
930 GStr wseq(seq.chars());
931 int wlen=rlen;
932 for (int ai=0;ai<adapters3.Count();ai++) {
933 if (adapters3[ai].is_empty()) continue;
934 int alen=adapters3[ai].length();
935 GStr& aseq=adapters3[ai];
936 GXAlnInfo* r_bestaln=match_RightEnd(aseq.chars(), alen, wseq.chars(), wlen, gxmem_r, 74);
937 if (r_bestaln) {
938 trimmed=true;
939 //keep unmatched region on the left, if any
940 l3-=(wlen-r_bestaln->sl+1);
941 delete r_bestaln;
942 if (l3<0) l3=0;
943 if (l3-l5+1<min_read_len) return true;
944 wseq=seq.substr(l5,l3-l5+1);
945 wlen=wseq.length();
946 }
947 }//for each 5' adapter
948 return trimmed;
949 }
950
951 bool trim_adapter5(GStr& seq, int&l5, int &l3) {
952 if (adapters5.Count()==0) return false;
953 int rlen=seq.length();
954 l5=0;
955 l3=rlen-1;
956 bool trimmed=false;
957 GStr wseq(seq.chars());
958 int wlen=rlen;
959 for (int ai=0;ai<adapters5.Count();ai++) {
960 if (adapters5[ai].is_empty()) continue;
961 int alen=adapters5[ai].length();
962 GStr& aseq=adapters5[ai];
963 GXAlnInfo* l_bestaln=match_LeftEnd(aseq.chars(), alen, adapters5[ai].pz,
964 wseq.chars(), wlen, gxmem_l, 84);
965 if (l_bestaln) {
966 trimmed=true;
967 l5+=l_bestaln->sr;
968 delete l_bestaln;
969 if (l5>=rlen) l5=rlen-1;
970 if (l3-l5+1<min_read_len) return true;
971 wseq=seq.substr(l5,l3-l5+1);
972 wlen=wseq.length();
973 }
974 }//for each 5' adapter
975 return trimmed;
976 }
977
978 //convert qvs to/from phred64 from/to phread33
979 void convertPhred(GStr& q) {
980 for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
981 }
982
983 void convertPhred(char* q, int len) {
984 for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
985 }
986
987 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
988 GStr& rname, GStr& rinfo, GStr& infname) {
989 rseq="";
990 rqv="";
991 rname="";
992 rinfo="";
993 if (fq.eof()) return false;
994 char* l=fq.getLine();
995 while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
996 if (l==NULL) return false;
997 /* if (rawFormat) {
998 //TODO: implement raw qseq parsing here
999 //if (raw type=N) then continue; //skip invalid/bad records
1000 } //raw qseq format
1001 else { // FASTQ or FASTA */
1002 isfasta=(l[0]=='>');
1003 if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1004 infname.chars(), l);
1005 GStr s(l);
1006 rname=&(l[1]);
1007 for (int i=0;i<rname.length();i++)
1008 if (rname[i]<=' ') {
1009 if (i<rname.length()-2) rinfo=rname.substr(i+1);
1010 rname.cut(i);
1011 break;
1012 }
1013 //now get the sequence
1014 if ((l=fq.getLine())==NULL)
1015 GError("Error: unexpected EOF after header for read %s (%s)\n",
1016 rname.chars(), infname.chars());
1017 rseq=l; //this must be the DNA line
1018 while ((l=fq.getLine())!=NULL) {
1019 //seq can span multiple lines
1020 if (l[0]=='>' || l[0]=='+') {
1021 fq.pushBack();
1022 break; //
1023 }
1024 rseq+=l;
1025 } //check for multi-line seq
1026 if (!isfasta) { //reading fastq quality values, which can also be multi-line
1027 if ((l=fq.getLine())==NULL)
1028 GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1029 if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1030 if ((l=fq.getLine())==NULL)
1031 GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1032 rqv=l;
1033 //if (rqv.length()!=rseq.length())
1034 // GError("Error: qv len != seq len for %s\n", rname.chars());
1035 while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1036 rqv+=l; //append to qv string
1037 }
1038 }// fastq
1039 // } //<-- FASTA or FASTQ
1040 rseq.upper();
1041 return true;
1042 }
1043
1044 #ifdef GDEBUG
1045 void showTrim(GStr& s, int l5, int l3) {
1046 if (l5>0) {
1047 color_bg(c_red);
1048 }
1049 for (int i=0;i<s.length()-1;i++) {
1050 if (i && i==l5) color_resetbg();
1051 fprintf(stderr, "%c", s[i]);
1052 if (i==l3) color_bg(c_red);
1053 }
1054 fprintf(stderr, "%c", s[s.length()-1]);
1055 color_reset();
1056 fprintf(stderr, "\n");
1057 }
1058 #endif
1059
1060 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1061 //returns 0 if the read was untouched, 1 if it was just trimmed
1062 // and a trash code if it was trashed
1063 l5=0;
1064 l3=rseq.length()-1;
1065 #ifdef GDEBUG
1066 //rseq.reverse();
1067 GMessage(">%s\n", rname.chars());
1068 GMessage("%s\n",rseq.chars());
1069 #endif
1070 if (l3-l5+1<min_read_len) {
1071 return 's';
1072 }
1073 GStr wseq(rseq.chars());
1074 GStr wqv(rqv.chars());
1075 int w5=l5;
1076 int w3=l3;
1077 //first do the q-based trimming
1078 if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1079 if (w3-w5+1<min_read_len) {
1080 return 'Q'; //invalid read
1081 }
1082 //-- keep only the w5..w3 range
1083 l5=w5;
1084 l3=w3;
1085 wseq=wseq.substr(w5, w3-w5+1);
1086 if (!wqv.is_empty())
1087 wqv=wqv.substr(w5, w3-w5+1);
1088 } //qv trimming
1089 // N-trimming on the remaining read seq
1090 if (ntrim(wseq, w5, w3)) {
1091 //GMessage("before: %s\n",wseq.chars());
1092 //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1093 l5+=w5;
1094 l3-=(wseq.length()-1-w3);
1095 if (w3-w5+1<min_read_len) {
1096 return 'N'; //to be trashed
1097 }
1098 //-- keep only the w5..w3 range
1099 wseq=wseq.substr(w5, w3-w5+1);
1100 if (!wqv.is_empty())
1101 wqv=wqv.substr(w5, w3-w5+1);
1102 w5=0;
1103 w3=wseq.length()-1;
1104 }
1105 char trim_code;
1106 do {
1107 trim_code=0;
1108 if (trim_poly5(wseq, w5, w3, polyA_seed)) {
1109 trim_code='A';
1110 }
1111 else if (trim_poly5(wseq, w5, w3, polyT_seed)) {
1112 trim_code='T';
1113 }
1114 else if (trim_adapter5(wseq, w5, w3)) {
1115 trim_code='5';
1116 }
1117 if (trim_code) {
1118 #ifdef GDEBUG
1119 GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1120 showTrim(wseq, w5, w3);
1121 #endif
1122 int trimlen=wseq.length()-(w3-w5+1);
1123 num_trimmed5++;
1124 if (trimlen<min_trimmed5)
1125 min_trimmed5=trimlen;
1126 l5+=w5;
1127 l3-=(wseq.length()-1-w3);
1128 if (w3-w5+1<min_read_len) {
1129 return trim_code;
1130 }
1131 //-- keep only the w5..w3 range
1132 wseq=wseq.substr(w5, w3-w5+1);
1133 if (!wqv.is_empty())
1134 wqv=wqv.substr(w5, w3-w5+1);
1135 }// trimmed at 5' end
1136 } while (trim_code);
1137
1138 do {
1139 trim_code=0;
1140 if (trim_poly3(wseq, w5, w3, polyA_seed)) {
1141 trim_code='A';
1142 }
1143 else if (trim_poly3(wseq, w5, w3, polyT_seed)) {
1144 trim_code='T';
1145 }
1146 else if (trim_adapter3(wseq, w5, w3)) {
1147 trim_code='3';
1148 }
1149 if (trim_code) {
1150 #ifdef GDEBUG
1151 GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1152 showTrim(wseq, w5, w3);
1153 #endif
1154 int trimlen=wseq.length()-(w3-w5+1);
1155 num_trimmed3++;
1156 if (trimlen<min_trimmed3)
1157 min_trimmed3=trimlen;
1158 l5+=w5;
1159 l3-=(wseq.length()-1-w3);
1160 if (w3-w5+1<min_read_len) {
1161 return trim_code;
1162 }
1163 //-- keep only the w5..w3 range
1164 wseq=wseq.substr(w5, w3-w5+1);
1165 if (!wqv.is_empty())
1166 wqv=wqv.substr(w5, w3-w5+1);
1167 }//trimming at 3' end
1168 } while (trim_code);
1169
1170
1171 if (doCollapse) {
1172 //keep read for later
1173 FqDupRec* dr=dhash.Find(wseq.chars());
1174 if (dr==NULL) { //new entry
1175 //if (prefix.is_empty())
1176 dhash.Add(wseq.chars(),
1177 new FqDupRec(&wqv, rname.chars()));
1178 //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1179 }
1180 else
1181 dr->add(wqv);
1182 } //collapsing duplicates
1183 else { //not collapsing duplicates
1184 //apply the dust filter now
1185 if (doDust) {
1186 int dustbases=dust(wseq);
1187 if (dustbases>(wseq.length()>>1)) {
1188 return 'D';
1189 }
1190 }
1191 } //not collapsing duplicates
1192 return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1193 }
1194
1195 void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1196 //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1197 if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1198 else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1199 }
1200
1201 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1202 outcounter++;
1203 bool asFasta=(rqv.is_empty() || fastaOutput);
1204 if (asFasta) {
1205 if (prefix.is_empty()) {
1206 printHeader(f_out, '>',rname,rinfo);
1207 fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1208 }
1209 else {
1210 fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1211 rseq.chars());
1212 }
1213 }
1214 else { //fastq
1215 if (convert_phred) convertPhred(rqv);
1216 if (prefix.is_empty()) {
1217 printHeader(f_out, '@', rname, rinfo);
1218 fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1219 }
1220 else
1221 fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1222 rseq.chars(),rqv.chars() );
1223 }
1224 }
1225
1226 void trash_report(char trashcode, GStr& rname, FILE* freport) {
1227 if (freport==NULL || trashcode<=' ') return;
1228 if (trashcode=='3' || trashcode=='5') {
1229 fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1230 }
1231 else {
1232 fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1233 }
1234 //tcounter++;
1235 }
1236
1237 GStr getFext(GStr& s, int* xpos=NULL) {
1238 //s must be a filename without a path
1239 GStr r("");
1240 if (xpos!=NULL) *xpos=0;
1241 if (s.is_empty() || s=="-") return r;
1242 int p=s.rindex('.');
1243 int d=s.rindex('/');
1244 if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1245 r=s.substr(p+1);
1246 if (xpos!=NULL) *xpos=p+1;
1247 r.lower();
1248 return r;
1249 }
1250
1251 void baseFileName(GStr& fname) {
1252 //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1253 int xpos=0;
1254 GStr fext=getFext(fname, &xpos);
1255 if (xpos<=1) return;
1256 bool extdel=false;
1257 GStr f2;
1258 if (fext=="z" || fext=="zip") {
1259 extdel=true;
1260 }
1261 else if (fext.length()>=2) {
1262 f2=fext.substr(0,2);
1263 extdel=(f2=="gz" || f2=="bz");
1264 }
1265 if (extdel) {
1266 fname.cut(xpos-1);
1267 fext=getFext(fname, &xpos);
1268 if (xpos<=1) return;
1269 }
1270 extdel=false;
1271 if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1272 extdel=true;
1273 }
1274 else if (fext.length()>=2) {
1275 extdel=(fext.substr(0,2)=="fa");
1276 }
1277 if (extdel) fname.cut(xpos-1);
1278 GStr fncp(fname);
1279 fncp.lower();
1280 fncp.chomp("seq");
1281 fncp.chomp("sequence");
1282 fncp.trimR("_.");
1283 if (fncp.length()<fname.length()) fname.cut(fncp.length());
1284 }
1285
1286 FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1287 FILE* f_out=NULL;
1288 GStr fname(getFileName(infname.chars()));
1289 //eliminate known extensions
1290 baseFileName(fname);
1291 if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1292 else if (pocmd.is_empty()) {
1293 GStr oname(fname);
1294 oname.append('.');
1295 oname.append(outsuffix);
1296 f_out=fopen(oname.chars(),"w");
1297 if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1298 }
1299 else {
1300 GStr oname(">");
1301 oname.append(fname);
1302 oname.append('.');
1303 oname.append(outsuffix);
1304 pocmd.append(oname);
1305 f_out=popen(pocmd.chars(), "w");
1306 if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1307 }
1308 return f_out;
1309 }
1310
1311 void guess_unzip(GStr& fname, GStr& picmd) {
1312 GStr fext=getFext(fname);
1313 if (fext=="gz" || fext=="gzip" || fext=="z") {
1314 picmd="gzip -cd ";
1315 }
1316 else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1317 picmd="bzip2 -cd ";
1318 }
1319 }
1320
1321 void addAdapter(GVec<CASeqData>& adapters, GStr& seq) {
1322 //TODO: prepare CASeqData here, and collect hexamers as well
1323
1324 }
1325
1326
1327 int loadAdapters(const char* fname) {
1328 GLineReader lr(fname);
1329 char* l;
1330 while ((l=lr.nextLine())!=NULL) {
1331 if (lr.length()<=3 || l[0]=='#') continue;
1332 if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1333 l[0]==';'|| l[0]==':' ) {
1334 int i=1;
1335 while (l[i]!=0 && isspace(l[i])) {
1336 i++;
1337 }
1338 if (l[i]!=0) {
1339 GStr s(&(l[i]));
1340 #ifdef GDEBUG
1341 //s.reverse();
1342 #endif
1343 addAdapter(adapters3, s);
1344 continue;
1345 }
1346 }
1347 else {
1348 GStr s(l);
1349 s.startTokenize("\t ;,:");
1350 GStr a5,a3;
1351 if (s.nextToken(a5))
1352 s.nextToken(a3);
1353 a5.upper();
1354 a3.upper();
1355 #ifdef GDEBUG
1356 // a5.reverse();
1357 // a3.reverse();
1358 #endif
1359 addAdapter(adapters5, a5);
1360 addAdapter(adapters3, a3);
1361 }
1362 }
1363 return adapters5.Count()+adapters3.Count();
1364 }
1365
1366 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1367 GStr& s, GStr& infname, GStr& infname2) {
1368 // uses outsuffix to generate output file names and open file handles as needed
1369 infname="";
1370 infname2="";
1371 f_in=NULL;
1372 f_in2=NULL;
1373 f_out=NULL;
1374 f_out2=NULL;
1375 //analyze outsuffix intent
1376 GStr pocmd;
1377 if (outsuffix=="-") {
1378 f_out=stdout;
1379 }
1380 else {
1381 GStr ox=getFext(outsuffix);
1382 if (ox.length()>2) ox=ox.substr(0,2);
1383 if (ox=="gz") pocmd="gzip -9 -c ";
1384 else if (ox=="bz") pocmd="bzip2 -9 -c ";
1385 }
1386 if (s=="-") {
1387 f_in=stdin;
1388 infname="stdin";
1389 f_out=prepOutFile(infname, pocmd);
1390 return;
1391 } // streaming from stdin
1392 s.startTokenize(",:");
1393 s.nextToken(infname);
1394 bool paired=s.nextToken(infname2);
1395 if (fileExists(infname.chars())==0)
1396 GError("Error: cannot find file %s!\n",infname.chars());
1397 GStr fname(getFileName(infname.chars()));
1398 GStr picmd;
1399 guess_unzip(fname, picmd);
1400 if (picmd.is_empty()) {
1401 f_in=fopen(infname.chars(), "r");
1402 if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1403 }
1404 else {
1405 picmd.append(infname);
1406 f_in=popen(picmd.chars(), "r");
1407 if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1408 }
1409 if (f_out==stdout) {
1410 if (paired) GError("Error: output suffix required for paired reads\n");
1411 return;
1412 }
1413 f_out=prepOutFile(infname, pocmd);
1414 if (!paired) return;
1415 if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1416 // ---- paired reads:-------------
1417 if (fileExists(infname2.chars())==0)
1418 GError("Error: cannot find file %s!\n",infname2.chars());
1419 picmd="";
1420 GStr fname2(getFileName(infname2.chars()));
1421 guess_unzip(fname2, picmd);
1422 if (picmd.is_empty()) {
1423 f_in2=fopen(infname2.chars(), "r");
1424 if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1425 }
1426 else {
1427 picmd.append(infname2);
1428 f_in2=popen(picmd.chars(), "r");
1429 if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1430 }
1431 f_out2=prepOutFile(infname2, pocmd);
1432 }