7 |
|
|
8 |
|
#define USAGE "Usage:\n\ |
9 |
|
fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\ |
10 |
< |
[-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\ |
10 |
> |
[-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\ |
11 |
|
[-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\ |
12 |
|
<input.fq>[,<input_mates.fq>\n\ |
13 |
|
\n\ |
31 |
|
(e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\ |
32 |
|
-3 trim the given adapter sequence at the 3' end of each read\n\ |
33 |
|
(e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\ |
34 |
< |
-A disable polyA trimming (enabled by default)\n\ |
35 |
< |
-T enable polyT trimming (disabled by default)\n\ |
34 |
> |
-A disable polyA/T trimming (enabled by default)\n\ |
35 |
|
-y minimum length of exact match to adaptor sequence at the proper end (6)\n\ |
36 |
|
-q trim bases with quality value lower than <minq> (starting at the 3' end)\n\ |
37 |
|
-t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\ |
48 |
|
-Q convert quality values to the other Phred qv type\n\ |
49 |
|
-V verbose processing\n\ |
50 |
|
" |
51 |
< |
|
51 |
> |
|
52 |
|
//-z for -o option, the output stream(s) will be first piped into the given\n |
53 |
|
// <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n |
54 |
|
|
55 |
< |
|
55 |
> |
|
56 |
|
// example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG |
57 |
|
|
58 |
|
//For paired reads sequencing: |
62 |
|
//FILE* f_out2=NULL; //for paired reads |
63 |
|
//FILE* f_in=NULL; //input fastq (stdin if not provided) |
64 |
|
//FILE* f_in2=NULL; //for paired reads |
65 |
+ |
|
66 |
|
FILE* freport=NULL; |
67 |
|
|
68 |
|
bool debug=false; |
69 |
|
bool verbose=false; |
70 |
|
bool doCollapse=false; |
71 |
|
bool doDust=false; |
72 |
+ |
bool doPolyTrim=true; |
73 |
|
bool fastaOutput=false; |
74 |
|
bool trashReport=false; |
75 |
< |
//bool rawFormat=false; |
75 |
> |
bool revCompl=false; //also reverse complement adaptor sequences |
76 |
|
int min_read_len=16; |
77 |
|
double max_perc_N=7.0; |
78 |
|
int dust_cutoff=16; |
79 |
|
bool isfasta=false; |
80 |
|
bool convert_phred=false; |
81 |
< |
GStr outsuffix; // -o |
81 |
> |
GStr outsuffix; // -o |
82 |
|
GStr prefix; |
83 |
|
GStr zcmd; |
84 |
|
int num_trimmed5=0; |
94 |
|
int qv_cvtadd=0; //could be -31 or +31 |
95 |
|
|
96 |
|
// adaptor matching metrics -- for X-drop ungapped extension |
97 |
+ |
const int match_reward=2; |
98 |
+ |
const int mismatch_penalty=3; |
99 |
+ |
const int Xdrop=8; |
100 |
+ |
|
101 |
|
const int poly_m_score=2; //match score |
102 |
|
const int poly_mis_score=-3; //mismatch |
103 |
|
const int poly_dropoff_score=7; |
106 |
|
const char *polyA_seed="AAAA"; |
107 |
|
const char *polyT_seed="TTTT"; |
108 |
|
|
109 |
< |
struct CAdapters { |
110 |
< |
GStr a5; |
111 |
< |
GStr a3; |
112 |
< |
CAdapters(const char* s5=NULL, const char* s3=NULL):a5(s5),a3(s3) { |
113 |
< |
} |
109 |
> |
struct CASeqData { |
110 |
> |
//positional data for every possible hexamer in an adapter |
111 |
> |
GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adapter sequence |
112 |
> |
GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adapter sequence |
113 |
> |
GStr seq; //actual adapter sequence data |
114 |
> |
GStr seqr; //reverse complement sequence |
115 |
> |
bool use_reverse; |
116 |
> |
CASeqData(bool rev=false):seq(),seqr() { |
117 |
> |
use_reverse=rev; |
118 |
> |
for (int i=0;i<4096;i++) { |
119 |
> |
pz[i]=NULL; |
120 |
> |
pzr[i]=NULL; |
121 |
> |
} |
122 |
> |
} |
123 |
> |
|
124 |
> |
void update(const char* s) { |
125 |
> |
seq=s; |
126 |
> |
table6mers(seq.chars(), seq.length(), pz); |
127 |
> |
if (!use_reverse) return; |
128 |
> |
//reverse complement |
129 |
> |
seqr=s; |
130 |
> |
int slen=seq.length(); |
131 |
> |
for (int i=0;i<slen;i++) |
132 |
> |
seqr[i]=ntComplement(seq[slen-i-1]); |
133 |
> |
table6mers(seqr.chars(), seqr.length(), pzr); |
134 |
> |
} |
135 |
> |
|
136 |
> |
~CASeqData() { |
137 |
> |
for (int i=0;i<4096;i++) { |
138 |
> |
delete pz[i]; |
139 |
> |
delete pzr[i]; |
140 |
> |
} |
141 |
> |
} |
142 |
|
}; |
143 |
|
|
144 |
< |
GPVec<CAdapters> adapters; |
144 |
> |
GVec<CASeqData> adapters5; |
145 |
> |
GVec<CASeqData> adapters3; |
146 |
> |
|
147 |
> |
CGreedyAlignData* gxmem_l=NULL; |
148 |
> |
CGreedyAlignData* gxmem_r=NULL; |
149 |
|
|
150 |
|
// element in dhash: |
151 |
|
class FqDupRec { |
195 |
|
|
196 |
|
GHash<FqDupRec> dhash; //hash to keep track of duplicates |
197 |
|
|
198 |
+ |
void addAdapter(GVec<CASeqData>& adapters, GStr& seq); |
199 |
|
int loadAdapters(const char* fname); |
200 |
|
|
201 |
< |
void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2, |
202 |
< |
GStr& s, GStr& infname, GStr& infname2); |
201 |
> |
void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2, |
202 |
> |
GStr& s, GStr& infname, GStr& infname2); |
203 |
|
// uses outsuffix to generate output file names and open file handles as needed |
204 |
< |
|
204 |
> |
|
205 |
|
void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter); |
206 |
|
void trash_report(char trashcode, GStr& rname, FILE* freport); |
207 |
|
|
208 |
< |
bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv, |
208 |
> |
bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv, |
209 |
|
GStr& rname, GStr& rinfo, GStr& infname); |
210 |
|
|
211 |
< |
char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3); |
211 |
> |
char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3); |
212 |
|
//returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed |
213 |
|
|
214 |
|
bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured |
223 |
|
void convertPhred(GStr& q); |
224 |
|
|
225 |
|
int main(int argc, char * const argv[]) { |
226 |
< |
GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:f:t:o:z:a:"); |
226 |
> |
GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:"); |
227 |
|
int e; |
228 |
|
if ((e=args.isError())>0) { |
229 |
|
GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]); |
234 |
|
convert_phred=(args.getOpt('Q')!=NULL); |
235 |
|
doCollapse=(args.getOpt('C')!=NULL); |
236 |
|
doDust=(args.getOpt('D')!=NULL); |
237 |
+ |
revCompl=(args.getOpt('R')!=NULL); |
238 |
+ |
if (args.getOpt('A')) doPolyTrim=false; |
239 |
|
/* |
240 |
|
rawFormat=(args.getOpt('R')!=NULL); |
241 |
|
if (rawFormat) { |
244 |
|
*/ |
245 |
|
prefix=args.getOpt('n'); |
246 |
|
GStr s=args.getOpt('l'); |
247 |
< |
if (!s.is_empty()) |
247 |
> |
if (!s.is_empty()) |
248 |
|
min_read_len=s.asInt(); |
249 |
|
s=args.getOpt('m'); |
250 |
< |
if (!s.is_empty()) |
250 |
> |
if (!s.is_empty()) |
251 |
|
max_perc_N=s.asDouble(); |
252 |
|
s=args.getOpt('d'); |
253 |
|
if (!s.is_empty()) { |
273 |
|
qv_phredtype=64; |
274 |
|
qv_cvtadd=-31; |
275 |
|
} |
276 |
< |
else |
276 |
> |
else |
277 |
|
GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE); |
278 |
|
} |
279 |
|
s=args.getOpt('f'); |
280 |
|
if (!s.is_empty()) { |
281 |
|
loadAdapters(s.chars()); |
282 |
|
} |
283 |
< |
bool fileAdapters=adapters.Count(); |
283 |
> |
bool fileAdapters=adapters5.Count()+adapters3.Count(); |
284 |
|
s=args.getOpt('5'); |
285 |
|
if (!s.is_empty()) { |
286 |
|
if (fileAdapters) |
287 |
|
GError("Error: options -5 and -f cannot be used together!\n"); |
288 |
|
s.upper(); |
289 |
< |
adapters.Add(new CAdapters(s.chars())); |
289 |
> |
addAdapter(adapters5, s); |
290 |
|
} |
291 |
|
s=args.getOpt('3'); |
292 |
|
if (!s.is_empty()) { |
293 |
|
if (fileAdapters) |
294 |
|
GError("Error: options -3 and -f cannot be used together!\n"); |
295 |
< |
s.upper(); |
296 |
< |
if (adapters.Count()>0) |
257 |
< |
adapters[0]->a3=s.chars(); |
258 |
< |
else adapters.Add(NULL, new CAdapters(s.chars())); |
295 |
> |
s.upper(); |
296 |
> |
addAdapter(adapters3, s); |
297 |
|
} |
298 |
|
s=args.getOpt('y'); |
299 |
|
if (!s.is_empty()) { |
300 |
|
int minmatch=s.asInt(); |
301 |
|
poly_minScore=minmatch*poly_m_score; |
302 |
|
} |
303 |
< |
|
303 |
> |
|
304 |
|
if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o'); |
305 |
|
else outsuffix="-"; |
306 |
|
trashReport= (args.getOpt('r')!=NULL); |
317 |
|
if (trashReport) |
318 |
|
openfw(freport, args, 'r'); |
319 |
|
char* infile=NULL; |
320 |
+ |
|
321 |
+ |
if (adapters5.Count()>0) |
322 |
+ |
gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2); |
323 |
+ |
if (adapters3.Count()>0) |
324 |
+ |
gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop); |
325 |
+ |
|
326 |
|
while ((infile=args.nextNonOpt())!=NULL) { |
327 |
+ |
//for each input file |
328 |
|
int incounter=0; //counter for input reads |
329 |
|
int outcounter=0; //counter for output reads |
330 |
|
int trash_s=0; //too short from the get go |
331 |
|
int trash_Q=0; |
332 |
|
int trash_N=0; |
333 |
|
int trash_D=0; |
334 |
+ |
int trash_poly=0; |
335 |
|
int trash_A3=0; |
336 |
|
int trash_A5=0; |
337 |
|
s=infile; |
356 |
|
int a5=0, a3=0, b5=0, b3=0; |
357 |
|
char tcode=0, tcode2=0; |
358 |
|
tcode=process_read(seqid, rseq, rqv, a5, a3); |
359 |
< |
//if (!doCollapse) { |
314 |
< |
if (fq2!=NULL) { |
359 |
> |
if (fq2!=NULL) { |
360 |
|
getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2); |
361 |
|
if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) { |
362 |
|
GError("Error: no paired match for read %s vs %s (%s,%s)\n", |
388 |
|
int nocounter=0; |
389 |
|
writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter); |
390 |
|
} |
391 |
< |
} //paired read |
347 |
< |
// } |
391 |
> |
} //pair read |
392 |
|
if (tcode>1) { //trashed |
393 |
+ |
#ifdef GDEBUG |
394 |
+ |
GMessage(" !!!!TRASH => 'N'\n"); |
395 |
+ |
#endif |
396 |
|
if (tcode=='s') trash_s++; |
397 |
+ |
else if (tcode=='A' || tcode=='T') trash_poly++; |
398 |
|
else if (tcode=='Q') trash_Q++; |
399 |
|
else if (tcode=='N') trash_N++; |
400 |
|
else if (tcode=='D') trash_D++; |
407 |
|
rseq=rseq.substr(a5,a3-a5+1); |
408 |
|
if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1); |
409 |
|
} |
410 |
+ |
#ifdef GDEBUG |
411 |
+ |
GMessage(" After trimming:\n"); |
412 |
+ |
GMessage("%s\n",rseq.chars()); |
413 |
+ |
#endif |
414 |
|
writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter); |
415 |
|
} |
416 |
|
} //for each fastq record |
438 |
|
} |
439 |
|
} |
440 |
|
outcounter++; |
441 |
< |
if (qd->count>maxdup_count) { |
441 |
> |
if (qd->count>maxdup_count) { |
442 |
|
maxdup_count=qd->count; |
443 |
|
maxdup_seq=seq; |
444 |
|
} |
445 |
|
if (isfasta) { |
446 |
|
if (prefix.is_empty()) { |
447 |
< |
fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count, |
447 |
> |
fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count, |
448 |
|
rseq.chars()); |
449 |
|
} |
450 |
|
else { //use custom read name |
455 |
|
else { //fastq format |
456 |
|
if (convert_phred) convertPhred(qd->qv, qd->len); |
457 |
|
if (prefix.is_empty()) { |
458 |
< |
fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count, |
458 |
> |
fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count, |
459 |
|
rseq.chars(), qd->qv); |
460 |
|
} |
461 |
|
else { //use custom read name |
491 |
|
GMessage(" Trashed by N%%:%9d\n", trash_N); |
492 |
|
if (trash_Q>0) |
493 |
|
GMessage("Trashed by low quality:%9d\n", trash_Q); |
494 |
+ |
if (trash_poly>0) |
495 |
+ |
GMessage(" Trashed by poly-A/T:%9d\n", trash_poly); |
496 |
|
if (trash_A5>0) |
497 |
|
GMessage(" Trashed by 5' adapter:%9d\n", trash_A5); |
498 |
|
if (trash_A3>0) |
504 |
|
FWCLOSE(f_out); |
505 |
|
FWCLOSE(f_out2); |
506 |
|
} //while each input file |
507 |
< |
|
507 |
> |
delete gxmem_l; |
508 |
> |
delete gxmem_r; |
509 |
|
//getc(stdin); |
510 |
|
} |
511 |
|
|
520 |
|
const char* seq; |
521 |
|
bool valid; |
522 |
|
NData() { |
523 |
+ |
seqlen=0; |
524 |
|
NCount=0; |
525 |
|
end5=0; |
526 |
|
end3=0; |
551 |
|
perc_N=(n*100.0)/(end5-end3+1); |
552 |
|
} |
553 |
|
}; |
554 |
< |
|
554 |
> |
|
555 |
|
static NData feat; |
556 |
|
int perc_lenN=12; // incremental distance from ends, in percentage of |
557 |
|
// sequence length, where N-trimming is done (default:12 %) (autolimited to 20) |
558 |
< |
|
558 |
> |
|
559 |
|
void N_analyze(int l5, int l3, int p5, int p3) { |
560 |
|
/* assumes feat was filled properly */ |
561 |
|
int old_dif, t5,t3,v; |
562 |
|
if (l3<l5+2 || p5>p3 ) { |
563 |
|
feat.end5=l5+1; |
564 |
|
feat.end3=l3+1; |
565 |
< |
return; |
565 |
> |
return; |
566 |
|
} |
567 |
|
|
568 |
|
t5=feat.NPos[p5]-l5; |
569 |
|
t3=l3-feat.NPos[p3]; |
570 |
|
old_dif=p3-p5; |
571 |
|
v=(int)((((double)(l3-l5))*perc_lenN)/100); |
572 |
< |
if (v>20) v=20; /* enforce N-search limit for very long reads */ |
572 |
> |
if (v>20) v=20; /* enforce N-search limit for very long reads */ |
573 |
|
if (t5 < v ) { |
574 |
|
l5=feat.NPos[p5]+1; |
575 |
|
p5++; |
586 |
|
feat.end3=l3+1; |
587 |
|
return; |
588 |
|
} |
589 |
< |
else |
589 |
> |
else |
590 |
|
N_analyze(l5,l3, p5,p3); |
591 |
|
} |
592 |
|
|
627 |
|
feat.init(rseq); |
628 |
|
l5=feat.end5-1; |
629 |
|
l3=feat.end3-1; |
630 |
< |
N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1); |
630 |
> |
N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1); |
631 |
|
if (l5==feat.end5-1 && l3==feat.end3-1) { |
632 |
|
if (feat.perc_N>max_perc_N) { |
633 |
|
feat.valid=false; |
645 |
|
return true; |
646 |
|
} |
647 |
|
feat.N_calc(); |
648 |
< |
|
648 |
> |
|
649 |
|
if (feat.perc_N>max_perc_N) { |
650 |
|
feat.valid=false; |
651 |
|
l3=l5+1; |
657 |
|
//--------------- dust functions ---------------- |
658 |
|
class DNADuster { |
659 |
|
public: |
660 |
< |
int dustword; |
661 |
< |
int dustwindow; |
662 |
< |
int dustwindow2; |
660 |
> |
int dustword; |
661 |
> |
int dustwindow; |
662 |
> |
int dustwindow2; |
663 |
|
int dustcutoff; |
664 |
|
int mv, iv, jv; |
665 |
|
int counts[32*32*32]; |
754 |
|
} |
755 |
|
} |
756 |
|
} |
757 |
< |
//return first; |
757 |
> |
//return first; |
758 |
|
} |
759 |
|
}; |
760 |
|
|
772 |
|
return ncount; |
773 |
|
} |
774 |
|
|
719 |
– |
int get3mer_value(const char* s) { |
720 |
– |
return (s[0]<<16)+(s[1]<<8)+s[2]; |
721 |
– |
} |
722 |
– |
|
723 |
– |
int w3_match(int qv, const char* str, int slen, int start_index=0) { |
724 |
– |
if (start_index>=slen || start_index<0) return -1; |
725 |
– |
for (int i=start_index;i<slen-3;i++) { |
726 |
– |
int rv=get3mer_value(str+i); |
727 |
– |
if (rv==qv) return i; |
728 |
– |
} |
729 |
– |
return -1; |
730 |
– |
} |
731 |
– |
|
732 |
– |
int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) { |
733 |
– |
if (end_index>=slen) return -1; |
734 |
– |
if (end_index<0) end_index=slen-1; |
735 |
– |
for (int i=end_index-2;i>=0;i--) { |
736 |
– |
int rv=get3mer_value(str+i); |
737 |
– |
if (rv==qv) return i; |
738 |
– |
} |
739 |
– |
return -1; |
740 |
– |
} |
741 |
– |
|
775 |
|
struct SLocScore { |
776 |
|
int pos; |
777 |
|
int score; |
790 |
|
}; |
791 |
|
|
792 |
|
bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) { |
793 |
+ |
if (!doPolyTrim) return false; |
794 |
|
int rlen=seq.length(); |
795 |
|
l5=0; |
796 |
|
l3=rlen-1; |
799 |
|
//assumes N trimming was already done |
800 |
|
//so a poly match should be very close to the end of the read |
801 |
|
// -- find the initial match (seed) |
802 |
< |
int lmin=GMAX((rlen-12), 0); |
802 |
> |
int lmin=GMAX((rlen-16), 0); |
803 |
|
int li; |
804 |
|
for (li=rlen-4;li>lmin;li--) { |
805 |
|
if (seedVal==*(int*)&(seq[li])) { |
813 |
|
SLocScore loc(ri, poly_m_score<<2); |
814 |
|
SLocScore maxloc(loc); |
815 |
|
//extend right |
816 |
< |
while (ri<rlen-2) { |
816 |
> |
while (ri<rlen-1) { |
817 |
|
ri++; |
818 |
|
if (seq[ri]==polyChar) { |
819 |
|
loc.add(ri,poly_m_score); |
830 |
|
} |
831 |
|
} |
832 |
|
ri=maxloc.pos; |
833 |
< |
if (ri<rlen-3) return false; //no trimming wanted, too far from 3' end |
833 |
> |
if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end |
834 |
|
//ri = right boundary for the poly match |
835 |
|
//extend left |
836 |
|
loc.set(li, maxloc.score); |
851 |
|
maxloc=loc; |
852 |
|
} |
853 |
|
} |
854 |
< |
if (maxloc.score>poly_minScore && ri>=rlen-3) { |
855 |
< |
l5=li; |
856 |
< |
l3=ri; |
854 |
> |
li=maxloc.pos; |
855 |
> |
if ((maxloc.score==poly_minScore && ri==rlen-1) || |
856 |
> |
(maxloc.score>poly_minScore && ri>=rlen-3) || |
857 |
> |
(maxloc.score>(poly_minScore*3) && ri>=rlen-8)) { |
858 |
> |
//trimming this li-ri match at 3' end |
859 |
> |
l3=li-1; |
860 |
> |
if (l3<0) l3=0; |
861 |
|
return true; |
862 |
|
} |
863 |
|
return false; |
864 |
|
} |
865 |
|
|
828 |
– |
|
866 |
|
bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) { |
867 |
+ |
if (!doPolyTrim) return false; |
868 |
|
int rlen=seq.length(); |
869 |
|
l5=0; |
870 |
|
l3=rlen-1; |
873 |
|
//assumes N trimming was already done |
874 |
|
//so a poly match should be very close to the end of the read |
875 |
|
// -- find the initial match (seed) |
876 |
< |
int lmax=GMIN(8, rlen-4);//how far from 5' end to look for 4-mer seeds |
876 |
> |
int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds |
877 |
|
int li; |
878 |
|
for (li=0;li<=lmax;li++) { |
879 |
|
if (seedVal==*(int*)&(seq[li])) { |
903 |
|
} |
904 |
|
} |
905 |
|
li=maxloc.pos; |
906 |
< |
if (li>3) return false; //no trimming wanted, too far from 5' end |
906 |
> |
if (li>5) return false; //no trimming wanted, too far from 5' end |
907 |
|
//li = right boundary for the poly match |
908 |
|
|
909 |
|
//extend right |
910 |
|
loc.set(ri, maxloc.score); |
911 |
|
maxloc.pos=ri; |
912 |
< |
while (ri<rlen-2) { |
912 |
> |
while (ri<rlen-1) { |
913 |
|
ri++; |
914 |
|
if (seq[ri]==polyChar) { |
915 |
|
loc.add(ri,poly_m_score); |
925 |
|
maxloc=loc; |
926 |
|
} |
927 |
|
} |
928 |
< |
|
929 |
< |
if (maxloc.score>poly_minScore && li<=3) { |
930 |
< |
l5=li; |
931 |
< |
l3=ri; |
928 |
> |
ri=maxloc.pos; |
929 |
> |
if ((maxloc.score==poly_minScore && li==0) || |
930 |
> |
(maxloc.score>poly_minScore && li<2) |
931 |
> |
|| (maxloc.score>(poly_minScore*3) && li<8)) { |
932 |
> |
//adjust l5 to reflect this trimming of 5' end |
933 |
> |
l5=ri+1; |
934 |
> |
if (l5>rlen-1) l5=rlen-1; |
935 |
|
return true; |
936 |
|
} |
937 |
|
return false; |
938 |
|
} |
939 |
|
|
940 |
|
bool trim_adapter3(GStr& seq, int&l5, int &l3) { |
941 |
+ |
if (adapters3.Count()==0) return false; |
942 |
|
int rlen=seq.length(); |
943 |
|
l5=0; |
944 |
|
l3=rlen-1; |
945 |
< |
//first try a full match, we might get lucky |
946 |
< |
int fi=-1; |
947 |
< |
if ((fi=seq.index(adapter3))>=0) { |
948 |
< |
if (fi<rlen-fi-a3len) {//match is closer to the right end |
949 |
< |
l5=fi+a3len; |
950 |
< |
l3=rlen-1; |
951 |
< |
} |
952 |
< |
else { |
953 |
< |
l5=0; |
954 |
< |
l3=fi-1; |
955 |
< |
} |
956 |
< |
return true; |
957 |
< |
} |
958 |
< |
#ifdef DEBUG |
959 |
< |
if (debug) GMessage(">TRIM3 >> Read: %s\n",seq.chars()); |
960 |
< |
#endif |
961 |
< |
|
962 |
< |
//also, for fast detection of other adapter-only reads that start past |
921 |
< |
// the beginning of the adapter sequence, try to see if the first a3len-4 |
922 |
< |
// bases of the read are a substring of the adapter |
923 |
< |
if (rlen>a3len-3) { |
924 |
< |
GStr rstart=seq.substr(1,a3len-4); |
925 |
< |
if ((fi=adapter3.index(rstart))>=0) { |
926 |
< |
l3=rlen-1; |
927 |
< |
l5=a3len-4; |
928 |
< |
while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++; |
929 |
< |
return true; |
930 |
< |
} |
931 |
< |
} |
932 |
< |
CSegChain a3segs; //no chains here, just an ordered collection of segment pairs |
933 |
< |
//check the easy cases - 11 bases exact match at the end |
934 |
< |
int fdlen=11; |
935 |
< |
if (a3len<16) { |
936 |
< |
fdlen=a3len>>1; |
937 |
< |
} |
938 |
< |
if (fdlen>4) { |
939 |
< |
//check if we're lucky enough to have the last 11 bases of the read a part of the adapter |
940 |
< |
GStr rstart=seq.substr(-fdlen-3,fdlen); |
941 |
< |
if ((fi=adapter3.index(rstart))>=0) { |
942 |
< |
#ifdef DEBUG |
943 |
< |
if (debug) GMessage(" W11match found: %*s\n", rlen-3, (adapter3.substr(fi,fdlen)).chars()); |
944 |
< |
#endif |
945 |
< |
if (extendMatch(seq.chars(), rlen, rlen-fdlen-3, |
946 |
< |
adapter3.chars(), a3len, fi, fdlen, l5,l3, a3segs)) |
947 |
< |
return true; |
948 |
< |
} |
949 |
< |
//another easy case: first 11 characters of the adaptor found as a substring of the read |
950 |
< |
GStr bstr=adapter3.substr(0, fdlen); |
951 |
< |
if ((fi=seq.rindex(bstr))>=0) { |
952 |
< |
#ifdef DEBUG |
953 |
< |
if (debug) GMessage(" A11match found: %*s\n", fi+fdlen, bstr.chars()); |
954 |
< |
#endif |
955 |
< |
if (extendMatch(seq.chars(), rlen, fi, |
956 |
< |
adapter3.chars(), a3len, 0, fdlen, l5,l3, a3segs)) |
957 |
< |
return true; |
958 |
< |
} |
959 |
< |
} //tried to match 11 bases first |
960 |
< |
|
961 |
< |
//no easy cases, so let's do the wmer hashing for the first 12 bases of the adaptor |
962 |
< |
//-- only extend if the match is in the 3' (ending) region of the read |
963 |
< |
int wordSize=3; |
964 |
< |
int hlen=12; |
965 |
< |
if (hlen>a3len-wordSize) hlen=a3len-wordSize; |
966 |
< |
int imin=rlen>>1; //last half of the read, left boundary for the wmer match |
967 |
< |
if (imin<a3len) { imin=GMIN(a3len, rlen-wordSize); } |
968 |
< |
imin=rlen-imin; |
969 |
< |
for (int iw=0;iw<hlen;iw++) { |
970 |
< |
//int32* qv=(int32*)(adapter3.chars()+iw); |
971 |
< |
int qv=get3mer_value(adapter3.chars()+iw); |
972 |
< |
fi=-1; |
973 |
< |
//while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) { |
974 |
< |
while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) { |
975 |
< |
//GMessage(" ... fi=%d after w3_rmatch() (imin=%d)\n", fi, imin); |
976 |
< |
|
977 |
< |
#ifdef DEBUG |
978 |
< |
if (debug) GMessage(" Wmatch found: %*s\n", fi+wordSize, (adapter3.substr(iw,wordSize)).chars()); |
979 |
< |
#endif |
980 |
< |
if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(), |
981 |
< |
a3len, iw, wordSize, l5,l3, a3segs)) return true; |
982 |
< |
fi--; |
983 |
< |
if (fi<imin) break; |
984 |
< |
} |
985 |
< |
} //for each wmer in the first hlen bases of the adaptor |
986 |
< |
/* |
987 |
< |
//couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there |
988 |
< |
//but only do this if we already have segment pairs collected in the last 12 bases of the adapter |
989 |
< |
if (a3segs.bstart>3 || a3segs.bend<(uint)(hlen-wordSize)) return false; |
990 |
< |
int hlen2=a3len-wordSize; |
991 |
< |
//if (hlen2>a3len-4) hlen2=a3len-4; |
992 |
< |
if (hlen2>hlen) { |
993 |
< |
#ifdef DEBUG |
994 |
< |
if (debug && a3segs.Count()>0) { |
995 |
< |
GMessage(" >>>>>2nd. hash: %s\n",seq.chars()); |
996 |
< |
} |
997 |
< |
#endif |
998 |
< |
for (int iw=hlen;iw<hlen2;iw++) { |
999 |
< |
//int* qv=(int32 *)(adapter3.chars()+iw); |
1000 |
< |
int qv=get3mer_value(adapter3.chars()+iw); |
1001 |
< |
fi=-1; |
1002 |
< |
//while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) { |
1003 |
< |
while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) { |
1004 |
< |
extendMatch(seq.chars(), rlen, fi, adapter3.chars(), |
1005 |
< |
a3len, iw, wordSize, l5,l3, a3segs); |
1006 |
< |
fi--; |
1007 |
< |
if (fi<imin) break; |
1008 |
< |
} |
1009 |
< |
} //for each wmer between hlen2 and hlen bases of the adaptor |
945 |
> |
bool trimmed=false; |
946 |
> |
GStr wseq(seq.chars()); |
947 |
> |
int wlen=rlen; |
948 |
> |
for (int ai=0;ai<adapters3.Count();ai++) { |
949 |
> |
//if (adapters3[ai].is_empty()) continue; |
950 |
> |
int alen=adapters3[ai].seq.length(); |
951 |
> |
GStr& aseq=adapters3[ai].seq; |
952 |
> |
GXAlnInfo* r_bestaln=match_RightEnd(aseq.chars(), alen, adapters3[ai].pz, |
953 |
> |
wseq.chars(), wlen, gxmem_r, 74); |
954 |
> |
if (r_bestaln) { |
955 |
> |
trimmed=true; |
956 |
> |
//keep unmatched region on the left, if any |
957 |
> |
l3-=(wlen-r_bestaln->sl+1); |
958 |
> |
delete r_bestaln; |
959 |
> |
if (l3<0) l3=0; |
960 |
> |
if (l3-l5+1<min_read_len) return true; |
961 |
> |
wseq=seq.substr(l5,l3-l5+1); |
962 |
> |
wlen=wseq.length(); |
963 |
|
} |
964 |
< |
//lastly, analyze collected a3segs for a possible gapped alignment: |
965 |
< |
GList<CSegChain> segchains(false,true,false); |
1013 |
< |
#ifdef DEBUG |
1014 |
< |
if (debug && a3segs.Count()>0) { |
1015 |
< |
GMessage(">>>>>>>>> Read: %s\n",seq.chars()); |
1016 |
< |
} |
1017 |
< |
#endif |
1018 |
< |
for (int i=0;i<a3segs.Count();i++) { |
1019 |
< |
if (a3segs[i]->chain==NULL) { |
1020 |
< |
if (a3segs[i]->b.start>3) continue; //don't start a hopeless chain |
1021 |
< |
CSegChain* newchain=new CSegChain(); |
1022 |
< |
newchain->setFreeItem(false); |
1023 |
< |
newchain->addSegPair(a3segs[i]); |
1024 |
< |
a3segs[i]->chain=newchain; |
1025 |
< |
segchains.Add(newchain); //just to free them when done |
1026 |
< |
} |
1027 |
< |
for (int j=i+1;j<a3segs.Count();j++) { |
1028 |
< |
CSegChain* chain=a3segs[i]->chain; |
1029 |
< |
if (chain->extendChain(a3segs[j])) { |
1030 |
< |
a3segs[j]->chain=chain; |
1031 |
< |
#ifdef DEBUG |
1032 |
< |
if (debug) dbgPrintChain(*chain, adapter3.chars()); |
1033 |
< |
#endif |
1034 |
< |
//save time by checking here if the extended chain is already acceptable for trimming |
1035 |
< |
if (chain->aend>(uint)(rlen-4) && chain->bstart<4 && chain->score>a_min_chain_score) { |
1036 |
< |
l5=0; |
1037 |
< |
l3=chain->astart-2; |
1038 |
< |
#ifdef DEBUG |
1039 |
< |
if (debug && a3segs.Count()>0) { |
1040 |
< |
GMessage(">>> >> trimmed-3: %*s\n",l3-l5+1,seq.substr(l5,l3-l5+1).chars()); |
1041 |
< |
} |
1042 |
< |
#endif |
1043 |
< |
return true; |
1044 |
< |
} |
1045 |
< |
} //chain can be extended |
1046 |
< |
} |
1047 |
< |
} //collect segment alignments into chains |
1048 |
< |
*/ |
1049 |
< |
return false; //no adapter parts found |
964 |
> |
}//for each 5' adapter |
965 |
> |
return trimmed; |
966 |
|
} |
967 |
|
|
968 |
|
bool trim_adapter5(GStr& seq, int&l5, int &l3) { |
969 |
< |
//if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars()); |
969 |
> |
if (adapters5.Count()==0) return false; |
970 |
|
int rlen=seq.length(); |
971 |
|
l5=0; |
972 |
|
l3=rlen-1; |
973 |
< |
//try to see if adapter is fully included in the read |
974 |
< |
int fi=-1; |
975 |
< |
if ((fi=seq.index(adapter5))>=0) { |
976 |
< |
if (fi<rlen-fi-a5len) {//match is closer to the right end |
977 |
< |
l5=fi+a5len; |
978 |
< |
l3=rlen-1; |
979 |
< |
} |
980 |
< |
else { |
981 |
< |
l5=0; |
982 |
< |
l3=fi-1; |
983 |
< |
} |
984 |
< |
return true; |
985 |
< |
} |
986 |
< |
#ifdef DEBUG |
987 |
< |
if (debug) GMessage(">TRIM5 >> Read: %s\n",seq.chars()); |
988 |
< |
#endif |
989 |
< |
|
1074 |
< |
CSegChain a5segs(true); //list of segment pairs to analyze later if no extendMatch succeeded |
1075 |
< |
|
1076 |
< |
//try the easy way out first - look for an exact match of 11 bases |
1077 |
< |
int fdlen=11; |
1078 |
< |
if (a5len<16) { |
1079 |
< |
fdlen=a5len>>1; |
1080 |
< |
} |
1081 |
< |
if (fdlen>4) { |
1082 |
< |
GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus |
1083 |
< |
if ((fi=adapter5.index(rstart))>=0) { |
1084 |
< |
#ifdef DEBUG |
1085 |
< |
if (debug) GMessage(" W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars()); |
1086 |
< |
#endif |
1087 |
< |
if (extendMatch(seq.chars(), rlen, 1, |
1088 |
< |
adapter5.chars(), a5len, fi, fdlen, l5,l3, a5segs, true)) |
1089 |
< |
return true; |
1090 |
< |
} |
1091 |
< |
//another easy case: last 11 characters of the adaptor found as a substring of the read |
1092 |
< |
GStr bstr=adapter5.substr(-fdlen); |
1093 |
< |
if ((fi=seq.index(bstr))>=0) { |
1094 |
< |
#ifdef DEBUG |
1095 |
< |
if (debug) GMessage(" A11match found: %*s\n", fi+fdlen, bstr.chars()); |
1096 |
< |
#endif |
1097 |
< |
if (extendMatch(seq.chars(), rlen, fi, |
1098 |
< |
adapter5.chars(), a5len, a5len-fdlen, fdlen, l5,l3,a5segs,true)) |
1099 |
< |
return true; |
1100 |
< |
} |
1101 |
< |
} //tried to matching at most 11 bases first |
1102 |
< |
|
1103 |
< |
//-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor |
1104 |
< |
//-- only extend a wmer if it matches in the 5' (beginning) region of the read |
1105 |
< |
int wordSize=3; |
1106 |
< |
int hlen=12; |
1107 |
< |
if (hlen>a5len-wordSize) hlen=a5len-wordSize; |
1108 |
< |
int imax=rlen>>1; //first half of the read, right boundary for the wmer match |
1109 |
< |
if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); } |
1110 |
< |
for (int iw=0;iw<=hlen;iw++) { |
1111 |
< |
int apstart=a5len-iw-wordSize; |
1112 |
< |
fi=0; |
1113 |
< |
//int* qv=(int32 *)(adapter5.chars()+apstart); |
1114 |
< |
int qv=get3mer_value(adapter5.chars()+apstart); |
1115 |
< |
//while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) { |
1116 |
< |
while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) { |
1117 |
< |
#ifdef DEBUG |
1118 |
< |
if (debug) GMessage(" Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars()); |
1119 |
< |
#endif |
1120 |
< |
if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(), |
1121 |
< |
a5len, apstart, wordSize, l5,l3, a5segs, true)) return true; |
1122 |
< |
fi++; |
1123 |
< |
if (fi>imax) break; |
1124 |
< |
} |
1125 |
< |
} //for each wmer in the last hlen bases of the adaptor |
1126 |
< |
/* |
1127 |
< |
|
1128 |
< |
//couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there |
1129 |
< |
//but only do this if we already have segment pairs collected in the last 12 bases of the adapter |
1130 |
< |
if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false; |
1131 |
< |
int hlen2=a5len-wordSize; |
1132 |
< |
//if (hlen2>a5len-wordSize) hlen2=a5len-wordSize; |
1133 |
< |
#ifdef DEBUG |
1134 |
< |
if (debug && a5segs.Count()>0) { |
1135 |
< |
GMessage(" >>>>>2nd. hash: %s\n",seq.chars()); |
1136 |
< |
} |
1137 |
< |
#endif |
1138 |
< |
if (hlen2>hlen) { |
1139 |
< |
for (int iw=hlen+1;iw<=hlen2;iw++) { |
1140 |
< |
int apstart=a5len-iw-wordSize; |
1141 |
< |
fi=0; |
1142 |
< |
//int* qv=(int32 *)(adapter5.chars()+apstart); |
1143 |
< |
int qv=get3mer_value(adapter5.chars()+apstart); |
1144 |
< |
//while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) { |
1145 |
< |
while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) { |
1146 |
< |
extendMatch(seq.chars(), rlen, fi, adapter5.chars(), |
1147 |
< |
a5len, apstart, wordSize, l5,l3, a5segs, true); |
1148 |
< |
fi++; |
1149 |
< |
if (fi>imax) break; |
1150 |
< |
} |
1151 |
< |
} //for each wmer between hlen2 and hlen bases of the adaptor |
973 |
> |
bool trimmed=false; |
974 |
> |
GStr wseq(seq.chars()); |
975 |
> |
int wlen=rlen; |
976 |
> |
for (int ai=0;ai<adapters5.Count();ai++) { |
977 |
> |
//if (adapters5[ai].is_empty()) continue; |
978 |
> |
int alen=adapters5[ai].seq.length(); |
979 |
> |
GStr& aseq=adapters5[ai].seq; |
980 |
> |
GXAlnInfo* l_bestaln=match_LeftEnd(aseq.chars(), alen, adapters5[ai].pz, |
981 |
> |
wseq.chars(), wlen, gxmem_l, 84); |
982 |
> |
if (l_bestaln) { |
983 |
> |
trimmed=true; |
984 |
> |
l5+=l_bestaln->sr; |
985 |
> |
delete l_bestaln; |
986 |
> |
if (l5>=rlen) l5=rlen-1; |
987 |
> |
if (l3-l5+1<min_read_len) return true; |
988 |
> |
wseq=seq.substr(l5,l3-l5+1); |
989 |
> |
wlen=wseq.length(); |
990 |
|
} |
991 |
< |
if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false; |
992 |
< |
// lastly, analyze collected a5segs for a possible gapped alignment: |
993 |
< |
GList<CSegChain> segchains(false,true,false); |
1156 |
< |
#ifdef DEBUG |
1157 |
< |
if (debug && a5segs.Count()>0) { |
1158 |
< |
GMessage(">>>>>>>>> Read: %s\n",seq.chars()); |
1159 |
< |
} |
1160 |
< |
#endif |
1161 |
< |
for (int i=0;i<a5segs.Count();i++) { |
1162 |
< |
if (a5segs[i]->chain==NULL) { |
1163 |
< |
if (a5segs[i]->b.end<(int)(a5len-4)) continue; //don't start a hopeless chain |
1164 |
< |
CSegChain* newchain=new CSegChain(true); |
1165 |
< |
newchain->setFreeItem(false); |
1166 |
< |
newchain->addSegPair(a5segs[i]); |
1167 |
< |
a5segs[i]->chain=newchain; |
1168 |
< |
segchains.Add(newchain); //just to free them when done |
1169 |
< |
} |
1170 |
< |
for (int j=i+1;j<a5segs.Count();j++) { |
1171 |
< |
CSegChain* chain=a5segs[i]->chain; |
1172 |
< |
if (chain->extendChain(a5segs[j])) { |
1173 |
< |
a5segs[j]->chain=chain; |
1174 |
< |
#ifdef DEBUG |
1175 |
< |
if (debug) dbgPrintChain(*chain, adapter5.chars()); |
1176 |
< |
#endif |
1177 |
< |
//save time by checking here if the extended chain is already acceptable for trimming |
1178 |
< |
if (chain->bend>(uint)(a5len-3) && chain->astart<4 && chain->score>a_min_chain_score) { |
1179 |
< |
l5=chain->aend; |
1180 |
< |
l3=rlen-1; |
1181 |
< |
return true; |
1182 |
< |
} |
1183 |
< |
} //chain can be extended |
1184 |
< |
} |
1185 |
< |
} //collect segment alignments into chains |
1186 |
< |
*/ |
1187 |
< |
return false; //no adapter parts found |
1188 |
< |
} |
991 |
> |
}//for each 5' adapter |
992 |
> |
return trimmed; |
993 |
> |
} |
994 |
|
|
995 |
< |
//convert qvs to/from phred64 from/to phread33 |
995 |
> |
//convert qvs to/from phred64 from/to phread33 |
996 |
|
void convertPhred(GStr& q) { |
997 |
|
for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd; |
998 |
|
} |
1001 |
|
for (int i=0;i<len;i++) q[i]+=qv_cvtadd; |
1002 |
|
} |
1003 |
|
|
1004 |
< |
bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv, |
1004 |
> |
bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv, |
1005 |
|
GStr& rname, GStr& rinfo, GStr& infname) { |
1006 |
|
rseq=""; |
1007 |
|
rqv=""; |
1017 |
|
} //raw qseq format |
1018 |
|
else { // FASTQ or FASTA */ |
1019 |
|
isfasta=(l[0]=='>'); |
1020 |
< |
if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n", |
1020 |
> |
if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n", |
1021 |
|
infname.chars(), l); |
1022 |
|
GStr s(l); |
1023 |
|
rname=&(l[1]); |
1024 |
|
for (int i=0;i<rname.length();i++) |
1025 |
< |
if (rname[i]<=' ') { |
1026 |
< |
if (i<rname.length()-2) rinfo=rname.substr(i+1); |
1027 |
< |
rname.cut(i); |
1028 |
< |
break; |
1025 |
> |
if (rname[i]<=' ') { |
1026 |
> |
if (i<rname.length()-2) rinfo=rname.substr(i+1); |
1027 |
> |
rname.cut(i); |
1028 |
> |
break; |
1029 |
|
} |
1030 |
|
//now get the sequence |
1031 |
< |
if ((l=fq.getLine())==NULL) |
1031 |
> |
if ((l=fq.getLine())==NULL) |
1032 |
|
GError("Error: unexpected EOF after header for read %s (%s)\n", |
1033 |
|
rname.chars(), infname.chars()); |
1034 |
|
rseq=l; //this must be the DNA line |
1035 |
|
while ((l=fq.getLine())!=NULL) { |
1036 |
|
//seq can span multiple lines |
1037 |
|
if (l[0]=='>' || l[0]=='+') { |
1038 |
< |
fq.pushBack(); |
1038 |
> |
fq.pushBack(); |
1039 |
|
break; // |
1040 |
|
} |
1041 |
|
rseq+=l; |
1042 |
< |
} //check for multi-line seq |
1042 |
> |
} //check for multi-line seq |
1043 |
|
if (!isfasta) { //reading fastq quality values, which can also be multi-line |
1044 |
|
if ((l=fq.getLine())==NULL) |
1045 |
|
GError("Error: unexpected EOF after sequence for %s\n", rname.chars()); |
1046 |
|
if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n"); |
1047 |
< |
if ((l=fq.getLine())==NULL) |
1047 |
> |
if ((l=fq.getLine())==NULL) |
1048 |
|
GError("Error: unexpected EOF after qv header for %s\n", rname.chars()); |
1049 |
|
rqv=l; |
1050 |
< |
//if (rqv.length()!=rseq.length()) |
1050 |
> |
//if (rqv.length()!=rseq.length()) |
1051 |
|
// GError("Error: qv len != seq len for %s\n", rname.chars()); |
1052 |
|
while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) { |
1053 |
|
rqv+=l; //append to qv string |
1058 |
|
return true; |
1059 |
|
} |
1060 |
|
|
1061 |
+ |
#ifdef GDEBUG |
1062 |
+ |
void showTrim(GStr& s, int l5, int l3) { |
1063 |
+ |
if (l5>0) { |
1064 |
+ |
color_bg(c_red); |
1065 |
+ |
} |
1066 |
+ |
for (int i=0;i<s.length()-1;i++) { |
1067 |
+ |
if (i && i==l5) color_resetbg(); |
1068 |
+ |
fprintf(stderr, "%c", s[i]); |
1069 |
+ |
if (i==l3) color_bg(c_red); |
1070 |
+ |
} |
1071 |
+ |
fprintf(stderr, "%c", s[s.length()-1]); |
1072 |
+ |
color_reset(); |
1073 |
+ |
fprintf(stderr, "\n"); |
1074 |
+ |
} |
1075 |
+ |
#endif |
1076 |
+ |
|
1077 |
|
char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) { |
1078 |
< |
//returns 0 if the read was untouched, 1 if it was just trimmed |
1078 |
> |
//returns 0 if the read was untouched, 1 if it was just trimmed |
1079 |
|
// and a trash code if it was trashed |
1080 |
|
l5=0; |
1081 |
|
l3=rseq.length()-1; |
1082 |
+ |
#ifdef GDEBUG |
1083 |
+ |
//rseq.reverse(); |
1084 |
+ |
GMessage(">%s\n", rname.chars()); |
1085 |
+ |
GMessage("%s\n",rseq.chars()); |
1086 |
+ |
#endif |
1087 |
|
if (l3-l5+1<min_read_len) { |
1088 |
|
return 's'; |
1089 |
|
} |
1119 |
|
w5=0; |
1120 |
|
w3=wseq.length()-1; |
1121 |
|
} |
1122 |
< |
if (a3len>0) { |
1123 |
< |
if (trim_adapter3(wseq, w5, w3)) { |
1122 |
> |
char trim_code; |
1123 |
> |
do { |
1124 |
> |
trim_code=0; |
1125 |
> |
if (trim_poly5(wseq, w5, w3, polyA_seed)) { |
1126 |
> |
trim_code='A'; |
1127 |
> |
} |
1128 |
> |
else if (trim_poly5(wseq, w5, w3, polyT_seed)) { |
1129 |
> |
trim_code='T'; |
1130 |
> |
} |
1131 |
> |
else if (trim_adapter5(wseq, w5, w3)) { |
1132 |
> |
trim_code='5'; |
1133 |
> |
} |
1134 |
> |
if (trim_code) { |
1135 |
> |
#ifdef GDEBUG |
1136 |
> |
GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3); |
1137 |
> |
showTrim(wseq, w5, w3); |
1138 |
> |
#endif |
1139 |
|
int trimlen=wseq.length()-(w3-w5+1); |
1140 |
< |
num_trimmed3++; |
1141 |
< |
if (trimlen<min_trimmed3) |
1142 |
< |
min_trimmed3=trimlen; |
1140 |
> |
num_trimmed5++; |
1141 |
> |
if (trimlen<min_trimmed5) |
1142 |
> |
min_trimmed5=trimlen; |
1143 |
|
l5+=w5; |
1144 |
|
l3-=(wseq.length()-1-w3); |
1145 |
|
if (w3-w5+1<min_read_len) { |
1146 |
< |
return '3'; |
1146 |
> |
return trim_code; |
1147 |
|
} |
1148 |
|
//-- keep only the w5..w3 range |
1149 |
|
wseq=wseq.substr(w5, w3-w5+1); |
1150 |
|
if (!wqv.is_empty()) |
1151 |
|
wqv=wqv.substr(w5, w3-w5+1); |
1152 |
< |
}//some adapter was trimmed |
1153 |
< |
} //adapter trimming |
1154 |
< |
if (a5len>0) { |
1155 |
< |
if (trim_adapter5(wseq, w5, w3)) { |
1152 |
> |
}// trimmed at 5' end |
1153 |
> |
} while (trim_code); |
1154 |
> |
|
1155 |
> |
do { |
1156 |
> |
trim_code=0; |
1157 |
> |
if (trim_poly3(wseq, w5, w3, polyA_seed)) { |
1158 |
> |
trim_code='A'; |
1159 |
> |
} |
1160 |
> |
else if (trim_poly3(wseq, w5, w3, polyT_seed)) { |
1161 |
> |
trim_code='T'; |
1162 |
> |
} |
1163 |
> |
else if (trim_adapter3(wseq, w5, w3)) { |
1164 |
> |
trim_code='3'; |
1165 |
> |
} |
1166 |
> |
if (trim_code) { |
1167 |
> |
#ifdef GDEBUG |
1168 |
> |
GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3); |
1169 |
> |
showTrim(wseq, w5, w3); |
1170 |
> |
#endif |
1171 |
|
int trimlen=wseq.length()-(w3-w5+1); |
1172 |
< |
num_trimmed5++; |
1173 |
< |
if (trimlen<min_trimmed5) |
1174 |
< |
min_trimmed5=trimlen; |
1172 |
> |
num_trimmed3++; |
1173 |
> |
if (trimlen<min_trimmed3) |
1174 |
> |
min_trimmed3=trimlen; |
1175 |
|
l5+=w5; |
1176 |
|
l3-=(wseq.length()-1-w3); |
1177 |
|
if (w3-w5+1<min_read_len) { |
1178 |
< |
return '5'; |
1178 |
> |
return trim_code; |
1179 |
|
} |
1180 |
|
//-- keep only the w5..w3 range |
1181 |
|
wseq=wseq.substr(w5, w3-w5+1); |
1182 |
|
if (!wqv.is_empty()) |
1183 |
|
wqv=wqv.substr(w5, w3-w5+1); |
1184 |
< |
}//some adapter was trimmed |
1185 |
< |
} //adapter trimming |
1184 |
> |
}//trimming at 3' end |
1185 |
> |
} while (trim_code); |
1186 |
> |
|
1187 |
> |
|
1188 |
|
if (doCollapse) { |
1189 |
|
//keep read for later |
1190 |
|
FqDupRec* dr=dhash.Find(wseq.chars()); |
1191 |
|
if (dr==NULL) { //new entry |
1192 |
< |
//if (prefix.is_empty()) |
1193 |
< |
dhash.Add(wseq.chars(), |
1192 |
> |
//if (prefix.is_empty()) |
1193 |
> |
dhash.Add(wseq.chars(), |
1194 |
|
new FqDupRec(&wqv, rname.chars())); |
1195 |
|
//else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length())); |
1196 |
|
} |
1224 |
|
fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now |
1225 |
|
} |
1226 |
|
else { |
1227 |
< |
fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter, |
1227 |
> |
fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter, |
1228 |
|
rseq.chars()); |
1229 |
|
} |
1230 |
|
} |
1235 |
|
fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars()); |
1236 |
|
} |
1237 |
|
else |
1238 |
< |
fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter, |
1238 |
> |
fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter, |
1239 |
|
rseq.chars(),rqv.chars() ); |
1240 |
|
} |
1241 |
|
} |
1243 |
|
void trash_report(char trashcode, GStr& rname, FILE* freport) { |
1244 |
|
if (freport==NULL || trashcode<=' ') return; |
1245 |
|
if (trashcode=='3' || trashcode=='5') { |
1246 |
< |
fprintf(freport, "%s\tA%c\n",rname.chars(),trashcode); |
1246 |
> |
fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode); |
1247 |
|
} |
1248 |
|
else { |
1249 |
|
fprintf(freport, "%s\t%c\n",rname.chars(),trashcode); |
1335 |
|
} |
1336 |
|
} |
1337 |
|
|
1338 |
+ |
void addAdapter(GVec<CASeqData>& adapters, GStr& seq) { |
1339 |
+ |
//TODO: prepare CASeqData here, and collect hexamers as well |
1340 |
+ |
CASeqData adata(revCompl); |
1341 |
+ |
int idx=adapters.Add(adata); |
1342 |
+ |
if (idx<0) GError("Error: failed to add adaptor!\n"); |
1343 |
+ |
adapters[idx].update(seq.chars()); |
1344 |
+ |
} |
1345 |
+ |
|
1346 |
|
|
1347 |
|
int loadAdapters(const char* fname) { |
1348 |
|
GLineReader lr(fname); |
1349 |
|
char* l; |
1350 |
|
while ((l=lr.nextLine())!=NULL) { |
1351 |
|
if (lr.length()<=3 || l[0]=='#') continue; |
1352 |
< |
char* s5; |
1353 |
< |
char* s3; |
1488 |
< |
if (isspace(l[0]) || l[0]==',' || l[0]==';'|| l[0]==':') { |
1352 |
> |
if ( l[0]==' ' || l[0]=='\t' || l[0]==',' || |
1353 |
> |
l[0]==';'|| l[0]==':' ) { |
1354 |
|
int i=1; |
1355 |
|
while (l[i]!=0 && isspace(l[i])) { |
1356 |
|
i++; |
1357 |
|
} |
1358 |
|
if (l[i]!=0) { |
1359 |
< |
adapters.Add(new CAdapters(NULL, &(l[i]))); |
1359 |
> |
GStr s(&(l[i])); |
1360 |
> |
#ifdef GDEBUG |
1361 |
> |
//s.reverse(); |
1362 |
> |
#endif |
1363 |
> |
addAdapter(adapters3, s); |
1364 |
|
continue; |
1365 |
|
} |
1366 |
|
} |
1367 |
|
else { |
1368 |
< |
p=l; |
1369 |
< |
while (*delpos!='\0' && strchr(fTokenDelimiter, *delpos)!=NULL) |
1370 |
< |
delpos++; |
1371 |
< |
s5.startTokenize("\t ;,:",); |
1372 |
< |
s5.nextToken(s3); |
1368 |
> |
GStr s(l); |
1369 |
> |
s.startTokenize("\t ;,:"); |
1370 |
> |
GStr a5,a3; |
1371 |
> |
if (s.nextToken(a5)) |
1372 |
> |
s.nextToken(a3); |
1373 |
> |
a5.upper(); |
1374 |
> |
a3.upper(); |
1375 |
> |
#ifdef GDEBUG |
1376 |
> |
// a5.reverse(); |
1377 |
> |
// a3.reverse(); |
1378 |
> |
#endif |
1379 |
> |
addAdapter(adapters5, a5); |
1380 |
> |
addAdapter(adapters3, a3); |
1381 |
|
} |
1382 |
|
} |
1383 |
< |
|
1383 |
> |
return adapters5.Count()+adapters3.Count(); |
1384 |
|
} |
1385 |
|
|
1386 |
< |
void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2, |
1387 |
< |
GStr& s, GStr& infname, GStr& infname2) { |
1386 |
> |
void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2, |
1387 |
> |
GStr& s, GStr& infname, GStr& infname2) { |
1388 |
|
// uses outsuffix to generate output file names and open file handles as needed |
1389 |
|
infname=""; |
1390 |
|
infname2=""; |
1412 |
|
s.startTokenize(",:"); |
1413 |
|
s.nextToken(infname); |
1414 |
|
bool paired=s.nextToken(infname2); |
1415 |
< |
if (fileExists(infname.chars())==0) |
1415 |
> |
if (fileExists(infname.chars())==0) |
1416 |
|
GError("Error: cannot find file %s!\n",infname.chars()); |
1417 |
|
GStr fname(getFileName(infname.chars())); |
1418 |
|
GStr picmd; |
1434 |
|
if (!paired) return; |
1435 |
|
if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n"); |
1436 |
|
// ---- paired reads:------------- |
1437 |
< |
if (fileExists(infname2.chars())==0) |
1437 |
> |
if (fileExists(infname2.chars())==0) |
1438 |
|
GError("Error: cannot find file %s!\n",infname2.chars()); |
1439 |
|
picmd=""; |
1440 |
|
GStr fname2(getFileName(infname2.chars())); |