1 |
gpertea |
4 |
#include "GArgs.h" |
2 |
|
|
#include "GStr.h" |
3 |
|
|
#include "GHash.hh" |
4 |
|
|
#include "GList.hh" |
5 |
|
|
|
6 |
|
|
#define USAGE "Usage:\n\ |
7 |
gpertea |
12 |
fqtrim [-5 <5'adapter>] [-3 <3'adapter>] [-l <minlen>] [-q <minqv>] [-C] [-D]\\\n\ |
8 |
|
|
[-p {64|33}] [-n <rename_prefix>] [-o <trimmed.fq>] [-r <discarded.lst>]\\\n\ |
9 |
|
|
[-Q] <input.fq>\n\ |
10 |
gpertea |
4 |
\n\ |
11 |
gpertea |
12 |
Trim low quality bases at 3' end, optionally trim adapter sequence, filter\n\ |
12 |
|
|
for low complexity and collapse duplicate reads\n\ |
13 |
gpertea |
4 |
\n\ |
14 |
|
|
Options:\n\ |
15 |
|
|
-n rename all the reads using the <prefix> followed by a read counter;\n\ |
16 |
|
|
if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\ |
17 |
|
|
the read duplication count\n\ |
18 |
|
|
-o write the trimmed/filtered fastq into <trimmed.fq>(instead of stdout)\n\ |
19 |
|
|
-5 trim the given adapter or primer sequence at the 5' end of each read\n\ |
20 |
|
|
(e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\ |
21 |
|
|
-3 trim the given adapter sequence at the 3' end of each read\n\ |
22 |
|
|
(e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\ |
23 |
gpertea |
12 |
-q trim bases with quality value lower than <minq> (starting the 3' end)\n\ |
24 |
|
|
-m maximum percentage of Ns allowed in a read after trimming (default 7)\n\ |
25 |
|
|
-l minimum \"clean\" length after trimming that a read must have\n\ |
26 |
|
|
in order to pass the filter (default: 16)\n\ |
27 |
gpertea |
4 |
-r write a simple \"trash report\" file listing the discarded reads with a\n\ |
28 |
|
|
one letter code for the reason why they were trashed\n\ |
29 |
gpertea |
12 |
-D apply a low-complexity (dust) filter and discard any read that has over\n\ |
30 |
|
|
50% of its length detected as low complexity\n\ |
31 |
gpertea |
4 |
-C collapse duplicate reads and add a prefix with their count to the read\n\ |
32 |
gpertea |
12 |
name (e.g. for microRNAs)\n\ |
33 |
|
|
-p input is phred64/phred33 (use -p64 or -p33)\n\ |
34 |
|
|
-Q convert quality values to the other Phred qv type\n\ |
35 |
|
|
-V verbose processing\n\ |
36 |
gpertea |
4 |
" |
37 |
|
|
// example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG |
38 |
gpertea |
12 |
|
39 |
|
|
//For pair ends sequencing: |
40 |
|
|
//3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT |
41 |
|
|
//5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG |
42 |
gpertea |
4 |
FILE* f_out=NULL; //stdout if not provided |
43 |
|
|
FILE* f_in=NULL; //input fastq (stdin if not provided) |
44 |
|
|
FILE* freport=NULL; |
45 |
|
|
bool debug=false; |
46 |
gpertea |
12 |
bool verbose=false; |
47 |
gpertea |
4 |
bool doCollapse=false; |
48 |
|
|
bool doDust=false; |
49 |
|
|
bool trashReport=false; |
50 |
gpertea |
12 |
//bool rawFormat=false; |
51 |
gpertea |
4 |
int min_read_len=16; |
52 |
gpertea |
12 |
double max_perc_N=7.0; |
53 |
gpertea |
4 |
int dust_cutoff=16; |
54 |
|
|
bool isfasta=false; |
55 |
gpertea |
12 |
bool convert_phred=false; |
56 |
gpertea |
4 |
GStr prefix; |
57 |
|
|
GStr adapter3; |
58 |
|
|
GStr adapter5; |
59 |
gpertea |
12 |
|
60 |
|
|
int qvtrim_min=0; |
61 |
|
|
|
62 |
|
|
int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet) |
63 |
|
|
int qv_cvtadd=0; //could be -31 or +31 |
64 |
|
|
|
65 |
gpertea |
4 |
int a3len=0; |
66 |
|
|
int a5len=0; |
67 |
|
|
// adaptor matching metrics -- for extendMatch() function |
68 |
gpertea |
12 |
const int a_m_score=2; //match score |
69 |
|
|
const int a_mis_score=-3; //mismatch |
70 |
|
|
const int a_dropoff_score=7; |
71 |
|
|
const int a_min_score=8; //an exact match of 4 bases at the proper ends WILL be trimmed |
72 |
|
|
const int a_min_chain_score=15; //for gapped alignments |
73 |
gpertea |
4 |
|
74 |
gpertea |
12 |
class CSegChain; |
75 |
|
|
|
76 |
|
|
class CSegPair { |
77 |
|
|
public: |
78 |
|
|
GSeg a; |
79 |
|
|
GSeg b; //the adapter segment |
80 |
|
|
int score; |
81 |
|
|
int flags; |
82 |
|
|
CSegChain* chain; |
83 |
|
|
CSegPair(int astart=0, int aend=0, int bstart=0, int bend=0, int mscore=0):a(astart,aend),b(bstart, bend) { |
84 |
|
|
score=mscore; |
85 |
|
|
if (score==0) score=a.len()*a_m_score; |
86 |
|
|
flags=0; |
87 |
|
|
chain=NULL; |
88 |
|
|
} |
89 |
|
|
int len() { return a.len(); } |
90 |
|
|
bool operator==(CSegPair& d){ |
91 |
|
|
//return (a.start==d.a.start && a.end==d.a.end && b.start==d.b.start && b.end==d.b.end); |
92 |
|
|
//make equal even segments that are included into one another: |
93 |
|
|
return (d.a.start>=a.start && d.a.end<=a.end && d.b.start>=b.start && d.b.end<=b.end); |
94 |
|
|
} |
95 |
|
|
bool operator>(CSegPair& d){ //ordering based on b (adaptor) start coord and score |
96 |
|
|
if (b.start==d.b.start) { |
97 |
|
|
if (score==d.score) { |
98 |
|
|
//just try to be consistent: |
99 |
|
|
if (b.end==d.b.end) { |
100 |
|
|
return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start); |
101 |
|
|
} |
102 |
|
|
return (b.end>d.b.end); |
103 |
|
|
} |
104 |
|
|
else return (score<d.score); |
105 |
|
|
} |
106 |
|
|
return (b.start>d.b.start); |
107 |
|
|
} |
108 |
|
|
bool operator<(CSegPair& d){ //ordering based on b (adaptor) coord |
109 |
|
|
/*if (b.start==d.b.start && b.end==d.b.end) { |
110 |
|
|
return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start); |
111 |
|
|
} |
112 |
|
|
return (b.start==d.b.start)?(b.end<d.b.end):(b.start<d.b.start);*/ |
113 |
|
|
if (b.start==d.b.start) { |
114 |
|
|
if (score==d.score) { |
115 |
|
|
//just try to be consistent: |
116 |
|
|
if (b.end==d.b.end) { |
117 |
|
|
return (a.start==d.a.start)?(a.end>d.a.end):(a.start>d.a.start); |
118 |
|
|
} |
119 |
|
|
return (b.end<d.b.end); |
120 |
|
|
} |
121 |
|
|
else return (score>d.score); |
122 |
|
|
} |
123 |
|
|
return (b.start<d.b.start); |
124 |
|
|
} |
125 |
|
|
}; |
126 |
|
|
|
127 |
|
|
int cmpSegEnds(pointer sa, pointer sb) { //sort by adaptor seg ends AND score |
128 |
|
|
CSegPair& x = *(CSegPair *)sa; |
129 |
|
|
CSegPair& y = *(CSegPair *)sb; |
130 |
|
|
/* |
131 |
|
|
if (x.b.end==y.b.end) { |
132 |
|
|
if (x.b.start==y.b.start) { |
133 |
|
|
if (x.a.end==y.a.end) { |
134 |
|
|
if (x.a.start==y.a.start) return 0; |
135 |
|
|
return ((x.a.start>y.a.start) ? -1 : 1); |
136 |
|
|
} |
137 |
|
|
else { |
138 |
|
|
return ((x.a.end>y.a.end) ? -1 : 1); |
139 |
|
|
} |
140 |
|
|
} |
141 |
|
|
else { |
142 |
|
|
return ((x.b.start>y.b.start) ? -1 : 1); |
143 |
|
|
} |
144 |
|
|
} |
145 |
|
|
else { |
146 |
|
|
return ((x.b.end>y.b.end) ? -1 : 1); |
147 |
|
|
} |
148 |
|
|
*/ |
149 |
|
|
if (x.b.end==y.b.end) { |
150 |
|
|
if (x.score==y.score) { |
151 |
|
|
if (x.b.start==y.b.start) { |
152 |
|
|
if (x.a.end==y.a.end) { |
153 |
|
|
if (x.a.start==y.a.start) return 0; |
154 |
|
|
return ((x.a.start<y.a.start) ? -1 : 1); |
155 |
|
|
} |
156 |
|
|
else { |
157 |
|
|
return ((x.a.end<y.a.end) ? -1 : 1); |
158 |
|
|
} |
159 |
|
|
} |
160 |
|
|
else { |
161 |
|
|
return ((x.b.start<y.b.start) ? -1 : 1); |
162 |
|
|
} |
163 |
|
|
} else return ((x.score>y.score) ? -1 : 1); |
164 |
|
|
} |
165 |
|
|
else { |
166 |
|
|
return ((x.b.end>y.b.end) ? -1 : 1); |
167 |
|
|
} |
168 |
|
|
|
169 |
|
|
} |
170 |
|
|
|
171 |
|
|
class CSegChain:public GList<CSegPair> { |
172 |
|
|
public: |
173 |
|
|
uint astart; |
174 |
|
|
uint aend; |
175 |
|
|
uint bstart; |
176 |
|
|
uint bend; |
177 |
|
|
int score; |
178 |
|
|
bool endSort; |
179 |
|
|
CSegChain(bool aln5=false):GList<CSegPair>(true,true,true) {//sorted, free elements, unique |
180 |
|
|
//as SegPairs are inserted, they will be sorted by a.start coordinate |
181 |
|
|
score=0; |
182 |
|
|
astart=MAX_UINT; |
183 |
|
|
aend=0; |
184 |
|
|
bstart=MAX_UINT; |
185 |
|
|
bend=0; |
186 |
|
|
endSort=aln5; |
187 |
|
|
if (aln5) { setSorted(cmpSegEnds); } |
188 |
|
|
} |
189 |
|
|
bool operator==(CSegChain& d) { |
190 |
|
|
//return (score==d.score); |
191 |
|
|
return (astart==d.astart && aend==d.aend && bstart==d.bstart && bend==d.bend); |
192 |
|
|
} |
193 |
|
|
bool operator>(CSegChain& d) { // order based on b (adaptor) coordinate |
194 |
|
|
//return (score<d.score); |
195 |
|
|
if (bstart==d.bstart && bend==d.bend) { |
196 |
|
|
return (astart==d.astart)?(aend>d.aend):(astart>d.astart); |
197 |
|
|
} |
198 |
|
|
return (bstart==d.bstart)?(bend>d.bend):(bstart>d.bstart); |
199 |
|
|
} |
200 |
|
|
bool operator<(CSegChain& d) { |
201 |
|
|
//return (score>d.score); |
202 |
|
|
if (bstart==d.bstart && bend==d.bend) { |
203 |
|
|
return (astart==d.astart)?(aend<d.aend):(astart<d.astart); |
204 |
|
|
} |
205 |
|
|
return (bstart==d.bstart)?(bend<d.bend):(bstart<d.bstart); |
206 |
|
|
} |
207 |
|
|
void addSegPair(CSegPair* segp) { |
208 |
|
|
if (AddIfNew(segp)!=segp) return; |
209 |
|
|
score+=segp->score; |
210 |
|
|
if (astart>segp->a.start) astart=segp->a.start; |
211 |
|
|
if (aend<segp->a.end) aend=segp->a.end; |
212 |
|
|
if (bstart>segp->b.start) bstart=segp->b.start; |
213 |
|
|
if (bend<segp->b.end) bend=segp->b.end; |
214 |
|
|
} |
215 |
|
|
//for building actual chains: |
216 |
|
|
bool extendChain(CSegPair* segp) { //segp expected to be "Greater Than" current chain |
217 |
|
|
int bgap=0; |
218 |
|
|
int agap=0; |
219 |
|
|
//if (endSort) { |
220 |
|
|
if (bstart>segp->b.start) { |
221 |
|
|
bgap = (int)(bstart-segp->b.end); |
222 |
|
|
if (abs(bgap)>2) return false; |
223 |
|
|
agap = (int)(astart-segp->a.end); |
224 |
|
|
if (abs(agap)>2) return false; |
225 |
|
|
} |
226 |
|
|
else { |
227 |
|
|
bgap = (int) (segp->b.start-bend); |
228 |
|
|
if (abs(bgap)>2) return false; |
229 |
|
|
agap = (int)(segp->a.start-aend); |
230 |
|
|
if (abs(agap)>2) return false; |
231 |
|
|
} |
232 |
|
|
if (agap*bgap<0) return false; |
233 |
|
|
addSegPair(segp); |
234 |
|
|
score-=abs(agap)+abs(bgap); |
235 |
|
|
return true; |
236 |
|
|
} |
237 |
|
|
}; |
238 |
|
|
|
239 |
gpertea |
4 |
// element in dhash: |
240 |
|
|
class FqDupRec { |
241 |
|
|
public: |
242 |
|
|
int count; //how many of these reads are the same |
243 |
|
|
int len; //length of qv |
244 |
|
|
char* firstname; //optional, only if we want to keep the original read names |
245 |
|
|
char* qv; |
246 |
|
|
FqDupRec(GStr* qstr=NULL, const char* rname=NULL) { |
247 |
|
|
len=0; |
248 |
|
|
qv=NULL; |
249 |
|
|
firstname=NULL; |
250 |
|
|
count=0; |
251 |
|
|
if (qstr!=NULL) { |
252 |
|
|
qv=Gstrdup(qstr->chars()); |
253 |
|
|
len=qstr->length(); |
254 |
|
|
count++; |
255 |
|
|
} |
256 |
|
|
if (rname!=NULL) firstname=Gstrdup(rname); |
257 |
|
|
} |
258 |
|
|
~FqDupRec() { |
259 |
|
|
GFREE(qv); |
260 |
|
|
GFREE(firstname); |
261 |
|
|
} |
262 |
|
|
void add(GStr& d) { //collapse another record into this one |
263 |
|
|
if (d.length()!=len) |
264 |
|
|
GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n"); |
265 |
|
|
count++; |
266 |
|
|
for (int i=0;i<len;i++) |
267 |
|
|
qv[i]=(qv[i]+d[i])/2; |
268 |
|
|
} |
269 |
|
|
}; |
270 |
|
|
|
271 |
|
|
void openfw(FILE* &f, GArgs& args, char opt) { |
272 |
|
|
GStr s=args.getOpt(opt); |
273 |
|
|
if (!s.is_empty()) { |
274 |
|
|
if (s=='-') f=stdout; |
275 |
|
|
else { |
276 |
|
|
f=fopen(s,"w"); |
277 |
|
|
if (f==NULL) GError("Error creating file: %s\n", s.chars()); |
278 |
|
|
} |
279 |
|
|
} |
280 |
|
|
} |
281 |
|
|
|
282 |
|
|
#define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh) |
283 |
|
|
#define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh) |
284 |
|
|
|
285 |
|
|
GHash<FqDupRec> dhash; //hash to keep track of duplicates |
286 |
|
|
|
287 |
gpertea |
12 |
bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured |
288 |
|
|
bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured |
289 |
gpertea |
4 |
int dust(GStr& seq); |
290 |
|
|
bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured |
291 |
|
|
bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured |
292 |
|
|
|
293 |
gpertea |
12 |
void convertPhred(char* q, int len); |
294 |
|
|
void convertPhred(GStr& q); |
295 |
gpertea |
4 |
|
296 |
|
|
int main(int argc, char * const argv[]) { |
297 |
gpertea |
12 |
GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:o:"); |
298 |
gpertea |
4 |
int e; |
299 |
|
|
int icounter=0; //counter for input reads |
300 |
|
|
int outcounter=0; //counter for output reads |
301 |
|
|
if ((e=args.isError())>0) { |
302 |
|
|
GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]); |
303 |
|
|
exit(224); |
304 |
|
|
} |
305 |
gpertea |
9 |
debug=(args.getOpt('Y')!=NULL); |
306 |
gpertea |
12 |
debug=(args.getOpt('V')!=NULL); |
307 |
|
|
convert_phred=(args.getOpt('Q')!=NULL); |
308 |
gpertea |
4 |
doCollapse=(args.getOpt('C')!=NULL); |
309 |
|
|
doDust=(args.getOpt('D')!=NULL); |
310 |
gpertea |
12 |
/* |
311 |
gpertea |
4 |
rawFormat=(args.getOpt('R')!=NULL); |
312 |
|
|
if (rawFormat) { |
313 |
|
|
GError("Sorry, raw qseq format parsing is not implemented yet!\n"); |
314 |
|
|
} |
315 |
gpertea |
12 |
*/ |
316 |
gpertea |
4 |
prefix=args.getOpt('n'); |
317 |
|
|
GStr s=args.getOpt('l'); |
318 |
|
|
if (!s.is_empty()) |
319 |
|
|
min_read_len=s.asInt(); |
320 |
gpertea |
12 |
s=args.getOpt('m'); |
321 |
|
|
if (!s.is_empty()) |
322 |
|
|
max_perc_N=s.asDouble(); |
323 |
gpertea |
4 |
s=args.getOpt('d'); |
324 |
|
|
if (!s.is_empty()) { |
325 |
|
|
dust_cutoff=s.asInt(); |
326 |
|
|
doDust=true; |
327 |
|
|
} |
328 |
gpertea |
12 |
s=args.getOpt('q'); |
329 |
|
|
if (!s.is_empty()) { |
330 |
|
|
qvtrim_min=s.asInt(); |
331 |
|
|
} |
332 |
|
|
s=args.getOpt('p'); |
333 |
|
|
if (!s.is_empty()) { |
334 |
|
|
int v=s.asInt(); |
335 |
|
|
if (v==33) { |
336 |
|
|
qv_phredtype=33; |
337 |
|
|
qv_cvtadd=31; |
338 |
|
|
} |
339 |
|
|
else if (v==64) { |
340 |
|
|
qv_phredtype=64; |
341 |
|
|
qv_cvtadd=-31; |
342 |
|
|
} |
343 |
|
|
else |
344 |
|
|
GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE); |
345 |
|
|
} |
346 |
gpertea |
4 |
if (args.getOpt('3')!=NULL) { |
347 |
|
|
adapter3=args.getOpt('3'); |
348 |
|
|
adapter3.upper(); |
349 |
|
|
a3len=adapter3.length(); |
350 |
|
|
} |
351 |
|
|
if (args.getOpt('5')!=NULL) { |
352 |
|
|
adapter5=args.getOpt('5'); |
353 |
|
|
adapter5.upper(); |
354 |
|
|
a5len=adapter5.length(); |
355 |
|
|
} |
356 |
|
|
trashReport= (args.getOpt('r')!=NULL); |
357 |
|
|
if (args.startNonOpt()==0) { |
358 |
|
|
GMessage(USAGE); |
359 |
|
|
exit(224); |
360 |
|
|
} |
361 |
|
|
|
362 |
|
|
openfw(f_out, args, 'o'); |
363 |
gpertea |
10 |
if (f_out==NULL) f_out=stdout; |
364 |
gpertea |
4 |
if (trashReport) |
365 |
|
|
openfw(freport, args, 'r'); |
366 |
|
|
char* infile=NULL; |
367 |
|
|
while ((infile=args.nextNonOpt())!=NULL) { |
368 |
|
|
GStr infname(infile); |
369 |
|
|
if (strcmp(infile,"-")==0) { |
370 |
|
|
f_in=stdin; infname="stdin"; } |
371 |
|
|
else { |
372 |
|
|
f_in=fopen(infile,"r"); |
373 |
|
|
if (f_in==NULL) |
374 |
|
|
GError("Cannot open input file %s!\n",infile); |
375 |
|
|
} |
376 |
|
|
GLineReader fq(f_in); |
377 |
|
|
char* l=NULL; |
378 |
|
|
while ((l=fq.getLine())!=NULL) { |
379 |
|
|
GStr rname; //current read name |
380 |
|
|
GStr rseq; //current read sequence |
381 |
|
|
GStr rqv; //current read quality values |
382 |
|
|
GStr s; |
383 |
gpertea |
12 |
/* if (rawFormat) { |
384 |
gpertea |
4 |
//TODO: implement qseq parsing here |
385 |
|
|
//if (raw type=N) then continue; //skip invalid/bad records |
386 |
|
|
|
387 |
|
|
} //raw qseq format |
388 |
gpertea |
12 |
else { // FASTQ or FASTA */ |
389 |
gpertea |
4 |
isfasta=(l[0]=='>'); |
390 |
|
|
if (!isfasta && l[0]!='@') GError("Error: fastq record marker not detected!\n"); |
391 |
|
|
s=l; |
392 |
|
|
rname=&(l[1]); |
393 |
|
|
icounter++; |
394 |
|
|
for (int i=0;i<rname.length();i++) |
395 |
|
|
if (rname[i]<=' ') { rname.cut(i); break; } |
396 |
|
|
//now get the sequence |
397 |
|
|
if ((l=fq.getLine())==NULL) |
398 |
|
|
GError("Error: unexpected EOF after header for %s\n",rname.chars()); |
399 |
|
|
rseq=l; //this must be the DNA line |
400 |
gpertea |
12 |
while ((l=fq.getLine())!=NULL) { |
401 |
|
|
//seq can span multiple lines |
402 |
|
|
if (l[0]=='>' || l[0]=='+') { |
403 |
gpertea |
4 |
fq.pushBack(); |
404 |
|
|
break; // |
405 |
|
|
} |
406 |
|
|
rseq+=l; |
407 |
gpertea |
12 |
} //check for multi-line seq |
408 |
|
|
if (!isfasta) { //reading fastq quality values, which can also be multi-line |
409 |
|
|
if ((l=fq.getLine())==NULL) |
410 |
gpertea |
4 |
GError("Error: unexpected EOF after sequence for %s\n", rname.chars()); |
411 |
|
|
if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n"); |
412 |
|
|
if ((l=fq.getLine())==NULL) |
413 |
|
|
GError("Error: unexpected EOF after qv header for %s\n", rname.chars()); |
414 |
|
|
rqv=l; |
415 |
gpertea |
12 |
//if (rqv.length()!=rseq.length()) |
416 |
|
|
// GError("Error: qv len != seq len for %s\n", rname.chars()); |
417 |
|
|
while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) { |
418 |
|
|
rqv+=l; //append to qv string |
419 |
|
|
} |
420 |
|
|
}// fastq |
421 |
|
|
// } //<-- FASTA or FASTQ |
422 |
gpertea |
4 |
rseq.upper(); |
423 |
|
|
int l5=0; |
424 |
|
|
int l3=rseq.length()-1; |
425 |
|
|
if (l3-l5+1<min_read_len) { |
426 |
|
|
if (trashReport) { |
427 |
|
|
fprintf(freport, "%s\ts\t%s\n",rname.chars(), rseq.chars()); |
428 |
|
|
} |
429 |
|
|
continue; |
430 |
|
|
} |
431 |
gpertea |
12 |
if (ntrim(rseq, l5, l3)) { // N-trimming |
432 |
|
|
//GMessage("before: %s\n",rseq.chars()); |
433 |
|
|
//GMessage("after : %s (%d)\n",rseq.substr(l5,l3-l5+1).chars(),l3-l5+1); |
434 |
gpertea |
4 |
if (l3-l5+1<min_read_len) { |
435 |
|
|
if (trashReport) { |
436 |
|
|
fprintf(freport, "%s\tN\t%s\n", rname.chars(), rseq.chars()); |
437 |
|
|
} |
438 |
|
|
continue; //invalid read |
439 |
|
|
} |
440 |
|
|
//-- keep only the l5..l3 range |
441 |
|
|
rseq=rseq.substr(l5, l3-l5+1); |
442 |
|
|
if (!rqv.is_empty()) |
443 |
|
|
rqv=rqv.substr(l5, l3-l5+1); |
444 |
gpertea |
12 |
l5=0; |
445 |
|
|
l3=rseq.length()-1; |
446 |
gpertea |
4 |
} |
447 |
gpertea |
12 |
if (qvtrim_min!=0 && !rqv.is_empty() && qtrim(rqv, l5, l3)) { // qv-threshold trimming |
448 |
|
|
if (l3-l5+1<min_read_len) { |
449 |
|
|
if (trashReport) { |
450 |
|
|
fprintf(freport, "%s\tQ\t%s\n", rname.chars(), rseq.chars()); |
451 |
|
|
} |
452 |
|
|
continue; //invalid read |
453 |
|
|
} |
454 |
|
|
//-- keep only the l5..l3 range |
455 |
|
|
rseq=rseq.substr(l5, l3-l5+1); |
456 |
|
|
if (!rqv.is_empty()) |
457 |
|
|
rqv=rqv.substr(l5, l3-l5+1); |
458 |
|
|
} //qv trimming |
459 |
gpertea |
4 |
if (a3len>0) { |
460 |
|
|
if (trim_adapter3(rseq, l5, l3)) { |
461 |
|
|
if (l3-l5+1<min_read_len) { |
462 |
|
|
if (trashReport) { |
463 |
|
|
fprintf(freport, "%s\tA3\t%s\n",rname.chars(), rseq.chars()); |
464 |
|
|
} |
465 |
|
|
continue; |
466 |
|
|
} |
467 |
|
|
//-- keep only the l5..l3 range |
468 |
|
|
rseq=rseq.substr(l5, l3-l5+1); |
469 |
|
|
if (!rqv.is_empty()) |
470 |
|
|
rqv=rqv.substr(l5, l3-l5+1); |
471 |
|
|
}//some adapter was trimmed |
472 |
|
|
} //adapter trimming |
473 |
|
|
if (a5len>0) { |
474 |
|
|
if (trim_adapter5(rseq, l5, l3)) { |
475 |
|
|
if (l3-l5+1<min_read_len) { |
476 |
|
|
if (trashReport) { |
477 |
|
|
fprintf(freport, "%s\tA5\t%s\n",rname.chars(), rseq.chars()); |
478 |
|
|
} |
479 |
|
|
continue; |
480 |
|
|
} |
481 |
|
|
//-- keep only the l5..l3 range |
482 |
|
|
rseq=rseq.substr(l5, l3-l5+1); |
483 |
|
|
if (!rqv.is_empty()) |
484 |
|
|
rqv=rqv.substr(l5, l3-l5+1); |
485 |
|
|
}//some adapter was trimmed |
486 |
|
|
} //adapter trimming |
487 |
|
|
if (doCollapse) { |
488 |
|
|
//keep read for later |
489 |
|
|
FqDupRec* dr=dhash.Find(rseq.chars()); |
490 |
|
|
if (dr==NULL) { //new entry |
491 |
|
|
//if (prefix.is_empty()) |
492 |
|
|
dhash.Add(rseq.chars(), |
493 |
|
|
new FqDupRec(&rqv, rname.chars())); |
494 |
|
|
//else dhash.Add(rseq.chars(), new FqDupRec(rqv.chars(),rqv.length())); |
495 |
|
|
} |
496 |
|
|
else |
497 |
|
|
dr->add(rqv); |
498 |
|
|
} //collapsing duplicates |
499 |
|
|
else { //not collapsing duplicates |
500 |
|
|
//do the dust filter now |
501 |
|
|
if (doDust) { |
502 |
|
|
int dustbases=dust(rseq); |
503 |
|
|
if (dustbases>(rseq.length()>>1)) { |
504 |
|
|
if (trashReport) { |
505 |
|
|
fprintf(freport, "%s\tD\t%s\n",rname.chars(),rseq.chars()); |
506 |
|
|
} |
507 |
|
|
continue; |
508 |
|
|
} |
509 |
|
|
} |
510 |
|
|
//print this record here |
511 |
|
|
outcounter++; |
512 |
|
|
if (isfasta) { |
513 |
|
|
if (prefix.is_empty()) |
514 |
|
|
fprintf(f_out, ">%s\n%s\n", rname.chars(), rseq.chars()); |
515 |
|
|
else |
516 |
|
|
fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter, |
517 |
|
|
rseq.chars()); |
518 |
|
|
} |
519 |
|
|
else { //fastq |
520 |
gpertea |
12 |
if (convert_phred) convertPhred(rqv); |
521 |
gpertea |
4 |
if (prefix.is_empty()) |
522 |
|
|
fprintf(f_out, "@%s\n%s\n+\n%s\n", rname.chars(), rseq.chars(),rqv.chars()); |
523 |
|
|
else |
524 |
|
|
fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter, |
525 |
|
|
rseq.chars(),rqv.chars() ); |
526 |
|
|
} |
527 |
|
|
} //not collapsing duplicates |
528 |
|
|
} //for each fastq record |
529 |
|
|
} //while each input file |
530 |
|
|
FRCLOSE(f_in); |
531 |
|
|
if (doCollapse) { |
532 |
|
|
outcounter=0; |
533 |
|
|
int maxdup_count=1; |
534 |
|
|
char* maxdup_seq=NULL; |
535 |
|
|
dhash.startIterate(); |
536 |
|
|
FqDupRec* qd=NULL; |
537 |
|
|
char* seq=NULL; |
538 |
|
|
while ((qd=dhash.NextData(seq))!=NULL) { |
539 |
|
|
GStr rseq(seq); |
540 |
|
|
//do the dusting here |
541 |
|
|
if (doDust) { |
542 |
|
|
int dustbases=dust(rseq); |
543 |
|
|
if (dustbases>(rseq.length()>>1)) { |
544 |
|
|
if (trashReport && qd->firstname!=NULL) { |
545 |
|
|
fprintf(freport, "%s_x%d\tD\t%s\n",qd->firstname, qd->count,seq); |
546 |
|
|
} |
547 |
|
|
continue; |
548 |
|
|
} |
549 |
|
|
} |
550 |
|
|
outcounter++; |
551 |
|
|
if (qd->count>maxdup_count) { |
552 |
|
|
maxdup_count=qd->count; |
553 |
|
|
maxdup_seq=seq; |
554 |
|
|
} |
555 |
|
|
if (isfasta) { |
556 |
|
|
if (prefix.is_empty()) { |
557 |
gpertea |
10 |
fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count, |
558 |
gpertea |
4 |
rseq.chars()); |
559 |
|
|
} |
560 |
|
|
else { //use custom read name |
561 |
gpertea |
10 |
fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter, |
562 |
gpertea |
4 |
qd->count, rseq.chars()); |
563 |
|
|
} |
564 |
|
|
} |
565 |
|
|
else { //fastq format |
566 |
gpertea |
12 |
if (convert_phred) convertPhred(qd->qv, qd->len); |
567 |
gpertea |
4 |
if (prefix.is_empty()) { |
568 |
|
|
fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count, |
569 |
|
|
rseq.chars(), qd->qv); |
570 |
|
|
} |
571 |
|
|
else { //use custom read name |
572 |
|
|
fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter, |
573 |
|
|
qd->count, rseq.chars(), qd->qv); |
574 |
|
|
} |
575 |
|
|
} |
576 |
|
|
}//for each element of dhash |
577 |
|
|
if (maxdup_count>1) { |
578 |
|
|
GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq); |
579 |
|
|
} |
580 |
|
|
} //report collapsed dhash entries |
581 |
|
|
GMessage("Number of input reads: %9d\n", icounter); |
582 |
|
|
GMessage(" Output records: %9d\n", outcounter); |
583 |
|
|
if (trashReport) { |
584 |
|
|
FWCLOSE(freport); |
585 |
|
|
} |
586 |
|
|
|
587 |
|
|
FWCLOSE(f_out); |
588 |
gpertea |
12 |
//getc(stdin); |
589 |
gpertea |
4 |
} |
590 |
|
|
|
591 |
|
|
class NData { |
592 |
|
|
public: |
593 |
|
|
int NPos[1024]; //there should be no reads longer than 1K ? |
594 |
|
|
int NCount; |
595 |
|
|
int end5; |
596 |
|
|
int end3; |
597 |
|
|
int seqlen; |
598 |
|
|
double perc_N; //percentage of Ns in end5..end3 range only! |
599 |
|
|
const char* seq; |
600 |
|
|
bool valid; |
601 |
|
|
NData() { |
602 |
|
|
NCount=0; |
603 |
|
|
end5=0; |
604 |
|
|
end3=0; |
605 |
|
|
seq=NULL; |
606 |
|
|
perc_N=0; |
607 |
|
|
valid=true; |
608 |
|
|
} |
609 |
|
|
void init(GStr& rseq) { |
610 |
|
|
NCount=0; |
611 |
|
|
perc_N=0; |
612 |
|
|
seqlen=rseq.length(); |
613 |
|
|
seq=rseq.chars(); |
614 |
|
|
end5=1; |
615 |
|
|
end3=seqlen; |
616 |
|
|
for (int i=0;i<seqlen;i++) |
617 |
gpertea |
12 |
if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT") |
618 |
gpertea |
4 |
NPos[NCount]=i; |
619 |
|
|
NCount++; |
620 |
|
|
} |
621 |
|
|
perc_N=(NCount*100.0)/seqlen; |
622 |
|
|
valid=true; |
623 |
|
|
} |
624 |
|
|
void N_calc() { //only in the region after trimming |
625 |
|
|
int n=0; |
626 |
|
|
for (int i=end3-1;i<end5;i++) { |
627 |
|
|
if (seq[i]=='N') n++; |
628 |
|
|
} |
629 |
|
|
perc_N=(n*100.0)/(end5-end3+1); |
630 |
|
|
} |
631 |
|
|
}; |
632 |
|
|
|
633 |
|
|
static NData feat; |
634 |
|
|
int perc_lenN=12; // incremental distance from ends, in percentage of |
635 |
|
|
// sequence length, where N-trimming is done (default:12 %) (autolimited to 20) |
636 |
|
|
|
637 |
|
|
void N_analyze(int l5, int l3, int p5, int p3) { |
638 |
|
|
/* assumes feat was filled properly */ |
639 |
|
|
int old_dif, t5,t3,v; |
640 |
gpertea |
12 |
if (l3<l5+2 || p5>p3 ) { |
641 |
gpertea |
4 |
feat.end5=l5+1; |
642 |
|
|
feat.end3=l3+1; |
643 |
|
|
return; |
644 |
|
|
} |
645 |
|
|
|
646 |
|
|
t5=feat.NPos[p5]-l5; |
647 |
|
|
t3=l3-feat.NPos[p3]; |
648 |
|
|
old_dif=p3-p5; |
649 |
|
|
v=(int)((((double)(l3-l5))*perc_lenN)/100); |
650 |
|
|
if (v>20) v=20; /* enforce N-search limit for very long reads */ |
651 |
|
|
if (t5 < v ) { |
652 |
|
|
l5=feat.NPos[p5]+1; |
653 |
|
|
p5++; |
654 |
|
|
} |
655 |
|
|
if (t3 < v) { |
656 |
|
|
l3=feat.NPos[p3]-1; |
657 |
|
|
p3--; |
658 |
|
|
} |
659 |
|
|
/* restNs=p3-p5; number of Ns in the new CLR */ |
660 |
|
|
if (p3-p5==old_dif) { /* no change, return */ |
661 |
|
|
/* don't trim if this may shorten the read too much */ |
662 |
|
|
//if (l5-l3<min_read_len) return; |
663 |
|
|
feat.end5=l5+1; |
664 |
|
|
feat.end3=l3+1; |
665 |
|
|
return; |
666 |
|
|
} |
667 |
|
|
else |
668 |
|
|
N_analyze(l5,l3, p5,p3); |
669 |
|
|
} |
670 |
|
|
|
671 |
|
|
|
672 |
gpertea |
12 |
bool qtrim(GStr& qvs, int &l5, int &l3) { |
673 |
|
|
if (qvtrim_min==0 || qvs.is_empty()) return false; |
674 |
|
|
if (qv_phredtype==0) { |
675 |
|
|
//try to guess the Phred type |
676 |
|
|
int vmin=256, vmax=0; |
677 |
|
|
for (int i=0;i<qvs.length();i++) { |
678 |
|
|
if (vmin>qvs[i]) vmin=qvs[i]; |
679 |
|
|
if (vmax<qvs[i]) vmax=qvs[i]; |
680 |
|
|
} |
681 |
|
|
if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; } |
682 |
|
|
if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; } |
683 |
|
|
if (qv_phredtype==0) { |
684 |
|
|
GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n"); |
685 |
|
|
} |
686 |
|
|
} //guessing the Phred type |
687 |
|
|
for (l3=qvs.length()-1;l3>2;l3--) { |
688 |
|
|
if (qvs[l3]-qv_phredtype>=qvtrim_min && qvs[l3-1]-qv_phredtype>=qvtrim_min) break; |
689 |
|
|
} |
690 |
|
|
//just in case, check also the 5' the end (?) |
691 |
|
|
for (l5=0;l5<qvs.length()-3;l5++) { |
692 |
|
|
if (qvs[l5]-qv_phredtype>=qvtrim_min && qvs[l5+1]-qv_phredtype>=qvtrim_min) break; |
693 |
|
|
} |
694 |
|
|
return (l5>0 || l3<qvs.length()-1); |
695 |
|
|
} |
696 |
|
|
|
697 |
|
|
bool ntrim(GStr& rseq, int &l5, int &l3) { |
698 |
|
|
//count Ns in the sequence, trim N-rich ends |
699 |
gpertea |
4 |
feat.init(rseq); |
700 |
|
|
l5=feat.end5-1; |
701 |
|
|
l3=feat.end3-1; |
702 |
|
|
N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1); |
703 |
gpertea |
12 |
if (l5==feat.end5-1 && l3==feat.end3-1) { |
704 |
|
|
if (feat.perc_N>max_perc_N) { |
705 |
|
|
feat.valid=false; |
706 |
|
|
l3=l5+1; |
707 |
|
|
return true; |
708 |
|
|
} |
709 |
|
|
else { |
710 |
|
|
return false; //nothing changed |
711 |
|
|
} |
712 |
|
|
} |
713 |
gpertea |
4 |
l5=feat.end5-1; |
714 |
|
|
l3=feat.end3-1; |
715 |
|
|
if (l3-l5+1<min_read_len) { |
716 |
|
|
feat.valid=false; |
717 |
|
|
return true; |
718 |
|
|
} |
719 |
|
|
feat.N_calc(); |
720 |
gpertea |
12 |
|
721 |
|
|
if (feat.perc_N>max_perc_N) { |
722 |
gpertea |
4 |
feat.valid=false; |
723 |
|
|
l3=l5+1; |
724 |
|
|
return true; |
725 |
|
|
} |
726 |
|
|
return true; |
727 |
|
|
} |
728 |
|
|
|
729 |
|
|
//--------------- dust functions ---------------- |
730 |
|
|
class DNADuster { |
731 |
|
|
public: |
732 |
|
|
int dustword; |
733 |
|
|
int dustwindow; |
734 |
|
|
int dustwindow2; |
735 |
|
|
int dustcutoff; |
736 |
|
|
int mv, iv, jv; |
737 |
|
|
int counts[32*32*32]; |
738 |
|
|
int iis[32*32*32]; |
739 |
|
|
DNADuster(int cutoff=16, int winsize=32, int wordsize=3) { |
740 |
|
|
dustword=wordsize; |
741 |
|
|
dustwindow=winsize; |
742 |
|
|
dustwindow2 = (winsize>>1); |
743 |
|
|
dustcutoff=cutoff; |
744 |
|
|
mv=0; |
745 |
|
|
iv=0; |
746 |
|
|
jv=0; |
747 |
|
|
} |
748 |
|
|
void setWindowSize(int value) { |
749 |
|
|
dustwindow = value; |
750 |
|
|
dustwindow2 = (dustwindow >> 1); |
751 |
|
|
} |
752 |
|
|
void setWordSize(int value) { |
753 |
|
|
dustword=value; |
754 |
|
|
} |
755 |
|
|
void wo1(int len, const char* s, int ivv) { |
756 |
|
|
int i, ii, j, v, t, n, n1, sum; |
757 |
|
|
int js, nis; |
758 |
|
|
n = 32 * 32 * 32; |
759 |
|
|
n1 = n - 1; |
760 |
|
|
nis = 0; |
761 |
|
|
i = 0; |
762 |
|
|
ii = 0; |
763 |
|
|
sum = 0; |
764 |
|
|
v = 0; |
765 |
|
|
for (j=0; j < len; j++, s++) { |
766 |
|
|
ii <<= 5; |
767 |
|
|
if (*s<=32) { |
768 |
|
|
i=0; |
769 |
|
|
continue; |
770 |
|
|
} |
771 |
|
|
ii |= *s - 'A'; //assume uppercase! |
772 |
|
|
ii &= n1; |
773 |
|
|
i++; |
774 |
|
|
if (i >= dustword) { |
775 |
|
|
for (js=0; js < nis && iis[js] != ii; js++) ; |
776 |
|
|
if (js == nis) { |
777 |
|
|
iis[nis] = ii; |
778 |
|
|
counts[ii] = 0; |
779 |
|
|
nis++; |
780 |
|
|
} |
781 |
|
|
if ((t = counts[ii]) > 0) { |
782 |
|
|
sum += t; |
783 |
|
|
v = 10 * sum / j; |
784 |
|
|
if (mv < v) { |
785 |
|
|
mv = v; |
786 |
|
|
iv = ivv; |
787 |
|
|
jv = j; |
788 |
|
|
} |
789 |
|
|
} |
790 |
|
|
counts[ii]++; |
791 |
|
|
} |
792 |
|
|
} |
793 |
|
|
} |
794 |
|
|
|
795 |
|
|
int wo(int len, const char* s, int* beg, int* end) { |
796 |
|
|
int i, l1; |
797 |
|
|
l1 = len - dustword + 1; |
798 |
|
|
if (l1 < 0) { |
799 |
|
|
*beg = 0; |
800 |
|
|
*end = len - 1; |
801 |
|
|
return 0; |
802 |
|
|
} |
803 |
|
|
mv = 0; |
804 |
|
|
iv = 0; |
805 |
|
|
jv = 0; |
806 |
|
|
for (i=0; i < l1; i++) { |
807 |
|
|
wo1(len-i, s+i, i); |
808 |
|
|
} |
809 |
|
|
*beg = iv; |
810 |
|
|
*end = iv + jv; |
811 |
|
|
return mv; |
812 |
|
|
} |
813 |
|
|
|
814 |
|
|
void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) { |
815 |
|
|
int i, j, l, a, b, v; |
816 |
|
|
if (cutoff==0) cutoff=dustcutoff; |
817 |
|
|
a=0;b=0; |
818 |
|
|
//GMessage("Dust cutoff=%d\n", cutoff); |
819 |
|
|
for (i=0; i < seqlen; i += dustwindow2) { |
820 |
|
|
l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i; |
821 |
|
|
v = wo(l, seq+i, &a, &b); |
822 |
|
|
if (v > cutoff) { |
823 |
|
|
//for (j = a; j <= b && j < dustwindow2; j++) { |
824 |
|
|
for (j = a; j <= b; j++) { |
825 |
|
|
seqmsk[i+j]='N';//could be made lowercase instead |
826 |
|
|
} |
827 |
|
|
} |
828 |
|
|
} |
829 |
|
|
//return first; |
830 |
|
|
} |
831 |
|
|
}; |
832 |
|
|
|
833 |
|
|
static DNADuster duster; |
834 |
|
|
|
835 |
|
|
int dust(GStr& rseq) { |
836 |
|
|
char* seq=Gstrdup(rseq.chars()); |
837 |
|
|
duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff); |
838 |
|
|
//check the number of Ns: |
839 |
|
|
int ncount=0; |
840 |
|
|
for (int i=0;i<rseq.length();i++) { |
841 |
|
|
if (seq[i]=='N') ncount++; |
842 |
|
|
} |
843 |
|
|
GFREE(seq); |
844 |
|
|
return ncount; |
845 |
|
|
} |
846 |
|
|
|
847 |
gpertea |
12 |
|
848 |
|
|
// ------------------ adapter matching - simple k-mer seed & extend, no indels for now |
849 |
|
|
//when a k-mer match is found, simply try to extend the alignment using a drop-off scheme |
850 |
|
|
//check minimum score and |
851 |
|
|
//for 3' adapter trimming: |
852 |
gpertea |
4 |
// require that the right end of the alignment for either the adaptor OR the read must be |
853 |
|
|
// < 3 distance from its right end |
854 |
|
|
// for 5' adapter trimming: |
855 |
|
|
// require that the left end of the alignment for either the adaptor OR the read must |
856 |
gpertea |
12 |
// be at coordinate < 3 from start |
857 |
gpertea |
4 |
|
858 |
|
|
bool extendMatch(const char* a, int alen, int ai, |
859 |
gpertea |
12 |
const char* b, int blen, int bi, int mlen, int& l5, int& l3, CSegChain& segs, bool end5=false) { |
860 |
gpertea |
4 |
//so the alignment starts at ai in a, bi in b, with a perfect match of length mlen |
861 |
gpertea |
12 |
#ifdef DEBUG |
862 |
|
|
GStr dbg(b); |
863 |
|
|
#endif |
864 |
|
|
//if (debug) { |
865 |
|
|
// GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai); |
866 |
|
|
// } |
867 |
gpertea |
4 |
int a_l=ai; //alignment coordinates on a |
868 |
|
|
int a_r=ai+mlen-1; |
869 |
|
|
int b_l=bi; //alignment coordinates on b |
870 |
|
|
int b_r=bi+mlen-1; |
871 |
|
|
int ai_maxscore=ai; |
872 |
|
|
int bi_maxscore=bi; |
873 |
|
|
int score=mlen*a_m_score; |
874 |
|
|
int maxscore=score; |
875 |
gpertea |
9 |
int mism5score=a_mis_score; |
876 |
|
|
if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end |
877 |
gpertea |
4 |
//try to extend to the left first, if possible |
878 |
|
|
while (ai>0 && bi>0) { |
879 |
|
|
ai--; |
880 |
|
|
bi--; |
881 |
|
|
score+= (a[ai]==b[bi])? a_m_score : mism5score; |
882 |
|
|
if (score>maxscore) { |
883 |
|
|
ai_maxscore=ai; |
884 |
|
|
bi_maxscore=bi; |
885 |
|
|
maxscore=score; |
886 |
|
|
} |
887 |
|
|
else if (maxscore-score>a_dropoff_score) break; |
888 |
|
|
} |
889 |
|
|
a_l=ai_maxscore; |
890 |
|
|
b_l=bi_maxscore; |
891 |
gpertea |
9 |
//if (debug) GMessage(" after l-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore); |
892 |
gpertea |
4 |
//now extend to the right |
893 |
|
|
ai_maxscore=a_r; |
894 |
|
|
bi_maxscore=b_r; |
895 |
|
|
ai=a_r; |
896 |
|
|
bi=b_r; |
897 |
|
|
score=maxscore; |
898 |
|
|
//sometimes there are extra AAAAs at the end of the read, ignore those |
899 |
|
|
if (strcmp(&a[alen-4],"AAAA")==0) { |
900 |
|
|
alen-=3; |
901 |
|
|
while (a[alen-1]=='A' && alen>ai) alen--; |
902 |
|
|
} |
903 |
|
|
while (ai<alen-1 && bi<blen-1) { |
904 |
|
|
ai++; |
905 |
|
|
bi++; |
906 |
|
|
//score+= (a[ai]==b[bi])? a_m_score : a_mis_score; |
907 |
|
|
if (a[ai]==b[bi]) { //match |
908 |
|
|
score+=a_m_score; |
909 |
|
|
if (ai>=alen-2) { |
910 |
|
|
score+=a_m_score-(alen-ai-1); |
911 |
|
|
} |
912 |
|
|
} |
913 |
|
|
else { //mismatch |
914 |
|
|
score+=a_mis_score; |
915 |
|
|
} |
916 |
|
|
if (score>maxscore) { |
917 |
|
|
ai_maxscore=ai; |
918 |
|
|
bi_maxscore=bi; |
919 |
|
|
maxscore=score; |
920 |
|
|
} |
921 |
|
|
else if (maxscore-score>a_dropoff_score) break; |
922 |
|
|
} |
923 |
|
|
a_r=ai_maxscore; |
924 |
|
|
b_r=bi_maxscore; |
925 |
gpertea |
9 |
int a_ovh3=alen-a_r-1; |
926 |
|
|
int b_ovh3=blen-b_r-1; |
927 |
|
|
int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3; |
928 |
|
|
int mmovh5=(a_l<b_l)? a_l : b_l; |
929 |
|
|
//if (debug) GMessage(" after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore); |
930 |
gpertea |
12 |
#ifdef DEBUG |
931 |
|
|
if (debug) GMessage(" extended to: %*s\n",a_r+1,dbg.substr(b_l,b_r-b_l+1).chars()); |
932 |
|
|
#endif |
933 |
gpertea |
9 |
if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) { |
934 |
|
|
if (a_l<a_ovh3) { |
935 |
gpertea |
4 |
//adapter closer to the left end (typical for 5' adapter) |
936 |
|
|
l5=a_r+1; |
937 |
|
|
l3=alen-1; |
938 |
|
|
} |
939 |
|
|
else { |
940 |
|
|
//adapter matching at the right end (typical for 3' adapter) |
941 |
|
|
l5=0; |
942 |
|
|
l3=a_l-1; |
943 |
|
|
} |
944 |
|
|
return true; |
945 |
|
|
} |
946 |
gpertea |
12 |
else { //keep this segment pair for later (gapped alignment) |
947 |
|
|
segs.addSegPair(new CSegPair(a_l+1, a_r+1, b_l+1, b_r+1, maxscore)); |
948 |
|
|
//this will also update min & max coordinates in segs (segs.astart, .aend, .bstart, .bend) |
949 |
|
|
} |
950 |
gpertea |
11 |
//do not trim: |
951 |
|
|
l5=0; |
952 |
|
|
l3=alen-1; |
953 |
gpertea |
4 |
return false; |
954 |
|
|
} |
955 |
|
|
|
956 |
gpertea |
12 |
/* |
957 |
|
|
int getWordValue(const char* s, int wlen) { |
958 |
|
|
int r=0; |
959 |
|
|
while (wlen--) { r+=(((int)s[wlen])<<wlen) } |
960 |
|
|
return r; |
961 |
|
|
} |
962 |
|
|
*/ |
963 |
|
|
int get3mer_value(const char* s) { |
964 |
|
|
return (s[0]<<16)+(s[1]<<8)+s[2]; |
965 |
|
|
} |
966 |
|
|
|
967 |
|
|
int w3_match(int qv, const char* str, int slen, int start_index=0) { |
968 |
|
|
if (start_index>=slen || start_index<0) return -1; |
969 |
|
|
for (int i=start_index;i<slen-3;i++) { |
970 |
|
|
int rv=get3mer_value(str+i); |
971 |
|
|
if (rv==qv) return i; |
972 |
|
|
} |
973 |
|
|
return -1; |
974 |
|
|
} |
975 |
|
|
|
976 |
|
|
int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) { |
977 |
|
|
if (end_index>=slen) return -1; |
978 |
|
|
if (end_index<0) end_index=slen-1; |
979 |
|
|
for (int i=end_index-2;i>=0;i--) { |
980 |
|
|
int rv=get3mer_value(str+i); |
981 |
|
|
if (rv==qv) return i; |
982 |
|
|
} |
983 |
|
|
return -1; |
984 |
|
|
} |
985 |
|
|
|
986 |
|
|
int fast4match(int32 qv, const char* str, int slen, int start_index=0) { |
987 |
|
|
if (start_index>=slen || start_index<0) return -1; |
988 |
|
|
for (int i=start_index;i<slen-4;i++) { |
989 |
|
|
int32* rv=(int32*)(str+i); |
990 |
|
|
if (*rv==qv) return i; |
991 |
|
|
} |
992 |
|
|
return -1; |
993 |
|
|
} |
994 |
|
|
|
995 |
|
|
int fast4rmatch(int32 qv, const char* str, int slen, int end_index=-1) { |
996 |
|
|
if (end_index>=slen) return -1; |
997 |
|
|
if (end_index<0) end_index=slen-1; |
998 |
|
|
for (int i=end_index-3;i>=0;i--) { |
999 |
|
|
int32* rv=(int32*)(str+i); |
1000 |
|
|
if (*rv==qv) return i; |
1001 |
|
|
} |
1002 |
|
|
return -1; |
1003 |
|
|
} |
1004 |
|
|
|
1005 |
|
|
#ifdef DEBUG |
1006 |
|
|
void dbgPrintChain(CSegChain& chain, const char* aseq) { |
1007 |
|
|
GStr s(aseq); |
1008 |
|
|
for (int i=0;i<chain.Count();i++) { |
1009 |
|
|
CSegPair& seg=*chain[i]; |
1010 |
|
|
GMessage(" dbg chain seg%d: %*s [%d-%d:%d-%d]\n",i,seg.a.start-1+seg.len(), |
1011 |
|
|
s.substr(seg.b.start-1, seg.len()).chars(), seg.b.start,seg.b.end,seg.a.start,seg.a.end); |
1012 |
|
|
} |
1013 |
|
|
} |
1014 |
|
|
#endif |
1015 |
|
|
|
1016 |
gpertea |
4 |
bool trim_adapter3(GStr& seq, int&l5, int &l3) { |
1017 |
|
|
int rlen=seq.length(); |
1018 |
|
|
l5=0; |
1019 |
|
|
l3=rlen-1; |
1020 |
|
|
//first try a full match, we might get lucky |
1021 |
|
|
int fi=-1; |
1022 |
|
|
if ((fi=seq.index(adapter3))>=0) { |
1023 |
|
|
if (fi<rlen-fi-a3len) {//match is closer to the right end |
1024 |
|
|
l5=fi+a3len; |
1025 |
|
|
l3=rlen-1; |
1026 |
|
|
} |
1027 |
|
|
else { |
1028 |
|
|
l5=0; |
1029 |
gpertea |
12 |
l3=fi-1; |
1030 |
gpertea |
4 |
} |
1031 |
|
|
return true; |
1032 |
|
|
} |
1033 |
gpertea |
12 |
#ifdef DEBUG |
1034 |
|
|
if (debug) GMessage(">TRIM3 >> Read: %s\n",seq.chars()); |
1035 |
|
|
#endif |
1036 |
|
|
|
1037 |
gpertea |
4 |
//also, for fast detection of other adapter-only reads that start past |
1038 |
|
|
// the beginning of the adapter sequence, try to see if the first a3len-4 |
1039 |
gpertea |
12 |
// bases of the read are a substring of the adapter |
1040 |
|
|
if (rlen>a3len-3) { |
1041 |
|
|
GStr rstart=seq.substr(1,a3len-4); |
1042 |
|
|
if ((fi=adapter3.index(rstart))>=0) { |
1043 |
|
|
l3=rlen-1; |
1044 |
|
|
l5=a3len-4; |
1045 |
|
|
while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++; |
1046 |
|
|
return true; |
1047 |
|
|
} |
1048 |
|
|
} |
1049 |
|
|
CSegChain a3segs; //no chains here, just an ordered collection of segment pairs |
1050 |
|
|
//check the easy cases - 11 bases exact match at the end |
1051 |
|
|
int fdlen=11; |
1052 |
|
|
if (a3len<16) { |
1053 |
|
|
fdlen=a3len>>1; |
1054 |
gpertea |
4 |
} |
1055 |
gpertea |
12 |
if (fdlen>4) { |
1056 |
|
|
//check if we're lucky enough to have the last 11 bases of the read a part of the adapter |
1057 |
|
|
GStr rstart=seq.substr(-fdlen-3,fdlen); |
1058 |
|
|
if ((fi=adapter3.index(rstart))>=0) { |
1059 |
|
|
#ifdef DEBUG |
1060 |
|
|
if (debug) GMessage(" W11match found: %*s\n", rlen-3, (adapter3.substr(fi,fdlen)).chars()); |
1061 |
|
|
#endif |
1062 |
|
|
if (extendMatch(seq.chars(), rlen, rlen-fdlen-3, |
1063 |
|
|
adapter3.chars(), a3len, fi, fdlen, l5,l3, a3segs)) |
1064 |
|
|
return true; |
1065 |
|
|
} |
1066 |
|
|
//another easy case: first 11 characters of the adaptor found as a substring of the read |
1067 |
|
|
GStr bstr=adapter3.substr(0, fdlen); |
1068 |
|
|
if ((fi=seq.rindex(bstr))>=0) { |
1069 |
|
|
#ifdef DEBUG |
1070 |
|
|
if (debug) GMessage(" A11match found: %*s\n", fi+fdlen, bstr.chars()); |
1071 |
|
|
#endif |
1072 |
|
|
if (extendMatch(seq.chars(), rlen, fi, |
1073 |
|
|
adapter3.chars(), a3len, 0, fdlen, l5,l3, a3segs)) |
1074 |
|
|
return true; |
1075 |
|
|
} |
1076 |
|
|
} //tried to match 11 bases first |
1077 |
|
|
|
1078 |
|
|
//no easy cases, so let's do the wmer hashing for the first 12 bases of the adaptor |
1079 |
|
|
//-- only extend if the match is in the 3' (ending) region of the read |
1080 |
|
|
int wordSize=3; |
1081 |
|
|
int hlen=12; |
1082 |
|
|
if (hlen>a3len-wordSize) hlen=a3len-wordSize; |
1083 |
|
|
int imin=rlen>>1; //last half of the read, left boundary for the wmer match |
1084 |
|
|
if (imin<a3len) { imin=GMIN(a3len, rlen-wordSize); } |
1085 |
|
|
imin=rlen-imin; |
1086 |
|
|
for (int iw=0;iw<hlen;iw++) { |
1087 |
|
|
//int32* qv=(int32*)(adapter3.chars()+iw); |
1088 |
|
|
int qv=get3mer_value(adapter3.chars()+iw); |
1089 |
|
|
fi=-1; |
1090 |
|
|
//while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) { |
1091 |
|
|
while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) { |
1092 |
|
|
//GMessage(" ... fi=%d after w3_rmatch() (imin=%d)\n", fi, imin); |
1093 |
|
|
|
1094 |
|
|
#ifdef DEBUG |
1095 |
|
|
if (debug) GMessage(" Wmatch found: %*s\n", fi+wordSize, (adapter3.substr(iw,wordSize)).chars()); |
1096 |
|
|
#endif |
1097 |
|
|
if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(), |
1098 |
|
|
a3len, iw, wordSize, l5,l3, a3segs)) return true; |
1099 |
|
|
fi--; |
1100 |
|
|
if (fi<imin) break; |
1101 |
|
|
} |
1102 |
|
|
} //for each wmer in the first hlen bases of the adaptor |
1103 |
|
|
/* |
1104 |
|
|
//couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there |
1105 |
|
|
//but only do this if we already have segment pairs collected in the last 12 bases of the adapter |
1106 |
|
|
if (a3segs.bstart>3 || a3segs.bend<(uint)(hlen-wordSize)) return false; |
1107 |
|
|
int hlen2=a3len-wordSize; |
1108 |
|
|
//if (hlen2>a3len-4) hlen2=a3len-4; |
1109 |
|
|
if (hlen2>hlen) { |
1110 |
|
|
#ifdef DEBUG |
1111 |
|
|
if (debug && a3segs.Count()>0) { |
1112 |
|
|
GMessage(" >>>>>2nd. hash: %s\n",seq.chars()); |
1113 |
|
|
} |
1114 |
|
|
#endif |
1115 |
|
|
for (int iw=hlen;iw<hlen2;iw++) { |
1116 |
|
|
//int* qv=(int32 *)(adapter3.chars()+iw); |
1117 |
|
|
int qv=get3mer_value(adapter3.chars()+iw); |
1118 |
|
|
fi=-1; |
1119 |
|
|
//while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) { |
1120 |
|
|
while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) { |
1121 |
|
|
extendMatch(seq.chars(), rlen, fi, adapter3.chars(), |
1122 |
|
|
a3len, iw, wordSize, l5,l3, a3segs); |
1123 |
|
|
fi--; |
1124 |
|
|
if (fi<imin) break; |
1125 |
|
|
} |
1126 |
|
|
} //for each wmer between hlen2 and hlen bases of the adaptor |
1127 |
|
|
} |
1128 |
|
|
//lastly, analyze collected a3segs for a possible gapped alignment: |
1129 |
|
|
GList<CSegChain> segchains(false,true,false); |
1130 |
|
|
#ifdef DEBUG |
1131 |
|
|
if (debug && a3segs.Count()>0) { |
1132 |
|
|
GMessage(">>>>>>>>> Read: %s\n",seq.chars()); |
1133 |
gpertea |
4 |
} |
1134 |
gpertea |
12 |
#endif |
1135 |
|
|
for (int i=0;i<a3segs.Count();i++) { |
1136 |
|
|
if (a3segs[i]->chain==NULL) { |
1137 |
|
|
if (a3segs[i]->b.start>3) continue; //don't start a hopeless chain |
1138 |
|
|
CSegChain* newchain=new CSegChain(); |
1139 |
|
|
newchain->setFreeItem(false); |
1140 |
|
|
newchain->addSegPair(a3segs[i]); |
1141 |
|
|
a3segs[i]->chain=newchain; |
1142 |
|
|
segchains.Add(newchain); //just to free them when done |
1143 |
|
|
} |
1144 |
|
|
for (int j=i+1;j<a3segs.Count();j++) { |
1145 |
|
|
CSegChain* chain=a3segs[i]->chain; |
1146 |
|
|
if (chain->extendChain(a3segs[j])) { |
1147 |
|
|
a3segs[j]->chain=chain; |
1148 |
|
|
#ifdef DEBUG |
1149 |
|
|
if (debug) dbgPrintChain(*chain, adapter3.chars()); |
1150 |
|
|
#endif |
1151 |
|
|
//save time by checking here if the extended chain is already acceptable for trimming |
1152 |
|
|
if (chain->aend>(uint)(rlen-4) && chain->bstart<4 && chain->score>a_min_chain_score) { |
1153 |
|
|
l5=0; |
1154 |
|
|
l3=chain->astart-2; |
1155 |
|
|
#ifdef DEBUG |
1156 |
|
|
if (debug && a3segs.Count()>0) { |
1157 |
|
|
GMessage(">>> >> trimmed-3: %*s\n",l3-l5+1,seq.substr(l5,l3-l5+1).chars()); |
1158 |
|
|
} |
1159 |
|
|
#endif |
1160 |
|
|
return true; |
1161 |
|
|
} |
1162 |
|
|
} //chain can be extended |
1163 |
gpertea |
4 |
} |
1164 |
gpertea |
12 |
} //collect segment alignments into chains |
1165 |
|
|
*/ |
1166 |
gpertea |
4 |
return false; //no adapter parts found |
1167 |
|
|
} |
1168 |
|
|
|
1169 |
|
|
bool trim_adapter5(GStr& seq, int&l5, int &l3) { |
1170 |
gpertea |
9 |
//if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars()); |
1171 |
gpertea |
4 |
int rlen=seq.length(); |
1172 |
|
|
l5=0; |
1173 |
|
|
l3=rlen-1; |
1174 |
|
|
//try to see if adapter is fully included in the read |
1175 |
|
|
int fi=-1; |
1176 |
|
|
if ((fi=seq.index(adapter5))>=0) { |
1177 |
|
|
if (fi<rlen-fi-a5len) {//match is closer to the right end |
1178 |
|
|
l5=fi+a5len; |
1179 |
|
|
l3=rlen-1; |
1180 |
|
|
} |
1181 |
|
|
else { |
1182 |
|
|
l5=0; |
1183 |
|
|
l3=fi-1; |
1184 |
|
|
} |
1185 |
|
|
return true; |
1186 |
|
|
} |
1187 |
gpertea |
12 |
#ifdef DEBUG |
1188 |
|
|
if (debug) GMessage(">TRIM5 >> Read: %s\n",seq.chars()); |
1189 |
|
|
#endif |
1190 |
|
|
|
1191 |
|
|
CSegChain a5segs(true); //list of segment pairs to analyze later if no extendMatch succeeded |
1192 |
|
|
|
1193 |
|
|
//try the easy way out first - look for an exact match of 11 bases |
1194 |
|
|
int fdlen=11; |
1195 |
|
|
if (a5len<16) { |
1196 |
|
|
fdlen=a5len>>1; |
1197 |
gpertea |
4 |
} |
1198 |
gpertea |
12 |
if (fdlen>4) { |
1199 |
|
|
GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus |
1200 |
|
|
if ((fi=adapter5.index(rstart))>=0) { |
1201 |
|
|
#ifdef DEBUG |
1202 |
|
|
if (debug) GMessage(" W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars()); |
1203 |
|
|
#endif |
1204 |
|
|
if (extendMatch(seq.chars(), rlen, 1, |
1205 |
|
|
adapter5.chars(), a5len, fi, fdlen, l5,l3, a5segs, true)) |
1206 |
|
|
return true; |
1207 |
|
|
} |
1208 |
|
|
//another easy case: last 11 characters of the adaptor found as a substring of the read |
1209 |
|
|
GStr bstr=adapter5.substr(-fdlen); |
1210 |
|
|
if ((fi=seq.index(bstr))>=0) { |
1211 |
|
|
#ifdef DEBUG |
1212 |
|
|
if (debug) GMessage(" A11match found: %*s\n", fi+fdlen, bstr.chars()); |
1213 |
|
|
#endif |
1214 |
|
|
if (extendMatch(seq.chars(), rlen, fi, |
1215 |
|
|
adapter5.chars(), a5len, a5len-fdlen, fdlen, l5,l3,a5segs,true)) |
1216 |
|
|
return true; |
1217 |
|
|
} |
1218 |
|
|
} //tried to matching at most 11 bases first |
1219 |
|
|
|
1220 |
|
|
//-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor |
1221 |
|
|
//-- only extend a wmer if it matches in the 5' (beginning) region of the read |
1222 |
|
|
int wordSize=3; |
1223 |
|
|
int hlen=12; |
1224 |
|
|
if (hlen>a5len-wordSize) hlen=a5len-wordSize; |
1225 |
|
|
int imax=rlen>>1; //first half of the read, right boundary for the wmer match |
1226 |
|
|
if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); } |
1227 |
|
|
for (int iw=0;iw<=hlen;iw++) { |
1228 |
|
|
int apstart=a5len-iw-wordSize; |
1229 |
|
|
fi=0; |
1230 |
|
|
//int* qv=(int32 *)(adapter5.chars()+apstart); |
1231 |
|
|
int qv=get3mer_value(adapter5.chars()+apstart); |
1232 |
|
|
//while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) { |
1233 |
|
|
while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) { |
1234 |
|
|
#ifdef DEBUG |
1235 |
|
|
if (debug) GMessage(" Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars()); |
1236 |
|
|
#endif |
1237 |
|
|
if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(), |
1238 |
|
|
a5len, apstart, wordSize, l5,l3, a5segs, true)) return true; |
1239 |
|
|
fi++; |
1240 |
|
|
if (fi>imax) break; |
1241 |
|
|
} |
1242 |
|
|
} //for each wmer in the last hlen bases of the adaptor |
1243 |
|
|
/* |
1244 |
|
|
|
1245 |
|
|
//couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there |
1246 |
|
|
//but only do this if we already have segment pairs collected in the last 12 bases of the adapter |
1247 |
|
|
if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false; |
1248 |
|
|
int hlen2=a5len-wordSize; |
1249 |
|
|
//if (hlen2>a5len-wordSize) hlen2=a5len-wordSize; |
1250 |
|
|
#ifdef DEBUG |
1251 |
|
|
if (debug && a5segs.Count()>0) { |
1252 |
|
|
GMessage(" >>>>>2nd. hash: %s\n",seq.chars()); |
1253 |
|
|
} |
1254 |
|
|
#endif |
1255 |
|
|
if (hlen2>hlen) { |
1256 |
|
|
for (int iw=hlen+1;iw<=hlen2;iw++) { |
1257 |
|
|
int apstart=a5len-iw-wordSize; |
1258 |
|
|
fi=0; |
1259 |
|
|
//int* qv=(int32 *)(adapter5.chars()+apstart); |
1260 |
|
|
int qv=get3mer_value(adapter5.chars()+apstart); |
1261 |
|
|
//while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) { |
1262 |
|
|
while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) { |
1263 |
|
|
extendMatch(seq.chars(), rlen, fi, adapter5.chars(), |
1264 |
|
|
a5len, apstart, wordSize, l5,l3, a5segs, true); |
1265 |
|
|
fi++; |
1266 |
|
|
if (fi>imax) break; |
1267 |
|
|
} |
1268 |
|
|
} //for each wmer between hlen2 and hlen bases of the adaptor |
1269 |
|
|
} |
1270 |
|
|
if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false; |
1271 |
|
|
// lastly, analyze collected a5segs for a possible gapped alignment: |
1272 |
|
|
GList<CSegChain> segchains(false,true,false); |
1273 |
|
|
#ifdef DEBUG |
1274 |
|
|
if (debug && a5segs.Count()>0) { |
1275 |
|
|
GMessage(">>>>>>>>> Read: %s\n",seq.chars()); |
1276 |
gpertea |
4 |
} |
1277 |
gpertea |
12 |
#endif |
1278 |
|
|
for (int i=0;i<a5segs.Count();i++) { |
1279 |
|
|
if (a5segs[i]->chain==NULL) { |
1280 |
|
|
if (a5segs[i]->b.end<(int)(a5len-4)) continue; //don't start a hopeless chain |
1281 |
|
|
CSegChain* newchain=new CSegChain(true); |
1282 |
|
|
newchain->setFreeItem(false); |
1283 |
|
|
newchain->addSegPair(a5segs[i]); |
1284 |
|
|
a5segs[i]->chain=newchain; |
1285 |
|
|
segchains.Add(newchain); //just to free them when done |
1286 |
|
|
} |
1287 |
|
|
for (int j=i+1;j<a5segs.Count();j++) { |
1288 |
|
|
CSegChain* chain=a5segs[i]->chain; |
1289 |
|
|
if (chain->extendChain(a5segs[j])) { |
1290 |
|
|
a5segs[j]->chain=chain; |
1291 |
|
|
#ifdef DEBUG |
1292 |
|
|
if (debug) dbgPrintChain(*chain, adapter5.chars()); |
1293 |
|
|
#endif |
1294 |
|
|
//save time by checking here if the extended chain is already acceptable for trimming |
1295 |
|
|
if (chain->bend>(uint)(a5len-3) && chain->astart<4 && chain->score>a_min_chain_score) { |
1296 |
|
|
l5=chain->aend; |
1297 |
|
|
l3=rlen-1; |
1298 |
|
|
return true; |
1299 |
|
|
} |
1300 |
|
|
} //chain can be extended |
1301 |
gpertea |
4 |
} |
1302 |
gpertea |
12 |
} //collect segment alignments into chains |
1303 |
|
|
*/ |
1304 |
gpertea |
4 |
return false; //no adapter parts found |
1305 |
|
|
} |
1306 |
|
|
|
1307 |
gpertea |
12 |
//convert qvs to/from phred64 from/to phread33 |
1308 |
|
|
void convertPhred(GStr& q) { |
1309 |
|
|
for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd; |
1310 |
gpertea |
4 |
} |
1311 |
|
|
|
1312 |
gpertea |
12 |
void convertPhred(char* q, int len) { |
1313 |
|
|
for (int i=0;i<len;i++) q[i]+=qv_cvtadd; |
1314 |
gpertea |
4 |
} |