1 |
gpertea |
4 |
#include "GArgs.h" |
2 |
|
|
#include "GStr.h" |
3 |
|
|
#include "GHash.hh" |
4 |
|
|
#include "GList.hh" |
5 |
gpertea |
13 |
#include <ctype.h> |
6 |
gpertea |
94 |
#include "GAlnExtend.h" |
7 |
gpertea |
4 |
|
8 |
|
|
#define USAGE "Usage:\n\ |
9 |
gpertea |
76 |
fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\ |
10 |
gpertea |
172 |
[-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\ |
11 |
gpertea |
76 |
[-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\ |
12 |
|
|
<input.fq>[,<input_mates.fq>\n\ |
13 |
gpertea |
4 |
\n\ |
14 |
gpertea |
76 |
Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\ |
15 |
|
|
for low complexity and collapse duplicate reads.\n\ |
16 |
gpertea |
13 |
If read pairs should be trimmed and kept together (i.e. without discarding\n\ |
17 |
|
|
one read in a pair), the two file names should be given delimited by a comma\n\ |
18 |
|
|
or a colon character\n\ |
19 |
gpertea |
4 |
\n\ |
20 |
|
|
Options:\n\ |
21 |
|
|
-n rename all the reads using the <prefix> followed by a read counter;\n\ |
22 |
|
|
if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\ |
23 |
|
|
the read duplication count\n\ |
24 |
gpertea |
76 |
-o unless this parameter is '-', write the trimmed/filtered reads to \n\ |
25 |
|
|
file(s) named <input>.<outsuffix> which will be created in the \n\ |
26 |
|
|
current (working) directory; (writes to stdout if -o- is given);\n\ |
27 |
|
|
a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\ |
28 |
|
|
-f file with adapter sequences to trim, each line having this format:\n\ |
29 |
|
|
<5'-adapter-sequence> <3'-adapter-sequence>\n\ |
30 |
gpertea |
4 |
-5 trim the given adapter or primer sequence at the 5' end of each read\n\ |
31 |
|
|
(e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\ |
32 |
|
|
-3 trim the given adapter sequence at the 3' end of each read\n\ |
33 |
|
|
(e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\ |
34 |
gpertea |
103 |
-A disable polyA/T trimming (enabled by default)\n\ |
35 |
gpertea |
94 |
-y minimum length of exact match to adaptor sequence at the proper end (6)\n\ |
36 |
gpertea |
13 |
-q trim bases with quality value lower than <minq> (starting at the 3' end)\n\ |
37 |
gpertea |
76 |
-t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\ |
38 |
gpertea |
12 |
-m maximum percentage of Ns allowed in a read after trimming (default 7)\n\ |
39 |
|
|
-l minimum \"clean\" length after trimming that a read must have\n\ |
40 |
|
|
in order to pass the filter (default: 16)\n\ |
41 |
gpertea |
4 |
-r write a simple \"trash report\" file listing the discarded reads with a\n\ |
42 |
|
|
one letter code for the reason why they were trashed\n\ |
43 |
gpertea |
12 |
-D apply a low-complexity (dust) filter and discard any read that has over\n\ |
44 |
|
|
50% of its length detected as low complexity\n\ |
45 |
gpertea |
4 |
-C collapse duplicate reads and add a prefix with their count to the read\n\ |
46 |
gpertea |
12 |
name (e.g. for microRNAs)\n\ |
47 |
|
|
-p input is phred64/phred33 (use -p64 or -p33)\n\ |
48 |
|
|
-Q convert quality values to the other Phred qv type\n\ |
49 |
|
|
-V verbose processing\n\ |
50 |
gpertea |
4 |
" |
51 |
gpertea |
170 |
|
52 |
gpertea |
13 |
//-z for -o option, the output stream(s) will be first piped into the given\n |
53 |
|
|
// <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n |
54 |
|
|
|
55 |
gpertea |
170 |
|
56 |
gpertea |
4 |
// example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG |
57 |
gpertea |
12 |
|
58 |
gpertea |
76 |
//For paired reads sequencing: |
59 |
gpertea |
12 |
//3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT |
60 |
|
|
//5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG |
61 |
gpertea |
13 |
//FILE* f_out=NULL; //stdout if not provided |
62 |
|
|
//FILE* f_out2=NULL; //for paired reads |
63 |
|
|
//FILE* f_in=NULL; //input fastq (stdin if not provided) |
64 |
|
|
//FILE* f_in2=NULL; //for paired reads |
65 |
gpertea |
170 |
|
66 |
gpertea |
4 |
FILE* freport=NULL; |
67 |
gpertea |
13 |
|
68 |
gpertea |
4 |
bool debug=false; |
69 |
gpertea |
12 |
bool verbose=false; |
70 |
gpertea |
4 |
bool doCollapse=false; |
71 |
|
|
bool doDust=false; |
72 |
gpertea |
103 |
bool doPolyTrim=true; |
73 |
gpertea |
13 |
bool fastaOutput=false; |
74 |
gpertea |
4 |
bool trashReport=false; |
75 |
gpertea |
172 |
bool revCompl=false; //also reverse complement adaptor sequences |
76 |
gpertea |
4 |
int min_read_len=16; |
77 |
gpertea |
12 |
double max_perc_N=7.0; |
78 |
gpertea |
4 |
int dust_cutoff=16; |
79 |
|
|
bool isfasta=false; |
80 |
gpertea |
12 |
bool convert_phred=false; |
81 |
gpertea |
170 |
GStr outsuffix; // -o |
82 |
gpertea |
13 |
GStr prefix; |
83 |
|
|
GStr zcmd; |
84 |
|
|
int num_trimmed5=0; |
85 |
|
|
int num_trimmed3=0; |
86 |
|
|
int num_trimmedN=0; |
87 |
|
|
int num_trimmedQ=0; |
88 |
|
|
int min_trimmed5=INT_MAX; |
89 |
|
|
int min_trimmed3=INT_MAX; |
90 |
gpertea |
12 |
|
91 |
gpertea |
13 |
int qvtrim_qmin=0; |
92 |
|
|
int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases |
93 |
gpertea |
12 |
int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet) |
94 |
|
|
int qv_cvtadd=0; //could be -31 or +31 |
95 |
|
|
|
96 |
gpertea |
94 |
// adaptor matching metrics -- for X-drop ungapped extension |
97 |
gpertea |
177 |
//const int match_reward=2; |
98 |
|
|
//const int mismatch_penalty=3; |
99 |
gpertea |
103 |
const int match_reward=2; |
100 |
gpertea |
177 |
const int mismatch_penalty=4; |
101 |
|
|
const int Xdrop=10; |
102 |
gpertea |
103 |
|
103 |
gpertea |
94 |
const int poly_m_score=2; //match score |
104 |
|
|
const int poly_mis_score=-3; //mismatch |
105 |
gpertea |
88 |
const int poly_dropoff_score=7; |
106 |
gpertea |
94 |
int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed |
107 |
gpertea |
4 |
|
108 |
gpertea |
94 |
const char *polyA_seed="AAAA"; |
109 |
|
|
const char *polyT_seed="TTTT"; |
110 |
gpertea |
12 |
|
111 |
gpertea |
170 |
struct CASeqData { |
112 |
|
|
//positional data for every possible hexamer in an adapter |
113 |
gpertea |
172 |
GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adapter sequence |
114 |
|
|
GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adapter sequence |
115 |
gpertea |
170 |
GStr seq; //actual adapter sequence data |
116 |
|
|
GStr seqr; //reverse complement sequence |
117 |
gpertea |
175 |
int amlen; //fraction of adapter length matching that's |
118 |
|
|
//enough to consider the alignment |
119 |
gpertea |
172 |
bool use_reverse; |
120 |
gpertea |
175 |
CASeqData(bool rev=false):seq(),seqr(), |
121 |
|
|
amlen(0), use_reverse(rev) { |
122 |
gpertea |
172 |
for (int i=0;i<4096;i++) { |
123 |
|
|
pz[i]=NULL; |
124 |
|
|
pzr[i]=NULL; |
125 |
|
|
} |
126 |
|
|
} |
127 |
|
|
|
128 |
|
|
void update(const char* s) { |
129 |
gpertea |
175 |
seq=s; |
130 |
|
|
table6mers(seq.chars(), seq.length(), pz); |
131 |
|
|
amlen=iround(double(seq.length())*0.8); |
132 |
|
|
if (amlen<12) |
133 |
|
|
amlen=12; |
134 |
|
|
if (!use_reverse) return; |
135 |
|
|
//reverse complement |
136 |
|
|
seqr=s; |
137 |
|
|
int slen=seq.length(); |
138 |
|
|
for (int i=0;i<slen;i++) |
139 |
|
|
seqr[i]=ntComplement(seq[slen-i-1]); |
140 |
|
|
table6mers(seqr.chars(), seqr.length(), pzr); |
141 |
gpertea |
170 |
} |
142 |
|
|
|
143 |
gpertea |
172 |
~CASeqData() { |
144 |
|
|
for (int i=0;i<4096;i++) { |
145 |
|
|
delete pz[i]; |
146 |
|
|
delete pzr[i]; |
147 |
gpertea |
170 |
} |
148 |
|
|
} |
149 |
|
|
}; |
150 |
|
|
|
151 |
|
|
GVec<CASeqData> adapters5; |
152 |
|
|
GVec<CASeqData> adapters3; |
153 |
|
|
|
154 |
gpertea |
103 |
CGreedyAlignData* gxmem_l=NULL; |
155 |
|
|
CGreedyAlignData* gxmem_r=NULL; |
156 |
|
|
|
157 |
gpertea |
4 |
// element in dhash: |
158 |
|
|
class FqDupRec { |
159 |
|
|
public: |
160 |
|
|
int count; //how many of these reads are the same |
161 |
|
|
int len; //length of qv |
162 |
|
|
char* firstname; //optional, only if we want to keep the original read names |
163 |
|
|
char* qv; |
164 |
|
|
FqDupRec(GStr* qstr=NULL, const char* rname=NULL) { |
165 |
|
|
len=0; |
166 |
|
|
qv=NULL; |
167 |
|
|
firstname=NULL; |
168 |
|
|
count=0; |
169 |
|
|
if (qstr!=NULL) { |
170 |
|
|
qv=Gstrdup(qstr->chars()); |
171 |
|
|
len=qstr->length(); |
172 |
|
|
count++; |
173 |
|
|
} |
174 |
|
|
if (rname!=NULL) firstname=Gstrdup(rname); |
175 |
|
|
} |
176 |
|
|
~FqDupRec() { |
177 |
|
|
GFREE(qv); |
178 |
|
|
GFREE(firstname); |
179 |
|
|
} |
180 |
|
|
void add(GStr& d) { //collapse another record into this one |
181 |
|
|
if (d.length()!=len) |
182 |
|
|
GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n"); |
183 |
|
|
count++; |
184 |
|
|
for (int i=0;i<len;i++) |
185 |
|
|
qv[i]=(qv[i]+d[i])/2; |
186 |
|
|
} |
187 |
|
|
}; |
188 |
|
|
|
189 |
|
|
void openfw(FILE* &f, GArgs& args, char opt) { |
190 |
|
|
GStr s=args.getOpt(opt); |
191 |
|
|
if (!s.is_empty()) { |
192 |
|
|
if (s=='-') f=stdout; |
193 |
|
|
else { |
194 |
gpertea |
13 |
f=fopen(s.chars(),"w"); |
195 |
gpertea |
4 |
if (f==NULL) GError("Error creating file: %s\n", s.chars()); |
196 |
|
|
} |
197 |
|
|
} |
198 |
|
|
} |
199 |
|
|
|
200 |
|
|
#define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh) |
201 |
|
|
#define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh) |
202 |
|
|
|
203 |
|
|
GHash<FqDupRec> dhash; //hash to keep track of duplicates |
204 |
|
|
|
205 |
gpertea |
172 |
void addAdapter(GVec<CASeqData>& adapters, GStr& seq); |
206 |
gpertea |
94 |
int loadAdapters(const char* fname); |
207 |
|
|
|
208 |
gpertea |
170 |
void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2, |
209 |
|
|
GStr& s, GStr& infname, GStr& infname2); |
210 |
gpertea |
13 |
// uses outsuffix to generate output file names and open file handles as needed |
211 |
gpertea |
170 |
|
212 |
gpertea |
13 |
void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter); |
213 |
|
|
void trash_report(char trashcode, GStr& rname, FILE* freport); |
214 |
|
|
|
215 |
gpertea |
170 |
bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv, |
216 |
gpertea |
13 |
GStr& rname, GStr& rinfo, GStr& infname); |
217 |
|
|
|
218 |
gpertea |
170 |
char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3); |
219 |
gpertea |
13 |
//returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed |
220 |
|
|
|
221 |
gpertea |
12 |
bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured |
222 |
|
|
bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured |
223 |
gpertea |
4 |
int dust(GStr& seq); |
224 |
gpertea |
94 |
bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured |
225 |
|
|
bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed); |
226 |
gpertea |
4 |
bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured |
227 |
gpertea |
94 |
bool trim_adapter3(GStr& seq, int &l5, int &l3); |
228 |
gpertea |
4 |
|
229 |
gpertea |
12 |
void convertPhred(char* q, int len); |
230 |
|
|
void convertPhred(GStr& q); |
231 |
gpertea |
4 |
|
232 |
|
|
int main(int argc, char * const argv[]) { |
233 |
gpertea |
172 |
GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:"); |
234 |
gpertea |
4 |
int e; |
235 |
|
|
if ((e=args.isError())>0) { |
236 |
|
|
GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]); |
237 |
|
|
exit(224); |
238 |
|
|
} |
239 |
gpertea |
9 |
debug=(args.getOpt('Y')!=NULL); |
240 |
gpertea |
13 |
verbose=(args.getOpt('V')!=NULL); |
241 |
gpertea |
12 |
convert_phred=(args.getOpt('Q')!=NULL); |
242 |
gpertea |
4 |
doCollapse=(args.getOpt('C')!=NULL); |
243 |
|
|
doDust=(args.getOpt('D')!=NULL); |
244 |
gpertea |
172 |
revCompl=(args.getOpt('R')!=NULL); |
245 |
gpertea |
103 |
if (args.getOpt('A')) doPolyTrim=false; |
246 |
gpertea |
12 |
/* |
247 |
gpertea |
4 |
rawFormat=(args.getOpt('R')!=NULL); |
248 |
|
|
if (rawFormat) { |
249 |
|
|
GError("Sorry, raw qseq format parsing is not implemented yet!\n"); |
250 |
|
|
} |
251 |
gpertea |
12 |
*/ |
252 |
gpertea |
4 |
prefix=args.getOpt('n'); |
253 |
|
|
GStr s=args.getOpt('l'); |
254 |
gpertea |
170 |
if (!s.is_empty()) |
255 |
gpertea |
4 |
min_read_len=s.asInt(); |
256 |
gpertea |
12 |
s=args.getOpt('m'); |
257 |
gpertea |
170 |
if (!s.is_empty()) |
258 |
gpertea |
12 |
max_perc_N=s.asDouble(); |
259 |
gpertea |
4 |
s=args.getOpt('d'); |
260 |
|
|
if (!s.is_empty()) { |
261 |
|
|
dust_cutoff=s.asInt(); |
262 |
|
|
doDust=true; |
263 |
|
|
} |
264 |
gpertea |
12 |
s=args.getOpt('q'); |
265 |
|
|
if (!s.is_empty()) { |
266 |
gpertea |
13 |
qvtrim_qmin=s.asInt(); |
267 |
gpertea |
12 |
} |
268 |
gpertea |
13 |
s=args.getOpt('t'); |
269 |
|
|
if (!s.is_empty()) { |
270 |
|
|
qvtrim_max=s.asInt(); |
271 |
|
|
} |
272 |
gpertea |
12 |
s=args.getOpt('p'); |
273 |
|
|
if (!s.is_empty()) { |
274 |
|
|
int v=s.asInt(); |
275 |
|
|
if (v==33) { |
276 |
|
|
qv_phredtype=33; |
277 |
|
|
qv_cvtadd=31; |
278 |
|
|
} |
279 |
|
|
else if (v==64) { |
280 |
|
|
qv_phredtype=64; |
281 |
|
|
qv_cvtadd=-31; |
282 |
|
|
} |
283 |
gpertea |
170 |
else |
284 |
gpertea |
12 |
GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE); |
285 |
|
|
} |
286 |
gpertea |
94 |
s=args.getOpt('f'); |
287 |
|
|
if (!s.is_empty()) { |
288 |
|
|
loadAdapters(s.chars()); |
289 |
|
|
} |
290 |
gpertea |
102 |
bool fileAdapters=adapters5.Count()+adapters3.Count(); |
291 |
gpertea |
94 |
s=args.getOpt('5'); |
292 |
|
|
if (!s.is_empty()) { |
293 |
|
|
if (fileAdapters) |
294 |
|
|
GError("Error: options -5 and -f cannot be used together!\n"); |
295 |
|
|
s.upper(); |
296 |
gpertea |
172 |
addAdapter(adapters5, s); |
297 |
gpertea |
4 |
} |
298 |
gpertea |
94 |
s=args.getOpt('3'); |
299 |
|
|
if (!s.is_empty()) { |
300 |
|
|
if (fileAdapters) |
301 |
|
|
GError("Error: options -3 and -f cannot be used together!\n"); |
302 |
gpertea |
102 |
s.upper(); |
303 |
gpertea |
172 |
addAdapter(adapters3, s); |
304 |
gpertea |
4 |
} |
305 |
gpertea |
94 |
s=args.getOpt('y'); |
306 |
gpertea |
13 |
if (!s.is_empty()) { |
307 |
gpertea |
94 |
int minmatch=s.asInt(); |
308 |
|
|
poly_minScore=minmatch*poly_m_score; |
309 |
gpertea |
13 |
} |
310 |
gpertea |
170 |
|
311 |
gpertea |
13 |
if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o'); |
312 |
gpertea |
76 |
else outsuffix="-"; |
313 |
gpertea |
4 |
trashReport= (args.getOpt('r')!=NULL); |
314 |
gpertea |
13 |
int fcount=args.startNonOpt(); |
315 |
|
|
if (fcount==0) { |
316 |
gpertea |
4 |
GMessage(USAGE); |
317 |
|
|
exit(224); |
318 |
|
|
} |
319 |
gpertea |
13 |
if (fcount>1 && doCollapse) { |
320 |
|
|
GError("%s Sorry, the -C option only works with a single input.\n", USAGE); |
321 |
|
|
} |
322 |
|
|
//openfw(f_out, args, 'o'); |
323 |
|
|
//if (f_out==NULL) f_out=stdout; |
324 |
gpertea |
4 |
if (trashReport) |
325 |
|
|
openfw(freport, args, 'r'); |
326 |
|
|
char* infile=NULL; |
327 |
gpertea |
103 |
|
328 |
|
|
if (adapters5.Count()>0) |
329 |
gpertea |
177 |
//gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2); |
330 |
|
|
gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop); |
331 |
gpertea |
103 |
if (adapters3.Count()>0) |
332 |
|
|
gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop); |
333 |
|
|
|
334 |
gpertea |
4 |
while ((infile=args.nextNonOpt())!=NULL) { |
335 |
gpertea |
103 |
//for each input file |
336 |
gpertea |
13 |
int incounter=0; //counter for input reads |
337 |
|
|
int outcounter=0; //counter for output reads |
338 |
|
|
int trash_s=0; //too short from the get go |
339 |
|
|
int trash_Q=0; |
340 |
|
|
int trash_N=0; |
341 |
|
|
int trash_D=0; |
342 |
gpertea |
103 |
int trash_poly=0; |
343 |
gpertea |
13 |
int trash_A3=0; |
344 |
|
|
int trash_A5=0; |
345 |
|
|
s=infile; |
346 |
|
|
GStr infname; |
347 |
|
|
GStr infname2; |
348 |
|
|
FILE* f_in=NULL; |
349 |
|
|
FILE* f_in2=NULL; |
350 |
|
|
FILE* f_out=NULL; |
351 |
|
|
FILE* f_out2=NULL; |
352 |
|
|
bool paired_reads=false; |
353 |
|
|
setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2); |
354 |
|
|
GLineReader fq(f_in); |
355 |
|
|
GLineReader* fq2=NULL; |
356 |
|
|
if (f_in2!=NULL) { |
357 |
|
|
fq2=new GLineReader(f_in2); |
358 |
|
|
paired_reads=true; |
359 |
|
|
} |
360 |
|
|
GStr rseq, rqv, seqid, seqinfo; |
361 |
|
|
GStr rseq2, rqv2, seqid2, seqinfo2; |
362 |
|
|
while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) { |
363 |
|
|
incounter++; |
364 |
|
|
int a5=0, a3=0, b5=0, b3=0; |
365 |
|
|
char tcode=0, tcode2=0; |
366 |
|
|
tcode=process_read(seqid, rseq, rqv, a5, a3); |
367 |
gpertea |
103 |
if (fq2!=NULL) { |
368 |
gpertea |
13 |
getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2); |
369 |
|
|
if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) { |
370 |
|
|
GError("Error: no paired match for read %s vs %s (%s,%s)\n", |
371 |
|
|
seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars()); |
372 |
|
|
} |
373 |
|
|
tcode2=process_read(seqid2, rseq2, rqv2, b5, b3); |
374 |
|
|
//decide what to do with this pair and print rseq2 if the pair makes it |
375 |
|
|
if (tcode>1 && tcode2<=1) { |
376 |
|
|
//"untrash" rseq |
377 |
|
|
if (a3-a5+1<min_read_len) { |
378 |
|
|
a5=1; |
379 |
|
|
if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); } |
380 |
gpertea |
4 |
} |
381 |
gpertea |
13 |
tcode=1; |
382 |
|
|
} |
383 |
|
|
else if (tcode<=1 && tcode2>1) { |
384 |
|
|
//"untrash" rseq2 |
385 |
|
|
if (b3-b5+1<min_read_len) { |
386 |
|
|
b5=1; |
387 |
|
|
if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); } |
388 |
|
|
} |
389 |
|
|
tcode2=1; |
390 |
|
|
} |
391 |
|
|
if (tcode<=1) { //trimmed or left intact -- write it! |
392 |
|
|
if (tcode>0) { |
393 |
|
|
rseq2=rseq2.substr(b5,b3-b5+1); |
394 |
|
|
if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1); |
395 |
gpertea |
4 |
} |
396 |
gpertea |
13 |
int nocounter=0; |
397 |
|
|
writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter); |
398 |
|
|
} |
399 |
gpertea |
103 |
} //pair read |
400 |
gpertea |
13 |
if (tcode>1) { //trashed |
401 |
gpertea |
103 |
#ifdef GDEBUG |
402 |
gpertea |
175 |
GMessage(" !!!!TRASH code = %c\n",tcode); |
403 |
gpertea |
103 |
#endif |
404 |
gpertea |
13 |
if (tcode=='s') trash_s++; |
405 |
gpertea |
103 |
else if (tcode=='A' || tcode=='T') trash_poly++; |
406 |
gpertea |
13 |
else if (tcode=='Q') trash_Q++; |
407 |
|
|
else if (tcode=='N') trash_N++; |
408 |
|
|
else if (tcode=='D') trash_D++; |
409 |
|
|
else if (tcode=='3') trash_A3++; |
410 |
|
|
else if (tcode=='5') trash_A5++; |
411 |
|
|
if (trashReport) trash_report(tcode, seqid, freport); |
412 |
|
|
} |
413 |
|
|
else if (!doCollapse) { //write it |
414 |
|
|
if (tcode>0) { |
415 |
|
|
rseq=rseq.substr(a5,a3-a5+1); |
416 |
|
|
if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1); |
417 |
|
|
} |
418 |
gpertea |
103 |
#ifdef GDEBUG |
419 |
|
|
GMessage(" After trimming:\n"); |
420 |
gpertea |
105 |
GMessage("%s\n",rseq.chars()); |
421 |
gpertea |
103 |
#endif |
422 |
gpertea |
13 |
writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter); |
423 |
|
|
} |
424 |
|
|
} //for each fastq record |
425 |
|
|
delete fq2; |
426 |
|
|
FRCLOSE(f_in); |
427 |
|
|
FRCLOSE(f_in2); |
428 |
|
|
if (doCollapse) { |
429 |
|
|
outcounter=0; |
430 |
|
|
int maxdup_count=1; |
431 |
|
|
char* maxdup_seq=NULL; |
432 |
|
|
dhash.startIterate(); |
433 |
|
|
FqDupRec* qd=NULL; |
434 |
|
|
char* seq=NULL; |
435 |
|
|
while ((qd=dhash.NextData(seq))!=NULL) { |
436 |
|
|
GStr rseq(seq); |
437 |
|
|
//do the dusting here |
438 |
|
|
if (doDust) { |
439 |
|
|
int dustbases=dust(rseq); |
440 |
|
|
if (dustbases>(rseq.length()>>1)) { |
441 |
|
|
if (trashReport && qd->firstname!=NULL) { |
442 |
|
|
fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count); |
443 |
gpertea |
4 |
} |
444 |
gpertea |
13 |
trash_D+=qd->count; |
445 |
|
|
continue; |
446 |
|
|
} |
447 |
|
|
} |
448 |
|
|
outcounter++; |
449 |
gpertea |
170 |
if (qd->count>maxdup_count) { |
450 |
gpertea |
13 |
maxdup_count=qd->count; |
451 |
|
|
maxdup_seq=seq; |
452 |
|
|
} |
453 |
|
|
if (isfasta) { |
454 |
|
|
if (prefix.is_empty()) { |
455 |
gpertea |
170 |
fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count, |
456 |
gpertea |
13 |
rseq.chars()); |
457 |
gpertea |
4 |
} |
458 |
gpertea |
13 |
else { //use custom read name |
459 |
|
|
fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter, |
460 |
|
|
qd->count, rseq.chars()); |
461 |
gpertea |
4 |
} |
462 |
gpertea |
13 |
} |
463 |
|
|
else { //fastq format |
464 |
|
|
if (convert_phred) convertPhred(qd->qv, qd->len); |
465 |
|
|
if (prefix.is_empty()) { |
466 |
gpertea |
170 |
fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count, |
467 |
gpertea |
13 |
rseq.chars(), qd->qv); |
468 |
gpertea |
4 |
} |
469 |
gpertea |
13 |
else { //use custom read name |
470 |
|
|
fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter, |
471 |
|
|
qd->count, rseq.chars(), qd->qv); |
472 |
|
|
} |
473 |
|
|
} |
474 |
|
|
}//for each element of dhash |
475 |
|
|
if (maxdup_count>1) { |
476 |
|
|
GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq); |
477 |
gpertea |
4 |
} |
478 |
gpertea |
13 |
} //collapse entries |
479 |
|
|
if (verbose) { |
480 |
|
|
if (paired_reads) { |
481 |
|
|
GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars()); |
482 |
|
|
GMessage("Number of input pairs :%9d\n", incounter); |
483 |
|
|
GMessage(" Output pairs :%9d\n", outcounter); |
484 |
|
|
} |
485 |
|
|
else { |
486 |
|
|
GMessage(">Input file : %s\n", infname.chars()); |
487 |
|
|
GMessage("Number of input reads :%9d\n", incounter); |
488 |
|
|
GMessage(" Output reads :%9d\n", outcounter); |
489 |
|
|
} |
490 |
|
|
GMessage("------------------------------------\n"); |
491 |
|
|
if (num_trimmed5) |
492 |
|
|
GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5); |
493 |
|
|
if (num_trimmed3) |
494 |
|
|
GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3); |
495 |
|
|
GMessage("------------------------------------\n"); |
496 |
|
|
if (trash_s>0) |
497 |
|
|
GMessage(" Trashed by length:%9d\n", trash_s); |
498 |
|
|
if (trash_N>0) |
499 |
|
|
GMessage(" Trashed by N%%:%9d\n", trash_N); |
500 |
|
|
if (trash_Q>0) |
501 |
|
|
GMessage("Trashed by low quality:%9d\n", trash_Q); |
502 |
gpertea |
103 |
if (trash_poly>0) |
503 |
|
|
GMessage(" Trashed by poly-A/T:%9d\n", trash_poly); |
504 |
gpertea |
13 |
if (trash_A5>0) |
505 |
|
|
GMessage(" Trashed by 5' adapter:%9d\n", trash_A5); |
506 |
|
|
if (trash_A3>0) |
507 |
|
|
GMessage(" Trashed by 3' adapter:%9d\n", trash_A3); |
508 |
|
|
} |
509 |
|
|
if (trashReport) { |
510 |
|
|
FWCLOSE(freport); |
511 |
gpertea |
4 |
} |
512 |
gpertea |
13 |
FWCLOSE(f_out); |
513 |
|
|
FWCLOSE(f_out2); |
514 |
|
|
} //while each input file |
515 |
gpertea |
103 |
delete gxmem_l; |
516 |
|
|
delete gxmem_r; |
517 |
gpertea |
12 |
//getc(stdin); |
518 |
gpertea |
4 |
} |
519 |
|
|
|
520 |
|
|
class NData { |
521 |
|
|
public: |
522 |
|
|
int NPos[1024]; //there should be no reads longer than 1K ? |
523 |
|
|
int NCount; |
524 |
|
|
int end5; |
525 |
|
|
int end3; |
526 |
|
|
int seqlen; |
527 |
|
|
double perc_N; //percentage of Ns in end5..end3 range only! |
528 |
|
|
const char* seq; |
529 |
|
|
bool valid; |
530 |
|
|
NData() { |
531 |
gpertea |
170 |
seqlen=0; |
532 |
gpertea |
4 |
NCount=0; |
533 |
|
|
end5=0; |
534 |
|
|
end3=0; |
535 |
|
|
seq=NULL; |
536 |
|
|
perc_N=0; |
537 |
|
|
valid=true; |
538 |
|
|
} |
539 |
|
|
void init(GStr& rseq) { |
540 |
|
|
NCount=0; |
541 |
|
|
perc_N=0; |
542 |
|
|
seqlen=rseq.length(); |
543 |
|
|
seq=rseq.chars(); |
544 |
|
|
end5=1; |
545 |
|
|
end3=seqlen; |
546 |
|
|
for (int i=0;i<seqlen;i++) |
547 |
gpertea |
12 |
if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT") |
548 |
gpertea |
4 |
NPos[NCount]=i; |
549 |
|
|
NCount++; |
550 |
|
|
} |
551 |
|
|
perc_N=(NCount*100.0)/seqlen; |
552 |
|
|
valid=true; |
553 |
|
|
} |
554 |
|
|
void N_calc() { //only in the region after trimming |
555 |
|
|
int n=0; |
556 |
|
|
for (int i=end3-1;i<end5;i++) { |
557 |
|
|
if (seq[i]=='N') n++; |
558 |
|
|
} |
559 |
|
|
perc_N=(n*100.0)/(end5-end3+1); |
560 |
|
|
} |
561 |
|
|
}; |
562 |
gpertea |
170 |
|
563 |
gpertea |
4 |
static NData feat; |
564 |
|
|
int perc_lenN=12; // incremental distance from ends, in percentage of |
565 |
|
|
// sequence length, where N-trimming is done (default:12 %) (autolimited to 20) |
566 |
gpertea |
170 |
|
567 |
gpertea |
4 |
void N_analyze(int l5, int l3, int p5, int p3) { |
568 |
|
|
/* assumes feat was filled properly */ |
569 |
|
|
int old_dif, t5,t3,v; |
570 |
gpertea |
12 |
if (l3<l5+2 || p5>p3 ) { |
571 |
gpertea |
4 |
feat.end5=l5+1; |
572 |
|
|
feat.end3=l3+1; |
573 |
gpertea |
170 |
return; |
574 |
gpertea |
4 |
} |
575 |
|
|
|
576 |
|
|
t5=feat.NPos[p5]-l5; |
577 |
|
|
t3=l3-feat.NPos[p3]; |
578 |
|
|
old_dif=p3-p5; |
579 |
|
|
v=(int)((((double)(l3-l5))*perc_lenN)/100); |
580 |
gpertea |
170 |
if (v>20) v=20; /* enforce N-search limit for very long reads */ |
581 |
gpertea |
4 |
if (t5 < v ) { |
582 |
|
|
l5=feat.NPos[p5]+1; |
583 |
|
|
p5++; |
584 |
|
|
} |
585 |
|
|
if (t3 < v) { |
586 |
|
|
l3=feat.NPos[p3]-1; |
587 |
|
|
p3--; |
588 |
|
|
} |
589 |
|
|
/* restNs=p3-p5; number of Ns in the new CLR */ |
590 |
|
|
if (p3-p5==old_dif) { /* no change, return */ |
591 |
|
|
/* don't trim if this may shorten the read too much */ |
592 |
|
|
//if (l5-l3<min_read_len) return; |
593 |
|
|
feat.end5=l5+1; |
594 |
|
|
feat.end3=l3+1; |
595 |
|
|
return; |
596 |
|
|
} |
597 |
gpertea |
170 |
else |
598 |
gpertea |
4 |
N_analyze(l5,l3, p5,p3); |
599 |
|
|
} |
600 |
|
|
|
601 |
|
|
|
602 |
gpertea |
12 |
bool qtrim(GStr& qvs, int &l5, int &l3) { |
603 |
gpertea |
13 |
if (qvtrim_qmin==0 || qvs.is_empty()) return false; |
604 |
gpertea |
12 |
if (qv_phredtype==0) { |
605 |
|
|
//try to guess the Phred type |
606 |
|
|
int vmin=256, vmax=0; |
607 |
|
|
for (int i=0;i<qvs.length();i++) { |
608 |
|
|
if (vmin>qvs[i]) vmin=qvs[i]; |
609 |
|
|
if (vmax<qvs[i]) vmax=qvs[i]; |
610 |
|
|
} |
611 |
|
|
if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; } |
612 |
|
|
if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; } |
613 |
|
|
if (qv_phredtype==0) { |
614 |
|
|
GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n"); |
615 |
|
|
} |
616 |
gpertea |
13 |
if (verbose) |
617 |
|
|
GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64)); |
618 |
|
|
} //guessing Phred type |
619 |
gpertea |
12 |
for (l3=qvs.length()-1;l3>2;l3--) { |
620 |
gpertea |
13 |
if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break; |
621 |
gpertea |
12 |
} |
622 |
|
|
//just in case, check also the 5' the end (?) |
623 |
|
|
for (l5=0;l5<qvs.length()-3;l5++) { |
624 |
gpertea |
13 |
if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break; |
625 |
gpertea |
12 |
} |
626 |
gpertea |
13 |
if (qvtrim_max>0) { |
627 |
|
|
if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max; |
628 |
|
|
if (l5>qvtrim_max) l5=qvtrim_max; |
629 |
|
|
} |
630 |
gpertea |
12 |
return (l5>0 || l3<qvs.length()-1); |
631 |
|
|
} |
632 |
|
|
|
633 |
|
|
bool ntrim(GStr& rseq, int &l5, int &l3) { |
634 |
|
|
//count Ns in the sequence, trim N-rich ends |
635 |
gpertea |
4 |
feat.init(rseq); |
636 |
|
|
l5=feat.end5-1; |
637 |
|
|
l3=feat.end3-1; |
638 |
gpertea |
170 |
N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1); |
639 |
gpertea |
12 |
if (l5==feat.end5-1 && l3==feat.end3-1) { |
640 |
|
|
if (feat.perc_N>max_perc_N) { |
641 |
|
|
feat.valid=false; |
642 |
|
|
l3=l5+1; |
643 |
|
|
return true; |
644 |
|
|
} |
645 |
|
|
else { |
646 |
|
|
return false; //nothing changed |
647 |
|
|
} |
648 |
|
|
} |
649 |
gpertea |
4 |
l5=feat.end5-1; |
650 |
|
|
l3=feat.end3-1; |
651 |
|
|
if (l3-l5+1<min_read_len) { |
652 |
|
|
feat.valid=false; |
653 |
|
|
return true; |
654 |
|
|
} |
655 |
|
|
feat.N_calc(); |
656 |
gpertea |
170 |
|
657 |
gpertea |
12 |
if (feat.perc_N>max_perc_N) { |
658 |
gpertea |
4 |
feat.valid=false; |
659 |
|
|
l3=l5+1; |
660 |
|
|
return true; |
661 |
|
|
} |
662 |
|
|
return true; |
663 |
|
|
} |
664 |
|
|
|
665 |
|
|
//--------------- dust functions ---------------- |
666 |
|
|
class DNADuster { |
667 |
|
|
public: |
668 |
gpertea |
170 |
int dustword; |
669 |
|
|
int dustwindow; |
670 |
|
|
int dustwindow2; |
671 |
gpertea |
4 |
int dustcutoff; |
672 |
|
|
int mv, iv, jv; |
673 |
|
|
int counts[32*32*32]; |
674 |
|
|
int iis[32*32*32]; |
675 |
|
|
DNADuster(int cutoff=16, int winsize=32, int wordsize=3) { |
676 |
|
|
dustword=wordsize; |
677 |
|
|
dustwindow=winsize; |
678 |
|
|
dustwindow2 = (winsize>>1); |
679 |
|
|
dustcutoff=cutoff; |
680 |
|
|
mv=0; |
681 |
|
|
iv=0; |
682 |
|
|
jv=0; |
683 |
|
|
} |
684 |
|
|
void setWindowSize(int value) { |
685 |
|
|
dustwindow = value; |
686 |
|
|
dustwindow2 = (dustwindow >> 1); |
687 |
|
|
} |
688 |
|
|
void setWordSize(int value) { |
689 |
|
|
dustword=value; |
690 |
|
|
} |
691 |
|
|
void wo1(int len, const char* s, int ivv) { |
692 |
|
|
int i, ii, j, v, t, n, n1, sum; |
693 |
|
|
int js, nis; |
694 |
|
|
n = 32 * 32 * 32; |
695 |
|
|
n1 = n - 1; |
696 |
|
|
nis = 0; |
697 |
|
|
i = 0; |
698 |
|
|
ii = 0; |
699 |
|
|
sum = 0; |
700 |
|
|
v = 0; |
701 |
|
|
for (j=0; j < len; j++, s++) { |
702 |
|
|
ii <<= 5; |
703 |
|
|
if (*s<=32) { |
704 |
|
|
i=0; |
705 |
|
|
continue; |
706 |
|
|
} |
707 |
|
|
ii |= *s - 'A'; //assume uppercase! |
708 |
|
|
ii &= n1; |
709 |
|
|
i++; |
710 |
|
|
if (i >= dustword) { |
711 |
|
|
for (js=0; js < nis && iis[js] != ii; js++) ; |
712 |
|
|
if (js == nis) { |
713 |
|
|
iis[nis] = ii; |
714 |
|
|
counts[ii] = 0; |
715 |
|
|
nis++; |
716 |
|
|
} |
717 |
|
|
if ((t = counts[ii]) > 0) { |
718 |
|
|
sum += t; |
719 |
|
|
v = 10 * sum / j; |
720 |
|
|
if (mv < v) { |
721 |
|
|
mv = v; |
722 |
|
|
iv = ivv; |
723 |
|
|
jv = j; |
724 |
|
|
} |
725 |
|
|
} |
726 |
|
|
counts[ii]++; |
727 |
|
|
} |
728 |
|
|
} |
729 |
|
|
} |
730 |
|
|
|
731 |
|
|
int wo(int len, const char* s, int* beg, int* end) { |
732 |
|
|
int i, l1; |
733 |
|
|
l1 = len - dustword + 1; |
734 |
|
|
if (l1 < 0) { |
735 |
|
|
*beg = 0; |
736 |
|
|
*end = len - 1; |
737 |
|
|
return 0; |
738 |
|
|
} |
739 |
|
|
mv = 0; |
740 |
|
|
iv = 0; |
741 |
|
|
jv = 0; |
742 |
|
|
for (i=0; i < l1; i++) { |
743 |
|
|
wo1(len-i, s+i, i); |
744 |
|
|
} |
745 |
|
|
*beg = iv; |
746 |
|
|
*end = iv + jv; |
747 |
|
|
return mv; |
748 |
|
|
} |
749 |
|
|
|
750 |
|
|
void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) { |
751 |
|
|
int i, j, l, a, b, v; |
752 |
|
|
if (cutoff==0) cutoff=dustcutoff; |
753 |
|
|
a=0;b=0; |
754 |
|
|
//GMessage("Dust cutoff=%d\n", cutoff); |
755 |
|
|
for (i=0; i < seqlen; i += dustwindow2) { |
756 |
|
|
l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i; |
757 |
|
|
v = wo(l, seq+i, &a, &b); |
758 |
|
|
if (v > cutoff) { |
759 |
|
|
//for (j = a; j <= b && j < dustwindow2; j++) { |
760 |
|
|
for (j = a; j <= b; j++) { |
761 |
|
|
seqmsk[i+j]='N';//could be made lowercase instead |
762 |
|
|
} |
763 |
|
|
} |
764 |
|
|
} |
765 |
gpertea |
170 |
//return first; |
766 |
gpertea |
4 |
} |
767 |
|
|
}; |
768 |
|
|
|
769 |
|
|
static DNADuster duster; |
770 |
|
|
|
771 |
|
|
int dust(GStr& rseq) { |
772 |
|
|
char* seq=Gstrdup(rseq.chars()); |
773 |
|
|
duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff); |
774 |
|
|
//check the number of Ns: |
775 |
|
|
int ncount=0; |
776 |
|
|
for (int i=0;i<rseq.length();i++) { |
777 |
|
|
if (seq[i]=='N') ncount++; |
778 |
|
|
} |
779 |
|
|
GFREE(seq); |
780 |
|
|
return ncount; |
781 |
|
|
} |
782 |
|
|
|
783 |
gpertea |
94 |
struct SLocScore { |
784 |
|
|
int pos; |
785 |
|
|
int score; |
786 |
|
|
SLocScore(int p=0,int s=0) { |
787 |
|
|
pos=p; |
788 |
|
|
score=s; |
789 |
|
|
} |
790 |
|
|
void set(int p, int s) { |
791 |
|
|
pos=p; |
792 |
|
|
score=s; |
793 |
|
|
} |
794 |
|
|
void add(int p, int add) { |
795 |
|
|
pos=p; |
796 |
|
|
score+=add; |
797 |
|
|
} |
798 |
|
|
}; |
799 |
gpertea |
12 |
|
800 |
gpertea |
94 |
bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) { |
801 |
gpertea |
103 |
if (!doPolyTrim) return false; |
802 |
gpertea |
94 |
int rlen=seq.length(); |
803 |
|
|
l5=0; |
804 |
|
|
l3=rlen-1; |
805 |
|
|
int32 seedVal=*(int32*)poly_seed; |
806 |
|
|
char polyChar=poly_seed[0]; |
807 |
|
|
//assumes N trimming was already done |
808 |
|
|
//so a poly match should be very close to the end of the read |
809 |
|
|
// -- find the initial match (seed) |
810 |
gpertea |
108 |
int lmin=GMAX((rlen-16), 0); |
811 |
gpertea |
94 |
int li; |
812 |
|
|
for (li=rlen-4;li>lmin;li--) { |
813 |
|
|
if (seedVal==*(int*)&(seq[li])) { |
814 |
|
|
break; |
815 |
|
|
} |
816 |
gpertea |
12 |
} |
817 |
gpertea |
94 |
if (li<=lmin) return false; |
818 |
|
|
//seed found, try to extend it both ways |
819 |
|
|
//extend right |
820 |
|
|
int ri=li+3; |
821 |
|
|
SLocScore loc(ri, poly_m_score<<2); |
822 |
|
|
SLocScore maxloc(loc); |
823 |
|
|
//extend right |
824 |
gpertea |
170 |
while (ri<rlen-1) { |
825 |
gpertea |
94 |
ri++; |
826 |
|
|
if (seq[ri]==polyChar) { |
827 |
|
|
loc.add(ri,poly_m_score); |
828 |
|
|
} |
829 |
|
|
else if (seq[ri]=='N') { |
830 |
|
|
loc.add(ri,0); |
831 |
|
|
} |
832 |
|
|
else { //mismatch |
833 |
|
|
loc.add(ri,poly_mis_score); |
834 |
|
|
if (maxloc.score-loc.score>poly_dropoff_score) break; |
835 |
|
|
} |
836 |
|
|
if (maxloc.score<=loc.score) { |
837 |
|
|
maxloc=loc; |
838 |
|
|
} |
839 |
gpertea |
12 |
} |
840 |
gpertea |
94 |
ri=maxloc.pos; |
841 |
gpertea |
103 |
if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end |
842 |
gpertea |
94 |
//ri = right boundary for the poly match |
843 |
|
|
//extend left |
844 |
|
|
loc.set(li, maxloc.score); |
845 |
|
|
maxloc.pos=li; |
846 |
|
|
while (li>0) { |
847 |
|
|
li--; |
848 |
|
|
if (seq[li]==polyChar) { |
849 |
|
|
loc.add(li,poly_m_score); |
850 |
|
|
} |
851 |
|
|
else if (seq[li]=='N') { |
852 |
|
|
loc.add(li,0); |
853 |
|
|
} |
854 |
|
|
else { //mismatch |
855 |
|
|
loc.add(li,poly_mis_score); |
856 |
|
|
if (maxloc.score-loc.score>poly_dropoff_score) break; |
857 |
|
|
} |
858 |
|
|
if (maxloc.score<=loc.score) { |
859 |
|
|
maxloc=loc; |
860 |
|
|
} |
861 |
|
|
} |
862 |
gpertea |
105 |
li=maxloc.pos; |
863 |
gpertea |
108 |
if ((maxloc.score==poly_minScore && ri==rlen-1) || |
864 |
|
|
(maxloc.score>poly_minScore && ri>=rlen-3) || |
865 |
|
|
(maxloc.score>(poly_minScore*3) && ri>=rlen-8)) { |
866 |
gpertea |
105 |
//trimming this li-ri match at 3' end |
867 |
|
|
l3=li-1; |
868 |
|
|
if (l3<0) l3=0; |
869 |
gpertea |
94 |
return true; |
870 |
|
|
} |
871 |
|
|
return false; |
872 |
gpertea |
12 |
} |
873 |
|
|
|
874 |
gpertea |
94 |
bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) { |
875 |
gpertea |
103 |
if (!doPolyTrim) return false; |
876 |
gpertea |
88 |
int rlen=seq.length(); |
877 |
|
|
l5=0; |
878 |
|
|
l3=rlen-1; |
879 |
gpertea |
94 |
int32 seedVal=*(int32*)poly_seed; |
880 |
|
|
char polyChar=poly_seed[0]; |
881 |
gpertea |
88 |
//assumes N trimming was already done |
882 |
|
|
//so a poly match should be very close to the end of the read |
883 |
|
|
// -- find the initial match (seed) |
884 |
gpertea |
108 |
int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds |
885 |
gpertea |
94 |
int li; |
886 |
|
|
for (li=0;li<=lmax;li++) { |
887 |
|
|
if (seedVal==*(int*)&(seq[li])) { |
888 |
gpertea |
88 |
break; |
889 |
|
|
} |
890 |
|
|
} |
891 |
gpertea |
94 |
if (li>lmax) return false; |
892 |
|
|
//seed found, try to extend it both ways |
893 |
|
|
//extend left |
894 |
|
|
int ri=li+3; //save rightmost base of the seed |
895 |
|
|
SLocScore loc(li, poly_m_score<<2); |
896 |
|
|
SLocScore maxloc(loc); |
897 |
|
|
while (li>0) { |
898 |
|
|
li--; |
899 |
|
|
if (seq[li]==polyChar) { |
900 |
|
|
loc.add(li,poly_m_score); |
901 |
|
|
} |
902 |
|
|
else if (seq[li]=='N') { |
903 |
|
|
loc.add(li,0); |
904 |
|
|
} |
905 |
|
|
else { //mismatch |
906 |
|
|
loc.add(li,poly_mis_score); |
907 |
|
|
if (maxloc.score-loc.score>poly_dropoff_score) break; |
908 |
|
|
} |
909 |
|
|
if (maxloc.score<=loc.score) { |
910 |
|
|
maxloc=loc; |
911 |
|
|
} |
912 |
gpertea |
88 |
} |
913 |
gpertea |
94 |
li=maxloc.pos; |
914 |
gpertea |
103 |
if (li>5) return false; //no trimming wanted, too far from 5' end |
915 |
gpertea |
94 |
//li = right boundary for the poly match |
916 |
|
|
|
917 |
|
|
//extend right |
918 |
|
|
loc.set(ri, maxloc.score); |
919 |
|
|
maxloc.pos=ri; |
920 |
gpertea |
170 |
while (ri<rlen-1) { |
921 |
gpertea |
94 |
ri++; |
922 |
|
|
if (seq[ri]==polyChar) { |
923 |
|
|
loc.add(ri,poly_m_score); |
924 |
|
|
} |
925 |
|
|
else if (seq[ri]=='N') { |
926 |
|
|
loc.add(ri,0); |
927 |
|
|
} |
928 |
|
|
else { //mismatch |
929 |
|
|
loc.add(ri,poly_mis_score); |
930 |
|
|
if (maxloc.score-loc.score>poly_dropoff_score) break; |
931 |
|
|
} |
932 |
|
|
if (maxloc.score<=loc.score) { |
933 |
|
|
maxloc=loc; |
934 |
|
|
} |
935 |
|
|
} |
936 |
gpertea |
105 |
ri=maxloc.pos; |
937 |
gpertea |
103 |
if ((maxloc.score==poly_minScore && li==0) || |
938 |
|
|
(maxloc.score>poly_minScore && li<2) |
939 |
gpertea |
108 |
|| (maxloc.score>(poly_minScore*3) && li<8)) { |
940 |
gpertea |
105 |
//adjust l5 to reflect this trimming of 5' end |
941 |
|
|
l5=ri+1; |
942 |
|
|
if (l5>rlen-1) l5=rlen-1; |
943 |
gpertea |
94 |
return true; |
944 |
|
|
} |
945 |
|
|
return false; |
946 |
gpertea |
88 |
} |
947 |
|
|
|
948 |
gpertea |
4 |
bool trim_adapter3(GStr& seq, int&l5, int &l3) { |
949 |
gpertea |
103 |
if (adapters3.Count()==0) return false; |
950 |
gpertea |
4 |
int rlen=seq.length(); |
951 |
|
|
l5=0; |
952 |
|
|
l3=rlen-1; |
953 |
gpertea |
103 |
bool trimmed=false; |
954 |
gpertea |
175 |
GStr wseq(seq); |
955 |
gpertea |
103 |
int wlen=rlen; |
956 |
gpertea |
175 |
GXSeqData seqdata; |
957 |
gpertea |
103 |
for (int ai=0;ai<adapters3.Count();ai++) { |
958 |
gpertea |
175 |
seqdata.update(adapters3[ai].seq.chars(), adapters3[ai].seq.length(), |
959 |
|
|
adapters3[ai].pz, wseq.chars(), wlen, adapters3[ai].amlen); |
960 |
|
|
GXAlnInfo* bestaln=match_RightEnd(seqdata, gxmem_r, 86); |
961 |
|
|
if (bestaln) { |
962 |
gpertea |
103 |
trimmed=true; |
963 |
gpertea |
175 |
//keep unmatched region on the left OR right (the longer one) |
964 |
|
|
if (bestaln->sl > wlen-bestaln->sr) { |
965 |
|
|
//keep left side |
966 |
|
|
l3-=(wlen-bestaln->sl+1); |
967 |
|
|
if (l3<0) l3=0; |
968 |
|
|
} |
969 |
|
|
else { //keep right side |
970 |
|
|
l5+=bestaln->sr; |
971 |
|
|
if (l5>=rlen) l5=rlen-1; |
972 |
|
|
} |
973 |
|
|
delete bestaln; |
974 |
gpertea |
103 |
if (l3-l5+1<min_read_len) return true; |
975 |
|
|
wseq=seq.substr(l5,l3-l5+1); |
976 |
|
|
wlen=wseq.length(); |
977 |
|
|
} |
978 |
|
|
}//for each 5' adapter |
979 |
|
|
return trimmed; |
980 |
gpertea |
4 |
} |
981 |
|
|
|
982 |
|
|
bool trim_adapter5(GStr& seq, int&l5, int &l3) { |
983 |
gpertea |
103 |
if (adapters5.Count()==0) return false; |
984 |
gpertea |
4 |
int rlen=seq.length(); |
985 |
|
|
l5=0; |
986 |
|
|
l3=rlen-1; |
987 |
gpertea |
102 |
bool trimmed=false; |
988 |
gpertea |
175 |
GStr wseq(seq); |
989 |
gpertea |
103 |
int wlen=rlen; |
990 |
gpertea |
175 |
GXSeqData seqdata; |
991 |
gpertea |
102 |
for (int ai=0;ai<adapters5.Count();ai++) { |
992 |
gpertea |
175 |
seqdata.update(adapters5[ai].seq.chars(), adapters5[ai].seq.length(), |
993 |
|
|
adapters5[ai].pz, wseq.chars(), wlen, adapters5[ai].amlen); |
994 |
|
|
GXAlnInfo* bestaln=match_LeftEnd(seqdata, gxmem_l, 90); |
995 |
|
|
if (bestaln) { |
996 |
gpertea |
103 |
trimmed=true; |
997 |
gpertea |
177 |
if (bestaln->sl-1 > wlen-bestaln->sr) { |
998 |
gpertea |
175 |
//keep left side |
999 |
|
|
l3-=(wlen-bestaln->sl+1); |
1000 |
|
|
if (l3<0) l3=0; |
1001 |
|
|
} |
1002 |
|
|
else { //keep right side |
1003 |
|
|
l5+=bestaln->sr; |
1004 |
|
|
if (l5>=rlen) l5=rlen-1; |
1005 |
|
|
} |
1006 |
|
|
delete bestaln; |
1007 |
gpertea |
103 |
if (l3-l5+1<min_read_len) return true; |
1008 |
|
|
wseq=seq.substr(l5,l3-l5+1); |
1009 |
|
|
wlen=wseq.length(); |
1010 |
|
|
} |
1011 |
gpertea |
102 |
}//for each 5' adapter |
1012 |
|
|
return trimmed; |
1013 |
gpertea |
98 |
} |
1014 |
gpertea |
4 |
|
1015 |
gpertea |
170 |
//convert qvs to/from phred64 from/to phread33 |
1016 |
gpertea |
12 |
void convertPhred(GStr& q) { |
1017 |
|
|
for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd; |
1018 |
gpertea |
4 |
} |
1019 |
|
|
|
1020 |
gpertea |
12 |
void convertPhred(char* q, int len) { |
1021 |
|
|
for (int i=0;i<len;i++) q[i]+=qv_cvtadd; |
1022 |
gpertea |
4 |
} |
1023 |
|
|
|
1024 |
gpertea |
170 |
bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv, |
1025 |
gpertea |
13 |
GStr& rname, GStr& rinfo, GStr& infname) { |
1026 |
|
|
rseq=""; |
1027 |
|
|
rqv=""; |
1028 |
|
|
rname=""; |
1029 |
|
|
rinfo=""; |
1030 |
|
|
if (fq.eof()) return false; |
1031 |
|
|
char* l=fq.getLine(); |
1032 |
|
|
while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines |
1033 |
|
|
if (l==NULL) return false; |
1034 |
|
|
/* if (rawFormat) { |
1035 |
|
|
//TODO: implement raw qseq parsing here |
1036 |
|
|
//if (raw type=N) then continue; //skip invalid/bad records |
1037 |
|
|
} //raw qseq format |
1038 |
|
|
else { // FASTQ or FASTA */ |
1039 |
|
|
isfasta=(l[0]=='>'); |
1040 |
gpertea |
170 |
if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n", |
1041 |
gpertea |
13 |
infname.chars(), l); |
1042 |
|
|
GStr s(l); |
1043 |
|
|
rname=&(l[1]); |
1044 |
|
|
for (int i=0;i<rname.length();i++) |
1045 |
gpertea |
170 |
if (rname[i]<=' ') { |
1046 |
|
|
if (i<rname.length()-2) rinfo=rname.substr(i+1); |
1047 |
|
|
rname.cut(i); |
1048 |
|
|
break; |
1049 |
gpertea |
13 |
} |
1050 |
|
|
//now get the sequence |
1051 |
gpertea |
170 |
if ((l=fq.getLine())==NULL) |
1052 |
gpertea |
13 |
GError("Error: unexpected EOF after header for read %s (%s)\n", |
1053 |
|
|
rname.chars(), infname.chars()); |
1054 |
|
|
rseq=l; //this must be the DNA line |
1055 |
|
|
while ((l=fq.getLine())!=NULL) { |
1056 |
|
|
//seq can span multiple lines |
1057 |
|
|
if (l[0]=='>' || l[0]=='+') { |
1058 |
gpertea |
170 |
fq.pushBack(); |
1059 |
gpertea |
13 |
break; // |
1060 |
|
|
} |
1061 |
|
|
rseq+=l; |
1062 |
gpertea |
170 |
} //check for multi-line seq |
1063 |
gpertea |
13 |
if (!isfasta) { //reading fastq quality values, which can also be multi-line |
1064 |
|
|
if ((l=fq.getLine())==NULL) |
1065 |
|
|
GError("Error: unexpected EOF after sequence for %s\n", rname.chars()); |
1066 |
|
|
if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n"); |
1067 |
gpertea |
170 |
if ((l=fq.getLine())==NULL) |
1068 |
gpertea |
13 |
GError("Error: unexpected EOF after qv header for %s\n", rname.chars()); |
1069 |
|
|
rqv=l; |
1070 |
gpertea |
170 |
//if (rqv.length()!=rseq.length()) |
1071 |
gpertea |
13 |
// GError("Error: qv len != seq len for %s\n", rname.chars()); |
1072 |
|
|
while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) { |
1073 |
|
|
rqv+=l; //append to qv string |
1074 |
|
|
} |
1075 |
|
|
}// fastq |
1076 |
|
|
// } //<-- FASTA or FASTQ |
1077 |
gpertea |
94 |
rseq.upper(); |
1078 |
gpertea |
13 |
return true; |
1079 |
|
|
} |
1080 |
|
|
|
1081 |
gpertea |
105 |
#ifdef GDEBUG |
1082 |
|
|
void showTrim(GStr& s, int l5, int l3) { |
1083 |
gpertea |
177 |
if (l5>0 || l3==0) { |
1084 |
gpertea |
105 |
color_bg(c_red); |
1085 |
|
|
} |
1086 |
|
|
for (int i=0;i<s.length()-1;i++) { |
1087 |
|
|
if (i && i==l5) color_resetbg(); |
1088 |
|
|
fprintf(stderr, "%c", s[i]); |
1089 |
gpertea |
177 |
if (i && i==l3) color_bg(c_red); |
1090 |
gpertea |
105 |
} |
1091 |
|
|
fprintf(stderr, "%c", s[s.length()-1]); |
1092 |
|
|
color_reset(); |
1093 |
|
|
fprintf(stderr, "\n"); |
1094 |
|
|
} |
1095 |
|
|
#endif |
1096 |
|
|
|
1097 |
gpertea |
13 |
char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) { |
1098 |
gpertea |
170 |
//returns 0 if the read was untouched, 1 if it was just trimmed |
1099 |
gpertea |
13 |
// and a trash code if it was trashed |
1100 |
|
|
l5=0; |
1101 |
|
|
l3=rseq.length()-1; |
1102 |
gpertea |
103 |
#ifdef GDEBUG |
1103 |
gpertea |
108 |
//rseq.reverse(); |
1104 |
gpertea |
103 |
GMessage(">%s\n", rname.chars()); |
1105 |
gpertea |
105 |
GMessage("%s\n",rseq.chars()); |
1106 |
gpertea |
103 |
#endif |
1107 |
gpertea |
13 |
if (l3-l5+1<min_read_len) { |
1108 |
|
|
return 's'; |
1109 |
|
|
} |
1110 |
|
|
GStr wseq(rseq.chars()); |
1111 |
|
|
GStr wqv(rqv.chars()); |
1112 |
|
|
int w5=l5; |
1113 |
|
|
int w3=l3; |
1114 |
|
|
//first do the q-based trimming |
1115 |
|
|
if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming |
1116 |
|
|
if (w3-w5+1<min_read_len) { |
1117 |
|
|
return 'Q'; //invalid read |
1118 |
|
|
} |
1119 |
|
|
//-- keep only the w5..w3 range |
1120 |
|
|
l5=w5; |
1121 |
|
|
l3=w3; |
1122 |
|
|
wseq=wseq.substr(w5, w3-w5+1); |
1123 |
|
|
if (!wqv.is_empty()) |
1124 |
|
|
wqv=wqv.substr(w5, w3-w5+1); |
1125 |
|
|
} //qv trimming |
1126 |
|
|
// N-trimming on the remaining read seq |
1127 |
|
|
if (ntrim(wseq, w5, w3)) { |
1128 |
|
|
//GMessage("before: %s\n",wseq.chars()); |
1129 |
|
|
//GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1); |
1130 |
|
|
l5+=w5; |
1131 |
|
|
l3-=(wseq.length()-1-w3); |
1132 |
|
|
if (w3-w5+1<min_read_len) { |
1133 |
|
|
return 'N'; //to be trashed |
1134 |
|
|
} |
1135 |
|
|
//-- keep only the w5..w3 range |
1136 |
|
|
wseq=wseq.substr(w5, w3-w5+1); |
1137 |
|
|
if (!wqv.is_empty()) |
1138 |
|
|
wqv=wqv.substr(w5, w3-w5+1); |
1139 |
|
|
w5=0; |
1140 |
|
|
w3=wseq.length()-1; |
1141 |
|
|
} |
1142 |
gpertea |
103 |
char trim_code; |
1143 |
|
|
do { |
1144 |
|
|
trim_code=0; |
1145 |
|
|
if (trim_poly5(wseq, w5, w3, polyA_seed)) { |
1146 |
|
|
trim_code='A'; |
1147 |
|
|
} |
1148 |
|
|
else if (trim_poly5(wseq, w5, w3, polyT_seed)) { |
1149 |
|
|
trim_code='T'; |
1150 |
|
|
} |
1151 |
|
|
else if (trim_adapter5(wseq, w5, w3)) { |
1152 |
|
|
trim_code='5'; |
1153 |
|
|
} |
1154 |
|
|
if (trim_code) { |
1155 |
gpertea |
105 |
#ifdef GDEBUG |
1156 |
|
|
GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3); |
1157 |
|
|
showTrim(wseq, w5, w3); |
1158 |
|
|
#endif |
1159 |
gpertea |
13 |
int trimlen=wseq.length()-(w3-w5+1); |
1160 |
gpertea |
103 |
num_trimmed5++; |
1161 |
|
|
if (trimlen<min_trimmed5) |
1162 |
|
|
min_trimmed5=trimlen; |
1163 |
gpertea |
13 |
l5+=w5; |
1164 |
|
|
l3-=(wseq.length()-1-w3); |
1165 |
|
|
if (w3-w5+1<min_read_len) { |
1166 |
gpertea |
103 |
return trim_code; |
1167 |
gpertea |
13 |
} |
1168 |
|
|
//-- keep only the w5..w3 range |
1169 |
|
|
wseq=wseq.substr(w5, w3-w5+1); |
1170 |
|
|
if (!wqv.is_empty()) |
1171 |
|
|
wqv=wqv.substr(w5, w3-w5+1); |
1172 |
gpertea |
103 |
}// trimmed at 5' end |
1173 |
|
|
} while (trim_code); |
1174 |
|
|
|
1175 |
|
|
do { |
1176 |
|
|
trim_code=0; |
1177 |
|
|
if (trim_poly3(wseq, w5, w3, polyA_seed)) { |
1178 |
|
|
trim_code='A'; |
1179 |
|
|
} |
1180 |
|
|
else if (trim_poly3(wseq, w5, w3, polyT_seed)) { |
1181 |
|
|
trim_code='T'; |
1182 |
|
|
} |
1183 |
|
|
else if (trim_adapter3(wseq, w5, w3)) { |
1184 |
|
|
trim_code='3'; |
1185 |
|
|
} |
1186 |
|
|
if (trim_code) { |
1187 |
gpertea |
105 |
#ifdef GDEBUG |
1188 |
|
|
GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3); |
1189 |
|
|
showTrim(wseq, w5, w3); |
1190 |
|
|
#endif |
1191 |
gpertea |
13 |
int trimlen=wseq.length()-(w3-w5+1); |
1192 |
gpertea |
103 |
num_trimmed3++; |
1193 |
gpertea |
170 |
if (trimlen<min_trimmed3) |
1194 |
gpertea |
103 |
min_trimmed3=trimlen; |
1195 |
gpertea |
13 |
l5+=w5; |
1196 |
|
|
l3-=(wseq.length()-1-w3); |
1197 |
|
|
if (w3-w5+1<min_read_len) { |
1198 |
gpertea |
103 |
return trim_code; |
1199 |
gpertea |
13 |
} |
1200 |
|
|
//-- keep only the w5..w3 range |
1201 |
|
|
wseq=wseq.substr(w5, w3-w5+1); |
1202 |
|
|
if (!wqv.is_empty()) |
1203 |
|
|
wqv=wqv.substr(w5, w3-w5+1); |
1204 |
gpertea |
103 |
}//trimming at 3' end |
1205 |
|
|
} while (trim_code); |
1206 |
|
|
|
1207 |
|
|
|
1208 |
gpertea |
13 |
if (doCollapse) { |
1209 |
|
|
//keep read for later |
1210 |
|
|
FqDupRec* dr=dhash.Find(wseq.chars()); |
1211 |
|
|
if (dr==NULL) { //new entry |
1212 |
gpertea |
170 |
//if (prefix.is_empty()) |
1213 |
|
|
dhash.Add(wseq.chars(), |
1214 |
gpertea |
13 |
new FqDupRec(&wqv, rname.chars())); |
1215 |
|
|
//else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length())); |
1216 |
|
|
} |
1217 |
|
|
else |
1218 |
|
|
dr->add(wqv); |
1219 |
|
|
} //collapsing duplicates |
1220 |
|
|
else { //not collapsing duplicates |
1221 |
|
|
//apply the dust filter now |
1222 |
|
|
if (doDust) { |
1223 |
|
|
int dustbases=dust(wseq); |
1224 |
|
|
if (dustbases>(wseq.length()>>1)) { |
1225 |
|
|
return 'D'; |
1226 |
|
|
} |
1227 |
|
|
} |
1228 |
|
|
} //not collapsing duplicates |
1229 |
|
|
return (l5>0 || l3<rseq.length()-1) ? 1 : 0; |
1230 |
|
|
} |
1231 |
|
|
|
1232 |
|
|
void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) { |
1233 |
|
|
//GMessage("printing Header..%c%s\n",recmarker, rname.chars()); |
1234 |
|
|
if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars()); |
1235 |
|
|
else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars()); |
1236 |
|
|
} |
1237 |
|
|
|
1238 |
|
|
void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) { |
1239 |
|
|
outcounter++; |
1240 |
|
|
bool asFasta=(rqv.is_empty() || fastaOutput); |
1241 |
|
|
if (asFasta) { |
1242 |
|
|
if (prefix.is_empty()) { |
1243 |
|
|
printHeader(f_out, '>',rname,rinfo); |
1244 |
|
|
fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now |
1245 |
|
|
} |
1246 |
|
|
else { |
1247 |
gpertea |
170 |
fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter, |
1248 |
gpertea |
13 |
rseq.chars()); |
1249 |
|
|
} |
1250 |
|
|
} |
1251 |
|
|
else { //fastq |
1252 |
|
|
if (convert_phred) convertPhred(rqv); |
1253 |
|
|
if (prefix.is_empty()) { |
1254 |
|
|
printHeader(f_out, '@', rname, rinfo); |
1255 |
|
|
fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars()); |
1256 |
|
|
} |
1257 |
|
|
else |
1258 |
gpertea |
170 |
fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter, |
1259 |
gpertea |
13 |
rseq.chars(),rqv.chars() ); |
1260 |
|
|
} |
1261 |
|
|
} |
1262 |
|
|
|
1263 |
|
|
void trash_report(char trashcode, GStr& rname, FILE* freport) { |
1264 |
|
|
if (freport==NULL || trashcode<=' ') return; |
1265 |
|
|
if (trashcode=='3' || trashcode=='5') { |
1266 |
gpertea |
103 |
fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode); |
1267 |
gpertea |
13 |
} |
1268 |
|
|
else { |
1269 |
|
|
fprintf(freport, "%s\t%c\n",rname.chars(),trashcode); |
1270 |
|
|
} |
1271 |
|
|
//tcounter++; |
1272 |
|
|
} |
1273 |
|
|
|
1274 |
|
|
GStr getFext(GStr& s, int* xpos=NULL) { |
1275 |
|
|
//s must be a filename without a path |
1276 |
|
|
GStr r(""); |
1277 |
|
|
if (xpos!=NULL) *xpos=0; |
1278 |
|
|
if (s.is_empty() || s=="-") return r; |
1279 |
|
|
int p=s.rindex('.'); |
1280 |
|
|
int d=s.rindex('/'); |
1281 |
|
|
if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r; |
1282 |
|
|
r=s.substr(p+1); |
1283 |
|
|
if (xpos!=NULL) *xpos=p+1; |
1284 |
|
|
r.lower(); |
1285 |
|
|
return r; |
1286 |
|
|
} |
1287 |
|
|
|
1288 |
|
|
void baseFileName(GStr& fname) { |
1289 |
|
|
//remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) . |
1290 |
|
|
int xpos=0; |
1291 |
|
|
GStr fext=getFext(fname, &xpos); |
1292 |
|
|
if (xpos<=1) return; |
1293 |
|
|
bool extdel=false; |
1294 |
|
|
GStr f2; |
1295 |
|
|
if (fext=="z" || fext=="zip") { |
1296 |
|
|
extdel=true; |
1297 |
|
|
} |
1298 |
|
|
else if (fext.length()>=2) { |
1299 |
|
|
f2=fext.substr(0,2); |
1300 |
|
|
extdel=(f2=="gz" || f2=="bz"); |
1301 |
|
|
} |
1302 |
|
|
if (extdel) { |
1303 |
|
|
fname.cut(xpos-1); |
1304 |
|
|
fext=getFext(fname, &xpos); |
1305 |
|
|
if (xpos<=1) return; |
1306 |
|
|
} |
1307 |
|
|
extdel=false; |
1308 |
|
|
if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") { |
1309 |
|
|
extdel=true; |
1310 |
|
|
} |
1311 |
|
|
else if (fext.length()>=2) { |
1312 |
|
|
extdel=(fext.substr(0,2)=="fa"); |
1313 |
|
|
} |
1314 |
|
|
if (extdel) fname.cut(xpos-1); |
1315 |
|
|
GStr fncp(fname); |
1316 |
|
|
fncp.lower(); |
1317 |
|
|
fncp.chomp("seq"); |
1318 |
|
|
fncp.chomp("sequence"); |
1319 |
|
|
fncp.trimR("_."); |
1320 |
|
|
if (fncp.length()<fname.length()) fname.cut(fncp.length()); |
1321 |
|
|
} |
1322 |
|
|
|
1323 |
|
|
FILE* prepOutFile(GStr& infname, GStr& pocmd) { |
1324 |
|
|
FILE* f_out=NULL; |
1325 |
|
|
GStr fname(getFileName(infname.chars())); |
1326 |
|
|
//eliminate known extensions |
1327 |
|
|
baseFileName(fname); |
1328 |
|
|
if (outsuffix.is_empty() || outsuffix=="-") { return stdout; } |
1329 |
|
|
else if (pocmd.is_empty()) { |
1330 |
|
|
GStr oname(fname); |
1331 |
|
|
oname.append('.'); |
1332 |
|
|
oname.append(outsuffix); |
1333 |
|
|
f_out=fopen(oname.chars(),"w"); |
1334 |
|
|
if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars()); |
1335 |
|
|
} |
1336 |
|
|
else { |
1337 |
|
|
GStr oname(">"); |
1338 |
|
|
oname.append(fname); |
1339 |
|
|
oname.append('.'); |
1340 |
|
|
oname.append(outsuffix); |
1341 |
|
|
pocmd.append(oname); |
1342 |
|
|
f_out=popen(pocmd.chars(), "w"); |
1343 |
|
|
if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars()); |
1344 |
|
|
} |
1345 |
|
|
return f_out; |
1346 |
|
|
} |
1347 |
|
|
|
1348 |
|
|
void guess_unzip(GStr& fname, GStr& picmd) { |
1349 |
|
|
GStr fext=getFext(fname); |
1350 |
|
|
if (fext=="gz" || fext=="gzip" || fext=="z") { |
1351 |
|
|
picmd="gzip -cd "; |
1352 |
|
|
} |
1353 |
|
|
else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") { |
1354 |
|
|
picmd="bzip2 -cd "; |
1355 |
|
|
} |
1356 |
|
|
} |
1357 |
|
|
|
1358 |
gpertea |
170 |
void addAdapter(GVec<CASeqData>& adapters, GStr& seq) { |
1359 |
|
|
//TODO: prepare CASeqData here, and collect hexamers as well |
1360 |
gpertea |
172 |
CASeqData adata(revCompl); |
1361 |
|
|
int idx=adapters.Add(adata); |
1362 |
|
|
if (idx<0) GError("Error: failed to add adaptor!\n"); |
1363 |
|
|
adapters[idx].update(seq.chars()); |
1364 |
gpertea |
170 |
} |
1365 |
|
|
|
1366 |
|
|
|
1367 |
gpertea |
94 |
int loadAdapters(const char* fname) { |
1368 |
|
|
GLineReader lr(fname); |
1369 |
|
|
char* l; |
1370 |
|
|
while ((l=lr.nextLine())!=NULL) { |
1371 |
|
|
if (lr.length()<=3 || l[0]=='#') continue; |
1372 |
gpertea |
98 |
if ( l[0]==' ' || l[0]=='\t' || l[0]==',' || |
1373 |
|
|
l[0]==';'|| l[0]==':' ) { |
1374 |
gpertea |
94 |
int i=1; |
1375 |
|
|
while (l[i]!=0 && isspace(l[i])) { |
1376 |
|
|
i++; |
1377 |
|
|
} |
1378 |
|
|
if (l[i]!=0) { |
1379 |
gpertea |
102 |
GStr s(&(l[i])); |
1380 |
gpertea |
108 |
#ifdef GDEBUG |
1381 |
|
|
//s.reverse(); |
1382 |
|
|
#endif |
1383 |
gpertea |
170 |
addAdapter(adapters3, s); |
1384 |
gpertea |
94 |
continue; |
1385 |
|
|
} |
1386 |
|
|
} |
1387 |
|
|
else { |
1388 |
gpertea |
98 |
GStr s(l); |
1389 |
|
|
s.startTokenize("\t ;,:"); |
1390 |
|
|
GStr a5,a3; |
1391 |
|
|
if (s.nextToken(a5)) |
1392 |
|
|
s.nextToken(a3); |
1393 |
gpertea |
102 |
a5.upper(); |
1394 |
|
|
a3.upper(); |
1395 |
gpertea |
108 |
#ifdef GDEBUG |
1396 |
|
|
// a5.reverse(); |
1397 |
|
|
// a3.reverse(); |
1398 |
|
|
#endif |
1399 |
gpertea |
170 |
addAdapter(adapters5, a5); |
1400 |
|
|
addAdapter(adapters3, a3); |
1401 |
gpertea |
94 |
} |
1402 |
|
|
} |
1403 |
gpertea |
102 |
return adapters5.Count()+adapters3.Count(); |
1404 |
gpertea |
94 |
} |
1405 |
|
|
|
1406 |
gpertea |
170 |
void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2, |
1407 |
|
|
GStr& s, GStr& infname, GStr& infname2) { |
1408 |
gpertea |
13 |
// uses outsuffix to generate output file names and open file handles as needed |
1409 |
|
|
infname=""; |
1410 |
|
|
infname2=""; |
1411 |
|
|
f_in=NULL; |
1412 |
|
|
f_in2=NULL; |
1413 |
|
|
f_out=NULL; |
1414 |
|
|
f_out2=NULL; |
1415 |
|
|
//analyze outsuffix intent |
1416 |
|
|
GStr pocmd; |
1417 |
gpertea |
76 |
if (outsuffix=="-") { |
1418 |
|
|
f_out=stdout; |
1419 |
|
|
} |
1420 |
|
|
else { |
1421 |
|
|
GStr ox=getFext(outsuffix); |
1422 |
|
|
if (ox.length()>2) ox=ox.substr(0,2); |
1423 |
|
|
if (ox=="gz") pocmd="gzip -9 -c "; |
1424 |
|
|
else if (ox=="bz") pocmd="bzip2 -9 -c "; |
1425 |
|
|
} |
1426 |
gpertea |
13 |
if (s=="-") { |
1427 |
|
|
f_in=stdin; |
1428 |
|
|
infname="stdin"; |
1429 |
|
|
f_out=prepOutFile(infname, pocmd); |
1430 |
|
|
return; |
1431 |
|
|
} // streaming from stdin |
1432 |
|
|
s.startTokenize(",:"); |
1433 |
|
|
s.nextToken(infname); |
1434 |
|
|
bool paired=s.nextToken(infname2); |
1435 |
gpertea |
170 |
if (fileExists(infname.chars())==0) |
1436 |
gpertea |
13 |
GError("Error: cannot find file %s!\n",infname.chars()); |
1437 |
|
|
GStr fname(getFileName(infname.chars())); |
1438 |
|
|
GStr picmd; |
1439 |
|
|
guess_unzip(fname, picmd); |
1440 |
|
|
if (picmd.is_empty()) { |
1441 |
|
|
f_in=fopen(infname.chars(), "r"); |
1442 |
|
|
if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars()); |
1443 |
|
|
} |
1444 |
|
|
else { |
1445 |
|
|
picmd.append(infname); |
1446 |
|
|
f_in=popen(picmd.chars(), "r"); |
1447 |
|
|
if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars()); |
1448 |
|
|
} |
1449 |
gpertea |
76 |
if (f_out==stdout) { |
1450 |
|
|
if (paired) GError("Error: output suffix required for paired reads\n"); |
1451 |
|
|
return; |
1452 |
|
|
} |
1453 |
gpertea |
13 |
f_out=prepOutFile(infname, pocmd); |
1454 |
|
|
if (!paired) return; |
1455 |
|
|
if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n"); |
1456 |
|
|
// ---- paired reads:------------- |
1457 |
gpertea |
170 |
if (fileExists(infname2.chars())==0) |
1458 |
gpertea |
13 |
GError("Error: cannot find file %s!\n",infname2.chars()); |
1459 |
|
|
picmd=""; |
1460 |
|
|
GStr fname2(getFileName(infname2.chars())); |
1461 |
|
|
guess_unzip(fname2, picmd); |
1462 |
|
|
if (picmd.is_empty()) { |
1463 |
|
|
f_in2=fopen(infname2.chars(), "r"); |
1464 |
|
|
if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars()); |
1465 |
|
|
} |
1466 |
|
|
else { |
1467 |
|
|
picmd.append(infname2); |
1468 |
|
|
f_in2=popen(picmd.chars(), "r"); |
1469 |
|
|
if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars()); |
1470 |
|
|
} |
1471 |
|
|
f_out2=prepOutFile(infname2, pocmd); |
1472 |
|
|
} |