3 |
|
#include "GHash.hh" |
4 |
|
#include "GList.hh" |
5 |
|
#include <ctype.h> |
6 |
+ |
#include "GAlnExtend.h" |
7 |
|
|
8 |
|
#define USAGE "Usage:\n\ |
9 |
|
fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\ |
31 |
|
(e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\ |
32 |
|
-3 trim the given adapter sequence at the 3' end of each read\n\ |
33 |
|
(e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\ |
34 |
< |
-a minimum length of exact match to adaptor sequence at the proper end (6)\n\ |
34 |
> |
-A disable polyA trimming (enabled by default)\n\ |
35 |
> |
-T enable polyT trimming (disabled by default)\n\ |
36 |
> |
-y minimum length of exact match to adaptor sequence at the proper end (6)\n\ |
37 |
|
-q trim bases with quality value lower than <minq> (starting at the 3' end)\n\ |
38 |
|
-t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\ |
39 |
|
-m maximum percentage of Ns allowed in a read after trimming (default 7)\n\ |
78 |
|
bool isfasta=false; |
79 |
|
bool convert_phred=false; |
80 |
|
GStr outsuffix; // -o |
78 |
– |
//GStr adapter3; |
79 |
– |
//GStr adapter5; |
81 |
|
GStr prefix; |
82 |
|
GStr zcmd; |
83 |
|
int num_trimmed5=0; |
92 |
|
int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet) |
93 |
|
int qv_cvtadd=0; //could be -31 or +31 |
94 |
|
|
95 |
< |
int a3len=0; |
96 |
< |
int a5len=0; |
97 |
< |
// adaptor matching metrics -- for extendMatch() function |
98 |
< |
const int a_m_score=2; //match score |
99 |
< |
const int a_mis_score=-3; //mismatch |
100 |
< |
const int a_dropoff_score=7; |
101 |
< |
int a_min_score=12; //an exact match of 6 bases at the proper ends WILL be trimmed |
102 |
< |
const int a_min_chain_score=15; //for gapped alignments |
103 |
< |
|
104 |
< |
class CSegChain; |
105 |
< |
|
106 |
< |
class CSegPair { |
107 |
< |
public: |
107 |
< |
GSeg a; |
108 |
< |
GSeg b; //the adapter segment |
109 |
< |
int score; |
110 |
< |
int flags; |
111 |
< |
CSegChain* chain; |
112 |
< |
CSegPair(int astart=0, int aend=0, int bstart=0, int bend=0, int mscore=0):a(astart,aend),b(bstart, bend) { |
113 |
< |
score=mscore; |
114 |
< |
if (score==0) score=a.len()*a_m_score; |
115 |
< |
flags=0; |
116 |
< |
chain=NULL; |
95 |
> |
// adaptor matching metrics -- for X-drop ungapped extension |
96 |
> |
const int poly_m_score=2; //match score |
97 |
> |
const int poly_mis_score=-3; //mismatch |
98 |
> |
const int poly_dropoff_score=7; |
99 |
> |
int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed |
100 |
> |
|
101 |
> |
const char *polyA_seed="AAAA"; |
102 |
> |
const char *polyT_seed="TTTT"; |
103 |
> |
|
104 |
> |
struct CAdapters { |
105 |
> |
GStr a5; |
106 |
> |
GStr a3; |
107 |
> |
CAdapters(const char* s5=NULL, const char* s3=NULL):a5(s5),a3(s3) { |
108 |
|
} |
109 |
< |
int len() { return a.len(); } |
119 |
< |
bool operator==(CSegPair& d){ |
120 |
< |
//return (a.start==d.a.start && a.end==d.a.end && b.start==d.b.start && b.end==d.b.end); |
121 |
< |
//make equal even segments that are included into one another: |
122 |
< |
return (d.a.start>=a.start && d.a.end<=a.end && d.b.start>=b.start && d.b.end<=b.end); |
123 |
< |
} |
124 |
< |
bool operator>(CSegPair& d){ //ordering based on b (adaptor) start coord and score |
125 |
< |
if (b.start==d.b.start) { |
126 |
< |
if (score==d.score) { |
127 |
< |
//just try to be consistent: |
128 |
< |
if (b.end==d.b.end) { |
129 |
< |
return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start); |
130 |
< |
} |
131 |
< |
return (b.end>d.b.end); |
132 |
< |
} |
133 |
< |
else return (score<d.score); |
134 |
< |
} |
135 |
< |
return (b.start>d.b.start); |
136 |
< |
} |
137 |
< |
bool operator<(CSegPair& d){ //ordering based on b (adaptor) coord |
138 |
< |
/*if (b.start==d.b.start && b.end==d.b.end) { |
139 |
< |
return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start); |
140 |
< |
} |
141 |
< |
return (b.start==d.b.start)?(b.end<d.b.end):(b.start<d.b.start);*/ |
142 |
< |
if (b.start==d.b.start) { |
143 |
< |
if (score==d.score) { |
144 |
< |
//just try to be consistent: |
145 |
< |
if (b.end==d.b.end) { |
146 |
< |
return (a.start==d.a.start)?(a.end>d.a.end):(a.start>d.a.start); |
147 |
< |
} |
148 |
< |
return (b.end<d.b.end); |
149 |
< |
} |
150 |
< |
else return (score>d.score); |
151 |
< |
} |
152 |
< |
return (b.start<d.b.start); |
153 |
< |
} |
154 |
< |
}; |
155 |
< |
|
156 |
< |
int cmpSegEnds(pointer sa, pointer sb) { //sort by adaptor seg ends AND score |
157 |
< |
CSegPair& x = *(CSegPair *)sa; |
158 |
< |
CSegPair& y = *(CSegPair *)sb; |
159 |
< |
/* |
160 |
< |
if (x.b.end==y.b.end) { |
161 |
< |
if (x.b.start==y.b.start) { |
162 |
< |
if (x.a.end==y.a.end) { |
163 |
< |
if (x.a.start==y.a.start) return 0; |
164 |
< |
return ((x.a.start>y.a.start) ? -1 : 1); |
165 |
< |
} |
166 |
< |
else { |
167 |
< |
return ((x.a.end>y.a.end) ? -1 : 1); |
168 |
< |
} |
169 |
< |
} |
170 |
< |
else { |
171 |
< |
return ((x.b.start>y.b.start) ? -1 : 1); |
172 |
< |
} |
173 |
< |
} |
174 |
< |
else { |
175 |
< |
return ((x.b.end>y.b.end) ? -1 : 1); |
176 |
< |
} |
177 |
< |
*/ |
178 |
< |
if (x.b.end==y.b.end) { |
179 |
< |
if (x.score==y.score) { |
180 |
< |
if (x.b.start==y.b.start) { |
181 |
< |
if (x.a.end==y.a.end) { |
182 |
< |
if (x.a.start==y.a.start) return 0; |
183 |
< |
return ((x.a.start<y.a.start) ? -1 : 1); |
184 |
< |
} |
185 |
< |
else { |
186 |
< |
return ((x.a.end<y.a.end) ? -1 : 1); |
187 |
< |
} |
188 |
< |
} |
189 |
< |
else { |
190 |
< |
return ((x.b.start<y.b.start) ? -1 : 1); |
191 |
< |
} |
192 |
< |
} else return ((x.score>y.score) ? -1 : 1); |
193 |
< |
} |
194 |
< |
else { |
195 |
< |
return ((x.b.end>y.b.end) ? -1 : 1); |
196 |
< |
} |
197 |
< |
|
198 |
< |
} |
109 |
> |
}; |
110 |
|
|
111 |
< |
class CSegChain:public GList<CSegPair> { |
201 |
< |
public: |
202 |
< |
uint astart; |
203 |
< |
uint aend; |
204 |
< |
uint bstart; |
205 |
< |
uint bend; |
206 |
< |
int score; |
207 |
< |
bool endSort; |
208 |
< |
CSegChain(bool aln5=false):GList<CSegPair>(true,true,true) {//sorted, free elements, unique |
209 |
< |
//as SegPairs are inserted, they will be sorted by a.start coordinate |
210 |
< |
score=0; |
211 |
< |
astart=MAX_UINT; |
212 |
< |
aend=0; |
213 |
< |
bstart=MAX_UINT; |
214 |
< |
bend=0; |
215 |
< |
endSort=aln5; |
216 |
< |
if (aln5) { setSorted(cmpSegEnds); } |
217 |
< |
} |
218 |
< |
bool operator==(CSegChain& d) { |
219 |
< |
//return (score==d.score); |
220 |
< |
return (astart==d.astart && aend==d.aend && bstart==d.bstart && bend==d.bend); |
221 |
< |
} |
222 |
< |
bool operator>(CSegChain& d) { // order based on b (adaptor) coordinate |
223 |
< |
//return (score<d.score); |
224 |
< |
if (bstart==d.bstart && bend==d.bend) { |
225 |
< |
return (astart==d.astart)?(aend>d.aend):(astart>d.astart); |
226 |
< |
} |
227 |
< |
return (bstart==d.bstart)?(bend>d.bend):(bstart>d.bstart); |
228 |
< |
} |
229 |
< |
bool operator<(CSegChain& d) { |
230 |
< |
//return (score>d.score); |
231 |
< |
if (bstart==d.bstart && bend==d.bend) { |
232 |
< |
return (astart==d.astart)?(aend<d.aend):(astart<d.astart); |
233 |
< |
} |
234 |
< |
return (bstart==d.bstart)?(bend<d.bend):(bstart<d.bstart); |
235 |
< |
} |
236 |
< |
void addSegPair(CSegPair* segp) { |
237 |
< |
if (AddIfNew(segp)!=segp) return; |
238 |
< |
score+=segp->score; |
239 |
< |
if (astart>segp->a.start) astart=segp->a.start; |
240 |
< |
if (aend<segp->a.end) aend=segp->a.end; |
241 |
< |
if (bstart>segp->b.start) bstart=segp->b.start; |
242 |
< |
if (bend<segp->b.end) bend=segp->b.end; |
243 |
< |
} |
244 |
< |
//for building actual chains: |
245 |
< |
bool extendChain(CSegPair* segp) { //segp expected to be "Greater Than" current chain |
246 |
< |
int bgap=0; |
247 |
< |
int agap=0; |
248 |
< |
//if (endSort) { |
249 |
< |
if (bstart>segp->b.start) { |
250 |
< |
bgap = (int)(bstart-segp->b.end); |
251 |
< |
if (abs(bgap)>2) return false; |
252 |
< |
agap = (int)(astart-segp->a.end); |
253 |
< |
if (abs(agap)>2) return false; |
254 |
< |
} |
255 |
< |
else { |
256 |
< |
bgap = (int) (segp->b.start-bend); |
257 |
< |
if (abs(bgap)>2) return false; |
258 |
< |
agap = (int)(segp->a.start-aend); |
259 |
< |
if (abs(agap)>2) return false; |
260 |
< |
} |
261 |
< |
if (agap*bgap<0) return false; |
262 |
< |
addSegPair(segp); |
263 |
< |
score-=abs(agap)+abs(bgap); |
264 |
< |
return true; |
265 |
< |
} |
266 |
< |
}; |
111 |
> |
GPVec<CAdapters> adapters; |
112 |
|
|
113 |
|
// element in dhash: |
114 |
|
class FqDupRec { |
158 |
|
|
159 |
|
GHash<FqDupRec> dhash; //hash to keep track of duplicates |
160 |
|
|
161 |
+ |
int loadAdapters(const char* fname); |
162 |
+ |
|
163 |
|
void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2, |
164 |
|
GStr& s, GStr& infname, GStr& infname2); |
165 |
|
// uses outsuffix to generate output file names and open file handles as needed |
176 |
|
bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured |
177 |
|
bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured |
178 |
|
int dust(GStr& seq); |
179 |
< |
bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured |
179 |
> |
bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured |
180 |
> |
bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed); |
181 |
|
bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured |
182 |
+ |
bool trim_adapter3(GStr& seq, int &l5, int &l3); |
183 |
|
|
184 |
|
void convertPhred(char* q, int len); |
185 |
|
void convertPhred(GStr& q); |
236 |
|
else |
237 |
|
GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE); |
238 |
|
} |
239 |
< |
if (args.getOpt('3')!=NULL) { |
240 |
< |
adapter3=args.getOpt('3'); |
241 |
< |
adapter3.upper(); |
242 |
< |
a3len=adapter3.length(); |
243 |
< |
} |
244 |
< |
if (args.getOpt('5')!=NULL) { |
245 |
< |
adapter5=args.getOpt('5'); |
246 |
< |
adapter5.upper(); |
247 |
< |
a5len=adapter5.length(); |
239 |
> |
s=args.getOpt('f'); |
240 |
> |
if (!s.is_empty()) { |
241 |
> |
loadAdapters(s.chars()); |
242 |
> |
} |
243 |
> |
bool fileAdapters=adapters.Count(); |
244 |
> |
s=args.getOpt('5'); |
245 |
> |
if (!s.is_empty()) { |
246 |
> |
if (fileAdapters) |
247 |
> |
GError("Error: options -5 and -f cannot be used together!\n"); |
248 |
> |
s.upper(); |
249 |
> |
adapters.Add(new CAdapters(s.chars())); |
250 |
|
} |
251 |
< |
s=args.getOpt('a'); |
251 |
> |
s=args.getOpt('3'); |
252 |
|
if (!s.is_empty()) { |
253 |
< |
int a_minmatch=s.asInt(); |
254 |
< |
a_min_score=a_minmatch<<1; |
253 |
> |
if (fileAdapters) |
254 |
> |
GError("Error: options -3 and -f cannot be used together!\n"); |
255 |
> |
s.upper(); |
256 |
> |
if (adapters.Count()>0) |
257 |
> |
adapters[0]->a3=s.chars(); |
258 |
> |
else adapters.Add(NULL, new CAdapters(s.chars())); |
259 |
> |
} |
260 |
> |
s=args.getOpt('y'); |
261 |
> |
if (!s.is_empty()) { |
262 |
> |
int minmatch=s.asInt(); |
263 |
> |
poly_minScore=minmatch*poly_m_score; |
264 |
|
} |
265 |
|
|
266 |
|
if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o'); |
716 |
|
return ncount; |
717 |
|
} |
718 |
|
|
859 |
– |
|
860 |
– |
// ------------------ adapter matching - simple k-mer seed & extend, no indels for now |
861 |
– |
//when a k-mer match is found, simply try to extend the alignment using a drop-off scheme |
862 |
– |
//check minimum score and |
863 |
– |
//for 3' adapter trimming: |
864 |
– |
// require that the right end of the alignment for either the adaptor OR the read must be |
865 |
– |
// < 3 distance from its right end |
866 |
– |
// for 5' adapter trimming: |
867 |
– |
// require that the left end of the alignment for either the adaptor OR the read must |
868 |
– |
// be at coordinate < 3 from start |
869 |
– |
|
870 |
– |
bool extendMatch(const char* a, int alen, int ai, |
871 |
– |
const char* b, int blen, int bi, int mlen, int& l5, int& l3, CSegChain& segs, bool end5=false) { |
872 |
– |
//so the alignment starts at ai in a, bi in b, with a perfect match of length mlen |
873 |
– |
#ifdef DEBUG |
874 |
– |
GStr dbg(b); |
875 |
– |
#endif |
876 |
– |
//if (debug) { |
877 |
– |
// GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai); |
878 |
– |
// } |
879 |
– |
int a_l=ai; //alignment coordinates on a |
880 |
– |
int a_r=ai+mlen-1; |
881 |
– |
int b_l=bi; //alignment coordinates on b |
882 |
– |
int b_r=bi+mlen-1; |
883 |
– |
int ai_maxscore=ai; |
884 |
– |
int bi_maxscore=bi; |
885 |
– |
int score=mlen*a_m_score; |
886 |
– |
int maxscore=score; |
887 |
– |
int mism5score=a_mis_score; |
888 |
– |
if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end |
889 |
– |
//try to extend to the left first, if possible |
890 |
– |
while (ai>0 && bi>0) { |
891 |
– |
ai--; |
892 |
– |
bi--; |
893 |
– |
score+= (a[ai]==b[bi])? a_m_score : mism5score; |
894 |
– |
if (score>maxscore) { |
895 |
– |
ai_maxscore=ai; |
896 |
– |
bi_maxscore=bi; |
897 |
– |
maxscore=score; |
898 |
– |
} |
899 |
– |
else if (maxscore-score>a_dropoff_score) break; |
900 |
– |
} |
901 |
– |
a_l=ai_maxscore; |
902 |
– |
b_l=bi_maxscore; |
903 |
– |
//if (debug) GMessage(" after l-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore); |
904 |
– |
//now extend to the right |
905 |
– |
ai_maxscore=a_r; |
906 |
– |
bi_maxscore=b_r; |
907 |
– |
ai=a_r; |
908 |
– |
bi=b_r; |
909 |
– |
score=maxscore; |
910 |
– |
//sometimes there are extra AAAAs at the end of the read, ignore those |
911 |
– |
if (strcmp(&a[alen-4],"AAAA")==0) { |
912 |
– |
alen-=3; |
913 |
– |
while (a[alen-1]=='A' && alen>ai) alen--; |
914 |
– |
} |
915 |
– |
while (ai<alen-1 && bi<blen-1) { |
916 |
– |
ai++; |
917 |
– |
bi++; |
918 |
– |
//score+= (a[ai]==b[bi])? a_m_score : a_mis_score; |
919 |
– |
if (a[ai]==b[bi]) { //match |
920 |
– |
score+=a_m_score; |
921 |
– |
if (ai>=alen-2) { |
922 |
– |
score+=a_m_score-(alen-ai-1); |
923 |
– |
} |
924 |
– |
} |
925 |
– |
else { //mismatch |
926 |
– |
score+=a_mis_score; |
927 |
– |
} |
928 |
– |
if (score>maxscore) { |
929 |
– |
ai_maxscore=ai; |
930 |
– |
bi_maxscore=bi; |
931 |
– |
maxscore=score; |
932 |
– |
} |
933 |
– |
else if (maxscore-score>a_dropoff_score) break; |
934 |
– |
} |
935 |
– |
a_r=ai_maxscore; |
936 |
– |
b_r=bi_maxscore; |
937 |
– |
int a_ovh3=alen-a_r-1; |
938 |
– |
int b_ovh3=blen-b_r-1; |
939 |
– |
int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3; |
940 |
– |
int mmovh5=(a_l<b_l)? a_l : b_l; |
941 |
– |
//if (debug) GMessage(" after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore); |
942 |
– |
#ifdef DEBUG |
943 |
– |
if (debug) GMessage(" extended to: %*s\n",a_r+1,dbg.substr(b_l,b_r-b_l+1).chars()); |
944 |
– |
#endif |
945 |
– |
if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) { |
946 |
– |
if (a_l<a_ovh3) { |
947 |
– |
//adapter closer to the left end (typical for 5' adapter) |
948 |
– |
l5=a_r+1; |
949 |
– |
l3=alen-1; |
950 |
– |
} |
951 |
– |
else { |
952 |
– |
//adapter matching at the right end (typical for 3' adapter) |
953 |
– |
l5=0; |
954 |
– |
l3=a_l-1; |
955 |
– |
} |
956 |
– |
return true; |
957 |
– |
} |
958 |
– |
else { //keep this segment pair for later (gapped alignment) |
959 |
– |
segs.addSegPair(new CSegPair(a_l+1, a_r+1, b_l+1, b_r+1, maxscore)); |
960 |
– |
//this will also update min & max coordinates in segs (segs.astart, .aend, .bstart, .bend) |
961 |
– |
} |
962 |
– |
//do not trim: |
963 |
– |
l5=0; |
964 |
– |
l3=alen-1; |
965 |
– |
return false; |
966 |
– |
} |
967 |
– |
|
968 |
– |
/* |
969 |
– |
int getWordValue(const char* s, int wlen) { |
970 |
– |
int r=0; |
971 |
– |
while (wlen--) { r+=(((int)s[wlen])<<wlen) } |
972 |
– |
return r; |
973 |
– |
} |
974 |
– |
*/ |
719 |
|
int get3mer_value(const char* s) { |
720 |
|
return (s[0]<<16)+(s[1]<<8)+s[2]; |
721 |
|
} |
739 |
|
return -1; |
740 |
|
} |
741 |
|
|
742 |
< |
int fast4match(int32 qv, const char* str, int slen, int start_index=0) { |
743 |
< |
if (start_index>=slen || start_index<0) return -1; |
744 |
< |
for (int i=start_index;i<slen-4;i++) { |
745 |
< |
int32* rv=(int32*)(str+i); |
746 |
< |
if (*rv==qv) return i; |
747 |
< |
} |
748 |
< |
return -1; |
749 |
< |
} |
742 |
> |
struct SLocScore { |
743 |
> |
int pos; |
744 |
> |
int score; |
745 |
> |
SLocScore(int p=0,int s=0) { |
746 |
> |
pos=p; |
747 |
> |
score=s; |
748 |
> |
} |
749 |
> |
void set(int p, int s) { |
750 |
> |
pos=p; |
751 |
> |
score=s; |
752 |
> |
} |
753 |
> |
void add(int p, int add) { |
754 |
> |
pos=p; |
755 |
> |
score+=add; |
756 |
> |
} |
757 |
> |
}; |
758 |
|
|
759 |
< |
int fast4rmatch(int32 qv, const char* str, int slen, int end_index=-1) { |
760 |
< |
if (end_index>=slen) return -1; |
761 |
< |
if (end_index<0) end_index=slen-1; |
762 |
< |
for (int i=end_index-3;i>=0;i--) { |
763 |
< |
int32* rv=(int32*)(str+i); |
764 |
< |
if (*rv==qv) return i; |
759 |
> |
bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) { |
760 |
> |
int rlen=seq.length(); |
761 |
> |
l5=0; |
762 |
> |
l3=rlen-1; |
763 |
> |
int32 seedVal=*(int32*)poly_seed; |
764 |
> |
char polyChar=poly_seed[0]; |
765 |
> |
//assumes N trimming was already done |
766 |
> |
//so a poly match should be very close to the end of the read |
767 |
> |
// -- find the initial match (seed) |
768 |
> |
int lmin=GMAX((rlen-12), 0); |
769 |
> |
int li; |
770 |
> |
for (li=rlen-4;li>lmin;li--) { |
771 |
> |
if (seedVal==*(int*)&(seq[li])) { |
772 |
> |
break; |
773 |
> |
} |
774 |
|
} |
775 |
< |
return -1; |
776 |
< |
} |
775 |
> |
if (li<=lmin) return false; |
776 |
> |
//seed found, try to extend it both ways |
777 |
> |
//extend right |
778 |
> |
int ri=li+3; |
779 |
> |
SLocScore loc(ri, poly_m_score<<2); |
780 |
> |
SLocScore maxloc(loc); |
781 |
> |
//extend right |
782 |
> |
while (ri<rlen-2) { |
783 |
> |
ri++; |
784 |
> |
if (seq[ri]==polyChar) { |
785 |
> |
loc.add(ri,poly_m_score); |
786 |
> |
} |
787 |
> |
else if (seq[ri]=='N') { |
788 |
> |
loc.add(ri,0); |
789 |
> |
} |
790 |
> |
else { //mismatch |
791 |
> |
loc.add(ri,poly_mis_score); |
792 |
> |
if (maxloc.score-loc.score>poly_dropoff_score) break; |
793 |
> |
} |
794 |
> |
if (maxloc.score<=loc.score) { |
795 |
> |
maxloc=loc; |
796 |
> |
} |
797 |
> |
} |
798 |
> |
ri=maxloc.pos; |
799 |
> |
if (ri<rlen-3) return false; //no trimming wanted, too far from 3' end |
800 |
> |
//ri = right boundary for the poly match |
801 |
> |
//extend left |
802 |
> |
loc.set(li, maxloc.score); |
803 |
> |
maxloc.pos=li; |
804 |
> |
while (li>0) { |
805 |
> |
li--; |
806 |
> |
if (seq[li]==polyChar) { |
807 |
> |
loc.add(li,poly_m_score); |
808 |
> |
} |
809 |
> |
else if (seq[li]=='N') { |
810 |
> |
loc.add(li,0); |
811 |
> |
} |
812 |
> |
else { //mismatch |
813 |
> |
loc.add(li,poly_mis_score); |
814 |
> |
if (maxloc.score-loc.score>poly_dropoff_score) break; |
815 |
> |
} |
816 |
> |
if (maxloc.score<=loc.score) { |
817 |
> |
maxloc=loc; |
818 |
> |
} |
819 |
> |
} |
820 |
> |
if (maxloc.score>poly_minScore && ri>=rlen-3) { |
821 |
> |
l5=li; |
822 |
> |
l3=ri; |
823 |
> |
return true; |
824 |
> |
} |
825 |
> |
return false; |
826 |
> |
} |
827 |
|
|
828 |
< |
#ifdef DEBUG |
829 |
< |
void dbgPrintChain(CSegChain& chain, const char* aseq) { |
830 |
< |
GStr s(aseq); |
831 |
< |
for (int i=0;i<chain.Count();i++) { |
832 |
< |
CSegPair& seg=*chain[i]; |
833 |
< |
GMessage(" dbg chain seg%d: %*s [%d-%d:%d-%d]\n",i,seg.a.start-1+seg.len(), |
834 |
< |
s.substr(seg.b.start-1, seg.len()).chars(), seg.b.start,seg.b.end,seg.a.start,seg.a.end); |
828 |
> |
|
829 |
> |
bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) { |
830 |
> |
int rlen=seq.length(); |
831 |
> |
l5=0; |
832 |
> |
l3=rlen-1; |
833 |
> |
int32 seedVal=*(int32*)poly_seed; |
834 |
> |
char polyChar=poly_seed[0]; |
835 |
> |
//assumes N trimming was already done |
836 |
> |
//so a poly match should be very close to the end of the read |
837 |
> |
// -- find the initial match (seed) |
838 |
> |
int lmax=GMIN(8, rlen-4);//how far from 5' end to look for 4-mer seeds |
839 |
> |
int li; |
840 |
> |
for (li=0;li<=lmax;li++) { |
841 |
> |
if (seedVal==*(int*)&(seq[li])) { |
842 |
> |
break; |
843 |
> |
} |
844 |
> |
} |
845 |
> |
if (li>lmax) return false; |
846 |
> |
//seed found, try to extend it both ways |
847 |
> |
//extend left |
848 |
> |
int ri=li+3; //save rightmost base of the seed |
849 |
> |
SLocScore loc(li, poly_m_score<<2); |
850 |
> |
SLocScore maxloc(loc); |
851 |
> |
while (li>0) { |
852 |
> |
li--; |
853 |
> |
if (seq[li]==polyChar) { |
854 |
> |
loc.add(li,poly_m_score); |
855 |
> |
} |
856 |
> |
else if (seq[li]=='N') { |
857 |
> |
loc.add(li,0); |
858 |
> |
} |
859 |
> |
else { //mismatch |
860 |
> |
loc.add(li,poly_mis_score); |
861 |
> |
if (maxloc.score-loc.score>poly_dropoff_score) break; |
862 |
> |
} |
863 |
> |
if (maxloc.score<=loc.score) { |
864 |
> |
maxloc=loc; |
865 |
> |
} |
866 |
> |
} |
867 |
> |
li=maxloc.pos; |
868 |
> |
if (li>3) return false; //no trimming wanted, too far from 5' end |
869 |
> |
//li = right boundary for the poly match |
870 |
> |
|
871 |
> |
//extend right |
872 |
> |
loc.set(ri, maxloc.score); |
873 |
> |
maxloc.pos=ri; |
874 |
> |
while (ri<rlen-2) { |
875 |
> |
ri++; |
876 |
> |
if (seq[ri]==polyChar) { |
877 |
> |
loc.add(ri,poly_m_score); |
878 |
> |
} |
879 |
> |
else if (seq[ri]=='N') { |
880 |
> |
loc.add(ri,0); |
881 |
> |
} |
882 |
> |
else { //mismatch |
883 |
> |
loc.add(ri,poly_mis_score); |
884 |
> |
if (maxloc.score-loc.score>poly_dropoff_score) break; |
885 |
> |
} |
886 |
> |
if (maxloc.score<=loc.score) { |
887 |
> |
maxloc=loc; |
888 |
> |
} |
889 |
|
} |
890 |
+ |
|
891 |
+ |
if (maxloc.score>poly_minScore && li<=3) { |
892 |
+ |
l5=li; |
893 |
+ |
l3=ri; |
894 |
+ |
return true; |
895 |
+ |
} |
896 |
+ |
return false; |
897 |
|
} |
1026 |
– |
#endif |
898 |
|
|
899 |
|
bool trim_adapter3(GStr& seq, int&l5, int &l3) { |
900 |
|
int rlen=seq.length(); |
1056 |
|
l3=rlen-1; |
1057 |
|
//try to see if adapter is fully included in the read |
1058 |
|
int fi=-1; |
1059 |
< |
if ((fi=seq.index(adapter5))>=0) { |
1060 |
< |
if (fi<rlen-fi-a5len) {//match is closer to the right end |
1061 |
< |
l5=fi+a5len; |
1062 |
< |
l3=rlen-1; |
1063 |
< |
} |
1064 |
< |
else { |
1065 |
< |
l5=0; |
1066 |
< |
l3=fi-1; |
1196 |
< |
} |
1197 |
< |
return true; |
1198 |
< |
} |
1199 |
< |
#ifdef DEBUG |
1200 |
< |
if (debug) GMessage(">TRIM5 >> Read: %s\n",seq.chars()); |
1201 |
< |
#endif |
1202 |
< |
|
1203 |
< |
CSegChain a5segs(true); //list of segment pairs to analyze later if no extendMatch succeeded |
1204 |
< |
|
1205 |
< |
//try the easy way out first - look for an exact match of 11 bases |
1206 |
< |
int fdlen=11; |
1207 |
< |
if (a5len<16) { |
1208 |
< |
fdlen=a5len>>1; |
1209 |
< |
} |
1210 |
< |
if (fdlen>4) { |
1211 |
< |
GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus |
1212 |
< |
if ((fi=adapter5.index(rstart))>=0) { |
1213 |
< |
#ifdef DEBUG |
1214 |
< |
if (debug) GMessage(" W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars()); |
1215 |
< |
#endif |
1216 |
< |
if (extendMatch(seq.chars(), rlen, 1, |
1217 |
< |
adapter5.chars(), a5len, fi, fdlen, l5,l3, a5segs, true)) |
1218 |
< |
return true; |
1059 |
> |
for (int ai=0;ai<adapters.Count();ai++) { |
1060 |
> |
if (adapters[ai]->a5.is_empty()) continue; |
1061 |
> |
int a5len=adapters[ai]->a5.length(); |
1062 |
> |
GStr& adapter5=adapters[ai]->a5; |
1063 |
> |
if ((fi=seq.index(adapter5))>=0) { |
1064 |
> |
if (fi<rlen-fi-a5len) {//match is closer to the right end |
1065 |
> |
l5=fi+a5len; |
1066 |
> |
l3=rlen-1; |
1067 |
|
} |
1068 |
< |
//another easy case: last 11 characters of the adaptor found as a substring of the read |
1069 |
< |
GStr bstr=adapter5.substr(-fdlen); |
1070 |
< |
if ((fi=seq.index(bstr))>=0) { |
1223 |
< |
#ifdef DEBUG |
1224 |
< |
if (debug) GMessage(" A11match found: %*s\n", fi+fdlen, bstr.chars()); |
1225 |
< |
#endif |
1226 |
< |
if (extendMatch(seq.chars(), rlen, fi, |
1227 |
< |
adapter5.chars(), a5len, a5len-fdlen, fdlen, l5,l3,a5segs,true)) |
1228 |
< |
return true; |
1068 |
> |
else { |
1069 |
> |
l5=0; |
1070 |
> |
l3=fi-1; |
1071 |
|
} |
1072 |
< |
} //tried to matching at most 11 bases first |
1073 |
< |
|
1074 |
< |
//-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor |
1075 |
< |
//-- only extend a wmer if it matches in the 5' (beginning) region of the read |
1076 |
< |
int wordSize=3; |
1235 |
< |
int hlen=12; |
1236 |
< |
if (hlen>a5len-wordSize) hlen=a5len-wordSize; |
1237 |
< |
int imax=rlen>>1; //first half of the read, right boundary for the wmer match |
1238 |
< |
if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); } |
1239 |
< |
for (int iw=0;iw<=hlen;iw++) { |
1240 |
< |
int apstart=a5len-iw-wordSize; |
1241 |
< |
fi=0; |
1242 |
< |
//int* qv=(int32 *)(adapter5.chars()+apstart); |
1243 |
< |
int qv=get3mer_value(adapter5.chars()+apstart); |
1244 |
< |
//while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) { |
1245 |
< |
while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) { |
1246 |
< |
#ifdef DEBUG |
1247 |
< |
if (debug) GMessage(" Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars()); |
1248 |
< |
#endif |
1249 |
< |
if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(), |
1250 |
< |
a5len, apstart, wordSize, l5,l3, a5segs, true)) return true; |
1251 |
< |
fi++; |
1252 |
< |
if (fi>imax) break; |
1253 |
< |
} |
1254 |
< |
} //for each wmer in the last hlen bases of the adaptor |
1255 |
< |
/* |
1072 |
> |
return true; |
1073 |
> |
} |
1074 |
> |
#ifdef DEBUG |
1075 |
> |
if (debug) GMessage(">TRIM5 >> Read: %s\n",seq.chars()); |
1076 |
> |
#endif |
1077 |
|
|
1078 |
< |
//couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there |
1079 |
< |
//but only do this if we already have segment pairs collected in the last 12 bases of the adapter |
1080 |
< |
if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false; |
1081 |
< |
int hlen2=a5len-wordSize; |
1082 |
< |
//if (hlen2>a5len-wordSize) hlen2=a5len-wordSize; |
1083 |
< |
#ifdef DEBUG |
1084 |
< |
if (debug && a5segs.Count()>0) { |
1085 |
< |
GMessage(" >>>>>2nd. hash: %s\n",seq.chars()); |
1086 |
< |
} |
1087 |
< |
#endif |
1088 |
< |
if (hlen2>hlen) { |
1089 |
< |
for (int iw=hlen+1;iw<=hlen2;iw++) { |
1090 |
< |
int apstart=a5len-iw-wordSize; |
1270 |
< |
fi=0; |
1271 |
< |
//int* qv=(int32 *)(adapter5.chars()+apstart); |
1272 |
< |
int qv=get3mer_value(adapter5.chars()+apstart); |
1273 |
< |
//while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) { |
1274 |
< |
while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) { |
1275 |
< |
extendMatch(seq.chars(), rlen, fi, adapter5.chars(), |
1276 |
< |
a5len, apstart, wordSize, l5,l3, a5segs, true); |
1277 |
< |
fi++; |
1278 |
< |
if (fi>imax) break; |
1279 |
< |
} |
1280 |
< |
} //for each wmer between hlen2 and hlen bases of the adaptor |
1281 |
< |
} |
1282 |
< |
if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false; |
1283 |
< |
// lastly, analyze collected a5segs for a possible gapped alignment: |
1284 |
< |
GList<CSegChain> segchains(false,true,false); |
1285 |
< |
#ifdef DEBUG |
1286 |
< |
if (debug && a5segs.Count()>0) { |
1287 |
< |
GMessage(">>>>>>>>> Read: %s\n",seq.chars()); |
1288 |
< |
} |
1289 |
< |
#endif |
1290 |
< |
for (int i=0;i<a5segs.Count();i++) { |
1291 |
< |
if (a5segs[i]->chain==NULL) { |
1292 |
< |
if (a5segs[i]->b.end<(int)(a5len-4)) continue; //don't start a hopeless chain |
1293 |
< |
CSegChain* newchain=new CSegChain(true); |
1294 |
< |
newchain->setFreeItem(false); |
1295 |
< |
newchain->addSegPair(a5segs[i]); |
1296 |
< |
a5segs[i]->chain=newchain; |
1297 |
< |
segchains.Add(newchain); //just to free them when done |
1298 |
< |
} |
1299 |
< |
for (int j=i+1;j<a5segs.Count();j++) { |
1300 |
< |
CSegChain* chain=a5segs[i]->chain; |
1301 |
< |
if (chain->extendChain(a5segs[j])) { |
1302 |
< |
a5segs[j]->chain=chain; |
1303 |
< |
#ifdef DEBUG |
1304 |
< |
if (debug) dbgPrintChain(*chain, adapter5.chars()); |
1305 |
< |
#endif |
1306 |
< |
//save time by checking here if the extended chain is already acceptable for trimming |
1307 |
< |
if (chain->bend>(uint)(a5len-3) && chain->astart<4 && chain->score>a_min_chain_score) { |
1308 |
< |
l5=chain->aend; |
1309 |
< |
l3=rlen-1; |
1078 |
> |
//try the easy way out first - look for an exact match of 11 bases |
1079 |
> |
int fdlen=11; |
1080 |
> |
if (a5len<16) { |
1081 |
> |
fdlen=a5len>>1; |
1082 |
> |
} |
1083 |
> |
if (fdlen>4) { |
1084 |
> |
GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus |
1085 |
> |
if ((fi=adapter5.index(rstart))>=0) { |
1086 |
> |
#ifdef DEBUG |
1087 |
> |
if (debug) GMessage(" W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars()); |
1088 |
> |
#endif |
1089 |
> |
if (extendMatch(seq.chars(), rlen, 1, |
1090 |
> |
adapter5.chars(), a5len, fi, fdlen, l5,l3, a5segs, true)) |
1091 |
|
return true; |
1092 |
< |
} |
1093 |
< |
} //chain can be extended |
1092 |
> |
} |
1093 |
> |
//another easy case: last 11 characters of the adaptor found as a substring of the read |
1094 |
> |
GStr bstr=adapter5.substr(-fdlen); |
1095 |
> |
if ((fi=seq.index(bstr))>=0) { |
1096 |
> |
#ifdef DEBUG |
1097 |
> |
if (debug) GMessage(" A11match found: %*s\n", fi+fdlen, bstr.chars()); |
1098 |
> |
#endif |
1099 |
> |
if (extendMatch(seq.chars(), rlen, fi, |
1100 |
> |
adapter5.chars(), a5len, a5len-fdlen, fdlen, l5,l3,a5segs,true)) |
1101 |
> |
return true; |
1102 |
> |
} |
1103 |
> |
} //tried to matching at most 11 bases first |
1104 |
> |
|
1105 |
> |
//-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor |
1106 |
> |
//-- only extend a wmer if it matches in the 5' (beginning) region of the read |
1107 |
> |
int wordSize=3; |
1108 |
> |
int hlen=12; |
1109 |
> |
if (hlen>a5len-wordSize) hlen=a5len-wordSize; |
1110 |
> |
int imax=rlen>>1; //first half of the read, right boundary for the wmer match |
1111 |
> |
if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); } |
1112 |
> |
for (int iw=0;iw<=hlen;iw++) { |
1113 |
> |
int apstart=a5len-iw-wordSize; |
1114 |
> |
fi=0; |
1115 |
> |
//int* qv=(int32 *)(adapter5.chars()+apstart); |
1116 |
> |
int qv=get3mer_value(adapter5.chars()+apstart); |
1117 |
> |
//while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) { |
1118 |
> |
while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) { |
1119 |
> |
#ifdef DEBUG |
1120 |
> |
if (debug) GMessage(" Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars()); |
1121 |
> |
#endif |
1122 |
> |
if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(), |
1123 |
> |
a5len, apstart, wordSize, l5,l3, a5segs, true)) return true; |
1124 |
> |
fi++; |
1125 |
> |
if (fi>imax) break; |
1126 |
|
} |
1127 |
< |
} //collect segment alignments into chains |
1128 |
< |
*/ |
1129 |
< |
return false; //no adapter parts found |
1130 |
< |
} |
1127 |
> |
} //for each wmer in the last hlen bases of the adaptor |
1128 |
> |
//if we're here we couldn't find a good extension |
1129 |
> |
return false; //no adapter parts found |
1130 |
> |
}//for each 5' adapter |
1131 |
> |
} |
1132 |
|
|
1133 |
|
//convert qvs to/from phred64 from/to phread33 |
1134 |
|
void convertPhred(GStr& q) { |
1192 |
|
} |
1193 |
|
}// fastq |
1194 |
|
// } //<-- FASTA or FASTQ |
1195 |
< |
rseq.upper(); //TODO: what if we care about masking? |
1195 |
> |
rseq.upper(); |
1196 |
|
return true; |
1197 |
|
} |
1198 |
|
|
1420 |
|
} |
1421 |
|
} |
1422 |
|
|
1423 |
+ |
|
1424 |
+ |
int loadAdapters(const char* fname) { |
1425 |
+ |
GLineReader lr(fname); |
1426 |
+ |
char* l; |
1427 |
+ |
while ((l=lr.nextLine())!=NULL) { |
1428 |
+ |
if (lr.length()<=3 || l[0]=='#') continue; |
1429 |
+ |
if ( l[0]==' ' || l[0]=='\t' || l[0]==',' || |
1430 |
+ |
l[0]==';'|| l[0]==':' ) { |
1431 |
+ |
int i=1; |
1432 |
+ |
while (l[i]!=0 && isspace(l[i])) { |
1433 |
+ |
i++; |
1434 |
+ |
} |
1435 |
+ |
if (l[i]!=0) { |
1436 |
+ |
adapters.Add(new CAdapters(NULL, &(l[i]))); |
1437 |
+ |
continue; |
1438 |
+ |
} |
1439 |
+ |
} |
1440 |
+ |
else { |
1441 |
+ |
GStr s(l); |
1442 |
+ |
s.startTokenize("\t ;,:"); |
1443 |
+ |
GStr a5,a3; |
1444 |
+ |
if (s.nextToken(a5)) |
1445 |
+ |
s.nextToken(a3); |
1446 |
+ |
adapters.Add(new CAdapters(a5.is_empty()?NULL:a5.chars(), |
1447 |
+ |
a3.is_empty()?NULL:a3.chars())); |
1448 |
+ |
} |
1449 |
+ |
} |
1450 |
+ |
return adapters.Count(); |
1451 |
+ |
} |
1452 |
+ |
|
1453 |
|
void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2, |
1454 |
|
GStr& s, GStr& infname, GStr& infname2) { |
1455 |
|
// uses outsuffix to generate output file names and open file handles as needed |