3 |
|
#include "GHash.hh" |
4 |
|
#include "GList.hh" |
5 |
|
#include <ctype.h> |
6 |
+ |
#include "GAlnExtend.h" |
7 |
|
|
8 |
|
#define USAGE "Usage:\n\ |
9 |
|
fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\ |
31 |
|
(e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\ |
32 |
|
-3 trim the given adapter sequence at the 3' end of each read\n\ |
33 |
|
(e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\ |
34 |
< |
-a minimum length of exact match to adaptor sequence at the proper end (6)\n\ |
34 |
> |
-A disable polyA trimming (enabled by default)\n\ |
35 |
> |
-T enable polyT trimming (disabled by default)\n\ |
36 |
> |
-y minimum length of exact match to adaptor sequence at the proper end (6)\n\ |
37 |
|
-q trim bases with quality value lower than <minq> (starting at the 3' end)\n\ |
38 |
|
-t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\ |
39 |
|
-m maximum percentage of Ns allowed in a read after trimming (default 7)\n\ |
78 |
|
bool isfasta=false; |
79 |
|
bool convert_phred=false; |
80 |
|
GStr outsuffix; // -o |
78 |
– |
//GStr adapter3; |
79 |
– |
//GStr adapter5; |
81 |
|
GStr prefix; |
82 |
|
GStr zcmd; |
83 |
|
int num_trimmed5=0; |
92 |
|
int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet) |
93 |
|
int qv_cvtadd=0; //could be -31 or +31 |
94 |
|
|
95 |
< |
int a3len=0; |
96 |
< |
int a5len=0; |
97 |
< |
// adaptor matching metrics -- for extendMatch() function |
97 |
< |
const int match_score=2; //match score |
98 |
< |
const int mismatch_score=-3; //mismatch |
99 |
< |
|
100 |
< |
const int a_m_score=2; //match score |
101 |
< |
const int a_mis_score=-3; //mismatch |
102 |
< |
const int a_dropoff_score=7; |
95 |
> |
// adaptor matching metrics -- for X-drop ungapped extension |
96 |
> |
const int poly_m_score=2; //match score |
97 |
> |
const int poly_mis_score=-3; //mismatch |
98 |
|
const int poly_dropoff_score=7; |
99 |
< |
int a_min_score=12; //an exact match of 6 bases at the proper ends WILL be trimmed |
105 |
< |
const int a_min_chain_score=15; //for gapped alignments |
99 |
> |
int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed |
100 |
|
|
101 |
< |
class CSegChain; |
101 |
> |
const char *polyA_seed="AAAA"; |
102 |
> |
const char *polyT_seed="TTTT"; |
103 |
|
|
104 |
< |
class CSegPair { |
105 |
< |
public: |
106 |
< |
GSeg a; |
107 |
< |
GSeg b; //the adapter segment |
113 |
< |
int score; |
114 |
< |
int flags; |
115 |
< |
CSegChain* chain; |
116 |
< |
CSegPair(int astart=0, int aend=0, int bstart=0, int bend=0, int mscore=0):a(astart,aend),b(bstart, bend) { |
117 |
< |
score=mscore; |
118 |
< |
if (score==0) score=a.len()*a_m_score; |
119 |
< |
flags=0; |
120 |
< |
chain=NULL; |
104 |
> |
struct CAdapters { |
105 |
> |
GStr a5; |
106 |
> |
GStr a3; |
107 |
> |
CAdapters(const char* s5=NULL, const char* s3=NULL):a5(s5),a3(s3) { |
108 |
|
} |
109 |
< |
int len() { return a.len(); } |
123 |
< |
bool operator==(CSegPair& d){ |
124 |
< |
//return (a.start==d.a.start && a.end==d.a.end && b.start==d.b.start && b.end==d.b.end); |
125 |
< |
//make equal even segments that are included into one another: |
126 |
< |
return (d.a.start>=a.start && d.a.end<=a.end && d.b.start>=b.start && d.b.end<=b.end); |
127 |
< |
} |
128 |
< |
bool operator>(CSegPair& d){ //ordering based on b (adaptor) start coord and score |
129 |
< |
if (b.start==d.b.start) { |
130 |
< |
if (score==d.score) { |
131 |
< |
//just try to be consistent: |
132 |
< |
if (b.end==d.b.end) { |
133 |
< |
return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start); |
134 |
< |
} |
135 |
< |
return (b.end>d.b.end); |
136 |
< |
} |
137 |
< |
else return (score<d.score); |
138 |
< |
} |
139 |
< |
return (b.start>d.b.start); |
140 |
< |
} |
141 |
< |
bool operator<(CSegPair& d){ //ordering based on b (adaptor) coord |
142 |
< |
/*if (b.start==d.b.start && b.end==d.b.end) { |
143 |
< |
return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start); |
144 |
< |
} |
145 |
< |
return (b.start==d.b.start)?(b.end<d.b.end):(b.start<d.b.start);*/ |
146 |
< |
if (b.start==d.b.start) { |
147 |
< |
if (score==d.score) { |
148 |
< |
//just try to be consistent: |
149 |
< |
if (b.end==d.b.end) { |
150 |
< |
return (a.start==d.a.start)?(a.end>d.a.end):(a.start>d.a.start); |
151 |
< |
} |
152 |
< |
return (b.end<d.b.end); |
153 |
< |
} |
154 |
< |
else return (score>d.score); |
155 |
< |
} |
156 |
< |
return (b.start<d.b.start); |
157 |
< |
} |
158 |
< |
}; |
159 |
< |
|
160 |
< |
int cmpSegEnds(pointer sa, pointer sb) { //sort by adaptor seg ends AND score |
161 |
< |
CSegPair& x = *(CSegPair *)sa; |
162 |
< |
CSegPair& y = *(CSegPair *)sb; |
163 |
< |
/* |
164 |
< |
if (x.b.end==y.b.end) { |
165 |
< |
if (x.b.start==y.b.start) { |
166 |
< |
if (x.a.end==y.a.end) { |
167 |
< |
if (x.a.start==y.a.start) return 0; |
168 |
< |
return ((x.a.start>y.a.start) ? -1 : 1); |
169 |
< |
} |
170 |
< |
else { |
171 |
< |
return ((x.a.end>y.a.end) ? -1 : 1); |
172 |
< |
} |
173 |
< |
} |
174 |
< |
else { |
175 |
< |
return ((x.b.start>y.b.start) ? -1 : 1); |
176 |
< |
} |
177 |
< |
} |
178 |
< |
else { |
179 |
< |
return ((x.b.end>y.b.end) ? -1 : 1); |
180 |
< |
} |
181 |
< |
*/ |
182 |
< |
if (x.b.end==y.b.end) { |
183 |
< |
if (x.score==y.score) { |
184 |
< |
if (x.b.start==y.b.start) { |
185 |
< |
if (x.a.end==y.a.end) { |
186 |
< |
if (x.a.start==y.a.start) return 0; |
187 |
< |
return ((x.a.start<y.a.start) ? -1 : 1); |
188 |
< |
} |
189 |
< |
else { |
190 |
< |
return ((x.a.end<y.a.end) ? -1 : 1); |
191 |
< |
} |
192 |
< |
} |
193 |
< |
else { |
194 |
< |
return ((x.b.start<y.b.start) ? -1 : 1); |
195 |
< |
} |
196 |
< |
} else return ((x.score>y.score) ? -1 : 1); |
197 |
< |
} |
198 |
< |
else { |
199 |
< |
return ((x.b.end>y.b.end) ? -1 : 1); |
200 |
< |
} |
201 |
< |
|
202 |
< |
} |
109 |
> |
}; |
110 |
|
|
111 |
< |
class CSegChain:public GList<CSegPair> { |
205 |
< |
public: |
206 |
< |
uint astart; |
207 |
< |
uint aend; |
208 |
< |
uint bstart; |
209 |
< |
uint bend; |
210 |
< |
int score; |
211 |
< |
bool endSort; |
212 |
< |
CSegChain(bool aln5=false):GList<CSegPair>(true,true,true) {//sorted, free elements, unique |
213 |
< |
//as SegPairs are inserted, they will be sorted by a.start coordinate |
214 |
< |
score=0; |
215 |
< |
astart=MAX_UINT; |
216 |
< |
aend=0; |
217 |
< |
bstart=MAX_UINT; |
218 |
< |
bend=0; |
219 |
< |
endSort=aln5; |
220 |
< |
if (aln5) { setSorted(cmpSegEnds); } |
221 |
< |
} |
222 |
< |
bool operator==(CSegChain& d) { |
223 |
< |
//return (score==d.score); |
224 |
< |
return (astart==d.astart && aend==d.aend && bstart==d.bstart && bend==d.bend); |
225 |
< |
} |
226 |
< |
bool operator>(CSegChain& d) { // order based on b (adaptor) coordinate |
227 |
< |
//return (score<d.score); |
228 |
< |
if (bstart==d.bstart && bend==d.bend) { |
229 |
< |
return (astart==d.astart)?(aend>d.aend):(astart>d.astart); |
230 |
< |
} |
231 |
< |
return (bstart==d.bstart)?(bend>d.bend):(bstart>d.bstart); |
232 |
< |
} |
233 |
< |
bool operator<(CSegChain& d) { |
234 |
< |
//return (score>d.score); |
235 |
< |
if (bstart==d.bstart && bend==d.bend) { |
236 |
< |
return (astart==d.astart)?(aend<d.aend):(astart<d.astart); |
237 |
< |
} |
238 |
< |
return (bstart==d.bstart)?(bend<d.bend):(bstart<d.bstart); |
239 |
< |
} |
240 |
< |
void addSegPair(CSegPair* segp) { |
241 |
< |
if (AddIfNew(segp)!=segp) return; |
242 |
< |
score+=segp->score; |
243 |
< |
if (astart>segp->a.start) astart=segp->a.start; |
244 |
< |
if (aend<segp->a.end) aend=segp->a.end; |
245 |
< |
if (bstart>segp->b.start) bstart=segp->b.start; |
246 |
< |
if (bend<segp->b.end) bend=segp->b.end; |
247 |
< |
} |
248 |
< |
//for building actual chains: |
249 |
< |
bool extendChain(CSegPair* segp) { //segp expected to be "Greater Than" current chain |
250 |
< |
int bgap=0; |
251 |
< |
int agap=0; |
252 |
< |
//if (endSort) { |
253 |
< |
if (bstart>segp->b.start) { |
254 |
< |
bgap = (int)(bstart-segp->b.end); |
255 |
< |
if (abs(bgap)>2) return false; |
256 |
< |
agap = (int)(astart-segp->a.end); |
257 |
< |
if (abs(agap)>2) return false; |
258 |
< |
} |
259 |
< |
else { |
260 |
< |
bgap = (int) (segp->b.start-bend); |
261 |
< |
if (abs(bgap)>2) return false; |
262 |
< |
agap = (int)(segp->a.start-aend); |
263 |
< |
if (abs(agap)>2) return false; |
264 |
< |
} |
265 |
< |
if (agap*bgap<0) return false; |
266 |
< |
addSegPair(segp); |
267 |
< |
score-=abs(agap)+abs(bgap); |
268 |
< |
return true; |
269 |
< |
} |
270 |
< |
}; |
111 |
> |
GPVec<CAdapters> adapters; |
112 |
|
|
113 |
|
// element in dhash: |
114 |
|
class FqDupRec { |
158 |
|
|
159 |
|
GHash<FqDupRec> dhash; //hash to keep track of duplicates |
160 |
|
|
161 |
+ |
int loadAdapters(const char* fname); |
162 |
+ |
|
163 |
|
void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2, |
164 |
|
GStr& s, GStr& infname, GStr& infname2); |
165 |
|
// uses outsuffix to generate output file names and open file handles as needed |
174 |
|
//returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed |
175 |
|
|
176 |
|
bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured |
334 |
– |
bool polytrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured |
177 |
|
bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured |
178 |
|
int dust(GStr& seq); |
179 |
< |
bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured |
179 |
> |
bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured |
180 |
> |
bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed); |
181 |
|
bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured |
182 |
+ |
bool trim_adapter3(GStr& seq, int &l5, int &l3); |
183 |
|
|
184 |
|
void convertPhred(char* q, int len); |
185 |
|
void convertPhred(GStr& q); |
236 |
|
else |
237 |
|
GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE); |
238 |
|
} |
239 |
< |
if (args.getOpt('3')!=NULL) { |
240 |
< |
adapter3=args.getOpt('3'); |
241 |
< |
adapter3.upper(); |
242 |
< |
a3len=adapter3.length(); |
243 |
< |
} |
244 |
< |
if (args.getOpt('5')!=NULL) { |
245 |
< |
adapter5=args.getOpt('5'); |
246 |
< |
adapter5.upper(); |
247 |
< |
a5len=adapter5.length(); |
239 |
> |
s=args.getOpt('f'); |
240 |
> |
if (!s.is_empty()) { |
241 |
> |
loadAdapters(s.chars()); |
242 |
> |
} |
243 |
> |
bool fileAdapters=adapters.Count(); |
244 |
> |
s=args.getOpt('5'); |
245 |
> |
if (!s.is_empty()) { |
246 |
> |
if (fileAdapters) |
247 |
> |
GError("Error: options -5 and -f cannot be used together!\n"); |
248 |
> |
s.upper(); |
249 |
> |
adapters.Add(new CAdapters(s.chars())); |
250 |
|
} |
251 |
< |
s=args.getOpt('a'); |
251 |
> |
s=args.getOpt('3'); |
252 |
|
if (!s.is_empty()) { |
253 |
< |
int a_minmatch=s.asInt(); |
254 |
< |
a_min_score=a_minmatch<<1; |
253 |
> |
if (fileAdapters) |
254 |
> |
GError("Error: options -3 and -f cannot be used together!\n"); |
255 |
> |
s.upper(); |
256 |
> |
if (adapters.Count()>0) |
257 |
> |
adapters[0]->a3=s.chars(); |
258 |
> |
else adapters.Add(NULL, new CAdapters(s.chars())); |
259 |
> |
} |
260 |
> |
s=args.getOpt('y'); |
261 |
> |
if (!s.is_empty()) { |
262 |
> |
int minmatch=s.asInt(); |
263 |
> |
poly_minScore=minmatch*poly_m_score; |
264 |
|
} |
265 |
|
|
266 |
|
if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o'); |
716 |
|
return ncount; |
717 |
|
} |
718 |
|
|
864 |
– |
|
865 |
– |
// ------------------ adapter matching - simple k-mer seed & extend, no indels for now |
866 |
– |
//when a k-mer match is found, simply try to extend the alignment using a drop-off scheme |
867 |
– |
//check minimum score and |
868 |
– |
//for 3' adapter trimming: |
869 |
– |
// require that the right end of the alignment for either the adaptor OR the read must be |
870 |
– |
// < 3 distance from its right end |
871 |
– |
// for 5' adapter trimming: |
872 |
– |
// require that the left end of the alignment for either the adaptor OR the read must |
873 |
– |
// be at coordinate < 3 from start |
874 |
– |
|
875 |
– |
bool extendMatch(const char* a, int alen, int ai, |
876 |
– |
const char* b, int blen, int bi, int mlen, int& l5, int& l3, CSegChain& segs, bool end5=false) { |
877 |
– |
//so the alignment starts at ai in a, bi in b, with a perfect match of length mlen |
878 |
– |
#ifdef DEBUG |
879 |
– |
GStr dbg(b); |
880 |
– |
#endif |
881 |
– |
//if (debug) { |
882 |
– |
// GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai); |
883 |
– |
// } |
884 |
– |
int a_l=ai; //alignment coordinates on a |
885 |
– |
int a_r=ai+mlen-1; |
886 |
– |
int b_l=bi; //alignment coordinates on b |
887 |
– |
int b_r=bi+mlen-1; |
888 |
– |
int ai_maxscore=ai; |
889 |
– |
int bi_maxscore=bi; |
890 |
– |
int score=mlen*a_m_score; |
891 |
– |
int maxscore=score; |
892 |
– |
int mism5score=a_mis_score; |
893 |
– |
if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end |
894 |
– |
//try to extend to the left first, if possible |
895 |
– |
while (ai>0 && bi>0) { |
896 |
– |
ai--; |
897 |
– |
bi--; |
898 |
– |
score+= (a[ai]==b[bi])? a_m_score : mism5score; |
899 |
– |
if (score>maxscore) { |
900 |
– |
ai_maxscore=ai; |
901 |
– |
bi_maxscore=bi; |
902 |
– |
maxscore=score; |
903 |
– |
} |
904 |
– |
else if (maxscore-score>a_dropoff_score) break; |
905 |
– |
} |
906 |
– |
a_l=ai_maxscore; |
907 |
– |
b_l=bi_maxscore; |
908 |
– |
//if (debug) GMessage(" after l-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore); |
909 |
– |
//now extend to the right |
910 |
– |
ai_maxscore=a_r; |
911 |
– |
bi_maxscore=b_r; |
912 |
– |
ai=a_r; |
913 |
– |
bi=b_r; |
914 |
– |
score=maxscore; |
915 |
– |
//sometimes there are extra AAAAs at the end of the read, ignore those |
916 |
– |
if (strcmp(&a[alen-4],"AAAA")==0) { |
917 |
– |
alen-=3; |
918 |
– |
while (a[alen-1]=='A' && alen>ai) alen--; |
919 |
– |
} |
920 |
– |
while (ai<alen-1 && bi<blen-1) { |
921 |
– |
ai++; |
922 |
– |
bi++; |
923 |
– |
//score+= (a[ai]==b[bi])? a_m_score : a_mis_score; |
924 |
– |
if (a[ai]==b[bi]) { //match |
925 |
– |
score+=a_m_score; |
926 |
– |
if (ai>=alen-2) { |
927 |
– |
score+=a_m_score-(alen-ai-1); |
928 |
– |
} |
929 |
– |
} |
930 |
– |
else { //mismatch |
931 |
– |
score+=a_mis_score; |
932 |
– |
} |
933 |
– |
if (score>maxscore) { |
934 |
– |
ai_maxscore=ai; |
935 |
– |
bi_maxscore=bi; |
936 |
– |
maxscore=score; |
937 |
– |
} |
938 |
– |
else if (maxscore-score>a_dropoff_score) break; |
939 |
– |
} |
940 |
– |
a_r=ai_maxscore; |
941 |
– |
b_r=bi_maxscore; |
942 |
– |
int a_ovh3=alen-a_r-1; |
943 |
– |
int b_ovh3=blen-b_r-1; |
944 |
– |
int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3; |
945 |
– |
int mmovh5=(a_l<b_l)? a_l : b_l; |
946 |
– |
//if (debug) GMessage(" after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore); |
947 |
– |
#ifdef DEBUG |
948 |
– |
if (debug) GMessage(" extended to: %*s\n",a_r+1,dbg.substr(b_l,b_r-b_l+1).chars()); |
949 |
– |
#endif |
950 |
– |
if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) { |
951 |
– |
if (a_l<a_ovh3) { |
952 |
– |
//adapter closer to the left end (typical for 5' adapter) |
953 |
– |
l5=a_r+1; |
954 |
– |
l3=alen-1; |
955 |
– |
} |
956 |
– |
else { |
957 |
– |
//adapter matching at the right end (typical for 3' adapter) |
958 |
– |
l5=0; |
959 |
– |
l3=a_l-1; |
960 |
– |
} |
961 |
– |
return true; |
962 |
– |
} |
963 |
– |
else { //keep this segment pair for later (gapped alignment) |
964 |
– |
segs.addSegPair(new CSegPair(a_l+1, a_r+1, b_l+1, b_r+1, maxscore)); |
965 |
– |
//this will also update min & max coordinates in segs (segs.astart, .aend, .bstart, .bend) |
966 |
– |
} |
967 |
– |
//do not trim: |
968 |
– |
l5=0; |
969 |
– |
l3=alen-1; |
970 |
– |
return false; |
971 |
– |
} |
972 |
– |
|
973 |
– |
/* |
974 |
– |
int getWordValue(const char* s, int wlen) { |
975 |
– |
int r=0; |
976 |
– |
while (wlen--) { r+=(((int)s[wlen])<<wlen) } |
977 |
– |
return r; |
978 |
– |
} |
979 |
– |
*/ |
719 |
|
int get3mer_value(const char* s) { |
720 |
|
return (s[0]<<16)+(s[1]<<8)+s[2]; |
721 |
|
} |
739 |
|
return -1; |
740 |
|
} |
741 |
|
|
742 |
< |
int fast4match(int32 qv, const char* str, int slen, int start_index=0) { |
743 |
< |
if (start_index>=slen || start_index<0) return -1; |
744 |
< |
for (int i=start_index;i<slen-4;i++) { |
745 |
< |
int32* rv=(int32*)(str+i); |
746 |
< |
if (*rv==qv) return i; |
747 |
< |
} |
748 |
< |
return -1; |
749 |
< |
} |
742 |
> |
struct SLocScore { |
743 |
> |
int pos; |
744 |
> |
int score; |
745 |
> |
SLocScore(int p=0,int s=0) { |
746 |
> |
pos=p; |
747 |
> |
score=s; |
748 |
> |
} |
749 |
> |
void set(int p, int s) { |
750 |
> |
pos=p; |
751 |
> |
score=s; |
752 |
> |
} |
753 |
> |
void add(int p, int add) { |
754 |
> |
pos=p; |
755 |
> |
score+=add; |
756 |
> |
} |
757 |
> |
}; |
758 |
|
|
759 |
< |
int fast4rmatch(int32 qv, const char* str, int slen, int end_index=-1) { |
760 |
< |
if (end_index>=slen) return -1; |
761 |
< |
if (end_index<0) end_index=slen-1; |
762 |
< |
for (int i=end_index-3;i>=0;i--) { |
763 |
< |
int32* rv=(int32*)(str+i); |
764 |
< |
if (*rv==qv) return i; |
759 |
> |
bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) { |
760 |
> |
int rlen=seq.length(); |
761 |
> |
l5=0; |
762 |
> |
l3=rlen-1; |
763 |
> |
int32 seedVal=*(int32*)poly_seed; |
764 |
> |
char polyChar=poly_seed[0]; |
765 |
> |
//assumes N trimming was already done |
766 |
> |
//so a poly match should be very close to the end of the read |
767 |
> |
// -- find the initial match (seed) |
768 |
> |
int lmin=GMAX((rlen-12), 0); |
769 |
> |
int li; |
770 |
> |
for (li=rlen-4;li>lmin;li--) { |
771 |
> |
if (seedVal==*(int*)&(seq[li])) { |
772 |
> |
break; |
773 |
> |
} |
774 |
|
} |
775 |
< |
return -1; |
776 |
< |
} |
777 |
< |
|
778 |
< |
#ifdef DEBUG |
779 |
< |
void dbgPrintChain(CSegChain& chain, const char* aseq) { |
780 |
< |
GStr s(aseq); |
781 |
< |
for (int i=0;i<chain.Count();i++) { |
782 |
< |
CSegPair& seg=*chain[i]; |
783 |
< |
GMessage(" dbg chain seg%d: %*s [%d-%d:%d-%d]\n",i,seg.a.start-1+seg.len(), |
784 |
< |
s.substr(seg.b.start-1, seg.len()).chars(), seg.b.start,seg.b.end,seg.a.start,seg.a.end); |
775 |
> |
if (li<=lmin) return false; |
776 |
> |
//seed found, try to extend it both ways |
777 |
> |
//extend right |
778 |
> |
int ri=li+3; |
779 |
> |
SLocScore loc(ri, poly_m_score<<2); |
780 |
> |
SLocScore maxloc(loc); |
781 |
> |
//extend right |
782 |
> |
while (ri<rlen-2) { |
783 |
> |
ri++; |
784 |
> |
if (seq[ri]==polyChar) { |
785 |
> |
loc.add(ri,poly_m_score); |
786 |
> |
} |
787 |
> |
else if (seq[ri]=='N') { |
788 |
> |
loc.add(ri,0); |
789 |
> |
} |
790 |
> |
else { //mismatch |
791 |
> |
loc.add(ri,poly_mis_score); |
792 |
> |
if (maxloc.score-loc.score>poly_dropoff_score) break; |
793 |
> |
} |
794 |
> |
if (maxloc.score<=loc.score) { |
795 |
> |
maxloc=loc; |
796 |
> |
} |
797 |
|
} |
798 |
+ |
ri=maxloc.pos; |
799 |
+ |
if (ri<rlen-3) return false; //no trimming wanted, too far from 3' end |
800 |
+ |
//ri = right boundary for the poly match |
801 |
+ |
//extend left |
802 |
+ |
loc.set(li, maxloc.score); |
803 |
+ |
maxloc.pos=li; |
804 |
+ |
while (li>0) { |
805 |
+ |
li--; |
806 |
+ |
if (seq[li]==polyChar) { |
807 |
+ |
loc.add(li,poly_m_score); |
808 |
+ |
} |
809 |
+ |
else if (seq[li]=='N') { |
810 |
+ |
loc.add(li,0); |
811 |
+ |
} |
812 |
+ |
else { //mismatch |
813 |
+ |
loc.add(li,poly_mis_score); |
814 |
+ |
if (maxloc.score-loc.score>poly_dropoff_score) break; |
815 |
+ |
} |
816 |
+ |
if (maxloc.score<=loc.score) { |
817 |
+ |
maxloc=loc; |
818 |
+ |
} |
819 |
+ |
} |
820 |
+ |
if (maxloc.score>poly_minScore && ri>=rlen-3) { |
821 |
+ |
l5=li; |
822 |
+ |
l3=ri; |
823 |
+ |
return true; |
824 |
+ |
} |
825 |
+ |
return false; |
826 |
|
} |
1031 |
– |
#endif |
1032 |
– |
const char *pA_seed="AAAA"; |
1033 |
– |
const char *pT_seed="TTTT"; |
1034 |
– |
const int polyA_seed=*(int*)pA_seed; |
1035 |
– |
const int polyT_seed=*(int*)pT_seed; |
827 |
|
|
828 |
< |
bool trim_poly3(GStr &seq, int &l5, int &l3) { |
828 |
> |
|
829 |
> |
bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) { |
830 |
|
int rlen=seq.length(); |
831 |
|
l5=0; |
832 |
|
l3=rlen-1; |
833 |
+ |
int32 seedVal=*(int32*)poly_seed; |
834 |
+ |
char polyChar=poly_seed[0]; |
835 |
|
//assumes N trimming was already done |
836 |
|
//so a poly match should be very close to the end of the read |
1043 |
– |
//TODO: should we attempt polyA and polyT at the same time, same end? |
837 |
|
// -- find the initial match (seed) |
838 |
< |
int rlo=GMAX((rlen-10), 0); |
839 |
< |
int si; |
840 |
< |
for (si=rlen-5;si>rlo;si--) { |
841 |
< |
if (polyA_seed==*(int*)&(seq.chars()+si)) { |
838 |
> |
int lmax=GMIN(8, rlen-4);//how far from 5' end to look for 4-mer seeds |
839 |
> |
int li; |
840 |
> |
for (li=0;li<=lmax;li++) { |
841 |
> |
if (seedVal==*(int*)&(seq[li])) { |
842 |
|
break; |
843 |
|
} |
844 |
|
} |
845 |
< |
if (si<=rlo) return false; |
846 |
< |
//seed found, try to extend it to left and right |
847 |
< |
int score=4*match_score; |
848 |
< |
max_rlo=rlo; |
849 |
< |
rlo--; |
850 |
< |
while (rlo>=0) { |
851 |
< |
rlo--; |
852 |
< |
score+= |
853 |
< |
|
845 |
> |
if (li>lmax) return false; |
846 |
> |
//seed found, try to extend it both ways |
847 |
> |
//extend left |
848 |
> |
int ri=li+3; //save rightmost base of the seed |
849 |
> |
SLocScore loc(li, poly_m_score<<2); |
850 |
> |
SLocScore maxloc(loc); |
851 |
> |
while (li>0) { |
852 |
> |
li--; |
853 |
> |
if (seq[li]==polyChar) { |
854 |
> |
loc.add(li,poly_m_score); |
855 |
> |
} |
856 |
> |
else if (seq[li]=='N') { |
857 |
> |
loc.add(li,0); |
858 |
> |
} |
859 |
> |
else { //mismatch |
860 |
> |
loc.add(li,poly_mis_score); |
861 |
> |
if (maxloc.score-loc.score>poly_dropoff_score) break; |
862 |
> |
} |
863 |
> |
if (maxloc.score<=loc.score) { |
864 |
> |
maxloc=loc; |
865 |
> |
} |
866 |
> |
} |
867 |
> |
li=maxloc.pos; |
868 |
> |
if (li>3) return false; //no trimming wanted, too far from 5' end |
869 |
> |
//li = right boundary for the poly match |
870 |
> |
|
871 |
> |
//extend right |
872 |
> |
loc.set(ri, maxloc.score); |
873 |
> |
maxloc.pos=ri; |
874 |
> |
while (ri<rlen-2) { |
875 |
> |
ri++; |
876 |
> |
if (seq[ri]==polyChar) { |
877 |
> |
loc.add(ri,poly_m_score); |
878 |
> |
} |
879 |
> |
else if (seq[ri]=='N') { |
880 |
> |
loc.add(ri,0); |
881 |
> |
} |
882 |
> |
else { //mismatch |
883 |
> |
loc.add(ri,poly_mis_score); |
884 |
> |
if (maxloc.score-loc.score>poly_dropoff_score) break; |
885 |
> |
} |
886 |
> |
if (maxloc.score<=loc.score) { |
887 |
> |
maxloc=loc; |
888 |
> |
} |
889 |
> |
} |
890 |
> |
|
891 |
> |
if (maxloc.score>poly_minScore && li<=3) { |
892 |
> |
l5=li; |
893 |
> |
l3=ri; |
894 |
> |
return true; |
895 |
|
} |
896 |
+ |
return false; |
897 |
|
} |
898 |
|
|
899 |
|
bool trim_adapter3(GStr& seq, int&l5, int &l3) { |
1249 |
|
} |
1250 |
|
}// fastq |
1251 |
|
// } //<-- FASTA or FASTQ |
1252 |
< |
rseq.upper(); //TODO: what if we care about masking? |
1252 |
> |
rseq.upper(); |
1253 |
|
return true; |
1254 |
|
} |
1255 |
|
|
1477 |
|
} |
1478 |
|
} |
1479 |
|
|
1480 |
+ |
|
1481 |
+ |
int loadAdapters(const char* fname) { |
1482 |
+ |
GLineReader lr(fname); |
1483 |
+ |
char* l; |
1484 |
+ |
while ((l=lr.nextLine())!=NULL) { |
1485 |
+ |
if (lr.length()<=3 || l[0]=='#') continue; |
1486 |
+ |
char* s5; |
1487 |
+ |
char* s3; |
1488 |
+ |
if (isspace(l[0]) || l[0]==',' || l[0]==';'|| l[0]==':') { |
1489 |
+ |
int i=1; |
1490 |
+ |
while (l[i]!=0 && isspace(l[i])) { |
1491 |
+ |
i++; |
1492 |
+ |
} |
1493 |
+ |
if (l[i]!=0) { |
1494 |
+ |
adapters.Add(new CAdapters(NULL, &(l[i]))); |
1495 |
+ |
continue; |
1496 |
+ |
} |
1497 |
+ |
} |
1498 |
+ |
else { |
1499 |
+ |
p=l; |
1500 |
+ |
while (*delpos!='\0' && strchr(fTokenDelimiter, *delpos)!=NULL) |
1501 |
+ |
delpos++; |
1502 |
+ |
s5.startTokenize("\t ;,:",); |
1503 |
+ |
s5.nextToken(s3); |
1504 |
+ |
} |
1505 |
+ |
} |
1506 |
+ |
|
1507 |
+ |
} |
1508 |
+ |
|
1509 |
|
void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2, |
1510 |
|
GStr& s, GStr& infname, GStr& infname2) { |
1511 |
|
// uses outsuffix to generate output file names and open file handles as needed |