112 |
|
GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adapter sequence |
113 |
|
GStr seq; //actual adapter sequence data |
114 |
|
GStr seqr; //reverse complement sequence |
115 |
+ |
int amlen; //fraction of adapter length matching that's |
116 |
+ |
//enough to consider the alignment |
117 |
|
bool use_reverse; |
118 |
< |
CASeqData(bool rev=false):seq(),seqr() { |
119 |
< |
use_reverse=rev; |
118 |
> |
CASeqData(bool rev=false):seq(),seqr(), |
119 |
> |
amlen(0), use_reverse(rev) { |
120 |
|
for (int i=0;i<4096;i++) { |
121 |
|
pz[i]=NULL; |
122 |
|
pzr[i]=NULL; |
124 |
|
} |
125 |
|
|
126 |
|
void update(const char* s) { |
127 |
< |
seq=s; |
128 |
< |
table6mers(seq.chars(), seq.length(), pz); |
129 |
< |
if (!use_reverse) return; |
130 |
< |
//reverse complement |
131 |
< |
seqr=s; |
132 |
< |
int slen=seq.length(); |
133 |
< |
for (int i=0;i<slen;i++) |
134 |
< |
seqr[i]=ntComplement(seq[slen-i-1]); |
135 |
< |
table6mers(seqr.chars(), seqr.length(), pzr); |
127 |
> |
seq=s; |
128 |
> |
table6mers(seq.chars(), seq.length(), pz); |
129 |
> |
amlen=iround(double(seq.length())*0.8); |
130 |
> |
if (amlen<12) |
131 |
> |
amlen=12; |
132 |
> |
if (!use_reverse) return; |
133 |
> |
//reverse complement |
134 |
> |
seqr=s; |
135 |
> |
int slen=seq.length(); |
136 |
> |
for (int i=0;i<slen;i++) |
137 |
> |
seqr[i]=ntComplement(seq[slen-i-1]); |
138 |
> |
table6mers(seqr.chars(), seqr.length(), pzr); |
139 |
|
} |
140 |
|
|
141 |
|
~CASeqData() { |
396 |
|
} //pair read |
397 |
|
if (tcode>1) { //trashed |
398 |
|
#ifdef GDEBUG |
399 |
< |
GMessage(" !!!!TRASH => 'N'\n"); |
399 |
> |
GMessage(" !!!!TRASH code = %c\n",tcode); |
400 |
|
#endif |
401 |
|
if (tcode=='s') trash_s++; |
402 |
|
else if (tcode=='A' || tcode=='T') trash_poly++; |
948 |
|
l5=0; |
949 |
|
l3=rlen-1; |
950 |
|
bool trimmed=false; |
951 |
< |
GStr wseq(seq.chars()); |
951 |
> |
GStr wseq(seq); |
952 |
|
int wlen=rlen; |
953 |
+ |
GXSeqData seqdata; |
954 |
|
for (int ai=0;ai<adapters3.Count();ai++) { |
955 |
< |
//if (adapters3[ai].is_empty()) continue; |
956 |
< |
int alen=adapters3[ai].seq.length(); |
957 |
< |
GStr& aseq=adapters3[ai].seq; |
958 |
< |
GXAlnInfo* r_bestaln=match_RightEnd(aseq.chars(), alen, adapters3[ai].pz, |
953 |
< |
wseq.chars(), wlen, gxmem_r, 74); |
954 |
< |
if (r_bestaln) { |
955 |
> |
seqdata.update(adapters3[ai].seq.chars(), adapters3[ai].seq.length(), |
956 |
> |
adapters3[ai].pz, wseq.chars(), wlen, adapters3[ai].amlen); |
957 |
> |
GXAlnInfo* bestaln=match_RightEnd(seqdata, gxmem_r, 86); |
958 |
> |
if (bestaln) { |
959 |
|
trimmed=true; |
960 |
< |
//keep unmatched region on the left, if any |
961 |
< |
l3-=(wlen-r_bestaln->sl+1); |
962 |
< |
delete r_bestaln; |
963 |
< |
if (l3<0) l3=0; |
960 |
> |
//keep unmatched region on the left OR right (the longer one) |
961 |
> |
if (bestaln->sl > wlen-bestaln->sr) { |
962 |
> |
//keep left side |
963 |
> |
l3-=(wlen-bestaln->sl+1); |
964 |
> |
if (l3<0) l3=0; |
965 |
> |
} |
966 |
> |
else { //keep right side |
967 |
> |
l5+=bestaln->sr; |
968 |
> |
if (l5>=rlen) l5=rlen-1; |
969 |
> |
} |
970 |
> |
delete bestaln; |
971 |
|
if (l3-l5+1<min_read_len) return true; |
972 |
|
wseq=seq.substr(l5,l3-l5+1); |
973 |
|
wlen=wseq.length(); |
982 |
|
l5=0; |
983 |
|
l3=rlen-1; |
984 |
|
bool trimmed=false; |
985 |
< |
GStr wseq(seq.chars()); |
985 |
> |
GStr wseq(seq); |
986 |
|
int wlen=rlen; |
987 |
+ |
GXSeqData seqdata; |
988 |
|
for (int ai=0;ai<adapters5.Count();ai++) { |
989 |
< |
//if (adapters5[ai].is_empty()) continue; |
990 |
< |
int alen=adapters5[ai].seq.length(); |
991 |
< |
GStr& aseq=adapters5[ai].seq; |
992 |
< |
GXAlnInfo* l_bestaln=match_LeftEnd(aseq.chars(), alen, adapters5[ai].pz, |
981 |
< |
wseq.chars(), wlen, gxmem_l, 84); |
982 |
< |
if (l_bestaln) { |
989 |
> |
seqdata.update(adapters5[ai].seq.chars(), adapters5[ai].seq.length(), |
990 |
> |
adapters5[ai].pz, wseq.chars(), wlen, adapters5[ai].amlen); |
991 |
> |
GXAlnInfo* bestaln=match_LeftEnd(seqdata, gxmem_l, 90); |
992 |
> |
if (bestaln) { |
993 |
|
trimmed=true; |
994 |
< |
l5+=l_bestaln->sr; |
995 |
< |
delete l_bestaln; |
996 |
< |
if (l5>=rlen) l5=rlen-1; |
994 |
> |
if (bestaln->sl > wlen-bestaln->sr) { |
995 |
> |
//keep left side |
996 |
> |
l3-=(wlen-bestaln->sl+1); |
997 |
> |
if (l3<0) l3=0; |
998 |
> |
} |
999 |
> |
else { //keep right side |
1000 |
> |
l5+=bestaln->sr; |
1001 |
> |
if (l5>=rlen) l5=rlen-1; |
1002 |
> |
} |
1003 |
> |
delete bestaln; |
1004 |
|
if (l3-l5+1<min_read_len) return true; |
1005 |
|
wseq=seq.substr(l5,l3-l5+1); |
1006 |
|
wlen=wseq.length(); |