ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 112 | Line 112
112     GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adapter sequence
113     GStr seq; //actual adapter sequence data
114     GStr seqr; //reverse complement sequence
115 +   int amlen; //fraction of adapter length matching that's
116 +              //enough to consider the alignment
117     bool use_reverse;
118 <   CASeqData(bool rev=false):seq(),seqr() {
119 <     use_reverse=rev;
118 >   CASeqData(bool rev=false):seq(),seqr(),
119 >             amlen(0), use_reverse(rev) {
120       for (int i=0;i<4096;i++) {
121          pz[i]=NULL;
122          pzr[i]=NULL;
# Line 122 | Line 124
124       }
125  
126     void update(const char* s) {
127 <   seq=s;
128 <   table6mers(seq.chars(), seq.length(), pz);
129 <   if (!use_reverse) return;
130 <   //reverse complement
131 <   seqr=s;
132 <   int slen=seq.length();
133 <   for (int i=0;i<slen;i++)
134 <     seqr[i]=ntComplement(seq[slen-i-1]);
135 <   table6mers(seqr.chars(), seqr.length(), pzr);
127 >         seq=s;
128 >         table6mers(seq.chars(), seq.length(), pz);
129 >         amlen=iround(double(seq.length())*0.8);
130 >         if (amlen<12)
131 >                amlen=12;
132 >         if (!use_reverse) return;
133 >         //reverse complement
134 >         seqr=s;
135 >         int slen=seq.length();
136 >         for (int i=0;i<slen;i++)
137 >           seqr[i]=ntComplement(seq[slen-i-1]);
138 >         table6mers(seqr.chars(), seqr.length(), pzr);
139     }
140  
141     ~CASeqData() {
# Line 391 | Line 396
396              } //pair read
397         if (tcode>1) { //trashed
398           #ifdef GDEBUG
399 <         GMessage(" !!!!TRASH => 'N'\n");
399 >         GMessage(" !!!!TRASH code = %c\n",tcode);
400           #endif
401            if (tcode=='s') trash_s++;
402            else if (tcode=='A' || tcode=='T') trash_poly++;
# Line 943 | Line 948
948   l5=0;
949   l3=rlen-1;
950   bool trimmed=false;
951 < GStr wseq(seq.chars());
951 > GStr wseq(seq);
952   int wlen=rlen;
953 + GXSeqData seqdata;
954   for (int ai=0;ai<adapters3.Count();ai++) {
955 <  //if (adapters3[ai].is_empty()) continue;
956 <  int alen=adapters3[ai].seq.length();
957 <  GStr& aseq=adapters3[ai].seq;
958 <  GXAlnInfo* r_bestaln=match_RightEnd(aseq.chars(), alen, adapters3[ai].pz,
953 <                            wseq.chars(), wlen, gxmem_r, 74);
954 <  if (r_bestaln) {
955 >  seqdata.update(adapters3[ai].seq.chars(), adapters3[ai].seq.length(),
956 >       adapters3[ai].pz, wseq.chars(), wlen, adapters3[ai].amlen);
957 >  GXAlnInfo* bestaln=match_RightEnd(seqdata, gxmem_r, 86);
958 >  if (bestaln) {
959       trimmed=true;
960 <     //keep unmatched region on the left, if any
961 <     l3-=(wlen-r_bestaln->sl+1);
962 <     delete r_bestaln;
963 <     if (l3<0) l3=0;
960 >     //keep unmatched region on the left OR right (the longer one)
961 >     if (bestaln->sl > wlen-bestaln->sr) {
962 >         //keep left side
963 >         l3-=(wlen-bestaln->sl+1);
964 >         if (l3<0) l3=0;
965 >         }
966 >     else { //keep right side
967 >         l5+=bestaln->sr;
968 >         if (l5>=rlen) l5=rlen-1;
969 >         }
970 >     delete bestaln;
971       if (l3-l5+1<min_read_len) return true;
972       wseq=seq.substr(l5,l3-l5+1);
973       wlen=wseq.length();
# Line 971 | Line 982
982   l5=0;
983   l3=rlen-1;
984   bool trimmed=false;
985 < GStr wseq(seq.chars());
985 > GStr wseq(seq);
986   int wlen=rlen;
987 + GXSeqData seqdata;
988   for (int ai=0;ai<adapters5.Count();ai++) {
989 <  //if (adapters5[ai].is_empty()) continue;
990 <  int alen=adapters5[ai].seq.length();
991 <  GStr& aseq=adapters5[ai].seq;
992 <  GXAlnInfo* l_bestaln=match_LeftEnd(aseq.chars(), alen, adapters5[ai].pz,
981 <                 wseq.chars(), wlen, gxmem_l, 84);
982 <  if (l_bestaln) {
989 >   seqdata.update(adapters5[ai].seq.chars(), adapters5[ai].seq.length(),
990 >       adapters5[ai].pz, wseq.chars(), wlen, adapters5[ai].amlen);
991 >  GXAlnInfo* bestaln=match_LeftEnd(seqdata, gxmem_l, 90);
992 >  if (bestaln) {
993       trimmed=true;
994 <     l5+=l_bestaln->sr;
995 <     delete l_bestaln;
996 <     if (l5>=rlen) l5=rlen-1;
994 >     if (bestaln->sl > wlen-bestaln->sr) {
995 >         //keep left side
996 >         l3-=(wlen-bestaln->sl+1);
997 >         if (l3<0) l3=0;
998 >         }
999 >     else { //keep right side
1000 >         l5+=bestaln->sr;
1001 >         if (l5>=rlen) l5=rlen-1;
1002 >         }
1003 >     delete bestaln;
1004       if (l3-l5+1<min_read_len) return true;
1005       wseq=seq.substr(l5,l3-l5+1);
1006       wlen=wseq.length();

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines