ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 94 | Line 94
94   int qv_cvtadd=0; //could be -31 or +31
95  
96   // adaptor matching metrics -- for X-drop ungapped extension
97 + //const int match_reward=2;
98 + //const int mismatch_penalty=3;
99   const int match_reward=2;
100 < const int mismatch_penalty=3;
101 < const int Xdrop=8;
100 > const int mismatch_penalty=4;
101 > const int Xdrop=10;
102  
103   const int poly_m_score=2; //match score
104   const int poly_mis_score=-3; //mismatch
# Line 112 | Line 114
114     GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adapter sequence
115     GStr seq; //actual adapter sequence data
116     GStr seqr; //reverse complement sequence
117 +   int amlen; //fraction of adapter length matching that's
118 +              //enough to consider the alignment
119     bool use_reverse;
120 <   CASeqData(bool rev=false):seq(),seqr() {
121 <     use_reverse=rev;
120 >   CASeqData(bool rev=false):seq(),seqr(),
121 >             amlen(0), use_reverse(rev) {
122       for (int i=0;i<4096;i++) {
123          pz[i]=NULL;
124          pzr[i]=NULL;
# Line 122 | Line 126
126       }
127  
128     void update(const char* s) {
129 <   seq=s;
130 <   table6mers(seq.chars(), seq.length(), pz);
131 <   if (!use_reverse) return;
132 <   //reverse complement
133 <   seqr=s;
134 <   int slen=seq.length();
135 <   for (int i=0;i<slen;i++)
136 <     seqr[i]=ntComplement(seq[slen-i-1]);
137 <   table6mers(seqr.chars(), seqr.length(), pzr);
129 >         seq=s;
130 >         table6mers(seq.chars(), seq.length(), pz);
131 >         amlen=iround(double(seq.length())*0.8);
132 >         if (amlen<12)
133 >                amlen=12;
134 >         if (!use_reverse) return;
135 >         //reverse complement
136 >         seqr=s;
137 >         int slen=seq.length();
138 >         for (int i=0;i<slen;i++)
139 >           seqr[i]=ntComplement(seq[slen-i-1]);
140 >         table6mers(seqr.chars(), seqr.length(), pzr);
141     }
142  
143     ~CASeqData() {
# Line 319 | Line 326
326    char* infile=NULL;
327  
328    if (adapters5.Count()>0)
329 <    gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
329 >    //gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
330 >        gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
331    if (adapters3.Count()>0)
332      gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
333  
# Line 391 | Line 399
399              } //pair read
400         if (tcode>1) { //trashed
401           #ifdef GDEBUG
402 <         GMessage(" !!!!TRASH => 'N'\n");
402 >         GMessage(" !!!!TRASH code = %c\n",tcode);
403           #endif
404            if (tcode=='s') trash_s++;
405            else if (tcode=='A' || tcode=='T') trash_poly++;
# Line 943 | Line 951
951   l5=0;
952   l3=rlen-1;
953   bool trimmed=false;
954 < GStr wseq(seq.chars());
954 > GStr wseq(seq);
955   int wlen=rlen;
956 + GXSeqData seqdata;
957   for (int ai=0;ai<adapters3.Count();ai++) {
958 <  //if (adapters3[ai].is_empty()) continue;
959 <  int alen=adapters3[ai].seq.length();
960 <  GStr& aseq=adapters3[ai].seq;
961 <  GXAlnInfo* r_bestaln=match_RightEnd(aseq.chars(), alen, adapters3[ai].pz,
953 <                            wseq.chars(), wlen, gxmem_r, 74);
954 <  if (r_bestaln) {
958 >  seqdata.update(adapters3[ai].seq.chars(), adapters3[ai].seq.length(),
959 >       adapters3[ai].pz, wseq.chars(), wlen, adapters3[ai].amlen);
960 >  GXAlnInfo* bestaln=match_RightEnd(seqdata, gxmem_r, 86);
961 >  if (bestaln) {
962       trimmed=true;
963 <     //keep unmatched region on the left, if any
964 <     l3-=(wlen-r_bestaln->sl+1);
965 <     delete r_bestaln;
966 <     if (l3<0) l3=0;
963 >     //keep unmatched region on the left OR right (the longer one)
964 >     if (bestaln->sl > wlen-bestaln->sr) {
965 >         //keep left side
966 >         l3-=(wlen-bestaln->sl+1);
967 >         if (l3<0) l3=0;
968 >         }
969 >     else { //keep right side
970 >         l5+=bestaln->sr;
971 >         if (l5>=rlen) l5=rlen-1;
972 >         }
973 >     delete bestaln;
974       if (l3-l5+1<min_read_len) return true;
975       wseq=seq.substr(l5,l3-l5+1);
976       wlen=wseq.length();
# Line 971 | Line 985
985   l5=0;
986   l3=rlen-1;
987   bool trimmed=false;
988 < GStr wseq(seq.chars());
988 > GStr wseq(seq);
989   int wlen=rlen;
990 + GXSeqData seqdata;
991   for (int ai=0;ai<adapters5.Count();ai++) {
992 <  //if (adapters5[ai].is_empty()) continue;
993 <  int alen=adapters5[ai].seq.length();
994 <  GStr& aseq=adapters5[ai].seq;
995 <  GXAlnInfo* l_bestaln=match_LeftEnd(aseq.chars(), alen, adapters5[ai].pz,
981 <                 wseq.chars(), wlen, gxmem_l, 84);
982 <  if (l_bestaln) {
992 >   seqdata.update(adapters5[ai].seq.chars(), adapters5[ai].seq.length(),
993 >       adapters5[ai].pz, wseq.chars(), wlen, adapters5[ai].amlen);
994 >  GXAlnInfo* bestaln=match_LeftEnd(seqdata, gxmem_l, 90);
995 >  if (bestaln) {
996       trimmed=true;
997 <     l5+=l_bestaln->sr;
998 <     delete l_bestaln;
999 <     if (l5>=rlen) l5=rlen-1;
997 >     if (bestaln->sl-1 > wlen-bestaln->sr) {
998 >         //keep left side
999 >         l3-=(wlen-bestaln->sl+1);
1000 >         if (l3<0) l3=0;
1001 >         }
1002 >     else { //keep right side
1003 >         l5+=bestaln->sr;
1004 >         if (l5>=rlen) l5=rlen-1;
1005 >         }
1006 >     delete bestaln;
1007       if (l3-l5+1<min_read_len) return true;
1008       wseq=seq.substr(l5,l3-l5+1);
1009       wlen=wseq.length();
# Line 1060 | Line 1080
1080  
1081   #ifdef GDEBUG
1082   void showTrim(GStr& s, int l5, int l3) {
1083 <  if (l5>0) {
1083 >  if (l5>0 || l3==0) {
1084      color_bg(c_red);
1085      }
1086    for (int i=0;i<s.length()-1;i++) {
1087      if (i && i==l5) color_resetbg();
1088      fprintf(stderr, "%c", s[i]);
1089 <    if (i==l3) color_bg(c_red);
1089 >    if (i && i==l3) color_bg(c_red);
1090     }
1091    fprintf(stderr, "%c", s[s.length()-1]);
1092    color_reset();

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines