ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 1056 | Line 1056
1056   l3=rlen-1;
1057   //try to see if adapter is fully included in the read
1058   int fi=-1;
1059 < if ((fi=seq.index(adapter5))>=0) {
1060 <   if (fi<rlen-fi-a5len) {//match is closer to the right end
1061 <      l5=fi+a5len;
1062 <      l3=rlen-1;
1063 <      }
1064 <    else {
1065 <      l5=0;
1066 <      l3=fi-1;
1067 <      }
1068 <   return true;
1069 <   }
1070 < #ifdef DEBUG
1071 < if (debug) GMessage(">TRIM5 >>   Read: %s\n",seq.chars());
1072 < #endif
1073 <
1074 < CSegChain a5segs(true); //list of segment pairs to analyze later if no extendMatch succeeded
1075 <
1076 < //try the easy way out first - look for an exact match of 11 bases
1077 < int fdlen=11;
1078 <  if (a5len<16) {
1079 <   fdlen=a5len>>1;
1080 <   }
1081 < if (fdlen>4) {
1082 <     GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus
1083 <     if ((fi=adapter5.index(rstart))>=0) {
1084 < #ifdef DEBUG
1085 <       if (debug) GMessage("  W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars());
1086 < #endif
1087 <       if (extendMatch(seq.chars(), rlen, 1,
1088 <                     adapter5.chars(), a5len, fi,  fdlen, l5,l3, a5segs, true))
1089 <           return true;
1059 > for (int ai=0;ai<adapters.Count();ai++) {
1060 >  if (adapters[ai]->a5.is_empty()) continue;
1061 >  int a5len=adapters[ai]->a5.length();
1062 >  GStr& adapter5=adapters[ai]->a5;
1063 >  if ((fi=seq.index(adapter5))>=0) {
1064 >    if (fi<rlen-fi-a5len) {//match is closer to the right end
1065 >       l5=fi+a5len;
1066 >       l3=rlen-1;
1067         }
1068 <     //another easy case: last 11 characters of the adaptor found as a substring of the read
1069 <     GStr bstr=adapter5.substr(-fdlen);
1070 <     if ((fi=seq.index(bstr))>=0) {
1094 < #ifdef DEBUG
1095 <       if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1096 < #endif
1097 <       if (extendMatch(seq.chars(), rlen, fi,
1098 <                     adapter5.chars(), a5len, a5len-fdlen,  fdlen, l5,l3,a5segs,true))
1099 <          return true;
1100 <       }
1101 <     } //tried to matching at most 11 bases first
1102 <    
1103 < //-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor
1104 < //-- only extend a wmer if it matches in the 5' (beginning) region of the read
1105 < int wordSize=3;
1106 < int hlen=12;
1107 < if (hlen>a5len-wordSize) hlen=a5len-wordSize;
1108 < int imax=rlen>>1; //first half of the read, right boundary for the wmer match
1109 < if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); }
1110 < for (int iw=0;iw<=hlen;iw++) {
1111 <   int apstart=a5len-iw-wordSize;
1112 <   fi=0;
1113 <   //int* qv=(int32 *)(adapter5.chars()+apstart);
1114 <   int qv=get3mer_value(adapter5.chars()+apstart);
1115 <   //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1116 <   while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1117 < #ifdef DEBUG
1118 <     if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars());
1119 < #endif
1120 <     if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1121 <                a5len, apstart, wordSize, l5,l3, a5segs, true)) return true;
1122 <     fi++;
1123 <     if (fi>imax) break;
1124 <     }
1125 <   } //for each wmer in the last hlen bases of the adaptor
1126 < /*
1127 <
1128 < //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1129 < //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1130 < if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1131 < int hlen2=a5len-wordSize;
1132 < //if (hlen2>a5len-wordSize) hlen2=a5len-wordSize;
1133 < #ifdef DEBUG
1134 <      if (debug && a5segs.Count()>0) {
1135 <        GMessage("  >>>>>2nd. hash: %s\n",seq.chars());
1136 <        }
1137 < #endif
1138 < if (hlen2>hlen) {
1139 <     for (int iw=hlen+1;iw<=hlen2;iw++) {
1140 <         int apstart=a5len-iw-wordSize;
1141 <         fi=0;
1142 <         //int* qv=(int32 *)(adapter5.chars()+apstart);
1143 <         int qv=get3mer_value(adapter5.chars()+apstart);
1144 <         //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1145 <         while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1146 <           extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1147 <                      a5len, apstart, wordSize, l5,l3, a5segs, true);
1148 <           fi++;
1149 <           if (fi>imax) break;
1150 <           }
1151 <         } //for each wmer between hlen2 and hlen bases of the adaptor
1152 <     }
1153 < if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1154 < // lastly, analyze collected a5segs for a possible gapped alignment:
1155 < GList<CSegChain> segchains(false,true,false);
1156 < #ifdef DEBUG
1157 < if (debug && a5segs.Count()>0) {
1158 <   GMessage(">>>>>>>>>   Read: %s\n",seq.chars());
1159 <   }
1160 < #endif
1161 < for (int i=0;i<a5segs.Count();i++) {
1162 <   if (a5segs[i]->chain==NULL) {
1163 <       if (a5segs[i]->b.end<(int)(a5len-4)) continue; //don't start a hopeless chain
1164 <       CSegChain* newchain=new CSegChain(true);
1165 <       newchain->setFreeItem(false);
1166 <       newchain->addSegPair(a5segs[i]);
1167 <       a5segs[i]->chain=newchain;
1168 <       segchains.Add(newchain); //just to free them when done
1068 >     else {
1069 >       l5=0;
1070 >       l3=fi-1;
1071         }
1072 <   for (int j=i+1;j<a5segs.Count();j++) {
1073 <      CSegChain* chain=a5segs[i]->chain;
1074 <      if (chain->extendChain(a5segs[j])) {
1075 <         a5segs[j]->chain=chain;
1076 < #ifdef DEBUG
1077 <         if (debug) dbgPrintChain(*chain, adapter5.chars());
1078 < #endif      
1079 <      //save time by checking here if the extended chain is already acceptable for trimming
1080 <         if (chain->bend>(uint)(a5len-3) && chain->astart<4 && chain->score>a_min_chain_score) {
1081 <            l5=chain->aend;
1082 <            l3=rlen-1;
1072 >    return true;
1073 >    }
1074 > #ifdef DEBUG
1075 >  if (debug) GMessage(">TRIM5 >>   Read: %s\n",seq.chars());
1076 > #endif
1077 >
1078 >  //try the easy way out first - look for an exact match of 11 bases
1079 >  int fdlen=11;
1080 >   if (a5len<16) {
1081 >    fdlen=a5len>>1;
1082 >    }
1083 >  if (fdlen>4) {
1084 >      GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus
1085 >      if ((fi=adapter5.index(rstart))>=0) {
1086 > #ifdef DEBUG
1087 >        if (debug) GMessage("  W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars());
1088 > #endif
1089 >        if (extendMatch(seq.chars(), rlen, 1,
1090 >                      adapter5.chars(), a5len, fi,  fdlen, l5,l3, a5segs, true))
1091              return true;
1092 <            }
1093 <         } //chain can be extended
1092 >        }
1093 >      //another easy case: last 11 characters of the adaptor found as a substring of the read
1094 >      GStr bstr=adapter5.substr(-fdlen);
1095 >      if ((fi=seq.index(bstr))>=0) {
1096 > #ifdef DEBUG
1097 >        if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1098 > #endif
1099 >        if (extendMatch(seq.chars(), rlen, fi,
1100 >                      adapter5.chars(), a5len, a5len-fdlen,  fdlen, l5,l3,a5segs,true))
1101 >           return true;
1102 >        }
1103 >      } //tried to matching at most 11 bases first
1104 >
1105 >  //-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor
1106 >  //-- only extend a wmer if it matches in the 5' (beginning) region of the read
1107 >  int wordSize=3;
1108 >  int hlen=12;
1109 >  if (hlen>a5len-wordSize) hlen=a5len-wordSize;
1110 >  int imax=rlen>>1; //first half of the read, right boundary for the wmer match
1111 >  if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); }
1112 >  for (int iw=0;iw<=hlen;iw++) {
1113 >    int apstart=a5len-iw-wordSize;
1114 >    fi=0;
1115 >    //int* qv=(int32 *)(adapter5.chars()+apstart);
1116 >    int qv=get3mer_value(adapter5.chars()+apstart);
1117 >    //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1118 >    while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1119 > #ifdef DEBUG
1120 >      if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars());
1121 > #endif
1122 >      if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1123 >                 a5len, apstart, wordSize, l5,l3, a5segs, true)) return true;
1124 >      fi++;
1125 >      if (fi>imax) break;
1126        }
1127 <   } //collect segment alignments into chains
1128 < */
1129 < return false; //no adapter parts found
1130 < }
1127 >    } //for each wmer in the last hlen bases of the adaptor
1128 > //if we're here we couldn't find a good extension
1129 >  return false; //no adapter parts found
1130 > }//for each 5' adapter
1131 > }
1132  
1133   //convert qvs to/from phred64 from/to phread33
1134   void convertPhred(GStr& q) {
# Line 1483 | Line 1426
1426    char* l;
1427    while ((l=lr.nextLine())!=NULL) {
1428     if (lr.length()<=3 || l[0]=='#') continue;
1429 <   char* s5;
1430 <   char* s3;
1488 <   if (isspace(l[0]) || l[0]==',' || l[0]==';'|| l[0]==':') {
1429 >   if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1430 >        l[0]==';'|| l[0]==':' ) {
1431        int i=1;
1432        while (l[i]!=0 && isspace(l[i])) {
1433          i++;
# Line 1496 | Line 1438
1438          }
1439        }
1440      else {
1441 <      p=l;
1442 <      while (*delpos!='\0' && strchr(fTokenDelimiter, *delpos)!=NULL)
1443 <             delpos++;
1444 <      s5.startTokenize("\t ;,:",);
1445 <      s5.nextToken(s3);
1441 >      GStr s(l);
1442 >      s.startTokenize("\t ;,:");
1443 >      GStr a5,a3;
1444 >      if (s.nextToken(a5))
1445 >         s.nextToken(a3);
1446 >      adapters.Add(new CAdapters(a5.is_empty()?NULL:a5.chars(),
1447 >                                a3.is_empty()?NULL:a3.chars()));
1448        }
1449     }
1450 <
1450 >   return adapters.Count();
1451   }
1452  
1453   void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines