ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 100 | Line 100
100  
101   const char *polyA_seed="AAAA";
102   const char *polyT_seed="TTTT";
103 <
103 > /*
104   struct CAdapters {
105      GStr a5;
106      GStr a3;
107      CAdapters(const char* s5=NULL, const char* s3=NULL):a5(s5),a3(s3) {
108        }
109   };
110 <
111 < GPVec<CAdapters> adapters;
110 > */
111 > GVec<GStr> adapters5;
112 > GVec<GStr> adapters3;
113  
114   // element in dhash:
115   class FqDupRec {
# Line 240 | Line 241
241    if (!s.is_empty()) {
242     loadAdapters(s.chars());
243     }
244 <  bool fileAdapters=adapters.Count();
244 >  bool fileAdapters=adapters5.Count()+adapters3.Count();
245    s=args.getOpt('5');
246    if (!s.is_empty()) {
247      if (fileAdapters)
248        GError("Error: options -5 and -f cannot be used together!\n");
249      s.upper();
250 <    adapters.Add(new CAdapters(s.chars()));
250 >    adapters5.Add(s);
251      }
252    s=args.getOpt('3');
253    if (!s.is_empty()) {
254      if (fileAdapters)
255        GError("Error: options -3 and -f cannot be used together!\n");
256 <    s.upper();
257 <    if (adapters.Count()>0)
257 <          adapters[0]->a3=s.chars();
258 <     else adapters.Add(NULL, new CAdapters(s.chars()));
256 >      s.upper();
257 >      adapters3.Add(s);
258      }
259    s=args.getOpt('y');
260    if (!s.is_empty()) {
# Line 900 | Line 899
899   int rlen=seq.length();
900   l5=0;
901   l3=rlen-1;
902 < //first try a full match, we might get lucky
904 < int fi=-1;
905 < if ((fi=seq.index(adapter3))>=0) {
906 <   if (fi<rlen-fi-a3len) {//match is closer to the right end
907 <      l5=fi+a3len;
908 <      l3=rlen-1;
909 <      }
910 <    else {
911 <      l5=0;
912 <      l3=fi-1;
913 <      }
914 <   return true;
915 <   }
916 < #ifdef DEBUG
917 < if (debug) GMessage(">TRIM3 >>   Read: %s\n",seq.chars());
918 < #endif
919 <
920 < //also, for fast detection of other adapter-only reads that start past
921 < // the beginning of the adapter sequence, try to see if the first a3len-4
922 < // bases of the read are a substring of the adapter
923 < if (rlen>a3len-3) {
924 <   GStr rstart=seq.substr(1,a3len-4);
925 <   if ((fi=adapter3.index(rstart))>=0) {
926 <     l3=rlen-1;
927 <     l5=a3len-4;
928 <     while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
929 <     return true;
930 <     }
931 <  }
932 < CSegChain a3segs; //no chains here, just an ordered collection of segment pairs
933 <  //check the easy cases - 11 bases exact match at the end
934 < int fdlen=11;
935 <  if (a3len<16) {
936 <   fdlen=a3len>>1;
937 <   }
938 < if (fdlen>4) {
939 <     //check if we're lucky enough to have the last 11 bases of the read a part of the adapter
940 <     GStr rstart=seq.substr(-fdlen-3,fdlen);
941 <     if ((fi=adapter3.index(rstart))>=0) {
942 < #ifdef DEBUG
943 <       if (debug) GMessage("  W11match found: %*s\n", rlen-3, (adapter3.substr(fi,fdlen)).chars());
944 < #endif
945 <       if (extendMatch(seq.chars(), rlen, rlen-fdlen-3,
946 <                     adapter3.chars(), a3len, fi,  fdlen, l5,l3, a3segs))
947 <            return true;
948 <       }
949 <     //another easy case: first 11 characters of the adaptor found as a substring of the read
950 <     GStr bstr=adapter3.substr(0, fdlen);
951 <     if ((fi=seq.rindex(bstr))>=0) {
952 < #ifdef DEBUG
953 <       if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
954 < #endif
955 <       if (extendMatch(seq.chars(), rlen, fi,
956 <                     adapter3.chars(), a3len, 0,  fdlen, l5,l3, a3segs))
957 <            return true;
958 <       }
959 <     } //tried to match 11 bases first
960 <    
961 < //no easy cases, so let's do the wmer hashing for the first 12 bases of the adaptor
962 < //-- only extend if the match is in the 3' (ending) region of the read
963 < int wordSize=3;
964 < int hlen=12;
965 < if (hlen>a3len-wordSize) hlen=a3len-wordSize;
966 < int imin=rlen>>1; //last half of the read, left boundary for the wmer match
967 < if (imin<a3len) { imin=GMIN(a3len, rlen-wordSize); }
968 < imin=rlen-imin;
969 < for (int iw=0;iw<hlen;iw++) {
970 <   //int32* qv=(int32*)(adapter3.chars()+iw);
971 <   int qv=get3mer_value(adapter3.chars()+iw);
972 <   fi=-1;
973 <   //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
974 <   while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
975 <     //GMessage(" ... fi=%d after w3_rmatch() (imin=%d)\n", fi, imin);
976 <
977 < #ifdef DEBUG
978 <     if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter3.substr(iw,wordSize)).chars());
979 < #endif
980 <     if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
981 <                   a3len, iw, wordSize, l5,l3, a3segs)) return true;
982 <     fi--;
983 <     if (fi<imin) break;
984 <     }
985 <   } //for each wmer in the first hlen bases of the adaptor
986 < /*
987 < //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
988 < //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
989 < if (a3segs.bstart>3 || a3segs.bend<(uint)(hlen-wordSize)) return false;
990 < int hlen2=a3len-wordSize;
991 < //if (hlen2>a3len-4) hlen2=a3len-4;
992 < if (hlen2>hlen) {
993 < #ifdef DEBUG
994 <     if (debug && a3segs.Count()>0) {
995 <        GMessage("  >>>>>2nd. hash: %s\n",seq.chars());
996 <        }
997 < #endif
998 <     for (int iw=hlen;iw<hlen2;iw++) {
999 <         //int* qv=(int32 *)(adapter3.chars()+iw);
1000 <         int qv=get3mer_value(adapter3.chars()+iw);
1001 <         fi=-1;
1002 <         //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1003 <         while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1004 <           extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1005 <                         a3len, iw, wordSize, l5,l3, a3segs);
1006 <           fi--;
1007 <           if (fi<imin) break;
1008 <           }
1009 <         } //for each wmer between hlen2 and hlen bases of the adaptor
1010 <     }
1011 < //lastly, analyze collected a3segs for a possible gapped alignment:
1012 < GList<CSegChain> segchains(false,true,false);
1013 < #ifdef DEBUG
1014 < if (debug && a3segs.Count()>0) {
1015 <   GMessage(">>>>>>>>>   Read: %s\n",seq.chars());
1016 <   }
1017 < #endif
1018 < for (int i=0;i<a3segs.Count();i++) {
1019 <   if (a3segs[i]->chain==NULL) {
1020 <       if (a3segs[i]->b.start>3) continue; //don't start a hopeless chain
1021 <       CSegChain* newchain=new CSegChain();
1022 <       newchain->setFreeItem(false);
1023 <       newchain->addSegPair(a3segs[i]);
1024 <       a3segs[i]->chain=newchain;
1025 <       segchains.Add(newchain); //just to free them when done
1026 <       }
1027 <   for (int j=i+1;j<a3segs.Count();j++) {
1028 <      CSegChain* chain=a3segs[i]->chain;
1029 <      if (chain->extendChain(a3segs[j])) {
1030 <          a3segs[j]->chain=chain;
1031 < #ifdef DEBUG
1032 <          if (debug) dbgPrintChain(*chain, adapter3.chars());
1033 < #endif      
1034 <          //save time by checking here if the extended chain is already acceptable for trimming
1035 <          if (chain->aend>(uint)(rlen-4) && chain->bstart<4 && chain->score>a_min_chain_score) {
1036 <            l5=0;
1037 <            l3=chain->astart-2;
1038 < #ifdef DEBUG
1039 <          if (debug && a3segs.Count()>0) {
1040 <            GMessage(">>> >> trimmed-3: %*s\n",l3-l5+1,seq.substr(l5,l3-l5+1).chars());
1041 <            }
1042 < #endif
1043 <            return true;
1044 <            }
1045 <          } //chain can be extended
1046 <      }
1047 <   } //collect segment alignments into chains
1048 < */  
902 >
903   return false; //no adapter parts found
904   }
905  
906   bool trim_adapter5(GStr& seq, int&l5, int &l3) {
1053 //if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars());
907   int rlen=seq.length();
908   l5=0;
909   l3=rlen-1;
910 < //try to see if adapter is fully included in the read
911 < int fi=-1;
912 < for (int ai=0;ai<adapters.Count();ai++) {
913 <  if (adapters[ai]->a5.is_empty()) continue;
914 <  int a5len=adapters[ai]->a5.length();
1062 <  GStr& adapter5=adapters[ai]->a5;
1063 <  if ((fi=seq.index(adapter5))>=0) {
1064 <    if (fi<rlen-fi-a5len) {//match is closer to the right end
1065 <       l5=fi+a5len;
1066 <       l3=rlen-1;
1067 <       }
1068 <     else {
1069 <       l5=0;
1070 <       l3=fi-1;
1071 <       }
1072 <    return true;
1073 <    }
1074 < #ifdef DEBUG
1075 <  if (debug) GMessage(">TRIM5 >>   Read: %s\n",seq.chars());
1076 < #endif
1077 <
1078 <  //try the easy way out first - look for an exact match of 11 bases
1079 <  int fdlen=11;
1080 <   if (a5len<16) {
1081 <    fdlen=a5len>>1;
1082 <    }
1083 <  if (fdlen>4) {
1084 <      GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus
1085 <      if ((fi=adapter5.index(rstart))>=0) {
1086 < #ifdef DEBUG
1087 <        if (debug) GMessage("  W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars());
1088 < #endif
1089 <        if (extendMatch(seq.chars(), rlen, 1,
1090 <                      adapter5.chars(), a5len, fi,  fdlen, l5,l3, a5segs, true))
1091 <            return true;
1092 <        }
1093 <      //another easy case: last 11 characters of the adaptor found as a substring of the read
1094 <      GStr bstr=adapter5.substr(-fdlen);
1095 <      if ((fi=seq.index(bstr))>=0) {
1096 < #ifdef DEBUG
1097 <        if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1098 < #endif
1099 <        if (extendMatch(seq.chars(), rlen, fi,
1100 <                      adapter5.chars(), a5len, a5len-fdlen,  fdlen, l5,l3,a5segs,true))
1101 <           return true;
1102 <        }
1103 <      } //tried to matching at most 11 bases first
910 > bool trimmed=false;
911 > for (int ai=0;ai<adapters5.Count();ai++) {
912 >  if (adapters5[ai].is_empty()) continue;
913 >  int a5len=adapters5[ai].length();
914 >  GStr& adapter5=adapters5[ai];
915  
916 <  //-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor
917 <  //-- only extend a wmer if it matches in the 5' (beginning) region of the read
1107 <  int wordSize=3;
1108 <  int hlen=12;
1109 <  if (hlen>a5len-wordSize) hlen=a5len-wordSize;
1110 <  int imax=rlen>>1; //first half of the read, right boundary for the wmer match
1111 <  if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); }
1112 <  for (int iw=0;iw<=hlen;iw++) {
1113 <    int apstart=a5len-iw-wordSize;
1114 <    fi=0;
1115 <    //int* qv=(int32 *)(adapter5.chars()+apstart);
1116 <    int qv=get3mer_value(adapter5.chars()+apstart);
1117 <    //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1118 <    while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1119 < #ifdef DEBUG
1120 <      if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars());
1121 < #endif
1122 <      if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1123 <                 a5len, apstart, wordSize, l5,l3, a5segs, true)) return true;
1124 <      fi++;
1125 <      if (fi>imax) break;
1126 <      }
1127 <    } //for each wmer in the last hlen bases of the adaptor
1128 < //if we're here we couldn't find a good extension
1129 <  return false; //no adapter parts found
1130 < }//for each 5' adapter
916 >  }//for each 5' adapter
917 >  return trimmed;
918   }
919  
920   //convert qvs to/from phred64 from/to phread33
# Line 1236 | Line 1023
1023     w5=0;
1024     w3=wseq.length()-1;
1025     }
1026 < if (a3len>0) {
1026 > if (adapters3.Count()>0) {
1027    if (trim_adapter3(wseq, w5, w3)) {
1028       int trimlen=wseq.length()-(w3-w5+1);
1029       num_trimmed3++;
# Line 1253 | Line 1040
1040           wqv=wqv.substr(w5, w3-w5+1);
1041        }//some adapter was trimmed
1042     } //adapter trimming
1043 < if (a5len>0) {
1043 > if (adapters5.Count()>0) {
1044    if (trim_adapter5(wseq, w5, w3)) {
1045       int trimlen=wseq.length()-(w3-w5+1);
1046       num_trimmed5++;
# Line 1433 | Line 1220
1220          i++;
1221          }
1222        if (l[i]!=0) {
1223 <        adapters.Add(new CAdapters(NULL, &(l[i])));
1223 >        GStr s(&(l[i]));
1224 >        adapters3.Add(s);
1225          continue;
1226          }
1227        }
# Line 1443 | Line 1231
1231        GStr a5,a3;
1232        if (s.nextToken(a5))
1233           s.nextToken(a3);
1234 <      adapters.Add(new CAdapters(a5.is_empty()?NULL:a5.chars(),
1235 <                                a3.is_empty()?NULL:a3.chars()));
1234 >      a5.upper();
1235 >      a3.upper();
1236 >      adapters5.Add(a5);
1237 >      adapters3.Add(a3);
1238        }
1239     }
1240 <   return adapters.Count();
1240 >   return adapters5.Count()+adapters3.Count();
1241   }
1242  
1243   void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines