ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.cpp
(Generate patch)
# Line 604 | Line 604
604   const int a_dropoff_score=7;
605   const int a_min_score=12; //at least 6 bases full match
606  
607 < // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
607 > // ------------------ adaptor matching - simple k-mer seed & extend, no indels for now
608   //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
609   //check minimum score and
610 < //for 3' adapter trimming:
610 > //for 3' adaptor trimming:
611   //     require that the right end of the alignment for either the adaptor OR the read must be
612   //     < 3 distance from its right end
613 < // for 5' adapter trimming:
613 > // for 5' adaptor trimming:
614   //     require that the left end of the alignment for either the adaptor OR the read must
615   //     be at coordinate < 3 from start
616  
# Line 683 | Line 683
683    int mmovh5=(a_l<b_l)? a_l : b_l;
684    if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
685       if (a_l<a_ovh3) {
686 <        //adapter closer to the left end (typical for 5' adapter)
686 >        //adaptor closer to the left end (typical for 5' adaptor)
687          l5=a_r+1;
688          l3=alen-1;
689          }
690        else {
691 <        //adapter matching at the right end (typical for 3' adapter)
691 >        //adaptor matching at the right end (typical for 3' adaptor)
692          l5=0;
693          l3=a_l-1;
694          }
# Line 788 | Line 788
788                  diagstrips->qmatch=new GXSeed(ai,bi,len);
789                  return diagstrips;
790                  }
791 <         if (bi>bimax && len<9) continue; //skip middle seeds that are not high scoring enough
792 <         if (bi<bimin && len<9) continue; //collectSeeds_R
791 >         if (bi>bimax && bi<bimin && len<9)
792 >                 //skip mid-sequence seeds that are not high scoring
793 >             continue;
794  
795           GXSeed* newseed=new GXSeed(ai,bi,len);
796           seeds.Add(newseed);
# Line 889 | Line 890
890                     reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
891      //check this extension here and bail out if it's not a good extension
892      if (s_ext_l+(trim->seedlen>>1) < trim->safelen &&
893 <        q_alnstart+1-q_ext_l>trim->boundary &&
894 <        s_alnstart+1-s_ext_l>trim->boundary) {
893 >        q_alnstart+1-q_ext_l>1 &&
894 >        s_alnstart+1-s_ext_l>trim->l_boundary) {
895        delete ed_script_rev;
896        if (freeAlnMem) delete gxmem;
897        return NULL;
# Line 916 | Line 917
917      //assuming s_max is really at the right end of s_seq
918      if (trim!=NULL && trim->type==galn_TrimRight &&
919          s_ext_r+(trim->seedlen>>1) < trim->safelen &&
920 <            q_alnstart+q_ext_r<q_max-3 &&
921 <            s_alnstart+s_ext_r<trim->boundary) {
920 >            q_alnstart+q_ext_r<q_max-2 &&
921 >            s_alnstart+s_ext_r<trim->r_boundary) {
922        delete ed_script_fwd;
923        if (freeAlnMem) delete gxmem;
924        return NULL;
# Line 972 | Line 973
973       reward, penalty, xdrop, gxmem, trim, editscript);
974   }
975  
976 < GXAlnInfo* match_RightEnd(GXSeqData& sd, CGreedyAlignData* gxmem, int min_pid) {
976 > GXAlnInfo* match_adaptor(GXSeqData& sd, GAlnTrimType trim_type,
977 >                                     CGreedyAlignData* gxmem, int min_pid) {
978    bool editscript=false;
979    #ifdef GDEBUG
980     editscript=true;
981 <   GMessage("==========> matching Right (3') end : %s\n", sd.aseq);
981 >   if (trim_type==galn_TrimLeft) {
982 >         GMessage("=======> searching left (5') end : %s\n", sd.aseq);
983 >     }
984 >   else if (trim_type==galn_TrimRight) {
985 >     GMessage("=======> searching right(3') end : %s\n", sd.aseq);
986 >     }
987 >   else if (trim_type==galn_TrimEither) {
988 >     GMessage("==========> searching  both ends : %s\n", sd.aseq);
989 >     }
990    #endif
991 <  CAlnTrim trimInfo(galn_TrimRight, sd.bseq, sd.blen, sd.amlen);
991 >  CAlnTrim trimInfo(trim_type, sd.bseq, sd.blen, sd.alen, sd.amlen);
992    GList<GXSeed> rseeds(true,true,false);
983
993    GXBandSet* alnbands=collectSeeds(rseeds, sd);
994    GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
995    //did we find a shortcut?
# Line 1003 | Line 1012
1012         GXBand& band=*(alnbands->Get(b));
1013         band.seeds.setSorted(cmpSeedScore);
1014         anchor_seeds.Add(band.seeds.First());
1015 <       band.tested=true;
1015 >       //band.tested=true;
1016         if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1017         }
1018      //#ifdef GDEBUG
# Line 1022 | Line 1031
1031   #endif
1032      GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen,
1033                              sd.bseq, a2, sd.blen, gxmem, &trimInfo, editscript);
1034 <    if (alninfo && alninfo->pid>=min_pid &&
1026 <        trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1034 >    if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo))
1035               galns.AddIfNew(alninfo, true);
1036          else delete alninfo;
1037      }
1038 <  if (galns.Count()==0 && alnbands->tmatch_r) {
1039 <      //last resort seed
1040 <      GXSeed& aseed=*alnbands->tmatch_r;
1041 <      int halfseed=aseed.len>>1;
1042 <      int a1=aseed.a_ofs+halfseed+1;
1043 <      int a2=aseed.b_ofs+halfseed+1;
1044 <      trimInfo.seedlen=aseed.len;
1038 >
1039 >  if (galns.Count()==0) {
1040 >         //last resort: look for weaker terminal seeds
1041 >          GPVec<GXSeed> tmatches(2,false);
1042 >          if (trim_type!=galn_TrimRight) {
1043 >                 if (alnbands->tmatch_l)
1044 >                    tmatches.Add(alnbands->tmatch_l);
1045 >             }
1046 >          if (trim_type!=galn_TrimLeft) {
1047 >                 if (alnbands->tmatch_r)
1048 >                    tmatches.Add(alnbands->tmatch_r);
1049 >             }
1050 >          for (int i=0;i<tmatches.Count();i++) {
1051 >                GXSeed& aseed=*tmatches[i];
1052 >                int halfseed=aseed.len>>1;
1053 >                int a1=aseed.a_ofs+halfseed+1;
1054 >                int a2=aseed.b_ofs+halfseed+1;
1055 >                trimInfo.seedlen=aseed.len;
1056   #ifdef GDEBUG
1057 <    GMessage("\t::: align from seed (%d, %d)of len %d.\n",aseed.a_ofs, aseed.b_ofs,
1057 >    GMessage("\t::: align from terminal seed (%d, %d)of len %d.\n",aseed.a_ofs, aseed.b_ofs,
1058             aseed.len);
1059   #endif
1060 <      GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen,
1060 >        GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen,
1061                                  sd.bseq, a2, sd.blen, gxmem, &trimInfo, editscript);
1062 <      if (alninfo && alninfo->pid>=min_pid &&
1044 <           trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1062 >        if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo))
1063                   galns.AddIfNew(alninfo, true);
1064 <            else delete alninfo;
1064 >             else delete alninfo;
1065 >        }//for each terminal seed
1066        }
1067 <
1067 >  //---- found all alignments
1068 >  delete alnbands;
1069    /*
1070 <  //special 3' end case: due to the seed scoring scheme being biased towards the 5' end of the read,
1071 <  //we should also try some seeds closer to the 3' end
1072 <  if (galns.Count()==0) {
1073 <    anchor_seeds.Clear();
1074 <    alnbands->setSorted(cmpDiagBands_R);
1075 <    int max_top_bands=4;
1076 <    int top_band_count=0;
1077 <    //#ifdef GDEBUG
1078 <    //GMessage(":::>> Retrying adjusting sort order.\n");
1059 <    //#endif
1060 <    if (alnbands->tmatch) {
1061 <      //anchor_seeds.setSorted(false);
1062 <      anchor_seeds.Add(alnbands->tmatch);
1063 <      }
1064 <    for (int b=0;b<alnbands->Count();b++) {
1065 <       if (alnbands->Get(b)->score<4) break;
1066 <       //#ifdef GDEBUG
1067 <       //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
1068 <       //#endif
1069 <       if (alnbands->Get(b)->tested) continue;
1070 <       top_band_count++;
1071 <       GXBand& band=*(alnbands->Get(b));
1072 <       band.seeds.setSorted(cmpSeedScore);
1073 <       anchor_seeds.Add(band.seeds.First());
1074 <       if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1075 <       }
1076 <    //#ifdef GDEBUG
1077 <    //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1078 <    //#endif
1079 <    for (int i=0;i<anchor_seeds.Count();i++) {
1080 <      GXSeed& aseed=*anchor_seeds[i];
1081 <      int a1=aseed.a_ofs+(aseed.len>>1)+1;
1082 <      int a2=aseed.b_ofs+(aseed.len>>1)+1;
1083 <      GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1084 <                              seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1085 <      if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1086 <               galns.AddIfNew(alninfo, true);
1087 <          else delete alninfo;
1070 >  #ifdef GDEBUG
1071 >  //print all valid alignments found
1072 >  for (int i=0;i<galns.Count();i++) {
1073 >    GXAlnInfo* alninfo=galns[i];
1074 >    GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
1075 >                         alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
1076 >    if (alninfo->gapinfo!=NULL) {
1077 >      GMessage("Alignment:\n");
1078 >      alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1079        }
1080 <    } */
1081 <
1082 <  //---- done
1092 <  delete alnbands;
1080 >    }
1081 >  #endif
1082 >  */
1083    if (galns.Count()) {
1084      GXAlnInfo* bestaln=galns.Shift();
1085      #ifdef GDEBUG
# Line 1099 | Line 1089
1089          bestaln->gapinfo->printAlignment(stderr, sd.aseq, sd.alen, sd.bseq, sd.blen);
1090          }
1091      #endif
1102
1092      return bestaln;
1093      }
1094    else return NULL;
1095   }
1096 <
1097 < GXAlnInfo* match_LeftEnd(GXSeqData& sd, CGreedyAlignData* gxmem, int min_pid) {
1096 > /*
1097 > GXAlnInfo* match_Left(GXSeqData& sd, CGreedyAlignData* gxmem, int min_pid) {
1098    bool editscript=false;
1099    #ifdef GDEBUG
1100     editscript=true;
1101     GMessage("==========> matching Left (5') end : %s\n", sd.aseq);
1102    #endif
1103 <  CAlnTrim trimInfo(galn_TrimLeft, sd.bseq, sd.blen, sd.amlen);
1103 >  CAlnTrim trimInfo(galn_TrimLeft, sd.bseq, sd.blen, sd.alen, sd.amlen);
1104    GList<GXSeed> rseeds(true,true,false);
1105    GXBandSet* alnbands = collectSeeds(rseeds, sd);
1106    GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
# Line 1134 | Line 1123
1123         GXBand& band=*(alnbands->Get(b));
1124         band.seeds.setSorted(cmpSeedScore);
1125         anchor_seeds.Add(band.seeds.First());
1126 <       band.tested=true;
1126 >       //band.tested=true;
1127         if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1128         }
1129      //#ifdef GDEBUG
# Line 1153 | Line 1142
1142   #endif
1143      GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen,
1144                              sd.bseq, a2, sd.blen, gxmem, &trimInfo, editscript);
1145 <    if (alninfo && alninfo->pid>=min_pid
1157 <           && trimInfo.validate(alninfo->sl, alninfo->sr,
1158 <                    alninfo->pid, sd.alen-alninfo->qr))
1145 >    if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo))
1146              galns.AddIfNew(alninfo, true);
1147         else delete alninfo;
1148      }
# Line 1167 | Line 1154
1154        trimInfo.seedlen=aseed.len;
1155        GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen,
1156                                sd.bseq, a2, sd.blen, gxmem, &trimInfo, editscript);
1157 <      if (alninfo && alninfo->pid>=min_pid &&
1171 <        trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, sd.alen-alninfo->qr))
1157 >      if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo))
1158           galns.Add(alninfo);
1159        }
1174  /*
1175  #ifdef GDEBUG
1176  //print valid alignments found
1177  for (int i=0;i<galns.Count();i++) {
1178    GXAlnInfo* alninfo=galns[i];
1179    GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
1180                         alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
1181    if (alninfo->gapinfo!=NULL) {
1182      GMessage("Alignment:\n");
1183      alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1184      }
1185    }
1186  #endif
1187  */
1160    //---- done
1161    delete alnbands;
1162    if (galns.Count()) {
# Line 1201 | Line 1173
1173      }
1174    else return NULL;
1175   }
1176 + */

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines