ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.cpp
(Generate patch)
# Line 700 | Line 700
700    return false;
701   }
702  
703 <
704 < GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len,
705 <        GVec<uint16>* amers[], const char* seqb, int b_len) {
706 < int bimin=GMAX(0,(b_len-a_len-6));
707 < GXSeedTable gx(a_len, b_len);
708 < GXBandSet* diagstrips=new GXBandSet(a_len, b_len); //set of overlapping 3-diagonal strips
709 < for (int bi=0;bi<=b_len-6;bi++) {
703 > /*
704 > GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, GXSeqData& sd) {
705 > int bimin=GMAX(0,(sd.blen-sd.alen-6));
706 > GXSeedTable gx(sd.alen, sd.blen);
707 > GXBandSet* diagstrips=new GXBandSet(sd.alen, sd.blen); //set of overlapping 3-diagonal strips
708 > //for (int bi=0;bi<=b_len-6;bi++) {
709 > for (int bi=sd.blen-6;bi>=0;bi--) {
710     //for each 6-mer of seqb
711 <   uint16 bv = get6mer((char*)&seqb[bi]);
712 <   GVec<uint16>* alocs = amers[bv];
711 >   uint16 bv = get6mer((char*)&(sd.bseq[bi]));
712 >   GVec<uint16>* alocs = sd.amers[bv];
713     if (alocs==NULL) continue;
714     //extend each hit
715     for (int h=0;h<alocs->Count();h++) {
# Line 720 | Line 720
720             continue;
721           for (int i=0;i<6;i++)
722              gx.x(ai+i,bi+i)=1;
723 <         //see if we can extend to the right
724 <         int aix=ai+6;
725 <         int bix=bi+6;
723 >         //see if we can extend to the left
724 >         int aix=ai-1;
725 >         int bix=bi-1;
726           int len=6;
727 <         while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
727 >         while (bix>=0 && aix>=0 && sd.aseq[aix]==sd.bseq[bix]) {
728                   gx.x(aix,bix)=1;
729 <                 aix++;bix++;
729 >                 ai=aix;
730 >                 bi=bix;
731 >                 aix--;bix--;
732                   len++;
733                   }
734 <        /*if (len>22) {
735 <                //heuristics: very likely the best we can get
734 >        if (len>sd.amlen) {
735 >                //heuristics: likely the best we can get
736                  //quick match shortcut
737                  diagstrips->qmatch=new GXSeed(ai,bi,len);
738                  return diagstrips;
739 <                } */
739 >                }
740           if (bi<bimin && len<9) continue; //skip middle seeds that are not high scoring enough
741           GXSeed* newseed=new GXSeed(ai,bi,len);
742           seeds.Add(newseed);
743           diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
744           //special last resort terminal match to be used if no better alignment is there
745 <         if (ai<2 && bi+len>b_len-2 &&
745 >         if (ai<2 && bi+len>sd.blen-2 &&
746                   (!diagstrips->tmatch || diagstrips->tmatch->len<len)) diagstrips->tmatch=newseed;
747           }
748    }
# Line 750 | Line 752
752   diagstrips->setSorted(true); //sort by score
753   return diagstrips;
754   }
755 <
756 < GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len,
757 <        GVec<uint16>* amers[], const char* seqb, int b_len) {
758 < //overlap of left (5') end of seqb
757 < //hash the last 12 bases of seqa:
758 < int bimax=GMIN((a_len+2), (b_len-6));
755 > */
756 > GXBandSet* collectSeeds(GList<GXSeed>& seeds, GXSeqData& sd) {
757 > int bimax=GMIN((sd.alen+2), (sd.blen-6));
758 > int bimin=GMAX(0,(sd.blen-sd.alen-6)); //from collectSeeds_R
759   //gx.init(a_maxlen, b_maxlen);
760 < GXSeedTable gx(a_len, b_len);
761 < GXBandSet* diagstrips=new GXBandSet(a_len, b_len); //set of overlapping 3-diagonal strips
762 < for (int bi=0;bi<=b_len-6;bi++) {
760 > GXSeedTable gx(sd.alen, sd.blen);
761 > GXBandSet* diagstrips=new GXBandSet(sd.alen, sd.blen); //set of overlapping 3-diagonal strips
762 > for (int bi=0;bi<=sd.blen-6;bi++) {
763     //for each 6-mer of seqb
764 <   uint16 bv = get6mer((char*)&seqb[bi]);
765 <   GVec<uint16>* alocs = amers[bv];
764 >   uint16 bv = get6mer((char*) & (sd.bseq[bi]));
765 >   GVec<uint16>* alocs = sd.amers[bv];
766     if (alocs==NULL) continue;
767     //extend each hit
768     for (int h=0;h<alocs->Count();h++) {
# Line 777 | Line 777
777           int aix=ai+6;
778           int bix=bi+6;
779           int len=6;
780 <         while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
780 >         while (bix<sd.blen && aix<sd.alen && sd.aseq[aix]==sd.bseq[bix]) {
781                   gx.x(aix,bix)=1;
782                   aix++;bix++;
783                   len++;
784                   }
785 <         /* if (len>22) {
785 >         if (len>sd.amlen) {
786                  //heuristics: very likely the best we can get
787                  //quick match shortcut
788                  diagstrips->qmatch=new GXSeed(ai,bi,len);
789                  return diagstrips;
790 <                } */
790 >                }
791           if (bi>bimax && len<9) continue; //skip middle seeds that are not high scoring enough
792 +         if (bi<bimin && len<9) continue; //collectSeeds_R
793 +
794           GXSeed* newseed=new GXSeed(ai,bi,len);
795           seeds.Add(newseed);
796           diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
797       //special last resort terminal match to be used if no better alignment is there
798 <     if (bi<2 && ai+len>=a_len-1 &&
799 <                 (!diagstrips->tmatch || diagstrips->tmatch->len<len))
800 <                   diagstrips->tmatch=newseed;
798 >     if (bi<2 && ai+len>=sd.alen-1 &&
799 >                 (!diagstrips->tmatch_l || diagstrips->tmatch_l->len<len))
800 >                      diagstrips->tmatch_l=newseed;
801 >     //collectSeeds_R:
802 >         if (ai<2 && bi+len>sd.blen-2 &&
803 >                 (!diagstrips->tmatch_r || diagstrips->tmatch_r->len<len))
804 >                  diagstrips->tmatch_r=newseed;
805       }
806   } //for each 6-mer of the read
807   for (int i=0;i<diagstrips->Count();i++) {
# Line 865 | Line 871
871    const char* s=s_seq+s_alnstart;
872    int s_avail=s_max-s_alnstart;
873    if (penalty<0) penalty=-penalty;
874 <  int MIN_GREEDY_SCORE=5*reward; //minimum score for an alignment to be reported for 0 diffs
874 >  int MIN_GREEDY_SCORE=6*reward; //minimum score for an alignment to be reported for 0 diffs
875    GXAlnInfo* alninfo=NULL;
876    bool freeAlnMem=(gxmem==NULL);
877    if (freeAlnMem) {
# Line 875 | Line 881
881    int retscore = 0;
882    int numdiffs = 0;
883    if (trim!=NULL && trim->type==galn_TrimLeft) {
884 +    //intent: trimming the left side
885      if (editscript)
886         ed_script_rev=new GXEditScript();
887  
888      int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
889                     reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
890      //check this extension here and bail out if it's not a good extension
891 <    if (s_alnstart+1-s_ext_l>trim->boundary) {
891 >    if (s_ext_l+(trim->seedlen>>1) < trim->safelen &&
892 >        q_alnstart+1-q_ext_l>trim->boundary &&
893 >        s_alnstart+1-s_ext_l>trim->boundary) {
894        delete ed_script_rev;
895        if (freeAlnMem) delete gxmem;
896        return NULL;
# Line 906 | Line 915
915      //check extension here and bail out if not a good right extension
916      //assuming s_max is really at the right end of s_seq
917      if (trim!=NULL && trim->type==galn_TrimRight &&
918 <           s_alnstart+s_ext_r<trim->boundary) {
918 >        s_ext_r+(trim->seedlen>>1) < trim->safelen &&
919 >            q_alnstart+q_ext_r<q_max-3 &&
920 >            s_alnstart+s_ext_r<trim->boundary) {
921        delete ed_script_fwd;
922        if (freeAlnMem) delete gxmem;
923        return NULL;
# Line 961 | Line 972
972       reward, penalty, xdrop, gxmem, trim, editscript);
973   }
974  
975 < GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, GVec<uint16>* amers[],
965 <                 const char* seqb, int seqb_len, CGreedyAlignData* gxmem, int min_pid) {
975 > GXAlnInfo* match_RightEnd(GXSeqData& sd, CGreedyAlignData* gxmem, int min_pid) {
976    bool editscript=false;
977    #ifdef GDEBUG
978     editscript=true;
979 <   GMessage("==========> matching Right (3') end : %s\n", seqa);
979 >   GMessage("==========> matching Right (3') end : %s\n", sd.aseq);
980    #endif
981 <
972 <  CAlnTrim trimInfo(galn_TrimRight, seqb, seqb_len);
981 >  CAlnTrim trimInfo(galn_TrimRight, sd.bseq, sd.blen, sd.amlen);
982    GList<GXSeed> rseeds(true,true,false);
983 <  GXBandSet* alnbands=collectSeeds_R(rseeds, seqa, seqa_len, amers, seqb, seqb_len);
983 >
984 >  GXBandSet* alnbands=collectSeeds(rseeds, sd);
985    GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
986    //did we find a shortcut?
987    if (alnbands->qmatch) {
# Line 1005 | Line 1015
1015      GXSeed& aseed=*anchor_seeds[i];
1016      int a1=aseed.a_ofs+(aseed.len>>1)+1;
1017      int a2=aseed.b_ofs+(aseed.len>>1)+1;
1018 <    GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1019 <                            seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1018 >    trimInfo.seedlen=aseed.len;
1019 > #ifdef GDEBUG
1020 >    GMessage("\t::: align from seed (%d, %d)of len %d.\n",aseed.a_ofs, aseed.b_ofs,
1021 >           aseed.len);
1022 > #endif
1023 >    GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen,
1024 >                            sd.bseq, a2, sd.blen, gxmem, &trimInfo, editscript);
1025      if (alninfo && alninfo->pid>=min_pid &&
1026 <        trimInfo.validate(seqa_len, alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1026 >        trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1027               galns.AddIfNew(alninfo, true);
1028          else delete alninfo;
1029      }
1030 <  if (galns.Count()==0 && alnbands->tmatch) {
1030 >  if (galns.Count()==0 && alnbands->tmatch_r) {
1031        //last resort seed
1032 <      GXSeed& aseed=*alnbands->tmatch;
1033 <      int a1=aseed.a_ofs+(aseed.len>>1)+1;
1034 <      int a2=aseed.b_ofs+(aseed.len>>1)+1;
1035 <      GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1036 <                                seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1032 >      GXSeed& aseed=*alnbands->tmatch_r;
1033 >      int halfseed=aseed.len>>1;
1034 >      int a1=aseed.a_ofs+halfseed+1;
1035 >      int a2=aseed.b_ofs+halfseed+1;
1036 >      trimInfo.seedlen=aseed.len;
1037 > #ifdef GDEBUG
1038 >    GMessage("\t::: align from seed (%d, %d)of len %d.\n",aseed.a_ofs, aseed.b_ofs,
1039 >           aseed.len);
1040 > #endif
1041 >      GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen,
1042 >                                sd.bseq, a2, sd.blen, gxmem, &trimInfo, editscript);
1043        if (alninfo && alninfo->pid>=min_pid &&
1044 <           trimInfo.validate(seqa_len, alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1044 >           trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1045                   galns.AddIfNew(alninfo, true);
1046              else delete alninfo;
1047        }
# Line 1075 | Line 1096
1096        GMessage("Best alignment: a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", bestaln->ql, bestaln->qr,
1097            bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid);
1098        if (bestaln->gapinfo!=NULL) {
1099 <        bestaln->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1099 >        bestaln->gapinfo->printAlignment(stderr, sd.aseq, sd.alen, sd.bseq, sd.blen);
1100          }
1101      #endif
1102  
# Line 1084 | Line 1105
1105    else return NULL;
1106   }
1107  
1108 < GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, GVec<uint16>* amers[],
1088 <                 const char* seqb, int seqb_len, CGreedyAlignData* gxmem, int min_pid) {
1108 > GXAlnInfo* match_LeftEnd(GXSeqData& sd, CGreedyAlignData* gxmem, int min_pid) {
1109    bool editscript=false;
1110    #ifdef GDEBUG
1111     editscript=true;
1112 <   GMessage("==========> matching Left (5') end : %s\n", seqa);
1112 >   GMessage("==========> matching Left (5') end : %s\n", sd.aseq);
1113    #endif
1114 <  CAlnTrim trimInfo(galn_TrimLeft, seqb, seqb_len);
1114 >  CAlnTrim trimInfo(galn_TrimLeft, sd.bseq, sd.blen, sd.amlen);
1115    GList<GXSeed> rseeds(true,true,false);
1116 <  GXBandSet* alnbands=collectSeeds_L(rseeds, seqa, seqa_len, amers, seqb, seqb_len);
1116 >  GXBandSet* alnbands = collectSeeds(rseeds, sd);
1117    GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
1118    if (alnbands->qmatch) {
1119      #ifdef GDEBUG
# Line 1126 | Line 1146
1146      GXSeed& aseed=*anchor_seeds[i];
1147      int a1=aseed.a_ofs+(aseed.len>>1)+1;
1148      int a2=aseed.b_ofs+(aseed.len>>1)+1;
1149 <    GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1150 <                            seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1149 >    trimInfo.seedlen=aseed.len;
1150 > #ifdef GDEBUG
1151 >    GMessage("\t::: align from seed (%d, %d)of len %d.\n",aseed.a_ofs, aseed.b_ofs,
1152 >           aseed.len);
1153 > #endif
1154 >    GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen,
1155 >                            sd.bseq, a2, sd.blen, gxmem, &trimInfo, editscript);
1156      if (alninfo && alninfo->pid>=min_pid
1157 <           && trimInfo.validate(seqa_len, alninfo->sl, alninfo->sr, alninfo->pid, seqa_len-alninfo->qr))
1157 >           && trimInfo.validate(alninfo->sl, alninfo->sr,
1158 >                    alninfo->pid, sd.alen-alninfo->qr))
1159              galns.AddIfNew(alninfo, true);
1160         else delete alninfo;
1161      }
1162 <  if (galns.Count()==0 && alnbands->tmatch) {
1162 >  if (galns.Count()==0 && alnbands->tmatch_l) {
1163        //last resort seed
1164 <      GXSeed& aseed=*alnbands->tmatch;
1164 >      GXSeed& aseed=*alnbands->tmatch_l;
1165        int a1=aseed.a_ofs+(aseed.len>>1)+1;
1166        int a2=aseed.b_ofs+(aseed.len>>1)+1;
1167 <      GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1168 <                              seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1167 >      trimInfo.seedlen=aseed.len;
1168 >      GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen,
1169 >                              sd.bseq, a2, sd.blen, gxmem, &trimInfo, editscript);
1170        if (alninfo && alninfo->pid>=min_pid &&
1171 <        trimInfo.validate(seqa_len, alninfo->sl, alninfo->sr, alninfo->pid, seqa_len-alninfo->qr))
1171 >        trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, sd.alen-alninfo->qr))
1172           galns.Add(alninfo);
1173        }
1174    /*
# Line 1166 | Line 1193
1193        GMessage("Best alignment: a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", bestaln->ql, bestaln->qr,
1194            bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid);
1195        if (bestaln->gapinfo!=NULL) {
1196 <        bestaln->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1196 >        bestaln->gapinfo->printAlignment(stderr, sd.aseq, sd.alen,
1197 >                                          sd.bseq, sd.blen);
1198          }
1199      #endif
1200      return bestaln;

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines