ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.cpp
(Generate patch)
# Line 93 | Line 93
93      else
94          g = BLAST_Gcd(*a, BLAST_Gcd(*b, *c));
95      if (g > 1) {
96 <        *a /= g;
96 >                *a /= g;
97          *b /= g;
98          *c /= g;
99      }
# Line 103 | Line 103
103  
104   uint16 get6mer(char* p) {
105    uint16 r=gdna2bit(p,3);
106 <  if (*p) {
107 <   uint16 v=gdna2bit(p,3);
108 <   r |= (v<<8);
109 <   }
106 >  r <<= 6;
107 >  r |= gdna2bit(p,3);
108    return r;
109   }
110  
111 +
112 + void table6mers(const char* s, int slen, GVec<uint16>* amers[]) {
113 + for (uint16 i=0; i <= slen-6; i++) {
114 +   char* p = (char*)(s+i);
115 +   uint16 v=get6mer(p);
116 +   if (amers[v]==NULL) {
117 +      amers[v]=new GVec<uint16>(1);
118 +      }
119 +   amers[v]->Add(i);
120 + }
121 + }
122 +
123 + GVec<uint16>* match6mer(char* start, GVec<uint16>* amers[]) {
124 +  //careful: this is broken if start+5 falls beyond the end of the string!
125 +  uint16 r=get6mer(start);
126 +  return amers[r];
127 + }
128 +
129   //signal that a diagonal is invalid
130   static const int kInvalidOffset = -2;
131  
# Line 686 | Line 702
702  
703  
704   GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len,
705 <        GVec<uint16> amers[], const char* seqb, int b_len) {
706 < //overlap of right (3') end of seqb
707 < //hash the first 12 bases of seqa:
708 < int aimin=0;
709 < int aimax=GMIN(9,(a_len-4));
710 < int bimin=GMAX(0,(b_len-a_len-2));//allow 2 extraneous bases, just in case there is some garbage at the end
711 < int bimax=b_len-4;
712 < int a_maxlen=aimax+4; //number of rows in the diagonal matrix
713 < int b_maxlen=b_len; //number of cols in the diagonal matrix
714 < GXSeedTable gx(a_maxlen, b_maxlen);
715 < GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
716 < for (int ai=aimin;ai<=aimax;ai++) {
717 <    int32* av=(int32 *)(&seqa[ai]);
718 <    //for (int bi=b_len-4;bi>=bimin;bi--) {
719 <    for (int bi=bimin;bi<=bimax;bi++) {
720 <       if (gx.x(ai,bi))
721 <            continue; //already have a previous seed covering this region of this diagonal
722 <       int32* bv=(int32 *)(&seqb[bi]);
723 <       if (*av == *bv) {
724 <          //word match
725 <          //see if we can extend to the right
726 <            gx.x(ai+1,bi+1)=1;
727 <            gx.x(ai+2,bi+2)=1;
728 <            gx.x(ai+3,bi+3)=1;
729 <          int aix=ai+4;
730 <          int bix=bi+4;
731 <          int len=4;
732 <          while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
733 <               {
734 <                 gx.x(aix,bix)=1;
735 <                 aix++;bix++;
736 <                 len++;
737 <               }
738 <          if (len>7) {
739 <              //heuristics: very likely the best we can get
740 <              int qlen=len;
741 <              while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
742 <                aix++;
743 <                bix++;
744 <                qlen++;
745 <                }
746 <              if (qlen>10) { //found a quick match shortcut
731 <                 diagstrips->qmatch=new GXSeed(ai,bi,qlen);
732 <                 return diagstrips;
733 <                 }
734 <              }
735 <          GXSeed* newseed=new GXSeed(ai,bi,len);
736 <          seeds.Add(newseed);
737 <          diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
738 <          //special last resort terminal match to be used if no better alignment is there
739 <          if (len>4 && ai==0 && (bi+len==b_len || bi+len==b_len-1)) diagstrips->tmatch=newseed;
740 <          }
741 <       }
742 <    }//for each 4-mer at the beginning of seqa
705 >        GVec<uint16>* amers[], const char* seqb, int b_len) {
706 > int bimin=GMAX(0,(b_len-a_len-6));
707 > GXSeedTable gx(a_len, b_len);
708 > GXBandSet* diagstrips=new GXBandSet(a_len, b_len); //set of overlapping 3-diagonal strips
709 > for (int bi=0;bi<=b_len-6;bi++) {
710 >   //for each 6-mer of seqb
711 >   uint16 bv = get6mer((char*)&seqb[bi]);
712 >   GVec<uint16>* alocs = amers[bv];
713 >   if (alocs==NULL) continue;
714 >   //extend each hit
715 >   for (int h=0;h<alocs->Count();h++) {
716 >         int ai=alocs->Get(h);
717 >         //word match
718 >         if (gx.x(ai,bi))
719 >           //already have a previous seed covering this region of this diagonal
720 >           continue;
721 >         for (int i=0;i<6;i++)
722 >            gx.x(ai+i,bi+i)=1;
723 >         //see if we can extend to the right
724 >         int aix=ai+6;
725 >         int bix=bi+6;
726 >         int len=6;
727 >         while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
728 >                 gx.x(aix,bix)=1;
729 >                 aix++;bix++;
730 >                 len++;
731 >                 }
732 >        /*if (len>22) {
733 >                //heuristics: very likely the best we can get
734 >                //quick match shortcut
735 >                diagstrips->qmatch=new GXSeed(ai,bi,len);
736 >                return diagstrips;
737 >                } */
738 >         if (bi<bimin && len<9) continue; //skip middle seeds that are not high scoring enough
739 >         GXSeed* newseed=new GXSeed(ai,bi,len);
740 >         seeds.Add(newseed);
741 >         diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
742 >         //special last resort terminal match to be used if no better alignment is there
743 >         if (ai<2 && bi+len>b_len-2 &&
744 >                 (!diagstrips->tmatch || diagstrips->tmatch->len<len)) diagstrips->tmatch=newseed;
745 >         }
746 >  }
747   for (int i=0;i<diagstrips->Count();i++) {
748      diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
749      }
# Line 748 | Line 752
752   }
753  
754   GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len,
755 <        GVec<uint16> amers[], const char* seqb, int b_len) {
755 >        GVec<uint16>* amers[], const char* seqb, int b_len) {
756   //overlap of left (5') end of seqb
757   //hash the last 12 bases of seqa:
758 < int aimin=GMAX(0,(a_len-16));
755 < int aimax=a_len-4;
756 < int bimin=0;
757 < int bimax=GMIN((a_len-2), (b_len-4));
758 < int a_maxlen=aimax+4; //number of rows in the diagonal matrix
759 < int b_maxlen=b_len; //number of cols in the diagonal matrix
758 > int bimax=GMIN((a_len+2), (b_len-6));
759   //gx.init(a_maxlen, b_maxlen);
760 < GXSeedTable gx(a_maxlen, b_maxlen);
761 < GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
762 < for (int ai=aimin;ai<=aimax;ai++) {
763 <    int32* av=(int32 *)(&seqa[ai]);
764 <    for (int bi=bimin;bi<=bimax;bi++) {
765 <       if (gx.x(ai,bi))
766 <          continue; //already have a previous seed covering this region of this diagonal
767 <       int32* bv=(int32 *)(&seqb[bi]);
768 <       if (*av == *bv) {
769 <          //word match
770 <          //see if we can extend to the right
771 <        gx.x(ai+1,bi+1)=1;
772 <        gx.x(ai+2,bi+2)=1;
773 <        gx.x(ai+3,bi+3)=1;
774 <          int aix=ai+4;
775 <          int bix=bi+4;
776 <          int len=4;
777 <          while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
778 <               {
779 <                 gx.x(aix,bix)=1;
780 <                 aix++;bix++;
781 <                 len++;
782 <               }
783 <          if (len>7) {
784 <              //heuristics: very likely the best we can get
785 <              int qlen=len;
786 <              while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
787 <                aix++;
788 <                bix++;
789 <                qlen++;
790 <                }
791 <              if (qlen>10) { //found a quick match shortcut
792 <                 diagstrips->qmatch=new GXSeed(ai,bi,qlen);
793 <                 return diagstrips;
794 <                 }
795 <              }
796 <          GXSeed* newseed=new GXSeed(ai,bi,len);
797 <          seeds.Add(newseed);
798 <          diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
799 <          //special last resort terminal match to be used if no better alignment is there
800 <          if (len>4 && bi==0 && ai+len==a_len) diagstrips->tmatch=newseed;
802 <          }
803 <       }
804 <    }//for each 4-mer at the end of seqa
760 > GXSeedTable gx(a_len, b_len);
761 > GXBandSet* diagstrips=new GXBandSet(a_len, b_len); //set of overlapping 3-diagonal strips
762 > for (int bi=0;bi<=b_len-6;bi++) {
763 >   //for each 6-mer of seqb
764 >   uint16 bv = get6mer((char*)&seqb[bi]);
765 >   GVec<uint16>* alocs = amers[bv];
766 >   if (alocs==NULL) continue;
767 >   //extend each hit
768 >   for (int h=0;h<alocs->Count();h++) {
769 >         int ai=alocs->Get(h);
770 >         //word match
771 >         if (gx.x(ai,bi))
772 >           //already have a previous seed covering this region of this diagonal
773 >           continue;
774 >         for (int i=0;i<6;i++)
775 >            gx.x(ai+i,bi+i)=1;
776 >         //see if we can extend to the right
777 >         int aix=ai+6;
778 >         int bix=bi+6;
779 >         int len=6;
780 >         while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
781 >                 gx.x(aix,bix)=1;
782 >                 aix++;bix++;
783 >                 len++;
784 >                 }
785 >         /* if (len>22) {
786 >                //heuristics: very likely the best we can get
787 >                //quick match shortcut
788 >                diagstrips->qmatch=new GXSeed(ai,bi,len);
789 >                return diagstrips;
790 >                } */
791 >         if (bi>bimax && len<9) continue; //skip middle seeds that are not high scoring enough
792 >         GXSeed* newseed=new GXSeed(ai,bi,len);
793 >         seeds.Add(newseed);
794 >         diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
795 >     //special last resort terminal match to be used if no better alignment is there
796 >     if (bi<2 && ai+len>=a_len-1 &&
797 >                 (!diagstrips->tmatch || diagstrips->tmatch->len<len))
798 >                   diagstrips->tmatch=newseed;
799 >     }
800 > } //for each 6-mer of the read
801   for (int i=0;i<diagstrips->Count();i++) {
802      diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
803      }
# Line 819 | Line 815
815    else return (s2->len-s1->len);
816   }
817  
818 + int cmpSeedScore_R(const pointer p1, const pointer p2) {
819 +  //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
820 +  GXSeed* s1=(GXSeed*)p1;
821 +  GXSeed* s2=(GXSeed*)p2;
822 +  if (s1->len==s2->len) {
823 +     return (s2->b_ofs-s1->b_ofs);
824 +     }
825 +  else return (s2->len-s1->len);
826 + }
827 +
828 +
829   int cmpSeedDiag(const pointer p1, const pointer p2) {
830    GXSeed* s1=(GXSeed*)p1;
831    GXSeed* s2=(GXSeed*)p2;
# Line 954 | Line 961
961       reward, penalty, xdrop, gxmem, trim, editscript);
962   }
963  
964 < GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, GVec<uint16> amers[],
964 > GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, GVec<uint16>* amers[],
965                   const char* seqb, int seqb_len, CGreedyAlignData* gxmem, int min_pid) {
966    bool editscript=false;
967    #ifdef GDEBUG
968     editscript=true;
969 <  // GMessage("==========> matching Right (3') end ========\n");
969 >   GMessage("==========> matching Right (3') end : %s\n", seqa);
970    #endif
971  
972    CAlnTrim trimInfo(galn_TrimRight, seqb, seqb_len);
# Line 968 | Line 975
975    GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
976    //did we find a shortcut?
977    if (alnbands->qmatch) {
978 <    //#ifdef GDEBUG
979 <    // GMessage("::: Found a quick long match -- using it instead of diagonal bands\n");
980 <    //#endif
978 >    #ifdef GDEBUG
979 >     GMessage("::: Found a quick long match at %d, len %d\n",
980 >          alnbands->qmatch->b_ofs, alnbands->qmatch->len);
981 >    #endif
982      anchor_seeds.Add(alnbands->qmatch);
983      }
984    else {
985      int max_top_bands=5;
986      int top_band_count=0;
987      for (int b=0;b<alnbands->Count();b++) {
988 <       if (alnbands->Get(b)->score<4) break;
988 >       if (alnbands->Get(b)->score<6) break;
989         //#ifdef GDEBUG
990         //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
991         //#endif
# Line 992 | Line 1000
1000      //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1001      //#endif
1002      }
995
1003    GList<GXAlnInfo> galns(true,true,false);
1004    for (int i=0;i<anchor_seeds.Count();i++) {
1005      GXSeed& aseed=*anchor_seeds[i];
# Line 1000 | Line 1007
1007      int a2=aseed.b_ofs+(aseed.len>>1)+1;
1008      GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1009                              seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1010 <    if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1010 >    if (alninfo && alninfo->pid>=min_pid &&
1011 >        trimInfo.validate(seqa_len, alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1012               galns.AddIfNew(alninfo, true);
1013          else delete alninfo;
1014      }
1015 +  if (galns.Count()==0 && alnbands->tmatch) {
1016 +      //last resort seed
1017 +      GXSeed& aseed=*alnbands->tmatch;
1018 +      int a1=aseed.a_ofs+(aseed.len>>1)+1;
1019 +      int a2=aseed.b_ofs+(aseed.len>>1)+1;
1020 +      GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1021 +                                seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1022 +      if (alninfo && alninfo->pid>=min_pid &&
1023 +           trimInfo.validate(seqa_len, alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1024 +                 galns.AddIfNew(alninfo, true);
1025 +            else delete alninfo;
1026 +      }
1027 +
1028 +  /*
1029    //special 3' end case: due to the seed scoring scheme being biased towards the 5' end of the read,
1030    //we should also try some seeds closer to the 3' end
1031    if (galns.Count()==0) {
# Line 1043 | Line 1065
1065                 galns.AddIfNew(alninfo, true);
1066            else delete alninfo;
1067        }
1068 <    }
1068 >    } */
1069 >
1070    //---- done
1071    delete alnbands;
1072    if (galns.Count()) {
# Line 1061 | Line 1084
1084    else return NULL;
1085   }
1086  
1087 < GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, GVec<uint16> amers[],
1087 > GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, GVec<uint16>* amers[],
1088                   const char* seqb, int seqb_len, CGreedyAlignData* gxmem, int min_pid) {
1089    bool editscript=false;
1090    #ifdef GDEBUG
1091     editscript=true;
1092 <   //GMessage("==========> matching Left (5') end ========\n");
1092 >   GMessage("==========> matching Left (5') end : %s\n", seqa);
1093    #endif
1094    CAlnTrim trimInfo(galn_TrimLeft, seqb, seqb_len);
1095    GList<GXSeed> rseeds(true,true,false);
1096    GXBandSet* alnbands=collectSeeds_L(rseeds, seqa, seqa_len, amers, seqb, seqb_len);
1074  GList<GXAlnInfo> galns(true,true,false);
1097    GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
1076
1098    if (alnbands->qmatch) {
1099 <    //#ifdef GDEBUG
1100 <    // GMessage("::: Found a quick long match -- using it instead of diagonal bands\n");
1101 <    //#endif
1099 >    #ifdef GDEBUG
1100 >     GMessage("::: Found a quick long match at %d, len %d\n",
1101 >          alnbands->qmatch->b_ofs, alnbands->qmatch->len);
1102 >    #endif
1103      anchor_seeds.Add(alnbands->qmatch);
1104      }
1105    else {
1106      int max_top_bands=5;
1107      int top_band_count=0;
1108      for (int b=0;b<alnbands->Count();b++) {
1109 <       if (alnbands->Get(b)->score<4) break;
1109 >       if (alnbands->Get(b)->score<6) break;
1110         //#ifdef GDEBUG
1111         //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
1112         //#endif
# Line 1092 | Line 1114
1114         GXBand& band=*(alnbands->Get(b));
1115         band.seeds.setSorted(cmpSeedScore);
1116         anchor_seeds.Add(band.seeds.First());
1117 +       band.tested=true;
1118         if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1119         }
1120      //#ifdef GDEBUG
1121      //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1122      //#endif
1123      }
1124 <  for (int i=0;i<anchor_seeds.Count();i++) {
1124 > GList<GXAlnInfo> galns(true,true,false);
1125 > for (int i=0;i<anchor_seeds.Count();i++) {
1126      GXSeed& aseed=*anchor_seeds[i];
1127      int a1=aseed.a_ofs+(aseed.len>>1)+1;
1128      int a2=aseed.b_ofs+(aseed.len>>1)+1;
1129      GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1130                              seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1131      if (alninfo && alninfo->pid>=min_pid
1132 <           && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, seqa_len-alninfo->qr))
1132 >           && trimInfo.validate(seqa_len, alninfo->sl, alninfo->sr, alninfo->pid, seqa_len-alninfo->qr))
1133              galns.AddIfNew(alninfo, true);
1134         else delete alninfo;
1135      }
1136    if (galns.Count()==0 && alnbands->tmatch) {
1137 +      //last resort seed
1138        GXSeed& aseed=*alnbands->tmatch;
1139        int a1=aseed.a_ofs+(aseed.len>>1)+1;
1140        int a2=aseed.b_ofs+(aseed.len>>1)+1;
1141        GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1142                                seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1143 <      if (alninfo) galns.Add(alninfo);
1143 >      if (alninfo && alninfo->pid>=min_pid &&
1144 >        trimInfo.validate(seqa_len, alninfo->sl, alninfo->sr, alninfo->pid, seqa_len-alninfo->qr))
1145 >         galns.Add(alninfo);
1146        }
1120  #ifdef GDEBUG
1147    /*
1148 +  #ifdef GDEBUG
1149 +  //print valid alignments found
1150    for (int i=0;i<galns.Count();i++) {
1151      GXAlnInfo* alninfo=galns[i];
1152      GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
# Line 1128 | Line 1156
1156        alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1157        }
1158      }
1131  */
1159    #endif
1160 +  */
1161    //---- done
1162    delete alnbands;
1163    if (galns.Count()) {

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines