ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.cpp
(Generate patch)
# Line 184 | Line 184
184                         bool reverse, int xdrop_threshold,
185                         int match_cost, int mismatch_cost,
186                         int& seq1_align_len, int& seq2_align_len,
187 <                       SGreedyAlignMem& aux_data,
187 >                       CGreedyAlignData& aux_data,
188                         GXEditScript *edit_block) {
189                         //GapPrelimEditBlock *edit_block,
190                         //bool& fence_hit, SGreedySeed *seed) {
# Line 230 | Line 230
230         nificantly better than this */
231  
232      max_dist = GMIN(GREEDY_MAX_COST,
233 <                   len2 / GREEDY_MAX_COST_FRACTION + 1);
233 >                   (len2/GREEDY_MAX_COST_FRACTION + 1));
234  
235      /* the main loop assumes that the index of all diagonals is
236         biased to lie in the middle of allocated bookkeeping
# Line 536 | Line 536
536   int q_max=strlen(q_seq); //query
537   int s_max=strlen(s_seq); //subj
538   return GreedyAlignRegion(q_seq, q_alnstart, q_max,
539 <          s_seq,  s_alnstart, s_max, galn_NoTrim, editscript, NULL, reward, penalty, xdrop);
539 >          s_seq,  s_alnstart, s_max, reward, penalty, xdrop, NULL, NULL, editscript);
540   }
541  
542   struct GXSeedTable {
# Line 578 | Line 578
578   const int a_m_score=2; //match score
579   const int a_mis_score=-3; //mismatch
580   const int a_dropoff_score=7;
581 + const int a_min_score=12; //at least 6 bases full match
582  
583   // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
584   //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
# Line 676 | Line 677
677   }
678  
679  
679 bool extendUngapped_R(const char* seqa, int a_len, int &aix,
680                      const char* seqb, int b_len, int &bix, int &len) {
681 //must start with a match
682 if (seqa[aix]!=seqb[bix]) GError("Error at extendUngapped_R: no initial match!\n");
683
684 }
685
680   GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
681   //overlap of right (3') end of seqb
682   //hash the first 12 bases of seqa:
# Line 716 | Line 710
710                   aix++;bix++;
711                   len++;
712                 }
713 <          if (len>8) {
713 >          if (len>7) {
714                //heuristics: very likely the best we can get
715                int qlen=len;
716                while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
# Line 724 | Line 718
718                  bix++;
719                  qlen++;
720                  }
721 <              if (qlen>10) {
721 >              if (qlen>10) { //found a quick match shortcut
722                   diagstrips->qmatch=new GXSeed(ai,bi,qlen);
723                   return diagstrips;
724                   }
# Line 732 | Line 726
726            GXSeed* newseed=new GXSeed(ai,bi,len);
727            seeds.Add(newseed);
728            diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
729 +          //special last resort terminal match to be used if no better alignment is there
730 +          if (len>4 && ai==0 && (bi+len==b_len || bi+len==b_len-1)) diagstrips->tmatch=newseed;
731            }
732         }
733      }//for each 4-mer at the beginning of seqa
# Line 775 | Line 771
771                   aix++;bix++;
772                   len++;
773                 }
774 +          if (len>7) {
775 +              //heuristics: very likely the best we can get
776 +              int qlen=len;
777 +              while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
778 +                aix++;
779 +                bix++;
780 +                qlen++;
781 +                }
782 +              if (qlen>10) { //found a quick match shortcut
783 +                 diagstrips->qmatch=new GXSeed(ai,bi,qlen);
784 +                 return diagstrips;
785 +                 }
786 +              }
787            GXSeed* newseed=new GXSeed(ai,bi,len);
788            seeds.Add(newseed);
789            diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
790 +          //special last resort terminal match to be used if no better alignment is there
791 +          if (len>4 && bi==0 && ai+len==a_len) diagstrips->tmatch=newseed;
792            }
793         }
794      }//for each 4-mer at the end of seqa
# Line 788 | Line 799
799   return diagstrips;
800   }
801  
791
792
802   int cmpSeedScore(const pointer p1, const pointer p2) {
803    //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
804    GXSeed* s1=(GXSeed*)p1;
# Line 806 | Line 815
815    return ((s1->b_ofs-s1->a_ofs)-(s2->b_ofs-s2->a_ofs));
816   }
817  
818 +
819 + int cmpDiagBands_R(const pointer p1, const pointer p2) {
820 +  //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
821 +  GXBand* b1=(GXBand*)p1;
822 +  GXBand* b2=(GXBand*)p2;
823 +  if (b1->score==b2->score) {
824 +     return (b2->w_min_b-b1->w_min_b);
825 +     }
826 +  else return (b2->score-b1->score);
827 + }
828 +
829 +
830 +
831   GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
832 <                       const char* s_seq, int s_alnstart, int s_max, GAlnTrimType trimtype,
833 <                        bool editscript, SGreedyAlignMem* gxmem, int reward, int penalty, int xdrop) {
832 >    const char* s_seq, int s_alnstart, int s_max,
833 >    int reward, int penalty, int xdrop, CGreedyAlignData* gxmem,
834 >    CAlnTrim* trim, bool editscript) {
835    GXEditScript* ed_script_fwd = NULL;
836    GXEditScript* ed_script_rev = NULL;
837    if ( q_alnstart>q_max || q_alnstart<1 || s_alnstart>s_max || s_alnstart<1 )
# Line 825 | Line 848
848    const char* s=s_seq+s_alnstart;
849    int s_avail=s_max-s_alnstart;
850    if (penalty<0) penalty=-penalty;
851 <  int MIN_GREEDY_SCORE=5*reward; //minimum score for an alignment to be reported
851 >  int MIN_GREEDY_SCORE=5*reward; //minimum score for an alignment to be reported for 0 diffs
852    GXAlnInfo* alninfo=NULL;
853    bool freeAlnMem=(gxmem==NULL);
854    if (freeAlnMem) {
855 <     gxmem=new SGreedyAlignMem(reward, penalty, xdrop);
855 >     gxmem=new CGreedyAlignData(reward, penalty, xdrop);
856       }
857    else gxmem->reset();
858    int retscore = 0;
859    int numdiffs = 0;
860 <  if (trimtype==galn_TrimLeft) {
860 >  if (trim!=NULL && trim->type==galn_TrimLeft) {
861      if (editscript)
862         ed_script_rev=new GXEditScript();
863  
864 <    int numdiffs_l =  GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
864 >    int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
865                     reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
866      //check this extension here and bail out if it's not a good extension
867 <    if (s_alnstart+1-s_ext_l>3) {
867 >    if (s_alnstart+1-s_ext_l>trim->boundary) {
868        delete ed_script_rev;
869        if (freeAlnMem) delete gxmem;
870        return NULL;
# Line 853 | Line 876
876                              reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
877      numdiffs=numdiffs_r+numdiffs_l;
878      //convert num diffs to actual score
879 <    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward-penalty);
879 >    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward+penalty);
880      if (editscript)
881         ed_script_rev->Append(ed_script_fwd); //combine the two extensions
882      }
# Line 865 | Line 888
888                              reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
889      //check extension here and bail out if not a good right extension
890      //assuming s_max is really at the right end of s_seq
891 <    if (trimtype==galn_TrimRight && s_alnstart+s_ext_r<s_max-2) {
891 >    if (trim!=NULL && trim->type==galn_TrimRight &&
892 >           s_alnstart+s_ext_r<trim->boundary) {
893        delete ed_script_fwd;
894        if (freeAlnMem) delete gxmem;
895        return NULL;
# Line 876 | Line 900
900                     reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
901      //convert num diffs to actual score
902      numdiffs=numdiffs_r+numdiffs_l;
903 <    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward-penalty);
903 >    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward+penalty);
904      if (editscript)
905         ed_script_rev->Append(ed_script_fwd); //combine the two extensions
906    }
# Line 887 | Line 911
911      alninfo->score=retscore;
912      alninfo->pid = 100 * (1 - ((double) numdiffs) / hsp_length);
913   #ifdef GDEBUG
914 <    if (ed_script_rev) {
915 <       GMessage("Final Edit script ::: ");
916 <       printEditScript(ed_script_rev);
917 <       }
914 >    //if (ed_script_rev) {
915 >    //   GMessage("Final Edit script ::: ");
916 >    //   printEditScript(ed_script_rev);
917 >    //   }
918   #endif
919      alninfo->editscript=ed_script_rev;
920      alninfo->gapinfo = new CAlnGapInfo(ed_script_rev, alninfo->ql-1, alninfo->sl-1);
# Line 905 | Line 929
929    return alninfo;
930   }
931  
932 + GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
933 +                       const char* s_seq, int s_alnstart, int s_max, CGreedyAlignData* gxmem,
934 +                       CAlnTrim* trim, bool editscript) {
935 + int reward=2;
936 + int penalty=3;
937 + int xdrop=8;
938 + if (gxmem) {
939 +   reward=gxmem->match_reward;
940 +   penalty=gxmem->mismatch_penalty;
941 +   xdrop=gxmem->x_drop;
942 +   }
943 + return GreedyAlignRegion(q_seq, q_alnstart, q_max, s_seq, s_alnstart, s_max,
944 +     reward, penalty, xdrop, gxmem, trim, editscript);
945 + }
946 +
947   GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
948 <            SGreedyAlignMem* gxmem, int match_reward, int mismatch_penalty, int Xdrop) {
948 >            CGreedyAlignData* gxmem, int min_pid) {
949    bool editscript=false;
950    #ifdef GDEBUG
951     editscript=true;
952 <   GMessage("==========> matching Right (3') end ========\n");
952 >  // GMessage("==========> matching Right (3') end ========\n");
953    #endif
954 +
955 +  CAlnTrim trimInfo(galn_TrimRight, seqb, seqb_len);
956    GList<GXSeed> rseeds(true,true,false);
957    GXBandSet* alnbands=collectSeeds_R(rseeds, seqa, seqa_len, seqb, seqb_len);
958 +  GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
959    //did we find a shortcut?
960    if (alnbands->qmatch) {
961 <
961 >    //#ifdef GDEBUG
962 >    // GMessage("::: Found a quick long match -- using it instead of diagonal bands\n");
963 >    //#endif
964 >    anchor_seeds.Add(alnbands->qmatch);
965 >    }
966 >  else {
967 >    int max_top_bands=5;
968 >    int top_band_count=0;
969 >    for (int b=0;b<alnbands->Count();b++) {
970 >       if (alnbands->Get(b)->score<4) break;
971 >       //#ifdef GDEBUG
972 >       //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
973 >       //#endif
974 >       top_band_count++;
975 >       GXBand& band=*(alnbands->Get(b));
976 >       band.seeds.setSorted(cmpSeedScore);
977 >       anchor_seeds.Add(band.seeds.First());
978 >       band.tested=true;
979 >       if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
980 >       }
981 >    //#ifdef GDEBUG
982 >    //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
983 >    //#endif
984      }
921  GList<GXAlnInfo> galns(true,true,false);
922  int max_top_bands=5;
923  int top_band_count=0;
924  GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
925  for (int b=0;b<alnbands->Count();b++) {
926     if (alnbands->Get(b)->score<4) break;
927     top_band_count++;
928     GXBand& band=*(alnbands->Get(b));
929     band.seeds.setSorted(cmpSeedScore);
930     anchor_seeds.Add(band.seeds.First());
931     if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
932     }
933  #ifdef GDEBUG
934  GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
935  #endif
985  
986 +  GList<GXAlnInfo> galns(true,true,false);
987    for (int i=0;i<anchor_seeds.Count();i++) {
988      GXSeed& aseed=*anchor_seeds[i];
989      int a1=aseed.a_ofs+(aseed.len>>1)+1;
990      int a2=aseed.b_ofs+(aseed.len>>1)+1;
991      GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
992 <                            seqb, a2, seqb_len, galn_TrimRight, editscript, gxmem,
993 <                            match_reward, mismatch_penalty, Xdrop);
944 <    if (alninfo && alninfo->pid>73 && alninfo->sr>seqb_len-4 && alninfo->ql<4)
992 >                            seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
993 >    if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
994               galns.AddIfNew(alninfo, true);
995          else delete alninfo;
996      }
997 <
998 <  #ifdef GDEBUG
999 <  for (int i=0;i<galns.Count();i++) {
1000 <    GXAlnInfo* alninfo=galns[i];
1001 <    GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
1002 <                         alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
1003 <    if (alninfo->gapinfo!=NULL) {
1004 <      GMessage("Alignment:\n");
1005 <      alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
997 >  //special 3' end case: due to the seed scoring scheme being biased towards the 5' end of the read,
998 >  //we should also try some seeds closer to the 3' end
999 >  if (galns.Count()==0) {
1000 >    anchor_seeds.Clear();
1001 >    alnbands->setSorted(cmpDiagBands_R);
1002 >    int max_top_bands=4;
1003 >    int top_band_count=0;
1004 >    //#ifdef GDEBUG
1005 >    //GMessage(":::>> Retrying adjusting sort order.\n");
1006 >    //#endif
1007 >    if (alnbands->tmatch) {
1008 >      //anchor_seeds.setSorted(false);
1009 >      anchor_seeds.Add(alnbands->tmatch);
1010 >      }
1011 >    for (int b=0;b<alnbands->Count();b++) {
1012 >       if (alnbands->Get(b)->score<4) break;
1013 >       //#ifdef GDEBUG
1014 >       //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
1015 >       //#endif
1016 >       if (alnbands->Get(b)->tested) continue;
1017 >       top_band_count++;
1018 >       GXBand& band=*(alnbands->Get(b));
1019 >       band.seeds.setSorted(cmpSeedScore);
1020 >       anchor_seeds.Add(band.seeds.First());
1021 >       if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1022 >       }
1023 >    //#ifdef GDEBUG
1024 >    //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1025 >    //#endif
1026 >    for (int i=0;i<anchor_seeds.Count();i++) {
1027 >      GXSeed& aseed=*anchor_seeds[i];
1028 >      int a1=aseed.a_ofs+(aseed.len>>1)+1;
1029 >      int a2=aseed.b_ofs+(aseed.len>>1)+1;
1030 >      GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1031 >                              seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1032 >      if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1033 >               galns.AddIfNew(alninfo, true);
1034 >          else delete alninfo;
1035        }
1036      }
959  #endif
1037    //---- done
1038    delete alnbands;
1039    if (galns.Count()) {
1040      GXAlnInfo* bestaln=galns.Shift();
1041 +    #ifdef GDEBUG
1042 +      GMessage("Best alignment: a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", bestaln->ql, bestaln->qr,
1043 +          bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid);
1044 +      if (bestaln->gapinfo!=NULL) {
1045 +        bestaln->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1046 +        }
1047 +    #endif
1048 +
1049      return bestaln;
1050      }
1051    else return NULL;
1052   }
1053  
1054   GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
1055 <          SGreedyAlignMem* gxmem, int match_reward, int mismatch_penalty, int Xdrop) {
1055 >          CGreedyAlignData* gxmem, int min_pid) {
1056    bool editscript=false;
1057    #ifdef GDEBUG
1058     editscript=true;
1059 <   GMessage("==========> matching Left (5') end ========\n");
1059 >   //GMessage("==========> matching Left (5') end ========\n");
1060    #endif
1061 +  CAlnTrim trimInfo(galn_TrimLeft, seqb, seqb_len);
1062    GList<GXSeed> rseeds(true,true,false);
1063    GXBandSet* alnbands=collectSeeds_L(rseeds, seqa, seqa_len, seqb, seqb_len);
1064    GList<GXAlnInfo> galns(true,true,false);
979  int max_top_bands=5;
980  int top_band_count=0;
1065    GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
982  for (int b=0;b<alnbands->Count();b++) {
983     if (alnbands->Get(b)->score<4) break;
984     top_band_count++;
985     GXBand& band=*(alnbands->Get(b));
986     band.seeds.setSorted(cmpSeedScore);
987     anchor_seeds.Add(band.seeds.First());
988     if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
989     }
990  #ifdef GDEBUG
991  GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
992  #endif
1066  
1067 +  if (alnbands->qmatch) {
1068 +    //#ifdef GDEBUG
1069 +    // GMessage("::: Found a quick long match -- using it instead of diagonal bands\n");
1070 +    //#endif
1071 +    anchor_seeds.Add(alnbands->qmatch);
1072 +    }
1073 +  else {
1074 +    int max_top_bands=5;
1075 +    int top_band_count=0;
1076 +    for (int b=0;b<alnbands->Count();b++) {
1077 +       if (alnbands->Get(b)->score<4) break;
1078 +       //#ifdef GDEBUG
1079 +       //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
1080 +       //#endif
1081 +       top_band_count++;
1082 +       GXBand& band=*(alnbands->Get(b));
1083 +       band.seeds.setSorted(cmpSeedScore);
1084 +       anchor_seeds.Add(band.seeds.First());
1085 +       if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1086 +       }
1087 +    //#ifdef GDEBUG
1088 +    //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1089 +    //#endif
1090 +    }
1091    for (int i=0;i<anchor_seeds.Count();i++) {
1092      GXSeed& aseed=*anchor_seeds[i];
1093      int a1=aseed.a_ofs+(aseed.len>>1)+1;
1094      int a2=aseed.b_ofs+(aseed.len>>1)+1;
1095      GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1096 <                            seqb, a2, seqb_len, galn_TrimLeft, editscript, gxmem,
1097 <                            match_reward, mismatch_penalty, Xdrop-2);
1098 <    if (alninfo && alninfo->pid>82 && alninfo->sl<3 && alninfo->qr>seqa_len-4)
1002 <         //TODO: at 5' end we should use a higher pid threshold
1003 <         // we could also decrease the X-drop value!
1096 >                            seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1097 >    if (alninfo && alninfo->pid>=min_pid
1098 >           && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, seqa_len-alninfo->qr))
1099              galns.AddIfNew(alninfo, true);
1100         else delete alninfo;
1101      }
1102 <
1102 >  if (galns.Count()==0 && alnbands->tmatch) {
1103 >      GXSeed& aseed=*alnbands->tmatch;
1104 >      int a1=aseed.a_ofs+(aseed.len>>1)+1;
1105 >      int a2=aseed.b_ofs+(aseed.len>>1)+1;
1106 >      GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1107 >                              seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1108 >      if (alninfo) galns.Add(alninfo);
1109 >      }
1110    #ifdef GDEBUG
1111 +  /*
1112    for (int i=0;i<galns.Count();i++) {
1113      GXAlnInfo* alninfo=galns[i];
1114      GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
# Line 1015 | Line 1118
1118        alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1119        }
1120      }
1121 +  */
1122    #endif
1123    //---- done
1124    delete alnbands;
1125    if (galns.Count()) {
1126      GXAlnInfo* bestaln=galns.Shift();
1127 +    #ifdef GDEBUG
1128 +      GMessage("Best alignment: a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", bestaln->ql, bestaln->qr,
1129 +          bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid);
1130 +      if (bestaln->gapinfo!=NULL) {
1131 +        bestaln->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1132 +        }
1133 +    #endif
1134      return bestaln;
1135      }
1136    else return NULL;

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines