ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.cpp
(Generate patch)
# Line 205 | Line 205
205  
206   int GXGreedyExtend(const char* seq1, int len1,
207                         const char* seq2, int len2,
208 <                       bool reverse, int xdrop_threshold,
209 <                       int match_cost, int mismatch_cost,
208 >                       bool reverse, //int xdrop_threshold, int match_cost, int mismatch_cost,
209                         int& seq1_align_len, int& seq2_align_len,
210                         CGreedyAlignData& aux_data,
211                         GXEditScript *edit_block) {
# Line 275 | Line 274
274         xdrop_offset gives the distance backwards in the score
275         array to look */
276  
277 <    xdrop_offset = (xdrop_threshold + match_cost / 2) /
279 <                           (match_cost + mismatch_cost) + 1;
277 >    xdrop_offset = aux_data.xdrop_ofs;
278  
279      // find the offset of the first mismatch between seq1 and seq2
280  
# Line 327 | Line 325
325      // fill in the initial offsets of the distance matrix
326  
327      last_seq2_off[0][diag_origin] = seq1_index;
328 <    max_score[0] = seq1_index * match_cost;
328 >    max_score[0] = seq1_index * aux_data.match_reward;
329      diag_lower = diag_origin - 1;
330      diag_upper = diag_origin + 1;
331      end1_reached = end2_reached = false;
# Line 353 | Line 351
351          // compute the score for distance d corresponding to the X-dropoff criterion
352  
353          xdrop_score = max_score[d - xdrop_offset] +
354 <                      (match_cost + mismatch_cost) * d - xdrop_threshold;
355 <        xdrop_score = (int)ceil((double)xdrop_score / (match_cost / 2));
354 >                      (aux_data.match_reward + aux_data.mismatch_penalty) * d - aux_data.x_drop;
355 >        xdrop_score = (int)ceil((double)xdrop_score / (aux_data.match_reward>>1));
356  
357          // for each diagonal of interest
358          for (k = tmp_diag_lower; k <= tmp_diag_upper; k++) {
# Line 431 | Line 429
429          }  // end loop over diagonals
430  
431          // compute the maximum score possible for distance d
432 <        curr_score = curr_extent * (match_cost / 2) -
433 <                        d * (match_cost + mismatch_cost);
432 >        curr_score = curr_extent * (aux_data.match_reward / 2) -
433 >                        d * (aux_data.match_reward + aux_data.mismatch_penalty);
434          // if this is the best score seen so far, update the
435          // statistics of the best alignment
436          if (curr_score > max_score[d - 1]) {
# Line 700 | Line 698
698    return false;
699   }
700  
701 < /*
704 < GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, GXSeqData& sd) {
705 < int bimin=GMAX(0,(sd.blen-sd.alen-6));
706 < GXSeedTable gx(sd.alen, sd.blen);
707 < GXBandSet* diagstrips=new GXBandSet(sd.alen, sd.blen); //set of overlapping 3-diagonal strips
708 < //for (int bi=0;bi<=b_len-6;bi++) {
709 < for (int bi=sd.blen-6;bi>=0;bi--) {
710 <   //for each 6-mer of seqb
711 <   uint16 bv = get6mer((char*)&(sd.bseq[bi]));
712 <   GVec<uint16>* alocs = sd.amers[bv];
713 <   if (alocs==NULL) continue;
714 <   //extend each hit
715 <   for (int h=0;h<alocs->Count();h++) {
716 <         int ai=alocs->Get(h);
717 <         //word match
718 <         if (gx.x(ai,bi))
719 <           //already have a previous seed covering this region of this diagonal
720 <           continue;
721 <         for (int i=0;i<6;i++)
722 <            gx.x(ai+i,bi+i)=1;
723 <         //see if we can extend to the left
724 <         int aix=ai-1;
725 <         int bix=bi-1;
726 <         int len=6;
727 <         while (bix>=0 && aix>=0 && sd.aseq[aix]==sd.bseq[bix]) {
728 <                 gx.x(aix,bix)=1;
729 <                 ai=aix;
730 <                 bi=bix;
731 <                 aix--;bix--;
732 <                 len++;
733 <                 }
734 <        if (len>sd.amlen) {
735 <                //heuristics: likely the best we can get
736 <                //quick match shortcut
737 <                diagstrips->qmatch=new GXSeed(ai,bi,len);
738 <                return diagstrips;
739 <                }
740 <         if (bi<bimin && len<9) continue; //skip middle seeds that are not high scoring enough
741 <         GXSeed* newseed=new GXSeed(ai,bi,len);
742 <         seeds.Add(newseed);
743 <         diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
744 <         //special last resort terminal match to be used if no better alignment is there
745 <         if (ai<2 && bi+len>sd.blen-2 &&
746 <                 (!diagstrips->tmatch || diagstrips->tmatch->len<len)) diagstrips->tmatch=newseed;
747 <         }
748 <  }
749 < for (int i=0;i<diagstrips->Count();i++) {
750 <    diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
751 <    }
752 < diagstrips->setSorted(true); //sort by score
753 < return diagstrips;
754 < }
755 < */
756 < GXBandSet* collectSeeds(GList<GXSeed>& seeds, GXSeqData& sd) {
757 < int bimax=GMIN((sd.alen+2), (sd.blen-6));
701 > GXBandSet* collectSeeds(GList<GXSeed>& seeds, GXSeqData& sd, GAlnTrimType trim_intent) {
702   int bimin=GMAX(0,(sd.blen-sd.alen-6)); //from collectSeeds_R
703 + int bimax=GMIN((sd.alen+2), (sd.blen-6));
704 + int b_start = (trim_intent==galn_TrimRight) ? bimin : 0;
705 + int b_end = (trim_intent==galn_TrimLeft) ?  bimax : sd.blen-6;
706   //gx.init(a_maxlen, b_maxlen);
707   GXSeedTable gx(sd.alen, sd.blen);
708   GXBandSet* diagstrips=new GXBandSet(sd.alen, sd.blen); //set of overlapping 3-diagonal strips
709 < for (int bi=0;bi<=sd.blen-6;bi++) {
709 > for (int bi=b_start;bi<=b_end;bi++) {
710     //for each 6-mer of seqb
711     uint16 bv = get6mer((char*) & (sd.bseq[bi]));
712     GVec<uint16>* alocs = sd.amers[bv];
# Line 771 | Line 718
718           if (gx.x(ai,bi))
719             //already have a previous seed covering this region of this diagonal
720             continue;
721 +         if (trim_intent==galn_TrimLeft && sd.blen>sd.alen+6 && bi>ai+6)
722 +               continue; //improper positioning for 5' trimming
723 +         if (trim_intent==galn_TrimRight && sd.blen>sd.alen+6 && bi<ai-6)
724 +               continue; //improper positioning for 5' trimming
725 +
726           //TODO: there could be Ns in this seed, should we count/adjust score?
727           for (int i=0;i<6;i++)
728              gx.x(ai+i,bi+i)=1;
# Line 790 | Line 742
742                  return diagstrips;
743                  }
744           if (bi>bimax && bi<bimin && len<9)
745 <                 //skip mid-sequence seeds that are not high scoring
745 >                 //ignore mid-sequence seeds that are not high scoring
746               continue;
747  
748           GXSeed* newseed=new GXSeed(ai,bi,len);
749           seeds.Add(newseed);
750           diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
751 <     //special last resort terminal match to be used if no better alignment is there
751 >     //keep last resort terminal match to be used if no better alignment is there
752       if (bi<2 && ai+len>=sd.alen-1 &&
753                   (!diagstrips->tmatch_l || diagstrips->tmatch_l->len<len))
754                        diagstrips->tmatch_l=newseed;
# Line 875 | Line 827
827    const char* s=s_seq+s_alnstart;
828    int s_avail=s_max-s_alnstart;
829    if (penalty<0) penalty=-penalty;
878  int MIN_GREEDY_SCORE=6*reward; //minimum score for an alignment to be reported for 0 diffs
830    GXAlnInfo* alninfo=NULL;
831    bool freeAlnMem=(gxmem==NULL);
832    if (freeAlnMem) {
833       gxmem=new CGreedyAlignData(reward, penalty, xdrop);
834 +     reward=gxmem->match_reward;
835 +     penalty=gxmem->mismatch_penalty;
836 +     xdrop=gxmem->x_drop;
837       }
838 <  else gxmem->reset();
838 >   else
839 >         gxmem->reset();
840 >  int minMatch= trim ? trim->minMatch : 6;
841 >  int MIN_GREEDY_SCORE=minMatch*reward; //minimum score for an alignment to be reported for 0 diffs
842    int retscore = 0;
843    int numdiffs = 0;
844    if (trim!=NULL && trim->type==galn_TrimLeft) {
# Line 889 | Line 846
846      if (editscript)
847         ed_script_rev=new GXEditScript();
848  
849 <    int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
850 <                   reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
849 >    int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, // xdrop, reward, penalty,
850 >                    s_ext_l, q_ext_l, *gxmem, ed_script_rev);
851      //check this extension here and bail out if it's not a good extension
852      if (s_ext_l+(trim->seedlen>>1) < trim->safelen &&
853          q_alnstart+1-q_ext_l>1 &&
# Line 902 | Line 859
859  
860      if (editscript)
861         ed_script_fwd=new GXEditScript();
862 <    int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
863 <                            reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
862 >    int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, //xdrop, reward, penalty,
863 >                                     s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
864      numdiffs=numdiffs_r+numdiffs_l;
865      //convert num diffs to actual score
866 <    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward+penalty);
866 >    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*(reward>>1) - numdiffs*(reward+penalty);
867      if (editscript)
868         ed_script_rev->Append(ed_script_fwd); //combine the two extensions
869      }
# Line 914 | Line 871
871      if (editscript) {
872         ed_script_fwd=new GXEditScript();
873         }
874 <    int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
875 <                            reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
874 >    int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, // xdrop, reward, penalty,
875 >                                     s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
876      //check extension here and bail out if not a good right extension
877      //assuming s_max is really at the right end of s_seq
878      if (trim!=NULL && trim->type==galn_TrimRight &&
# Line 928 | Line 885
885        }
886      if (editscript)
887         ed_script_rev=new GXEditScript();
888 <    int numdiffs_l =  GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
889 <                   reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
888 >    int numdiffs_l =  GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, // xdrop, reward, penalty,
889 >                                  s_ext_l, q_ext_l, *gxmem, ed_script_rev);
890      //convert num diffs to actual score
891      numdiffs=numdiffs_r+numdiffs_l;
892 <    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward+penalty);
892 >    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*(reward>>1) - numdiffs*(reward+penalty);
893      if (editscript)
894         ed_script_rev->Append(ed_script_fwd); //combine the two extensions
895    }
939
896    if (retscore>=MIN_GREEDY_SCORE) {
897 <    alninfo=new GXAlnInfo(q_seq, q_alnstart+1-q_ext_l,q_alnstart+q_ext_r, s_seq, s_alnstart+1-s_ext_l, s_alnstart+s_ext_r);
897 >    alninfo=new GXAlnInfo(q_seq, q_alnstart+1-q_ext_l, q_alnstart+q_ext_r, s_seq, s_alnstart+1-s_ext_l, s_alnstart+s_ext_r);
898      int hsp_length = GMIN(q_ext_l+q_ext_r, s_ext_l+s_ext_r);
899      alninfo->score=retscore;
900 +    if (gxmem->scaled) alninfo->score >>= 1;
901      alninfo->pid = 100 * (1 - ((double) numdiffs) / hsp_length);
902   #ifdef GDEBUG
903      //if (ed_script_rev) {
# Line 952 | Line 909
909      alninfo->gapinfo = new CAlnGapInfo(ed_script_rev, alninfo->ql-1, alninfo->sl-1);
910      }
911    else {
912 <    if (freeAlnMem) delete gxmem;
912 >    //if (freeAlnMem) delete gxmem;
913      delete ed_script_rev;
914      delete alninfo;
915      alninfo=NULL;
916      }
917 +  if (freeAlnMem) delete gxmem;
918    delete ed_script_fwd;
919    return alninfo;
920   }
# Line 965 | Line 923
923                         const char* s_seq, int s_alnstart, int s_max, CGreedyAlignData* gxmem,
924                         CAlnTrim* trim, bool editscript) {
925   int reward=2;
926 < int penalty=3;
927 < int xdrop=8;
926 > int penalty=10;
927 > int xdrop=32;
928   if (gxmem) {
929     reward=gxmem->match_reward;
930     penalty=gxmem->mismatch_penalty;
# Line 976 | Line 934
934       reward, penalty, xdrop, gxmem, trim, editscript);
935   }
936  
937 < GXAlnInfo* match_adaptor(GXSeqData& sd, GAlnTrimType trim_type,
937 > GXAlnInfo* match_adaptor(GXSeqData& sd, GAlnTrimType trim_type, int minMatch,
938                                       CGreedyAlignData* gxmem, int min_pid) {
939    bool editscript=false;
940    #ifdef GDEBUG
# Line 991 | Line 949
949       GMessage("==========> searching  both ends : %s\n", sd.aseq);
950       }
951    #endif
952 <  CAlnTrim trimInfo(trim_type, sd.bseq, sd.blen, sd.alen, sd.amlen);
952 >  CAlnTrim trimInfo(trim_type, sd.bseq, sd.blen, sd.alen, minMatch, sd.amlen);
953    GList<GXSeed> rseeds(true,true,false);
954 <  GXBandSet* alnbands=collectSeeds(rseeds, sd);
954 >  GXBandSet* alnbands=collectSeeds(rseeds, sd, trim_type);
955    GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
956    //did we find a shortcut?
957    if (alnbands->qmatch) {
# Line 1096 | Line 1054
1054      }
1055    else return NULL;
1056   }
1099 /*
1100 GXAlnInfo* match_Left(GXSeqData& sd, CGreedyAlignData* gxmem, int min_pid) {
1101  bool editscript=false;
1102  #ifdef GDEBUG
1103   editscript=true;
1104   GMessage("==========> matching Left (5') end : %s\n", sd.aseq);
1105  #endif
1106  CAlnTrim trimInfo(galn_TrimLeft, sd.bseq, sd.blen, sd.alen, sd.amlen);
1107  GList<GXSeed> rseeds(true,true,false);
1108  GXBandSet* alnbands = collectSeeds(rseeds, sd);
1109  GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
1110  if (alnbands->qmatch) {
1111    #ifdef GDEBUG
1112     GMessage("::: Found a quick long match at %d, len %d\n",
1113          alnbands->qmatch->b_ofs, alnbands->qmatch->len);
1114    #endif
1115    anchor_seeds.Add(alnbands->qmatch);
1116    }
1117  else {
1118    int max_top_bands=5;
1119    int top_band_count=0;
1120    for (int b=0;b<alnbands->Count();b++) {
1121       if (alnbands->Get(b)->score<6) break;
1122       //#ifdef GDEBUG
1123       //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
1124       //#endif
1125       top_band_count++;
1126       GXBand& band=*(alnbands->Get(b));
1127       band.seeds.setSorted(cmpSeedScore);
1128       anchor_seeds.Add(band.seeds.First());
1129       //band.tested=true;
1130       if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1131       }
1132    //#ifdef GDEBUG
1133    //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1134    //#endif
1135    }
1136 GList<GXAlnInfo> galns(true,true,false);
1137 for (int i=0;i<anchor_seeds.Count();i++) {
1138    GXSeed& aseed=*anchor_seeds[i];
1139    int a1=aseed.a_ofs+(aseed.len>>1)+1;
1140    int a2=aseed.b_ofs+(aseed.len>>1)+1;
1141    trimInfo.seedlen=aseed.len;
1142 #ifdef GDEBUG
1143    GMessage("\t::: align from seed (%d, %d)of len %d.\n",aseed.a_ofs, aseed.b_ofs,
1144           aseed.len);
1145 #endif
1146    GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen,
1147                            sd.bseq, a2, sd.blen, gxmem, &trimInfo, editscript);
1148    if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo))
1149            galns.AddIfNew(alninfo, true);
1150       else delete alninfo;
1151    }
1152  if (galns.Count()==0 && alnbands->tmatch_l) {
1153      //last resort seed
1154      GXSeed& aseed=*alnbands->tmatch_l;
1155      int a1=aseed.a_ofs+(aseed.len>>1)+1;
1156      int a2=aseed.b_ofs+(aseed.len>>1)+1;
1157      trimInfo.seedlen=aseed.len;
1158      GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen,
1159                              sd.bseq, a2, sd.blen, gxmem, &trimInfo, editscript);
1160      if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo))
1161         galns.Add(alninfo);
1162      }
1163  //---- done
1164  delete alnbands;
1165  if (galns.Count()) {
1166    GXAlnInfo* bestaln=galns.Shift();
1167    #ifdef GDEBUG
1168      GMessage("Best alignment: a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", bestaln->ql, bestaln->qr,
1169          bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid);
1170      if (bestaln->gapinfo!=NULL) {
1171        bestaln->gapinfo->printAlignment(stderr, sd.aseq, sd.alen,
1172                                          sd.bseq, sd.blen);
1173        }
1174    #endif
1175    return bestaln;
1176    }
1177  else return NULL;
1178 }
1179 */

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines