ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/seqalign/GXAlign.cpp
(Generate patch)
# Line 533 | Line 533
533   int q_max=strlen(q_seq); //query
534   int s_max=strlen(s_seq); //subj
535   return GreedyAlignRegion(q_seq, q_alnstart, q_max,
536 <          s_seq,  s_alnstart, s_max, editscript, reward, penalty, xdrop);
536 >          s_seq,  s_alnstart, s_max, galn_NoTrim, editscript, NULL, reward, penalty, xdrop);
537   }
538  
539   struct GXSeedTable {
# Line 573 | Line 573
573   };
574  
575  
576 static GXSeedTable gx(12,255);
577
576   GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
577   //overlap of right (3') end of seqb
578   //hash the first 12 bases of seqa:
579 + int aimin=0;
580   int aimax=GMIN(9,(a_len-4));
581   int bimin=GMAX(0,(b_len-a_len-2));//allow 2 extraneous bases, just in case there is some garbage at the end
582 + int bimax=b_len-4;
583   int a_maxlen=aimax+4; //number of rows in the diagonal matrix
584   int b_maxlen=b_len; //number of cols in the diagonal matrix
585 < gx.init(a_maxlen, b_maxlen);
585 > GXSeedTable gx(a_maxlen, b_maxlen);
586   GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
587 < for (int ai=0;ai<aimax;ai++) {
587 > for (int ai=aimin;ai<=aimax;ai++) {
588      int32* av=(int32 *)(&seqa[ai]);
589 <    for (int bi=b_len-5;bi>=bimin;bi--) {
589 >    //for (int bi=b_len-4;bi>=bimin;bi--) {
590 >    for (int bi=bimin;bi<=bimax;bi++) {
591         if (gx.x(ai,bi))
592              continue; //already have a previous seed covering this region of this diagonal
593         int32* bv=(int32 *)(&seqb[bi]);
594         if (*av == *bv) {
595            //word match
596            //see if we can extend to the right
597 <          gx.x(ai+1,bi+1)=1;
598 <          gx.x(ai+2,bi+2)=1;
599 <          gx.x(ai+3,bi+3)=1;
597 >            gx.x(ai+1,bi+1)=1;
598 >            gx.x(ai+2,bi+2)=1;
599 >            gx.x(ai+3,bi+3)=1;
600            int aix=ai+4;
601            int bix=bi+4;
602            int len=4;
# Line 618 | Line 619
619   return diagstrips;
620   }
621  
622 + GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
623 + //overlap of left (5') end of seqb
624 + //hash the last 12 bases of seqa:
625 + int aimin=GMAX(0,(a_len-16));
626 + int aimax=a_len-4;
627 + int bimin=0;
628 + int bimax=GMIN((a_len-2), (b_len-4));
629 + int a_maxlen=aimax+4; //number of rows in the diagonal matrix
630 + int b_maxlen=b_len; //number of cols in the diagonal matrix
631 + //gx.init(a_maxlen, b_maxlen);
632 + GXSeedTable gx(a_maxlen, b_maxlen);
633 + GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
634 + for (int ai=aimin;ai<=aimax;ai++) {
635 +    int32* av=(int32 *)(&seqa[ai]);
636 +    for (int bi=bimin;bi<=bimax;bi++) {
637 +       if (gx.x(ai,bi))
638 +          continue; //already have a previous seed covering this region of this diagonal
639 +       int32* bv=(int32 *)(&seqb[bi]);
640 +       if (*av == *bv) {
641 +          //word match
642 +          //see if we can extend to the right
643 +        gx.x(ai+1,bi+1)=1;
644 +        gx.x(ai+2,bi+2)=1;
645 +        gx.x(ai+3,bi+3)=1;
646 +          int aix=ai+4;
647 +          int bix=bi+4;
648 +          int len=4;
649 +          while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
650 +               {
651 +                 gx.x(aix,bix)=1;
652 +                 aix++;bix++;
653 +                 len++;
654 +               }
655 +          GXSeed* newseed=new GXSeed(ai,bi,len);
656 +          seeds.Add(newseed);
657 +          diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
658 +          }
659 +       }
660 +    }//for each 4-mer at the end of seqa
661 + for (int i=0;i<diagstrips->Count();i++) {
662 +    diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
663 +    }
664 + diagstrips->setSorted(true); //sort by score
665 + return diagstrips;
666 + }
667 +
668 +
669 +
670   int cmpSeedScore(const pointer p1, const pointer p2) {
671    //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
672    GXSeed* s1=(GXSeed*)p1;
# Line 635 | Line 684
684   }
685  
686   GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
687 <                       const char* s_seq, int s_alnstart, int s_max,
688 <                        bool editscript, int reward, int penalty, int xdrop) {
640 <
687 >                       const char* s_seq, int s_alnstart, int s_max, GAlnTrimType trimtype,
688 >                        bool editscript, SGreedyAlignMem* gxmem, int reward, int penalty, int xdrop) {
689    GXEditScript* ed_script_fwd = NULL;
690    GXEditScript* ed_script_rev = NULL;
691    if ( q_alnstart>q_max || q_alnstart<1 || s_alnstart>s_max || s_alnstart<1 )
# Line 648 | Line 696
696      GError("GreedyAlign() Error: attempt to use an empty sequence string!\n");
697    if (q_seq[q_alnstart]!=s_seq[s_alnstart])
698      GError("GreedyAlign() Error: improper anchor (mismatch)!\n");
651  if (editscript) {
652     ed_script_fwd=new GXEditScript();
653     }
699    int q_ext_l=0, q_ext_r=0, s_ext_l=0, s_ext_r=0;
700    const char* q=q_seq+q_alnstart;
701    int q_avail=q_max-q_alnstart;
# Line 658 | Line 703
703    int s_avail=s_max-s_alnstart;
704    if (penalty<0) penalty=-penalty;
705    int MIN_GREEDY_SCORE=5*reward; //minimum score for an alignment to be reported
706 +  GXAlnInfo* alninfo=NULL;
707 +  bool freeAlnMem=(gxmem==NULL);
708 +  if (freeAlnMem) {
709 +     gxmem=new SGreedyAlignMem(reward, penalty, xdrop);
710 +     }
711 +  else gxmem->reset();
712 +  int retscore = 0;
713 +  int numdiffs = 0;
714 +  if (trimtype==galn_TrimLeft) {
715 +    if (editscript)
716 +       ed_script_rev=new GXEditScript();
717 +
718 +    int numdiffs_l =  GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
719 +                   reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
720 +    //check this extension here and bail out if it's not a good extension
721 +    if (s_alnstart+1-s_ext_l>3) {
722 +      delete ed_script_rev;
723 +      if (freeAlnMem) delete gxmem;
724 +      return NULL;
725 +      }
726  
727 <  SGreedyAlignMem* gxmem=new SGreedyAlignMem(reward, penalty, xdrop);
728 <  int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
729 <                          reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
730 < #ifdef GDEBUG
731 <  GMessage("fwd numdiffs=%d (%d-%d : %d-%d)\n", numdiffs_r, q_alnstart+1,q_alnstart+q_ext_r, s_alnstart+1, s_alnstart+s_ext_r);
732 <  if (editscript) {
733 <    GMessage("fwd edit script: ");
734 <    printEditScript(ed_script_fwd);
727 >    if (editscript)
728 >       ed_script_fwd=new GXEditScript();
729 >    int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
730 >                            reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
731 >    numdiffs=numdiffs_r+numdiffs_l;
732 >    //convert num diffs to actual score
733 >    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward-penalty);
734 >    if (editscript)
735 >       ed_script_rev->Append(ed_script_fwd); //combine the two extensions
736      }
737 < #endif
738 <  //TODO: only extend to the left if the right-extension is above the minimum score
739 <  //if (score<MIN_EXT_R_SCORE) {
740 <  // delete abmp;
741 <  // delete ed_script_fwd;
742 <  // return;
743 <  // }
744 <  // ---- extend to the left of anchor point now
745 <  if (editscript)
746 <     ed_script_rev=new GXEditScript();
747 <  //int numdiffs_l = GXGreedyAlign(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
748 <  //            reward, penalty, &s_ext_l, &q_ext_l, abmp, ed_script_rev);
749 <  int numdiffs_l =  GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
750 <                 reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
751 < #ifdef GDEBUG
752 <  GMessage("rev numdiffs=%d (%d-%d vs %d-%d)\n", numdiffs_l, q_alnstart+1-q_ext_l, q_alnstart+1, s_alnstart+1-s_ext_l, s_alnstart+1);
753 <  GMessage("rev edit script: ");
754 <  if (ed_script_rev) {
755 <     printEditScript(ed_script_rev);
756 <     }
757 < #endif
758 <  GXAlnInfo* alninfo=NULL;
759 <  // In basic case the greedy algorithm returns number of
760 <  // differences, hence we need to convert it to score    
695 <  int numdiffs=numdiffs_r+numdiffs_l;
696 <  int retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward-penalty);
697 < #ifdef GDEBUG
698 <  GMessage("Final score: %d\n",retscore);
699 < #endif
700 <  if (ed_script_rev)
701 <     ed_script_rev->Append(ed_script_fwd); //combine the two extensions
737 >  else {
738 >    if (editscript) {
739 >       ed_script_fwd=new GXEditScript();
740 >       }
741 >    int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
742 >                            reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
743 >    //check extension here and bail out if not a good right extension
744 >    //assuming s_max is really at the right end of s_seq
745 >    if (trimtype==galn_TrimRight && s_alnstart+s_ext_r<s_max-2) {
746 >      delete ed_script_fwd;
747 >      if (freeAlnMem) delete gxmem;
748 >      return NULL;
749 >      }
750 >    if (editscript)
751 >       ed_script_rev=new GXEditScript();
752 >    int numdiffs_l =  GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
753 >                   reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
754 >    //convert num diffs to actual score
755 >    numdiffs=numdiffs_r+numdiffs_l;
756 >    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward-penalty);
757 >    if (editscript)
758 >       ed_script_rev->Append(ed_script_fwd); //combine the two extensions
759 >  }
760 >
761    if (retscore>=MIN_GREEDY_SCORE) {
762      alninfo=new GXAlnInfo(q_seq, q_alnstart+1-q_ext_l,q_alnstart+q_ext_r, s_seq, s_alnstart+1-s_ext_l, s_alnstart+s_ext_r);
763      int hsp_length = GMIN(q_ext_l+q_ext_r, s_ext_l+s_ext_r);
# Line 712 | Line 771
771   #endif
772      alninfo->editscript=ed_script_rev;
773      alninfo->gapinfo = new CAlnGapInfo(ed_script_rev, alninfo->ql-1, alninfo->sl-1);
715    alninfo->alnmem=gxmem;
774      }
775    else {
776 <    delete gxmem;
776 >    if (freeAlnMem) delete gxmem;
777      delete ed_script_rev;
778      delete alninfo;
779      alninfo=NULL;

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines