533 |
|
int q_max=strlen(q_seq); //query |
534 |
|
int s_max=strlen(s_seq); //subj |
535 |
|
return GreedyAlignRegion(q_seq, q_alnstart, q_max, |
536 |
< |
s_seq, s_alnstart, s_max, editscript, reward, penalty, xdrop); |
536 |
> |
s_seq, s_alnstart, s_max, galn_NoTrim, editscript, NULL, reward, penalty, xdrop); |
537 |
|
} |
538 |
|
|
539 |
|
struct GXSeedTable { |
573 |
|
}; |
574 |
|
|
575 |
|
|
576 |
– |
static GXSeedTable gx(12,255); |
577 |
– |
|
576 |
|
GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) { |
577 |
|
//overlap of right (3') end of seqb |
578 |
|
//hash the first 12 bases of seqa: |
579 |
+ |
int aimin=0; |
580 |
|
int aimax=GMIN(9,(a_len-4)); |
581 |
|
int bimin=GMAX(0,(b_len-a_len-2));//allow 2 extraneous bases, just in case there is some garbage at the end |
582 |
+ |
int bimax=b_len-4; |
583 |
|
int a_maxlen=aimax+4; //number of rows in the diagonal matrix |
584 |
|
int b_maxlen=b_len; //number of cols in the diagonal matrix |
585 |
< |
gx.init(a_maxlen, b_maxlen); |
585 |
> |
GXSeedTable gx(a_maxlen, b_maxlen); |
586 |
|
GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips |
587 |
< |
for (int ai=0;ai<aimax;ai++) { |
587 |
> |
for (int ai=aimin;ai<=aimax;ai++) { |
588 |
|
int32* av=(int32 *)(&seqa[ai]); |
589 |
< |
for (int bi=b_len-5;bi>=bimin;bi--) { |
589 |
> |
//for (int bi=b_len-4;bi>=bimin;bi--) { |
590 |
> |
for (int bi=bimin;bi<=bimax;bi++) { |
591 |
|
if (gx.x(ai,bi)) |
592 |
|
continue; //already have a previous seed covering this region of this diagonal |
593 |
|
int32* bv=(int32 *)(&seqb[bi]); |
594 |
|
if (*av == *bv) { |
595 |
|
//word match |
596 |
|
//see if we can extend to the right |
597 |
< |
gx.x(ai+1,bi+1)=1; |
598 |
< |
gx.x(ai+2,bi+2)=1; |
599 |
< |
gx.x(ai+3,bi+3)=1; |
597 |
> |
gx.x(ai+1,bi+1)=1; |
598 |
> |
gx.x(ai+2,bi+2)=1; |
599 |
> |
gx.x(ai+3,bi+3)=1; |
600 |
|
int aix=ai+4; |
601 |
|
int bix=bi+4; |
602 |
|
int len=4; |
619 |
|
return diagstrips; |
620 |
|
} |
621 |
|
|
622 |
+ |
GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) { |
623 |
+ |
//overlap of left (5') end of seqb |
624 |
+ |
//hash the last 12 bases of seqa: |
625 |
+ |
int aimin=GMAX(0,(a_len-16)); |
626 |
+ |
int aimax=a_len-4; |
627 |
+ |
int bimin=0; |
628 |
+ |
int bimax=GMIN((a_len-2), (b_len-4)); |
629 |
+ |
int a_maxlen=aimax+4; //number of rows in the diagonal matrix |
630 |
+ |
int b_maxlen=b_len; //number of cols in the diagonal matrix |
631 |
+ |
//gx.init(a_maxlen, b_maxlen); |
632 |
+ |
GXSeedTable gx(a_maxlen, b_maxlen); |
633 |
+ |
GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips |
634 |
+ |
for (int ai=aimin;ai<=aimax;ai++) { |
635 |
+ |
int32* av=(int32 *)(&seqa[ai]); |
636 |
+ |
for (int bi=bimin;bi<=bimax;bi++) { |
637 |
+ |
if (gx.x(ai,bi)) |
638 |
+ |
continue; //already have a previous seed covering this region of this diagonal |
639 |
+ |
int32* bv=(int32 *)(&seqb[bi]); |
640 |
+ |
if (*av == *bv) { |
641 |
+ |
//word match |
642 |
+ |
//see if we can extend to the right |
643 |
+ |
gx.x(ai+1,bi+1)=1; |
644 |
+ |
gx.x(ai+2,bi+2)=1; |
645 |
+ |
gx.x(ai+3,bi+3)=1; |
646 |
+ |
int aix=ai+4; |
647 |
+ |
int bix=bi+4; |
648 |
+ |
int len=4; |
649 |
+ |
while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix]) |
650 |
+ |
{ |
651 |
+ |
gx.x(aix,bix)=1; |
652 |
+ |
aix++;bix++; |
653 |
+ |
len++; |
654 |
+ |
} |
655 |
+ |
GXSeed* newseed=new GXSeed(ai,bi,len); |
656 |
+ |
seeds.Add(newseed); |
657 |
+ |
diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals |
658 |
+ |
} |
659 |
+ |
} |
660 |
+ |
}//for each 4-mer at the end of seqa |
661 |
+ |
for (int i=0;i<diagstrips->Count();i++) { |
662 |
+ |
diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds |
663 |
+ |
} |
664 |
+ |
diagstrips->setSorted(true); //sort by score |
665 |
+ |
return diagstrips; |
666 |
+ |
} |
667 |
+ |
|
668 |
+ |
|
669 |
+ |
|
670 |
|
int cmpSeedScore(const pointer p1, const pointer p2) { |
671 |
|
//return (((GXSeed*)s2)->len-((GXSeed*)s1)->len); |
672 |
|
GXSeed* s1=(GXSeed*)p1; |
684 |
|
} |
685 |
|
|
686 |
|
GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max, |
687 |
< |
const char* s_seq, int s_alnstart, int s_max, |
688 |
< |
bool editscript, int reward, int penalty, int xdrop) { |
640 |
< |
|
687 |
> |
const char* s_seq, int s_alnstart, int s_max, GAlnTrimType trimtype, |
688 |
> |
bool editscript, SGreedyAlignMem* gxmem, int reward, int penalty, int xdrop) { |
689 |
|
GXEditScript* ed_script_fwd = NULL; |
690 |
|
GXEditScript* ed_script_rev = NULL; |
691 |
|
if ( q_alnstart>q_max || q_alnstart<1 || s_alnstart>s_max || s_alnstart<1 ) |
696 |
|
GError("GreedyAlign() Error: attempt to use an empty sequence string!\n"); |
697 |
|
if (q_seq[q_alnstart]!=s_seq[s_alnstart]) |
698 |
|
GError("GreedyAlign() Error: improper anchor (mismatch)!\n"); |
651 |
– |
if (editscript) { |
652 |
– |
ed_script_fwd=new GXEditScript(); |
653 |
– |
} |
699 |
|
int q_ext_l=0, q_ext_r=0, s_ext_l=0, s_ext_r=0; |
700 |
|
const char* q=q_seq+q_alnstart; |
701 |
|
int q_avail=q_max-q_alnstart; |
703 |
|
int s_avail=s_max-s_alnstart; |
704 |
|
if (penalty<0) penalty=-penalty; |
705 |
|
int MIN_GREEDY_SCORE=5*reward; //minimum score for an alignment to be reported |
706 |
+ |
GXAlnInfo* alninfo=NULL; |
707 |
+ |
bool freeAlnMem=(gxmem==NULL); |
708 |
+ |
if (freeAlnMem) { |
709 |
+ |
gxmem=new SGreedyAlignMem(reward, penalty, xdrop); |
710 |
+ |
} |
711 |
+ |
else gxmem->reset(); |
712 |
+ |
int retscore = 0; |
713 |
+ |
int numdiffs = 0; |
714 |
+ |
if (trimtype==galn_TrimLeft) { |
715 |
+ |
if (editscript) |
716 |
+ |
ed_script_rev=new GXEditScript(); |
717 |
+ |
|
718 |
+ |
int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop, |
719 |
+ |
reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev); |
720 |
+ |
//check this extension here and bail out if it's not a good extension |
721 |
+ |
if (s_alnstart+1-s_ext_l>3) { |
722 |
+ |
delete ed_script_rev; |
723 |
+ |
if (freeAlnMem) delete gxmem; |
724 |
+ |
return NULL; |
725 |
+ |
} |
726 |
|
|
727 |
< |
SGreedyAlignMem* gxmem=new SGreedyAlignMem(reward, penalty, xdrop); |
728 |
< |
int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop, |
729 |
< |
reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd); |
730 |
< |
#ifdef GDEBUG |
731 |
< |
GMessage("fwd numdiffs=%d (%d-%d : %d-%d)\n", numdiffs_r, q_alnstart+1,q_alnstart+q_ext_r, s_alnstart+1, s_alnstart+s_ext_r); |
732 |
< |
if (editscript) { |
733 |
< |
GMessage("fwd edit script: "); |
734 |
< |
printEditScript(ed_script_fwd); |
727 |
> |
if (editscript) |
728 |
> |
ed_script_fwd=new GXEditScript(); |
729 |
> |
int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop, |
730 |
> |
reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd); |
731 |
> |
numdiffs=numdiffs_r+numdiffs_l; |
732 |
> |
//convert num diffs to actual score |
733 |
> |
retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward-penalty); |
734 |
> |
if (editscript) |
735 |
> |
ed_script_rev->Append(ed_script_fwd); //combine the two extensions |
736 |
|
} |
737 |
< |
#endif |
738 |
< |
//TODO: only extend to the left if the right-extension is above the minimum score |
739 |
< |
//if (score<MIN_EXT_R_SCORE) { |
740 |
< |
// delete abmp; |
741 |
< |
// delete ed_script_fwd; |
742 |
< |
// return; |
743 |
< |
// } |
744 |
< |
// ---- extend to the left of anchor point now |
745 |
< |
if (editscript) |
746 |
< |
ed_script_rev=new GXEditScript(); |
747 |
< |
//int numdiffs_l = GXGreedyAlign(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop, |
748 |
< |
// reward, penalty, &s_ext_l, &q_ext_l, abmp, ed_script_rev); |
749 |
< |
int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop, |
750 |
< |
reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev); |
751 |
< |
#ifdef GDEBUG |
752 |
< |
GMessage("rev numdiffs=%d (%d-%d vs %d-%d)\n", numdiffs_l, q_alnstart+1-q_ext_l, q_alnstart+1, s_alnstart+1-s_ext_l, s_alnstart+1); |
753 |
< |
GMessage("rev edit script: "); |
754 |
< |
if (ed_script_rev) { |
755 |
< |
printEditScript(ed_script_rev); |
756 |
< |
} |
757 |
< |
#endif |
758 |
< |
GXAlnInfo* alninfo=NULL; |
759 |
< |
// In basic case the greedy algorithm returns number of |
760 |
< |
// differences, hence we need to convert it to score |
695 |
< |
int numdiffs=numdiffs_r+numdiffs_l; |
696 |
< |
int retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward-penalty); |
697 |
< |
#ifdef GDEBUG |
698 |
< |
GMessage("Final score: %d\n",retscore); |
699 |
< |
#endif |
700 |
< |
if (ed_script_rev) |
701 |
< |
ed_script_rev->Append(ed_script_fwd); //combine the two extensions |
737 |
> |
else { |
738 |
> |
if (editscript) { |
739 |
> |
ed_script_fwd=new GXEditScript(); |
740 |
> |
} |
741 |
> |
int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop, |
742 |
> |
reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd); |
743 |
> |
//check extension here and bail out if not a good right extension |
744 |
> |
//assuming s_max is really at the right end of s_seq |
745 |
> |
if (trimtype==galn_TrimRight && s_alnstart+s_ext_r<s_max-2) { |
746 |
> |
delete ed_script_fwd; |
747 |
> |
if (freeAlnMem) delete gxmem; |
748 |
> |
return NULL; |
749 |
> |
} |
750 |
> |
if (editscript) |
751 |
> |
ed_script_rev=new GXEditScript(); |
752 |
> |
int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop, |
753 |
> |
reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev); |
754 |
> |
//convert num diffs to actual score |
755 |
> |
numdiffs=numdiffs_r+numdiffs_l; |
756 |
> |
retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward-penalty); |
757 |
> |
if (editscript) |
758 |
> |
ed_script_rev->Append(ed_script_fwd); //combine the two extensions |
759 |
> |
} |
760 |
> |
|
761 |
|
if (retscore>=MIN_GREEDY_SCORE) { |
762 |
|
alninfo=new GXAlnInfo(q_seq, q_alnstart+1-q_ext_l,q_alnstart+q_ext_r, s_seq, s_alnstart+1-s_ext_l, s_alnstart+s_ext_r); |
763 |
|
int hsp_length = GMIN(q_ext_l+q_ext_r, s_ext_l+s_ext_r); |
771 |
|
#endif |
772 |
|
alninfo->editscript=ed_script_rev; |
773 |
|
alninfo->gapinfo = new CAlnGapInfo(ed_script_rev, alninfo->ql-1, alninfo->sl-1); |
715 |
– |
alninfo->alnmem=gxmem; |
774 |
|
} |
775 |
|
else { |
776 |
< |
delete gxmem; |
776 |
> |
if (freeAlnMem) delete gxmem; |
777 |
|
delete ed_script_rev; |
778 |
|
delete alninfo; |
779 |
|
alninfo=NULL; |