ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.cpp
(Generate patch)
# Line 93 | Line 93
93      else
94          g = BLAST_Gcd(*a, BLAST_Gcd(*b, *c));
95      if (g > 1) {
96 <        *a /= g;
96 >                *a /= g;
97          *b /= g;
98          *c /= g;
99      }
# Line 103 | Line 103
103  
104   uint16 get6mer(char* p) {
105    uint16 r=gdna2bit(p,3);
106 <  if (*p) {
107 <   uint16 v=gdna2bit(p,3);
108 <   r |= (v<<8);
109 <   }
106 >  r <<= 6;
107 >  r |= gdna2bit(p,3);
108    return r;
109   }
110  
111 +
112 + void table6mers(const char* s, int slen, GVec<uint16>* amers[]) {
113 + for (uint16 i=0; i <= slen-6; i++) {
114 +   char* p = (char*)(s+i);
115 +   uint16 v=get6mer(p);
116 +   if (amers[v]==NULL) {
117 +      amers[v]=new GVec<uint16>(1);
118 +      }
119 +   amers[v]->Add(i);
120 + }
121 + }
122 +
123 + GVec<uint16>* match6mer(char* start, GVec<uint16>* amers[]) {
124 +  //careful: this is broken if start+5 falls beyond the end of the string!
125 +  uint16 r=get6mer(start);
126 +  return amers[r];
127 + }
128 +
129   //signal that a diagonal is invalid
130   static const int kInvalidOffset = -2;
131  
# Line 189 | Line 205
205  
206   int GXGreedyExtend(const char* seq1, int len1,
207                         const char* seq2, int len2,
208 <                       bool reverse, int xdrop_threshold,
193 <                       int match_cost, int mismatch_cost,
208 >                       bool reverse, //int xdrop_threshold, int match_cost, int mismatch_cost,
209                         int& seq1_align_len, int& seq2_align_len,
210                         CGreedyAlignData& aux_data,
211                         GXEditScript *edit_block) {
# Line 259 | Line 274
274         xdrop_offset gives the distance backwards in the score
275         array to look */
276  
277 <    xdrop_offset = (xdrop_threshold + match_cost / 2) /
263 <                           (match_cost + mismatch_cost) + 1;
277 >    xdrop_offset = aux_data.xdrop_ofs;
278  
279      // find the offset of the first mismatch between seq1 and seq2
280  
# Line 311 | Line 325
325      // fill in the initial offsets of the distance matrix
326  
327      last_seq2_off[0][diag_origin] = seq1_index;
328 <    max_score[0] = seq1_index * match_cost;
328 >    max_score[0] = seq1_index * aux_data.match_reward;
329      diag_lower = diag_origin - 1;
330      diag_upper = diag_origin + 1;
331      end1_reached = end2_reached = false;
# Line 337 | Line 351
351          // compute the score for distance d corresponding to the X-dropoff criterion
352  
353          xdrop_score = max_score[d - xdrop_offset] +
354 <                      (match_cost + mismatch_cost) * d - xdrop_threshold;
355 <        xdrop_score = (int)ceil((double)xdrop_score / (match_cost / 2));
354 >                      (aux_data.match_reward + aux_data.mismatch_penalty) * d - aux_data.x_drop;
355 >        xdrop_score = (int)ceil((double)xdrop_score / (aux_data.match_reward>>1));
356  
357          // for each diagonal of interest
358          for (k = tmp_diag_lower; k <= tmp_diag_upper; k++) {
# Line 415 | Line 429
429          }  // end loop over diagonals
430  
431          // compute the maximum score possible for distance d
432 <        curr_score = curr_extent * (match_cost / 2) -
433 <                        d * (match_cost + mismatch_cost);
432 >        curr_score = curr_extent * (aux_data.match_reward / 2) -
433 >                        d * (aux_data.match_reward + aux_data.mismatch_penalty);
434          // if this is the best score seen so far, update the
435          // statistics of the best alignment
436          if (curr_score > max_score[d - 1]) {
# Line 449 | Line 463
463          if (edit_block == NULL) {
464             // if no traceback is specified, the next row of
465             //   last_seq2_off can reuse previously allocated memory
466 <           //TODO FIXME The following assumes two arrays of
466 >           //WARNING The following assumes two arrays of
467             //  at least max_dist+4 int's have already been allocated
468              last_seq2_off[d + 1] = last_seq2_off[d - 1];
469              }
# Line 458 | Line 472
472              // so a new row must be allocated
473              last_seq2_off[d + 1] = (int*)mem_pool->getByteSpace((diag_upper - diag_lower + 7)*sizeof(int));
474              // move the origin for this row backwards
475 <            //TODO FIXME: dubious pointer arithmetic ?!
475 >            // dubious pointer arithmetic ?!
476              //last_seq2_off[d + 1] = last_seq2_off[d + 1] - diag_lower + 2;
477              }
478      }   // end loop over distinct distances
# Line 588 | Line 602
602   const int a_dropoff_score=7;
603   const int a_min_score=12; //at least 6 bases full match
604  
605 < // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
605 > // ------------------ adaptor matching - simple k-mer seed & extend, no indels for now
606   //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
607   //check minimum score and
608 < //for 3' adapter trimming:
608 > //for 3' adaptor trimming:
609   //     require that the right end of the alignment for either the adaptor OR the read must be
610   //     < 3 distance from its right end
611 < // for 5' adapter trimming:
611 > // for 5' adaptor trimming:
612   //     require that the left end of the alignment for either the adaptor OR the read must
613   //     be at coordinate < 3 from start
614  
# Line 667 | Line 681
681    int mmovh5=(a_l<b_l)? a_l : b_l;
682    if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
683       if (a_l<a_ovh3) {
684 <        //adapter closer to the left end (typical for 5' adapter)
684 >        //adaptor closer to the left end (typical for 5' adaptor)
685          l5=a_r+1;
686          l3=alen-1;
687          }
688        else {
689 <        //adapter matching at the right end (typical for 3' adapter)
689 >        //adaptor matching at the right end (typical for 3' adaptor)
690          l5=0;
691          l3=a_l-1;
692          }
# Line 684 | Line 698
698    return false;
699   }
700  
701 <
702 < GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len,
703 <        GVec<uint16> amers[], const char* seqb, int b_len) {
704 < //overlap of right (3') end of seqb
705 < //hash the first 12 bases of seqa:
692 < int aimin=0;
693 < int aimax=GMIN(9,(a_len-4));
694 < int bimin=GMAX(0,(b_len-a_len-2));//allow 2 extraneous bases, just in case there is some garbage at the end
695 < int bimax=b_len-4;
696 < int a_maxlen=aimax+4; //number of rows in the diagonal matrix
697 < int b_maxlen=b_len; //number of cols in the diagonal matrix
698 < GXSeedTable gx(a_maxlen, b_maxlen);
699 < GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
700 < for (int ai=aimin;ai<=aimax;ai++) {
701 <    int32* av=(int32 *)(&seqa[ai]);
702 <    //for (int bi=b_len-4;bi>=bimin;bi--) {
703 <    for (int bi=bimin;bi<=bimax;bi++) {
704 <       if (gx.x(ai,bi))
705 <            continue; //already have a previous seed covering this region of this diagonal
706 <       int32* bv=(int32 *)(&seqb[bi]);
707 <       if (*av == *bv) {
708 <          //word match
709 <          //see if we can extend to the right
710 <            gx.x(ai+1,bi+1)=1;
711 <            gx.x(ai+2,bi+2)=1;
712 <            gx.x(ai+3,bi+3)=1;
713 <          int aix=ai+4;
714 <          int bix=bi+4;
715 <          int len=4;
716 <          while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
717 <               {
718 <                 gx.x(aix,bix)=1;
719 <                 aix++;bix++;
720 <                 len++;
721 <               }
722 <          if (len>7) {
723 <              //heuristics: very likely the best we can get
724 <              int qlen=len;
725 <              while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
726 <                aix++;
727 <                bix++;
728 <                qlen++;
729 <                }
730 <              if (qlen>10) { //found a quick match shortcut
731 <                 diagstrips->qmatch=new GXSeed(ai,bi,qlen);
732 <                 return diagstrips;
733 <                 }
734 <              }
735 <          GXSeed* newseed=new GXSeed(ai,bi,len);
736 <          seeds.Add(newseed);
737 <          diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
738 <          //special last resort terminal match to be used if no better alignment is there
739 <          if (len>4 && ai==0 && (bi+len==b_len || bi+len==b_len-1)) diagstrips->tmatch=newseed;
740 <          }
741 <       }
742 <    }//for each 4-mer at the beginning of seqa
743 < for (int i=0;i<diagstrips->Count();i++) {
744 <    diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
745 <    }
746 < diagstrips->setSorted(true); //sort by score
747 < return diagstrips;
748 < }
749 <
750 < GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len,
751 <        GVec<uint16> amers[], const char* seqb, int b_len) {
752 < //overlap of left (5') end of seqb
753 < //hash the last 12 bases of seqa:
754 < int aimin=GMAX(0,(a_len-16));
755 < int aimax=a_len-4;
756 < int bimin=0;
757 < int bimax=GMIN((a_len-2), (b_len-4));
758 < int a_maxlen=aimax+4; //number of rows in the diagonal matrix
759 < int b_maxlen=b_len; //number of cols in the diagonal matrix
701 > GXBandSet* collectSeeds(GList<GXSeed>& seeds, GXSeqData& sd, GAlnTrimType trim_intent) {
702 > int bimin=GMAX(0,(sd.blen-sd.alen-6)); //from collectSeeds_R
703 > int bimax=GMIN((sd.alen+2), (sd.blen-6));
704 > int b_start = (trim_intent==galn_TrimRight) ? bimin : 0;
705 > int b_end = (trim_intent==galn_TrimLeft) ?  bimax : sd.blen-6;
706   //gx.init(a_maxlen, b_maxlen);
707 < GXSeedTable gx(a_maxlen, b_maxlen);
708 < GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
709 < for (int ai=aimin;ai<=aimax;ai++) {
710 <    int32* av=(int32 *)(&seqa[ai]);
711 <    for (int bi=bimin;bi<=bimax;bi++) {
712 <       if (gx.x(ai,bi))
713 <          continue; //already have a previous seed covering this region of this diagonal
714 <       int32* bv=(int32 *)(&seqb[bi]);
715 <       if (*av == *bv) {
716 <          //word match
717 <          //see if we can extend to the right
718 <        gx.x(ai+1,bi+1)=1;
719 <        gx.x(ai+2,bi+2)=1;
720 <        gx.x(ai+3,bi+3)=1;
721 <          int aix=ai+4;
722 <          int bix=bi+4;
723 <          int len=4;
724 <          while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
725 <               {
726 <                 gx.x(aix,bix)=1;
727 <                 aix++;bix++;
728 <                 len++;
729 <               }
730 <          if (len>7) {
731 <              //heuristics: very likely the best we can get
732 <              int qlen=len;
733 <              while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
734 <                aix++;
735 <                bix++;
736 <                qlen++;
737 <                }
738 <              if (qlen>10) { //found a quick match shortcut
739 <                 diagstrips->qmatch=new GXSeed(ai,bi,qlen);
740 <                 return diagstrips;
741 <                 }
742 <              }
743 <          GXSeed* newseed=new GXSeed(ai,bi,len);
744 <          seeds.Add(newseed);
745 <          diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
746 <          //special last resort terminal match to be used if no better alignment is there
747 <          if (len>4 && bi==0 && ai+len==a_len) diagstrips->tmatch=newseed;
748 <          }
749 <       }
750 <    }//for each 4-mer at the end of seqa
707 > GXSeedTable gx(sd.alen, sd.blen);
708 > GXBandSet* diagstrips=new GXBandSet(sd.alen, sd.blen); //set of overlapping 3-diagonal strips
709 > for (int bi=b_start;bi<=b_end;bi++) {
710 >   //for each 6-mer of seqb
711 >   uint16 bv = get6mer((char*) & (sd.bseq[bi]));
712 >   GVec<uint16>* alocs = sd.amers[bv];
713 >   if (alocs==NULL) continue;
714 >   //extend each hit
715 >   for (int h=0;h<alocs->Count();h++) {
716 >         int ai=alocs->Get(h);
717 >         //word match
718 >         if (gx.x(ai,bi))
719 >           //already have a previous seed covering this region of this diagonal
720 >           continue;
721 >         if (trim_intent==galn_TrimLeft && sd.blen>sd.alen+6 && bi>ai+6)
722 >               continue; //improper positioning for 5' trimming
723 >         if (trim_intent==galn_TrimRight && sd.blen>sd.alen+6 && bi<ai-6)
724 >               continue; //improper positioning for 5' trimming
725 >
726 >         //TODO: there could be Ns in this seed, should we count/adjust score?
727 >         for (int i=0;i<6;i++)
728 >            gx.x(ai+i,bi+i)=1;
729 >         //see if we can extend to the right
730 >         int aix=ai+6;
731 >         int bix=bi+6;
732 >         int len=6;
733 >         while (bix<sd.blen && aix<sd.alen && sd.aseq[aix]==sd.bseq[bix]) {
734 >                 gx.x(aix,bix)=1;
735 >                 aix++;bix++;
736 >                 len++;
737 >                 }
738 >         if (len>sd.amlen) {
739 >                //heuristics: very likely the best we can get
740 >                //quick match shortcut
741 >                diagstrips->qmatch=new GXSeed(ai,bi,len);
742 >                return diagstrips;
743 >                }
744 >         if (bi>bimax && bi<bimin && len<9)
745 >                 //ignore mid-sequence seeds that are not high scoring
746 >             continue;
747 >
748 >         GXSeed* newseed=new GXSeed(ai,bi,len);
749 >         seeds.Add(newseed);
750 >         diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
751 >     //keep last resort terminal match to be used if no better alignment is there
752 >     if (bi<2 && ai+len>=sd.alen-1 &&
753 >                 (!diagstrips->tmatch_l || diagstrips->tmatch_l->len<len))
754 >                      diagstrips->tmatch_l=newseed;
755 >     //collectSeeds_R:
756 >         if (ai<2 && bi+len>sd.blen-2 &&
757 >                 (!diagstrips->tmatch_r || diagstrips->tmatch_r->len<len))
758 >                  diagstrips->tmatch_r=newseed;
759 >     }
760 > } //for each 6-mer of the read
761   for (int i=0;i<diagstrips->Count();i++) {
762      diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
763      }
# Line 819 | Line 775
775    else return (s2->len-s1->len);
776   }
777  
778 + int cmpSeedScore_R(const pointer p1, const pointer p2) {
779 +  //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
780 +  GXSeed* s1=(GXSeed*)p1;
781 +  GXSeed* s2=(GXSeed*)p2;
782 +  if (s1->len==s2->len) {
783 +     return (s2->b_ofs-s1->b_ofs);
784 +     }
785 +  else return (s2->len-s1->len);
786 + }
787 +
788 +
789   int cmpSeedDiag(const pointer p1, const pointer p2) {
790    GXSeed* s1=(GXSeed*)p1;
791    GXSeed* s2=(GXSeed*)p2;
# Line 850 | Line 817
817    s_alnstart--;
818    if (q_seq==NULL || q_seq[0]==0 || s_seq==NULL || s_seq[0]==0)
819      GError("GreedyAlign() Error: attempt to use an empty sequence string!\n");
820 <  if (q_seq[q_alnstart]!=s_seq[s_alnstart])
821 <    GError("GreedyAlign() Error: improper anchor (mismatch)!\n");
820 >  /*if (q_seq[q_alnstart]!=s_seq[s_alnstart])
821 >    GError("GreedyAlign() Error: improper anchor (mismatch):\n%s (start %d len %d)\n%s (start %d len %d)\n",
822 >           q_seq, q_alnstart, q_max, s_seq, s_alnstart, s_max);
823 >           */
824    int q_ext_l=0, q_ext_r=0, s_ext_l=0, s_ext_r=0;
825    const char* q=q_seq+q_alnstart;
826    int q_avail=q_max-q_alnstart;
827    const char* s=s_seq+s_alnstart;
828    int s_avail=s_max-s_alnstart;
829    if (penalty<0) penalty=-penalty;
861  int MIN_GREEDY_SCORE=5*reward; //minimum score for an alignment to be reported for 0 diffs
830    GXAlnInfo* alninfo=NULL;
831    bool freeAlnMem=(gxmem==NULL);
832    if (freeAlnMem) {
833       gxmem=new CGreedyAlignData(reward, penalty, xdrop);
834 +     reward=gxmem->match_reward;
835 +     penalty=gxmem->mismatch_penalty;
836 +     xdrop=gxmem->x_drop;
837       }
838 <  else gxmem->reset();
838 >   else
839 >         gxmem->reset();
840 >  int minMatch= trim ? trim->minMatch : 6;
841 >  int MIN_GREEDY_SCORE=minMatch*reward; //minimum score for an alignment to be reported for 0 diffs
842    int retscore = 0;
843    int numdiffs = 0;
844    if (trim!=NULL && trim->type==galn_TrimLeft) {
845 +    //intent: trimming the left side
846      if (editscript)
847         ed_script_rev=new GXEditScript();
848  
849 <    int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
850 <                   reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
849 >    int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, // xdrop, reward, penalty,
850 >                    s_ext_l, q_ext_l, *gxmem, ed_script_rev);
851      //check this extension here and bail out if it's not a good extension
852 <    if (s_alnstart+1-s_ext_l>trim->boundary) {
852 >    if (s_ext_l+(trim->seedlen>>1) < trim->safelen &&
853 >        q_alnstart+1-q_ext_l>1 &&
854 >        s_alnstart+1-s_ext_l>trim->l_boundary) {
855        delete ed_script_rev;
856        if (freeAlnMem) delete gxmem;
857        return NULL;
# Line 882 | Line 859
859  
860      if (editscript)
861         ed_script_fwd=new GXEditScript();
862 <    int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
863 <                            reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
862 >    int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, //xdrop, reward, penalty,
863 >                                     s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
864      numdiffs=numdiffs_r+numdiffs_l;
865      //convert num diffs to actual score
866 <    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward+penalty);
866 >    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*(reward>>1) - numdiffs*(reward+penalty);
867      if (editscript)
868         ed_script_rev->Append(ed_script_fwd); //combine the two extensions
869      }
# Line 894 | Line 871
871      if (editscript) {
872         ed_script_fwd=new GXEditScript();
873         }
874 <    int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
875 <                            reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
874 >    int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, // xdrop, reward, penalty,
875 >                                     s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
876      //check extension here and bail out if not a good right extension
877      //assuming s_max is really at the right end of s_seq
878      if (trim!=NULL && trim->type==galn_TrimRight &&
879 <           s_alnstart+s_ext_r<trim->boundary) {
879 >        s_ext_r+(trim->seedlen>>1) < trim->safelen &&
880 >            q_alnstart+q_ext_r<q_max-2 &&
881 >            s_alnstart+s_ext_r<trim->r_boundary) {
882        delete ed_script_fwd;
883        if (freeAlnMem) delete gxmem;
884        return NULL;
885        }
886      if (editscript)
887         ed_script_rev=new GXEditScript();
888 <    int numdiffs_l =  GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
889 <                   reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
888 >    int numdiffs_l =  GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, // xdrop, reward, penalty,
889 >                                  s_ext_l, q_ext_l, *gxmem, ed_script_rev);
890      //convert num diffs to actual score
891      numdiffs=numdiffs_r+numdiffs_l;
892 <    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward+penalty);
892 >    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*(reward>>1) - numdiffs*(reward+penalty);
893      if (editscript)
894         ed_script_rev->Append(ed_script_fwd); //combine the two extensions
895    }
917
896    if (retscore>=MIN_GREEDY_SCORE) {
897 <    alninfo=new GXAlnInfo(q_seq, q_alnstart+1-q_ext_l,q_alnstart+q_ext_r, s_seq, s_alnstart+1-s_ext_l, s_alnstart+s_ext_r);
897 >    alninfo=new GXAlnInfo(q_seq, q_alnstart+1-q_ext_l, q_alnstart+q_ext_r, s_seq, s_alnstart+1-s_ext_l, s_alnstart+s_ext_r);
898      int hsp_length = GMIN(q_ext_l+q_ext_r, s_ext_l+s_ext_r);
899      alninfo->score=retscore;
900 +    if (gxmem->scaled) alninfo->score >>= 1;
901      alninfo->pid = 100 * (1 - ((double) numdiffs) / hsp_length);
902   #ifdef GDEBUG
903      //if (ed_script_rev) {
# Line 930 | Line 909
909      alninfo->gapinfo = new CAlnGapInfo(ed_script_rev, alninfo->ql-1, alninfo->sl-1);
910      }
911    else {
912 <    if (freeAlnMem) delete gxmem;
912 >    //if (freeAlnMem) delete gxmem;
913      delete ed_script_rev;
914      delete alninfo;
915      alninfo=NULL;
916      }
917 +  if (freeAlnMem) delete gxmem;
918    delete ed_script_fwd;
919    return alninfo;
920   }
# Line 943 | Line 923
923                         const char* s_seq, int s_alnstart, int s_max, CGreedyAlignData* gxmem,
924                         CAlnTrim* trim, bool editscript) {
925   int reward=2;
926 < int penalty=3;
927 < int xdrop=8;
926 > int penalty=10;
927 > int xdrop=32;
928   if (gxmem) {
929     reward=gxmem->match_reward;
930     penalty=gxmem->mismatch_penalty;
# Line 954 | Line 934
934       reward, penalty, xdrop, gxmem, trim, editscript);
935   }
936  
937 < GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, GVec<uint16> amers[],
938 <                 const char* seqb, int seqb_len, CGreedyAlignData* gxmem, int min_pid) {
937 > GXAlnInfo* match_adaptor(GXSeqData& sd, GAlnTrimType trim_type, int minMatch,
938 >                                     CGreedyAlignData* gxmem, int min_pid) {
939    bool editscript=false;
940    #ifdef GDEBUG
941     editscript=true;
942 <  // GMessage("==========> matching Right (3') end ========\n");
942 >   if (trim_type==galn_TrimLeft) {
943 >         GMessage("=======> searching left (5') end : %s\n", sd.aseq);
944 >     }
945 >   else if (trim_type==galn_TrimRight) {
946 >     GMessage("=======> searching right(3') end : %s\n", sd.aseq);
947 >     }
948 >   else if (trim_type==galn_TrimEither) {
949 >     GMessage("==========> searching  both ends : %s\n", sd.aseq);
950 >     }
951    #endif
952 <
965 <  CAlnTrim trimInfo(galn_TrimRight, seqb, seqb_len);
952 >  CAlnTrim trimInfo(trim_type, sd.bseq, sd.blen, sd.alen, minMatch, sd.amlen);
953    GList<GXSeed> rseeds(true,true,false);
954 <  GXBandSet* alnbands=collectSeeds_R(rseeds, seqa, seqa_len, amers, seqb, seqb_len);
954 >  GXBandSet* alnbands=collectSeeds(rseeds, sd, trim_type);
955    GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
956    //did we find a shortcut?
957    if (alnbands->qmatch) {
958 <    //#ifdef GDEBUG
959 <    // GMessage("::: Found a quick long match -- using it instead of diagonal bands\n");
960 <    //#endif
958 >    #ifdef GDEBUG
959 >     GMessage("::: Found a quick long match at %d, len %d\n",
960 >          alnbands->qmatch->b_ofs, alnbands->qmatch->len);
961 >    #endif
962      anchor_seeds.Add(alnbands->qmatch);
963      }
964    else {
965      int max_top_bands=5;
966      int top_band_count=0;
967      for (int b=0;b<alnbands->Count();b++) {
968 <       if (alnbands->Get(b)->score<4) break;
968 >       if (alnbands->Get(b)->score<6) break;
969         //#ifdef GDEBUG
970         //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
971         //#endif
# Line 985 | Line 973
973         GXBand& band=*(alnbands->Get(b));
974         band.seeds.setSorted(cmpSeedScore);
975         anchor_seeds.Add(band.seeds.First());
976 <       band.tested=true;
976 >       //band.tested=true;
977         if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
978         }
979      //#ifdef GDEBUG
980      //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
981      //#endif
982      }
995
983    GList<GXAlnInfo> galns(true,true,false);
984    for (int i=0;i<anchor_seeds.Count();i++) {
985      GXSeed& aseed=*anchor_seeds[i];
986      int a1=aseed.a_ofs+(aseed.len>>1)+1;
987      int a2=aseed.b_ofs+(aseed.len>>1)+1;
988 <    GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
989 <                            seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
990 <    if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
988 >    trimInfo.seedlen=aseed.len;
989 > #ifdef GDEBUG
990 >    GMessage("\t::: align from seed (%d, %d)of len %d.\n",aseed.a_ofs, aseed.b_ofs,
991 >           aseed.len);
992 > #endif
993 >    GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen,
994 >                            sd.bseq, a2, sd.blen, gxmem, &trimInfo, editscript);
995 >    if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo))
996               galns.AddIfNew(alninfo, true);
997          else delete alninfo;
998      }
999 <  //special 3' end case: due to the seed scoring scheme being biased towards the 5' end of the read,
1008 <  //we should also try some seeds closer to the 3' end
999 >
1000    if (galns.Count()==0) {
1001 <    anchor_seeds.Clear();
1002 <    alnbands->setSorted(cmpDiagBands_R);
1003 <    int max_top_bands=4;
1004 <    int top_band_count=0;
1005 <    //#ifdef GDEBUG
1006 <    //GMessage(":::>> Retrying adjusting sort order.\n");
1007 <    //#endif
1008 <    if (alnbands->tmatch) {
1009 <      //anchor_seeds.setSorted(false);
1010 <      anchor_seeds.Add(alnbands->tmatch);
1011 <      }
1012 <    for (int b=0;b<alnbands->Count();b++) {
1013 <       if (alnbands->Get(b)->score<4) break;
1014 <       //#ifdef GDEBUG
1015 <       //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
1016 <       //#endif
1017 <       if (alnbands->Get(b)->tested) continue;
1018 <       top_band_count++;
1019 <       GXBand& band=*(alnbands->Get(b));
1020 <       band.seeds.setSorted(cmpSeedScore);
1021 <       anchor_seeds.Add(band.seeds.First());
1022 <       if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1023 <       }
1024 <    //#ifdef GDEBUG
1025 <    //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1026 <    //#endif
1036 <    for (int i=0;i<anchor_seeds.Count();i++) {
1037 <      GXSeed& aseed=*anchor_seeds[i];
1038 <      int a1=aseed.a_ofs+(aseed.len>>1)+1;
1039 <      int a2=aseed.b_ofs+(aseed.len>>1)+1;
1040 <      GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1041 <                              seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1042 <      if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1043 <               galns.AddIfNew(alninfo, true);
1044 <          else delete alninfo;
1001 >         //last resort: look for weaker terminal seeds
1002 >          GPVec<GXSeed> tmatches(2,false);
1003 >          if (trim_type!=galn_TrimRight) {
1004 >                 if (alnbands->tmatch_l)
1005 >                    tmatches.Add(alnbands->tmatch_l);
1006 >             }
1007 >          if (trim_type!=galn_TrimLeft) {
1008 >                 if (alnbands->tmatch_r)
1009 >                    tmatches.Add(alnbands->tmatch_r);
1010 >             }
1011 >          for (int i=0;i<tmatches.Count();i++) {
1012 >                GXSeed& aseed=*tmatches[i];
1013 >                int halfseed=aseed.len>>1;
1014 >                int a1=aseed.a_ofs+halfseed+1;
1015 >                int a2=aseed.b_ofs+halfseed+1;
1016 >                trimInfo.seedlen=aseed.len;
1017 > #ifdef GDEBUG
1018 >    GMessage("\t::: align from terminal seed (%d, %d)of len %d.\n",aseed.a_ofs, aseed.b_ofs,
1019 >           aseed.len);
1020 > #endif
1021 >        GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen,
1022 >                                sd.bseq, a2, sd.blen, gxmem, &trimInfo, editscript);
1023 >        if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo))
1024 >                 galns.AddIfNew(alninfo, true);
1025 >             else delete alninfo;
1026 >        }//for each terminal seed
1027        }
1028 <    }
1047 <  //---- done
1028 >  //---- found all alignments
1029    delete alnbands;
1049  if (galns.Count()) {
1050    GXAlnInfo* bestaln=galns.Shift();
1051    #ifdef GDEBUG
1052      GMessage("Best alignment: a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", bestaln->ql, bestaln->qr,
1053          bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid);
1054      if (bestaln->gapinfo!=NULL) {
1055        bestaln->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1056        }
1057    #endif
1058
1059    return bestaln;
1060    }
1061  else return NULL;
1062 }
1063
1064 GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, GVec<uint16> amers[],
1065                 const char* seqb, int seqb_len, CGreedyAlignData* gxmem, int min_pid) {
1066  bool editscript=false;
1067  #ifdef GDEBUG
1068   editscript=true;
1069   //GMessage("==========> matching Left (5') end ========\n");
1070  #endif
1071  CAlnTrim trimInfo(galn_TrimLeft, seqb, seqb_len);
1072  GList<GXSeed> rseeds(true,true,false);
1073  GXBandSet* alnbands=collectSeeds_L(rseeds, seqa, seqa_len, amers, seqb, seqb_len);
1074  GList<GXAlnInfo> galns(true,true,false);
1075  GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
1076
1077  if (alnbands->qmatch) {
1078    //#ifdef GDEBUG
1079    // GMessage("::: Found a quick long match -- using it instead of diagonal bands\n");
1080    //#endif
1081    anchor_seeds.Add(alnbands->qmatch);
1082    }
1083  else {
1084    int max_top_bands=5;
1085    int top_band_count=0;
1086    for (int b=0;b<alnbands->Count();b++) {
1087       if (alnbands->Get(b)->score<4) break;
1088       //#ifdef GDEBUG
1089       //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
1090       //#endif
1091       top_band_count++;
1092       GXBand& band=*(alnbands->Get(b));
1093       band.seeds.setSorted(cmpSeedScore);
1094       anchor_seeds.Add(band.seeds.First());
1095       if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1096       }
1097    //#ifdef GDEBUG
1098    //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1099    //#endif
1100    }
1101  for (int i=0;i<anchor_seeds.Count();i++) {
1102    GXSeed& aseed=*anchor_seeds[i];
1103    int a1=aseed.a_ofs+(aseed.len>>1)+1;
1104    int a2=aseed.b_ofs+(aseed.len>>1)+1;
1105    GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1106                            seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1107    if (alninfo && alninfo->pid>=min_pid
1108           && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, seqa_len-alninfo->qr))
1109            galns.AddIfNew(alninfo, true);
1110       else delete alninfo;
1111    }
1112  if (galns.Count()==0 && alnbands->tmatch) {
1113      GXSeed& aseed=*alnbands->tmatch;
1114      int a1=aseed.a_ofs+(aseed.len>>1)+1;
1115      int a2=aseed.b_ofs+(aseed.len>>1)+1;
1116      GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1117                              seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1118      if (alninfo) galns.Add(alninfo);
1119      }
1120  #ifdef GDEBUG
1030    /*
1031 +  #ifdef GDEBUG
1032 +  //print all valid alignments found
1033    for (int i=0;i<galns.Count();i++) {
1034      GXAlnInfo* alninfo=galns[i];
1035      GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
# Line 1128 | Line 1039
1039        alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1040        }
1041      }
1131  */
1042    #endif
1043 <  //---- done
1134 <  delete alnbands;
1043 >  */
1044    if (galns.Count()) {
1045      GXAlnInfo* bestaln=galns.Shift();
1046      #ifdef GDEBUG
1047        GMessage("Best alignment: a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", bestaln->ql, bestaln->qr,
1048            bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid);
1049        if (bestaln->gapinfo!=NULL) {
1050 <        bestaln->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1050 >        bestaln->gapinfo->printAlignment(stderr, sd.aseq, sd.alen, sd.bseq, sd.blen);
1051          }
1052      #endif
1053      return bestaln;

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines