ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.cpp
(Generate patch)
# Line 53 | Line 53
53    if (flast_d[d-1][diag-1] > GMAX(flast_d[d-1][diag], flast_d[d-1][diag+1])) {
54        *row1 = flast_d[d-1][diag-1];
55        return diag-1;
56 <      }
56 >      }
57    if (flast_d[d-1][diag] > flast_d[d-1][diag+1]) {
58        *row1 = flast_d[d-1][diag];
59        return diag;
# Line 65 | Line 65
65   void GapXEditScript::print() { //debug
66        GapXEditScript* p=this;
67        do {
68 <        GMessage("%d%c ",p->num, xgapcodes[p->op_type]);
69 <        } while ((p=p->next)!=NULL);
68 >        GMessage("%d%c ",p->num, xgapcodes[p->op_type]);
69 >        } while ((p=p->next)!=NULL);
70        GMessage("\n");
71      }
72  
# Line 93 | Line 93
93      else
94          g = BLAST_Gcd(*a, BLAST_Gcd(*b, *c));
95      if (g > 1) {
96 <        *a /= g;
96 >                *a /= g;
97          *b /= g;
98          *c /= g;
99      }
# Line 101 | Line 101
101   }
102  
103  
104 + uint16 get6mer(char* p) {
105 +  uint16 r=gdna2bit(p,3);
106 +  r <<= 6;
107 +  r |= gdna2bit(p,3);
108 +  return r;
109 + }
110 +
111 +
112 + void table6mers(const char* s, int slen, GVec<uint16>* amers[]) {
113 + for (uint16 i=0; i <= slen-6; i++) {
114 +   char* p = (char*)(s+i);
115 +   uint16 v=get6mer(p);
116 +   if (amers[v]==NULL) {
117 +      amers[v]=new GVec<uint16>(1);
118 +      }
119 +   amers[v]->Add(i);
120 + }
121 + }
122 +
123 + GVec<uint16>* match6mer(char* start, GVec<uint16>* amers[]) {
124 +  //careful: this is broken if start+5 falls beyond the end of the string!
125 +  uint16 r=get6mer(start);
126 +  return amers[r];
127 + }
128  
129   //signal that a diagonal is invalid
130   static const int kInvalidOffset = -2;
# Line 129 | Line 153
153               //    }
154        }
155      else { //forward lookup
156 <             while (seq1_index < len1 && seq2_index < len2 &&
156 >             while (seq1_index < len1 && seq2_index < len2 &&
157                     //seq1[seq1_index] < 4 &&
158                     seq1[seq1_index] == seq2[seq2_index]) {
159                  ++seq1_index;
# Line 184 | Line 208
208                         bool reverse, int xdrop_threshold,
209                         int match_cost, int mismatch_cost,
210                         int& seq1_align_len, int& seq2_align_len,
211 <                       SGreedyAlignMem& aux_data,
211 >                       CGreedyAlignData& aux_data,
212                         GXEditScript *edit_block) {
213                         //GapPrelimEditBlock *edit_block,
214                         //bool& fence_hit, SGreedySeed *seed) {
# Line 204 | Line 228
228      int longest_match_run;
229      bool end1_reached, end2_reached;
230      GXMemPool* mem_pool;
231 <
231 >
232      /* ordinary dynamic programming alignment, for each offset
233         in seq1, walks through offsets in seq2 until an X-dropoff
234 <       test fails, saving the best score encountered along
234 >       test fails, saving the best score encountered along
235         the way. Instead of score, this code tracks the 'distance'
236         (number of mismatches plus number of gaps) between seq1
237         and seq2. Instead of walking through sequence offsets, it
238         walks through diagonals that can achieve a given distance.
239 <    
239 >
240         Note that in what follows, the numbering of diagonals implies
241 <       a dot matrix where increasing seq1 offsets go to the right on
241 >       a dot matrix where increasing seq1 offsets go to the right on
242         the x axis, and increasing seq2 offsets go up the y axis.
243         The gapped alignment thus proceeds up and to the right in
244         the graph, and diagonals are numbered increasing to the right */
# Line 223 | Line 247
247      best_diag = 0;
248  
249      /* set the number of distinct distances the algorithm will
250 <       examine in the search for an optimal alignment. The
251 <       heuristic worst-case running time of the algorithm is
250 >       examine in the search for an optimal alignment. The
251 >       heuristic worst-case running time of the algorithm is
252         O(max_dist**2 + (len1+len2)); for sequences which are
253         very similar, the average running time will be sig-
254         nificantly better than this */
255  
256      max_dist = GMIN(GREEDY_MAX_COST,
257 <                   len2 / GREEDY_MAX_COST_FRACTION + 1);
257 >                   (len2/GREEDY_MAX_COST_FRACTION + 1));
258  
259      /* the main loop assumes that the index of all diagonals is
260 <       biased to lie in the middle of allocated bookkeeping
260 >       biased to lie in the middle of allocated bookkeeping
261         structures */
262  
263      diag_origin = max_dist + 2;
# Line 251 | Line 275
275         xdrop_offset gives the distance backwards in the score
276         array to look */
277  
278 <    xdrop_offset = (xdrop_threshold + match_cost / 2) /
278 >    xdrop_offset = (xdrop_threshold + match_cost / 2) /
279                             (match_cost + mismatch_cost) + 1;
280 <    
280 >
281      // find the offset of the first mismatch between seq1 and seq2
282  
283      index = s_FindFirstMismatch(seq1, len1,  seq2, len2, 0, 0, reverse);
# Line 277 | Line 301
301          if (edit_block != NULL)
302              //GapPrelimEditBlockAdd(edit_block, eGapAlignSub, index);
303              edit_block->opRep(index);
304 <        return 0;
304 >        return 0;
305      }
306  
307      // set up the memory pool
# Line 287 | Line 311
311        }
312      else if (mem_pool == NULL) {
313         aux_data.space = mem_pool = new GXMemPool();
314 <    }
314 >    }
315      else {
316          mem_pool->refresh();
317      }
318 <    
318 >
319      /* set up the array of per-distance maximum scores. There
320         are max_diags + xdrop_offset distances to track, the first
321         xdrop_offset of which are 0 */
# Line 299 | Line 323
323      max_score = aux_data.max_score + xdrop_offset;
324      for (index = 0; index < xdrop_offset; index++)
325          aux_data.max_score[index] = 0;
326 <    
326 >
327      // fill in the initial offsets of the distance matrix
328  
329      last_seq2_off[0][diag_origin] = seq1_index;
# Line 328 | Line 352
352  
353          // compute the score for distance d corresponding to the X-dropoff criterion
354  
355 <        xdrop_score = max_score[d - xdrop_offset] +
355 >        xdrop_score = max_score[d - xdrop_offset] +
356                        (match_cost + mismatch_cost) * d - xdrop_threshold;
357 <        xdrop_score = (int)ceil((double)xdrop_score / (match_cost / 2));
357 >        xdrop_score = (int)ceil((double)xdrop_score / (match_cost / 2));
358  
359          // for each diagonal of interest
360          for (k = tmp_diag_lower; k <= tmp_diag_upper; k++) {
361              /* find the largest offset into seq2 that increases
362                 the distance from d-1 to d (i.e. keeps the alignment
363 <               from getting worse for as long as possible), then
363 >               from getting worse for as long as possible), then
364                 choose the offset into seq1 that will keep the
365 <               resulting diagonal fixed at k
366 <            
365 >               resulting diagonal fixed at k
366 >
367                 Note that this requires kInvalidOffset+1 to be smaller
368                 than any valid offset into seq2, i.e. to be negative */
369  
# Line 360 | Line 384
384                  continue;
385              }
386              diag_upper = k;
387 <            
388 <            /* slide down diagonal k until a mismatch
387 >
388 >            /* slide down diagonal k until a mismatch
389                 occurs. As long as only matches are encountered,
390                 the current distance d will not change */
391  
# Line 397 | Line 421
421                 can each be of size at most max_diags+2 */
422  
423              if (seq2_index == len2) {
424 <                diag_lower = k + 1;
424 >                diag_lower = k + 1;
425                  end2_reached = true;
426              }
427              if (seq1_index == len1) {
428 <                diag_upper = k - 1;
428 >                diag_upper = k - 1;
429                  end1_reached = true;
430              }
431          }  // end loop over diagonals
432  
433          // compute the maximum score possible for distance d
434 <        curr_score = curr_extent * (match_cost / 2) -
434 >        curr_score = curr_extent * (match_cost / 2) -
435                          d * (match_cost + mismatch_cost);
436          // if this is the best score seen so far, update the
437          // statistics of the best alignment
# Line 417 | Line 441
441              best_diag = curr_diag;
442              seq2_align_len = curr_seq2_index;
443              seq1_align_len = curr_seq2_index + best_diag - diag_origin;
444 <        }
444 >        }
445          else {
446              max_score[d] = max_score[d - 1];
447          }
# Line 428 | Line 452
452          if (diag_lower > diag_upper)
453              break;
454  
455 <        /* set up for the next distance to examine. Because the
456 <           bounds increase by at most one for each distance,
457 <           diag_lower and diag_upper can each be of size at
455 >        /* set up for the next distance to examine. Because the
456 >           bounds increase by at most one for each distance,
457 >           diag_lower and diag_upper can each be of size at
458             most max_diags+2 */
459  
460          if (!end2_reached)
461 <            diag_lower--;
461 >            diag_lower--;
462          if (!end1_reached)
463              diag_upper++;
464  
# Line 455 | Line 479
479              }
480      }   // end loop over distinct distances
481  
482 <    
482 >
483      if (edit_block == NULL)
484          return best_dist;
485  
486      //----  perform traceback
487 <    d = best_dist;
487 >    d = best_dist;
488      seq1_index = seq1_align_len;
489      seq2_index = seq2_align_len;
490      // for all positive distances
# Line 476 | Line 500
500             traceback operation. best_diag starts off with the
501             value computed during the alignment process */
502  
503 <        new_diag = s_GetNextNonAffineTback(last_seq2_off, d,
503 >        new_diag = s_GetNextNonAffineTback(last_seq2_off, d,
504                                             best_diag, &new_seq2_index);
505  
506          if (new_diag == best_diag) {
# Line 484 | Line 508
508              if (seq2_index - new_seq2_index > 0) {
509                    edit_block->opRep(seq2_index - new_seq2_index);
510              }
511 <        }
511 >        }
512          else if (new_diag < best_diag) {
513              // smaller diagonal: issue a group of substitutions
514              //   and then a gap in seq2 */
# Line 493 | Line 517
517              }
518              //GapPrelimEditBlockAdd(edit_block, eGapAlignIns, 1);
519              edit_block->opIns(1);
520 <        }
520 >        }
521          else {
522              // larger diagonal: issue a group of substitutions
523              //   and then a gap in seq1
# Line 502 | Line 526
526              }
527              edit_block->opDel(1);
528          }
529 <        d--;
530 <        best_diag = new_diag;
531 <        seq2_index = new_seq2_index;
529 >        d--;
530 >        best_diag = new_diag;
531 >        seq2_index = new_seq2_index;
532      }
533   //done:
534      // handle the final group of substitutions back to distance zero,
# Line 526 | Line 550
550        unsigned char op_type = 3 - ( ed_script->ops[i] & gxEDIT_OP_MASK );
551        if (op_type == 3)
552           GError("Error: printEditScript encountered op_type 3 ?!\n");
553 <      GMessage("%d%c ", num, xgapcodes[op_type]);
553 >      GMessage("%d%c ", num, xgapcodes[op_type]);
554        }
555      GMessage("\n");
556    }
# Line 536 | Line 560
560   int q_max=strlen(q_seq); //query
561   int s_max=strlen(s_seq); //subj
562   return GreedyAlignRegion(q_seq, q_alnstart, q_max,
563 <          s_seq,  s_alnstart, s_max, galn_NoTrim, editscript, NULL, reward, penalty, xdrop);
563 >          s_seq,  s_alnstart, s_max, reward, penalty, xdrop, NULL, NULL, editscript);
564   }
565  
566   struct GXSeedTable {
# Line 575 | Line 599
599  
600   };
601  
602 <
603 < GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
604 < //overlap of right (3') end of seqb
605 < //hash the first 12 bases of seqa:
606 < int aimin=0;
607 < int aimax=GMIN(9,(a_len-4));
608 < int bimin=GMAX(0,(b_len-a_len-2));//allow 2 extraneous bases, just in case there is some garbage at the end
609 < int bimax=b_len-4;
610 < int a_maxlen=aimax+4; //number of rows in the diagonal matrix
611 < int b_maxlen=b_len; //number of cols in the diagonal matrix
612 < GXSeedTable gx(a_maxlen, b_maxlen);
613 < GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
614 < for (int ai=aimin;ai<=aimax;ai++) {
615 <    int32* av=(int32 *)(&seqa[ai]);
616 <    //for (int bi=b_len-4;bi>=bimin;bi--) {
617 <    for (int bi=bimin;bi<=bimax;bi++) {
618 <       if (gx.x(ai,bi))
619 <            continue; //already have a previous seed covering this region of this diagonal
620 <       int32* bv=(int32 *)(&seqb[bi]);
621 <       if (*av == *bv) {
622 <          //word match
623 <          //see if we can extend to the right
624 <            gx.x(ai+1,bi+1)=1;
625 <            gx.x(ai+2,bi+2)=1;
626 <            gx.x(ai+3,bi+3)=1;
627 <          int aix=ai+4;
628 <          int bix=bi+4;
629 <          int len=4;
630 <          while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
631 <               {
632 <                 gx.x(aix,bix)=1;
633 <                 aix++;bix++;
634 <                 len++;
635 <               }
636 <          GXSeed* newseed=new GXSeed(ai,bi,len);
637 <          seeds.Add(newseed);
638 <          diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
639 <          }
602 > const int a_m_score=2; //match score
603 > const int a_mis_score=-3; //mismatch
604 > const int a_dropoff_score=7;
605 > const int a_min_score=12; //at least 6 bases full match
606 >
607 > // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
608 > //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
609 > //check minimum score and
610 > //for 3' adapter trimming:
611 > //     require that the right end of the alignment for either the adaptor OR the read must be
612 > //     < 3 distance from its right end
613 > // for 5' adapter trimming:
614 > //     require that the left end of the alignment for either the adaptor OR the read must
615 > //     be at coordinate < 3 from start
616 >
617 > bool extendUngapped(const char* a, int alen, int ai,
618 >                 const char* b, int blen, int bi, int& mlen, int& l5, int& l3, bool end5=false) {
619 > //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
620 > //if (debug) {
621 > //  GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
622 > //  }
623 > int a_l=ai; //alignment coordinates on a
624 > int a_r=ai+mlen-1;
625 > int b_l=bi; //alignment coordinates on b
626 > int b_r=bi+mlen-1;
627 > int ai_maxscore=ai;
628 > int bi_maxscore=bi;
629 > int score=mlen*a_m_score;
630 > int maxscore=score;
631 > int mism5score=a_mis_score;
632 > if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
633 > //try to extend to the left first, if possible
634 > while (ai>0 && bi>0) {
635 >   ai--;
636 >   bi--;
637 >   score+= (a[ai]==b[bi])? a_m_score : mism5score;
638 >   if (score>maxscore) {
639 >       ai_maxscore=ai;
640 >       bi_maxscore=bi;
641 >       maxscore=score;
642         }
643 <    }//for each 4-mer at the beginning of seqa
643 >     else if (maxscore-score>a_dropoff_score) break;
644 >   }
645 > a_l=ai_maxscore;
646 > b_l=bi_maxscore;
647 > //now extend to the right
648 > ai_maxscore=a_r;
649 > bi_maxscore=b_r;
650 > ai=a_r;
651 > bi=b_r;
652 > score=maxscore;
653 > //sometimes there are extra As at the end of the read, ignore those
654 > if (a[alen-2]=='A' && a[alen-1]=='A') {
655 >    alen-=2;
656 >    while (a[alen-1]=='A' && alen>ai) alen--;
657 >    }
658 > while (ai<alen-1 && bi<blen-1) {
659 >   ai++;
660 >   bi++;
661 >   //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
662 >   if (a[ai]==b[bi]) { //match
663 >      score+=a_m_score;
664 >      if (ai>=alen-2) {
665 >           score+=a_m_score-(alen-ai-1);
666 >           }
667 >      }
668 >    else { //mismatch
669 >      score+=a_mis_score;
670 >      }
671 >   if (score>maxscore) {
672 >       ai_maxscore=ai;
673 >       bi_maxscore=bi;
674 >       maxscore=score;
675 >       }
676 >     else if (maxscore-score>a_dropoff_score) break;
677 >   }
678 >  a_r=ai_maxscore;
679 >  b_r=bi_maxscore;
680 >  int a_ovh3=alen-a_r-1;
681 >  int b_ovh3=blen-b_r-1;
682 >  int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
683 >  int mmovh5=(a_l<b_l)? a_l : b_l;
684 >  if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
685 >     if (a_l<a_ovh3) {
686 >        //adapter closer to the left end (typical for 5' adapter)
687 >        l5=a_r+1;
688 >        l3=alen-1;
689 >        }
690 >      else {
691 >        //adapter matching at the right end (typical for 3' adapter)
692 >        l5=0;
693 >        l3=a_l-1;
694 >        }
695 >     return true;
696 >     }
697 >  //do not trim:
698 >  l5=0;
699 >  l3=alen-1;
700 >  return false;
701 > }
702 >
703 >
704 > GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len,
705 >        GVec<uint16>* amers[], const char* seqb, int b_len) {
706 > int bimin=GMAX(0,(b_len-a_len-6));
707 > GXSeedTable gx(a_len, b_len);
708 > GXBandSet* diagstrips=new GXBandSet(a_len, b_len); //set of overlapping 3-diagonal strips
709 > for (int bi=0;bi<=b_len-6;bi++) {
710 >   //for each 6-mer of seqb
711 >   uint16 bv = get6mer((char*)&seqb[bi]);
712 >   GVec<uint16>* alocs = amers[bv];
713 >   if (alocs==NULL) continue;
714 >   //extend each hit
715 >   for (int h=0;h<alocs->Count();h++) {
716 >         int ai=alocs->Get(h);
717 >         //word match
718 >         if (gx.x(ai,bi))
719 >           //already have a previous seed covering this region of this diagonal
720 >           continue;
721 >         for (int i=0;i<6;i++)
722 >            gx.x(ai+i,bi+i)=1;
723 >         //see if we can extend to the right
724 >         int aix=ai+6;
725 >         int bix=bi+6;
726 >         int len=6;
727 >         while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
728 >                 gx.x(aix,bix)=1;
729 >                 aix++;bix++;
730 >                 len++;
731 >                 }
732 >        /*if (len>22) {
733 >                //heuristics: very likely the best we can get
734 >                //quick match shortcut
735 >                diagstrips->qmatch=new GXSeed(ai,bi,len);
736 >                return diagstrips;
737 >                } */
738 >         if (bi<bimin && len<9) continue; //skip middle seeds that are not high scoring enough
739 >         GXSeed* newseed=new GXSeed(ai,bi,len);
740 >         seeds.Add(newseed);
741 >         diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
742 >         //special last resort terminal match to be used if no better alignment is there
743 >         if (ai<2 && bi+len>b_len-2 &&
744 >                 (!diagstrips->tmatch || diagstrips->tmatch->len<len)) diagstrips->tmatch=newseed;
745 >         }
746 >  }
747   for (int i=0;i<diagstrips->Count();i++) {
748      diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
749      }
# Line 622 | Line 751
751   return diagstrips;
752   }
753  
754 < GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
754 > GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len,
755 >        GVec<uint16>* amers[], const char* seqb, int b_len) {
756   //overlap of left (5') end of seqb
757   //hash the last 12 bases of seqa:
758 < int aimin=GMAX(0,(a_len-16));
629 < int aimax=a_len-4;
630 < int bimin=0;
631 < int bimax=GMIN((a_len-2), (b_len-4));
632 < int a_maxlen=aimax+4; //number of rows in the diagonal matrix
633 < int b_maxlen=b_len; //number of cols in the diagonal matrix
758 > int bimax=GMIN((a_len+2), (b_len-6));
759   //gx.init(a_maxlen, b_maxlen);
760 < GXSeedTable gx(a_maxlen, b_maxlen);
761 < GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
762 < for (int ai=aimin;ai<=aimax;ai++) {
763 <    int32* av=(int32 *)(&seqa[ai]);
764 <    for (int bi=bimin;bi<=bimax;bi++) {
765 <       if (gx.x(ai,bi))
766 <          continue; //already have a previous seed covering this region of this diagonal
767 <       int32* bv=(int32 *)(&seqb[bi]);
768 <       if (*av == *bv) {
769 <          //word match
770 <          //see if we can extend to the right
771 <        gx.x(ai+1,bi+1)=1;
772 <        gx.x(ai+2,bi+2)=1;
773 <        gx.x(ai+3,bi+3)=1;
774 <          int aix=ai+4;
775 <          int bix=bi+4;
776 <          int len=4;
777 <          while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
778 <               {
779 <                 gx.x(aix,bix)=1;
780 <                 aix++;bix++;
781 <                 len++;
782 <               }
783 <          GXSeed* newseed=new GXSeed(ai,bi,len);
784 <          seeds.Add(newseed);
785 <          diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
786 <          }
787 <       }
788 <    }//for each 4-mer at the end of seqa
760 > GXSeedTable gx(a_len, b_len);
761 > GXBandSet* diagstrips=new GXBandSet(a_len, b_len); //set of overlapping 3-diagonal strips
762 > for (int bi=0;bi<=b_len-6;bi++) {
763 >   //for each 6-mer of seqb
764 >   uint16 bv = get6mer((char*)&seqb[bi]);
765 >   GVec<uint16>* alocs = amers[bv];
766 >   if (alocs==NULL) continue;
767 >   //extend each hit
768 >   for (int h=0;h<alocs->Count();h++) {
769 >         int ai=alocs->Get(h);
770 >         //word match
771 >         if (gx.x(ai,bi))
772 >           //already have a previous seed covering this region of this diagonal
773 >           continue;
774 >         for (int i=0;i<6;i++)
775 >            gx.x(ai+i,bi+i)=1;
776 >         //see if we can extend to the right
777 >         int aix=ai+6;
778 >         int bix=bi+6;
779 >         int len=6;
780 >         while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
781 >                 gx.x(aix,bix)=1;
782 >                 aix++;bix++;
783 >                 len++;
784 >                 }
785 >         /* if (len>22) {
786 >                //heuristics: very likely the best we can get
787 >                //quick match shortcut
788 >                diagstrips->qmatch=new GXSeed(ai,bi,len);
789 >                return diagstrips;
790 >                } */
791 >         if (bi>bimax && len<9) continue; //skip middle seeds that are not high scoring enough
792 >         GXSeed* newseed=new GXSeed(ai,bi,len);
793 >         seeds.Add(newseed);
794 >         diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
795 >     //special last resort terminal match to be used if no better alignment is there
796 >     if (bi<2 && ai+len>=a_len-1 &&
797 >                 (!diagstrips->tmatch || diagstrips->tmatch->len<len))
798 >                   diagstrips->tmatch=newseed;
799 >     }
800 > } //for each 6-mer of the read
801   for (int i=0;i<diagstrips->Count();i++) {
802      diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
803      }
# Line 668 | Line 805
805   return diagstrips;
806   }
807  
671
672
808   int cmpSeedScore(const pointer p1, const pointer p2) {
809    //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
810    GXSeed* s1=(GXSeed*)p1;
# Line 680 | Line 815
815    else return (s2->len-s1->len);
816   }
817  
818 + int cmpSeedScore_R(const pointer p1, const pointer p2) {
819 +  //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
820 +  GXSeed* s1=(GXSeed*)p1;
821 +  GXSeed* s2=(GXSeed*)p2;
822 +  if (s1->len==s2->len) {
823 +     return (s2->b_ofs-s1->b_ofs);
824 +     }
825 +  else return (s2->len-s1->len);
826 + }
827 +
828 +
829   int cmpSeedDiag(const pointer p1, const pointer p2) {
830    GXSeed* s1=(GXSeed*)p1;
831    GXSeed* s2=(GXSeed*)p2;
832    return ((s1->b_ofs-s1->a_ofs)-(s2->b_ofs-s2->a_ofs));
833   }
834  
835 +
836 + int cmpDiagBands_R(const pointer p1, const pointer p2) {
837 +  //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
838 +  GXBand* b1=(GXBand*)p1;
839 +  GXBand* b2=(GXBand*)p2;
840 +  if (b1->score==b2->score) {
841 +     return (b2->w_min_b-b1->w_min_b);
842 +     }
843 +  else return (b2->score-b1->score);
844 + }
845 +
846 +
847 +
848   GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
849 <                       const char* s_seq, int s_alnstart, int s_max, GAlnTrimType trimtype,
850 <                        bool editscript, SGreedyAlignMem* gxmem, int reward, int penalty, int xdrop) {
849 >    const char* s_seq, int s_alnstart, int s_max,
850 >    int reward, int penalty, int xdrop, CGreedyAlignData* gxmem,
851 >    CAlnTrim* trim, bool editscript) {
852    GXEditScript* ed_script_fwd = NULL;
853    GXEditScript* ed_script_rev = NULL;
854    if ( q_alnstart>q_max || q_alnstart<1 || s_alnstart>s_max || s_alnstart<1 )
# Line 705 | Line 865
865    const char* s=s_seq+s_alnstart;
866    int s_avail=s_max-s_alnstart;
867    if (penalty<0) penalty=-penalty;
868 <  int MIN_GREEDY_SCORE=5*reward; //minimum score for an alignment to be reported
868 >  int MIN_GREEDY_SCORE=5*reward; //minimum score for an alignment to be reported for 0 diffs
869    GXAlnInfo* alninfo=NULL;
870    bool freeAlnMem=(gxmem==NULL);
871    if (freeAlnMem) {
872 <     gxmem=new SGreedyAlignMem(reward, penalty, xdrop);
872 >     gxmem=new CGreedyAlignData(reward, penalty, xdrop);
873       }
874    else gxmem->reset();
875    int retscore = 0;
876    int numdiffs = 0;
877 <  if (trimtype==galn_TrimLeft) {
877 >  if (trim!=NULL && trim->type==galn_TrimLeft) {
878      if (editscript)
879         ed_script_rev=new GXEditScript();
880  
881 <    int numdiffs_l =  GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
881 >    int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
882                     reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
883      //check this extension here and bail out if it's not a good extension
884 <    if (s_alnstart+1-s_ext_l>3) {
884 >    if (s_alnstart+1-s_ext_l>trim->boundary) {
885        delete ed_script_rev;
886        if (freeAlnMem) delete gxmem;
887        return NULL;
# Line 733 | Line 893
893                              reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
894      numdiffs=numdiffs_r+numdiffs_l;
895      //convert num diffs to actual score
896 <    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward-penalty);
896 >    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward+penalty);
897      if (editscript)
898         ed_script_rev->Append(ed_script_fwd); //combine the two extensions
899      }
# Line 745 | Line 905
905                              reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
906      //check extension here and bail out if not a good right extension
907      //assuming s_max is really at the right end of s_seq
908 <    if (trimtype==galn_TrimRight && s_alnstart+s_ext_r<s_max-2) {
908 >    if (trim!=NULL && trim->type==galn_TrimRight &&
909 >           s_alnstart+s_ext_r<trim->boundary) {
910        delete ed_script_fwd;
911        if (freeAlnMem) delete gxmem;
912        return NULL;
# Line 756 | Line 917
917                     reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
918      //convert num diffs to actual score
919      numdiffs=numdiffs_r+numdiffs_l;
920 <    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward-penalty);
920 >    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward+penalty);
921      if (editscript)
922         ed_script_rev->Append(ed_script_fwd); //combine the two extensions
923    }
# Line 767 | Line 928
928      alninfo->score=retscore;
929      alninfo->pid = 100 * (1 - ((double) numdiffs) / hsp_length);
930   #ifdef GDEBUG
931 <    if (ed_script_rev) {
932 <       GMessage("Final Edit script ::: ");
933 <       printEditScript(ed_script_rev);
934 <       }
931 >    //if (ed_script_rev) {
932 >    //   GMessage("Final Edit script ::: ");
933 >    //   printEditScript(ed_script_rev);
934 >    //   }
935   #endif
936      alninfo->editscript=ed_script_rev;
937      alninfo->gapinfo = new CAlnGapInfo(ed_script_rev, alninfo->ql-1, alninfo->sl-1);
# Line 785 | Line 946
946    return alninfo;
947   }
948  
949 < GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
950 <            SGreedyAlignMem* gxmem, int match_reward, int mismatch_penalty, int Xdrop) {
949 > GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
950 >                       const char* s_seq, int s_alnstart, int s_max, CGreedyAlignData* gxmem,
951 >                       CAlnTrim* trim, bool editscript) {
952 > int reward=2;
953 > int penalty=3;
954 > int xdrop=8;
955 > if (gxmem) {
956 >   reward=gxmem->match_reward;
957 >   penalty=gxmem->mismatch_penalty;
958 >   xdrop=gxmem->x_drop;
959 >   }
960 > return GreedyAlignRegion(q_seq, q_alnstart, q_max, s_seq, s_alnstart, s_max,
961 >     reward, penalty, xdrop, gxmem, trim, editscript);
962 > }
963 >
964 > GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, GVec<uint16>* amers[],
965 >                 const char* seqb, int seqb_len, CGreedyAlignData* gxmem, int min_pid) {
966    bool editscript=false;
967    #ifdef GDEBUG
968     editscript=true;
969 <   GMessage("==========> matching Right (3') end ========\n");
969 >   GMessage("==========> matching Right (3') end : %s\n", seqa);
970    #endif
971 +
972 +  CAlnTrim trimInfo(galn_TrimRight, seqb, seqb_len);
973    GList<GXSeed> rseeds(true,true,false);
974 <  GXBandSet* alnbands=collectSeeds_R(rseeds, seqa, seqa_len, seqb, seqb_len);
797 <  GList<GXAlnInfo> galns(true,true,false);
798 <  int max_top_bands=5;
799 <  int top_band_count=0;
974 >  GXBandSet* alnbands=collectSeeds_R(rseeds, seqa, seqa_len, amers, seqb, seqb_len);
975    GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
976 <  for (int b=0;b<alnbands->Count();b++) {
977 <     if (alnbands->Get(b)->score<4) break;
978 <     top_band_count++;
979 <     GXBand& band=*(alnbands->Get(b));
980 <     band.seeds.setSorted(cmpSeedScore);
981 <     anchor_seeds.Add(band.seeds.First());
982 <     if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
983 <     }
984 <  #ifdef GDEBUG
985 <  GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
986 <  #endif
987 <
976 >  //did we find a shortcut?
977 >  if (alnbands->qmatch) {
978 >    #ifdef GDEBUG
979 >     GMessage("::: Found a quick long match at %d, len %d\n",
980 >          alnbands->qmatch->b_ofs, alnbands->qmatch->len);
981 >    #endif
982 >    anchor_seeds.Add(alnbands->qmatch);
983 >    }
984 >  else {
985 >    int max_top_bands=5;
986 >    int top_band_count=0;
987 >    for (int b=0;b<alnbands->Count();b++) {
988 >       if (alnbands->Get(b)->score<6) break;
989 >       //#ifdef GDEBUG
990 >       //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
991 >       //#endif
992 >       top_band_count++;
993 >       GXBand& band=*(alnbands->Get(b));
994 >       band.seeds.setSorted(cmpSeedScore);
995 >       anchor_seeds.Add(band.seeds.First());
996 >       band.tested=true;
997 >       if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
998 >       }
999 >    //#ifdef GDEBUG
1000 >    //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1001 >    //#endif
1002 >    }
1003 >  GList<GXAlnInfo> galns(true,true,false);
1004    for (int i=0;i<anchor_seeds.Count();i++) {
1005      GXSeed& aseed=*anchor_seeds[i];
1006      int a1=aseed.a_ofs+(aseed.len>>1)+1;
1007      int a2=aseed.b_ofs+(aseed.len>>1)+1;
1008      GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1009 <                            seqb, a2, seqb_len, galn_TrimRight, editscript, gxmem,
1010 <                            match_reward, mismatch_penalty, Xdrop);
1011 <    if (alninfo && alninfo->pid>73 && alninfo->sr>seqb_len-4 && alninfo->ql<4)
1009 >                            seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1010 >    if (alninfo && alninfo->pid>=min_pid &&
1011 >        trimInfo.validate(seqa_len, alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1012               galns.AddIfNew(alninfo, true);
1013          else delete alninfo;
1014      }
1015 +  if (galns.Count()==0 && alnbands->tmatch) {
1016 +      //last resort seed
1017 +      GXSeed& aseed=*alnbands->tmatch;
1018 +      int a1=aseed.a_ofs+(aseed.len>>1)+1;
1019 +      int a2=aseed.b_ofs+(aseed.len>>1)+1;
1020 +      GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1021 +                                seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1022 +      if (alninfo && alninfo->pid>=min_pid &&
1023 +           trimInfo.validate(seqa_len, alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1024 +                 galns.AddIfNew(alninfo, true);
1025 +            else delete alninfo;
1026 +      }
1027  
1028 <  #ifdef GDEBUG
1029 <  for (int i=0;i<galns.Count();i++) {
1030 <    GXAlnInfo* alninfo=galns[i];
1031 <    GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
1032 <                         alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
1033 <    if (alninfo->gapinfo!=NULL) {
1034 <      GMessage("Alignment:\n");
1035 <      alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1028 >  /*
1029 >  //special 3' end case: due to the seed scoring scheme being biased towards the 5' end of the read,
1030 >  //we should also try some seeds closer to the 3' end
1031 >  if (galns.Count()==0) {
1032 >    anchor_seeds.Clear();
1033 >    alnbands->setSorted(cmpDiagBands_R);
1034 >    int max_top_bands=4;
1035 >    int top_band_count=0;
1036 >    //#ifdef GDEBUG
1037 >    //GMessage(":::>> Retrying adjusting sort order.\n");
1038 >    //#endif
1039 >    if (alnbands->tmatch) {
1040 >      //anchor_seeds.setSorted(false);
1041 >      anchor_seeds.Add(alnbands->tmatch);
1042        }
1043 <    }
1044 <  #endif
1043 >    for (int b=0;b<alnbands->Count();b++) {
1044 >       if (alnbands->Get(b)->score<4) break;
1045 >       //#ifdef GDEBUG
1046 >       //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
1047 >       //#endif
1048 >       if (alnbands->Get(b)->tested) continue;
1049 >       top_band_count++;
1050 >       GXBand& band=*(alnbands->Get(b));
1051 >       band.seeds.setSorted(cmpSeedScore);
1052 >       anchor_seeds.Add(band.seeds.First());
1053 >       if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1054 >       }
1055 >    //#ifdef GDEBUG
1056 >    //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1057 >    //#endif
1058 >    for (int i=0;i<anchor_seeds.Count();i++) {
1059 >      GXSeed& aseed=*anchor_seeds[i];
1060 >      int a1=aseed.a_ofs+(aseed.len>>1)+1;
1061 >      int a2=aseed.b_ofs+(aseed.len>>1)+1;
1062 >      GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1063 >                              seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1064 >      if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1065 >               galns.AddIfNew(alninfo, true);
1066 >          else delete alninfo;
1067 >      }
1068 >    } */
1069 >
1070    //---- done
1071    delete alnbands;
1072    if (galns.Count()) {
1073      GXAlnInfo* bestaln=galns.Shift();
1074 +    #ifdef GDEBUG
1075 +      GMessage("Best alignment: a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", bestaln->ql, bestaln->qr,
1076 +          bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid);
1077 +      if (bestaln->gapinfo!=NULL) {
1078 +        bestaln->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1079 +        }
1080 +    #endif
1081 +
1082      return bestaln;
1083      }
1084    else return NULL;
1085   }
1086  
1087 < GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
1088 <          SGreedyAlignMem* gxmem, int match_reward, int mismatch_penalty, int Xdrop) {
1087 > GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, GVec<uint16>* amers[],
1088 >                 const char* seqb, int seqb_len, CGreedyAlignData* gxmem, int min_pid) {
1089    bool editscript=false;
1090    #ifdef GDEBUG
1091     editscript=true;
1092 <   GMessage("==========> matching Left (5') end ========\n");
1092 >   GMessage("==========> matching Left (5') end : %s\n", seqa);
1093    #endif
1094 +  CAlnTrim trimInfo(galn_TrimLeft, seqb, seqb_len);
1095    GList<GXSeed> rseeds(true,true,false);
1096 <  GXBandSet* alnbands=collectSeeds_L(rseeds, seqa, seqa_len, seqb, seqb_len);
854 <  GList<GXAlnInfo> galns(true,true,false);
855 <  int max_top_bands=5;
856 <  int top_band_count=0;
1096 >  GXBandSet* alnbands=collectSeeds_L(rseeds, seqa, seqa_len, amers, seqb, seqb_len);
1097    GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
1098 <  for (int b=0;b<alnbands->Count();b++) {
1099 <     if (alnbands->Get(b)->score<4) break;
1100 <     top_band_count++;
1101 <     GXBand& band=*(alnbands->Get(b));
1102 <     band.seeds.setSorted(cmpSeedScore);
1103 <     anchor_seeds.Add(band.seeds.First());
1104 <     if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1105 <     }
1106 <  #ifdef GDEBUG
1107 <  GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1108 <  #endif
1109 <
1110 <  for (int i=0;i<anchor_seeds.Count();i++) {
1098 >  if (alnbands->qmatch) {
1099 >    #ifdef GDEBUG
1100 >     GMessage("::: Found a quick long match at %d, len %d\n",
1101 >          alnbands->qmatch->b_ofs, alnbands->qmatch->len);
1102 >    #endif
1103 >    anchor_seeds.Add(alnbands->qmatch);
1104 >    }
1105 >  else {
1106 >    int max_top_bands=5;
1107 >    int top_band_count=0;
1108 >    for (int b=0;b<alnbands->Count();b++) {
1109 >       if (alnbands->Get(b)->score<6) break;
1110 >       //#ifdef GDEBUG
1111 >       //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
1112 >       //#endif
1113 >       top_band_count++;
1114 >       GXBand& band=*(alnbands->Get(b));
1115 >       band.seeds.setSorted(cmpSeedScore);
1116 >       anchor_seeds.Add(band.seeds.First());
1117 >       band.tested=true;
1118 >       if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1119 >       }
1120 >    //#ifdef GDEBUG
1121 >    //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1122 >    //#endif
1123 >    }
1124 > GList<GXAlnInfo> galns(true,true,false);
1125 > for (int i=0;i<anchor_seeds.Count();i++) {
1126      GXSeed& aseed=*anchor_seeds[i];
1127      int a1=aseed.a_ofs+(aseed.len>>1)+1;
1128      int a2=aseed.b_ofs+(aseed.len>>1)+1;
1129      GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1130 <                            seqb, a2, seqb_len, galn_TrimLeft, editscript, gxmem,
1131 <                            match_reward, mismatch_penalty, Xdrop-2);
1132 <    if (alninfo && alninfo->pid>82 && alninfo->sl<3 && alninfo->qr>seqa_len-4)
878 <         //TODO: at 5' end we should use a higher pid threshold
879 <         // we could also decrease the X-drop value!
1130 >                            seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1131 >    if (alninfo && alninfo->pid>=min_pid
1132 >           && trimInfo.validate(seqa_len, alninfo->sl, alninfo->sr, alninfo->pid, seqa_len-alninfo->qr))
1133              galns.AddIfNew(alninfo, true);
1134         else delete alninfo;
1135      }
1136 <
1136 >  if (galns.Count()==0 && alnbands->tmatch) {
1137 >      //last resort seed
1138 >      GXSeed& aseed=*alnbands->tmatch;
1139 >      int a1=aseed.a_ofs+(aseed.len>>1)+1;
1140 >      int a2=aseed.b_ofs+(aseed.len>>1)+1;
1141 >      GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1142 >                              seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1143 >      if (alninfo && alninfo->pid>=min_pid &&
1144 >        trimInfo.validate(seqa_len, alninfo->sl, alninfo->sr, alninfo->pid, seqa_len-alninfo->qr))
1145 >         galns.Add(alninfo);
1146 >      }
1147 >  /*
1148    #ifdef GDEBUG
1149 +  //print valid alignments found
1150    for (int i=0;i<galns.Count();i++) {
1151      GXAlnInfo* alninfo=galns[i];
1152      GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
# Line 892 | Line 1157
1157        }
1158      }
1159    #endif
1160 +  */
1161    //---- done
1162    delete alnbands;
1163    if (galns.Count()) {
1164      GXAlnInfo* bestaln=galns.Shift();
1165 +    #ifdef GDEBUG
1166 +      GMessage("Best alignment: a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", bestaln->ql, bestaln->qr,
1167 +          bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid);
1168 +      if (bestaln->gapinfo!=NULL) {
1169 +        bestaln->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1170 +        }
1171 +    #endif
1172      return bestaln;
1173      }
1174    else return NULL;

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines