ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.cpp
(Generate patch)
# Line 53 | Line 53
53    if (flast_d[d-1][diag-1] > GMAX(flast_d[d-1][diag], flast_d[d-1][diag+1])) {
54        *row1 = flast_d[d-1][diag-1];
55        return diag-1;
56 <      }
56 >      }
57    if (flast_d[d-1][diag] > flast_d[d-1][diag+1]) {
58        *row1 = flast_d[d-1][diag];
59        return diag;
# Line 65 | Line 65
65   void GapXEditScript::print() { //debug
66        GapXEditScript* p=this;
67        do {
68 <        GMessage("%d%c ",p->num, xgapcodes[p->op_type]);
69 <        } while ((p=p->next)!=NULL);
68 >        GMessage("%d%c ",p->num, xgapcodes[p->op_type]);
69 >        } while ((p=p->next)!=NULL);
70        GMessage("\n");
71      }
72  
# Line 93 | Line 93
93      else
94          g = BLAST_Gcd(*a, BLAST_Gcd(*b, *c));
95      if (g > 1) {
96 <        *a /= g;
96 >                *a /= g;
97          *b /= g;
98          *c /= g;
99      }
# Line 101 | Line 101
101   }
102  
103  
104 + uint16 get6mer(char* p) {
105 +  uint16 r=gdna2bit(p,3);
106 +  r <<= 6;
107 +  r |= gdna2bit(p,3);
108 +  return r;
109 + }
110 +
111 +
112 + void table6mers(const char* s, int slen, GVec<uint16>* amers[]) {
113 + for (uint16 i=0; i <= slen-6; i++) {
114 +   char* p = (char*)(s+i);
115 +   uint16 v=get6mer(p);
116 +   if (amers[v]==NULL) {
117 +      amers[v]=new GVec<uint16>(1);
118 +      }
119 +   amers[v]->Add(i);
120 + }
121 + }
122 +
123 + GVec<uint16>* match6mer(char* start, GVec<uint16>* amers[]) {
124 +  //careful: this is broken if start+5 falls beyond the end of the string!
125 +  uint16 r=get6mer(start);
126 +  return amers[r];
127 + }
128  
129   //signal that a diagonal is invalid
130   static const int kInvalidOffset = -2;
# Line 129 | Line 153
153               //    }
154        }
155      else { //forward lookup
156 <             while (seq1_index < len1 && seq2_index < len2 &&
156 >             while (seq1_index < len1 && seq2_index < len2 &&
157                     //seq1[seq1_index] < 4 &&
158                     seq1[seq1_index] == seq2[seq2_index]) {
159                  ++seq1_index;
# Line 181 | Line 205
205  
206   int GXGreedyExtend(const char* seq1, int len1,
207                         const char* seq2, int len2,
208 <                       bool reverse, int xdrop_threshold,
185 <                       int match_cost, int mismatch_cost,
208 >                       bool reverse, //int xdrop_threshold, int match_cost, int mismatch_cost,
209                         int& seq1_align_len, int& seq2_align_len,
210                         CGreedyAlignData& aux_data,
211                         GXEditScript *edit_block) {
# Line 204 | Line 227
227      int longest_match_run;
228      bool end1_reached, end2_reached;
229      GXMemPool* mem_pool;
230 <
230 >
231      /* ordinary dynamic programming alignment, for each offset
232         in seq1, walks through offsets in seq2 until an X-dropoff
233 <       test fails, saving the best score encountered along
233 >       test fails, saving the best score encountered along
234         the way. Instead of score, this code tracks the 'distance'
235         (number of mismatches plus number of gaps) between seq1
236         and seq2. Instead of walking through sequence offsets, it
237         walks through diagonals that can achieve a given distance.
238 <    
238 >
239         Note that in what follows, the numbering of diagonals implies
240 <       a dot matrix where increasing seq1 offsets go to the right on
240 >       a dot matrix where increasing seq1 offsets go to the right on
241         the x axis, and increasing seq2 offsets go up the y axis.
242         The gapped alignment thus proceeds up and to the right in
243         the graph, and diagonals are numbered increasing to the right */
# Line 223 | Line 246
246      best_diag = 0;
247  
248      /* set the number of distinct distances the algorithm will
249 <       examine in the search for an optimal alignment. The
250 <       heuristic worst-case running time of the algorithm is
249 >       examine in the search for an optimal alignment. The
250 >       heuristic worst-case running time of the algorithm is
251         O(max_dist**2 + (len1+len2)); for sequences which are
252         very similar, the average running time will be sig-
253         nificantly better than this */
254  
255      max_dist = GMIN(GREEDY_MAX_COST,
256 <                   len2 / GREEDY_MAX_COST_FRACTION + 1);
256 >                   (len2/GREEDY_MAX_COST_FRACTION + 1));
257  
258      /* the main loop assumes that the index of all diagonals is
259 <       biased to lie in the middle of allocated bookkeeping
259 >       biased to lie in the middle of allocated bookkeeping
260         structures */
261  
262      diag_origin = max_dist + 2;
# Line 251 | Line 274
274         xdrop_offset gives the distance backwards in the score
275         array to look */
276  
277 <    xdrop_offset = (xdrop_threshold + match_cost / 2) /
278 <                           (match_cost + mismatch_cost) + 1;
256 <    
277 >    xdrop_offset = aux_data.xdrop_ofs;
278 >
279      // find the offset of the first mismatch between seq1 and seq2
280  
281      index = s_FindFirstMismatch(seq1, len1,  seq2, len2, 0, 0, reverse);
# Line 277 | Line 299
299          if (edit_block != NULL)
300              //GapPrelimEditBlockAdd(edit_block, eGapAlignSub, index);
301              edit_block->opRep(index);
302 <        return 0;
302 >        return 0;
303      }
304  
305      // set up the memory pool
# Line 287 | Line 309
309        }
310      else if (mem_pool == NULL) {
311         aux_data.space = mem_pool = new GXMemPool();
312 <    }
312 >    }
313      else {
314          mem_pool->refresh();
315      }
316 <    
316 >
317      /* set up the array of per-distance maximum scores. There
318         are max_diags + xdrop_offset distances to track, the first
319         xdrop_offset of which are 0 */
# Line 299 | Line 321
321      max_score = aux_data.max_score + xdrop_offset;
322      for (index = 0; index < xdrop_offset; index++)
323          aux_data.max_score[index] = 0;
324 <    
324 >
325      // fill in the initial offsets of the distance matrix
326  
327      last_seq2_off[0][diag_origin] = seq1_index;
328 <    max_score[0] = seq1_index * match_cost;
328 >    max_score[0] = seq1_index * aux_data.match_reward;
329      diag_lower = diag_origin - 1;
330      diag_upper = diag_origin + 1;
331      end1_reached = end2_reached = false;
# Line 328 | Line 350
350  
351          // compute the score for distance d corresponding to the X-dropoff criterion
352  
353 <        xdrop_score = max_score[d - xdrop_offset] +
354 <                      (match_cost + mismatch_cost) * d - xdrop_threshold;
355 <        xdrop_score = (int)ceil((double)xdrop_score / (match_cost / 2));
353 >        xdrop_score = max_score[d - xdrop_offset] +
354 >                      (aux_data.match_reward + aux_data.mismatch_penalty) * d - aux_data.x_drop;
355 >        xdrop_score = (int)ceil((double)xdrop_score / (aux_data.match_reward>>1));
356  
357          // for each diagonal of interest
358          for (k = tmp_diag_lower; k <= tmp_diag_upper; k++) {
359              /* find the largest offset into seq2 that increases
360                 the distance from d-1 to d (i.e. keeps the alignment
361 <               from getting worse for as long as possible), then
361 >               from getting worse for as long as possible), then
362                 choose the offset into seq1 that will keep the
363 <               resulting diagonal fixed at k
364 <            
363 >               resulting diagonal fixed at k
364 >
365                 Note that this requires kInvalidOffset+1 to be smaller
366                 than any valid offset into seq2, i.e. to be negative */
367  
# Line 360 | Line 382
382                  continue;
383              }
384              diag_upper = k;
385 <            
386 <            /* slide down diagonal k until a mismatch
385 >
386 >            /* slide down diagonal k until a mismatch
387                 occurs. As long as only matches are encountered,
388                 the current distance d will not change */
389  
# Line 397 | Line 419
419                 can each be of size at most max_diags+2 */
420  
421              if (seq2_index == len2) {
422 <                diag_lower = k + 1;
422 >                diag_lower = k + 1;
423                  end2_reached = true;
424              }
425              if (seq1_index == len1) {
426 <                diag_upper = k - 1;
426 >                diag_upper = k - 1;
427                  end1_reached = true;
428              }
429          }  // end loop over diagonals
430  
431          // compute the maximum score possible for distance d
432 <        curr_score = curr_extent * (match_cost / 2) -
433 <                        d * (match_cost + mismatch_cost);
432 >        curr_score = curr_extent * (aux_data.match_reward / 2) -
433 >                        d * (aux_data.match_reward + aux_data.mismatch_penalty);
434          // if this is the best score seen so far, update the
435          // statistics of the best alignment
436          if (curr_score > max_score[d - 1]) {
# Line 417 | Line 439
439              best_diag = curr_diag;
440              seq2_align_len = curr_seq2_index;
441              seq1_align_len = curr_seq2_index + best_diag - diag_origin;
442 <        }
442 >        }
443          else {
444              max_score[d] = max_score[d - 1];
445          }
# Line 428 | Line 450
450          if (diag_lower > diag_upper)
451              break;
452  
453 <        /* set up for the next distance to examine. Because the
454 <           bounds increase by at most one for each distance,
455 <           diag_lower and diag_upper can each be of size at
453 >        /* set up for the next distance to examine. Because the
454 >           bounds increase by at most one for each distance,
455 >           diag_lower and diag_upper can each be of size at
456             most max_diags+2 */
457  
458          if (!end2_reached)
459 <            diag_lower--;
459 >            diag_lower--;
460          if (!end1_reached)
461              diag_upper++;
462  
# Line 455 | Line 477
477              }
478      }   // end loop over distinct distances
479  
480 <    
480 >
481      if (edit_block == NULL)
482          return best_dist;
483  
484      //----  perform traceback
485 <    d = best_dist;
485 >    d = best_dist;
486      seq1_index = seq1_align_len;
487      seq2_index = seq2_align_len;
488      // for all positive distances
# Line 476 | Line 498
498             traceback operation. best_diag starts off with the
499             value computed during the alignment process */
500  
501 <        new_diag = s_GetNextNonAffineTback(last_seq2_off, d,
501 >        new_diag = s_GetNextNonAffineTback(last_seq2_off, d,
502                                             best_diag, &new_seq2_index);
503  
504          if (new_diag == best_diag) {
# Line 484 | Line 506
506              if (seq2_index - new_seq2_index > 0) {
507                    edit_block->opRep(seq2_index - new_seq2_index);
508              }
509 <        }
509 >        }
510          else if (new_diag < best_diag) {
511              // smaller diagonal: issue a group of substitutions
512              //   and then a gap in seq2 */
# Line 493 | Line 515
515              }
516              //GapPrelimEditBlockAdd(edit_block, eGapAlignIns, 1);
517              edit_block->opIns(1);
518 <        }
518 >        }
519          else {
520              // larger diagonal: issue a group of substitutions
521              //   and then a gap in seq1
# Line 502 | Line 524
524              }
525              edit_block->opDel(1);
526          }
527 <        d--;
528 <        best_diag = new_diag;
529 <        seq2_index = new_seq2_index;
527 >        d--;
528 >        best_diag = new_diag;
529 >        seq2_index = new_seq2_index;
530      }
531   //done:
532      // handle the final group of substitutions back to distance zero,
# Line 526 | Line 548
548        unsigned char op_type = 3 - ( ed_script->ops[i] & gxEDIT_OP_MASK );
549        if (op_type == 3)
550           GError("Error: printEditScript encountered op_type 3 ?!\n");
551 <      GMessage("%d%c ", num, xgapcodes[op_type]);
551 >      GMessage("%d%c ", num, xgapcodes[op_type]);
552        }
553      GMessage("\n");
554    }
# Line 580 | Line 602
602   const int a_dropoff_score=7;
603   const int a_min_score=12; //at least 6 bases full match
604  
605 < // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
605 > // ------------------ adaptor matching - simple k-mer seed & extend, no indels for now
606   //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
607   //check minimum score and
608 < //for 3' adapter trimming:
608 > //for 3' adaptor trimming:
609   //     require that the right end of the alignment for either the adaptor OR the read must be
610   //     < 3 distance from its right end
611 < // for 5' adapter trimming:
611 > // for 5' adaptor trimming:
612   //     require that the left end of the alignment for either the adaptor OR the read must
613   //     be at coordinate < 3 from start
614  
# Line 659 | Line 681
681    int mmovh5=(a_l<b_l)? a_l : b_l;
682    if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
683       if (a_l<a_ovh3) {
684 <        //adapter closer to the left end (typical for 5' adapter)
684 >        //adaptor closer to the left end (typical for 5' adaptor)
685          l5=a_r+1;
686          l3=alen-1;
687          }
688        else {
689 <        //adapter matching at the right end (typical for 3' adapter)
689 >        //adaptor matching at the right end (typical for 3' adaptor)
690          l5=0;
691          l3=a_l-1;
692          }
# Line 676 | Line 698
698    return false;
699   }
700  
701 <
702 < GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
703 < //overlap of right (3') end of seqb
704 < //hash the first 12 bases of seqa:
705 < int aimin=0;
684 < int aimax=GMIN(9,(a_len-4));
685 < int bimin=GMAX(0,(b_len-a_len-2));//allow 2 extraneous bases, just in case there is some garbage at the end
686 < int bimax=b_len-4;
687 < int a_maxlen=aimax+4; //number of rows in the diagonal matrix
688 < int b_maxlen=b_len; //number of cols in the diagonal matrix
689 < GXSeedTable gx(a_maxlen, b_maxlen);
690 < GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
691 < for (int ai=aimin;ai<=aimax;ai++) {
692 <    int32* av=(int32 *)(&seqa[ai]);
693 <    //for (int bi=b_len-4;bi>=bimin;bi--) {
694 <    for (int bi=bimin;bi<=bimax;bi++) {
695 <       if (gx.x(ai,bi))
696 <            continue; //already have a previous seed covering this region of this diagonal
697 <       int32* bv=(int32 *)(&seqb[bi]);
698 <       if (*av == *bv) {
699 <          //word match
700 <          //see if we can extend to the right
701 <            gx.x(ai+1,bi+1)=1;
702 <            gx.x(ai+2,bi+2)=1;
703 <            gx.x(ai+3,bi+3)=1;
704 <          int aix=ai+4;
705 <          int bix=bi+4;
706 <          int len=4;
707 <          while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
708 <               {
709 <                 gx.x(aix,bix)=1;
710 <                 aix++;bix++;
711 <                 len++;
712 <               }
713 <          if (len>7) {
714 <              //heuristics: very likely the best we can get
715 <              int qlen=len;
716 <              while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
717 <                aix++;
718 <                bix++;
719 <                qlen++;
720 <                }
721 <              if (qlen>10) { //found a quick match shortcut
722 <                 diagstrips->qmatch=new GXSeed(ai,bi,qlen);
723 <                 return diagstrips;
724 <                 }
725 <              }
726 <          GXSeed* newseed=new GXSeed(ai,bi,len);
727 <          seeds.Add(newseed);
728 <          diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
729 <          //special last resort terminal match to be used if no better alignment is there
730 <          if (len>4 && ai==0 && (bi+len==b_len || bi+len==b_len-1)) diagstrips->tmatch=newseed;
731 <          }
732 <       }
733 <    }//for each 4-mer at the beginning of seqa
734 < for (int i=0;i<diagstrips->Count();i++) {
735 <    diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
736 <    }
737 < diagstrips->setSorted(true); //sort by score
738 < return diagstrips;
739 < }
740 <
741 < GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
742 < //overlap of left (5') end of seqb
743 < //hash the last 12 bases of seqa:
744 < int aimin=GMAX(0,(a_len-16));
745 < int aimax=a_len-4;
746 < int bimin=0;
747 < int bimax=GMIN((a_len-2), (b_len-4));
748 < int a_maxlen=aimax+4; //number of rows in the diagonal matrix
749 < int b_maxlen=b_len; //number of cols in the diagonal matrix
701 > GXBandSet* collectSeeds(GList<GXSeed>& seeds, GXSeqData& sd, GAlnTrimType trim_intent) {
702 > int bimin=GMAX(0,(sd.blen-sd.alen-6)); //from collectSeeds_R
703 > int bimax=GMIN((sd.alen+2), (sd.blen-6));
704 > int b_start = (trim_intent==galn_TrimRight) ? bimin : 0;
705 > int b_end = (trim_intent==galn_TrimLeft) ?  bimax : sd.blen-6;
706   //gx.init(a_maxlen, b_maxlen);
707 < GXSeedTable gx(a_maxlen, b_maxlen);
708 < GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
709 < for (int ai=aimin;ai<=aimax;ai++) {
710 <    int32* av=(int32 *)(&seqa[ai]);
711 <    for (int bi=bimin;bi<=bimax;bi++) {
712 <       if (gx.x(ai,bi))
713 <          continue; //already have a previous seed covering this region of this diagonal
714 <       int32* bv=(int32 *)(&seqb[bi]);
715 <       if (*av == *bv) {
716 <          //word match
717 <          //see if we can extend to the right
718 <        gx.x(ai+1,bi+1)=1;
719 <        gx.x(ai+2,bi+2)=1;
720 <        gx.x(ai+3,bi+3)=1;
721 <          int aix=ai+4;
722 <          int bix=bi+4;
723 <          int len=4;
724 <          while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
725 <               {
726 <                 gx.x(aix,bix)=1;
727 <                 aix++;bix++;
728 <                 len++;
729 <               }
730 <          if (len>7) {
731 <              //heuristics: very likely the best we can get
732 <              int qlen=len;
733 <              while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
734 <                aix++;
735 <                bix++;
736 <                qlen++;
737 <                }
738 <              if (qlen>10) { //found a quick match shortcut
739 <                 diagstrips->qmatch=new GXSeed(ai,bi,qlen);
740 <                 return diagstrips;
741 <                 }
742 <              }
743 <          GXSeed* newseed=new GXSeed(ai,bi,len);
744 <          seeds.Add(newseed);
745 <          diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
746 <          //special last resort terminal match to be used if no better alignment is there
747 <          if (len>4 && bi==0 && ai+len==a_len) diagstrips->tmatch=newseed;
748 <          }
749 <       }
750 <    }//for each 4-mer at the end of seqa
707 > GXSeedTable gx(sd.alen, sd.blen);
708 > GXBandSet* diagstrips=new GXBandSet(sd.alen, sd.blen); //set of overlapping 3-diagonal strips
709 > for (int bi=b_start;bi<=b_end;bi++) {
710 >   //for each 6-mer of seqb
711 >   uint16 bv = get6mer((char*) & (sd.bseq[bi]));
712 >   GVec<uint16>* alocs = sd.amers[bv];
713 >   if (alocs==NULL) continue;
714 >   //extend each hit
715 >   for (int h=0;h<alocs->Count();h++) {
716 >         int ai=alocs->Get(h);
717 >         //word match
718 >         if (gx.x(ai,bi))
719 >           //already have a previous seed covering this region of this diagonal
720 >           continue;
721 >         if (trim_intent==galn_TrimLeft && sd.blen>sd.alen+6 && bi>ai+6)
722 >               continue; //improper positioning for 5' trimming
723 >         if (trim_intent==galn_TrimRight && sd.blen>sd.alen+6 && bi<ai-6)
724 >               continue; //improper positioning for 5' trimming
725 >
726 >         //TODO: there could be Ns in this seed, should we count/adjust score?
727 >         for (int i=0;i<6;i++)
728 >            gx.x(ai+i,bi+i)=1;
729 >         //see if we can extend to the right
730 >         int aix=ai+6;
731 >         int bix=bi+6;
732 >         int len=6;
733 >         while (bix<sd.blen && aix<sd.alen && sd.aseq[aix]==sd.bseq[bix]) {
734 >                 gx.x(aix,bix)=1;
735 >                 aix++;bix++;
736 >                 len++;
737 >                 }
738 >         if (len>sd.amlen) {
739 >                //heuristics: very likely the best we can get
740 >                //quick match shortcut
741 >                diagstrips->qmatch=new GXSeed(ai,bi,len);
742 >                return diagstrips;
743 >                }
744 >         if (bi>bimax && bi<bimin && len<9)
745 >                 //ignore mid-sequence seeds that are not high scoring
746 >             continue;
747 >
748 >         GXSeed* newseed=new GXSeed(ai,bi,len);
749 >         seeds.Add(newseed);
750 >         diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
751 >     //keep last resort terminal match to be used if no better alignment is there
752 >     if (bi<2 && ai+len>=sd.alen-1 &&
753 >                 (!diagstrips->tmatch_l || diagstrips->tmatch_l->len<len))
754 >                      diagstrips->tmatch_l=newseed;
755 >     //collectSeeds_R:
756 >         if (ai<2 && bi+len>sd.blen-2 &&
757 >                 (!diagstrips->tmatch_r || diagstrips->tmatch_r->len<len))
758 >                  diagstrips->tmatch_r=newseed;
759 >     }
760 > } //for each 6-mer of the read
761   for (int i=0;i<diagstrips->Count();i++) {
762      diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
763      }
# Line 809 | Line 775
775    else return (s2->len-s1->len);
776   }
777  
778 + int cmpSeedScore_R(const pointer p1, const pointer p2) {
779 +  //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
780 +  GXSeed* s1=(GXSeed*)p1;
781 +  GXSeed* s2=(GXSeed*)p2;
782 +  if (s1->len==s2->len) {
783 +     return (s2->b_ofs-s1->b_ofs);
784 +     }
785 +  else return (s2->len-s1->len);
786 + }
787 +
788 +
789   int cmpSeedDiag(const pointer p1, const pointer p2) {
790    GXSeed* s1=(GXSeed*)p1;
791    GXSeed* s2=(GXSeed*)p2;
# Line 840 | Line 817
817    s_alnstart--;
818    if (q_seq==NULL || q_seq[0]==0 || s_seq==NULL || s_seq[0]==0)
819      GError("GreedyAlign() Error: attempt to use an empty sequence string!\n");
820 <  if (q_seq[q_alnstart]!=s_seq[s_alnstart])
821 <    GError("GreedyAlign() Error: improper anchor (mismatch)!\n");
820 >  /*if (q_seq[q_alnstart]!=s_seq[s_alnstart])
821 >    GError("GreedyAlign() Error: improper anchor (mismatch):\n%s (start %d len %d)\n%s (start %d len %d)\n",
822 >           q_seq, q_alnstart, q_max, s_seq, s_alnstart, s_max);
823 >           */
824    int q_ext_l=0, q_ext_r=0, s_ext_l=0, s_ext_r=0;
825    const char* q=q_seq+q_alnstart;
826    int q_avail=q_max-q_alnstart;
827    const char* s=s_seq+s_alnstart;
828    int s_avail=s_max-s_alnstart;
829    if (penalty<0) penalty=-penalty;
851  int MIN_GREEDY_SCORE=5*reward; //minimum score for an alignment to be reported for 0 diffs
830    GXAlnInfo* alninfo=NULL;
831    bool freeAlnMem=(gxmem==NULL);
832    if (freeAlnMem) {
833       gxmem=new CGreedyAlignData(reward, penalty, xdrop);
834 +     reward=gxmem->match_reward;
835 +     penalty=gxmem->mismatch_penalty;
836 +     xdrop=gxmem->x_drop;
837       }
838 <  else gxmem->reset();
838 >   else
839 >         gxmem->reset();
840 >  int minMatch= trim ? trim->minMatch : 6;
841 >  int MIN_GREEDY_SCORE=minMatch*reward; //minimum score for an alignment to be reported for 0 diffs
842    int retscore = 0;
843    int numdiffs = 0;
844    if (trim!=NULL && trim->type==galn_TrimLeft) {
845 +    //intent: trimming the left side
846      if (editscript)
847         ed_script_rev=new GXEditScript();
848  
849 <    int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
850 <                   reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
849 >    int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, // xdrop, reward, penalty,
850 >                    s_ext_l, q_ext_l, *gxmem, ed_script_rev);
851      //check this extension here and bail out if it's not a good extension
852 <    if (s_alnstart+1-s_ext_l>trim->boundary) {
852 >    if (s_ext_l+(trim->seedlen>>1) < trim->safelen &&
853 >        q_alnstart+1-q_ext_l>1 &&
854 >        s_alnstart+1-s_ext_l>trim->l_boundary) {
855        delete ed_script_rev;
856        if (freeAlnMem) delete gxmem;
857        return NULL;
# Line 872 | Line 859
859  
860      if (editscript)
861         ed_script_fwd=new GXEditScript();
862 <    int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
863 <                            reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
862 >    int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, //xdrop, reward, penalty,
863 >                                     s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
864      numdiffs=numdiffs_r+numdiffs_l;
865      //convert num diffs to actual score
866 <    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward+penalty);
866 >    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*(reward>>1) - numdiffs*(reward+penalty);
867      if (editscript)
868         ed_script_rev->Append(ed_script_fwd); //combine the two extensions
869      }
# Line 884 | Line 871
871      if (editscript) {
872         ed_script_fwd=new GXEditScript();
873         }
874 <    int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
875 <                            reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
874 >    int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, // xdrop, reward, penalty,
875 >                                     s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
876      //check extension here and bail out if not a good right extension
877      //assuming s_max is really at the right end of s_seq
878 <    if (trim!=NULL && trim->type==galn_TrimRight && s_alnstart+s_ext_r<trim->boundary) {
878 >    if (trim!=NULL && trim->type==galn_TrimRight &&
879 >        s_ext_r+(trim->seedlen>>1) < trim->safelen &&
880 >            q_alnstart+q_ext_r<q_max-2 &&
881 >            s_alnstart+s_ext_r<trim->r_boundary) {
882        delete ed_script_fwd;
883        if (freeAlnMem) delete gxmem;
884        return NULL;
885        }
886      if (editscript)
887         ed_script_rev=new GXEditScript();
888 <    int numdiffs_l =  GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
889 <                   reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
888 >    int numdiffs_l =  GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, // xdrop, reward, penalty,
889 >                                  s_ext_l, q_ext_l, *gxmem, ed_script_rev);
890      //convert num diffs to actual score
891      numdiffs=numdiffs_r+numdiffs_l;
892 <    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward+penalty);
892 >    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*(reward>>1) - numdiffs*(reward+penalty);
893      if (editscript)
894         ed_script_rev->Append(ed_script_fwd); //combine the two extensions
895    }
906
896    if (retscore>=MIN_GREEDY_SCORE) {
897 <    alninfo=new GXAlnInfo(q_seq, q_alnstart+1-q_ext_l,q_alnstart+q_ext_r, s_seq, s_alnstart+1-s_ext_l, s_alnstart+s_ext_r);
897 >    alninfo=new GXAlnInfo(q_seq, q_alnstart+1-q_ext_l, q_alnstart+q_ext_r, s_seq, s_alnstart+1-s_ext_l, s_alnstart+s_ext_r);
898      int hsp_length = GMIN(q_ext_l+q_ext_r, s_ext_l+s_ext_r);
899      alninfo->score=retscore;
900 +    if (gxmem->scaled) alninfo->score >>= 1;
901      alninfo->pid = 100 * (1 - ((double) numdiffs) / hsp_length);
902   #ifdef GDEBUG
903      //if (ed_script_rev) {
# Line 919 | Line 909
909      alninfo->gapinfo = new CAlnGapInfo(ed_script_rev, alninfo->ql-1, alninfo->sl-1);
910      }
911    else {
912 <    if (freeAlnMem) delete gxmem;
912 >    //if (freeAlnMem) delete gxmem;
913      delete ed_script_rev;
914      delete alninfo;
915      alninfo=NULL;
916      }
917 +  if (freeAlnMem) delete gxmem;
918    delete ed_script_fwd;
919    return alninfo;
920   }
# Line 932 | Line 923
923                         const char* s_seq, int s_alnstart, int s_max, CGreedyAlignData* gxmem,
924                         CAlnTrim* trim, bool editscript) {
925   int reward=2;
926 < int penalty=3;
927 < int xdrop=8;
926 > int penalty=10;
927 > int xdrop=32;
928   if (gxmem) {
929     reward=gxmem->match_reward;
930     penalty=gxmem->mismatch_penalty;
# Line 943 | Line 934
934       reward, penalty, xdrop, gxmem, trim, editscript);
935   }
936  
937 < GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
938 <            CGreedyAlignData* gxmem, int min_pid) {
937 > GXAlnInfo* match_adaptor(GXSeqData& sd, GAlnTrimType trim_type, int minMatch,
938 >                                     CGreedyAlignData* gxmem, int min_pid) {
939    bool editscript=false;
940    #ifdef GDEBUG
941     editscript=true;
942 <  // GMessage("==========> matching Right (3') end ========\n");
942 >   if (trim_type==galn_TrimLeft) {
943 >         GMessage("=======> searching left (5') end : %s\n", sd.aseq);
944 >     }
945 >   else if (trim_type==galn_TrimRight) {
946 >     GMessage("=======> searching right(3') end : %s\n", sd.aseq);
947 >     }
948 >   else if (trim_type==galn_TrimEither) {
949 >     GMessage("==========> searching  both ends : %s\n", sd.aseq);
950 >     }
951    #endif
952 <
954 <  CAlnTrim trimInfo(galn_TrimRight, seqb, seqb_len);
952 >  CAlnTrim trimInfo(trim_type, sd.bseq, sd.blen, sd.alen, minMatch, sd.amlen);
953    GList<GXSeed> rseeds(true,true,false);
954 <  GXBandSet* alnbands=collectSeeds_R(rseeds, seqa, seqa_len, seqb, seqb_len);
954 >  GXBandSet* alnbands=collectSeeds(rseeds, sd, trim_type);
955    GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
956    //did we find a shortcut?
957    if (alnbands->qmatch) {
958 <    //#ifdef GDEBUG
959 <    // GMessage("::: Found a quick long match -- using it instead of diagonal bands\n");
960 <    //#endif
958 >    #ifdef GDEBUG
959 >     GMessage("::: Found a quick long match at %d, len %d\n",
960 >          alnbands->qmatch->b_ofs, alnbands->qmatch->len);
961 >    #endif
962      anchor_seeds.Add(alnbands->qmatch);
963      }
964    else {
965      int max_top_bands=5;
966      int top_band_count=0;
967      for (int b=0;b<alnbands->Count();b++) {
968 <       if (alnbands->Get(b)->score<4) break;
968 >       if (alnbands->Get(b)->score<6) break;
969         //#ifdef GDEBUG
970         //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
971         //#endif
# Line 974 | Line 973
973         GXBand& band=*(alnbands->Get(b));
974         band.seeds.setSorted(cmpSeedScore);
975         anchor_seeds.Add(band.seeds.First());
976 <       band.tested=true;
976 >       //band.tested=true;
977         if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
978         }
979      //#ifdef GDEBUG
980      //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
981      //#endif
982      }
984
983    GList<GXAlnInfo> galns(true,true,false);
984    for (int i=0;i<anchor_seeds.Count();i++) {
985      GXSeed& aseed=*anchor_seeds[i];
986      int a1=aseed.a_ofs+(aseed.len>>1)+1;
987      int a2=aseed.b_ofs+(aseed.len>>1)+1;
988 <    GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
989 <                            seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
990 <    if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
988 >    trimInfo.seedlen=aseed.len;
989 > #ifdef GDEBUG
990 >    GMessage("\t::: align from seed (%d, %d)of len %d.\n",aseed.a_ofs, aseed.b_ofs,
991 >           aseed.len);
992 > #endif
993 >    GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen,
994 >                            sd.bseq, a2, sd.blen, gxmem, &trimInfo, editscript);
995 >    if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo))
996               galns.AddIfNew(alninfo, true);
997          else delete alninfo;
998      }
999 <  //special 3' end case: due to the seed scoring scheme being biased towards the 5' end of the read,
997 <  //we should also try some seeds closer to the 3' end
999 >
1000    if (galns.Count()==0) {
1001 <    anchor_seeds.Clear();
1002 <    alnbands->setSorted(cmpDiagBands_R);
1003 <    int max_top_bands=4;
1004 <    int top_band_count=0;
1005 <    //#ifdef GDEBUG
1006 <    //GMessage(":::>> Retrying adjusting sort order.\n");
1007 <    //#endif
1008 <    if (alnbands->tmatch) {
1009 <      //anchor_seeds.setSorted(false);
1010 <      anchor_seeds.Add(alnbands->tmatch);
1011 <      }
1012 <    for (int b=0;b<alnbands->Count();b++) {
1013 <       if (alnbands->Get(b)->score<4) break;
1014 <       //#ifdef GDEBUG
1015 <       //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
1016 <       //#endif
1017 <       if (alnbands->Get(b)->tested) continue;
1018 <       top_band_count++;
1019 <       GXBand& band=*(alnbands->Get(b));
1020 <       band.seeds.setSorted(cmpSeedScore);
1021 <       anchor_seeds.Add(band.seeds.First());
1022 <       if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1023 <       }
1024 <    //#ifdef GDEBUG
1025 <    //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1026 <    //#endif
1025 <    for (int i=0;i<anchor_seeds.Count();i++) {
1026 <      GXSeed& aseed=*anchor_seeds[i];
1027 <      int a1=aseed.a_ofs+(aseed.len>>1)+1;
1028 <      int a2=aseed.b_ofs+(aseed.len>>1)+1;
1029 <      GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1030 <                              seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1031 <      if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1032 <               galns.AddIfNew(alninfo, true);
1033 <          else delete alninfo;
1001 >         //last resort: look for weaker terminal seeds
1002 >          GPVec<GXSeed> tmatches(2,false);
1003 >          if (trim_type!=galn_TrimRight) {
1004 >                 if (alnbands->tmatch_l)
1005 >                    tmatches.Add(alnbands->tmatch_l);
1006 >             }
1007 >          if (trim_type!=galn_TrimLeft) {
1008 >                 if (alnbands->tmatch_r)
1009 >                    tmatches.Add(alnbands->tmatch_r);
1010 >             }
1011 >          for (int i=0;i<tmatches.Count();i++) {
1012 >                GXSeed& aseed=*tmatches[i];
1013 >                int halfseed=aseed.len>>1;
1014 >                int a1=aseed.a_ofs+halfseed+1;
1015 >                int a2=aseed.b_ofs+halfseed+1;
1016 >                trimInfo.seedlen=aseed.len;
1017 > #ifdef GDEBUG
1018 >    GMessage("\t::: align from terminal seed (%d, %d)of len %d.\n",aseed.a_ofs, aseed.b_ofs,
1019 >           aseed.len);
1020 > #endif
1021 >        GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen,
1022 >                                sd.bseq, a2, sd.blen, gxmem, &trimInfo, editscript);
1023 >        if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo))
1024 >                 galns.AddIfNew(alninfo, true);
1025 >             else delete alninfo;
1026 >        }//for each terminal seed
1027        }
1028 <    }
1036 <  //---- done
1028 >  //---- found all alignments
1029    delete alnbands;
1038  if (galns.Count()) {
1039    GXAlnInfo* bestaln=galns.Shift();
1040    #ifdef GDEBUG
1041      GMessage("Best alignment: a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", bestaln->ql, bestaln->qr,
1042          bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid);
1043      if (bestaln->gapinfo!=NULL) {
1044        bestaln->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1045        }
1046    #endif
1047
1048    return bestaln;
1049    }
1050  else return NULL;
1051 }
1052
1053 GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
1054          CGreedyAlignData* gxmem, int min_pid) {
1055  bool editscript=false;
1056  #ifdef GDEBUG
1057   editscript=true;
1058   //GMessage("==========> matching Left (5') end ========\n");
1059  #endif
1060  CAlnTrim trimInfo(galn_TrimLeft, seqb, seqb_len);
1061  GList<GXSeed> rseeds(true,true,false);
1062  GXBandSet* alnbands=collectSeeds_L(rseeds, seqa, seqa_len, seqb, seqb_len);
1063  GList<GXAlnInfo> galns(true,true,false);
1064  GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
1065
1066  if (alnbands->qmatch) {
1067    //#ifdef GDEBUG
1068    // GMessage("::: Found a quick long match -- using it instead of diagonal bands\n");
1069    //#endif
1070    anchor_seeds.Add(alnbands->qmatch);
1071    }
1072  else {
1073    int max_top_bands=5;
1074    int top_band_count=0;
1075    for (int b=0;b<alnbands->Count();b++) {
1076       if (alnbands->Get(b)->score<4) break;
1077       //#ifdef GDEBUG
1078       //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
1079       //#endif
1080       top_band_count++;
1081       GXBand& band=*(alnbands->Get(b));
1082       band.seeds.setSorted(cmpSeedScore);
1083       anchor_seeds.Add(band.seeds.First());
1084       if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1085       }
1086    //#ifdef GDEBUG
1087    //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1088    //#endif
1089    }
1090  for (int i=0;i<anchor_seeds.Count();i++) {
1091    GXSeed& aseed=*anchor_seeds[i];
1092    int a1=aseed.a_ofs+(aseed.len>>1)+1;
1093    int a2=aseed.b_ofs+(aseed.len>>1)+1;
1094    GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1095                            seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1096    if (alninfo && alninfo->pid>=min_pid
1097           && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, seqa_len-alninfo->qr))
1098            galns.AddIfNew(alninfo, true);
1099       else delete alninfo;
1100    }
1101  if (galns.Count()==0 && alnbands->tmatch) {
1102      GXSeed& aseed=*alnbands->tmatch;
1103      int a1=aseed.a_ofs+(aseed.len>>1)+1;
1104      int a2=aseed.b_ofs+(aseed.len>>1)+1;
1105      GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1106                              seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1107      if (alninfo) galns.Add(alninfo);
1108      }
1109  #ifdef GDEBUG
1030    /*
1031 +  #ifdef GDEBUG
1032 +  //print all valid alignments found
1033    for (int i=0;i<galns.Count();i++) {
1034      GXAlnInfo* alninfo=galns[i];
1035      GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
# Line 1117 | Line 1039
1039        alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1040        }
1041      }
1120  */
1042    #endif
1043 <  //---- done
1123 <  delete alnbands;
1043 >  */
1044    if (galns.Count()) {
1045      GXAlnInfo* bestaln=galns.Shift();
1046      #ifdef GDEBUG
1047        GMessage("Best alignment: a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", bestaln->ql, bestaln->qr,
1048            bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid);
1049        if (bestaln->gapinfo!=NULL) {
1050 <        bestaln->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1050 >        bestaln->gapinfo->printAlignment(stderr, sd.aseq, sd.alen, sd.bseq, sd.blen);
1051          }
1052      #endif
1053      return bestaln;

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines