ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.cpp
(Generate patch)
# Line 53 | Line 53
53    if (flast_d[d-1][diag-1] > GMAX(flast_d[d-1][diag], flast_d[d-1][diag+1])) {
54        *row1 = flast_d[d-1][diag-1];
55        return diag-1;
56 <      }
56 >      }
57    if (flast_d[d-1][diag] > flast_d[d-1][diag+1]) {
58        *row1 = flast_d[d-1][diag];
59        return diag;
# Line 65 | Line 65
65   void GapXEditScript::print() { //debug
66        GapXEditScript* p=this;
67        do {
68 <        GMessage("%d%c ",p->num, xgapcodes[p->op_type]);
69 <        } while ((p=p->next)!=NULL);
68 >        GMessage("%d%c ",p->num, xgapcodes[p->op_type]);
69 >        } while ((p=p->next)!=NULL);
70        GMessage("\n");
71      }
72  
# Line 93 | Line 93
93      else
94          g = BLAST_Gcd(*a, BLAST_Gcd(*b, *c));
95      if (g > 1) {
96 <        *a /= g;
96 >                *a /= g;
97          *b /= g;
98          *c /= g;
99      }
# Line 101 | Line 101
101   }
102  
103  
104 + uint16 get6mer(char* p) {
105 +  uint16 r=gdna2bit(p,3);
106 +  r <<= 6;
107 +  r |= gdna2bit(p,3);
108 +  return r;
109 + }
110 +
111 +
112 + void table6mers(const char* s, int slen, GVec<uint16>* amers[]) {
113 + for (uint16 i=0; i <= slen-6; i++) {
114 +   char* p = (char*)(s+i);
115 +   uint16 v=get6mer(p);
116 +   if (amers[v]==NULL) {
117 +      amers[v]=new GVec<uint16>(1);
118 +      }
119 +   amers[v]->Add(i);
120 + }
121 + }
122 +
123 + GVec<uint16>* match6mer(char* start, GVec<uint16>* amers[]) {
124 +  //careful: this is broken if start+5 falls beyond the end of the string!
125 +  uint16 r=get6mer(start);
126 +  return amers[r];
127 + }
128  
129   //signal that a diagonal is invalid
130   static const int kInvalidOffset = -2;
# Line 129 | Line 153
153               //    }
154        }
155      else { //forward lookup
156 <             while (seq1_index < len1 && seq2_index < len2 &&
156 >             while (seq1_index < len1 && seq2_index < len2 &&
157                     //seq1[seq1_index] < 4 &&
158                     seq1[seq1_index] == seq2[seq2_index]) {
159                  ++seq1_index;
# Line 181 | Line 205
205  
206   int GXGreedyExtend(const char* seq1, int len1,
207                         const char* seq2, int len2,
208 <                       bool reverse, int xdrop_threshold,
185 <                       int match_cost, int mismatch_cost,
208 >                       bool reverse, //int xdrop_threshold, int match_cost, int mismatch_cost,
209                         int& seq1_align_len, int& seq2_align_len,
210                         CGreedyAlignData& aux_data,
211                         GXEditScript *edit_block) {
# Line 204 | Line 227
227      int longest_match_run;
228      bool end1_reached, end2_reached;
229      GXMemPool* mem_pool;
230 <
230 >
231      /* ordinary dynamic programming alignment, for each offset
232         in seq1, walks through offsets in seq2 until an X-dropoff
233 <       test fails, saving the best score encountered along
233 >       test fails, saving the best score encountered along
234         the way. Instead of score, this code tracks the 'distance'
235         (number of mismatches plus number of gaps) between seq1
236         and seq2. Instead of walking through sequence offsets, it
237         walks through diagonals that can achieve a given distance.
238 <    
238 >
239         Note that in what follows, the numbering of diagonals implies
240 <       a dot matrix where increasing seq1 offsets go to the right on
240 >       a dot matrix where increasing seq1 offsets go to the right on
241         the x axis, and increasing seq2 offsets go up the y axis.
242         The gapped alignment thus proceeds up and to the right in
243         the graph, and diagonals are numbered increasing to the right */
# Line 223 | Line 246
246      best_diag = 0;
247  
248      /* set the number of distinct distances the algorithm will
249 <       examine in the search for an optimal alignment. The
250 <       heuristic worst-case running time of the algorithm is
249 >       examine in the search for an optimal alignment. The
250 >       heuristic worst-case running time of the algorithm is
251         O(max_dist**2 + (len1+len2)); for sequences which are
252         very similar, the average running time will be sig-
253         nificantly better than this */
254  
255      max_dist = GMIN(GREEDY_MAX_COST,
256 <                   len2 / GREEDY_MAX_COST_FRACTION + 1);
256 >                   (len2/GREEDY_MAX_COST_FRACTION + 1));
257  
258      /* the main loop assumes that the index of all diagonals is
259 <       biased to lie in the middle of allocated bookkeeping
259 >       biased to lie in the middle of allocated bookkeeping
260         structures */
261  
262      diag_origin = max_dist + 2;
# Line 251 | Line 274
274         xdrop_offset gives the distance backwards in the score
275         array to look */
276  
277 <    xdrop_offset = (xdrop_threshold + match_cost / 2) /
278 <                           (match_cost + mismatch_cost) + 1;
256 <    
277 >    xdrop_offset = aux_data.xdrop_ofs;
278 >
279      // find the offset of the first mismatch between seq1 and seq2
280  
281      index = s_FindFirstMismatch(seq1, len1,  seq2, len2, 0, 0, reverse);
# Line 277 | Line 299
299          if (edit_block != NULL)
300              //GapPrelimEditBlockAdd(edit_block, eGapAlignSub, index);
301              edit_block->opRep(index);
302 <        return 0;
302 >        return 0;
303      }
304  
305      // set up the memory pool
# Line 287 | Line 309
309        }
310      else if (mem_pool == NULL) {
311         aux_data.space = mem_pool = new GXMemPool();
312 <    }
312 >    }
313      else {
314          mem_pool->refresh();
315      }
316 <    
316 >
317      /* set up the array of per-distance maximum scores. There
318         are max_diags + xdrop_offset distances to track, the first
319         xdrop_offset of which are 0 */
# Line 299 | Line 321
321      max_score = aux_data.max_score + xdrop_offset;
322      for (index = 0; index < xdrop_offset; index++)
323          aux_data.max_score[index] = 0;
324 <    
324 >
325      // fill in the initial offsets of the distance matrix
326  
327      last_seq2_off[0][diag_origin] = seq1_index;
328 <    max_score[0] = seq1_index * match_cost;
328 >    max_score[0] = seq1_index * aux_data.match_reward;
329      diag_lower = diag_origin - 1;
330      diag_upper = diag_origin + 1;
331      end1_reached = end2_reached = false;
# Line 328 | Line 350
350  
351          // compute the score for distance d corresponding to the X-dropoff criterion
352  
353 <        xdrop_score = max_score[d - xdrop_offset] +
354 <                      (match_cost + mismatch_cost) * d - xdrop_threshold;
355 <        xdrop_score = (int)ceil((double)xdrop_score / (match_cost / 2));
353 >        xdrop_score = max_score[d - xdrop_offset] +
354 >                      (aux_data.match_reward + aux_data.mismatch_penalty) * d - aux_data.x_drop;
355 >        xdrop_score = (int)ceil((double)xdrop_score / (aux_data.match_reward>>1));
356  
357          // for each diagonal of interest
358          for (k = tmp_diag_lower; k <= tmp_diag_upper; k++) {
359              /* find the largest offset into seq2 that increases
360                 the distance from d-1 to d (i.e. keeps the alignment
361 <               from getting worse for as long as possible), then
361 >               from getting worse for as long as possible), then
362                 choose the offset into seq1 that will keep the
363 <               resulting diagonal fixed at k
364 <            
363 >               resulting diagonal fixed at k
364 >
365                 Note that this requires kInvalidOffset+1 to be smaller
366                 than any valid offset into seq2, i.e. to be negative */
367  
# Line 360 | Line 382
382                  continue;
383              }
384              diag_upper = k;
385 <            
386 <            /* slide down diagonal k until a mismatch
385 >
386 >            /* slide down diagonal k until a mismatch
387                 occurs. As long as only matches are encountered,
388                 the current distance d will not change */
389  
# Line 397 | Line 419
419                 can each be of size at most max_diags+2 */
420  
421              if (seq2_index == len2) {
422 <                diag_lower = k + 1;
422 >                diag_lower = k + 1;
423                  end2_reached = true;
424              }
425              if (seq1_index == len1) {
426 <                diag_upper = k - 1;
426 >                diag_upper = k - 1;
427                  end1_reached = true;
428              }
429          }  // end loop over diagonals
430  
431          // compute the maximum score possible for distance d
432 <        curr_score = curr_extent * (match_cost / 2) -
433 <                        d * (match_cost + mismatch_cost);
432 >        curr_score = curr_extent * (aux_data.match_reward / 2) -
433 >                        d * (aux_data.match_reward + aux_data.mismatch_penalty);
434          // if this is the best score seen so far, update the
435          // statistics of the best alignment
436          if (curr_score > max_score[d - 1]) {
# Line 417 | Line 439
439              best_diag = curr_diag;
440              seq2_align_len = curr_seq2_index;
441              seq1_align_len = curr_seq2_index + best_diag - diag_origin;
442 <        }
442 >        }
443          else {
444              max_score[d] = max_score[d - 1];
445          }
# Line 428 | Line 450
450          if (diag_lower > diag_upper)
451              break;
452  
453 <        /* set up for the next distance to examine. Because the
454 <           bounds increase by at most one for each distance,
455 <           diag_lower and diag_upper can each be of size at
453 >        /* set up for the next distance to examine. Because the
454 >           bounds increase by at most one for each distance,
455 >           diag_lower and diag_upper can each be of size at
456             most max_diags+2 */
457  
458          if (!end2_reached)
459 <            diag_lower--;
459 >            diag_lower--;
460          if (!end1_reached)
461              diag_upper++;
462  
# Line 455 | Line 477
477              }
478      }   // end loop over distinct distances
479  
480 <    
480 >
481      if (edit_block == NULL)
482          return best_dist;
483  
484      //----  perform traceback
485 <    d = best_dist;
485 >    d = best_dist;
486      seq1_index = seq1_align_len;
487      seq2_index = seq2_align_len;
488      // for all positive distances
# Line 476 | Line 498
498             traceback operation. best_diag starts off with the
499             value computed during the alignment process */
500  
501 <        new_diag = s_GetNextNonAffineTback(last_seq2_off, d,
501 >        new_diag = s_GetNextNonAffineTback(last_seq2_off, d,
502                                             best_diag, &new_seq2_index);
503  
504          if (new_diag == best_diag) {
# Line 484 | Line 506
506              if (seq2_index - new_seq2_index > 0) {
507                    edit_block->opRep(seq2_index - new_seq2_index);
508              }
509 <        }
509 >        }
510          else if (new_diag < best_diag) {
511              // smaller diagonal: issue a group of substitutions
512              //   and then a gap in seq2 */
# Line 493 | Line 515
515              }
516              //GapPrelimEditBlockAdd(edit_block, eGapAlignIns, 1);
517              edit_block->opIns(1);
518 <        }
518 >        }
519          else {
520              // larger diagonal: issue a group of substitutions
521              //   and then a gap in seq1
# Line 502 | Line 524
524              }
525              edit_block->opDel(1);
526          }
527 <        d--;
528 <        best_diag = new_diag;
529 <        seq2_index = new_seq2_index;
527 >        d--;
528 >        best_diag = new_diag;
529 >        seq2_index = new_seq2_index;
530      }
531   //done:
532      // handle the final group of substitutions back to distance zero,
# Line 526 | Line 548
548        unsigned char op_type = 3 - ( ed_script->ops[i] & gxEDIT_OP_MASK );
549        if (op_type == 3)
550           GError("Error: printEditScript encountered op_type 3 ?!\n");
551 <      GMessage("%d%c ", num, xgapcodes[op_type]);
551 >      GMessage("%d%c ", num, xgapcodes[op_type]);
552        }
553      GMessage("\n");
554    }
# Line 580 | Line 602
602   const int a_dropoff_score=7;
603   const int a_min_score=12; //at least 6 bases full match
604  
605 < // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
605 > // ------------------ adaptor matching - simple k-mer seed & extend, no indels for now
606   //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
607   //check minimum score and
608 < //for 3' adapter trimming:
608 > //for 3' adaptor trimming:
609   //     require that the right end of the alignment for either the adaptor OR the read must be
610   //     < 3 distance from its right end
611 < // for 5' adapter trimming:
611 > // for 5' adaptor trimming:
612   //     require that the left end of the alignment for either the adaptor OR the read must
613   //     be at coordinate < 3 from start
614  
# Line 659 | Line 681
681    int mmovh5=(a_l<b_l)? a_l : b_l;
682    if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
683       if (a_l<a_ovh3) {
684 <        //adapter closer to the left end (typical for 5' adapter)
684 >        //adaptor closer to the left end (typical for 5' adaptor)
685          l5=a_r+1;
686          l3=alen-1;
687          }
688        else {
689 <        //adapter matching at the right end (typical for 3' adapter)
689 >        //adaptor matching at the right end (typical for 3' adaptor)
690          l5=0;
691          l3=a_l-1;
692          }
# Line 676 | Line 698
698    return false;
699   }
700  
701 <
702 < GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
703 < //overlap of right (3') end of seqb
704 < //hash the first 12 bases of seqa:
705 < int aimin=0;
684 < int aimax=GMIN(9,(a_len-4));
685 < int bimin=GMAX(0,(b_len-a_len-2));//allow 2 extraneous bases, just in case there is some garbage at the end
686 < int bimax=b_len-4;
687 < int a_maxlen=aimax+4; //number of rows in the diagonal matrix
688 < int b_maxlen=b_len; //number of cols in the diagonal matrix
689 < GXSeedTable gx(a_maxlen, b_maxlen);
690 < GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
691 < for (int ai=aimin;ai<=aimax;ai++) {
692 <    int32* av=(int32 *)(&seqa[ai]);
693 <    //for (int bi=b_len-4;bi>=bimin;bi--) {
694 <    for (int bi=bimin;bi<=bimax;bi++) {
695 <       if (gx.x(ai,bi))
696 <            continue; //already have a previous seed covering this region of this diagonal
697 <       int32* bv=(int32 *)(&seqb[bi]);
698 <       if (*av == *bv) {
699 <          //word match
700 <          //see if we can extend to the right
701 <            gx.x(ai+1,bi+1)=1;
702 <            gx.x(ai+2,bi+2)=1;
703 <            gx.x(ai+3,bi+3)=1;
704 <          int aix=ai+4;
705 <          int bix=bi+4;
706 <          int len=4;
707 <          while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
708 <               {
709 <                 gx.x(aix,bix)=1;
710 <                 aix++;bix++;
711 <                 len++;
712 <               }
713 <          if (len>7) {
714 <              //heuristics: very likely the best we can get
715 <              int qlen=len;
716 <              while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
717 <                aix++;
718 <                bix++;
719 <                qlen++;
720 <                }
721 <              if (qlen>10) { //found a quick match shortcut
722 <                 diagstrips->qmatch=new GXSeed(ai,bi,qlen);
723 <                 return diagstrips;
724 <                 }
725 <              }
726 <          GXSeed* newseed=new GXSeed(ai,bi,len);
727 <          seeds.Add(newseed);
728 <          diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
729 <          }
730 <       }
731 <    }//for each 4-mer at the beginning of seqa
732 < for (int i=0;i<diagstrips->Count();i++) {
733 <    diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
734 <    }
735 < diagstrips->setSorted(true); //sort by score
736 < return diagstrips;
737 < }
738 <
739 < GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
740 < //overlap of left (5') end of seqb
741 < //hash the last 12 bases of seqa:
742 < int aimin=GMAX(0,(a_len-16));
743 < int aimax=a_len-4;
744 < int bimin=0;
745 < int bimax=GMIN((a_len-2), (b_len-4));
746 < int a_maxlen=aimax+4; //number of rows in the diagonal matrix
747 < int b_maxlen=b_len; //number of cols in the diagonal matrix
701 > GXBandSet* collectSeeds(GList<GXSeed>& seeds, GXSeqData& sd, GAlnTrimType trim_intent) {
702 > int bimin=GMAX(0,(sd.blen-sd.alen-6)); //from collectSeeds_R
703 > int bimax=GMIN((sd.alen+2), (sd.blen-6));
704 > int b_start = (trim_intent==galn_TrimRight) ? bimin : 0;
705 > int b_end = (trim_intent==galn_TrimLeft) ?  bimax : sd.blen-6;
706   //gx.init(a_maxlen, b_maxlen);
707 < GXSeedTable gx(a_maxlen, b_maxlen);
708 < GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
709 < for (int ai=aimin;ai<=aimax;ai++) {
710 <    int32* av=(int32 *)(&seqa[ai]);
711 <    for (int bi=bimin;bi<=bimax;bi++) {
712 <       if (gx.x(ai,bi))
713 <          continue; //already have a previous seed covering this region of this diagonal
714 <       int32* bv=(int32 *)(&seqb[bi]);
715 <       if (*av == *bv) {
716 <          //word match
717 <          //see if we can extend to the right
718 <        gx.x(ai+1,bi+1)=1;
719 <        gx.x(ai+2,bi+2)=1;
720 <        gx.x(ai+3,bi+3)=1;
721 <          int aix=ai+4;
722 <          int bix=bi+4;
723 <          int len=4;
724 <          while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
725 <               {
726 <                 gx.x(aix,bix)=1;
727 <                 aix++;bix++;
728 <                 len++;
729 <               }
730 <          if (len>7) {
731 <              //heuristics: very likely the best we can get
732 <              int qlen=len;
733 <              while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
734 <                aix++;
735 <                bix++;
736 <                qlen++;
737 <                }
738 <              if (qlen>10) { //found a quick match shortcut
739 <                 diagstrips->qmatch=new GXSeed(ai,bi,qlen);
740 <                 return diagstrips;
741 <                 }
742 <              }
743 <          GXSeed* newseed=new GXSeed(ai,bi,len);
744 <          seeds.Add(newseed);
745 <          diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
746 <          }
747 <       }
748 <    }//for each 4-mer at the end of seqa
707 > GXSeedTable gx(sd.alen, sd.blen);
708 > GXBandSet* diagstrips=new GXBandSet(sd.alen, sd.blen); //set of overlapping 3-diagonal strips
709 > for (int bi=b_start;bi<=b_end;bi++) {
710 >   //for each 6-mer of seqb
711 >   uint16 bv = get6mer((char*) & (sd.bseq[bi]));
712 >   GVec<uint16>* alocs = sd.amers[bv];
713 >   if (alocs==NULL) continue;
714 >   //extend each hit
715 >   for (int h=0;h<alocs->Count();h++) {
716 >         int ai=alocs->Get(h);
717 >         //word match
718 >         if (gx.x(ai,bi))
719 >           //already have a previous seed covering this region of this diagonal
720 >           continue;
721 >         if (trim_intent==galn_TrimLeft && sd.blen>sd.alen+6 && bi>ai+6)
722 >               continue; //improper positioning for 5' trimming
723 >         if (trim_intent==galn_TrimRight && sd.blen>sd.alen+6 && bi<ai-6)
724 >               continue; //improper positioning for 5' trimming
725 >
726 >         //TODO: there could be Ns in this seed, should we count/adjust score?
727 >         for (int i=0;i<6;i++)
728 >            gx.x(ai+i,bi+i)=1;
729 >         //see if we can extend to the right
730 >         int aix=ai+6;
731 >         int bix=bi+6;
732 >         int len=6;
733 >         while (bix<sd.blen && aix<sd.alen && sd.aseq[aix]==sd.bseq[bix]) {
734 >                 gx.x(aix,bix)=1;
735 >                 aix++;bix++;
736 >                 len++;
737 >                 }
738 >         if (len>sd.amlen) {
739 >                //heuristics: very likely the best we can get
740 >                //quick match shortcut
741 >                diagstrips->qmatch=new GXSeed(ai,bi,len);
742 >                return diagstrips;
743 >                }
744 >         if (bi>bimax && bi<bimin && len<9)
745 >                 //ignore mid-sequence seeds that are not high scoring
746 >             continue;
747 >
748 >         GXSeed* newseed=new GXSeed(ai,bi,len);
749 >         seeds.Add(newseed);
750 >         diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
751 >     //keep last resort terminal match to be used if no better alignment is there
752 >     if (bi<2 && ai+len>=sd.alen-1 &&
753 >                 (!diagstrips->tmatch_l || diagstrips->tmatch_l->len<len))
754 >                      diagstrips->tmatch_l=newseed;
755 >     //collectSeeds_R:
756 >         if (ai<2 && bi+len>sd.blen-2 &&
757 >                 (!diagstrips->tmatch_r || diagstrips->tmatch_r->len<len))
758 >                  diagstrips->tmatch_r=newseed;
759 >     }
760 > } //for each 6-mer of the read
761   for (int i=0;i<diagstrips->Count();i++) {
762      diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
763      }
# Line 805 | Line 775
775    else return (s2->len-s1->len);
776   }
777  
778 + int cmpSeedScore_R(const pointer p1, const pointer p2) {
779 +  //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
780 +  GXSeed* s1=(GXSeed*)p1;
781 +  GXSeed* s2=(GXSeed*)p2;
782 +  if (s1->len==s2->len) {
783 +     return (s2->b_ofs-s1->b_ofs);
784 +     }
785 +  else return (s2->len-s1->len);
786 + }
787 +
788 +
789   int cmpSeedDiag(const pointer p1, const pointer p2) {
790    GXSeed* s1=(GXSeed*)p1;
791    GXSeed* s2=(GXSeed*)p2;
# Line 836 | Line 817
817    s_alnstart--;
818    if (q_seq==NULL || q_seq[0]==0 || s_seq==NULL || s_seq[0]==0)
819      GError("GreedyAlign() Error: attempt to use an empty sequence string!\n");
820 <  if (q_seq[q_alnstart]!=s_seq[s_alnstart])
821 <    GError("GreedyAlign() Error: improper anchor (mismatch)!\n");
820 >  /*if (q_seq[q_alnstart]!=s_seq[s_alnstart])
821 >    GError("GreedyAlign() Error: improper anchor (mismatch):\n%s (start %d len %d)\n%s (start %d len %d)\n",
822 >           q_seq, q_alnstart, q_max, s_seq, s_alnstart, s_max);
823 >           */
824    int q_ext_l=0, q_ext_r=0, s_ext_l=0, s_ext_r=0;
825    const char* q=q_seq+q_alnstart;
826    int q_avail=q_max-q_alnstart;
827    const char* s=s_seq+s_alnstart;
828    int s_avail=s_max-s_alnstart;
829    if (penalty<0) penalty=-penalty;
847  int MIN_GREEDY_SCORE=5*reward; //minimum score for an alignment to be reported for 0 diffs
830    GXAlnInfo* alninfo=NULL;
831    bool freeAlnMem=(gxmem==NULL);
832    if (freeAlnMem) {
833       gxmem=new CGreedyAlignData(reward, penalty, xdrop);
834 +     reward=gxmem->match_reward;
835 +     penalty=gxmem->mismatch_penalty;
836 +     xdrop=gxmem->x_drop;
837       }
838 <  else gxmem->reset();
838 >   else
839 >         gxmem->reset();
840 >  int minMatch= trim ? trim->minMatch : 6;
841 >  int MIN_GREEDY_SCORE=minMatch*reward; //minimum score for an alignment to be reported for 0 diffs
842    int retscore = 0;
843    int numdiffs = 0;
844    if (trim!=NULL && trim->type==galn_TrimLeft) {
845 +    //intent: trimming the left side
846      if (editscript)
847         ed_script_rev=new GXEditScript();
848  
849 <    int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
850 <                   reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
849 >    int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, // xdrop, reward, penalty,
850 >                    s_ext_l, q_ext_l, *gxmem, ed_script_rev);
851      //check this extension here and bail out if it's not a good extension
852 <    if (s_alnstart+1-s_ext_l>trim->boundary) {
852 >    if (s_ext_l+(trim->seedlen>>1) < trim->safelen &&
853 >        q_alnstart+1-q_ext_l>1 &&
854 >        s_alnstart+1-s_ext_l>trim->l_boundary) {
855        delete ed_script_rev;
856        if (freeAlnMem) delete gxmem;
857        return NULL;
# Line 868 | Line 859
859  
860      if (editscript)
861         ed_script_fwd=new GXEditScript();
862 <    int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
863 <                            reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
862 >    int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, //xdrop, reward, penalty,
863 >                                     s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
864      numdiffs=numdiffs_r+numdiffs_l;
865      //convert num diffs to actual score
866 <    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward+penalty);
866 >    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*(reward>>1) - numdiffs*(reward+penalty);
867      if (editscript)
868         ed_script_rev->Append(ed_script_fwd); //combine the two extensions
869      }
# Line 880 | Line 871
871      if (editscript) {
872         ed_script_fwd=new GXEditScript();
873         }
874 <    int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
875 <                            reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
874 >    int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, // xdrop, reward, penalty,
875 >                                     s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
876      //check extension here and bail out if not a good right extension
877      //assuming s_max is really at the right end of s_seq
878 <    if (trim!=NULL && trim->type==galn_TrimRight && s_alnstart+s_ext_r<trim->boundary) {
878 >    if (trim!=NULL && trim->type==galn_TrimRight &&
879 >        s_ext_r+(trim->seedlen>>1) < trim->safelen &&
880 >            q_alnstart+q_ext_r<q_max-2 &&
881 >            s_alnstart+s_ext_r<trim->r_boundary) {
882        delete ed_script_fwd;
883        if (freeAlnMem) delete gxmem;
884        return NULL;
885        }
886      if (editscript)
887         ed_script_rev=new GXEditScript();
888 <    int numdiffs_l =  GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
889 <                   reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
888 >    int numdiffs_l =  GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, // xdrop, reward, penalty,
889 >                                  s_ext_l, q_ext_l, *gxmem, ed_script_rev);
890      //convert num diffs to actual score
891      numdiffs=numdiffs_r+numdiffs_l;
892 <    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward+penalty);
892 >    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*(reward>>1) - numdiffs*(reward+penalty);
893      if (editscript)
894         ed_script_rev->Append(ed_script_fwd); //combine the two extensions
895    }
902
896    if (retscore>=MIN_GREEDY_SCORE) {
897 <    alninfo=new GXAlnInfo(q_seq, q_alnstart+1-q_ext_l,q_alnstart+q_ext_r, s_seq, s_alnstart+1-s_ext_l, s_alnstart+s_ext_r);
897 >    alninfo=new GXAlnInfo(q_seq, q_alnstart+1-q_ext_l, q_alnstart+q_ext_r, s_seq, s_alnstart+1-s_ext_l, s_alnstart+s_ext_r);
898      int hsp_length = GMIN(q_ext_l+q_ext_r, s_ext_l+s_ext_r);
899      alninfo->score=retscore;
900 +    if (gxmem->scaled) alninfo->score >>= 1;
901      alninfo->pid = 100 * (1 - ((double) numdiffs) / hsp_length);
902   #ifdef GDEBUG
903 <    if (ed_script_rev) {
904 <       GMessage("Final Edit script ::: ");
905 <       printEditScript(ed_script_rev);
906 <       }
903 >    //if (ed_script_rev) {
904 >    //   GMessage("Final Edit script ::: ");
905 >    //   printEditScript(ed_script_rev);
906 >    //   }
907   #endif
908      alninfo->editscript=ed_script_rev;
909      alninfo->gapinfo = new CAlnGapInfo(ed_script_rev, alninfo->ql-1, alninfo->sl-1);
910      }
911    else {
912 <    if (freeAlnMem) delete gxmem;
912 >    //if (freeAlnMem) delete gxmem;
913      delete ed_script_rev;
914      delete alninfo;
915      alninfo=NULL;
916      }
917 +  if (freeAlnMem) delete gxmem;
918    delete ed_script_fwd;
919    return alninfo;
920   }
# Line 928 | Line 923
923                         const char* s_seq, int s_alnstart, int s_max, CGreedyAlignData* gxmem,
924                         CAlnTrim* trim, bool editscript) {
925   int reward=2;
926 < int penalty=3;
927 < int xdrop=8;
926 > int penalty=10;
927 > int xdrop=32;
928   if (gxmem) {
929     reward=gxmem->match_reward;
930     penalty=gxmem->mismatch_penalty;
# Line 939 | Line 934
934       reward, penalty, xdrop, gxmem, trim, editscript);
935   }
936  
937 < GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
938 <            CGreedyAlignData* gxmem, int min_pid) {
937 > GXAlnInfo* match_adaptor(GXSeqData& sd, GAlnTrimType trim_type, int minMatch,
938 >                                     CGreedyAlignData* gxmem, int min_pid) {
939    bool editscript=false;
940    #ifdef GDEBUG
941     editscript=true;
942 <   GMessage("==========> matching Right (3') end ========\n");
942 >   if (trim_type==galn_TrimLeft) {
943 >         GMessage("=======> searching left (5') end : %s\n", sd.aseq);
944 >     }
945 >   else if (trim_type==galn_TrimRight) {
946 >     GMessage("=======> searching right(3') end : %s\n", sd.aseq);
947 >     }
948 >   else if (trim_type==galn_TrimEither) {
949 >     GMessage("==========> searching  both ends : %s\n", sd.aseq);
950 >     }
951    #endif
952 <
950 <  CAlnTrim trimInfo(galn_TrimRight, seqb, seqb_len);
952 >  CAlnTrim trimInfo(trim_type, sd.bseq, sd.blen, sd.alen, minMatch, sd.amlen);
953    GList<GXSeed> rseeds(true,true,false);
954 <  GXBandSet* alnbands=collectSeeds_R(rseeds, seqa, seqa_len, seqb, seqb_len);
954 >  GXBandSet* alnbands=collectSeeds(rseeds, sd, trim_type);
955    GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
956    //did we find a shortcut?
957    if (alnbands->qmatch) {
958      #ifdef GDEBUG
959 <     GMessage("::: Found a quick long match -- using it instead of diagonal bands\n");
959 >     GMessage("::: Found a quick long match at %d, len %d\n",
960 >          alnbands->qmatch->b_ofs, alnbands->qmatch->len);
961      #endif
962      anchor_seeds.Add(alnbands->qmatch);
963      }
# Line 962 | Line 965
965      int max_top_bands=5;
966      int top_band_count=0;
967      for (int b=0;b<alnbands->Count();b++) {
968 <       if (alnbands->Get(b)->score<4) break;
969 <       #ifdef GDEBUG
970 <       GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
971 <       #endif
968 >       if (alnbands->Get(b)->score<6) break;
969 >       //#ifdef GDEBUG
970 >       //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
971 >       //#endif
972         top_band_count++;
973         GXBand& band=*(alnbands->Get(b));
974         band.seeds.setSorted(cmpSeedScore);
975         anchor_seeds.Add(band.seeds.First());
976 <       band.tested=true;
976 >       //band.tested=true;
977         if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
978         }
979 <    #ifdef GDEBUG
980 <    GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
981 <    #endif
979 >    //#ifdef GDEBUG
980 >    //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
981 >    //#endif
982      }
980
983    GList<GXAlnInfo> galns(true,true,false);
984    for (int i=0;i<anchor_seeds.Count();i++) {
985      GXSeed& aseed=*anchor_seeds[i];
986      int a1=aseed.a_ofs+(aseed.len>>1)+1;
987      int a2=aseed.b_ofs+(aseed.len>>1)+1;
988 <    GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
989 <                            seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
990 <    if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
988 >    trimInfo.seedlen=aseed.len;
989 > #ifdef GDEBUG
990 >    GMessage("\t::: align from seed (%d, %d)of len %d.\n",aseed.a_ofs, aseed.b_ofs,
991 >           aseed.len);
992 > #endif
993 >    GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen,
994 >                            sd.bseq, a2, sd.blen, gxmem, &trimInfo, editscript);
995 >    if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo))
996               galns.AddIfNew(alninfo, true);
997          else delete alninfo;
998      }
999 <  //special case: due to the scoring scheme bias towards the 5' end of the read,
993 <  //also try some seeds at the 3' end
999 >
1000    if (galns.Count()==0) {
1001 <    alnbands->setSorted(cmpDiagBands_R);
1002 <    int max_top_bands=4;
1003 <    int top_band_count=0;
1004 <    #ifdef GDEBUG
1005 <    GMessage(":::>> Retrying adjusting sort order.\n");
1006 <    #endif
1007 <    for (int b=0;b<alnbands->Count();b++) {
1008 <       if (alnbands->Get(b)->score<4) break;
1009 <       #ifdef GDEBUG
1010 <       GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
1011 <       #endif
1012 <       if (alnbands->Get(b)->tested) continue;
1013 <       top_band_count++;
1014 <       GXBand& band=*(alnbands->Get(b));
1015 <       band.seeds.setSorted(cmpSeedScore);
1016 <       anchor_seeds.Add(band.seeds.First());
1017 <       if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1018 <       }
1019 <    #ifdef GDEBUG
1020 <    GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1021 <    #endif
1022 <    for (int i=0;i<anchor_seeds.Count();i++) {
1023 <      GXSeed& aseed=*anchor_seeds[i];
1024 <      int a1=aseed.a_ofs+(aseed.len>>1)+1;
1025 <      int a2=aseed.b_ofs+(aseed.len>>1)+1;
1026 <      GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1021 <                              seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1022 <      if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1023 <               galns.AddIfNew(alninfo, true);
1024 <          else delete alninfo;
1001 >         //last resort: look for weaker terminal seeds
1002 >          GPVec<GXSeed> tmatches(2,false);
1003 >          if (trim_type!=galn_TrimRight) {
1004 >                 if (alnbands->tmatch_l)
1005 >                    tmatches.Add(alnbands->tmatch_l);
1006 >             }
1007 >          if (trim_type!=galn_TrimLeft) {
1008 >                 if (alnbands->tmatch_r)
1009 >                    tmatches.Add(alnbands->tmatch_r);
1010 >             }
1011 >          for (int i=0;i<tmatches.Count();i++) {
1012 >                GXSeed& aseed=*tmatches[i];
1013 >                int halfseed=aseed.len>>1;
1014 >                int a1=aseed.a_ofs+halfseed+1;
1015 >                int a2=aseed.b_ofs+halfseed+1;
1016 >                trimInfo.seedlen=aseed.len;
1017 > #ifdef GDEBUG
1018 >    GMessage("\t::: align from terminal seed (%d, %d)of len %d.\n",aseed.a_ofs, aseed.b_ofs,
1019 >           aseed.len);
1020 > #endif
1021 >        GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen,
1022 >                                sd.bseq, a2, sd.blen, gxmem, &trimInfo, editscript);
1023 >        if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo))
1024 >                 galns.AddIfNew(alninfo, true);
1025 >             else delete alninfo;
1026 >        }//for each terminal seed
1027        }
1028 <    }
1028 >  //---- found all alignments
1029 >  delete alnbands;
1030 >  /*
1031    #ifdef GDEBUG
1032 +  //print all valid alignments found
1033    for (int i=0;i<galns.Count();i++) {
1034      GXAlnInfo* alninfo=galns[i];
1035      GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
# Line 1035 | Line 1040
1040        }
1041      }
1042    #endif
1043 <  //---- done
1039 <  delete alnbands;
1043 >  */
1044    if (galns.Count()) {
1045      GXAlnInfo* bestaln=galns.Shift();
1042    return bestaln;
1043    }
1044  else return NULL;
1045 }
1046
1047 GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
1048          CGreedyAlignData* gxmem, int min_pid) {
1049  bool editscript=false;
1050  #ifdef GDEBUG
1051   editscript=true;
1052   GMessage("==========> matching Left (5') end ========\n");
1053  #endif
1054  CAlnTrim trimInfo(galn_TrimLeft, seqb, seqb_len);
1055  GList<GXSeed> rseeds(true,true,false);
1056  GXBandSet* alnbands=collectSeeds_L(rseeds, seqa, seqa_len, seqb, seqb_len);
1057  GList<GXAlnInfo> galns(true,true,false);
1058  GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
1059
1060  if (alnbands->qmatch) {
1061    #ifdef GDEBUG
1062     GMessage("::: Found a quick long match -- using it instead of diagonal bands\n");
1063    #endif
1064    anchor_seeds.Add(alnbands->qmatch);
1065    }
1066  else {
1067    int max_top_bands=5;
1068    int top_band_count=0;
1069    for (int b=0;b<alnbands->Count();b++) {
1070       if (alnbands->Get(b)->score<4) break;
1071       #ifdef GDEBUG
1072       GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
1073       #endif
1074       top_band_count++;
1075       GXBand& band=*(alnbands->Get(b));
1076       band.seeds.setSorted(cmpSeedScore);
1077       anchor_seeds.Add(band.seeds.First());
1078       if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1079       }
1046      #ifdef GDEBUG
1047 <    GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1047 >      GMessage("Best alignment: a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", bestaln->ql, bestaln->qr,
1048 >          bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid);
1049 >      if (bestaln->gapinfo!=NULL) {
1050 >        bestaln->gapinfo->printAlignment(stderr, sd.aseq, sd.alen, sd.bseq, sd.blen);
1051 >        }
1052      #endif
1083    }
1084  for (int i=0;i<anchor_seeds.Count();i++) {
1085    GXSeed& aseed=*anchor_seeds[i];
1086    int a1=aseed.a_ofs+(aseed.len>>1)+1;
1087    int a2=aseed.b_ofs+(aseed.len>>1)+1;
1088    GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1089                            seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1090    if (alninfo && alninfo->pid>=min_pid
1091           && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, seqa_len-alninfo->qr))
1092            galns.AddIfNew(alninfo, true);
1093       else delete alninfo;
1094    }
1095
1096  #ifdef GDEBUG
1097  for (int i=0;i<galns.Count();i++) {
1098    GXAlnInfo* alninfo=galns[i];
1099    GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
1100                         alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
1101    if (alninfo->gapinfo!=NULL) {
1102      GMessage("Alignment:\n");
1103      alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1104      }
1105    }
1106  #endif
1107  //---- done
1108  delete alnbands;
1109  if (galns.Count()) {
1110    GXAlnInfo* bestaln=galns.Shift();
1053      return bestaln;
1054      }
1055    else return NULL;

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines