ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.cpp
(Generate patch)
# Line 53 | Line 53
53    if (flast_d[d-1][diag-1] > GMAX(flast_d[d-1][diag], flast_d[d-1][diag+1])) {
54        *row1 = flast_d[d-1][diag-1];
55        return diag-1;
56 <      }
56 >      }
57    if (flast_d[d-1][diag] > flast_d[d-1][diag+1]) {
58        *row1 = flast_d[d-1][diag];
59        return diag;
# Line 65 | Line 65
65   void GapXEditScript::print() { //debug
66        GapXEditScript* p=this;
67        do {
68 <        GMessage("%d%c ",p->num, xgapcodes[p->op_type]);
69 <        } while ((p=p->next)!=NULL);
68 >        GMessage("%d%c ",p->num, xgapcodes[p->op_type]);
69 >        } while ((p=p->next)!=NULL);
70        GMessage("\n");
71      }
72  
# Line 93 | Line 93
93      else
94          g = BLAST_Gcd(*a, BLAST_Gcd(*b, *c));
95      if (g > 1) {
96 <        *a /= g;
96 >                *a /= g;
97          *b /= g;
98          *c /= g;
99      }
# Line 101 | Line 101
101   }
102  
103  
104 + uint16 get6mer(char* p) {
105 +  uint16 r=gdna2bit(p,3);
106 +  r <<= 6;
107 +  r |= gdna2bit(p,3);
108 +  return r;
109 + }
110 +
111 +
112 + void table6mers(const char* s, int slen, GVec<uint16>* amers[]) {
113 + for (uint16 i=0; i <= slen-6; i++) {
114 +   char* p = (char*)(s+i);
115 +   uint16 v=get6mer(p);
116 +   if (amers[v]==NULL) {
117 +      amers[v]=new GVec<uint16>(1);
118 +      }
119 +   amers[v]->Add(i);
120 + }
121 + }
122 +
123 + GVec<uint16>* match6mer(char* start, GVec<uint16>* amers[]) {
124 +  //careful: this is broken if start+5 falls beyond the end of the string!
125 +  uint16 r=get6mer(start);
126 +  return amers[r];
127 + }
128  
129   //signal that a diagonal is invalid
130   static const int kInvalidOffset = -2;
# Line 129 | Line 153
153               //    }
154        }
155      else { //forward lookup
156 <             while (seq1_index < len1 && seq2_index < len2 &&
156 >             while (seq1_index < len1 && seq2_index < len2 &&
157                     //seq1[seq1_index] < 4 &&
158                     seq1[seq1_index] == seq2[seq2_index]) {
159                  ++seq1_index;
# Line 181 | Line 205
205  
206   int GXGreedyExtend(const char* seq1, int len1,
207                         const char* seq2, int len2,
208 <                       bool reverse, int xdrop_threshold,
185 <                       int match_cost, int mismatch_cost,
208 >                       bool reverse, //int xdrop_threshold, int match_cost, int mismatch_cost,
209                         int& seq1_align_len, int& seq2_align_len,
210 <                       SGreedyAlignMem& aux_data,
210 >                       CGreedyAlignData& aux_data,
211                         GXEditScript *edit_block) {
212                         //GapPrelimEditBlock *edit_block,
213                         //bool& fence_hit, SGreedySeed *seed) {
# Line 204 | Line 227
227      int longest_match_run;
228      bool end1_reached, end2_reached;
229      GXMemPool* mem_pool;
230 <
230 >
231      /* ordinary dynamic programming alignment, for each offset
232         in seq1, walks through offsets in seq2 until an X-dropoff
233 <       test fails, saving the best score encountered along
233 >       test fails, saving the best score encountered along
234         the way. Instead of score, this code tracks the 'distance'
235         (number of mismatches plus number of gaps) between seq1
236         and seq2. Instead of walking through sequence offsets, it
237         walks through diagonals that can achieve a given distance.
238 <    
238 >
239         Note that in what follows, the numbering of diagonals implies
240 <       a dot matrix where increasing seq1 offsets go to the right on
240 >       a dot matrix where increasing seq1 offsets go to the right on
241         the x axis, and increasing seq2 offsets go up the y axis.
242         The gapped alignment thus proceeds up and to the right in
243         the graph, and diagonals are numbered increasing to the right */
# Line 223 | Line 246
246      best_diag = 0;
247  
248      /* set the number of distinct distances the algorithm will
249 <       examine in the search for an optimal alignment. The
250 <       heuristic worst-case running time of the algorithm is
249 >       examine in the search for an optimal alignment. The
250 >       heuristic worst-case running time of the algorithm is
251         O(max_dist**2 + (len1+len2)); for sequences which are
252         very similar, the average running time will be sig-
253         nificantly better than this */
254  
255      max_dist = GMIN(GREEDY_MAX_COST,
256 <                   len2 / GREEDY_MAX_COST_FRACTION + 1);
256 >                   (len2/GREEDY_MAX_COST_FRACTION + 1));
257  
258      /* the main loop assumes that the index of all diagonals is
259 <       biased to lie in the middle of allocated bookkeeping
259 >       biased to lie in the middle of allocated bookkeeping
260         structures */
261  
262      diag_origin = max_dist + 2;
# Line 251 | Line 274
274         xdrop_offset gives the distance backwards in the score
275         array to look */
276  
277 <    xdrop_offset = (xdrop_threshold + match_cost / 2) /
278 <                           (match_cost + mismatch_cost) + 1;
256 <    
277 >    xdrop_offset = aux_data.xdrop_ofs;
278 >
279      // find the offset of the first mismatch between seq1 and seq2
280  
281      index = s_FindFirstMismatch(seq1, len1,  seq2, len2, 0, 0, reverse);
# Line 277 | Line 299
299          if (edit_block != NULL)
300              //GapPrelimEditBlockAdd(edit_block, eGapAlignSub, index);
301              edit_block->opRep(index);
302 <        return 0;
302 >        return 0;
303      }
304  
305      // set up the memory pool
# Line 287 | Line 309
309        }
310      else if (mem_pool == NULL) {
311         aux_data.space = mem_pool = new GXMemPool();
312 <    }
312 >    }
313      else {
314          mem_pool->refresh();
315      }
316 <    
316 >
317      /* set up the array of per-distance maximum scores. There
318         are max_diags + xdrop_offset distances to track, the first
319         xdrop_offset of which are 0 */
# Line 299 | Line 321
321      max_score = aux_data.max_score + xdrop_offset;
322      for (index = 0; index < xdrop_offset; index++)
323          aux_data.max_score[index] = 0;
324 <    
324 >
325      // fill in the initial offsets of the distance matrix
326  
327      last_seq2_off[0][diag_origin] = seq1_index;
328 <    max_score[0] = seq1_index * match_cost;
328 >    max_score[0] = seq1_index * aux_data.match_reward;
329      diag_lower = diag_origin - 1;
330      diag_upper = diag_origin + 1;
331      end1_reached = end2_reached = false;
# Line 328 | Line 350
350  
351          // compute the score for distance d corresponding to the X-dropoff criterion
352  
353 <        xdrop_score = max_score[d - xdrop_offset] +
354 <                      (match_cost + mismatch_cost) * d - xdrop_threshold;
355 <        xdrop_score = (int)ceil((double)xdrop_score / (match_cost / 2));
353 >        xdrop_score = max_score[d - xdrop_offset] +
354 >                      (aux_data.match_reward + aux_data.mismatch_penalty) * d - aux_data.x_drop;
355 >        xdrop_score = (int)ceil((double)xdrop_score / (aux_data.match_reward>>1));
356  
357          // for each diagonal of interest
358          for (k = tmp_diag_lower; k <= tmp_diag_upper; k++) {
359              /* find the largest offset into seq2 that increases
360                 the distance from d-1 to d (i.e. keeps the alignment
361 <               from getting worse for as long as possible), then
361 >               from getting worse for as long as possible), then
362                 choose the offset into seq1 that will keep the
363 <               resulting diagonal fixed at k
364 <            
363 >               resulting diagonal fixed at k
364 >
365                 Note that this requires kInvalidOffset+1 to be smaller
366                 than any valid offset into seq2, i.e. to be negative */
367  
# Line 360 | Line 382
382                  continue;
383              }
384              diag_upper = k;
385 <            
386 <            /* slide down diagonal k until a mismatch
385 >
386 >            /* slide down diagonal k until a mismatch
387                 occurs. As long as only matches are encountered,
388                 the current distance d will not change */
389  
# Line 397 | Line 419
419                 can each be of size at most max_diags+2 */
420  
421              if (seq2_index == len2) {
422 <                diag_lower = k + 1;
422 >                diag_lower = k + 1;
423                  end2_reached = true;
424              }
425              if (seq1_index == len1) {
426 <                diag_upper = k - 1;
426 >                diag_upper = k - 1;
427                  end1_reached = true;
428              }
429          }  // end loop over diagonals
430  
431          // compute the maximum score possible for distance d
432 <        curr_score = curr_extent * (match_cost / 2) -
433 <                        d * (match_cost + mismatch_cost);
432 >        curr_score = curr_extent * (aux_data.match_reward / 2) -
433 >                        d * (aux_data.match_reward + aux_data.mismatch_penalty);
434          // if this is the best score seen so far, update the
435          // statistics of the best alignment
436          if (curr_score > max_score[d - 1]) {
# Line 417 | Line 439
439              best_diag = curr_diag;
440              seq2_align_len = curr_seq2_index;
441              seq1_align_len = curr_seq2_index + best_diag - diag_origin;
442 <        }
442 >        }
443          else {
444              max_score[d] = max_score[d - 1];
445          }
# Line 428 | Line 450
450          if (diag_lower > diag_upper)
451              break;
452  
453 <        /* set up for the next distance to examine. Because the
454 <           bounds increase by at most one for each distance,
455 <           diag_lower and diag_upper can each be of size at
453 >        /* set up for the next distance to examine. Because the
454 >           bounds increase by at most one for each distance,
455 >           diag_lower and diag_upper can each be of size at
456             most max_diags+2 */
457  
458          if (!end2_reached)
459 <            diag_lower--;
459 >            diag_lower--;
460          if (!end1_reached)
461              diag_upper++;
462  
# Line 455 | Line 477
477              }
478      }   // end loop over distinct distances
479  
480 <    
480 >
481      if (edit_block == NULL)
482          return best_dist;
483  
484      //----  perform traceback
485 <    d = best_dist;
485 >    d = best_dist;
486      seq1_index = seq1_align_len;
487      seq2_index = seq2_align_len;
488      // for all positive distances
# Line 476 | Line 498
498             traceback operation. best_diag starts off with the
499             value computed during the alignment process */
500  
501 <        new_diag = s_GetNextNonAffineTback(last_seq2_off, d,
501 >        new_diag = s_GetNextNonAffineTback(last_seq2_off, d,
502                                             best_diag, &new_seq2_index);
503  
504          if (new_diag == best_diag) {
# Line 484 | Line 506
506              if (seq2_index - new_seq2_index > 0) {
507                    edit_block->opRep(seq2_index - new_seq2_index);
508              }
509 <        }
509 >        }
510          else if (new_diag < best_diag) {
511              // smaller diagonal: issue a group of substitutions
512              //   and then a gap in seq2 */
# Line 493 | Line 515
515              }
516              //GapPrelimEditBlockAdd(edit_block, eGapAlignIns, 1);
517              edit_block->opIns(1);
518 <        }
518 >        }
519          else {
520              // larger diagonal: issue a group of substitutions
521              //   and then a gap in seq1
# Line 502 | Line 524
524              }
525              edit_block->opDel(1);
526          }
527 <        d--;
528 <        best_diag = new_diag;
529 <        seq2_index = new_seq2_index;
527 >        d--;
528 >        best_diag = new_diag;
529 >        seq2_index = new_seq2_index;
530      }
531   //done:
532      // handle the final group of substitutions back to distance zero,
# Line 526 | Line 548
548        unsigned char op_type = 3 - ( ed_script->ops[i] & gxEDIT_OP_MASK );
549        if (op_type == 3)
550           GError("Error: printEditScript encountered op_type 3 ?!\n");
551 <      GMessage("%d%c ", num, xgapcodes[op_type]);
551 >      GMessage("%d%c ", num, xgapcodes[op_type]);
552        }
553      GMessage("\n");
554    }
# Line 536 | Line 558
558   int q_max=strlen(q_seq); //query
559   int s_max=strlen(s_seq); //subj
560   return GreedyAlignRegion(q_seq, q_alnstart, q_max,
561 <          s_seq,  s_alnstart, s_max, galn_NoTrim, editscript, NULL, reward, penalty, xdrop);
561 >          s_seq,  s_alnstart, s_max, reward, penalty, xdrop, NULL, NULL, editscript);
562   }
563  
564   struct GXSeedTable {
# Line 575 | Line 597
597  
598   };
599  
600 <
601 < GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
602 < //overlap of right (3') end of seqb
603 < //hash the first 12 bases of seqa:
604 < int aimin=0;
605 < int aimax=GMIN(9,(a_len-4));
606 < int bimin=GMAX(0,(b_len-a_len-2));//allow 2 extraneous bases, just in case there is some garbage at the end
607 < int bimax=b_len-4;
608 < int a_maxlen=aimax+4; //number of rows in the diagonal matrix
609 < int b_maxlen=b_len; //number of cols in the diagonal matrix
610 < GXSeedTable gx(a_maxlen, b_maxlen);
611 < GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
612 < for (int ai=aimin;ai<=aimax;ai++) {
613 <    int32* av=(int32 *)(&seqa[ai]);
614 <    //for (int bi=b_len-4;bi>=bimin;bi--) {
615 <    for (int bi=bimin;bi<=bimax;bi++) {
616 <       if (gx.x(ai,bi))
617 <            continue; //already have a previous seed covering this region of this diagonal
618 <       int32* bv=(int32 *)(&seqb[bi]);
619 <       if (*av == *bv) {
620 <          //word match
621 <          //see if we can extend to the right
622 <            gx.x(ai+1,bi+1)=1;
623 <            gx.x(ai+2,bi+2)=1;
624 <            gx.x(ai+3,bi+3)=1;
625 <          int aix=ai+4;
626 <          int bix=bi+4;
627 <          int len=4;
628 <          while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
629 <               {
630 <                 gx.x(aix,bix)=1;
631 <                 aix++;bix++;
632 <                 len++;
633 <               }
634 <          GXSeed* newseed=new GXSeed(ai,bi,len);
635 <          seeds.Add(newseed);
636 <          diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
637 <          }
600 > const int a_m_score=2; //match score
601 > const int a_mis_score=-3; //mismatch
602 > const int a_dropoff_score=7;
603 > const int a_min_score=12; //at least 6 bases full match
604 >
605 > // ------------------ adaptor matching - simple k-mer seed & extend, no indels for now
606 > //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
607 > //check minimum score and
608 > //for 3' adaptor trimming:
609 > //     require that the right end of the alignment for either the adaptor OR the read must be
610 > //     < 3 distance from its right end
611 > // for 5' adaptor trimming:
612 > //     require that the left end of the alignment for either the adaptor OR the read must
613 > //     be at coordinate < 3 from start
614 >
615 > bool extendUngapped(const char* a, int alen, int ai,
616 >                 const char* b, int blen, int bi, int& mlen, int& l5, int& l3, bool end5=false) {
617 > //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
618 > //if (debug) {
619 > //  GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
620 > //  }
621 > int a_l=ai; //alignment coordinates on a
622 > int a_r=ai+mlen-1;
623 > int b_l=bi; //alignment coordinates on b
624 > int b_r=bi+mlen-1;
625 > int ai_maxscore=ai;
626 > int bi_maxscore=bi;
627 > int score=mlen*a_m_score;
628 > int maxscore=score;
629 > int mism5score=a_mis_score;
630 > if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
631 > //try to extend to the left first, if possible
632 > while (ai>0 && bi>0) {
633 >   ai--;
634 >   bi--;
635 >   score+= (a[ai]==b[bi])? a_m_score : mism5score;
636 >   if (score>maxscore) {
637 >       ai_maxscore=ai;
638 >       bi_maxscore=bi;
639 >       maxscore=score;
640         }
641 <    }//for each 4-mer at the beginning of seqa
642 < for (int i=0;i<diagstrips->Count();i++) {
643 <    diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
644 <    }
645 < diagstrips->setSorted(true); //sort by score
646 < return diagstrips;
647 < }
641 >     else if (maxscore-score>a_dropoff_score) break;
642 >   }
643 > a_l=ai_maxscore;
644 > b_l=bi_maxscore;
645 > //now extend to the right
646 > ai_maxscore=a_r;
647 > bi_maxscore=b_r;
648 > ai=a_r;
649 > bi=b_r;
650 > score=maxscore;
651 > //sometimes there are extra As at the end of the read, ignore those
652 > if (a[alen-2]=='A' && a[alen-1]=='A') {
653 >    alen-=2;
654 >    while (a[alen-1]=='A' && alen>ai) alen--;
655 >    }
656 > while (ai<alen-1 && bi<blen-1) {
657 >   ai++;
658 >   bi++;
659 >   //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
660 >   if (a[ai]==b[bi]) { //match
661 >      score+=a_m_score;
662 >      if (ai>=alen-2) {
663 >           score+=a_m_score-(alen-ai-1);
664 >           }
665 >      }
666 >    else { //mismatch
667 >      score+=a_mis_score;
668 >      }
669 >   if (score>maxscore) {
670 >       ai_maxscore=ai;
671 >       bi_maxscore=bi;
672 >       maxscore=score;
673 >       }
674 >     else if (maxscore-score>a_dropoff_score) break;
675 >   }
676 >  a_r=ai_maxscore;
677 >  b_r=bi_maxscore;
678 >  int a_ovh3=alen-a_r-1;
679 >  int b_ovh3=blen-b_r-1;
680 >  int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
681 >  int mmovh5=(a_l<b_l)? a_l : b_l;
682 >  if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
683 >     if (a_l<a_ovh3) {
684 >        //adaptor closer to the left end (typical for 5' adaptor)
685 >        l5=a_r+1;
686 >        l3=alen-1;
687 >        }
688 >      else {
689 >        //adaptor matching at the right end (typical for 3' adaptor)
690 >        l5=0;
691 >        l3=a_l-1;
692 >        }
693 >     return true;
694 >     }
695 >  //do not trim:
696 >  l5=0;
697 >  l3=alen-1;
698 >  return false;
699 > }
700  
701 < GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
702 < //overlap of left (5') end of seqb
703 < //hash the last 12 bases of seqa:
704 < int aimin=GMAX(0,(a_len-16));
705 < int aimax=a_len-4;
630 < int bimin=0;
631 < int bimax=GMIN((a_len-2), (b_len-4));
632 < int a_maxlen=aimax+4; //number of rows in the diagonal matrix
633 < int b_maxlen=b_len; //number of cols in the diagonal matrix
701 > GXBandSet* collectSeeds(GList<GXSeed>& seeds, GXSeqData& sd, GAlnTrimType trim_intent) {
702 > int bimin=GMAX(0,(sd.blen-sd.alen-6)); //from collectSeeds_R
703 > int bimax=GMIN((sd.alen+2), (sd.blen-6));
704 > int b_start = (trim_intent==galn_TrimRight) ? bimin : 0;
705 > int b_end = (trim_intent==galn_TrimLeft) ?  bimax : sd.blen-6;
706   //gx.init(a_maxlen, b_maxlen);
707 < GXSeedTable gx(a_maxlen, b_maxlen);
708 < GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
709 < for (int ai=aimin;ai<=aimax;ai++) {
710 <    int32* av=(int32 *)(&seqa[ai]);
711 <    for (int bi=bimin;bi<=bimax;bi++) {
712 <       if (gx.x(ai,bi))
713 <          continue; //already have a previous seed covering this region of this diagonal
714 <       int32* bv=(int32 *)(&seqb[bi]);
715 <       if (*av == *bv) {
716 <          //word match
717 <          //see if we can extend to the right
718 <        gx.x(ai+1,bi+1)=1;
719 <        gx.x(ai+2,bi+2)=1;
720 <        gx.x(ai+3,bi+3)=1;
721 <          int aix=ai+4;
722 <          int bix=bi+4;
723 <          int len=4;
724 <          while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
725 <               {
726 <                 gx.x(aix,bix)=1;
727 <                 aix++;bix++;
728 <                 len++;
729 <               }
730 <          GXSeed* newseed=new GXSeed(ai,bi,len);
731 <          seeds.Add(newseed);
732 <          diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
733 <          }
734 <       }
735 <    }//for each 4-mer at the end of seqa
707 > GXSeedTable gx(sd.alen, sd.blen);
708 > GXBandSet* diagstrips=new GXBandSet(sd.alen, sd.blen); //set of overlapping 3-diagonal strips
709 > for (int bi=b_start;bi<=b_end;bi++) {
710 >   //for each 6-mer of seqb
711 >   uint16 bv = get6mer((char*) & (sd.bseq[bi]));
712 >   GVec<uint16>* alocs = sd.amers[bv];
713 >   if (alocs==NULL) continue;
714 >   //extend each hit
715 >   for (int h=0;h<alocs->Count();h++) {
716 >         int ai=alocs->Get(h);
717 >         //word match
718 >         if (gx.x(ai,bi))
719 >           //already have a previous seed covering this region of this diagonal
720 >           continue;
721 >         if (trim_intent==galn_TrimLeft && sd.blen>sd.alen+6 && bi>ai+6)
722 >               continue; //improper positioning for 5' trimming
723 >         if (trim_intent==galn_TrimRight && sd.blen>sd.alen+6 && bi<ai-6)
724 >               continue; //improper positioning for 5' trimming
725 >
726 >         //TODO: there could be Ns in this seed, should we count/adjust score?
727 >         for (int i=0;i<6;i++)
728 >            gx.x(ai+i,bi+i)=1;
729 >         //see if we can extend to the right
730 >         int aix=ai+6;
731 >         int bix=bi+6;
732 >         int len=6;
733 >         while (bix<sd.blen && aix<sd.alen && sd.aseq[aix]==sd.bseq[bix]) {
734 >                 gx.x(aix,bix)=1;
735 >                 aix++;bix++;
736 >                 len++;
737 >                 }
738 >         if (len>sd.amlen) {
739 >                //heuristics: very likely the best we can get
740 >                //quick match shortcut
741 >                diagstrips->qmatch=new GXSeed(ai,bi,len);
742 >                return diagstrips;
743 >                }
744 >         if (bi>bimax && bi<bimin && len<9)
745 >                 //ignore mid-sequence seeds that are not high scoring
746 >             continue;
747 >
748 >         GXSeed* newseed=new GXSeed(ai,bi,len);
749 >         seeds.Add(newseed);
750 >         diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
751 >     //keep last resort terminal match to be used if no better alignment is there
752 >     if (bi<2 && ai+len>=sd.alen-1 &&
753 >                 (!diagstrips->tmatch_l || diagstrips->tmatch_l->len<len))
754 >                      diagstrips->tmatch_l=newseed;
755 >     //collectSeeds_R:
756 >         if (ai<2 && bi+len>sd.blen-2 &&
757 >                 (!diagstrips->tmatch_r || diagstrips->tmatch_r->len<len))
758 >                  diagstrips->tmatch_r=newseed;
759 >     }
760 > } //for each 6-mer of the read
761   for (int i=0;i<diagstrips->Count();i++) {
762      diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
763      }
# Line 668 | Line 765
765   return diagstrips;
766   }
767  
671
672
768   int cmpSeedScore(const pointer p1, const pointer p2) {
769    //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
770    GXSeed* s1=(GXSeed*)p1;
# Line 680 | Line 775
775    else return (s2->len-s1->len);
776   }
777  
778 + int cmpSeedScore_R(const pointer p1, const pointer p2) {
779 +  //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
780 +  GXSeed* s1=(GXSeed*)p1;
781 +  GXSeed* s2=(GXSeed*)p2;
782 +  if (s1->len==s2->len) {
783 +     return (s2->b_ofs-s1->b_ofs);
784 +     }
785 +  else return (s2->len-s1->len);
786 + }
787 +
788 +
789   int cmpSeedDiag(const pointer p1, const pointer p2) {
790    GXSeed* s1=(GXSeed*)p1;
791    GXSeed* s2=(GXSeed*)p2;
792    return ((s1->b_ofs-s1->a_ofs)-(s2->b_ofs-s2->a_ofs));
793   }
794  
795 +
796 + int cmpDiagBands_R(const pointer p1, const pointer p2) {
797 +  //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
798 +  GXBand* b1=(GXBand*)p1;
799 +  GXBand* b2=(GXBand*)p2;
800 +  if (b1->score==b2->score) {
801 +     return (b2->w_min_b-b1->w_min_b);
802 +     }
803 +  else return (b2->score-b1->score);
804 + }
805 +
806 +
807 +
808   GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
809 <                       const char* s_seq, int s_alnstart, int s_max, GAlnTrimType trimtype,
810 <                        bool editscript, SGreedyAlignMem* gxmem, int reward, int penalty, int xdrop) {
809 >    const char* s_seq, int s_alnstart, int s_max,
810 >    int reward, int penalty, int xdrop, CGreedyAlignData* gxmem,
811 >    CAlnTrim* trim, bool editscript) {
812    GXEditScript* ed_script_fwd = NULL;
813    GXEditScript* ed_script_rev = NULL;
814    if ( q_alnstart>q_max || q_alnstart<1 || s_alnstart>s_max || s_alnstart<1 )
# Line 697 | Line 817
817    s_alnstart--;
818    if (q_seq==NULL || q_seq[0]==0 || s_seq==NULL || s_seq[0]==0)
819      GError("GreedyAlign() Error: attempt to use an empty sequence string!\n");
820 <  if (q_seq[q_alnstart]!=s_seq[s_alnstart])
821 <    GError("GreedyAlign() Error: improper anchor (mismatch)!\n");
820 >  /*if (q_seq[q_alnstart]!=s_seq[s_alnstart])
821 >    GError("GreedyAlign() Error: improper anchor (mismatch):\n%s (start %d len %d)\n%s (start %d len %d)\n",
822 >           q_seq, q_alnstart, q_max, s_seq, s_alnstart, s_max);
823 >           */
824    int q_ext_l=0, q_ext_r=0, s_ext_l=0, s_ext_r=0;
825    const char* q=q_seq+q_alnstart;
826    int q_avail=q_max-q_alnstart;
827    const char* s=s_seq+s_alnstart;
828    int s_avail=s_max-s_alnstart;
829    if (penalty<0) penalty=-penalty;
708  int MIN_GREEDY_SCORE=5*reward; //minimum score for an alignment to be reported
830    GXAlnInfo* alninfo=NULL;
831    bool freeAlnMem=(gxmem==NULL);
832    if (freeAlnMem) {
833 <     gxmem=new SGreedyAlignMem(reward, penalty, xdrop);
833 >     gxmem=new CGreedyAlignData(reward, penalty, xdrop);
834 >     reward=gxmem->match_reward;
835 >     penalty=gxmem->mismatch_penalty;
836 >     xdrop=gxmem->x_drop;
837       }
838 <  else gxmem->reset();
838 >   else
839 >         gxmem->reset();
840 >  int minMatch= trim ? trim->minMatch : 6;
841 >  int MIN_GREEDY_SCORE=minMatch*reward; //minimum score for an alignment to be reported for 0 diffs
842    int retscore = 0;
843    int numdiffs = 0;
844 <  if (trimtype==galn_TrimLeft) {
844 >  if (trim!=NULL && trim->type==galn_TrimLeft) {
845 >    //intent: trimming the left side
846      if (editscript)
847         ed_script_rev=new GXEditScript();
848  
849 <    int numdiffs_l =  GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
850 <                   reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
849 >    int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, // xdrop, reward, penalty,
850 >                    s_ext_l, q_ext_l, *gxmem, ed_script_rev);
851      //check this extension here and bail out if it's not a good extension
852 <    if (s_alnstart+1-s_ext_l>3) {
852 >    if (s_ext_l+(trim->seedlen>>1) < trim->safelen &&
853 >        q_alnstart+1-q_ext_l>1 &&
854 >        s_alnstart+1-s_ext_l>trim->l_boundary) {
855        delete ed_script_rev;
856        if (freeAlnMem) delete gxmem;
857        return NULL;
# Line 729 | Line 859
859  
860      if (editscript)
861         ed_script_fwd=new GXEditScript();
862 <    int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
863 <                            reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
862 >    int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, //xdrop, reward, penalty,
863 >                                     s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
864      numdiffs=numdiffs_r+numdiffs_l;
865      //convert num diffs to actual score
866 <    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward-penalty);
866 >    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*(reward>>1) - numdiffs*(reward+penalty);
867      if (editscript)
868         ed_script_rev->Append(ed_script_fwd); //combine the two extensions
869      }
# Line 741 | Line 871
871      if (editscript) {
872         ed_script_fwd=new GXEditScript();
873         }
874 <    int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
875 <                            reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
874 >    int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, // xdrop, reward, penalty,
875 >                                     s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
876      //check extension here and bail out if not a good right extension
877      //assuming s_max is really at the right end of s_seq
878 <    if (trimtype==galn_TrimRight && s_alnstart+s_ext_r<s_max-2) {
878 >    if (trim!=NULL && trim->type==galn_TrimRight &&
879 >        s_ext_r+(trim->seedlen>>1) < trim->safelen &&
880 >            q_alnstart+q_ext_r<q_max-2 &&
881 >            s_alnstart+s_ext_r<trim->r_boundary) {
882        delete ed_script_fwd;
883        if (freeAlnMem) delete gxmem;
884        return NULL;
885        }
886      if (editscript)
887         ed_script_rev=new GXEditScript();
888 <    int numdiffs_l =  GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
889 <                   reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
888 >    int numdiffs_l =  GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, // xdrop, reward, penalty,
889 >                                  s_ext_l, q_ext_l, *gxmem, ed_script_rev);
890      //convert num diffs to actual score
891      numdiffs=numdiffs_r+numdiffs_l;
892 <    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward-penalty);
892 >    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*(reward>>1) - numdiffs*(reward+penalty);
893      if (editscript)
894         ed_script_rev->Append(ed_script_fwd); //combine the two extensions
895    }
763
896    if (retscore>=MIN_GREEDY_SCORE) {
897 <    alninfo=new GXAlnInfo(q_seq, q_alnstart+1-q_ext_l,q_alnstart+q_ext_r, s_seq, s_alnstart+1-s_ext_l, s_alnstart+s_ext_r);
897 >    alninfo=new GXAlnInfo(q_seq, q_alnstart+1-q_ext_l, q_alnstart+q_ext_r, s_seq, s_alnstart+1-s_ext_l, s_alnstart+s_ext_r);
898      int hsp_length = GMIN(q_ext_l+q_ext_r, s_ext_l+s_ext_r);
899      alninfo->score=retscore;
900 +    if (gxmem->scaled) alninfo->score >>= 1;
901      alninfo->pid = 100 * (1 - ((double) numdiffs) / hsp_length);
902   #ifdef GDEBUG
903 <    if (ed_script_rev) {
904 <       GMessage("Final Edit script ::: ");
905 <       printEditScript(ed_script_rev);
906 <       }
903 >    //if (ed_script_rev) {
904 >    //   GMessage("Final Edit script ::: ");
905 >    //   printEditScript(ed_script_rev);
906 >    //   }
907   #endif
908      alninfo->editscript=ed_script_rev;
909      alninfo->gapinfo = new CAlnGapInfo(ed_script_rev, alninfo->ql-1, alninfo->sl-1);
910      }
911    else {
912 <    if (freeAlnMem) delete gxmem;
912 >    //if (freeAlnMem) delete gxmem;
913      delete ed_script_rev;
914      delete alninfo;
915      alninfo=NULL;
916      }
917 +  if (freeAlnMem) delete gxmem;
918    delete ed_script_fwd;
919    return alninfo;
920   }
921  
922 < GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
923 <            SGreedyAlignMem* gxmem, int match_reward, int mismatch_penalty, int Xdrop) {
922 > GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
923 >                       const char* s_seq, int s_alnstart, int s_max, CGreedyAlignData* gxmem,
924 >                       CAlnTrim* trim, bool editscript) {
925 > int reward=2;
926 > int penalty=10;
927 > int xdrop=32;
928 > if (gxmem) {
929 >   reward=gxmem->match_reward;
930 >   penalty=gxmem->mismatch_penalty;
931 >   xdrop=gxmem->x_drop;
932 >   }
933 > return GreedyAlignRegion(q_seq, q_alnstart, q_max, s_seq, s_alnstart, s_max,
934 >     reward, penalty, xdrop, gxmem, trim, editscript);
935 > }
936 >
937 > GXAlnInfo* match_adaptor(GXSeqData& sd, GAlnTrimType trim_type, int minMatch,
938 >                                     CGreedyAlignData* gxmem, int min_pid) {
939    bool editscript=false;
940    #ifdef GDEBUG
941     editscript=true;
942 <   GMessage("==========> matching Right (3') end ========\n");
942 >   if (trim_type==galn_TrimLeft) {
943 >         GMessage("=======> searching left (5') end : %s\n", sd.aseq);
944 >     }
945 >   else if (trim_type==galn_TrimRight) {
946 >     GMessage("=======> searching right(3') end : %s\n", sd.aseq);
947 >     }
948 >   else if (trim_type==galn_TrimEither) {
949 >     GMessage("==========> searching  both ends : %s\n", sd.aseq);
950 >     }
951    #endif
952 +  CAlnTrim trimInfo(trim_type, sd.bseq, sd.blen, sd.alen, minMatch, sd.amlen);
953    GList<GXSeed> rseeds(true,true,false);
954 <  GXBandSet* alnbands=collectSeeds_R(rseeds, seqa, seqa_len, seqb, seqb_len);
797 <  GList<GXAlnInfo> galns(true,true,false);
798 <  int max_top_bands=5;
799 <  int top_band_count=0;
954 >  GXBandSet* alnbands=collectSeeds(rseeds, sd, trim_type);
955    GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
956 <  for (int b=0;b<alnbands->Count();b++) {
957 <     if (alnbands->Get(b)->score<4) break;
958 <     top_band_count++;
959 <     GXBand& band=*(alnbands->Get(b));
960 <     band.seeds.setSorted(cmpSeedScore);
961 <     anchor_seeds.Add(band.seeds.First());
962 <     if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
963 <     }
964 <  #ifdef GDEBUG
965 <  GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
966 <  #endif
967 <
956 >  //did we find a shortcut?
957 >  if (alnbands->qmatch) {
958 >    #ifdef GDEBUG
959 >     GMessage("::: Found a quick long match at %d, len %d\n",
960 >          alnbands->qmatch->b_ofs, alnbands->qmatch->len);
961 >    #endif
962 >    anchor_seeds.Add(alnbands->qmatch);
963 >    }
964 >  else {
965 >    int max_top_bands=5;
966 >    int top_band_count=0;
967 >    for (int b=0;b<alnbands->Count();b++) {
968 >       if (alnbands->Get(b)->score<6) break;
969 >       //#ifdef GDEBUG
970 >       //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
971 >       //#endif
972 >       top_band_count++;
973 >       GXBand& band=*(alnbands->Get(b));
974 >       band.seeds.setSorted(cmpSeedScore);
975 >       anchor_seeds.Add(band.seeds.First());
976 >       //band.tested=true;
977 >       if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
978 >       }
979 >    //#ifdef GDEBUG
980 >    //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
981 >    //#endif
982 >    }
983 >  GList<GXAlnInfo> galns(true,true,false);
984    for (int i=0;i<anchor_seeds.Count();i++) {
985      GXSeed& aseed=*anchor_seeds[i];
986      int a1=aseed.a_ofs+(aseed.len>>1)+1;
987      int a2=aseed.b_ofs+(aseed.len>>1)+1;
988 <    GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
989 <                            seqb, a2, seqb_len, galn_TrimRight, editscript, gxmem,
990 <                            match_reward, mismatch_penalty, Xdrop);
991 <    if (alninfo && alninfo->pid>73 && alninfo->sr>seqb_len-4 && alninfo->ql<4)
988 >    trimInfo.seedlen=aseed.len;
989 > #ifdef GDEBUG
990 >    GMessage("\t::: align from seed (%d, %d)of len %d.\n",aseed.a_ofs, aseed.b_ofs,
991 >           aseed.len);
992 > #endif
993 >    GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen,
994 >                            sd.bseq, a2, sd.blen, gxmem, &trimInfo, editscript);
995 >    if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo))
996               galns.AddIfNew(alninfo, true);
997          else delete alninfo;
998      }
999  
1000 <  #ifdef GDEBUG
1001 <  for (int i=0;i<galns.Count();i++) {
1002 <    GXAlnInfo* alninfo=galns[i];
1003 <    GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
1004 <                         alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
1005 <    if (alninfo->gapinfo!=NULL) {
1006 <      GMessage("Alignment:\n");
1007 <      alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1000 >  if (galns.Count()==0) {
1001 >         //last resort: look for weaker terminal seeds
1002 >          GPVec<GXSeed> tmatches(2,false);
1003 >          if (trim_type!=galn_TrimRight) {
1004 >                 if (alnbands->tmatch_l)
1005 >                    tmatches.Add(alnbands->tmatch_l);
1006 >             }
1007 >          if (trim_type!=galn_TrimLeft) {
1008 >                 if (alnbands->tmatch_r)
1009 >                    tmatches.Add(alnbands->tmatch_r);
1010 >             }
1011 >          for (int i=0;i<tmatches.Count();i++) {
1012 >                GXSeed& aseed=*tmatches[i];
1013 >                int halfseed=aseed.len>>1;
1014 >                int a1=aseed.a_ofs+halfseed+1;
1015 >                int a2=aseed.b_ofs+halfseed+1;
1016 >                trimInfo.seedlen=aseed.len;
1017 > #ifdef GDEBUG
1018 >    GMessage("\t::: align from terminal seed (%d, %d)of len %d.\n",aseed.a_ofs, aseed.b_ofs,
1019 >           aseed.len);
1020 > #endif
1021 >        GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen,
1022 >                                sd.bseq, a2, sd.blen, gxmem, &trimInfo, editscript);
1023 >        if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo))
1024 >                 galns.AddIfNew(alninfo, true);
1025 >             else delete alninfo;
1026 >        }//for each terminal seed
1027        }
1028 <    }
835 <  #endif
836 <  //---- done
1028 >  //---- found all alignments
1029    delete alnbands;
1030 <  if (galns.Count()) {
839 <    GXAlnInfo* bestaln=galns.Shift();
840 <    return bestaln;
841 <    }
842 <  else return NULL;
843 < }
844 <
845 < GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
846 <          SGreedyAlignMem* gxmem, int match_reward, int mismatch_penalty, int Xdrop) {
847 <  bool editscript=false;
848 <  #ifdef GDEBUG
849 <   editscript=true;
850 <   GMessage("==========> matching Left (5') end ========\n");
851 <  #endif
852 <  GList<GXSeed> rseeds(true,true,false);
853 <  GXBandSet* alnbands=collectSeeds_L(rseeds, seqa, seqa_len, seqb, seqb_len);
854 <  GList<GXAlnInfo> galns(true,true,false);
855 <  int max_top_bands=5;
856 <  int top_band_count=0;
857 <  GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
858 <  for (int b=0;b<alnbands->Count();b++) {
859 <     if (alnbands->Get(b)->score<4) break;
860 <     top_band_count++;
861 <     GXBand& band=*(alnbands->Get(b));
862 <     band.seeds.setSorted(cmpSeedScore);
863 <     anchor_seeds.Add(band.seeds.First());
864 <     if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
865 <     }
866 <  #ifdef GDEBUG
867 <  GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
868 <  #endif
869 <
870 <  for (int i=0;i<anchor_seeds.Count();i++) {
871 <    GXSeed& aseed=*anchor_seeds[i];
872 <    int a1=aseed.a_ofs+(aseed.len>>1)+1;
873 <    int a2=aseed.b_ofs+(aseed.len>>1)+1;
874 <    GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
875 <                            seqb, a2, seqb_len, galn_TrimLeft, editscript, gxmem,
876 <                            match_reward, mismatch_penalty, Xdrop-2);
877 <    if (alninfo && alninfo->pid>82 && alninfo->sl<3 && alninfo->qr>seqa_len-4)
878 <         //TODO: at 5' end we should use a higher pid threshold
879 <         // we could also decrease the X-drop value!
880 <            galns.AddIfNew(alninfo, true);
881 <       else delete alninfo;
882 <    }
883 <
1030 >  /*
1031    #ifdef GDEBUG
1032 +  //print all valid alignments found
1033    for (int i=0;i<galns.Count();i++) {
1034      GXAlnInfo* alninfo=galns[i];
1035      GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
# Line 892 | Line 1040
1040        }
1041      }
1042    #endif
1043 <  //---- done
896 <  delete alnbands;
1043 >  */
1044    if (galns.Count()) {
1045      GXAlnInfo* bestaln=galns.Shift();
1046 +    #ifdef GDEBUG
1047 +      GMessage("Best alignment: a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", bestaln->ql, bestaln->qr,
1048 +          bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid);
1049 +      if (bestaln->gapinfo!=NULL) {
1050 +        bestaln->gapinfo->printAlignment(stderr, sd.aseq, sd.alen, sd.bseq, sd.blen);
1051 +        }
1052 +    #endif
1053      return bestaln;
1054      }
1055    else return NULL;

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines