ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.cpp
(Generate patch)
# Line 53 | Line 53
53    if (flast_d[d-1][diag-1] > GMAX(flast_d[d-1][diag], flast_d[d-1][diag+1])) {
54        *row1 = flast_d[d-1][diag-1];
55        return diag-1;
56 <      }
56 >      }
57    if (flast_d[d-1][diag] > flast_d[d-1][diag+1]) {
58        *row1 = flast_d[d-1][diag];
59        return diag;
# Line 65 | Line 65
65   void GapXEditScript::print() { //debug
66        GapXEditScript* p=this;
67        do {
68 <        GMessage("%d%c ",p->num, xgapcodes[p->op_type]);
69 <        } while ((p=p->next)!=NULL);
68 >        GMessage("%d%c ",p->num, xgapcodes[p->op_type]);
69 >        } while ((p=p->next)!=NULL);
70        GMessage("\n");
71      }
72  
# Line 101 | Line 101
101   }
102  
103  
104 + uint16 get6mer(char* p) {
105 +  uint16 r=gdna2bit(p,3);
106 +  if (*p) {
107 +   uint16 v=gdna2bit(p,3);
108 +   r |= (v<<8);
109 +   }
110 +  return r;
111 + }
112  
113   //signal that a diagonal is invalid
114   static const int kInvalidOffset = -2;
# Line 129 | Line 137
137               //    }
138        }
139      else { //forward lookup
140 <             while (seq1_index < len1 && seq2_index < len2 &&
140 >             while (seq1_index < len1 && seq2_index < len2 &&
141                     //seq1[seq1_index] < 4 &&
142                     seq1[seq1_index] == seq2[seq2_index]) {
143                  ++seq1_index;
# Line 184 | Line 192
192                         bool reverse, int xdrop_threshold,
193                         int match_cost, int mismatch_cost,
194                         int& seq1_align_len, int& seq2_align_len,
195 <                       SGreedyAlignMem& aux_data,
195 >                       CGreedyAlignData& aux_data,
196                         GXEditScript *edit_block) {
197                         //GapPrelimEditBlock *edit_block,
198                         //bool& fence_hit, SGreedySeed *seed) {
# Line 204 | Line 212
212      int longest_match_run;
213      bool end1_reached, end2_reached;
214      GXMemPool* mem_pool;
215 <
215 >
216      /* ordinary dynamic programming alignment, for each offset
217         in seq1, walks through offsets in seq2 until an X-dropoff
218 <       test fails, saving the best score encountered along
218 >       test fails, saving the best score encountered along
219         the way. Instead of score, this code tracks the 'distance'
220         (number of mismatches plus number of gaps) between seq1
221         and seq2. Instead of walking through sequence offsets, it
222         walks through diagonals that can achieve a given distance.
223 <    
223 >
224         Note that in what follows, the numbering of diagonals implies
225 <       a dot matrix where increasing seq1 offsets go to the right on
225 >       a dot matrix where increasing seq1 offsets go to the right on
226         the x axis, and increasing seq2 offsets go up the y axis.
227         The gapped alignment thus proceeds up and to the right in
228         the graph, and diagonals are numbered increasing to the right */
# Line 223 | Line 231
231      best_diag = 0;
232  
233      /* set the number of distinct distances the algorithm will
234 <       examine in the search for an optimal alignment. The
235 <       heuristic worst-case running time of the algorithm is
234 >       examine in the search for an optimal alignment. The
235 >       heuristic worst-case running time of the algorithm is
236         O(max_dist**2 + (len1+len2)); for sequences which are
237         very similar, the average running time will be sig-
238         nificantly better than this */
239  
240      max_dist = GMIN(GREEDY_MAX_COST,
241 <                   len2 / GREEDY_MAX_COST_FRACTION + 1);
241 >                   (len2/GREEDY_MAX_COST_FRACTION + 1));
242  
243      /* the main loop assumes that the index of all diagonals is
244 <       biased to lie in the middle of allocated bookkeeping
244 >       biased to lie in the middle of allocated bookkeeping
245         structures */
246  
247      diag_origin = max_dist + 2;
# Line 251 | Line 259
259         xdrop_offset gives the distance backwards in the score
260         array to look */
261  
262 <    xdrop_offset = (xdrop_threshold + match_cost / 2) /
262 >    xdrop_offset = (xdrop_threshold + match_cost / 2) /
263                             (match_cost + mismatch_cost) + 1;
264 <    
264 >
265      // find the offset of the first mismatch between seq1 and seq2
266  
267      index = s_FindFirstMismatch(seq1, len1,  seq2, len2, 0, 0, reverse);
# Line 277 | Line 285
285          if (edit_block != NULL)
286              //GapPrelimEditBlockAdd(edit_block, eGapAlignSub, index);
287              edit_block->opRep(index);
288 <        return 0;
288 >        return 0;
289      }
290  
291      // set up the memory pool
# Line 287 | Line 295
295        }
296      else if (mem_pool == NULL) {
297         aux_data.space = mem_pool = new GXMemPool();
298 <    }
298 >    }
299      else {
300          mem_pool->refresh();
301      }
302 <    
302 >
303      /* set up the array of per-distance maximum scores. There
304         are max_diags + xdrop_offset distances to track, the first
305         xdrop_offset of which are 0 */
# Line 299 | Line 307
307      max_score = aux_data.max_score + xdrop_offset;
308      for (index = 0; index < xdrop_offset; index++)
309          aux_data.max_score[index] = 0;
310 <    
310 >
311      // fill in the initial offsets of the distance matrix
312  
313      last_seq2_off[0][diag_origin] = seq1_index;
# Line 328 | Line 336
336  
337          // compute the score for distance d corresponding to the X-dropoff criterion
338  
339 <        xdrop_score = max_score[d - xdrop_offset] +
339 >        xdrop_score = max_score[d - xdrop_offset] +
340                        (match_cost + mismatch_cost) * d - xdrop_threshold;
341 <        xdrop_score = (int)ceil((double)xdrop_score / (match_cost / 2));
341 >        xdrop_score = (int)ceil((double)xdrop_score / (match_cost / 2));
342  
343          // for each diagonal of interest
344          for (k = tmp_diag_lower; k <= tmp_diag_upper; k++) {
345              /* find the largest offset into seq2 that increases
346                 the distance from d-1 to d (i.e. keeps the alignment
347 <               from getting worse for as long as possible), then
347 >               from getting worse for as long as possible), then
348                 choose the offset into seq1 that will keep the
349 <               resulting diagonal fixed at k
350 <            
349 >               resulting diagonal fixed at k
350 >
351                 Note that this requires kInvalidOffset+1 to be smaller
352                 than any valid offset into seq2, i.e. to be negative */
353  
# Line 360 | Line 368
368                  continue;
369              }
370              diag_upper = k;
371 <            
372 <            /* slide down diagonal k until a mismatch
371 >
372 >            /* slide down diagonal k until a mismatch
373                 occurs. As long as only matches are encountered,
374                 the current distance d will not change */
375  
# Line 397 | Line 405
405                 can each be of size at most max_diags+2 */
406  
407              if (seq2_index == len2) {
408 <                diag_lower = k + 1;
408 >                diag_lower = k + 1;
409                  end2_reached = true;
410              }
411              if (seq1_index == len1) {
412 <                diag_upper = k - 1;
412 >                diag_upper = k - 1;
413                  end1_reached = true;
414              }
415          }  // end loop over diagonals
416  
417          // compute the maximum score possible for distance d
418 <        curr_score = curr_extent * (match_cost / 2) -
418 >        curr_score = curr_extent * (match_cost / 2) -
419                          d * (match_cost + mismatch_cost);
420          // if this is the best score seen so far, update the
421          // statistics of the best alignment
# Line 417 | Line 425
425              best_diag = curr_diag;
426              seq2_align_len = curr_seq2_index;
427              seq1_align_len = curr_seq2_index + best_diag - diag_origin;
428 <        }
428 >        }
429          else {
430              max_score[d] = max_score[d - 1];
431          }
# Line 428 | Line 436
436          if (diag_lower > diag_upper)
437              break;
438  
439 <        /* set up for the next distance to examine. Because the
440 <           bounds increase by at most one for each distance,
441 <           diag_lower and diag_upper can each be of size at
439 >        /* set up for the next distance to examine. Because the
440 >           bounds increase by at most one for each distance,
441 >           diag_lower and diag_upper can each be of size at
442             most max_diags+2 */
443  
444          if (!end2_reached)
445 <            diag_lower--;
445 >            diag_lower--;
446          if (!end1_reached)
447              diag_upper++;
448  
# Line 455 | Line 463
463              }
464      }   // end loop over distinct distances
465  
466 <    
466 >
467      if (edit_block == NULL)
468          return best_dist;
469  
470      //----  perform traceback
471 <    d = best_dist;
471 >    d = best_dist;
472      seq1_index = seq1_align_len;
473      seq2_index = seq2_align_len;
474      // for all positive distances
# Line 476 | Line 484
484             traceback operation. best_diag starts off with the
485             value computed during the alignment process */
486  
487 <        new_diag = s_GetNextNonAffineTback(last_seq2_off, d,
487 >        new_diag = s_GetNextNonAffineTback(last_seq2_off, d,
488                                             best_diag, &new_seq2_index);
489  
490          if (new_diag == best_diag) {
# Line 484 | Line 492
492              if (seq2_index - new_seq2_index > 0) {
493                    edit_block->opRep(seq2_index - new_seq2_index);
494              }
495 <        }
495 >        }
496          else if (new_diag < best_diag) {
497              // smaller diagonal: issue a group of substitutions
498              //   and then a gap in seq2 */
# Line 493 | Line 501
501              }
502              //GapPrelimEditBlockAdd(edit_block, eGapAlignIns, 1);
503              edit_block->opIns(1);
504 <        }
504 >        }
505          else {
506              // larger diagonal: issue a group of substitutions
507              //   and then a gap in seq1
# Line 502 | Line 510
510              }
511              edit_block->opDel(1);
512          }
513 <        d--;
514 <        best_diag = new_diag;
515 <        seq2_index = new_seq2_index;
513 >        d--;
514 >        best_diag = new_diag;
515 >        seq2_index = new_seq2_index;
516      }
517   //done:
518      // handle the final group of substitutions back to distance zero,
# Line 526 | Line 534
534        unsigned char op_type = 3 - ( ed_script->ops[i] & gxEDIT_OP_MASK );
535        if (op_type == 3)
536           GError("Error: printEditScript encountered op_type 3 ?!\n");
537 <      GMessage("%d%c ", num, xgapcodes[op_type]);
537 >      GMessage("%d%c ", num, xgapcodes[op_type]);
538        }
539      GMessage("\n");
540    }
# Line 536 | Line 544
544   int q_max=strlen(q_seq); //query
545   int s_max=strlen(s_seq); //subj
546   return GreedyAlignRegion(q_seq, q_alnstart, q_max,
547 <          s_seq,  s_alnstart, s_max, galn_NoTrim, editscript, NULL, reward, penalty, xdrop);
547 >          s_seq,  s_alnstart, s_max, reward, penalty, xdrop, NULL, NULL, editscript);
548   }
549  
550   struct GXSeedTable {
# Line 575 | Line 583
583  
584   };
585  
586 + const int a_m_score=2; //match score
587 + const int a_mis_score=-3; //mismatch
588 + const int a_dropoff_score=7;
589 + const int a_min_score=12; //at least 6 bases full match
590 +
591 + // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
592 + //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
593 + //check minimum score and
594 + //for 3' adapter trimming:
595 + //     require that the right end of the alignment for either the adaptor OR the read must be
596 + //     < 3 distance from its right end
597 + // for 5' adapter trimming:
598 + //     require that the left end of the alignment for either the adaptor OR the read must
599 + //     be at coordinate < 3 from start
600 +
601 + bool extendUngapped(const char* a, int alen, int ai,
602 +                 const char* b, int blen, int bi, int& mlen, int& l5, int& l3, bool end5=false) {
603 + //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
604 + //if (debug) {
605 + //  GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
606 + //  }
607 + int a_l=ai; //alignment coordinates on a
608 + int a_r=ai+mlen-1;
609 + int b_l=bi; //alignment coordinates on b
610 + int b_r=bi+mlen-1;
611 + int ai_maxscore=ai;
612 + int bi_maxscore=bi;
613 + int score=mlen*a_m_score;
614 + int maxscore=score;
615 + int mism5score=a_mis_score;
616 + if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
617 + //try to extend to the left first, if possible
618 + while (ai>0 && bi>0) {
619 +   ai--;
620 +   bi--;
621 +   score+= (a[ai]==b[bi])? a_m_score : mism5score;
622 +   if (score>maxscore) {
623 +       ai_maxscore=ai;
624 +       bi_maxscore=bi;
625 +       maxscore=score;
626 +       }
627 +     else if (maxscore-score>a_dropoff_score) break;
628 +   }
629 + a_l=ai_maxscore;
630 + b_l=bi_maxscore;
631 + //now extend to the right
632 + ai_maxscore=a_r;
633 + bi_maxscore=b_r;
634 + ai=a_r;
635 + bi=b_r;
636 + score=maxscore;
637 + //sometimes there are extra As at the end of the read, ignore those
638 + if (a[alen-2]=='A' && a[alen-1]=='A') {
639 +    alen-=2;
640 +    while (a[alen-1]=='A' && alen>ai) alen--;
641 +    }
642 + while (ai<alen-1 && bi<blen-1) {
643 +   ai++;
644 +   bi++;
645 +   //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
646 +   if (a[ai]==b[bi]) { //match
647 +      score+=a_m_score;
648 +      if (ai>=alen-2) {
649 +           score+=a_m_score-(alen-ai-1);
650 +           }
651 +      }
652 +    else { //mismatch
653 +      score+=a_mis_score;
654 +      }
655 +   if (score>maxscore) {
656 +       ai_maxscore=ai;
657 +       bi_maxscore=bi;
658 +       maxscore=score;
659 +       }
660 +     else if (maxscore-score>a_dropoff_score) break;
661 +   }
662 +  a_r=ai_maxscore;
663 +  b_r=bi_maxscore;
664 +  int a_ovh3=alen-a_r-1;
665 +  int b_ovh3=blen-b_r-1;
666 +  int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
667 +  int mmovh5=(a_l<b_l)? a_l : b_l;
668 +  if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
669 +     if (a_l<a_ovh3) {
670 +        //adapter closer to the left end (typical for 5' adapter)
671 +        l5=a_r+1;
672 +        l3=alen-1;
673 +        }
674 +      else {
675 +        //adapter matching at the right end (typical for 3' adapter)
676 +        l5=0;
677 +        l3=a_l-1;
678 +        }
679 +     return true;
680 +     }
681 +  //do not trim:
682 +  l5=0;
683 +  l3=alen-1;
684 +  return false;
685 + }
686 +
687  
688 < GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
688 > GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len,
689 >        GVec<uint16> amers[], const char* seqb, int b_len) {
690   //overlap of right (3') end of seqb
691   //hash the first 12 bases of seqa:
692   int aimin=0;
# Line 609 | Line 719
719                   aix++;bix++;
720                   len++;
721                 }
722 +          if (len>7) {
723 +              //heuristics: very likely the best we can get
724 +              int qlen=len;
725 +              while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
726 +                aix++;
727 +                bix++;
728 +                qlen++;
729 +                }
730 +              if (qlen>10) { //found a quick match shortcut
731 +                 diagstrips->qmatch=new GXSeed(ai,bi,qlen);
732 +                 return diagstrips;
733 +                 }
734 +              }
735            GXSeed* newseed=new GXSeed(ai,bi,len);
736            seeds.Add(newseed);
737            diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
738 +          //special last resort terminal match to be used if no better alignment is there
739 +          if (len>4 && ai==0 && (bi+len==b_len || bi+len==b_len-1)) diagstrips->tmatch=newseed;
740            }
741         }
742      }//for each 4-mer at the beginning of seqa
# Line 622 | Line 747
747   return diagstrips;
748   }
749  
750 < GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
750 > GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len,
751 >        GVec<uint16> amers[], const char* seqb, int b_len) {
752   //overlap of left (5') end of seqb
753   //hash the last 12 bases of seqa:
754   int aimin=GMAX(0,(a_len-16));
# Line 655 | Line 781
781                   aix++;bix++;
782                   len++;
783                 }
784 +          if (len>7) {
785 +              //heuristics: very likely the best we can get
786 +              int qlen=len;
787 +              while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
788 +                aix++;
789 +                bix++;
790 +                qlen++;
791 +                }
792 +              if (qlen>10) { //found a quick match shortcut
793 +                 diagstrips->qmatch=new GXSeed(ai,bi,qlen);
794 +                 return diagstrips;
795 +                 }
796 +              }
797            GXSeed* newseed=new GXSeed(ai,bi,len);
798            seeds.Add(newseed);
799            diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
800 +          //special last resort terminal match to be used if no better alignment is there
801 +          if (len>4 && bi==0 && ai+len==a_len) diagstrips->tmatch=newseed;
802            }
803         }
804      }//for each 4-mer at the end of seqa
# Line 668 | Line 809
809   return diagstrips;
810   }
811  
671
672
812   int cmpSeedScore(const pointer p1, const pointer p2) {
813    //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
814    GXSeed* s1=(GXSeed*)p1;
# Line 686 | Line 825
825    return ((s1->b_ofs-s1->a_ofs)-(s2->b_ofs-s2->a_ofs));
826   }
827  
828 +
829 + int cmpDiagBands_R(const pointer p1, const pointer p2) {
830 +  //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
831 +  GXBand* b1=(GXBand*)p1;
832 +  GXBand* b2=(GXBand*)p2;
833 +  if (b1->score==b2->score) {
834 +     return (b2->w_min_b-b1->w_min_b);
835 +     }
836 +  else return (b2->score-b1->score);
837 + }
838 +
839 +
840 +
841   GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
842 <                       const char* s_seq, int s_alnstart, int s_max, GAlnTrimType trimtype,
843 <                        bool editscript, SGreedyAlignMem* gxmem, int reward, int penalty, int xdrop) {
842 >    const char* s_seq, int s_alnstart, int s_max,
843 >    int reward, int penalty, int xdrop, CGreedyAlignData* gxmem,
844 >    CAlnTrim* trim, bool editscript) {
845    GXEditScript* ed_script_fwd = NULL;
846    GXEditScript* ed_script_rev = NULL;
847    if ( q_alnstart>q_max || q_alnstart<1 || s_alnstart>s_max || s_alnstart<1 )
# Line 705 | Line 858
858    const char* s=s_seq+s_alnstart;
859    int s_avail=s_max-s_alnstart;
860    if (penalty<0) penalty=-penalty;
861 <  int MIN_GREEDY_SCORE=5*reward; //minimum score for an alignment to be reported
861 >  int MIN_GREEDY_SCORE=5*reward; //minimum score for an alignment to be reported for 0 diffs
862    GXAlnInfo* alninfo=NULL;
863    bool freeAlnMem=(gxmem==NULL);
864    if (freeAlnMem) {
865 <     gxmem=new SGreedyAlignMem(reward, penalty, xdrop);
865 >     gxmem=new CGreedyAlignData(reward, penalty, xdrop);
866       }
867    else gxmem->reset();
868    int retscore = 0;
869    int numdiffs = 0;
870 <  if (trimtype==galn_TrimLeft) {
870 >  if (trim!=NULL && trim->type==galn_TrimLeft) {
871      if (editscript)
872         ed_script_rev=new GXEditScript();
873  
874 <    int numdiffs_l =  GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
874 >    int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
875                     reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
876      //check this extension here and bail out if it's not a good extension
877 <    if (s_alnstart+1-s_ext_l>3) {
877 >    if (s_alnstart+1-s_ext_l>trim->boundary) {
878        delete ed_script_rev;
879        if (freeAlnMem) delete gxmem;
880        return NULL;
# Line 733 | Line 886
886                              reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
887      numdiffs=numdiffs_r+numdiffs_l;
888      //convert num diffs to actual score
889 <    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward-penalty);
889 >    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward+penalty);
890      if (editscript)
891         ed_script_rev->Append(ed_script_fwd); //combine the two extensions
892      }
# Line 745 | Line 898
898                              reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
899      //check extension here and bail out if not a good right extension
900      //assuming s_max is really at the right end of s_seq
901 <    if (trimtype==galn_TrimRight && s_alnstart+s_ext_r<s_max-2) {
901 >    if (trim!=NULL && trim->type==galn_TrimRight &&
902 >           s_alnstart+s_ext_r<trim->boundary) {
903        delete ed_script_fwd;
904        if (freeAlnMem) delete gxmem;
905        return NULL;
# Line 756 | Line 910
910                     reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
911      //convert num diffs to actual score
912      numdiffs=numdiffs_r+numdiffs_l;
913 <    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward-penalty);
913 >    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward+penalty);
914      if (editscript)
915         ed_script_rev->Append(ed_script_fwd); //combine the two extensions
916    }
# Line 767 | Line 921
921      alninfo->score=retscore;
922      alninfo->pid = 100 * (1 - ((double) numdiffs) / hsp_length);
923   #ifdef GDEBUG
924 <    if (ed_script_rev) {
925 <       GMessage("Final Edit script ::: ");
926 <       printEditScript(ed_script_rev);
927 <       }
924 >    //if (ed_script_rev) {
925 >    //   GMessage("Final Edit script ::: ");
926 >    //   printEditScript(ed_script_rev);
927 >    //   }
928   #endif
929      alninfo->editscript=ed_script_rev;
930      alninfo->gapinfo = new CAlnGapInfo(ed_script_rev, alninfo->ql-1, alninfo->sl-1);
# Line 784 | Line 938
938    delete ed_script_fwd;
939    return alninfo;
940   }
941 +
942 + GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
943 +                       const char* s_seq, int s_alnstart, int s_max, CGreedyAlignData* gxmem,
944 +                       CAlnTrim* trim, bool editscript) {
945 + int reward=2;
946 + int penalty=3;
947 + int xdrop=8;
948 + if (gxmem) {
949 +   reward=gxmem->match_reward;
950 +   penalty=gxmem->mismatch_penalty;
951 +   xdrop=gxmem->x_drop;
952 +   }
953 + return GreedyAlignRegion(q_seq, q_alnstart, q_max, s_seq, s_alnstart, s_max,
954 +     reward, penalty, xdrop, gxmem, trim, editscript);
955 + }
956 +
957 + GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, GVec<uint16> amers[],
958 +                 const char* seqb, int seqb_len, CGreedyAlignData* gxmem, int min_pid) {
959 +  bool editscript=false;
960 +  #ifdef GDEBUG
961 +   editscript=true;
962 +  // GMessage("==========> matching Right (3') end ========\n");
963 +  #endif
964 +
965 +  CAlnTrim trimInfo(galn_TrimRight, seqb, seqb_len);
966 +  GList<GXSeed> rseeds(true,true,false);
967 +  GXBandSet* alnbands=collectSeeds_R(rseeds, seqa, seqa_len, amers, seqb, seqb_len);
968 +  GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
969 +  //did we find a shortcut?
970 +  if (alnbands->qmatch) {
971 +    //#ifdef GDEBUG
972 +    // GMessage("::: Found a quick long match -- using it instead of diagonal bands\n");
973 +    //#endif
974 +    anchor_seeds.Add(alnbands->qmatch);
975 +    }
976 +  else {
977 +    int max_top_bands=5;
978 +    int top_band_count=0;
979 +    for (int b=0;b<alnbands->Count();b++) {
980 +       if (alnbands->Get(b)->score<4) break;
981 +       //#ifdef GDEBUG
982 +       //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
983 +       //#endif
984 +       top_band_count++;
985 +       GXBand& band=*(alnbands->Get(b));
986 +       band.seeds.setSorted(cmpSeedScore);
987 +       anchor_seeds.Add(band.seeds.First());
988 +       band.tested=true;
989 +       if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
990 +       }
991 +    //#ifdef GDEBUG
992 +    //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
993 +    //#endif
994 +    }
995 +
996 +  GList<GXAlnInfo> galns(true,true,false);
997 +  for (int i=0;i<anchor_seeds.Count();i++) {
998 +    GXSeed& aseed=*anchor_seeds[i];
999 +    int a1=aseed.a_ofs+(aseed.len>>1)+1;
1000 +    int a2=aseed.b_ofs+(aseed.len>>1)+1;
1001 +    GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1002 +                            seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1003 +    if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1004 +             galns.AddIfNew(alninfo, true);
1005 +        else delete alninfo;
1006 +    }
1007 +  //special 3' end case: due to the seed scoring scheme being biased towards the 5' end of the read,
1008 +  //we should also try some seeds closer to the 3' end
1009 +  if (galns.Count()==0) {
1010 +    anchor_seeds.Clear();
1011 +    alnbands->setSorted(cmpDiagBands_R);
1012 +    int max_top_bands=4;
1013 +    int top_band_count=0;
1014 +    //#ifdef GDEBUG
1015 +    //GMessage(":::>> Retrying adjusting sort order.\n");
1016 +    //#endif
1017 +    if (alnbands->tmatch) {
1018 +      //anchor_seeds.setSorted(false);
1019 +      anchor_seeds.Add(alnbands->tmatch);
1020 +      }
1021 +    for (int b=0;b<alnbands->Count();b++) {
1022 +       if (alnbands->Get(b)->score<4) break;
1023 +       //#ifdef GDEBUG
1024 +       //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
1025 +       //#endif
1026 +       if (alnbands->Get(b)->tested) continue;
1027 +       top_band_count++;
1028 +       GXBand& band=*(alnbands->Get(b));
1029 +       band.seeds.setSorted(cmpSeedScore);
1030 +       anchor_seeds.Add(band.seeds.First());
1031 +       if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1032 +       }
1033 +    //#ifdef GDEBUG
1034 +    //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1035 +    //#endif
1036 +    for (int i=0;i<anchor_seeds.Count();i++) {
1037 +      GXSeed& aseed=*anchor_seeds[i];
1038 +      int a1=aseed.a_ofs+(aseed.len>>1)+1;
1039 +      int a2=aseed.b_ofs+(aseed.len>>1)+1;
1040 +      GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1041 +                              seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1042 +      if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1043 +               galns.AddIfNew(alninfo, true);
1044 +          else delete alninfo;
1045 +      }
1046 +    }
1047 +  //---- done
1048 +  delete alnbands;
1049 +  if (galns.Count()) {
1050 +    GXAlnInfo* bestaln=galns.Shift();
1051 +    #ifdef GDEBUG
1052 +      GMessage("Best alignment: a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", bestaln->ql, bestaln->qr,
1053 +          bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid);
1054 +      if (bestaln->gapinfo!=NULL) {
1055 +        bestaln->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1056 +        }
1057 +    #endif
1058 +
1059 +    return bestaln;
1060 +    }
1061 +  else return NULL;
1062 + }
1063 +
1064 + GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, GVec<uint16> amers[],
1065 +                 const char* seqb, int seqb_len, CGreedyAlignData* gxmem, int min_pid) {
1066 +  bool editscript=false;
1067 +  #ifdef GDEBUG
1068 +   editscript=true;
1069 +   //GMessage("==========> matching Left (5') end ========\n");
1070 +  #endif
1071 +  CAlnTrim trimInfo(galn_TrimLeft, seqb, seqb_len);
1072 +  GList<GXSeed> rseeds(true,true,false);
1073 +  GXBandSet* alnbands=collectSeeds_L(rseeds, seqa, seqa_len, amers, seqb, seqb_len);
1074 +  GList<GXAlnInfo> galns(true,true,false);
1075 +  GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
1076 +
1077 +  if (alnbands->qmatch) {
1078 +    //#ifdef GDEBUG
1079 +    // GMessage("::: Found a quick long match -- using it instead of diagonal bands\n");
1080 +    //#endif
1081 +    anchor_seeds.Add(alnbands->qmatch);
1082 +    }
1083 +  else {
1084 +    int max_top_bands=5;
1085 +    int top_band_count=0;
1086 +    for (int b=0;b<alnbands->Count();b++) {
1087 +       if (alnbands->Get(b)->score<4) break;
1088 +       //#ifdef GDEBUG
1089 +       //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
1090 +       //#endif
1091 +       top_band_count++;
1092 +       GXBand& band=*(alnbands->Get(b));
1093 +       band.seeds.setSorted(cmpSeedScore);
1094 +       anchor_seeds.Add(band.seeds.First());
1095 +       if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1096 +       }
1097 +    //#ifdef GDEBUG
1098 +    //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1099 +    //#endif
1100 +    }
1101 +  for (int i=0;i<anchor_seeds.Count();i++) {
1102 +    GXSeed& aseed=*anchor_seeds[i];
1103 +    int a1=aseed.a_ofs+(aseed.len>>1)+1;
1104 +    int a2=aseed.b_ofs+(aseed.len>>1)+1;
1105 +    GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1106 +                            seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1107 +    if (alninfo && alninfo->pid>=min_pid
1108 +           && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, seqa_len-alninfo->qr))
1109 +            galns.AddIfNew(alninfo, true);
1110 +       else delete alninfo;
1111 +    }
1112 +  if (galns.Count()==0 && alnbands->tmatch) {
1113 +      GXSeed& aseed=*alnbands->tmatch;
1114 +      int a1=aseed.a_ofs+(aseed.len>>1)+1;
1115 +      int a2=aseed.b_ofs+(aseed.len>>1)+1;
1116 +      GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1117 +                              seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1118 +      if (alninfo) galns.Add(alninfo);
1119 +      }
1120 +  #ifdef GDEBUG
1121 +  /*
1122 +  for (int i=0;i<galns.Count();i++) {
1123 +    GXAlnInfo* alninfo=galns[i];
1124 +    GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
1125 +                         alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
1126 +    if (alninfo->gapinfo!=NULL) {
1127 +      GMessage("Alignment:\n");
1128 +      alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1129 +      }
1130 +    }
1131 +  */
1132 +  #endif
1133 +  //---- done
1134 +  delete alnbands;
1135 +  if (galns.Count()) {
1136 +    GXAlnInfo* bestaln=galns.Shift();
1137 +    #ifdef GDEBUG
1138 +      GMessage("Best alignment: a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", bestaln->ql, bestaln->qr,
1139 +          bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid);
1140 +      if (bestaln->gapinfo!=NULL) {
1141 +        bestaln->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1142 +        }
1143 +    #endif
1144 +    return bestaln;
1145 +    }
1146 +  else return NULL;
1147 + }

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines