ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.h
(Generate patch)
# Line 5 | Line 5
5  
6   #include "GBase.h"
7   #include "GList.hh"
8 < #include <string.h>
8 > #include "gdna.h"
9  
10 < #define GDEBUG 1
10 > //#define GDEBUG 1
11  
12   enum {
13      gxEDIT_OP_MASK = 0x3,
# Line 98 | Line 98
98  
99          if (GX_EDITOP_GET(oplast) == op) {
100              uint32 l=ops[opnum-1];
101 <            ops[opnum-1]=GX_EDITOP_CONS((GX_EDITOP_GET(l)),
101 >            ops[opnum-1]=GX_EDITOP_CONS((GX_EDITOP_GET(l)),
102                 (GX_EDITOP_VAL(l) + k));
103              }
104          else {
# Line 215 | Line 215
215  
216   };
217  
218 < #define GREEDY_MAX_COST_FRACTION 5
218 > #define GREEDY_MAX_COST_FRACTION 8
219   /* (was 2) sequence_length / (this number) is a measure of how hard the
220    alignment code will work to find the optimal alignment; in fact
221    this gives a worst case bound on the number of loop iterations */
# Line 346 | Line 346
346         a_len=0;
347             b_len=0;
348             if (ed_script==NULL) return;
349 <           for (uint32 i=0; i<ed_script->opnum; i++) {
349 >           for (uint32 i=0; i<ed_script->opnum; i++) {
350                    int num=((ed_script->ops[i]) >> 2);
351                    char op_type = 3 - ( ed_script->ops[i] & gxEDIT_OP_MASK );
352                    if (op_type == 3 || op_type < 0 )
# Line 371 | Line 371
371             }
372  
373   #ifdef GDEBUG
374 <        void printAlignment(FILE* f, const char* sa, int sa_len,
374 >        void printAlignment(FILE* f, const char* sa, int sa_len,
375                       const char* sb, int sb_len) {
376                  //print seq A
377             char al[1024]; //display buffer for seq A
# Line 436 | Line 436
436         }
437   #endif
438    };
439 <  
439 >
440   struct GXAlnInfo {
441   const char *qseq;
442   int ql,qr;
# Line 444 | Line 444
444   int sl,sr;
445   int score;
446   double pid;
447 + bool strong;
448   GXEditScript* editscript;
449   CAlnGapInfo* gapinfo;
450   GXAlnInfo(const char* q, int q_l, int q_r, const char* s, int s_l, int s_r,
# Line 456 | Line 457
457      sr=s_r;
458      score=sc;
459      pid=percid;
460 +    strong=false;
461      editscript=NULL;
462      gapinfo=NULL;
463      }
# Line 466 | Line 468
468    bool operator<(GXAlnInfo& d) {
469      return ((score==d.score)? pid>d.pid : score>d.score);
470      }
469  bool operator>(GXAlnInfo& d) {
470    return ((score==d.score)? pid<d.pid : score<d.score);
471    }
471    bool operator==(GXAlnInfo& d) {
472      return (score==d.score && pid==d.pid);
473      }
# Line 484 | Line 483
483     bool operator<(GXSeed& d){
484        return ((b_ofs==d.b_ofs) ? a_ofs<d.a_ofs : b_ofs<d.b_ofs);
485        }
487   bool operator>(GXSeed& d){
488      return ((b_ofs==d.b_ofs) ? a_ofs>d.a_ofs : b_ofs>d.b_ofs);
489      }
486     bool operator==(GXSeed& d){
487        return (b_ofs==d.b_ofs && a_ofs==d.a_ofs); //should never be the case, seeds are uniquely constructed
488        }
# Line 554 | Line 550
550          int a_gap=scur.a_ofs-sprev.a_ofs-sprev.len;
551          int max_gap=b_gap;
552          int min_gap=a_gap;
553 <        if (min_gap>max_gap) swap(max_gap, min_gap);
553 >        if (min_gap>max_gap) Gswap(max_gap, min_gap);
554          int _penalty=0;
555          if (min_gap<0) { //overlap
556                 if (max_gap>0) { _penalty=GMAX((-min_gap), max_gap); }
# Line 574 | Line 570
570       //return ((score==d.score) ? seeds.Count()>d.seeds.Count() : score>d.score);
571       return ((score==d.score) ? w_min_b<d.w_min_b : score>d.score);
572       }
577  bool operator>(GXBand& d){
578     //return ((score==d.score) ? seeds.Count()<d.seeds.Count() : score<d.score);
579    return ((score==d.score) ? w_min_b>d.w_min_b : score<d.score);
580     }
573    bool operator==(GXBand& d){
574      //return (score==d.score && seeds.Count()==d.seeds.Count());
575       return (score==d.score && w_min_b==d.w_min_b);
# Line 588 | Line 580
580   class GXBandSet:public GList<GXBand> {
581    public:
582     GXSeed* qmatch; //long match (mismatches allowed) if a very good match was extended well
583 <   GXSeed* tmatch; //terminal match to be used if there is no better alignment
583 >   GXSeed* tmatch_r; //terminal match to be used if there is no better alignment
584 >   GXSeed* tmatch_l; //terminal match to be used if there is no better alignment
585     int idxoffset; //global anti-diagonal->index offset (a_len-1)
586     //used to convert a diagonal to an index
587     //diagonal is always b_ofs-a_ofs, so the minimum value is -a_len+1
# Line 602 | Line 595
595     GXBandSet(int a_len, int b_len):GList<GXBand>(a_len+b_len-1, false, true, false) {
596        idxoffset=a_len-1;
597        qmatch=NULL;
598 <      tmatch=NULL; //terminal match to be used if everything else fails
598 >      tmatch_l=NULL; //terminal match to be used if everything else fails
599 >      tmatch_r=NULL;
600            //diag will range from -a_len+1 to b_len-1, so after adjustment
601            //by idxoffset we get a max of a_len+b_len-2
602        int bcount=a_len+b_len-1;
603        for (int i=0;i<bcount;i++)
604 <                   this->Add(new GXBand(i-idxoffset));
604 >              this->Add(new GXBand(i-idxoffset));
605             //unsorted, this should set fList[i]
606        }
607     ~GXBandSet() {
# Line 622 | Line 616
616       }
617   };
618  
619 + struct GXSeqData {
620 +  const char* aseq;
621 +  int alen;
622 +  const char* bseq;
623 +  int blen;
624 +  GVec<uint16>** amers;
625 +  int amlen; //minimum alignment length that's sufficient to
626 +             //trigger the quick extension heuristics
627 +  GXSeqData(const char* sa=NULL, int la=0, const char* sb=NULL, int lb=0,
628 +  GVec<uint16>* mers[]=NULL):aseq(sa), alen(la),
629 +     bseq(sb),  blen(lb), amers(mers), amlen(0) {
630 +   calc_amlen();
631 +   calc_bmlen();
632 +   }
633 +  void calc_amlen() {
634 +    if (alen) {
635 +       int ah=iround(double(alen)*0.8);
636 +       if (ah<12) ah=12;
637 +       if (amlen>ah) amlen=ah;
638 +       }
639 +    }
640 +  void calc_bmlen() {
641 +    if (blen) {
642 +      int bh = iround(double(alen)*0.6);
643 +      if (bh<12) bh=12;
644 +      if (amlen>bh) amlen=bh;
645 +      }
646 +    }
647 +  void update(const char* sa, int la, GVec<uint16>** mers,
648 +          const char* sb, int lb, int mlen=0) {
649 +     aseq=sa;
650 +     alen=la;
651 +     amers=mers;
652 +     if (mlen) {
653 +       amlen=mlen;
654 +       }
655 +       else calc_amlen();
656 +     if (sb==bseq && blen==lb) return;
657 +     bseq=sb;
658 +     blen=lb;
659 +     calc_bmlen();
660 +     }
661 +  /*
662 +  void update_b(const char* sb, int lb) {
663 +     if (sb==bseq && blen==lb) return;
664 +     bseq=sb;
665 +     blen=lb;
666 +     calc_bmlen();
667 +     }*/
668 + };
669 +
670 + uint16 get6mer(char* p);
671 + void table6mers(const char* s, int slen, GVec<uint16>* amers[]);
672  
673 < GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len); //for overlap at 3' end of seqb
673 > //GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, GXSeqData& sd); //for overlap at 3' end of seqb
674  
675 < GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len); //for overlap at 5' end of seqb
675 > GXBandSet* collectSeeds(GList<GXSeed>& seeds, GXSeqData& sd); //for overlap at 5' end of seqb
676  
677   void printEditScript(GXEditScript* ed_script);
678  
# Line 640 | Line 687
687  
688  
689   enum GAlnTrimType {
690 <  galn_NoTrim=0,
690 >  //Describes trimming intent
691 >  galn_None=0, //no trimming, just alignment
692    galn_TrimLeft,
693 <  galn_TrimRight
693 >  galn_TrimRight,
694 >  galn_TrimEither //adaptor should be trimmed from either end
695    };
696  
697   struct CAlnTrim {
698    GAlnTrimType type;
699 <  int boundary; //base index (either left or right) excluding terminal poly-A stretches
700 <  void prepare(GAlnTrimType trim_type, const char* s, int s_len) {
701 <    type=trim_type;
702 <    boundary=0;
703 <    if (type==galn_TrimLeft) {
699 >  int l_boundary; //base index (either left or right) excluding terminal poly-A stretches
700 >  int r_boundary; //base index (either left or right) excluding terminal poly-A stretches
701 >  int alen; //query/adaptor seq length (for validate())
702 >  int safelen; //alignment length > amlen should be automatically validated
703 >  int seedlen;
704 >  void prepare(const char* s, int s_len) {
705 >    //type=trim_type;
706 >    //amlen=smlen;
707 >    l_boundary=0;
708 >    r_boundary=0;
709 >    //if (type==galn_TrimLeft) {
710          int s_lbound=0;
711          if (s[0]=='A' && s[1]=='A' && s[2]=='A') {
712             s_lbound=3;
# Line 661 | Line 716
716             s_lbound=4;
717             while (s_lbound<s_len-1 && s[s_lbound]=='A') s_lbound++;
718             }
719 <        boundary=s_lbound+3;
720 <        return;
721 <        }
722 <    if (type==galn_TrimRight) {
719 >        l_boundary=s_lbound+3;
720 >    //    return;
721 >    //    }
722 >    //if (type==galn_TrimRight) {
723         int r=s_len-1;
724         if (s[r]=='A' && s[r-1]=='A' && s[r-2]=='A') {
725            r-=3;
# Line 674 | Line 729
729            r-=4;
730            while (r>0 && s[r]=='A') r--;
731            }
732 <       boundary=r-3;
733 <       }
732 >       r_boundary=r-3;
733 >    //   }
734      }
735  
736 <  CAlnTrim(GAlnTrimType trim_type, const char* s, int s_len) {
737 <    prepare(trim_type, s, s_len);
736 >  CAlnTrim(GAlnTrimType trim_type, const char* s, int s_len, int a_len, int smlen):
737 >                   type(trim_type), l_boundary(0), r_boundary(0), alen(a_len), safelen(smlen) {
738 >    prepare(s, s_len);
739      }
740  
741 <  bool validate(int sl, int sr, int alnpid, int adist) {
742 <   int alnlen=sr-sl+1;
741 >  bool validate_R(int sr, int admax, int badj, int adist) {
742 >        if (adist>admax) return false;
743 >        return (sr>=r_boundary+badj);
744 >   }
745 >
746 >  bool validate_L(int sl, int alnlen, int admax, int badj, int alnpid, int adist) {
747 >        if (adist>admax) return false;
748 >    //left match should be more stringent (5')
749 >    if (alnpid<93) {
750 >      if (alnlen<13) return false;
751 >      admax=0;
752 >      badj++;
753 >      }
754 >    return (sl<=l_boundary-badj);
755 >  }
756 >
757 >  bool validate(GXAlnInfo* alninfo) {
758 >   int alnlen=alninfo->sr - alninfo->sl + 1;
759 >   if (alninfo->pid>90.0 && alnlen>safelen) {
760 >           //special case: heavy match, could be in the middle
761 >           if (alninfo->pid>95) alninfo->strong=true;
762 >           return true;
763 >       }
764 >   int sl=alninfo->sl;
765 >   int sr=alninfo->sr;
766     sl--;sr--; //boundary is 0-based
767 <   int badj=0;
768 <   int admax=3;
769 <   if (alnlen<11) {
767 >   int badj=0; //default boundary is 3 bases distance to end
768 >   int admax=1;
769 >   if (alnlen<13) {
770        //stricter boundary check
771 +      if (alninfo->pid<90) return false;
772        badj=2;
773 <      admax=1;
694 <      if (alnlen<=6) badj++;
773 >      if (alnlen<=7) { badj++; admax=0; }
774        }
775 <   if (adist>admax) return false;
775 >   if (type==galn_TrimLeft) {
776 >         return validate_L(sl, alnlen, admax, badj, alninfo->pid, alen-alninfo->qr);
777 >     }
778 >   else if (type==galn_TrimRight) {
779 >         return validate_R(sr, admax, badj, alninfo->ql-1);
780 >     }
781 >   else if (type==galn_TrimEither) {
782 >     return (validate_R(sr, admax, badj, alninfo->ql-1) ||
783 >           validate_L(sl, alnlen, admax, badj, alninfo->pid, alen-alninfo->qr));
784 >     }
785 >   return true;
786 >   /*
787     if (type==galn_TrimRight) {
788        return (sr>=boundary+badj);
789        }
790     else {
791 <      //left side should be more stringent
792 <      if (alnpid<91) {
793 <        if (alnlen<11) return false;
791 >      //left match should be more stringent (5')
792 >      if (alnpid<93) {
793 >        if (alnlen<13) return false;
794 >        admax=0;
795          badj++;
796          }
797        return (sl<=boundary-badj);
798        }
799 +    */
800     }
709
801   };
802  
803  
# Line 722 | Line 813
813   GXAlnInfo* GreedyAlign(const char* q_seq,  int q_alnstart, const char* s_seq, int s_alnstart,
814          bool editscript=false, int reward=2, int penalty=3, int xdrop=8);
815  
816 < GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
817 <          CGreedyAlignData* gxmem=NULL, int min_pid=83);
818 < GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
728 <          CGreedyAlignData* gxmem=NULL, int min_pid=73);
816 > GXAlnInfo* match_adaptor(GXSeqData& sd, GAlnTrimType trim_type,
817 >                                CGreedyAlignData* gxmem=NULL, int min_pid=90);
818 > //GXAlnInfo* match_RightEnd(GXSeqData& sd, CGreedyAlignData* gxmem=NULL, int min_pid=90);
819   #endif

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines