ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.h
(Generate patch)
# Line 5 | Line 5
5  
6   #include "GBase.h"
7   #include "GList.hh"
8 < #include <string.h>
8 > #include "gdna.h"
9  
10 < #define GDEBUG 1
10 > //#define GDEBUG 1
11  
12   enum {
13      gxEDIT_OP_MASK = 0x3,
# Line 98 | Line 98
98  
99          if (GX_EDITOP_GET(oplast) == op) {
100              uint32 l=ops[opnum-1];
101 <            ops[opnum-1]=GX_EDITOP_CONS((GX_EDITOP_GET(l)),
101 >            ops[opnum-1]=GX_EDITOP_CONS((GX_EDITOP_GET(l)),
102                 (GX_EDITOP_VAL(l) + k));
103              }
104          else {
# Line 215 | Line 215
215  
216   };
217  
218 < #define GREEDY_MAX_COST_FRACTION 5
218 > #define GREEDY_MAX_COST_FRACTION 8
219   /* (was 2) sequence_length / (this number) is a measure of how hard the
220    alignment code will work to find the optimal alignment; in fact
221    this gives a worst case bound on the number of loop iterations */
# Line 235 | Line 235
235     int* max_score;                   // array of maximum scores
236     GXMemPool* space;                // local memory pool for SGreedyOffset structs
237     //
238 +   bool scaled; //scores are all x2
239     int match_reward;
240     int mismatch_penalty;
241     int x_drop;
242 +   int xdrop_ofs;
243     // Allocate memory for the greedy gapped alignment algorithm
244     CGreedyAlignData(int reward, int penalty, int xdrop) {
245 +          scaled=false;
246 +          xdrop_ofs = 0;
247        //int max_d, diff_d;
248        if (penalty<0) penalty=-penalty;
249        if (reward % 2) {
250 <         //scale params
250 >         //scale params up
251 >         scaled=true;
252           match_reward = reward << 1;
253           mismatch_penalty = (penalty << 1);
254           x_drop = xdrop<<1;
# Line 253 | Line 258
258           mismatch_penalty = penalty;
259           x_drop=xdrop;
260           }
261 +      xdrop_ofs=(x_drop + (match_reward>>1)) /
262 +          (match_reward + mismatch_penalty) + 1;
263        //if (gap_open == 0 && gap_extend == 0)
264        //   gap_extend = (reward >> 1) + penalty;
265        const int max_dbseq_length=255; //adjust this accordingly
# Line 293 | Line 300
300       GCALLOC(max_score, (sizeof(int) * (max_d + 1 + d_diff)));
301       if (!max_score) GError(GX_GALLOC_ERROR);
302       }
303 +
304     ~CGreedyAlignData() {
305       if (last_seq2_off) {
306           GFREE(last_seq2_off[0]);
# Line 346 | Line 354
354         a_len=0;
355             b_len=0;
356             if (ed_script==NULL) return;
357 <           for (uint32 i=0; i<ed_script->opnum; i++) {
357 >           for (uint32 i=0; i<ed_script->opnum; i++) {
358                    int num=((ed_script->ops[i]) >> 2);
359                    char op_type = 3 - ( ed_script->ops[i] & gxEDIT_OP_MASK );
360                    if (op_type == 3 || op_type < 0 )
# Line 371 | Line 379
379             }
380  
381   #ifdef GDEBUG
382 <        void printAlignment(FILE* f, const char* sa, int sa_len,
382 >        void printAlignment(FILE* f, const char* sa, int sa_len,
383                       const char* sb, int sb_len) {
384                  //print seq A
385             char al[1024]; //display buffer for seq A
# Line 436 | Line 444
444         }
445   #endif
446    };
447 <  
447 >
448   struct GXAlnInfo {
449   const char *qseq;
450   int ql,qr;
# Line 444 | Line 452
452   int sl,sr;
453   int score;
454   double pid;
455 + bool strong;
456   GXEditScript* editscript;
457   CAlnGapInfo* gapinfo;
458   GXAlnInfo(const char* q, int q_l, int q_r, const char* s, int s_l, int s_r,
# Line 456 | Line 465
465      sr=s_r;
466      score=sc;
467      pid=percid;
468 +    strong=false;
469      editscript=NULL;
470      gapinfo=NULL;
471      }
# Line 466 | Line 476
476    bool operator<(GXAlnInfo& d) {
477      return ((score==d.score)? pid>d.pid : score>d.score);
478      }
469  bool operator>(GXAlnInfo& d) {
470    return ((score==d.score)? pid<d.pid : score<d.score);
471    }
479    bool operator==(GXAlnInfo& d) {
480      return (score==d.score && pid==d.pid);
481      }
# Line 484 | Line 491
491     bool operator<(GXSeed& d){
492        return ((b_ofs==d.b_ofs) ? a_ofs<d.a_ofs : b_ofs<d.b_ofs);
493        }
487   bool operator>(GXSeed& d){
488      return ((b_ofs==d.b_ofs) ? a_ofs>d.a_ofs : b_ofs>d.b_ofs);
489      }
494     bool operator==(GXSeed& d){
495        return (b_ofs==d.b_ofs && a_ofs==d.a_ofs); //should never be the case, seeds are uniquely constructed
496        }
# Line 554 | Line 558
558          int a_gap=scur.a_ofs-sprev.a_ofs-sprev.len;
559          int max_gap=b_gap;
560          int min_gap=a_gap;
561 <        if (min_gap>max_gap) swap(max_gap, min_gap);
561 >        if (min_gap>max_gap) Gswap(max_gap, min_gap);
562          int _penalty=0;
563          if (min_gap<0) { //overlap
564                 if (max_gap>0) { _penalty=GMAX((-min_gap), max_gap); }
# Line 574 | Line 578
578       //return ((score==d.score) ? seeds.Count()>d.seeds.Count() : score>d.score);
579       return ((score==d.score) ? w_min_b<d.w_min_b : score>d.score);
580       }
577  bool operator>(GXBand& d){
578     //return ((score==d.score) ? seeds.Count()<d.seeds.Count() : score<d.score);
579    return ((score==d.score) ? w_min_b>d.w_min_b : score<d.score);
580     }
581    bool operator==(GXBand& d){
582      //return (score==d.score && seeds.Count()==d.seeds.Count());
583       return (score==d.score && w_min_b==d.w_min_b);
# Line 588 | Line 588
588   class GXBandSet:public GList<GXBand> {
589    public:
590     GXSeed* qmatch; //long match (mismatches allowed) if a very good match was extended well
591 <   GXSeed* tmatch; //terminal match to be used if there is no better alignment
591 >   GXSeed* tmatch_r; //terminal match to be used if there is no better alignment
592 >   GXSeed* tmatch_l; //terminal match to be used if there is no better alignment
593     int idxoffset; //global anti-diagonal->index offset (a_len-1)
594     //used to convert a diagonal to an index
595     //diagonal is always b_ofs-a_ofs, so the minimum value is -a_len+1
# Line 602 | Line 603
603     GXBandSet(int a_len, int b_len):GList<GXBand>(a_len+b_len-1, false, true, false) {
604        idxoffset=a_len-1;
605        qmatch=NULL;
606 <      tmatch=NULL; //terminal match to be used if everything else fails
606 >      tmatch_l=NULL; //terminal match to be used if everything else fails
607 >      tmatch_r=NULL;
608            //diag will range from -a_len+1 to b_len-1, so after adjustment
609            //by idxoffset we get a max of a_len+b_len-2
610        int bcount=a_len+b_len-1;
611        for (int i=0;i<bcount;i++)
612 <                   this->Add(new GXBand(i-idxoffset));
612 >              this->Add(new GXBand(i-idxoffset));
613             //unsorted, this should set fList[i]
614        }
615     ~GXBandSet() {
# Line 622 | Line 624
624       }
625   };
626  
627 + inline int calc_safelen(int alen) {
628 +         int r=iround(double(alen*0.6));
629 +         return (r<22)? 22 : r;
630 +  }
631 +
632 + struct GXSeqData {
633 +  const char* aseq;
634 +  int alen;
635 +  const char* bseq;
636 +  int blen;
637 +  GVec<uint16>** amers;
638 +  int amlen; //minimum alignment length that's sufficient to
639 +             //trigger the quick extension heuristics
640 +  GXSeqData(const char* sa=NULL, int la=0, const char* sb=NULL, int lb=0,
641 +  GVec<uint16>* mers[]=NULL):aseq(sa), alen(la),
642 +     bseq(sb),  blen(lb), amers(mers), amlen(0) {
643 +   calc_amlen();
644 +   calc_bmlen();
645 +   }
646  
647 < GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len); //for overlap at 3' end of seqb
647 >  void calc_amlen() {
648 >    if (alen) {
649 >       int ah=calc_safelen(alen);
650 >       if (amlen>ah) amlen=ah;
651 >       }
652 >    }
653 >  void calc_bmlen() {
654 >    if (blen) {
655 >      int bh = iround(double(blen)*0.6);
656 >      if (bh<22) bh=22;
657 >      if (amlen>bh) amlen=bh;
658 >      }
659 >    }
660 >  void update(const char* sa, int la, GVec<uint16>** mers,
661 >          const char* sb, int lb, int mlen=0) {
662 >     aseq=sa;
663 >     alen=la;
664 >     amers=mers;
665 >     if (mlen) {
666 >       amlen=mlen;
667 >       }
668 >       else calc_amlen();
669 >     if (sb==bseq && blen==lb) return;
670 >     bseq=sb;
671 >     blen=lb;
672 >     calc_bmlen();
673 >     }
674 >  /*
675 >  void update_b(const char* sb, int lb) {
676 >     if (sb==bseq && blen==lb) return;
677 >     bseq=sb;
678 >     blen=lb;
679 >     calc_bmlen();
680 >     }*/
681 > };
682  
683 < GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len); //for overlap at 5' end of seqb
683 > uint16 get6mer(char* p);
684 > void table6mers(const char* s, int slen, GVec<uint16>* amers[]);
685  
686   void printEditScript(GXEditScript* ed_script);
687  
688  
689   int GXGreedyExtend(const char* seq1, int len1,
690                         const char* seq2, int len2,
691 <                       bool reverse, int xdrop_threshold,
636 <                       int match_cost, int mismatch_cost,
691 >                       bool reverse, //int xdrop_threshold, int match_cost, int mismatch_cost,
692                         int& seq1_align_len, int& seq2_align_len,
693                         CGreedyAlignData& aux_data,
694                         GXEditScript *edit_block);
695  
696  
697   enum GAlnTrimType {
698 <  galn_NoTrim=0,
698 >  //Describes trimming intent
699 >  galn_None=0, //no trimming, just alignment
700    galn_TrimLeft,
701 <  galn_TrimRight
701 >  galn_TrimRight,
702 >  galn_TrimEither //adaptor should be trimmed from either end
703    };
704  
705   struct CAlnTrim {
706    GAlnTrimType type;
707 <  int boundary; //base index (either left or right) excluding terminal poly-A stretches
708 <  void prepare(GAlnTrimType trim_type, const char* s, int s_len) {
709 <    type=trim_type;
710 <    boundary=0;
711 <    if (type==galn_TrimLeft) {
707 >  int minMatch; //minimum terminal exact match that will be removed from ends
708 >  int l_boundary; //base index (either left or right) excluding terminal poly-A stretches
709 >  int r_boundary; //base index (either left or right) excluding terminal poly-A stretches
710 >  int alen; //query/adaptor seq length (for validate())
711 >  int safelen; //alignment length > amlen should be automatically validated
712 >  int seedlen;
713 >  void prepare(const char* s, int s_len) {
714 >    //type=trim_type;
715 >    //amlen=smlen;
716 >    l_boundary=0;
717 >    r_boundary=0;
718 >    //if (type==galn_TrimLeft) {
719          int s_lbound=0;
720          if (s[0]=='A' && s[1]=='A' && s[2]=='A') {
721             s_lbound=3;
# Line 661 | Line 725
725             s_lbound=4;
726             while (s_lbound<s_len-1 && s[s_lbound]=='A') s_lbound++;
727             }
728 <        boundary=s_lbound+3;
729 <        return;
730 <        }
731 <    if (type==galn_TrimRight) {
728 >        l_boundary=s_lbound+3;
729 >    //    return;
730 >    //    }
731 >    //if (type==galn_TrimRight) {
732         int r=s_len-1;
733         if (s[r]=='A' && s[r-1]=='A' && s[r-2]=='A') {
734            r-=3;
# Line 674 | Line 738
738            r-=4;
739            while (r>0 && s[r]=='A') r--;
740            }
741 <       boundary=r-3;
742 <       }
741 >       r_boundary=r-3;
742 >    //   }
743      }
744  
745 <  CAlnTrim(GAlnTrimType trim_type, const char* s, int s_len) {
746 <    prepare(trim_type, s, s_len);
745 >  CAlnTrim(GAlnTrimType trim_type, const char* s, int s_len, int a_len, int minEndTrim, int smlen):
746 >                   type(trim_type), minMatch(minEndTrim), l_boundary(0), r_boundary(0),
747 >                   alen(a_len), safelen(smlen) {
748 >    prepare(s, s_len);
749      }
750  
751 <  bool validate(int sl, int sr, int alnpid, int adist) {
752 <   int alnlen=sr-sl+1;
751 >  bool validate_R(int sr, int admax, int badj, int adist) {
752 >        if (adist>admax) return false;
753 >        return (sr>=r_boundary+badj);
754 >   }
755 >
756 >  bool validate_L(int sl, int alnlen, int admax, int badj, int alnpid, int adist) {
757 >        if (adist>admax) return false;
758 >    //left match should be more stringent (5')
759 >    if (alnpid<93) {
760 >      if (alnlen<13 || alnlen<minMatch) return false;
761 >      admax=0;
762 >      badj++;
763 >      }
764 >    return (sl<=l_boundary-badj);
765 >  }
766 >
767 >  bool validate(GXAlnInfo* alninfo) {
768 >   int alnlen=alninfo->sr - alninfo->sl + 1;
769 >   if (alninfo->pid>90.0 && alnlen>safelen) {
770 >           //special case: heavy match, could be in the middle
771 >           if (alninfo->pid>94)
772 >                    alninfo->strong=true;
773 >           return true;
774 >       }
775 >   int sl=alninfo->sl;
776 >   int sr=alninfo->sr;
777     sl--;sr--; //boundary is 0-based
778 <   int badj=0;
779 <   int admax=3;
780 <   if (alnlen<11) {
778 >   int badj=0; //default boundary is 3 bases distance to end
779 >   int admax=1;
780 >   if (alnlen<13) {
781        //stricter boundary check
782 +      if (alninfo->pid<90) return false;
783        badj=2;
784 <      admax=1;
694 <      if (alnlen<=6) badj++;
784 >      if (alnlen<=7) { badj++; admax=0; }
785        }
786 <   if (adist>admax) return false;
786 >   if (type==galn_TrimLeft) {
787 >         return validate_L(sl, alnlen, admax, badj, alninfo->pid, alen-alninfo->qr);
788 >     }
789 >   else if (type==galn_TrimRight) {
790 >         return validate_R(sr, admax, badj, alninfo->ql-1);
791 >     }
792 >   else if (type==galn_TrimEither) {
793 >     return (validate_R(sr, admax, badj, alninfo->ql-1) ||
794 >           validate_L(sl, alnlen, admax, badj, alninfo->pid, alen-alninfo->qr));
795 >     }
796 >   return true;
797 >   /*
798     if (type==galn_TrimRight) {
799        return (sr>=boundary+badj);
800        }
801     else {
802 <      //left side should be more stringent
803 <      if (alnpid<91) {
804 <        if (alnlen<11) return false;
802 >      //left match should be more stringent (5')
803 >      if (alnpid<93) {
804 >        if (alnlen<13) return false;
805 >        admax=0;
806          badj++;
807          }
808        return (sl<=boundary-badj);
809        }
810 +    */
811     }
709
812   };
813  
814  
815 < // reward MUST be >1, always
815 >
816 >
817 > //GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, GXSeqData& sd); //for overlap at 3' end of seqb
818 >
819 > GXBandSet* collectSeeds(GList<GXSeed>& seeds, GXSeqData& sd, GAlnTrimType trim_intent); //for overlap at 5' end of seqb
820 >                                                                //=galn_None
821 > // reward MUST be >1 for this function
822   GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
823                    const char* s_seq, int s_alnstart, int s_max,
824                    int reward, int penalty, int xdrop, CGreedyAlignData* gxmem=NULL,
# Line 720 | Line 828
828                         CAlnTrim* trim=NULL, bool editscript=false);
829  
830   GXAlnInfo* GreedyAlign(const char* q_seq,  int q_alnstart, const char* s_seq, int s_alnstart,
831 <        bool editscript=false, int reward=2, int penalty=3, int xdrop=8);
831 >        bool editscript=false, int reward=2, int penalty=10, int xdrop=32);
832  
833 < GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
834 <          CGreedyAlignData* gxmem=NULL, int min_pid=83);
835 < GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
728 <          CGreedyAlignData* gxmem=NULL, int min_pid=73);
833 > GXAlnInfo* match_adaptor(GXSeqData& sd, GAlnTrimType trim_type, int minMatch,
834 >                                CGreedyAlignData* gxmem=NULL, int min_pid=90);
835 > //GXAlnInfo* match_RightEnd(GXSeqData& sd, CGreedyAlignData* gxmem=NULL, int min_pid=90);
836   #endif

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines