ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.h
(Generate patch)
# Line 5 | Line 5
5  
6   #include "GBase.h"
7   #include "GList.hh"
8 < #include <string.h>
8 > #include "gdna.h"
9  
10   //#define GDEBUG 1
11  
# Line 98 | Line 98
98  
99          if (GX_EDITOP_GET(oplast) == op) {
100              uint32 l=ops[opnum-1];
101 <            ops[opnum-1]=GX_EDITOP_CONS((GX_EDITOP_GET(l)),
101 >            ops[opnum-1]=GX_EDITOP_CONS((GX_EDITOP_GET(l)),
102                 (GX_EDITOP_VAL(l) + k));
103              }
104          else {
# Line 215 | Line 215
215  
216   };
217  
218 < #define GREEDY_MAX_COST_FRACTION 5
218 > #define GREEDY_MAX_COST_FRACTION 8
219   /* (was 2) sequence_length / (this number) is a measure of how hard the
220    alignment code will work to find the optimal alignment; in fact
221    this gives a worst case bound on the number of loop iterations */
# Line 227 | Line 227
227   #define GX_GALLOC_ERROR "Error: failed to allocate memory for greedy alignment!\n"
228  
229   // all auxiliary memory needed for the greedy extension algorithm
230 < struct SGreedyAlignMem {
230 > class CGreedyAlignData {
231 >   int d_diff;
232 >   int max_d;
233 > public:
234     int** last_seq2_off;              // 2-D array of distances
235     int* max_score;                   // array of maximum scores
236     GXMemPool* space;                // local memory pool for SGreedyOffset structs
237 <   int d_diff;
238 <   int max_d;
237 >   //
238 >   int match_reward;
239 >   int mismatch_penalty;
240 >   int x_drop;
241     // Allocate memory for the greedy gapped alignment algorithm
242 <   SGreedyAlignMem(int reward, int penalty, int Xdrop) {
242 >   CGreedyAlignData(int reward, int penalty, int xdrop) {
243        //int max_d, diff_d;
244        if (penalty<0) penalty=-penalty;
245        if (reward % 2) {
246           //scale params
247 <         reward = reward << 1;
248 <         penalty = (penalty << 1);
249 <         Xdrop = Xdrop<<1;
247 >         match_reward = reward << 1;
248 >         mismatch_penalty = (penalty << 1);
249 >         x_drop = xdrop<<1;
250 >         }
251 >      else {
252 >         match_reward=reward;
253 >         mismatch_penalty = penalty;
254 >         x_drop=xdrop;
255           }
256        //if (gap_open == 0 && gap_extend == 0)
257        //   gap_extend = (reward >> 1) + penalty;
# Line 254 | Line 264
264        space=NULL;           // local memory pool for SGreedyOffset structs
265        //if (score_params.gap_open==0 && score_params.gap_extend==0) {
266           //non-affine, simpler Greedy algorithm
267 <       d_diff = (Xdrop+reward/2)/(penalty+reward)+1;
267 >       d_diff = (x_drop+match_reward/2)/(mismatch_penalty+match_reward)+1;
268         GMALLOC(last_seq2_off, ((max_d + 2) * sizeof(int*)));
269         if (!last_seq2_off)
270           GError(GX_GALLOC_ERROR);
# Line 283 | Line 293
293       GCALLOC(max_score, (sizeof(int) * (max_d + 1 + d_diff)));
294       if (!max_score) GError(GX_GALLOC_ERROR);
295       }
296 <   ~SGreedyAlignMem() {
296 >   ~CGreedyAlignData() {
297       if (last_seq2_off) {
298           GFREE(last_seq2_off[0]);
299           GFREE(last_seq2_off);
# Line 336 | Line 346
346         a_len=0;
347             b_len=0;
348             if (ed_script==NULL) return;
349 <           for (uint32 i=0; i<ed_script->opnum; i++) {
349 >           for (uint32 i=0; i<ed_script->opnum; i++) {
350                    int num=((ed_script->ops[i]) >> 2);
351                    char op_type = 3 - ( ed_script->ops[i] & gxEDIT_OP_MASK );
352                    if (op_type == 3 || op_type < 0 )
# Line 361 | Line 371
371             }
372  
373   #ifdef GDEBUG
374 <        void printAlignment(FILE* f, const char* sa, int sa_len,
374 >        void printAlignment(FILE* f, const char* sa, int sa_len,
375                       const char* sb, int sb_len) {
376                  //print seq A
377             char al[1024]; //display buffer for seq A
# Line 426 | Line 436
436         }
437   #endif
438    };
439 <  
439 >
440   struct GXAlnInfo {
441   const char *qseq;
442   int ql,qr;
# Line 456 | Line 466
466    bool operator<(GXAlnInfo& d) {
467      return ((score==d.score)? pid>d.pid : score>d.score);
468      }
459  bool operator>(GXAlnInfo& d) {
460    return ((score==d.score)? pid<d.pid : score<d.score);
461    }
469    bool operator==(GXAlnInfo& d) {
470      return (score==d.score && pid==d.pid);
471      }
# Line 474 | Line 481
481     bool operator<(GXSeed& d){
482        return ((b_ofs==d.b_ofs) ? a_ofs<d.a_ofs : b_ofs<d.b_ofs);
483        }
477   bool operator>(GXSeed& d){
478      return ((b_ofs==d.b_ofs) ? a_ofs>d.a_ofs : b_ofs>d.b_ofs);
479      }
484     bool operator==(GXSeed& d){
485        return (b_ofs==d.b_ofs && a_ofs==d.a_ofs); //should never be the case, seeds are uniquely constructed
486        }
# Line 499 | Line 503
503     //seeds for this, and diag+1 and diag+2 are stored here
504    int min_a, max_a; //maximal coordinates of the bundle
505    int min_b, max_b;
506 +  int w_min_b; //weighted average of left start coordinate
507 +  int avg_len;
508    GList<GXSeed> seeds; //sorted by x coordinate (b_ofs)
509    int score; //sum of seed scores (- overlapping_bases/2 - gaps)
510 +  bool tested;
511    GXBand(int start_diag=-1, GXSeed* seed=NULL):seeds(true, false, false) {
512            diag=start_diag;
513            min_a=MAX_INT;
# Line 508 | Line 515
515            max_a=0;
516            max_b=0;
517            score=0;
518 +          avg_len=0;
519 +          w_min_b=0;
520 +          tested=false;
521      if (seed!=NULL) addSeed(seed);
522      }
523    void addSeed(GXSeed* seed) {
524       seeds.Add(seed);
525       score+=seed->len;
526 +     avg_len+=seed->len;
527 +     w_min_b+=seed->b_ofs * seed->len;
528       //if (diag<0) diag=seed->diag; //should NOT be done like this
529       if (seed->a_ofs < min_a) min_a=seed->a_ofs;
530       if (seed->a_ofs+ seed->len > max_a) max_a=seed->a_ofs+seed->len;
531       if (seed->b_ofs < min_b) min_b=seed->b_ofs;
532       if (seed->b_ofs+seed->len > max_b) max_b=seed->b_ofs+seed->len;
533       }
534 +
535    void finalize() {
536            //!! to be called only AFTER all seeds have been added
537            // seeds are sorted by b_ofs
538            //penalize seed gaps and overlaps on b sequence
539 +    if (avg_len==0) return;
540 +    w_min_b/=avg_len;
541 +    avg_len>>=1;
542            for (int i=1;i<seeds.Count();i++) {
543          GXSeed& sprev=*seeds[i-1];
544          GXSeed& scur=*seeds[i];
# Line 532 | Line 548
548          int a_gap=scur.a_ofs-sprev.a_ofs-sprev.len;
549          int max_gap=b_gap;
550          int min_gap=a_gap;
551 <        if (min_gap>max_gap) swap(max_gap, min_gap);
551 >        if (min_gap>max_gap) Gswap(max_gap, min_gap);
552 >        int _penalty=0;
553          if (min_gap<0) { //overlap
554 <               if (max_gap>0) { score-=GMAX((-min_gap), max_gap); }
555 <                  else score+=min_gap;
554 >               if (max_gap>0) { _penalty=GMAX((-min_gap), max_gap); }
555 >                  else _penalty=-min_gap;
556                 }
557              else { //gap
558 <               score-=max_gap;
558 >               _penalty=max_gap;
559                 }
560 <              }//for each seed
560 >        score-=(_penalty>>1);
561 >        //score-=_penalty;
562 >              }//for each seed
563       }
564  
565    //bands will be sorted by decreasing score eventually, after all seeds are added
566    //more seeds better than one longer seed?
567    bool operator<(GXBand& d){
568 <     return ((score==d.score) ? seeds.Count()>d.seeds.Count() : score>d.score);
569 <     }
551 <  bool operator>(GXBand& d){
552 <     return ((score==d.score) ? seeds.Count()<d.seeds.Count() : score<d.score);
568 >     //return ((score==d.score) ? seeds.Count()>d.seeds.Count() : score>d.score);
569 >     return ((score==d.score) ? w_min_b<d.w_min_b : score>d.score);
570       }
571    bool operator==(GXBand& d){
572 <     //return (score==d.score && seeds.Count()==d.seeds.Count());
573 <    return (score==d.score && seeds.Count()==d.seeds.Count());
572 >    //return (score==d.score && seeds.Count()==d.seeds.Count());
573 >     return (score==d.score && w_min_b==d.w_min_b);
574       }
575  
576   };
# Line 561 | Line 578
578   class GXBandSet:public GList<GXBand> {
579    public:
580     GXSeed* qmatch; //long match (mismatches allowed) if a very good match was extended well
581 +   GXSeed* tmatch; //terminal match to be used if there is no better alignment
582     int idxoffset; //global anti-diagonal->index offset (a_len-1)
583     //used to convert a diagonal to an index
584     //diagonal is always b_ofs-a_ofs, so the minimum value is -a_len+1
# Line 574 | Line 592
592     GXBandSet(int a_len, int b_len):GList<GXBand>(a_len+b_len-1, false, true, false) {
593        idxoffset=a_len-1;
594        qmatch=NULL;
595 +      tmatch=NULL; //terminal match to be used if everything else fails
596            //diag will range from -a_len+1 to b_len-1, so after adjustment
597            //by idxoffset we get a max of a_len+b_len-2
598        int bcount=a_len+b_len-1;
# Line 594 | Line 613
613   };
614  
615  
616 < GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len); //for overlap at 3' end of seqb
616 > uint16 get6mer(char* p);
617 > void table6mers(const char* s, int slen, GVec<uint16>* amers[]);
618  
619 < GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len); //for overlap at 5' end of seqb
619 > GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, GVec<uint16>* amers[],
620 >         const char* seqb, int b_len); //for overlap at 3' end of seqb
621 >
622 > GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len, GVec<uint16>* amers[],
623 >        const char* seqb, int b_len); //for overlap at 5' end of seqb
624  
625   void printEditScript(GXEditScript* ed_script);
626  
# Line 606 | Line 630
630                         bool reverse, int xdrop_threshold,
631                         int match_cost, int mismatch_cost,
632                         int& seq1_align_len, int& seq2_align_len,
633 <                       SGreedyAlignMem& aux_data,
633 >                       CGreedyAlignData& aux_data,
634                         GXEditScript *edit_block);
635  
636  
# Line 616 | Line 640
640    galn_TrimRight
641    };
642  
643 + struct CAlnTrim {
644 +  GAlnTrimType type;
645 +  int boundary; //base index (either left or right) excluding terminal poly-A stretches
646 +  void prepare(GAlnTrimType trim_type, const char* s, int s_len) {
647 +    type=trim_type;
648 +    boundary=0;
649 +    if (type==galn_TrimLeft) {
650 +        int s_lbound=0;
651 +        if (s[0]=='A' && s[1]=='A' && s[2]=='A') {
652 +           s_lbound=3;
653 +           while (s_lbound<s_len-1 && s[s_lbound]=='A') s_lbound++;
654 +           }
655 +        else if (s[1]=='A' && s[2]=='A' && s[3]=='A') {
656 +           s_lbound=4;
657 +           while (s_lbound<s_len-1 && s[s_lbound]=='A') s_lbound++;
658 +           }
659 +        boundary=s_lbound+3;
660 +        return;
661 +        }
662 +    if (type==galn_TrimRight) {
663 +       int r=s_len-1;
664 +       if (s[r]=='A' && s[r-1]=='A' && s[r-2]=='A') {
665 +          r-=3;
666 +          while (r>0 && s[r]=='A') r--;
667 +          }
668 +       else if (s[r-1]=='A' && s[r-2]=='A' && s[r-3]=='A') {
669 +          r-=4;
670 +          while (r>0 && s[r]=='A') r--;
671 +          }
672 +       boundary=r-3;
673 +       }
674 +    }
675 +
676 +  CAlnTrim(GAlnTrimType trim_type, const char* s, int s_len) {
677 +    prepare(trim_type, s, s_len);
678 +    }
679 +
680 +  bool validate(int sl, int sr, int alnpid, int adist) {
681 +   int alnlen=sr-sl+1;
682 +   sl--;sr--; //boundary is 0-based
683 +   int badj=0; //default boundary is 3 bases distance to end
684 +   int admax=1;
685 +   if (alnlen<13) {
686 +      //stricter boundary check
687 +      if (alnpid<90) return false;
688 +      badj=2;
689 +      if (alnlen<=7) { badj++; admax=0; }
690 +      }
691 +   if (adist>admax) return false;
692 +   if (type==galn_TrimRight) {
693 +      return (sr>=boundary+badj);
694 +      }
695 +   else {
696 +      //left match should be more stringent (5')
697 +      if (alnpid<93) {
698 +        if (alnlen<13) return false;
699 +        admax=0;
700 +        badj++;
701 +        }
702 +      return (sl<=boundary-badj);
703 +      }
704 +   }
705 +
706 + };
707 +
708 +
709   // reward MUST be >1, always
710   GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
711 <                  const char* s_seq, int s_alnstart, int s_max, GAlnTrimType trimtype=galn_NoTrim,
712 <                     bool editscript=false, SGreedyAlignMem* gxmem=NULL, int reward=2, int penalty=3, int xdrop=8);
711 >                  const char* s_seq, int s_alnstart, int s_max,
712 >                  int reward, int penalty, int xdrop, CGreedyAlignData* gxmem=NULL,
713 >                  CAlnTrim* trim=NULL, bool editscript=false);
714 > GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
715 >                       const char* s_seq, int s_alnstart, int s_max, CGreedyAlignData* gxmem,
716 >                       CAlnTrim* trim=NULL, bool editscript=false);
717 >
718   GXAlnInfo* GreedyAlign(const char* q_seq,  int q_alnstart, const char* s_seq, int s_alnstart,
719          bool editscript=false, int reward=2, int penalty=3, int xdrop=8);
720  
721 < GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
722 <          SGreedyAlignMem* gxmem, int match_reward=2, int mismatch_penalty=3, int Xdrop=8);
723 < GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
724 <            SGreedyAlignMem* gxmem, int match_reward=2, int mismatch_penalty=3, int Xdrop=8);
721 > GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, GVec<uint16>* amers[],
722 >           const char* seqb, int seqb_len, CGreedyAlignData* gxmem=NULL, int min_pid=83);
723 > GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, GVec<uint16>* amers[],
724 >           const char* seqb, int seqb_len, CGreedyAlignData* gxmem=NULL, int min_pid=73);
725   #endif

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines