ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.cpp
(Generate patch)
# Line 184 | Line 184
184                         bool reverse, int xdrop_threshold,
185                         int match_cost, int mismatch_cost,
186                         int& seq1_align_len, int& seq2_align_len,
187 <                       SGreedyAlignMem& aux_data,
187 >                       CGreedyAlignData& aux_data,
188                         GXEditScript *edit_block) {
189                         //GapPrelimEditBlock *edit_block,
190                         //bool& fence_hit, SGreedySeed *seed) {
# Line 536 | Line 536
536   int q_max=strlen(q_seq); //query
537   int s_max=strlen(s_seq); //subj
538   return GreedyAlignRegion(q_seq, q_alnstart, q_max,
539 <          s_seq,  s_alnstart, s_max, galn_NoTrim, editscript, NULL, reward, penalty, xdrop);
539 >          s_seq,  s_alnstart, s_max, reward, penalty, xdrop, NULL, NULL, editscript);
540   }
541  
542   struct GXSeedTable {
# Line 575 | Line 575
575  
576   };
577  
578 + const int a_m_score=2; //match score
579 + const int a_mis_score=-3; //mismatch
580 + const int a_dropoff_score=7;
581 + const int a_min_score=12; //at least 6 bases full match
582 +
583 + // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
584 + //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
585 + //check minimum score and
586 + //for 3' adapter trimming:
587 + //     require that the right end of the alignment for either the adaptor OR the read must be
588 + //     < 3 distance from its right end
589 + // for 5' adapter trimming:
590 + //     require that the left end of the alignment for either the adaptor OR the read must
591 + //     be at coordinate < 3 from start
592 +
593 + bool extendUngapped(const char* a, int alen, int ai,
594 +                 const char* b, int blen, int bi, int& mlen, int& l5, int& l3, bool end5=false) {
595 + //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
596 + //if (debug) {
597 + //  GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
598 + //  }
599 + int a_l=ai; //alignment coordinates on a
600 + int a_r=ai+mlen-1;
601 + int b_l=bi; //alignment coordinates on b
602 + int b_r=bi+mlen-1;
603 + int ai_maxscore=ai;
604 + int bi_maxscore=bi;
605 + int score=mlen*a_m_score;
606 + int maxscore=score;
607 + int mism5score=a_mis_score;
608 + if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
609 + //try to extend to the left first, if possible
610 + while (ai>0 && bi>0) {
611 +   ai--;
612 +   bi--;
613 +   score+= (a[ai]==b[bi])? a_m_score : mism5score;
614 +   if (score>maxscore) {
615 +       ai_maxscore=ai;
616 +       bi_maxscore=bi;
617 +       maxscore=score;
618 +       }
619 +     else if (maxscore-score>a_dropoff_score) break;
620 +   }
621 + a_l=ai_maxscore;
622 + b_l=bi_maxscore;
623 + //now extend to the right
624 + ai_maxscore=a_r;
625 + bi_maxscore=b_r;
626 + ai=a_r;
627 + bi=b_r;
628 + score=maxscore;
629 + //sometimes there are extra As at the end of the read, ignore those
630 + if (a[alen-2]=='A' && a[alen-1]=='A') {
631 +    alen-=2;
632 +    while (a[alen-1]=='A' && alen>ai) alen--;
633 +    }
634 + while (ai<alen-1 && bi<blen-1) {
635 +   ai++;
636 +   bi++;
637 +   //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
638 +   if (a[ai]==b[bi]) { //match
639 +      score+=a_m_score;
640 +      if (ai>=alen-2) {
641 +           score+=a_m_score-(alen-ai-1);
642 +           }
643 +      }
644 +    else { //mismatch
645 +      score+=a_mis_score;
646 +      }
647 +   if (score>maxscore) {
648 +       ai_maxscore=ai;
649 +       bi_maxscore=bi;
650 +       maxscore=score;
651 +       }
652 +     else if (maxscore-score>a_dropoff_score) break;
653 +   }
654 +  a_r=ai_maxscore;
655 +  b_r=bi_maxscore;
656 +  int a_ovh3=alen-a_r-1;
657 +  int b_ovh3=blen-b_r-1;
658 +  int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
659 +  int mmovh5=(a_l<b_l)? a_l : b_l;
660 +  if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
661 +     if (a_l<a_ovh3) {
662 +        //adapter closer to the left end (typical for 5' adapter)
663 +        l5=a_r+1;
664 +        l3=alen-1;
665 +        }
666 +      else {
667 +        //adapter matching at the right end (typical for 3' adapter)
668 +        l5=0;
669 +        l3=a_l-1;
670 +        }
671 +     return true;
672 +     }
673 +  //do not trim:
674 +  l5=0;
675 +  l3=alen-1;
676 +  return false;
677 + }
678 +
679  
680   GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
681   //overlap of right (3') end of seqb
# Line 609 | Line 710
710                   aix++;bix++;
711                   len++;
712                 }
713 +          if (len>7) {
714 +              //heuristics: very likely the best we can get
715 +              int qlen=len;
716 +              while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
717 +                aix++;
718 +                bix++;
719 +                qlen++;
720 +                }
721 +              if (qlen>10) { //found a quick match shortcut
722 +                 diagstrips->qmatch=new GXSeed(ai,bi,qlen);
723 +                 return diagstrips;
724 +                 }
725 +              }
726            GXSeed* newseed=new GXSeed(ai,bi,len);
727            seeds.Add(newseed);
728            diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
# Line 655 | Line 769
769                   aix++;bix++;
770                   len++;
771                 }
772 +          if (len>7) {
773 +              //heuristics: very likely the best we can get
774 +              int qlen=len;
775 +              while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
776 +                aix++;
777 +                bix++;
778 +                qlen++;
779 +                }
780 +              if (qlen>10) { //found a quick match shortcut
781 +                 diagstrips->qmatch=new GXSeed(ai,bi,qlen);
782 +                 return diagstrips;
783 +                 }
784 +              }
785            GXSeed* newseed=new GXSeed(ai,bi,len);
786            seeds.Add(newseed);
787            diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
# Line 668 | Line 795
795   return diagstrips;
796   }
797  
671
672
798   int cmpSeedScore(const pointer p1, const pointer p2) {
799    //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
800    GXSeed* s1=(GXSeed*)p1;
# Line 686 | Line 811
811    return ((s1->b_ofs-s1->a_ofs)-(s2->b_ofs-s2->a_ofs));
812   }
813  
814 +
815 + int cmpDiagBands_R(const pointer p1, const pointer p2) {
816 +  //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
817 +  GXBand* b1=(GXBand*)p1;
818 +  GXBand* b2=(GXBand*)p2;
819 +  if (b1->score==b2->score) {
820 +     return (b2->w_min_b-b1->w_min_b);
821 +     }
822 +  else return (b2->score-b1->score);
823 + }
824 +
825 +
826 +
827   GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
828 <                       const char* s_seq, int s_alnstart, int s_max, GAlnTrimType trimtype,
829 <                        bool editscript, SGreedyAlignMem* gxmem, int reward, int penalty, int xdrop) {
828 >    const char* s_seq, int s_alnstart, int s_max,
829 >    int reward, int penalty, int xdrop, CGreedyAlignData* gxmem,
830 >    CAlnTrim* trim, bool editscript) {
831    GXEditScript* ed_script_fwd = NULL;
832    GXEditScript* ed_script_rev = NULL;
833    if ( q_alnstart>q_max || q_alnstart<1 || s_alnstart>s_max || s_alnstart<1 )
# Line 705 | Line 844
844    const char* s=s_seq+s_alnstart;
845    int s_avail=s_max-s_alnstart;
846    if (penalty<0) penalty=-penalty;
847 <  int MIN_GREEDY_SCORE=5*reward; //minimum score for an alignment to be reported
847 >  int MIN_GREEDY_SCORE=5*reward; //minimum score for an alignment to be reported for 0 diffs
848    GXAlnInfo* alninfo=NULL;
849    bool freeAlnMem=(gxmem==NULL);
850    if (freeAlnMem) {
851 <     gxmem=new SGreedyAlignMem(reward, penalty, xdrop);
851 >     gxmem=new CGreedyAlignData(reward, penalty, xdrop);
852       }
853    else gxmem->reset();
854    int retscore = 0;
855    int numdiffs = 0;
856 <  if (trimtype==galn_TrimLeft) {
856 >  if (trim!=NULL && trim->type==galn_TrimLeft) {
857      if (editscript)
858         ed_script_rev=new GXEditScript();
859  
860 <    int numdiffs_l =  GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
860 >    int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
861                     reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
862      //check this extension here and bail out if it's not a good extension
863 <    if (s_alnstart+1-s_ext_l>3) {
863 >    if (s_alnstart+1-s_ext_l>trim->boundary) {
864        delete ed_script_rev;
865        if (freeAlnMem) delete gxmem;
866        return NULL;
# Line 733 | Line 872
872                              reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
873      numdiffs=numdiffs_r+numdiffs_l;
874      //convert num diffs to actual score
875 <    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward-penalty);
875 >    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward+penalty);
876      if (editscript)
877         ed_script_rev->Append(ed_script_fwd); //combine the two extensions
878      }
# Line 745 | Line 884
884                              reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
885      //check extension here and bail out if not a good right extension
886      //assuming s_max is really at the right end of s_seq
887 <    if (trimtype==galn_TrimRight && s_alnstart+s_ext_r<s_max-2) {
887 >    if (trim!=NULL && trim->type==galn_TrimRight && s_alnstart+s_ext_r<trim->boundary) {
888        delete ed_script_fwd;
889        if (freeAlnMem) delete gxmem;
890        return NULL;
# Line 756 | Line 895
895                     reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
896      //convert num diffs to actual score
897      numdiffs=numdiffs_r+numdiffs_l;
898 <    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward-penalty);
898 >    retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward+penalty);
899      if (editscript)
900         ed_script_rev->Append(ed_script_fwd); //combine the two extensions
901    }
# Line 784 | Line 923
923    delete ed_script_fwd;
924    return alninfo;
925   }
926 +
927 + GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
928 +                       const char* s_seq, int s_alnstart, int s_max, CGreedyAlignData* gxmem,
929 +                       CAlnTrim* trim, bool editscript) {
930 + int reward=2;
931 + int penalty=3;
932 + int xdrop=8;
933 + if (gxmem) {
934 +   reward=gxmem->match_reward;
935 +   penalty=gxmem->mismatch_penalty;
936 +   xdrop=gxmem->x_drop;
937 +   }
938 + return GreedyAlignRegion(q_seq, q_alnstart, q_max, s_seq, s_alnstart, s_max,
939 +     reward, penalty, xdrop, gxmem, trim, editscript);
940 + }
941 +
942 + GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
943 +            CGreedyAlignData* gxmem, int min_pid) {
944 +  bool editscript=false;
945 +  #ifdef GDEBUG
946 +   editscript=true;
947 +   GMessage("==========> matching Right (3') end ========\n");
948 +  #endif
949 +
950 +  CAlnTrim trimInfo(galn_TrimRight, seqb, seqb_len);
951 +  GList<GXSeed> rseeds(true,true,false);
952 +  GXBandSet* alnbands=collectSeeds_R(rseeds, seqa, seqa_len, seqb, seqb_len);
953 +  GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
954 +  //did we find a shortcut?
955 +  if (alnbands->qmatch) {
956 +    #ifdef GDEBUG
957 +     GMessage("::: Found a quick long match -- using it instead of diagonal bands\n");
958 +    #endif
959 +    anchor_seeds.Add(alnbands->qmatch);
960 +    }
961 +  else {
962 +    int max_top_bands=5;
963 +    int top_band_count=0;
964 +    for (int b=0;b<alnbands->Count();b++) {
965 +       if (alnbands->Get(b)->score<4) break;
966 +       #ifdef GDEBUG
967 +       GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
968 +       #endif
969 +       top_band_count++;
970 +       GXBand& band=*(alnbands->Get(b));
971 +       band.seeds.setSorted(cmpSeedScore);
972 +       anchor_seeds.Add(band.seeds.First());
973 +       band.tested=true;
974 +       if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
975 +       }
976 +    #ifdef GDEBUG
977 +    GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
978 +    #endif
979 +    }
980 +
981 +  GList<GXAlnInfo> galns(true,true,false);
982 +  for (int i=0;i<anchor_seeds.Count();i++) {
983 +    GXSeed& aseed=*anchor_seeds[i];
984 +    int a1=aseed.a_ofs+(aseed.len>>1)+1;
985 +    int a2=aseed.b_ofs+(aseed.len>>1)+1;
986 +    GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
987 +                            seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
988 +    if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
989 +             galns.AddIfNew(alninfo, true);
990 +        else delete alninfo;
991 +    }
992 +  //special case: due to the scoring scheme bias towards the 5' end of the read,
993 +  //also try some seeds at the 3' end
994 +  if (galns.Count()==0) {
995 +    alnbands->setSorted(cmpDiagBands_R);
996 +    int max_top_bands=4;
997 +    int top_band_count=0;
998 +    #ifdef GDEBUG
999 +    GMessage(":::>> Retrying adjusting sort order.\n");
1000 +    #endif
1001 +    for (int b=0;b<alnbands->Count();b++) {
1002 +       if (alnbands->Get(b)->score<4) break;
1003 +       #ifdef GDEBUG
1004 +       GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
1005 +       #endif
1006 +       if (alnbands->Get(b)->tested) continue;
1007 +       top_band_count++;
1008 +       GXBand& band=*(alnbands->Get(b));
1009 +       band.seeds.setSorted(cmpSeedScore);
1010 +       anchor_seeds.Add(band.seeds.First());
1011 +       if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1012 +       }
1013 +    #ifdef GDEBUG
1014 +    GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1015 +    #endif
1016 +    for (int i=0;i<anchor_seeds.Count();i++) {
1017 +      GXSeed& aseed=*anchor_seeds[i];
1018 +      int a1=aseed.a_ofs+(aseed.len>>1)+1;
1019 +      int a2=aseed.b_ofs+(aseed.len>>1)+1;
1020 +      GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1021 +                              seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1022 +      if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1023 +               galns.AddIfNew(alninfo, true);
1024 +          else delete alninfo;
1025 +      }
1026 +    }
1027 +  #ifdef GDEBUG
1028 +  for (int i=0;i<galns.Count();i++) {
1029 +    GXAlnInfo* alninfo=galns[i];
1030 +    GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
1031 +                         alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
1032 +    if (alninfo->gapinfo!=NULL) {
1033 +      GMessage("Alignment:\n");
1034 +      alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1035 +      }
1036 +    }
1037 +  #endif
1038 +  //---- done
1039 +  delete alnbands;
1040 +  if (galns.Count()) {
1041 +    GXAlnInfo* bestaln=galns.Shift();
1042 +    return bestaln;
1043 +    }
1044 +  else return NULL;
1045 + }
1046 +
1047 + GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
1048 +          CGreedyAlignData* gxmem, int min_pid) {
1049 +  bool editscript=false;
1050 +  #ifdef GDEBUG
1051 +   editscript=true;
1052 +   GMessage("==========> matching Left (5') end ========\n");
1053 +  #endif
1054 +  CAlnTrim trimInfo(galn_TrimLeft, seqb, seqb_len);
1055 +  GList<GXSeed> rseeds(true,true,false);
1056 +  GXBandSet* alnbands=collectSeeds_L(rseeds, seqa, seqa_len, seqb, seqb_len);
1057 +  GList<GXAlnInfo> galns(true,true,false);
1058 +  GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
1059 +
1060 +  if (alnbands->qmatch) {
1061 +    #ifdef GDEBUG
1062 +     GMessage("::: Found a quick long match -- using it instead of diagonal bands\n");
1063 +    #endif
1064 +    anchor_seeds.Add(alnbands->qmatch);
1065 +    }
1066 +  else {
1067 +    int max_top_bands=5;
1068 +    int top_band_count=0;
1069 +    for (int b=0;b<alnbands->Count();b++) {
1070 +       if (alnbands->Get(b)->score<4) break;
1071 +       #ifdef GDEBUG
1072 +       GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
1073 +       #endif
1074 +       top_band_count++;
1075 +       GXBand& band=*(alnbands->Get(b));
1076 +       band.seeds.setSorted(cmpSeedScore);
1077 +       anchor_seeds.Add(band.seeds.First());
1078 +       if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1079 +       }
1080 +    #ifdef GDEBUG
1081 +    GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1082 +    #endif
1083 +    }
1084 +  for (int i=0;i<anchor_seeds.Count();i++) {
1085 +    GXSeed& aseed=*anchor_seeds[i];
1086 +    int a1=aseed.a_ofs+(aseed.len>>1)+1;
1087 +    int a2=aseed.b_ofs+(aseed.len>>1)+1;
1088 +    GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1089 +                            seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1090 +    if (alninfo && alninfo->pid>=min_pid
1091 +           && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, seqa_len-alninfo->qr))
1092 +            galns.AddIfNew(alninfo, true);
1093 +       else delete alninfo;
1094 +    }
1095 +
1096 +  #ifdef GDEBUG
1097 +  for (int i=0;i<galns.Count();i++) {
1098 +    GXAlnInfo* alninfo=galns[i];
1099 +    GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
1100 +                         alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
1101 +    if (alninfo->gapinfo!=NULL) {
1102 +      GMessage("Alignment:\n");
1103 +      alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1104 +      }
1105 +    }
1106 +  #endif
1107 +  //---- done
1108 +  delete alnbands;
1109 +  if (galns.Count()) {
1110 +    GXAlnInfo* bestaln=galns.Shift();
1111 +    return bestaln;
1112 +    }
1113 +  else return NULL;
1114 + }

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines