ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.cpp
(Generate patch)
# Line 575 | Line 575
575  
576   };
577  
578 + const int a_m_score=2; //match score
579 + const int a_mis_score=-3; //mismatch
580 + const int a_dropoff_score=7;
581 +
582 + // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
583 + //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
584 + //check minimum score and
585 + //for 3' adapter trimming:
586 + //     require that the right end of the alignment for either the adaptor OR the read must be
587 + //     < 3 distance from its right end
588 + // for 5' adapter trimming:
589 + //     require that the left end of the alignment for either the adaptor OR the read must
590 + //     be at coordinate < 3 from start
591 +
592 + bool extendUngapped(const char* a, int alen, int ai,
593 +                 const char* b, int blen, int bi, int& mlen, int& l5, int& l3, bool end5=false) {
594 + //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
595 + //if (debug) {
596 + //  GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
597 + //  }
598 + int a_l=ai; //alignment coordinates on a
599 + int a_r=ai+mlen-1;
600 + int b_l=bi; //alignment coordinates on b
601 + int b_r=bi+mlen-1;
602 + int ai_maxscore=ai;
603 + int bi_maxscore=bi;
604 + int score=mlen*a_m_score;
605 + int maxscore=score;
606 + int mism5score=a_mis_score;
607 + if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
608 + //try to extend to the left first, if possible
609 + while (ai>0 && bi>0) {
610 +   ai--;
611 +   bi--;
612 +   score+= (a[ai]==b[bi])? a_m_score : mism5score;
613 +   if (score>maxscore) {
614 +       ai_maxscore=ai;
615 +       bi_maxscore=bi;
616 +       maxscore=score;
617 +       }
618 +     else if (maxscore-score>a_dropoff_score) break;
619 +   }
620 + a_l=ai_maxscore;
621 + b_l=bi_maxscore;
622 + //now extend to the right
623 + ai_maxscore=a_r;
624 + bi_maxscore=b_r;
625 + ai=a_r;
626 + bi=b_r;
627 + score=maxscore;
628 + //sometimes there are extra As at the end of the read, ignore those
629 + if (a[alen-2]=='A' && a[alen-1]=='A') {
630 +    alen-=2;
631 +    while (a[alen-1]=='A' && alen>ai) alen--;
632 +    }
633 + while (ai<alen-1 && bi<blen-1) {
634 +   ai++;
635 +   bi++;
636 +   //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
637 +   if (a[ai]==b[bi]) { //match
638 +      score+=a_m_score;
639 +      if (ai>=alen-2) {
640 +           score+=a_m_score-(alen-ai-1);
641 +           }
642 +      }
643 +    else { //mismatch
644 +      score+=a_mis_score;
645 +      }
646 +   if (score>maxscore) {
647 +       ai_maxscore=ai;
648 +       bi_maxscore=bi;
649 +       maxscore=score;
650 +       }
651 +     else if (maxscore-score>a_dropoff_score) break;
652 +   }
653 +  a_r=ai_maxscore;
654 +  b_r=bi_maxscore;
655 +  int a_ovh3=alen-a_r-1;
656 +  int b_ovh3=blen-b_r-1;
657 +  int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
658 +  int mmovh5=(a_l<b_l)? a_l : b_l;
659 +  if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
660 +     if (a_l<a_ovh3) {
661 +        //adapter closer to the left end (typical for 5' adapter)
662 +        l5=a_r+1;
663 +        l3=alen-1;
664 +        }
665 +      else {
666 +        //adapter matching at the right end (typical for 3' adapter)
667 +        l5=0;
668 +        l3=a_l-1;
669 +        }
670 +     return true;
671 +     }
672 +  //do not trim:
673 +  l5=0;
674 +  l3=alen-1;
675 +  return false;
676 + }
677 +
678 +
679 + bool extendUngapped_R(const char* seqa, int a_len, int &aix,
680 +                      const char* seqb, int b_len, int &bix, int &len) {
681 + //must start with a match
682 + if (seqa[aix]!=seqb[bix]) GError("Error at extendUngapped_R: no initial match!\n");
683 +
684 + }
685  
686   GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
687   //overlap of right (3') end of seqb
# Line 609 | Line 716
716                   aix++;bix++;
717                   len++;
718                 }
719 +          if (len>8) {
720 +              //heuristics: very likely the best we can get
721 +              int qlen=len;
722 +              while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
723 +                aix++;
724 +                bix++;
725 +                qlen++;
726 +                }
727 +              if (qlen>10) {
728 +                 diagstrips->qmatch=new GXSeed(ai,bi,qlen);
729 +                 return diagstrips;
730 +                 }
731 +              }
732            GXSeed* newseed=new GXSeed(ai,bi,len);
733            seeds.Add(newseed);
734            diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
# Line 784 | Line 904
904    delete ed_script_fwd;
905    return alninfo;
906   }
907 +
908 + GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
909 +            SGreedyAlignMem* gxmem, int match_reward, int mismatch_penalty, int Xdrop) {
910 +  bool editscript=false;
911 +  #ifdef GDEBUG
912 +   editscript=true;
913 +   GMessage("==========> matching Right (3') end ========\n");
914 +  #endif
915 +  GList<GXSeed> rseeds(true,true,false);
916 +  GXBandSet* alnbands=collectSeeds_R(rseeds, seqa, seqa_len, seqb, seqb_len);
917 +  //did we find a shortcut?
918 +  if (alnbands->qmatch) {
919 +
920 +    }
921 +  GList<GXAlnInfo> galns(true,true,false);
922 +  int max_top_bands=5;
923 +  int top_band_count=0;
924 +  GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
925 +  for (int b=0;b<alnbands->Count();b++) {
926 +     if (alnbands->Get(b)->score<4) break;
927 +     top_band_count++;
928 +     GXBand& band=*(alnbands->Get(b));
929 +     band.seeds.setSorted(cmpSeedScore);
930 +     anchor_seeds.Add(band.seeds.First());
931 +     if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
932 +     }
933 +  #ifdef GDEBUG
934 +  GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
935 +  #endif
936 +
937 +  for (int i=0;i<anchor_seeds.Count();i++) {
938 +    GXSeed& aseed=*anchor_seeds[i];
939 +    int a1=aseed.a_ofs+(aseed.len>>1)+1;
940 +    int a2=aseed.b_ofs+(aseed.len>>1)+1;
941 +    GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
942 +                            seqb, a2, seqb_len, galn_TrimRight, editscript, gxmem,
943 +                            match_reward, mismatch_penalty, Xdrop);
944 +    if (alninfo && alninfo->pid>73 && alninfo->sr>seqb_len-4 && alninfo->ql<4)
945 +             galns.AddIfNew(alninfo, true);
946 +        else delete alninfo;
947 +    }
948 +
949 +  #ifdef GDEBUG
950 +  for (int i=0;i<galns.Count();i++) {
951 +    GXAlnInfo* alninfo=galns[i];
952 +    GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
953 +                         alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
954 +    if (alninfo->gapinfo!=NULL) {
955 +      GMessage("Alignment:\n");
956 +      alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
957 +      }
958 +    }
959 +  #endif
960 +  //---- done
961 +  delete alnbands;
962 +  if (galns.Count()) {
963 +    GXAlnInfo* bestaln=galns.Shift();
964 +    return bestaln;
965 +    }
966 +  else return NULL;
967 + }
968 +
969 + GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
970 +          SGreedyAlignMem* gxmem, int match_reward, int mismatch_penalty, int Xdrop) {
971 +  bool editscript=false;
972 +  #ifdef GDEBUG
973 +   editscript=true;
974 +   GMessage("==========> matching Left (5') end ========\n");
975 +  #endif
976 +  GList<GXSeed> rseeds(true,true,false);
977 +  GXBandSet* alnbands=collectSeeds_L(rseeds, seqa, seqa_len, seqb, seqb_len);
978 +  GList<GXAlnInfo> galns(true,true,false);
979 +  int max_top_bands=5;
980 +  int top_band_count=0;
981 +  GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
982 +  for (int b=0;b<alnbands->Count();b++) {
983 +     if (alnbands->Get(b)->score<4) break;
984 +     top_band_count++;
985 +     GXBand& band=*(alnbands->Get(b));
986 +     band.seeds.setSorted(cmpSeedScore);
987 +     anchor_seeds.Add(band.seeds.First());
988 +     if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
989 +     }
990 +  #ifdef GDEBUG
991 +  GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
992 +  #endif
993 +
994 +  for (int i=0;i<anchor_seeds.Count();i++) {
995 +    GXSeed& aseed=*anchor_seeds[i];
996 +    int a1=aseed.a_ofs+(aseed.len>>1)+1;
997 +    int a2=aseed.b_ofs+(aseed.len>>1)+1;
998 +    GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
999 +                            seqb, a2, seqb_len, galn_TrimLeft, editscript, gxmem,
1000 +                            match_reward, mismatch_penalty, Xdrop-2);
1001 +    if (alninfo && alninfo->pid>82 && alninfo->sl<3 && alninfo->qr>seqa_len-4)
1002 +         //TODO: at 5' end we should use a higher pid threshold
1003 +         // we could also decrease the X-drop value!
1004 +            galns.AddIfNew(alninfo, true);
1005 +       else delete alninfo;
1006 +    }
1007 +
1008 +  #ifdef GDEBUG
1009 +  for (int i=0;i<galns.Count();i++) {
1010 +    GXAlnInfo* alninfo=galns[i];
1011 +    GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
1012 +                         alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
1013 +    if (alninfo->gapinfo!=NULL) {
1014 +      GMessage("Alignment:\n");
1015 +      alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1016 +      }
1017 +    }
1018 +  #endif
1019 +  //---- done
1020 +  delete alnbands;
1021 +  if (galns.Count()) {
1022 +    GXAlnInfo* bestaln=galns.Shift();
1023 +    return bestaln;
1024 +    }
1025 +  else return NULL;
1026 + }

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines