ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/seqalign/seqalign.cpp
(Generate patch)
# Line 32 | Line 32
32   return r;
33   }
34  
35 < void loadSeqs(const char* fname, char*& s1, char*& s2,
35 > bool loadSeqs(GLineReader& lr, char*& s1, char*& s2,
36                 int& s1_len, int& s2_len) {
37 GLineReader lr(fname);
37   char* l=NULL;
38   s1=NULL;
39   s2=NULL;
# Line 43 | Line 42
42    if (s1==NULL) {
43      s1=seq_cleanup(l, s1_len);
44      }
45 <   else if (s2==NULL) s2=seq_cleanup(l, s2_len);
45 >   else if (s2==NULL) {
46 >             s2=seq_cleanup(l, s2_len);
47 >             return true;
48 >             }
49             else break;
50    }
51 + if (s1!=NULL) GFREE(s1);
52 + return false;
53   }
54  
55  
56 < GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len) {
57 < //offsets must be in the middle of a good (best) HSP (ungapped match)
58 < //GXAlnInfo* alninfo=GreedyAlign(seqa, 22, seqb, 16, editscript);
59 < bool editscript=false;
60 < #ifdef GDEBUG
61 < editscript=true;
62 < GMessage("==========> matching Right (3') end ========\n");
63 < #endif
64 < GList<GXSeed> rseeds(true,true,false);
65 < GXBandSet* alnbands=collectSeeds_R(rseeds, seqa, seqa_len, seqb, seqb_len);
66 < GList<GXAlnInfo> galns(true,true,false);
67 < int max_top_bands=5;
68 < int top_band_count=0;
69 < //GList<GXSeed> anchor_seeds(true,false,true);
70 < GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
71 < for (int b=0;b<alnbands->Count();b++) {
72 <   if (alnbands->Get(b)->score<4) break;
73 <   top_band_count++;
74 <   GXBand& band=*(alnbands->Get(b));
75 <   band.seeds.setSorted(cmpSeedScore);
76 <   anchor_seeds.Add(band.seeds.First());
77 <   //if (band.seeds.Count()>1) anchor_seeds.Add(band.seeds[1]);
78 <   if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
79 <   }
80 < #ifdef GDEBUG
81 < GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
82 < #endif
83 <
84 < for (int i=0;i<anchor_seeds.Count();i++) {
85 <  GXSeed& aseed=*anchor_seeds[i];
86 <  //TODO: refine this by checking the right-extension score first (GreedyExtend_Right?)
87 <  //   because there is no need to check left if the right side doesn't align to the end
88 <  int a1=aseed.a_ofs+(aseed.len>>1)+1;
89 <  int a2=aseed.b_ofs+(aseed.len>>1)+1;
90 < #ifdef GDEBUG
87 <  GMessage("Processing Anchor: (%d,%d)\n",a1,a2);
88 < #endif
89 <  GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
90 <                          seqb, a2, seqb_len, editscript,
91 <                          match_reward, mismatch_penalty, Xdrop);
92 <  if (alninfo) galns.AddIfNew(alninfo, true);
93 <  }
56 > GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len, SGreedyAlignMem* gxmem) {
57 >  bool editscript=false;
58 >  #ifdef GDEBUG
59 >   editscript=true;
60 >   GMessage("==========> matching Right (3') end ========\n");
61 >  #endif
62 >  GList<GXSeed> rseeds(true,true,false);
63 >  GXBandSet* alnbands=collectSeeds_R(rseeds, seqa, seqa_len, seqb, seqb_len);
64 >  GList<GXAlnInfo> galns(true,true,false);
65 >  int max_top_bands=5;
66 >  int top_band_count=0;
67 >  GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
68 >  for (int b=0;b<alnbands->Count();b++) {
69 >     if (alnbands->Get(b)->score<4) break;
70 >     top_band_count++;
71 >     GXBand& band=*(alnbands->Get(b));
72 >     band.seeds.setSorted(cmpSeedScore);
73 >     anchor_seeds.Add(band.seeds.First());
74 >     if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
75 >     }
76 >  #ifdef GDEBUG
77 >  GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
78 >  #endif
79 >
80 >  for (int i=0;i<anchor_seeds.Count();i++) {
81 >    GXSeed& aseed=*anchor_seeds[i];
82 >    int a1=aseed.a_ofs+(aseed.len>>1)+1;
83 >    int a2=aseed.b_ofs+(aseed.len>>1)+1;
84 >    GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
85 >                            seqb, a2, seqb_len, galn_TrimRight, editscript, gxmem,
86 >                            match_reward, mismatch_penalty, Xdrop);
87 >    if (alninfo && alninfo->pid>73 && alninfo->sr>seqb_len-4 && alninfo->ql<4)
88 >             galns.AddIfNew(alninfo, true);
89 >        else delete alninfo;
90 >    }
91  
92 < #ifdef GDEBUG
93 < for (int i=0;i<galns.Count();i++) {
94 <  GXAlnInfo* alninfo=galns[i];
95 <  GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
96 <                       alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
97 <  if (alninfo->gapinfo!=NULL) {
98 <    GMessage("Alignment:\n");
99 <    alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
92 >  #ifdef GDEBUG
93 >  for (int i=0;i<galns.Count();i++) {
94 >    GXAlnInfo* alninfo=galns[i];
95 >    GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
96 >                         alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
97 >    if (alninfo->gapinfo!=NULL) {
98 >      GMessage("Alignment:\n");
99 >      alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
100 >      }
101      }
102 <  }
103 < #endif
104 < //---- done
102 >  #endif
103 >  //---- done
104 >  delete alnbands;
105 >  if (galns.Count()) {
106 >    GXAlnInfo* bestaln=galns.Shift();
107 >    return bestaln;
108 >    }
109 >  else return NULL;
110 > }
111  
112 < delete alnbands;
113 < if (galns.Count()) {
114 <  GXAlnInfo* bestaln=galns.Shift();
115 <  return bestaln;
116 <  }
117 < else return NULL;
112 > GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len, SGreedyAlignMem* gxmem) {
113 >  bool editscript=false;
114 >  #ifdef GDEBUG
115 >   editscript=true;
116 >   GMessage("==========> matching Left (5') end ========\n");
117 >  #endif
118 >  GList<GXSeed> rseeds(true,true,false);
119 >  GXBandSet* alnbands=collectSeeds_L(rseeds, seqa, seqa_len, seqb, seqb_len);
120 >  GList<GXAlnInfo> galns(true,true,false);
121 >  int max_top_bands=5;
122 >  int top_band_count=0;
123 >  GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
124 >  for (int b=0;b<alnbands->Count();b++) {
125 >     if (alnbands->Get(b)->score<4) break;
126 >     top_band_count++;
127 >     GXBand& band=*(alnbands->Get(b));
128 >     band.seeds.setSorted(cmpSeedScore);
129 >     anchor_seeds.Add(band.seeds.First());
130 >     if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
131 >     }
132 >  #ifdef GDEBUG
133 >  GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
134 >  #endif
135 >
136 >  for (int i=0;i<anchor_seeds.Count();i++) {
137 >    GXSeed& aseed=*anchor_seeds[i];
138 >    int a1=aseed.a_ofs+(aseed.len>>1)+1;
139 >    int a2=aseed.b_ofs+(aseed.len>>1)+1;
140 >    GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
141 >                            seqb, a2, seqb_len, galn_TrimLeft, editscript, gxmem,
142 >                            match_reward, mismatch_penalty, Xdrop-2);
143 >    if (alninfo && alninfo->pid>82 && alninfo->sl<3 && alninfo->qr>seqa_len-4)
144 >         //TODO: at 5' end we should use a higher pid threshold
145 >         // we could also decrease the X-drop value!
146 >            galns.AddIfNew(alninfo, true);
147 >       else delete alninfo;
148 >    }
149 >
150 >  #ifdef GDEBUG
151 >  for (int i=0;i<galns.Count();i++) {
152 >    GXAlnInfo* alninfo=galns[i];
153 >    GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
154 >                         alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
155 >    if (alninfo->gapinfo!=NULL) {
156 >      GMessage("Alignment:\n");
157 >      alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
158 >      }
159 >    }
160 >  #endif
161 >  //---- done
162 >  delete alnbands;
163 >  if (galns.Count()) {
164 >    GXAlnInfo* bestaln=galns.Shift();
165 >    return bestaln;
166 >    }
167 >  else return NULL;
168   }
169  
170  
171  
172 + void findTrimming(char* aseq, int alen, char* rseq, int rlen, SGreedyAlignMem* gxmem=NULL) {
173 +
174 + GXAlnInfo* r_bestaln=match_RightEnd(aseq, alen, rseq, rlen, gxmem);
175 +
176 + #ifdef GDEBUG
177 +  if (r_bestaln)
178 +  GMessage("******** Best 3' Alignment: %d-%d vs %d-%d, score=%d, pid=%.2f\n",
179 +        r_bestaln->ql, r_bestaln->qr, r_bestaln->sl, r_bestaln->sr, r_bestaln->score, r_bestaln->pid);
180 +  else GMessage("******** No suitable 3' alignment found.\n");
181 + #endif
182 +  if (r_bestaln)
183 +    fprintf(stdout, "3prime %d-%d:%d-%d|score=%d|pid=%.2f\n",
184 +      r_bestaln->ql, r_bestaln->qr, r_bestaln->sl, r_bestaln->sr, r_bestaln->score, r_bestaln->pid);
185 +
186 +  delete r_bestaln;
187 +
188 +  //testing: reverse both sequences and redo the search at 5' end
189 +  reverseChars(aseq);
190 +  reverseChars(rseq);
191 +
192 +  GXAlnInfo* l_bestaln=match_LeftEnd(aseq, alen, rseq, rlen, gxmem);
193 +
194 + #ifdef GDEBUG
195 +  if (l_bestaln)
196 +    GMessage("******** Best 5' Alignment: %d-%d vs %d-%d, score=%d, pid=%.2f\n",
197 +        l_bestaln->ql, l_bestaln->qr, l_bestaln->sl, l_bestaln->sr, l_bestaln->score, l_bestaln->pid);
198 +    else GMessage("******** No suitable 5' alignment found.\n");
199 + #endif
200 +  if (l_bestaln)
201 +    fprintf(stdout, "5prime %d-%d:%d-%d|score=%d|pid=%.2f\n",
202 +      l_bestaln->ql, l_bestaln->qr, l_bestaln->sl, l_bestaln->sr, l_bestaln->score, l_bestaln->pid);
203 +
204 +  delete l_bestaln;
205 +
206 +  GFREE(aseq);
207 +  GFREE(rseq);
208 + }
209 +
210   //================= MAIN =====================
211  
212   int main(int argc, char * const argv[]) {
# Line 125 | Line 217
217   if (args.getOpt('h')!=NULL) GError("%s\n", USAGE);
218   char* seqa=NULL;
219   char* seqb=NULL;
220 < GStr s=args.getOpt('i');
220 > GStr s=args.getOpt('X');
221 > if (!s.is_empty())
222 >    Xdrop=s.asInt();
223 > s=args.getOpt('M');
224 > if (!s.is_empty())
225 >    match_reward=s.asInt();
226 > s=args.getOpt('N');
227 > if (!s.is_empty())
228 >    mismatch_penalty=s.asInt();
229 >
230 > s=args.getOpt('i');
231   int seqa_len=0, seqb_len=0;
232 < if (!s.is_empty()) loadSeqs(s.chars(), seqa, seqb, seqa_len, seqb_len);
232 > if (!s.is_empty()) {
233 >      GLineReader lr(s.chars());
234 >      SGreedyAlignMem* gxmem=new SGreedyAlignMem(match_reward, mismatch_penalty, Xdrop);
235 >      while (loadSeqs(lr, seqa, seqb, seqa_len, seqb_len)) {
236 >         findTrimming(seqa, seqa_len, seqb, seqb_len, gxmem);
237 >         }
238 >      delete gxmem;
239 >      }
240     else {
241       s=args.getOpt('a');
242       if (!s.is_empty()) seqa=seq_cleanup(s.chars(), seqa_len);
243       s=args.getOpt('b');
244       if (!s.is_empty()) seqb=seq_cleanup(s.chars(), seqb_len);
245 +     if (seqa==NULL || seqa_len<2 || seqb==NULL || seqb_len<2)
246 +        GError("Error: two sequences with length>1 should be given!\n");
247 +     findTrimming(seqa, seqa_len, seqb, seqb_len);
248       }
137 if (seqa==NULL || seqa_len<2 || seqb==NULL || seqb_len<2)
138    GError("Error: two sequences with length>1 should be given!\n");
139 s=args.getOpt('X');
140 if (!s.is_empty())
141   Xdrop=s.asInt();
142 s=args.getOpt('M');
143 if (!s.is_empty())
144   match_reward=s.asInt();
145 s=args.getOpt('N');
146 if (!s.is_empty())
147   mismatch_penalty=s.asInt();
148
249  
150 #ifdef GDEBUG
151 GMessage("seqa: %s\n",seqa);
152 GMessage("seqb: %s\n",seqb);
153 #endif
154 GXAlnInfo* bestaln=match_RightEnd(seqa, seqa_len, seqb, seqb_len);
155 //#ifdef GDEBUG
156 if (bestaln)
157 GMessage("******** Best Alignment: %d-%d vs %d-%d, score=%d, pid=%.2f\n",
158       bestaln->ql, bestaln->qr, bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid);
159 else GMessage("******** No suitable terminal alignment found.\n");
160 //#endif
161 delete bestaln;
162 GFREE(seqa);
163 GFREE(seqb);
250   }
251  
252  

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines