ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/seqalign/GXAlign.cpp
Revision: 77
Committed: Tue Sep 27 21:07:59 2011 UTC (9 years, 8 months ago) by gpertea
File size: 13667 byte(s)
Log Message:
wip - cutting seeds, finalize()

Line File contents
1 #include "GXAlign.h"
2
3 #ifdef GDEBUG
4 char buf[6]={0x1B,'[', 'n','m','m','\0'};
5
6 void color_fg(int c,FILE* f) {
7 if (f!=stderr && f!=stdout) return;
8 sprintf((char *)(&buf[2]),"%dm",c+30);
9 fwrite(buf,1,strlen(buf), f);
10 }
11
12 void color_bg(int c, FILE* f) {
13 if (f!=stderr && f!=stdout) return;
14 sprintf((char *)(&buf[2]),"%dm",c+40);
15 fwrite(buf,1,strlen(buf),f);
16 };
17
18 void color_resetfg(FILE* f) {
19 if (f!=stderr && f!=stdout) return;
20 sprintf((char *)(&buf[2]),"39m");
21 fwrite(buf,1,strlen(buf), f);
22 };
23
24 void color_resetbg(FILE* f) {
25 if (f!=stderr && f!=stdout) return;
26 sprintf((char *)(&buf[2]),"49m");
27 fwrite(buf,1,strlen(buf), f);
28 }
29
30 void color_reset(FILE* f) {
31 if (f!=stderr && f!=stdout) return;
32 sprintf((char *)(&buf[2]),"0m");
33 fwrite(buf,1,strlen(buf), f);
34 };
35
36 void color_normal(FILE* f) {
37 if (f!=stderr && f!=stdout) return;
38 sprintf((char *)(&buf[2]),"22m");
39 fwrite(buf,1,strlen(buf), f);
40 };
41
42 #endif
43
44
45 char xgapcodes[4]={'S','I', 'D', 'X'};
46
47 int get_last(int **flast_d, int d, int diag, int *row1) {
48 if (flast_d[d-1][diag-1] > GMAX(flast_d[d-1][diag], flast_d[d-1][diag+1])) {
49 *row1 = flast_d[d-1][diag-1];
50 return diag-1;
51 }
52 if (flast_d[d-1][diag] > flast_d[d-1][diag+1]) {
53 *row1 = flast_d[d-1][diag];
54 return diag;
55 }
56 *row1 = flast_d[d-1][diag+1];
57 return diag+1;
58 }
59
60 void GapXEditScript::print() { //debug
61 GapXEditScript* p=this;
62 do {
63 GMessage("%d%c ",p->num, xgapcodes[p->op_type]);
64 } while ((p=p->next)!=NULL);
65 GMessage("\n");
66 }
67
68
69
70 int GXGreedyAlign(const char* s1, int len1,
71 const char* s2, int len2,
72 bool reverse, int xdrop_threshold,
73 int match_cost, int mismatch_cost,
74 int* e1, int* e2,
75 GXAlnMem* abmp, GXEditScript *S) {
76 int col, // column number
77 d, // current distance
78 k, // current diagonal
79 flower, fupper, // boundaries for searching diagonals
80 row, // row number
81 MAX_D, // maximum cost
82 ORIGIN,
83 return_val = 0;
84 int** flast_d = abmp->flast_d; // rows containing the last d
85 int* max_row; // reached for cost d=0, ... len1.
86
87 int X_pen = xdrop_threshold;
88 int M_half = match_cost/2;
89 int Op_cost = mismatch_cost + M_half*2;
90 int D_diff = (X_pen+M_half) / Op_cost + 1;
91 int x, cur_max, b_diag = 0, best_diag = INT_MAX/2;
92 int* max_row_free = abmp->max_row_free;
93 char nlower = 0, nupper = 0;
94 GXAlnSpace* space = abmp->space;
95 int max_len = len2;
96
97 MAX_D = (int) (len1/ERROR_FRACTION + 1);
98 ORIGIN = MAX_D + 2;
99 #ifdef GDEBUG
100 GMessage("D_diff=%d, MAX_D=%d\n", D_diff, MAX_D);
101 #endif
102 *e1 = *e2 = 0;
103 //find first mismatch
104 if (reverse) {
105 for (row = 0; row < len2 && row < len1 && (s2[len2-1-row] == s1[len1-1-row]); row++)
106 ; //empty
107 } else {
108 for (row = 0; row < len2 && row < len1 && (s2[row] == s1[row]); row++)
109 ; //empty
110 }
111 *e1 = row;
112 *e2 = row;
113 if (row == len1) {
114 if (S != NULL)
115 S->opRep(row);
116 // hit last row; stop search
117 return 0;
118 }
119 if (S==NULL)
120 space = 0;
121 else if (!space) {
122 space = new GXAlnSpace();
123 abmp->space = space;
124 }
125 else
126 space->refresh();
127
128 max_row = max_row_free + D_diff;
129 for (k = 0; k < D_diff; k++)
130 max_row_free[k] = 0;
131
132 flast_d[0][ORIGIN] = row;
133 max_row[0] = (row + row)*M_half;
134
135 flower = ORIGIN - 1;
136 fupper = ORIGIN + 1;
137
138 d = 1;
139 while (d <= MAX_D) {
140 int fl0, fu0;
141 flast_d[d - 1][flower - 1] = flast_d[d - 1][flower] = -1;
142 flast_d[d - 1][fupper] = flast_d[d - 1][fupper + 1] = -1;
143 x = max_row[d - D_diff] + Op_cost * d - X_pen;
144 x = (int)ceil(x/M_half);
145 cur_max = 0;
146 fl0 = flower;
147 fu0 = fupper;
148 for (k = fl0; k <= fu0; k++) {
149 row = GMAX(flast_d[d - 1][k + 1], flast_d[d - 1][k]) + 1;
150 row = GMAX(row, flast_d[d - 1][k - 1]);
151 col = row + k - ORIGIN;
152 if (row + col >= x)
153 fupper = k;
154 else {
155 if (k == flower)
156 flower++;
157 else
158 flast_d[d][k] = -1;
159 continue;
160 }
161 if (row > max_len || row < 0) {
162 flower = k+1; nlower = 1;
163 } else {
164 // Slide down the diagonal. Don't do this if reached
165 // the end point, which has value 0x0f
166 if (reverse) {
167 if (s2[len2-row] != 0x0f) {
168 while (row < len2 && col < len1 && s2[len2-1-row] == s1[len1-1-col]) {
169 ++row;
170 ++col;
171 }
172 } else {
173 max_len = row;
174 flower = k+1; nlower = 1;
175 }
176 } else if (s2[row-1] != 0x0f) {
177 while (row < len2 && col < len1 && s2[row] == s1[col]) {
178 ++row;
179 ++col;
180 }
181 } else {
182 max_len = row;
183 flower = k+1; nlower = 1;
184 }
185 }
186 flast_d[d][k] = row;
187 if (row + col > cur_max) {
188 cur_max = row + col;
189 b_diag = k;
190 }
191 if (row == len2) {
192 flower = k+1; nlower = 1;
193 }
194 if (col == len1) {
195 fupper = k-1; nupper = 1;
196 }
197 } //for k
198 k = cur_max*M_half - d * Op_cost;
199 if (max_row[d - 1] < k) {
200 max_row[d] = k;
201 return_val = d;
202 best_diag = b_diag;
203 *e2 = flast_d[d][b_diag];
204 *e1 = (*e2)+b_diag-ORIGIN;
205 } else {
206 max_row[d] = max_row[d - 1];
207 }
208 if (flower > fupper)
209 break;
210 d++;
211 if (!nlower) flower--;
212 if (!nupper) fupper++;
213 if (S==NULL)
214 flast_d[d] = flast_d[d - 2];
215 else {
216 // space array consists of GX3Val structures which are
217 // 3 times larger than int, so divide requested amount by 3
218 flast_d[d] = (int*) space->getSpace((fupper-flower+7)/3);
219 if (flast_d[d] != NULL)
220 flast_d[d] = flast_d[d] - flower + 2;
221 else
222 return return_val;
223 }
224 } //while d<=MAX_D
225
226 if (S!=NULL) { /*trace back*/
227 int row1, col1, diag1, diag;
228 d = return_val; diag = best_diag;
229 row = *e2; col = *e1;
230 while (d > 0) {
231 diag1 = get_last(flast_d, d, diag, &row1);
232 col1 = row1+diag1-ORIGIN;
233 if (diag1 == diag) {
234 if (row-row1 > 0) S->opRep(row-row1);
235 } else if (diag1 < diag) {
236 if (row-row1 > 0) S->opRep(row-row1);
237 S->opIns(1);
238 } else {
239 if (row-row1-1> 0) S->opRep(row-row1-1);
240 S->opDel(1);
241 }
242 d--; diag = diag1; col = col1; row = row1;
243 } //while d>0
244 S->opRep(flast_d[0][ORIGIN]);
245 if (!reverse)
246 S->reverse();
247 }
248 return return_val;
249 }
250
251 void printEditScript(GXEditScript* ed_script) {
252 uint i;
253 if (ed_script==NULL || ed_script->opnum == 0)
254 return;
255 for (i=0; i<ed_script->opnum; i++) {
256 int num=((ed_script->ops[i]) >> 2);
257 unsigned char op_type = 3 - ( ed_script->ops[i] & EDIT_OP_MASK );
258 if (op_type == 3)
259 GError("Error: printEditScript encountered op_type 3 ?!\n");
260 GMessage("%d%c ", num, xgapcodes[op_type]);
261 }
262 GMessage("\n");
263 }
264
265 GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart,
266 bool editscript, int reward, int penalty, int xdrop) {
267 int q_max=strlen(q_seq); //query
268 int s_max=strlen(s_seq); //subj
269 return GreedyAlignRegion(q_seq, q_alnstart, q_max,
270 s_seq, s_alnstart, s_max, editscript, reward, penalty, xdrop);
271 }
272
273 struct GXSeedTable {
274 int a_num, b_num;
275 int a_cap, b_cap;
276 char* xc;
277 GXSeedTable(int a=12, int b=255) {
278 a_cap=0;
279 b_cap=0;
280 a_num=0;
281 b_num=0;
282 xc=NULL;
283 init(a,b);
284 }
285 ~GXSeedTable() {
286 GFREE(xc);
287 }
288 void init(int a, int b) {
289 a_num=a;
290 b_num=b;
291 bool resize=false;
292 if (b_num>b_cap) { resize=true; b_cap=b_num;}
293 if (a_num>a_cap) { resize=true; a_cap=a_num;}
294 if (resize) {
295 GFREE(xc);
296 GCALLOC(xc, (a_num*b_num));
297 }
298 else {
299 //just clear up to a_max, b_max
300 memset((void*)xc, 0, (a_num*b_num));
301 }
302 }
303 char& x(int ax, int by) {
304 return xc[by*a_num+ax];
305 }
306
307 };
308
309
310 static GXSeedTable gx(12,255);
311
312 GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
313 //overlap of right (3') end of seqb
314 //hash the first 12 bases of seqa:
315 int aimax=GMIN(9,(a_len-4));
316 int bimin=GMAX(0,(b_len-a_len-2));//allow 2 extraneous bases, just in case there is some garbage at the end
317 int a_mlen=aimax+4; //number of rows in the diagonal matrix
318 int b_mlen=b_len-4; //number of cols in the diagonal matrix
319 gx.init(a_mlen, b_mlen);
320 GXBandSet* diagstrips=new GXBandSet(a_mlen, b_mlen); //set of overlapping 3-diagonal strips
321 for (int ai=0;ai<aimax;ai++) {
322 int32* av=(int32 *)(&seqa[ai]);
323 for (int bi=b_len-5;bi>=bimin;bi--) {
324 if (gx.x(ai,bi))
325 continue; //already have a previous seed covering this region of this diagonal
326 int32* bv=(int32 *)(&seqb[bi]);
327 if (*av == *bv) {
328 //word match
329 //see if we can extend to the right
330 gx.x(ai+1,bi+1)=1;
331 gx.x(ai+2,bi+2)=1;
332 gx.x(ai+3,bi+3)=1;
333 int aix=ai+4;
334 int bix=bi+4;
335 int len=4;
336 while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix])
337 { aix++;bix++; len++; gx.x(aix,bix)=1; }
338 GXSeed* newseed=new GXSeed(ai,bi,len);
339 seeds.Add(newseed);
340 diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
341 }
342 }
343 }//for each 4-mer at the beginning of seqa
344 return diagstrips;
345 }
346
347 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
348 const char* s_seq, int s_alnstart, int s_max,
349 bool editscript, int reward, int penalty, int xdrop) {
350
351 GXEditScript* ed_script_fwd = NULL;
352 GXEditScript* ed_script_rev = NULL;
353 if ( q_alnstart>q_max || q_alnstart<1 || s_alnstart>s_max || s_alnstart<1 )
354 GError("GreedyAlign() Error: invalid anchor coordinate.\n");
355 q_alnstart--;
356 s_alnstart--;
357 if (q_seq==NULL || q_seq[0]==0 || s_seq==NULL || s_seq[0]==0)
358 GError("GreedyAlign() Error: attempt to use an empty sequence string!\n");
359 if (q_seq[q_alnstart]!=s_seq[s_alnstart])
360 GError("GreedyAlign() Error: improper anchor (mismatch)!\n");
361 if (editscript) {
362 ed_script_fwd=new GXEditScript();
363 }
364 int q_ext_l=0, q_ext_r=0, s_ext_l=0, s_ext_r=0;
365 //int q_max=strlen(q_seq); //query
366 //int s_max=strlen(s_seq); //subj
367 const char* q=q_seq+q_alnstart;
368 int q_avail=q_max-q_alnstart;
369 const char* s=s_seq+s_alnstart;
370 int s_avail=s_max-s_alnstart;
371 //GMessage("q_offset=%d, s_offset=%d\n", q_off, s_off);
372 //int MIN_GREEDY_SCORE=reward*5; //minimum score for an alignment to be reported
373 int MIN_GREEDY_SCORE=5*reward; //minimum score for an alignment to be reported
374 GXAlnMem* abmp=new GXAlnMem(s_max, reward, -penalty, xdrop);
375 // extend to the right
376 // int numdiffs_r=0;
377 int numdiffs_r = GXGreedyAlign(s, s_avail, q, q_avail, false, xdrop,
378 reward, -penalty, &s_ext_r, &q_ext_r, abmp, ed_script_fwd);
379 #ifdef GDEBUG
380 GMessage("fwd numdiffs=%d (%d-%d : %d-%d)\n", numdiffs_r, q_alnstart+1,q_alnstart+q_ext_r, s_alnstart+1, s_alnstart+s_ext_r);
381 if (editscript) {
382 GMessage("fwd edit script: ");
383 printEditScript(ed_script_fwd);
384 }
385 #endif
386 //TODO: only extend to the left if the right-extension is above the minimum score
387 //if (score<MIN_EXT_R_SCORE) {
388 // delete abmp;
389 // delete ed_script_fwd;
390 // return;
391 // }
392 // ---- extend to the left of anchor point now
393 if (editscript)
394 ed_script_rev=new GXEditScript();
395 int numdiffs_l = GXGreedyAlign(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
396 reward, -penalty, &s_ext_l, &q_ext_l, abmp, ed_script_rev);
397 #ifdef GDEBUG
398 GMessage("rev numdiffs=%d (%d-%d vs %d-%d)\n", numdiffs_l, q_alnstart+1-q_ext_l, q_alnstart+1, s_alnstart+1-s_ext_l, s_alnstart+1);
399 GMessage("rev edit script: ");
400 if (ed_script_rev) {
401 printEditScript(ed_script_rev);
402 }
403 #endif
404 GXAlnInfo* alninfo=NULL;
405 // In basic case the greedy algorithm returns number of
406 // differences, hence we need to convert it to score
407 int numdiffs=numdiffs_r+numdiffs_l;
408 int retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward-penalty);
409 #ifdef GDEBUG
410 GMessage("Final score: %d\n",retscore);
411 #endif
412 if (ed_script_rev)
413 ed_script_rev->Append(ed_script_fwd); //combine the two extensions
414 if (retscore>=MIN_GREEDY_SCORE) {
415 alninfo=new GXAlnInfo(q_seq, q_alnstart+1-q_ext_l,q_alnstart+q_ext_r, s_seq, s_alnstart+1-s_ext_l, s_alnstart+s_ext_r);
416 int hsp_length = GMIN(q_ext_l+q_ext_r, s_ext_l+s_ext_r);
417 alninfo->score=retscore;
418 alninfo->pid = 100 * (1 - ((double) numdiffs) / hsp_length);
419 #ifdef GDEBUG
420 if (ed_script_rev) {
421 GMessage("Final Edit script ::: ");
422 printEditScript(ed_script_rev);
423 }
424 #endif
425 alninfo->editscript=ed_script_rev;
426 alninfo->gapinfo = new CAlnGapInfo(ed_script_rev, alninfo->ql-1, alninfo->sl-1);
427 alninfo->abmp=abmp;
428 }
429 else {
430 delete abmp;
431 delete ed_script_rev;
432 }
433 delete ed_script_fwd;
434 return alninfo;
435 }