ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/seqalign/GXAlign.h
Revision: 81
Committed: Thu Sep 29 19:02:24 2011 UTC (9 years, 7 months ago) by gpertea
File size: 15456 byte(s)
Log Message:
fixes, still a bug with flast_d[d] = flast_d[d] - flower + 2;

Line File contents
1 #ifndef _GXALIGN_H
2
3 #include "GBase.h"
4 #include "GList.hh"
5
6 #define GDEBUG 1
7
8 #define MAX_SPACE 1000000
9 #define LARGE 100000000
10
11 enum {
12 EDIT_OP_MASK = 0x3,
13 EDIT_OP_ERR = 0x0,
14 EDIT_OP_INS = 0x1,
15 EDIT_OP_DEL = 0x2,
16 EDIT_OP_REP = 0x3
17 };
18
19 #define GX_EDITOP_VAL(op) ((op) >> 2)
20 #define GX_EDITOP_GET(op) ((op) & EDIT_OP_MASK)
21 #define GX_EDITOP_CONS(op, val) (((val) << 2) | ((op) & EDIT_OP_MASK))
22
23 #ifdef GDEBUG
24 enum {c_black=0,
25 c_red, c_green,c_brown,c_blue,c_magenta,c_cyan,c_white
26 };
27
28 void color_fg(int c, FILE* f=stderr);
29 void color_bg(int c, FILE* f=stderr);
30 void color_resetfg(FILE* f=stderr);
31 void color_resetbg(FILE* f=stderr);
32 void color_reset(FILE* f=stderr);
33 void color_normal(FILE* f=stderr);
34 #endif
35
36 struct GXEditScript{
37 uint32 *ops; // array of edit operations
38 uint32 opsize, opnum; // size of allocation, number in use
39 uint32 oplast; // most recent operation added
40 //methods
41
42 GXEditScript() {
43 init();
44 }
45 ~GXEditScript() {
46 GFREE(ops);
47 }
48 void init() {
49 ops = NULL;
50 opsize = 0;
51 opnum = 0;
52 oplast = 0;
53 getReady(8);
54 }
55
56 int getReady(uint32 n) {
57 uint32 m = n + n/2;
58 if (opsize <= n) {
59 GREALLOC(ops, m*sizeof(uint32));
60 opsize = m;
61 }
62 return 1;
63 }
64
65 int getReady2(uint32 n) {
66 if (opsize - opnum <= n)
67 return getReady(n + opnum);
68 return 1;
69 }
70
71 int Put(uint32 op, uint32 n) {
72 if (!getReady2(2))
73 return 0;
74 oplast = op;
75 ops[opnum] = GX_EDITOP_CONS(op, n);
76 opnum += 1;
77 ops[opnum] = 0; // sentinel
78 return 1;
79 }
80 uint32* First() {
81 return opnum > 0 ? & ops[0] : NULL;
82 }
83
84 uint32* Next(uint32 *op) {
85 // assumes flat address space !
86 if (&ops[0] <= op && op < &ops[opnum-1])
87 return op+1;
88 else
89 return 0;
90 }
91
92 int More(uint32 op, uint32 k) {
93 if (op == EDIT_OP_ERR) {
94 GError("GXEditScript::opMore: bad opcode %d:%d", op, k);
95 return -1;
96 }
97
98 if (GX_EDITOP_GET(oplast) == op) {
99 uint32 l=ops[opnum-1];
100 ops[opnum-1]=GX_EDITOP_CONS((GX_EDITOP_GET(l)),
101 (GX_EDITOP_VAL(l) + k));
102 }
103 else {
104 Put(op, k);
105 }
106
107 return 0;
108 }
109
110 GXEditScript* Append(GXEditScript *et) {
111 uint32 *op;
112 for (op = et->First(); op; op = et->Next(op))
113 More(GX_EDITOP_GET(*op), GX_EDITOP_VAL(*op));
114 return this;
115 }
116
117 int opDel(uint32 k) {
118 return More(EDIT_OP_DEL, k);
119 }
120 int opIns(uint32 k) {
121 return More(EDIT_OP_INS, k);
122 }
123 int opRep(uint32 k) {
124 return More(EDIT_OP_REP, k);
125 }
126
127 GXEditScript *reverse() {
128 const uint32 mid = opnum/2;
129 const uint32 end = opnum-1;
130 for (uint32 i = 0; i < mid; ++i) {
131 const uint32 t = ops[i];
132 ops[i] = ops[end-i];
133 ops[end-i] = t;
134 }
135 return this;
136 }
137 };
138
139
140 struct GX3Val {
141 int I, C, D;
142 };
143
144 // ----- pool allocator -----
145 struct GXAlnSpace {
146 GX3Val* space_array;
147 int used, size;
148 GXAlnSpace *next;
149 //methods
150 GXAlnSpace() {
151 uint32 amount=MAX_SPACE;
152 GCALLOC(space_array, sizeof(GX3Val)*amount);
153 if (space_array == NULL) {
154 GError("Failed to allocated GXAlnSpace for greedy extension!\n");
155 return;
156 }
157 used = 0;
158 size = amount;
159 next = NULL;
160 }
161 void refresh() {
162 GXAlnSpace* sp=this;
163 while (sp) {
164 sp->used = 0;
165 sp = sp->next;
166 }
167 }
168 ~GXAlnSpace() {
169 GXAlnSpace* next_sp;
170 GXAlnSpace* sp=this->next;
171 while (sp) {
172 next_sp = sp->next;
173 GFREE(sp->space_array);
174 delete sp;
175 sp = next_sp;
176 }
177 GFREE(space_array);
178 }
179
180 GX3Val* getSpace(int amount) {
181 GX3Val* v;
182 if (amount < 0) return NULL;
183 GXAlnSpace* S=this;
184 while (used+amount > S->size) {
185 if (next == NULL) {
186 next=new GXAlnSpace();
187 }
188 S = S->next;
189 }
190 v = S->space_array+S->used;
191 S->used += amount;
192 return v;
193 }
194 };
195
196 #define ERROR_FRACTION 6 // 1/this : the higher it is, the smaller the diagonal band
197
198 struct GXAlnMem { //GreedyAlignMem
199 int** flast_d; //flast_d[][]
200 int* max_row_free;
201 int* uplow_free;
202 GXAlnSpace* space;
203 int max_d;
204 GXAlnMem(int max_len, int reward, int penalty, int Xdrop) {
205 flast_d=NULL;
206 max_row_free=NULL;
207 max_d=0;
208 uplow_free=NULL;
209 space=NULL;
210 max_d = (int)(max_len / ERROR_FRACTION + 1);
211 int d_diff = (Xdrop+reward/2)/(penalty+reward) + 1;
212 GCALLOC(flast_d, (max_d+2)* sizeof(int*));
213 if (flast_d == NULL) {
214 GError("Error: cannot allocate GXAlnMem structure!\n");
215 }
216 flast_d[0] = (int*)malloc((max_d + max_d + 6) * sizeof(int) * 2);
217 if (flast_d[0] == NULL) {
218 GError("Error: failed to allocate flast_d[0] in GXAlnMem!\n");
219 }
220 flast_d[1] = flast_d[0] + max_d + max_d + 6;
221 uplow_free = NULL;
222 max_row_free = (int*) malloc(sizeof(int) * (max_d + 1 + d_diff));
223 space = new GXAlnSpace();
224 }
225 ~GXAlnMem() {
226 delete space;
227 GFREE(max_row_free);
228 GFREE(flast_d[0]);
229 GFREE(flast_d);
230 }
231 };
232
233 int GXGreedyAlign(const char* s1, int len1,
234 const char* s2, int len2,
235 bool reverse, int xdrop_threshold,
236 int match_cost, int mismatch_cost,
237 int* e1, int* e2,
238 GXAlnMem* abmp, GXEditScript *S=NULL);
239
240 #define GAPALIGN_SUB ((unsigned char)0) /*op types within the edit script*/
241 #define GAPALIGN_INS ((unsigned char)1)
242 #define GAPALIGN_DEL ((unsigned char)2)
243 #define GAPALIGN_DECLINE ((unsigned char)3)
244
245 struct GapXEditScript {
246 unsigned char op_type; // GAPALIGN_SUB, GAPALIGN_INS, or GAPALIGN_DEL
247 int num; // Number of operations
248 GapXEditScript* next;
249 GapXEditScript() {
250 op_type=0;
251 num=0;
252 next=NULL;
253 }
254 void print();
255 };
256
257 class CSeqGap { //
258 public:
259 int offset;
260 int len;
261 CSeqGap(int gofs=0,int glen=1) {
262 offset=gofs;
263 len=glen;
264 }
265 };
266
267 class CAlnGapInfo {
268 int a_ofs; //alignment start on seq a (0 based)
269 int b_ofs; //alignment start on seq b (0 based)
270 int a_len; //length of alignment on seq a
271 int b_len; //length of alignment on seq b
272 public:
273 GVec<CSeqGap> a_gaps;
274 GVec<CSeqGap> b_gaps;
275 CAlnGapInfo(GXEditScript* ed_script, int astart=0, int bstart=0):a_gaps(),b_gaps() {
276 a_ofs=astart;
277 b_ofs=bstart;
278 a_len=0;
279 b_len=0;
280 for (uint32 i=0; i<ed_script->opnum; i++) {
281 int num=((ed_script->ops[i]) >> 2);
282 char op_type = 3 - ( ed_script->ops[i] & EDIT_OP_MASK );
283 if (op_type == 3 || op_type < 0 )
284 GError("Error: encountered op_type %d in ed_script?!\n", (int)op_type);
285 CSeqGap gap;
286 switch (op_type) {
287 case GAPALIGN_SUB: a_len+=num;
288 b_len+=num;
289 break;
290 case GAPALIGN_INS: a_len+=num;
291 gap.offset=b_ofs+b_len;
292 gap.len=num;
293 b_gaps.Add(gap);
294 break;
295 case GAPALIGN_DEL: b_len+=num;
296 gap.offset=a_ofs+a_len;
297 gap.len=num;
298 a_gaps.Add(gap);
299 break;
300 }
301 }
302 }
303 #ifdef GDEBUG
304 void printAlignment(FILE* f, const char* sa, const char* sb) {
305 //print seq A
306 char al[1024]; //display buffer for seq A
307 int ap=0; //index in al[] for current character printed
308 int g=0;
309 int aend=a_ofs+a_len;
310 if (a_ofs<b_ofs) {
311 for (int i=0;i<b_ofs-a_ofs;i++) {
312 fprintf(f, " ");
313 al[++ap]=' ';
314 }
315 }
316 for (int i=0;i<aend;i++) {
317 if (g<a_gaps.Count() && a_gaps[g].offset==i) {
318 for (int j=0;j<a_gaps[g].len;j++) {
319 fprintf(f, "-");
320 al[++ap]='-';
321 }
322 g++;
323 }
324 if (i==a_ofs) color_bg(c_blue,f);
325 fprintf(f, "%c", sa[i]);
326 al[++ap]=sa[i];
327 }
328 color_reset(f);
329 fprintf(f, &sa[aend]);
330 fprintf(f, "\n");
331 //print seq B
332 ap=0;
333 g=0;
334 int bend=b_ofs+b_len;
335 if (a_ofs>b_ofs) {
336 for (int i=0;i<a_ofs-b_ofs;i++) {
337 fprintf(f, " ");
338 ap++;
339 }
340 }
341 for (int i=0;i<b_ofs;i++) {
342 fprintf(f, "%c", sb[i]);
343 ap++;
344 }
345 for (int i=b_ofs;i<bend;i++) {
346 if (g<b_gaps.Count() && b_gaps[g].offset==i) {
347 for (int j=0;j<b_gaps[g].len;j++) {
348 fprintf(f, "-");
349 ap++;
350 }
351 g++;
352 }
353 if (i==b_ofs) color_bg(c_blue,f);
354 ap++;
355 bool mismatch=(sb[i]!=al[ap] && al[ap]!='-');
356 if (mismatch) color_bg(c_red,f);
357 fprintf(f, "%c", sb[i]);
358 if (mismatch) color_bg(c_blue,f);
359 }
360 color_reset(f);
361 fprintf(f, &sb[bend]);
362 fprintf(f, "\n");
363 }
364 #endif
365 };
366
367
368 struct GXAlnInfo {
369 const char *qseq;
370 int ql,qr;
371 const char *sseq;
372 int sl,sr;
373 int score;
374 double pid;
375 GXEditScript* editscript;
376 GXAlnMem* abmp;
377 CAlnGapInfo* gapinfo;
378 GXAlnInfo(const char* q, int q_l, int q_r, const char* s, int s_l, int s_r,
379 int sc=0, double percid=0) {
380 qseq=q;
381 sseq=s;
382 ql=q_l;
383 qr=q_r;
384 sl=s_l;
385 sr=s_r;
386 score=sc;
387 pid=percid;
388 //editscript=NULL;
389 gapinfo=NULL;
390 }
391 ~GXAlnInfo() {
392 delete editscript;
393 delete abmp;
394 delete gapinfo;
395 }
396 bool operator<(GXAlnInfo& d) {
397 return ((score==d.score)? pid>d.pid : score>d.score);
398 }
399 bool operator>(GXAlnInfo& d) {
400 return ((score==d.score)? pid<d.pid : score<d.score);
401 }
402 bool operator==(GXAlnInfo& d) {
403 return (score==d.score && pid==d.pid);
404 }
405
406 };
407
408
409
410 struct GXSeed {
411 int b_ofs; //0-based coordinate on seq b (x coordinate)
412 int a_ofs; //0-based coordinate on seq a (y coordinate)
413 int len; //length of exact match after extension
414 bool operator<(GXSeed& d){
415 return ((b_ofs==d.b_ofs) ? a_ofs<d.a_ofs : b_ofs<d.b_ofs);
416 }
417 bool operator>(GXSeed& d){
418 return ((b_ofs==d.b_ofs) ? a_ofs>d.a_ofs : b_ofs>d.b_ofs);
419 }
420 bool operator==(GXSeed& d){
421 return (b_ofs==d.b_ofs && a_ofs==d.a_ofs); //should never be the case, seeds are uniquely constructed
422 }
423 GXSeed(int aofs=0, int bofs=0, int l=4) {
424 a_ofs=aofs;
425 b_ofs=bofs;
426 len=l;
427 }
428 };
429
430 int cmpSeedDiag(const pointer p1, const pointer p2);
431 //seeds are "equal" if they're on the same diagonal (for selection purposes only)
432
433 int cmpSeedScore(const pointer p1, const pointer p2); //also takes position into account
434 //among seeds with same length, prefer those closer to the left end of the read (seq_b)
435
436 struct GXBand {
437 //bundle of seed matches on 3 adjacent diagonals
438 int diag; //first anti-diagonal (b_ofs-a_ofs) in this group of 3
439 //seeds for this, and diag+1 and diag+2 are stored here
440 int min_a, max_a; //maximal coordinates of the bundle
441 int min_b, max_b;
442 GList<GXSeed> seeds; //sorted by x coordinate (b_ofs)
443 int score; //sum of seed scores (- overlapping_bases/2 - gaps)
444 GXBand(int start_diag=-1, GXSeed* seed=NULL):seeds(true, false, false) {
445 diag=start_diag;
446 min_a=MAX_INT;
447 min_b=MAX_INT;
448 max_a=0;
449 max_b=0;
450 score=0;
451 if (seed!=NULL) addSeed(seed);
452 }
453 void addSeed(GXSeed* seed) {
454 seeds.Add(seed);
455 score+=seed->len;
456 //if (diag<0) diag=seed->diag; //should NOT be done like this
457 if (seed->a_ofs < min_a) min_a=seed->a_ofs;
458 if (seed->a_ofs+ seed->len > max_a) max_a=seed->a_ofs+seed->len;
459 if (seed->b_ofs < min_b) min_b=seed->b_ofs;
460 if (seed->b_ofs+seed->len > max_b) max_b=seed->b_ofs+seed->len;
461 }
462 void finalize() {
463 //!! to be called only AFTER all seeds have been added
464 // seeds are sorted by b_ofs
465 //penalize seed gaps and overlaps on b sequence
466 for (int i=1;i<seeds.Count();i++) {
467 GXSeed& sprev=*seeds[i-1];
468 GXSeed& scur=*seeds[i];
469 if (scur==sprev) GError("Error: duplicate seeds found (%d-%d:%d-%d)!\n",
470 scur.a_ofs+1, scur.a_ofs+scur.len, scur.b_ofs+1, scur.b_ofs+scur.len);
471 int b_gap=scur.b_ofs-sprev.b_ofs-sprev.len;
472 int a_gap=scur.a_ofs-sprev.a_ofs-sprev.len;
473 int max_gap=b_gap;
474 int min_gap=a_gap;
475 if (min_gap>max_gap) swap(max_gap, min_gap);
476 if (min_gap<0) { //overlap
477 if (max_gap>0) { score-=GMAX((-min_gap), max_gap); }
478 else score+=min_gap;
479 }
480 else { //gap
481 score-=max_gap;
482 }
483 }//for each seed
484 }
485
486 //bands will be sorted by decreasing score eventually, after all seeds are added
487 bool operator<(GXBand& d){
488 return ((score==d.score) ? seeds.Count()>d.seeds.Count() : score>d.score);
489 }
490 bool operator>(GXBand& d){
491 return ((score==d.score) ? seeds.Count()<d.seeds.Count() : score<d.score);
492 }
493 bool operator==(GXBand& d){
494 return (score==d.score && seeds.Count()==d.seeds.Count());
495 }
496
497 };
498
499 class GXBandSet:public GList<GXBand> {
500 public:
501 int idxoffset; //global anti-diagonal->index offset (a_len-1)
502 //used to convert a diagonal to an index
503 //diagonal is always b_ofs-a_ofs, so the minimum value is -a_len+1
504 //hence offset is a_len-1
505 GXBand* band(int diag) { //retrieve the band for given anti-diagonal (b_ofs-a_ofs)
506 return Get(diag+idxoffset);
507 }
508 GXBand* band(int a_ofs, int b_ofs) { //retrieve the band for given anti-diagonal (b_ofs-a_ofs)
509 return Get(b_ofs-a_ofs+idxoffset);
510 }
511 GXBandSet(int a_len, int b_len):GList<GXBand>(a_len+b_len-1, false, true, false) {
512 idxoffset=a_len-1;
513 //diag will range from -a_len+1 to b_len-1, so after adjustment
514 //by idxoffset we get a max of a_len+b_len-2
515 int bcount=a_len+b_len-1;
516 for (int i=0;i<bcount;i++)
517 this->Add(new GXBand(i-idxoffset));
518 //unsorted, this should set fList[i]
519 }
520 void addSeed(GXSeed* seed) {
521 //MUST be unsorted !!!
522 int idx=(seed->b_ofs-seed->a_ofs)+idxoffset;
523 fList[idx]->addSeed(seed);
524 if (idx>0) fList[idx-1]->addSeed(seed);
525 if (idx<fCount-1) fList[idx+1]->addSeed(seed);
526 }
527 };
528
529 GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len); //overlap of 3' end of seqb
530 //void collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, const char* seqb); //overlap of 5' end of seqb
531
532 void printEditScript(GXEditScript* ed_script);
533
534
535 // reward MUST be >1, always
536 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
537 const char* s_seq, int s_alnstart, int s_max,
538 bool editscript=false, int reward=2, int penalty=-2, int xdrop=14);
539 GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart,
540 bool editscript=false, int reward=2, int penalty=-2, int xdrop=14);
541
542 #endif