ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/seqalign/GXAlign.h
Revision: 84
Committed: Fri Sep 30 03:19:32 2011 UTC (9 years, 7 months ago) by gpertea
File size: 15675 byte(s)
Log Message:
wip bad

Line File contents
1 #ifndef _GXALIGN_H
2
3 #include "GBase.h"
4 #include "GList.hh"
5
6 #define GDEBUG 1
7
8 #define MAX_SPACE 2000000
9
10 enum {
11 EDIT_OP_MASK = 0x3,
12 EDIT_OP_ERR = 0x0,
13 EDIT_OP_INS = 0x1,
14 EDIT_OP_DEL = 0x2,
15 EDIT_OP_REP = 0x3
16 };
17
18 #define GX_EDITOP_VAL(op) ((op) >> 2)
19 #define GX_EDITOP_GET(op) ((op) & EDIT_OP_MASK)
20 #define GX_EDITOP_CONS(op, val) (((val) << 2) | ((op) & EDIT_OP_MASK))
21
22 #ifdef GDEBUG
23 enum {c_black=0,
24 c_red, c_green,c_brown,c_blue,c_magenta,c_cyan,c_white
25 };
26
27 void color_fg(int c, FILE* f=stderr);
28 void color_bg(int c, FILE* f=stderr);
29 void color_resetfg(FILE* f=stderr);
30 void color_resetbg(FILE* f=stderr);
31 void color_reset(FILE* f=stderr);
32 void color_normal(FILE* f=stderr);
33 #endif
34
35 struct GXEditScript{
36 uint32 *ops; // array of edit operations
37 uint32 opsize, opnum; // size of allocation, number in use
38 uint32 oplast; // most recent operation added
39 //methods
40
41 GXEditScript() {
42 init();
43 }
44 ~GXEditScript() {
45 GFREE(ops);
46 }
47 void init() {
48 ops = NULL;
49 opsize = 0;
50 opnum = 0;
51 oplast = 0;
52 getReady(8);
53 }
54
55 int getReady(uint32 n) {
56 uint32 m = n + n/2;
57 if (opsize <= n) {
58 GREALLOC(ops, m*sizeof(uint32));
59 opsize = m;
60 }
61 return 1;
62 }
63
64 int getReady2(uint32 n) {
65 if (opsize - opnum <= n)
66 return getReady(n + opnum);
67 return 1;
68 }
69
70 int Put(uint32 op, uint32 n) {
71 if (!getReady2(2))
72 return 0;
73 oplast = op;
74 ops[opnum] = GX_EDITOP_CONS(op, n);
75 opnum += 1;
76 ops[opnum] = 0; // sentinel
77 return 1;
78 }
79 uint32* First() {
80 return opnum > 0 ? & ops[0] : NULL;
81 }
82
83 uint32* Next(uint32 *op) {
84 // assumes flat address space !
85 if (&ops[0] <= op && op < &ops[opnum-1])
86 return op+1;
87 else
88 return 0;
89 }
90
91 int More(uint32 op, uint32 k) {
92 if (op == EDIT_OP_ERR) {
93 GError("GXEditScript::opMore: bad opcode %d:%d", op, k);
94 return -1;
95 }
96
97 if (GX_EDITOP_GET(oplast) == op) {
98 uint32 l=ops[opnum-1];
99 ops[opnum-1]=GX_EDITOP_CONS((GX_EDITOP_GET(l)),
100 (GX_EDITOP_VAL(l) + k));
101 }
102 else {
103 Put(op, k);
104 }
105
106 return 0;
107 }
108
109 GXEditScript* Append(GXEditScript *et) {
110 uint32 *op;
111 for (op = et->First(); op; op = et->Next(op))
112 More(GX_EDITOP_GET(*op), GX_EDITOP_VAL(*op));
113 return this;
114 }
115
116 int opDel(uint32 k) {
117 return More(EDIT_OP_DEL, k);
118 }
119 int opIns(uint32 k) {
120 return More(EDIT_OP_INS, k);
121 }
122 int opRep(uint32 k) {
123 return More(EDIT_OP_REP, k);
124 }
125
126 GXEditScript *reverse() {
127 const uint32 mid = opnum/2;
128 const uint32 end = opnum-1;
129 for (uint32 i = 0; i < mid; ++i) {
130 const uint32 t = ops[i];
131 ops[i] = ops[end-i];
132 ops[end-i] = t;
133 }
134 return this;
135 }
136 };
137
138
139 // ----- pool allocator -----
140 // works as a linked list of allocated memory blocks
141 struct GXMemPool {
142 char* memblock;
143 int used, size;
144 GXMemPool *next;
145 //methods
146 GXMemPool() { //by default allocate a large block here (10M)
147 uint32 amount= MAX_SPACE<<3;
148 GCALLOC(memblock, amount);
149 if (memblock == NULL) {
150 GError("Failed to allocated GXAlnSpace for greedy extension!\n");
151 return;
152 }
153 used = 0;
154 size = amount;
155 next = NULL;
156 }
157 void refresh() {
158 GXMemPool* sp=this;
159 while (sp) {
160 sp->used = 0;
161 sp = sp->next;
162 }
163 }
164 ~GXMemPool() {
165 GXMemPool* next_sp;
166 GXMemPool* sp=this->next;
167 while (sp) {
168 next_sp = sp->next;
169 GFREE(sp->memblock);
170 delete sp;
171 sp = next_sp;
172 }
173 GFREE(memblock);
174 }
175
176 void* getSpace(int amount) { //amount to use or allocate memory, in bytes
177 //can use the first found memory block with enough room,
178 // or allocate a new large block
179 char* v;
180 if (amount < 0) return NULL;
181 GXMemPool* S=this;
182 while (used+amount > S->size) {
183 //no room in current block, get a new mem block
184 if (next == NULL) {
185 next=new GXMemPool(); //allocates a large contiguous memory block
186 }
187 S = S->next;
188 }
189 v = S->memblock+S->used;
190 S->used += amount;
191 //align to next 8-byte boundary
192 int m8 = S->used & 7; //modulo 8
193 if (m8)
194 S->used += 8 - m8;
195 return (void*)v;
196 }
197 };
198
199 #define ERROR_FRACTION 5 // 1/this : the higher it is, the smaller the diagonal band
200
201 struct GXAlnMem { //GreedyAlignMem
202 int** flast_d; //flast_d[][]
203 int* max_row_free;
204 int* uplow_free;
205 GXMemPool* space;
206 int MAX_D;
207 GXAlnMem(int max_len, int reward, int penalty, int Xdrop) {
208 flast_d=NULL;
209 max_row_free=NULL;
210 MAX_D=0;
211 uplow_free=NULL;
212 space=NULL;
213 MAX_D = (int)(max_len / ERROR_FRACTION + 1);
214 //MAX_D = 4;
215 int d_diff = (Xdrop+reward/2)/(penalty+reward) + 1;
216 GCALLOC(flast_d, (MAX_D+2)* sizeof(int*));
217 if (flast_d == NULL) {
218 GError("Error: cannot allocate GXAlnMem structure!\n");
219 }
220 flast_d[0] = (int*)malloc(((MAX_D<<1) + 6) * sizeof(int) * 2);
221 //allocates two int rows - flast_d[1] too
222 if (flast_d[0] == NULL) {
223 GError("Error: failed to allocate flast_d[0] in GXAlnMem!\n");
224 }
225 flast_d[1] = flast_d[0] + (MAX_D<<1) + 6; //int array for next row
226 uplow_free = NULL;
227 max_row_free = (int*) malloc(sizeof(int) * (MAX_D + 1 + d_diff));
228 space = new GXMemPool();
229 }
230 ~GXAlnMem() {
231 delete space;
232 GFREE(max_row_free);
233 GFREE(flast_d[0]);
234 GFREE(flast_d);
235 }
236 };
237
238 int GXGreedyAlign(const char* s1, int len1,
239 const char* s2, int len2,
240 bool reverse, int xdrop_threshold,
241 int match_cost, int mismatch_cost,
242 int* e1, int* e2,
243 GXAlnMem* abmp, GXEditScript *S=NULL);
244
245 #define GAPALIGN_SUB ((unsigned char)0) /*op types within the edit script*/
246 #define GAPALIGN_INS ((unsigned char)1)
247 #define GAPALIGN_DEL ((unsigned char)2)
248 #define GAPALIGN_DECLINE ((unsigned char)3)
249
250 struct GapXEditScript {
251 unsigned char op_type; // GAPALIGN_SUB, GAPALIGN_INS, or GAPALIGN_DEL
252 int num; // Number of operations
253 GapXEditScript* next;
254 GapXEditScript() {
255 op_type=0;
256 num=0;
257 next=NULL;
258 }
259 void print();
260 };
261
262 class CSeqGap { //
263 public:
264 int offset;
265 int len;
266 CSeqGap(int gofs=0,int glen=1) {
267 offset=gofs;
268 len=glen;
269 }
270 };
271
272 class CAlnGapInfo {
273 int a_ofs; //alignment start on seq a (0 based)
274 int b_ofs; //alignment start on seq b (0 based)
275 int a_len; //length of alignment on seq a
276 int b_len; //length of alignment on seq b
277 public:
278 GVec<CSeqGap> a_gaps;
279 GVec<CSeqGap> b_gaps;
280 CAlnGapInfo(GXEditScript* ed_script, int astart=0, int bstart=0):a_gaps(),b_gaps() {
281 a_ofs=astart;
282 b_ofs=bstart;
283 a_len=0;
284 b_len=0;
285 if (ed_script==NULL) return;
286 for (uint32 i=0; i<ed_script->opnum; i++) {
287 int num=((ed_script->ops[i]) >> 2);
288 char op_type = 3 - ( ed_script->ops[i] & EDIT_OP_MASK );
289 if (op_type == 3 || op_type < 0 )
290 GError("Error: encountered op_type %d in ed_script?!\n", (int)op_type);
291 CSeqGap gap;
292 switch (op_type) {
293 case GAPALIGN_SUB: a_len+=num;
294 b_len+=num;
295 break;
296 case GAPALIGN_INS: a_len+=num;
297 gap.offset=b_ofs+b_len;
298 gap.len=num;
299 b_gaps.Add(gap);
300 break;
301 case GAPALIGN_DEL: b_len+=num;
302 gap.offset=a_ofs+a_len;
303 gap.len=num;
304 a_gaps.Add(gap);
305 break;
306 }
307 }
308 }
309
310 #ifdef GDEBUG
311 void printAlignment(FILE* f, const char* sa, int sa_len,
312 const char* sb, int sb_len) {
313 //print seq A
314 char al[1024]; //display buffer for seq A
315 int ap=0; //index in al[] for current character printed
316 int g=0;
317 int aend=a_ofs+a_len;
318 if (a_ofs<b_ofs) {
319 for (int i=0;i<b_ofs-a_ofs;i++) {
320 fprintf(f, " ");
321 al[++ap]=' ';
322 }
323 }
324 for (int i=0;i<aend;i++) {
325 if (g<a_gaps.Count() && a_gaps[g].offset==i) {
326 for (int j=0;j<a_gaps[g].len;j++) {
327 fprintf(f, "-");
328 al[++ap]='-';
329 }
330 g++;
331 }
332 if (i==a_ofs) color_bg(c_blue,f);
333 fprintf(f, "%c", sa[i]);
334 al[++ap]=sa[i];
335 }
336 color_reset(f);
337 if (aend<sa_len)
338 fprintf(f, &sa[aend]);
339 fprintf(f, "\n");
340 //print seq B
341 ap=0;
342 g=0;
343 int bend=b_ofs+b_len;
344 if (a_ofs>b_ofs) {
345 for (int i=0;i<a_ofs-b_ofs;i++) {
346 fprintf(f, " ");
347 ap++;
348 }
349 }
350 for (int i=0;i<b_ofs;i++) {
351 fprintf(f, "%c", sb[i]);
352 ap++;
353 }
354 for (int i=b_ofs;i<bend;i++) {
355 if (g<b_gaps.Count() && b_gaps[g].offset==i) {
356 for (int j=0;j<b_gaps[g].len;j++) {
357 fprintf(f, "-");
358 ap++;
359 }
360 g++;
361 }
362 if (i==b_ofs) color_bg(c_blue,f);
363 ap++;
364 bool mismatch=(sb[i]!=al[ap] && al[ap]!='-');
365 if (mismatch) color_bg(c_red,f);
366 fprintf(f, "%c", sb[i]);
367 if (mismatch) color_bg(c_blue,f);
368 }
369 color_reset(f);
370 if (bend<sb_len)
371 fprintf(f, &sb[bend]);
372 fprintf(f, "\n");
373 }
374 #endif
375 };
376
377
378 struct GXAlnInfo {
379 const char *qseq;
380 int ql,qr;
381 const char *sseq;
382 int sl,sr;
383 int score;
384 double pid;
385 GXEditScript* editscript;
386 GXAlnMem* abmp;
387 CAlnGapInfo* gapinfo;
388 GXAlnInfo(const char* q, int q_l, int q_r, const char* s, int s_l, int s_r,
389 int sc=0, double percid=0) {
390 qseq=q;
391 sseq=s;
392 ql=q_l;
393 qr=q_r;
394 sl=s_l;
395 sr=s_r;
396 score=sc;
397 pid=percid;
398 //editscript=NULL;
399 gapinfo=NULL;
400 }
401 ~GXAlnInfo() {
402 delete editscript;
403 delete abmp;
404 delete gapinfo;
405 }
406 bool operator<(GXAlnInfo& d) {
407 return ((score==d.score)? pid>d.pid : score>d.score);
408 }
409 bool operator>(GXAlnInfo& d) {
410 return ((score==d.score)? pid<d.pid : score<d.score);
411 }
412 bool operator==(GXAlnInfo& d) {
413 return (score==d.score && pid==d.pid);
414 }
415
416 };
417
418
419
420 struct GXSeed {
421 int b_ofs; //0-based coordinate on seq b (x coordinate)
422 int a_ofs; //0-based coordinate on seq a (y coordinate)
423 int len; //length of exact match after extension
424 bool operator<(GXSeed& d){
425 return ((b_ofs==d.b_ofs) ? a_ofs<d.a_ofs : b_ofs<d.b_ofs);
426 }
427 bool operator>(GXSeed& d){
428 return ((b_ofs==d.b_ofs) ? a_ofs>d.a_ofs : b_ofs>d.b_ofs);
429 }
430 bool operator==(GXSeed& d){
431 return (b_ofs==d.b_ofs && a_ofs==d.a_ofs); //should never be the case, seeds are uniquely constructed
432 }
433 GXSeed(int aofs=0, int bofs=0, int l=4) {
434 a_ofs=aofs;
435 b_ofs=bofs;
436 len=l;
437 }
438 };
439
440 int cmpSeedDiag(const pointer p1, const pointer p2);
441 //seeds are "equal" if they're on the same diagonal (for selection purposes only)
442
443 int cmpSeedScore(const pointer p1, const pointer p2); //also takes position into account
444 //among seeds with same length, prefer those closer to the left end of the read (seq_b)
445
446 struct GXBand {
447 //bundle of seed matches on 3 adjacent diagonals
448 int diag; //first anti-diagonal (b_ofs-a_ofs) in this group of 3
449 //seeds for this, and diag+1 and diag+2 are stored here
450 int min_a, max_a; //maximal coordinates of the bundle
451 int min_b, max_b;
452 GList<GXSeed> seeds; //sorted by x coordinate (b_ofs)
453 int score; //sum of seed scores (- overlapping_bases/2 - gaps)
454 GXBand(int start_diag=-1, GXSeed* seed=NULL):seeds(true, false, false) {
455 diag=start_diag;
456 min_a=MAX_INT;
457 min_b=MAX_INT;
458 max_a=0;
459 max_b=0;
460 score=0;
461 if (seed!=NULL) addSeed(seed);
462 }
463 void addSeed(GXSeed* seed) {
464 seeds.Add(seed);
465 score+=seed->len;
466 //if (diag<0) diag=seed->diag; //should NOT be done like this
467 if (seed->a_ofs < min_a) min_a=seed->a_ofs;
468 if (seed->a_ofs+ seed->len > max_a) max_a=seed->a_ofs+seed->len;
469 if (seed->b_ofs < min_b) min_b=seed->b_ofs;
470 if (seed->b_ofs+seed->len > max_b) max_b=seed->b_ofs+seed->len;
471 }
472 void finalize() {
473 //!! to be called only AFTER all seeds have been added
474 // seeds are sorted by b_ofs
475 //penalize seed gaps and overlaps on b sequence
476 for (int i=1;i<seeds.Count();i++) {
477 GXSeed& sprev=*seeds[i-1];
478 GXSeed& scur=*seeds[i];
479 if (scur==sprev) GError("Error: duplicate seeds found (%d-%d:%d-%d)!\n",
480 scur.a_ofs+1, scur.a_ofs+scur.len, scur.b_ofs+1, scur.b_ofs+scur.len);
481 int b_gap=scur.b_ofs-sprev.b_ofs-sprev.len;
482 int a_gap=scur.a_ofs-sprev.a_ofs-sprev.len;
483 int max_gap=b_gap;
484 int min_gap=a_gap;
485 if (min_gap>max_gap) swap(max_gap, min_gap);
486 if (min_gap<0) { //overlap
487 if (max_gap>0) { score-=GMAX((-min_gap), max_gap); }
488 else score+=min_gap;
489 }
490 else { //gap
491 score-=max_gap;
492 }
493 }//for each seed
494 }
495
496 //bands will be sorted by decreasing score eventually, after all seeds are added
497 bool operator<(GXBand& d){
498 return ((score==d.score) ? seeds.Count()>d.seeds.Count() : score>d.score);
499 }
500 bool operator>(GXBand& d){
501 return ((score==d.score) ? seeds.Count()<d.seeds.Count() : score<d.score);
502 }
503 bool operator==(GXBand& d){
504 return (score==d.score && seeds.Count()==d.seeds.Count());
505 }
506
507 };
508
509 class GXBandSet:public GList<GXBand> {
510 public:
511 int idxoffset; //global anti-diagonal->index offset (a_len-1)
512 //used to convert a diagonal to an index
513 //diagonal is always b_ofs-a_ofs, so the minimum value is -a_len+1
514 //hence offset is a_len-1
515 GXBand* band(int diag) { //retrieve the band for given anti-diagonal (b_ofs-a_ofs)
516 return Get(diag+idxoffset);
517 }
518 GXBand* band(int a_ofs, int b_ofs) { //retrieve the band for given anti-diagonal (b_ofs-a_ofs)
519 return Get(b_ofs-a_ofs+idxoffset);
520 }
521 GXBandSet(int a_len, int b_len):GList<GXBand>(a_len+b_len-1, false, true, false) {
522 idxoffset=a_len-1;
523 //diag will range from -a_len+1 to b_len-1, so after adjustment
524 //by idxoffset we get a max of a_len+b_len-2
525 int bcount=a_len+b_len-1;
526 for (int i=0;i<bcount;i++)
527 this->Add(new GXBand(i-idxoffset));
528 //unsorted, this should set fList[i]
529 }
530 void addSeed(GXSeed* seed) {
531 //MUST be unsorted !!!
532 int idx=(seed->b_ofs-seed->a_ofs)+idxoffset;
533 fList[idx]->addSeed(seed);
534 if (idx>0) fList[idx-1]->addSeed(seed);
535 if (idx<fCount-1) fList[idx+1]->addSeed(seed);
536 }
537 };
538
539 GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len); //overlap of 3' end of seqb
540 //void collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, const char* seqb); //overlap of 5' end of seqb
541
542 void printEditScript(GXEditScript* ed_script);
543
544
545 // reward MUST be >1, always
546 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
547 const char* s_seq, int s_alnstart, int s_max,
548 bool editscript=false, int reward=2, int penalty=-2, int xdrop=14);
549 GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart,
550 bool editscript=false, int reward=2, int penalty=-2, int xdrop=14);
551
552 #endif