ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/seqalign/GXAlign.h
Revision: 83
Committed: Thu Sep 29 21:32:42 2011 UTC (9 years, 10 months ago) by gpertea
File size: 15664 byte(s)
Log Message:
wip

Line File contents
1 #ifndef _GXALIGN_H
2
3 #include "GBase.h"
4 #include "GList.hh"
5
6 #define GDEBUG 1
7
8 #define MAX_SPACE 2000000
9
10 enum {
11 EDIT_OP_MASK = 0x3,
12 EDIT_OP_ERR = 0x0,
13 EDIT_OP_INS = 0x1,
14 EDIT_OP_DEL = 0x2,
15 EDIT_OP_REP = 0x3
16 };
17
18 #define GX_EDITOP_VAL(op) ((op) >> 2)
19 #define GX_EDITOP_GET(op) ((op) & EDIT_OP_MASK)
20 #define GX_EDITOP_CONS(op, val) (((val) << 2) | ((op) & EDIT_OP_MASK))
21
22 #ifdef GDEBUG
23 enum {c_black=0,
24 c_red, c_green,c_brown,c_blue,c_magenta,c_cyan,c_white
25 };
26
27 void color_fg(int c, FILE* f=stderr);
28 void color_bg(int c, FILE* f=stderr);
29 void color_resetfg(FILE* f=stderr);
30 void color_resetbg(FILE* f=stderr);
31 void color_reset(FILE* f=stderr);
32 void color_normal(FILE* f=stderr);
33 #endif
34
35 struct GXEditScript{
36 uint32 *ops; // array of edit operations
37 uint32 opsize, opnum; // size of allocation, number in use
38 uint32 oplast; // most recent operation added
39 //methods
40
41 GXEditScript() {
42 init();
43 }
44 ~GXEditScript() {
45 GFREE(ops);
46 }
47 void init() {
48 ops = NULL;
49 opsize = 0;
50 opnum = 0;
51 oplast = 0;
52 getReady(8);
53 }
54
55 int getReady(uint32 n) {
56 uint32 m = n + n/2;
57 if (opsize <= n) {
58 GREALLOC(ops, m*sizeof(uint32));
59 opsize = m;
60 }
61 return 1;
62 }
63
64 int getReady2(uint32 n) {
65 if (opsize - opnum <= n)
66 return getReady(n + opnum);
67 return 1;
68 }
69
70 int Put(uint32 op, uint32 n) {
71 if (!getReady2(2))
72 return 0;
73 oplast = op;
74 ops[opnum] = GX_EDITOP_CONS(op, n);
75 opnum += 1;
76 ops[opnum] = 0; // sentinel
77 return 1;
78 }
79 uint32* First() {
80 return opnum > 0 ? & ops[0] : NULL;
81 }
82
83 uint32* Next(uint32 *op) {
84 // assumes flat address space !
85 if (&ops[0] <= op && op < &ops[opnum-1])
86 return op+1;
87 else
88 return 0;
89 }
90
91 int More(uint32 op, uint32 k) {
92 if (op == EDIT_OP_ERR) {
93 GError("GXEditScript::opMore: bad opcode %d:%d", op, k);
94 return -1;
95 }
96
97 if (GX_EDITOP_GET(oplast) == op) {
98 uint32 l=ops[opnum-1];
99 ops[opnum-1]=GX_EDITOP_CONS((GX_EDITOP_GET(l)),
100 (GX_EDITOP_VAL(l) + k));
101 }
102 else {
103 Put(op, k);
104 }
105
106 return 0;
107 }
108
109 GXEditScript* Append(GXEditScript *et) {
110 uint32 *op;
111 for (op = et->First(); op; op = et->Next(op))
112 More(GX_EDITOP_GET(*op), GX_EDITOP_VAL(*op));
113 return this;
114 }
115
116 int opDel(uint32 k) {
117 return More(EDIT_OP_DEL, k);
118 }
119 int opIns(uint32 k) {
120 return More(EDIT_OP_INS, k);
121 }
122 int opRep(uint32 k) {
123 return More(EDIT_OP_REP, k);
124 }
125
126 GXEditScript *reverse() {
127 const uint32 mid = opnum/2;
128 const uint32 end = opnum-1;
129 for (uint32 i = 0; i < mid; ++i) {
130 const uint32 t = ops[i];
131 ops[i] = ops[end-i];
132 ops[end-i] = t;
133 }
134 return this;
135 }
136 };
137
138
139 // ----- pool allocator -----
140 // works as a linked list of allocated memory blocks
141 struct GXMemPool {
142 char* memblock;
143 int used, size;
144 GXMemPool *next;
145 //methods
146 GXMemPool() { //by default allocate a large block here (10M)
147 uint32 amount= MAX_SPACE<<3;
148 GCALLOC(memblock, amount);
149 if (memblock == NULL) {
150 GError("Failed to allocated GXAlnSpace for greedy extension!\n");
151 return;
152 }
153 used = 0;
154 size = amount;
155 next = NULL;
156 }
157 void refresh() {
158 GXMemPool* sp=this;
159 while (sp) {
160 sp->used = 0;
161 sp = sp->next;
162 }
163 }
164 ~GXMemPool() {
165 GXMemPool* next_sp;
166 GXMemPool* sp=this->next;
167 while (sp) {
168 next_sp = sp->next;
169 GFREE(sp->memblock);
170 delete sp;
171 sp = next_sp;
172 }
173 GFREE(memblock);
174 }
175
176 void* getSpace(int amount) { //amount to use or allocate memory, in bytes
177 //can use the first found memory block with enough room,
178 // or allocate a new large block
179 char* v;
180 if (amount < 0) return NULL;
181 GXMemPool* S=this;
182 while (used+amount > S->size) {
183 //no room in current block, get a new mem block
184 if (next == NULL) {
185 next=new GXMemPool(); //allocates a large contiguous memory block
186 }
187 S = S->next;
188 }
189 v = S->memblock+S->used;
190 S->used += amount;
191 //align to next 8-byte boundary
192 int m8 = S->used & 7; //modulo 8
193 if (m8)
194 S->used += 8 - m8;
195 return (void*)v;
196 }
197 };
198
199 #define ERROR_FRACTION 6 // 1/this : the higher it is, the smaller the diagonal band
200
201 struct GXAlnMem { //GreedyAlignMem
202 int** flast_d; //flast_d[][]
203 int* max_row_free;
204 int* uplow_free;
205 GXMemPool* space;
206 int max_d;
207 GXAlnMem(int max_len, int reward, int penalty, int Xdrop) {
208 flast_d=NULL;
209 max_row_free=NULL;
210 max_d=0;
211 uplow_free=NULL;
212 space=NULL;
213 max_d = (int)(max_len / ERROR_FRACTION + 1);
214 int d_diff = (Xdrop+reward/2)/(penalty+reward) + 1;
215 GCALLOC(flast_d, (max_d+2)* sizeof(int*));
216 if (flast_d == NULL) {
217 GError("Error: cannot allocate GXAlnMem structure!\n");
218 }
219 flast_d[0] = (int*)malloc((max_d + max_d + 6) * sizeof(int) * 2);
220 //allocates two int rows - flast_d[1] too
221 if (flast_d[0] == NULL) {
222 GError("Error: failed to allocate flast_d[0] in GXAlnMem!\n");
223 }
224 flast_d[1] = flast_d[0] + max_d + max_d + 6; //int array for next row
225 uplow_free = NULL;
226 max_row_free = (int*) malloc(sizeof(int) * (max_d + 1 + d_diff));
227 space = new GXMemPool();
228 }
229 ~GXAlnMem() {
230 delete space;
231 GFREE(max_row_free);
232 GFREE(flast_d[0]);
233 GFREE(flast_d);
234 }
235 };
236
237 int GXGreedyAlign(const char* s1, int len1,
238 const char* s2, int len2,
239 bool reverse, int xdrop_threshold,
240 int match_cost, int mismatch_cost,
241 int* e1, int* e2,
242 GXAlnMem* abmp, GXEditScript *S=NULL);
243
244 #define GAPALIGN_SUB ((unsigned char)0) /*op types within the edit script*/
245 #define GAPALIGN_INS ((unsigned char)1)
246 #define GAPALIGN_DEL ((unsigned char)2)
247 #define GAPALIGN_DECLINE ((unsigned char)3)
248
249 struct GapXEditScript {
250 unsigned char op_type; // GAPALIGN_SUB, GAPALIGN_INS, or GAPALIGN_DEL
251 int num; // Number of operations
252 GapXEditScript* next;
253 GapXEditScript() {
254 op_type=0;
255 num=0;
256 next=NULL;
257 }
258 void print();
259 };
260
261 class CSeqGap { //
262 public:
263 int offset;
264 int len;
265 CSeqGap(int gofs=0,int glen=1) {
266 offset=gofs;
267 len=glen;
268 }
269 };
270
271 class CAlnGapInfo {
272 int a_ofs; //alignment start on seq a (0 based)
273 int b_ofs; //alignment start on seq b (0 based)
274 int a_len; //length of alignment on seq a
275 int b_len; //length of alignment on seq b
276 public:
277 GVec<CSeqGap> a_gaps;
278 GVec<CSeqGap> b_gaps;
279 CAlnGapInfo(GXEditScript* ed_script, int astart=0, int bstart=0):a_gaps(),b_gaps() {
280 a_ofs=astart;
281 b_ofs=bstart;
282 a_len=0;
283 b_len=0;
284 if (ed_script==NULL) return;
285 for (uint32 i=0; i<ed_script->opnum; i++) {
286 int num=((ed_script->ops[i]) >> 2);
287 char op_type = 3 - ( ed_script->ops[i] & EDIT_OP_MASK );
288 if (op_type == 3 || op_type < 0 )
289 GError("Error: encountered op_type %d in ed_script?!\n", (int)op_type);
290 CSeqGap gap;
291 switch (op_type) {
292 case GAPALIGN_SUB: a_len+=num;
293 b_len+=num;
294 break;
295 case GAPALIGN_INS: a_len+=num;
296 gap.offset=b_ofs+b_len;
297 gap.len=num;
298 b_gaps.Add(gap);
299 break;
300 case GAPALIGN_DEL: b_len+=num;
301 gap.offset=a_ofs+a_len;
302 gap.len=num;
303 a_gaps.Add(gap);
304 break;
305 }
306 }
307 }
308
309 #ifdef GDEBUG
310 void printAlignment(FILE* f, const char* sa, int sa_len,
311 const char* sb, int sb_len) {
312 //print seq A
313 char al[1024]; //display buffer for seq A
314 int ap=0; //index in al[] for current character printed
315 int g=0;
316 int aend=a_ofs+a_len;
317 if (a_ofs<b_ofs) {
318 for (int i=0;i<b_ofs-a_ofs;i++) {
319 fprintf(f, " ");
320 al[++ap]=' ';
321 }
322 }
323 for (int i=0;i<aend;i++) {
324 if (g<a_gaps.Count() && a_gaps[g].offset==i) {
325 for (int j=0;j<a_gaps[g].len;j++) {
326 fprintf(f, "-");
327 al[++ap]='-';
328 }
329 g++;
330 }
331 if (i==a_ofs) color_bg(c_blue,f);
332 fprintf(f, "%c", sa[i]);
333 al[++ap]=sa[i];
334 }
335 color_reset(f);
336 if (aend<sa_len)
337 fprintf(f, &sa[aend]);
338 fprintf(f, "\n");
339 //print seq B
340 ap=0;
341 g=0;
342 int bend=b_ofs+b_len;
343 if (a_ofs>b_ofs) {
344 for (int i=0;i<a_ofs-b_ofs;i++) {
345 fprintf(f, " ");
346 ap++;
347 }
348 }
349 for (int i=0;i<b_ofs;i++) {
350 fprintf(f, "%c", sb[i]);
351 ap++;
352 }
353 for (int i=b_ofs;i<bend;i++) {
354 if (g<b_gaps.Count() && b_gaps[g].offset==i) {
355 for (int j=0;j<b_gaps[g].len;j++) {
356 fprintf(f, "-");
357 ap++;
358 }
359 g++;
360 }
361 if (i==b_ofs) color_bg(c_blue,f);
362 ap++;
363 bool mismatch=(sb[i]!=al[ap] && al[ap]!='-');
364 if (mismatch) color_bg(c_red,f);
365 fprintf(f, "%c", sb[i]);
366 if (mismatch) color_bg(c_blue,f);
367 }
368 color_reset(f);
369 if (bend<sb_len)
370 fprintf(f, &sb[bend]);
371 fprintf(f, "\n");
372 }
373 #endif
374 };
375
376
377 struct GXAlnInfo {
378 const char *qseq;
379 int ql,qr;
380 const char *sseq;
381 int sl,sr;
382 int score;
383 double pid;
384 GXEditScript* editscript;
385 GXAlnMem* abmp;
386 CAlnGapInfo* gapinfo;
387 GXAlnInfo(const char* q, int q_l, int q_r, const char* s, int s_l, int s_r,
388 int sc=0, double percid=0) {
389 qseq=q;
390 sseq=s;
391 ql=q_l;
392 qr=q_r;
393 sl=s_l;
394 sr=s_r;
395 score=sc;
396 pid=percid;
397 //editscript=NULL;
398 gapinfo=NULL;
399 }
400 ~GXAlnInfo() {
401 delete editscript;
402 delete abmp;
403 delete gapinfo;
404 }
405 bool operator<(GXAlnInfo& d) {
406 return ((score==d.score)? pid>d.pid : score>d.score);
407 }
408 bool operator>(GXAlnInfo& d) {
409 return ((score==d.score)? pid<d.pid : score<d.score);
410 }
411 bool operator==(GXAlnInfo& d) {
412 return (score==d.score && pid==d.pid);
413 }
414
415 };
416
417
418
419 struct GXSeed {
420 int b_ofs; //0-based coordinate on seq b (x coordinate)
421 int a_ofs; //0-based coordinate on seq a (y coordinate)
422 int len; //length of exact match after extension
423 bool operator<(GXSeed& d){
424 return ((b_ofs==d.b_ofs) ? a_ofs<d.a_ofs : b_ofs<d.b_ofs);
425 }
426 bool operator>(GXSeed& d){
427 return ((b_ofs==d.b_ofs) ? a_ofs>d.a_ofs : b_ofs>d.b_ofs);
428 }
429 bool operator==(GXSeed& d){
430 return (b_ofs==d.b_ofs && a_ofs==d.a_ofs); //should never be the case, seeds are uniquely constructed
431 }
432 GXSeed(int aofs=0, int bofs=0, int l=4) {
433 a_ofs=aofs;
434 b_ofs=bofs;
435 len=l;
436 }
437 };
438
439 int cmpSeedDiag(const pointer p1, const pointer p2);
440 //seeds are "equal" if they're on the same diagonal (for selection purposes only)
441
442 int cmpSeedScore(const pointer p1, const pointer p2); //also takes position into account
443 //among seeds with same length, prefer those closer to the left end of the read (seq_b)
444
445 struct GXBand {
446 //bundle of seed matches on 3 adjacent diagonals
447 int diag; //first anti-diagonal (b_ofs-a_ofs) in this group of 3
448 //seeds for this, and diag+1 and diag+2 are stored here
449 int min_a, max_a; //maximal coordinates of the bundle
450 int min_b, max_b;
451 GList<GXSeed> seeds; //sorted by x coordinate (b_ofs)
452 int score; //sum of seed scores (- overlapping_bases/2 - gaps)
453 GXBand(int start_diag=-1, GXSeed* seed=NULL):seeds(true, false, false) {
454 diag=start_diag;
455 min_a=MAX_INT;
456 min_b=MAX_INT;
457 max_a=0;
458 max_b=0;
459 score=0;
460 if (seed!=NULL) addSeed(seed);
461 }
462 void addSeed(GXSeed* seed) {
463 seeds.Add(seed);
464 score+=seed->len;
465 //if (diag<0) diag=seed->diag; //should NOT be done like this
466 if (seed->a_ofs < min_a) min_a=seed->a_ofs;
467 if (seed->a_ofs+ seed->len > max_a) max_a=seed->a_ofs+seed->len;
468 if (seed->b_ofs < min_b) min_b=seed->b_ofs;
469 if (seed->b_ofs+seed->len > max_b) max_b=seed->b_ofs+seed->len;
470 }
471 void finalize() {
472 //!! to be called only AFTER all seeds have been added
473 // seeds are sorted by b_ofs
474 //penalize seed gaps and overlaps on b sequence
475 for (int i=1;i<seeds.Count();i++) {
476 GXSeed& sprev=*seeds[i-1];
477 GXSeed& scur=*seeds[i];
478 if (scur==sprev) GError("Error: duplicate seeds found (%d-%d:%d-%d)!\n",
479 scur.a_ofs+1, scur.a_ofs+scur.len, scur.b_ofs+1, scur.b_ofs+scur.len);
480 int b_gap=scur.b_ofs-sprev.b_ofs-sprev.len;
481 int a_gap=scur.a_ofs-sprev.a_ofs-sprev.len;
482 int max_gap=b_gap;
483 int min_gap=a_gap;
484 if (min_gap>max_gap) swap(max_gap, min_gap);
485 if (min_gap<0) { //overlap
486 if (max_gap>0) { score-=GMAX((-min_gap), max_gap); }
487 else score+=min_gap;
488 }
489 else { //gap
490 score-=max_gap;
491 }
492 }//for each seed
493 }
494
495 //bands will be sorted by decreasing score eventually, after all seeds are added
496 bool operator<(GXBand& d){
497 return ((score==d.score) ? seeds.Count()>d.seeds.Count() : score>d.score);
498 }
499 bool operator>(GXBand& d){
500 return ((score==d.score) ? seeds.Count()<d.seeds.Count() : score<d.score);
501 }
502 bool operator==(GXBand& d){
503 return (score==d.score && seeds.Count()==d.seeds.Count());
504 }
505
506 };
507
508 class GXBandSet:public GList<GXBand> {
509 public:
510 int idxoffset; //global anti-diagonal->index offset (a_len-1)
511 //used to convert a diagonal to an index
512 //diagonal is always b_ofs-a_ofs, so the minimum value is -a_len+1
513 //hence offset is a_len-1
514 GXBand* band(int diag) { //retrieve the band for given anti-diagonal (b_ofs-a_ofs)
515 return Get(diag+idxoffset);
516 }
517 GXBand* band(int a_ofs, int b_ofs) { //retrieve the band for given anti-diagonal (b_ofs-a_ofs)
518 return Get(b_ofs-a_ofs+idxoffset);
519 }
520 GXBandSet(int a_len, int b_len):GList<GXBand>(a_len+b_len-1, false, true, false) {
521 idxoffset=a_len-1;
522 //diag will range from -a_len+1 to b_len-1, so after adjustment
523 //by idxoffset we get a max of a_len+b_len-2
524 int bcount=a_len+b_len-1;
525 for (int i=0;i<bcount;i++)
526 this->Add(new GXBand(i-idxoffset));
527 //unsorted, this should set fList[i]
528 }
529 void addSeed(GXSeed* seed) {
530 //MUST be unsorted !!!
531 int idx=(seed->b_ofs-seed->a_ofs)+idxoffset;
532 fList[idx]->addSeed(seed);
533 if (idx>0) fList[idx-1]->addSeed(seed);
534 if (idx<fCount-1) fList[idx+1]->addSeed(seed);
535 }
536 };
537
538 GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len); //overlap of 3' end of seqb
539 //void collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, const char* seqb); //overlap of 5' end of seqb
540
541 void printEditScript(GXEditScript* ed_script);
542
543
544 // reward MUST be >1, always
545 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
546 const char* s_seq, int s_alnstart, int s_max,
547 bool editscript=false, int reward=2, int penalty=-2, int xdrop=14);
548 GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart,
549 bool editscript=false, int reward=2, int penalty=-2, int xdrop=14);
550
551 #endif