ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/seqalign/GXAlign.h
Revision: 77
Committed: Tue Sep 27 21:07:59 2011 UTC (9 years, 8 months ago) by gpertea
File size: 15760 byte(s)
Log Message:
wip - cutting seeds, finalize()

Line File contents
1 #ifndef _GXALIGN_H
2
3 #include "GBase.h"
4 #include "GList.hh"
5
6 #define GDEBUG 1
7
8 #define ERROR_FRACTION 6 // 1/this : the higher it is, the smaller the diagonal band
9 #define MAX_SPACE 1000000
10 #define LARGE 100000000
11
12 enum {
13 EDIT_OP_MASK = 0x3,
14 EDIT_OP_ERR = 0x0,
15 EDIT_OP_INS = 0x1,
16 EDIT_OP_DEL = 0x2,
17 EDIT_OP_REP = 0x3
18 };
19
20 #define GX_EDITOP_VAL(op) ((op) >> 2)
21 #define GX_EDITOP_GET(op) ((op) & EDIT_OP_MASK)
22 #define GX_EDITOP_CONS(op, val) (((val) << 2) | ((op) & EDIT_OP_MASK))
23
24 #ifdef GDEBUG
25 enum {c_black=0,
26 c_red, c_green,c_brown,c_blue,c_magenta,c_cyan,c_white
27 };
28
29 void color_fg(int c, FILE* f=stderr);
30 void color_bg(int c, FILE* f=stderr);
31 void color_resetfg(FILE* f=stderr);
32 void color_resetbg(FILE* f=stderr);
33 void color_reset(FILE* f=stderr);
34 void color_normal(FILE* f=stderr);
35 #endif
36
37 struct GXEditScript{
38 uint32 *ops; // array of edit operations
39 uint32 opsize, opnum; // size of allocation, number in use
40 uint32 oplast; // most recent operation added
41 //methods
42
43 GXEditScript() {
44 init();
45 }
46 ~GXEditScript() {
47 GFREE(ops);
48 }
49 void init() {
50 ops = NULL;
51 opsize = 0;
52 opnum = 0;
53 oplast = 0;
54 getReady(8);
55 }
56
57 int getReady(uint32 n) {
58 uint32 m = n + n/2;
59 if (opsize <= n) {
60 GREALLOC(ops, m*sizeof(uint32));
61 opsize = m;
62 }
63 return 1;
64 }
65
66 int getReady2(uint32 n) {
67 if (opsize - opnum <= n)
68 return getReady(n + opnum);
69 return 1;
70 }
71
72 int Put(uint32 op, uint32 n) {
73 if (!getReady2(2))
74 return 0;
75 oplast = op;
76 ops[opnum] = GX_EDITOP_CONS(op, n);
77 opnum += 1;
78 ops[opnum] = 0; // sentinel
79 return 1;
80 }
81 uint32* First() {
82 return opnum > 0 ? & ops[0] : NULL;
83 }
84
85 uint32* Next(uint32 *op) {
86 // assumes flat address space !
87 if (&ops[0] <= op && op < &ops[opnum-1])
88 return op+1;
89 else
90 return 0;
91 }
92
93 int More(uint32 op, uint32 k) {
94 if (op == EDIT_OP_ERR) {
95 GError("GXEditScript::opMore: bad opcode %d:%d", op, k);
96 return -1;
97 }
98
99 if (GX_EDITOP_GET(oplast) == op) {
100 uint32 l=ops[opnum-1];
101 ops[opnum-1]=GX_EDITOP_CONS((GX_EDITOP_GET(l)),
102 (GX_EDITOP_VAL(l) + k));
103 }
104 else {
105 Put(op, k);
106 }
107
108 return 0;
109 }
110
111 GXEditScript* Append(GXEditScript *et) {
112 uint32 *op;
113 for (op = et->First(); op; op = et->Next(op))
114 More(GX_EDITOP_GET(*op), GX_EDITOP_VAL(*op));
115 return this;
116 }
117
118 int opDel(uint32 k) {
119 return More(EDIT_OP_DEL, k);
120 }
121 int opIns(uint32 k) {
122 return More(EDIT_OP_INS, k);
123 }
124 int opRep(uint32 k) {
125 return More(EDIT_OP_REP, k);
126 }
127
128 GXEditScript *reverse() {
129 const uint32 mid = opnum/2;
130 const uint32 end = opnum-1;
131 for (uint32 i = 0; i < mid; ++i) {
132 const uint32 t = ops[i];
133 ops[i] = ops[end-i];
134 ops[end-i] = t;
135 }
136 return this;
137 }
138 };
139
140
141 /* ----- pool allocator ----- */
142 struct GX3Val {
143 int I, C, D;
144 };
145
146 struct GXAlnSpace {
147 GX3Val* space_array;
148 int used, size;
149 GXAlnSpace *next;
150 //methods
151 GXAlnSpace() {
152 uint32 amount=MAX_SPACE;
153 GCALLOC(space_array, sizeof(GX3Val)*amount);
154 if (space_array == NULL) {
155 GError("Failed to allocated GXAlnSpace for greedy extension!\n");
156 return;
157 }
158 used = 0;
159 size = amount;
160 next = NULL;
161 }
162 void refresh() {
163 GXAlnSpace* sp=this;
164 while (sp) {
165 sp->used = 0;
166 sp = sp->next;
167 }
168 }
169 ~GXAlnSpace() {
170 GXAlnSpace* next_sp;
171 GXAlnSpace* sp=this->next;
172 while (sp) {
173 next_sp = sp->next;
174 GFREE(sp->space_array);
175 delete sp;
176 sp = next_sp;
177 }
178 }
179 GX3Val* getSpace(int amount) {
180 GX3Val* v;
181 if (amount < 0) return NULL;
182 GXAlnSpace* S=this;
183 while (used+amount > S->size) {
184 if (next == NULL) {
185 next=new GXAlnSpace();
186 }
187 S = S->next;
188 }
189 v = S->space_array+S->used;
190 S->used += amount;
191 return v;
192 }
193 };
194
195 struct GXAlnMem { //GreedyAlignMem
196 int** flast_d;
197 int* max_row_free;
198 int* uplow_free;
199 GXAlnSpace* space;
200 int max_d;
201 GXAlnMem(int max_len, int reward, int penalty, int Xdrop) {
202 flast_d=NULL;
203 max_row_free=NULL;
204 max_d=0;
205 uplow_free=NULL;
206 space=NULL;
207 max_d = (int)(max_len / ERROR_FRACTION + 1);
208 int d_diff = (Xdrop+reward/2)/(penalty+reward) + 1;
209 GCALLOC(flast_d, (max_d+2)* sizeof(int));
210 if (flast_d == NULL) {
211 GError("Error: cannot allocate GXAlnMem structure!\n");
212 }
213 flast_d[0] = (int*)malloc((max_d + max_d + 6) * sizeof(int) * 2);
214 if (flast_d[0] == NULL) {
215 GError("Error: failed to allocate flast_d[0] in GXAlnMem!\n");
216 }
217 flast_d[1] = flast_d[0] + max_d + max_d + 6;
218 uplow_free = NULL;
219 max_row_free = (int*) malloc(sizeof(int) * (max_d + 1 + d_diff));
220 space = new GXAlnSpace();
221 }
222 ~GXAlnMem() {
223 delete space;
224 GFREE(max_row_free);
225 //GFREE(flast_d[0]); // FIXME
226 //TODO: check with valgrind
227 //for (int i=0;i<max_d+2;i++) {
228 // GFREE(flast_d[i]);
229 // }
230 //GFREE(flast_d);
231 }
232 };
233
234 int GXGreedyAlign(const char* s1, int len1,
235 const char* s2, int len2,
236 bool reverse, int xdrop_threshold,
237 int match_cost, int mismatch_cost,
238 int* e1, int* e2,
239 GXAlnMem* abmp, GXEditScript *S=NULL);
240
241 #define GAPALIGN_SUB ((unsigned char)0) /*op types within the edit script*/
242 #define GAPALIGN_INS ((unsigned char)1)
243 #define GAPALIGN_DEL ((unsigned char)2)
244 #define GAPALIGN_DECLINE ((unsigned char)3)
245
246 struct GapXEditScript {
247 unsigned char op_type; // GAPALIGN_SUB, GAPALIGN_INS, or GAPALIGN_DEL
248 int num; // Number of operations
249 GapXEditScript* next;
250 GapXEditScript() {
251 op_type=0;
252 num=0;
253 next=NULL;
254 }
255 void print();
256 };
257
258 class CSeqGap { //
259 public:
260 int offset;
261 int len;
262 CSeqGap(int gofs=0,int glen=1) {
263 offset=gofs;
264 len=glen;
265 }
266 };
267
268 class CAlnGapInfo {
269 int a_ofs; //alignment start on seq a (0 based)
270 int b_ofs; //alignment start on seq b (0 based)
271 int a_len; //length of alignment on seq a
272 int b_len; //length of alignment on seq b
273 public:
274 GVec<CSeqGap> a_gaps;
275 GVec<CSeqGap> b_gaps;
276 CAlnGapInfo(GXEditScript* ed_script, int astart=0, int bstart=0):a_gaps(),b_gaps() {
277 a_ofs=astart;
278 b_ofs=bstart;
279 a_len=0;
280 b_len=0;
281 for (uint32 i=0; i<ed_script->opnum; i++) {
282 int num=((ed_script->ops[i]) >> 2);
283 char op_type = 3 - ( ed_script->ops[i] & EDIT_OP_MASK );
284 if (op_type == 3 || op_type < 0 )
285 GError("Error: encountered op_type %d in ed_script?!\n", (int)op_type);
286 CSeqGap gap;
287 switch (op_type) {
288 case GAPALIGN_SUB: a_len+=num;
289 b_len+=num;
290 break;
291 case GAPALIGN_INS: a_len+=num;
292 gap.offset=b_ofs+b_len;
293 gap.len=num;
294 b_gaps.Add(gap);
295 break;
296 case GAPALIGN_DEL: b_len+=num;
297 gap.offset=a_ofs+a_len;
298 gap.len=num;
299 a_gaps.Add(gap);
300 break;
301 }
302 }
303 }
304 #ifdef GDEBUG
305 void printAlignment(FILE* f, const char* sa, const char* sb) {
306 //print seq A
307 char al[1024]; //display buffer for seq A
308 int ap=0; //index in al[] for current character printed
309 int g=0;
310 int aend=a_ofs+a_len;
311 if (a_ofs<b_ofs) {
312 for (int i=0;i<b_ofs-a_ofs;i++) {
313 fprintf(f, " ");
314 al[++ap]=' ';
315 }
316 }
317 for (int i=0;i<aend;i++) {
318 if (g<a_gaps.Count() && a_gaps[g].offset==i) {
319 for (int j=0;j<a_gaps[g].len;j++) {
320 fprintf(f, "-");
321 al[++ap]='-';
322 }
323 g++;
324 }
325 if (i==a_ofs) color_bg(c_blue,f);
326 fprintf(f, "%c", sa[i]);
327 al[++ap]=sa[i];
328 }
329 color_reset(f);
330 fprintf(f, &sa[aend]);
331 fprintf(f, "\n");
332 //print seq B
333 ap=0;
334 g=0;
335 int bend=b_ofs+b_len;
336 if (a_ofs>b_ofs) {
337 for (int i=0;i<a_ofs-b_ofs;i++) {
338 fprintf(f, " ");
339 ap++;
340 }
341 }
342 for (int i=0;i<b_ofs;i++) {
343 fprintf(f, "%c", sb[i]);
344 ap++;
345 }
346 for (int i=b_ofs;i<bend;i++) {
347 if (g<b_gaps.Count() && b_gaps[g].offset==i) {
348 for (int j=0;j<b_gaps[g].len;j++) {
349 fprintf(f, "-");
350 ap++;
351 }
352 g++;
353 }
354 if (i==b_ofs) color_bg(c_blue,f);
355 ap++;
356 bool mismatch=(sb[i]!=al[ap] && al[ap]!='-');
357 if (mismatch) color_bg(c_red,f);
358 fprintf(f, "%c", sb[i]);
359 if (mismatch) color_bg(c_blue,f);
360 }
361 color_reset(f);
362 fprintf(f, &sb[bend]);
363 fprintf(f, "\n");
364 }
365 #endif
366 };
367
368
369 struct GXAlnInfo {
370 const char *qseq;
371 int ql,qr;
372 const char *sseq;
373 int sl,sr;
374 int score;
375 double pid;
376 GXEditScript* editscript;
377 GXAlnMem* abmp;
378 CAlnGapInfo* gapinfo;
379 GXAlnInfo(const char* q, int q_l, int q_r, const char* s, int s_l, int s_r,
380 int sc=0, double percid=0) {
381 qseq=q;
382 sseq=s;
383 ql=q_l;
384 qr=q_r;
385 sl=s_l;
386 sr=s_r;
387 score=sc;
388 pid=percid;
389 //editscript=NULL;
390 gapinfo=NULL;
391 }
392 ~GXAlnInfo() {
393 delete editscript;
394 delete abmp;
395 delete gapinfo;
396 }
397 };
398
399
400
401 struct GXSeed {
402 int b_ofs; //0-based coordinate on seq b (x coordinate)
403 int a_ofs; //0-based coordinate on seq a (y coordinate)
404 int len; //length of exact match after extension
405 bool edited; //if it's an edited (cut) seed
406 bool operator<(GXSeed& d){
407 return ((b_ofs==d.b_ofs) ? a_ofs<d.a_ofs : b_ofs<d.b_ofs);
408 }
409 bool operator>(GXSeed& d){
410 return ((b_ofs==d.b_ofs) ? a_ofs>d.a_ofs : b_ofs>d.b_ofs);
411 }
412 bool operator==(GXSeed& d){
413 return (b_ofs==d.b_ofs && a_ofs==d.a_ofs); //should never be the case, seeds are uniquely constructed
414 }
415 GXSeed(int aofs=0, int bofs=0, int l=4) {
416 a_ofs=aofs;
417 b_ofs=bofs;
418 edited=false;
419 len=l;
420 }
421 };
422
423 struct GXBand {
424 //bundle of seed matches on 3 adjacent diagonals
425 int diag; //first anti-diagonal (b_ofs-a_ofs) in this group of 3
426 //seeds for this, and diag+1 and diag+2 are stored here
427 int min_a, max_a; //maximal coordinates of the bundle
428 int min_b, max_b;
429 GList<GXSeed> seeds; //sorted by x coordinate (b_ofs)
430 GPVec<GXSeed> cutseeds; //copy-on-cut list of seeds that will replace the seed instances when a seed gets edited
431 int score; //sum of seed scores (- overlapping_bases/2 - gaps)
432 GXBand(int start_diag=-1, GXSeed* seed=NULL):seeds(true, false, false),
433 cutseeds(10, true){
434 diag=start_diag;
435 min_a=MAX_INT;
436 min_b=MAX_INT;
437 max_a=0;
438 max_b=0;
439 score=0;
440 if (seed!=NULL) addSeed(seed);
441 }
442 void addSeed(GXSeed* seed) {
443 seeds.Add(seed);
444 score+=seed->len;
445 //if (diag<0) diag=seed->diag; //should NOT be done like this
446 if (seed->a_ofs < min_a) min_a=seed->a_ofs;
447 if (seed->a_ofs+ seed->len > max_a) max_a=seed->a_ofs+seed->len;
448 if (seed->b_ofs < min_b) min_b=seed->b_ofs;
449 if (seed->b_ofs+seed->len > max_b) max_b=seed->b_ofs+seed->len;
450 }
451 void finalize() {
452 //!! to be called only AFTER all seeds have been added
453 // seeds are sorted by b_ofs
454 //penalize gaps and overlaps on b sequence
455 //cut out overlapping regions from shorter seeds
456
457 bool had_cut=false;
458 do {
459 for (int i=1;i<seeds.Count();i++) {
460 GXSeed& sprev=*seeds[i-1];
461 if (sprev.len<=0) continue;
462 GXSeed& scur=*seeds[i];
463 if (scur==sprev) GError("Error: duplicate seeds found (%d-%d:%d-%d)!\n",
464 scur.a_ofs+1, scur.a_ofs+scur.len, scur.b_ofs+1, scur.b_ofs+scur.len);
465 int b_gap=scur.b_ofs-sprev.b_ofs-sprev.len;
466 int a_gap=scur.a_ofs-sprev.a_ofs-sprev.len;
467 int max_gap=b_gap;
468 int min_gap=a_gap;
469 if (min_gap>max_gap) swap(max_gap, min_gap);
470 if (min_gap<0) {
471 //cut shorter or previous seed by -min_gap amount
472 GXSeed* cseed=NULL;
473 had_cut=true;
474 if (scur.len<sprev.len) {
475 //cut beginning of scur
476 cseed=new GXSeed(scur);
477 cseed->edited=1;
478 cseed->len+=min_gap; //this can get negative!
479 cseed->a_ofs+=min_gap;//these can get invalid
480 cseed->b_ofs+=min_gap;
481 seeds.Put(i,cseed); //TODO: re-sort?
482 }
483 else { //cut ending of sprev
484 cseed=new GXSeed(sprev);
485 cseed->edited=1;
486 cseed->len+=min_gap; //this can get negative!
487 seeds.Put(i-1,cseed);
488 }
489 }//overlap
490 }
491 } while (had_cut); //repeat until no more cuts
492 }
493 //bands will be sorted by decreasing score eventually, after all seeds are added
494 bool operator<(GXBand& d){
495 return ((score==d.score) ? seeds.Count()>d.seeds.Count() : score>d.score);
496 }
497 bool operator>(GXBand& d){
498 return ((score==d.score) ? seeds.Count()<d.seeds.Count() : score<d.score);
499 }
500 bool operator==(GXBand& d){
501 return (score==d.score && seeds.Count()==d.seeds.Count());
502 }
503
504 };
505
506 class GXBandSet:public GList<GXBand> {
507 public:
508 int idxoffset; //global anti-diagonal->index offset (a_len-1)
509 //used to convert a diagonal to an index
510 //diagonal is always b_ofs-a_ofs, so the minimum value is -a_len+1
511 //hence offset is a_len-1
512 GXBand* band(int diag) { //retrieve the band for given anti-diagonal (b_ofs-a_ofs)
513 return Get(diag+idxoffset);
514 }
515 GXBand* band(int a_ofs, int b_ofs) { //retrieve the band for given anti-diagonal (b_ofs-a_ofs)
516 return Get(b_ofs-a_ofs+idxoffset);
517 }
518 GXBandSet(int a_len, int b_len):GList<GXBand>(a_len+b_len-1, false, true, false) {
519 idxoffset=a_len-1;
520 //diag will range from -a_len+1 to b_len-1, so after adjustment
521 //by idxoffset we get a max of a_len+b_len-2
522 int bcount=a_len+b_len-1;
523 for (int i=0;i<bcount;i++)
524 fList[i]=new GXBand(i-idxoffset);
525 }
526 void addSeed(GXSeed* seed) {
527 //MUST be unsorted !!!
528 int idx=(seed->b_ofs-seed->a_ofs)+idxoffset;
529 fList[idx]->addSeed(seed);
530 if (idx>0) fList[idx-1]->addSeed(seed);
531 if (idx<fCount-1) fList[idx+1]->addSeed(seed);
532 }
533 };
534
535 GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len); //overlap of 3' end of seqb
536 //void collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, const char* seqb); //overlap of 5' end of seqb
537
538 void printEditScript(GXEditScript* ed_script);
539
540 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
541 const char* s_seq, int s_alnstart, int s_max,
542 bool editscript=false, int reward=2, int penalty=-3, int xdrop=22);
543 GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart,
544 bool editscript=false, int reward=2, int penalty=-3, int xdrop=22);
545
546 #endif