ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/seqalign/GXAlign.h
Revision: 86
Committed: Fri Sep 30 21:38:39 2011 UTC (9 years, 7 months ago) by gpertea
File size: 18013 byte(s)
Log Message:
fixed a few more subtle bugs

Line File contents
1 #ifndef _GXALIGN_H
2
3 #include "GBase.h"
4 #include "GList.hh"
5
6 #define GDEBUG 1
7
8 enum {
9 EDIT_OP_MASK = 0x3,
10 EDIT_OP_ERR = 0x0,
11 EDIT_OP_INS = 0x1,
12 EDIT_OP_DEL = 0x2,
13 EDIT_OP_REP = 0x3
14 };
15
16 #define GX_EDITOP_VAL(op) ((op) >> 2)
17 #define GX_EDITOP_GET(op) ((op) & EDIT_OP_MASK)
18 #define GX_EDITOP_CONS(op, val) (((val) << 2) | ((op) & EDIT_OP_MASK))
19
20 #ifdef GDEBUG
21 enum {c_black=0,
22 c_red, c_green,c_brown,c_blue,c_magenta,c_cyan,c_white
23 };
24
25 void color_fg(int c, FILE* f=stderr);
26 void color_bg(int c, FILE* f=stderr);
27 void color_resetfg(FILE* f=stderr);
28 void color_resetbg(FILE* f=stderr);
29 void color_reset(FILE* f=stderr);
30 void color_normal(FILE* f=stderr);
31 #endif
32
33 struct GXEditScript{
34 uint32 *ops; // array of edit operations
35 uint32 opsize, opnum; // size of allocation, number in use
36 uint32 oplast; // most recent operation added
37 //methods
38
39 GXEditScript() {
40 init();
41 }
42 ~GXEditScript() {
43 GFREE(ops);
44 }
45 void init() {
46 ops = NULL;
47 opsize = 0;
48 opnum = 0;
49 oplast = 0;
50 getReady(8);
51 }
52
53 int getReady(uint32 n) {
54 uint32 m = n + n/2;
55 if (opsize <= n) {
56 GREALLOC(ops, m*sizeof(uint32));
57 opsize = m;
58 }
59 return 1;
60 }
61
62 int getReady2(uint32 n) {
63 if (opsize - opnum <= n)
64 return getReady(n + opnum);
65 return 1;
66 }
67
68 int Put(uint32 op, uint32 n) {
69 if (!getReady2(2))
70 return 0;
71 oplast = op;
72 ops[opnum] = GX_EDITOP_CONS(op, n);
73 opnum += 1;
74 ops[opnum] = 0; // sentinel
75 return 1;
76 }
77 uint32* First() {
78 return opnum > 0 ? & ops[0] : NULL;
79 }
80
81 uint32* Next(uint32 *op) {
82 // assumes flat address space !
83 if (&ops[0] <= op && op < &ops[opnum-1])
84 return op+1;
85 else
86 return 0;
87 }
88
89 int More(uint32 op, uint32 k) {
90 if (op == EDIT_OP_ERR) {
91 GError("GXEditScript::opMore: bad opcode %d:%d", op, k);
92 return -1;
93 }
94
95 if (GX_EDITOP_GET(oplast) == op) {
96 uint32 l=ops[opnum-1];
97 ops[opnum-1]=GX_EDITOP_CONS((GX_EDITOP_GET(l)),
98 (GX_EDITOP_VAL(l) + k));
99 }
100 else {
101 Put(op, k);
102 }
103
104 return 0;
105 }
106
107 GXEditScript* Append(GXEditScript *et) {
108 uint32 *op;
109 for (op = et->First(); op; op = et->Next(op))
110 More(GX_EDITOP_GET(*op), GX_EDITOP_VAL(*op));
111 return this;
112 }
113
114 int opDel(uint32 k) {
115 return More(EDIT_OP_DEL, k);
116 }
117 int opIns(uint32 k) {
118 return More(EDIT_OP_INS, k);
119 }
120 int opRep(uint32 k) {
121 return More(EDIT_OP_REP, k);
122 }
123
124 GXEditScript *reverse() {
125 const uint32 mid = opnum/2;
126 const uint32 end = opnum-1;
127 for (uint32 i = 0; i < mid; ++i) {
128 const uint32 t = ops[i];
129 ops[i] = ops[end-i];
130 ops[end-i] = t;
131 }
132 return this;
133 }
134 };
135
136
137 /** Bookkeeping structure for greedy alignment. When aligning
138 two sequences, the members of this structure store the
139 largest offset into the second sequence that leads to a
140 high-scoring alignment for a given start point */
141 struct SGreedyOffset {
142 int insert_off; // Best offset for a path ending in an insertion
143 int match_off; // Best offset for a path ending in a match
144 int delete_off; // Best offset for a path ending in a deletion
145 };
146
147 // ----- pool allocator -----
148 // works as a linked list of allocated memory blocks
149 struct GXMemPool {
150 SGreedyOffset* memblock;
151 int used, size;
152 GXMemPool *next;
153 static int kMinSpace;
154 //methods
155 GXMemPool(int num_offsp=0) { //by default allocate a large block here (10M)
156 num_offsp=GMAX(kMinSpace, num_offsp);
157 GMALLOC(memblock, num_offsp*sizeof(SGreedyOffset));
158 if (memblock == NULL) {
159 GError("Failed to allocated GXMemPool(%d) for greedy extension!\n",num_offsp);
160 return;
161 }
162 used = 0;
163 size = num_offsp;
164 next = NULL;
165 }
166
167 void refresh() {
168 GXMemPool* sp=this;
169 while (sp) {
170 sp->used = 0;
171 sp = sp->next;
172 }
173 }
174 ~GXMemPool() {
175 GXMemPool* next_sp;
176 GXMemPool* sp=this->next;
177 while (sp) {
178 next_sp = sp->next;
179 GFREE(sp->memblock);
180 delete sp;
181 sp = next_sp;
182 }
183 GFREE(memblock);
184 }
185
186 SGreedyOffset* getSpace(int num_alloc) { // SGreedyOffset[num_alloc] array
187 //can use the first found memory block with enough room,
188 // or allocate a new large block
189 SGreedyOffset* v;
190 if (num_alloc < 0) return NULL;
191 GXMemPool* S=this;
192 while (used+num_alloc > S->size) {
193 //no room in current block, get a new mem block
194 if (next == NULL) {
195 next=new GXMemPool(num_alloc); //allocates a large contiguous memory block
196 }
197 S = S->next;
198 }
199 v = S->memblock+S->used;
200 S->used += num_alloc;
201 //align to first 8-byte boundary
202 int m8 = S->used & 7; //modulo 8
203 if (m8)
204 S->used += 8 - m8;
205 return v;
206 }
207
208 void* getByteSpace(int byte_size) { //amount to use or allocate memory, in bytes
209 return (void*)getSpace(byte_size/sizeof(SGreedyOffset));
210 }
211
212 };
213
214 #define GREEDY_MAX_COST_FRACTION 5
215 /* (was 2) sequence_length / (this number) is a measure of how hard the
216 alignment code will work to find the optimal alignment; in fact
217 this gives a worst case bound on the number of loop iterations */
218
219 #define GREEDY_MAX_COST 1000
220 // The largest diff distance (max indels+mismatches) to be examined for an optimal alignment
221 // (should be increased for large sequences)
222
223 #define GX_GALLOC_ERROR "Error: failed to allocate memory for greedy alignment!\n"
224
225 // all auxiliary memory needed for the greedy extension algorithm
226 struct SGreedyAlignMem {
227 int** last_seq2_off; // 2-D array of distances
228 int* max_score; // array of maximum scores
229 GXMemPool* space; // local memory pool for SGreedyOffset structs
230
231 // Allocate memory for the greedy gapped alignment algorithm
232 SGreedyAlignMem(int reward, int penalty, int Xdrop) {
233 int max_d, d_diff;
234 if (penalty<0) penalty=-penalty;
235 if (reward % 2) {
236 //scale params
237 reward = reward << 1;
238 penalty = (penalty << 1);
239 Xdrop = Xdrop<<1;
240 }
241 //if (gap_open == 0 && gap_extend == 0)
242 // gap_extend = (reward >> 1) + penalty;
243 const int max_dbseq_length=255; //adjust this accordingly
244 max_d = GMIN(GREEDY_MAX_COST,
245 (max_dbseq_length/GREEDY_MAX_COST_FRACTION + 1));
246
247 last_seq2_off=NULL; // 2-D array of distances
248 max_score=NULL; // array of maximum scores
249 space=NULL; // local memory pool for SGreedyOffset structs
250 //if (score_params.gap_open==0 && score_params.gap_extend==0) {
251 //non-affine, simpler Greedy algorithm
252 d_diff = (Xdrop+reward/2)/(penalty+reward)+1;
253 GMALLOC(last_seq2_off, ((max_d + 2) * sizeof(int*)));
254 if (!last_seq2_off)
255 GError(GX_GALLOC_ERROR);
256 GCALLOC(last_seq2_off[0], ((max_d + max_d + 6) * sizeof(int) * 2));
257 //allocates contiguous memory for 2 rows here
258 if (!last_seq2_off[0])
259 GError(GX_GALLOC_ERROR);
260 last_seq2_off[1] = last_seq2_off[0] + max_d + max_d + 6; //memory allocated already for this row
261
262 GCALLOC(max_score, (sizeof(int) * (max_d + 1 + d_diff)));
263 space = new GXMemPool();
264 if (!max_score || !space)
265 GError(GX_GALLOC_ERROR);
266 } //consructor
267
268 ~SGreedyAlignMem() {
269 if (last_seq2_off) {
270 GFREE(last_seq2_off[0]);
271 GFREE(last_seq2_off);
272 }
273 GFREE(max_score);
274 delete space;
275 }
276
277 };
278
279
280 #define GAPALIGN_SUB ((unsigned char)0) /*op types within the edit script*/
281 #define GAPALIGN_INS ((unsigned char)1)
282 #define GAPALIGN_DEL ((unsigned char)2)
283 #define GAPALIGN_DECLINE ((unsigned char)3)
284
285 struct GapXEditScript {
286 unsigned char op_type; // GAPALIGN_SUB, GAPALIGN_INS, or GAPALIGN_DEL
287 int num; // Number of operations
288 GapXEditScript* next;
289 GapXEditScript() {
290 op_type=0;
291 num=0;
292 next=NULL;
293 }
294 void print();
295 };
296
297 class CSeqGap { //
298 public:
299 int offset;
300 int len;
301 CSeqGap(int gofs=0,int glen=1) {
302 offset=gofs;
303 len=glen;
304 }
305 };
306
307 class CAlnGapInfo {
308 int a_ofs; //alignment start on seq a (0 based)
309 int b_ofs; //alignment start on seq b (0 based)
310 int a_len; //length of alignment on seq a
311 int b_len; //length of alignment on seq b
312 public:
313 GVec<CSeqGap> a_gaps;
314 GVec<CSeqGap> b_gaps;
315 CAlnGapInfo(GXEditScript* ed_script, int astart=0, int bstart=0):a_gaps(),b_gaps() {
316 a_ofs=astart;
317 b_ofs=bstart;
318 a_len=0;
319 b_len=0;
320 if (ed_script==NULL) return;
321 for (uint32 i=0; i<ed_script->opnum; i++) {
322 int num=((ed_script->ops[i]) >> 2);
323 char op_type = 3 - ( ed_script->ops[i] & EDIT_OP_MASK );
324 if (op_type == 3 || op_type < 0 )
325 GError("Error: encountered op_type %d in ed_script?!\n", (int)op_type);
326 CSeqGap gap;
327 switch (op_type) {
328 case GAPALIGN_SUB: a_len+=num;
329 b_len+=num;
330 break;
331 case GAPALIGN_INS: a_len+=num;
332 gap.offset=b_ofs+b_len;
333 gap.len=num;
334 b_gaps.Add(gap);
335 break;
336 case GAPALIGN_DEL: b_len+=num;
337 gap.offset=a_ofs+a_len;
338 gap.len=num;
339 a_gaps.Add(gap);
340 break;
341 }
342 }
343 }
344
345 #ifdef GDEBUG
346 void printAlignment(FILE* f, const char* sa, int sa_len,
347 const char* sb, int sb_len) {
348 //print seq A
349 char al[1024]; //display buffer for seq A
350 int ap=0; //index in al[] for current character printed
351 int g=0;
352 int aend=a_ofs+a_len;
353 if (a_ofs<b_ofs) {
354 for (int i=0;i<b_ofs-a_ofs;i++) {
355 fprintf(f, " ");
356 al[++ap]=' ';
357 }
358 }
359 for (int i=0;i<aend;i++) {
360 if (g<a_gaps.Count() && a_gaps[g].offset==i) {
361 for (int j=0;j<a_gaps[g].len;j++) {
362 fprintf(f, "-");
363 al[++ap]='-';
364 }
365 g++;
366 }
367 if (i==a_ofs) color_bg(c_blue,f);
368 fprintf(f, "%c", sa[i]);
369 al[++ap]=sa[i];
370 }
371 color_reset(f);
372 if (aend<sa_len)
373 fprintf(f, &sa[aend]);
374 fprintf(f, "\n");
375 //print seq B
376 ap=0;
377 g=0;
378 int bend=b_ofs+b_len;
379 if (a_ofs>b_ofs) {
380 for (int i=0;i<a_ofs-b_ofs;i++) {
381 fprintf(f, " ");
382 ap++;
383 }
384 }
385 for (int i=0;i<b_ofs;i++) {
386 fprintf(f, "%c", sb[i]);
387 ap++;
388 }
389 for (int i=b_ofs;i<bend;i++) {
390 if (g<b_gaps.Count() && b_gaps[g].offset==i) {
391 for (int j=0;j<b_gaps[g].len;j++) {
392 fprintf(f, "-");
393 ap++;
394 }
395 g++;
396 }
397 if (i==b_ofs) color_bg(c_blue,f);
398 ap++;
399 bool mismatch=(sb[i]!=al[ap] && al[ap]!='-');
400 if (mismatch) color_bg(c_red,f);
401 fprintf(f, "%c", sb[i]);
402 if (mismatch) color_bg(c_blue,f);
403 }
404 color_reset(f);
405 if (bend<sb_len)
406 fprintf(f, &sb[bend]);
407 fprintf(f, "\n");
408 }
409 #endif
410 };
411
412
413 struct GXAlnInfo {
414 const char *qseq;
415 int ql,qr;
416 const char *sseq;
417 int sl,sr;
418 int score;
419 double pid;
420 GXEditScript* editscript;
421 //GXAlnMem* abmp;
422 SGreedyAlignMem* alnmem;
423 CAlnGapInfo* gapinfo;
424 GXAlnInfo(const char* q, int q_l, int q_r, const char* s, int s_l, int s_r,
425 int sc=0, double percid=0) {
426 qseq=q;
427 sseq=s;
428 ql=q_l;
429 qr=q_r;
430 sl=s_l;
431 sr=s_r;
432 score=sc;
433 pid=percid;
434 //editscript=NULL;
435 gapinfo=NULL;
436 }
437 ~GXAlnInfo() {
438 delete editscript;
439 delete alnmem;
440 delete gapinfo;
441 }
442 bool operator<(GXAlnInfo& d) {
443 return ((score==d.score)? pid>d.pid : score>d.score);
444 }
445 bool operator>(GXAlnInfo& d) {
446 return ((score==d.score)? pid<d.pid : score<d.score);
447 }
448 bool operator==(GXAlnInfo& d) {
449 return (score==d.score && pid==d.pid);
450 }
451
452 };
453
454
455
456 struct GXSeed {
457 int b_ofs; //0-based coordinate on seq b (x coordinate)
458 int a_ofs; //0-based coordinate on seq a (y coordinate)
459 int len; //length of exact match after extension
460 bool operator<(GXSeed& d){
461 return ((b_ofs==d.b_ofs) ? a_ofs<d.a_ofs : b_ofs<d.b_ofs);
462 }
463 bool operator>(GXSeed& d){
464 return ((b_ofs==d.b_ofs) ? a_ofs>d.a_ofs : b_ofs>d.b_ofs);
465 }
466 bool operator==(GXSeed& d){
467 return (b_ofs==d.b_ofs && a_ofs==d.a_ofs); //should never be the case, seeds are uniquely constructed
468 }
469 GXSeed(int aofs=0, int bofs=0, int l=4) {
470 a_ofs=aofs;
471 b_ofs=bofs;
472 len=l;
473 }
474 };
475
476 int cmpSeedDiag(const pointer p1, const pointer p2);
477 //seeds are "equal" if they're on the same diagonal (for selection purposes only)
478
479 int cmpSeedScore(const pointer p1, const pointer p2); //also takes position into account
480 //among seeds with same length, prefer those closer to the left end of the read (seq_b)
481
482 struct GXBand {
483 //bundle of seed matches on 3 adjacent diagonals
484 int diag; //first anti-diagonal (b_ofs-a_ofs) in this group of 3
485 //seeds for this, and diag+1 and diag+2 are stored here
486 int min_a, max_a; //maximal coordinates of the bundle
487 int min_b, max_b;
488 GList<GXSeed> seeds; //sorted by x coordinate (b_ofs)
489 int score; //sum of seed scores (- overlapping_bases/2 - gaps)
490 GXBand(int start_diag=-1, GXSeed* seed=NULL):seeds(true, false, false) {
491 diag=start_diag;
492 min_a=MAX_INT;
493 min_b=MAX_INT;
494 max_a=0;
495 max_b=0;
496 score=0;
497 if (seed!=NULL) addSeed(seed);
498 }
499 void addSeed(GXSeed* seed) {
500 seeds.Add(seed);
501 score+=seed->len;
502 //if (diag<0) diag=seed->diag; //should NOT be done like this
503 if (seed->a_ofs < min_a) min_a=seed->a_ofs;
504 if (seed->a_ofs+ seed->len > max_a) max_a=seed->a_ofs+seed->len;
505 if (seed->b_ofs < min_b) min_b=seed->b_ofs;
506 if (seed->b_ofs+seed->len > max_b) max_b=seed->b_ofs+seed->len;
507 }
508 void finalize() {
509 //!! to be called only AFTER all seeds have been added
510 // seeds are sorted by b_ofs
511 //penalize seed gaps and overlaps on b sequence
512 for (int i=1;i<seeds.Count();i++) {
513 GXSeed& sprev=*seeds[i-1];
514 GXSeed& scur=*seeds[i];
515 if (scur==sprev) GError("Error: duplicate seeds found (%d-%d:%d-%d)!\n",
516 scur.a_ofs+1, scur.a_ofs+scur.len, scur.b_ofs+1, scur.b_ofs+scur.len);
517 int b_gap=scur.b_ofs-sprev.b_ofs-sprev.len;
518 int a_gap=scur.a_ofs-sprev.a_ofs-sprev.len;
519 int max_gap=b_gap;
520 int min_gap=a_gap;
521 if (min_gap>max_gap) swap(max_gap, min_gap);
522 if (min_gap<0) { //overlap
523 if (max_gap>0) { score-=GMAX((-min_gap), max_gap); }
524 else score+=min_gap;
525 }
526 else { //gap
527 score-=max_gap;
528 }
529 }//for each seed
530 }
531
532 //bands will be sorted by decreasing score eventually, after all seeds are added
533 //more seeds better than one longer seed?
534 bool operator<(GXBand& d){
535 return ((score==d.score) ? seeds.Count()>d.seeds.Count() : score>d.score);
536 }
537 bool operator>(GXBand& d){
538 return ((score==d.score) ? seeds.Count()<d.seeds.Count() : score<d.score);
539 }
540 bool operator==(GXBand& d){
541 //return (score==d.score && seeds.Count()==d.seeds.Count());
542 return (score==d.score && seeds.Count()==d.seeds.Count());
543 }
544
545 };
546
547 class GXBandSet:public GList<GXBand> {
548 public:
549 int idxoffset; //global anti-diagonal->index offset (a_len-1)
550 //used to convert a diagonal to an index
551 //diagonal is always b_ofs-a_ofs, so the minimum value is -a_len+1
552 //hence offset is a_len-1
553 GXBand* band(int diag) { //retrieve the band for given anti-diagonal (b_ofs-a_ofs)
554 return Get(diag+idxoffset);
555 }
556 GXBand* band(int a_ofs, int b_ofs) { //retrieve the band for given anti-diagonal (b_ofs-a_ofs)
557 return Get(b_ofs-a_ofs+idxoffset);
558 }
559 GXBandSet(int a_len, int b_len):GList<GXBand>(a_len+b_len-1, false, true, false) {
560 idxoffset=a_len-1;
561 //diag will range from -a_len+1 to b_len-1, so after adjustment
562 //by idxoffset we get a max of a_len+b_len-2
563 int bcount=a_len+b_len-1;
564 for (int i=0;i<bcount;i++)
565 this->Add(new GXBand(i-idxoffset));
566 //unsorted, this should set fList[i]
567 }
568 void addSeed(GXSeed* seed) {
569 //MUST be unsorted !!!
570 int idx=(seed->b_ofs-seed->a_ofs)+idxoffset;
571 fList[idx]->addSeed(seed);
572 if (idx>0) fList[idx-1]->addSeed(seed);
573 if (idx<fCount-1) fList[idx+1]->addSeed(seed);
574 }
575 };
576
577
578 GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len); //overlap of 3' end of seqb
579 //void collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, const char* seqb); //overlap of 5' end of seqb
580
581 void printEditScript(GXEditScript* ed_script);
582
583
584 int GXGreedyExtend(const char* seq1, int len1,
585 const char* seq2, int len2,
586 bool reverse, int xdrop_threshold,
587 int match_cost, int mismatch_cost,
588 int& seq1_align_len, int& seq2_align_len,
589 SGreedyAlignMem& aux_data,
590 GXEditScript *edit_block);
591
592 // reward MUST be >1, always
593 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
594 const char* s_seq, int s_alnstart, int s_max,
595 bool editscript=false, int reward=2, int penalty=3, int xdrop=8);
596 GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart,
597 bool editscript=false, int reward=2, int penalty=3, int xdrop=8);
598
599 #endif