ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/seqalign/GXAlign.h
Revision: 92
Committed: Mon Oct 3 21:36:42 2011 UTC (8 years, 1 month ago) by gpertea
File size: 18667 byte(s)
Log Message:
switching to gclib/GAlnExtend

Line File contents
1 #ifndef _GXALIGN_H
2
3 #include "GBase.h"
4 #include "GList.hh"
5 #include <string.h>
6
7 //#define GDEBUG 1
8
9 enum {
10 EDIT_OP_MASK = 0x3,
11 EDIT_OP_ERR = 0x0,
12 EDIT_OP_INS = 0x1,
13 EDIT_OP_DEL = 0x2,
14 EDIT_OP_REP = 0x3
15 };
16
17 #define GX_EDITOP_VAL(op) ((op) >> 2)
18 #define GX_EDITOP_GET(op) ((op) & EDIT_OP_MASK)
19 #define GX_EDITOP_CONS(op, val) (((val) << 2) | ((op) & EDIT_OP_MASK))
20
21 #ifdef GDEBUG
22 enum {c_black=0,
23 c_red, c_green,c_brown,c_blue,c_magenta,c_cyan,c_white
24 };
25
26 void color_fg(int c, FILE* f=stderr);
27 void color_bg(int c, FILE* f=stderr);
28 void color_resetfg(FILE* f=stderr);
29 void color_resetbg(FILE* f=stderr);
30 void color_reset(FILE* f=stderr);
31 void color_normal(FILE* f=stderr);
32 #endif
33
34 struct GXEditScript{
35 uint32 *ops; // array of edit operations
36 uint32 opsize, opnum; // size of allocation, number in use
37 uint32 oplast; // most recent operation added
38 //methods
39
40 GXEditScript() {
41 init();
42 }
43 ~GXEditScript() {
44 GFREE(ops);
45 }
46 void init() {
47 ops = NULL;
48 opsize = 0;
49 opnum = 0;
50 oplast = 0;
51 getReady(8);
52 }
53
54 int getReady(uint32 n) {
55 uint32 m = n + n/2;
56 if (opsize <= n) {
57 GREALLOC(ops, m*sizeof(uint32));
58 opsize = m;
59 }
60 return 1;
61 }
62
63 int getReady2(uint32 n) {
64 if (opsize - opnum <= n)
65 return getReady(n + opnum);
66 return 1;
67 }
68
69 int Put(uint32 op, uint32 n) {
70 if (!getReady2(2))
71 return 0;
72 oplast = op;
73 ops[opnum] = GX_EDITOP_CONS(op, n);
74 opnum += 1;
75 ops[opnum] = 0; // sentinel
76 return 1;
77 }
78 uint32* First() {
79 return opnum > 0 ? & ops[0] : NULL;
80 }
81
82 uint32* Next(uint32 *op) {
83 // assumes flat address space !
84 if (&ops[0] <= op && op < &ops[opnum-1])
85 return op+1;
86 else
87 return 0;
88 }
89
90 int More(uint32 op, uint32 k) {
91 if (op == EDIT_OP_ERR) {
92 GError("GXEditScript::opMore: bad opcode %d:%d", op, k);
93 return -1;
94 }
95
96 if (GX_EDITOP_GET(oplast) == op) {
97 uint32 l=ops[opnum-1];
98 ops[opnum-1]=GX_EDITOP_CONS((GX_EDITOP_GET(l)),
99 (GX_EDITOP_VAL(l) + k));
100 }
101 else {
102 Put(op, k);
103 }
104
105 return 0;
106 }
107
108 GXEditScript* Append(GXEditScript *et) {
109 uint32 *op;
110 for (op = et->First(); op; op = et->Next(op))
111 More(GX_EDITOP_GET(*op), GX_EDITOP_VAL(*op));
112 return this;
113 }
114
115 int opDel(uint32 k) {
116 return More(EDIT_OP_DEL, k);
117 }
118 int opIns(uint32 k) {
119 return More(EDIT_OP_INS, k);
120 }
121 int opRep(uint32 k) {
122 return More(EDIT_OP_REP, k);
123 }
124
125 GXEditScript *reverse() {
126 const uint32 mid = opnum/2;
127 const uint32 end = opnum-1;
128 for (uint32 i = 0; i < mid; ++i) {
129 const uint32 t = ops[i];
130 ops[i] = ops[end-i];
131 ops[end-i] = t;
132 }
133 return this;
134 }
135 };
136
137
138 /** Bookkeeping structure for greedy alignment. When aligning
139 two sequences, the members of this structure store the
140 largest offset into the second sequence that leads to a
141 high-scoring alignment for a given start point */
142 struct SGreedyOffset {
143 int insert_off; // Best offset for a path ending in an insertion
144 int match_off; // Best offset for a path ending in a match
145 int delete_off; // Best offset for a path ending in a deletion
146 };
147
148 // ----- pool allocator -----
149 // works as a linked list of allocated memory blocks
150 struct GXMemPool {
151 SGreedyOffset* memblock;
152 int used, size;
153 GXMemPool *next;
154 static int kMinSpace;
155 //methods
156 GXMemPool(int num_offsp=0) { //by default allocate a large block here (10M)
157 num_offsp=GMAX(kMinSpace, num_offsp);
158 GMALLOC(memblock, num_offsp*sizeof(SGreedyOffset));
159 if (memblock == NULL) {
160 GError("Failed to allocated GXMemPool(%d) for greedy extension!\n",num_offsp);
161 return;
162 }
163 used = 0;
164 size = num_offsp;
165 next = NULL;
166 }
167
168 void refresh() {
169 GXMemPool* sp=this;
170 while (sp) {
171 sp->used = 0;
172 sp = sp->next;
173 }
174 }
175 ~GXMemPool() {
176 GXMemPool* next_sp;
177 GXMemPool* sp=this->next;
178 while (sp) {
179 next_sp = sp->next;
180 GFREE(sp->memblock);
181 delete sp;
182 sp = next_sp;
183 }
184 GFREE(memblock);
185 }
186
187 SGreedyOffset* getSpace(int num_alloc) { // SGreedyOffset[num_alloc] array
188 //can use the first found memory block with enough room,
189 // or allocate a new large block
190 SGreedyOffset* v;
191 if (num_alloc < 0) return NULL;
192 GXMemPool* S=this;
193 while (used+num_alloc > S->size) {
194 //no room in current block, get a new mem block
195 if (next == NULL) {
196 next=new GXMemPool(num_alloc); //allocates a large contiguous memory block
197 }
198 S = S->next;
199 }
200 v = S->memblock+S->used;
201 S->used += num_alloc;
202 //align to first 8-byte boundary
203 int m8 = S->used & 7; //modulo 8
204 if (m8)
205 S->used += 8 - m8;
206 return v;
207 }
208
209 void* getByteSpace(int byte_size) { //amount to use or allocate memory, in bytes
210 return (void*)getSpace(byte_size/sizeof(SGreedyOffset));
211 }
212
213 };
214
215 #define GREEDY_MAX_COST_FRACTION 5
216 /* (was 2) sequence_length / (this number) is a measure of how hard the
217 alignment code will work to find the optimal alignment; in fact
218 this gives a worst case bound on the number of loop iterations */
219
220 #define GREEDY_MAX_COST 1000
221 // The largest diff distance (max indels+mismatches) to be examined for an optimal alignment
222 // (should be increased for large sequences)
223
224 #define GX_GALLOC_ERROR "Error: failed to allocate memory for greedy alignment!\n"
225
226 // all auxiliary memory needed for the greedy extension algorithm
227 struct SGreedyAlignMem {
228 int** last_seq2_off; // 2-D array of distances
229 int* max_score; // array of maximum scores
230 GXMemPool* space; // local memory pool for SGreedyOffset structs
231 int d_diff;
232 int max_d;
233 // Allocate memory for the greedy gapped alignment algorithm
234 SGreedyAlignMem(int reward, int penalty, int Xdrop) {
235 //int max_d, diff_d;
236 if (penalty<0) penalty=-penalty;
237 if (reward % 2) {
238 //scale params
239 reward = reward << 1;
240 penalty = (penalty << 1);
241 Xdrop = Xdrop<<1;
242 }
243 //if (gap_open == 0 && gap_extend == 0)
244 // gap_extend = (reward >> 1) + penalty;
245 const int max_dbseq_length=255; //adjust this accordingly
246 max_d = GMIN(GREEDY_MAX_COST,
247 (max_dbseq_length/GREEDY_MAX_COST_FRACTION + 1));
248
249 last_seq2_off=NULL; // 2-D array of distances
250 max_score=NULL; // array of maximum scores
251 space=NULL; // local memory pool for SGreedyOffset structs
252 //if (score_params.gap_open==0 && score_params.gap_extend==0) {
253 //non-affine, simpler Greedy algorithm
254 d_diff = (Xdrop+reward/2)/(penalty+reward)+1;
255 GMALLOC(last_seq2_off, ((max_d + 2) * sizeof(int*)));
256 if (!last_seq2_off)
257 GError(GX_GALLOC_ERROR);
258 GCALLOC(last_seq2_off[0], ((max_d + max_d + 6) * sizeof(int) * 2));
259 //allocates contiguous memory for 2 rows here
260 if (!last_seq2_off[0])
261 GError(GX_GALLOC_ERROR);
262 last_seq2_off[1] = last_seq2_off[0] + max_d + max_d + 6; //memory allocated already for this row
263
264 GCALLOC(max_score, (sizeof(int) * (max_d + 1 + d_diff)));
265 space = new GXMemPool();
266 if (!max_score || !space)
267 GError(GX_GALLOC_ERROR);
268 } //consructor
269
270 void reset() {
271 space->refresh();
272 if (last_seq2_off) {
273 GFREE((last_seq2_off[0]));
274 }
275 GFREE(max_score);
276 GCALLOC(last_seq2_off[0], ((max_d + max_d + 6) * sizeof(int) * 2));
277 if (!last_seq2_off[0]) GError(GX_GALLOC_ERROR);
278 //allocates contiguous memory for 2 rows here
279 last_seq2_off[1] = last_seq2_off[0] + max_d + max_d + 6;
280 GCALLOC(max_score, (sizeof(int) * (max_d + 1 + d_diff)));
281 if (!max_score) GError(GX_GALLOC_ERROR);
282 }
283 ~SGreedyAlignMem() {
284 if (last_seq2_off) {
285 GFREE(last_seq2_off[0]);
286 GFREE(last_seq2_off);
287 }
288 GFREE(max_score);
289 delete space;
290 }
291
292 };
293
294
295 #define GAPALIGN_SUB ((unsigned char)0) /*op types within the edit script*/
296 #define GAPALIGN_INS ((unsigned char)1)
297 #define GAPALIGN_DEL ((unsigned char)2)
298 #define GAPALIGN_DECLINE ((unsigned char)3)
299
300 struct GapXEditScript {
301 unsigned char op_type; // GAPALIGN_SUB, GAPALIGN_INS, or GAPALIGN_DEL
302 int num; // Number of operations
303 GapXEditScript* next;
304 GapXEditScript() {
305 op_type=0;
306 num=0;
307 next=NULL;
308 }
309 void print();
310 };
311
312 class CSeqGap { //
313 public:
314 int offset;
315 int len;
316 CSeqGap(int gofs=0,int glen=1) {
317 offset=gofs;
318 len=glen;
319 }
320 };
321
322 class CAlnGapInfo {
323 int a_ofs; //alignment start on seq a (0 based)
324 int b_ofs; //alignment start on seq b (0 based)
325 int a_len; //length of alignment on seq a
326 int b_len; //length of alignment on seq b
327 public:
328 GVec<CSeqGap> a_gaps;
329 GVec<CSeqGap> b_gaps;
330 CAlnGapInfo(GXEditScript* ed_script, int astart=0, int bstart=0):a_gaps(),b_gaps() {
331 a_ofs=astart;
332 b_ofs=bstart;
333 a_len=0;
334 b_len=0;
335 if (ed_script==NULL) return;
336 for (uint32 i=0; i<ed_script->opnum; i++) {
337 int num=((ed_script->ops[i]) >> 2);
338 char op_type = 3 - ( ed_script->ops[i] & EDIT_OP_MASK );
339 if (op_type == 3 || op_type < 0 )
340 GError("Error: encountered op_type %d in ed_script?!\n", (int)op_type);
341 CSeqGap gap;
342 switch (op_type) {
343 case GAPALIGN_SUB: a_len+=num;
344 b_len+=num;
345 break;
346 case GAPALIGN_INS: a_len+=num;
347 gap.offset=b_ofs+b_len;
348 gap.len=num;
349 b_gaps.Add(gap);
350 break;
351 case GAPALIGN_DEL: b_len+=num;
352 gap.offset=a_ofs+a_len;
353 gap.len=num;
354 a_gaps.Add(gap);
355 break;
356 }
357 }
358 }
359
360 #ifdef GDEBUG
361 void printAlignment(FILE* f, const char* sa, int sa_len,
362 const char* sb, int sb_len) {
363 //print seq A
364 char al[1024]; //display buffer for seq A
365 int ap=0; //index in al[] for current character printed
366 int g=0;
367 int aend=a_ofs+a_len;
368 if (a_ofs<b_ofs) {
369 for (int i=0;i<b_ofs-a_ofs;i++) {
370 fprintf(f, " ");
371 al[++ap]=' ';
372 }
373 }
374 for (int i=0;i<aend;i++) {
375 if (g<a_gaps.Count() && a_gaps[g].offset==i) {
376 for (int j=0;j<a_gaps[g].len;j++) {
377 fprintf(f, "-");
378 al[++ap]='-';
379 }
380 g++;
381 }
382 if (i==a_ofs) color_bg(c_blue,f);
383 fprintf(f, "%c", sa[i]);
384 al[++ap]=sa[i];
385 }
386 color_reset(f);
387 if (aend<sa_len)
388 fprintf(f, &sa[aend]);
389 fprintf(f, "\n");
390 //print seq B
391 ap=0;
392 g=0;
393 int bend=b_ofs+b_len;
394 if (a_ofs>b_ofs) {
395 for (int i=0;i<a_ofs-b_ofs;i++) {
396 fprintf(f, " ");
397 ap++;
398 }
399 }
400 for (int i=0;i<b_ofs;i++) {
401 fprintf(f, "%c", sb[i]);
402 ap++;
403 }
404 for (int i=b_ofs;i<bend;i++) {
405 if (g<b_gaps.Count() && b_gaps[g].offset==i) {
406 for (int j=0;j<b_gaps[g].len;j++) {
407 fprintf(f, "-");
408 ap++;
409 }
410 g++;
411 }
412 if (i==b_ofs) color_bg(c_blue,f);
413 ap++;
414 bool mismatch=(sb[i]!=al[ap] && al[ap]!='-');
415 if (mismatch) color_bg(c_red,f);
416 fprintf(f, "%c", sb[i]);
417 if (mismatch) color_bg(c_blue,f);
418 }
419 color_reset(f);
420 if (bend<sb_len)
421 fprintf(f, &sb[bend]);
422 fprintf(f, "\n");
423 }
424 #endif
425 };
426
427 struct GXAlnInfo {
428 const char *qseq;
429 int ql,qr;
430 const char *sseq;
431 int sl,sr;
432 int score;
433 double pid;
434 GXEditScript* editscript;
435 CAlnGapInfo* gapinfo;
436 GXAlnInfo(const char* q, int q_l, int q_r, const char* s, int s_l, int s_r,
437 int sc=0, double percid=0) {
438 qseq=q;
439 sseq=s;
440 ql=q_l;
441 qr=q_r;
442 sl=s_l;
443 sr=s_r;
444 score=sc;
445 pid=percid;
446 editscript=NULL;
447 gapinfo=NULL;
448 }
449 ~GXAlnInfo() {
450 delete editscript;
451 delete gapinfo;
452 }
453 bool operator<(GXAlnInfo& d) {
454 return ((score==d.score)? pid>d.pid : score>d.score);
455 }
456 bool operator>(GXAlnInfo& d) {
457 return ((score==d.score)? pid<d.pid : score<d.score);
458 }
459 bool operator==(GXAlnInfo& d) {
460 return (score==d.score && pid==d.pid);
461 }
462
463 };
464
465
466
467 struct GXSeed {
468 int b_ofs; //0-based coordinate on seq b (x coordinate)
469 int a_ofs; //0-based coordinate on seq a (y coordinate)
470 int len; //length of exact match after extension
471 bool operator<(GXSeed& d){
472 return ((b_ofs==d.b_ofs) ? a_ofs<d.a_ofs : b_ofs<d.b_ofs);
473 }
474 bool operator>(GXSeed& d){
475 return ((b_ofs==d.b_ofs) ? a_ofs>d.a_ofs : b_ofs>d.b_ofs);
476 }
477 bool operator==(GXSeed& d){
478 return (b_ofs==d.b_ofs && a_ofs==d.a_ofs); //should never be the case, seeds are uniquely constructed
479 }
480 GXSeed(int aofs=0, int bofs=0, int l=4) {
481 a_ofs=aofs;
482 b_ofs=bofs;
483 len=l;
484 }
485 };
486
487 int cmpSeedDiag(const pointer p1, const pointer p2);
488 //seeds are "equal" if they're on the same diagonal (for selection purposes only)
489
490 int cmpSeedScore(const pointer p1, const pointer p2); //also takes position into account
491 //among seeds with same length, prefer those closer to the left end of the read (seq_b)
492
493 struct GXBand {
494 //bundle of seed matches on 3 adjacent diagonals
495 int diag; //first anti-diagonal (b_ofs-a_ofs) in this group of 3
496 //seeds for this, and diag+1 and diag+2 are stored here
497 int min_a, max_a; //maximal coordinates of the bundle
498 int min_b, max_b;
499 GList<GXSeed> seeds; //sorted by x coordinate (b_ofs)
500 int score; //sum of seed scores (- overlapping_bases/2 - gaps)
501 GXBand(int start_diag=-1, GXSeed* seed=NULL):seeds(true, false, false) {
502 diag=start_diag;
503 min_a=MAX_INT;
504 min_b=MAX_INT;
505 max_a=0;
506 max_b=0;
507 score=0;
508 if (seed!=NULL) addSeed(seed);
509 }
510 void addSeed(GXSeed* seed) {
511 seeds.Add(seed);
512 score+=seed->len;
513 //if (diag<0) diag=seed->diag; //should NOT be done like this
514 if (seed->a_ofs < min_a) min_a=seed->a_ofs;
515 if (seed->a_ofs+ seed->len > max_a) max_a=seed->a_ofs+seed->len;
516 if (seed->b_ofs < min_b) min_b=seed->b_ofs;
517 if (seed->b_ofs+seed->len > max_b) max_b=seed->b_ofs+seed->len;
518 }
519 void finalize() {
520 //!! to be called only AFTER all seeds have been added
521 // seeds are sorted by b_ofs
522 //penalize seed gaps and overlaps on b sequence
523 for (int i=1;i<seeds.Count();i++) {
524 GXSeed& sprev=*seeds[i-1];
525 GXSeed& scur=*seeds[i];
526 if (scur==sprev) GError("Error: duplicate seeds found (%d-%d:%d-%d)!\n",
527 scur.a_ofs+1, scur.a_ofs+scur.len, scur.b_ofs+1, scur.b_ofs+scur.len);
528 int b_gap=scur.b_ofs-sprev.b_ofs-sprev.len;
529 int a_gap=scur.a_ofs-sprev.a_ofs-sprev.len;
530 int max_gap=b_gap;
531 int min_gap=a_gap;
532 if (min_gap>max_gap) swap(max_gap, min_gap);
533 if (min_gap<0) { //overlap
534 if (max_gap>0) { score-=GMAX((-min_gap), max_gap); }
535 else score+=min_gap;
536 }
537 else { //gap
538 score-=max_gap;
539 }
540 }//for each seed
541 }
542
543 //bands will be sorted by decreasing score eventually, after all seeds are added
544 //more seeds better than one longer seed?
545 bool operator<(GXBand& d){
546 return ((score==d.score) ? seeds.Count()>d.seeds.Count() : score>d.score);
547 }
548 bool operator>(GXBand& d){
549 return ((score==d.score) ? seeds.Count()<d.seeds.Count() : score<d.score);
550 }
551 bool operator==(GXBand& d){
552 //return (score==d.score && seeds.Count()==d.seeds.Count());
553 return (score==d.score && seeds.Count()==d.seeds.Count());
554 }
555
556 };
557
558 class GXBandSet:public GList<GXBand> {
559 public:
560 int idxoffset; //global anti-diagonal->index offset (a_len-1)
561 //used to convert a diagonal to an index
562 //diagonal is always b_ofs-a_ofs, so the minimum value is -a_len+1
563 //hence offset is a_len-1
564 GXBand* band(int diag) { //retrieve the band for given anti-diagonal (b_ofs-a_ofs)
565 return Get(diag+idxoffset);
566 }
567 GXBand* band(int a_ofs, int b_ofs) { //retrieve the band for given anti-diagonal (b_ofs-a_ofs)
568 return Get(b_ofs-a_ofs+idxoffset);
569 }
570 GXBandSet(int a_len, int b_len):GList<GXBand>(a_len+b_len-1, false, true, false) {
571 idxoffset=a_len-1;
572 //diag will range from -a_len+1 to b_len-1, so after adjustment
573 //by idxoffset we get a max of a_len+b_len-2
574 int bcount=a_len+b_len-1;
575 for (int i=0;i<bcount;i++)
576 this->Add(new GXBand(i-idxoffset));
577 //unsorted, this should set fList[i]
578 }
579 void addSeed(GXSeed* seed) {
580 //MUST be unsorted !!!
581 int idx=(seed->b_ofs-seed->a_ofs)+idxoffset;
582 fList[idx]->addSeed(seed);
583 if (idx>0) fList[idx-1]->addSeed(seed);
584 if (idx<fCount-1) fList[idx+1]->addSeed(seed);
585 }
586 };
587
588
589 GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len); //for overlap at 3' end of seqb
590
591 GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len); //for overlap at 5' end of seqb
592
593 void printEditScript(GXEditScript* ed_script);
594
595
596 int GXGreedyExtend(const char* seq1, int len1,
597 const char* seq2, int len2,
598 bool reverse, int xdrop_threshold,
599 int match_cost, int mismatch_cost,
600 int& seq1_align_len, int& seq2_align_len,
601 SGreedyAlignMem& aux_data,
602 GXEditScript *edit_block);
603
604
605 enum GAlnTrimType {
606 galn_NoTrim=0,
607 galn_TrimLeft,
608 galn_TrimRight
609 };
610
611 // reward MUST be >1, always
612 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
613 const char* s_seq, int s_alnstart, int s_max, GAlnTrimType trimtype=galn_NoTrim,
614 bool editscript=false, SGreedyAlignMem* gxmem=NULL, int reward=2, int penalty=3, int xdrop=8);
615 GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart,
616 bool editscript=false, int reward=2, int penalty=3, int xdrop=8);
617
618 #endif