ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/seqalign/GXAlign.h
Revision: 84
Committed: Fri Sep 30 03:19:32 2011 UTC (9 years, 7 months ago) by gpertea
File size: 15675 byte(s)
Log Message:
wip bad

Line User Rev File contents
1 gpertea 74 #ifndef _GXALIGN_H
2    
3     #include "GBase.h"
4     #include "GList.hh"
5    
6     #define GDEBUG 1
7    
8 gpertea 82 #define MAX_SPACE 2000000
9 gpertea 74
10     enum {
11     EDIT_OP_MASK = 0x3,
12     EDIT_OP_ERR = 0x0,
13     EDIT_OP_INS = 0x1,
14     EDIT_OP_DEL = 0x2,
15     EDIT_OP_REP = 0x3
16     };
17    
18     #define GX_EDITOP_VAL(op) ((op) >> 2)
19     #define GX_EDITOP_GET(op) ((op) & EDIT_OP_MASK)
20     #define GX_EDITOP_CONS(op, val) (((val) << 2) | ((op) & EDIT_OP_MASK))
21    
22     #ifdef GDEBUG
23     enum {c_black=0,
24     c_red, c_green,c_brown,c_blue,c_magenta,c_cyan,c_white
25     };
26    
27     void color_fg(int c, FILE* f=stderr);
28     void color_bg(int c, FILE* f=stderr);
29     void color_resetfg(FILE* f=stderr);
30     void color_resetbg(FILE* f=stderr);
31     void color_reset(FILE* f=stderr);
32     void color_normal(FILE* f=stderr);
33     #endif
34    
35     struct GXEditScript{
36     uint32 *ops; // array of edit operations
37     uint32 opsize, opnum; // size of allocation, number in use
38     uint32 oplast; // most recent operation added
39     //methods
40    
41     GXEditScript() {
42     init();
43     }
44     ~GXEditScript() {
45     GFREE(ops);
46     }
47     void init() {
48     ops = NULL;
49     opsize = 0;
50     opnum = 0;
51     oplast = 0;
52     getReady(8);
53     }
54    
55     int getReady(uint32 n) {
56     uint32 m = n + n/2;
57     if (opsize <= n) {
58     GREALLOC(ops, m*sizeof(uint32));
59     opsize = m;
60     }
61     return 1;
62     }
63    
64     int getReady2(uint32 n) {
65     if (opsize - opnum <= n)
66     return getReady(n + opnum);
67     return 1;
68     }
69    
70     int Put(uint32 op, uint32 n) {
71     if (!getReady2(2))
72     return 0;
73     oplast = op;
74     ops[opnum] = GX_EDITOP_CONS(op, n);
75     opnum += 1;
76     ops[opnum] = 0; // sentinel
77     return 1;
78     }
79     uint32* First() {
80     return opnum > 0 ? & ops[0] : NULL;
81     }
82    
83     uint32* Next(uint32 *op) {
84     // assumes flat address space !
85     if (&ops[0] <= op && op < &ops[opnum-1])
86     return op+1;
87     else
88     return 0;
89     }
90    
91     int More(uint32 op, uint32 k) {
92     if (op == EDIT_OP_ERR) {
93     GError("GXEditScript::opMore: bad opcode %d:%d", op, k);
94     return -1;
95     }
96    
97     if (GX_EDITOP_GET(oplast) == op) {
98     uint32 l=ops[opnum-1];
99     ops[opnum-1]=GX_EDITOP_CONS((GX_EDITOP_GET(l)),
100     (GX_EDITOP_VAL(l) + k));
101     }
102     else {
103     Put(op, k);
104     }
105    
106     return 0;
107     }
108    
109     GXEditScript* Append(GXEditScript *et) {
110     uint32 *op;
111     for (op = et->First(); op; op = et->Next(op))
112     More(GX_EDITOP_GET(*op), GX_EDITOP_VAL(*op));
113     return this;
114     }
115    
116     int opDel(uint32 k) {
117     return More(EDIT_OP_DEL, k);
118     }
119     int opIns(uint32 k) {
120     return More(EDIT_OP_INS, k);
121     }
122     int opRep(uint32 k) {
123     return More(EDIT_OP_REP, k);
124     }
125    
126     GXEditScript *reverse() {
127     const uint32 mid = opnum/2;
128     const uint32 end = opnum-1;
129     for (uint32 i = 0; i < mid; ++i) {
130     const uint32 t = ops[i];
131     ops[i] = ops[end-i];
132     ops[end-i] = t;
133     }
134     return this;
135     }
136     };
137    
138    
139 gpertea 81 // ----- pool allocator -----
140 gpertea 82 // works as a linked list of allocated memory blocks
141     struct GXMemPool {
142     char* memblock;
143 gpertea 74 int used, size;
144 gpertea 82 GXMemPool *next;
145 gpertea 74 //methods
146 gpertea 82 GXMemPool() { //by default allocate a large block here (10M)
147     uint32 amount= MAX_SPACE<<3;
148     GCALLOC(memblock, amount);
149     if (memblock == NULL) {
150 gpertea 74 GError("Failed to allocated GXAlnSpace for greedy extension!\n");
151     return;
152     }
153     used = 0;
154     size = amount;
155     next = NULL;
156     }
157     void refresh() {
158 gpertea 82 GXMemPool* sp=this;
159 gpertea 74 while (sp) {
160     sp->used = 0;
161     sp = sp->next;
162     }
163     }
164 gpertea 82 ~GXMemPool() {
165     GXMemPool* next_sp;
166     GXMemPool* sp=this->next;
167 gpertea 74 while (sp) {
168     next_sp = sp->next;
169 gpertea 82 GFREE(sp->memblock);
170 gpertea 74 delete sp;
171     sp = next_sp;
172     }
173 gpertea 82 GFREE(memblock);
174 gpertea 74 }
175 gpertea 79
176 gpertea 82 void* getSpace(int amount) { //amount to use or allocate memory, in bytes
177     //can use the first found memory block with enough room,
178     // or allocate a new large block
179     char* v;
180 gpertea 74 if (amount < 0) return NULL;
181 gpertea 82 GXMemPool* S=this;
182 gpertea 74 while (used+amount > S->size) {
183 gpertea 82 //no room in current block, get a new mem block
184 gpertea 74 if (next == NULL) {
185 gpertea 82 next=new GXMemPool(); //allocates a large contiguous memory block
186 gpertea 74 }
187     S = S->next;
188     }
189 gpertea 82 v = S->memblock+S->used;
190 gpertea 74 S->used += amount;
191 gpertea 82 //align to next 8-byte boundary
192     int m8 = S->used & 7; //modulo 8
193     if (m8)
194     S->used += 8 - m8;
195     return (void*)v;
196 gpertea 74 }
197     };
198    
199 gpertea 84 #define ERROR_FRACTION 5 // 1/this : the higher it is, the smaller the diagonal band
200 gpertea 79
201 gpertea 74 struct GXAlnMem { //GreedyAlignMem
202 gpertea 81 int** flast_d; //flast_d[][]
203 gpertea 74 int* max_row_free;
204     int* uplow_free;
205 gpertea 82 GXMemPool* space;
206 gpertea 84 int MAX_D;
207 gpertea 74 GXAlnMem(int max_len, int reward, int penalty, int Xdrop) {
208     flast_d=NULL;
209     max_row_free=NULL;
210 gpertea 84 MAX_D=0;
211 gpertea 74 uplow_free=NULL;
212     space=NULL;
213 gpertea 84 MAX_D = (int)(max_len / ERROR_FRACTION + 1);
214     //MAX_D = 4;
215 gpertea 74 int d_diff = (Xdrop+reward/2)/(penalty+reward) + 1;
216 gpertea 84 GCALLOC(flast_d, (MAX_D+2)* sizeof(int*));
217 gpertea 74 if (flast_d == NULL) {
218     GError("Error: cannot allocate GXAlnMem structure!\n");
219     }
220 gpertea 84 flast_d[0] = (int*)malloc(((MAX_D<<1) + 6) * sizeof(int) * 2);
221 gpertea 82 //allocates two int rows - flast_d[1] too
222 gpertea 74 if (flast_d[0] == NULL) {
223     GError("Error: failed to allocate flast_d[0] in GXAlnMem!\n");
224     }
225 gpertea 84 flast_d[1] = flast_d[0] + (MAX_D<<1) + 6; //int array for next row
226 gpertea 74 uplow_free = NULL;
227 gpertea 84 max_row_free = (int*) malloc(sizeof(int) * (MAX_D + 1 + d_diff));
228 gpertea 82 space = new GXMemPool();
229 gpertea 74 }
230     ~GXAlnMem() {
231     delete space;
232     GFREE(max_row_free);
233 gpertea 79 GFREE(flast_d[0]);
234     GFREE(flast_d);
235 gpertea 74 }
236     };
237    
238     int GXGreedyAlign(const char* s1, int len1,
239     const char* s2, int len2,
240     bool reverse, int xdrop_threshold,
241     int match_cost, int mismatch_cost,
242     int* e1, int* e2,
243     GXAlnMem* abmp, GXEditScript *S=NULL);
244    
245     #define GAPALIGN_SUB ((unsigned char)0) /*op types within the edit script*/
246     #define GAPALIGN_INS ((unsigned char)1)
247     #define GAPALIGN_DEL ((unsigned char)2)
248     #define GAPALIGN_DECLINE ((unsigned char)3)
249    
250     struct GapXEditScript {
251     unsigned char op_type; // GAPALIGN_SUB, GAPALIGN_INS, or GAPALIGN_DEL
252     int num; // Number of operations
253     GapXEditScript* next;
254     GapXEditScript() {
255     op_type=0;
256     num=0;
257     next=NULL;
258     }
259     void print();
260     };
261    
262     class CSeqGap { //
263     public:
264     int offset;
265     int len;
266     CSeqGap(int gofs=0,int glen=1) {
267     offset=gofs;
268     len=glen;
269     }
270     };
271    
272     class CAlnGapInfo {
273     int a_ofs; //alignment start on seq a (0 based)
274     int b_ofs; //alignment start on seq b (0 based)
275     int a_len; //length of alignment on seq a
276     int b_len; //length of alignment on seq b
277     public:
278     GVec<CSeqGap> a_gaps;
279     GVec<CSeqGap> b_gaps;
280     CAlnGapInfo(GXEditScript* ed_script, int astart=0, int bstart=0):a_gaps(),b_gaps() {
281     a_ofs=astart;
282     b_ofs=bstart;
283     a_len=0;
284 gpertea 82 b_len=0;
285     if (ed_script==NULL) return;
286     for (uint32 i=0; i<ed_script->opnum; i++) {
287     int num=((ed_script->ops[i]) >> 2);
288     char op_type = 3 - ( ed_script->ops[i] & EDIT_OP_MASK );
289     if (op_type == 3 || op_type < 0 )
290     GError("Error: encountered op_type %d in ed_script?!\n", (int)op_type);
291     CSeqGap gap;
292     switch (op_type) {
293     case GAPALIGN_SUB: a_len+=num;
294     b_len+=num;
295     break;
296     case GAPALIGN_INS: a_len+=num;
297     gap.offset=b_ofs+b_len;
298     gap.len=num;
299     b_gaps.Add(gap);
300     break;
301     case GAPALIGN_DEL: b_len+=num;
302     gap.offset=a_ofs+a_len;
303     gap.len=num;
304     a_gaps.Add(gap);
305     break;
306     }
307     }
308     }
309    
310 gpertea 74 #ifdef GDEBUG
311 gpertea 83 void printAlignment(FILE* f, const char* sa, int sa_len,
312     const char* sb, int sb_len) {
313 gpertea 82 //print seq A
314     char al[1024]; //display buffer for seq A
315     int ap=0; //index in al[] for current character printed
316     int g=0;
317     int aend=a_ofs+a_len;
318     if (a_ofs<b_ofs) {
319     for (int i=0;i<b_ofs-a_ofs;i++) {
320 gpertea 74 fprintf(f, " ");
321     al[++ap]=' ';
322     }
323     }
324     for (int i=0;i<aend;i++) {
325     if (g<a_gaps.Count() && a_gaps[g].offset==i) {
326     for (int j=0;j<a_gaps[g].len;j++) {
327     fprintf(f, "-");
328     al[++ap]='-';
329     }
330     g++;
331     }
332     if (i==a_ofs) color_bg(c_blue,f);
333     fprintf(f, "%c", sa[i]);
334     al[++ap]=sa[i];
335     }
336     color_reset(f);
337 gpertea 83 if (aend<sa_len)
338     fprintf(f, &sa[aend]);
339 gpertea 74 fprintf(f, "\n");
340     //print seq B
341     ap=0;
342     g=0;
343     int bend=b_ofs+b_len;
344     if (a_ofs>b_ofs) {
345     for (int i=0;i<a_ofs-b_ofs;i++) {
346     fprintf(f, " ");
347     ap++;
348     }
349     }
350     for (int i=0;i<b_ofs;i++) {
351     fprintf(f, "%c", sb[i]);
352     ap++;
353     }
354     for (int i=b_ofs;i<bend;i++) {
355     if (g<b_gaps.Count() && b_gaps[g].offset==i) {
356     for (int j=0;j<b_gaps[g].len;j++) {
357     fprintf(f, "-");
358     ap++;
359     }
360     g++;
361     }
362     if (i==b_ofs) color_bg(c_blue,f);
363     ap++;
364     bool mismatch=(sb[i]!=al[ap] && al[ap]!='-');
365     if (mismatch) color_bg(c_red,f);
366     fprintf(f, "%c", sb[i]);
367     if (mismatch) color_bg(c_blue,f);
368     }
369     color_reset(f);
370 gpertea 83 if (bend<sb_len)
371     fprintf(f, &sb[bend]);
372 gpertea 74 fprintf(f, "\n");
373     }
374     #endif
375     };
376    
377    
378     struct GXAlnInfo {
379     const char *qseq;
380     int ql,qr;
381     const char *sseq;
382     int sl,sr;
383     int score;
384     double pid;
385     GXEditScript* editscript;
386     GXAlnMem* abmp;
387     CAlnGapInfo* gapinfo;
388     GXAlnInfo(const char* q, int q_l, int q_r, const char* s, int s_l, int s_r,
389     int sc=0, double percid=0) {
390     qseq=q;
391     sseq=s;
392     ql=q_l;
393     qr=q_r;
394     sl=s_l;
395     sr=s_r;
396     score=sc;
397     pid=percid;
398     //editscript=NULL;
399     gapinfo=NULL;
400     }
401     ~GXAlnInfo() {
402     delete editscript;
403     delete abmp;
404     delete gapinfo;
405     }
406 gpertea 78 bool operator<(GXAlnInfo& d) {
407     return ((score==d.score)? pid>d.pid : score>d.score);
408     }
409     bool operator>(GXAlnInfo& d) {
410     return ((score==d.score)? pid<d.pid : score<d.score);
411     }
412     bool operator==(GXAlnInfo& d) {
413     return (score==d.score && pid==d.pid);
414     }
415    
416 gpertea 74 };
417    
418    
419    
420     struct GXSeed {
421 gpertea 77 int b_ofs; //0-based coordinate on seq b (x coordinate)
422     int a_ofs; //0-based coordinate on seq a (y coordinate)
423 gpertea 74 int len; //length of exact match after extension
424     bool operator<(GXSeed& d){
425 gpertea 77 return ((b_ofs==d.b_ofs) ? a_ofs<d.a_ofs : b_ofs<d.b_ofs);
426 gpertea 74 }
427     bool operator>(GXSeed& d){
428 gpertea 77 return ((b_ofs==d.b_ofs) ? a_ofs>d.a_ofs : b_ofs>d.b_ofs);
429 gpertea 74 }
430     bool operator==(GXSeed& d){
431 gpertea 77 return (b_ofs==d.b_ofs && a_ofs==d.a_ofs); //should never be the case, seeds are uniquely constructed
432 gpertea 74 }
433     GXSeed(int aofs=0, int bofs=0, int l=4) {
434     a_ofs=aofs;
435     b_ofs=bofs;
436     len=l;
437     }
438     };
439    
440 gpertea 81 int cmpSeedDiag(const pointer p1, const pointer p2);
441     //seeds are "equal" if they're on the same diagonal (for selection purposes only)
442 gpertea 78
443 gpertea 81 int cmpSeedScore(const pointer p1, const pointer p2); //also takes position into account
444     //among seeds with same length, prefer those closer to the left end of the read (seq_b)
445    
446 gpertea 77 struct GXBand {
447     //bundle of seed matches on 3 adjacent diagonals
448     int diag; //first anti-diagonal (b_ofs-a_ofs) in this group of 3
449     //seeds for this, and diag+1 and diag+2 are stored here
450     int min_a, max_a; //maximal coordinates of the bundle
451     int min_b, max_b;
452     GList<GXSeed> seeds; //sorted by x coordinate (b_ofs)
453     int score; //sum of seed scores (- overlapping_bases/2 - gaps)
454 gpertea 78 GXBand(int start_diag=-1, GXSeed* seed=NULL):seeds(true, false, false) {
455 gpertea 77 diag=start_diag;
456     min_a=MAX_INT;
457     min_b=MAX_INT;
458     max_a=0;
459     max_b=0;
460     score=0;
461 gpertea 78 if (seed!=NULL) addSeed(seed);
462     }
463 gpertea 77 void addSeed(GXSeed* seed) {
464     seeds.Add(seed);
465     score+=seed->len;
466     //if (diag<0) diag=seed->diag; //should NOT be done like this
467     if (seed->a_ofs < min_a) min_a=seed->a_ofs;
468     if (seed->a_ofs+ seed->len > max_a) max_a=seed->a_ofs+seed->len;
469     if (seed->b_ofs < min_b) min_b=seed->b_ofs;
470     if (seed->b_ofs+seed->len > max_b) max_b=seed->b_ofs+seed->len;
471     }
472     void finalize() {
473 gpertea 78 //!! to be called only AFTER all seeds have been added
474     // seeds are sorted by b_ofs
475     //penalize seed gaps and overlaps on b sequence
476 gpertea 77 for (int i=1;i<seeds.Count();i++) {
477     GXSeed& sprev=*seeds[i-1];
478     GXSeed& scur=*seeds[i];
479     if (scur==sprev) GError("Error: duplicate seeds found (%d-%d:%d-%d)!\n",
480     scur.a_ofs+1, scur.a_ofs+scur.len, scur.b_ofs+1, scur.b_ofs+scur.len);
481     int b_gap=scur.b_ofs-sprev.b_ofs-sprev.len;
482     int a_gap=scur.a_ofs-sprev.a_ofs-sprev.len;
483     int max_gap=b_gap;
484     int min_gap=a_gap;
485     if (min_gap>max_gap) swap(max_gap, min_gap);
486 gpertea 78 if (min_gap<0) { //overlap
487     if (max_gap>0) { score-=GMAX((-min_gap), max_gap); }
488     else score+=min_gap;
489 gpertea 77 }
490 gpertea 78 else { //gap
491     score-=max_gap;
492 gpertea 77 }
493 gpertea 78 }//for each seed
494 gpertea 77 }
495 gpertea 78
496 gpertea 77 //bands will be sorted by decreasing score eventually, after all seeds are added
497     bool operator<(GXBand& d){
498     return ((score==d.score) ? seeds.Count()>d.seeds.Count() : score>d.score);
499     }
500     bool operator>(GXBand& d){
501     return ((score==d.score) ? seeds.Count()<d.seeds.Count() : score<d.score);
502     }
503     bool operator==(GXBand& d){
504     return (score==d.score && seeds.Count()==d.seeds.Count());
505     }
506    
507     };
508    
509     class GXBandSet:public GList<GXBand> {
510     public:
511     int idxoffset; //global anti-diagonal->index offset (a_len-1)
512     //used to convert a diagonal to an index
513     //diagonal is always b_ofs-a_ofs, so the minimum value is -a_len+1
514     //hence offset is a_len-1
515     GXBand* band(int diag) { //retrieve the band for given anti-diagonal (b_ofs-a_ofs)
516     return Get(diag+idxoffset);
517     }
518     GXBand* band(int a_ofs, int b_ofs) { //retrieve the band for given anti-diagonal (b_ofs-a_ofs)
519     return Get(b_ofs-a_ofs+idxoffset);
520     }
521     GXBandSet(int a_len, int b_len):GList<GXBand>(a_len+b_len-1, false, true, false) {
522     idxoffset=a_len-1;
523     //diag will range from -a_len+1 to b_len-1, so after adjustment
524     //by idxoffset we get a max of a_len+b_len-2
525     int bcount=a_len+b_len-1;
526     for (int i=0;i<bcount;i++)
527 gpertea 78 this->Add(new GXBand(i-idxoffset));
528     //unsorted, this should set fList[i]
529 gpertea 77 }
530     void addSeed(GXSeed* seed) {
531     //MUST be unsorted !!!
532     int idx=(seed->b_ofs-seed->a_ofs)+idxoffset;
533     fList[idx]->addSeed(seed);
534     if (idx>0) fList[idx-1]->addSeed(seed);
535     if (idx<fCount-1) fList[idx+1]->addSeed(seed);
536     }
537     };
538    
539     GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len); //overlap of 3' end of seqb
540 gpertea 74 //void collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, const char* seqb); //overlap of 5' end of seqb
541    
542     void printEditScript(GXEditScript* ed_script);
543    
544 gpertea 81
545     // reward MUST be >1, always
546 gpertea 74 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
547     const char* s_seq, int s_alnstart, int s_max,
548 gpertea 81 bool editscript=false, int reward=2, int penalty=-2, int xdrop=14);
549 gpertea 74 GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart,
550 gpertea 81 bool editscript=false, int reward=2, int penalty=-2, int xdrop=14);
551 gpertea 74
552     #endif