ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/seqalign/GXAlign.h
Revision: 81
Committed: Thu Sep 29 19:02:24 2011 UTC (9 years, 7 months ago) by gpertea
File size: 15456 byte(s)
Log Message:
fixes, still a bug with flast_d[d] = flast_d[d] - flower + 2;

Line User Rev File contents
1 gpertea 74 #ifndef _GXALIGN_H
2    
3     #include "GBase.h"
4     #include "GList.hh"
5    
6     #define GDEBUG 1
7    
8     #define MAX_SPACE 1000000
9     #define LARGE 100000000
10    
11     enum {
12     EDIT_OP_MASK = 0x3,
13     EDIT_OP_ERR = 0x0,
14     EDIT_OP_INS = 0x1,
15     EDIT_OP_DEL = 0x2,
16     EDIT_OP_REP = 0x3
17     };
18    
19     #define GX_EDITOP_VAL(op) ((op) >> 2)
20     #define GX_EDITOP_GET(op) ((op) & EDIT_OP_MASK)
21     #define GX_EDITOP_CONS(op, val) (((val) << 2) | ((op) & EDIT_OP_MASK))
22    
23     #ifdef GDEBUG
24     enum {c_black=0,
25     c_red, c_green,c_brown,c_blue,c_magenta,c_cyan,c_white
26     };
27    
28     void color_fg(int c, FILE* f=stderr);
29     void color_bg(int c, FILE* f=stderr);
30     void color_resetfg(FILE* f=stderr);
31     void color_resetbg(FILE* f=stderr);
32     void color_reset(FILE* f=stderr);
33     void color_normal(FILE* f=stderr);
34     #endif
35    
36     struct GXEditScript{
37     uint32 *ops; // array of edit operations
38     uint32 opsize, opnum; // size of allocation, number in use
39     uint32 oplast; // most recent operation added
40     //methods
41    
42     GXEditScript() {
43     init();
44     }
45     ~GXEditScript() {
46     GFREE(ops);
47     }
48     void init() {
49     ops = NULL;
50     opsize = 0;
51     opnum = 0;
52     oplast = 0;
53     getReady(8);
54     }
55    
56     int getReady(uint32 n) {
57     uint32 m = n + n/2;
58     if (opsize <= n) {
59     GREALLOC(ops, m*sizeof(uint32));
60     opsize = m;
61     }
62     return 1;
63     }
64    
65     int getReady2(uint32 n) {
66     if (opsize - opnum <= n)
67     return getReady(n + opnum);
68     return 1;
69     }
70    
71     int Put(uint32 op, uint32 n) {
72     if (!getReady2(2))
73     return 0;
74     oplast = op;
75     ops[opnum] = GX_EDITOP_CONS(op, n);
76     opnum += 1;
77     ops[opnum] = 0; // sentinel
78     return 1;
79     }
80     uint32* First() {
81     return opnum > 0 ? & ops[0] : NULL;
82     }
83    
84     uint32* Next(uint32 *op) {
85     // assumes flat address space !
86     if (&ops[0] <= op && op < &ops[opnum-1])
87     return op+1;
88     else
89     return 0;
90     }
91    
92     int More(uint32 op, uint32 k) {
93     if (op == EDIT_OP_ERR) {
94     GError("GXEditScript::opMore: bad opcode %d:%d", op, k);
95     return -1;
96     }
97    
98     if (GX_EDITOP_GET(oplast) == op) {
99     uint32 l=ops[opnum-1];
100     ops[opnum-1]=GX_EDITOP_CONS((GX_EDITOP_GET(l)),
101     (GX_EDITOP_VAL(l) + k));
102     }
103     else {
104     Put(op, k);
105     }
106    
107     return 0;
108     }
109    
110     GXEditScript* Append(GXEditScript *et) {
111     uint32 *op;
112     for (op = et->First(); op; op = et->Next(op))
113     More(GX_EDITOP_GET(*op), GX_EDITOP_VAL(*op));
114     return this;
115     }
116    
117     int opDel(uint32 k) {
118     return More(EDIT_OP_DEL, k);
119     }
120     int opIns(uint32 k) {
121     return More(EDIT_OP_INS, k);
122     }
123     int opRep(uint32 k) {
124     return More(EDIT_OP_REP, k);
125     }
126    
127     GXEditScript *reverse() {
128     const uint32 mid = opnum/2;
129     const uint32 end = opnum-1;
130     for (uint32 i = 0; i < mid; ++i) {
131     const uint32 t = ops[i];
132     ops[i] = ops[end-i];
133     ops[end-i] = t;
134     }
135     return this;
136     }
137     };
138    
139    
140     struct GX3Val {
141 gpertea 81 int I, C, D;
142 gpertea 74 };
143    
144 gpertea 81 // ----- pool allocator -----
145 gpertea 74 struct GXAlnSpace {
146     GX3Val* space_array;
147     int used, size;
148     GXAlnSpace *next;
149     //methods
150     GXAlnSpace() {
151     uint32 amount=MAX_SPACE;
152     GCALLOC(space_array, sizeof(GX3Val)*amount);
153     if (space_array == NULL) {
154     GError("Failed to allocated GXAlnSpace for greedy extension!\n");
155     return;
156     }
157     used = 0;
158     size = amount;
159     next = NULL;
160     }
161     void refresh() {
162     GXAlnSpace* sp=this;
163     while (sp) {
164     sp->used = 0;
165     sp = sp->next;
166     }
167     }
168     ~GXAlnSpace() {
169     GXAlnSpace* next_sp;
170     GXAlnSpace* sp=this->next;
171     while (sp) {
172     next_sp = sp->next;
173     GFREE(sp->space_array);
174     delete sp;
175     sp = next_sp;
176     }
177 gpertea 79 GFREE(space_array);
178 gpertea 74 }
179 gpertea 79
180 gpertea 74 GX3Val* getSpace(int amount) {
181     GX3Val* v;
182     if (amount < 0) return NULL;
183     GXAlnSpace* S=this;
184     while (used+amount > S->size) {
185     if (next == NULL) {
186     next=new GXAlnSpace();
187     }
188     S = S->next;
189     }
190     v = S->space_array+S->used;
191     S->used += amount;
192     return v;
193     }
194     };
195    
196 gpertea 79 #define ERROR_FRACTION 6 // 1/this : the higher it is, the smaller the diagonal band
197    
198 gpertea 74 struct GXAlnMem { //GreedyAlignMem
199 gpertea 81 int** flast_d; //flast_d[][]
200 gpertea 74 int* max_row_free;
201     int* uplow_free;
202     GXAlnSpace* space;
203     int max_d;
204     GXAlnMem(int max_len, int reward, int penalty, int Xdrop) {
205     flast_d=NULL;
206     max_row_free=NULL;
207     max_d=0;
208     uplow_free=NULL;
209     space=NULL;
210     max_d = (int)(max_len / ERROR_FRACTION + 1);
211     int d_diff = (Xdrop+reward/2)/(penalty+reward) + 1;
212 gpertea 79 GCALLOC(flast_d, (max_d+2)* sizeof(int*));
213 gpertea 74 if (flast_d == NULL) {
214     GError("Error: cannot allocate GXAlnMem structure!\n");
215     }
216     flast_d[0] = (int*)malloc((max_d + max_d + 6) * sizeof(int) * 2);
217     if (flast_d[0] == NULL) {
218     GError("Error: failed to allocate flast_d[0] in GXAlnMem!\n");
219     }
220     flast_d[1] = flast_d[0] + max_d + max_d + 6;
221     uplow_free = NULL;
222     max_row_free = (int*) malloc(sizeof(int) * (max_d + 1 + d_diff));
223     space = new GXAlnSpace();
224     }
225     ~GXAlnMem() {
226     delete space;
227     GFREE(max_row_free);
228 gpertea 79 GFREE(flast_d[0]);
229     GFREE(flast_d);
230 gpertea 74 }
231     };
232    
233     int GXGreedyAlign(const char* s1, int len1,
234     const char* s2, int len2,
235     bool reverse, int xdrop_threshold,
236     int match_cost, int mismatch_cost,
237     int* e1, int* e2,
238     GXAlnMem* abmp, GXEditScript *S=NULL);
239    
240     #define GAPALIGN_SUB ((unsigned char)0) /*op types within the edit script*/
241     #define GAPALIGN_INS ((unsigned char)1)
242     #define GAPALIGN_DEL ((unsigned char)2)
243     #define GAPALIGN_DECLINE ((unsigned char)3)
244    
245     struct GapXEditScript {
246     unsigned char op_type; // GAPALIGN_SUB, GAPALIGN_INS, or GAPALIGN_DEL
247     int num; // Number of operations
248     GapXEditScript* next;
249     GapXEditScript() {
250     op_type=0;
251     num=0;
252     next=NULL;
253     }
254     void print();
255     };
256    
257     class CSeqGap { //
258     public:
259     int offset;
260     int len;
261     CSeqGap(int gofs=0,int glen=1) {
262     offset=gofs;
263     len=glen;
264     }
265     };
266    
267     class CAlnGapInfo {
268     int a_ofs; //alignment start on seq a (0 based)
269     int b_ofs; //alignment start on seq b (0 based)
270     int a_len; //length of alignment on seq a
271     int b_len; //length of alignment on seq b
272     public:
273     GVec<CSeqGap> a_gaps;
274     GVec<CSeqGap> b_gaps;
275     CAlnGapInfo(GXEditScript* ed_script, int astart=0, int bstart=0):a_gaps(),b_gaps() {
276     a_ofs=astart;
277     b_ofs=bstart;
278     a_len=0;
279     b_len=0;
280     for (uint32 i=0; i<ed_script->opnum; i++) {
281     int num=((ed_script->ops[i]) >> 2);
282     char op_type = 3 - ( ed_script->ops[i] & EDIT_OP_MASK );
283     if (op_type == 3 || op_type < 0 )
284     GError("Error: encountered op_type %d in ed_script?!\n", (int)op_type);
285     CSeqGap gap;
286     switch (op_type) {
287     case GAPALIGN_SUB: a_len+=num;
288     b_len+=num;
289     break;
290     case GAPALIGN_INS: a_len+=num;
291     gap.offset=b_ofs+b_len;
292     gap.len=num;
293     b_gaps.Add(gap);
294     break;
295     case GAPALIGN_DEL: b_len+=num;
296     gap.offset=a_ofs+a_len;
297     gap.len=num;
298     a_gaps.Add(gap);
299     break;
300     }
301     }
302     }
303     #ifdef GDEBUG
304     void printAlignment(FILE* f, const char* sa, const char* sb) {
305     //print seq A
306     char al[1024]; //display buffer for seq A
307     int ap=0; //index in al[] for current character printed
308     int g=0;
309     int aend=a_ofs+a_len;
310     if (a_ofs<b_ofs) {
311     for (int i=0;i<b_ofs-a_ofs;i++) {
312     fprintf(f, " ");
313     al[++ap]=' ';
314     }
315     }
316     for (int i=0;i<aend;i++) {
317     if (g<a_gaps.Count() && a_gaps[g].offset==i) {
318     for (int j=0;j<a_gaps[g].len;j++) {
319     fprintf(f, "-");
320     al[++ap]='-';
321     }
322     g++;
323     }
324     if (i==a_ofs) color_bg(c_blue,f);
325     fprintf(f, "%c", sa[i]);
326     al[++ap]=sa[i];
327     }
328     color_reset(f);
329     fprintf(f, &sa[aend]);
330     fprintf(f, "\n");
331     //print seq B
332     ap=0;
333     g=0;
334     int bend=b_ofs+b_len;
335     if (a_ofs>b_ofs) {
336     for (int i=0;i<a_ofs-b_ofs;i++) {
337     fprintf(f, " ");
338     ap++;
339     }
340     }
341     for (int i=0;i<b_ofs;i++) {
342     fprintf(f, "%c", sb[i]);
343     ap++;
344     }
345     for (int i=b_ofs;i<bend;i++) {
346     if (g<b_gaps.Count() && b_gaps[g].offset==i) {
347     for (int j=0;j<b_gaps[g].len;j++) {
348     fprintf(f, "-");
349     ap++;
350     }
351     g++;
352     }
353     if (i==b_ofs) color_bg(c_blue,f);
354     ap++;
355     bool mismatch=(sb[i]!=al[ap] && al[ap]!='-');
356     if (mismatch) color_bg(c_red,f);
357     fprintf(f, "%c", sb[i]);
358     if (mismatch) color_bg(c_blue,f);
359     }
360     color_reset(f);
361     fprintf(f, &sb[bend]);
362     fprintf(f, "\n");
363     }
364     #endif
365     };
366    
367    
368     struct GXAlnInfo {
369     const char *qseq;
370     int ql,qr;
371     const char *sseq;
372     int sl,sr;
373     int score;
374     double pid;
375     GXEditScript* editscript;
376     GXAlnMem* abmp;
377     CAlnGapInfo* gapinfo;
378     GXAlnInfo(const char* q, int q_l, int q_r, const char* s, int s_l, int s_r,
379     int sc=0, double percid=0) {
380     qseq=q;
381     sseq=s;
382     ql=q_l;
383     qr=q_r;
384     sl=s_l;
385     sr=s_r;
386     score=sc;
387     pid=percid;
388     //editscript=NULL;
389     gapinfo=NULL;
390     }
391     ~GXAlnInfo() {
392     delete editscript;
393     delete abmp;
394     delete gapinfo;
395     }
396 gpertea 78 bool operator<(GXAlnInfo& d) {
397     return ((score==d.score)? pid>d.pid : score>d.score);
398     }
399     bool operator>(GXAlnInfo& d) {
400     return ((score==d.score)? pid<d.pid : score<d.score);
401     }
402     bool operator==(GXAlnInfo& d) {
403     return (score==d.score && pid==d.pid);
404     }
405    
406 gpertea 74 };
407    
408    
409    
410     struct GXSeed {
411 gpertea 77 int b_ofs; //0-based coordinate on seq b (x coordinate)
412     int a_ofs; //0-based coordinate on seq a (y coordinate)
413 gpertea 74 int len; //length of exact match after extension
414     bool operator<(GXSeed& d){
415 gpertea 77 return ((b_ofs==d.b_ofs) ? a_ofs<d.a_ofs : b_ofs<d.b_ofs);
416 gpertea 74 }
417     bool operator>(GXSeed& d){
418 gpertea 77 return ((b_ofs==d.b_ofs) ? a_ofs>d.a_ofs : b_ofs>d.b_ofs);
419 gpertea 74 }
420     bool operator==(GXSeed& d){
421 gpertea 77 return (b_ofs==d.b_ofs && a_ofs==d.a_ofs); //should never be the case, seeds are uniquely constructed
422 gpertea 74 }
423     GXSeed(int aofs=0, int bofs=0, int l=4) {
424     a_ofs=aofs;
425     b_ofs=bofs;
426     len=l;
427     }
428     };
429    
430 gpertea 81 int cmpSeedDiag(const pointer p1, const pointer p2);
431     //seeds are "equal" if they're on the same diagonal (for selection purposes only)
432 gpertea 78
433 gpertea 81 int cmpSeedScore(const pointer p1, const pointer p2); //also takes position into account
434     //among seeds with same length, prefer those closer to the left end of the read (seq_b)
435    
436 gpertea 77 struct GXBand {
437     //bundle of seed matches on 3 adjacent diagonals
438     int diag; //first anti-diagonal (b_ofs-a_ofs) in this group of 3
439     //seeds for this, and diag+1 and diag+2 are stored here
440     int min_a, max_a; //maximal coordinates of the bundle
441     int min_b, max_b;
442     GList<GXSeed> seeds; //sorted by x coordinate (b_ofs)
443     int score; //sum of seed scores (- overlapping_bases/2 - gaps)
444 gpertea 78 GXBand(int start_diag=-1, GXSeed* seed=NULL):seeds(true, false, false) {
445 gpertea 77 diag=start_diag;
446     min_a=MAX_INT;
447     min_b=MAX_INT;
448     max_a=0;
449     max_b=0;
450     score=0;
451 gpertea 78 if (seed!=NULL) addSeed(seed);
452     }
453 gpertea 77 void addSeed(GXSeed* seed) {
454     seeds.Add(seed);
455     score+=seed->len;
456     //if (diag<0) diag=seed->diag; //should NOT be done like this
457     if (seed->a_ofs < min_a) min_a=seed->a_ofs;
458     if (seed->a_ofs+ seed->len > max_a) max_a=seed->a_ofs+seed->len;
459     if (seed->b_ofs < min_b) min_b=seed->b_ofs;
460     if (seed->b_ofs+seed->len > max_b) max_b=seed->b_ofs+seed->len;
461     }
462     void finalize() {
463 gpertea 78 //!! to be called only AFTER all seeds have been added
464     // seeds are sorted by b_ofs
465     //penalize seed gaps and overlaps on b sequence
466 gpertea 77 for (int i=1;i<seeds.Count();i++) {
467     GXSeed& sprev=*seeds[i-1];
468     GXSeed& scur=*seeds[i];
469     if (scur==sprev) GError("Error: duplicate seeds found (%d-%d:%d-%d)!\n",
470     scur.a_ofs+1, scur.a_ofs+scur.len, scur.b_ofs+1, scur.b_ofs+scur.len);
471     int b_gap=scur.b_ofs-sprev.b_ofs-sprev.len;
472     int a_gap=scur.a_ofs-sprev.a_ofs-sprev.len;
473     int max_gap=b_gap;
474     int min_gap=a_gap;
475     if (min_gap>max_gap) swap(max_gap, min_gap);
476 gpertea 78 if (min_gap<0) { //overlap
477     if (max_gap>0) { score-=GMAX((-min_gap), max_gap); }
478     else score+=min_gap;
479 gpertea 77 }
480 gpertea 78 else { //gap
481     score-=max_gap;
482 gpertea 77 }
483 gpertea 78 }//for each seed
484 gpertea 77 }
485 gpertea 78
486 gpertea 77 //bands will be sorted by decreasing score eventually, after all seeds are added
487     bool operator<(GXBand& d){
488     return ((score==d.score) ? seeds.Count()>d.seeds.Count() : score>d.score);
489     }
490     bool operator>(GXBand& d){
491     return ((score==d.score) ? seeds.Count()<d.seeds.Count() : score<d.score);
492     }
493     bool operator==(GXBand& d){
494     return (score==d.score && seeds.Count()==d.seeds.Count());
495     }
496    
497     };
498    
499     class GXBandSet:public GList<GXBand> {
500     public:
501     int idxoffset; //global anti-diagonal->index offset (a_len-1)
502     //used to convert a diagonal to an index
503     //diagonal is always b_ofs-a_ofs, so the minimum value is -a_len+1
504     //hence offset is a_len-1
505     GXBand* band(int diag) { //retrieve the band for given anti-diagonal (b_ofs-a_ofs)
506     return Get(diag+idxoffset);
507     }
508     GXBand* band(int a_ofs, int b_ofs) { //retrieve the band for given anti-diagonal (b_ofs-a_ofs)
509     return Get(b_ofs-a_ofs+idxoffset);
510     }
511     GXBandSet(int a_len, int b_len):GList<GXBand>(a_len+b_len-1, false, true, false) {
512     idxoffset=a_len-1;
513     //diag will range from -a_len+1 to b_len-1, so after adjustment
514     //by idxoffset we get a max of a_len+b_len-2
515     int bcount=a_len+b_len-1;
516     for (int i=0;i<bcount;i++)
517 gpertea 78 this->Add(new GXBand(i-idxoffset));
518     //unsorted, this should set fList[i]
519 gpertea 77 }
520     void addSeed(GXSeed* seed) {
521     //MUST be unsorted !!!
522     int idx=(seed->b_ofs-seed->a_ofs)+idxoffset;
523     fList[idx]->addSeed(seed);
524     if (idx>0) fList[idx-1]->addSeed(seed);
525     if (idx<fCount-1) fList[idx+1]->addSeed(seed);
526     }
527     };
528    
529     GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len); //overlap of 3' end of seqb
530 gpertea 74 //void collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, const char* seqb); //overlap of 5' end of seqb
531    
532     void printEditScript(GXEditScript* ed_script);
533    
534 gpertea 81
535     // reward MUST be >1, always
536 gpertea 74 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
537     const char* s_seq, int s_alnstart, int s_max,
538 gpertea 81 bool editscript=false, int reward=2, int penalty=-2, int xdrop=14);
539 gpertea 74 GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart,
540 gpertea 81 bool editscript=false, int reward=2, int penalty=-2, int xdrop=14);
541 gpertea 74
542     #endif