ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/seqalign/GXAlign.h
Revision: 78
Committed: Wed Sep 28 03:20:10 2011 UTC (9 years, 7 months ago) by gpertea
File size: 15310 byte(s)
Log Message:
3' match seems to work, needs a few refinements

Line User Rev File contents
1 gpertea 74 #ifndef _GXALIGN_H
2    
3     #include "GBase.h"
4     #include "GList.hh"
5    
6     #define GDEBUG 1
7    
8     #define ERROR_FRACTION 6 // 1/this : the higher it is, the smaller the diagonal band
9     #define MAX_SPACE 1000000
10     #define LARGE 100000000
11    
12     enum {
13     EDIT_OP_MASK = 0x3,
14     EDIT_OP_ERR = 0x0,
15     EDIT_OP_INS = 0x1,
16     EDIT_OP_DEL = 0x2,
17     EDIT_OP_REP = 0x3
18     };
19    
20     #define GX_EDITOP_VAL(op) ((op) >> 2)
21     #define GX_EDITOP_GET(op) ((op) & EDIT_OP_MASK)
22     #define GX_EDITOP_CONS(op, val) (((val) << 2) | ((op) & EDIT_OP_MASK))
23    
24     #ifdef GDEBUG
25     enum {c_black=0,
26     c_red, c_green,c_brown,c_blue,c_magenta,c_cyan,c_white
27     };
28    
29     void color_fg(int c, FILE* f=stderr);
30     void color_bg(int c, FILE* f=stderr);
31     void color_resetfg(FILE* f=stderr);
32     void color_resetbg(FILE* f=stderr);
33     void color_reset(FILE* f=stderr);
34     void color_normal(FILE* f=stderr);
35     #endif
36    
37     struct GXEditScript{
38     uint32 *ops; // array of edit operations
39     uint32 opsize, opnum; // size of allocation, number in use
40     uint32 oplast; // most recent operation added
41     //methods
42    
43     GXEditScript() {
44     init();
45     }
46     ~GXEditScript() {
47     GFREE(ops);
48     }
49     void init() {
50     ops = NULL;
51     opsize = 0;
52     opnum = 0;
53     oplast = 0;
54     getReady(8);
55     }
56    
57     int getReady(uint32 n) {
58     uint32 m = n + n/2;
59     if (opsize <= n) {
60     GREALLOC(ops, m*sizeof(uint32));
61     opsize = m;
62     }
63     return 1;
64     }
65    
66     int getReady2(uint32 n) {
67     if (opsize - opnum <= n)
68     return getReady(n + opnum);
69     return 1;
70     }
71    
72     int Put(uint32 op, uint32 n) {
73     if (!getReady2(2))
74     return 0;
75     oplast = op;
76     ops[opnum] = GX_EDITOP_CONS(op, n);
77     opnum += 1;
78     ops[opnum] = 0; // sentinel
79     return 1;
80     }
81     uint32* First() {
82     return opnum > 0 ? & ops[0] : NULL;
83     }
84    
85     uint32* Next(uint32 *op) {
86     // assumes flat address space !
87     if (&ops[0] <= op && op < &ops[opnum-1])
88     return op+1;
89     else
90     return 0;
91     }
92    
93     int More(uint32 op, uint32 k) {
94     if (op == EDIT_OP_ERR) {
95     GError("GXEditScript::opMore: bad opcode %d:%d", op, k);
96     return -1;
97     }
98    
99     if (GX_EDITOP_GET(oplast) == op) {
100     uint32 l=ops[opnum-1];
101     ops[opnum-1]=GX_EDITOP_CONS((GX_EDITOP_GET(l)),
102     (GX_EDITOP_VAL(l) + k));
103     }
104     else {
105     Put(op, k);
106     }
107    
108     return 0;
109     }
110    
111     GXEditScript* Append(GXEditScript *et) {
112     uint32 *op;
113     for (op = et->First(); op; op = et->Next(op))
114     More(GX_EDITOP_GET(*op), GX_EDITOP_VAL(*op));
115     return this;
116     }
117    
118     int opDel(uint32 k) {
119     return More(EDIT_OP_DEL, k);
120     }
121     int opIns(uint32 k) {
122     return More(EDIT_OP_INS, k);
123     }
124     int opRep(uint32 k) {
125     return More(EDIT_OP_REP, k);
126     }
127    
128     GXEditScript *reverse() {
129     const uint32 mid = opnum/2;
130     const uint32 end = opnum-1;
131     for (uint32 i = 0; i < mid; ++i) {
132     const uint32 t = ops[i];
133     ops[i] = ops[end-i];
134     ops[end-i] = t;
135     }
136     return this;
137     }
138     };
139    
140    
141     /* ----- pool allocator ----- */
142     struct GX3Val {
143     int I, C, D;
144     };
145    
146     struct GXAlnSpace {
147     GX3Val* space_array;
148     int used, size;
149     GXAlnSpace *next;
150     //methods
151     GXAlnSpace() {
152     uint32 amount=MAX_SPACE;
153     GCALLOC(space_array, sizeof(GX3Val)*amount);
154     if (space_array == NULL) {
155     GError("Failed to allocated GXAlnSpace for greedy extension!\n");
156     return;
157     }
158     used = 0;
159     size = amount;
160     next = NULL;
161     }
162     void refresh() {
163     GXAlnSpace* sp=this;
164     while (sp) {
165     sp->used = 0;
166     sp = sp->next;
167     }
168     }
169     ~GXAlnSpace() {
170     GXAlnSpace* next_sp;
171     GXAlnSpace* sp=this->next;
172     while (sp) {
173     next_sp = sp->next;
174     GFREE(sp->space_array);
175     delete sp;
176     sp = next_sp;
177     }
178     }
179     GX3Val* getSpace(int amount) {
180     GX3Val* v;
181     if (amount < 0) return NULL;
182     GXAlnSpace* S=this;
183     while (used+amount > S->size) {
184     if (next == NULL) {
185     next=new GXAlnSpace();
186     }
187     S = S->next;
188     }
189     v = S->space_array+S->used;
190     S->used += amount;
191     return v;
192     }
193     };
194    
195     struct GXAlnMem { //GreedyAlignMem
196     int** flast_d;
197     int* max_row_free;
198     int* uplow_free;
199     GXAlnSpace* space;
200     int max_d;
201     GXAlnMem(int max_len, int reward, int penalty, int Xdrop) {
202     flast_d=NULL;
203     max_row_free=NULL;
204     max_d=0;
205     uplow_free=NULL;
206     space=NULL;
207     max_d = (int)(max_len / ERROR_FRACTION + 1);
208     int d_diff = (Xdrop+reward/2)/(penalty+reward) + 1;
209     GCALLOC(flast_d, (max_d+2)* sizeof(int));
210     if (flast_d == NULL) {
211     GError("Error: cannot allocate GXAlnMem structure!\n");
212     }
213     flast_d[0] = (int*)malloc((max_d + max_d + 6) * sizeof(int) * 2);
214     if (flast_d[0] == NULL) {
215     GError("Error: failed to allocate flast_d[0] in GXAlnMem!\n");
216     }
217     flast_d[1] = flast_d[0] + max_d + max_d + 6;
218     uplow_free = NULL;
219     max_row_free = (int*) malloc(sizeof(int) * (max_d + 1 + d_diff));
220     space = new GXAlnSpace();
221     }
222     ~GXAlnMem() {
223     delete space;
224     GFREE(max_row_free);
225     //GFREE(flast_d[0]); // FIXME
226     //TODO: check with valgrind
227     //for (int i=0;i<max_d+2;i++) {
228     // GFREE(flast_d[i]);
229     // }
230     //GFREE(flast_d);
231     }
232     };
233    
234     int GXGreedyAlign(const char* s1, int len1,
235     const char* s2, int len2,
236     bool reverse, int xdrop_threshold,
237     int match_cost, int mismatch_cost,
238     int* e1, int* e2,
239     GXAlnMem* abmp, GXEditScript *S=NULL);
240    
241     #define GAPALIGN_SUB ((unsigned char)0) /*op types within the edit script*/
242     #define GAPALIGN_INS ((unsigned char)1)
243     #define GAPALIGN_DEL ((unsigned char)2)
244     #define GAPALIGN_DECLINE ((unsigned char)3)
245    
246     struct GapXEditScript {
247     unsigned char op_type; // GAPALIGN_SUB, GAPALIGN_INS, or GAPALIGN_DEL
248     int num; // Number of operations
249     GapXEditScript* next;
250     GapXEditScript() {
251     op_type=0;
252     num=0;
253     next=NULL;
254     }
255     void print();
256     };
257    
258     class CSeqGap { //
259     public:
260     int offset;
261     int len;
262     CSeqGap(int gofs=0,int glen=1) {
263     offset=gofs;
264     len=glen;
265     }
266     };
267    
268     class CAlnGapInfo {
269     int a_ofs; //alignment start on seq a (0 based)
270     int b_ofs; //alignment start on seq b (0 based)
271     int a_len; //length of alignment on seq a
272     int b_len; //length of alignment on seq b
273     public:
274     GVec<CSeqGap> a_gaps;
275     GVec<CSeqGap> b_gaps;
276     CAlnGapInfo(GXEditScript* ed_script, int astart=0, int bstart=0):a_gaps(),b_gaps() {
277     a_ofs=astart;
278     b_ofs=bstart;
279     a_len=0;
280     b_len=0;
281     for (uint32 i=0; i<ed_script->opnum; i++) {
282     int num=((ed_script->ops[i]) >> 2);
283     char op_type = 3 - ( ed_script->ops[i] & EDIT_OP_MASK );
284     if (op_type == 3 || op_type < 0 )
285     GError("Error: encountered op_type %d in ed_script?!\n", (int)op_type);
286     CSeqGap gap;
287     switch (op_type) {
288     case GAPALIGN_SUB: a_len+=num;
289     b_len+=num;
290     break;
291     case GAPALIGN_INS: a_len+=num;
292     gap.offset=b_ofs+b_len;
293     gap.len=num;
294     b_gaps.Add(gap);
295     break;
296     case GAPALIGN_DEL: b_len+=num;
297     gap.offset=a_ofs+a_len;
298     gap.len=num;
299     a_gaps.Add(gap);
300     break;
301     }
302     }
303     }
304     #ifdef GDEBUG
305     void printAlignment(FILE* f, const char* sa, const char* sb) {
306     //print seq A
307     char al[1024]; //display buffer for seq A
308     int ap=0; //index in al[] for current character printed
309     int g=0;
310     int aend=a_ofs+a_len;
311     if (a_ofs<b_ofs) {
312     for (int i=0;i<b_ofs-a_ofs;i++) {
313     fprintf(f, " ");
314     al[++ap]=' ';
315     }
316     }
317     for (int i=0;i<aend;i++) {
318     if (g<a_gaps.Count() && a_gaps[g].offset==i) {
319     for (int j=0;j<a_gaps[g].len;j++) {
320     fprintf(f, "-");
321     al[++ap]='-';
322     }
323     g++;
324     }
325     if (i==a_ofs) color_bg(c_blue,f);
326     fprintf(f, "%c", sa[i]);
327     al[++ap]=sa[i];
328     }
329     color_reset(f);
330     fprintf(f, &sa[aend]);
331     fprintf(f, "\n");
332     //print seq B
333     ap=0;
334     g=0;
335     int bend=b_ofs+b_len;
336     if (a_ofs>b_ofs) {
337     for (int i=0;i<a_ofs-b_ofs;i++) {
338     fprintf(f, " ");
339     ap++;
340     }
341     }
342     for (int i=0;i<b_ofs;i++) {
343     fprintf(f, "%c", sb[i]);
344     ap++;
345     }
346     for (int i=b_ofs;i<bend;i++) {
347     if (g<b_gaps.Count() && b_gaps[g].offset==i) {
348     for (int j=0;j<b_gaps[g].len;j++) {
349     fprintf(f, "-");
350     ap++;
351     }
352     g++;
353     }
354     if (i==b_ofs) color_bg(c_blue,f);
355     ap++;
356     bool mismatch=(sb[i]!=al[ap] && al[ap]!='-');
357     if (mismatch) color_bg(c_red,f);
358     fprintf(f, "%c", sb[i]);
359     if (mismatch) color_bg(c_blue,f);
360     }
361     color_reset(f);
362     fprintf(f, &sb[bend]);
363     fprintf(f, "\n");
364     }
365     #endif
366     };
367    
368    
369     struct GXAlnInfo {
370     const char *qseq;
371     int ql,qr;
372     const char *sseq;
373     int sl,sr;
374     int score;
375     double pid;
376     GXEditScript* editscript;
377     GXAlnMem* abmp;
378     CAlnGapInfo* gapinfo;
379     GXAlnInfo(const char* q, int q_l, int q_r, const char* s, int s_l, int s_r,
380     int sc=0, double percid=0) {
381     qseq=q;
382     sseq=s;
383     ql=q_l;
384     qr=q_r;
385     sl=s_l;
386     sr=s_r;
387     score=sc;
388     pid=percid;
389     //editscript=NULL;
390     gapinfo=NULL;
391     }
392     ~GXAlnInfo() {
393     delete editscript;
394     delete abmp;
395     delete gapinfo;
396     }
397 gpertea 78 bool operator<(GXAlnInfo& d) {
398     return ((score==d.score)? pid>d.pid : score>d.score);
399     }
400     bool operator>(GXAlnInfo& d) {
401     return ((score==d.score)? pid<d.pid : score<d.score);
402     }
403     bool operator==(GXAlnInfo& d) {
404     return (score==d.score && pid==d.pid);
405     }
406    
407 gpertea 74 };
408    
409    
410    
411     struct GXSeed {
412 gpertea 77 int b_ofs; //0-based coordinate on seq b (x coordinate)
413     int a_ofs; //0-based coordinate on seq a (y coordinate)
414 gpertea 74 int len; //length of exact match after extension
415 gpertea 77 bool edited; //if it's an edited (cut) seed
416 gpertea 74 bool operator<(GXSeed& d){
417 gpertea 77 return ((b_ofs==d.b_ofs) ? a_ofs<d.a_ofs : b_ofs<d.b_ofs);
418 gpertea 74 }
419     bool operator>(GXSeed& d){
420 gpertea 77 return ((b_ofs==d.b_ofs) ? a_ofs>d.a_ofs : b_ofs>d.b_ofs);
421 gpertea 74 }
422     bool operator==(GXSeed& d){
423 gpertea 77 return (b_ofs==d.b_ofs && a_ofs==d.a_ofs); //should never be the case, seeds are uniquely constructed
424 gpertea 74 }
425     GXSeed(int aofs=0, int bofs=0, int l=4) {
426     a_ofs=aofs;
427     b_ofs=bofs;
428 gpertea 77 edited=false;
429 gpertea 74 len=l;
430     }
431     };
432    
433 gpertea 78 int cmpSeedLen(const pointer s1, const pointer s2);
434    
435 gpertea 77 struct GXBand {
436     //bundle of seed matches on 3 adjacent diagonals
437     int diag; //first anti-diagonal (b_ofs-a_ofs) in this group of 3
438     //seeds for this, and diag+1 and diag+2 are stored here
439     int min_a, max_a; //maximal coordinates of the bundle
440     int min_b, max_b;
441     GList<GXSeed> seeds; //sorted by x coordinate (b_ofs)
442     int score; //sum of seed scores (- overlapping_bases/2 - gaps)
443 gpertea 78 GXBand(int start_diag=-1, GXSeed* seed=NULL):seeds(true, false, false) {
444 gpertea 77 diag=start_diag;
445     min_a=MAX_INT;
446     min_b=MAX_INT;
447     max_a=0;
448     max_b=0;
449     score=0;
450 gpertea 78 if (seed!=NULL) addSeed(seed);
451     }
452 gpertea 77 void addSeed(GXSeed* seed) {
453     seeds.Add(seed);
454     score+=seed->len;
455     //if (diag<0) diag=seed->diag; //should NOT be done like this
456     if (seed->a_ofs < min_a) min_a=seed->a_ofs;
457     if (seed->a_ofs+ seed->len > max_a) max_a=seed->a_ofs+seed->len;
458     if (seed->b_ofs < min_b) min_b=seed->b_ofs;
459     if (seed->b_ofs+seed->len > max_b) max_b=seed->b_ofs+seed->len;
460     }
461     void finalize() {
462 gpertea 78 //!! to be called only AFTER all seeds have been added
463     // seeds are sorted by b_ofs
464     //penalize seed gaps and overlaps on b sequence
465 gpertea 77 for (int i=1;i<seeds.Count();i++) {
466     GXSeed& sprev=*seeds[i-1];
467     GXSeed& scur=*seeds[i];
468     if (scur==sprev) GError("Error: duplicate seeds found (%d-%d:%d-%d)!\n",
469     scur.a_ofs+1, scur.a_ofs+scur.len, scur.b_ofs+1, scur.b_ofs+scur.len);
470     int b_gap=scur.b_ofs-sprev.b_ofs-sprev.len;
471     int a_gap=scur.a_ofs-sprev.a_ofs-sprev.len;
472     int max_gap=b_gap;
473     int min_gap=a_gap;
474     if (min_gap>max_gap) swap(max_gap, min_gap);
475 gpertea 78 if (min_gap<0) { //overlap
476     if (max_gap>0) { score-=GMAX((-min_gap), max_gap); }
477     else score+=min_gap;
478 gpertea 77 }
479 gpertea 78 else { //gap
480     score-=max_gap;
481 gpertea 77 }
482 gpertea 78 }//for each seed
483 gpertea 77 }
484 gpertea 78
485 gpertea 77 //bands will be sorted by decreasing score eventually, after all seeds are added
486     bool operator<(GXBand& d){
487     return ((score==d.score) ? seeds.Count()>d.seeds.Count() : score>d.score);
488     }
489     bool operator>(GXBand& d){
490     return ((score==d.score) ? seeds.Count()<d.seeds.Count() : score<d.score);
491     }
492     bool operator==(GXBand& d){
493     return (score==d.score && seeds.Count()==d.seeds.Count());
494     }
495    
496     };
497    
498     class GXBandSet:public GList<GXBand> {
499     public:
500     int idxoffset; //global anti-diagonal->index offset (a_len-1)
501     //used to convert a diagonal to an index
502     //diagonal is always b_ofs-a_ofs, so the minimum value is -a_len+1
503     //hence offset is a_len-1
504     GXBand* band(int diag) { //retrieve the band for given anti-diagonal (b_ofs-a_ofs)
505     return Get(diag+idxoffset);
506     }
507     GXBand* band(int a_ofs, int b_ofs) { //retrieve the band for given anti-diagonal (b_ofs-a_ofs)
508     return Get(b_ofs-a_ofs+idxoffset);
509     }
510     GXBandSet(int a_len, int b_len):GList<GXBand>(a_len+b_len-1, false, true, false) {
511     idxoffset=a_len-1;
512     //diag will range from -a_len+1 to b_len-1, so after adjustment
513     //by idxoffset we get a max of a_len+b_len-2
514     int bcount=a_len+b_len-1;
515     for (int i=0;i<bcount;i++)
516 gpertea 78 this->Add(new GXBand(i-idxoffset));
517     //unsorted, this should set fList[i]
518 gpertea 77 }
519     void addSeed(GXSeed* seed) {
520     //MUST be unsorted !!!
521     int idx=(seed->b_ofs-seed->a_ofs)+idxoffset;
522     fList[idx]->addSeed(seed);
523     if (idx>0) fList[idx-1]->addSeed(seed);
524     if (idx<fCount-1) fList[idx+1]->addSeed(seed);
525     }
526     };
527    
528     GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len); //overlap of 3' end of seqb
529 gpertea 74 //void collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, const char* seqb); //overlap of 5' end of seqb
530    
531     void printEditScript(GXEditScript* ed_script);
532    
533     GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
534     const char* s_seq, int s_alnstart, int s_max,
535     bool editscript=false, int reward=2, int penalty=-3, int xdrop=22);
536     GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart,
537     bool editscript=false, int reward=2, int penalty=-3, int xdrop=22);
538    
539     #endif