ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/seqalign/GXAlign.h
Revision: 86
Committed: Fri Sep 30 21:38:39 2011 UTC (9 years, 7 months ago) by gpertea
File size: 18013 byte(s)
Log Message:
fixed a few more subtle bugs

Line User Rev File contents
1 gpertea 74 #ifndef _GXALIGN_H
2    
3     #include "GBase.h"
4     #include "GList.hh"
5    
6     #define GDEBUG 1
7    
8     enum {
9     EDIT_OP_MASK = 0x3,
10     EDIT_OP_ERR = 0x0,
11     EDIT_OP_INS = 0x1,
12     EDIT_OP_DEL = 0x2,
13     EDIT_OP_REP = 0x3
14     };
15    
16     #define GX_EDITOP_VAL(op) ((op) >> 2)
17     #define GX_EDITOP_GET(op) ((op) & EDIT_OP_MASK)
18     #define GX_EDITOP_CONS(op, val) (((val) << 2) | ((op) & EDIT_OP_MASK))
19    
20     #ifdef GDEBUG
21     enum {c_black=0,
22     c_red, c_green,c_brown,c_blue,c_magenta,c_cyan,c_white
23     };
24    
25     void color_fg(int c, FILE* f=stderr);
26     void color_bg(int c, FILE* f=stderr);
27     void color_resetfg(FILE* f=stderr);
28     void color_resetbg(FILE* f=stderr);
29     void color_reset(FILE* f=stderr);
30     void color_normal(FILE* f=stderr);
31     #endif
32    
33     struct GXEditScript{
34     uint32 *ops; // array of edit operations
35     uint32 opsize, opnum; // size of allocation, number in use
36     uint32 oplast; // most recent operation added
37     //methods
38    
39     GXEditScript() {
40     init();
41     }
42     ~GXEditScript() {
43     GFREE(ops);
44     }
45     void init() {
46     ops = NULL;
47     opsize = 0;
48     opnum = 0;
49     oplast = 0;
50     getReady(8);
51     }
52    
53     int getReady(uint32 n) {
54     uint32 m = n + n/2;
55     if (opsize <= n) {
56     GREALLOC(ops, m*sizeof(uint32));
57     opsize = m;
58     }
59     return 1;
60     }
61    
62     int getReady2(uint32 n) {
63     if (opsize - opnum <= n)
64     return getReady(n + opnum);
65     return 1;
66     }
67    
68     int Put(uint32 op, uint32 n) {
69     if (!getReady2(2))
70     return 0;
71     oplast = op;
72     ops[opnum] = GX_EDITOP_CONS(op, n);
73     opnum += 1;
74     ops[opnum] = 0; // sentinel
75     return 1;
76     }
77     uint32* First() {
78     return opnum > 0 ? & ops[0] : NULL;
79     }
80    
81     uint32* Next(uint32 *op) {
82     // assumes flat address space !
83     if (&ops[0] <= op && op < &ops[opnum-1])
84     return op+1;
85     else
86     return 0;
87     }
88    
89     int More(uint32 op, uint32 k) {
90     if (op == EDIT_OP_ERR) {
91     GError("GXEditScript::opMore: bad opcode %d:%d", op, k);
92     return -1;
93     }
94    
95     if (GX_EDITOP_GET(oplast) == op) {
96     uint32 l=ops[opnum-1];
97     ops[opnum-1]=GX_EDITOP_CONS((GX_EDITOP_GET(l)),
98     (GX_EDITOP_VAL(l) + k));
99     }
100     else {
101     Put(op, k);
102     }
103    
104     return 0;
105     }
106    
107     GXEditScript* Append(GXEditScript *et) {
108     uint32 *op;
109     for (op = et->First(); op; op = et->Next(op))
110     More(GX_EDITOP_GET(*op), GX_EDITOP_VAL(*op));
111     return this;
112     }
113    
114     int opDel(uint32 k) {
115     return More(EDIT_OP_DEL, k);
116     }
117     int opIns(uint32 k) {
118     return More(EDIT_OP_INS, k);
119     }
120     int opRep(uint32 k) {
121     return More(EDIT_OP_REP, k);
122     }
123    
124     GXEditScript *reverse() {
125     const uint32 mid = opnum/2;
126     const uint32 end = opnum-1;
127     for (uint32 i = 0; i < mid; ++i) {
128     const uint32 t = ops[i];
129     ops[i] = ops[end-i];
130     ops[end-i] = t;
131     }
132     return this;
133     }
134     };
135    
136    
137 gpertea 85 /** Bookkeeping structure for greedy alignment. When aligning
138     two sequences, the members of this structure store the
139     largest offset into the second sequence that leads to a
140     high-scoring alignment for a given start point */
141     struct SGreedyOffset {
142     int insert_off; // Best offset for a path ending in an insertion
143     int match_off; // Best offset for a path ending in a match
144     int delete_off; // Best offset for a path ending in a deletion
145     };
146    
147 gpertea 81 // ----- pool allocator -----
148 gpertea 82 // works as a linked list of allocated memory blocks
149     struct GXMemPool {
150 gpertea 85 SGreedyOffset* memblock;
151 gpertea 74 int used, size;
152 gpertea 82 GXMemPool *next;
153 gpertea 85 static int kMinSpace;
154 gpertea 74 //methods
155 gpertea 85 GXMemPool(int num_offsp=0) { //by default allocate a large block here (10M)
156     num_offsp=GMAX(kMinSpace, num_offsp);
157     GMALLOC(memblock, num_offsp*sizeof(SGreedyOffset));
158 gpertea 82 if (memblock == NULL) {
159 gpertea 85 GError("Failed to allocated GXMemPool(%d) for greedy extension!\n",num_offsp);
160 gpertea 74 return;
161     }
162 gpertea 85 used = 0;
163     size = num_offsp;
164 gpertea 74 next = NULL;
165     }
166 gpertea 85
167 gpertea 74 void refresh() {
168 gpertea 82 GXMemPool* sp=this;
169 gpertea 74 while (sp) {
170     sp->used = 0;
171     sp = sp->next;
172     }
173     }
174 gpertea 82 ~GXMemPool() {
175     GXMemPool* next_sp;
176     GXMemPool* sp=this->next;
177 gpertea 74 while (sp) {
178     next_sp = sp->next;
179 gpertea 82 GFREE(sp->memblock);
180 gpertea 74 delete sp;
181     sp = next_sp;
182     }
183 gpertea 82 GFREE(memblock);
184 gpertea 74 }
185 gpertea 79
186 gpertea 85 SGreedyOffset* getSpace(int num_alloc) { // SGreedyOffset[num_alloc] array
187 gpertea 82 //can use the first found memory block with enough room,
188     // or allocate a new large block
189 gpertea 85 SGreedyOffset* v;
190     if (num_alloc < 0) return NULL;
191     GXMemPool* S=this;
192     while (used+num_alloc > S->size) {
193 gpertea 82 //no room in current block, get a new mem block
194 gpertea 74 if (next == NULL) {
195 gpertea 85 next=new GXMemPool(num_alloc); //allocates a large contiguous memory block
196 gpertea 74 }
197     S = S->next;
198     }
199 gpertea 85 v = S->memblock+S->used;
200     S->used += num_alloc;
201     //align to first 8-byte boundary
202     int m8 = S->used & 7; //modulo 8
203     if (m8)
204 gpertea 82 S->used += 8 - m8;
205 gpertea 85 return v;
206     }
207    
208     void* getByteSpace(int byte_size) { //amount to use or allocate memory, in bytes
209     return (void*)getSpace(byte_size/sizeof(SGreedyOffset));
210     }
211    
212 gpertea 74 };
213    
214 gpertea 85 #define GREEDY_MAX_COST_FRACTION 5
215     /* (was 2) sequence_length / (this number) is a measure of how hard the
216     alignment code will work to find the optimal alignment; in fact
217     this gives a worst case bound on the number of loop iterations */
218 gpertea 79
219 gpertea 85 #define GREEDY_MAX_COST 1000
220     // The largest diff distance (max indels+mismatches) to be examined for an optimal alignment
221     // (should be increased for large sequences)
222    
223     #define GX_GALLOC_ERROR "Error: failed to allocate memory for greedy alignment!\n"
224    
225     // all auxiliary memory needed for the greedy extension algorithm
226     struct SGreedyAlignMem {
227     int** last_seq2_off; // 2-D array of distances
228     int* max_score; // array of maximum scores
229     GXMemPool* space; // local memory pool for SGreedyOffset structs
230    
231     // Allocate memory for the greedy gapped alignment algorithm
232     SGreedyAlignMem(int reward, int penalty, int Xdrop) {
233     int max_d, d_diff;
234     if (penalty<0) penalty=-penalty;
235     if (reward % 2) {
236     //scale params
237     reward = reward << 1;
238     penalty = (penalty << 1);
239     Xdrop = Xdrop<<1;
240     }
241     //if (gap_open == 0 && gap_extend == 0)
242     // gap_extend = (reward >> 1) + penalty;
243     const int max_dbseq_length=255; //adjust this accordingly
244     max_d = GMIN(GREEDY_MAX_COST,
245     (max_dbseq_length/GREEDY_MAX_COST_FRACTION + 1));
246    
247     last_seq2_off=NULL; // 2-D array of distances
248     max_score=NULL; // array of maximum scores
249     space=NULL; // local memory pool for SGreedyOffset structs
250     //if (score_params.gap_open==0 && score_params.gap_extend==0) {
251     //non-affine, simpler Greedy algorithm
252     d_diff = (Xdrop+reward/2)/(penalty+reward)+1;
253     GMALLOC(last_seq2_off, ((max_d + 2) * sizeof(int*)));
254     if (!last_seq2_off)
255     GError(GX_GALLOC_ERROR);
256     GCALLOC(last_seq2_off[0], ((max_d + max_d + 6) * sizeof(int) * 2));
257     //allocates contiguous memory for 2 rows here
258     if (!last_seq2_off[0])
259     GError(GX_GALLOC_ERROR);
260     last_seq2_off[1] = last_seq2_off[0] + max_d + max_d + 6; //memory allocated already for this row
261    
262     GCALLOC(max_score, (sizeof(int) * (max_d + 1 + d_diff)));
263     space = new GXMemPool();
264     if (!max_score || !space)
265     GError(GX_GALLOC_ERROR);
266     } //consructor
267    
268     ~SGreedyAlignMem() {
269     if (last_seq2_off) {
270     GFREE(last_seq2_off[0]);
271     GFREE(last_seq2_off);
272     }
273     GFREE(max_score);
274     delete space;
275     }
276    
277 gpertea 74 };
278    
279    
280     #define GAPALIGN_SUB ((unsigned char)0) /*op types within the edit script*/
281     #define GAPALIGN_INS ((unsigned char)1)
282     #define GAPALIGN_DEL ((unsigned char)2)
283     #define GAPALIGN_DECLINE ((unsigned char)3)
284    
285     struct GapXEditScript {
286     unsigned char op_type; // GAPALIGN_SUB, GAPALIGN_INS, or GAPALIGN_DEL
287     int num; // Number of operations
288     GapXEditScript* next;
289     GapXEditScript() {
290     op_type=0;
291     num=0;
292     next=NULL;
293     }
294     void print();
295     };
296    
297     class CSeqGap { //
298     public:
299     int offset;
300     int len;
301     CSeqGap(int gofs=0,int glen=1) {
302     offset=gofs;
303     len=glen;
304     }
305     };
306    
307     class CAlnGapInfo {
308     int a_ofs; //alignment start on seq a (0 based)
309     int b_ofs; //alignment start on seq b (0 based)
310     int a_len; //length of alignment on seq a
311     int b_len; //length of alignment on seq b
312     public:
313     GVec<CSeqGap> a_gaps;
314     GVec<CSeqGap> b_gaps;
315     CAlnGapInfo(GXEditScript* ed_script, int astart=0, int bstart=0):a_gaps(),b_gaps() {
316     a_ofs=astart;
317     b_ofs=bstart;
318     a_len=0;
319 gpertea 82 b_len=0;
320     if (ed_script==NULL) return;
321     for (uint32 i=0; i<ed_script->opnum; i++) {
322     int num=((ed_script->ops[i]) >> 2);
323     char op_type = 3 - ( ed_script->ops[i] & EDIT_OP_MASK );
324     if (op_type == 3 || op_type < 0 )
325     GError("Error: encountered op_type %d in ed_script?!\n", (int)op_type);
326     CSeqGap gap;
327     switch (op_type) {
328     case GAPALIGN_SUB: a_len+=num;
329     b_len+=num;
330     break;
331     case GAPALIGN_INS: a_len+=num;
332     gap.offset=b_ofs+b_len;
333     gap.len=num;
334     b_gaps.Add(gap);
335     break;
336     case GAPALIGN_DEL: b_len+=num;
337     gap.offset=a_ofs+a_len;
338     gap.len=num;
339     a_gaps.Add(gap);
340     break;
341     }
342     }
343     }
344    
345 gpertea 74 #ifdef GDEBUG
346 gpertea 83 void printAlignment(FILE* f, const char* sa, int sa_len,
347     const char* sb, int sb_len) {
348 gpertea 82 //print seq A
349     char al[1024]; //display buffer for seq A
350     int ap=0; //index in al[] for current character printed
351     int g=0;
352     int aend=a_ofs+a_len;
353     if (a_ofs<b_ofs) {
354     for (int i=0;i<b_ofs-a_ofs;i++) {
355 gpertea 74 fprintf(f, " ");
356     al[++ap]=' ';
357     }
358     }
359     for (int i=0;i<aend;i++) {
360     if (g<a_gaps.Count() && a_gaps[g].offset==i) {
361     for (int j=0;j<a_gaps[g].len;j++) {
362     fprintf(f, "-");
363     al[++ap]='-';
364     }
365     g++;
366     }
367     if (i==a_ofs) color_bg(c_blue,f);
368     fprintf(f, "%c", sa[i]);
369     al[++ap]=sa[i];
370     }
371     color_reset(f);
372 gpertea 83 if (aend<sa_len)
373     fprintf(f, &sa[aend]);
374 gpertea 74 fprintf(f, "\n");
375     //print seq B
376     ap=0;
377     g=0;
378     int bend=b_ofs+b_len;
379     if (a_ofs>b_ofs) {
380     for (int i=0;i<a_ofs-b_ofs;i++) {
381     fprintf(f, " ");
382     ap++;
383     }
384     }
385     for (int i=0;i<b_ofs;i++) {
386     fprintf(f, "%c", sb[i]);
387     ap++;
388     }
389     for (int i=b_ofs;i<bend;i++) {
390     if (g<b_gaps.Count() && b_gaps[g].offset==i) {
391     for (int j=0;j<b_gaps[g].len;j++) {
392     fprintf(f, "-");
393     ap++;
394     }
395     g++;
396     }
397     if (i==b_ofs) color_bg(c_blue,f);
398     ap++;
399     bool mismatch=(sb[i]!=al[ap] && al[ap]!='-');
400     if (mismatch) color_bg(c_red,f);
401     fprintf(f, "%c", sb[i]);
402     if (mismatch) color_bg(c_blue,f);
403     }
404     color_reset(f);
405 gpertea 83 if (bend<sb_len)
406     fprintf(f, &sb[bend]);
407 gpertea 74 fprintf(f, "\n");
408     }
409     #endif
410     };
411    
412    
413     struct GXAlnInfo {
414     const char *qseq;
415     int ql,qr;
416     const char *sseq;
417     int sl,sr;
418     int score;
419     double pid;
420     GXEditScript* editscript;
421 gpertea 85 //GXAlnMem* abmp;
422     SGreedyAlignMem* alnmem;
423 gpertea 74 CAlnGapInfo* gapinfo;
424     GXAlnInfo(const char* q, int q_l, int q_r, const char* s, int s_l, int s_r,
425     int sc=0, double percid=0) {
426     qseq=q;
427     sseq=s;
428     ql=q_l;
429     qr=q_r;
430     sl=s_l;
431     sr=s_r;
432     score=sc;
433     pid=percid;
434     //editscript=NULL;
435     gapinfo=NULL;
436     }
437     ~GXAlnInfo() {
438     delete editscript;
439 gpertea 85 delete alnmem;
440 gpertea 74 delete gapinfo;
441     }
442 gpertea 78 bool operator<(GXAlnInfo& d) {
443     return ((score==d.score)? pid>d.pid : score>d.score);
444     }
445     bool operator>(GXAlnInfo& d) {
446     return ((score==d.score)? pid<d.pid : score<d.score);
447     }
448     bool operator==(GXAlnInfo& d) {
449     return (score==d.score && pid==d.pid);
450     }
451    
452 gpertea 74 };
453    
454    
455    
456     struct GXSeed {
457 gpertea 77 int b_ofs; //0-based coordinate on seq b (x coordinate)
458     int a_ofs; //0-based coordinate on seq a (y coordinate)
459 gpertea 74 int len; //length of exact match after extension
460     bool operator<(GXSeed& d){
461 gpertea 77 return ((b_ofs==d.b_ofs) ? a_ofs<d.a_ofs : b_ofs<d.b_ofs);
462 gpertea 74 }
463     bool operator>(GXSeed& d){
464 gpertea 77 return ((b_ofs==d.b_ofs) ? a_ofs>d.a_ofs : b_ofs>d.b_ofs);
465 gpertea 74 }
466     bool operator==(GXSeed& d){
467 gpertea 77 return (b_ofs==d.b_ofs && a_ofs==d.a_ofs); //should never be the case, seeds are uniquely constructed
468 gpertea 74 }
469     GXSeed(int aofs=0, int bofs=0, int l=4) {
470     a_ofs=aofs;
471     b_ofs=bofs;
472     len=l;
473     }
474     };
475    
476 gpertea 81 int cmpSeedDiag(const pointer p1, const pointer p2);
477     //seeds are "equal" if they're on the same diagonal (for selection purposes only)
478 gpertea 78
479 gpertea 81 int cmpSeedScore(const pointer p1, const pointer p2); //also takes position into account
480     //among seeds with same length, prefer those closer to the left end of the read (seq_b)
481    
482 gpertea 77 struct GXBand {
483     //bundle of seed matches on 3 adjacent diagonals
484     int diag; //first anti-diagonal (b_ofs-a_ofs) in this group of 3
485     //seeds for this, and diag+1 and diag+2 are stored here
486     int min_a, max_a; //maximal coordinates of the bundle
487     int min_b, max_b;
488     GList<GXSeed> seeds; //sorted by x coordinate (b_ofs)
489     int score; //sum of seed scores (- overlapping_bases/2 - gaps)
490 gpertea 78 GXBand(int start_diag=-1, GXSeed* seed=NULL):seeds(true, false, false) {
491 gpertea 77 diag=start_diag;
492     min_a=MAX_INT;
493     min_b=MAX_INT;
494     max_a=0;
495     max_b=0;
496     score=0;
497 gpertea 78 if (seed!=NULL) addSeed(seed);
498     }
499 gpertea 77 void addSeed(GXSeed* seed) {
500     seeds.Add(seed);
501     score+=seed->len;
502     //if (diag<0) diag=seed->diag; //should NOT be done like this
503     if (seed->a_ofs < min_a) min_a=seed->a_ofs;
504     if (seed->a_ofs+ seed->len > max_a) max_a=seed->a_ofs+seed->len;
505     if (seed->b_ofs < min_b) min_b=seed->b_ofs;
506     if (seed->b_ofs+seed->len > max_b) max_b=seed->b_ofs+seed->len;
507     }
508     void finalize() {
509 gpertea 78 //!! to be called only AFTER all seeds have been added
510     // seeds are sorted by b_ofs
511     //penalize seed gaps and overlaps on b sequence
512 gpertea 77 for (int i=1;i<seeds.Count();i++) {
513     GXSeed& sprev=*seeds[i-1];
514     GXSeed& scur=*seeds[i];
515     if (scur==sprev) GError("Error: duplicate seeds found (%d-%d:%d-%d)!\n",
516     scur.a_ofs+1, scur.a_ofs+scur.len, scur.b_ofs+1, scur.b_ofs+scur.len);
517     int b_gap=scur.b_ofs-sprev.b_ofs-sprev.len;
518     int a_gap=scur.a_ofs-sprev.a_ofs-sprev.len;
519     int max_gap=b_gap;
520     int min_gap=a_gap;
521     if (min_gap>max_gap) swap(max_gap, min_gap);
522 gpertea 78 if (min_gap<0) { //overlap
523     if (max_gap>0) { score-=GMAX((-min_gap), max_gap); }
524     else score+=min_gap;
525 gpertea 77 }
526 gpertea 78 else { //gap
527     score-=max_gap;
528 gpertea 77 }
529 gpertea 78 }//for each seed
530 gpertea 77 }
531 gpertea 78
532 gpertea 77 //bands will be sorted by decreasing score eventually, after all seeds are added
533 gpertea 85 //more seeds better than one longer seed?
534 gpertea 77 bool operator<(GXBand& d){
535     return ((score==d.score) ? seeds.Count()>d.seeds.Count() : score>d.score);
536     }
537     bool operator>(GXBand& d){
538     return ((score==d.score) ? seeds.Count()<d.seeds.Count() : score<d.score);
539     }
540     bool operator==(GXBand& d){
541 gpertea 85 //return (score==d.score && seeds.Count()==d.seeds.Count());
542     return (score==d.score && seeds.Count()==d.seeds.Count());
543 gpertea 77 }
544    
545     };
546    
547     class GXBandSet:public GList<GXBand> {
548     public:
549     int idxoffset; //global anti-diagonal->index offset (a_len-1)
550     //used to convert a diagonal to an index
551     //diagonal is always b_ofs-a_ofs, so the minimum value is -a_len+1
552     //hence offset is a_len-1
553     GXBand* band(int diag) { //retrieve the band for given anti-diagonal (b_ofs-a_ofs)
554     return Get(diag+idxoffset);
555     }
556     GXBand* band(int a_ofs, int b_ofs) { //retrieve the band for given anti-diagonal (b_ofs-a_ofs)
557     return Get(b_ofs-a_ofs+idxoffset);
558     }
559     GXBandSet(int a_len, int b_len):GList<GXBand>(a_len+b_len-1, false, true, false) {
560     idxoffset=a_len-1;
561     //diag will range from -a_len+1 to b_len-1, so after adjustment
562     //by idxoffset we get a max of a_len+b_len-2
563     int bcount=a_len+b_len-1;
564     for (int i=0;i<bcount;i++)
565 gpertea 78 this->Add(new GXBand(i-idxoffset));
566     //unsorted, this should set fList[i]
567 gpertea 77 }
568     void addSeed(GXSeed* seed) {
569     //MUST be unsorted !!!
570     int idx=(seed->b_ofs-seed->a_ofs)+idxoffset;
571     fList[idx]->addSeed(seed);
572     if (idx>0) fList[idx-1]->addSeed(seed);
573     if (idx<fCount-1) fList[idx+1]->addSeed(seed);
574     }
575     };
576    
577 gpertea 85
578 gpertea 77 GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len); //overlap of 3' end of seqb
579 gpertea 74 //void collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, const char* seqb); //overlap of 5' end of seqb
580    
581     void printEditScript(GXEditScript* ed_script);
582    
583 gpertea 81
584 gpertea 85 int GXGreedyExtend(const char* seq1, int len1,
585     const char* seq2, int len2,
586     bool reverse, int xdrop_threshold,
587     int match_cost, int mismatch_cost,
588     int& seq1_align_len, int& seq2_align_len,
589     SGreedyAlignMem& aux_data,
590     GXEditScript *edit_block);
591    
592 gpertea 81 // reward MUST be >1, always
593 gpertea 74 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
594     const char* s_seq, int s_alnstart, int s_max,
595 gpertea 86 bool editscript=false, int reward=2, int penalty=3, int xdrop=8);
596 gpertea 74 GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart,
597 gpertea 86 bool editscript=false, int reward=2, int penalty=3, int xdrop=8);
598 gpertea 74
599     #endif