ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.h
Revision: 99
Committed: Thu Oct 6 20:49:17 2011 UTC (7 years, 7 months ago) by gpertea
File size: 19319 byte(s)
Log Message:
wip

Line User Rev File contents
1 gpertea 93 #ifndef _GALIGNEXTEND_H
2    
3     //greedy gapped alignment extension
4     //(mostly lifted from NCBI's megablast gapped extension code)
5    
6     #include "GBase.h"
7     #include "GList.hh"
8     #include <string.h>
9    
10     //#define GDEBUG 1
11    
12     enum {
13     gxEDIT_OP_MASK = 0x3,
14     gxEDIT_OP_ERR = 0x0,
15     gxEDIT_OP_INS = 0x1,
16     gxEDIT_OP_DEL = 0x2,
17     gxEDIT_OP_REP = 0x3
18     };
19    
20     #define GX_EDITOP_VAL(op) ((op) >> 2)
21     #define GX_EDITOP_GET(op) ((op) & gxEDIT_OP_MASK)
22     #define GX_EDITOP_CONS(op, val) (((val) << 2) | ((op) & gxEDIT_OP_MASK))
23    
24     #ifdef GDEBUG
25     enum {c_black=0,
26     c_red, c_green,c_brown,c_blue,c_magenta,c_cyan,c_white
27     };
28    
29     void color_fg(int c, FILE* f=stderr);
30     void color_bg(int c, FILE* f=stderr);
31     void color_resetfg(FILE* f=stderr);
32     void color_resetbg(FILE* f=stderr);
33     void color_reset(FILE* f=stderr);
34     void color_normal(FILE* f=stderr);
35     #endif
36    
37     struct GXEditScript{
38     uint32 *ops; // array of edit operations
39     uint32 opsize, opnum; // size of allocation, number in use
40     uint32 oplast; // most recent operation added
41     //methods
42    
43     GXEditScript() {
44     init();
45     }
46     ~GXEditScript() {
47     GFREE(ops);
48     }
49     void init() {
50     ops = NULL;
51     opsize = 0;
52     opnum = 0;
53     oplast = 0;
54     getReady(8);
55     }
56    
57     int getReady(uint32 n) {
58     uint32 m = n + n/2;
59     if (opsize <= n) {
60     GREALLOC(ops, m*sizeof(uint32));
61     opsize = m;
62     }
63     return 1;
64     }
65    
66     int getReady2(uint32 n) {
67     if (opsize - opnum <= n)
68     return getReady(n + opnum);
69     return 1;
70     }
71    
72     int Put(uint32 op, uint32 n) {
73     if (!getReady2(2))
74     return 0;
75     oplast = op;
76     ops[opnum] = GX_EDITOP_CONS(op, n);
77     opnum += 1;
78     ops[opnum] = 0; // sentinel
79     return 1;
80     }
81     uint32* First() {
82     return opnum > 0 ? & ops[0] : NULL;
83     }
84    
85     uint32* Next(uint32 *op) {
86     // assumes flat address space !
87     if (&ops[0] <= op && op < &ops[opnum-1])
88     return op+1;
89     else
90     return 0;
91     }
92    
93     int More(uint32 op, uint32 k) {
94     if (op == gxEDIT_OP_ERR) {
95     GError("GXEditScript::opMore: bad opcode %d:%d", op, k);
96     return -1;
97     }
98    
99     if (GX_EDITOP_GET(oplast) == op) {
100     uint32 l=ops[opnum-1];
101     ops[opnum-1]=GX_EDITOP_CONS((GX_EDITOP_GET(l)),
102     (GX_EDITOP_VAL(l) + k));
103     }
104     else {
105     Put(op, k);
106     }
107    
108     return 0;
109     }
110    
111     GXEditScript* Append(GXEditScript *et) {
112     uint32 *op;
113     for (op = et->First(); op; op = et->Next(op))
114     More(GX_EDITOP_GET(*op), GX_EDITOP_VAL(*op));
115     return this;
116     }
117    
118     int opDel(uint32 k) {
119     return More(gxEDIT_OP_DEL, k);
120     }
121     int opIns(uint32 k) {
122     return More(gxEDIT_OP_INS, k);
123     }
124     int opRep(uint32 k) {
125     return More(gxEDIT_OP_REP, k);
126     }
127    
128     GXEditScript *reverse() {
129     const uint32 mid = opnum/2;
130     const uint32 end = opnum-1;
131     for (uint32 i = 0; i < mid; ++i) {
132     const uint32 t = ops[i];
133     ops[i] = ops[end-i];
134     ops[end-i] = t;
135     }
136     return this;
137     }
138     };
139    
140    
141     /** Bookkeeping structure for greedy alignment. When aligning
142     two sequences, the members of this structure store the
143     largest offset into the second sequence that leads to a
144     high-scoring alignment for a given start point */
145     struct SGreedyOffset {
146     int insert_off; // Best offset for a path ending in an insertion
147     int match_off; // Best offset for a path ending in a match
148     int delete_off; // Best offset for a path ending in a deletion
149     };
150    
151     // ----- pool allocator -----
152     // works as a linked list of allocated memory blocks
153     struct GXMemPool {
154     SGreedyOffset* memblock;
155     int used, size;
156     GXMemPool *next;
157     static int kMinSpace;
158     //methods
159     GXMemPool(int num_offsp=0) { //by default allocate a large block here (10M)
160     num_offsp=GMAX(kMinSpace, num_offsp);
161     GMALLOC(memblock, num_offsp*sizeof(SGreedyOffset));
162     if (memblock == NULL) {
163     GError("Failed to allocated GXMemPool(%d) for greedy extension!\n",num_offsp);
164     return;
165     }
166     used = 0;
167     size = num_offsp;
168     next = NULL;
169     }
170    
171     void refresh() {
172     GXMemPool* sp=this;
173     while (sp) {
174     sp->used = 0;
175     sp = sp->next;
176     }
177     }
178     ~GXMemPool() {
179     GXMemPool* next_sp;
180     GXMemPool* sp=this->next;
181     while (sp) {
182     next_sp = sp->next;
183     GFREE(sp->memblock);
184     delete sp;
185     sp = next_sp;
186     }
187     GFREE(memblock);
188     }
189    
190     SGreedyOffset* getSpace(int num_alloc) { // SGreedyOffset[num_alloc] array
191     //can use the first found memory block with enough room,
192     // or allocate a new large block
193     SGreedyOffset* v;
194     if (num_alloc < 0) return NULL;
195     GXMemPool* S=this;
196     while (used+num_alloc > S->size) {
197     //no room in current block, get a new mem block
198     if (next == NULL) {
199     next=new GXMemPool(num_alloc); //allocates a large contiguous memory block
200     }
201     S = S->next;
202     }
203     v = S->memblock+S->used;
204     S->used += num_alloc;
205     //align to first 8-byte boundary
206     int m8 = S->used & 7; //modulo 8
207     if (m8)
208     S->used += 8 - m8;
209     return v;
210     }
211    
212     void* getByteSpace(int byte_size) { //amount to use or allocate memory, in bytes
213     return (void*)getSpace(byte_size/sizeof(SGreedyOffset));
214     }
215    
216     };
217    
218     #define GREEDY_MAX_COST_FRACTION 5
219     /* (was 2) sequence_length / (this number) is a measure of how hard the
220     alignment code will work to find the optimal alignment; in fact
221     this gives a worst case bound on the number of loop iterations */
222    
223     #define GREEDY_MAX_COST 1000
224     // The largest diff distance (max indels+mismatches) to be examined for an optimal alignment
225     // (should be increased for large sequences)
226    
227     #define GX_GALLOC_ERROR "Error: failed to allocate memory for greedy alignment!\n"
228    
229     // all auxiliary memory needed for the greedy extension algorithm
230     struct SGreedyAlignMem {
231     int** last_seq2_off; // 2-D array of distances
232     int* max_score; // array of maximum scores
233     GXMemPool* space; // local memory pool for SGreedyOffset structs
234     int d_diff;
235     int max_d;
236     // Allocate memory for the greedy gapped alignment algorithm
237     SGreedyAlignMem(int reward, int penalty, int Xdrop) {
238     //int max_d, diff_d;
239     if (penalty<0) penalty=-penalty;
240     if (reward % 2) {
241     //scale params
242     reward = reward << 1;
243     penalty = (penalty << 1);
244     Xdrop = Xdrop<<1;
245     }
246     //if (gap_open == 0 && gap_extend == 0)
247     // gap_extend = (reward >> 1) + penalty;
248     const int max_dbseq_length=255; //adjust this accordingly
249     max_d = GMIN(GREEDY_MAX_COST,
250     (max_dbseq_length/GREEDY_MAX_COST_FRACTION + 1));
251    
252     last_seq2_off=NULL; // 2-D array of distances
253     max_score=NULL; // array of maximum scores
254     space=NULL; // local memory pool for SGreedyOffset structs
255     //if (score_params.gap_open==0 && score_params.gap_extend==0) {
256     //non-affine, simpler Greedy algorithm
257     d_diff = (Xdrop+reward/2)/(penalty+reward)+1;
258     GMALLOC(last_seq2_off, ((max_d + 2) * sizeof(int*)));
259     if (!last_seq2_off)
260     GError(GX_GALLOC_ERROR);
261     GCALLOC(last_seq2_off[0], ((max_d + max_d + 6) * sizeof(int) * 2));
262     //allocates contiguous memory for 2 rows here
263     if (!last_seq2_off[0])
264     GError(GX_GALLOC_ERROR);
265     last_seq2_off[1] = last_seq2_off[0] + max_d + max_d + 6; //memory allocated already for this row
266    
267     GCALLOC(max_score, (sizeof(int) * (max_d + 1 + d_diff)));
268     space = new GXMemPool();
269     if (!max_score || !space)
270     GError(GX_GALLOC_ERROR);
271     } //consructor
272    
273     void reset() {
274     space->refresh();
275     if (last_seq2_off) {
276     GFREE((last_seq2_off[0]));
277     }
278     GFREE(max_score);
279     GCALLOC(last_seq2_off[0], ((max_d + max_d + 6) * sizeof(int) * 2));
280     if (!last_seq2_off[0]) GError(GX_GALLOC_ERROR);
281     //allocates contiguous memory for 2 rows here
282     last_seq2_off[1] = last_seq2_off[0] + max_d + max_d + 6;
283     GCALLOC(max_score, (sizeof(int) * (max_d + 1 + d_diff)));
284     if (!max_score) GError(GX_GALLOC_ERROR);
285     }
286     ~SGreedyAlignMem() {
287     if (last_seq2_off) {
288     GFREE(last_seq2_off[0]);
289     GFREE(last_seq2_off);
290     }
291     GFREE(max_score);
292     delete space;
293     }
294    
295     };
296    
297    
298     #define GAPALIGN_SUB ((unsigned char)0) /*op types within the edit script*/
299     #define GAPALIGN_INS ((unsigned char)1)
300     #define GAPALIGN_DEL ((unsigned char)2)
301     #define GAPALIGN_DECLINE ((unsigned char)3)
302    
303     struct GapXEditScript {
304     unsigned char op_type; // GAPALIGN_SUB, GAPALIGN_INS, or GAPALIGN_DEL
305     int num; // Number of operations
306     GapXEditScript* next;
307     GapXEditScript() {
308     op_type=0;
309     num=0;
310     next=NULL;
311     }
312     void print();
313     };
314    
315     class CSeqGap { //
316     public:
317     int offset;
318     int len;
319     CSeqGap(int gofs=0,int glen=1) {
320     offset=gofs;
321     len=glen;
322     }
323     };
324    
325     class CAlnGapInfo {
326     int a_ofs; //alignment start on seq a (0 based)
327     int b_ofs; //alignment start on seq b (0 based)
328     int a_len; //length of alignment on seq a
329     int b_len; //length of alignment on seq b
330     public:
331     GVec<CSeqGap> a_gaps;
332     GVec<CSeqGap> b_gaps;
333     CAlnGapInfo(GXEditScript* ed_script, int astart=0, int bstart=0):a_gaps(),b_gaps() {
334     a_ofs=astart;
335     b_ofs=bstart;
336     a_len=0;
337     b_len=0;
338     if (ed_script==NULL) return;
339     for (uint32 i=0; i<ed_script->opnum; i++) {
340     int num=((ed_script->ops[i]) >> 2);
341     char op_type = 3 - ( ed_script->ops[i] & gxEDIT_OP_MASK );
342     if (op_type == 3 || op_type < 0 )
343     GError("Error: encountered op_type %d in ed_script?!\n", (int)op_type);
344     CSeqGap gap;
345     switch (op_type) {
346     case GAPALIGN_SUB: a_len+=num;
347     b_len+=num;
348     break;
349     case GAPALIGN_INS: a_len+=num;
350     gap.offset=b_ofs+b_len;
351     gap.len=num;
352     b_gaps.Add(gap);
353     break;
354     case GAPALIGN_DEL: b_len+=num;
355     gap.offset=a_ofs+a_len;
356     gap.len=num;
357     a_gaps.Add(gap);
358     break;
359     }
360     }
361     }
362    
363     #ifdef GDEBUG
364     void printAlignment(FILE* f, const char* sa, int sa_len,
365     const char* sb, int sb_len) {
366     //print seq A
367     char al[1024]; //display buffer for seq A
368     int ap=0; //index in al[] for current character printed
369     int g=0;
370     int aend=a_ofs+a_len;
371     if (a_ofs<b_ofs) {
372     for (int i=0;i<b_ofs-a_ofs;i++) {
373     fprintf(f, " ");
374     al[++ap]=' ';
375     }
376     }
377     for (int i=0;i<aend;i++) {
378     if (g<a_gaps.Count() && a_gaps[g].offset==i) {
379     for (int j=0;j<a_gaps[g].len;j++) {
380     fprintf(f, "-");
381     al[++ap]='-';
382     }
383     g++;
384     }
385     if (i==a_ofs) color_bg(c_blue,f);
386     fprintf(f, "%c", sa[i]);
387     al[++ap]=sa[i];
388     }
389     color_reset(f);
390     if (aend<sa_len)
391     fprintf(f, &sa[aend]);
392     fprintf(f, "\n");
393     //print seq B
394     ap=0;
395     g=0;
396     int bend=b_ofs+b_len;
397     if (a_ofs>b_ofs) {
398     for (int i=0;i<a_ofs-b_ofs;i++) {
399     fprintf(f, " ");
400     ap++;
401     }
402     }
403     for (int i=0;i<b_ofs;i++) {
404     fprintf(f, "%c", sb[i]);
405     ap++;
406     }
407     for (int i=b_ofs;i<bend;i++) {
408     if (g<b_gaps.Count() && b_gaps[g].offset==i) {
409     for (int j=0;j<b_gaps[g].len;j++) {
410     fprintf(f, "-");
411     ap++;
412     }
413     g++;
414     }
415     if (i==b_ofs) color_bg(c_blue,f);
416     ap++;
417     bool mismatch=(sb[i]!=al[ap] && al[ap]!='-');
418     if (mismatch) color_bg(c_red,f);
419     fprintf(f, "%c", sb[i]);
420     if (mismatch) color_bg(c_blue,f);
421     }
422     color_reset(f);
423     if (bend<sb_len)
424     fprintf(f, &sb[bend]);
425     fprintf(f, "\n");
426     }
427     #endif
428     };
429    
430     struct GXAlnInfo {
431     const char *qseq;
432     int ql,qr;
433     const char *sseq;
434     int sl,sr;
435     int score;
436     double pid;
437     GXEditScript* editscript;
438     CAlnGapInfo* gapinfo;
439     GXAlnInfo(const char* q, int q_l, int q_r, const char* s, int s_l, int s_r,
440     int sc=0, double percid=0) {
441     qseq=q;
442     sseq=s;
443     ql=q_l;
444     qr=q_r;
445     sl=s_l;
446     sr=s_r;
447     score=sc;
448     pid=percid;
449     editscript=NULL;
450     gapinfo=NULL;
451     }
452     ~GXAlnInfo() {
453     delete editscript;
454     delete gapinfo;
455     }
456     bool operator<(GXAlnInfo& d) {
457     return ((score==d.score)? pid>d.pid : score>d.score);
458     }
459     bool operator>(GXAlnInfo& d) {
460     return ((score==d.score)? pid<d.pid : score<d.score);
461     }
462     bool operator==(GXAlnInfo& d) {
463     return (score==d.score && pid==d.pid);
464     }
465    
466     };
467    
468    
469    
470     struct GXSeed {
471     int b_ofs; //0-based coordinate on seq b (x coordinate)
472     int a_ofs; //0-based coordinate on seq a (y coordinate)
473     int len; //length of exact match after extension
474     bool operator<(GXSeed& d){
475     return ((b_ofs==d.b_ofs) ? a_ofs<d.a_ofs : b_ofs<d.b_ofs);
476     }
477     bool operator>(GXSeed& d){
478     return ((b_ofs==d.b_ofs) ? a_ofs>d.a_ofs : b_ofs>d.b_ofs);
479     }
480     bool operator==(GXSeed& d){
481     return (b_ofs==d.b_ofs && a_ofs==d.a_ofs); //should never be the case, seeds are uniquely constructed
482     }
483     GXSeed(int aofs=0, int bofs=0, int l=4) {
484     a_ofs=aofs;
485     b_ofs=bofs;
486     len=l;
487     }
488     };
489    
490     int cmpSeedDiag(const pointer p1, const pointer p2);
491     //seeds are "equal" if they're on the same diagonal (for selection purposes only)
492    
493     int cmpSeedScore(const pointer p1, const pointer p2); //also takes position into account
494     //among seeds with same length, prefer those closer to the left end of the read (seq_b)
495    
496     struct GXBand {
497     //bundle of seed matches on 3 adjacent diagonals
498     int diag; //first anti-diagonal (b_ofs-a_ofs) in this group of 3
499     //seeds for this, and diag+1 and diag+2 are stored here
500     int min_a, max_a; //maximal coordinates of the bundle
501     int min_b, max_b;
502     GList<GXSeed> seeds; //sorted by x coordinate (b_ofs)
503     int score; //sum of seed scores (- overlapping_bases/2 - gaps)
504     GXBand(int start_diag=-1, GXSeed* seed=NULL):seeds(true, false, false) {
505     diag=start_diag;
506     min_a=MAX_INT;
507     min_b=MAX_INT;
508     max_a=0;
509     max_b=0;
510     score=0;
511     if (seed!=NULL) addSeed(seed);
512     }
513     void addSeed(GXSeed* seed) {
514     seeds.Add(seed);
515     score+=seed->len;
516     //if (diag<0) diag=seed->diag; //should NOT be done like this
517     if (seed->a_ofs < min_a) min_a=seed->a_ofs;
518     if (seed->a_ofs+ seed->len > max_a) max_a=seed->a_ofs+seed->len;
519     if (seed->b_ofs < min_b) min_b=seed->b_ofs;
520     if (seed->b_ofs+seed->len > max_b) max_b=seed->b_ofs+seed->len;
521     }
522     void finalize() {
523     //!! to be called only AFTER all seeds have been added
524     // seeds are sorted by b_ofs
525     //penalize seed gaps and overlaps on b sequence
526     for (int i=1;i<seeds.Count();i++) {
527     GXSeed& sprev=*seeds[i-1];
528     GXSeed& scur=*seeds[i];
529     if (scur==sprev) GError("Error: duplicate seeds found (%d-%d:%d-%d)!\n",
530     scur.a_ofs+1, scur.a_ofs+scur.len, scur.b_ofs+1, scur.b_ofs+scur.len);
531     int b_gap=scur.b_ofs-sprev.b_ofs-sprev.len;
532     int a_gap=scur.a_ofs-sprev.a_ofs-sprev.len;
533     int max_gap=b_gap;
534     int min_gap=a_gap;
535     if (min_gap>max_gap) swap(max_gap, min_gap);
536     if (min_gap<0) { //overlap
537     if (max_gap>0) { score-=GMAX((-min_gap), max_gap); }
538     else score+=min_gap;
539     }
540     else { //gap
541     score-=max_gap;
542     }
543     }//for each seed
544     }
545    
546     //bands will be sorted by decreasing score eventually, after all seeds are added
547     //more seeds better than one longer seed?
548     bool operator<(GXBand& d){
549     return ((score==d.score) ? seeds.Count()>d.seeds.Count() : score>d.score);
550     }
551     bool operator>(GXBand& d){
552     return ((score==d.score) ? seeds.Count()<d.seeds.Count() : score<d.score);
553     }
554     bool operator==(GXBand& d){
555     //return (score==d.score && seeds.Count()==d.seeds.Count());
556     return (score==d.score && seeds.Count()==d.seeds.Count());
557     }
558    
559     };
560    
561     class GXBandSet:public GList<GXBand> {
562     public:
563 gpertea 99 GXSeed* qmatch; //long match (mismatches allowed) if a very good match was extended well
564 gpertea 93 int idxoffset; //global anti-diagonal->index offset (a_len-1)
565     //used to convert a diagonal to an index
566     //diagonal is always b_ofs-a_ofs, so the minimum value is -a_len+1
567     //hence offset is a_len-1
568     GXBand* band(int diag) { //retrieve the band for given anti-diagonal (b_ofs-a_ofs)
569     return Get(diag+idxoffset);
570     }
571     GXBand* band(int a_ofs, int b_ofs) { //retrieve the band for given anti-diagonal (b_ofs-a_ofs)
572     return Get(b_ofs-a_ofs+idxoffset);
573     }
574     GXBandSet(int a_len, int b_len):GList<GXBand>(a_len+b_len-1, false, true, false) {
575     idxoffset=a_len-1;
576 gpertea 99 qmatch=NULL;
577 gpertea 93 //diag will range from -a_len+1 to b_len-1, so after adjustment
578     //by idxoffset we get a max of a_len+b_len-2
579     int bcount=a_len+b_len-1;
580     for (int i=0;i<bcount;i++)
581     this->Add(new GXBand(i-idxoffset));
582     //unsorted, this should set fList[i]
583     }
584 gpertea 99 ~GXBandSet() {
585     delete qmatch;
586     }
587 gpertea 93 void addSeed(GXSeed* seed) {
588     //MUST be unsorted !!!
589     int idx=(seed->b_ofs-seed->a_ofs)+idxoffset;
590     fList[idx]->addSeed(seed);
591     if (idx>0) fList[idx-1]->addSeed(seed);
592     if (idx<fCount-1) fList[idx+1]->addSeed(seed);
593     }
594     };
595    
596    
597     GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len); //for overlap at 3' end of seqb
598    
599     GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len); //for overlap at 5' end of seqb
600    
601     void printEditScript(GXEditScript* ed_script);
602    
603    
604     int GXGreedyExtend(const char* seq1, int len1,
605     const char* seq2, int len2,
606     bool reverse, int xdrop_threshold,
607     int match_cost, int mismatch_cost,
608     int& seq1_align_len, int& seq2_align_len,
609     SGreedyAlignMem& aux_data,
610     GXEditScript *edit_block);
611    
612    
613     enum GAlnTrimType {
614     galn_NoTrim=0,
615     galn_TrimLeft,
616     galn_TrimRight
617     };
618    
619     // reward MUST be >1, always
620     GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
621     const char* s_seq, int s_alnstart, int s_max, GAlnTrimType trimtype=galn_NoTrim,
622     bool editscript=false, SGreedyAlignMem* gxmem=NULL, int reward=2, int penalty=3, int xdrop=8);
623     GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart,
624     bool editscript=false, int reward=2, int penalty=3, int xdrop=8);
625    
626 gpertea 96 GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
627     SGreedyAlignMem* gxmem, int match_reward=2, int mismatch_penalty=3, int Xdrop=8);
628     GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
629     SGreedyAlignMem* gxmem, int match_reward=2, int mismatch_penalty=3, int Xdrop=8);
630 gpertea 93 #endif