ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.h
Revision: 194
Committed: Tue Feb 21 21:19:11 2012 UTC (7 years, 8 months ago) by gpertea
File size: 25051 byte(s)
Log Message:
increased "safe length" for an alignment from 16 to 22

Line User Rev File contents
1 gpertea 93 #ifndef _GALIGNEXTEND_H
2    
3     //greedy gapped alignment extension
4     //(mostly lifted from NCBI's megablast gapped extension code)
5    
6     #include "GBase.h"
7     #include "GList.hh"
8 gpertea 171 #include "gdna.h"
9 gpertea 93
10 gpertea 106 //#define GDEBUG 1
11 gpertea 93
12     enum {
13     gxEDIT_OP_MASK = 0x3,
14     gxEDIT_OP_ERR = 0x0,
15     gxEDIT_OP_INS = 0x1,
16     gxEDIT_OP_DEL = 0x2,
17     gxEDIT_OP_REP = 0x3
18     };
19    
20     #define GX_EDITOP_VAL(op) ((op) >> 2)
21     #define GX_EDITOP_GET(op) ((op) & gxEDIT_OP_MASK)
22     #define GX_EDITOP_CONS(op, val) (((val) << 2) | ((op) & gxEDIT_OP_MASK))
23    
24     #ifdef GDEBUG
25     enum {c_black=0,
26     c_red, c_green,c_brown,c_blue,c_magenta,c_cyan,c_white
27     };
28    
29     void color_fg(int c, FILE* f=stderr);
30     void color_bg(int c, FILE* f=stderr);
31     void color_resetfg(FILE* f=stderr);
32     void color_resetbg(FILE* f=stderr);
33     void color_reset(FILE* f=stderr);
34     void color_normal(FILE* f=stderr);
35     #endif
36    
37     struct GXEditScript{
38     uint32 *ops; // array of edit operations
39     uint32 opsize, opnum; // size of allocation, number in use
40     uint32 oplast; // most recent operation added
41     //methods
42    
43     GXEditScript() {
44     init();
45     }
46     ~GXEditScript() {
47     GFREE(ops);
48     }
49     void init() {
50     ops = NULL;
51     opsize = 0;
52     opnum = 0;
53     oplast = 0;
54     getReady(8);
55     }
56    
57     int getReady(uint32 n) {
58     uint32 m = n + n/2;
59     if (opsize <= n) {
60     GREALLOC(ops, m*sizeof(uint32));
61     opsize = m;
62     }
63     return 1;
64     }
65    
66     int getReady2(uint32 n) {
67     if (opsize - opnum <= n)
68     return getReady(n + opnum);
69     return 1;
70     }
71    
72     int Put(uint32 op, uint32 n) {
73     if (!getReady2(2))
74     return 0;
75     oplast = op;
76     ops[opnum] = GX_EDITOP_CONS(op, n);
77     opnum += 1;
78     ops[opnum] = 0; // sentinel
79     return 1;
80     }
81     uint32* First() {
82     return opnum > 0 ? & ops[0] : NULL;
83     }
84    
85     uint32* Next(uint32 *op) {
86     // assumes flat address space !
87     if (&ops[0] <= op && op < &ops[opnum-1])
88     return op+1;
89     else
90     return 0;
91     }
92    
93     int More(uint32 op, uint32 k) {
94     if (op == gxEDIT_OP_ERR) {
95     GError("GXEditScript::opMore: bad opcode %d:%d", op, k);
96     return -1;
97     }
98    
99     if (GX_EDITOP_GET(oplast) == op) {
100     uint32 l=ops[opnum-1];
101 gpertea 171 ops[opnum-1]=GX_EDITOP_CONS((GX_EDITOP_GET(l)),
102 gpertea 93 (GX_EDITOP_VAL(l) + k));
103     }
104     else {
105     Put(op, k);
106     }
107    
108     return 0;
109     }
110    
111     GXEditScript* Append(GXEditScript *et) {
112     uint32 *op;
113     for (op = et->First(); op; op = et->Next(op))
114     More(GX_EDITOP_GET(*op), GX_EDITOP_VAL(*op));
115     return this;
116     }
117    
118     int opDel(uint32 k) {
119     return More(gxEDIT_OP_DEL, k);
120     }
121     int opIns(uint32 k) {
122     return More(gxEDIT_OP_INS, k);
123     }
124     int opRep(uint32 k) {
125     return More(gxEDIT_OP_REP, k);
126     }
127    
128     GXEditScript *reverse() {
129     const uint32 mid = opnum/2;
130     const uint32 end = opnum-1;
131     for (uint32 i = 0; i < mid; ++i) {
132     const uint32 t = ops[i];
133     ops[i] = ops[end-i];
134     ops[end-i] = t;
135     }
136     return this;
137     }
138     };
139    
140    
141     /** Bookkeeping structure for greedy alignment. When aligning
142     two sequences, the members of this structure store the
143     largest offset into the second sequence that leads to a
144     high-scoring alignment for a given start point */
145     struct SGreedyOffset {
146     int insert_off; // Best offset for a path ending in an insertion
147     int match_off; // Best offset for a path ending in a match
148     int delete_off; // Best offset for a path ending in a deletion
149     };
150    
151     // ----- pool allocator -----
152     // works as a linked list of allocated memory blocks
153     struct GXMemPool {
154     SGreedyOffset* memblock;
155     int used, size;
156     GXMemPool *next;
157     static int kMinSpace;
158     //methods
159     GXMemPool(int num_offsp=0) { //by default allocate a large block here (10M)
160     num_offsp=GMAX(kMinSpace, num_offsp);
161     GMALLOC(memblock, num_offsp*sizeof(SGreedyOffset));
162     if (memblock == NULL) {
163     GError("Failed to allocated GXMemPool(%d) for greedy extension!\n",num_offsp);
164     return;
165     }
166     used = 0;
167     size = num_offsp;
168     next = NULL;
169     }
170    
171     void refresh() {
172     GXMemPool* sp=this;
173     while (sp) {
174     sp->used = 0;
175     sp = sp->next;
176     }
177     }
178     ~GXMemPool() {
179     GXMemPool* next_sp;
180     GXMemPool* sp=this->next;
181     while (sp) {
182     next_sp = sp->next;
183     GFREE(sp->memblock);
184     delete sp;
185     sp = next_sp;
186     }
187     GFREE(memblock);
188     }
189    
190     SGreedyOffset* getSpace(int num_alloc) { // SGreedyOffset[num_alloc] array
191     //can use the first found memory block with enough room,
192     // or allocate a new large block
193     SGreedyOffset* v;
194     if (num_alloc < 0) return NULL;
195     GXMemPool* S=this;
196     while (used+num_alloc > S->size) {
197     //no room in current block, get a new mem block
198     if (next == NULL) {
199     next=new GXMemPool(num_alloc); //allocates a large contiguous memory block
200     }
201     S = S->next;
202     }
203     v = S->memblock+S->used;
204     S->used += num_alloc;
205     //align to first 8-byte boundary
206     int m8 = S->used & 7; //modulo 8
207     if (m8)
208     S->used += 8 - m8;
209     return v;
210     }
211    
212     void* getByteSpace(int byte_size) { //amount to use or allocate memory, in bytes
213     return (void*)getSpace(byte_size/sizeof(SGreedyOffset));
214     }
215    
216     };
217    
218 gpertea 109 #define GREEDY_MAX_COST_FRACTION 8
219 gpertea 93 /* (was 2) sequence_length / (this number) is a measure of how hard the
220     alignment code will work to find the optimal alignment; in fact
221     this gives a worst case bound on the number of loop iterations */
222    
223     #define GREEDY_MAX_COST 1000
224     // The largest diff distance (max indels+mismatches) to be examined for an optimal alignment
225     // (should be increased for large sequences)
226    
227     #define GX_GALLOC_ERROR "Error: failed to allocate memory for greedy alignment!\n"
228    
229     // all auxiliary memory needed for the greedy extension algorithm
230 gpertea 101 class CGreedyAlignData {
231     int d_diff;
232     int max_d;
233     public:
234 gpertea 93 int** last_seq2_off; // 2-D array of distances
235     int* max_score; // array of maximum scores
236     GXMemPool* space; // local memory pool for SGreedyOffset structs
237 gpertea 101 //
238 gpertea 191 bool scaled; //scores are all x2
239 gpertea 101 int match_reward;
240     int mismatch_penalty;
241     int x_drop;
242 gpertea 191 int xdrop_ofs;
243 gpertea 93 // Allocate memory for the greedy gapped alignment algorithm
244 gpertea 101 CGreedyAlignData(int reward, int penalty, int xdrop) {
245 gpertea 191 scaled=false;
246     xdrop_ofs = 0;
247 gpertea 93 //int max_d, diff_d;
248     if (penalty<0) penalty=-penalty;
249     if (reward % 2) {
250 gpertea 191 //scale params up
251     scaled=true;
252 gpertea 101 match_reward = reward << 1;
253     mismatch_penalty = (penalty << 1);
254     x_drop = xdrop<<1;
255 gpertea 93 }
256 gpertea 101 else {
257     match_reward=reward;
258     mismatch_penalty = penalty;
259     x_drop=xdrop;
260     }
261 gpertea 191 xdrop_ofs=(x_drop + (match_reward>>1)) /
262     (match_reward + mismatch_penalty) + 1;
263 gpertea 93 //if (gap_open == 0 && gap_extend == 0)
264     // gap_extend = (reward >> 1) + penalty;
265     const int max_dbseq_length=255; //adjust this accordingly
266     max_d = GMIN(GREEDY_MAX_COST,
267     (max_dbseq_length/GREEDY_MAX_COST_FRACTION + 1));
268    
269     last_seq2_off=NULL; // 2-D array of distances
270     max_score=NULL; // array of maximum scores
271     space=NULL; // local memory pool for SGreedyOffset structs
272     //if (score_params.gap_open==0 && score_params.gap_extend==0) {
273     //non-affine, simpler Greedy algorithm
274 gpertea 101 d_diff = (x_drop+match_reward/2)/(mismatch_penalty+match_reward)+1;
275 gpertea 93 GMALLOC(last_seq2_off, ((max_d + 2) * sizeof(int*)));
276     if (!last_seq2_off)
277     GError(GX_GALLOC_ERROR);
278     GCALLOC(last_seq2_off[0], ((max_d + max_d + 6) * sizeof(int) * 2));
279     //allocates contiguous memory for 2 rows here
280     if (!last_seq2_off[0])
281     GError(GX_GALLOC_ERROR);
282     last_seq2_off[1] = last_seq2_off[0] + max_d + max_d + 6; //memory allocated already for this row
283    
284     GCALLOC(max_score, (sizeof(int) * (max_d + 1 + d_diff)));
285     space = new GXMemPool();
286     if (!max_score || !space)
287     GError(GX_GALLOC_ERROR);
288     } //consructor
289    
290     void reset() {
291     space->refresh();
292     if (last_seq2_off) {
293     GFREE((last_seq2_off[0]));
294     }
295     GFREE(max_score);
296     GCALLOC(last_seq2_off[0], ((max_d + max_d + 6) * sizeof(int) * 2));
297     if (!last_seq2_off[0]) GError(GX_GALLOC_ERROR);
298     //allocates contiguous memory for 2 rows here
299     last_seq2_off[1] = last_seq2_off[0] + max_d + max_d + 6;
300     GCALLOC(max_score, (sizeof(int) * (max_d + 1 + d_diff)));
301     if (!max_score) GError(GX_GALLOC_ERROR);
302     }
303 gpertea 191
304 gpertea 101 ~CGreedyAlignData() {
305 gpertea 93 if (last_seq2_off) {
306     GFREE(last_seq2_off[0]);
307     GFREE(last_seq2_off);
308     }
309     GFREE(max_score);
310     delete space;
311     }
312    
313     };
314    
315    
316     #define GAPALIGN_SUB ((unsigned char)0) /*op types within the edit script*/
317     #define GAPALIGN_INS ((unsigned char)1)
318     #define GAPALIGN_DEL ((unsigned char)2)
319     #define GAPALIGN_DECLINE ((unsigned char)3)
320    
321     struct GapXEditScript {
322     unsigned char op_type; // GAPALIGN_SUB, GAPALIGN_INS, or GAPALIGN_DEL
323     int num; // Number of operations
324     GapXEditScript* next;
325     GapXEditScript() {
326     op_type=0;
327     num=0;
328     next=NULL;
329     }
330     void print();
331     };
332    
333     class CSeqGap { //
334     public:
335     int offset;
336     int len;
337     CSeqGap(int gofs=0,int glen=1) {
338     offset=gofs;
339     len=glen;
340     }
341     };
342    
343     class CAlnGapInfo {
344     int a_ofs; //alignment start on seq a (0 based)
345     int b_ofs; //alignment start on seq b (0 based)
346     int a_len; //length of alignment on seq a
347     int b_len; //length of alignment on seq b
348     public:
349     GVec<CSeqGap> a_gaps;
350     GVec<CSeqGap> b_gaps;
351     CAlnGapInfo(GXEditScript* ed_script, int astart=0, int bstart=0):a_gaps(),b_gaps() {
352     a_ofs=astart;
353     b_ofs=bstart;
354     a_len=0;
355     b_len=0;
356     if (ed_script==NULL) return;
357 gpertea 173 for (uint32 i=0; i<ed_script->opnum; i++) {
358 gpertea 93 int num=((ed_script->ops[i]) >> 2);
359     char op_type = 3 - ( ed_script->ops[i] & gxEDIT_OP_MASK );
360     if (op_type == 3 || op_type < 0 )
361     GError("Error: encountered op_type %d in ed_script?!\n", (int)op_type);
362     CSeqGap gap;
363     switch (op_type) {
364     case GAPALIGN_SUB: a_len+=num;
365     b_len+=num;
366     break;
367     case GAPALIGN_INS: a_len+=num;
368     gap.offset=b_ofs+b_len;
369     gap.len=num;
370     b_gaps.Add(gap);
371     break;
372     case GAPALIGN_DEL: b_len+=num;
373     gap.offset=a_ofs+a_len;
374     gap.len=num;
375     a_gaps.Add(gap);
376     break;
377     }
378     }
379     }
380    
381     #ifdef GDEBUG
382 gpertea 171 void printAlignment(FILE* f, const char* sa, int sa_len,
383 gpertea 93 const char* sb, int sb_len) {
384     //print seq A
385     char al[1024]; //display buffer for seq A
386     int ap=0; //index in al[] for current character printed
387     int g=0;
388     int aend=a_ofs+a_len;
389     if (a_ofs<b_ofs) {
390     for (int i=0;i<b_ofs-a_ofs;i++) {
391     fprintf(f, " ");
392     al[++ap]=' ';
393     }
394     }
395     for (int i=0;i<aend;i++) {
396     if (g<a_gaps.Count() && a_gaps[g].offset==i) {
397     for (int j=0;j<a_gaps[g].len;j++) {
398     fprintf(f, "-");
399     al[++ap]='-';
400     }
401     g++;
402     }
403     if (i==a_ofs) color_bg(c_blue,f);
404     fprintf(f, "%c", sa[i]);
405     al[++ap]=sa[i];
406     }
407     color_reset(f);
408     if (aend<sa_len)
409     fprintf(f, &sa[aend]);
410     fprintf(f, "\n");
411     //print seq B
412     ap=0;
413     g=0;
414     int bend=b_ofs+b_len;
415     if (a_ofs>b_ofs) {
416     for (int i=0;i<a_ofs-b_ofs;i++) {
417     fprintf(f, " ");
418     ap++;
419     }
420     }
421     for (int i=0;i<b_ofs;i++) {
422     fprintf(f, "%c", sb[i]);
423     ap++;
424     }
425     for (int i=b_ofs;i<bend;i++) {
426     if (g<b_gaps.Count() && b_gaps[g].offset==i) {
427     for (int j=0;j<b_gaps[g].len;j++) {
428     fprintf(f, "-");
429     ap++;
430     }
431     g++;
432     }
433     if (i==b_ofs) color_bg(c_blue,f);
434     ap++;
435     bool mismatch=(sb[i]!=al[ap] && al[ap]!='-');
436     if (mismatch) color_bg(c_red,f);
437     fprintf(f, "%c", sb[i]);
438     if (mismatch) color_bg(c_blue,f);
439     }
440     color_reset(f);
441     if (bend<sb_len)
442     fprintf(f, &sb[bend]);
443     fprintf(f, "\n");
444     }
445     #endif
446     };
447 gpertea 171
448 gpertea 93 struct GXAlnInfo {
449     const char *qseq;
450     int ql,qr;
451     const char *sseq;
452     int sl,sr;
453     int score;
454     double pid;
455 gpertea 184 bool strong;
456 gpertea 93 GXEditScript* editscript;
457     CAlnGapInfo* gapinfo;
458     GXAlnInfo(const char* q, int q_l, int q_r, const char* s, int s_l, int s_r,
459     int sc=0, double percid=0) {
460     qseq=q;
461     sseq=s;
462     ql=q_l;
463     qr=q_r;
464     sl=s_l;
465     sr=s_r;
466     score=sc;
467     pid=percid;
468 gpertea 184 strong=false;
469 gpertea 93 editscript=NULL;
470     gapinfo=NULL;
471     }
472     ~GXAlnInfo() {
473     delete editscript;
474     delete gapinfo;
475     }
476     bool operator<(GXAlnInfo& d) {
477     return ((score==d.score)? pid>d.pid : score>d.score);
478     }
479     bool operator==(GXAlnInfo& d) {
480     return (score==d.score && pid==d.pid);
481     }
482    
483     };
484    
485    
486    
487     struct GXSeed {
488     int b_ofs; //0-based coordinate on seq b (x coordinate)
489     int a_ofs; //0-based coordinate on seq a (y coordinate)
490     int len; //length of exact match after extension
491     bool operator<(GXSeed& d){
492     return ((b_ofs==d.b_ofs) ? a_ofs<d.a_ofs : b_ofs<d.b_ofs);
493     }
494     bool operator==(GXSeed& d){
495     return (b_ofs==d.b_ofs && a_ofs==d.a_ofs); //should never be the case, seeds are uniquely constructed
496     }
497     GXSeed(int aofs=0, int bofs=0, int l=4) {
498     a_ofs=aofs;
499     b_ofs=bofs;
500     len=l;
501     }
502     };
503    
504     int cmpSeedDiag(const pointer p1, const pointer p2);
505     //seeds are "equal" if they're on the same diagonal (for selection purposes only)
506    
507     int cmpSeedScore(const pointer p1, const pointer p2); //also takes position into account
508     //among seeds with same length, prefer those closer to the left end of the read (seq_b)
509    
510     struct GXBand {
511     //bundle of seed matches on 3 adjacent diagonals
512     int diag; //first anti-diagonal (b_ofs-a_ofs) in this group of 3
513     //seeds for this, and diag+1 and diag+2 are stored here
514     int min_a, max_a; //maximal coordinates of the bundle
515     int min_b, max_b;
516 gpertea 101 int w_min_b; //weighted average of left start coordinate
517     int avg_len;
518 gpertea 93 GList<GXSeed> seeds; //sorted by x coordinate (b_ofs)
519     int score; //sum of seed scores (- overlapping_bases/2 - gaps)
520 gpertea 101 bool tested;
521 gpertea 93 GXBand(int start_diag=-1, GXSeed* seed=NULL):seeds(true, false, false) {
522     diag=start_diag;
523     min_a=MAX_INT;
524     min_b=MAX_INT;
525     max_a=0;
526     max_b=0;
527     score=0;
528 gpertea 101 avg_len=0;
529     w_min_b=0;
530     tested=false;
531 gpertea 93 if (seed!=NULL) addSeed(seed);
532     }
533     void addSeed(GXSeed* seed) {
534     seeds.Add(seed);
535     score+=seed->len;
536 gpertea 101 avg_len+=seed->len;
537     w_min_b+=seed->b_ofs * seed->len;
538 gpertea 93 //if (diag<0) diag=seed->diag; //should NOT be done like this
539     if (seed->a_ofs < min_a) min_a=seed->a_ofs;
540     if (seed->a_ofs+ seed->len > max_a) max_a=seed->a_ofs+seed->len;
541     if (seed->b_ofs < min_b) min_b=seed->b_ofs;
542     if (seed->b_ofs+seed->len > max_b) max_b=seed->b_ofs+seed->len;
543     }
544 gpertea 101
545 gpertea 93 void finalize() {
546     //!! to be called only AFTER all seeds have been added
547     // seeds are sorted by b_ofs
548     //penalize seed gaps and overlaps on b sequence
549 gpertea 101 if (avg_len==0) return;
550     w_min_b/=avg_len;
551     avg_len>>=1;
552 gpertea 93 for (int i=1;i<seeds.Count();i++) {
553     GXSeed& sprev=*seeds[i-1];
554     GXSeed& scur=*seeds[i];
555     if (scur==sprev) GError("Error: duplicate seeds found (%d-%d:%d-%d)!\n",
556     scur.a_ofs+1, scur.a_ofs+scur.len, scur.b_ofs+1, scur.b_ofs+scur.len);
557     int b_gap=scur.b_ofs-sprev.b_ofs-sprev.len;
558     int a_gap=scur.a_ofs-sprev.a_ofs-sprev.len;
559     int max_gap=b_gap;
560     int min_gap=a_gap;
561 gpertea 144 if (min_gap>max_gap) Gswap(max_gap, min_gap);
562 gpertea 101 int _penalty=0;
563 gpertea 93 if (min_gap<0) { //overlap
564 gpertea 101 if (max_gap>0) { _penalty=GMAX((-min_gap), max_gap); }
565     else _penalty=-min_gap;
566 gpertea 93 }
567     else { //gap
568 gpertea 101 _penalty=max_gap;
569 gpertea 93 }
570 gpertea 101 score-=(_penalty>>1);
571     //score-=_penalty;
572     }//for each seed
573 gpertea 93 }
574    
575     //bands will be sorted by decreasing score eventually, after all seeds are added
576     //more seeds better than one longer seed?
577     bool operator<(GXBand& d){
578 gpertea 101 //return ((score==d.score) ? seeds.Count()>d.seeds.Count() : score>d.score);
579     return ((score==d.score) ? w_min_b<d.w_min_b : score>d.score);
580 gpertea 93 }
581     bool operator==(GXBand& d){
582 gpertea 101 //return (score==d.score && seeds.Count()==d.seeds.Count());
583     return (score==d.score && w_min_b==d.w_min_b);
584 gpertea 93 }
585    
586     };
587    
588     class GXBandSet:public GList<GXBand> {
589     public:
590 gpertea 99 GXSeed* qmatch; //long match (mismatches allowed) if a very good match was extended well
591 gpertea 178 GXSeed* tmatch_r; //terminal match to be used if there is no better alignment
592     GXSeed* tmatch_l; //terminal match to be used if there is no better alignment
593 gpertea 93 int idxoffset; //global anti-diagonal->index offset (a_len-1)
594     //used to convert a diagonal to an index
595     //diagonal is always b_ofs-a_ofs, so the minimum value is -a_len+1
596     //hence offset is a_len-1
597     GXBand* band(int diag) { //retrieve the band for given anti-diagonal (b_ofs-a_ofs)
598     return Get(diag+idxoffset);
599     }
600     GXBand* band(int a_ofs, int b_ofs) { //retrieve the band for given anti-diagonal (b_ofs-a_ofs)
601     return Get(b_ofs-a_ofs+idxoffset);
602     }
603     GXBandSet(int a_len, int b_len):GList<GXBand>(a_len+b_len-1, false, true, false) {
604     idxoffset=a_len-1;
605 gpertea 99 qmatch=NULL;
606 gpertea 178 tmatch_l=NULL; //terminal match to be used if everything else fails
607     tmatch_r=NULL;
608 gpertea 93 //diag will range from -a_len+1 to b_len-1, so after adjustment
609     //by idxoffset we get a max of a_len+b_len-2
610     int bcount=a_len+b_len-1;
611     for (int i=0;i<bcount;i++)
612 gpertea 174 this->Add(new GXBand(i-idxoffset));
613 gpertea 93 //unsorted, this should set fList[i]
614     }
615 gpertea 99 ~GXBandSet() {
616     delete qmatch;
617     }
618 gpertea 93 void addSeed(GXSeed* seed) {
619     //MUST be unsorted !!!
620     int idx=(seed->b_ofs-seed->a_ofs)+idxoffset;
621     fList[idx]->addSeed(seed);
622     if (idx>0) fList[idx-1]->addSeed(seed);
623     if (idx<fCount-1) fList[idx+1]->addSeed(seed);
624     }
625     };
626    
627 gpertea 191 inline int calc_safelen(int alen) {
628     int r=iround(double(alen*0.6));
629 gpertea 194 return (r<22)? 22 : r;
630 gpertea 191 }
631    
632 gpertea 176 struct GXSeqData {
633     const char* aseq;
634     int alen;
635     const char* bseq;
636     int blen;
637     GVec<uint16>** amers;
638     int amlen; //minimum alignment length that's sufficient to
639     //trigger the quick extension heuristics
640     GXSeqData(const char* sa=NULL, int la=0, const char* sb=NULL, int lb=0,
641     GVec<uint16>* mers[]=NULL):aseq(sa), alen(la),
642     bseq(sb), blen(lb), amers(mers), amlen(0) {
643     calc_amlen();
644     calc_bmlen();
645     }
646 gpertea 191
647 gpertea 176 void calc_amlen() {
648     if (alen) {
649 gpertea 191 int ah=calc_safelen(alen);
650 gpertea 176 if (amlen>ah) amlen=ah;
651     }
652     }
653     void calc_bmlen() {
654     if (blen) {
655 gpertea 191 int bh = iround(double(blen)*0.6);
656 gpertea 194 if (bh<22) bh=22;
657 gpertea 176 if (amlen>bh) amlen=bh;
658     }
659     }
660     void update(const char* sa, int la, GVec<uint16>** mers,
661     const char* sb, int lb, int mlen=0) {
662     aseq=sa;
663     alen=la;
664     amers=mers;
665     if (mlen) {
666     amlen=mlen;
667     }
668     else calc_amlen();
669     if (sb==bseq && blen==lb) return;
670     bseq=sb;
671     blen=lb;
672     calc_bmlen();
673     }
674     /*
675     void update_b(const char* sb, int lb) {
676     if (sb==bseq && blen==lb) return;
677     bseq=sb;
678     blen=lb;
679     calc_bmlen();
680     }*/
681     };
682 gpertea 93
683 gpertea 171 uint16 get6mer(char* p);
684 gpertea 173 void table6mers(const char* s, int slen, GVec<uint16>* amers[]);
685 gpertea 93
686     void printEditScript(GXEditScript* ed_script);
687    
688    
689     int GXGreedyExtend(const char* seq1, int len1,
690     const char* seq2, int len2,
691 gpertea 191 bool reverse, //int xdrop_threshold, int match_cost, int mismatch_cost,
692 gpertea 93 int& seq1_align_len, int& seq2_align_len,
693 gpertea 101 CGreedyAlignData& aux_data,
694 gpertea 93 GXEditScript *edit_block);
695    
696    
697     enum GAlnTrimType {
698 gpertea 181 //Describes trimming intent
699     galn_None=0, //no trimming, just alignment
700 gpertea 93 galn_TrimLeft,
701 gpertea 181 galn_TrimRight,
702     galn_TrimEither //adaptor should be trimmed from either end
703 gpertea 93 };
704    
705 gpertea 101 struct CAlnTrim {
706     GAlnTrimType type;
707 gpertea 191 int minMatch; //minimum terminal exact match that will be removed from ends
708 gpertea 181 int l_boundary; //base index (either left or right) excluding terminal poly-A stretches
709     int r_boundary; //base index (either left or right) excluding terminal poly-A stretches
710     int alen; //query/adaptor seq length (for validate())
711 gpertea 176 int safelen; //alignment length > amlen should be automatically validated
712     int seedlen;
713     void prepare(const char* s, int s_len) {
714     //type=trim_type;
715     //amlen=smlen;
716 gpertea 181 l_boundary=0;
717     r_boundary=0;
718     //if (type==galn_TrimLeft) {
719 gpertea 101 int s_lbound=0;
720     if (s[0]=='A' && s[1]=='A' && s[2]=='A') {
721     s_lbound=3;
722     while (s_lbound<s_len-1 && s[s_lbound]=='A') s_lbound++;
723     }
724     else if (s[1]=='A' && s[2]=='A' && s[3]=='A') {
725     s_lbound=4;
726     while (s_lbound<s_len-1 && s[s_lbound]=='A') s_lbound++;
727     }
728 gpertea 181 l_boundary=s_lbound+3;
729     // return;
730     // }
731     //if (type==galn_TrimRight) {
732 gpertea 101 int r=s_len-1;
733     if (s[r]=='A' && s[r-1]=='A' && s[r-2]=='A') {
734     r-=3;
735     while (r>0 && s[r]=='A') r--;
736     }
737     else if (s[r-1]=='A' && s[r-2]=='A' && s[r-3]=='A') {
738     r-=4;
739     while (r>0 && s[r]=='A') r--;
740     }
741 gpertea 181 r_boundary=r-3;
742     // }
743 gpertea 101 }
744    
745 gpertea 191 CAlnTrim(GAlnTrimType trim_type, const char* s, int s_len, int a_len, int minEndTrim, int smlen):
746     type(trim_type), minMatch(minEndTrim), l_boundary(0), r_boundary(0),
747     alen(a_len), safelen(smlen) {
748 gpertea 176 prepare(s, s_len);
749 gpertea 101 }
750    
751 gpertea 181 bool validate_R(int sr, int admax, int badj, int adist) {
752     if (adist>admax) return false;
753     return (sr>=r_boundary+badj);
754     }
755 gpertea 184
756 gpertea 181 bool validate_L(int sl, int alnlen, int admax, int badj, int alnpid, int adist) {
757     if (adist>admax) return false;
758     //left match should be more stringent (5')
759     if (alnpid<93) {
760 gpertea 191 if (alnlen<13 || alnlen<minMatch) return false;
761 gpertea 181 admax=0;
762     badj++;
763     }
764     return (sl<=l_boundary-badj);
765     }
766    
767     bool validate(GXAlnInfo* alninfo) {
768     int alnlen=alninfo->sr - alninfo->sl + 1;
769 gpertea 184 if (alninfo->pid>90.0 && alnlen>safelen) {
770 gpertea 181 //special case: heavy match, could be in the middle
771 gpertea 186 if (alninfo->pid>94)
772     alninfo->strong=true;
773 gpertea 181 return true;
774 gpertea 184 }
775 gpertea 181 int sl=alninfo->sl;
776     int sr=alninfo->sr;
777 gpertea 101 sl--;sr--; //boundary is 0-based
778 gpertea 109 int badj=0; //default boundary is 3 bases distance to end
779     int admax=1;
780     if (alnlen<13) {
781 gpertea 101 //stricter boundary check
782 gpertea 181 if (alninfo->pid<90) return false;
783 gpertea 101 badj=2;
784 gpertea 109 if (alnlen<=7) { badj++; admax=0; }
785 gpertea 101 }
786 gpertea 181 if (type==galn_TrimLeft) {
787     return validate_L(sl, alnlen, admax, badj, alninfo->pid, alen-alninfo->qr);
788     }
789     else if (type==galn_TrimRight) {
790     return validate_R(sr, admax, badj, alninfo->ql-1);
791     }
792     else if (type==galn_TrimEither) {
793     return (validate_R(sr, admax, badj, alninfo->ql-1) ||
794     validate_L(sl, alnlen, admax, badj, alninfo->pid, alen-alninfo->qr));
795     }
796     return true;
797     /*
798 gpertea 101 if (type==galn_TrimRight) {
799     return (sr>=boundary+badj);
800     }
801     else {
802 gpertea 109 //left match should be more stringent (5')
803     if (alnpid<93) {
804     if (alnlen<13) return false;
805     admax=0;
806 gpertea 101 badj++;
807     }
808     return (sl<=boundary-badj);
809     }
810 gpertea 181 */
811 gpertea 101 }
812     };
813    
814    
815 gpertea 191
816    
817     //GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, GXSeqData& sd); //for overlap at 3' end of seqb
818    
819     GXBandSet* collectSeeds(GList<GXSeed>& seeds, GXSeqData& sd, GAlnTrimType trim_intent); //for overlap at 5' end of seqb
820     //=galn_None
821     // reward MUST be >1 for this function
822 gpertea 93 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
823 gpertea 101 const char* s_seq, int s_alnstart, int s_max,
824     int reward, int penalty, int xdrop, CGreedyAlignData* gxmem=NULL,
825     CAlnTrim* trim=NULL, bool editscript=false);
826     GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
827     const char* s_seq, int s_alnstart, int s_max, CGreedyAlignData* gxmem,
828     CAlnTrim* trim=NULL, bool editscript=false);
829    
830 gpertea 93 GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart,
831 gpertea 191 bool editscript=false, int reward=2, int penalty=10, int xdrop=32);
832 gpertea 93
833 gpertea 191 GXAlnInfo* match_adaptor(GXSeqData& sd, GAlnTrimType trim_type, int minMatch,
834 gpertea 181 CGreedyAlignData* gxmem=NULL, int min_pid=90);
835     //GXAlnInfo* match_RightEnd(GXSeqData& sd, CGreedyAlignData* gxmem=NULL, int min_pid=90);
836 gpertea 93 #endif