ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.h
Revision: 194
Committed: Tue Feb 21 21:19:11 2012 UTC (7 years, 4 months ago) by gpertea
File size: 25051 byte(s)
Log Message:
increased "safe length" for an alignment from 16 to 22

Line File contents
1 #ifndef _GALIGNEXTEND_H
2
3 //greedy gapped alignment extension
4 //(mostly lifted from NCBI's megablast gapped extension code)
5
6 #include "GBase.h"
7 #include "GList.hh"
8 #include "gdna.h"
9
10 //#define GDEBUG 1
11
12 enum {
13 gxEDIT_OP_MASK = 0x3,
14 gxEDIT_OP_ERR = 0x0,
15 gxEDIT_OP_INS = 0x1,
16 gxEDIT_OP_DEL = 0x2,
17 gxEDIT_OP_REP = 0x3
18 };
19
20 #define GX_EDITOP_VAL(op) ((op) >> 2)
21 #define GX_EDITOP_GET(op) ((op) & gxEDIT_OP_MASK)
22 #define GX_EDITOP_CONS(op, val) (((val) << 2) | ((op) & gxEDIT_OP_MASK))
23
24 #ifdef GDEBUG
25 enum {c_black=0,
26 c_red, c_green,c_brown,c_blue,c_magenta,c_cyan,c_white
27 };
28
29 void color_fg(int c, FILE* f=stderr);
30 void color_bg(int c, FILE* f=stderr);
31 void color_resetfg(FILE* f=stderr);
32 void color_resetbg(FILE* f=stderr);
33 void color_reset(FILE* f=stderr);
34 void color_normal(FILE* f=stderr);
35 #endif
36
37 struct GXEditScript{
38 uint32 *ops; // array of edit operations
39 uint32 opsize, opnum; // size of allocation, number in use
40 uint32 oplast; // most recent operation added
41 //methods
42
43 GXEditScript() {
44 init();
45 }
46 ~GXEditScript() {
47 GFREE(ops);
48 }
49 void init() {
50 ops = NULL;
51 opsize = 0;
52 opnum = 0;
53 oplast = 0;
54 getReady(8);
55 }
56
57 int getReady(uint32 n) {
58 uint32 m = n + n/2;
59 if (opsize <= n) {
60 GREALLOC(ops, m*sizeof(uint32));
61 opsize = m;
62 }
63 return 1;
64 }
65
66 int getReady2(uint32 n) {
67 if (opsize - opnum <= n)
68 return getReady(n + opnum);
69 return 1;
70 }
71
72 int Put(uint32 op, uint32 n) {
73 if (!getReady2(2))
74 return 0;
75 oplast = op;
76 ops[opnum] = GX_EDITOP_CONS(op, n);
77 opnum += 1;
78 ops[opnum] = 0; // sentinel
79 return 1;
80 }
81 uint32* First() {
82 return opnum > 0 ? & ops[0] : NULL;
83 }
84
85 uint32* Next(uint32 *op) {
86 // assumes flat address space !
87 if (&ops[0] <= op && op < &ops[opnum-1])
88 return op+1;
89 else
90 return 0;
91 }
92
93 int More(uint32 op, uint32 k) {
94 if (op == gxEDIT_OP_ERR) {
95 GError("GXEditScript::opMore: bad opcode %d:%d", op, k);
96 return -1;
97 }
98
99 if (GX_EDITOP_GET(oplast) == op) {
100 uint32 l=ops[opnum-1];
101 ops[opnum-1]=GX_EDITOP_CONS((GX_EDITOP_GET(l)),
102 (GX_EDITOP_VAL(l) + k));
103 }
104 else {
105 Put(op, k);
106 }
107
108 return 0;
109 }
110
111 GXEditScript* Append(GXEditScript *et) {
112 uint32 *op;
113 for (op = et->First(); op; op = et->Next(op))
114 More(GX_EDITOP_GET(*op), GX_EDITOP_VAL(*op));
115 return this;
116 }
117
118 int opDel(uint32 k) {
119 return More(gxEDIT_OP_DEL, k);
120 }
121 int opIns(uint32 k) {
122 return More(gxEDIT_OP_INS, k);
123 }
124 int opRep(uint32 k) {
125 return More(gxEDIT_OP_REP, k);
126 }
127
128 GXEditScript *reverse() {
129 const uint32 mid = opnum/2;
130 const uint32 end = opnum-1;
131 for (uint32 i = 0; i < mid; ++i) {
132 const uint32 t = ops[i];
133 ops[i] = ops[end-i];
134 ops[end-i] = t;
135 }
136 return this;
137 }
138 };
139
140
141 /** Bookkeeping structure for greedy alignment. When aligning
142 two sequences, the members of this structure store the
143 largest offset into the second sequence that leads to a
144 high-scoring alignment for a given start point */
145 struct SGreedyOffset {
146 int insert_off; // Best offset for a path ending in an insertion
147 int match_off; // Best offset for a path ending in a match
148 int delete_off; // Best offset for a path ending in a deletion
149 };
150
151 // ----- pool allocator -----
152 // works as a linked list of allocated memory blocks
153 struct GXMemPool {
154 SGreedyOffset* memblock;
155 int used, size;
156 GXMemPool *next;
157 static int kMinSpace;
158 //methods
159 GXMemPool(int num_offsp=0) { //by default allocate a large block here (10M)
160 num_offsp=GMAX(kMinSpace, num_offsp);
161 GMALLOC(memblock, num_offsp*sizeof(SGreedyOffset));
162 if (memblock == NULL) {
163 GError("Failed to allocated GXMemPool(%d) for greedy extension!\n",num_offsp);
164 return;
165 }
166 used = 0;
167 size = num_offsp;
168 next = NULL;
169 }
170
171 void refresh() {
172 GXMemPool* sp=this;
173 while (sp) {
174 sp->used = 0;
175 sp = sp->next;
176 }
177 }
178 ~GXMemPool() {
179 GXMemPool* next_sp;
180 GXMemPool* sp=this->next;
181 while (sp) {
182 next_sp = sp->next;
183 GFREE(sp->memblock);
184 delete sp;
185 sp = next_sp;
186 }
187 GFREE(memblock);
188 }
189
190 SGreedyOffset* getSpace(int num_alloc) { // SGreedyOffset[num_alloc] array
191 //can use the first found memory block with enough room,
192 // or allocate a new large block
193 SGreedyOffset* v;
194 if (num_alloc < 0) return NULL;
195 GXMemPool* S=this;
196 while (used+num_alloc > S->size) {
197 //no room in current block, get a new mem block
198 if (next == NULL) {
199 next=new GXMemPool(num_alloc); //allocates a large contiguous memory block
200 }
201 S = S->next;
202 }
203 v = S->memblock+S->used;
204 S->used += num_alloc;
205 //align to first 8-byte boundary
206 int m8 = S->used & 7; //modulo 8
207 if (m8)
208 S->used += 8 - m8;
209 return v;
210 }
211
212 void* getByteSpace(int byte_size) { //amount to use or allocate memory, in bytes
213 return (void*)getSpace(byte_size/sizeof(SGreedyOffset));
214 }
215
216 };
217
218 #define GREEDY_MAX_COST_FRACTION 8
219 /* (was 2) sequence_length / (this number) is a measure of how hard the
220 alignment code will work to find the optimal alignment; in fact
221 this gives a worst case bound on the number of loop iterations */
222
223 #define GREEDY_MAX_COST 1000
224 // The largest diff distance (max indels+mismatches) to be examined for an optimal alignment
225 // (should be increased for large sequences)
226
227 #define GX_GALLOC_ERROR "Error: failed to allocate memory for greedy alignment!\n"
228
229 // all auxiliary memory needed for the greedy extension algorithm
230 class CGreedyAlignData {
231 int d_diff;
232 int max_d;
233 public:
234 int** last_seq2_off; // 2-D array of distances
235 int* max_score; // array of maximum scores
236 GXMemPool* space; // local memory pool for SGreedyOffset structs
237 //
238 bool scaled; //scores are all x2
239 int match_reward;
240 int mismatch_penalty;
241 int x_drop;
242 int xdrop_ofs;
243 // Allocate memory for the greedy gapped alignment algorithm
244 CGreedyAlignData(int reward, int penalty, int xdrop) {
245 scaled=false;
246 xdrop_ofs = 0;
247 //int max_d, diff_d;
248 if (penalty<0) penalty=-penalty;
249 if (reward % 2) {
250 //scale params up
251 scaled=true;
252 match_reward = reward << 1;
253 mismatch_penalty = (penalty << 1);
254 x_drop = xdrop<<1;
255 }
256 else {
257 match_reward=reward;
258 mismatch_penalty = penalty;
259 x_drop=xdrop;
260 }
261 xdrop_ofs=(x_drop + (match_reward>>1)) /
262 (match_reward + mismatch_penalty) + 1;
263 //if (gap_open == 0 && gap_extend == 0)
264 // gap_extend = (reward >> 1) + penalty;
265 const int max_dbseq_length=255; //adjust this accordingly
266 max_d = GMIN(GREEDY_MAX_COST,
267 (max_dbseq_length/GREEDY_MAX_COST_FRACTION + 1));
268
269 last_seq2_off=NULL; // 2-D array of distances
270 max_score=NULL; // array of maximum scores
271 space=NULL; // local memory pool for SGreedyOffset structs
272 //if (score_params.gap_open==0 && score_params.gap_extend==0) {
273 //non-affine, simpler Greedy algorithm
274 d_diff = (x_drop+match_reward/2)/(mismatch_penalty+match_reward)+1;
275 GMALLOC(last_seq2_off, ((max_d + 2) * sizeof(int*)));
276 if (!last_seq2_off)
277 GError(GX_GALLOC_ERROR);
278 GCALLOC(last_seq2_off[0], ((max_d + max_d + 6) * sizeof(int) * 2));
279 //allocates contiguous memory for 2 rows here
280 if (!last_seq2_off[0])
281 GError(GX_GALLOC_ERROR);
282 last_seq2_off[1] = last_seq2_off[0] + max_d + max_d + 6; //memory allocated already for this row
283
284 GCALLOC(max_score, (sizeof(int) * (max_d + 1 + d_diff)));
285 space = new GXMemPool();
286 if (!max_score || !space)
287 GError(GX_GALLOC_ERROR);
288 } //consructor
289
290 void reset() {
291 space->refresh();
292 if (last_seq2_off) {
293 GFREE((last_seq2_off[0]));
294 }
295 GFREE(max_score);
296 GCALLOC(last_seq2_off[0], ((max_d + max_d + 6) * sizeof(int) * 2));
297 if (!last_seq2_off[0]) GError(GX_GALLOC_ERROR);
298 //allocates contiguous memory for 2 rows here
299 last_seq2_off[1] = last_seq2_off[0] + max_d + max_d + 6;
300 GCALLOC(max_score, (sizeof(int) * (max_d + 1 + d_diff)));
301 if (!max_score) GError(GX_GALLOC_ERROR);
302 }
303
304 ~CGreedyAlignData() {
305 if (last_seq2_off) {
306 GFREE(last_seq2_off[0]);
307 GFREE(last_seq2_off);
308 }
309 GFREE(max_score);
310 delete space;
311 }
312
313 };
314
315
316 #define GAPALIGN_SUB ((unsigned char)0) /*op types within the edit script*/
317 #define GAPALIGN_INS ((unsigned char)1)
318 #define GAPALIGN_DEL ((unsigned char)2)
319 #define GAPALIGN_DECLINE ((unsigned char)3)
320
321 struct GapXEditScript {
322 unsigned char op_type; // GAPALIGN_SUB, GAPALIGN_INS, or GAPALIGN_DEL
323 int num; // Number of operations
324 GapXEditScript* next;
325 GapXEditScript() {
326 op_type=0;
327 num=0;
328 next=NULL;
329 }
330 void print();
331 };
332
333 class CSeqGap { //
334 public:
335 int offset;
336 int len;
337 CSeqGap(int gofs=0,int glen=1) {
338 offset=gofs;
339 len=glen;
340 }
341 };
342
343 class CAlnGapInfo {
344 int a_ofs; //alignment start on seq a (0 based)
345 int b_ofs; //alignment start on seq b (0 based)
346 int a_len; //length of alignment on seq a
347 int b_len; //length of alignment on seq b
348 public:
349 GVec<CSeqGap> a_gaps;
350 GVec<CSeqGap> b_gaps;
351 CAlnGapInfo(GXEditScript* ed_script, int astart=0, int bstart=0):a_gaps(),b_gaps() {
352 a_ofs=astart;
353 b_ofs=bstart;
354 a_len=0;
355 b_len=0;
356 if (ed_script==NULL) return;
357 for (uint32 i=0; i<ed_script->opnum; i++) {
358 int num=((ed_script->ops[i]) >> 2);
359 char op_type = 3 - ( ed_script->ops[i] & gxEDIT_OP_MASK );
360 if (op_type == 3 || op_type < 0 )
361 GError("Error: encountered op_type %d in ed_script?!\n", (int)op_type);
362 CSeqGap gap;
363 switch (op_type) {
364 case GAPALIGN_SUB: a_len+=num;
365 b_len+=num;
366 break;
367 case GAPALIGN_INS: a_len+=num;
368 gap.offset=b_ofs+b_len;
369 gap.len=num;
370 b_gaps.Add(gap);
371 break;
372 case GAPALIGN_DEL: b_len+=num;
373 gap.offset=a_ofs+a_len;
374 gap.len=num;
375 a_gaps.Add(gap);
376 break;
377 }
378 }
379 }
380
381 #ifdef GDEBUG
382 void printAlignment(FILE* f, const char* sa, int sa_len,
383 const char* sb, int sb_len) {
384 //print seq A
385 char al[1024]; //display buffer for seq A
386 int ap=0; //index in al[] for current character printed
387 int g=0;
388 int aend=a_ofs+a_len;
389 if (a_ofs<b_ofs) {
390 for (int i=0;i<b_ofs-a_ofs;i++) {
391 fprintf(f, " ");
392 al[++ap]=' ';
393 }
394 }
395 for (int i=0;i<aend;i++) {
396 if (g<a_gaps.Count() && a_gaps[g].offset==i) {
397 for (int j=0;j<a_gaps[g].len;j++) {
398 fprintf(f, "-");
399 al[++ap]='-';
400 }
401 g++;
402 }
403 if (i==a_ofs) color_bg(c_blue,f);
404 fprintf(f, "%c", sa[i]);
405 al[++ap]=sa[i];
406 }
407 color_reset(f);
408 if (aend<sa_len)
409 fprintf(f, &sa[aend]);
410 fprintf(f, "\n");
411 //print seq B
412 ap=0;
413 g=0;
414 int bend=b_ofs+b_len;
415 if (a_ofs>b_ofs) {
416 for (int i=0;i<a_ofs-b_ofs;i++) {
417 fprintf(f, " ");
418 ap++;
419 }
420 }
421 for (int i=0;i<b_ofs;i++) {
422 fprintf(f, "%c", sb[i]);
423 ap++;
424 }
425 for (int i=b_ofs;i<bend;i++) {
426 if (g<b_gaps.Count() && b_gaps[g].offset==i) {
427 for (int j=0;j<b_gaps[g].len;j++) {
428 fprintf(f, "-");
429 ap++;
430 }
431 g++;
432 }
433 if (i==b_ofs) color_bg(c_blue,f);
434 ap++;
435 bool mismatch=(sb[i]!=al[ap] && al[ap]!='-');
436 if (mismatch) color_bg(c_red,f);
437 fprintf(f, "%c", sb[i]);
438 if (mismatch) color_bg(c_blue,f);
439 }
440 color_reset(f);
441 if (bend<sb_len)
442 fprintf(f, &sb[bend]);
443 fprintf(f, "\n");
444 }
445 #endif
446 };
447
448 struct GXAlnInfo {
449 const char *qseq;
450 int ql,qr;
451 const char *sseq;
452 int sl,sr;
453 int score;
454 double pid;
455 bool strong;
456 GXEditScript* editscript;
457 CAlnGapInfo* gapinfo;
458 GXAlnInfo(const char* q, int q_l, int q_r, const char* s, int s_l, int s_r,
459 int sc=0, double percid=0) {
460 qseq=q;
461 sseq=s;
462 ql=q_l;
463 qr=q_r;
464 sl=s_l;
465 sr=s_r;
466 score=sc;
467 pid=percid;
468 strong=false;
469 editscript=NULL;
470 gapinfo=NULL;
471 }
472 ~GXAlnInfo() {
473 delete editscript;
474 delete gapinfo;
475 }
476 bool operator<(GXAlnInfo& d) {
477 return ((score==d.score)? pid>d.pid : score>d.score);
478 }
479 bool operator==(GXAlnInfo& d) {
480 return (score==d.score && pid==d.pid);
481 }
482
483 };
484
485
486
487 struct GXSeed {
488 int b_ofs; //0-based coordinate on seq b (x coordinate)
489 int a_ofs; //0-based coordinate on seq a (y coordinate)
490 int len; //length of exact match after extension
491 bool operator<(GXSeed& d){
492 return ((b_ofs==d.b_ofs) ? a_ofs<d.a_ofs : b_ofs<d.b_ofs);
493 }
494 bool operator==(GXSeed& d){
495 return (b_ofs==d.b_ofs && a_ofs==d.a_ofs); //should never be the case, seeds are uniquely constructed
496 }
497 GXSeed(int aofs=0, int bofs=0, int l=4) {
498 a_ofs=aofs;
499 b_ofs=bofs;
500 len=l;
501 }
502 };
503
504 int cmpSeedDiag(const pointer p1, const pointer p2);
505 //seeds are "equal" if they're on the same diagonal (for selection purposes only)
506
507 int cmpSeedScore(const pointer p1, const pointer p2); //also takes position into account
508 //among seeds with same length, prefer those closer to the left end of the read (seq_b)
509
510 struct GXBand {
511 //bundle of seed matches on 3 adjacent diagonals
512 int diag; //first anti-diagonal (b_ofs-a_ofs) in this group of 3
513 //seeds for this, and diag+1 and diag+2 are stored here
514 int min_a, max_a; //maximal coordinates of the bundle
515 int min_b, max_b;
516 int w_min_b; //weighted average of left start coordinate
517 int avg_len;
518 GList<GXSeed> seeds; //sorted by x coordinate (b_ofs)
519 int score; //sum of seed scores (- overlapping_bases/2 - gaps)
520 bool tested;
521 GXBand(int start_diag=-1, GXSeed* seed=NULL):seeds(true, false, false) {
522 diag=start_diag;
523 min_a=MAX_INT;
524 min_b=MAX_INT;
525 max_a=0;
526 max_b=0;
527 score=0;
528 avg_len=0;
529 w_min_b=0;
530 tested=false;
531 if (seed!=NULL) addSeed(seed);
532 }
533 void addSeed(GXSeed* seed) {
534 seeds.Add(seed);
535 score+=seed->len;
536 avg_len+=seed->len;
537 w_min_b+=seed->b_ofs * seed->len;
538 //if (diag<0) diag=seed->diag; //should NOT be done like this
539 if (seed->a_ofs < min_a) min_a=seed->a_ofs;
540 if (seed->a_ofs+ seed->len > max_a) max_a=seed->a_ofs+seed->len;
541 if (seed->b_ofs < min_b) min_b=seed->b_ofs;
542 if (seed->b_ofs+seed->len > max_b) max_b=seed->b_ofs+seed->len;
543 }
544
545 void finalize() {
546 //!! to be called only AFTER all seeds have been added
547 // seeds are sorted by b_ofs
548 //penalize seed gaps and overlaps on b sequence
549 if (avg_len==0) return;
550 w_min_b/=avg_len;
551 avg_len>>=1;
552 for (int i=1;i<seeds.Count();i++) {
553 GXSeed& sprev=*seeds[i-1];
554 GXSeed& scur=*seeds[i];
555 if (scur==sprev) GError("Error: duplicate seeds found (%d-%d:%d-%d)!\n",
556 scur.a_ofs+1, scur.a_ofs+scur.len, scur.b_ofs+1, scur.b_ofs+scur.len);
557 int b_gap=scur.b_ofs-sprev.b_ofs-sprev.len;
558 int a_gap=scur.a_ofs-sprev.a_ofs-sprev.len;
559 int max_gap=b_gap;
560 int min_gap=a_gap;
561 if (min_gap>max_gap) Gswap(max_gap, min_gap);
562 int _penalty=0;
563 if (min_gap<0) { //overlap
564 if (max_gap>0) { _penalty=GMAX((-min_gap), max_gap); }
565 else _penalty=-min_gap;
566 }
567 else { //gap
568 _penalty=max_gap;
569 }
570 score-=(_penalty>>1);
571 //score-=_penalty;
572 }//for each seed
573 }
574
575 //bands will be sorted by decreasing score eventually, after all seeds are added
576 //more seeds better than one longer seed?
577 bool operator<(GXBand& d){
578 //return ((score==d.score) ? seeds.Count()>d.seeds.Count() : score>d.score);
579 return ((score==d.score) ? w_min_b<d.w_min_b : score>d.score);
580 }
581 bool operator==(GXBand& d){
582 //return (score==d.score && seeds.Count()==d.seeds.Count());
583 return (score==d.score && w_min_b==d.w_min_b);
584 }
585
586 };
587
588 class GXBandSet:public GList<GXBand> {
589 public:
590 GXSeed* qmatch; //long match (mismatches allowed) if a very good match was extended well
591 GXSeed* tmatch_r; //terminal match to be used if there is no better alignment
592 GXSeed* tmatch_l; //terminal match to be used if there is no better alignment
593 int idxoffset; //global anti-diagonal->index offset (a_len-1)
594 //used to convert a diagonal to an index
595 //diagonal is always b_ofs-a_ofs, so the minimum value is -a_len+1
596 //hence offset is a_len-1
597 GXBand* band(int diag) { //retrieve the band for given anti-diagonal (b_ofs-a_ofs)
598 return Get(diag+idxoffset);
599 }
600 GXBand* band(int a_ofs, int b_ofs) { //retrieve the band for given anti-diagonal (b_ofs-a_ofs)
601 return Get(b_ofs-a_ofs+idxoffset);
602 }
603 GXBandSet(int a_len, int b_len):GList<GXBand>(a_len+b_len-1, false, true, false) {
604 idxoffset=a_len-1;
605 qmatch=NULL;
606 tmatch_l=NULL; //terminal match to be used if everything else fails
607 tmatch_r=NULL;
608 //diag will range from -a_len+1 to b_len-1, so after adjustment
609 //by idxoffset we get a max of a_len+b_len-2
610 int bcount=a_len+b_len-1;
611 for (int i=0;i<bcount;i++)
612 this->Add(new GXBand(i-idxoffset));
613 //unsorted, this should set fList[i]
614 }
615 ~GXBandSet() {
616 delete qmatch;
617 }
618 void addSeed(GXSeed* seed) {
619 //MUST be unsorted !!!
620 int idx=(seed->b_ofs-seed->a_ofs)+idxoffset;
621 fList[idx]->addSeed(seed);
622 if (idx>0) fList[idx-1]->addSeed(seed);
623 if (idx<fCount-1) fList[idx+1]->addSeed(seed);
624 }
625 };
626
627 inline int calc_safelen(int alen) {
628 int r=iround(double(alen*0.6));
629 return (r<22)? 22 : r;
630 }
631
632 struct GXSeqData {
633 const char* aseq;
634 int alen;
635 const char* bseq;
636 int blen;
637 GVec<uint16>** amers;
638 int amlen; //minimum alignment length that's sufficient to
639 //trigger the quick extension heuristics
640 GXSeqData(const char* sa=NULL, int la=0, const char* sb=NULL, int lb=0,
641 GVec<uint16>* mers[]=NULL):aseq(sa), alen(la),
642 bseq(sb), blen(lb), amers(mers), amlen(0) {
643 calc_amlen();
644 calc_bmlen();
645 }
646
647 void calc_amlen() {
648 if (alen) {
649 int ah=calc_safelen(alen);
650 if (amlen>ah) amlen=ah;
651 }
652 }
653 void calc_bmlen() {
654 if (blen) {
655 int bh = iround(double(blen)*0.6);
656 if (bh<22) bh=22;
657 if (amlen>bh) amlen=bh;
658 }
659 }
660 void update(const char* sa, int la, GVec<uint16>** mers,
661 const char* sb, int lb, int mlen=0) {
662 aseq=sa;
663 alen=la;
664 amers=mers;
665 if (mlen) {
666 amlen=mlen;
667 }
668 else calc_amlen();
669 if (sb==bseq && blen==lb) return;
670 bseq=sb;
671 blen=lb;
672 calc_bmlen();
673 }
674 /*
675 void update_b(const char* sb, int lb) {
676 if (sb==bseq && blen==lb) return;
677 bseq=sb;
678 blen=lb;
679 calc_bmlen();
680 }*/
681 };
682
683 uint16 get6mer(char* p);
684 void table6mers(const char* s, int slen, GVec<uint16>* amers[]);
685
686 void printEditScript(GXEditScript* ed_script);
687
688
689 int GXGreedyExtend(const char* seq1, int len1,
690 const char* seq2, int len2,
691 bool reverse, //int xdrop_threshold, int match_cost, int mismatch_cost,
692 int& seq1_align_len, int& seq2_align_len,
693 CGreedyAlignData& aux_data,
694 GXEditScript *edit_block);
695
696
697 enum GAlnTrimType {
698 //Describes trimming intent
699 galn_None=0, //no trimming, just alignment
700 galn_TrimLeft,
701 galn_TrimRight,
702 galn_TrimEither //adaptor should be trimmed from either end
703 };
704
705 struct CAlnTrim {
706 GAlnTrimType type;
707 int minMatch; //minimum terminal exact match that will be removed from ends
708 int l_boundary; //base index (either left or right) excluding terminal poly-A stretches
709 int r_boundary; //base index (either left or right) excluding terminal poly-A stretches
710 int alen; //query/adaptor seq length (for validate())
711 int safelen; //alignment length > amlen should be automatically validated
712 int seedlen;
713 void prepare(const char* s, int s_len) {
714 //type=trim_type;
715 //amlen=smlen;
716 l_boundary=0;
717 r_boundary=0;
718 //if (type==galn_TrimLeft) {
719 int s_lbound=0;
720 if (s[0]=='A' && s[1]=='A' && s[2]=='A') {
721 s_lbound=3;
722 while (s_lbound<s_len-1 && s[s_lbound]=='A') s_lbound++;
723 }
724 else if (s[1]=='A' && s[2]=='A' && s[3]=='A') {
725 s_lbound=4;
726 while (s_lbound<s_len-1 && s[s_lbound]=='A') s_lbound++;
727 }
728 l_boundary=s_lbound+3;
729 // return;
730 // }
731 //if (type==galn_TrimRight) {
732 int r=s_len-1;
733 if (s[r]=='A' && s[r-1]=='A' && s[r-2]=='A') {
734 r-=3;
735 while (r>0 && s[r]=='A') r--;
736 }
737 else if (s[r-1]=='A' && s[r-2]=='A' && s[r-3]=='A') {
738 r-=4;
739 while (r>0 && s[r]=='A') r--;
740 }
741 r_boundary=r-3;
742 // }
743 }
744
745 CAlnTrim(GAlnTrimType trim_type, const char* s, int s_len, int a_len, int minEndTrim, int smlen):
746 type(trim_type), minMatch(minEndTrim), l_boundary(0), r_boundary(0),
747 alen(a_len), safelen(smlen) {
748 prepare(s, s_len);
749 }
750
751 bool validate_R(int sr, int admax, int badj, int adist) {
752 if (adist>admax) return false;
753 return (sr>=r_boundary+badj);
754 }
755
756 bool validate_L(int sl, int alnlen, int admax, int badj, int alnpid, int adist) {
757 if (adist>admax) return false;
758 //left match should be more stringent (5')
759 if (alnpid<93) {
760 if (alnlen<13 || alnlen<minMatch) return false;
761 admax=0;
762 badj++;
763 }
764 return (sl<=l_boundary-badj);
765 }
766
767 bool validate(GXAlnInfo* alninfo) {
768 int alnlen=alninfo->sr - alninfo->sl + 1;
769 if (alninfo->pid>90.0 && alnlen>safelen) {
770 //special case: heavy match, could be in the middle
771 if (alninfo->pid>94)
772 alninfo->strong=true;
773 return true;
774 }
775 int sl=alninfo->sl;
776 int sr=alninfo->sr;
777 sl--;sr--; //boundary is 0-based
778 int badj=0; //default boundary is 3 bases distance to end
779 int admax=1;
780 if (alnlen<13) {
781 //stricter boundary check
782 if (alninfo->pid<90) return false;
783 badj=2;
784 if (alnlen<=7) { badj++; admax=0; }
785 }
786 if (type==galn_TrimLeft) {
787 return validate_L(sl, alnlen, admax, badj, alninfo->pid, alen-alninfo->qr);
788 }
789 else if (type==galn_TrimRight) {
790 return validate_R(sr, admax, badj, alninfo->ql-1);
791 }
792 else if (type==galn_TrimEither) {
793 return (validate_R(sr, admax, badj, alninfo->ql-1) ||
794 validate_L(sl, alnlen, admax, badj, alninfo->pid, alen-alninfo->qr));
795 }
796 return true;
797 /*
798 if (type==galn_TrimRight) {
799 return (sr>=boundary+badj);
800 }
801 else {
802 //left match should be more stringent (5')
803 if (alnpid<93) {
804 if (alnlen<13) return false;
805 admax=0;
806 badj++;
807 }
808 return (sl<=boundary-badj);
809 }
810 */
811 }
812 };
813
814
815
816
817 //GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, GXSeqData& sd); //for overlap at 3' end of seqb
818
819 GXBandSet* collectSeeds(GList<GXSeed>& seeds, GXSeqData& sd, GAlnTrimType trim_intent); //for overlap at 5' end of seqb
820 //=galn_None
821 // reward MUST be >1 for this function
822 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
823 const char* s_seq, int s_alnstart, int s_max,
824 int reward, int penalty, int xdrop, CGreedyAlignData* gxmem=NULL,
825 CAlnTrim* trim=NULL, bool editscript=false);
826 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
827 const char* s_seq, int s_alnstart, int s_max, CGreedyAlignData* gxmem,
828 CAlnTrim* trim=NULL, bool editscript=false);
829
830 GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart,
831 bool editscript=false, int reward=2, int penalty=10, int xdrop=32);
832
833 GXAlnInfo* match_adaptor(GXSeqData& sd, GAlnTrimType trim_type, int minMatch,
834 CGreedyAlignData* gxmem=NULL, int min_pid=90);
835 //GXAlnInfo* match_RightEnd(GXSeqData& sd, CGreedyAlignData* gxmem=NULL, int min_pid=90);
836 #endif