1 |
#ifndef _GALIGNEXTEND_H |
2 |
|
3 |
//greedy gapped alignment extension |
4 |
//(mostly lifted from NCBI's megablast gapped extension code) |
5 |
|
6 |
#include "GBase.h" |
7 |
#include "GList.hh" |
8 |
#include "gdna.h" |
9 |
|
10 |
//#define GDEBUG 1 |
11 |
|
12 |
enum { |
13 |
gxEDIT_OP_MASK = 0x3, |
14 |
gxEDIT_OP_ERR = 0x0, |
15 |
gxEDIT_OP_INS = 0x1, |
16 |
gxEDIT_OP_DEL = 0x2, |
17 |
gxEDIT_OP_REP = 0x3 |
18 |
}; |
19 |
|
20 |
#define GX_EDITOP_VAL(op) ((op) >> 2) |
21 |
#define GX_EDITOP_GET(op) ((op) & gxEDIT_OP_MASK) |
22 |
#define GX_EDITOP_CONS(op, val) (((val) << 2) | ((op) & gxEDIT_OP_MASK)) |
23 |
|
24 |
#ifdef GDEBUG |
25 |
enum {c_black=0, |
26 |
c_red, c_green,c_brown,c_blue,c_magenta,c_cyan,c_white |
27 |
}; |
28 |
|
29 |
void color_fg(int c, FILE* f=stderr); |
30 |
void color_bg(int c, FILE* f=stderr); |
31 |
void color_resetfg(FILE* f=stderr); |
32 |
void color_resetbg(FILE* f=stderr); |
33 |
void color_reset(FILE* f=stderr); |
34 |
void color_normal(FILE* f=stderr); |
35 |
#endif |
36 |
|
37 |
struct GXEditScript{ |
38 |
uint32 *ops; // array of edit operations |
39 |
uint32 opsize, opnum; // size of allocation, number in use |
40 |
uint32 oplast; // most recent operation added |
41 |
//methods |
42 |
|
43 |
GXEditScript() { |
44 |
init(); |
45 |
} |
46 |
~GXEditScript() { |
47 |
GFREE(ops); |
48 |
} |
49 |
void init() { |
50 |
ops = NULL; |
51 |
opsize = 0; |
52 |
opnum = 0; |
53 |
oplast = 0; |
54 |
getReady(8); |
55 |
} |
56 |
|
57 |
int getReady(uint32 n) { |
58 |
uint32 m = n + n/2; |
59 |
if (opsize <= n) { |
60 |
GREALLOC(ops, m*sizeof(uint32)); |
61 |
opsize = m; |
62 |
} |
63 |
return 1; |
64 |
} |
65 |
|
66 |
int getReady2(uint32 n) { |
67 |
if (opsize - opnum <= n) |
68 |
return getReady(n + opnum); |
69 |
return 1; |
70 |
} |
71 |
|
72 |
int Put(uint32 op, uint32 n) { |
73 |
if (!getReady2(2)) |
74 |
return 0; |
75 |
oplast = op; |
76 |
ops[opnum] = GX_EDITOP_CONS(op, n); |
77 |
opnum += 1; |
78 |
ops[opnum] = 0; // sentinel |
79 |
return 1; |
80 |
} |
81 |
uint32* First() { |
82 |
return opnum > 0 ? & ops[0] : NULL; |
83 |
} |
84 |
|
85 |
uint32* Next(uint32 *op) { |
86 |
// assumes flat address space ! |
87 |
if (&ops[0] <= op && op < &ops[opnum-1]) |
88 |
return op+1; |
89 |
else |
90 |
return 0; |
91 |
} |
92 |
|
93 |
int More(uint32 op, uint32 k) { |
94 |
if (op == gxEDIT_OP_ERR) { |
95 |
GError("GXEditScript::opMore: bad opcode %d:%d", op, k); |
96 |
return -1; |
97 |
} |
98 |
|
99 |
if (GX_EDITOP_GET(oplast) == op) { |
100 |
uint32 l=ops[opnum-1]; |
101 |
ops[opnum-1]=GX_EDITOP_CONS((GX_EDITOP_GET(l)), |
102 |
(GX_EDITOP_VAL(l) + k)); |
103 |
} |
104 |
else { |
105 |
Put(op, k); |
106 |
} |
107 |
|
108 |
return 0; |
109 |
} |
110 |
|
111 |
GXEditScript* Append(GXEditScript *et) { |
112 |
uint32 *op; |
113 |
for (op = et->First(); op; op = et->Next(op)) |
114 |
More(GX_EDITOP_GET(*op), GX_EDITOP_VAL(*op)); |
115 |
return this; |
116 |
} |
117 |
|
118 |
int opDel(uint32 k) { |
119 |
return More(gxEDIT_OP_DEL, k); |
120 |
} |
121 |
int opIns(uint32 k) { |
122 |
return More(gxEDIT_OP_INS, k); |
123 |
} |
124 |
int opRep(uint32 k) { |
125 |
return More(gxEDIT_OP_REP, k); |
126 |
} |
127 |
|
128 |
GXEditScript *reverse() { |
129 |
const uint32 mid = opnum/2; |
130 |
const uint32 end = opnum-1; |
131 |
for (uint32 i = 0; i < mid; ++i) { |
132 |
const uint32 t = ops[i]; |
133 |
ops[i] = ops[end-i]; |
134 |
ops[end-i] = t; |
135 |
} |
136 |
return this; |
137 |
} |
138 |
}; |
139 |
|
140 |
|
141 |
/** Bookkeeping structure for greedy alignment. When aligning |
142 |
two sequences, the members of this structure store the |
143 |
largest offset into the second sequence that leads to a |
144 |
high-scoring alignment for a given start point */ |
145 |
struct SGreedyOffset { |
146 |
int insert_off; // Best offset for a path ending in an insertion |
147 |
int match_off; // Best offset for a path ending in a match |
148 |
int delete_off; // Best offset for a path ending in a deletion |
149 |
}; |
150 |
|
151 |
// ----- pool allocator ----- |
152 |
// works as a linked list of allocated memory blocks |
153 |
struct GXMemPool { |
154 |
SGreedyOffset* memblock; |
155 |
int used, size; |
156 |
GXMemPool *next; |
157 |
static int kMinSpace; |
158 |
//methods |
159 |
GXMemPool(int num_offsp=0) { //by default allocate a large block here (10M) |
160 |
num_offsp=GMAX(kMinSpace, num_offsp); |
161 |
GMALLOC(memblock, num_offsp*sizeof(SGreedyOffset)); |
162 |
if (memblock == NULL) { |
163 |
GError("Failed to allocated GXMemPool(%d) for greedy extension!\n",num_offsp); |
164 |
return; |
165 |
} |
166 |
used = 0; |
167 |
size = num_offsp; |
168 |
next = NULL; |
169 |
} |
170 |
|
171 |
void refresh() { |
172 |
GXMemPool* sp=this; |
173 |
while (sp) { |
174 |
sp->used = 0; |
175 |
sp = sp->next; |
176 |
} |
177 |
} |
178 |
~GXMemPool() { |
179 |
GXMemPool* next_sp; |
180 |
GXMemPool* sp=this->next; |
181 |
while (sp) { |
182 |
next_sp = sp->next; |
183 |
GFREE(sp->memblock); |
184 |
delete sp; |
185 |
sp = next_sp; |
186 |
} |
187 |
GFREE(memblock); |
188 |
} |
189 |
|
190 |
SGreedyOffset* getSpace(int num_alloc) { // SGreedyOffset[num_alloc] array |
191 |
//can use the first found memory block with enough room, |
192 |
// or allocate a new large block |
193 |
SGreedyOffset* v; |
194 |
if (num_alloc < 0) return NULL; |
195 |
GXMemPool* S=this; |
196 |
while (used+num_alloc > S->size) { |
197 |
//no room in current block, get a new mem block |
198 |
if (next == NULL) { |
199 |
next=new GXMemPool(num_alloc); //allocates a large contiguous memory block |
200 |
} |
201 |
S = S->next; |
202 |
} |
203 |
v = S->memblock+S->used; |
204 |
S->used += num_alloc; |
205 |
//align to first 8-byte boundary |
206 |
int m8 = S->used & 7; //modulo 8 |
207 |
if (m8) |
208 |
S->used += 8 - m8; |
209 |
return v; |
210 |
} |
211 |
|
212 |
void* getByteSpace(int byte_size) { //amount to use or allocate memory, in bytes |
213 |
return (void*)getSpace(byte_size/sizeof(SGreedyOffset)); |
214 |
} |
215 |
|
216 |
}; |
217 |
|
218 |
#define GREEDY_MAX_COST_FRACTION 8 |
219 |
/* (was 2) sequence_length / (this number) is a measure of how hard the |
220 |
alignment code will work to find the optimal alignment; in fact |
221 |
this gives a worst case bound on the number of loop iterations */ |
222 |
|
223 |
#define GREEDY_MAX_COST 1000 |
224 |
// The largest diff distance (max indels+mismatches) to be examined for an optimal alignment |
225 |
// (should be increased for large sequences) |
226 |
|
227 |
#define GX_GALLOC_ERROR "Error: failed to allocate memory for greedy alignment!\n" |
228 |
|
229 |
// all auxiliary memory needed for the greedy extension algorithm |
230 |
class CGreedyAlignData { |
231 |
int d_diff; |
232 |
int max_d; |
233 |
public: |
234 |
int** last_seq2_off; // 2-D array of distances |
235 |
int* max_score; // array of maximum scores |
236 |
GXMemPool* space; // local memory pool for SGreedyOffset structs |
237 |
// |
238 |
bool scaled; //scores are all x2 |
239 |
int match_reward; |
240 |
int mismatch_penalty; |
241 |
int x_drop; |
242 |
int xdrop_ofs; |
243 |
// Allocate memory for the greedy gapped alignment algorithm |
244 |
CGreedyAlignData(int reward, int penalty, int xdrop) { |
245 |
scaled=false; |
246 |
xdrop_ofs = 0; |
247 |
//int max_d, diff_d; |
248 |
if (penalty<0) penalty=-penalty; |
249 |
if (reward % 2) { |
250 |
//scale params up |
251 |
scaled=true; |
252 |
match_reward = reward << 1; |
253 |
mismatch_penalty = (penalty << 1); |
254 |
x_drop = xdrop<<1; |
255 |
} |
256 |
else { |
257 |
match_reward=reward; |
258 |
mismatch_penalty = penalty; |
259 |
x_drop=xdrop; |
260 |
} |
261 |
xdrop_ofs=(x_drop + (match_reward>>1)) / |
262 |
(match_reward + mismatch_penalty) + 1; |
263 |
//if (gap_open == 0 && gap_extend == 0) |
264 |
// gap_extend = (reward >> 1) + penalty; |
265 |
const int max_dbseq_length=255; //adjust this accordingly |
266 |
max_d = GMIN(GREEDY_MAX_COST, |
267 |
(max_dbseq_length/GREEDY_MAX_COST_FRACTION + 1)); |
268 |
|
269 |
last_seq2_off=NULL; // 2-D array of distances |
270 |
max_score=NULL; // array of maximum scores |
271 |
space=NULL; // local memory pool for SGreedyOffset structs |
272 |
//if (score_params.gap_open==0 && score_params.gap_extend==0) { |
273 |
//non-affine, simpler Greedy algorithm |
274 |
d_diff = (x_drop+match_reward/2)/(mismatch_penalty+match_reward)+1; |
275 |
GMALLOC(last_seq2_off, ((max_d + 2) * sizeof(int*))); |
276 |
if (!last_seq2_off) |
277 |
GError(GX_GALLOC_ERROR); |
278 |
GCALLOC(last_seq2_off[0], ((max_d + max_d + 6) * sizeof(int) * 2)); |
279 |
//allocates contiguous memory for 2 rows here |
280 |
if (!last_seq2_off[0]) |
281 |
GError(GX_GALLOC_ERROR); |
282 |
last_seq2_off[1] = last_seq2_off[0] + max_d + max_d + 6; //memory allocated already for this row |
283 |
|
284 |
GCALLOC(max_score, (sizeof(int) * (max_d + 1 + d_diff))); |
285 |
space = new GXMemPool(); |
286 |
if (!max_score || !space) |
287 |
GError(GX_GALLOC_ERROR); |
288 |
} //consructor |
289 |
|
290 |
void reset() { |
291 |
space->refresh(); |
292 |
if (last_seq2_off) { |
293 |
GFREE((last_seq2_off[0])); |
294 |
} |
295 |
GFREE(max_score); |
296 |
GCALLOC(last_seq2_off[0], ((max_d + max_d + 6) * sizeof(int) * 2)); |
297 |
if (!last_seq2_off[0]) GError(GX_GALLOC_ERROR); |
298 |
//allocates contiguous memory for 2 rows here |
299 |
last_seq2_off[1] = last_seq2_off[0] + max_d + max_d + 6; |
300 |
GCALLOC(max_score, (sizeof(int) * (max_d + 1 + d_diff))); |
301 |
if (!max_score) GError(GX_GALLOC_ERROR); |
302 |
} |
303 |
|
304 |
~CGreedyAlignData() { |
305 |
if (last_seq2_off) { |
306 |
GFREE(last_seq2_off[0]); |
307 |
GFREE(last_seq2_off); |
308 |
} |
309 |
GFREE(max_score); |
310 |
delete space; |
311 |
} |
312 |
|
313 |
}; |
314 |
|
315 |
|
316 |
#define GAPALIGN_SUB ((unsigned char)0) /*op types within the edit script*/ |
317 |
#define GAPALIGN_INS ((unsigned char)1) |
318 |
#define GAPALIGN_DEL ((unsigned char)2) |
319 |
#define GAPALIGN_DECLINE ((unsigned char)3) |
320 |
|
321 |
struct GapXEditScript { |
322 |
unsigned char op_type; // GAPALIGN_SUB, GAPALIGN_INS, or GAPALIGN_DEL |
323 |
int num; // Number of operations |
324 |
GapXEditScript* next; |
325 |
GapXEditScript() { |
326 |
op_type=0; |
327 |
num=0; |
328 |
next=NULL; |
329 |
} |
330 |
void print(); |
331 |
}; |
332 |
|
333 |
class CSeqGap { // |
334 |
public: |
335 |
int offset; |
336 |
int len; |
337 |
CSeqGap(int gofs=0,int glen=1) { |
338 |
offset=gofs; |
339 |
len=glen; |
340 |
} |
341 |
}; |
342 |
|
343 |
class CAlnGapInfo { |
344 |
int a_ofs; //alignment start on seq a (0 based) |
345 |
int b_ofs; //alignment start on seq b (0 based) |
346 |
int a_len; //length of alignment on seq a |
347 |
int b_len; //length of alignment on seq b |
348 |
public: |
349 |
GVec<CSeqGap> a_gaps; |
350 |
GVec<CSeqGap> b_gaps; |
351 |
CAlnGapInfo(GXEditScript* ed_script, int astart=0, int bstart=0):a_gaps(),b_gaps() { |
352 |
a_ofs=astart; |
353 |
b_ofs=bstart; |
354 |
a_len=0; |
355 |
b_len=0; |
356 |
if (ed_script==NULL) return; |
357 |
for (uint32 i=0; i<ed_script->opnum; i++) { |
358 |
int num=((ed_script->ops[i]) >> 2); |
359 |
char op_type = 3 - ( ed_script->ops[i] & gxEDIT_OP_MASK ); |
360 |
if (op_type == 3 || op_type < 0 ) |
361 |
GError("Error: encountered op_type %d in ed_script?!\n", (int)op_type); |
362 |
CSeqGap gap; |
363 |
switch (op_type) { |
364 |
case GAPALIGN_SUB: a_len+=num; |
365 |
b_len+=num; |
366 |
break; |
367 |
case GAPALIGN_INS: a_len+=num; |
368 |
gap.offset=b_ofs+b_len; |
369 |
gap.len=num; |
370 |
b_gaps.Add(gap); |
371 |
break; |
372 |
case GAPALIGN_DEL: b_len+=num; |
373 |
gap.offset=a_ofs+a_len; |
374 |
gap.len=num; |
375 |
a_gaps.Add(gap); |
376 |
break; |
377 |
} |
378 |
} |
379 |
} |
380 |
|
381 |
#ifdef GDEBUG |
382 |
void printAlignment(FILE* f, const char* sa, int sa_len, |
383 |
const char* sb, int sb_len) { |
384 |
//print seq A |
385 |
char al[1024]; //display buffer for seq A |
386 |
int ap=0; //index in al[] for current character printed |
387 |
int g=0; |
388 |
int aend=a_ofs+a_len; |
389 |
if (a_ofs<b_ofs) { |
390 |
for (int i=0;i<b_ofs-a_ofs;i++) { |
391 |
fprintf(f, " "); |
392 |
al[++ap]=' '; |
393 |
} |
394 |
} |
395 |
for (int i=0;i<aend;i++) { |
396 |
if (g<a_gaps.Count() && a_gaps[g].offset==i) { |
397 |
for (int j=0;j<a_gaps[g].len;j++) { |
398 |
fprintf(f, "-"); |
399 |
al[++ap]='-'; |
400 |
} |
401 |
g++; |
402 |
} |
403 |
if (i==a_ofs) color_bg(c_blue,f); |
404 |
fprintf(f, "%c", sa[i]); |
405 |
al[++ap]=sa[i]; |
406 |
} |
407 |
color_reset(f); |
408 |
if (aend<sa_len) |
409 |
fprintf(f, &sa[aend]); |
410 |
fprintf(f, "\n"); |
411 |
//print seq B |
412 |
ap=0; |
413 |
g=0; |
414 |
int bend=b_ofs+b_len; |
415 |
if (a_ofs>b_ofs) { |
416 |
for (int i=0;i<a_ofs-b_ofs;i++) { |
417 |
fprintf(f, " "); |
418 |
ap++; |
419 |
} |
420 |
} |
421 |
for (int i=0;i<b_ofs;i++) { |
422 |
fprintf(f, "%c", sb[i]); |
423 |
ap++; |
424 |
} |
425 |
for (int i=b_ofs;i<bend;i++) { |
426 |
if (g<b_gaps.Count() && b_gaps[g].offset==i) { |
427 |
for (int j=0;j<b_gaps[g].len;j++) { |
428 |
fprintf(f, "-"); |
429 |
ap++; |
430 |
} |
431 |
g++; |
432 |
} |
433 |
if (i==b_ofs) color_bg(c_blue,f); |
434 |
ap++; |
435 |
bool mismatch=(sb[i]!=al[ap] && al[ap]!='-'); |
436 |
if (mismatch) color_bg(c_red,f); |
437 |
fprintf(f, "%c", sb[i]); |
438 |
if (mismatch) color_bg(c_blue,f); |
439 |
} |
440 |
color_reset(f); |
441 |
if (bend<sb_len) |
442 |
fprintf(f, &sb[bend]); |
443 |
fprintf(f, "\n"); |
444 |
} |
445 |
#endif |
446 |
}; |
447 |
|
448 |
struct GXAlnInfo { |
449 |
const char *qseq; |
450 |
int ql,qr; |
451 |
const char *sseq; |
452 |
int sl,sr; |
453 |
int score; |
454 |
double pid; |
455 |
bool strong; |
456 |
GXEditScript* editscript; |
457 |
CAlnGapInfo* gapinfo; |
458 |
GXAlnInfo(const char* q, int q_l, int q_r, const char* s, int s_l, int s_r, |
459 |
int sc=0, double percid=0) { |
460 |
qseq=q; |
461 |
sseq=s; |
462 |
ql=q_l; |
463 |
qr=q_r; |
464 |
sl=s_l; |
465 |
sr=s_r; |
466 |
score=sc; |
467 |
pid=percid; |
468 |
strong=false; |
469 |
editscript=NULL; |
470 |
gapinfo=NULL; |
471 |
} |
472 |
~GXAlnInfo() { |
473 |
delete editscript; |
474 |
delete gapinfo; |
475 |
} |
476 |
bool operator<(GXAlnInfo& d) { |
477 |
return ((score==d.score)? pid>d.pid : score>d.score); |
478 |
} |
479 |
bool operator==(GXAlnInfo& d) { |
480 |
return (score==d.score && pid==d.pid); |
481 |
} |
482 |
|
483 |
}; |
484 |
|
485 |
|
486 |
|
487 |
struct GXSeed { |
488 |
int b_ofs; //0-based coordinate on seq b (x coordinate) |
489 |
int a_ofs; //0-based coordinate on seq a (y coordinate) |
490 |
int len; //length of exact match after extension |
491 |
bool operator<(GXSeed& d){ |
492 |
return ((b_ofs==d.b_ofs) ? a_ofs<d.a_ofs : b_ofs<d.b_ofs); |
493 |
} |
494 |
bool operator==(GXSeed& d){ |
495 |
return (b_ofs==d.b_ofs && a_ofs==d.a_ofs); //should never be the case, seeds are uniquely constructed |
496 |
} |
497 |
GXSeed(int aofs=0, int bofs=0, int l=4) { |
498 |
a_ofs=aofs; |
499 |
b_ofs=bofs; |
500 |
len=l; |
501 |
} |
502 |
}; |
503 |
|
504 |
int cmpSeedDiag(const pointer p1, const pointer p2); |
505 |
//seeds are "equal" if they're on the same diagonal (for selection purposes only) |
506 |
|
507 |
int cmpSeedScore(const pointer p1, const pointer p2); //also takes position into account |
508 |
//among seeds with same length, prefer those closer to the left end of the read (seq_b) |
509 |
|
510 |
struct GXBand { |
511 |
//bundle of seed matches on 3 adjacent diagonals |
512 |
int diag; //first anti-diagonal (b_ofs-a_ofs) in this group of 3 |
513 |
//seeds for this, and diag+1 and diag+2 are stored here |
514 |
int min_a, max_a; //maximal coordinates of the bundle |
515 |
int min_b, max_b; |
516 |
int w_min_b; //weighted average of left start coordinate |
517 |
int avg_len; |
518 |
GList<GXSeed> seeds; //sorted by x coordinate (b_ofs) |
519 |
int score; //sum of seed scores (- overlapping_bases/2 - gaps) |
520 |
bool tested; |
521 |
GXBand(int start_diag=-1, GXSeed* seed=NULL):seeds(true, false, false) { |
522 |
diag=start_diag; |
523 |
min_a=MAX_INT; |
524 |
min_b=MAX_INT; |
525 |
max_a=0; |
526 |
max_b=0; |
527 |
score=0; |
528 |
avg_len=0; |
529 |
w_min_b=0; |
530 |
tested=false; |
531 |
if (seed!=NULL) addSeed(seed); |
532 |
} |
533 |
void addSeed(GXSeed* seed) { |
534 |
seeds.Add(seed); |
535 |
score+=seed->len; |
536 |
avg_len+=seed->len; |
537 |
w_min_b+=seed->b_ofs * seed->len; |
538 |
//if (diag<0) diag=seed->diag; //should NOT be done like this |
539 |
if (seed->a_ofs < min_a) min_a=seed->a_ofs; |
540 |
if (seed->a_ofs+ seed->len > max_a) max_a=seed->a_ofs+seed->len; |
541 |
if (seed->b_ofs < min_b) min_b=seed->b_ofs; |
542 |
if (seed->b_ofs+seed->len > max_b) max_b=seed->b_ofs+seed->len; |
543 |
} |
544 |
|
545 |
void finalize() { |
546 |
//!! to be called only AFTER all seeds have been added |
547 |
// seeds are sorted by b_ofs |
548 |
//penalize seed gaps and overlaps on b sequence |
549 |
if (avg_len==0) return; |
550 |
w_min_b/=avg_len; |
551 |
avg_len>>=1; |
552 |
for (int i=1;i<seeds.Count();i++) { |
553 |
GXSeed& sprev=*seeds[i-1]; |
554 |
GXSeed& scur=*seeds[i]; |
555 |
if (scur==sprev) GError("Error: duplicate seeds found (%d-%d:%d-%d)!\n", |
556 |
scur.a_ofs+1, scur.a_ofs+scur.len, scur.b_ofs+1, scur.b_ofs+scur.len); |
557 |
int b_gap=scur.b_ofs-sprev.b_ofs-sprev.len; |
558 |
int a_gap=scur.a_ofs-sprev.a_ofs-sprev.len; |
559 |
int max_gap=b_gap; |
560 |
int min_gap=a_gap; |
561 |
if (min_gap>max_gap) Gswap(max_gap, min_gap); |
562 |
int _penalty=0; |
563 |
if (min_gap<0) { //overlap |
564 |
if (max_gap>0) { _penalty=GMAX((-min_gap), max_gap); } |
565 |
else _penalty=-min_gap; |
566 |
} |
567 |
else { //gap |
568 |
_penalty=max_gap; |
569 |
} |
570 |
score-=(_penalty>>1); |
571 |
//score-=_penalty; |
572 |
}//for each seed |
573 |
} |
574 |
|
575 |
//bands will be sorted by decreasing score eventually, after all seeds are added |
576 |
//more seeds better than one longer seed? |
577 |
bool operator<(GXBand& d){ |
578 |
//return ((score==d.score) ? seeds.Count()>d.seeds.Count() : score>d.score); |
579 |
return ((score==d.score) ? w_min_b<d.w_min_b : score>d.score); |
580 |
} |
581 |
bool operator==(GXBand& d){ |
582 |
//return (score==d.score && seeds.Count()==d.seeds.Count()); |
583 |
return (score==d.score && w_min_b==d.w_min_b); |
584 |
} |
585 |
|
586 |
}; |
587 |
|
588 |
class GXBandSet:public GList<GXBand> { |
589 |
public: |
590 |
GXSeed* qmatch; //long match (mismatches allowed) if a very good match was extended well |
591 |
GXSeed* tmatch_r; //terminal match to be used if there is no better alignment |
592 |
GXSeed* tmatch_l; //terminal match to be used if there is no better alignment |
593 |
int idxoffset; //global anti-diagonal->index offset (a_len-1) |
594 |
//used to convert a diagonal to an index |
595 |
//diagonal is always b_ofs-a_ofs, so the minimum value is -a_len+1 |
596 |
//hence offset is a_len-1 |
597 |
GXBand* band(int diag) { //retrieve the band for given anti-diagonal (b_ofs-a_ofs) |
598 |
return Get(diag+idxoffset); |
599 |
} |
600 |
GXBand* band(int a_ofs, int b_ofs) { //retrieve the band for given anti-diagonal (b_ofs-a_ofs) |
601 |
return Get(b_ofs-a_ofs+idxoffset); |
602 |
} |
603 |
GXBandSet(int a_len, int b_len):GList<GXBand>(a_len+b_len-1, false, true, false) { |
604 |
idxoffset=a_len-1; |
605 |
qmatch=NULL; |
606 |
tmatch_l=NULL; //terminal match to be used if everything else fails |
607 |
tmatch_r=NULL; |
608 |
//diag will range from -a_len+1 to b_len-1, so after adjustment |
609 |
//by idxoffset we get a max of a_len+b_len-2 |
610 |
int bcount=a_len+b_len-1; |
611 |
for (int i=0;i<bcount;i++) |
612 |
this->Add(new GXBand(i-idxoffset)); |
613 |
//unsorted, this should set fList[i] |
614 |
} |
615 |
~GXBandSet() { |
616 |
delete qmatch; |
617 |
} |
618 |
void addSeed(GXSeed* seed) { |
619 |
//MUST be unsorted !!! |
620 |
int idx=(seed->b_ofs-seed->a_ofs)+idxoffset; |
621 |
fList[idx]->addSeed(seed); |
622 |
if (idx>0) fList[idx-1]->addSeed(seed); |
623 |
if (idx<fCount-1) fList[idx+1]->addSeed(seed); |
624 |
} |
625 |
}; |
626 |
|
627 |
inline int calc_safelen(int alen) { |
628 |
int r=iround(double(alen*0.6)); |
629 |
return (r<22)? 22 : r; |
630 |
} |
631 |
|
632 |
struct GXSeqData { |
633 |
const char* aseq; |
634 |
int alen; |
635 |
const char* bseq; |
636 |
int blen; |
637 |
GVec<uint16>** amers; |
638 |
int amlen; //minimum alignment length that's sufficient to |
639 |
//trigger the quick extension heuristics |
640 |
GXSeqData(const char* sa=NULL, int la=0, const char* sb=NULL, int lb=0, |
641 |
GVec<uint16>* mers[]=NULL):aseq(sa), alen(la), |
642 |
bseq(sb), blen(lb), amers(mers), amlen(0) { |
643 |
calc_amlen(); |
644 |
calc_bmlen(); |
645 |
} |
646 |
|
647 |
void calc_amlen() { |
648 |
if (alen) { |
649 |
int ah=calc_safelen(alen); |
650 |
if (amlen>ah) amlen=ah; |
651 |
} |
652 |
} |
653 |
void calc_bmlen() { |
654 |
if (blen) { |
655 |
int bh = iround(double(blen)*0.6); |
656 |
if (bh<22) bh=22; |
657 |
if (amlen>bh) amlen=bh; |
658 |
} |
659 |
} |
660 |
void update(const char* sa, int la, GVec<uint16>** mers, |
661 |
const char* sb, int lb, int mlen=0) { |
662 |
aseq=sa; |
663 |
alen=la; |
664 |
amers=mers; |
665 |
if (mlen) { |
666 |
amlen=mlen; |
667 |
} |
668 |
else calc_amlen(); |
669 |
if (sb==bseq && blen==lb) return; |
670 |
bseq=sb; |
671 |
blen=lb; |
672 |
calc_bmlen(); |
673 |
} |
674 |
/* |
675 |
void update_b(const char* sb, int lb) { |
676 |
if (sb==bseq && blen==lb) return; |
677 |
bseq=sb; |
678 |
blen=lb; |
679 |
calc_bmlen(); |
680 |
}*/ |
681 |
}; |
682 |
|
683 |
uint16 get6mer(char* p); |
684 |
void table6mers(const char* s, int slen, GVec<uint16>* amers[]); |
685 |
|
686 |
void printEditScript(GXEditScript* ed_script); |
687 |
|
688 |
|
689 |
int GXGreedyExtend(const char* seq1, int len1, |
690 |
const char* seq2, int len2, |
691 |
bool reverse, //int xdrop_threshold, int match_cost, int mismatch_cost, |
692 |
int& seq1_align_len, int& seq2_align_len, |
693 |
CGreedyAlignData& aux_data, |
694 |
GXEditScript *edit_block); |
695 |
|
696 |
|
697 |
enum GAlnTrimType { |
698 |
//Describes trimming intent |
699 |
galn_None=0, //no trimming, just alignment |
700 |
galn_TrimLeft, |
701 |
galn_TrimRight, |
702 |
galn_TrimEither //adaptor should be trimmed from either end |
703 |
}; |
704 |
|
705 |
struct CAlnTrim { |
706 |
GAlnTrimType type; |
707 |
int minMatch; //minimum terminal exact match that will be removed from ends |
708 |
int l_boundary; //base index (either left or right) excluding terminal poly-A stretches |
709 |
int r_boundary; //base index (either left or right) excluding terminal poly-A stretches |
710 |
int alen; //query/adaptor seq length (for validate()) |
711 |
int safelen; //alignment length > amlen should be automatically validated |
712 |
int seedlen; |
713 |
void prepare(const char* s, int s_len) { |
714 |
//type=trim_type; |
715 |
//amlen=smlen; |
716 |
l_boundary=0; |
717 |
r_boundary=0; |
718 |
//if (type==galn_TrimLeft) { |
719 |
int s_lbound=0; |
720 |
if (s[0]=='A' && s[1]=='A' && s[2]=='A') { |
721 |
s_lbound=3; |
722 |
while (s_lbound<s_len-1 && s[s_lbound]=='A') s_lbound++; |
723 |
} |
724 |
else if (s[1]=='A' && s[2]=='A' && s[3]=='A') { |
725 |
s_lbound=4; |
726 |
while (s_lbound<s_len-1 && s[s_lbound]=='A') s_lbound++; |
727 |
} |
728 |
l_boundary=s_lbound+3; |
729 |
// return; |
730 |
// } |
731 |
//if (type==galn_TrimRight) { |
732 |
int r=s_len-1; |
733 |
if (s[r]=='A' && s[r-1]=='A' && s[r-2]=='A') { |
734 |
r-=3; |
735 |
while (r>0 && s[r]=='A') r--; |
736 |
} |
737 |
else if (s[r-1]=='A' && s[r-2]=='A' && s[r-3]=='A') { |
738 |
r-=4; |
739 |
while (r>0 && s[r]=='A') r--; |
740 |
} |
741 |
r_boundary=r-3; |
742 |
// } |
743 |
} |
744 |
|
745 |
CAlnTrim(GAlnTrimType trim_type, const char* s, int s_len, int a_len, int minEndTrim, int smlen): |
746 |
type(trim_type), minMatch(minEndTrim), l_boundary(0), r_boundary(0), |
747 |
alen(a_len), safelen(smlen) { |
748 |
prepare(s, s_len); |
749 |
} |
750 |
|
751 |
bool validate_R(int sr, int admax, int badj, int adist) { |
752 |
if (adist>admax) return false; |
753 |
return (sr>=r_boundary+badj); |
754 |
} |
755 |
|
756 |
bool validate_L(int sl, int alnlen, int admax, int badj, int alnpid, int adist) { |
757 |
if (adist>admax) return false; |
758 |
//left match should be more stringent (5') |
759 |
if (alnpid<93) { |
760 |
if (alnlen<13 || alnlen<minMatch) return false; |
761 |
admax=0; |
762 |
badj++; |
763 |
} |
764 |
return (sl<=l_boundary-badj); |
765 |
} |
766 |
|
767 |
bool validate(GXAlnInfo* alninfo) { |
768 |
int alnlen=alninfo->sr - alninfo->sl + 1; |
769 |
if (alninfo->pid>90.0 && alnlen>safelen) { |
770 |
//special case: heavy match, could be in the middle |
771 |
if (alninfo->pid>94) |
772 |
alninfo->strong=true; |
773 |
return true; |
774 |
} |
775 |
int sl=alninfo->sl; |
776 |
int sr=alninfo->sr; |
777 |
sl--;sr--; //boundary is 0-based |
778 |
int badj=0; //default boundary is 3 bases distance to end |
779 |
int admax=1; |
780 |
if (alnlen<13) { |
781 |
//stricter boundary check |
782 |
if (alninfo->pid<90) return false; |
783 |
badj=2; |
784 |
if (alnlen<=7) { badj++; admax=0; } |
785 |
} |
786 |
if (type==galn_TrimLeft) { |
787 |
return validate_L(sl, alnlen, admax, badj, alninfo->pid, alen-alninfo->qr); |
788 |
} |
789 |
else if (type==galn_TrimRight) { |
790 |
return validate_R(sr, admax, badj, alninfo->ql-1); |
791 |
} |
792 |
else if (type==galn_TrimEither) { |
793 |
return (validate_R(sr, admax, badj, alninfo->ql-1) || |
794 |
validate_L(sl, alnlen, admax, badj, alninfo->pid, alen-alninfo->qr)); |
795 |
} |
796 |
return true; |
797 |
/* |
798 |
if (type==galn_TrimRight) { |
799 |
return (sr>=boundary+badj); |
800 |
} |
801 |
else { |
802 |
//left match should be more stringent (5') |
803 |
if (alnpid<93) { |
804 |
if (alnlen<13) return false; |
805 |
admax=0; |
806 |
badj++; |
807 |
} |
808 |
return (sl<=boundary-badj); |
809 |
} |
810 |
*/ |
811 |
} |
812 |
}; |
813 |
|
814 |
|
815 |
|
816 |
|
817 |
//GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, GXSeqData& sd); //for overlap at 3' end of seqb |
818 |
|
819 |
GXBandSet* collectSeeds(GList<GXSeed>& seeds, GXSeqData& sd, GAlnTrimType trim_intent); //for overlap at 5' end of seqb |
820 |
//=galn_None |
821 |
// reward MUST be >1 for this function |
822 |
GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max, |
823 |
const char* s_seq, int s_alnstart, int s_max, |
824 |
int reward, int penalty, int xdrop, CGreedyAlignData* gxmem=NULL, |
825 |
CAlnTrim* trim=NULL, bool editscript=false); |
826 |
GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max, |
827 |
const char* s_seq, int s_alnstart, int s_max, CGreedyAlignData* gxmem, |
828 |
CAlnTrim* trim=NULL, bool editscript=false); |
829 |
|
830 |
GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart, |
831 |
bool editscript=false, int reward=2, int penalty=10, int xdrop=32); |
832 |
|
833 |
GXAlnInfo* match_adaptor(GXSeqData& sd, GAlnTrimType trim_type, int minMatch, |
834 |
CGreedyAlignData* gxmem=NULL, int min_pid=90); |
835 |
//GXAlnInfo* match_RightEnd(GXSeqData& sd, CGreedyAlignData* gxmem=NULL, int min_pid=90); |
836 |
#endif |