1 |
gpertea |
93 |
#ifndef _GALIGNEXTEND_H |
2 |
|
|
|
3 |
|
|
//greedy gapped alignment extension |
4 |
|
|
//(mostly lifted from NCBI's megablast gapped extension code) |
5 |
|
|
|
6 |
|
|
#include "GBase.h" |
7 |
|
|
#include "GList.hh" |
8 |
|
|
#include <string.h> |
9 |
|
|
|
10 |
gpertea |
106 |
//#define GDEBUG 1 |
11 |
gpertea |
93 |
|
12 |
|
|
enum { |
13 |
|
|
gxEDIT_OP_MASK = 0x3, |
14 |
|
|
gxEDIT_OP_ERR = 0x0, |
15 |
|
|
gxEDIT_OP_INS = 0x1, |
16 |
|
|
gxEDIT_OP_DEL = 0x2, |
17 |
|
|
gxEDIT_OP_REP = 0x3 |
18 |
|
|
}; |
19 |
|
|
|
20 |
|
|
#define GX_EDITOP_VAL(op) ((op) >> 2) |
21 |
|
|
#define GX_EDITOP_GET(op) ((op) & gxEDIT_OP_MASK) |
22 |
|
|
#define GX_EDITOP_CONS(op, val) (((val) << 2) | ((op) & gxEDIT_OP_MASK)) |
23 |
|
|
|
24 |
|
|
#ifdef GDEBUG |
25 |
|
|
enum {c_black=0, |
26 |
|
|
c_red, c_green,c_brown,c_blue,c_magenta,c_cyan,c_white |
27 |
|
|
}; |
28 |
|
|
|
29 |
|
|
void color_fg(int c, FILE* f=stderr); |
30 |
|
|
void color_bg(int c, FILE* f=stderr); |
31 |
|
|
void color_resetfg(FILE* f=stderr); |
32 |
|
|
void color_resetbg(FILE* f=stderr); |
33 |
|
|
void color_reset(FILE* f=stderr); |
34 |
|
|
void color_normal(FILE* f=stderr); |
35 |
|
|
#endif |
36 |
|
|
|
37 |
|
|
struct GXEditScript{ |
38 |
|
|
uint32 *ops; // array of edit operations |
39 |
|
|
uint32 opsize, opnum; // size of allocation, number in use |
40 |
|
|
uint32 oplast; // most recent operation added |
41 |
|
|
//methods |
42 |
|
|
|
43 |
|
|
GXEditScript() { |
44 |
|
|
init(); |
45 |
|
|
} |
46 |
|
|
~GXEditScript() { |
47 |
|
|
GFREE(ops); |
48 |
|
|
} |
49 |
|
|
void init() { |
50 |
|
|
ops = NULL; |
51 |
|
|
opsize = 0; |
52 |
|
|
opnum = 0; |
53 |
|
|
oplast = 0; |
54 |
|
|
getReady(8); |
55 |
|
|
} |
56 |
|
|
|
57 |
|
|
int getReady(uint32 n) { |
58 |
|
|
uint32 m = n + n/2; |
59 |
|
|
if (opsize <= n) { |
60 |
|
|
GREALLOC(ops, m*sizeof(uint32)); |
61 |
|
|
opsize = m; |
62 |
|
|
} |
63 |
|
|
return 1; |
64 |
|
|
} |
65 |
|
|
|
66 |
|
|
int getReady2(uint32 n) { |
67 |
|
|
if (opsize - opnum <= n) |
68 |
|
|
return getReady(n + opnum); |
69 |
|
|
return 1; |
70 |
|
|
} |
71 |
|
|
|
72 |
|
|
int Put(uint32 op, uint32 n) { |
73 |
|
|
if (!getReady2(2)) |
74 |
|
|
return 0; |
75 |
|
|
oplast = op; |
76 |
|
|
ops[opnum] = GX_EDITOP_CONS(op, n); |
77 |
|
|
opnum += 1; |
78 |
|
|
ops[opnum] = 0; // sentinel |
79 |
|
|
return 1; |
80 |
|
|
} |
81 |
|
|
uint32* First() { |
82 |
|
|
return opnum > 0 ? & ops[0] : NULL; |
83 |
|
|
} |
84 |
|
|
|
85 |
|
|
uint32* Next(uint32 *op) { |
86 |
|
|
// assumes flat address space ! |
87 |
|
|
if (&ops[0] <= op && op < &ops[opnum-1]) |
88 |
|
|
return op+1; |
89 |
|
|
else |
90 |
|
|
return 0; |
91 |
|
|
} |
92 |
|
|
|
93 |
|
|
int More(uint32 op, uint32 k) { |
94 |
|
|
if (op == gxEDIT_OP_ERR) { |
95 |
|
|
GError("GXEditScript::opMore: bad opcode %d:%d", op, k); |
96 |
|
|
return -1; |
97 |
|
|
} |
98 |
|
|
|
99 |
|
|
if (GX_EDITOP_GET(oplast) == op) { |
100 |
|
|
uint32 l=ops[opnum-1]; |
101 |
|
|
ops[opnum-1]=GX_EDITOP_CONS((GX_EDITOP_GET(l)), |
102 |
|
|
(GX_EDITOP_VAL(l) + k)); |
103 |
|
|
} |
104 |
|
|
else { |
105 |
|
|
Put(op, k); |
106 |
|
|
} |
107 |
|
|
|
108 |
|
|
return 0; |
109 |
|
|
} |
110 |
|
|
|
111 |
|
|
GXEditScript* Append(GXEditScript *et) { |
112 |
|
|
uint32 *op; |
113 |
|
|
for (op = et->First(); op; op = et->Next(op)) |
114 |
|
|
More(GX_EDITOP_GET(*op), GX_EDITOP_VAL(*op)); |
115 |
|
|
return this; |
116 |
|
|
} |
117 |
|
|
|
118 |
|
|
int opDel(uint32 k) { |
119 |
|
|
return More(gxEDIT_OP_DEL, k); |
120 |
|
|
} |
121 |
|
|
int opIns(uint32 k) { |
122 |
|
|
return More(gxEDIT_OP_INS, k); |
123 |
|
|
} |
124 |
|
|
int opRep(uint32 k) { |
125 |
|
|
return More(gxEDIT_OP_REP, k); |
126 |
|
|
} |
127 |
|
|
|
128 |
|
|
GXEditScript *reverse() { |
129 |
|
|
const uint32 mid = opnum/2; |
130 |
|
|
const uint32 end = opnum-1; |
131 |
|
|
for (uint32 i = 0; i < mid; ++i) { |
132 |
|
|
const uint32 t = ops[i]; |
133 |
|
|
ops[i] = ops[end-i]; |
134 |
|
|
ops[end-i] = t; |
135 |
|
|
} |
136 |
|
|
return this; |
137 |
|
|
} |
138 |
|
|
}; |
139 |
|
|
|
140 |
|
|
|
141 |
|
|
/** Bookkeeping structure for greedy alignment. When aligning |
142 |
|
|
two sequences, the members of this structure store the |
143 |
|
|
largest offset into the second sequence that leads to a |
144 |
|
|
high-scoring alignment for a given start point */ |
145 |
|
|
struct SGreedyOffset { |
146 |
|
|
int insert_off; // Best offset for a path ending in an insertion |
147 |
|
|
int match_off; // Best offset for a path ending in a match |
148 |
|
|
int delete_off; // Best offset for a path ending in a deletion |
149 |
|
|
}; |
150 |
|
|
|
151 |
|
|
// ----- pool allocator ----- |
152 |
|
|
// works as a linked list of allocated memory blocks |
153 |
|
|
struct GXMemPool { |
154 |
|
|
SGreedyOffset* memblock; |
155 |
|
|
int used, size; |
156 |
|
|
GXMemPool *next; |
157 |
|
|
static int kMinSpace; |
158 |
|
|
//methods |
159 |
|
|
GXMemPool(int num_offsp=0) { //by default allocate a large block here (10M) |
160 |
|
|
num_offsp=GMAX(kMinSpace, num_offsp); |
161 |
|
|
GMALLOC(memblock, num_offsp*sizeof(SGreedyOffset)); |
162 |
|
|
if (memblock == NULL) { |
163 |
|
|
GError("Failed to allocated GXMemPool(%d) for greedy extension!\n",num_offsp); |
164 |
|
|
return; |
165 |
|
|
} |
166 |
|
|
used = 0; |
167 |
|
|
size = num_offsp; |
168 |
|
|
next = NULL; |
169 |
|
|
} |
170 |
|
|
|
171 |
|
|
void refresh() { |
172 |
|
|
GXMemPool* sp=this; |
173 |
|
|
while (sp) { |
174 |
|
|
sp->used = 0; |
175 |
|
|
sp = sp->next; |
176 |
|
|
} |
177 |
|
|
} |
178 |
|
|
~GXMemPool() { |
179 |
|
|
GXMemPool* next_sp; |
180 |
|
|
GXMemPool* sp=this->next; |
181 |
|
|
while (sp) { |
182 |
|
|
next_sp = sp->next; |
183 |
|
|
GFREE(sp->memblock); |
184 |
|
|
delete sp; |
185 |
|
|
sp = next_sp; |
186 |
|
|
} |
187 |
|
|
GFREE(memblock); |
188 |
|
|
} |
189 |
|
|
|
190 |
|
|
SGreedyOffset* getSpace(int num_alloc) { // SGreedyOffset[num_alloc] array |
191 |
|
|
//can use the first found memory block with enough room, |
192 |
|
|
// or allocate a new large block |
193 |
|
|
SGreedyOffset* v; |
194 |
|
|
if (num_alloc < 0) return NULL; |
195 |
|
|
GXMemPool* S=this; |
196 |
|
|
while (used+num_alloc > S->size) { |
197 |
|
|
//no room in current block, get a new mem block |
198 |
|
|
if (next == NULL) { |
199 |
|
|
next=new GXMemPool(num_alloc); //allocates a large contiguous memory block |
200 |
|
|
} |
201 |
|
|
S = S->next; |
202 |
|
|
} |
203 |
|
|
v = S->memblock+S->used; |
204 |
|
|
S->used += num_alloc; |
205 |
|
|
//align to first 8-byte boundary |
206 |
|
|
int m8 = S->used & 7; //modulo 8 |
207 |
|
|
if (m8) |
208 |
|
|
S->used += 8 - m8; |
209 |
|
|
return v; |
210 |
|
|
} |
211 |
|
|
|
212 |
|
|
void* getByteSpace(int byte_size) { //amount to use or allocate memory, in bytes |
213 |
|
|
return (void*)getSpace(byte_size/sizeof(SGreedyOffset)); |
214 |
|
|
} |
215 |
|
|
|
216 |
|
|
}; |
217 |
|
|
|
218 |
|
|
#define GREEDY_MAX_COST_FRACTION 5 |
219 |
|
|
/* (was 2) sequence_length / (this number) is a measure of how hard the |
220 |
|
|
alignment code will work to find the optimal alignment; in fact |
221 |
|
|
this gives a worst case bound on the number of loop iterations */ |
222 |
|
|
|
223 |
|
|
#define GREEDY_MAX_COST 1000 |
224 |
|
|
// The largest diff distance (max indels+mismatches) to be examined for an optimal alignment |
225 |
|
|
// (should be increased for large sequences) |
226 |
|
|
|
227 |
|
|
#define GX_GALLOC_ERROR "Error: failed to allocate memory for greedy alignment!\n" |
228 |
|
|
|
229 |
|
|
// all auxiliary memory needed for the greedy extension algorithm |
230 |
gpertea |
101 |
class CGreedyAlignData { |
231 |
|
|
int d_diff; |
232 |
|
|
int max_d; |
233 |
|
|
public: |
234 |
gpertea |
93 |
int** last_seq2_off; // 2-D array of distances |
235 |
|
|
int* max_score; // array of maximum scores |
236 |
|
|
GXMemPool* space; // local memory pool for SGreedyOffset structs |
237 |
gpertea |
101 |
// |
238 |
|
|
int match_reward; |
239 |
|
|
int mismatch_penalty; |
240 |
|
|
int x_drop; |
241 |
gpertea |
93 |
// Allocate memory for the greedy gapped alignment algorithm |
242 |
gpertea |
101 |
CGreedyAlignData(int reward, int penalty, int xdrop) { |
243 |
gpertea |
93 |
//int max_d, diff_d; |
244 |
|
|
if (penalty<0) penalty=-penalty; |
245 |
|
|
if (reward % 2) { |
246 |
|
|
//scale params |
247 |
gpertea |
101 |
match_reward = reward << 1; |
248 |
|
|
mismatch_penalty = (penalty << 1); |
249 |
|
|
x_drop = xdrop<<1; |
250 |
gpertea |
93 |
} |
251 |
gpertea |
101 |
else { |
252 |
|
|
match_reward=reward; |
253 |
|
|
mismatch_penalty = penalty; |
254 |
|
|
x_drop=xdrop; |
255 |
|
|
} |
256 |
gpertea |
93 |
//if (gap_open == 0 && gap_extend == 0) |
257 |
|
|
// gap_extend = (reward >> 1) + penalty; |
258 |
|
|
const int max_dbseq_length=255; //adjust this accordingly |
259 |
|
|
max_d = GMIN(GREEDY_MAX_COST, |
260 |
|
|
(max_dbseq_length/GREEDY_MAX_COST_FRACTION + 1)); |
261 |
|
|
|
262 |
|
|
last_seq2_off=NULL; // 2-D array of distances |
263 |
|
|
max_score=NULL; // array of maximum scores |
264 |
|
|
space=NULL; // local memory pool for SGreedyOffset structs |
265 |
|
|
//if (score_params.gap_open==0 && score_params.gap_extend==0) { |
266 |
|
|
//non-affine, simpler Greedy algorithm |
267 |
gpertea |
101 |
d_diff = (x_drop+match_reward/2)/(mismatch_penalty+match_reward)+1; |
268 |
gpertea |
93 |
GMALLOC(last_seq2_off, ((max_d + 2) * sizeof(int*))); |
269 |
|
|
if (!last_seq2_off) |
270 |
|
|
GError(GX_GALLOC_ERROR); |
271 |
|
|
GCALLOC(last_seq2_off[0], ((max_d + max_d + 6) * sizeof(int) * 2)); |
272 |
|
|
//allocates contiguous memory for 2 rows here |
273 |
|
|
if (!last_seq2_off[0]) |
274 |
|
|
GError(GX_GALLOC_ERROR); |
275 |
|
|
last_seq2_off[1] = last_seq2_off[0] + max_d + max_d + 6; //memory allocated already for this row |
276 |
|
|
|
277 |
|
|
GCALLOC(max_score, (sizeof(int) * (max_d + 1 + d_diff))); |
278 |
|
|
space = new GXMemPool(); |
279 |
|
|
if (!max_score || !space) |
280 |
|
|
GError(GX_GALLOC_ERROR); |
281 |
|
|
} //consructor |
282 |
|
|
|
283 |
|
|
void reset() { |
284 |
|
|
space->refresh(); |
285 |
|
|
if (last_seq2_off) { |
286 |
|
|
GFREE((last_seq2_off[0])); |
287 |
|
|
} |
288 |
|
|
GFREE(max_score); |
289 |
|
|
GCALLOC(last_seq2_off[0], ((max_d + max_d + 6) * sizeof(int) * 2)); |
290 |
|
|
if (!last_seq2_off[0]) GError(GX_GALLOC_ERROR); |
291 |
|
|
//allocates contiguous memory for 2 rows here |
292 |
|
|
last_seq2_off[1] = last_seq2_off[0] + max_d + max_d + 6; |
293 |
|
|
GCALLOC(max_score, (sizeof(int) * (max_d + 1 + d_diff))); |
294 |
|
|
if (!max_score) GError(GX_GALLOC_ERROR); |
295 |
|
|
} |
296 |
gpertea |
101 |
~CGreedyAlignData() { |
297 |
gpertea |
93 |
if (last_seq2_off) { |
298 |
|
|
GFREE(last_seq2_off[0]); |
299 |
|
|
GFREE(last_seq2_off); |
300 |
|
|
} |
301 |
|
|
GFREE(max_score); |
302 |
|
|
delete space; |
303 |
|
|
} |
304 |
|
|
|
305 |
|
|
}; |
306 |
|
|
|
307 |
|
|
|
308 |
|
|
#define GAPALIGN_SUB ((unsigned char)0) /*op types within the edit script*/ |
309 |
|
|
#define GAPALIGN_INS ((unsigned char)1) |
310 |
|
|
#define GAPALIGN_DEL ((unsigned char)2) |
311 |
|
|
#define GAPALIGN_DECLINE ((unsigned char)3) |
312 |
|
|
|
313 |
|
|
struct GapXEditScript { |
314 |
|
|
unsigned char op_type; // GAPALIGN_SUB, GAPALIGN_INS, or GAPALIGN_DEL |
315 |
|
|
int num; // Number of operations |
316 |
|
|
GapXEditScript* next; |
317 |
|
|
GapXEditScript() { |
318 |
|
|
op_type=0; |
319 |
|
|
num=0; |
320 |
|
|
next=NULL; |
321 |
|
|
} |
322 |
|
|
void print(); |
323 |
|
|
}; |
324 |
|
|
|
325 |
|
|
class CSeqGap { // |
326 |
|
|
public: |
327 |
|
|
int offset; |
328 |
|
|
int len; |
329 |
|
|
CSeqGap(int gofs=0,int glen=1) { |
330 |
|
|
offset=gofs; |
331 |
|
|
len=glen; |
332 |
|
|
} |
333 |
|
|
}; |
334 |
|
|
|
335 |
|
|
class CAlnGapInfo { |
336 |
|
|
int a_ofs; //alignment start on seq a (0 based) |
337 |
|
|
int b_ofs; //alignment start on seq b (0 based) |
338 |
|
|
int a_len; //length of alignment on seq a |
339 |
|
|
int b_len; //length of alignment on seq b |
340 |
|
|
public: |
341 |
|
|
GVec<CSeqGap> a_gaps; |
342 |
|
|
GVec<CSeqGap> b_gaps; |
343 |
|
|
CAlnGapInfo(GXEditScript* ed_script, int astart=0, int bstart=0):a_gaps(),b_gaps() { |
344 |
|
|
a_ofs=astart; |
345 |
|
|
b_ofs=bstart; |
346 |
|
|
a_len=0; |
347 |
|
|
b_len=0; |
348 |
|
|
if (ed_script==NULL) return; |
349 |
|
|
for (uint32 i=0; i<ed_script->opnum; i++) {
|
350 |
|
|
int num=((ed_script->ops[i]) >> 2); |
351 |
|
|
char op_type = 3 - ( ed_script->ops[i] & gxEDIT_OP_MASK ); |
352 |
|
|
if (op_type == 3 || op_type < 0 ) |
353 |
|
|
GError("Error: encountered op_type %d in ed_script?!\n", (int)op_type); |
354 |
|
|
CSeqGap gap; |
355 |
|
|
switch (op_type) { |
356 |
|
|
case GAPALIGN_SUB: a_len+=num; |
357 |
|
|
b_len+=num; |
358 |
|
|
break; |
359 |
|
|
case GAPALIGN_INS: a_len+=num; |
360 |
|
|
gap.offset=b_ofs+b_len; |
361 |
|
|
gap.len=num; |
362 |
|
|
b_gaps.Add(gap); |
363 |
|
|
break; |
364 |
|
|
case GAPALIGN_DEL: b_len+=num; |
365 |
|
|
gap.offset=a_ofs+a_len; |
366 |
|
|
gap.len=num; |
367 |
|
|
a_gaps.Add(gap); |
368 |
|
|
break; |
369 |
|
|
} |
370 |
|
|
} |
371 |
|
|
} |
372 |
|
|
|
373 |
|
|
#ifdef GDEBUG |
374 |
|
|
void printAlignment(FILE* f, const char* sa, int sa_len, |
375 |
|
|
const char* sb, int sb_len) { |
376 |
|
|
//print seq A |
377 |
|
|
char al[1024]; //display buffer for seq A |
378 |
|
|
int ap=0; //index in al[] for current character printed |
379 |
|
|
int g=0; |
380 |
|
|
int aend=a_ofs+a_len; |
381 |
|
|
if (a_ofs<b_ofs) { |
382 |
|
|
for (int i=0;i<b_ofs-a_ofs;i++) { |
383 |
|
|
fprintf(f, " "); |
384 |
|
|
al[++ap]=' '; |
385 |
|
|
} |
386 |
|
|
} |
387 |
|
|
for (int i=0;i<aend;i++) { |
388 |
|
|
if (g<a_gaps.Count() && a_gaps[g].offset==i) { |
389 |
|
|
for (int j=0;j<a_gaps[g].len;j++) { |
390 |
|
|
fprintf(f, "-"); |
391 |
|
|
al[++ap]='-'; |
392 |
|
|
} |
393 |
|
|
g++; |
394 |
|
|
} |
395 |
|
|
if (i==a_ofs) color_bg(c_blue,f); |
396 |
|
|
fprintf(f, "%c", sa[i]); |
397 |
|
|
al[++ap]=sa[i]; |
398 |
|
|
} |
399 |
|
|
color_reset(f); |
400 |
|
|
if (aend<sa_len) |
401 |
|
|
fprintf(f, &sa[aend]); |
402 |
|
|
fprintf(f, "\n"); |
403 |
|
|
//print seq B |
404 |
|
|
ap=0; |
405 |
|
|
g=0; |
406 |
|
|
int bend=b_ofs+b_len; |
407 |
|
|
if (a_ofs>b_ofs) { |
408 |
|
|
for (int i=0;i<a_ofs-b_ofs;i++) { |
409 |
|
|
fprintf(f, " "); |
410 |
|
|
ap++; |
411 |
|
|
} |
412 |
|
|
} |
413 |
|
|
for (int i=0;i<b_ofs;i++) { |
414 |
|
|
fprintf(f, "%c", sb[i]); |
415 |
|
|
ap++; |
416 |
|
|
} |
417 |
|
|
for (int i=b_ofs;i<bend;i++) { |
418 |
|
|
if (g<b_gaps.Count() && b_gaps[g].offset==i) { |
419 |
|
|
for (int j=0;j<b_gaps[g].len;j++) { |
420 |
|
|
fprintf(f, "-"); |
421 |
|
|
ap++; |
422 |
|
|
} |
423 |
|
|
g++; |
424 |
|
|
} |
425 |
|
|
if (i==b_ofs) color_bg(c_blue,f); |
426 |
|
|
ap++; |
427 |
|
|
bool mismatch=(sb[i]!=al[ap] && al[ap]!='-'); |
428 |
|
|
if (mismatch) color_bg(c_red,f); |
429 |
|
|
fprintf(f, "%c", sb[i]); |
430 |
|
|
if (mismatch) color_bg(c_blue,f); |
431 |
|
|
} |
432 |
|
|
color_reset(f); |
433 |
|
|
if (bend<sb_len) |
434 |
|
|
fprintf(f, &sb[bend]); |
435 |
|
|
fprintf(f, "\n"); |
436 |
|
|
} |
437 |
|
|
#endif |
438 |
|
|
}; |
439 |
|
|
|
440 |
|
|
struct GXAlnInfo { |
441 |
|
|
const char *qseq; |
442 |
|
|
int ql,qr; |
443 |
|
|
const char *sseq; |
444 |
|
|
int sl,sr; |
445 |
|
|
int score; |
446 |
|
|
double pid; |
447 |
|
|
GXEditScript* editscript; |
448 |
|
|
CAlnGapInfo* gapinfo; |
449 |
|
|
GXAlnInfo(const char* q, int q_l, int q_r, const char* s, int s_l, int s_r, |
450 |
|
|
int sc=0, double percid=0) { |
451 |
|
|
qseq=q; |
452 |
|
|
sseq=s; |
453 |
|
|
ql=q_l; |
454 |
|
|
qr=q_r; |
455 |
|
|
sl=s_l; |
456 |
|
|
sr=s_r; |
457 |
|
|
score=sc; |
458 |
|
|
pid=percid; |
459 |
|
|
editscript=NULL; |
460 |
|
|
gapinfo=NULL; |
461 |
|
|
} |
462 |
|
|
~GXAlnInfo() { |
463 |
|
|
delete editscript; |
464 |
|
|
delete gapinfo; |
465 |
|
|
} |
466 |
|
|
bool operator<(GXAlnInfo& d) { |
467 |
|
|
return ((score==d.score)? pid>d.pid : score>d.score); |
468 |
|
|
} |
469 |
|
|
bool operator>(GXAlnInfo& d) { |
470 |
|
|
return ((score==d.score)? pid<d.pid : score<d.score); |
471 |
|
|
} |
472 |
|
|
bool operator==(GXAlnInfo& d) { |
473 |
|
|
return (score==d.score && pid==d.pid); |
474 |
|
|
} |
475 |
|
|
|
476 |
|
|
}; |
477 |
|
|
|
478 |
|
|
|
479 |
|
|
|
480 |
|
|
struct GXSeed { |
481 |
|
|
int b_ofs; //0-based coordinate on seq b (x coordinate) |
482 |
|
|
int a_ofs; //0-based coordinate on seq a (y coordinate) |
483 |
|
|
int len; //length of exact match after extension |
484 |
|
|
bool operator<(GXSeed& d){ |
485 |
|
|
return ((b_ofs==d.b_ofs) ? a_ofs<d.a_ofs : b_ofs<d.b_ofs); |
486 |
|
|
} |
487 |
|
|
bool operator>(GXSeed& d){ |
488 |
|
|
return ((b_ofs==d.b_ofs) ? a_ofs>d.a_ofs : b_ofs>d.b_ofs); |
489 |
|
|
} |
490 |
|
|
bool operator==(GXSeed& d){ |
491 |
|
|
return (b_ofs==d.b_ofs && a_ofs==d.a_ofs); //should never be the case, seeds are uniquely constructed |
492 |
|
|
} |
493 |
|
|
GXSeed(int aofs=0, int bofs=0, int l=4) { |
494 |
|
|
a_ofs=aofs; |
495 |
|
|
b_ofs=bofs; |
496 |
|
|
len=l; |
497 |
|
|
} |
498 |
|
|
}; |
499 |
|
|
|
500 |
|
|
int cmpSeedDiag(const pointer p1, const pointer p2); |
501 |
|
|
//seeds are "equal" if they're on the same diagonal (for selection purposes only) |
502 |
|
|
|
503 |
|
|
int cmpSeedScore(const pointer p1, const pointer p2); //also takes position into account |
504 |
|
|
//among seeds with same length, prefer those closer to the left end of the read (seq_b) |
505 |
|
|
|
506 |
|
|
struct GXBand { |
507 |
|
|
//bundle of seed matches on 3 adjacent diagonals |
508 |
|
|
int diag; //first anti-diagonal (b_ofs-a_ofs) in this group of 3 |
509 |
|
|
//seeds for this, and diag+1 and diag+2 are stored here |
510 |
|
|
int min_a, max_a; //maximal coordinates of the bundle |
511 |
|
|
int min_b, max_b; |
512 |
gpertea |
101 |
int w_min_b; //weighted average of left start coordinate |
513 |
|
|
int avg_len; |
514 |
gpertea |
93 |
GList<GXSeed> seeds; //sorted by x coordinate (b_ofs) |
515 |
|
|
int score; //sum of seed scores (- overlapping_bases/2 - gaps) |
516 |
gpertea |
101 |
bool tested; |
517 |
gpertea |
93 |
GXBand(int start_diag=-1, GXSeed* seed=NULL):seeds(true, false, false) { |
518 |
|
|
diag=start_diag; |
519 |
|
|
min_a=MAX_INT; |
520 |
|
|
min_b=MAX_INT; |
521 |
|
|
max_a=0; |
522 |
|
|
max_b=0; |
523 |
|
|
score=0; |
524 |
gpertea |
101 |
avg_len=0; |
525 |
|
|
w_min_b=0; |
526 |
|
|
tested=false; |
527 |
gpertea |
93 |
if (seed!=NULL) addSeed(seed); |
528 |
|
|
} |
529 |
|
|
void addSeed(GXSeed* seed) { |
530 |
|
|
seeds.Add(seed); |
531 |
|
|
score+=seed->len; |
532 |
gpertea |
101 |
avg_len+=seed->len; |
533 |
|
|
w_min_b+=seed->b_ofs * seed->len; |
534 |
gpertea |
93 |
//if (diag<0) diag=seed->diag; //should NOT be done like this |
535 |
|
|
if (seed->a_ofs < min_a) min_a=seed->a_ofs; |
536 |
|
|
if (seed->a_ofs+ seed->len > max_a) max_a=seed->a_ofs+seed->len; |
537 |
|
|
if (seed->b_ofs < min_b) min_b=seed->b_ofs; |
538 |
|
|
if (seed->b_ofs+seed->len > max_b) max_b=seed->b_ofs+seed->len; |
539 |
|
|
} |
540 |
gpertea |
101 |
|
541 |
gpertea |
93 |
void finalize() { |
542 |
|
|
//!! to be called only AFTER all seeds have been added |
543 |
|
|
// seeds are sorted by b_ofs |
544 |
|
|
//penalize seed gaps and overlaps on b sequence |
545 |
gpertea |
101 |
if (avg_len==0) return; |
546 |
|
|
w_min_b/=avg_len; |
547 |
|
|
avg_len>>=1; |
548 |
gpertea |
93 |
for (int i=1;i<seeds.Count();i++) { |
549 |
|
|
GXSeed& sprev=*seeds[i-1]; |
550 |
|
|
GXSeed& scur=*seeds[i]; |
551 |
|
|
if (scur==sprev) GError("Error: duplicate seeds found (%d-%d:%d-%d)!\n", |
552 |
|
|
scur.a_ofs+1, scur.a_ofs+scur.len, scur.b_ofs+1, scur.b_ofs+scur.len); |
553 |
|
|
int b_gap=scur.b_ofs-sprev.b_ofs-sprev.len; |
554 |
|
|
int a_gap=scur.a_ofs-sprev.a_ofs-sprev.len; |
555 |
|
|
int max_gap=b_gap; |
556 |
|
|
int min_gap=a_gap; |
557 |
|
|
if (min_gap>max_gap) swap(max_gap, min_gap); |
558 |
gpertea |
101 |
int _penalty=0; |
559 |
gpertea |
93 |
if (min_gap<0) { //overlap |
560 |
gpertea |
101 |
if (max_gap>0) { _penalty=GMAX((-min_gap), max_gap); } |
561 |
|
|
else _penalty=-min_gap; |
562 |
gpertea |
93 |
} |
563 |
|
|
else { //gap |
564 |
gpertea |
101 |
_penalty=max_gap; |
565 |
gpertea |
93 |
} |
566 |
gpertea |
101 |
score-=(_penalty>>1); |
567 |
|
|
//score-=_penalty; |
568 |
|
|
}//for each seed |
569 |
gpertea |
93 |
} |
570 |
|
|
|
571 |
|
|
//bands will be sorted by decreasing score eventually, after all seeds are added |
572 |
|
|
//more seeds better than one longer seed? |
573 |
|
|
bool operator<(GXBand& d){ |
574 |
gpertea |
101 |
//return ((score==d.score) ? seeds.Count()>d.seeds.Count() : score>d.score); |
575 |
|
|
return ((score==d.score) ? w_min_b<d.w_min_b : score>d.score); |
576 |
gpertea |
93 |
} |
577 |
|
|
bool operator>(GXBand& d){ |
578 |
gpertea |
101 |
//return ((score==d.score) ? seeds.Count()<d.seeds.Count() : score<d.score); |
579 |
|
|
return ((score==d.score) ? w_min_b>d.w_min_b : score<d.score); |
580 |
gpertea |
93 |
} |
581 |
|
|
bool operator==(GXBand& d){ |
582 |
gpertea |
101 |
//return (score==d.score && seeds.Count()==d.seeds.Count()); |
583 |
|
|
return (score==d.score && w_min_b==d.w_min_b); |
584 |
gpertea |
93 |
} |
585 |
|
|
|
586 |
|
|
}; |
587 |
|
|
|
588 |
|
|
class GXBandSet:public GList<GXBand> { |
589 |
|
|
public: |
590 |
gpertea |
99 |
GXSeed* qmatch; //long match (mismatches allowed) if a very good match was extended well |
591 |
gpertea |
104 |
GXSeed* tmatch; //terminal match to be used if there is no better alignment |
592 |
gpertea |
93 |
int idxoffset; //global anti-diagonal->index offset (a_len-1) |
593 |
|
|
//used to convert a diagonal to an index |
594 |
|
|
//diagonal is always b_ofs-a_ofs, so the minimum value is -a_len+1 |
595 |
|
|
//hence offset is a_len-1 |
596 |
|
|
GXBand* band(int diag) { //retrieve the band for given anti-diagonal (b_ofs-a_ofs) |
597 |
|
|
return Get(diag+idxoffset); |
598 |
|
|
} |
599 |
|
|
GXBand* band(int a_ofs, int b_ofs) { //retrieve the band for given anti-diagonal (b_ofs-a_ofs) |
600 |
|
|
return Get(b_ofs-a_ofs+idxoffset); |
601 |
|
|
} |
602 |
|
|
GXBandSet(int a_len, int b_len):GList<GXBand>(a_len+b_len-1, false, true, false) { |
603 |
|
|
idxoffset=a_len-1; |
604 |
gpertea |
99 |
qmatch=NULL; |
605 |
gpertea |
104 |
tmatch=NULL; //terminal match to be used if everything else fails |
606 |
gpertea |
93 |
//diag will range from -a_len+1 to b_len-1, so after adjustment |
607 |
|
|
//by idxoffset we get a max of a_len+b_len-2 |
608 |
|
|
int bcount=a_len+b_len-1; |
609 |
|
|
for (int i=0;i<bcount;i++) |
610 |
|
|
this->Add(new GXBand(i-idxoffset)); |
611 |
|
|
//unsorted, this should set fList[i] |
612 |
|
|
} |
613 |
gpertea |
99 |
~GXBandSet() { |
614 |
|
|
delete qmatch; |
615 |
|
|
} |
616 |
gpertea |
93 |
void addSeed(GXSeed* seed) { |
617 |
|
|
//MUST be unsorted !!! |
618 |
|
|
int idx=(seed->b_ofs-seed->a_ofs)+idxoffset; |
619 |
|
|
fList[idx]->addSeed(seed); |
620 |
|
|
if (idx>0) fList[idx-1]->addSeed(seed); |
621 |
|
|
if (idx<fCount-1) fList[idx+1]->addSeed(seed); |
622 |
|
|
} |
623 |
|
|
}; |
624 |
|
|
|
625 |
|
|
|
626 |
|
|
GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len); //for overlap at 3' end of seqb |
627 |
|
|
|
628 |
|
|
GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len); //for overlap at 5' end of seqb |
629 |
|
|
|
630 |
|
|
void printEditScript(GXEditScript* ed_script); |
631 |
|
|
|
632 |
|
|
|
633 |
|
|
int GXGreedyExtend(const char* seq1, int len1, |
634 |
|
|
const char* seq2, int len2, |
635 |
|
|
bool reverse, int xdrop_threshold, |
636 |
|
|
int match_cost, int mismatch_cost, |
637 |
|
|
int& seq1_align_len, int& seq2_align_len, |
638 |
gpertea |
101 |
CGreedyAlignData& aux_data, |
639 |
gpertea |
93 |
GXEditScript *edit_block); |
640 |
|
|
|
641 |
|
|
|
642 |
|
|
enum GAlnTrimType { |
643 |
|
|
galn_NoTrim=0, |
644 |
|
|
galn_TrimLeft, |
645 |
|
|
galn_TrimRight |
646 |
|
|
}; |
647 |
|
|
|
648 |
gpertea |
101 |
struct CAlnTrim { |
649 |
|
|
GAlnTrimType type; |
650 |
|
|
int boundary; //base index (either left or right) excluding terminal poly-A stretches |
651 |
|
|
void prepare(GAlnTrimType trim_type, const char* s, int s_len) { |
652 |
|
|
type=trim_type; |
653 |
|
|
boundary=0; |
654 |
|
|
if (type==galn_TrimLeft) { |
655 |
|
|
int s_lbound=0; |
656 |
|
|
if (s[0]=='A' && s[1]=='A' && s[2]=='A') { |
657 |
|
|
s_lbound=3; |
658 |
|
|
while (s_lbound<s_len-1 && s[s_lbound]=='A') s_lbound++; |
659 |
|
|
} |
660 |
|
|
else if (s[1]=='A' && s[2]=='A' && s[3]=='A') { |
661 |
|
|
s_lbound=4; |
662 |
|
|
while (s_lbound<s_len-1 && s[s_lbound]=='A') s_lbound++; |
663 |
|
|
} |
664 |
|
|
boundary=s_lbound+3; |
665 |
|
|
return; |
666 |
|
|
} |
667 |
|
|
if (type==galn_TrimRight) { |
668 |
|
|
int r=s_len-1; |
669 |
|
|
if (s[r]=='A' && s[r-1]=='A' && s[r-2]=='A') { |
670 |
|
|
r-=3; |
671 |
|
|
while (r>0 && s[r]=='A') r--; |
672 |
|
|
} |
673 |
|
|
else if (s[r-1]=='A' && s[r-2]=='A' && s[r-3]=='A') { |
674 |
|
|
r-=4; |
675 |
|
|
while (r>0 && s[r]=='A') r--; |
676 |
|
|
} |
677 |
|
|
boundary=r-3; |
678 |
|
|
} |
679 |
|
|
} |
680 |
|
|
|
681 |
|
|
CAlnTrim(GAlnTrimType trim_type, const char* s, int s_len) { |
682 |
|
|
prepare(trim_type, s, s_len); |
683 |
|
|
} |
684 |
|
|
|
685 |
|
|
bool validate(int sl, int sr, int alnpid, int adist) { |
686 |
|
|
int alnlen=sr-sl+1; |
687 |
|
|
sl--;sr--; //boundary is 0-based |
688 |
|
|
int badj=0; |
689 |
|
|
int admax=3; |
690 |
|
|
if (alnlen<11) { |
691 |
|
|
//stricter boundary check |
692 |
|
|
badj=2; |
693 |
|
|
admax=1; |
694 |
gpertea |
106 |
if (alnlen<=6) { badj++; admax=0; } |
695 |
gpertea |
101 |
} |
696 |
|
|
if (adist>admax) return false; |
697 |
|
|
if (type==galn_TrimRight) { |
698 |
|
|
return (sr>=boundary+badj); |
699 |
|
|
} |
700 |
|
|
else { |
701 |
|
|
//left side should be more stringent |
702 |
|
|
if (alnpid<91) { |
703 |
|
|
if (alnlen<11) return false; |
704 |
|
|
badj++; |
705 |
|
|
} |
706 |
|
|
return (sl<=boundary-badj); |
707 |
|
|
} |
708 |
|
|
} |
709 |
|
|
|
710 |
|
|
}; |
711 |
|
|
|
712 |
|
|
|
713 |
gpertea |
93 |
// reward MUST be >1, always |
714 |
|
|
GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max, |
715 |
gpertea |
101 |
const char* s_seq, int s_alnstart, int s_max, |
716 |
|
|
int reward, int penalty, int xdrop, CGreedyAlignData* gxmem=NULL, |
717 |
|
|
CAlnTrim* trim=NULL, bool editscript=false); |
718 |
|
|
GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max, |
719 |
|
|
const char* s_seq, int s_alnstart, int s_max, CGreedyAlignData* gxmem, |
720 |
|
|
CAlnTrim* trim=NULL, bool editscript=false); |
721 |
|
|
|
722 |
gpertea |
93 |
GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart, |
723 |
|
|
bool editscript=false, int reward=2, int penalty=3, int xdrop=8); |
724 |
|
|
|
725 |
gpertea |
96 |
GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len, |
726 |
gpertea |
101 |
CGreedyAlignData* gxmem=NULL, int min_pid=83); |
727 |
gpertea |
96 |
GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len, |
728 |
gpertea |
101 |
CGreedyAlignData* gxmem=NULL, int min_pid=73); |
729 |
gpertea |
93 |
#endif |