ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.cpp
Revision: 104
Committed: Mon Oct 10 19:34:20 2011 UTC (7 years, 9 months ago) by gpertea
File size: 39728 byte(s)
Log Message:
fixed GVec, GArray to use new[] instead of malloc

Line File contents
1 #include "GAlnExtend.h"
2
3 //greedy gapped alignment extension
4 //(mostly lifted from NCBI's megablast gapped extension code)
5
6 int GXMemPool::kMinSpace = 1000000;
7
8 #ifdef GDEBUG
9 char buf[6]={0x1B,'[', 'n','m','m','\0'};
10
11 void color_fg(int c,FILE* f) {
12 if (f!=stderr && f!=stdout) return;
13 sprintf((char *)(&buf[2]),"%dm",c+30);
14 fwrite(buf,1,strlen(buf), f);
15 }
16
17 void color_bg(int c, FILE* f) {
18 if (f!=stderr && f!=stdout) return;
19 sprintf((char *)(&buf[2]),"%dm",c+40);
20 fwrite(buf,1,strlen(buf),f);
21 };
22
23 void color_resetfg(FILE* f) {
24 if (f!=stderr && f!=stdout) return;
25 sprintf((char *)(&buf[2]),"39m");
26 fwrite(buf,1,strlen(buf), f);
27 };
28
29 void color_resetbg(FILE* f) {
30 if (f!=stderr && f!=stdout) return;
31 sprintf((char *)(&buf[2]),"49m");
32 fwrite(buf,1,strlen(buf), f);
33 }
34
35 void color_reset(FILE* f) {
36 if (f!=stderr && f!=stdout) return;
37 sprintf((char *)(&buf[2]),"0m");
38 fwrite(buf,1,strlen(buf), f);
39 };
40
41 void color_normal(FILE* f) {
42 if (f!=stderr && f!=stdout) return;
43 sprintf((char *)(&buf[2]),"22m");
44 fwrite(buf,1,strlen(buf), f);
45 };
46
47 #endif
48
49
50 char xgapcodes[4]={'S','I', 'D', 'X'};
51
52 int get_last(int **flast_d, int d, int diag, int *row1) {
53 if (flast_d[d-1][diag-1] > GMAX(flast_d[d-1][diag], flast_d[d-1][diag+1])) {
54 *row1 = flast_d[d-1][diag-1];
55 return diag-1;
56 }
57 if (flast_d[d-1][diag] > flast_d[d-1][diag+1]) {
58 *row1 = flast_d[d-1][diag];
59 return diag;
60 }
61 *row1 = flast_d[d-1][diag+1];
62 return diag+1;
63 }
64
65 void GapXEditScript::print() { //debug
66 GapXEditScript* p=this;
67 do {
68 GMessage("%d%c ",p->num, xgapcodes[p->op_type]);
69 } while ((p=p->next)!=NULL);
70 GMessage("\n");
71 }
72
73
74 int BLAST_Gcd(int a, int b) {
75 int c;
76
77 b = abs(b);
78 if (b > a)
79 c=a, a=b, b=c;
80
81 while (b != 0) {
82 c = a%b;
83 a = b;
84 b = c;
85 }
86 return a;
87 }
88
89 int BLAST_Gdb3(int* a, int* b, int* c) {
90 int g;
91 if (*b == 0)
92 g = BLAST_Gcd(*a, *c);
93 else
94 g = BLAST_Gcd(*a, BLAST_Gcd(*b, *c));
95 if (g > 1) {
96 *a /= g;
97 *b /= g;
98 *c /= g;
99 }
100 return g;
101 }
102
103
104
105 //signal that a diagonal is invalid
106 static const int kInvalidOffset = -2;
107
108 int s_FindFirstMismatch(const char *seq1, int len1,
109 const char *seq2, int len2,
110 int seq1_index, int seq2_index,
111 //bool &fence_hit,
112 bool reverse) {
113 int start_index = seq1_index;
114 /* Sentry detection here should be relatively inexpensive: The
115 sentry value cannot appear in the query, so detection only
116 needs to be done at exit from the subject-query matching loop.
117 For uncompressed sequences, ambiguities in the query (i.e. seq1)
118 always count as mismatches */
119 if (reverse) {
120 while (seq1_index < len1 && seq2_index < len2 &&
121 //seq1[len1-1 - seq1_index] < 4 &&
122 seq1[len1-1 - seq1_index] == seq2[len2-1 - seq2_index]) {
123 ++seq1_index;
124 ++seq2_index;
125 }
126 //if (seq2_index < len2 && seq2[len2-1-seq2_index] == FENCE_SENTRY) {
127 //if len2-1-seq2_index<=0) {
128 // fence_hit = true;
129 // }
130 }
131 else { //forward lookup
132 while (seq1_index < len1 && seq2_index < len2 &&
133 //seq1[seq1_index] < 4 &&
134 seq1[seq1_index] == seq2[seq2_index]) {
135 ++seq1_index;
136 ++seq2_index;
137 }
138 //if (seq2_index < len2 && seq2[seq2_index] == FENCE_SENTRY) {
139 //if (seq2_index==len2) {
140 // fence_hit = true;
141 //}
142 }
143 return seq1_index - start_index;
144 }
145
146
147
148 /** During the traceback for a non-affine greedy alignment,
149 compute the diagonal that will result from the next
150 traceback operation
151
152 @param last_seq2_off Array of offsets into the second sequence;
153 last_seq2_off[d][k] gives the largest offset into
154 the second sequence that lies on diagonal k and
155 has distance d [in]
156 @param d Starting distance [in]
157 @param diag Index of diagonal that produced the starting distance [in]
158 @param seq2_index The offset into the second sequence after the traceback
159 operation has completed [out]
160 @return The diagonal resulting from the next traceback operation
161 being applied
162 */
163 int s_GetNextNonAffineTback(int **last_seq2_off, int d,
164 int diag, int *seq2_index) {
165 // choose the traceback operation that results in the
166 // largest seq2 offset at this point, then compute the
167 // new diagonal that is implied by the operation
168 if (last_seq2_off[d-1][diag-1] >
169 GMAX(last_seq2_off[d-1][diag], last_seq2_off[d-1][diag+1])) {
170 *seq2_index = last_seq2_off[d-1][diag-1];
171 return diag - 1; // gap in seq2
172 }
173 if (last_seq2_off[d-1][diag] > last_seq2_off[d-1][diag+1]) {
174 *seq2_index = last_seq2_off[d-1][diag];
175 return diag; // match
176 }
177 *seq2_index = last_seq2_off[d-1][diag+1];
178 return diag + 1; // gap in seq1
179 }
180
181
182 int GXGreedyExtend(const char* seq1, int len1,
183 const char* seq2, int len2,
184 bool reverse, int xdrop_threshold,
185 int match_cost, int mismatch_cost,
186 int& seq1_align_len, int& seq2_align_len,
187 CGreedyAlignData& aux_data,
188 GXEditScript *edit_block) {
189 //GapPrelimEditBlock *edit_block,
190 //bool& fence_hit, SGreedySeed *seed) {
191 int seq1_index;
192 int seq2_index;
193 int index;
194 int d;
195 int k;
196 int diag_lower, diag_upper;
197 int max_dist;
198 int diag_origin;
199 int best_dist;
200 int best_diag;
201 int** last_seq2_off;
202 int* max_score;
203 int xdrop_offset;
204 int longest_match_run;
205 bool end1_reached, end2_reached;
206 GXMemPool* mem_pool;
207
208 /* ordinary dynamic programming alignment, for each offset
209 in seq1, walks through offsets in seq2 until an X-dropoff
210 test fails, saving the best score encountered along
211 the way. Instead of score, this code tracks the 'distance'
212 (number of mismatches plus number of gaps) between seq1
213 and seq2. Instead of walking through sequence offsets, it
214 walks through diagonals that can achieve a given distance.
215
216 Note that in what follows, the numbering of diagonals implies
217 a dot matrix where increasing seq1 offsets go to the right on
218 the x axis, and increasing seq2 offsets go up the y axis.
219 The gapped alignment thus proceeds up and to the right in
220 the graph, and diagonals are numbered increasing to the right */
221
222 best_dist = 0;
223 best_diag = 0;
224
225 /* set the number of distinct distances the algorithm will
226 examine in the search for an optimal alignment. The
227 heuristic worst-case running time of the algorithm is
228 O(max_dist**2 + (len1+len2)); for sequences which are
229 very similar, the average running time will be sig-
230 nificantly better than this */
231
232 max_dist = GMIN(GREEDY_MAX_COST,
233 len2 / GREEDY_MAX_COST_FRACTION + 1);
234
235 /* the main loop assumes that the index of all diagonals is
236 biased to lie in the middle of allocated bookkeeping
237 structures */
238
239 diag_origin = max_dist + 2;
240
241 // last_seq2_off[d][k] is the largest offset into seq2 that
242 // lies on diagonal k and has distance d
243
244 last_seq2_off = aux_data.last_seq2_off;
245
246 /* Instead of tracking the best alignment score and using
247 xdrop_theshold directly, track the best score for each
248 unique distance and use the best score for some previously
249 computed distance to implement the X-dropoff test.
250
251 xdrop_offset gives the distance backwards in the score
252 array to look */
253
254 xdrop_offset = (xdrop_threshold + match_cost / 2) /
255 (match_cost + mismatch_cost) + 1;
256
257 // find the offset of the first mismatch between seq1 and seq2
258
259 index = s_FindFirstMismatch(seq1, len1, seq2, len2, 0, 0, reverse);
260 // fence_hit, reverse, rem);
261
262 // update the extents of the alignment, and bail out
263 // early if no further work is needed
264
265 seq1_align_len = index;
266 seq2_align_len = index;
267 seq1_index = index;
268 /*
269 seed->start_q = 0;
270 seed->start_s = 0;
271 seed->match_length = index;
272 */
273 longest_match_run = index;
274
275 if (index == len1 || index == len2) {
276 /* Return the number of differences, which is zero here */
277 if (edit_block != NULL)
278 //GapPrelimEditBlockAdd(edit_block, eGapAlignSub, index);
279 edit_block->opRep(index);
280 return 0;
281 }
282
283 // set up the memory pool
284 mem_pool = aux_data.space;
285 if (edit_block == NULL) {
286 mem_pool = NULL;
287 }
288 else if (mem_pool == NULL) {
289 aux_data.space = mem_pool = new GXMemPool();
290 }
291 else {
292 mem_pool->refresh();
293 }
294
295 /* set up the array of per-distance maximum scores. There
296 are max_diags + xdrop_offset distances to track, the first
297 xdrop_offset of which are 0 */
298
299 max_score = aux_data.max_score + xdrop_offset;
300 for (index = 0; index < xdrop_offset; index++)
301 aux_data.max_score[index] = 0;
302
303 // fill in the initial offsets of the distance matrix
304
305 last_seq2_off[0][diag_origin] = seq1_index;
306 max_score[0] = seq1_index * match_cost;
307 diag_lower = diag_origin - 1;
308 diag_upper = diag_origin + 1;
309 end1_reached = end2_reached = false;
310
311 // for each distance
312 for (d = 1; d <= max_dist; d++) {
313 int xdrop_score;
314 int curr_score;
315 int curr_extent = 0;
316 int curr_seq2_index = 0;
317 int curr_diag = 0;
318 int tmp_diag_lower = diag_lower;
319 int tmp_diag_upper = diag_upper;
320
321 // Assign impossible seq2 offsets to any diagonals that
322 // are not in the range (diag_lower,diag_upper).
323 // These will serve as sentinel values for the inner loop
324 last_seq2_off[d - 1][diag_lower-1] = kInvalidOffset;
325 last_seq2_off[d - 1][diag_lower] = kInvalidOffset;
326 last_seq2_off[d - 1][diag_upper] = kInvalidOffset;
327 last_seq2_off[d - 1][diag_upper+1] = kInvalidOffset;
328
329 // compute the score for distance d corresponding to the X-dropoff criterion
330
331 xdrop_score = max_score[d - xdrop_offset] +
332 (match_cost + mismatch_cost) * d - xdrop_threshold;
333 xdrop_score = (int)ceil((double)xdrop_score / (match_cost / 2));
334
335 // for each diagonal of interest
336 for (k = tmp_diag_lower; k <= tmp_diag_upper; k++) {
337 /* find the largest offset into seq2 that increases
338 the distance from d-1 to d (i.e. keeps the alignment
339 from getting worse for as long as possible), then
340 choose the offset into seq1 that will keep the
341 resulting diagonal fixed at k
342
343 Note that this requires kInvalidOffset+1 to be smaller
344 than any valid offset into seq2, i.e. to be negative */
345
346 seq2_index = GMAX(last_seq2_off[d - 1][k + 1],
347 last_seq2_off[d - 1][k ]) + 1;
348 seq2_index = GMAX(seq2_index, last_seq2_off[d - 1][k - 1]);
349 seq1_index = seq2_index + k - diag_origin;
350
351 if (seq2_index < 0 || seq1_index + seq2_index < xdrop_score) {
352
353 // if no valid diagonal can reach distance d, or the
354 // X-dropoff test fails, narrow the range of diagonals
355 // to test and skip to the next diagonal
356 if (k == diag_lower)
357 diag_lower++;
358 else
359 last_seq2_off[d][k] = kInvalidOffset;
360 continue;
361 }
362 diag_upper = k;
363
364 /* slide down diagonal k until a mismatch
365 occurs. As long as only matches are encountered,
366 the current distance d will not change */
367
368 index = s_FindFirstMismatch(seq1, len1, seq2, len2,
369 seq1_index, seq2_index, reverse);
370 //fence_hit, reverse, rem);
371 if (index > longest_match_run) {
372 //seed->start_q = seq1_index;
373 //seed->start_s = seq2_index;
374 //seed->match_length = index;
375 longest_match_run = index;
376 }
377 seq1_index += index;
378 seq2_index += index;
379
380 // set the new largest seq2 offset that achieves
381 // distance d on diagonal k
382
383 last_seq2_off[d][k] = seq2_index;
384
385 // since all values of k are constrained to have the
386 // same distance d, the value of k which maximizes the
387 // alignment score is the one that covers the most of seq1 and seq2
388 if (seq1_index + seq2_index > curr_extent) {
389 curr_extent = seq1_index + seq2_index;
390 curr_seq2_index = seq2_index;
391 curr_diag = k;
392 }
393
394 /* clamp the bounds on diagonals to avoid walking off
395 either sequence. Because the bounds increase by at
396 most one for each distance, diag_lower and diag_upper
397 can each be of size at most max_diags+2 */
398
399 if (seq2_index == len2) {
400 diag_lower = k + 1;
401 end2_reached = true;
402 }
403 if (seq1_index == len1) {
404 diag_upper = k - 1;
405 end1_reached = true;
406 }
407 } // end loop over diagonals
408
409 // compute the maximum score possible for distance d
410 curr_score = curr_extent * (match_cost / 2) -
411 d * (match_cost + mismatch_cost);
412 // if this is the best score seen so far, update the
413 // statistics of the best alignment
414 if (curr_score > max_score[d - 1]) {
415 max_score[d] = curr_score;
416 best_dist = d;
417 best_diag = curr_diag;
418 seq2_align_len = curr_seq2_index;
419 seq1_align_len = curr_seq2_index + best_diag - diag_origin;
420 }
421 else {
422 max_score[d] = max_score[d - 1];
423 }
424
425 // alignment has finished if the lower and upper bounds
426 // on diagonals to check have converged to each other
427
428 if (diag_lower > diag_upper)
429 break;
430
431 /* set up for the next distance to examine. Because the
432 bounds increase by at most one for each distance,
433 diag_lower and diag_upper can each be of size at
434 most max_diags+2 */
435
436 if (!end2_reached)
437 diag_lower--;
438 if (!end1_reached)
439 diag_upper++;
440
441 if (edit_block == NULL) {
442 // if no traceback is specified, the next row of
443 // last_seq2_off can reuse previously allocated memory
444 //TODO FIXME The following assumes two arrays of
445 // at least max_dist+4 int's have already been allocated
446 last_seq2_off[d + 1] = last_seq2_off[d - 1];
447 }
448 else {
449 // traceback requires all rows of last_seq2_off to be saved,
450 // so a new row must be allocated
451 last_seq2_off[d + 1] = (int*)mem_pool->getByteSpace((diag_upper - diag_lower + 7)*sizeof(int));
452 // move the origin for this row backwards
453 //TODO FIXME: dubious pointer arithmetic ?!
454 //last_seq2_off[d + 1] = last_seq2_off[d + 1] - diag_lower + 2;
455 }
456 } // end loop over distinct distances
457
458
459 if (edit_block == NULL)
460 return best_dist;
461
462 //---- perform traceback
463 d = best_dist;
464 seq1_index = seq1_align_len;
465 seq2_index = seq2_align_len;
466 // for all positive distances
467
468 //if (fence_hit && *fence_hit)
469 // goto done;
470 if (index==len1 || index==len2) d=0;
471 while (d > 0) {
472 int new_diag;
473 int new_seq2_index;
474
475 /* retrieve the value of the diagonal after the next
476 traceback operation. best_diag starts off with the
477 value computed during the alignment process */
478
479 new_diag = s_GetNextNonAffineTback(last_seq2_off, d,
480 best_diag, &new_seq2_index);
481
482 if (new_diag == best_diag) {
483 // same diagonal: issue a group of substitutions
484 if (seq2_index - new_seq2_index > 0) {
485 edit_block->opRep(seq2_index - new_seq2_index);
486 }
487 }
488 else if (new_diag < best_diag) {
489 // smaller diagonal: issue a group of substitutions
490 // and then a gap in seq2 */
491 if (seq2_index - new_seq2_index > 0) {
492 edit_block->opRep(seq2_index - new_seq2_index);
493 }
494 //GapPrelimEditBlockAdd(edit_block, eGapAlignIns, 1);
495 edit_block->opIns(1);
496 }
497 else {
498 // larger diagonal: issue a group of substitutions
499 // and then a gap in seq1
500 if (seq2_index - new_seq2_index - 1 > 0) {
501 edit_block->opRep(seq2_index - new_seq2_index - 1);
502 }
503 edit_block->opDel(1);
504 }
505 d--;
506 best_diag = new_diag;
507 seq2_index = new_seq2_index;
508 }
509 //done:
510 // handle the final group of substitutions back to distance zero,
511 // i.e. back to offset zero of seq1 and seq2
512 //GapPrelimEditBlockAdd(edit_block, eGapAlignSub,
513 // last_seq2_off[0][diag_origin]);
514 edit_block->opRep(last_seq2_off[0][diag_origin]);
515 if (!reverse)
516 edit_block->reverse();
517 return best_dist;
518 }
519
520 void printEditScript(GXEditScript* ed_script) {
521 uint i;
522 if (ed_script==NULL || ed_script->opnum == 0)
523 return;
524 for (i=0; i<ed_script->opnum; i++) {
525 int num=((ed_script->ops[i]) >> 2);
526 unsigned char op_type = 3 - ( ed_script->ops[i] & gxEDIT_OP_MASK );
527 if (op_type == 3)
528 GError("Error: printEditScript encountered op_type 3 ?!\n");
529 GMessage("%d%c ", num, xgapcodes[op_type]);
530 }
531 GMessage("\n");
532 }
533
534 GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart,
535 bool editscript, int reward, int penalty, int xdrop) {
536 int q_max=strlen(q_seq); //query
537 int s_max=strlen(s_seq); //subj
538 return GreedyAlignRegion(q_seq, q_alnstart, q_max,
539 s_seq, s_alnstart, s_max, reward, penalty, xdrop, NULL, NULL, editscript);
540 }
541
542 struct GXSeedTable {
543 int a_num, b_num;
544 int a_cap, b_cap;
545 char* xc;
546 GXSeedTable(int a=12, int b=255) {
547 a_cap=0;
548 b_cap=0;
549 a_num=0;
550 b_num=0;
551 xc=NULL;
552 init(a,b);
553 }
554 ~GXSeedTable() {
555 GFREE(xc);
556 }
557 void init(int a, int b) {
558 a_num=a;
559 b_num=b;
560 bool resize=false;
561 if (b_num>b_cap) { resize=true; b_cap=b_num;}
562 if (a_num>a_cap) { resize=true; a_cap=a_num;}
563 if (resize) {
564 GFREE(xc);
565 GCALLOC(xc, (a_num*b_num));
566 }
567 else {
568 //just clear up to a_max, b_max
569 memset((void*)xc, 0, (a_num*b_num));
570 }
571 }
572 char& x(int ax, int by) {
573 return xc[by*a_num+ax];
574 }
575
576 };
577
578 const int a_m_score=2; //match score
579 const int a_mis_score=-3; //mismatch
580 const int a_dropoff_score=7;
581 const int a_min_score=12; //at least 6 bases full match
582
583 // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
584 //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
585 //check minimum score and
586 //for 3' adapter trimming:
587 // require that the right end of the alignment for either the adaptor OR the read must be
588 // < 3 distance from its right end
589 // for 5' adapter trimming:
590 // require that the left end of the alignment for either the adaptor OR the read must
591 // be at coordinate < 3 from start
592
593 bool extendUngapped(const char* a, int alen, int ai,
594 const char* b, int blen, int bi, int& mlen, int& l5, int& l3, bool end5=false) {
595 //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
596 //if (debug) {
597 // GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
598 // }
599 int a_l=ai; //alignment coordinates on a
600 int a_r=ai+mlen-1;
601 int b_l=bi; //alignment coordinates on b
602 int b_r=bi+mlen-1;
603 int ai_maxscore=ai;
604 int bi_maxscore=bi;
605 int score=mlen*a_m_score;
606 int maxscore=score;
607 int mism5score=a_mis_score;
608 if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
609 //try to extend to the left first, if possible
610 while (ai>0 && bi>0) {
611 ai--;
612 bi--;
613 score+= (a[ai]==b[bi])? a_m_score : mism5score;
614 if (score>maxscore) {
615 ai_maxscore=ai;
616 bi_maxscore=bi;
617 maxscore=score;
618 }
619 else if (maxscore-score>a_dropoff_score) break;
620 }
621 a_l=ai_maxscore;
622 b_l=bi_maxscore;
623 //now extend to the right
624 ai_maxscore=a_r;
625 bi_maxscore=b_r;
626 ai=a_r;
627 bi=b_r;
628 score=maxscore;
629 //sometimes there are extra As at the end of the read, ignore those
630 if (a[alen-2]=='A' && a[alen-1]=='A') {
631 alen-=2;
632 while (a[alen-1]=='A' && alen>ai) alen--;
633 }
634 while (ai<alen-1 && bi<blen-1) {
635 ai++;
636 bi++;
637 //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
638 if (a[ai]==b[bi]) { //match
639 score+=a_m_score;
640 if (ai>=alen-2) {
641 score+=a_m_score-(alen-ai-1);
642 }
643 }
644 else { //mismatch
645 score+=a_mis_score;
646 }
647 if (score>maxscore) {
648 ai_maxscore=ai;
649 bi_maxscore=bi;
650 maxscore=score;
651 }
652 else if (maxscore-score>a_dropoff_score) break;
653 }
654 a_r=ai_maxscore;
655 b_r=bi_maxscore;
656 int a_ovh3=alen-a_r-1;
657 int b_ovh3=blen-b_r-1;
658 int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
659 int mmovh5=(a_l<b_l)? a_l : b_l;
660 if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
661 if (a_l<a_ovh3) {
662 //adapter closer to the left end (typical for 5' adapter)
663 l5=a_r+1;
664 l3=alen-1;
665 }
666 else {
667 //adapter matching at the right end (typical for 3' adapter)
668 l5=0;
669 l3=a_l-1;
670 }
671 return true;
672 }
673 //do not trim:
674 l5=0;
675 l3=alen-1;
676 return false;
677 }
678
679
680 GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
681 //overlap of right (3') end of seqb
682 //hash the first 12 bases of seqa:
683 int aimin=0;
684 int aimax=GMIN(9,(a_len-4));
685 int bimin=GMAX(0,(b_len-a_len-2));//allow 2 extraneous bases, just in case there is some garbage at the end
686 int bimax=b_len-4;
687 int a_maxlen=aimax+4; //number of rows in the diagonal matrix
688 int b_maxlen=b_len; //number of cols in the diagonal matrix
689 GXSeedTable gx(a_maxlen, b_maxlen);
690 GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
691 for (int ai=aimin;ai<=aimax;ai++) {
692 int32* av=(int32 *)(&seqa[ai]);
693 //for (int bi=b_len-4;bi>=bimin;bi--) {
694 for (int bi=bimin;bi<=bimax;bi++) {
695 if (gx.x(ai,bi))
696 continue; //already have a previous seed covering this region of this diagonal
697 int32* bv=(int32 *)(&seqb[bi]);
698 if (*av == *bv) {
699 //word match
700 //see if we can extend to the right
701 gx.x(ai+1,bi+1)=1;
702 gx.x(ai+2,bi+2)=1;
703 gx.x(ai+3,bi+3)=1;
704 int aix=ai+4;
705 int bix=bi+4;
706 int len=4;
707 while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
708 {
709 gx.x(aix,bix)=1;
710 aix++;bix++;
711 len++;
712 }
713 if (len>7) {
714 //heuristics: very likely the best we can get
715 int qlen=len;
716 while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
717 aix++;
718 bix++;
719 qlen++;
720 }
721 if (qlen>10) { //found a quick match shortcut
722 diagstrips->qmatch=new GXSeed(ai,bi,qlen);
723 return diagstrips;
724 }
725 }
726 GXSeed* newseed=new GXSeed(ai,bi,len);
727 seeds.Add(newseed);
728 diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
729 //special last resort terminal match to be used if no better alignment is there
730 if (len>4 && ai==0 && (bi+len==b_len || bi+len==b_len-1)) diagstrips->tmatch=newseed;
731 }
732 }
733 }//for each 4-mer at the beginning of seqa
734 for (int i=0;i<diagstrips->Count();i++) {
735 diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
736 }
737 diagstrips->setSorted(true); //sort by score
738 return diagstrips;
739 }
740
741 GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
742 //overlap of left (5') end of seqb
743 //hash the last 12 bases of seqa:
744 int aimin=GMAX(0,(a_len-16));
745 int aimax=a_len-4;
746 int bimin=0;
747 int bimax=GMIN((a_len-2), (b_len-4));
748 int a_maxlen=aimax+4; //number of rows in the diagonal matrix
749 int b_maxlen=b_len; //number of cols in the diagonal matrix
750 //gx.init(a_maxlen, b_maxlen);
751 GXSeedTable gx(a_maxlen, b_maxlen);
752 GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
753 for (int ai=aimin;ai<=aimax;ai++) {
754 int32* av=(int32 *)(&seqa[ai]);
755 for (int bi=bimin;bi<=bimax;bi++) {
756 if (gx.x(ai,bi))
757 continue; //already have a previous seed covering this region of this diagonal
758 int32* bv=(int32 *)(&seqb[bi]);
759 if (*av == *bv) {
760 //word match
761 //see if we can extend to the right
762 gx.x(ai+1,bi+1)=1;
763 gx.x(ai+2,bi+2)=1;
764 gx.x(ai+3,bi+3)=1;
765 int aix=ai+4;
766 int bix=bi+4;
767 int len=4;
768 while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
769 {
770 gx.x(aix,bix)=1;
771 aix++;bix++;
772 len++;
773 }
774 if (len>7) {
775 //heuristics: very likely the best we can get
776 int qlen=len;
777 while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
778 aix++;
779 bix++;
780 qlen++;
781 }
782 if (qlen>10) { //found a quick match shortcut
783 diagstrips->qmatch=new GXSeed(ai,bi,qlen);
784 return diagstrips;
785 }
786 }
787 GXSeed* newseed=new GXSeed(ai,bi,len);
788 seeds.Add(newseed);
789 diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
790 //special last resort terminal match to be used if no better alignment is there
791 if (len>4 && bi==0 && ai+len==a_len) diagstrips->tmatch=newseed;
792 }
793 }
794 }//for each 4-mer at the end of seqa
795 for (int i=0;i<diagstrips->Count();i++) {
796 diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
797 }
798 diagstrips->setSorted(true); //sort by score
799 return diagstrips;
800 }
801
802 int cmpSeedScore(const pointer p1, const pointer p2) {
803 //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
804 GXSeed* s1=(GXSeed*)p1;
805 GXSeed* s2=(GXSeed*)p2;
806 if (s1->len==s2->len) {
807 return (s1->b_ofs-s2->b_ofs);
808 }
809 else return (s2->len-s1->len);
810 }
811
812 int cmpSeedDiag(const pointer p1, const pointer p2) {
813 GXSeed* s1=(GXSeed*)p1;
814 GXSeed* s2=(GXSeed*)p2;
815 return ((s1->b_ofs-s1->a_ofs)-(s2->b_ofs-s2->a_ofs));
816 }
817
818
819 int cmpDiagBands_R(const pointer p1, const pointer p2) {
820 //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
821 GXBand* b1=(GXBand*)p1;
822 GXBand* b2=(GXBand*)p2;
823 if (b1->score==b2->score) {
824 return (b2->w_min_b-b1->w_min_b);
825 }
826 else return (b2->score-b1->score);
827 }
828
829
830
831 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
832 const char* s_seq, int s_alnstart, int s_max,
833 int reward, int penalty, int xdrop, CGreedyAlignData* gxmem,
834 CAlnTrim* trim, bool editscript) {
835 GXEditScript* ed_script_fwd = NULL;
836 GXEditScript* ed_script_rev = NULL;
837 if ( q_alnstart>q_max || q_alnstart<1 || s_alnstart>s_max || s_alnstart<1 )
838 GError("GreedyAlign() Error: invalid anchor coordinate.\n");
839 q_alnstart--;
840 s_alnstart--;
841 if (q_seq==NULL || q_seq[0]==0 || s_seq==NULL || s_seq[0]==0)
842 GError("GreedyAlign() Error: attempt to use an empty sequence string!\n");
843 if (q_seq[q_alnstart]!=s_seq[s_alnstart])
844 GError("GreedyAlign() Error: improper anchor (mismatch)!\n");
845 int q_ext_l=0, q_ext_r=0, s_ext_l=0, s_ext_r=0;
846 const char* q=q_seq+q_alnstart;
847 int q_avail=q_max-q_alnstart;
848 const char* s=s_seq+s_alnstart;
849 int s_avail=s_max-s_alnstart;
850 if (penalty<0) penalty=-penalty;
851 int MIN_GREEDY_SCORE=5*reward; //minimum score for an alignment to be reported for 0 diffs
852 GXAlnInfo* alninfo=NULL;
853 bool freeAlnMem=(gxmem==NULL);
854 if (freeAlnMem) {
855 gxmem=new CGreedyAlignData(reward, penalty, xdrop);
856 }
857 else gxmem->reset();
858 int retscore = 0;
859 int numdiffs = 0;
860 if (trim!=NULL && trim->type==galn_TrimLeft) {
861 if (editscript)
862 ed_script_rev=new GXEditScript();
863
864 int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
865 reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
866 //check this extension here and bail out if it's not a good extension
867 if (s_alnstart+1-s_ext_l>trim->boundary) {
868 delete ed_script_rev;
869 if (freeAlnMem) delete gxmem;
870 return NULL;
871 }
872
873 if (editscript)
874 ed_script_fwd=new GXEditScript();
875 int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
876 reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
877 numdiffs=numdiffs_r+numdiffs_l;
878 //convert num diffs to actual score
879 retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward+penalty);
880 if (editscript)
881 ed_script_rev->Append(ed_script_fwd); //combine the two extensions
882 }
883 else {
884 if (editscript) {
885 ed_script_fwd=new GXEditScript();
886 }
887 int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
888 reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
889 //check extension here and bail out if not a good right extension
890 //assuming s_max is really at the right end of s_seq
891 if (trim!=NULL && trim->type==galn_TrimRight && s_alnstart+s_ext_r<trim->boundary) {
892 delete ed_script_fwd;
893 if (freeAlnMem) delete gxmem;
894 return NULL;
895 }
896 if (editscript)
897 ed_script_rev=new GXEditScript();
898 int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
899 reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
900 //convert num diffs to actual score
901 numdiffs=numdiffs_r+numdiffs_l;
902 retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward+penalty);
903 if (editscript)
904 ed_script_rev->Append(ed_script_fwd); //combine the two extensions
905 }
906
907 if (retscore>=MIN_GREEDY_SCORE) {
908 alninfo=new GXAlnInfo(q_seq, q_alnstart+1-q_ext_l,q_alnstart+q_ext_r, s_seq, s_alnstart+1-s_ext_l, s_alnstart+s_ext_r);
909 int hsp_length = GMIN(q_ext_l+q_ext_r, s_ext_l+s_ext_r);
910 alninfo->score=retscore;
911 alninfo->pid = 100 * (1 - ((double) numdiffs) / hsp_length);
912 #ifdef GDEBUG
913 //if (ed_script_rev) {
914 // GMessage("Final Edit script ::: ");
915 // printEditScript(ed_script_rev);
916 // }
917 #endif
918 alninfo->editscript=ed_script_rev;
919 alninfo->gapinfo = new CAlnGapInfo(ed_script_rev, alninfo->ql-1, alninfo->sl-1);
920 }
921 else {
922 if (freeAlnMem) delete gxmem;
923 delete ed_script_rev;
924 delete alninfo;
925 alninfo=NULL;
926 }
927 delete ed_script_fwd;
928 return alninfo;
929 }
930
931 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
932 const char* s_seq, int s_alnstart, int s_max, CGreedyAlignData* gxmem,
933 CAlnTrim* trim, bool editscript) {
934 int reward=2;
935 int penalty=3;
936 int xdrop=8;
937 if (gxmem) {
938 reward=gxmem->match_reward;
939 penalty=gxmem->mismatch_penalty;
940 xdrop=gxmem->x_drop;
941 }
942 return GreedyAlignRegion(q_seq, q_alnstart, q_max, s_seq, s_alnstart, s_max,
943 reward, penalty, xdrop, gxmem, trim, editscript);
944 }
945
946 GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
947 CGreedyAlignData* gxmem, int min_pid) {
948 bool editscript=false;
949 #ifdef GDEBUG
950 editscript=true;
951 // GMessage("==========> matching Right (3') end ========\n");
952 #endif
953
954 CAlnTrim trimInfo(galn_TrimRight, seqb, seqb_len);
955 GList<GXSeed> rseeds(true,true,false);
956 GXBandSet* alnbands=collectSeeds_R(rseeds, seqa, seqa_len, seqb, seqb_len);
957 GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
958 //did we find a shortcut?
959 if (alnbands->qmatch) {
960 //#ifdef GDEBUG
961 // GMessage("::: Found a quick long match -- using it instead of diagonal bands\n");
962 //#endif
963 anchor_seeds.Add(alnbands->qmatch);
964 }
965 else {
966 int max_top_bands=5;
967 int top_band_count=0;
968 for (int b=0;b<alnbands->Count();b++) {
969 if (alnbands->Get(b)->score<4) break;
970 //#ifdef GDEBUG
971 //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
972 //#endif
973 top_band_count++;
974 GXBand& band=*(alnbands->Get(b));
975 band.seeds.setSorted(cmpSeedScore);
976 anchor_seeds.Add(band.seeds.First());
977 band.tested=true;
978 if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
979 }
980 //#ifdef GDEBUG
981 //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
982 //#endif
983 }
984
985 GList<GXAlnInfo> galns(true,true,false);
986 for (int i=0;i<anchor_seeds.Count();i++) {
987 GXSeed& aseed=*anchor_seeds[i];
988 int a1=aseed.a_ofs+(aseed.len>>1)+1;
989 int a2=aseed.b_ofs+(aseed.len>>1)+1;
990 GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
991 seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
992 if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
993 galns.AddIfNew(alninfo, true);
994 else delete alninfo;
995 }
996 //special 3' end case: due to the seed scoring scheme being biased towards the 5' end of the read,
997 //we should also try some seeds closer to the 3' end
998 if (galns.Count()==0) {
999 anchor_seeds.Clear();
1000 alnbands->setSorted(cmpDiagBands_R);
1001 int max_top_bands=4;
1002 int top_band_count=0;
1003 //#ifdef GDEBUG
1004 //GMessage(":::>> Retrying adjusting sort order.\n");
1005 //#endif
1006 if (alnbands->tmatch) {
1007 //anchor_seeds.setSorted(false);
1008 anchor_seeds.Add(alnbands->tmatch);
1009 }
1010 for (int b=0;b<alnbands->Count();b++) {
1011 if (alnbands->Get(b)->score<4) break;
1012 //#ifdef GDEBUG
1013 //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
1014 //#endif
1015 if (alnbands->Get(b)->tested) continue;
1016 top_band_count++;
1017 GXBand& band=*(alnbands->Get(b));
1018 band.seeds.setSorted(cmpSeedScore);
1019 anchor_seeds.Add(band.seeds.First());
1020 if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1021 }
1022 //#ifdef GDEBUG
1023 //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1024 //#endif
1025 for (int i=0;i<anchor_seeds.Count();i++) {
1026 GXSeed& aseed=*anchor_seeds[i];
1027 int a1=aseed.a_ofs+(aseed.len>>1)+1;
1028 int a2=aseed.b_ofs+(aseed.len>>1)+1;
1029 GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1030 seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1031 if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1032 galns.AddIfNew(alninfo, true);
1033 else delete alninfo;
1034 }
1035 }
1036 //---- done
1037 delete alnbands;
1038 if (galns.Count()) {
1039 GXAlnInfo* bestaln=galns.Shift();
1040 #ifdef GDEBUG
1041 GMessage("Best alignment: a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", bestaln->ql, bestaln->qr,
1042 bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid);
1043 if (bestaln->gapinfo!=NULL) {
1044 bestaln->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1045 }
1046 #endif
1047
1048 return bestaln;
1049 }
1050 else return NULL;
1051 }
1052
1053 GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
1054 CGreedyAlignData* gxmem, int min_pid) {
1055 bool editscript=false;
1056 #ifdef GDEBUG
1057 editscript=true;
1058 //GMessage("==========> matching Left (5') end ========\n");
1059 #endif
1060 CAlnTrim trimInfo(galn_TrimLeft, seqb, seqb_len);
1061 GList<GXSeed> rseeds(true,true,false);
1062 GXBandSet* alnbands=collectSeeds_L(rseeds, seqa, seqa_len, seqb, seqb_len);
1063 GList<GXAlnInfo> galns(true,true,false);
1064 GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
1065
1066 if (alnbands->qmatch) {
1067 //#ifdef GDEBUG
1068 // GMessage("::: Found a quick long match -- using it instead of diagonal bands\n");
1069 //#endif
1070 anchor_seeds.Add(alnbands->qmatch);
1071 }
1072 else {
1073 int max_top_bands=5;
1074 int top_band_count=0;
1075 for (int b=0;b<alnbands->Count();b++) {
1076 if (alnbands->Get(b)->score<4) break;
1077 //#ifdef GDEBUG
1078 //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
1079 //#endif
1080 top_band_count++;
1081 GXBand& band=*(alnbands->Get(b));
1082 band.seeds.setSorted(cmpSeedScore);
1083 anchor_seeds.Add(band.seeds.First());
1084 if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1085 }
1086 //#ifdef GDEBUG
1087 //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1088 //#endif
1089 }
1090 for (int i=0;i<anchor_seeds.Count();i++) {
1091 GXSeed& aseed=*anchor_seeds[i];
1092 int a1=aseed.a_ofs+(aseed.len>>1)+1;
1093 int a2=aseed.b_ofs+(aseed.len>>1)+1;
1094 GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1095 seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1096 if (alninfo && alninfo->pid>=min_pid
1097 && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, seqa_len-alninfo->qr))
1098 galns.AddIfNew(alninfo, true);
1099 else delete alninfo;
1100 }
1101 if (galns.Count()==0 && alnbands->tmatch) {
1102 GXSeed& aseed=*alnbands->tmatch;
1103 int a1=aseed.a_ofs+(aseed.len>>1)+1;
1104 int a2=aseed.b_ofs+(aseed.len>>1)+1;
1105 GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1106 seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1107 if (alninfo) galns.Add(alninfo);
1108 }
1109 #ifdef GDEBUG
1110 /*
1111 for (int i=0;i<galns.Count();i++) {
1112 GXAlnInfo* alninfo=galns[i];
1113 GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
1114 alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
1115 if (alninfo->gapinfo!=NULL) {
1116 GMessage("Alignment:\n");
1117 alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1118 }
1119 }
1120 */
1121 #endif
1122 //---- done
1123 delete alnbands;
1124 if (galns.Count()) {
1125 GXAlnInfo* bestaln=galns.Shift();
1126 #ifdef GDEBUG
1127 GMessage("Best alignment: a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", bestaln->ql, bestaln->qr,
1128 bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid);
1129 if (bestaln->gapinfo!=NULL) {
1130 bestaln->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1131 }
1132 #endif
1133 return bestaln;
1134 }
1135 else return NULL;
1136 }