ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.cpp
Revision: 99
Committed: Thu Oct 6 20:49:17 2011 UTC (7 years, 9 months ago) by gpertea
File size: 35461 byte(s)
Log Message:
wip

Line File contents
1 #include "GAlnExtend.h"
2
3 //greedy gapped alignment extension
4 //(mostly lifted from NCBI's megablast gapped extension code)
5
6 int GXMemPool::kMinSpace = 1000000;
7
8 #ifdef GDEBUG
9 char buf[6]={0x1B,'[', 'n','m','m','\0'};
10
11 void color_fg(int c,FILE* f) {
12 if (f!=stderr && f!=stdout) return;
13 sprintf((char *)(&buf[2]),"%dm",c+30);
14 fwrite(buf,1,strlen(buf), f);
15 }
16
17 void color_bg(int c, FILE* f) {
18 if (f!=stderr && f!=stdout) return;
19 sprintf((char *)(&buf[2]),"%dm",c+40);
20 fwrite(buf,1,strlen(buf),f);
21 };
22
23 void color_resetfg(FILE* f) {
24 if (f!=stderr && f!=stdout) return;
25 sprintf((char *)(&buf[2]),"39m");
26 fwrite(buf,1,strlen(buf), f);
27 };
28
29 void color_resetbg(FILE* f) {
30 if (f!=stderr && f!=stdout) return;
31 sprintf((char *)(&buf[2]),"49m");
32 fwrite(buf,1,strlen(buf), f);
33 }
34
35 void color_reset(FILE* f) {
36 if (f!=stderr && f!=stdout) return;
37 sprintf((char *)(&buf[2]),"0m");
38 fwrite(buf,1,strlen(buf), f);
39 };
40
41 void color_normal(FILE* f) {
42 if (f!=stderr && f!=stdout) return;
43 sprintf((char *)(&buf[2]),"22m");
44 fwrite(buf,1,strlen(buf), f);
45 };
46
47 #endif
48
49
50 char xgapcodes[4]={'S','I', 'D', 'X'};
51
52 int get_last(int **flast_d, int d, int diag, int *row1) {
53 if (flast_d[d-1][diag-1] > GMAX(flast_d[d-1][diag], flast_d[d-1][diag+1])) {
54 *row1 = flast_d[d-1][diag-1];
55 return diag-1;
56 }
57 if (flast_d[d-1][diag] > flast_d[d-1][diag+1]) {
58 *row1 = flast_d[d-1][diag];
59 return diag;
60 }
61 *row1 = flast_d[d-1][diag+1];
62 return diag+1;
63 }
64
65 void GapXEditScript::print() { //debug
66 GapXEditScript* p=this;
67 do {
68 GMessage("%d%c ",p->num, xgapcodes[p->op_type]);
69 } while ((p=p->next)!=NULL);
70 GMessage("\n");
71 }
72
73
74 int BLAST_Gcd(int a, int b) {
75 int c;
76
77 b = abs(b);
78 if (b > a)
79 c=a, a=b, b=c;
80
81 while (b != 0) {
82 c = a%b;
83 a = b;
84 b = c;
85 }
86 return a;
87 }
88
89 int BLAST_Gdb3(int* a, int* b, int* c) {
90 int g;
91 if (*b == 0)
92 g = BLAST_Gcd(*a, *c);
93 else
94 g = BLAST_Gcd(*a, BLAST_Gcd(*b, *c));
95 if (g > 1) {
96 *a /= g;
97 *b /= g;
98 *c /= g;
99 }
100 return g;
101 }
102
103
104
105 //signal that a diagonal is invalid
106 static const int kInvalidOffset = -2;
107
108 int s_FindFirstMismatch(const char *seq1, int len1,
109 const char *seq2, int len2,
110 int seq1_index, int seq2_index,
111 //bool &fence_hit,
112 bool reverse) {
113 int start_index = seq1_index;
114 /* Sentry detection here should be relatively inexpensive: The
115 sentry value cannot appear in the query, so detection only
116 needs to be done at exit from the subject-query matching loop.
117 For uncompressed sequences, ambiguities in the query (i.e. seq1)
118 always count as mismatches */
119 if (reverse) {
120 while (seq1_index < len1 && seq2_index < len2 &&
121 //seq1[len1-1 - seq1_index] < 4 &&
122 seq1[len1-1 - seq1_index] == seq2[len2-1 - seq2_index]) {
123 ++seq1_index;
124 ++seq2_index;
125 }
126 //if (seq2_index < len2 && seq2[len2-1-seq2_index] == FENCE_SENTRY) {
127 //if len2-1-seq2_index<=0) {
128 // fence_hit = true;
129 // }
130 }
131 else { //forward lookup
132 while (seq1_index < len1 && seq2_index < len2 &&
133 //seq1[seq1_index] < 4 &&
134 seq1[seq1_index] == seq2[seq2_index]) {
135 ++seq1_index;
136 ++seq2_index;
137 }
138 //if (seq2_index < len2 && seq2[seq2_index] == FENCE_SENTRY) {
139 //if (seq2_index==len2) {
140 // fence_hit = true;
141 //}
142 }
143 return seq1_index - start_index;
144 }
145
146
147
148 /** During the traceback for a non-affine greedy alignment,
149 compute the diagonal that will result from the next
150 traceback operation
151
152 @param last_seq2_off Array of offsets into the second sequence;
153 last_seq2_off[d][k] gives the largest offset into
154 the second sequence that lies on diagonal k and
155 has distance d [in]
156 @param d Starting distance [in]
157 @param diag Index of diagonal that produced the starting distance [in]
158 @param seq2_index The offset into the second sequence after the traceback
159 operation has completed [out]
160 @return The diagonal resulting from the next traceback operation
161 being applied
162 */
163 int s_GetNextNonAffineTback(int **last_seq2_off, int d,
164 int diag, int *seq2_index) {
165 // choose the traceback operation that results in the
166 // largest seq2 offset at this point, then compute the
167 // new diagonal that is implied by the operation
168 if (last_seq2_off[d-1][diag-1] >
169 GMAX(last_seq2_off[d-1][diag], last_seq2_off[d-1][diag+1])) {
170 *seq2_index = last_seq2_off[d-1][diag-1];
171 return diag - 1; // gap in seq2
172 }
173 if (last_seq2_off[d-1][diag] > last_seq2_off[d-1][diag+1]) {
174 *seq2_index = last_seq2_off[d-1][diag];
175 return diag; // match
176 }
177 *seq2_index = last_seq2_off[d-1][diag+1];
178 return diag + 1; // gap in seq1
179 }
180
181
182 int GXGreedyExtend(const char* seq1, int len1,
183 const char* seq2, int len2,
184 bool reverse, int xdrop_threshold,
185 int match_cost, int mismatch_cost,
186 int& seq1_align_len, int& seq2_align_len,
187 SGreedyAlignMem& aux_data,
188 GXEditScript *edit_block) {
189 //GapPrelimEditBlock *edit_block,
190 //bool& fence_hit, SGreedySeed *seed) {
191 int seq1_index;
192 int seq2_index;
193 int index;
194 int d;
195 int k;
196 int diag_lower, diag_upper;
197 int max_dist;
198 int diag_origin;
199 int best_dist;
200 int best_diag;
201 int** last_seq2_off;
202 int* max_score;
203 int xdrop_offset;
204 int longest_match_run;
205 bool end1_reached, end2_reached;
206 GXMemPool* mem_pool;
207
208 /* ordinary dynamic programming alignment, for each offset
209 in seq1, walks through offsets in seq2 until an X-dropoff
210 test fails, saving the best score encountered along
211 the way. Instead of score, this code tracks the 'distance'
212 (number of mismatches plus number of gaps) between seq1
213 and seq2. Instead of walking through sequence offsets, it
214 walks through diagonals that can achieve a given distance.
215
216 Note that in what follows, the numbering of diagonals implies
217 a dot matrix where increasing seq1 offsets go to the right on
218 the x axis, and increasing seq2 offsets go up the y axis.
219 The gapped alignment thus proceeds up and to the right in
220 the graph, and diagonals are numbered increasing to the right */
221
222 best_dist = 0;
223 best_diag = 0;
224
225 /* set the number of distinct distances the algorithm will
226 examine in the search for an optimal alignment. The
227 heuristic worst-case running time of the algorithm is
228 O(max_dist**2 + (len1+len2)); for sequences which are
229 very similar, the average running time will be sig-
230 nificantly better than this */
231
232 max_dist = GMIN(GREEDY_MAX_COST,
233 len2 / GREEDY_MAX_COST_FRACTION + 1);
234
235 /* the main loop assumes that the index of all diagonals is
236 biased to lie in the middle of allocated bookkeeping
237 structures */
238
239 diag_origin = max_dist + 2;
240
241 // last_seq2_off[d][k] is the largest offset into seq2 that
242 // lies on diagonal k and has distance d
243
244 last_seq2_off = aux_data.last_seq2_off;
245
246 /* Instead of tracking the best alignment score and using
247 xdrop_theshold directly, track the best score for each
248 unique distance and use the best score for some previously
249 computed distance to implement the X-dropoff test.
250
251 xdrop_offset gives the distance backwards in the score
252 array to look */
253
254 xdrop_offset = (xdrop_threshold + match_cost / 2) /
255 (match_cost + mismatch_cost) + 1;
256
257 // find the offset of the first mismatch between seq1 and seq2
258
259 index = s_FindFirstMismatch(seq1, len1, seq2, len2, 0, 0, reverse);
260 // fence_hit, reverse, rem);
261
262 // update the extents of the alignment, and bail out
263 // early if no further work is needed
264
265 seq1_align_len = index;
266 seq2_align_len = index;
267 seq1_index = index;
268 /*
269 seed->start_q = 0;
270 seed->start_s = 0;
271 seed->match_length = index;
272 */
273 longest_match_run = index;
274
275 if (index == len1 || index == len2) {
276 /* Return the number of differences, which is zero here */
277 if (edit_block != NULL)
278 //GapPrelimEditBlockAdd(edit_block, eGapAlignSub, index);
279 edit_block->opRep(index);
280 return 0;
281 }
282
283 // set up the memory pool
284 mem_pool = aux_data.space;
285 if (edit_block == NULL) {
286 mem_pool = NULL;
287 }
288 else if (mem_pool == NULL) {
289 aux_data.space = mem_pool = new GXMemPool();
290 }
291 else {
292 mem_pool->refresh();
293 }
294
295 /* set up the array of per-distance maximum scores. There
296 are max_diags + xdrop_offset distances to track, the first
297 xdrop_offset of which are 0 */
298
299 max_score = aux_data.max_score + xdrop_offset;
300 for (index = 0; index < xdrop_offset; index++)
301 aux_data.max_score[index] = 0;
302
303 // fill in the initial offsets of the distance matrix
304
305 last_seq2_off[0][diag_origin] = seq1_index;
306 max_score[0] = seq1_index * match_cost;
307 diag_lower = diag_origin - 1;
308 diag_upper = diag_origin + 1;
309 end1_reached = end2_reached = false;
310
311 // for each distance
312 for (d = 1; d <= max_dist; d++) {
313 int xdrop_score;
314 int curr_score;
315 int curr_extent = 0;
316 int curr_seq2_index = 0;
317 int curr_diag = 0;
318 int tmp_diag_lower = diag_lower;
319 int tmp_diag_upper = diag_upper;
320
321 // Assign impossible seq2 offsets to any diagonals that
322 // are not in the range (diag_lower,diag_upper).
323 // These will serve as sentinel values for the inner loop
324 last_seq2_off[d - 1][diag_lower-1] = kInvalidOffset;
325 last_seq2_off[d - 1][diag_lower] = kInvalidOffset;
326 last_seq2_off[d - 1][diag_upper] = kInvalidOffset;
327 last_seq2_off[d - 1][diag_upper+1] = kInvalidOffset;
328
329 // compute the score for distance d corresponding to the X-dropoff criterion
330
331 xdrop_score = max_score[d - xdrop_offset] +
332 (match_cost + mismatch_cost) * d - xdrop_threshold;
333 xdrop_score = (int)ceil((double)xdrop_score / (match_cost / 2));
334
335 // for each diagonal of interest
336 for (k = tmp_diag_lower; k <= tmp_diag_upper; k++) {
337 /* find the largest offset into seq2 that increases
338 the distance from d-1 to d (i.e. keeps the alignment
339 from getting worse for as long as possible), then
340 choose the offset into seq1 that will keep the
341 resulting diagonal fixed at k
342
343 Note that this requires kInvalidOffset+1 to be smaller
344 than any valid offset into seq2, i.e. to be negative */
345
346 seq2_index = GMAX(last_seq2_off[d - 1][k + 1],
347 last_seq2_off[d - 1][k ]) + 1;
348 seq2_index = GMAX(seq2_index, last_seq2_off[d - 1][k - 1]);
349 seq1_index = seq2_index + k - diag_origin;
350
351 if (seq2_index < 0 || seq1_index + seq2_index < xdrop_score) {
352
353 // if no valid diagonal can reach distance d, or the
354 // X-dropoff test fails, narrow the range of diagonals
355 // to test and skip to the next diagonal
356 if (k == diag_lower)
357 diag_lower++;
358 else
359 last_seq2_off[d][k] = kInvalidOffset;
360 continue;
361 }
362 diag_upper = k;
363
364 /* slide down diagonal k until a mismatch
365 occurs. As long as only matches are encountered,
366 the current distance d will not change */
367
368 index = s_FindFirstMismatch(seq1, len1, seq2, len2,
369 seq1_index, seq2_index, reverse);
370 //fence_hit, reverse, rem);
371 if (index > longest_match_run) {
372 //seed->start_q = seq1_index;
373 //seed->start_s = seq2_index;
374 //seed->match_length = index;
375 longest_match_run = index;
376 }
377 seq1_index += index;
378 seq2_index += index;
379
380 // set the new largest seq2 offset that achieves
381 // distance d on diagonal k
382
383 last_seq2_off[d][k] = seq2_index;
384
385 // since all values of k are constrained to have the
386 // same distance d, the value of k which maximizes the
387 // alignment score is the one that covers the most of seq1 and seq2
388 if (seq1_index + seq2_index > curr_extent) {
389 curr_extent = seq1_index + seq2_index;
390 curr_seq2_index = seq2_index;
391 curr_diag = k;
392 }
393
394 /* clamp the bounds on diagonals to avoid walking off
395 either sequence. Because the bounds increase by at
396 most one for each distance, diag_lower and diag_upper
397 can each be of size at most max_diags+2 */
398
399 if (seq2_index == len2) {
400 diag_lower = k + 1;
401 end2_reached = true;
402 }
403 if (seq1_index == len1) {
404 diag_upper = k - 1;
405 end1_reached = true;
406 }
407 } // end loop over diagonals
408
409 // compute the maximum score possible for distance d
410 curr_score = curr_extent * (match_cost / 2) -
411 d * (match_cost + mismatch_cost);
412 // if this is the best score seen so far, update the
413 // statistics of the best alignment
414 if (curr_score > max_score[d - 1]) {
415 max_score[d] = curr_score;
416 best_dist = d;
417 best_diag = curr_diag;
418 seq2_align_len = curr_seq2_index;
419 seq1_align_len = curr_seq2_index + best_diag - diag_origin;
420 }
421 else {
422 max_score[d] = max_score[d - 1];
423 }
424
425 // alignment has finished if the lower and upper bounds
426 // on diagonals to check have converged to each other
427
428 if (diag_lower > diag_upper)
429 break;
430
431 /* set up for the next distance to examine. Because the
432 bounds increase by at most one for each distance,
433 diag_lower and diag_upper can each be of size at
434 most max_diags+2 */
435
436 if (!end2_reached)
437 diag_lower--;
438 if (!end1_reached)
439 diag_upper++;
440
441 if (edit_block == NULL) {
442 // if no traceback is specified, the next row of
443 // last_seq2_off can reuse previously allocated memory
444 //TODO FIXME The following assumes two arrays of
445 // at least max_dist+4 int's have already been allocated
446 last_seq2_off[d + 1] = last_seq2_off[d - 1];
447 }
448 else {
449 // traceback requires all rows of last_seq2_off to be saved,
450 // so a new row must be allocated
451 last_seq2_off[d + 1] = (int*)mem_pool->getByteSpace((diag_upper - diag_lower + 7)*sizeof(int));
452 // move the origin for this row backwards
453 //TODO FIXME: dubious pointer arithmetic ?!
454 //last_seq2_off[d + 1] = last_seq2_off[d + 1] - diag_lower + 2;
455 }
456 } // end loop over distinct distances
457
458
459 if (edit_block == NULL)
460 return best_dist;
461
462 //---- perform traceback
463 d = best_dist;
464 seq1_index = seq1_align_len;
465 seq2_index = seq2_align_len;
466 // for all positive distances
467
468 //if (fence_hit && *fence_hit)
469 // goto done;
470 if (index==len1 || index==len2) d=0;
471 while (d > 0) {
472 int new_diag;
473 int new_seq2_index;
474
475 /* retrieve the value of the diagonal after the next
476 traceback operation. best_diag starts off with the
477 value computed during the alignment process */
478
479 new_diag = s_GetNextNonAffineTback(last_seq2_off, d,
480 best_diag, &new_seq2_index);
481
482 if (new_diag == best_diag) {
483 // same diagonal: issue a group of substitutions
484 if (seq2_index - new_seq2_index > 0) {
485 edit_block->opRep(seq2_index - new_seq2_index);
486 }
487 }
488 else if (new_diag < best_diag) {
489 // smaller diagonal: issue a group of substitutions
490 // and then a gap in seq2 */
491 if (seq2_index - new_seq2_index > 0) {
492 edit_block->opRep(seq2_index - new_seq2_index);
493 }
494 //GapPrelimEditBlockAdd(edit_block, eGapAlignIns, 1);
495 edit_block->opIns(1);
496 }
497 else {
498 // larger diagonal: issue a group of substitutions
499 // and then a gap in seq1
500 if (seq2_index - new_seq2_index - 1 > 0) {
501 edit_block->opRep(seq2_index - new_seq2_index - 1);
502 }
503 edit_block->opDel(1);
504 }
505 d--;
506 best_diag = new_diag;
507 seq2_index = new_seq2_index;
508 }
509 //done:
510 // handle the final group of substitutions back to distance zero,
511 // i.e. back to offset zero of seq1 and seq2
512 //GapPrelimEditBlockAdd(edit_block, eGapAlignSub,
513 // last_seq2_off[0][diag_origin]);
514 edit_block->opRep(last_seq2_off[0][diag_origin]);
515 if (!reverse)
516 edit_block->reverse();
517 return best_dist;
518 }
519
520 void printEditScript(GXEditScript* ed_script) {
521 uint i;
522 if (ed_script==NULL || ed_script->opnum == 0)
523 return;
524 for (i=0; i<ed_script->opnum; i++) {
525 int num=((ed_script->ops[i]) >> 2);
526 unsigned char op_type = 3 - ( ed_script->ops[i] & gxEDIT_OP_MASK );
527 if (op_type == 3)
528 GError("Error: printEditScript encountered op_type 3 ?!\n");
529 GMessage("%d%c ", num, xgapcodes[op_type]);
530 }
531 GMessage("\n");
532 }
533
534 GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart,
535 bool editscript, int reward, int penalty, int xdrop) {
536 int q_max=strlen(q_seq); //query
537 int s_max=strlen(s_seq); //subj
538 return GreedyAlignRegion(q_seq, q_alnstart, q_max,
539 s_seq, s_alnstart, s_max, galn_NoTrim, editscript, NULL, reward, penalty, xdrop);
540 }
541
542 struct GXSeedTable {
543 int a_num, b_num;
544 int a_cap, b_cap;
545 char* xc;
546 GXSeedTable(int a=12, int b=255) {
547 a_cap=0;
548 b_cap=0;
549 a_num=0;
550 b_num=0;
551 xc=NULL;
552 init(a,b);
553 }
554 ~GXSeedTable() {
555 GFREE(xc);
556 }
557 void init(int a, int b) {
558 a_num=a;
559 b_num=b;
560 bool resize=false;
561 if (b_num>b_cap) { resize=true; b_cap=b_num;}
562 if (a_num>a_cap) { resize=true; a_cap=a_num;}
563 if (resize) {
564 GFREE(xc);
565 GCALLOC(xc, (a_num*b_num));
566 }
567 else {
568 //just clear up to a_max, b_max
569 memset((void*)xc, 0, (a_num*b_num));
570 }
571 }
572 char& x(int ax, int by) {
573 return xc[by*a_num+ax];
574 }
575
576 };
577
578 const int a_m_score=2; //match score
579 const int a_mis_score=-3; //mismatch
580 const int a_dropoff_score=7;
581
582 // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
583 //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
584 //check minimum score and
585 //for 3' adapter trimming:
586 // require that the right end of the alignment for either the adaptor OR the read must be
587 // < 3 distance from its right end
588 // for 5' adapter trimming:
589 // require that the left end of the alignment for either the adaptor OR the read must
590 // be at coordinate < 3 from start
591
592 bool extendUngapped(const char* a, int alen, int ai,
593 const char* b, int blen, int bi, int& mlen, int& l5, int& l3, bool end5=false) {
594 //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
595 //if (debug) {
596 // GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
597 // }
598 int a_l=ai; //alignment coordinates on a
599 int a_r=ai+mlen-1;
600 int b_l=bi; //alignment coordinates on b
601 int b_r=bi+mlen-1;
602 int ai_maxscore=ai;
603 int bi_maxscore=bi;
604 int score=mlen*a_m_score;
605 int maxscore=score;
606 int mism5score=a_mis_score;
607 if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
608 //try to extend to the left first, if possible
609 while (ai>0 && bi>0) {
610 ai--;
611 bi--;
612 score+= (a[ai]==b[bi])? a_m_score : mism5score;
613 if (score>maxscore) {
614 ai_maxscore=ai;
615 bi_maxscore=bi;
616 maxscore=score;
617 }
618 else if (maxscore-score>a_dropoff_score) break;
619 }
620 a_l=ai_maxscore;
621 b_l=bi_maxscore;
622 //now extend to the right
623 ai_maxscore=a_r;
624 bi_maxscore=b_r;
625 ai=a_r;
626 bi=b_r;
627 score=maxscore;
628 //sometimes there are extra As at the end of the read, ignore those
629 if (a[alen-2]=='A' && a[alen-1]=='A') {
630 alen-=2;
631 while (a[alen-1]=='A' && alen>ai) alen--;
632 }
633 while (ai<alen-1 && bi<blen-1) {
634 ai++;
635 bi++;
636 //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
637 if (a[ai]==b[bi]) { //match
638 score+=a_m_score;
639 if (ai>=alen-2) {
640 score+=a_m_score-(alen-ai-1);
641 }
642 }
643 else { //mismatch
644 score+=a_mis_score;
645 }
646 if (score>maxscore) {
647 ai_maxscore=ai;
648 bi_maxscore=bi;
649 maxscore=score;
650 }
651 else if (maxscore-score>a_dropoff_score) break;
652 }
653 a_r=ai_maxscore;
654 b_r=bi_maxscore;
655 int a_ovh3=alen-a_r-1;
656 int b_ovh3=blen-b_r-1;
657 int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
658 int mmovh5=(a_l<b_l)? a_l : b_l;
659 if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
660 if (a_l<a_ovh3) {
661 //adapter closer to the left end (typical for 5' adapter)
662 l5=a_r+1;
663 l3=alen-1;
664 }
665 else {
666 //adapter matching at the right end (typical for 3' adapter)
667 l5=0;
668 l3=a_l-1;
669 }
670 return true;
671 }
672 //do not trim:
673 l5=0;
674 l3=alen-1;
675 return false;
676 }
677
678
679 bool extendUngapped_R(const char* seqa, int a_len, int &aix,
680 const char* seqb, int b_len, int &bix, int &len) {
681 //must start with a match
682 if (seqa[aix]!=seqb[bix]) GError("Error at extendUngapped_R: no initial match!\n");
683
684 }
685
686 GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
687 //overlap of right (3') end of seqb
688 //hash the first 12 bases of seqa:
689 int aimin=0;
690 int aimax=GMIN(9,(a_len-4));
691 int bimin=GMAX(0,(b_len-a_len-2));//allow 2 extraneous bases, just in case there is some garbage at the end
692 int bimax=b_len-4;
693 int a_maxlen=aimax+4; //number of rows in the diagonal matrix
694 int b_maxlen=b_len; //number of cols in the diagonal matrix
695 GXSeedTable gx(a_maxlen, b_maxlen);
696 GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
697 for (int ai=aimin;ai<=aimax;ai++) {
698 int32* av=(int32 *)(&seqa[ai]);
699 //for (int bi=b_len-4;bi>=bimin;bi--) {
700 for (int bi=bimin;bi<=bimax;bi++) {
701 if (gx.x(ai,bi))
702 continue; //already have a previous seed covering this region of this diagonal
703 int32* bv=(int32 *)(&seqb[bi]);
704 if (*av == *bv) {
705 //word match
706 //see if we can extend to the right
707 gx.x(ai+1,bi+1)=1;
708 gx.x(ai+2,bi+2)=1;
709 gx.x(ai+3,bi+3)=1;
710 int aix=ai+4;
711 int bix=bi+4;
712 int len=4;
713 while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
714 {
715 gx.x(aix,bix)=1;
716 aix++;bix++;
717 len++;
718 }
719 if (len>8) {
720 //heuristics: very likely the best we can get
721 int qlen=len;
722 while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
723 aix++;
724 bix++;
725 qlen++;
726 }
727 if (qlen>10) {
728 diagstrips->qmatch=new GXSeed(ai,bi,qlen);
729 return diagstrips;
730 }
731 }
732 GXSeed* newseed=new GXSeed(ai,bi,len);
733 seeds.Add(newseed);
734 diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
735 }
736 }
737 }//for each 4-mer at the beginning of seqa
738 for (int i=0;i<diagstrips->Count();i++) {
739 diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
740 }
741 diagstrips->setSorted(true); //sort by score
742 return diagstrips;
743 }
744
745 GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
746 //overlap of left (5') end of seqb
747 //hash the last 12 bases of seqa:
748 int aimin=GMAX(0,(a_len-16));
749 int aimax=a_len-4;
750 int bimin=0;
751 int bimax=GMIN((a_len-2), (b_len-4));
752 int a_maxlen=aimax+4; //number of rows in the diagonal matrix
753 int b_maxlen=b_len; //number of cols in the diagonal matrix
754 //gx.init(a_maxlen, b_maxlen);
755 GXSeedTable gx(a_maxlen, b_maxlen);
756 GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
757 for (int ai=aimin;ai<=aimax;ai++) {
758 int32* av=(int32 *)(&seqa[ai]);
759 for (int bi=bimin;bi<=bimax;bi++) {
760 if (gx.x(ai,bi))
761 continue; //already have a previous seed covering this region of this diagonal
762 int32* bv=(int32 *)(&seqb[bi]);
763 if (*av == *bv) {
764 //word match
765 //see if we can extend to the right
766 gx.x(ai+1,bi+1)=1;
767 gx.x(ai+2,bi+2)=1;
768 gx.x(ai+3,bi+3)=1;
769 int aix=ai+4;
770 int bix=bi+4;
771 int len=4;
772 while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
773 {
774 gx.x(aix,bix)=1;
775 aix++;bix++;
776 len++;
777 }
778 GXSeed* newseed=new GXSeed(ai,bi,len);
779 seeds.Add(newseed);
780 diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
781 }
782 }
783 }//for each 4-mer at the end of seqa
784 for (int i=0;i<diagstrips->Count();i++) {
785 diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
786 }
787 diagstrips->setSorted(true); //sort by score
788 return diagstrips;
789 }
790
791
792
793 int cmpSeedScore(const pointer p1, const pointer p2) {
794 //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
795 GXSeed* s1=(GXSeed*)p1;
796 GXSeed* s2=(GXSeed*)p2;
797 if (s1->len==s2->len) {
798 return (s1->b_ofs-s2->b_ofs);
799 }
800 else return (s2->len-s1->len);
801 }
802
803 int cmpSeedDiag(const pointer p1, const pointer p2) {
804 GXSeed* s1=(GXSeed*)p1;
805 GXSeed* s2=(GXSeed*)p2;
806 return ((s1->b_ofs-s1->a_ofs)-(s2->b_ofs-s2->a_ofs));
807 }
808
809 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
810 const char* s_seq, int s_alnstart, int s_max, GAlnTrimType trimtype,
811 bool editscript, SGreedyAlignMem* gxmem, int reward, int penalty, int xdrop) {
812 GXEditScript* ed_script_fwd = NULL;
813 GXEditScript* ed_script_rev = NULL;
814 if ( q_alnstart>q_max || q_alnstart<1 || s_alnstart>s_max || s_alnstart<1 )
815 GError("GreedyAlign() Error: invalid anchor coordinate.\n");
816 q_alnstart--;
817 s_alnstart--;
818 if (q_seq==NULL || q_seq[0]==0 || s_seq==NULL || s_seq[0]==0)
819 GError("GreedyAlign() Error: attempt to use an empty sequence string!\n");
820 if (q_seq[q_alnstart]!=s_seq[s_alnstart])
821 GError("GreedyAlign() Error: improper anchor (mismatch)!\n");
822 int q_ext_l=0, q_ext_r=0, s_ext_l=0, s_ext_r=0;
823 const char* q=q_seq+q_alnstart;
824 int q_avail=q_max-q_alnstart;
825 const char* s=s_seq+s_alnstart;
826 int s_avail=s_max-s_alnstart;
827 if (penalty<0) penalty=-penalty;
828 int MIN_GREEDY_SCORE=5*reward; //minimum score for an alignment to be reported
829 GXAlnInfo* alninfo=NULL;
830 bool freeAlnMem=(gxmem==NULL);
831 if (freeAlnMem) {
832 gxmem=new SGreedyAlignMem(reward, penalty, xdrop);
833 }
834 else gxmem->reset();
835 int retscore = 0;
836 int numdiffs = 0;
837 if (trimtype==galn_TrimLeft) {
838 if (editscript)
839 ed_script_rev=new GXEditScript();
840
841 int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
842 reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
843 //check this extension here and bail out if it's not a good extension
844 if (s_alnstart+1-s_ext_l>3) {
845 delete ed_script_rev;
846 if (freeAlnMem) delete gxmem;
847 return NULL;
848 }
849
850 if (editscript)
851 ed_script_fwd=new GXEditScript();
852 int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
853 reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
854 numdiffs=numdiffs_r+numdiffs_l;
855 //convert num diffs to actual score
856 retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward-penalty);
857 if (editscript)
858 ed_script_rev->Append(ed_script_fwd); //combine the two extensions
859 }
860 else {
861 if (editscript) {
862 ed_script_fwd=new GXEditScript();
863 }
864 int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
865 reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
866 //check extension here and bail out if not a good right extension
867 //assuming s_max is really at the right end of s_seq
868 if (trimtype==galn_TrimRight && s_alnstart+s_ext_r<s_max-2) {
869 delete ed_script_fwd;
870 if (freeAlnMem) delete gxmem;
871 return NULL;
872 }
873 if (editscript)
874 ed_script_rev=new GXEditScript();
875 int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
876 reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
877 //convert num diffs to actual score
878 numdiffs=numdiffs_r+numdiffs_l;
879 retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward-penalty);
880 if (editscript)
881 ed_script_rev->Append(ed_script_fwd); //combine the two extensions
882 }
883
884 if (retscore>=MIN_GREEDY_SCORE) {
885 alninfo=new GXAlnInfo(q_seq, q_alnstart+1-q_ext_l,q_alnstart+q_ext_r, s_seq, s_alnstart+1-s_ext_l, s_alnstart+s_ext_r);
886 int hsp_length = GMIN(q_ext_l+q_ext_r, s_ext_l+s_ext_r);
887 alninfo->score=retscore;
888 alninfo->pid = 100 * (1 - ((double) numdiffs) / hsp_length);
889 #ifdef GDEBUG
890 if (ed_script_rev) {
891 GMessage("Final Edit script ::: ");
892 printEditScript(ed_script_rev);
893 }
894 #endif
895 alninfo->editscript=ed_script_rev;
896 alninfo->gapinfo = new CAlnGapInfo(ed_script_rev, alninfo->ql-1, alninfo->sl-1);
897 }
898 else {
899 if (freeAlnMem) delete gxmem;
900 delete ed_script_rev;
901 delete alninfo;
902 alninfo=NULL;
903 }
904 delete ed_script_fwd;
905 return alninfo;
906 }
907
908 GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
909 SGreedyAlignMem* gxmem, int match_reward, int mismatch_penalty, int Xdrop) {
910 bool editscript=false;
911 #ifdef GDEBUG
912 editscript=true;
913 GMessage("==========> matching Right (3') end ========\n");
914 #endif
915 GList<GXSeed> rseeds(true,true,false);
916 GXBandSet* alnbands=collectSeeds_R(rseeds, seqa, seqa_len, seqb, seqb_len);
917 //did we find a shortcut?
918 if (alnbands->qmatch) {
919
920 }
921 GList<GXAlnInfo> galns(true,true,false);
922 int max_top_bands=5;
923 int top_band_count=0;
924 GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
925 for (int b=0;b<alnbands->Count();b++) {
926 if (alnbands->Get(b)->score<4) break;
927 top_band_count++;
928 GXBand& band=*(alnbands->Get(b));
929 band.seeds.setSorted(cmpSeedScore);
930 anchor_seeds.Add(band.seeds.First());
931 if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
932 }
933 #ifdef GDEBUG
934 GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
935 #endif
936
937 for (int i=0;i<anchor_seeds.Count();i++) {
938 GXSeed& aseed=*anchor_seeds[i];
939 int a1=aseed.a_ofs+(aseed.len>>1)+1;
940 int a2=aseed.b_ofs+(aseed.len>>1)+1;
941 GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
942 seqb, a2, seqb_len, galn_TrimRight, editscript, gxmem,
943 match_reward, mismatch_penalty, Xdrop);
944 if (alninfo && alninfo->pid>73 && alninfo->sr>seqb_len-4 && alninfo->ql<4)
945 galns.AddIfNew(alninfo, true);
946 else delete alninfo;
947 }
948
949 #ifdef GDEBUG
950 for (int i=0;i<galns.Count();i++) {
951 GXAlnInfo* alninfo=galns[i];
952 GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
953 alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
954 if (alninfo->gapinfo!=NULL) {
955 GMessage("Alignment:\n");
956 alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
957 }
958 }
959 #endif
960 //---- done
961 delete alnbands;
962 if (galns.Count()) {
963 GXAlnInfo* bestaln=galns.Shift();
964 return bestaln;
965 }
966 else return NULL;
967 }
968
969 GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
970 SGreedyAlignMem* gxmem, int match_reward, int mismatch_penalty, int Xdrop) {
971 bool editscript=false;
972 #ifdef GDEBUG
973 editscript=true;
974 GMessage("==========> matching Left (5') end ========\n");
975 #endif
976 GList<GXSeed> rseeds(true,true,false);
977 GXBandSet* alnbands=collectSeeds_L(rseeds, seqa, seqa_len, seqb, seqb_len);
978 GList<GXAlnInfo> galns(true,true,false);
979 int max_top_bands=5;
980 int top_band_count=0;
981 GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
982 for (int b=0;b<alnbands->Count();b++) {
983 if (alnbands->Get(b)->score<4) break;
984 top_band_count++;
985 GXBand& band=*(alnbands->Get(b));
986 band.seeds.setSorted(cmpSeedScore);
987 anchor_seeds.Add(band.seeds.First());
988 if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
989 }
990 #ifdef GDEBUG
991 GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
992 #endif
993
994 for (int i=0;i<anchor_seeds.Count();i++) {
995 GXSeed& aseed=*anchor_seeds[i];
996 int a1=aseed.a_ofs+(aseed.len>>1)+1;
997 int a2=aseed.b_ofs+(aseed.len>>1)+1;
998 GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
999 seqb, a2, seqb_len, galn_TrimLeft, editscript, gxmem,
1000 match_reward, mismatch_penalty, Xdrop-2);
1001 if (alninfo && alninfo->pid>82 && alninfo->sl<3 && alninfo->qr>seqa_len-4)
1002 //TODO: at 5' end we should use a higher pid threshold
1003 // we could also decrease the X-drop value!
1004 galns.AddIfNew(alninfo, true);
1005 else delete alninfo;
1006 }
1007
1008 #ifdef GDEBUG
1009 for (int i=0;i<galns.Count();i++) {
1010 GXAlnInfo* alninfo=galns[i];
1011 GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
1012 alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
1013 if (alninfo->gapinfo!=NULL) {
1014 GMessage("Alignment:\n");
1015 alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1016 }
1017 }
1018 #endif
1019 //---- done
1020 delete alnbands;
1021 if (galns.Count()) {
1022 GXAlnInfo* bestaln=galns.Shift();
1023 return bestaln;
1024 }
1025 else return NULL;
1026 }