ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.cpp
Revision: 101
Committed: Fri Oct 7 21:13:55 2011 UTC (7 years, 7 months ago) by gpertea
File size: 38519 byte(s)
Log Message:
adjusting params

Line File contents
1 #include "GAlnExtend.h"
2
3 //greedy gapped alignment extension
4 //(mostly lifted from NCBI's megablast gapped extension code)
5
6 int GXMemPool::kMinSpace = 1000000;
7
8 #ifdef GDEBUG
9 char buf[6]={0x1B,'[', 'n','m','m','\0'};
10
11 void color_fg(int c,FILE* f) {
12 if (f!=stderr && f!=stdout) return;
13 sprintf((char *)(&buf[2]),"%dm",c+30);
14 fwrite(buf,1,strlen(buf), f);
15 }
16
17 void color_bg(int c, FILE* f) {
18 if (f!=stderr && f!=stdout) return;
19 sprintf((char *)(&buf[2]),"%dm",c+40);
20 fwrite(buf,1,strlen(buf),f);
21 };
22
23 void color_resetfg(FILE* f) {
24 if (f!=stderr && f!=stdout) return;
25 sprintf((char *)(&buf[2]),"39m");
26 fwrite(buf,1,strlen(buf), f);
27 };
28
29 void color_resetbg(FILE* f) {
30 if (f!=stderr && f!=stdout) return;
31 sprintf((char *)(&buf[2]),"49m");
32 fwrite(buf,1,strlen(buf), f);
33 }
34
35 void color_reset(FILE* f) {
36 if (f!=stderr && f!=stdout) return;
37 sprintf((char *)(&buf[2]),"0m");
38 fwrite(buf,1,strlen(buf), f);
39 };
40
41 void color_normal(FILE* f) {
42 if (f!=stderr && f!=stdout) return;
43 sprintf((char *)(&buf[2]),"22m");
44 fwrite(buf,1,strlen(buf), f);
45 };
46
47 #endif
48
49
50 char xgapcodes[4]={'S','I', 'D', 'X'};
51
52 int get_last(int **flast_d, int d, int diag, int *row1) {
53 if (flast_d[d-1][diag-1] > GMAX(flast_d[d-1][diag], flast_d[d-1][diag+1])) {
54 *row1 = flast_d[d-1][diag-1];
55 return diag-1;
56 }
57 if (flast_d[d-1][diag] > flast_d[d-1][diag+1]) {
58 *row1 = flast_d[d-1][diag];
59 return diag;
60 }
61 *row1 = flast_d[d-1][diag+1];
62 return diag+1;
63 }
64
65 void GapXEditScript::print() { //debug
66 GapXEditScript* p=this;
67 do {
68 GMessage("%d%c ",p->num, xgapcodes[p->op_type]);
69 } while ((p=p->next)!=NULL);
70 GMessage("\n");
71 }
72
73
74 int BLAST_Gcd(int a, int b) {
75 int c;
76
77 b = abs(b);
78 if (b > a)
79 c=a, a=b, b=c;
80
81 while (b != 0) {
82 c = a%b;
83 a = b;
84 b = c;
85 }
86 return a;
87 }
88
89 int BLAST_Gdb3(int* a, int* b, int* c) {
90 int g;
91 if (*b == 0)
92 g = BLAST_Gcd(*a, *c);
93 else
94 g = BLAST_Gcd(*a, BLAST_Gcd(*b, *c));
95 if (g > 1) {
96 *a /= g;
97 *b /= g;
98 *c /= g;
99 }
100 return g;
101 }
102
103
104
105 //signal that a diagonal is invalid
106 static const int kInvalidOffset = -2;
107
108 int s_FindFirstMismatch(const char *seq1, int len1,
109 const char *seq2, int len2,
110 int seq1_index, int seq2_index,
111 //bool &fence_hit,
112 bool reverse) {
113 int start_index = seq1_index;
114 /* Sentry detection here should be relatively inexpensive: The
115 sentry value cannot appear in the query, so detection only
116 needs to be done at exit from the subject-query matching loop.
117 For uncompressed sequences, ambiguities in the query (i.e. seq1)
118 always count as mismatches */
119 if (reverse) {
120 while (seq1_index < len1 && seq2_index < len2 &&
121 //seq1[len1-1 - seq1_index] < 4 &&
122 seq1[len1-1 - seq1_index] == seq2[len2-1 - seq2_index]) {
123 ++seq1_index;
124 ++seq2_index;
125 }
126 //if (seq2_index < len2 && seq2[len2-1-seq2_index] == FENCE_SENTRY) {
127 //if len2-1-seq2_index<=0) {
128 // fence_hit = true;
129 // }
130 }
131 else { //forward lookup
132 while (seq1_index < len1 && seq2_index < len2 &&
133 //seq1[seq1_index] < 4 &&
134 seq1[seq1_index] == seq2[seq2_index]) {
135 ++seq1_index;
136 ++seq2_index;
137 }
138 //if (seq2_index < len2 && seq2[seq2_index] == FENCE_SENTRY) {
139 //if (seq2_index==len2) {
140 // fence_hit = true;
141 //}
142 }
143 return seq1_index - start_index;
144 }
145
146
147
148 /** During the traceback for a non-affine greedy alignment,
149 compute the diagonal that will result from the next
150 traceback operation
151
152 @param last_seq2_off Array of offsets into the second sequence;
153 last_seq2_off[d][k] gives the largest offset into
154 the second sequence that lies on diagonal k and
155 has distance d [in]
156 @param d Starting distance [in]
157 @param diag Index of diagonal that produced the starting distance [in]
158 @param seq2_index The offset into the second sequence after the traceback
159 operation has completed [out]
160 @return The diagonal resulting from the next traceback operation
161 being applied
162 */
163 int s_GetNextNonAffineTback(int **last_seq2_off, int d,
164 int diag, int *seq2_index) {
165 // choose the traceback operation that results in the
166 // largest seq2 offset at this point, then compute the
167 // new diagonal that is implied by the operation
168 if (last_seq2_off[d-1][diag-1] >
169 GMAX(last_seq2_off[d-1][diag], last_seq2_off[d-1][diag+1])) {
170 *seq2_index = last_seq2_off[d-1][diag-1];
171 return diag - 1; // gap in seq2
172 }
173 if (last_seq2_off[d-1][diag] > last_seq2_off[d-1][diag+1]) {
174 *seq2_index = last_seq2_off[d-1][diag];
175 return diag; // match
176 }
177 *seq2_index = last_seq2_off[d-1][diag+1];
178 return diag + 1; // gap in seq1
179 }
180
181
182 int GXGreedyExtend(const char* seq1, int len1,
183 const char* seq2, int len2,
184 bool reverse, int xdrop_threshold,
185 int match_cost, int mismatch_cost,
186 int& seq1_align_len, int& seq2_align_len,
187 CGreedyAlignData& aux_data,
188 GXEditScript *edit_block) {
189 //GapPrelimEditBlock *edit_block,
190 //bool& fence_hit, SGreedySeed *seed) {
191 int seq1_index;
192 int seq2_index;
193 int index;
194 int d;
195 int k;
196 int diag_lower, diag_upper;
197 int max_dist;
198 int diag_origin;
199 int best_dist;
200 int best_diag;
201 int** last_seq2_off;
202 int* max_score;
203 int xdrop_offset;
204 int longest_match_run;
205 bool end1_reached, end2_reached;
206 GXMemPool* mem_pool;
207
208 /* ordinary dynamic programming alignment, for each offset
209 in seq1, walks through offsets in seq2 until an X-dropoff
210 test fails, saving the best score encountered along
211 the way. Instead of score, this code tracks the 'distance'
212 (number of mismatches plus number of gaps) between seq1
213 and seq2. Instead of walking through sequence offsets, it
214 walks through diagonals that can achieve a given distance.
215
216 Note that in what follows, the numbering of diagonals implies
217 a dot matrix where increasing seq1 offsets go to the right on
218 the x axis, and increasing seq2 offsets go up the y axis.
219 The gapped alignment thus proceeds up and to the right in
220 the graph, and diagonals are numbered increasing to the right */
221
222 best_dist = 0;
223 best_diag = 0;
224
225 /* set the number of distinct distances the algorithm will
226 examine in the search for an optimal alignment. The
227 heuristic worst-case running time of the algorithm is
228 O(max_dist**2 + (len1+len2)); for sequences which are
229 very similar, the average running time will be sig-
230 nificantly better than this */
231
232 max_dist = GMIN(GREEDY_MAX_COST,
233 len2 / GREEDY_MAX_COST_FRACTION + 1);
234
235 /* the main loop assumes that the index of all diagonals is
236 biased to lie in the middle of allocated bookkeeping
237 structures */
238
239 diag_origin = max_dist + 2;
240
241 // last_seq2_off[d][k] is the largest offset into seq2 that
242 // lies on diagonal k and has distance d
243
244 last_seq2_off = aux_data.last_seq2_off;
245
246 /* Instead of tracking the best alignment score and using
247 xdrop_theshold directly, track the best score for each
248 unique distance and use the best score for some previously
249 computed distance to implement the X-dropoff test.
250
251 xdrop_offset gives the distance backwards in the score
252 array to look */
253
254 xdrop_offset = (xdrop_threshold + match_cost / 2) /
255 (match_cost + mismatch_cost) + 1;
256
257 // find the offset of the first mismatch between seq1 and seq2
258
259 index = s_FindFirstMismatch(seq1, len1, seq2, len2, 0, 0, reverse);
260 // fence_hit, reverse, rem);
261
262 // update the extents of the alignment, and bail out
263 // early if no further work is needed
264
265 seq1_align_len = index;
266 seq2_align_len = index;
267 seq1_index = index;
268 /*
269 seed->start_q = 0;
270 seed->start_s = 0;
271 seed->match_length = index;
272 */
273 longest_match_run = index;
274
275 if (index == len1 || index == len2) {
276 /* Return the number of differences, which is zero here */
277 if (edit_block != NULL)
278 //GapPrelimEditBlockAdd(edit_block, eGapAlignSub, index);
279 edit_block->opRep(index);
280 return 0;
281 }
282
283 // set up the memory pool
284 mem_pool = aux_data.space;
285 if (edit_block == NULL) {
286 mem_pool = NULL;
287 }
288 else if (mem_pool == NULL) {
289 aux_data.space = mem_pool = new GXMemPool();
290 }
291 else {
292 mem_pool->refresh();
293 }
294
295 /* set up the array of per-distance maximum scores. There
296 are max_diags + xdrop_offset distances to track, the first
297 xdrop_offset of which are 0 */
298
299 max_score = aux_data.max_score + xdrop_offset;
300 for (index = 0; index < xdrop_offset; index++)
301 aux_data.max_score[index] = 0;
302
303 // fill in the initial offsets of the distance matrix
304
305 last_seq2_off[0][diag_origin] = seq1_index;
306 max_score[0] = seq1_index * match_cost;
307 diag_lower = diag_origin - 1;
308 diag_upper = diag_origin + 1;
309 end1_reached = end2_reached = false;
310
311 // for each distance
312 for (d = 1; d <= max_dist; d++) {
313 int xdrop_score;
314 int curr_score;
315 int curr_extent = 0;
316 int curr_seq2_index = 0;
317 int curr_diag = 0;
318 int tmp_diag_lower = diag_lower;
319 int tmp_diag_upper = diag_upper;
320
321 // Assign impossible seq2 offsets to any diagonals that
322 // are not in the range (diag_lower,diag_upper).
323 // These will serve as sentinel values for the inner loop
324 last_seq2_off[d - 1][diag_lower-1] = kInvalidOffset;
325 last_seq2_off[d - 1][diag_lower] = kInvalidOffset;
326 last_seq2_off[d - 1][diag_upper] = kInvalidOffset;
327 last_seq2_off[d - 1][diag_upper+1] = kInvalidOffset;
328
329 // compute the score for distance d corresponding to the X-dropoff criterion
330
331 xdrop_score = max_score[d - xdrop_offset] +
332 (match_cost + mismatch_cost) * d - xdrop_threshold;
333 xdrop_score = (int)ceil((double)xdrop_score / (match_cost / 2));
334
335 // for each diagonal of interest
336 for (k = tmp_diag_lower; k <= tmp_diag_upper; k++) {
337 /* find the largest offset into seq2 that increases
338 the distance from d-1 to d (i.e. keeps the alignment
339 from getting worse for as long as possible), then
340 choose the offset into seq1 that will keep the
341 resulting diagonal fixed at k
342
343 Note that this requires kInvalidOffset+1 to be smaller
344 than any valid offset into seq2, i.e. to be negative */
345
346 seq2_index = GMAX(last_seq2_off[d - 1][k + 1],
347 last_seq2_off[d - 1][k ]) + 1;
348 seq2_index = GMAX(seq2_index, last_seq2_off[d - 1][k - 1]);
349 seq1_index = seq2_index + k - diag_origin;
350
351 if (seq2_index < 0 || seq1_index + seq2_index < xdrop_score) {
352
353 // if no valid diagonal can reach distance d, or the
354 // X-dropoff test fails, narrow the range of diagonals
355 // to test and skip to the next diagonal
356 if (k == diag_lower)
357 diag_lower++;
358 else
359 last_seq2_off[d][k] = kInvalidOffset;
360 continue;
361 }
362 diag_upper = k;
363
364 /* slide down diagonal k until a mismatch
365 occurs. As long as only matches are encountered,
366 the current distance d will not change */
367
368 index = s_FindFirstMismatch(seq1, len1, seq2, len2,
369 seq1_index, seq2_index, reverse);
370 //fence_hit, reverse, rem);
371 if (index > longest_match_run) {
372 //seed->start_q = seq1_index;
373 //seed->start_s = seq2_index;
374 //seed->match_length = index;
375 longest_match_run = index;
376 }
377 seq1_index += index;
378 seq2_index += index;
379
380 // set the new largest seq2 offset that achieves
381 // distance d on diagonal k
382
383 last_seq2_off[d][k] = seq2_index;
384
385 // since all values of k are constrained to have the
386 // same distance d, the value of k which maximizes the
387 // alignment score is the one that covers the most of seq1 and seq2
388 if (seq1_index + seq2_index > curr_extent) {
389 curr_extent = seq1_index + seq2_index;
390 curr_seq2_index = seq2_index;
391 curr_diag = k;
392 }
393
394 /* clamp the bounds on diagonals to avoid walking off
395 either sequence. Because the bounds increase by at
396 most one for each distance, diag_lower and diag_upper
397 can each be of size at most max_diags+2 */
398
399 if (seq2_index == len2) {
400 diag_lower = k + 1;
401 end2_reached = true;
402 }
403 if (seq1_index == len1) {
404 diag_upper = k - 1;
405 end1_reached = true;
406 }
407 } // end loop over diagonals
408
409 // compute the maximum score possible for distance d
410 curr_score = curr_extent * (match_cost / 2) -
411 d * (match_cost + mismatch_cost);
412 // if this is the best score seen so far, update the
413 // statistics of the best alignment
414 if (curr_score > max_score[d - 1]) {
415 max_score[d] = curr_score;
416 best_dist = d;
417 best_diag = curr_diag;
418 seq2_align_len = curr_seq2_index;
419 seq1_align_len = curr_seq2_index + best_diag - diag_origin;
420 }
421 else {
422 max_score[d] = max_score[d - 1];
423 }
424
425 // alignment has finished if the lower and upper bounds
426 // on diagonals to check have converged to each other
427
428 if (diag_lower > diag_upper)
429 break;
430
431 /* set up for the next distance to examine. Because the
432 bounds increase by at most one for each distance,
433 diag_lower and diag_upper can each be of size at
434 most max_diags+2 */
435
436 if (!end2_reached)
437 diag_lower--;
438 if (!end1_reached)
439 diag_upper++;
440
441 if (edit_block == NULL) {
442 // if no traceback is specified, the next row of
443 // last_seq2_off can reuse previously allocated memory
444 //TODO FIXME The following assumes two arrays of
445 // at least max_dist+4 int's have already been allocated
446 last_seq2_off[d + 1] = last_seq2_off[d - 1];
447 }
448 else {
449 // traceback requires all rows of last_seq2_off to be saved,
450 // so a new row must be allocated
451 last_seq2_off[d + 1] = (int*)mem_pool->getByteSpace((diag_upper - diag_lower + 7)*sizeof(int));
452 // move the origin for this row backwards
453 //TODO FIXME: dubious pointer arithmetic ?!
454 //last_seq2_off[d + 1] = last_seq2_off[d + 1] - diag_lower + 2;
455 }
456 } // end loop over distinct distances
457
458
459 if (edit_block == NULL)
460 return best_dist;
461
462 //---- perform traceback
463 d = best_dist;
464 seq1_index = seq1_align_len;
465 seq2_index = seq2_align_len;
466 // for all positive distances
467
468 //if (fence_hit && *fence_hit)
469 // goto done;
470 if (index==len1 || index==len2) d=0;
471 while (d > 0) {
472 int new_diag;
473 int new_seq2_index;
474
475 /* retrieve the value of the diagonal after the next
476 traceback operation. best_diag starts off with the
477 value computed during the alignment process */
478
479 new_diag = s_GetNextNonAffineTback(last_seq2_off, d,
480 best_diag, &new_seq2_index);
481
482 if (new_diag == best_diag) {
483 // same diagonal: issue a group of substitutions
484 if (seq2_index - new_seq2_index > 0) {
485 edit_block->opRep(seq2_index - new_seq2_index);
486 }
487 }
488 else if (new_diag < best_diag) {
489 // smaller diagonal: issue a group of substitutions
490 // and then a gap in seq2 */
491 if (seq2_index - new_seq2_index > 0) {
492 edit_block->opRep(seq2_index - new_seq2_index);
493 }
494 //GapPrelimEditBlockAdd(edit_block, eGapAlignIns, 1);
495 edit_block->opIns(1);
496 }
497 else {
498 // larger diagonal: issue a group of substitutions
499 // and then a gap in seq1
500 if (seq2_index - new_seq2_index - 1 > 0) {
501 edit_block->opRep(seq2_index - new_seq2_index - 1);
502 }
503 edit_block->opDel(1);
504 }
505 d--;
506 best_diag = new_diag;
507 seq2_index = new_seq2_index;
508 }
509 //done:
510 // handle the final group of substitutions back to distance zero,
511 // i.e. back to offset zero of seq1 and seq2
512 //GapPrelimEditBlockAdd(edit_block, eGapAlignSub,
513 // last_seq2_off[0][diag_origin]);
514 edit_block->opRep(last_seq2_off[0][diag_origin]);
515 if (!reverse)
516 edit_block->reverse();
517 return best_dist;
518 }
519
520 void printEditScript(GXEditScript* ed_script) {
521 uint i;
522 if (ed_script==NULL || ed_script->opnum == 0)
523 return;
524 for (i=0; i<ed_script->opnum; i++) {
525 int num=((ed_script->ops[i]) >> 2);
526 unsigned char op_type = 3 - ( ed_script->ops[i] & gxEDIT_OP_MASK );
527 if (op_type == 3)
528 GError("Error: printEditScript encountered op_type 3 ?!\n");
529 GMessage("%d%c ", num, xgapcodes[op_type]);
530 }
531 GMessage("\n");
532 }
533
534 GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart,
535 bool editscript, int reward, int penalty, int xdrop) {
536 int q_max=strlen(q_seq); //query
537 int s_max=strlen(s_seq); //subj
538 return GreedyAlignRegion(q_seq, q_alnstart, q_max,
539 s_seq, s_alnstart, s_max, reward, penalty, xdrop, NULL, NULL, editscript);
540 }
541
542 struct GXSeedTable {
543 int a_num, b_num;
544 int a_cap, b_cap;
545 char* xc;
546 GXSeedTable(int a=12, int b=255) {
547 a_cap=0;
548 b_cap=0;
549 a_num=0;
550 b_num=0;
551 xc=NULL;
552 init(a,b);
553 }
554 ~GXSeedTable() {
555 GFREE(xc);
556 }
557 void init(int a, int b) {
558 a_num=a;
559 b_num=b;
560 bool resize=false;
561 if (b_num>b_cap) { resize=true; b_cap=b_num;}
562 if (a_num>a_cap) { resize=true; a_cap=a_num;}
563 if (resize) {
564 GFREE(xc);
565 GCALLOC(xc, (a_num*b_num));
566 }
567 else {
568 //just clear up to a_max, b_max
569 memset((void*)xc, 0, (a_num*b_num));
570 }
571 }
572 char& x(int ax, int by) {
573 return xc[by*a_num+ax];
574 }
575
576 };
577
578 const int a_m_score=2; //match score
579 const int a_mis_score=-3; //mismatch
580 const int a_dropoff_score=7;
581 const int a_min_score=12; //at least 6 bases full match
582
583 // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
584 //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
585 //check minimum score and
586 //for 3' adapter trimming:
587 // require that the right end of the alignment for either the adaptor OR the read must be
588 // < 3 distance from its right end
589 // for 5' adapter trimming:
590 // require that the left end of the alignment for either the adaptor OR the read must
591 // be at coordinate < 3 from start
592
593 bool extendUngapped(const char* a, int alen, int ai,
594 const char* b, int blen, int bi, int& mlen, int& l5, int& l3, bool end5=false) {
595 //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
596 //if (debug) {
597 // GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
598 // }
599 int a_l=ai; //alignment coordinates on a
600 int a_r=ai+mlen-1;
601 int b_l=bi; //alignment coordinates on b
602 int b_r=bi+mlen-1;
603 int ai_maxscore=ai;
604 int bi_maxscore=bi;
605 int score=mlen*a_m_score;
606 int maxscore=score;
607 int mism5score=a_mis_score;
608 if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
609 //try to extend to the left first, if possible
610 while (ai>0 && bi>0) {
611 ai--;
612 bi--;
613 score+= (a[ai]==b[bi])? a_m_score : mism5score;
614 if (score>maxscore) {
615 ai_maxscore=ai;
616 bi_maxscore=bi;
617 maxscore=score;
618 }
619 else if (maxscore-score>a_dropoff_score) break;
620 }
621 a_l=ai_maxscore;
622 b_l=bi_maxscore;
623 //now extend to the right
624 ai_maxscore=a_r;
625 bi_maxscore=b_r;
626 ai=a_r;
627 bi=b_r;
628 score=maxscore;
629 //sometimes there are extra As at the end of the read, ignore those
630 if (a[alen-2]=='A' && a[alen-1]=='A') {
631 alen-=2;
632 while (a[alen-1]=='A' && alen>ai) alen--;
633 }
634 while (ai<alen-1 && bi<blen-1) {
635 ai++;
636 bi++;
637 //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
638 if (a[ai]==b[bi]) { //match
639 score+=a_m_score;
640 if (ai>=alen-2) {
641 score+=a_m_score-(alen-ai-1);
642 }
643 }
644 else { //mismatch
645 score+=a_mis_score;
646 }
647 if (score>maxscore) {
648 ai_maxscore=ai;
649 bi_maxscore=bi;
650 maxscore=score;
651 }
652 else if (maxscore-score>a_dropoff_score) break;
653 }
654 a_r=ai_maxscore;
655 b_r=bi_maxscore;
656 int a_ovh3=alen-a_r-1;
657 int b_ovh3=blen-b_r-1;
658 int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
659 int mmovh5=(a_l<b_l)? a_l : b_l;
660 if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
661 if (a_l<a_ovh3) {
662 //adapter closer to the left end (typical for 5' adapter)
663 l5=a_r+1;
664 l3=alen-1;
665 }
666 else {
667 //adapter matching at the right end (typical for 3' adapter)
668 l5=0;
669 l3=a_l-1;
670 }
671 return true;
672 }
673 //do not trim:
674 l5=0;
675 l3=alen-1;
676 return false;
677 }
678
679
680 GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
681 //overlap of right (3') end of seqb
682 //hash the first 12 bases of seqa:
683 int aimin=0;
684 int aimax=GMIN(9,(a_len-4));
685 int bimin=GMAX(0,(b_len-a_len-2));//allow 2 extraneous bases, just in case there is some garbage at the end
686 int bimax=b_len-4;
687 int a_maxlen=aimax+4; //number of rows in the diagonal matrix
688 int b_maxlen=b_len; //number of cols in the diagonal matrix
689 GXSeedTable gx(a_maxlen, b_maxlen);
690 GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
691 for (int ai=aimin;ai<=aimax;ai++) {
692 int32* av=(int32 *)(&seqa[ai]);
693 //for (int bi=b_len-4;bi>=bimin;bi--) {
694 for (int bi=bimin;bi<=bimax;bi++) {
695 if (gx.x(ai,bi))
696 continue; //already have a previous seed covering this region of this diagonal
697 int32* bv=(int32 *)(&seqb[bi]);
698 if (*av == *bv) {
699 //word match
700 //see if we can extend to the right
701 gx.x(ai+1,bi+1)=1;
702 gx.x(ai+2,bi+2)=1;
703 gx.x(ai+3,bi+3)=1;
704 int aix=ai+4;
705 int bix=bi+4;
706 int len=4;
707 while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
708 {
709 gx.x(aix,bix)=1;
710 aix++;bix++;
711 len++;
712 }
713 if (len>7) {
714 //heuristics: very likely the best we can get
715 int qlen=len;
716 while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
717 aix++;
718 bix++;
719 qlen++;
720 }
721 if (qlen>10) { //found a quick match shortcut
722 diagstrips->qmatch=new GXSeed(ai,bi,qlen);
723 return diagstrips;
724 }
725 }
726 GXSeed* newseed=new GXSeed(ai,bi,len);
727 seeds.Add(newseed);
728 diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
729 }
730 }
731 }//for each 4-mer at the beginning of seqa
732 for (int i=0;i<diagstrips->Count();i++) {
733 diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
734 }
735 diagstrips->setSorted(true); //sort by score
736 return diagstrips;
737 }
738
739 GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
740 //overlap of left (5') end of seqb
741 //hash the last 12 bases of seqa:
742 int aimin=GMAX(0,(a_len-16));
743 int aimax=a_len-4;
744 int bimin=0;
745 int bimax=GMIN((a_len-2), (b_len-4));
746 int a_maxlen=aimax+4; //number of rows in the diagonal matrix
747 int b_maxlen=b_len; //number of cols in the diagonal matrix
748 //gx.init(a_maxlen, b_maxlen);
749 GXSeedTable gx(a_maxlen, b_maxlen);
750 GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
751 for (int ai=aimin;ai<=aimax;ai++) {
752 int32* av=(int32 *)(&seqa[ai]);
753 for (int bi=bimin;bi<=bimax;bi++) {
754 if (gx.x(ai,bi))
755 continue; //already have a previous seed covering this region of this diagonal
756 int32* bv=(int32 *)(&seqb[bi]);
757 if (*av == *bv) {
758 //word match
759 //see if we can extend to the right
760 gx.x(ai+1,bi+1)=1;
761 gx.x(ai+2,bi+2)=1;
762 gx.x(ai+3,bi+3)=1;
763 int aix=ai+4;
764 int bix=bi+4;
765 int len=4;
766 while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
767 {
768 gx.x(aix,bix)=1;
769 aix++;bix++;
770 len++;
771 }
772 if (len>7) {
773 //heuristics: very likely the best we can get
774 int qlen=len;
775 while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
776 aix++;
777 bix++;
778 qlen++;
779 }
780 if (qlen>10) { //found a quick match shortcut
781 diagstrips->qmatch=new GXSeed(ai,bi,qlen);
782 return diagstrips;
783 }
784 }
785 GXSeed* newseed=new GXSeed(ai,bi,len);
786 seeds.Add(newseed);
787 diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
788 }
789 }
790 }//for each 4-mer at the end of seqa
791 for (int i=0;i<diagstrips->Count();i++) {
792 diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
793 }
794 diagstrips->setSorted(true); //sort by score
795 return diagstrips;
796 }
797
798 int cmpSeedScore(const pointer p1, const pointer p2) {
799 //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
800 GXSeed* s1=(GXSeed*)p1;
801 GXSeed* s2=(GXSeed*)p2;
802 if (s1->len==s2->len) {
803 return (s1->b_ofs-s2->b_ofs);
804 }
805 else return (s2->len-s1->len);
806 }
807
808 int cmpSeedDiag(const pointer p1, const pointer p2) {
809 GXSeed* s1=(GXSeed*)p1;
810 GXSeed* s2=(GXSeed*)p2;
811 return ((s1->b_ofs-s1->a_ofs)-(s2->b_ofs-s2->a_ofs));
812 }
813
814
815 int cmpDiagBands_R(const pointer p1, const pointer p2) {
816 //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
817 GXBand* b1=(GXBand*)p1;
818 GXBand* b2=(GXBand*)p2;
819 if (b1->score==b2->score) {
820 return (b2->w_min_b-b1->w_min_b);
821 }
822 else return (b2->score-b1->score);
823 }
824
825
826
827 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
828 const char* s_seq, int s_alnstart, int s_max,
829 int reward, int penalty, int xdrop, CGreedyAlignData* gxmem,
830 CAlnTrim* trim, bool editscript) {
831 GXEditScript* ed_script_fwd = NULL;
832 GXEditScript* ed_script_rev = NULL;
833 if ( q_alnstart>q_max || q_alnstart<1 || s_alnstart>s_max || s_alnstart<1 )
834 GError("GreedyAlign() Error: invalid anchor coordinate.\n");
835 q_alnstart--;
836 s_alnstart--;
837 if (q_seq==NULL || q_seq[0]==0 || s_seq==NULL || s_seq[0]==0)
838 GError("GreedyAlign() Error: attempt to use an empty sequence string!\n");
839 if (q_seq[q_alnstart]!=s_seq[s_alnstart])
840 GError("GreedyAlign() Error: improper anchor (mismatch)!\n");
841 int q_ext_l=0, q_ext_r=0, s_ext_l=0, s_ext_r=0;
842 const char* q=q_seq+q_alnstart;
843 int q_avail=q_max-q_alnstart;
844 const char* s=s_seq+s_alnstart;
845 int s_avail=s_max-s_alnstart;
846 if (penalty<0) penalty=-penalty;
847 int MIN_GREEDY_SCORE=5*reward; //minimum score for an alignment to be reported for 0 diffs
848 GXAlnInfo* alninfo=NULL;
849 bool freeAlnMem=(gxmem==NULL);
850 if (freeAlnMem) {
851 gxmem=new CGreedyAlignData(reward, penalty, xdrop);
852 }
853 else gxmem->reset();
854 int retscore = 0;
855 int numdiffs = 0;
856 if (trim!=NULL && trim->type==galn_TrimLeft) {
857 if (editscript)
858 ed_script_rev=new GXEditScript();
859
860 int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
861 reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
862 //check this extension here and bail out if it's not a good extension
863 if (s_alnstart+1-s_ext_l>trim->boundary) {
864 delete ed_script_rev;
865 if (freeAlnMem) delete gxmem;
866 return NULL;
867 }
868
869 if (editscript)
870 ed_script_fwd=new GXEditScript();
871 int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
872 reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
873 numdiffs=numdiffs_r+numdiffs_l;
874 //convert num diffs to actual score
875 retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward+penalty);
876 if (editscript)
877 ed_script_rev->Append(ed_script_fwd); //combine the two extensions
878 }
879 else {
880 if (editscript) {
881 ed_script_fwd=new GXEditScript();
882 }
883 int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
884 reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
885 //check extension here and bail out if not a good right extension
886 //assuming s_max is really at the right end of s_seq
887 if (trim!=NULL && trim->type==galn_TrimRight && s_alnstart+s_ext_r<trim->boundary) {
888 delete ed_script_fwd;
889 if (freeAlnMem) delete gxmem;
890 return NULL;
891 }
892 if (editscript)
893 ed_script_rev=new GXEditScript();
894 int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
895 reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
896 //convert num diffs to actual score
897 numdiffs=numdiffs_r+numdiffs_l;
898 retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward+penalty);
899 if (editscript)
900 ed_script_rev->Append(ed_script_fwd); //combine the two extensions
901 }
902
903 if (retscore>=MIN_GREEDY_SCORE) {
904 alninfo=new GXAlnInfo(q_seq, q_alnstart+1-q_ext_l,q_alnstart+q_ext_r, s_seq, s_alnstart+1-s_ext_l, s_alnstart+s_ext_r);
905 int hsp_length = GMIN(q_ext_l+q_ext_r, s_ext_l+s_ext_r);
906 alninfo->score=retscore;
907 alninfo->pid = 100 * (1 - ((double) numdiffs) / hsp_length);
908 #ifdef GDEBUG
909 if (ed_script_rev) {
910 GMessage("Final Edit script ::: ");
911 printEditScript(ed_script_rev);
912 }
913 #endif
914 alninfo->editscript=ed_script_rev;
915 alninfo->gapinfo = new CAlnGapInfo(ed_script_rev, alninfo->ql-1, alninfo->sl-1);
916 }
917 else {
918 if (freeAlnMem) delete gxmem;
919 delete ed_script_rev;
920 delete alninfo;
921 alninfo=NULL;
922 }
923 delete ed_script_fwd;
924 return alninfo;
925 }
926
927 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
928 const char* s_seq, int s_alnstart, int s_max, CGreedyAlignData* gxmem,
929 CAlnTrim* trim, bool editscript) {
930 int reward=2;
931 int penalty=3;
932 int xdrop=8;
933 if (gxmem) {
934 reward=gxmem->match_reward;
935 penalty=gxmem->mismatch_penalty;
936 xdrop=gxmem->x_drop;
937 }
938 return GreedyAlignRegion(q_seq, q_alnstart, q_max, s_seq, s_alnstart, s_max,
939 reward, penalty, xdrop, gxmem, trim, editscript);
940 }
941
942 GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
943 CGreedyAlignData* gxmem, int min_pid) {
944 bool editscript=false;
945 #ifdef GDEBUG
946 editscript=true;
947 GMessage("==========> matching Right (3') end ========\n");
948 #endif
949
950 CAlnTrim trimInfo(galn_TrimRight, seqb, seqb_len);
951 GList<GXSeed> rseeds(true,true,false);
952 GXBandSet* alnbands=collectSeeds_R(rseeds, seqa, seqa_len, seqb, seqb_len);
953 GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
954 //did we find a shortcut?
955 if (alnbands->qmatch) {
956 #ifdef GDEBUG
957 GMessage("::: Found a quick long match -- using it instead of diagonal bands\n");
958 #endif
959 anchor_seeds.Add(alnbands->qmatch);
960 }
961 else {
962 int max_top_bands=5;
963 int top_band_count=0;
964 for (int b=0;b<alnbands->Count();b++) {
965 if (alnbands->Get(b)->score<4) break;
966 #ifdef GDEBUG
967 GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
968 #endif
969 top_band_count++;
970 GXBand& band=*(alnbands->Get(b));
971 band.seeds.setSorted(cmpSeedScore);
972 anchor_seeds.Add(band.seeds.First());
973 band.tested=true;
974 if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
975 }
976 #ifdef GDEBUG
977 GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
978 #endif
979 }
980
981 GList<GXAlnInfo> galns(true,true,false);
982 for (int i=0;i<anchor_seeds.Count();i++) {
983 GXSeed& aseed=*anchor_seeds[i];
984 int a1=aseed.a_ofs+(aseed.len>>1)+1;
985 int a2=aseed.b_ofs+(aseed.len>>1)+1;
986 GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
987 seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
988 if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
989 galns.AddIfNew(alninfo, true);
990 else delete alninfo;
991 }
992 //special case: due to the scoring scheme bias towards the 5' end of the read,
993 //also try some seeds at the 3' end
994 if (galns.Count()==0) {
995 alnbands->setSorted(cmpDiagBands_R);
996 int max_top_bands=4;
997 int top_band_count=0;
998 #ifdef GDEBUG
999 GMessage(":::>> Retrying adjusting sort order.\n");
1000 #endif
1001 for (int b=0;b<alnbands->Count();b++) {
1002 if (alnbands->Get(b)->score<4) break;
1003 #ifdef GDEBUG
1004 GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
1005 #endif
1006 if (alnbands->Get(b)->tested) continue;
1007 top_band_count++;
1008 GXBand& band=*(alnbands->Get(b));
1009 band.seeds.setSorted(cmpSeedScore);
1010 anchor_seeds.Add(band.seeds.First());
1011 if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1012 }
1013 #ifdef GDEBUG
1014 GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1015 #endif
1016 for (int i=0;i<anchor_seeds.Count();i++) {
1017 GXSeed& aseed=*anchor_seeds[i];
1018 int a1=aseed.a_ofs+(aseed.len>>1)+1;
1019 int a2=aseed.b_ofs+(aseed.len>>1)+1;
1020 GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1021 seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1022 if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1023 galns.AddIfNew(alninfo, true);
1024 else delete alninfo;
1025 }
1026 }
1027 #ifdef GDEBUG
1028 for (int i=0;i<galns.Count();i++) {
1029 GXAlnInfo* alninfo=galns[i];
1030 GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
1031 alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
1032 if (alninfo->gapinfo!=NULL) {
1033 GMessage("Alignment:\n");
1034 alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1035 }
1036 }
1037 #endif
1038 //---- done
1039 delete alnbands;
1040 if (galns.Count()) {
1041 GXAlnInfo* bestaln=galns.Shift();
1042 return bestaln;
1043 }
1044 else return NULL;
1045 }
1046
1047 GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
1048 CGreedyAlignData* gxmem, int min_pid) {
1049 bool editscript=false;
1050 #ifdef GDEBUG
1051 editscript=true;
1052 GMessage("==========> matching Left (5') end ========\n");
1053 #endif
1054 CAlnTrim trimInfo(galn_TrimLeft, seqb, seqb_len);
1055 GList<GXSeed> rseeds(true,true,false);
1056 GXBandSet* alnbands=collectSeeds_L(rseeds, seqa, seqa_len, seqb, seqb_len);
1057 GList<GXAlnInfo> galns(true,true,false);
1058 GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
1059
1060 if (alnbands->qmatch) {
1061 #ifdef GDEBUG
1062 GMessage("::: Found a quick long match -- using it instead of diagonal bands\n");
1063 #endif
1064 anchor_seeds.Add(alnbands->qmatch);
1065 }
1066 else {
1067 int max_top_bands=5;
1068 int top_band_count=0;
1069 for (int b=0;b<alnbands->Count();b++) {
1070 if (alnbands->Get(b)->score<4) break;
1071 #ifdef GDEBUG
1072 GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
1073 #endif
1074 top_band_count++;
1075 GXBand& band=*(alnbands->Get(b));
1076 band.seeds.setSorted(cmpSeedScore);
1077 anchor_seeds.Add(band.seeds.First());
1078 if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1079 }
1080 #ifdef GDEBUG
1081 GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1082 #endif
1083 }
1084 for (int i=0;i<anchor_seeds.Count();i++) {
1085 GXSeed& aseed=*anchor_seeds[i];
1086 int a1=aseed.a_ofs+(aseed.len>>1)+1;
1087 int a2=aseed.b_ofs+(aseed.len>>1)+1;
1088 GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1089 seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1090 if (alninfo && alninfo->pid>=min_pid
1091 && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, seqa_len-alninfo->qr))
1092 galns.AddIfNew(alninfo, true);
1093 else delete alninfo;
1094 }
1095
1096 #ifdef GDEBUG
1097 for (int i=0;i<galns.Count();i++) {
1098 GXAlnInfo* alninfo=galns[i];
1099 GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
1100 alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
1101 if (alninfo->gapinfo!=NULL) {
1102 GMessage("Alignment:\n");
1103 alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1104 }
1105 }
1106 #endif
1107 //---- done
1108 delete alnbands;
1109 if (galns.Count()) {
1110 GXAlnInfo* bestaln=galns.Shift();
1111 return bestaln;
1112 }
1113 else return NULL;
1114 }