ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.cpp
Revision: 171
Committed: Tue Feb 14 22:36:26 2012 UTC (7 years, 4 months ago) by gpertea
File size: 39886 byte(s)
Log Message:
wip fqtrim

Line File contents
1 #include "GAlnExtend.h"
2
3 //greedy gapped alignment extension
4 //(mostly lifted from NCBI's megablast gapped extension code)
5
6 int GXMemPool::kMinSpace = 1000000;
7
8 #ifdef GDEBUG
9 char buf[6]={0x1B,'[', 'n','m','m','\0'};
10
11 void color_fg(int c,FILE* f) {
12 if (f!=stderr && f!=stdout) return;
13 sprintf((char *)(&buf[2]),"%dm",c+30);
14 fwrite(buf,1,strlen(buf), f);
15 }
16
17 void color_bg(int c, FILE* f) {
18 if (f!=stderr && f!=stdout) return;
19 sprintf((char *)(&buf[2]),"%dm",c+40);
20 fwrite(buf,1,strlen(buf),f);
21 };
22
23 void color_resetfg(FILE* f) {
24 if (f!=stderr && f!=stdout) return;
25 sprintf((char *)(&buf[2]),"39m");
26 fwrite(buf,1,strlen(buf), f);
27 };
28
29 void color_resetbg(FILE* f) {
30 if (f!=stderr && f!=stdout) return;
31 sprintf((char *)(&buf[2]),"49m");
32 fwrite(buf,1,strlen(buf), f);
33 }
34
35 void color_reset(FILE* f) {
36 if (f!=stderr && f!=stdout) return;
37 sprintf((char *)(&buf[2]),"0m");
38 fwrite(buf,1,strlen(buf), f);
39 };
40
41 void color_normal(FILE* f) {
42 if (f!=stderr && f!=stdout) return;
43 sprintf((char *)(&buf[2]),"22m");
44 fwrite(buf,1,strlen(buf), f);
45 };
46
47 #endif
48
49
50 char xgapcodes[4]={'S','I', 'D', 'X'};
51
52 int get_last(int **flast_d, int d, int diag, int *row1) {
53 if (flast_d[d-1][diag-1] > GMAX(flast_d[d-1][diag], flast_d[d-1][diag+1])) {
54 *row1 = flast_d[d-1][diag-1];
55 return diag-1;
56 }
57 if (flast_d[d-1][diag] > flast_d[d-1][diag+1]) {
58 *row1 = flast_d[d-1][diag];
59 return diag;
60 }
61 *row1 = flast_d[d-1][diag+1];
62 return diag+1;
63 }
64
65 void GapXEditScript::print() { //debug
66 GapXEditScript* p=this;
67 do {
68 GMessage("%d%c ",p->num, xgapcodes[p->op_type]);
69 } while ((p=p->next)!=NULL);
70 GMessage("\n");
71 }
72
73
74 int BLAST_Gcd(int a, int b) {
75 int c;
76
77 b = abs(b);
78 if (b > a)
79 c=a, a=b, b=c;
80
81 while (b != 0) {
82 c = a%b;
83 a = b;
84 b = c;
85 }
86 return a;
87 }
88
89 int BLAST_Gdb3(int* a, int* b, int* c) {
90 int g;
91 if (*b == 0)
92 g = BLAST_Gcd(*a, *c);
93 else
94 g = BLAST_Gcd(*a, BLAST_Gcd(*b, *c));
95 if (g > 1) {
96 *a /= g;
97 *b /= g;
98 *c /= g;
99 }
100 return g;
101 }
102
103
104 uint16 get6mer(char* p) {
105 uint16 r=gdna2bit(p,3);
106 if (*p) {
107 uint16 v=gdna2bit(p,3);
108 r |= (v<<8);
109 }
110 return r;
111 }
112
113 //signal that a diagonal is invalid
114 static const int kInvalidOffset = -2;
115
116 int s_FindFirstMismatch(const char *seq1, int len1,
117 const char *seq2, int len2,
118 int seq1_index, int seq2_index,
119 //bool &fence_hit,
120 bool reverse) {
121 int start_index = seq1_index;
122 /* Sentry detection here should be relatively inexpensive: The
123 sentry value cannot appear in the query, so detection only
124 needs to be done at exit from the subject-query matching loop.
125 For uncompressed sequences, ambiguities in the query (i.e. seq1)
126 always count as mismatches */
127 if (reverse) {
128 while (seq1_index < len1 && seq2_index < len2 &&
129 //seq1[len1-1 - seq1_index] < 4 &&
130 seq1[len1-1 - seq1_index] == seq2[len2-1 - seq2_index]) {
131 ++seq1_index;
132 ++seq2_index;
133 }
134 //if (seq2_index < len2 && seq2[len2-1-seq2_index] == FENCE_SENTRY) {
135 //if len2-1-seq2_index<=0) {
136 // fence_hit = true;
137 // }
138 }
139 else { //forward lookup
140 while (seq1_index < len1 && seq2_index < len2 &&
141 //seq1[seq1_index] < 4 &&
142 seq1[seq1_index] == seq2[seq2_index]) {
143 ++seq1_index;
144 ++seq2_index;
145 }
146 //if (seq2_index < len2 && seq2[seq2_index] == FENCE_SENTRY) {
147 //if (seq2_index==len2) {
148 // fence_hit = true;
149 //}
150 }
151 return seq1_index - start_index;
152 }
153
154
155
156 /** During the traceback for a non-affine greedy alignment,
157 compute the diagonal that will result from the next
158 traceback operation
159
160 @param last_seq2_off Array of offsets into the second sequence;
161 last_seq2_off[d][k] gives the largest offset into
162 the second sequence that lies on diagonal k and
163 has distance d [in]
164 @param d Starting distance [in]
165 @param diag Index of diagonal that produced the starting distance [in]
166 @param seq2_index The offset into the second sequence after the traceback
167 operation has completed [out]
168 @return The diagonal resulting from the next traceback operation
169 being applied
170 */
171 int s_GetNextNonAffineTback(int **last_seq2_off, int d,
172 int diag, int *seq2_index) {
173 // choose the traceback operation that results in the
174 // largest seq2 offset at this point, then compute the
175 // new diagonal that is implied by the operation
176 if (last_seq2_off[d-1][diag-1] >
177 GMAX(last_seq2_off[d-1][diag], last_seq2_off[d-1][diag+1])) {
178 *seq2_index = last_seq2_off[d-1][diag-1];
179 return diag - 1; // gap in seq2
180 }
181 if (last_seq2_off[d-1][diag] > last_seq2_off[d-1][diag+1]) {
182 *seq2_index = last_seq2_off[d-1][diag];
183 return diag; // match
184 }
185 *seq2_index = last_seq2_off[d-1][diag+1];
186 return diag + 1; // gap in seq1
187 }
188
189
190 int GXGreedyExtend(const char* seq1, int len1,
191 const char* seq2, int len2,
192 bool reverse, int xdrop_threshold,
193 int match_cost, int mismatch_cost,
194 int& seq1_align_len, int& seq2_align_len,
195 CGreedyAlignData& aux_data,
196 GXEditScript *edit_block) {
197 //GapPrelimEditBlock *edit_block,
198 //bool& fence_hit, SGreedySeed *seed) {
199 int seq1_index;
200 int seq2_index;
201 int index;
202 int d;
203 int k;
204 int diag_lower, diag_upper;
205 int max_dist;
206 int diag_origin;
207 int best_dist;
208 int best_diag;
209 int** last_seq2_off;
210 int* max_score;
211 int xdrop_offset;
212 int longest_match_run;
213 bool end1_reached, end2_reached;
214 GXMemPool* mem_pool;
215
216 /* ordinary dynamic programming alignment, for each offset
217 in seq1, walks through offsets in seq2 until an X-dropoff
218 test fails, saving the best score encountered along
219 the way. Instead of score, this code tracks the 'distance'
220 (number of mismatches plus number of gaps) between seq1
221 and seq2. Instead of walking through sequence offsets, it
222 walks through diagonals that can achieve a given distance.
223
224 Note that in what follows, the numbering of diagonals implies
225 a dot matrix where increasing seq1 offsets go to the right on
226 the x axis, and increasing seq2 offsets go up the y axis.
227 The gapped alignment thus proceeds up and to the right in
228 the graph, and diagonals are numbered increasing to the right */
229
230 best_dist = 0;
231 best_diag = 0;
232
233 /* set the number of distinct distances the algorithm will
234 examine in the search for an optimal alignment. The
235 heuristic worst-case running time of the algorithm is
236 O(max_dist**2 + (len1+len2)); for sequences which are
237 very similar, the average running time will be sig-
238 nificantly better than this */
239
240 max_dist = GMIN(GREEDY_MAX_COST,
241 (len2/GREEDY_MAX_COST_FRACTION + 1));
242
243 /* the main loop assumes that the index of all diagonals is
244 biased to lie in the middle of allocated bookkeeping
245 structures */
246
247 diag_origin = max_dist + 2;
248
249 // last_seq2_off[d][k] is the largest offset into seq2 that
250 // lies on diagonal k and has distance d
251
252 last_seq2_off = aux_data.last_seq2_off;
253
254 /* Instead of tracking the best alignment score and using
255 xdrop_theshold directly, track the best score for each
256 unique distance and use the best score for some previously
257 computed distance to implement the X-dropoff test.
258
259 xdrop_offset gives the distance backwards in the score
260 array to look */
261
262 xdrop_offset = (xdrop_threshold + match_cost / 2) /
263 (match_cost + mismatch_cost) + 1;
264
265 // find the offset of the first mismatch between seq1 and seq2
266
267 index = s_FindFirstMismatch(seq1, len1, seq2, len2, 0, 0, reverse);
268 // fence_hit, reverse, rem);
269
270 // update the extents of the alignment, and bail out
271 // early if no further work is needed
272
273 seq1_align_len = index;
274 seq2_align_len = index;
275 seq1_index = index;
276 /*
277 seed->start_q = 0;
278 seed->start_s = 0;
279 seed->match_length = index;
280 */
281 longest_match_run = index;
282
283 if (index == len1 || index == len2) {
284 /* Return the number of differences, which is zero here */
285 if (edit_block != NULL)
286 //GapPrelimEditBlockAdd(edit_block, eGapAlignSub, index);
287 edit_block->opRep(index);
288 return 0;
289 }
290
291 // set up the memory pool
292 mem_pool = aux_data.space;
293 if (edit_block == NULL) {
294 mem_pool = NULL;
295 }
296 else if (mem_pool == NULL) {
297 aux_data.space = mem_pool = new GXMemPool();
298 }
299 else {
300 mem_pool->refresh();
301 }
302
303 /* set up the array of per-distance maximum scores. There
304 are max_diags + xdrop_offset distances to track, the first
305 xdrop_offset of which are 0 */
306
307 max_score = aux_data.max_score + xdrop_offset;
308 for (index = 0; index < xdrop_offset; index++)
309 aux_data.max_score[index] = 0;
310
311 // fill in the initial offsets of the distance matrix
312
313 last_seq2_off[0][diag_origin] = seq1_index;
314 max_score[0] = seq1_index * match_cost;
315 diag_lower = diag_origin - 1;
316 diag_upper = diag_origin + 1;
317 end1_reached = end2_reached = false;
318
319 // for each distance
320 for (d = 1; d <= max_dist; d++) {
321 int xdrop_score;
322 int curr_score;
323 int curr_extent = 0;
324 int curr_seq2_index = 0;
325 int curr_diag = 0;
326 int tmp_diag_lower = diag_lower;
327 int tmp_diag_upper = diag_upper;
328
329 // Assign impossible seq2 offsets to any diagonals that
330 // are not in the range (diag_lower,diag_upper).
331 // These will serve as sentinel values for the inner loop
332 last_seq2_off[d - 1][diag_lower-1] = kInvalidOffset;
333 last_seq2_off[d - 1][diag_lower] = kInvalidOffset;
334 last_seq2_off[d - 1][diag_upper] = kInvalidOffset;
335 last_seq2_off[d - 1][diag_upper+1] = kInvalidOffset;
336
337 // compute the score for distance d corresponding to the X-dropoff criterion
338
339 xdrop_score = max_score[d - xdrop_offset] +
340 (match_cost + mismatch_cost) * d - xdrop_threshold;
341 xdrop_score = (int)ceil((double)xdrop_score / (match_cost / 2));
342
343 // for each diagonal of interest
344 for (k = tmp_diag_lower; k <= tmp_diag_upper; k++) {
345 /* find the largest offset into seq2 that increases
346 the distance from d-1 to d (i.e. keeps the alignment
347 from getting worse for as long as possible), then
348 choose the offset into seq1 that will keep the
349 resulting diagonal fixed at k
350
351 Note that this requires kInvalidOffset+1 to be smaller
352 than any valid offset into seq2, i.e. to be negative */
353
354 seq2_index = GMAX(last_seq2_off[d - 1][k + 1],
355 last_seq2_off[d - 1][k ]) + 1;
356 seq2_index = GMAX(seq2_index, last_seq2_off[d - 1][k - 1]);
357 seq1_index = seq2_index + k - diag_origin;
358
359 if (seq2_index < 0 || seq1_index + seq2_index < xdrop_score) {
360
361 // if no valid diagonal can reach distance d, or the
362 // X-dropoff test fails, narrow the range of diagonals
363 // to test and skip to the next diagonal
364 if (k == diag_lower)
365 diag_lower++;
366 else
367 last_seq2_off[d][k] = kInvalidOffset;
368 continue;
369 }
370 diag_upper = k;
371
372 /* slide down diagonal k until a mismatch
373 occurs. As long as only matches are encountered,
374 the current distance d will not change */
375
376 index = s_FindFirstMismatch(seq1, len1, seq2, len2,
377 seq1_index, seq2_index, reverse);
378 //fence_hit, reverse, rem);
379 if (index > longest_match_run) {
380 //seed->start_q = seq1_index;
381 //seed->start_s = seq2_index;
382 //seed->match_length = index;
383 longest_match_run = index;
384 }
385 seq1_index += index;
386 seq2_index += index;
387
388 // set the new largest seq2 offset that achieves
389 // distance d on diagonal k
390
391 last_seq2_off[d][k] = seq2_index;
392
393 // since all values of k are constrained to have the
394 // same distance d, the value of k which maximizes the
395 // alignment score is the one that covers the most of seq1 and seq2
396 if (seq1_index + seq2_index > curr_extent) {
397 curr_extent = seq1_index + seq2_index;
398 curr_seq2_index = seq2_index;
399 curr_diag = k;
400 }
401
402 /* clamp the bounds on diagonals to avoid walking off
403 either sequence. Because the bounds increase by at
404 most one for each distance, diag_lower and diag_upper
405 can each be of size at most max_diags+2 */
406
407 if (seq2_index == len2) {
408 diag_lower = k + 1;
409 end2_reached = true;
410 }
411 if (seq1_index == len1) {
412 diag_upper = k - 1;
413 end1_reached = true;
414 }
415 } // end loop over diagonals
416
417 // compute the maximum score possible for distance d
418 curr_score = curr_extent * (match_cost / 2) -
419 d * (match_cost + mismatch_cost);
420 // if this is the best score seen so far, update the
421 // statistics of the best alignment
422 if (curr_score > max_score[d - 1]) {
423 max_score[d] = curr_score;
424 best_dist = d;
425 best_diag = curr_diag;
426 seq2_align_len = curr_seq2_index;
427 seq1_align_len = curr_seq2_index + best_diag - diag_origin;
428 }
429 else {
430 max_score[d] = max_score[d - 1];
431 }
432
433 // alignment has finished if the lower and upper bounds
434 // on diagonals to check have converged to each other
435
436 if (diag_lower > diag_upper)
437 break;
438
439 /* set up for the next distance to examine. Because the
440 bounds increase by at most one for each distance,
441 diag_lower and diag_upper can each be of size at
442 most max_diags+2 */
443
444 if (!end2_reached)
445 diag_lower--;
446 if (!end1_reached)
447 diag_upper++;
448
449 if (edit_block == NULL) {
450 // if no traceback is specified, the next row of
451 // last_seq2_off can reuse previously allocated memory
452 //TODO FIXME The following assumes two arrays of
453 // at least max_dist+4 int's have already been allocated
454 last_seq2_off[d + 1] = last_seq2_off[d - 1];
455 }
456 else {
457 // traceback requires all rows of last_seq2_off to be saved,
458 // so a new row must be allocated
459 last_seq2_off[d + 1] = (int*)mem_pool->getByteSpace((diag_upper - diag_lower + 7)*sizeof(int));
460 // move the origin for this row backwards
461 //TODO FIXME: dubious pointer arithmetic ?!
462 //last_seq2_off[d + 1] = last_seq2_off[d + 1] - diag_lower + 2;
463 }
464 } // end loop over distinct distances
465
466
467 if (edit_block == NULL)
468 return best_dist;
469
470 //---- perform traceback
471 d = best_dist;
472 seq1_index = seq1_align_len;
473 seq2_index = seq2_align_len;
474 // for all positive distances
475
476 //if (fence_hit && *fence_hit)
477 // goto done;
478 if (index==len1 || index==len2) d=0;
479 while (d > 0) {
480 int new_diag;
481 int new_seq2_index;
482
483 /* retrieve the value of the diagonal after the next
484 traceback operation. best_diag starts off with the
485 value computed during the alignment process */
486
487 new_diag = s_GetNextNonAffineTback(last_seq2_off, d,
488 best_diag, &new_seq2_index);
489
490 if (new_diag == best_diag) {
491 // same diagonal: issue a group of substitutions
492 if (seq2_index - new_seq2_index > 0) {
493 edit_block->opRep(seq2_index - new_seq2_index);
494 }
495 }
496 else if (new_diag < best_diag) {
497 // smaller diagonal: issue a group of substitutions
498 // and then a gap in seq2 */
499 if (seq2_index - new_seq2_index > 0) {
500 edit_block->opRep(seq2_index - new_seq2_index);
501 }
502 //GapPrelimEditBlockAdd(edit_block, eGapAlignIns, 1);
503 edit_block->opIns(1);
504 }
505 else {
506 // larger diagonal: issue a group of substitutions
507 // and then a gap in seq1
508 if (seq2_index - new_seq2_index - 1 > 0) {
509 edit_block->opRep(seq2_index - new_seq2_index - 1);
510 }
511 edit_block->opDel(1);
512 }
513 d--;
514 best_diag = new_diag;
515 seq2_index = new_seq2_index;
516 }
517 //done:
518 // handle the final group of substitutions back to distance zero,
519 // i.e. back to offset zero of seq1 and seq2
520 //GapPrelimEditBlockAdd(edit_block, eGapAlignSub,
521 // last_seq2_off[0][diag_origin]);
522 edit_block->opRep(last_seq2_off[0][diag_origin]);
523 if (!reverse)
524 edit_block->reverse();
525 return best_dist;
526 }
527
528 void printEditScript(GXEditScript* ed_script) {
529 uint i;
530 if (ed_script==NULL || ed_script->opnum == 0)
531 return;
532 for (i=0; i<ed_script->opnum; i++) {
533 int num=((ed_script->ops[i]) >> 2);
534 unsigned char op_type = 3 - ( ed_script->ops[i] & gxEDIT_OP_MASK );
535 if (op_type == 3)
536 GError("Error: printEditScript encountered op_type 3 ?!\n");
537 GMessage("%d%c ", num, xgapcodes[op_type]);
538 }
539 GMessage("\n");
540 }
541
542 GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart,
543 bool editscript, int reward, int penalty, int xdrop) {
544 int q_max=strlen(q_seq); //query
545 int s_max=strlen(s_seq); //subj
546 return GreedyAlignRegion(q_seq, q_alnstart, q_max,
547 s_seq, s_alnstart, s_max, reward, penalty, xdrop, NULL, NULL, editscript);
548 }
549
550 struct GXSeedTable {
551 int a_num, b_num;
552 int a_cap, b_cap;
553 char* xc;
554 GXSeedTable(int a=12, int b=255) {
555 a_cap=0;
556 b_cap=0;
557 a_num=0;
558 b_num=0;
559 xc=NULL;
560 init(a,b);
561 }
562 ~GXSeedTable() {
563 GFREE(xc);
564 }
565 void init(int a, int b) {
566 a_num=a;
567 b_num=b;
568 bool resize=false;
569 if (b_num>b_cap) { resize=true; b_cap=b_num;}
570 if (a_num>a_cap) { resize=true; a_cap=a_num;}
571 if (resize) {
572 GFREE(xc);
573 GCALLOC(xc, (a_num*b_num));
574 }
575 else {
576 //just clear up to a_max, b_max
577 memset((void*)xc, 0, (a_num*b_num));
578 }
579 }
580 char& x(int ax, int by) {
581 return xc[by*a_num+ax];
582 }
583
584 };
585
586 const int a_m_score=2; //match score
587 const int a_mis_score=-3; //mismatch
588 const int a_dropoff_score=7;
589 const int a_min_score=12; //at least 6 bases full match
590
591 // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
592 //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
593 //check minimum score and
594 //for 3' adapter trimming:
595 // require that the right end of the alignment for either the adaptor OR the read must be
596 // < 3 distance from its right end
597 // for 5' adapter trimming:
598 // require that the left end of the alignment for either the adaptor OR the read must
599 // be at coordinate < 3 from start
600
601 bool extendUngapped(const char* a, int alen, int ai,
602 const char* b, int blen, int bi, int& mlen, int& l5, int& l3, bool end5=false) {
603 //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
604 //if (debug) {
605 // GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
606 // }
607 int a_l=ai; //alignment coordinates on a
608 int a_r=ai+mlen-1;
609 int b_l=bi; //alignment coordinates on b
610 int b_r=bi+mlen-1;
611 int ai_maxscore=ai;
612 int bi_maxscore=bi;
613 int score=mlen*a_m_score;
614 int maxscore=score;
615 int mism5score=a_mis_score;
616 if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
617 //try to extend to the left first, if possible
618 while (ai>0 && bi>0) {
619 ai--;
620 bi--;
621 score+= (a[ai]==b[bi])? a_m_score : mism5score;
622 if (score>maxscore) {
623 ai_maxscore=ai;
624 bi_maxscore=bi;
625 maxscore=score;
626 }
627 else if (maxscore-score>a_dropoff_score) break;
628 }
629 a_l=ai_maxscore;
630 b_l=bi_maxscore;
631 //now extend to the right
632 ai_maxscore=a_r;
633 bi_maxscore=b_r;
634 ai=a_r;
635 bi=b_r;
636 score=maxscore;
637 //sometimes there are extra As at the end of the read, ignore those
638 if (a[alen-2]=='A' && a[alen-1]=='A') {
639 alen-=2;
640 while (a[alen-1]=='A' && alen>ai) alen--;
641 }
642 while (ai<alen-1 && bi<blen-1) {
643 ai++;
644 bi++;
645 //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
646 if (a[ai]==b[bi]) { //match
647 score+=a_m_score;
648 if (ai>=alen-2) {
649 score+=a_m_score-(alen-ai-1);
650 }
651 }
652 else { //mismatch
653 score+=a_mis_score;
654 }
655 if (score>maxscore) {
656 ai_maxscore=ai;
657 bi_maxscore=bi;
658 maxscore=score;
659 }
660 else if (maxscore-score>a_dropoff_score) break;
661 }
662 a_r=ai_maxscore;
663 b_r=bi_maxscore;
664 int a_ovh3=alen-a_r-1;
665 int b_ovh3=blen-b_r-1;
666 int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
667 int mmovh5=(a_l<b_l)? a_l : b_l;
668 if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
669 if (a_l<a_ovh3) {
670 //adapter closer to the left end (typical for 5' adapter)
671 l5=a_r+1;
672 l3=alen-1;
673 }
674 else {
675 //adapter matching at the right end (typical for 3' adapter)
676 l5=0;
677 l3=a_l-1;
678 }
679 return true;
680 }
681 //do not trim:
682 l5=0;
683 l3=alen-1;
684 return false;
685 }
686
687
688 GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len,
689 GVec<uint16> amers[], const char* seqb, int b_len) {
690 //overlap of right (3') end of seqb
691 //hash the first 12 bases of seqa:
692 int aimin=0;
693 int aimax=GMIN(9,(a_len-4));
694 int bimin=GMAX(0,(b_len-a_len-2));//allow 2 extraneous bases, just in case there is some garbage at the end
695 int bimax=b_len-4;
696 int a_maxlen=aimax+4; //number of rows in the diagonal matrix
697 int b_maxlen=b_len; //number of cols in the diagonal matrix
698 GXSeedTable gx(a_maxlen, b_maxlen);
699 GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
700 for (int ai=aimin;ai<=aimax;ai++) {
701 int32* av=(int32 *)(&seqa[ai]);
702 //for (int bi=b_len-4;bi>=bimin;bi--) {
703 for (int bi=bimin;bi<=bimax;bi++) {
704 if (gx.x(ai,bi))
705 continue; //already have a previous seed covering this region of this diagonal
706 int32* bv=(int32 *)(&seqb[bi]);
707 if (*av == *bv) {
708 //word match
709 //see if we can extend to the right
710 gx.x(ai+1,bi+1)=1;
711 gx.x(ai+2,bi+2)=1;
712 gx.x(ai+3,bi+3)=1;
713 int aix=ai+4;
714 int bix=bi+4;
715 int len=4;
716 while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
717 {
718 gx.x(aix,bix)=1;
719 aix++;bix++;
720 len++;
721 }
722 if (len>7) {
723 //heuristics: very likely the best we can get
724 int qlen=len;
725 while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
726 aix++;
727 bix++;
728 qlen++;
729 }
730 if (qlen>10) { //found a quick match shortcut
731 diagstrips->qmatch=new GXSeed(ai,bi,qlen);
732 return diagstrips;
733 }
734 }
735 GXSeed* newseed=new GXSeed(ai,bi,len);
736 seeds.Add(newseed);
737 diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
738 //special last resort terminal match to be used if no better alignment is there
739 if (len>4 && ai==0 && (bi+len==b_len || bi+len==b_len-1)) diagstrips->tmatch=newseed;
740 }
741 }
742 }//for each 4-mer at the beginning of seqa
743 for (int i=0;i<diagstrips->Count();i++) {
744 diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
745 }
746 diagstrips->setSorted(true); //sort by score
747 return diagstrips;
748 }
749
750 GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len,
751 GVec<uint16> amers[], const char* seqb, int b_len) {
752 //overlap of left (5') end of seqb
753 //hash the last 12 bases of seqa:
754 int aimin=GMAX(0,(a_len-16));
755 int aimax=a_len-4;
756 int bimin=0;
757 int bimax=GMIN((a_len-2), (b_len-4));
758 int a_maxlen=aimax+4; //number of rows in the diagonal matrix
759 int b_maxlen=b_len; //number of cols in the diagonal matrix
760 //gx.init(a_maxlen, b_maxlen);
761 GXSeedTable gx(a_maxlen, b_maxlen);
762 GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
763 for (int ai=aimin;ai<=aimax;ai++) {
764 int32* av=(int32 *)(&seqa[ai]);
765 for (int bi=bimin;bi<=bimax;bi++) {
766 if (gx.x(ai,bi))
767 continue; //already have a previous seed covering this region of this diagonal
768 int32* bv=(int32 *)(&seqb[bi]);
769 if (*av == *bv) {
770 //word match
771 //see if we can extend to the right
772 gx.x(ai+1,bi+1)=1;
773 gx.x(ai+2,bi+2)=1;
774 gx.x(ai+3,bi+3)=1;
775 int aix=ai+4;
776 int bix=bi+4;
777 int len=4;
778 while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
779 {
780 gx.x(aix,bix)=1;
781 aix++;bix++;
782 len++;
783 }
784 if (len>7) {
785 //heuristics: very likely the best we can get
786 int qlen=len;
787 while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
788 aix++;
789 bix++;
790 qlen++;
791 }
792 if (qlen>10) { //found a quick match shortcut
793 diagstrips->qmatch=new GXSeed(ai,bi,qlen);
794 return diagstrips;
795 }
796 }
797 GXSeed* newseed=new GXSeed(ai,bi,len);
798 seeds.Add(newseed);
799 diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
800 //special last resort terminal match to be used if no better alignment is there
801 if (len>4 && bi==0 && ai+len==a_len) diagstrips->tmatch=newseed;
802 }
803 }
804 }//for each 4-mer at the end of seqa
805 for (int i=0;i<diagstrips->Count();i++) {
806 diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
807 }
808 diagstrips->setSorted(true); //sort by score
809 return diagstrips;
810 }
811
812 int cmpSeedScore(const pointer p1, const pointer p2) {
813 //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
814 GXSeed* s1=(GXSeed*)p1;
815 GXSeed* s2=(GXSeed*)p2;
816 if (s1->len==s2->len) {
817 return (s1->b_ofs-s2->b_ofs);
818 }
819 else return (s2->len-s1->len);
820 }
821
822 int cmpSeedDiag(const pointer p1, const pointer p2) {
823 GXSeed* s1=(GXSeed*)p1;
824 GXSeed* s2=(GXSeed*)p2;
825 return ((s1->b_ofs-s1->a_ofs)-(s2->b_ofs-s2->a_ofs));
826 }
827
828
829 int cmpDiagBands_R(const pointer p1, const pointer p2) {
830 //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
831 GXBand* b1=(GXBand*)p1;
832 GXBand* b2=(GXBand*)p2;
833 if (b1->score==b2->score) {
834 return (b2->w_min_b-b1->w_min_b);
835 }
836 else return (b2->score-b1->score);
837 }
838
839
840
841 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
842 const char* s_seq, int s_alnstart, int s_max,
843 int reward, int penalty, int xdrop, CGreedyAlignData* gxmem,
844 CAlnTrim* trim, bool editscript) {
845 GXEditScript* ed_script_fwd = NULL;
846 GXEditScript* ed_script_rev = NULL;
847 if ( q_alnstart>q_max || q_alnstart<1 || s_alnstart>s_max || s_alnstart<1 )
848 GError("GreedyAlign() Error: invalid anchor coordinate.\n");
849 q_alnstart--;
850 s_alnstart--;
851 if (q_seq==NULL || q_seq[0]==0 || s_seq==NULL || s_seq[0]==0)
852 GError("GreedyAlign() Error: attempt to use an empty sequence string!\n");
853 if (q_seq[q_alnstart]!=s_seq[s_alnstart])
854 GError("GreedyAlign() Error: improper anchor (mismatch)!\n");
855 int q_ext_l=0, q_ext_r=0, s_ext_l=0, s_ext_r=0;
856 const char* q=q_seq+q_alnstart;
857 int q_avail=q_max-q_alnstart;
858 const char* s=s_seq+s_alnstart;
859 int s_avail=s_max-s_alnstart;
860 if (penalty<0) penalty=-penalty;
861 int MIN_GREEDY_SCORE=5*reward; //minimum score for an alignment to be reported for 0 diffs
862 GXAlnInfo* alninfo=NULL;
863 bool freeAlnMem=(gxmem==NULL);
864 if (freeAlnMem) {
865 gxmem=new CGreedyAlignData(reward, penalty, xdrop);
866 }
867 else gxmem->reset();
868 int retscore = 0;
869 int numdiffs = 0;
870 if (trim!=NULL && trim->type==galn_TrimLeft) {
871 if (editscript)
872 ed_script_rev=new GXEditScript();
873
874 int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
875 reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
876 //check this extension here and bail out if it's not a good extension
877 if (s_alnstart+1-s_ext_l>trim->boundary) {
878 delete ed_script_rev;
879 if (freeAlnMem) delete gxmem;
880 return NULL;
881 }
882
883 if (editscript)
884 ed_script_fwd=new GXEditScript();
885 int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
886 reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
887 numdiffs=numdiffs_r+numdiffs_l;
888 //convert num diffs to actual score
889 retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward+penalty);
890 if (editscript)
891 ed_script_rev->Append(ed_script_fwd); //combine the two extensions
892 }
893 else {
894 if (editscript) {
895 ed_script_fwd=new GXEditScript();
896 }
897 int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
898 reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
899 //check extension here and bail out if not a good right extension
900 //assuming s_max is really at the right end of s_seq
901 if (trim!=NULL && trim->type==galn_TrimRight &&
902 s_alnstart+s_ext_r<trim->boundary) {
903 delete ed_script_fwd;
904 if (freeAlnMem) delete gxmem;
905 return NULL;
906 }
907 if (editscript)
908 ed_script_rev=new GXEditScript();
909 int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
910 reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
911 //convert num diffs to actual score
912 numdiffs=numdiffs_r+numdiffs_l;
913 retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward+penalty);
914 if (editscript)
915 ed_script_rev->Append(ed_script_fwd); //combine the two extensions
916 }
917
918 if (retscore>=MIN_GREEDY_SCORE) {
919 alninfo=new GXAlnInfo(q_seq, q_alnstart+1-q_ext_l,q_alnstart+q_ext_r, s_seq, s_alnstart+1-s_ext_l, s_alnstart+s_ext_r);
920 int hsp_length = GMIN(q_ext_l+q_ext_r, s_ext_l+s_ext_r);
921 alninfo->score=retscore;
922 alninfo->pid = 100 * (1 - ((double) numdiffs) / hsp_length);
923 #ifdef GDEBUG
924 //if (ed_script_rev) {
925 // GMessage("Final Edit script ::: ");
926 // printEditScript(ed_script_rev);
927 // }
928 #endif
929 alninfo->editscript=ed_script_rev;
930 alninfo->gapinfo = new CAlnGapInfo(ed_script_rev, alninfo->ql-1, alninfo->sl-1);
931 }
932 else {
933 if (freeAlnMem) delete gxmem;
934 delete ed_script_rev;
935 delete alninfo;
936 alninfo=NULL;
937 }
938 delete ed_script_fwd;
939 return alninfo;
940 }
941
942 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
943 const char* s_seq, int s_alnstart, int s_max, CGreedyAlignData* gxmem,
944 CAlnTrim* trim, bool editscript) {
945 int reward=2;
946 int penalty=3;
947 int xdrop=8;
948 if (gxmem) {
949 reward=gxmem->match_reward;
950 penalty=gxmem->mismatch_penalty;
951 xdrop=gxmem->x_drop;
952 }
953 return GreedyAlignRegion(q_seq, q_alnstart, q_max, s_seq, s_alnstart, s_max,
954 reward, penalty, xdrop, gxmem, trim, editscript);
955 }
956
957 GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, GVec<uint16> amers[],
958 const char* seqb, int seqb_len, CGreedyAlignData* gxmem, int min_pid) {
959 bool editscript=false;
960 #ifdef GDEBUG
961 editscript=true;
962 // GMessage("==========> matching Right (3') end ========\n");
963 #endif
964
965 CAlnTrim trimInfo(galn_TrimRight, seqb, seqb_len);
966 GList<GXSeed> rseeds(true,true,false);
967 GXBandSet* alnbands=collectSeeds_R(rseeds, seqa, seqa_len, amers, seqb, seqb_len);
968 GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
969 //did we find a shortcut?
970 if (alnbands->qmatch) {
971 //#ifdef GDEBUG
972 // GMessage("::: Found a quick long match -- using it instead of diagonal bands\n");
973 //#endif
974 anchor_seeds.Add(alnbands->qmatch);
975 }
976 else {
977 int max_top_bands=5;
978 int top_band_count=0;
979 for (int b=0;b<alnbands->Count();b++) {
980 if (alnbands->Get(b)->score<4) break;
981 //#ifdef GDEBUG
982 //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
983 //#endif
984 top_band_count++;
985 GXBand& band=*(alnbands->Get(b));
986 band.seeds.setSorted(cmpSeedScore);
987 anchor_seeds.Add(band.seeds.First());
988 band.tested=true;
989 if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
990 }
991 //#ifdef GDEBUG
992 //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
993 //#endif
994 }
995
996 GList<GXAlnInfo> galns(true,true,false);
997 for (int i=0;i<anchor_seeds.Count();i++) {
998 GXSeed& aseed=*anchor_seeds[i];
999 int a1=aseed.a_ofs+(aseed.len>>1)+1;
1000 int a2=aseed.b_ofs+(aseed.len>>1)+1;
1001 GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1002 seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1003 if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1004 galns.AddIfNew(alninfo, true);
1005 else delete alninfo;
1006 }
1007 //special 3' end case: due to the seed scoring scheme being biased towards the 5' end of the read,
1008 //we should also try some seeds closer to the 3' end
1009 if (galns.Count()==0) {
1010 anchor_seeds.Clear();
1011 alnbands->setSorted(cmpDiagBands_R);
1012 int max_top_bands=4;
1013 int top_band_count=0;
1014 //#ifdef GDEBUG
1015 //GMessage(":::>> Retrying adjusting sort order.\n");
1016 //#endif
1017 if (alnbands->tmatch) {
1018 //anchor_seeds.setSorted(false);
1019 anchor_seeds.Add(alnbands->tmatch);
1020 }
1021 for (int b=0;b<alnbands->Count();b++) {
1022 if (alnbands->Get(b)->score<4) break;
1023 //#ifdef GDEBUG
1024 //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
1025 //#endif
1026 if (alnbands->Get(b)->tested) continue;
1027 top_band_count++;
1028 GXBand& band=*(alnbands->Get(b));
1029 band.seeds.setSorted(cmpSeedScore);
1030 anchor_seeds.Add(band.seeds.First());
1031 if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1032 }
1033 //#ifdef GDEBUG
1034 //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1035 //#endif
1036 for (int i=0;i<anchor_seeds.Count();i++) {
1037 GXSeed& aseed=*anchor_seeds[i];
1038 int a1=aseed.a_ofs+(aseed.len>>1)+1;
1039 int a2=aseed.b_ofs+(aseed.len>>1)+1;
1040 GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1041 seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1042 if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1043 galns.AddIfNew(alninfo, true);
1044 else delete alninfo;
1045 }
1046 }
1047 //---- done
1048 delete alnbands;
1049 if (galns.Count()) {
1050 GXAlnInfo* bestaln=galns.Shift();
1051 #ifdef GDEBUG
1052 GMessage("Best alignment: a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", bestaln->ql, bestaln->qr,
1053 bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid);
1054 if (bestaln->gapinfo!=NULL) {
1055 bestaln->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1056 }
1057 #endif
1058
1059 return bestaln;
1060 }
1061 else return NULL;
1062 }
1063
1064 GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, GVec<uint16> amers[],
1065 const char* seqb, int seqb_len, CGreedyAlignData* gxmem, int min_pid) {
1066 bool editscript=false;
1067 #ifdef GDEBUG
1068 editscript=true;
1069 //GMessage("==========> matching Left (5') end ========\n");
1070 #endif
1071 CAlnTrim trimInfo(galn_TrimLeft, seqb, seqb_len);
1072 GList<GXSeed> rseeds(true,true,false);
1073 GXBandSet* alnbands=collectSeeds_L(rseeds, seqa, seqa_len, amers, seqb, seqb_len);
1074 GList<GXAlnInfo> galns(true,true,false);
1075 GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
1076
1077 if (alnbands->qmatch) {
1078 //#ifdef GDEBUG
1079 // GMessage("::: Found a quick long match -- using it instead of diagonal bands\n");
1080 //#endif
1081 anchor_seeds.Add(alnbands->qmatch);
1082 }
1083 else {
1084 int max_top_bands=5;
1085 int top_band_count=0;
1086 for (int b=0;b<alnbands->Count();b++) {
1087 if (alnbands->Get(b)->score<4) break;
1088 //#ifdef GDEBUG
1089 //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
1090 //#endif
1091 top_band_count++;
1092 GXBand& band=*(alnbands->Get(b));
1093 band.seeds.setSorted(cmpSeedScore);
1094 anchor_seeds.Add(band.seeds.First());
1095 if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1096 }
1097 //#ifdef GDEBUG
1098 //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1099 //#endif
1100 }
1101 for (int i=0;i<anchor_seeds.Count();i++) {
1102 GXSeed& aseed=*anchor_seeds[i];
1103 int a1=aseed.a_ofs+(aseed.len>>1)+1;
1104 int a2=aseed.b_ofs+(aseed.len>>1)+1;
1105 GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1106 seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1107 if (alninfo && alninfo->pid>=min_pid
1108 && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, seqa_len-alninfo->qr))
1109 galns.AddIfNew(alninfo, true);
1110 else delete alninfo;
1111 }
1112 if (galns.Count()==0 && alnbands->tmatch) {
1113 GXSeed& aseed=*alnbands->tmatch;
1114 int a1=aseed.a_ofs+(aseed.len>>1)+1;
1115 int a2=aseed.b_ofs+(aseed.len>>1)+1;
1116 GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1117 seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1118 if (alninfo) galns.Add(alninfo);
1119 }
1120 #ifdef GDEBUG
1121 /*
1122 for (int i=0;i<galns.Count();i++) {
1123 GXAlnInfo* alninfo=galns[i];
1124 GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
1125 alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
1126 if (alninfo->gapinfo!=NULL) {
1127 GMessage("Alignment:\n");
1128 alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1129 }
1130 }
1131 */
1132 #endif
1133 //---- done
1134 delete alnbands;
1135 if (galns.Count()) {
1136 GXAlnInfo* bestaln=galns.Shift();
1137 #ifdef GDEBUG
1138 GMessage("Best alignment: a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", bestaln->ql, bestaln->qr,
1139 bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid);
1140 if (bestaln->gapinfo!=NULL) {
1141 bestaln->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1142 }
1143 #endif
1144 return bestaln;
1145 }
1146 else return NULL;
1147 }