ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.cpp
Revision: 301
Committed: Fri Oct 26 12:44:32 2012 UTC (7 years ago) by gpertea
File size: 35870 byte(s)
Log Message:
minor refactoring


Line File contents
1 #include "GAlnExtend.h"
2
3 //greedy gapped alignment extension
4 //(mostly lifted from NCBI's megablast gapped extension code)
5
6 int GXMemPool::kMinSpace = 1000000;
7
8 #ifdef GDEBUG
9 char buf[6]={0x1B,'[', 'n','m','m','\0'};
10
11 void color_fg(int c,FILE* f) {
12 if (f!=stderr && f!=stdout) return;
13 sprintf((char *)(&buf[2]),"%dm",c+30);
14 fwrite(buf,1,strlen(buf), f);
15 }
16
17 void color_bg(int c, FILE* f) {
18 if (f!=stderr && f!=stdout) return;
19 sprintf((char *)(&buf[2]),"%dm",c+40);
20 fwrite(buf,1,strlen(buf),f);
21 };
22
23 void color_resetfg(FILE* f) {
24 if (f!=stderr && f!=stdout) return;
25 sprintf((char *)(&buf[2]),"39m");
26 fwrite(buf,1,strlen(buf), f);
27 };
28
29 void color_resetbg(FILE* f) {
30 if (f!=stderr && f!=stdout) return;
31 sprintf((char *)(&buf[2]),"49m");
32 fwrite(buf,1,strlen(buf), f);
33 }
34
35 void color_reset(FILE* f) {
36 if (f!=stderr && f!=stdout) return;
37 sprintf((char *)(&buf[2]),"0m");
38 fwrite(buf,1,strlen(buf), f);
39 };
40
41 void color_normal(FILE* f) {
42 if (f!=stderr && f!=stdout) return;
43 sprintf((char *)(&buf[2]),"22m");
44 fwrite(buf,1,strlen(buf), f);
45 };
46
47 #endif
48
49
50 char xgapcodes[4]={'S','I', 'D', 'X'};
51
52 int get_last(int **flast_d, int d, int diag, int *row1) {
53 if (flast_d[d-1][diag-1] > GMAX(flast_d[d-1][diag], flast_d[d-1][diag+1])) {
54 *row1 = flast_d[d-1][diag-1];
55 return diag-1;
56 }
57 if (flast_d[d-1][diag] > flast_d[d-1][diag+1]) {
58 *row1 = flast_d[d-1][diag];
59 return diag;
60 }
61 *row1 = flast_d[d-1][diag+1];
62 return diag+1;
63 }
64
65 void GapXEditScript::print() { //debug
66 GapXEditScript* p=this;
67 do {
68 GMessage("%d%c ",p->num, xgapcodes[p->op_type]);
69 } while ((p=p->next)!=NULL);
70 GMessage("\n");
71 }
72
73
74 int BLAST_Gcd(int a, int b) {
75 int c;
76
77 b = abs(b);
78 if (b > a)
79 c=a, a=b, b=c;
80
81 while (b != 0) {
82 c = a%b;
83 a = b;
84 b = c;
85 }
86 return a;
87 }
88
89 int BLAST_Gdb3(int* a, int* b, int* c) {
90 int g;
91 if (*b == 0)
92 g = BLAST_Gcd(*a, *c);
93 else
94 g = BLAST_Gcd(*a, BLAST_Gcd(*b, *c));
95 if (g > 1) {
96 *a /= g;
97 *b /= g;
98 *c /= g;
99 }
100 return g;
101 }
102
103
104 uint16 get6mer(char* p) {
105 uint16 r=gdna2bit(p,3);
106 r <<= 6;
107 r |= gdna2bit(p,3);
108 return r;
109 }
110
111
112 void table6mers(const char* s, int slen, GVec<uint16>* amers[]) {
113 for (uint16 i=0; i <= slen-6; i++) {
114 char* p = (char*)(s+i);
115 uint16 v=get6mer(p);
116 if (amers[v]==NULL) {
117 amers[v]=new GVec<uint16>(1);
118 }
119 amers[v]->Add(i);
120 }
121 }
122
123 GVec<uint16>* match6mer(char* start, GVec<uint16>* amers[]) {
124 //careful: this is broken if start+5 falls beyond the end of the string!
125 uint16 r=get6mer(start);
126 return amers[r];
127 }
128
129 //signal that a diagonal is invalid
130 static const int kInvalidOffset = -2;
131
132 int s_FindFirstMismatch(const char *seq1, int len1,
133 const char *seq2, int len2,
134 int seq1_index, int seq2_index,
135 //bool &fence_hit,
136 bool reverse) {
137 int start_index = seq1_index;
138 /* Sentry detection here should be relatively inexpensive: The
139 sentry value cannot appear in the query, so detection only
140 needs to be done at exit from the subject-query matching loop.
141 For uncompressed sequences, ambiguities in the query (i.e. seq1)
142 always count as mismatches */
143 if (reverse) {
144 while (seq1_index < len1 && seq2_index < len2 &&
145 //seq1[len1-1 - seq1_index] < 4 &&
146 seq1[len1-1 - seq1_index] == seq2[len2-1 - seq2_index]) {
147 ++seq1_index;
148 ++seq2_index;
149 }
150 //if (seq2_index < len2 && seq2[len2-1-seq2_index] == FENCE_SENTRY) {
151 //if len2-1-seq2_index<=0) {
152 // fence_hit = true;
153 // }
154 }
155 else { //forward lookup
156 while (seq1_index < len1 && seq2_index < len2 &&
157 //seq1[seq1_index] < 4 &&
158 seq1[seq1_index] == seq2[seq2_index]) {
159 ++seq1_index;
160 ++seq2_index;
161 }
162 //if (seq2_index < len2 && seq2[seq2_index] == FENCE_SENTRY) {
163 //if (seq2_index==len2) {
164 // fence_hit = true;
165 //}
166 }
167 return seq1_index - start_index;
168 }
169
170
171
172 /** During the traceback for a non-affine greedy alignment,
173 compute the diagonal that will result from the next
174 traceback operation
175
176 @param last_seq2_off Array of offsets into the second sequence;
177 last_seq2_off[d][k] gives the largest offset into
178 the second sequence that lies on diagonal k and
179 has distance d [in]
180 @param d Starting distance [in]
181 @param diag Index of diagonal that produced the starting distance [in]
182 @param seq2_index The offset into the second sequence after the traceback
183 operation has completed [out]
184 @return The diagonal resulting from the next traceback operation
185 being applied
186 */
187 int s_GetNextNonAffineTback(int **last_seq2_off, int d,
188 int diag, int *seq2_index) {
189 // choose the traceback operation that results in the
190 // largest seq2 offset at this point, then compute the
191 // new diagonal that is implied by the operation
192 if (last_seq2_off[d-1][diag-1] >
193 GMAX(last_seq2_off[d-1][diag], last_seq2_off[d-1][diag+1])) {
194 *seq2_index = last_seq2_off[d-1][diag-1];
195 return diag - 1; // gap in seq2
196 }
197 if (last_seq2_off[d-1][diag] > last_seq2_off[d-1][diag+1]) {
198 *seq2_index = last_seq2_off[d-1][diag];
199 return diag; // match
200 }
201 *seq2_index = last_seq2_off[d-1][diag+1];
202 return diag + 1; // gap in seq1
203 }
204
205
206 int GXGreedyExtend(const char* seq1, int len1,
207 const char* seq2, int len2,
208 bool reverse, //int xdrop_threshold, int match_cost, int mismatch_cost,
209 int& seq1_align_len, int& seq2_align_len,
210 CGreedyAlignData& aux_data,
211 GXEditScript *edit_block) {
212 //GapPrelimEditBlock *edit_block,
213 //bool& fence_hit, SGreedySeed *seed) {
214 int seq1_index;
215 int seq2_index;
216 int index;
217 int d;
218 int k;
219 int diag_lower, diag_upper;
220 int max_dist;
221 int diag_origin;
222 int best_dist;
223 int best_diag;
224 int** last_seq2_off;
225 int* max_score;
226 int xdrop_offset;
227 int longest_match_run;
228 bool end1_reached, end2_reached;
229 GXMemPool* mem_pool;
230
231 /* ordinary dynamic programming alignment, for each offset
232 in seq1, walks through offsets in seq2 until an X-dropoff
233 test fails, saving the best score encountered along
234 the way. Instead of score, this code tracks the 'distance'
235 (number of mismatches plus number of gaps) between seq1
236 and seq2. Instead of walking through sequence offsets, it
237 walks through diagonals that can achieve a given distance.
238
239 Note that in what follows, the numbering of diagonals implies
240 a dot matrix where increasing seq1 offsets go to the right on
241 the x axis, and increasing seq2 offsets go up the y axis.
242 The gapped alignment thus proceeds up and to the right in
243 the graph, and diagonals are numbered increasing to the right */
244
245 best_dist = 0;
246 best_diag = 0;
247
248 /* set the number of distinct distances the algorithm will
249 examine in the search for an optimal alignment. The
250 heuristic worst-case running time of the algorithm is
251 O(max_dist**2 + (len1+len2)); for sequences which are
252 very similar, the average running time will be sig-
253 nificantly better than this */
254
255 max_dist = GMIN(GREEDY_MAX_COST,
256 (len2/GREEDY_MAX_COST_FRACTION + 1));
257
258 /* the main loop assumes that the index of all diagonals is
259 biased to lie in the middle of allocated bookkeeping
260 structures */
261
262 diag_origin = max_dist + 2;
263
264 // last_seq2_off[d][k] is the largest offset into seq2 that
265 // lies on diagonal k and has distance d
266
267 last_seq2_off = aux_data.last_seq2_off;
268
269 /* Instead of tracking the best alignment score and using
270 xdrop_theshold directly, track the best score for each
271 unique distance and use the best score for some previously
272 computed distance to implement the X-dropoff test.
273
274 xdrop_offset gives the distance backwards in the score
275 array to look */
276
277 xdrop_offset = aux_data.xdrop_ofs;
278
279 // find the offset of the first mismatch between seq1 and seq2
280
281 index = s_FindFirstMismatch(seq1, len1, seq2, len2, 0, 0, reverse);
282 // fence_hit, reverse, rem);
283
284 // update the extents of the alignment, and bail out
285 // early if no further work is needed
286
287 seq1_align_len = index;
288 seq2_align_len = index;
289 seq1_index = index;
290 /*
291 seed->start_q = 0;
292 seed->start_s = 0;
293 seed->match_length = index;
294 */
295 longest_match_run = index;
296
297 if (index == len1 || index == len2) {
298 /* Return the number of differences, which is zero here */
299 if (edit_block != NULL)
300 //GapPrelimEditBlockAdd(edit_block, eGapAlignSub, index);
301 edit_block->opRep(index);
302 return 0;
303 }
304
305 // set up the memory pool
306 mem_pool = aux_data.space;
307 if (edit_block == NULL) {
308 mem_pool = NULL;
309 }
310 else if (mem_pool == NULL) {
311 aux_data.space = mem_pool = new GXMemPool();
312 }
313 else {
314 mem_pool->refresh();
315 }
316
317 /* set up the array of per-distance maximum scores. There
318 are max_diags + xdrop_offset distances to track, the first
319 xdrop_offset of which are 0 */
320
321 max_score = aux_data.max_score + xdrop_offset;
322 for (index = 0; index < xdrop_offset; index++)
323 aux_data.max_score[index] = 0;
324
325 // fill in the initial offsets of the distance matrix
326
327 last_seq2_off[0][diag_origin] = seq1_index;
328 max_score[0] = seq1_index * aux_data.match_reward;
329 diag_lower = diag_origin - 1;
330 diag_upper = diag_origin + 1;
331 end1_reached = end2_reached = false;
332
333 // for each distance
334 for (d = 1; d <= max_dist; d++) {
335 int xdrop_score;
336 int curr_score;
337 int curr_extent = 0;
338 int curr_seq2_index = 0;
339 int curr_diag = 0;
340 int tmp_diag_lower = diag_lower;
341 int tmp_diag_upper = diag_upper;
342
343 // Assign impossible seq2 offsets to any diagonals that
344 // are not in the range (diag_lower,diag_upper).
345 // These will serve as sentinel values for the inner loop
346 last_seq2_off[d - 1][diag_lower-1] = kInvalidOffset;
347 last_seq2_off[d - 1][diag_lower] = kInvalidOffset;
348 last_seq2_off[d - 1][diag_upper] = kInvalidOffset;
349 last_seq2_off[d - 1][diag_upper+1] = kInvalidOffset;
350
351 // compute the score for distance d corresponding to the X-dropoff criterion
352
353 xdrop_score = max_score[d - xdrop_offset] +
354 (aux_data.match_reward + aux_data.mismatch_penalty) * d - aux_data.x_drop;
355 xdrop_score = (int)ceil((double)xdrop_score / (aux_data.match_reward>>1));
356
357 // for each diagonal of interest
358 for (k = tmp_diag_lower; k <= tmp_diag_upper; k++) {
359 /* find the largest offset into seq2 that increases
360 the distance from d-1 to d (i.e. keeps the alignment
361 from getting worse for as long as possible), then
362 choose the offset into seq1 that will keep the
363 resulting diagonal fixed at k
364
365 Note that this requires kInvalidOffset+1 to be smaller
366 than any valid offset into seq2, i.e. to be negative */
367
368 seq2_index = GMAX(last_seq2_off[d - 1][k + 1],
369 last_seq2_off[d - 1][k ]) + 1;
370 seq2_index = GMAX(seq2_index, last_seq2_off[d - 1][k - 1]);
371 seq1_index = seq2_index + k - diag_origin;
372
373 if (seq2_index < 0 || seq1_index + seq2_index < xdrop_score) {
374
375 // if no valid diagonal can reach distance d, or the
376 // X-dropoff test fails, narrow the range of diagonals
377 // to test and skip to the next diagonal
378 if (k == diag_lower)
379 diag_lower++;
380 else
381 last_seq2_off[d][k] = kInvalidOffset;
382 continue;
383 }
384 diag_upper = k;
385
386 /* slide down diagonal k until a mismatch
387 occurs. As long as only matches are encountered,
388 the current distance d will not change */
389
390 index = s_FindFirstMismatch(seq1, len1, seq2, len2,
391 seq1_index, seq2_index, reverse);
392 //fence_hit, reverse, rem);
393 if (index > longest_match_run) {
394 //seed->start_q = seq1_index;
395 //seed->start_s = seq2_index;
396 //seed->match_length = index;
397 longest_match_run = index;
398 }
399 seq1_index += index;
400 seq2_index += index;
401
402 // set the new largest seq2 offset that achieves
403 // distance d on diagonal k
404
405 last_seq2_off[d][k] = seq2_index;
406
407 // since all values of k are constrained to have the
408 // same distance d, the value of k which maximizes the
409 // alignment score is the one that covers the most of seq1 and seq2
410 if (seq1_index + seq2_index > curr_extent) {
411 curr_extent = seq1_index + seq2_index;
412 curr_seq2_index = seq2_index;
413 curr_diag = k;
414 }
415
416 /* clamp the bounds on diagonals to avoid walking off
417 either sequence. Because the bounds increase by at
418 most one for each distance, diag_lower and diag_upper
419 can each be of size at most max_diags+2 */
420
421 if (seq2_index == len2) {
422 diag_lower = k + 1;
423 end2_reached = true;
424 }
425 if (seq1_index == len1) {
426 diag_upper = k - 1;
427 end1_reached = true;
428 }
429 } // end loop over diagonals
430
431 // compute the maximum score possible for distance d
432 curr_score = curr_extent * (aux_data.match_reward / 2) -
433 d * (aux_data.match_reward + aux_data.mismatch_penalty);
434 // if this is the best score seen so far, update the
435 // statistics of the best alignment
436 if (curr_score > max_score[d - 1]) {
437 max_score[d] = curr_score;
438 best_dist = d;
439 best_diag = curr_diag;
440 seq2_align_len = curr_seq2_index;
441 seq1_align_len = curr_seq2_index + best_diag - diag_origin;
442 }
443 else {
444 max_score[d] = max_score[d - 1];
445 }
446
447 // alignment has finished if the lower and upper bounds
448 // on diagonals to check have converged to each other
449
450 if (diag_lower > diag_upper)
451 break;
452
453 /* set up for the next distance to examine. Because the
454 bounds increase by at most one for each distance,
455 diag_lower and diag_upper can each be of size at
456 most max_diags+2 */
457
458 if (!end2_reached)
459 diag_lower--;
460 if (!end1_reached)
461 diag_upper++;
462
463 if (edit_block == NULL) {
464 // if no traceback is specified, the next row of
465 // last_seq2_off can reuse previously allocated memory
466 //WARNING The following assumes two arrays of
467 // at least max_dist+4 int's have already been allocated
468 last_seq2_off[d + 1] = last_seq2_off[d - 1];
469 }
470 else {
471 // traceback requires all rows of last_seq2_off to be saved,
472 // so a new row must be allocated
473 last_seq2_off[d + 1] = (int*)mem_pool->getByteSpace((diag_upper - diag_lower + 7)*sizeof(int));
474 // move the origin for this row backwards
475 // dubious pointer arithmetic ?!
476 //last_seq2_off[d + 1] = last_seq2_off[d + 1] - diag_lower + 2;
477 }
478 } // end loop over distinct distances
479
480
481 if (edit_block == NULL)
482 return best_dist;
483
484 //---- perform traceback
485 d = best_dist;
486 seq1_index = seq1_align_len;
487 seq2_index = seq2_align_len;
488 // for all positive distances
489
490 //if (fence_hit && *fence_hit)
491 // goto done;
492 if (index==len1 || index==len2) d=0;
493 while (d > 0) {
494 int new_diag;
495 int new_seq2_index;
496
497 /* retrieve the value of the diagonal after the next
498 traceback operation. best_diag starts off with the
499 value computed during the alignment process */
500
501 new_diag = s_GetNextNonAffineTback(last_seq2_off, d,
502 best_diag, &new_seq2_index);
503
504 if (new_diag == best_diag) {
505 // same diagonal: issue a group of substitutions
506 if (seq2_index - new_seq2_index > 0) {
507 edit_block->opRep(seq2_index - new_seq2_index);
508 }
509 }
510 else if (new_diag < best_diag) {
511 // smaller diagonal: issue a group of substitutions
512 // and then a gap in seq2 */
513 if (seq2_index - new_seq2_index > 0) {
514 edit_block->opRep(seq2_index - new_seq2_index);
515 }
516 //GapPrelimEditBlockAdd(edit_block, eGapAlignIns, 1);
517 edit_block->opIns(1);
518 }
519 else {
520 // larger diagonal: issue a group of substitutions
521 // and then a gap in seq1
522 if (seq2_index - new_seq2_index - 1 > 0) {
523 edit_block->opRep(seq2_index - new_seq2_index - 1);
524 }
525 edit_block->opDel(1);
526 }
527 d--;
528 best_diag = new_diag;
529 seq2_index = new_seq2_index;
530 }
531 //done:
532 // handle the final group of substitutions back to distance zero,
533 // i.e. back to offset zero of seq1 and seq2
534 //GapPrelimEditBlockAdd(edit_block, eGapAlignSub,
535 // last_seq2_off[0][diag_origin]);
536 edit_block->opRep(last_seq2_off[0][diag_origin]);
537 if (!reverse)
538 edit_block->reverse();
539 return best_dist;
540 }
541
542 void printEditScript(GXEditScript* ed_script) {
543 uint i;
544 if (ed_script==NULL || ed_script->opnum == 0)
545 return;
546 for (i=0; i<ed_script->opnum; i++) {
547 int num=((ed_script->ops[i]) >> 2);
548 unsigned char op_type = 3 - ( ed_script->ops[i] & gxEDIT_OP_MASK );
549 if (op_type == 3)
550 GError("Error: printEditScript encountered op_type 3 ?!\n");
551 GMessage("%d%c ", num, xgapcodes[op_type]);
552 }
553 GMessage("\n");
554 }
555
556 GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart,
557 bool editscript, int reward, int penalty, int xdrop) {
558 int q_max=strlen(q_seq); //query
559 int s_max=strlen(s_seq); //subj
560 return GreedyAlignRegion(q_seq, q_alnstart, q_max,
561 s_seq, s_alnstart, s_max, reward, penalty, xdrop, NULL, NULL, editscript);
562 }
563
564 struct GXSeedTable {
565 int a_num, b_num;
566 int a_cap, b_cap;
567 char* xc;
568 GXSeedTable(int a=12, int b=255) {
569 a_cap=0;
570 b_cap=0;
571 a_num=0;
572 b_num=0;
573 xc=NULL;
574 init(a,b);
575 }
576 ~GXSeedTable() {
577 GFREE(xc);
578 }
579 void init(int a, int b) {
580 a_num=a;
581 b_num=b;
582 bool resize=false;
583 if (b_num>b_cap) { resize=true; b_cap=b_num;}
584 if (a_num>a_cap) { resize=true; a_cap=a_num;}
585 if (resize) {
586 GFREE(xc);
587 GCALLOC(xc, (a_num*b_num));
588 }
589 else {
590 //just clear up to a_max, b_max
591 memset((void*)xc, 0, (a_num*b_num));
592 }
593 }
594 char& x(int ax, int by) {
595 return xc[by*a_num+ax];
596 }
597
598 };
599
600 const int a_m_score=2; //match score
601 const int a_mis_score=-3; //mismatch
602 const int a_dropoff_score=7;
603 const int a_min_score=12; //at least 6 bases full match
604
605 // ------------------ adaptor matching - simple k-mer seed & extend, no indels for now
606 //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
607 //check minimum score and
608 //for 3' adaptor trimming:
609 // require that the right end of the alignment for either the adaptor OR the read must be
610 // < 3 distance from its right end
611 // for 5' adaptor trimming:
612 // require that the left end of the alignment for either the adaptor OR the read must
613 // be at coordinate < 3 from start
614
615 bool extendUngapped(const char* a, int alen, int ai,
616 const char* b, int blen, int bi, int& mlen, int& l5, int& l3, bool end5=false) {
617 //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
618 //if (debug) {
619 // GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
620 // }
621 int a_l=ai; //alignment coordinates on a
622 int a_r=ai+mlen-1;
623 int b_l=bi; //alignment coordinates on b
624 int b_r=bi+mlen-1;
625 int ai_maxscore=ai;
626 int bi_maxscore=bi;
627 int score=mlen*a_m_score;
628 int maxscore=score;
629 int mism5score=a_mis_score;
630 if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
631 //try to extend to the left first, if possible
632 while (ai>0 && bi>0) {
633 ai--;
634 bi--;
635 score+= (a[ai]==b[bi])? a_m_score : mism5score;
636 if (score>maxscore) {
637 ai_maxscore=ai;
638 bi_maxscore=bi;
639 maxscore=score;
640 }
641 else if (maxscore-score>a_dropoff_score) break;
642 }
643 a_l=ai_maxscore;
644 b_l=bi_maxscore;
645 //now extend to the right
646 ai_maxscore=a_r;
647 bi_maxscore=b_r;
648 ai=a_r;
649 bi=b_r;
650 score=maxscore;
651 //sometimes there are extra As at the end of the read, ignore those
652 if (a[alen-2]=='A' && a[alen-1]=='A') {
653 alen-=2;
654 while (a[alen-1]=='A' && alen>ai) alen--;
655 }
656 while (ai<alen-1 && bi<blen-1) {
657 ai++;
658 bi++;
659 //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
660 if (a[ai]==b[bi]) { //match
661 score+=a_m_score;
662 if (ai>=alen-2) {
663 score+=a_m_score-(alen-ai-1);
664 }
665 }
666 else { //mismatch
667 score+=a_mis_score;
668 }
669 if (score>maxscore) {
670 ai_maxscore=ai;
671 bi_maxscore=bi;
672 maxscore=score;
673 }
674 else if (maxscore-score>a_dropoff_score) break;
675 }
676 a_r=ai_maxscore;
677 b_r=bi_maxscore;
678 int a_ovh3=alen-a_r-1;
679 int b_ovh3=blen-b_r-1;
680 int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
681 int mmovh5=(a_l<b_l)? a_l : b_l;
682 if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
683 if (a_l<a_ovh3) {
684 //adaptor closer to the left end (typical for 5' adaptor)
685 l5=a_r+1;
686 l3=alen-1;
687 }
688 else {
689 //adaptor matching at the right end (typical for 3' adaptor)
690 l5=0;
691 l3=a_l-1;
692 }
693 return true;
694 }
695 //do not trim:
696 l5=0;
697 l3=alen-1;
698 return false;
699 }
700
701 GXBandSet* collectSeeds(GList<GXSeed>& seeds, GXSeqData& sd, GAlnTrimType trim_intent) {
702 int bimin=GMAX(0,(sd.blen-sd.alen-6)); //from collectSeeds_R
703 int bimax=GMIN((sd.alen+2), (sd.blen-6));
704 int b_start = (trim_intent==galn_TrimRight) ? bimin : 0;
705 int b_end = (trim_intent==galn_TrimLeft) ? bimax : sd.blen-6;
706 //gx.init(a_maxlen, b_maxlen);
707 GXSeedTable gx(sd.alen, sd.blen);
708 GXBandSet* diagstrips=new GXBandSet(sd.alen, sd.blen); //set of overlapping 3-diagonal strips
709 for (int bi=b_start;bi<=b_end;bi++) {
710 //for each 6-mer of seqb
711 uint16 bv = get6mer((char*) & (sd.bseq[bi]));
712 GVec<uint16>* alocs = sd.amers[bv];
713 if (alocs==NULL) continue;
714 //extend each hit
715 for (int h=0;h<alocs->Count();h++) {
716 int ai=alocs->Get(h);
717 //word match
718 if (gx.x(ai,bi))
719 //already have a previous seed covering this region of this diagonal
720 continue;
721 if (trim_intent==galn_TrimLeft && sd.blen>sd.alen+6 && bi>ai+6)
722 continue; //improper positioning for 5' trimming
723 if (trim_intent==galn_TrimRight && sd.blen>sd.alen+6 && bi<ai-6)
724 continue; //improper positioning for 5' trimming
725
726 //TODO: there could be Ns in this seed, should we count/adjust score?
727 for (int i=0;i<6;i++)
728 gx.x(ai+i,bi+i)=1;
729 //see if we can extend to the right
730 int aix=ai+6;
731 int bix=bi+6;
732 int len=6;
733 while (bix<sd.blen && aix<sd.alen && sd.aseq[aix]==sd.bseq[bix]) {
734 gx.x(aix,bix)=1;
735 aix++;bix++;
736 len++;
737 }
738 if (len>sd.amlen) {
739 //heuristics: very likely the best we can get
740 //quick match shortcut
741 diagstrips->qmatch=new GXSeed(ai,bi,len);
742 return diagstrips;
743 }
744 if (bi>bimax && bi<bimin && len<9)
745 //ignore mid-sequence seeds that are not high scoring
746 continue;
747
748 GXSeed* newseed=new GXSeed(ai,bi,len);
749 seeds.Add(newseed);
750 diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
751 //keep last resort terminal match to be used if no better alignment is there
752 if (bi<2 && ai+len>=sd.alen-1 &&
753 (!diagstrips->tmatch_l || diagstrips->tmatch_l->len<len))
754 diagstrips->tmatch_l=newseed;
755 //collectSeeds_R:
756 if (ai<2 && bi+len>sd.blen-2 &&
757 (!diagstrips->tmatch_r || diagstrips->tmatch_r->len<len))
758 diagstrips->tmatch_r=newseed;
759 }
760 } //for each 6-mer of the read
761 for (int i=0;i<diagstrips->Count();i++) {
762 diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
763 }
764 diagstrips->setSorted(true); //sort by score
765 return diagstrips;
766 }
767
768 int cmpSeedScore(const pointer p1, const pointer p2) {
769 //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
770 GXSeed* s1=(GXSeed*)p1;
771 GXSeed* s2=(GXSeed*)p2;
772 if (s1->len==s2->len) {
773 return (s1->b_ofs-s2->b_ofs);
774 }
775 else return (s2->len-s1->len);
776 }
777
778 int cmpSeedScore_R(const pointer p1, const pointer p2) {
779 //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
780 GXSeed* s1=(GXSeed*)p1;
781 GXSeed* s2=(GXSeed*)p2;
782 if (s1->len==s2->len) {
783 return (s2->b_ofs-s1->b_ofs);
784 }
785 else return (s2->len-s1->len);
786 }
787
788
789 int cmpSeedDiag(const pointer p1, const pointer p2) {
790 GXSeed* s1=(GXSeed*)p1;
791 GXSeed* s2=(GXSeed*)p2;
792 return ((s1->b_ofs-s1->a_ofs)-(s2->b_ofs-s2->a_ofs));
793 }
794
795
796 int cmpDiagBands_R(const pointer p1, const pointer p2) {
797 //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
798 GXBand* b1=(GXBand*)p1;
799 GXBand* b2=(GXBand*)p2;
800 if (b1->score==b2->score) {
801 return (b2->w_min_b-b1->w_min_b);
802 }
803 else return (b2->score-b1->score);
804 }
805
806
807
808 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
809 const char* s_seq, int s_alnstart, int s_max,
810 int reward, int penalty, int xdrop, CGreedyAlignData* gxmem,
811 CAlnTrim* trim, bool editscript) {
812 GXEditScript* ed_script_fwd = NULL;
813 GXEditScript* ed_script_rev = NULL;
814 if ( q_alnstart>q_max || q_alnstart<1 || s_alnstart>s_max || s_alnstart<1 )
815 GError("GreedyAlign() Error: invalid anchor coordinate.\n");
816 q_alnstart--;
817 s_alnstart--;
818 if (q_seq==NULL || q_seq[0]==0 || s_seq==NULL || s_seq[0]==0)
819 GError("GreedyAlign() Error: attempt to use an empty sequence string!\n");
820 /*if (q_seq[q_alnstart]!=s_seq[s_alnstart])
821 GError("GreedyAlign() Error: improper anchor (mismatch):\n%s (start %d len %d)\n%s (start %d len %d)\n",
822 q_seq, q_alnstart, q_max, s_seq, s_alnstart, s_max);
823 */
824 int q_ext_l=0, q_ext_r=0, s_ext_l=0, s_ext_r=0;
825 const char* q=q_seq+q_alnstart;
826 int q_avail=q_max-q_alnstart;
827 const char* s=s_seq+s_alnstart;
828 int s_avail=s_max-s_alnstart;
829 if (penalty<0) penalty=-penalty;
830 GXAlnInfo* alninfo=NULL;
831 bool freeAlnMem=(gxmem==NULL);
832 if (freeAlnMem) {
833 gxmem=new CGreedyAlignData(reward, penalty, xdrop);
834 reward=gxmem->match_reward;
835 penalty=gxmem->mismatch_penalty;
836 xdrop=gxmem->x_drop;
837 }
838 else
839 gxmem->reset();
840 int minMatch= trim ? trim->minMatch : 6;
841 int MIN_GREEDY_SCORE=minMatch*reward; //minimum score for an alignment to be reported for 0 diffs
842 int retscore = 0;
843 int numdiffs = 0;
844 if (trim!=NULL && trim->type==galn_TrimLeft) {
845 //intent: trimming the left side
846 if (editscript)
847 ed_script_rev=new GXEditScript();
848
849 int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, // xdrop, reward, penalty,
850 s_ext_l, q_ext_l, *gxmem, ed_script_rev);
851 //check this extension here and bail out if it's not a good extension
852 if (s_ext_l+(trim->seedlen>>1) < trim->safelen &&
853 q_alnstart+1-q_ext_l>1 &&
854 s_alnstart+1-s_ext_l>trim->l_boundary) {
855 delete ed_script_rev;
856 if (freeAlnMem) delete gxmem;
857 return NULL;
858 }
859
860 if (editscript)
861 ed_script_fwd=new GXEditScript();
862 int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, //xdrop, reward, penalty,
863 s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
864 numdiffs=numdiffs_r+numdiffs_l;
865 //convert num diffs to actual score
866 retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*(reward>>1) - numdiffs*(reward+penalty);
867 if (editscript)
868 ed_script_rev->Append(ed_script_fwd); //combine the two extensions
869 }
870 else {
871 if (editscript) {
872 ed_script_fwd=new GXEditScript();
873 }
874 int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, // xdrop, reward, penalty,
875 s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
876 //check extension here and bail out if not a good right extension
877 //assuming s_max is really at the right end of s_seq
878 if (trim!=NULL && trim->type==galn_TrimRight &&
879 s_ext_r+(trim->seedlen>>1) < trim->safelen &&
880 q_alnstart+q_ext_r<q_max-2 &&
881 s_alnstart+s_ext_r<trim->r_boundary) {
882 delete ed_script_fwd;
883 if (freeAlnMem) delete gxmem;
884 return NULL;
885 }
886 if (editscript)
887 ed_script_rev=new GXEditScript();
888 int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, // xdrop, reward, penalty,
889 s_ext_l, q_ext_l, *gxmem, ed_script_rev);
890 //convert num diffs to actual score
891 numdiffs=numdiffs_r+numdiffs_l;
892 retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*(reward>>1) - numdiffs*(reward+penalty);
893 if (editscript)
894 ed_script_rev->Append(ed_script_fwd); //combine the two extensions
895 }
896 if (retscore>=MIN_GREEDY_SCORE) {
897 alninfo=new GXAlnInfo(q_seq, q_alnstart+1-q_ext_l, q_alnstart+q_ext_r, s_seq, s_alnstart+1-s_ext_l, s_alnstart+s_ext_r);
898 int hsp_length = GMIN(q_ext_l+q_ext_r, s_ext_l+s_ext_r);
899 alninfo->score=retscore;
900 if (gxmem->scaled) alninfo->score >>= 1;
901 alninfo->pid = 100 * (1 - ((double) numdiffs) / hsp_length);
902 #ifdef GDEBUG
903 //if (ed_script_rev) {
904 // GMessage("Final Edit script ::: ");
905 // printEditScript(ed_script_rev);
906 // }
907 #endif
908 alninfo->editscript=ed_script_rev;
909 alninfo->gapinfo = new CAlnGapInfo(ed_script_rev, alninfo->ql-1, alninfo->sl-1);
910 }
911 else {
912 //if (freeAlnMem) delete gxmem;
913 delete ed_script_rev;
914 delete alninfo;
915 alninfo=NULL;
916 }
917 if (freeAlnMem) delete gxmem;
918 delete ed_script_fwd;
919 return alninfo;
920 }
921
922 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
923 const char* s_seq, int s_alnstart, int s_max, CGreedyAlignData* gxmem,
924 CAlnTrim* trim, bool editscript) {
925 int reward=2;
926 int penalty=10;
927 int xdrop=32;
928 if (gxmem) {
929 reward=gxmem->match_reward;
930 penalty=gxmem->mismatch_penalty;
931 xdrop=gxmem->x_drop;
932 }
933 return GreedyAlignRegion(q_seq, q_alnstart, q_max, s_seq, s_alnstart, s_max,
934 reward, penalty, xdrop, gxmem, trim, editscript);
935 }
936
937 GXAlnInfo* match_adaptor(GXSeqData& sd, GAlnTrimType trim_type, int minMatch,
938 CGreedyAlignData* gxmem, int min_pid) {
939 bool editscript=false;
940 #ifdef GDEBUG
941 editscript=true;
942 if (trim_type==galn_TrimLeft) {
943 GMessage("=======> searching left (5') end : %s\n", sd.aseq);
944 }
945 else if (trim_type==galn_TrimRight) {
946 GMessage("=======> searching right(3') end : %s\n", sd.aseq);
947 }
948 else if (trim_type==galn_TrimEither) {
949 GMessage("==========> searching both ends : %s\n", sd.aseq);
950 }
951 #endif
952 CAlnTrim trimInfo(trim_type, sd.bseq, sd.blen, sd.alen, minMatch, sd.amlen);
953 GList<GXSeed> rseeds(true,true,false);
954 GXBandSet* alnbands=collectSeeds(rseeds, sd, trim_type);
955 GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
956 //did we find a shortcut?
957 if (alnbands->qmatch) {
958 #ifdef GDEBUG
959 GMessage("::: Found a quick long match at %d, len %d\n",
960 alnbands->qmatch->b_ofs, alnbands->qmatch->len);
961 #endif
962 anchor_seeds.Add(alnbands->qmatch);
963 }
964 else {
965 int max_top_bands=5;
966 int top_band_count=0;
967 for (int b=0;b<alnbands->Count();b++) {
968 if (alnbands->Get(b)->score<6) break;
969 //#ifdef GDEBUG
970 //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
971 //#endif
972 top_band_count++;
973 GXBand& band=*(alnbands->Get(b));
974 band.seeds.setSorted(cmpSeedScore);
975 anchor_seeds.Add(band.seeds.First());
976 //band.tested=true;
977 if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
978 }
979 //#ifdef GDEBUG
980 //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
981 //#endif
982 }
983 GList<GXAlnInfo> galns(true,true,false);
984 for (int i=0;i<anchor_seeds.Count();i++) {
985 GXSeed& aseed=*anchor_seeds[i];
986 int a1=aseed.a_ofs+(aseed.len>>1)+1;
987 int a2=aseed.b_ofs+(aseed.len>>1)+1;
988 trimInfo.seedlen=aseed.len;
989 #ifdef GDEBUG
990 GMessage("\t::: align from seed (%d, %d)of len %d.\n",aseed.a_ofs, aseed.b_ofs,
991 aseed.len);
992 #endif
993 GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen,
994 sd.bseq, a2, sd.blen, gxmem, &trimInfo, editscript);
995 if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo))
996 galns.AddIfNew(alninfo, true);
997 else delete alninfo;
998 }
999
1000 if (galns.Count()==0) {
1001 //last resort: look for weaker terminal seeds
1002 GPVec<GXSeed> tmatches(2,false);
1003 if (trim_type!=galn_TrimRight) {
1004 if (alnbands->tmatch_l)
1005 tmatches.Add(alnbands->tmatch_l);
1006 }
1007 if (trim_type!=galn_TrimLeft) {
1008 if (alnbands->tmatch_r)
1009 tmatches.Add(alnbands->tmatch_r);
1010 }
1011 for (int i=0;i<tmatches.Count();i++) {
1012 GXSeed& aseed=*tmatches[i];
1013 int halfseed=aseed.len>>1;
1014 int a1=aseed.a_ofs+halfseed+1;
1015 int a2=aseed.b_ofs+halfseed+1;
1016 trimInfo.seedlen=aseed.len;
1017 #ifdef GDEBUG
1018 GMessage("\t::: align from terminal seed (%d, %d)of len %d.\n",aseed.a_ofs, aseed.b_ofs,
1019 aseed.len);
1020 #endif
1021 GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen,
1022 sd.bseq, a2, sd.blen, gxmem, &trimInfo, editscript);
1023 if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo))
1024 galns.AddIfNew(alninfo, true);
1025 else delete alninfo;
1026 }//for each terminal seed
1027 }
1028 //---- found all alignments
1029 delete alnbands;
1030 /*
1031 #ifdef GDEBUG
1032 //print all valid alignments found
1033 for (int i=0;i<galns.Count();i++) {
1034 GXAlnInfo* alninfo=galns[i];
1035 GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
1036 alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
1037 if (alninfo->gapinfo!=NULL) {
1038 GMessage("Alignment:\n");
1039 alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1040 }
1041 }
1042 #endif
1043 */
1044 if (galns.Count()) {
1045 GXAlnInfo* bestaln=galns.Shift();
1046 #ifdef GDEBUG
1047 GMessage("Best alignment: a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", bestaln->ql, bestaln->qr,
1048 bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid);
1049 if (bestaln->gapinfo!=NULL) {
1050 bestaln->gapinfo->printAlignment(stderr, sd.aseq, sd.alen, sd.bseq, sd.blen);
1051 }
1052 #endif
1053 return bestaln;
1054 }
1055 else return NULL;
1056 }