ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.cpp
Revision: 93
Committed: Tue Oct 4 12:27:31 2011 UTC (7 years, 9 months ago) by gpertea
File size: 27397 byte(s)
Log Message:
added gapped alignment extension code

Line File contents
1 #include "GAlnExtend.h"
2
3 //greedy gapped alignment extension
4 //(mostly lifted from NCBI's megablast gapped extension code)
5
6 int GXMemPool::kMinSpace = 1000000;
7
8 #ifdef GDEBUG
9 char buf[6]={0x1B,'[', 'n','m','m','\0'};
10
11 void color_fg(int c,FILE* f) {
12 if (f!=stderr && f!=stdout) return;
13 sprintf((char *)(&buf[2]),"%dm",c+30);
14 fwrite(buf,1,strlen(buf), f);
15 }
16
17 void color_bg(int c, FILE* f) {
18 if (f!=stderr && f!=stdout) return;
19 sprintf((char *)(&buf[2]),"%dm",c+40);
20 fwrite(buf,1,strlen(buf),f);
21 };
22
23 void color_resetfg(FILE* f) {
24 if (f!=stderr && f!=stdout) return;
25 sprintf((char *)(&buf[2]),"39m");
26 fwrite(buf,1,strlen(buf), f);
27 };
28
29 void color_resetbg(FILE* f) {
30 if (f!=stderr && f!=stdout) return;
31 sprintf((char *)(&buf[2]),"49m");
32 fwrite(buf,1,strlen(buf), f);
33 }
34
35 void color_reset(FILE* f) {
36 if (f!=stderr && f!=stdout) return;
37 sprintf((char *)(&buf[2]),"0m");
38 fwrite(buf,1,strlen(buf), f);
39 };
40
41 void color_normal(FILE* f) {
42 if (f!=stderr && f!=stdout) return;
43 sprintf((char *)(&buf[2]),"22m");
44 fwrite(buf,1,strlen(buf), f);
45 };
46
47 #endif
48
49
50 char xgapcodes[4]={'S','I', 'D', 'X'};
51
52 int get_last(int **flast_d, int d, int diag, int *row1) {
53 if (flast_d[d-1][diag-1] > GMAX(flast_d[d-1][diag], flast_d[d-1][diag+1])) {
54 *row1 = flast_d[d-1][diag-1];
55 return diag-1;
56 }
57 if (flast_d[d-1][diag] > flast_d[d-1][diag+1]) {
58 *row1 = flast_d[d-1][diag];
59 return diag;
60 }
61 *row1 = flast_d[d-1][diag+1];
62 return diag+1;
63 }
64
65 void GapXEditScript::print() { //debug
66 GapXEditScript* p=this;
67 do {
68 GMessage("%d%c ",p->num, xgapcodes[p->op_type]);
69 } while ((p=p->next)!=NULL);
70 GMessage("\n");
71 }
72
73
74 int BLAST_Gcd(int a, int b) {
75 int c;
76
77 b = abs(b);
78 if (b > a)
79 c=a, a=b, b=c;
80
81 while (b != 0) {
82 c = a%b;
83 a = b;
84 b = c;
85 }
86 return a;
87 }
88
89 int BLAST_Gdb3(int* a, int* b, int* c) {
90 int g;
91 if (*b == 0)
92 g = BLAST_Gcd(*a, *c);
93 else
94 g = BLAST_Gcd(*a, BLAST_Gcd(*b, *c));
95 if (g > 1) {
96 *a /= g;
97 *b /= g;
98 *c /= g;
99 }
100 return g;
101 }
102
103
104
105 //signal that a diagonal is invalid
106 static const int kInvalidOffset = -2;
107
108 int s_FindFirstMismatch(const char *seq1, int len1,
109 const char *seq2, int len2,
110 int seq1_index, int seq2_index,
111 //bool &fence_hit,
112 bool reverse) {
113 int start_index = seq1_index;
114 /* Sentry detection here should be relatively inexpensive: The
115 sentry value cannot appear in the query, so detection only
116 needs to be done at exit from the subject-query matching loop.
117 For uncompressed sequences, ambiguities in the query (i.e. seq1)
118 always count as mismatches */
119 if (reverse) {
120 while (seq1_index < len1 && seq2_index < len2 &&
121 //seq1[len1-1 - seq1_index] < 4 &&
122 seq1[len1-1 - seq1_index] == seq2[len2-1 - seq2_index]) {
123 ++seq1_index;
124 ++seq2_index;
125 }
126 //if (seq2_index < len2 && seq2[len2-1-seq2_index] == FENCE_SENTRY) {
127 //if len2-1-seq2_index<=0) {
128 // fence_hit = true;
129 // }
130 }
131 else { //forward lookup
132 while (seq1_index < len1 && seq2_index < len2 &&
133 //seq1[seq1_index] < 4 &&
134 seq1[seq1_index] == seq2[seq2_index]) {
135 ++seq1_index;
136 ++seq2_index;
137 }
138 //if (seq2_index < len2 && seq2[seq2_index] == FENCE_SENTRY) {
139 //if (seq2_index==len2) {
140 // fence_hit = true;
141 //}
142 }
143 return seq1_index - start_index;
144 }
145
146
147
148 /** During the traceback for a non-affine greedy alignment,
149 compute the diagonal that will result from the next
150 traceback operation
151
152 @param last_seq2_off Array of offsets into the second sequence;
153 last_seq2_off[d][k] gives the largest offset into
154 the second sequence that lies on diagonal k and
155 has distance d [in]
156 @param d Starting distance [in]
157 @param diag Index of diagonal that produced the starting distance [in]
158 @param seq2_index The offset into the second sequence after the traceback
159 operation has completed [out]
160 @return The diagonal resulting from the next traceback operation
161 being applied
162 */
163 int s_GetNextNonAffineTback(int **last_seq2_off, int d,
164 int diag, int *seq2_index) {
165 // choose the traceback operation that results in the
166 // largest seq2 offset at this point, then compute the
167 // new diagonal that is implied by the operation
168 if (last_seq2_off[d-1][diag-1] >
169 GMAX(last_seq2_off[d-1][diag], last_seq2_off[d-1][diag+1])) {
170 *seq2_index = last_seq2_off[d-1][diag-1];
171 return diag - 1; // gap in seq2
172 }
173 if (last_seq2_off[d-1][diag] > last_seq2_off[d-1][diag+1]) {
174 *seq2_index = last_seq2_off[d-1][diag];
175 return diag; // match
176 }
177 *seq2_index = last_seq2_off[d-1][diag+1];
178 return diag + 1; // gap in seq1
179 }
180
181
182 int GXGreedyExtend(const char* seq1, int len1,
183 const char* seq2, int len2,
184 bool reverse, int xdrop_threshold,
185 int match_cost, int mismatch_cost,
186 int& seq1_align_len, int& seq2_align_len,
187 SGreedyAlignMem& aux_data,
188 GXEditScript *edit_block) {
189 //GapPrelimEditBlock *edit_block,
190 //bool& fence_hit, SGreedySeed *seed) {
191 int seq1_index;
192 int seq2_index;
193 int index;
194 int d;
195 int k;
196 int diag_lower, diag_upper;
197 int max_dist;
198 int diag_origin;
199 int best_dist;
200 int best_diag;
201 int** last_seq2_off;
202 int* max_score;
203 int xdrop_offset;
204 int longest_match_run;
205 bool end1_reached, end2_reached;
206 GXMemPool* mem_pool;
207
208 /* ordinary dynamic programming alignment, for each offset
209 in seq1, walks through offsets in seq2 until an X-dropoff
210 test fails, saving the best score encountered along
211 the way. Instead of score, this code tracks the 'distance'
212 (number of mismatches plus number of gaps) between seq1
213 and seq2. Instead of walking through sequence offsets, it
214 walks through diagonals that can achieve a given distance.
215
216 Note that in what follows, the numbering of diagonals implies
217 a dot matrix where increasing seq1 offsets go to the right on
218 the x axis, and increasing seq2 offsets go up the y axis.
219 The gapped alignment thus proceeds up and to the right in
220 the graph, and diagonals are numbered increasing to the right */
221
222 best_dist = 0;
223 best_diag = 0;
224
225 /* set the number of distinct distances the algorithm will
226 examine in the search for an optimal alignment. The
227 heuristic worst-case running time of the algorithm is
228 O(max_dist**2 + (len1+len2)); for sequences which are
229 very similar, the average running time will be sig-
230 nificantly better than this */
231
232 max_dist = GMIN(GREEDY_MAX_COST,
233 len2 / GREEDY_MAX_COST_FRACTION + 1);
234
235 /* the main loop assumes that the index of all diagonals is
236 biased to lie in the middle of allocated bookkeeping
237 structures */
238
239 diag_origin = max_dist + 2;
240
241 // last_seq2_off[d][k] is the largest offset into seq2 that
242 // lies on diagonal k and has distance d
243
244 last_seq2_off = aux_data.last_seq2_off;
245
246 /* Instead of tracking the best alignment score and using
247 xdrop_theshold directly, track the best score for each
248 unique distance and use the best score for some previously
249 computed distance to implement the X-dropoff test.
250
251 xdrop_offset gives the distance backwards in the score
252 array to look */
253
254 xdrop_offset = (xdrop_threshold + match_cost / 2) /
255 (match_cost + mismatch_cost) + 1;
256
257 // find the offset of the first mismatch between seq1 and seq2
258
259 index = s_FindFirstMismatch(seq1, len1, seq2, len2, 0, 0, reverse);
260 // fence_hit, reverse, rem);
261
262 // update the extents of the alignment, and bail out
263 // early if no further work is needed
264
265 seq1_align_len = index;
266 seq2_align_len = index;
267 seq1_index = index;
268 /*
269 seed->start_q = 0;
270 seed->start_s = 0;
271 seed->match_length = index;
272 */
273 longest_match_run = index;
274
275 if (index == len1 || index == len2) {
276 /* Return the number of differences, which is zero here */
277 if (edit_block != NULL)
278 //GapPrelimEditBlockAdd(edit_block, eGapAlignSub, index);
279 edit_block->opRep(index);
280 return 0;
281 }
282
283 // set up the memory pool
284 mem_pool = aux_data.space;
285 if (edit_block == NULL) {
286 mem_pool = NULL;
287 }
288 else if (mem_pool == NULL) {
289 aux_data.space = mem_pool = new GXMemPool();
290 }
291 else {
292 mem_pool->refresh();
293 }
294
295 /* set up the array of per-distance maximum scores. There
296 are max_diags + xdrop_offset distances to track, the first
297 xdrop_offset of which are 0 */
298
299 max_score = aux_data.max_score + xdrop_offset;
300 for (index = 0; index < xdrop_offset; index++)
301 aux_data.max_score[index] = 0;
302
303 // fill in the initial offsets of the distance matrix
304
305 last_seq2_off[0][diag_origin] = seq1_index;
306 max_score[0] = seq1_index * match_cost;
307 diag_lower = diag_origin - 1;
308 diag_upper = diag_origin + 1;
309 end1_reached = end2_reached = false;
310
311 // for each distance
312 for (d = 1; d <= max_dist; d++) {
313 int xdrop_score;
314 int curr_score;
315 int curr_extent = 0;
316 int curr_seq2_index = 0;
317 int curr_diag = 0;
318 int tmp_diag_lower = diag_lower;
319 int tmp_diag_upper = diag_upper;
320
321 // Assign impossible seq2 offsets to any diagonals that
322 // are not in the range (diag_lower,diag_upper).
323 // These will serve as sentinel values for the inner loop
324 last_seq2_off[d - 1][diag_lower-1] = kInvalidOffset;
325 last_seq2_off[d - 1][diag_lower] = kInvalidOffset;
326 last_seq2_off[d - 1][diag_upper] = kInvalidOffset;
327 last_seq2_off[d - 1][diag_upper+1] = kInvalidOffset;
328
329 // compute the score for distance d corresponding to the X-dropoff criterion
330
331 xdrop_score = max_score[d - xdrop_offset] +
332 (match_cost + mismatch_cost) * d - xdrop_threshold;
333 xdrop_score = (int)ceil((double)xdrop_score / (match_cost / 2));
334
335 // for each diagonal of interest
336 for (k = tmp_diag_lower; k <= tmp_diag_upper; k++) {
337 /* find the largest offset into seq2 that increases
338 the distance from d-1 to d (i.e. keeps the alignment
339 from getting worse for as long as possible), then
340 choose the offset into seq1 that will keep the
341 resulting diagonal fixed at k
342
343 Note that this requires kInvalidOffset+1 to be smaller
344 than any valid offset into seq2, i.e. to be negative */
345
346 seq2_index = GMAX(last_seq2_off[d - 1][k + 1],
347 last_seq2_off[d - 1][k ]) + 1;
348 seq2_index = GMAX(seq2_index, last_seq2_off[d - 1][k - 1]);
349 seq1_index = seq2_index + k - diag_origin;
350
351 if (seq2_index < 0 || seq1_index + seq2_index < xdrop_score) {
352
353 // if no valid diagonal can reach distance d, or the
354 // X-dropoff test fails, narrow the range of diagonals
355 // to test and skip to the next diagonal
356 if (k == diag_lower)
357 diag_lower++;
358 else
359 last_seq2_off[d][k] = kInvalidOffset;
360 continue;
361 }
362 diag_upper = k;
363
364 /* slide down diagonal k until a mismatch
365 occurs. As long as only matches are encountered,
366 the current distance d will not change */
367
368 index = s_FindFirstMismatch(seq1, len1, seq2, len2,
369 seq1_index, seq2_index, reverse);
370 //fence_hit, reverse, rem);
371 if (index > longest_match_run) {
372 //seed->start_q = seq1_index;
373 //seed->start_s = seq2_index;
374 //seed->match_length = index;
375 longest_match_run = index;
376 }
377 seq1_index += index;
378 seq2_index += index;
379
380 // set the new largest seq2 offset that achieves
381 // distance d on diagonal k
382
383 last_seq2_off[d][k] = seq2_index;
384
385 // since all values of k are constrained to have the
386 // same distance d, the value of k which maximizes the
387 // alignment score is the one that covers the most of seq1 and seq2
388 if (seq1_index + seq2_index > curr_extent) {
389 curr_extent = seq1_index + seq2_index;
390 curr_seq2_index = seq2_index;
391 curr_diag = k;
392 }
393
394 /* clamp the bounds on diagonals to avoid walking off
395 either sequence. Because the bounds increase by at
396 most one for each distance, diag_lower and diag_upper
397 can each be of size at most max_diags+2 */
398
399 if (seq2_index == len2) {
400 diag_lower = k + 1;
401 end2_reached = true;
402 }
403 if (seq1_index == len1) {
404 diag_upper = k - 1;
405 end1_reached = true;
406 }
407 } // end loop over diagonals
408
409 // compute the maximum score possible for distance d
410 curr_score = curr_extent * (match_cost / 2) -
411 d * (match_cost + mismatch_cost);
412 // if this is the best score seen so far, update the
413 // statistics of the best alignment
414 if (curr_score > max_score[d - 1]) {
415 max_score[d] = curr_score;
416 best_dist = d;
417 best_diag = curr_diag;
418 seq2_align_len = curr_seq2_index;
419 seq1_align_len = curr_seq2_index + best_diag - diag_origin;
420 }
421 else {
422 max_score[d] = max_score[d - 1];
423 }
424
425 // alignment has finished if the lower and upper bounds
426 // on diagonals to check have converged to each other
427
428 if (diag_lower > diag_upper)
429 break;
430
431 /* set up for the next distance to examine. Because the
432 bounds increase by at most one for each distance,
433 diag_lower and diag_upper can each be of size at
434 most max_diags+2 */
435
436 if (!end2_reached)
437 diag_lower--;
438 if (!end1_reached)
439 diag_upper++;
440
441 if (edit_block == NULL) {
442 // if no traceback is specified, the next row of
443 // last_seq2_off can reuse previously allocated memory
444 //TODO FIXME The following assumes two arrays of
445 // at least max_dist+4 int's have already been allocated
446 last_seq2_off[d + 1] = last_seq2_off[d - 1];
447 }
448 else {
449 // traceback requires all rows of last_seq2_off to be saved,
450 // so a new row must be allocated
451 last_seq2_off[d + 1] = (int*)mem_pool->getByteSpace((diag_upper - diag_lower + 7)*sizeof(int));
452 // move the origin for this row backwards
453 //TODO FIXME: dubious pointer arithmetic ?!
454 //last_seq2_off[d + 1] = last_seq2_off[d + 1] - diag_lower + 2;
455 }
456 } // end loop over distinct distances
457
458
459 if (edit_block == NULL)
460 return best_dist;
461
462 //---- perform traceback
463 d = best_dist;
464 seq1_index = seq1_align_len;
465 seq2_index = seq2_align_len;
466 // for all positive distances
467
468 //if (fence_hit && *fence_hit)
469 // goto done;
470 if (index==len1 || index==len2) d=0;
471 while (d > 0) {
472 int new_diag;
473 int new_seq2_index;
474
475 /* retrieve the value of the diagonal after the next
476 traceback operation. best_diag starts off with the
477 value computed during the alignment process */
478
479 new_diag = s_GetNextNonAffineTback(last_seq2_off, d,
480 best_diag, &new_seq2_index);
481
482 if (new_diag == best_diag) {
483 // same diagonal: issue a group of substitutions
484 if (seq2_index - new_seq2_index > 0) {
485 edit_block->opRep(seq2_index - new_seq2_index);
486 }
487 }
488 else if (new_diag < best_diag) {
489 // smaller diagonal: issue a group of substitutions
490 // and then a gap in seq2 */
491 if (seq2_index - new_seq2_index > 0) {
492 edit_block->opRep(seq2_index - new_seq2_index);
493 }
494 //GapPrelimEditBlockAdd(edit_block, eGapAlignIns, 1);
495 edit_block->opIns(1);
496 }
497 else {
498 // larger diagonal: issue a group of substitutions
499 // and then a gap in seq1
500 if (seq2_index - new_seq2_index - 1 > 0) {
501 edit_block->opRep(seq2_index - new_seq2_index - 1);
502 }
503 edit_block->opDel(1);
504 }
505 d--;
506 best_diag = new_diag;
507 seq2_index = new_seq2_index;
508 }
509 //done:
510 // handle the final group of substitutions back to distance zero,
511 // i.e. back to offset zero of seq1 and seq2
512 //GapPrelimEditBlockAdd(edit_block, eGapAlignSub,
513 // last_seq2_off[0][diag_origin]);
514 edit_block->opRep(last_seq2_off[0][diag_origin]);
515 if (!reverse)
516 edit_block->reverse();
517 return best_dist;
518 }
519
520 void printEditScript(GXEditScript* ed_script) {
521 uint i;
522 if (ed_script==NULL || ed_script->opnum == 0)
523 return;
524 for (i=0; i<ed_script->opnum; i++) {
525 int num=((ed_script->ops[i]) >> 2);
526 unsigned char op_type = 3 - ( ed_script->ops[i] & gxEDIT_OP_MASK );
527 if (op_type == 3)
528 GError("Error: printEditScript encountered op_type 3 ?!\n");
529 GMessage("%d%c ", num, xgapcodes[op_type]);
530 }
531 GMessage("\n");
532 }
533
534 GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart,
535 bool editscript, int reward, int penalty, int xdrop) {
536 int q_max=strlen(q_seq); //query
537 int s_max=strlen(s_seq); //subj
538 return GreedyAlignRegion(q_seq, q_alnstart, q_max,
539 s_seq, s_alnstart, s_max, galn_NoTrim, editscript, NULL, reward, penalty, xdrop);
540 }
541
542 struct GXSeedTable {
543 int a_num, b_num;
544 int a_cap, b_cap;
545 char* xc;
546 GXSeedTable(int a=12, int b=255) {
547 a_cap=0;
548 b_cap=0;
549 a_num=0;
550 b_num=0;
551 xc=NULL;
552 init(a,b);
553 }
554 ~GXSeedTable() {
555 GFREE(xc);
556 }
557 void init(int a, int b) {
558 a_num=a;
559 b_num=b;
560 bool resize=false;
561 if (b_num>b_cap) { resize=true; b_cap=b_num;}
562 if (a_num>a_cap) { resize=true; a_cap=a_num;}
563 if (resize) {
564 GFREE(xc);
565 GCALLOC(xc, (a_num*b_num));
566 }
567 else {
568 //just clear up to a_max, b_max
569 memset((void*)xc, 0, (a_num*b_num));
570 }
571 }
572 char& x(int ax, int by) {
573 return xc[by*a_num+ax];
574 }
575
576 };
577
578
579 GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
580 //overlap of right (3') end of seqb
581 //hash the first 12 bases of seqa:
582 int aimin=0;
583 int aimax=GMIN(9,(a_len-4));
584 int bimin=GMAX(0,(b_len-a_len-2));//allow 2 extraneous bases, just in case there is some garbage at the end
585 int bimax=b_len-4;
586 int a_maxlen=aimax+4; //number of rows in the diagonal matrix
587 int b_maxlen=b_len; //number of cols in the diagonal matrix
588 GXSeedTable gx(a_maxlen, b_maxlen);
589 GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
590 for (int ai=aimin;ai<=aimax;ai++) {
591 int32* av=(int32 *)(&seqa[ai]);
592 //for (int bi=b_len-4;bi>=bimin;bi--) {
593 for (int bi=bimin;bi<=bimax;bi++) {
594 if (gx.x(ai,bi))
595 continue; //already have a previous seed covering this region of this diagonal
596 int32* bv=(int32 *)(&seqb[bi]);
597 if (*av == *bv) {
598 //word match
599 //see if we can extend to the right
600 gx.x(ai+1,bi+1)=1;
601 gx.x(ai+2,bi+2)=1;
602 gx.x(ai+3,bi+3)=1;
603 int aix=ai+4;
604 int bix=bi+4;
605 int len=4;
606 while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
607 {
608 gx.x(aix,bix)=1;
609 aix++;bix++;
610 len++;
611 }
612 GXSeed* newseed=new GXSeed(ai,bi,len);
613 seeds.Add(newseed);
614 diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
615 }
616 }
617 }//for each 4-mer at the beginning of seqa
618 for (int i=0;i<diagstrips->Count();i++) {
619 diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
620 }
621 diagstrips->setSorted(true); //sort by score
622 return diagstrips;
623 }
624
625 GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
626 //overlap of left (5') end of seqb
627 //hash the last 12 bases of seqa:
628 int aimin=GMAX(0,(a_len-16));
629 int aimax=a_len-4;
630 int bimin=0;
631 int bimax=GMIN((a_len-2), (b_len-4));
632 int a_maxlen=aimax+4; //number of rows in the diagonal matrix
633 int b_maxlen=b_len; //number of cols in the diagonal matrix
634 //gx.init(a_maxlen, b_maxlen);
635 GXSeedTable gx(a_maxlen, b_maxlen);
636 GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
637 for (int ai=aimin;ai<=aimax;ai++) {
638 int32* av=(int32 *)(&seqa[ai]);
639 for (int bi=bimin;bi<=bimax;bi++) {
640 if (gx.x(ai,bi))
641 continue; //already have a previous seed covering this region of this diagonal
642 int32* bv=(int32 *)(&seqb[bi]);
643 if (*av == *bv) {
644 //word match
645 //see if we can extend to the right
646 gx.x(ai+1,bi+1)=1;
647 gx.x(ai+2,bi+2)=1;
648 gx.x(ai+3,bi+3)=1;
649 int aix=ai+4;
650 int bix=bi+4;
651 int len=4;
652 while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
653 {
654 gx.x(aix,bix)=1;
655 aix++;bix++;
656 len++;
657 }
658 GXSeed* newseed=new GXSeed(ai,bi,len);
659 seeds.Add(newseed);
660 diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
661 }
662 }
663 }//for each 4-mer at the end of seqa
664 for (int i=0;i<diagstrips->Count();i++) {
665 diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
666 }
667 diagstrips->setSorted(true); //sort by score
668 return diagstrips;
669 }
670
671
672
673 int cmpSeedScore(const pointer p1, const pointer p2) {
674 //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
675 GXSeed* s1=(GXSeed*)p1;
676 GXSeed* s2=(GXSeed*)p2;
677 if (s1->len==s2->len) {
678 return (s1->b_ofs-s2->b_ofs);
679 }
680 else return (s2->len-s1->len);
681 }
682
683 int cmpSeedDiag(const pointer p1, const pointer p2) {
684 GXSeed* s1=(GXSeed*)p1;
685 GXSeed* s2=(GXSeed*)p2;
686 return ((s1->b_ofs-s1->a_ofs)-(s2->b_ofs-s2->a_ofs));
687 }
688
689 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
690 const char* s_seq, int s_alnstart, int s_max, GAlnTrimType trimtype,
691 bool editscript, SGreedyAlignMem* gxmem, int reward, int penalty, int xdrop) {
692 GXEditScript* ed_script_fwd = NULL;
693 GXEditScript* ed_script_rev = NULL;
694 if ( q_alnstart>q_max || q_alnstart<1 || s_alnstart>s_max || s_alnstart<1 )
695 GError("GreedyAlign() Error: invalid anchor coordinate.\n");
696 q_alnstart--;
697 s_alnstart--;
698 if (q_seq==NULL || q_seq[0]==0 || s_seq==NULL || s_seq[0]==0)
699 GError("GreedyAlign() Error: attempt to use an empty sequence string!\n");
700 if (q_seq[q_alnstart]!=s_seq[s_alnstart])
701 GError("GreedyAlign() Error: improper anchor (mismatch)!\n");
702 int q_ext_l=0, q_ext_r=0, s_ext_l=0, s_ext_r=0;
703 const char* q=q_seq+q_alnstart;
704 int q_avail=q_max-q_alnstart;
705 const char* s=s_seq+s_alnstart;
706 int s_avail=s_max-s_alnstart;
707 if (penalty<0) penalty=-penalty;
708 int MIN_GREEDY_SCORE=5*reward; //minimum score for an alignment to be reported
709 GXAlnInfo* alninfo=NULL;
710 bool freeAlnMem=(gxmem==NULL);
711 if (freeAlnMem) {
712 gxmem=new SGreedyAlignMem(reward, penalty, xdrop);
713 }
714 else gxmem->reset();
715 int retscore = 0;
716 int numdiffs = 0;
717 if (trimtype==galn_TrimLeft) {
718 if (editscript)
719 ed_script_rev=new GXEditScript();
720
721 int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
722 reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
723 //check this extension here and bail out if it's not a good extension
724 if (s_alnstart+1-s_ext_l>3) {
725 delete ed_script_rev;
726 if (freeAlnMem) delete gxmem;
727 return NULL;
728 }
729
730 if (editscript)
731 ed_script_fwd=new GXEditScript();
732 int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
733 reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
734 numdiffs=numdiffs_r+numdiffs_l;
735 //convert num diffs to actual score
736 retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward-penalty);
737 if (editscript)
738 ed_script_rev->Append(ed_script_fwd); //combine the two extensions
739 }
740 else {
741 if (editscript) {
742 ed_script_fwd=new GXEditScript();
743 }
744 int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
745 reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
746 //check extension here and bail out if not a good right extension
747 //assuming s_max is really at the right end of s_seq
748 if (trimtype==galn_TrimRight && s_alnstart+s_ext_r<s_max-2) {
749 delete ed_script_fwd;
750 if (freeAlnMem) delete gxmem;
751 return NULL;
752 }
753 if (editscript)
754 ed_script_rev=new GXEditScript();
755 int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
756 reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
757 //convert num diffs to actual score
758 numdiffs=numdiffs_r+numdiffs_l;
759 retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward-penalty);
760 if (editscript)
761 ed_script_rev->Append(ed_script_fwd); //combine the two extensions
762 }
763
764 if (retscore>=MIN_GREEDY_SCORE) {
765 alninfo=new GXAlnInfo(q_seq, q_alnstart+1-q_ext_l,q_alnstart+q_ext_r, s_seq, s_alnstart+1-s_ext_l, s_alnstart+s_ext_r);
766 int hsp_length = GMIN(q_ext_l+q_ext_r, s_ext_l+s_ext_r);
767 alninfo->score=retscore;
768 alninfo->pid = 100 * (1 - ((double) numdiffs) / hsp_length);
769 #ifdef GDEBUG
770 if (ed_script_rev) {
771 GMessage("Final Edit script ::: ");
772 printEditScript(ed_script_rev);
773 }
774 #endif
775 alninfo->editscript=ed_script_rev;
776 alninfo->gapinfo = new CAlnGapInfo(ed_script_rev, alninfo->ql-1, alninfo->sl-1);
777 }
778 else {
779 if (freeAlnMem) delete gxmem;
780 delete ed_script_rev;
781 delete alninfo;
782 alninfo=NULL;
783 }
784 delete ed_script_fwd;
785 return alninfo;
786 }