ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/seqalign/GXAlign.cpp
Revision: 89
Committed: Mon Oct 3 18:42:24 2011 UTC (9 years, 6 months ago) by gpertea
File size: 27293 byte(s)
Log Message:
separate wrappers for 5' and  3' alignments

Line File contents
1 #include "GXAlign.h"
2
3 int GXMemPool::kMinSpace = 1000000;
4
5 #ifdef GDEBUG
6 char buf[6]={0x1B,'[', 'n','m','m','\0'};
7
8 void color_fg(int c,FILE* f) {
9 if (f!=stderr && f!=stdout) return;
10 sprintf((char *)(&buf[2]),"%dm",c+30);
11 fwrite(buf,1,strlen(buf), f);
12 }
13
14 void color_bg(int c, FILE* f) {
15 if (f!=stderr && f!=stdout) return;
16 sprintf((char *)(&buf[2]),"%dm",c+40);
17 fwrite(buf,1,strlen(buf),f);
18 };
19
20 void color_resetfg(FILE* f) {
21 if (f!=stderr && f!=stdout) return;
22 sprintf((char *)(&buf[2]),"39m");
23 fwrite(buf,1,strlen(buf), f);
24 };
25
26 void color_resetbg(FILE* f) {
27 if (f!=stderr && f!=stdout) return;
28 sprintf((char *)(&buf[2]),"49m");
29 fwrite(buf,1,strlen(buf), f);
30 }
31
32 void color_reset(FILE* f) {
33 if (f!=stderr && f!=stdout) return;
34 sprintf((char *)(&buf[2]),"0m");
35 fwrite(buf,1,strlen(buf), f);
36 };
37
38 void color_normal(FILE* f) {
39 if (f!=stderr && f!=stdout) return;
40 sprintf((char *)(&buf[2]),"22m");
41 fwrite(buf,1,strlen(buf), f);
42 };
43
44 #endif
45
46
47 char xgapcodes[4]={'S','I', 'D', 'X'};
48
49 int get_last(int **flast_d, int d, int diag, int *row1) {
50 if (flast_d[d-1][diag-1] > GMAX(flast_d[d-1][diag], flast_d[d-1][diag+1])) {
51 *row1 = flast_d[d-1][diag-1];
52 return diag-1;
53 }
54 if (flast_d[d-1][diag] > flast_d[d-1][diag+1]) {
55 *row1 = flast_d[d-1][diag];
56 return diag;
57 }
58 *row1 = flast_d[d-1][diag+1];
59 return diag+1;
60 }
61
62 void GapXEditScript::print() { //debug
63 GapXEditScript* p=this;
64 do {
65 GMessage("%d%c ",p->num, xgapcodes[p->op_type]);
66 } while ((p=p->next)!=NULL);
67 GMessage("\n");
68 }
69
70
71 int BLAST_Gcd(int a, int b) {
72 int c;
73
74 b = abs(b);
75 if (b > a)
76 c=a, a=b, b=c;
77
78 while (b != 0) {
79 c = a%b;
80 a = b;
81 b = c;
82 }
83 return a;
84 }
85
86 int BLAST_Gdb3(int* a, int* b, int* c) {
87 int g;
88 if (*b == 0)
89 g = BLAST_Gcd(*a, *c);
90 else
91 g = BLAST_Gcd(*a, BLAST_Gcd(*b, *c));
92 if (g > 1) {
93 *a /= g;
94 *b /= g;
95 *c /= g;
96 }
97 return g;
98 }
99
100
101
102 //signal that a diagonal is invalid
103 static const int kInvalidOffset = -2;
104
105 int s_FindFirstMismatch(const char *seq1, int len1,
106 const char *seq2, int len2,
107 int seq1_index, int seq2_index,
108 //bool &fence_hit,
109 bool reverse) {
110 int start_index = seq1_index;
111 /* Sentry detection here should be relatively inexpensive: The
112 sentry value cannot appear in the query, so detection only
113 needs to be done at exit from the subject-query matching loop.
114 For uncompressed sequences, ambiguities in the query (i.e. seq1)
115 always count as mismatches */
116 if (reverse) {
117 while (seq1_index < len1 && seq2_index < len2 &&
118 //seq1[len1-1 - seq1_index] < 4 &&
119 seq1[len1-1 - seq1_index] == seq2[len2-1 - seq2_index]) {
120 ++seq1_index;
121 ++seq2_index;
122 }
123 //if (seq2_index < len2 && seq2[len2-1-seq2_index] == FENCE_SENTRY) {
124 //if len2-1-seq2_index<=0) {
125 // fence_hit = true;
126 // }
127 }
128 else { //forward lookup
129 while (seq1_index < len1 && seq2_index < len2 &&
130 //seq1[seq1_index] < 4 &&
131 seq1[seq1_index] == seq2[seq2_index]) {
132 ++seq1_index;
133 ++seq2_index;
134 }
135 //if (seq2_index < len2 && seq2[seq2_index] == FENCE_SENTRY) {
136 //if (seq2_index==len2) {
137 // fence_hit = true;
138 //}
139 }
140 return seq1_index - start_index;
141 }
142
143
144
145 /** During the traceback for a non-affine greedy alignment,
146 compute the diagonal that will result from the next
147 traceback operation
148
149 @param last_seq2_off Array of offsets into the second sequence;
150 last_seq2_off[d][k] gives the largest offset into
151 the second sequence that lies on diagonal k and
152 has distance d [in]
153 @param d Starting distance [in]
154 @param diag Index of diagonal that produced the starting distance [in]
155 @param seq2_index The offset into the second sequence after the traceback
156 operation has completed [out]
157 @return The diagonal resulting from the next traceback operation
158 being applied
159 */
160 int s_GetNextNonAffineTback(int **last_seq2_off, int d,
161 int diag, int *seq2_index) {
162 // choose the traceback operation that results in the
163 // largest seq2 offset at this point, then compute the
164 // new diagonal that is implied by the operation
165 if (last_seq2_off[d-1][diag-1] >
166 GMAX(last_seq2_off[d-1][diag], last_seq2_off[d-1][diag+1])) {
167 *seq2_index = last_seq2_off[d-1][diag-1];
168 return diag - 1; // gap in seq2
169 }
170 if (last_seq2_off[d-1][diag] > last_seq2_off[d-1][diag+1]) {
171 *seq2_index = last_seq2_off[d-1][diag];
172 return diag; // match
173 }
174 *seq2_index = last_seq2_off[d-1][diag+1];
175 return diag + 1; // gap in seq1
176 }
177
178
179 int GXGreedyExtend(const char* seq1, int len1,
180 const char* seq2, int len2,
181 bool reverse, int xdrop_threshold,
182 int match_cost, int mismatch_cost,
183 int& seq1_align_len, int& seq2_align_len,
184 SGreedyAlignMem& aux_data,
185 GXEditScript *edit_block) {
186 //GapPrelimEditBlock *edit_block,
187 //bool& fence_hit, SGreedySeed *seed) {
188 int seq1_index;
189 int seq2_index;
190 int index;
191 int d;
192 int k;
193 int diag_lower, diag_upper;
194 int max_dist;
195 int diag_origin;
196 int best_dist;
197 int best_diag;
198 int** last_seq2_off;
199 int* max_score;
200 int xdrop_offset;
201 int longest_match_run;
202 bool end1_reached, end2_reached;
203 GXMemPool* mem_pool;
204
205 /* ordinary dynamic programming alignment, for each offset
206 in seq1, walks through offsets in seq2 until an X-dropoff
207 test fails, saving the best score encountered along
208 the way. Instead of score, this code tracks the 'distance'
209 (number of mismatches plus number of gaps) between seq1
210 and seq2. Instead of walking through sequence offsets, it
211 walks through diagonals that can achieve a given distance.
212
213 Note that in what follows, the numbering of diagonals implies
214 a dot matrix where increasing seq1 offsets go to the right on
215 the x axis, and increasing seq2 offsets go up the y axis.
216 The gapped alignment thus proceeds up and to the right in
217 the graph, and diagonals are numbered increasing to the right */
218
219 best_dist = 0;
220 best_diag = 0;
221
222 /* set the number of distinct distances the algorithm will
223 examine in the search for an optimal alignment. The
224 heuristic worst-case running time of the algorithm is
225 O(max_dist**2 + (len1+len2)); for sequences which are
226 very similar, the average running time will be sig-
227 nificantly better than this */
228
229 max_dist = GMIN(GREEDY_MAX_COST,
230 len2 / GREEDY_MAX_COST_FRACTION + 1);
231
232 /* the main loop assumes that the index of all diagonals is
233 biased to lie in the middle of allocated bookkeeping
234 structures */
235
236 diag_origin = max_dist + 2;
237
238 // last_seq2_off[d][k] is the largest offset into seq2 that
239 // lies on diagonal k and has distance d
240
241 last_seq2_off = aux_data.last_seq2_off;
242
243 /* Instead of tracking the best alignment score and using
244 xdrop_theshold directly, track the best score for each
245 unique distance and use the best score for some previously
246 computed distance to implement the X-dropoff test.
247
248 xdrop_offset gives the distance backwards in the score
249 array to look */
250
251 xdrop_offset = (xdrop_threshold + match_cost / 2) /
252 (match_cost + mismatch_cost) + 1;
253
254 // find the offset of the first mismatch between seq1 and seq2
255
256 index = s_FindFirstMismatch(seq1, len1, seq2, len2, 0, 0, reverse);
257 // fence_hit, reverse, rem);
258
259 // update the extents of the alignment, and bail out
260 // early if no further work is needed
261
262 seq1_align_len = index;
263 seq2_align_len = index;
264 seq1_index = index;
265 /*
266 seed->start_q = 0;
267 seed->start_s = 0;
268 seed->match_length = index;
269 */
270 longest_match_run = index;
271
272 if (index == len1 || index == len2) {
273 /* Return the number of differences, which is zero here */
274 if (edit_block != NULL)
275 //GapPrelimEditBlockAdd(edit_block, eGapAlignSub, index);
276 edit_block->opRep(index);
277 return 0;
278 }
279
280 // set up the memory pool
281 mem_pool = aux_data.space;
282 if (edit_block == NULL) {
283 mem_pool = NULL;
284 }
285 else if (mem_pool == NULL) {
286 aux_data.space = mem_pool = new GXMemPool();
287 }
288 else {
289 mem_pool->refresh();
290 }
291
292 /* set up the array of per-distance maximum scores. There
293 are max_diags + xdrop_offset distances to track, the first
294 xdrop_offset of which are 0 */
295
296 max_score = aux_data.max_score + xdrop_offset;
297 for (index = 0; index < xdrop_offset; index++)
298 aux_data.max_score[index] = 0;
299
300 // fill in the initial offsets of the distance matrix
301
302 last_seq2_off[0][diag_origin] = seq1_index;
303 max_score[0] = seq1_index * match_cost;
304 diag_lower = diag_origin - 1;
305 diag_upper = diag_origin + 1;
306 end1_reached = end2_reached = false;
307
308 // for each distance
309 for (d = 1; d <= max_dist; d++) {
310 int xdrop_score;
311 int curr_score;
312 int curr_extent = 0;
313 int curr_seq2_index = 0;
314 int curr_diag = 0;
315 int tmp_diag_lower = diag_lower;
316 int tmp_diag_upper = diag_upper;
317
318 // Assign impossible seq2 offsets to any diagonals that
319 // are not in the range (diag_lower,diag_upper).
320 // These will serve as sentinel values for the inner loop
321 last_seq2_off[d - 1][diag_lower-1] = kInvalidOffset;
322 last_seq2_off[d - 1][diag_lower] = kInvalidOffset;
323 last_seq2_off[d - 1][diag_upper] = kInvalidOffset;
324 last_seq2_off[d - 1][diag_upper+1] = kInvalidOffset;
325
326 // compute the score for distance d corresponding to the X-dropoff criterion
327
328 xdrop_score = max_score[d - xdrop_offset] +
329 (match_cost + mismatch_cost) * d - xdrop_threshold;
330 xdrop_score = (int)ceil((double)xdrop_score / (match_cost / 2));
331
332 // for each diagonal of interest
333 for (k = tmp_diag_lower; k <= tmp_diag_upper; k++) {
334 /* find the largest offset into seq2 that increases
335 the distance from d-1 to d (i.e. keeps the alignment
336 from getting worse for as long as possible), then
337 choose the offset into seq1 that will keep the
338 resulting diagonal fixed at k
339
340 Note that this requires kInvalidOffset+1 to be smaller
341 than any valid offset into seq2, i.e. to be negative */
342
343 seq2_index = GMAX(last_seq2_off[d - 1][k + 1],
344 last_seq2_off[d - 1][k ]) + 1;
345 seq2_index = GMAX(seq2_index, last_seq2_off[d - 1][k - 1]);
346 seq1_index = seq2_index + k - diag_origin;
347
348 if (seq2_index < 0 || seq1_index + seq2_index < xdrop_score) {
349
350 // if no valid diagonal can reach distance d, or the
351 // X-dropoff test fails, narrow the range of diagonals
352 // to test and skip to the next diagonal
353 if (k == diag_lower)
354 diag_lower++;
355 else
356 last_seq2_off[d][k] = kInvalidOffset;
357 continue;
358 }
359 diag_upper = k;
360
361 /* slide down diagonal k until a mismatch
362 occurs. As long as only matches are encountered,
363 the current distance d will not change */
364
365 index = s_FindFirstMismatch(seq1, len1, seq2, len2,
366 seq1_index, seq2_index, reverse);
367 //fence_hit, reverse, rem);
368 if (index > longest_match_run) {
369 //seed->start_q = seq1_index;
370 //seed->start_s = seq2_index;
371 //seed->match_length = index;
372 longest_match_run = index;
373 }
374 seq1_index += index;
375 seq2_index += index;
376
377 // set the new largest seq2 offset that achieves
378 // distance d on diagonal k
379
380 last_seq2_off[d][k] = seq2_index;
381
382 // since all values of k are constrained to have the
383 // same distance d, the value of k which maximizes the
384 // alignment score is the one that covers the most of seq1 and seq2
385 if (seq1_index + seq2_index > curr_extent) {
386 curr_extent = seq1_index + seq2_index;
387 curr_seq2_index = seq2_index;
388 curr_diag = k;
389 }
390
391 /* clamp the bounds on diagonals to avoid walking off
392 either sequence. Because the bounds increase by at
393 most one for each distance, diag_lower and diag_upper
394 can each be of size at most max_diags+2 */
395
396 if (seq2_index == len2) {
397 diag_lower = k + 1;
398 end2_reached = true;
399 }
400 if (seq1_index == len1) {
401 diag_upper = k - 1;
402 end1_reached = true;
403 }
404 } // end loop over diagonals
405
406 // compute the maximum score possible for distance d
407 curr_score = curr_extent * (match_cost / 2) -
408 d * (match_cost + mismatch_cost);
409 // if this is the best score seen so far, update the
410 // statistics of the best alignment
411 if (curr_score > max_score[d - 1]) {
412 max_score[d] = curr_score;
413 best_dist = d;
414 best_diag = curr_diag;
415 seq2_align_len = curr_seq2_index;
416 seq1_align_len = curr_seq2_index + best_diag - diag_origin;
417 }
418 else {
419 max_score[d] = max_score[d - 1];
420 }
421
422 // alignment has finished if the lower and upper bounds
423 // on diagonals to check have converged to each other
424
425 if (diag_lower > diag_upper)
426 break;
427
428 /* set up for the next distance to examine. Because the
429 bounds increase by at most one for each distance,
430 diag_lower and diag_upper can each be of size at
431 most max_diags+2 */
432
433 if (!end2_reached)
434 diag_lower--;
435 if (!end1_reached)
436 diag_upper++;
437
438 if (edit_block == NULL) {
439 // if no traceback is specified, the next row of
440 // last_seq2_off can reuse previously allocated memory
441 //TODO FIXME The following assumes two arrays of
442 // at least max_dist+4 int's have already been allocated
443 last_seq2_off[d + 1] = last_seq2_off[d - 1];
444 }
445 else {
446 // traceback requires all rows of last_seq2_off to be saved,
447 // so a new row must be allocated
448 last_seq2_off[d + 1] = (int*)mem_pool->getByteSpace((diag_upper - diag_lower + 7)*sizeof(int));
449 // move the origin for this row backwards
450 //TODO FIXME: dubious pointer arithmetic ?!
451 //last_seq2_off[d + 1] = last_seq2_off[d + 1] - diag_lower + 2;
452 }
453 } // end loop over distinct distances
454
455
456 if (edit_block == NULL)
457 return best_dist;
458
459 //---- perform traceback
460 d = best_dist;
461 seq1_index = seq1_align_len;
462 seq2_index = seq2_align_len;
463 // for all positive distances
464
465 //if (fence_hit && *fence_hit)
466 // goto done;
467 if (index==len1 || index==len2) d=0;
468 while (d > 0) {
469 int new_diag;
470 int new_seq2_index;
471
472 /* retrieve the value of the diagonal after the next
473 traceback operation. best_diag starts off with the
474 value computed during the alignment process */
475
476 new_diag = s_GetNextNonAffineTback(last_seq2_off, d,
477 best_diag, &new_seq2_index);
478
479 if (new_diag == best_diag) {
480 // same diagonal: issue a group of substitutions
481 if (seq2_index - new_seq2_index > 0) {
482 edit_block->opRep(seq2_index - new_seq2_index);
483 }
484 }
485 else if (new_diag < best_diag) {
486 // smaller diagonal: issue a group of substitutions
487 // and then a gap in seq2 */
488 if (seq2_index - new_seq2_index > 0) {
489 edit_block->opRep(seq2_index - new_seq2_index);
490 }
491 //GapPrelimEditBlockAdd(edit_block, eGapAlignIns, 1);
492 edit_block->opIns(1);
493 }
494 else {
495 // larger diagonal: issue a group of substitutions
496 // and then a gap in seq1
497 if (seq2_index - new_seq2_index - 1 > 0) {
498 edit_block->opRep(seq2_index - new_seq2_index - 1);
499 }
500 edit_block->opDel(1);
501 }
502 d--;
503 best_diag = new_diag;
504 seq2_index = new_seq2_index;
505 }
506 //done:
507 // handle the final group of substitutions back to distance zero,
508 // i.e. back to offset zero of seq1 and seq2
509 //GapPrelimEditBlockAdd(edit_block, eGapAlignSub,
510 // last_seq2_off[0][diag_origin]);
511 edit_block->opRep(last_seq2_off[0][diag_origin]);
512 if (!reverse)
513 edit_block->reverse();
514 return best_dist;
515 }
516
517 void printEditScript(GXEditScript* ed_script) {
518 uint i;
519 if (ed_script==NULL || ed_script->opnum == 0)
520 return;
521 for (i=0; i<ed_script->opnum; i++) {
522 int num=((ed_script->ops[i]) >> 2);
523 unsigned char op_type = 3 - ( ed_script->ops[i] & EDIT_OP_MASK );
524 if (op_type == 3)
525 GError("Error: printEditScript encountered op_type 3 ?!\n");
526 GMessage("%d%c ", num, xgapcodes[op_type]);
527 }
528 GMessage("\n");
529 }
530
531 GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart,
532 bool editscript, int reward, int penalty, int xdrop) {
533 int q_max=strlen(q_seq); //query
534 int s_max=strlen(s_seq); //subj
535 return GreedyAlignRegion(q_seq, q_alnstart, q_max,
536 s_seq, s_alnstart, s_max, galn_NoTrim, editscript, NULL, reward, penalty, xdrop);
537 }
538
539 struct GXSeedTable {
540 int a_num, b_num;
541 int a_cap, b_cap;
542 char* xc;
543 GXSeedTable(int a=12, int b=255) {
544 a_cap=0;
545 b_cap=0;
546 a_num=0;
547 b_num=0;
548 xc=NULL;
549 init(a,b);
550 }
551 ~GXSeedTable() {
552 GFREE(xc);
553 }
554 void init(int a, int b) {
555 a_num=a;
556 b_num=b;
557 bool resize=false;
558 if (b_num>b_cap) { resize=true; b_cap=b_num;}
559 if (a_num>a_cap) { resize=true; a_cap=a_num;}
560 if (resize) {
561 GFREE(xc);
562 GCALLOC(xc, (a_num*b_num));
563 }
564 else {
565 //just clear up to a_max, b_max
566 memset((void*)xc, 0, (a_num*b_num));
567 }
568 }
569 char& x(int ax, int by) {
570 return xc[by*a_num+ax];
571 }
572
573 };
574
575
576 GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
577 //overlap of right (3') end of seqb
578 //hash the first 12 bases of seqa:
579 int aimin=0;
580 int aimax=GMIN(9,(a_len-4));
581 int bimin=GMAX(0,(b_len-a_len-2));//allow 2 extraneous bases, just in case there is some garbage at the end
582 int bimax=b_len-4;
583 int a_maxlen=aimax+4; //number of rows in the diagonal matrix
584 int b_maxlen=b_len; //number of cols in the diagonal matrix
585 GXSeedTable gx(a_maxlen, b_maxlen);
586 GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
587 for (int ai=aimin;ai<=aimax;ai++) {
588 int32* av=(int32 *)(&seqa[ai]);
589 //for (int bi=b_len-4;bi>=bimin;bi--) {
590 for (int bi=bimin;bi<=bimax;bi++) {
591 if (gx.x(ai,bi))
592 continue; //already have a previous seed covering this region of this diagonal
593 int32* bv=(int32 *)(&seqb[bi]);
594 if (*av == *bv) {
595 //word match
596 //see if we can extend to the right
597 gx.x(ai+1,bi+1)=1;
598 gx.x(ai+2,bi+2)=1;
599 gx.x(ai+3,bi+3)=1;
600 int aix=ai+4;
601 int bix=bi+4;
602 int len=4;
603 while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
604 {
605 gx.x(aix,bix)=1;
606 aix++;bix++;
607 len++;
608 }
609 GXSeed* newseed=new GXSeed(ai,bi,len);
610 seeds.Add(newseed);
611 diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
612 }
613 }
614 }//for each 4-mer at the beginning of seqa
615 for (int i=0;i<diagstrips->Count();i++) {
616 diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
617 }
618 diagstrips->setSorted(true); //sort by score
619 return diagstrips;
620 }
621
622 GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
623 //overlap of left (5') end of seqb
624 //hash the last 12 bases of seqa:
625 int aimin=GMAX(0,(a_len-16));
626 int aimax=a_len-4;
627 int bimin=0;
628 int bimax=GMIN((a_len-2), (b_len-4));
629 int a_maxlen=aimax+4; //number of rows in the diagonal matrix
630 int b_maxlen=b_len; //number of cols in the diagonal matrix
631 //gx.init(a_maxlen, b_maxlen);
632 GXSeedTable gx(a_maxlen, b_maxlen);
633 GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
634 for (int ai=aimin;ai<=aimax;ai++) {
635 int32* av=(int32 *)(&seqa[ai]);
636 for (int bi=bimin;bi<=bimax;bi++) {
637 if (gx.x(ai,bi))
638 continue; //already have a previous seed covering this region of this diagonal
639 int32* bv=(int32 *)(&seqb[bi]);
640 if (*av == *bv) {
641 //word match
642 //see if we can extend to the right
643 gx.x(ai+1,bi+1)=1;
644 gx.x(ai+2,bi+2)=1;
645 gx.x(ai+3,bi+3)=1;
646 int aix=ai+4;
647 int bix=bi+4;
648 int len=4;
649 while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
650 {
651 gx.x(aix,bix)=1;
652 aix++;bix++;
653 len++;
654 }
655 GXSeed* newseed=new GXSeed(ai,bi,len);
656 seeds.Add(newseed);
657 diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
658 }
659 }
660 }//for each 4-mer at the end of seqa
661 for (int i=0;i<diagstrips->Count();i++) {
662 diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
663 }
664 diagstrips->setSorted(true); //sort by score
665 return diagstrips;
666 }
667
668
669
670 int cmpSeedScore(const pointer p1, const pointer p2) {
671 //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
672 GXSeed* s1=(GXSeed*)p1;
673 GXSeed* s2=(GXSeed*)p2;
674 if (s1->len==s2->len) {
675 return (s1->b_ofs-s2->b_ofs);
676 }
677 else return (s2->len-s1->len);
678 }
679
680 int cmpSeedDiag(const pointer p1, const pointer p2) {
681 GXSeed* s1=(GXSeed*)p1;
682 GXSeed* s2=(GXSeed*)p2;
683 return ((s1->b_ofs-s1->a_ofs)-(s2->b_ofs-s2->a_ofs));
684 }
685
686 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
687 const char* s_seq, int s_alnstart, int s_max, GAlnTrimType trimtype,
688 bool editscript, SGreedyAlignMem* gxmem, int reward, int penalty, int xdrop) {
689 GXEditScript* ed_script_fwd = NULL;
690 GXEditScript* ed_script_rev = NULL;
691 if ( q_alnstart>q_max || q_alnstart<1 || s_alnstart>s_max || s_alnstart<1 )
692 GError("GreedyAlign() Error: invalid anchor coordinate.\n");
693 q_alnstart--;
694 s_alnstart--;
695 if (q_seq==NULL || q_seq[0]==0 || s_seq==NULL || s_seq[0]==0)
696 GError("GreedyAlign() Error: attempt to use an empty sequence string!\n");
697 if (q_seq[q_alnstart]!=s_seq[s_alnstart])
698 GError("GreedyAlign() Error: improper anchor (mismatch)!\n");
699 int q_ext_l=0, q_ext_r=0, s_ext_l=0, s_ext_r=0;
700 const char* q=q_seq+q_alnstart;
701 int q_avail=q_max-q_alnstart;
702 const char* s=s_seq+s_alnstart;
703 int s_avail=s_max-s_alnstart;
704 if (penalty<0) penalty=-penalty;
705 int MIN_GREEDY_SCORE=5*reward; //minimum score for an alignment to be reported
706 GXAlnInfo* alninfo=NULL;
707 bool freeAlnMem=(gxmem==NULL);
708 if (freeAlnMem) {
709 gxmem=new SGreedyAlignMem(reward, penalty, xdrop);
710 }
711 else gxmem->reset();
712 int retscore = 0;
713 int numdiffs = 0;
714 if (trimtype==galn_TrimLeft) {
715 if (editscript)
716 ed_script_rev=new GXEditScript();
717
718 int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
719 reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
720 //check this extension here and bail out if it's not a good extension
721 if (s_alnstart+1-s_ext_l>3) {
722 delete ed_script_rev;
723 if (freeAlnMem) delete gxmem;
724 return NULL;
725 }
726
727 if (editscript)
728 ed_script_fwd=new GXEditScript();
729 int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
730 reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
731 numdiffs=numdiffs_r+numdiffs_l;
732 //convert num diffs to actual score
733 retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward-penalty);
734 if (editscript)
735 ed_script_rev->Append(ed_script_fwd); //combine the two extensions
736 }
737 else {
738 if (editscript) {
739 ed_script_fwd=new GXEditScript();
740 }
741 int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
742 reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
743 //check extension here and bail out if not a good right extension
744 //assuming s_max is really at the right end of s_seq
745 if (trimtype==galn_TrimRight && s_alnstart+s_ext_r<s_max-2) {
746 delete ed_script_fwd;
747 if (freeAlnMem) delete gxmem;
748 return NULL;
749 }
750 if (editscript)
751 ed_script_rev=new GXEditScript();
752 int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
753 reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
754 //convert num diffs to actual score
755 numdiffs=numdiffs_r+numdiffs_l;
756 retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward-penalty);
757 if (editscript)
758 ed_script_rev->Append(ed_script_fwd); //combine the two extensions
759 }
760
761 if (retscore>=MIN_GREEDY_SCORE) {
762 alninfo=new GXAlnInfo(q_seq, q_alnstart+1-q_ext_l,q_alnstart+q_ext_r, s_seq, s_alnstart+1-s_ext_l, s_alnstart+s_ext_r);
763 int hsp_length = GMIN(q_ext_l+q_ext_r, s_ext_l+s_ext_r);
764 alninfo->score=retscore;
765 alninfo->pid = 100 * (1 - ((double) numdiffs) / hsp_length);
766 #ifdef GDEBUG
767 if (ed_script_rev) {
768 GMessage("Final Edit script ::: ");
769 printEditScript(ed_script_rev);
770 }
771 #endif
772 alninfo->editscript=ed_script_rev;
773 alninfo->gapinfo = new CAlnGapInfo(ed_script_rev, alninfo->ql-1, alninfo->sl-1);
774 }
775 else {
776 if (freeAlnMem) delete gxmem;
777 delete ed_script_rev;
778 delete alninfo;
779 alninfo=NULL;
780 }
781 delete ed_script_fwd;
782 return alninfo;
783 }