ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/seqalign/GXAlign.cpp
Revision: 89
Committed: Mon Oct 3 18:42:24 2011 UTC (9 years, 7 months ago) by gpertea
File size: 27293 byte(s)
Log Message:
separate wrappers for 5' and  3' alignments

Line User Rev File contents
1 gpertea 74 #include "GXAlign.h"
2    
3 gpertea 85 int GXMemPool::kMinSpace = 1000000;
4    
5 gpertea 74 #ifdef GDEBUG
6     char buf[6]={0x1B,'[', 'n','m','m','\0'};
7    
8     void color_fg(int c,FILE* f) {
9     if (f!=stderr && f!=stdout) return;
10     sprintf((char *)(&buf[2]),"%dm",c+30);
11     fwrite(buf,1,strlen(buf), f);
12     }
13    
14     void color_bg(int c, FILE* f) {
15     if (f!=stderr && f!=stdout) return;
16     sprintf((char *)(&buf[2]),"%dm",c+40);
17     fwrite(buf,1,strlen(buf),f);
18     };
19    
20     void color_resetfg(FILE* f) {
21     if (f!=stderr && f!=stdout) return;
22     sprintf((char *)(&buf[2]),"39m");
23     fwrite(buf,1,strlen(buf), f);
24     };
25    
26     void color_resetbg(FILE* f) {
27     if (f!=stderr && f!=stdout) return;
28     sprintf((char *)(&buf[2]),"49m");
29     fwrite(buf,1,strlen(buf), f);
30     }
31    
32     void color_reset(FILE* f) {
33     if (f!=stderr && f!=stdout) return;
34     sprintf((char *)(&buf[2]),"0m");
35     fwrite(buf,1,strlen(buf), f);
36     };
37    
38     void color_normal(FILE* f) {
39     if (f!=stderr && f!=stdout) return;
40     sprintf((char *)(&buf[2]),"22m");
41     fwrite(buf,1,strlen(buf), f);
42     };
43    
44     #endif
45    
46    
47     char xgapcodes[4]={'S','I', 'D', 'X'};
48    
49     int get_last(int **flast_d, int d, int diag, int *row1) {
50     if (flast_d[d-1][diag-1] > GMAX(flast_d[d-1][diag], flast_d[d-1][diag+1])) {
51     *row1 = flast_d[d-1][diag-1];
52     return diag-1;
53     }
54     if (flast_d[d-1][diag] > flast_d[d-1][diag+1]) {
55     *row1 = flast_d[d-1][diag];
56     return diag;
57     }
58     *row1 = flast_d[d-1][diag+1];
59     return diag+1;
60     }
61    
62     void GapXEditScript::print() { //debug
63     GapXEditScript* p=this;
64     do {
65     GMessage("%d%c ",p->num, xgapcodes[p->op_type]);
66     } while ((p=p->next)!=NULL);
67     GMessage("\n");
68     }
69    
70    
71 gpertea 85 int BLAST_Gcd(int a, int b) {
72     int c;
73 gpertea 74
74 gpertea 85 b = abs(b);
75     if (b > a)
76     c=a, a=b, b=c;
77 gpertea 74
78 gpertea 85 while (b != 0) {
79     c = a%b;
80     a = b;
81     b = c;
82     }
83     return a;
84     }
85 gpertea 74
86 gpertea 85 int BLAST_Gdb3(int* a, int* b, int* c) {
87     int g;
88     if (*b == 0)
89     g = BLAST_Gcd(*a, *c);
90     else
91     g = BLAST_Gcd(*a, BLAST_Gcd(*b, *c));
92     if (g > 1) {
93     *a /= g;
94     *b /= g;
95     *c /= g;
96     }
97     return g;
98     }
99    
100    
101    
102     //signal that a diagonal is invalid
103     static const int kInvalidOffset = -2;
104    
105     int s_FindFirstMismatch(const char *seq1, int len1,
106     const char *seq2, int len2,
107     int seq1_index, int seq2_index,
108     //bool &fence_hit,
109     bool reverse) {
110     int start_index = seq1_index;
111     /* Sentry detection here should be relatively inexpensive: The
112     sentry value cannot appear in the query, so detection only
113     needs to be done at exit from the subject-query matching loop.
114     For uncompressed sequences, ambiguities in the query (i.e. seq1)
115     always count as mismatches */
116     if (reverse) {
117     while (seq1_index < len1 && seq2_index < len2 &&
118     //seq1[len1-1 - seq1_index] < 4 &&
119     seq1[len1-1 - seq1_index] == seq2[len2-1 - seq2_index]) {
120     ++seq1_index;
121     ++seq2_index;
122     }
123     //if (seq2_index < len2 && seq2[len2-1-seq2_index] == FENCE_SENTRY) {
124     //if len2-1-seq2_index<=0) {
125     // fence_hit = true;
126     // }
127 gpertea 74 }
128 gpertea 85 else { //forward lookup
129     while (seq1_index < len1 && seq2_index < len2 &&
130     //seq1[seq1_index] < 4 &&
131     seq1[seq1_index] == seq2[seq2_index]) {
132     ++seq1_index;
133     ++seq2_index;
134     }
135     //if (seq2_index < len2 && seq2[seq2_index] == FENCE_SENTRY) {
136     //if (seq2_index==len2) {
137     // fence_hit = true;
138     //}
139     }
140     return seq1_index - start_index;
141     }
142 gpertea 74
143    
144    
145 gpertea 85 /** During the traceback for a non-affine greedy alignment,
146     compute the diagonal that will result from the next
147     traceback operation
148 gpertea 74
149 gpertea 85 @param last_seq2_off Array of offsets into the second sequence;
150     last_seq2_off[d][k] gives the largest offset into
151     the second sequence that lies on diagonal k and
152     has distance d [in]
153     @param d Starting distance [in]
154     @param diag Index of diagonal that produced the starting distance [in]
155     @param seq2_index The offset into the second sequence after the traceback
156     operation has completed [out]
157     @return The diagonal resulting from the next traceback operation
158     being applied
159     */
160     int s_GetNextNonAffineTback(int **last_seq2_off, int d,
161     int diag, int *seq2_index) {
162     // choose the traceback operation that results in the
163     // largest seq2 offset at this point, then compute the
164     // new diagonal that is implied by the operation
165     if (last_seq2_off[d-1][diag-1] >
166     GMAX(last_seq2_off[d-1][diag], last_seq2_off[d-1][diag+1])) {
167     *seq2_index = last_seq2_off[d-1][diag-1];
168     return diag - 1; // gap in seq2
169     }
170     if (last_seq2_off[d-1][diag] > last_seq2_off[d-1][diag+1]) {
171     *seq2_index = last_seq2_off[d-1][diag];
172     return diag; // match
173     }
174     *seq2_index = last_seq2_off[d-1][diag+1];
175     return diag + 1; // gap in seq1
176     }
177    
178    
179     int GXGreedyExtend(const char* seq1, int len1,
180     const char* seq2, int len2,
181     bool reverse, int xdrop_threshold,
182     int match_cost, int mismatch_cost,
183     int& seq1_align_len, int& seq2_align_len,
184     SGreedyAlignMem& aux_data,
185     GXEditScript *edit_block) {
186     //GapPrelimEditBlock *edit_block,
187     //bool& fence_hit, SGreedySeed *seed) {
188     int seq1_index;
189     int seq2_index;
190     int index;
191     int d;
192     int k;
193     int diag_lower, diag_upper;
194     int max_dist;
195     int diag_origin;
196     int best_dist;
197     int best_diag;
198     int** last_seq2_off;
199     int* max_score;
200     int xdrop_offset;
201     int longest_match_run;
202     bool end1_reached, end2_reached;
203     GXMemPool* mem_pool;
204    
205     /* ordinary dynamic programming alignment, for each offset
206     in seq1, walks through offsets in seq2 until an X-dropoff
207     test fails, saving the best score encountered along
208     the way. Instead of score, this code tracks the 'distance'
209     (number of mismatches plus number of gaps) between seq1
210     and seq2. Instead of walking through sequence offsets, it
211     walks through diagonals that can achieve a given distance.
212    
213     Note that in what follows, the numbering of diagonals implies
214     a dot matrix where increasing seq1 offsets go to the right on
215     the x axis, and increasing seq2 offsets go up the y axis.
216     The gapped alignment thus proceeds up and to the right in
217     the graph, and diagonals are numbered increasing to the right */
218    
219     best_dist = 0;
220     best_diag = 0;
221    
222     /* set the number of distinct distances the algorithm will
223     examine in the search for an optimal alignment. The
224     heuristic worst-case running time of the algorithm is
225     O(max_dist**2 + (len1+len2)); for sequences which are
226     very similar, the average running time will be sig-
227     nificantly better than this */
228    
229     max_dist = GMIN(GREEDY_MAX_COST,
230     len2 / GREEDY_MAX_COST_FRACTION + 1);
231    
232     /* the main loop assumes that the index of all diagonals is
233     biased to lie in the middle of allocated bookkeeping
234     structures */
235    
236     diag_origin = max_dist + 2;
237    
238     // last_seq2_off[d][k] is the largest offset into seq2 that
239     // lies on diagonal k and has distance d
240    
241     last_seq2_off = aux_data.last_seq2_off;
242    
243     /* Instead of tracking the best alignment score and using
244     xdrop_theshold directly, track the best score for each
245     unique distance and use the best score for some previously
246     computed distance to implement the X-dropoff test.
247    
248     xdrop_offset gives the distance backwards in the score
249     array to look */
250    
251     xdrop_offset = (xdrop_threshold + match_cost / 2) /
252     (match_cost + mismatch_cost) + 1;
253    
254     // find the offset of the first mismatch between seq1 and seq2
255    
256     index = s_FindFirstMismatch(seq1, len1, seq2, len2, 0, 0, reverse);
257     // fence_hit, reverse, rem);
258    
259     // update the extents of the alignment, and bail out
260     // early if no further work is needed
261    
262     seq1_align_len = index;
263     seq2_align_len = index;
264     seq1_index = index;
265     /*
266     seed->start_q = 0;
267     seed->start_s = 0;
268     seed->match_length = index;
269     */
270     longest_match_run = index;
271    
272     if (index == len1 || index == len2) {
273     /* Return the number of differences, which is zero here */
274     if (edit_block != NULL)
275     //GapPrelimEditBlockAdd(edit_block, eGapAlignSub, index);
276     edit_block->opRep(index);
277     return 0;
278     }
279    
280     // set up the memory pool
281     mem_pool = aux_data.space;
282     if (edit_block == NULL) {
283     mem_pool = NULL;
284     }
285     else if (mem_pool == NULL) {
286     aux_data.space = mem_pool = new GXMemPool();
287     }
288     else {
289     mem_pool->refresh();
290     }
291    
292     /* set up the array of per-distance maximum scores. There
293     are max_diags + xdrop_offset distances to track, the first
294     xdrop_offset of which are 0 */
295    
296     max_score = aux_data.max_score + xdrop_offset;
297     for (index = 0; index < xdrop_offset; index++)
298     aux_data.max_score[index] = 0;
299    
300     // fill in the initial offsets of the distance matrix
301    
302     last_seq2_off[0][diag_origin] = seq1_index;
303     max_score[0] = seq1_index * match_cost;
304     diag_lower = diag_origin - 1;
305     diag_upper = diag_origin + 1;
306     end1_reached = end2_reached = false;
307    
308     // for each distance
309     for (d = 1; d <= max_dist; d++) {
310     int xdrop_score;
311     int curr_score;
312     int curr_extent = 0;
313     int curr_seq2_index = 0;
314     int curr_diag = 0;
315     int tmp_diag_lower = diag_lower;
316     int tmp_diag_upper = diag_upper;
317    
318     // Assign impossible seq2 offsets to any diagonals that
319     // are not in the range (diag_lower,diag_upper).
320     // These will serve as sentinel values for the inner loop
321     last_seq2_off[d - 1][diag_lower-1] = kInvalidOffset;
322     last_seq2_off[d - 1][diag_lower] = kInvalidOffset;
323     last_seq2_off[d - 1][diag_upper] = kInvalidOffset;
324     last_seq2_off[d - 1][diag_upper+1] = kInvalidOffset;
325    
326     // compute the score for distance d corresponding to the X-dropoff criterion
327    
328     xdrop_score = max_score[d - xdrop_offset] +
329     (match_cost + mismatch_cost) * d - xdrop_threshold;
330     xdrop_score = (int)ceil((double)xdrop_score / (match_cost / 2));
331    
332     // for each diagonal of interest
333     for (k = tmp_diag_lower; k <= tmp_diag_upper; k++) {
334     /* find the largest offset into seq2 that increases
335     the distance from d-1 to d (i.e. keeps the alignment
336     from getting worse for as long as possible), then
337     choose the offset into seq1 that will keep the
338     resulting diagonal fixed at k
339    
340     Note that this requires kInvalidOffset+1 to be smaller
341     than any valid offset into seq2, i.e. to be negative */
342    
343     seq2_index = GMAX(last_seq2_off[d - 1][k + 1],
344     last_seq2_off[d - 1][k ]) + 1;
345     seq2_index = GMAX(seq2_index, last_seq2_off[d - 1][k - 1]);
346     seq1_index = seq2_index + k - diag_origin;
347    
348     if (seq2_index < 0 || seq1_index + seq2_index < xdrop_score) {
349    
350     // if no valid diagonal can reach distance d, or the
351     // X-dropoff test fails, narrow the range of diagonals
352     // to test and skip to the next diagonal
353     if (k == diag_lower)
354     diag_lower++;
355     else
356     last_seq2_off[d][k] = kInvalidOffset;
357     continue;
358     }
359     diag_upper = k;
360    
361     /* slide down diagonal k until a mismatch
362     occurs. As long as only matches are encountered,
363     the current distance d will not change */
364    
365     index = s_FindFirstMismatch(seq1, len1, seq2, len2,
366     seq1_index, seq2_index, reverse);
367     //fence_hit, reverse, rem);
368     if (index > longest_match_run) {
369     //seed->start_q = seq1_index;
370     //seed->start_s = seq2_index;
371     //seed->match_length = index;
372     longest_match_run = index;
373 gpertea 74 }
374 gpertea 85 seq1_index += index;
375     seq2_index += index;
376    
377     // set the new largest seq2 offset that achieves
378     // distance d on diagonal k
379    
380     last_seq2_off[d][k] = seq2_index;
381    
382     // since all values of k are constrained to have the
383     // same distance d, the value of k which maximizes the
384     // alignment score is the one that covers the most of seq1 and seq2
385     if (seq1_index + seq2_index > curr_extent) {
386     curr_extent = seq1_index + seq2_index;
387     curr_seq2_index = seq2_index;
388     curr_diag = k;
389     }
390    
391     /* clamp the bounds on diagonals to avoid walking off
392     either sequence. Because the bounds increase by at
393     most one for each distance, diag_lower and diag_upper
394     can each be of size at most max_diags+2 */
395    
396     if (seq2_index == len2) {
397     diag_lower = k + 1;
398     end2_reached = true;
399 gpertea 74 }
400 gpertea 85 if (seq1_index == len1) {
401     diag_upper = k - 1;
402     end1_reached = true;
403 gpertea 74 }
404 gpertea 85 } // end loop over diagonals
405    
406     // compute the maximum score possible for distance d
407     curr_score = curr_extent * (match_cost / 2) -
408     d * (match_cost + mismatch_cost);
409     // if this is the best score seen so far, update the
410     // statistics of the best alignment
411     if (curr_score > max_score[d - 1]) {
412     max_score[d] = curr_score;
413     best_dist = d;
414     best_diag = curr_diag;
415     seq2_align_len = curr_seq2_index;
416     seq1_align_len = curr_seq2_index + best_diag - diag_origin;
417     }
418     else {
419     max_score[d] = max_score[d - 1];
420     }
421    
422     // alignment has finished if the lower and upper bounds
423     // on diagonals to check have converged to each other
424    
425     if (diag_lower > diag_upper)
426     break;
427    
428     /* set up for the next distance to examine. Because the
429     bounds increase by at most one for each distance,
430     diag_lower and diag_upper can each be of size at
431     most max_diags+2 */
432    
433     if (!end2_reached)
434     diag_lower--;
435     if (!end1_reached)
436     diag_upper++;
437    
438     if (edit_block == NULL) {
439     // if no traceback is specified, the next row of
440     // last_seq2_off can reuse previously allocated memory
441     //TODO FIXME The following assumes two arrays of
442     // at least max_dist+4 int's have already been allocated
443     last_seq2_off[d + 1] = last_seq2_off[d - 1];
444 gpertea 74 }
445     else {
446 gpertea 85 // traceback requires all rows of last_seq2_off to be saved,
447     // so a new row must be allocated
448     last_seq2_off[d + 1] = (int*)mem_pool->getByteSpace((diag_upper - diag_lower + 7)*sizeof(int));
449     // move the origin for this row backwards
450     //TODO FIXME: dubious pointer arithmetic ?!
451 gpertea 87 //last_seq2_off[d + 1] = last_seq2_off[d + 1] - diag_lower + 2;
452 gpertea 85 }
453     } // end loop over distinct distances
454    
455    
456     if (edit_block == NULL)
457     return best_dist;
458    
459     //---- perform traceback
460     d = best_dist;
461     seq1_index = seq1_align_len;
462     seq2_index = seq2_align_len;
463     // for all positive distances
464    
465     //if (fence_hit && *fence_hit)
466     // goto done;
467     if (index==len1 || index==len2) d=0;
468     while (d > 0) {
469     int new_diag;
470     int new_seq2_index;
471    
472     /* retrieve the value of the diagonal after the next
473     traceback operation. best_diag starts off with the
474     value computed during the alignment process */
475    
476     new_diag = s_GetNextNonAffineTback(last_seq2_off, d,
477     best_diag, &new_seq2_index);
478    
479     if (new_diag == best_diag) {
480     // same diagonal: issue a group of substitutions
481     if (seq2_index - new_seq2_index > 0) {
482     edit_block->opRep(seq2_index - new_seq2_index);
483     }
484     }
485     else if (new_diag < best_diag) {
486     // smaller diagonal: issue a group of substitutions
487     // and then a gap in seq2 */
488     if (seq2_index - new_seq2_index > 0) {
489     edit_block->opRep(seq2_index - new_seq2_index);
490     }
491     //GapPrelimEditBlockAdd(edit_block, eGapAlignIns, 1);
492     edit_block->opIns(1);
493     }
494     else {
495     // larger diagonal: issue a group of substitutions
496     // and then a gap in seq1
497     if (seq2_index - new_seq2_index - 1 > 0) {
498     edit_block->opRep(seq2_index - new_seq2_index - 1);
499     }
500     edit_block->opDel(1);
501 gpertea 84 }
502 gpertea 85 d--;
503     best_diag = new_diag;
504     seq2_index = new_seq2_index;
505     }
506     //done:
507     // handle the final group of substitutions back to distance zero,
508     // i.e. back to offset zero of seq1 and seq2
509     //GapPrelimEditBlockAdd(edit_block, eGapAlignSub,
510     // last_seq2_off[0][diag_origin]);
511     edit_block->opRep(last_seq2_off[0][diag_origin]);
512     if (!reverse)
513     edit_block->reverse();
514     return best_dist;
515 gpertea 74 }
516    
517     void printEditScript(GXEditScript* ed_script) {
518     uint i;
519     if (ed_script==NULL || ed_script->opnum == 0)
520     return;
521     for (i=0; i<ed_script->opnum; i++) {
522     int num=((ed_script->ops[i]) >> 2);
523     unsigned char op_type = 3 - ( ed_script->ops[i] & EDIT_OP_MASK );
524     if (op_type == 3)
525     GError("Error: printEditScript encountered op_type 3 ?!\n");
526     GMessage("%d%c ", num, xgapcodes[op_type]);
527     }
528     GMessage("\n");
529     }
530    
531     GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart,
532     bool editscript, int reward, int penalty, int xdrop) {
533     int q_max=strlen(q_seq); //query
534     int s_max=strlen(s_seq); //subj
535     return GreedyAlignRegion(q_seq, q_alnstart, q_max,
536 gpertea 89 s_seq, s_alnstart, s_max, galn_NoTrim, editscript, NULL, reward, penalty, xdrop);
537 gpertea 74 }
538    
539 gpertea 75 struct GXSeedTable {
540 gpertea 77 int a_num, b_num;
541 gpertea 75 int a_cap, b_cap;
542 gpertea 77 char* xc;
543 gpertea 75 GXSeedTable(int a=12, int b=255) {
544 gpertea 77 a_cap=0;
545     b_cap=0;
546     a_num=0;
547     b_num=0;
548     xc=NULL;
549     init(a,b);
550     }
551     ~GXSeedTable() {
552     GFREE(xc);
553 gpertea 75 }
554     void init(int a, int b) {
555 gpertea 77 a_num=a;
556     b_num=b;
557     bool resize=false;
558     if (b_num>b_cap) { resize=true; b_cap=b_num;}
559     if (a_num>a_cap) { resize=true; a_cap=a_num;}
560     if (resize) {
561     GFREE(xc);
562     GCALLOC(xc, (a_num*b_num));
563 gpertea 75 }
564     else {
565     //just clear up to a_max, b_max
566 gpertea 77 memset((void*)xc, 0, (a_num*b_num));
567 gpertea 75 }
568     }
569 gpertea 77 char& x(int ax, int by) {
570     return xc[by*a_num+ax];
571     }
572    
573 gpertea 75 };
574 gpertea 74
575 gpertea 75
576 gpertea 77 GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
577 gpertea 74 //overlap of right (3') end of seqb
578 gpertea 77 //hash the first 12 bases of seqa:
579 gpertea 89 int aimin=0;
580 gpertea 77 int aimax=GMIN(9,(a_len-4));
581 gpertea 74 int bimin=GMAX(0,(b_len-a_len-2));//allow 2 extraneous bases, just in case there is some garbage at the end
582 gpertea 89 int bimax=b_len-4;
583 gpertea 86 int a_maxlen=aimax+4; //number of rows in the diagonal matrix
584     int b_maxlen=b_len; //number of cols in the diagonal matrix
585 gpertea 89 GXSeedTable gx(a_maxlen, b_maxlen);
586 gpertea 86 GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
587 gpertea 89 for (int ai=aimin;ai<=aimax;ai++) {
588 gpertea 74 int32* av=(int32 *)(&seqa[ai]);
589 gpertea 89 //for (int bi=b_len-4;bi>=bimin;bi--) {
590     for (int bi=bimin;bi<=bimax;bi++) {
591 gpertea 77 if (gx.x(ai,bi))
592     continue; //already have a previous seed covering this region of this diagonal
593 gpertea 74 int32* bv=(int32 *)(&seqb[bi]);
594     if (*av == *bv) {
595     //word match
596     //see if we can extend to the right
597 gpertea 89 gx.x(ai+1,bi+1)=1;
598     gx.x(ai+2,bi+2)=1;
599     gx.x(ai+3,bi+3)=1;
600 gpertea 74 int aix=ai+4;
601     int bix=bi+4;
602     int len=4;
603 gpertea 86 while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
604     {
605     gx.x(aix,bix)=1;
606     aix++;bix++;
607     len++;
608     }
609 gpertea 77 GXSeed* newseed=new GXSeed(ai,bi,len);
610     seeds.Add(newseed);
611     diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
612 gpertea 74 }
613     }
614     }//for each 4-mer at the beginning of seqa
615 gpertea 78 for (int i=0;i<diagstrips->Count();i++) {
616     diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
617     }
618     diagstrips->setSorted(true); //sort by score
619 gpertea 77 return diagstrips;
620 gpertea 74 }
621    
622 gpertea 89 GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
623     //overlap of left (5') end of seqb
624     //hash the last 12 bases of seqa:
625     int aimin=GMAX(0,(a_len-16));
626     int aimax=a_len-4;
627     int bimin=0;
628     int bimax=GMIN((a_len-2), (b_len-4));
629     int a_maxlen=aimax+4; //number of rows in the diagonal matrix
630     int b_maxlen=b_len; //number of cols in the diagonal matrix
631     //gx.init(a_maxlen, b_maxlen);
632     GXSeedTable gx(a_maxlen, b_maxlen);
633     GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
634     for (int ai=aimin;ai<=aimax;ai++) {
635     int32* av=(int32 *)(&seqa[ai]);
636     for (int bi=bimin;bi<=bimax;bi++) {
637     if (gx.x(ai,bi))
638     continue; //already have a previous seed covering this region of this diagonal
639     int32* bv=(int32 *)(&seqb[bi]);
640     if (*av == *bv) {
641     //word match
642     //see if we can extend to the right
643     gx.x(ai+1,bi+1)=1;
644     gx.x(ai+2,bi+2)=1;
645     gx.x(ai+3,bi+3)=1;
646     int aix=ai+4;
647     int bix=bi+4;
648     int len=4;
649     while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
650     {
651     gx.x(aix,bix)=1;
652     aix++;bix++;
653     len++;
654     }
655     GXSeed* newseed=new GXSeed(ai,bi,len);
656     seeds.Add(newseed);
657     diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
658     }
659     }
660     }//for each 4-mer at the end of seqa
661     for (int i=0;i<diagstrips->Count();i++) {
662     diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
663     }
664     diagstrips->setSorted(true); //sort by score
665     return diagstrips;
666     }
667    
668    
669    
670 gpertea 81 int cmpSeedScore(const pointer p1, const pointer p2) {
671     //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
672     GXSeed* s1=(GXSeed*)p1;
673     GXSeed* s2=(GXSeed*)p2;
674     if (s1->len==s2->len) {
675     return (s1->b_ofs-s2->b_ofs);
676     }
677     else return (s2->len-s1->len);
678 gpertea 78 }
679    
680 gpertea 81 int cmpSeedDiag(const pointer p1, const pointer p2) {
681     GXSeed* s1=(GXSeed*)p1;
682     GXSeed* s2=(GXSeed*)p2;
683     return ((s1->b_ofs-s1->a_ofs)-(s2->b_ofs-s2->a_ofs));
684     }
685 gpertea 78
686 gpertea 74 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
687 gpertea 89 const char* s_seq, int s_alnstart, int s_max, GAlnTrimType trimtype,
688     bool editscript, SGreedyAlignMem* gxmem, int reward, int penalty, int xdrop) {
689 gpertea 74 GXEditScript* ed_script_fwd = NULL;
690     GXEditScript* ed_script_rev = NULL;
691     if ( q_alnstart>q_max || q_alnstart<1 || s_alnstart>s_max || s_alnstart<1 )
692     GError("GreedyAlign() Error: invalid anchor coordinate.\n");
693     q_alnstart--;
694     s_alnstart--;
695     if (q_seq==NULL || q_seq[0]==0 || s_seq==NULL || s_seq[0]==0)
696     GError("GreedyAlign() Error: attempt to use an empty sequence string!\n");
697     if (q_seq[q_alnstart]!=s_seq[s_alnstart])
698     GError("GreedyAlign() Error: improper anchor (mismatch)!\n");
699     int q_ext_l=0, q_ext_r=0, s_ext_l=0, s_ext_r=0;
700     const char* q=q_seq+q_alnstart;
701     int q_avail=q_max-q_alnstart;
702     const char* s=s_seq+s_alnstart;
703     int s_avail=s_max-s_alnstart;
704 gpertea 85 if (penalty<0) penalty=-penalty;
705 gpertea 74 int MIN_GREEDY_SCORE=5*reward; //minimum score for an alignment to be reported
706 gpertea 89 GXAlnInfo* alninfo=NULL;
707     bool freeAlnMem=(gxmem==NULL);
708     if (freeAlnMem) {
709     gxmem=new SGreedyAlignMem(reward, penalty, xdrop);
710     }
711     else gxmem->reset();
712     int retscore = 0;
713     int numdiffs = 0;
714     if (trimtype==galn_TrimLeft) {
715     if (editscript)
716     ed_script_rev=new GXEditScript();
717 gpertea 81
718 gpertea 89 int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
719     reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
720     //check this extension here and bail out if it's not a good extension
721     if (s_alnstart+1-s_ext_l>3) {
722     delete ed_script_rev;
723     if (freeAlnMem) delete gxmem;
724     return NULL;
725     }
726    
727     if (editscript)
728     ed_script_fwd=new GXEditScript();
729     int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
730     reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
731     numdiffs=numdiffs_r+numdiffs_l;
732     //convert num diffs to actual score
733     retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward-penalty);
734     if (editscript)
735     ed_script_rev->Append(ed_script_fwd); //combine the two extensions
736 gpertea 74 }
737 gpertea 89 else {
738     if (editscript) {
739     ed_script_fwd=new GXEditScript();
740     }
741     int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
742     reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
743     //check extension here and bail out if not a good right extension
744     //assuming s_max is really at the right end of s_seq
745     if (trimtype==galn_TrimRight && s_alnstart+s_ext_r<s_max-2) {
746     delete ed_script_fwd;
747     if (freeAlnMem) delete gxmem;
748     return NULL;
749     }
750     if (editscript)
751     ed_script_rev=new GXEditScript();
752     int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
753     reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
754     //convert num diffs to actual score
755     numdiffs=numdiffs_r+numdiffs_l;
756     retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward-penalty);
757     if (editscript)
758     ed_script_rev->Append(ed_script_fwd); //combine the two extensions
759     }
760    
761 gpertea 74 if (retscore>=MIN_GREEDY_SCORE) {
762     alninfo=new GXAlnInfo(q_seq, q_alnstart+1-q_ext_l,q_alnstart+q_ext_r, s_seq, s_alnstart+1-s_ext_l, s_alnstart+s_ext_r);
763     int hsp_length = GMIN(q_ext_l+q_ext_r, s_ext_l+s_ext_r);
764     alninfo->score=retscore;
765     alninfo->pid = 100 * (1 - ((double) numdiffs) / hsp_length);
766     #ifdef GDEBUG
767     if (ed_script_rev) {
768     GMessage("Final Edit script ::: ");
769     printEditScript(ed_script_rev);
770     }
771     #endif
772     alninfo->editscript=ed_script_rev;
773     alninfo->gapinfo = new CAlnGapInfo(ed_script_rev, alninfo->ql-1, alninfo->sl-1);
774     }
775     else {
776 gpertea 89 if (freeAlnMem) delete gxmem;
777 gpertea 74 delete ed_script_rev;
778 gpertea 78 delete alninfo;
779     alninfo=NULL;
780 gpertea 74 }
781     delete ed_script_fwd;
782     return alninfo;
783     }