ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.cpp
Revision: 96
Committed: Wed Oct 5 18:31:48 2011 UTC (7 years, 9 months ago) by gpertea
File size: 31760 byte(s)
Log Message:
moved match_LeftEnd() and match_RightEnd() to GAlnExtend

Line User Rev File contents
1 gpertea 93 #include "GAlnExtend.h"
2    
3     //greedy gapped alignment extension
4     //(mostly lifted from NCBI's megablast gapped extension code)
5    
6     int GXMemPool::kMinSpace = 1000000;
7    
8     #ifdef GDEBUG
9     char buf[6]={0x1B,'[', 'n','m','m','\0'};
10    
11     void color_fg(int c,FILE* f) {
12     if (f!=stderr && f!=stdout) return;
13     sprintf((char *)(&buf[2]),"%dm",c+30);
14     fwrite(buf,1,strlen(buf), f);
15     }
16    
17     void color_bg(int c, FILE* f) {
18     if (f!=stderr && f!=stdout) return;
19     sprintf((char *)(&buf[2]),"%dm",c+40);
20     fwrite(buf,1,strlen(buf),f);
21     };
22    
23     void color_resetfg(FILE* f) {
24     if (f!=stderr && f!=stdout) return;
25     sprintf((char *)(&buf[2]),"39m");
26     fwrite(buf,1,strlen(buf), f);
27     };
28    
29     void color_resetbg(FILE* f) {
30     if (f!=stderr && f!=stdout) return;
31     sprintf((char *)(&buf[2]),"49m");
32     fwrite(buf,1,strlen(buf), f);
33     }
34    
35     void color_reset(FILE* f) {
36     if (f!=stderr && f!=stdout) return;
37     sprintf((char *)(&buf[2]),"0m");
38     fwrite(buf,1,strlen(buf), f);
39     };
40    
41     void color_normal(FILE* f) {
42     if (f!=stderr && f!=stdout) return;
43     sprintf((char *)(&buf[2]),"22m");
44     fwrite(buf,1,strlen(buf), f);
45     };
46    
47     #endif
48    
49    
50     char xgapcodes[4]={'S','I', 'D', 'X'};
51    
52     int get_last(int **flast_d, int d, int diag, int *row1) {
53     if (flast_d[d-1][diag-1] > GMAX(flast_d[d-1][diag], flast_d[d-1][diag+1])) {
54     *row1 = flast_d[d-1][diag-1];
55     return diag-1;
56     }
57     if (flast_d[d-1][diag] > flast_d[d-1][diag+1]) {
58     *row1 = flast_d[d-1][diag];
59     return diag;
60     }
61     *row1 = flast_d[d-1][diag+1];
62     return diag+1;
63     }
64    
65     void GapXEditScript::print() { //debug
66     GapXEditScript* p=this;
67     do {
68     GMessage("%d%c ",p->num, xgapcodes[p->op_type]);
69     } while ((p=p->next)!=NULL);
70     GMessage("\n");
71     }
72    
73    
74     int BLAST_Gcd(int a, int b) {
75     int c;
76    
77     b = abs(b);
78     if (b > a)
79     c=a, a=b, b=c;
80    
81     while (b != 0) {
82     c = a%b;
83     a = b;
84     b = c;
85     }
86     return a;
87     }
88    
89     int BLAST_Gdb3(int* a, int* b, int* c) {
90     int g;
91     if (*b == 0)
92     g = BLAST_Gcd(*a, *c);
93     else
94     g = BLAST_Gcd(*a, BLAST_Gcd(*b, *c));
95     if (g > 1) {
96     *a /= g;
97     *b /= g;
98     *c /= g;
99     }
100     return g;
101     }
102    
103    
104    
105     //signal that a diagonal is invalid
106     static const int kInvalidOffset = -2;
107    
108     int s_FindFirstMismatch(const char *seq1, int len1,
109     const char *seq2, int len2,
110     int seq1_index, int seq2_index,
111     //bool &fence_hit,
112     bool reverse) {
113     int start_index = seq1_index;
114     /* Sentry detection here should be relatively inexpensive: The
115     sentry value cannot appear in the query, so detection only
116     needs to be done at exit from the subject-query matching loop.
117     For uncompressed sequences, ambiguities in the query (i.e. seq1)
118     always count as mismatches */
119     if (reverse) {
120     while (seq1_index < len1 && seq2_index < len2 &&
121     //seq1[len1-1 - seq1_index] < 4 &&
122     seq1[len1-1 - seq1_index] == seq2[len2-1 - seq2_index]) {
123     ++seq1_index;
124     ++seq2_index;
125     }
126     //if (seq2_index < len2 && seq2[len2-1-seq2_index] == FENCE_SENTRY) {
127     //if len2-1-seq2_index<=0) {
128     // fence_hit = true;
129     // }
130     }
131     else { //forward lookup
132     while (seq1_index < len1 && seq2_index < len2 &&
133     //seq1[seq1_index] < 4 &&
134     seq1[seq1_index] == seq2[seq2_index]) {
135     ++seq1_index;
136     ++seq2_index;
137     }
138     //if (seq2_index < len2 && seq2[seq2_index] == FENCE_SENTRY) {
139     //if (seq2_index==len2) {
140     // fence_hit = true;
141     //}
142     }
143     return seq1_index - start_index;
144     }
145    
146    
147    
148     /** During the traceback for a non-affine greedy alignment,
149     compute the diagonal that will result from the next
150     traceback operation
151    
152     @param last_seq2_off Array of offsets into the second sequence;
153     last_seq2_off[d][k] gives the largest offset into
154     the second sequence that lies on diagonal k and
155     has distance d [in]
156     @param d Starting distance [in]
157     @param diag Index of diagonal that produced the starting distance [in]
158     @param seq2_index The offset into the second sequence after the traceback
159     operation has completed [out]
160     @return The diagonal resulting from the next traceback operation
161     being applied
162     */
163     int s_GetNextNonAffineTback(int **last_seq2_off, int d,
164     int diag, int *seq2_index) {
165     // choose the traceback operation that results in the
166     // largest seq2 offset at this point, then compute the
167     // new diagonal that is implied by the operation
168     if (last_seq2_off[d-1][diag-1] >
169     GMAX(last_seq2_off[d-1][diag], last_seq2_off[d-1][diag+1])) {
170     *seq2_index = last_seq2_off[d-1][diag-1];
171     return diag - 1; // gap in seq2
172     }
173     if (last_seq2_off[d-1][diag] > last_seq2_off[d-1][diag+1]) {
174     *seq2_index = last_seq2_off[d-1][diag];
175     return diag; // match
176     }
177     *seq2_index = last_seq2_off[d-1][diag+1];
178     return diag + 1; // gap in seq1
179     }
180    
181    
182     int GXGreedyExtend(const char* seq1, int len1,
183     const char* seq2, int len2,
184     bool reverse, int xdrop_threshold,
185     int match_cost, int mismatch_cost,
186     int& seq1_align_len, int& seq2_align_len,
187     SGreedyAlignMem& aux_data,
188     GXEditScript *edit_block) {
189     //GapPrelimEditBlock *edit_block,
190     //bool& fence_hit, SGreedySeed *seed) {
191     int seq1_index;
192     int seq2_index;
193     int index;
194     int d;
195     int k;
196     int diag_lower, diag_upper;
197     int max_dist;
198     int diag_origin;
199     int best_dist;
200     int best_diag;
201     int** last_seq2_off;
202     int* max_score;
203     int xdrop_offset;
204     int longest_match_run;
205     bool end1_reached, end2_reached;
206     GXMemPool* mem_pool;
207    
208     /* ordinary dynamic programming alignment, for each offset
209     in seq1, walks through offsets in seq2 until an X-dropoff
210     test fails, saving the best score encountered along
211     the way. Instead of score, this code tracks the 'distance'
212     (number of mismatches plus number of gaps) between seq1
213     and seq2. Instead of walking through sequence offsets, it
214     walks through diagonals that can achieve a given distance.
215    
216     Note that in what follows, the numbering of diagonals implies
217     a dot matrix where increasing seq1 offsets go to the right on
218     the x axis, and increasing seq2 offsets go up the y axis.
219     The gapped alignment thus proceeds up and to the right in
220     the graph, and diagonals are numbered increasing to the right */
221    
222     best_dist = 0;
223     best_diag = 0;
224    
225     /* set the number of distinct distances the algorithm will
226     examine in the search for an optimal alignment. The
227     heuristic worst-case running time of the algorithm is
228     O(max_dist**2 + (len1+len2)); for sequences which are
229     very similar, the average running time will be sig-
230     nificantly better than this */
231    
232     max_dist = GMIN(GREEDY_MAX_COST,
233     len2 / GREEDY_MAX_COST_FRACTION + 1);
234    
235     /* the main loop assumes that the index of all diagonals is
236     biased to lie in the middle of allocated bookkeeping
237     structures */
238    
239     diag_origin = max_dist + 2;
240    
241     // last_seq2_off[d][k] is the largest offset into seq2 that
242     // lies on diagonal k and has distance d
243    
244     last_seq2_off = aux_data.last_seq2_off;
245    
246     /* Instead of tracking the best alignment score and using
247     xdrop_theshold directly, track the best score for each
248     unique distance and use the best score for some previously
249     computed distance to implement the X-dropoff test.
250    
251     xdrop_offset gives the distance backwards in the score
252     array to look */
253    
254     xdrop_offset = (xdrop_threshold + match_cost / 2) /
255     (match_cost + mismatch_cost) + 1;
256    
257     // find the offset of the first mismatch between seq1 and seq2
258    
259     index = s_FindFirstMismatch(seq1, len1, seq2, len2, 0, 0, reverse);
260     // fence_hit, reverse, rem);
261    
262     // update the extents of the alignment, and bail out
263     // early if no further work is needed
264    
265     seq1_align_len = index;
266     seq2_align_len = index;
267     seq1_index = index;
268     /*
269     seed->start_q = 0;
270     seed->start_s = 0;
271     seed->match_length = index;
272     */
273     longest_match_run = index;
274    
275     if (index == len1 || index == len2) {
276     /* Return the number of differences, which is zero here */
277     if (edit_block != NULL)
278     //GapPrelimEditBlockAdd(edit_block, eGapAlignSub, index);
279     edit_block->opRep(index);
280     return 0;
281     }
282    
283     // set up the memory pool
284     mem_pool = aux_data.space;
285     if (edit_block == NULL) {
286     mem_pool = NULL;
287     }
288     else if (mem_pool == NULL) {
289     aux_data.space = mem_pool = new GXMemPool();
290     }
291     else {
292     mem_pool->refresh();
293     }
294    
295     /* set up the array of per-distance maximum scores. There
296     are max_diags + xdrop_offset distances to track, the first
297     xdrop_offset of which are 0 */
298    
299     max_score = aux_data.max_score + xdrop_offset;
300     for (index = 0; index < xdrop_offset; index++)
301     aux_data.max_score[index] = 0;
302    
303     // fill in the initial offsets of the distance matrix
304    
305     last_seq2_off[0][diag_origin] = seq1_index;
306     max_score[0] = seq1_index * match_cost;
307     diag_lower = diag_origin - 1;
308     diag_upper = diag_origin + 1;
309     end1_reached = end2_reached = false;
310    
311     // for each distance
312     for (d = 1; d <= max_dist; d++) {
313     int xdrop_score;
314     int curr_score;
315     int curr_extent = 0;
316     int curr_seq2_index = 0;
317     int curr_diag = 0;
318     int tmp_diag_lower = diag_lower;
319     int tmp_diag_upper = diag_upper;
320    
321     // Assign impossible seq2 offsets to any diagonals that
322     // are not in the range (diag_lower,diag_upper).
323     // These will serve as sentinel values for the inner loop
324     last_seq2_off[d - 1][diag_lower-1] = kInvalidOffset;
325     last_seq2_off[d - 1][diag_lower] = kInvalidOffset;
326     last_seq2_off[d - 1][diag_upper] = kInvalidOffset;
327     last_seq2_off[d - 1][diag_upper+1] = kInvalidOffset;
328    
329     // compute the score for distance d corresponding to the X-dropoff criterion
330    
331     xdrop_score = max_score[d - xdrop_offset] +
332     (match_cost + mismatch_cost) * d - xdrop_threshold;
333     xdrop_score = (int)ceil((double)xdrop_score / (match_cost / 2));
334    
335     // for each diagonal of interest
336     for (k = tmp_diag_lower; k <= tmp_diag_upper; k++) {
337     /* find the largest offset into seq2 that increases
338     the distance from d-1 to d (i.e. keeps the alignment
339     from getting worse for as long as possible), then
340     choose the offset into seq1 that will keep the
341     resulting diagonal fixed at k
342    
343     Note that this requires kInvalidOffset+1 to be smaller
344     than any valid offset into seq2, i.e. to be negative */
345    
346     seq2_index = GMAX(last_seq2_off[d - 1][k + 1],
347     last_seq2_off[d - 1][k ]) + 1;
348     seq2_index = GMAX(seq2_index, last_seq2_off[d - 1][k - 1]);
349     seq1_index = seq2_index + k - diag_origin;
350    
351     if (seq2_index < 0 || seq1_index + seq2_index < xdrop_score) {
352    
353     // if no valid diagonal can reach distance d, or the
354     // X-dropoff test fails, narrow the range of diagonals
355     // to test and skip to the next diagonal
356     if (k == diag_lower)
357     diag_lower++;
358     else
359     last_seq2_off[d][k] = kInvalidOffset;
360     continue;
361     }
362     diag_upper = k;
363    
364     /* slide down diagonal k until a mismatch
365     occurs. As long as only matches are encountered,
366     the current distance d will not change */
367    
368     index = s_FindFirstMismatch(seq1, len1, seq2, len2,
369     seq1_index, seq2_index, reverse);
370     //fence_hit, reverse, rem);
371     if (index > longest_match_run) {
372     //seed->start_q = seq1_index;
373     //seed->start_s = seq2_index;
374     //seed->match_length = index;
375     longest_match_run = index;
376     }
377     seq1_index += index;
378     seq2_index += index;
379    
380     // set the new largest seq2 offset that achieves
381     // distance d on diagonal k
382    
383     last_seq2_off[d][k] = seq2_index;
384    
385     // since all values of k are constrained to have the
386     // same distance d, the value of k which maximizes the
387     // alignment score is the one that covers the most of seq1 and seq2
388     if (seq1_index + seq2_index > curr_extent) {
389     curr_extent = seq1_index + seq2_index;
390     curr_seq2_index = seq2_index;
391     curr_diag = k;
392     }
393    
394     /* clamp the bounds on diagonals to avoid walking off
395     either sequence. Because the bounds increase by at
396     most one for each distance, diag_lower and diag_upper
397     can each be of size at most max_diags+2 */
398    
399     if (seq2_index == len2) {
400     diag_lower = k + 1;
401     end2_reached = true;
402     }
403     if (seq1_index == len1) {
404     diag_upper = k - 1;
405     end1_reached = true;
406     }
407     } // end loop over diagonals
408    
409     // compute the maximum score possible for distance d
410     curr_score = curr_extent * (match_cost / 2) -
411     d * (match_cost + mismatch_cost);
412     // if this is the best score seen so far, update the
413     // statistics of the best alignment
414     if (curr_score > max_score[d - 1]) {
415     max_score[d] = curr_score;
416     best_dist = d;
417     best_diag = curr_diag;
418     seq2_align_len = curr_seq2_index;
419     seq1_align_len = curr_seq2_index + best_diag - diag_origin;
420     }
421     else {
422     max_score[d] = max_score[d - 1];
423     }
424    
425     // alignment has finished if the lower and upper bounds
426     // on diagonals to check have converged to each other
427    
428     if (diag_lower > diag_upper)
429     break;
430    
431     /* set up for the next distance to examine. Because the
432     bounds increase by at most one for each distance,
433     diag_lower and diag_upper can each be of size at
434     most max_diags+2 */
435    
436     if (!end2_reached)
437     diag_lower--;
438     if (!end1_reached)
439     diag_upper++;
440    
441     if (edit_block == NULL) {
442     // if no traceback is specified, the next row of
443     // last_seq2_off can reuse previously allocated memory
444     //TODO FIXME The following assumes two arrays of
445     // at least max_dist+4 int's have already been allocated
446     last_seq2_off[d + 1] = last_seq2_off[d - 1];
447     }
448     else {
449     // traceback requires all rows of last_seq2_off to be saved,
450     // so a new row must be allocated
451     last_seq2_off[d + 1] = (int*)mem_pool->getByteSpace((diag_upper - diag_lower + 7)*sizeof(int));
452     // move the origin for this row backwards
453     //TODO FIXME: dubious pointer arithmetic ?!
454     //last_seq2_off[d + 1] = last_seq2_off[d + 1] - diag_lower + 2;
455     }
456     } // end loop over distinct distances
457    
458    
459     if (edit_block == NULL)
460     return best_dist;
461    
462     //---- perform traceback
463     d = best_dist;
464     seq1_index = seq1_align_len;
465     seq2_index = seq2_align_len;
466     // for all positive distances
467    
468     //if (fence_hit && *fence_hit)
469     // goto done;
470     if (index==len1 || index==len2) d=0;
471     while (d > 0) {
472     int new_diag;
473     int new_seq2_index;
474    
475     /* retrieve the value of the diagonal after the next
476     traceback operation. best_diag starts off with the
477     value computed during the alignment process */
478    
479     new_diag = s_GetNextNonAffineTback(last_seq2_off, d,
480     best_diag, &new_seq2_index);
481    
482     if (new_diag == best_diag) {
483     // same diagonal: issue a group of substitutions
484     if (seq2_index - new_seq2_index > 0) {
485     edit_block->opRep(seq2_index - new_seq2_index);
486     }
487     }
488     else if (new_diag < best_diag) {
489     // smaller diagonal: issue a group of substitutions
490     // and then a gap in seq2 */
491     if (seq2_index - new_seq2_index > 0) {
492     edit_block->opRep(seq2_index - new_seq2_index);
493     }
494     //GapPrelimEditBlockAdd(edit_block, eGapAlignIns, 1);
495     edit_block->opIns(1);
496     }
497     else {
498     // larger diagonal: issue a group of substitutions
499     // and then a gap in seq1
500     if (seq2_index - new_seq2_index - 1 > 0) {
501     edit_block->opRep(seq2_index - new_seq2_index - 1);
502     }
503     edit_block->opDel(1);
504     }
505     d--;
506     best_diag = new_diag;
507     seq2_index = new_seq2_index;
508     }
509     //done:
510     // handle the final group of substitutions back to distance zero,
511     // i.e. back to offset zero of seq1 and seq2
512     //GapPrelimEditBlockAdd(edit_block, eGapAlignSub,
513     // last_seq2_off[0][diag_origin]);
514     edit_block->opRep(last_seq2_off[0][diag_origin]);
515     if (!reverse)
516     edit_block->reverse();
517     return best_dist;
518     }
519    
520     void printEditScript(GXEditScript* ed_script) {
521     uint i;
522     if (ed_script==NULL || ed_script->opnum == 0)
523     return;
524     for (i=0; i<ed_script->opnum; i++) {
525     int num=((ed_script->ops[i]) >> 2);
526     unsigned char op_type = 3 - ( ed_script->ops[i] & gxEDIT_OP_MASK );
527     if (op_type == 3)
528     GError("Error: printEditScript encountered op_type 3 ?!\n");
529     GMessage("%d%c ", num, xgapcodes[op_type]);
530     }
531     GMessage("\n");
532     }
533    
534     GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart,
535     bool editscript, int reward, int penalty, int xdrop) {
536     int q_max=strlen(q_seq); //query
537     int s_max=strlen(s_seq); //subj
538     return GreedyAlignRegion(q_seq, q_alnstart, q_max,
539     s_seq, s_alnstart, s_max, galn_NoTrim, editscript, NULL, reward, penalty, xdrop);
540     }
541    
542     struct GXSeedTable {
543     int a_num, b_num;
544     int a_cap, b_cap;
545     char* xc;
546     GXSeedTable(int a=12, int b=255) {
547     a_cap=0;
548     b_cap=0;
549     a_num=0;
550     b_num=0;
551     xc=NULL;
552     init(a,b);
553     }
554     ~GXSeedTable() {
555     GFREE(xc);
556     }
557     void init(int a, int b) {
558     a_num=a;
559     b_num=b;
560     bool resize=false;
561     if (b_num>b_cap) { resize=true; b_cap=b_num;}
562     if (a_num>a_cap) { resize=true; a_cap=a_num;}
563     if (resize) {
564     GFREE(xc);
565     GCALLOC(xc, (a_num*b_num));
566     }
567     else {
568     //just clear up to a_max, b_max
569     memset((void*)xc, 0, (a_num*b_num));
570     }
571     }
572     char& x(int ax, int by) {
573     return xc[by*a_num+ax];
574     }
575    
576     };
577    
578    
579     GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
580     //overlap of right (3') end of seqb
581     //hash the first 12 bases of seqa:
582     int aimin=0;
583     int aimax=GMIN(9,(a_len-4));
584     int bimin=GMAX(0,(b_len-a_len-2));//allow 2 extraneous bases, just in case there is some garbage at the end
585     int bimax=b_len-4;
586     int a_maxlen=aimax+4; //number of rows in the diagonal matrix
587     int b_maxlen=b_len; //number of cols in the diagonal matrix
588     GXSeedTable gx(a_maxlen, b_maxlen);
589     GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
590     for (int ai=aimin;ai<=aimax;ai++) {
591     int32* av=(int32 *)(&seqa[ai]);
592     //for (int bi=b_len-4;bi>=bimin;bi--) {
593     for (int bi=bimin;bi<=bimax;bi++) {
594     if (gx.x(ai,bi))
595     continue; //already have a previous seed covering this region of this diagonal
596     int32* bv=(int32 *)(&seqb[bi]);
597     if (*av == *bv) {
598     //word match
599     //see if we can extend to the right
600     gx.x(ai+1,bi+1)=1;
601     gx.x(ai+2,bi+2)=1;
602     gx.x(ai+3,bi+3)=1;
603     int aix=ai+4;
604     int bix=bi+4;
605     int len=4;
606     while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
607     {
608     gx.x(aix,bix)=1;
609     aix++;bix++;
610     len++;
611     }
612     GXSeed* newseed=new GXSeed(ai,bi,len);
613     seeds.Add(newseed);
614     diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
615     }
616     }
617     }//for each 4-mer at the beginning of seqa
618     for (int i=0;i<diagstrips->Count();i++) {
619     diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
620     }
621     diagstrips->setSorted(true); //sort by score
622     return diagstrips;
623     }
624    
625     GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
626     //overlap of left (5') end of seqb
627     //hash the last 12 bases of seqa:
628     int aimin=GMAX(0,(a_len-16));
629     int aimax=a_len-4;
630     int bimin=0;
631     int bimax=GMIN((a_len-2), (b_len-4));
632     int a_maxlen=aimax+4; //number of rows in the diagonal matrix
633     int b_maxlen=b_len; //number of cols in the diagonal matrix
634     //gx.init(a_maxlen, b_maxlen);
635     GXSeedTable gx(a_maxlen, b_maxlen);
636     GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
637     for (int ai=aimin;ai<=aimax;ai++) {
638     int32* av=(int32 *)(&seqa[ai]);
639     for (int bi=bimin;bi<=bimax;bi++) {
640     if (gx.x(ai,bi))
641     continue; //already have a previous seed covering this region of this diagonal
642     int32* bv=(int32 *)(&seqb[bi]);
643     if (*av == *bv) {
644     //word match
645     //see if we can extend to the right
646     gx.x(ai+1,bi+1)=1;
647     gx.x(ai+2,bi+2)=1;
648     gx.x(ai+3,bi+3)=1;
649     int aix=ai+4;
650     int bix=bi+4;
651     int len=4;
652     while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
653     {
654     gx.x(aix,bix)=1;
655     aix++;bix++;
656     len++;
657     }
658     GXSeed* newseed=new GXSeed(ai,bi,len);
659     seeds.Add(newseed);
660     diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
661     }
662     }
663     }//for each 4-mer at the end of seqa
664     for (int i=0;i<diagstrips->Count();i++) {
665     diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
666     }
667     diagstrips->setSorted(true); //sort by score
668     return diagstrips;
669     }
670    
671    
672    
673     int cmpSeedScore(const pointer p1, const pointer p2) {
674     //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
675     GXSeed* s1=(GXSeed*)p1;
676     GXSeed* s2=(GXSeed*)p2;
677     if (s1->len==s2->len) {
678     return (s1->b_ofs-s2->b_ofs);
679     }
680     else return (s2->len-s1->len);
681     }
682    
683     int cmpSeedDiag(const pointer p1, const pointer p2) {
684     GXSeed* s1=(GXSeed*)p1;
685     GXSeed* s2=(GXSeed*)p2;
686     return ((s1->b_ofs-s1->a_ofs)-(s2->b_ofs-s2->a_ofs));
687     }
688    
689     GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
690     const char* s_seq, int s_alnstart, int s_max, GAlnTrimType trimtype,
691     bool editscript, SGreedyAlignMem* gxmem, int reward, int penalty, int xdrop) {
692     GXEditScript* ed_script_fwd = NULL;
693     GXEditScript* ed_script_rev = NULL;
694     if ( q_alnstart>q_max || q_alnstart<1 || s_alnstart>s_max || s_alnstart<1 )
695     GError("GreedyAlign() Error: invalid anchor coordinate.\n");
696     q_alnstart--;
697     s_alnstart--;
698     if (q_seq==NULL || q_seq[0]==0 || s_seq==NULL || s_seq[0]==0)
699     GError("GreedyAlign() Error: attempt to use an empty sequence string!\n");
700     if (q_seq[q_alnstart]!=s_seq[s_alnstart])
701     GError("GreedyAlign() Error: improper anchor (mismatch)!\n");
702     int q_ext_l=0, q_ext_r=0, s_ext_l=0, s_ext_r=0;
703     const char* q=q_seq+q_alnstart;
704     int q_avail=q_max-q_alnstart;
705     const char* s=s_seq+s_alnstart;
706     int s_avail=s_max-s_alnstart;
707     if (penalty<0) penalty=-penalty;
708     int MIN_GREEDY_SCORE=5*reward; //minimum score for an alignment to be reported
709     GXAlnInfo* alninfo=NULL;
710     bool freeAlnMem=(gxmem==NULL);
711     if (freeAlnMem) {
712     gxmem=new SGreedyAlignMem(reward, penalty, xdrop);
713     }
714     else gxmem->reset();
715     int retscore = 0;
716     int numdiffs = 0;
717     if (trimtype==galn_TrimLeft) {
718     if (editscript)
719     ed_script_rev=new GXEditScript();
720    
721     int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
722     reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
723     //check this extension here and bail out if it's not a good extension
724     if (s_alnstart+1-s_ext_l>3) {
725     delete ed_script_rev;
726     if (freeAlnMem) delete gxmem;
727     return NULL;
728     }
729    
730     if (editscript)
731     ed_script_fwd=new GXEditScript();
732     int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
733     reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
734     numdiffs=numdiffs_r+numdiffs_l;
735     //convert num diffs to actual score
736     retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward-penalty);
737     if (editscript)
738     ed_script_rev->Append(ed_script_fwd); //combine the two extensions
739     }
740     else {
741     if (editscript) {
742     ed_script_fwd=new GXEditScript();
743     }
744     int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
745     reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
746     //check extension here and bail out if not a good right extension
747     //assuming s_max is really at the right end of s_seq
748     if (trimtype==galn_TrimRight && s_alnstart+s_ext_r<s_max-2) {
749     delete ed_script_fwd;
750     if (freeAlnMem) delete gxmem;
751     return NULL;
752     }
753     if (editscript)
754     ed_script_rev=new GXEditScript();
755     int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
756     reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
757     //convert num diffs to actual score
758     numdiffs=numdiffs_r+numdiffs_l;
759     retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward-penalty);
760     if (editscript)
761     ed_script_rev->Append(ed_script_fwd); //combine the two extensions
762     }
763    
764     if (retscore>=MIN_GREEDY_SCORE) {
765     alninfo=new GXAlnInfo(q_seq, q_alnstart+1-q_ext_l,q_alnstart+q_ext_r, s_seq, s_alnstart+1-s_ext_l, s_alnstart+s_ext_r);
766     int hsp_length = GMIN(q_ext_l+q_ext_r, s_ext_l+s_ext_r);
767     alninfo->score=retscore;
768     alninfo->pid = 100 * (1 - ((double) numdiffs) / hsp_length);
769     #ifdef GDEBUG
770     if (ed_script_rev) {
771     GMessage("Final Edit script ::: ");
772     printEditScript(ed_script_rev);
773     }
774     #endif
775     alninfo->editscript=ed_script_rev;
776     alninfo->gapinfo = new CAlnGapInfo(ed_script_rev, alninfo->ql-1, alninfo->sl-1);
777     }
778     else {
779     if (freeAlnMem) delete gxmem;
780     delete ed_script_rev;
781     delete alninfo;
782     alninfo=NULL;
783     }
784     delete ed_script_fwd;
785     return alninfo;
786     }
787 gpertea 96
788     GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
789     SGreedyAlignMem* gxmem, int match_reward, int mismatch_penalty, int Xdrop) {
790     bool editscript=false;
791     #ifdef GDEBUG
792     editscript=true;
793     GMessage("==========> matching Right (3') end ========\n");
794     #endif
795     GList<GXSeed> rseeds(true,true,false);
796     GXBandSet* alnbands=collectSeeds_R(rseeds, seqa, seqa_len, seqb, seqb_len);
797     GList<GXAlnInfo> galns(true,true,false);
798     int max_top_bands=5;
799     int top_band_count=0;
800     GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
801     for (int b=0;b<alnbands->Count();b++) {
802     if (alnbands->Get(b)->score<4) break;
803     top_band_count++;
804     GXBand& band=*(alnbands->Get(b));
805     band.seeds.setSorted(cmpSeedScore);
806     anchor_seeds.Add(band.seeds.First());
807     if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
808     }
809     #ifdef GDEBUG
810     GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
811     #endif
812    
813     for (int i=0;i<anchor_seeds.Count();i++) {
814     GXSeed& aseed=*anchor_seeds[i];
815     int a1=aseed.a_ofs+(aseed.len>>1)+1;
816     int a2=aseed.b_ofs+(aseed.len>>1)+1;
817     GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
818     seqb, a2, seqb_len, galn_TrimRight, editscript, gxmem,
819     match_reward, mismatch_penalty, Xdrop);
820     if (alninfo && alninfo->pid>73 && alninfo->sr>seqb_len-4 && alninfo->ql<4)
821     galns.AddIfNew(alninfo, true);
822     else delete alninfo;
823     }
824    
825     #ifdef GDEBUG
826     for (int i=0;i<galns.Count();i++) {
827     GXAlnInfo* alninfo=galns[i];
828     GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
829     alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
830     if (alninfo->gapinfo!=NULL) {
831     GMessage("Alignment:\n");
832     alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
833     }
834     }
835     #endif
836     //---- done
837     delete alnbands;
838     if (galns.Count()) {
839     GXAlnInfo* bestaln=galns.Shift();
840     return bestaln;
841     }
842     else return NULL;
843     }
844    
845     GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
846     SGreedyAlignMem* gxmem, int match_reward, int mismatch_penalty, int Xdrop) {
847     bool editscript=false;
848     #ifdef GDEBUG
849     editscript=true;
850     GMessage("==========> matching Left (5') end ========\n");
851     #endif
852     GList<GXSeed> rseeds(true,true,false);
853     GXBandSet* alnbands=collectSeeds_L(rseeds, seqa, seqa_len, seqb, seqb_len);
854     GList<GXAlnInfo> galns(true,true,false);
855     int max_top_bands=5;
856     int top_band_count=0;
857     GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
858     for (int b=0;b<alnbands->Count();b++) {
859     if (alnbands->Get(b)->score<4) break;
860     top_band_count++;
861     GXBand& band=*(alnbands->Get(b));
862     band.seeds.setSorted(cmpSeedScore);
863     anchor_seeds.Add(band.seeds.First());
864     if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
865     }
866     #ifdef GDEBUG
867     GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
868     #endif
869    
870     for (int i=0;i<anchor_seeds.Count();i++) {
871     GXSeed& aseed=*anchor_seeds[i];
872     int a1=aseed.a_ofs+(aseed.len>>1)+1;
873     int a2=aseed.b_ofs+(aseed.len>>1)+1;
874     GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
875     seqb, a2, seqb_len, galn_TrimLeft, editscript, gxmem,
876     match_reward, mismatch_penalty, Xdrop-2);
877     if (alninfo && alninfo->pid>82 && alninfo->sl<3 && alninfo->qr>seqa_len-4)
878     //TODO: at 5' end we should use a higher pid threshold
879     // we could also decrease the X-drop value!
880     galns.AddIfNew(alninfo, true);
881     else delete alninfo;
882     }
883    
884     #ifdef GDEBUG
885     for (int i=0;i<galns.Count();i++) {
886     GXAlnInfo* alninfo=galns[i];
887     GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
888     alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
889     if (alninfo->gapinfo!=NULL) {
890     GMessage("Alignment:\n");
891     alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
892     }
893     }
894     #endif
895     //---- done
896     delete alnbands;
897     if (galns.Count()) {
898     GXAlnInfo* bestaln=galns.Shift();
899     return bestaln;
900     }
901     else return NULL;
902     }