ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.cpp
Revision: 99
Committed: Thu Oct 6 20:49:17 2011 UTC (8 years, 2 months ago) by gpertea
File size: 35461 byte(s)
Log Message:
wip

Line User Rev File contents
1 gpertea 93 #include "GAlnExtend.h"
2    
3     //greedy gapped alignment extension
4     //(mostly lifted from NCBI's megablast gapped extension code)
5    
6     int GXMemPool::kMinSpace = 1000000;
7    
8     #ifdef GDEBUG
9     char buf[6]={0x1B,'[', 'n','m','m','\0'};
10    
11     void color_fg(int c,FILE* f) {
12     if (f!=stderr && f!=stdout) return;
13     sprintf((char *)(&buf[2]),"%dm",c+30);
14     fwrite(buf,1,strlen(buf), f);
15     }
16    
17     void color_bg(int c, FILE* f) {
18     if (f!=stderr && f!=stdout) return;
19     sprintf((char *)(&buf[2]),"%dm",c+40);
20     fwrite(buf,1,strlen(buf),f);
21     };
22    
23     void color_resetfg(FILE* f) {
24     if (f!=stderr && f!=stdout) return;
25     sprintf((char *)(&buf[2]),"39m");
26     fwrite(buf,1,strlen(buf), f);
27     };
28    
29     void color_resetbg(FILE* f) {
30     if (f!=stderr && f!=stdout) return;
31     sprintf((char *)(&buf[2]),"49m");
32     fwrite(buf,1,strlen(buf), f);
33     }
34    
35     void color_reset(FILE* f) {
36     if (f!=stderr && f!=stdout) return;
37     sprintf((char *)(&buf[2]),"0m");
38     fwrite(buf,1,strlen(buf), f);
39     };
40    
41     void color_normal(FILE* f) {
42     if (f!=stderr && f!=stdout) return;
43     sprintf((char *)(&buf[2]),"22m");
44     fwrite(buf,1,strlen(buf), f);
45     };
46    
47     #endif
48    
49    
50     char xgapcodes[4]={'S','I', 'D', 'X'};
51    
52     int get_last(int **flast_d, int d, int diag, int *row1) {
53     if (flast_d[d-1][diag-1] > GMAX(flast_d[d-1][diag], flast_d[d-1][diag+1])) {
54     *row1 = flast_d[d-1][diag-1];
55     return diag-1;
56     }
57     if (flast_d[d-1][diag] > flast_d[d-1][diag+1]) {
58     *row1 = flast_d[d-1][diag];
59     return diag;
60     }
61     *row1 = flast_d[d-1][diag+1];
62     return diag+1;
63     }
64    
65     void GapXEditScript::print() { //debug
66     GapXEditScript* p=this;
67     do {
68     GMessage("%d%c ",p->num, xgapcodes[p->op_type]);
69     } while ((p=p->next)!=NULL);
70     GMessage("\n");
71     }
72    
73    
74     int BLAST_Gcd(int a, int b) {
75     int c;
76    
77     b = abs(b);
78     if (b > a)
79     c=a, a=b, b=c;
80    
81     while (b != 0) {
82     c = a%b;
83     a = b;
84     b = c;
85     }
86     return a;
87     }
88    
89     int BLAST_Gdb3(int* a, int* b, int* c) {
90     int g;
91     if (*b == 0)
92     g = BLAST_Gcd(*a, *c);
93     else
94     g = BLAST_Gcd(*a, BLAST_Gcd(*b, *c));
95     if (g > 1) {
96     *a /= g;
97     *b /= g;
98     *c /= g;
99     }
100     return g;
101     }
102    
103    
104    
105     //signal that a diagonal is invalid
106     static const int kInvalidOffset = -2;
107    
108     int s_FindFirstMismatch(const char *seq1, int len1,
109     const char *seq2, int len2,
110     int seq1_index, int seq2_index,
111     //bool &fence_hit,
112     bool reverse) {
113     int start_index = seq1_index;
114     /* Sentry detection here should be relatively inexpensive: The
115     sentry value cannot appear in the query, so detection only
116     needs to be done at exit from the subject-query matching loop.
117     For uncompressed sequences, ambiguities in the query (i.e. seq1)
118     always count as mismatches */
119     if (reverse) {
120     while (seq1_index < len1 && seq2_index < len2 &&
121     //seq1[len1-1 - seq1_index] < 4 &&
122     seq1[len1-1 - seq1_index] == seq2[len2-1 - seq2_index]) {
123     ++seq1_index;
124     ++seq2_index;
125     }
126     //if (seq2_index < len2 && seq2[len2-1-seq2_index] == FENCE_SENTRY) {
127     //if len2-1-seq2_index<=0) {
128     // fence_hit = true;
129     // }
130     }
131     else { //forward lookup
132     while (seq1_index < len1 && seq2_index < len2 &&
133     //seq1[seq1_index] < 4 &&
134     seq1[seq1_index] == seq2[seq2_index]) {
135     ++seq1_index;
136     ++seq2_index;
137     }
138     //if (seq2_index < len2 && seq2[seq2_index] == FENCE_SENTRY) {
139     //if (seq2_index==len2) {
140     // fence_hit = true;
141     //}
142     }
143     return seq1_index - start_index;
144     }
145    
146    
147    
148     /** During the traceback for a non-affine greedy alignment,
149     compute the diagonal that will result from the next
150     traceback operation
151    
152     @param last_seq2_off Array of offsets into the second sequence;
153     last_seq2_off[d][k] gives the largest offset into
154     the second sequence that lies on diagonal k and
155     has distance d [in]
156     @param d Starting distance [in]
157     @param diag Index of diagonal that produced the starting distance [in]
158     @param seq2_index The offset into the second sequence after the traceback
159     operation has completed [out]
160     @return The diagonal resulting from the next traceback operation
161     being applied
162     */
163     int s_GetNextNonAffineTback(int **last_seq2_off, int d,
164     int diag, int *seq2_index) {
165     // choose the traceback operation that results in the
166     // largest seq2 offset at this point, then compute the
167     // new diagonal that is implied by the operation
168     if (last_seq2_off[d-1][diag-1] >
169     GMAX(last_seq2_off[d-1][diag], last_seq2_off[d-1][diag+1])) {
170     *seq2_index = last_seq2_off[d-1][diag-1];
171     return diag - 1; // gap in seq2
172     }
173     if (last_seq2_off[d-1][diag] > last_seq2_off[d-1][diag+1]) {
174     *seq2_index = last_seq2_off[d-1][diag];
175     return diag; // match
176     }
177     *seq2_index = last_seq2_off[d-1][diag+1];
178     return diag + 1; // gap in seq1
179     }
180    
181    
182     int GXGreedyExtend(const char* seq1, int len1,
183     const char* seq2, int len2,
184     bool reverse, int xdrop_threshold,
185     int match_cost, int mismatch_cost,
186     int& seq1_align_len, int& seq2_align_len,
187     SGreedyAlignMem& aux_data,
188     GXEditScript *edit_block) {
189     //GapPrelimEditBlock *edit_block,
190     //bool& fence_hit, SGreedySeed *seed) {
191     int seq1_index;
192     int seq2_index;
193     int index;
194     int d;
195     int k;
196     int diag_lower, diag_upper;
197     int max_dist;
198     int diag_origin;
199     int best_dist;
200     int best_diag;
201     int** last_seq2_off;
202     int* max_score;
203     int xdrop_offset;
204     int longest_match_run;
205     bool end1_reached, end2_reached;
206     GXMemPool* mem_pool;
207    
208     /* ordinary dynamic programming alignment, for each offset
209     in seq1, walks through offsets in seq2 until an X-dropoff
210     test fails, saving the best score encountered along
211     the way. Instead of score, this code tracks the 'distance'
212     (number of mismatches plus number of gaps) between seq1
213     and seq2. Instead of walking through sequence offsets, it
214     walks through diagonals that can achieve a given distance.
215    
216     Note that in what follows, the numbering of diagonals implies
217     a dot matrix where increasing seq1 offsets go to the right on
218     the x axis, and increasing seq2 offsets go up the y axis.
219     The gapped alignment thus proceeds up and to the right in
220     the graph, and diagonals are numbered increasing to the right */
221    
222     best_dist = 0;
223     best_diag = 0;
224    
225     /* set the number of distinct distances the algorithm will
226     examine in the search for an optimal alignment. The
227     heuristic worst-case running time of the algorithm is
228     O(max_dist**2 + (len1+len2)); for sequences which are
229     very similar, the average running time will be sig-
230     nificantly better than this */
231    
232     max_dist = GMIN(GREEDY_MAX_COST,
233     len2 / GREEDY_MAX_COST_FRACTION + 1);
234    
235     /* the main loop assumes that the index of all diagonals is
236     biased to lie in the middle of allocated bookkeeping
237     structures */
238    
239     diag_origin = max_dist + 2;
240    
241     // last_seq2_off[d][k] is the largest offset into seq2 that
242     // lies on diagonal k and has distance d
243    
244     last_seq2_off = aux_data.last_seq2_off;
245    
246     /* Instead of tracking the best alignment score and using
247     xdrop_theshold directly, track the best score for each
248     unique distance and use the best score for some previously
249     computed distance to implement the X-dropoff test.
250    
251     xdrop_offset gives the distance backwards in the score
252     array to look */
253    
254     xdrop_offset = (xdrop_threshold + match_cost / 2) /
255     (match_cost + mismatch_cost) + 1;
256    
257     // find the offset of the first mismatch between seq1 and seq2
258    
259     index = s_FindFirstMismatch(seq1, len1, seq2, len2, 0, 0, reverse);
260     // fence_hit, reverse, rem);
261    
262     // update the extents of the alignment, and bail out
263     // early if no further work is needed
264    
265     seq1_align_len = index;
266     seq2_align_len = index;
267     seq1_index = index;
268     /*
269     seed->start_q = 0;
270     seed->start_s = 0;
271     seed->match_length = index;
272     */
273     longest_match_run = index;
274    
275     if (index == len1 || index == len2) {
276     /* Return the number of differences, which is zero here */
277     if (edit_block != NULL)
278     //GapPrelimEditBlockAdd(edit_block, eGapAlignSub, index);
279     edit_block->opRep(index);
280     return 0;
281     }
282    
283     // set up the memory pool
284     mem_pool = aux_data.space;
285     if (edit_block == NULL) {
286     mem_pool = NULL;
287     }
288     else if (mem_pool == NULL) {
289     aux_data.space = mem_pool = new GXMemPool();
290     }
291     else {
292     mem_pool->refresh();
293     }
294    
295     /* set up the array of per-distance maximum scores. There
296     are max_diags + xdrop_offset distances to track, the first
297     xdrop_offset of which are 0 */
298    
299     max_score = aux_data.max_score + xdrop_offset;
300     for (index = 0; index < xdrop_offset; index++)
301     aux_data.max_score[index] = 0;
302    
303     // fill in the initial offsets of the distance matrix
304    
305     last_seq2_off[0][diag_origin] = seq1_index;
306     max_score[0] = seq1_index * match_cost;
307     diag_lower = diag_origin - 1;
308     diag_upper = diag_origin + 1;
309     end1_reached = end2_reached = false;
310    
311     // for each distance
312     for (d = 1; d <= max_dist; d++) {
313     int xdrop_score;
314     int curr_score;
315     int curr_extent = 0;
316     int curr_seq2_index = 0;
317     int curr_diag = 0;
318     int tmp_diag_lower = diag_lower;
319     int tmp_diag_upper = diag_upper;
320    
321     // Assign impossible seq2 offsets to any diagonals that
322     // are not in the range (diag_lower,diag_upper).
323     // These will serve as sentinel values for the inner loop
324     last_seq2_off[d - 1][diag_lower-1] = kInvalidOffset;
325     last_seq2_off[d - 1][diag_lower] = kInvalidOffset;
326     last_seq2_off[d - 1][diag_upper] = kInvalidOffset;
327     last_seq2_off[d - 1][diag_upper+1] = kInvalidOffset;
328    
329     // compute the score for distance d corresponding to the X-dropoff criterion
330    
331     xdrop_score = max_score[d - xdrop_offset] +
332     (match_cost + mismatch_cost) * d - xdrop_threshold;
333     xdrop_score = (int)ceil((double)xdrop_score / (match_cost / 2));
334    
335     // for each diagonal of interest
336     for (k = tmp_diag_lower; k <= tmp_diag_upper; k++) {
337     /* find the largest offset into seq2 that increases
338     the distance from d-1 to d (i.e. keeps the alignment
339     from getting worse for as long as possible), then
340     choose the offset into seq1 that will keep the
341     resulting diagonal fixed at k
342    
343     Note that this requires kInvalidOffset+1 to be smaller
344     than any valid offset into seq2, i.e. to be negative */
345    
346     seq2_index = GMAX(last_seq2_off[d - 1][k + 1],
347     last_seq2_off[d - 1][k ]) + 1;
348     seq2_index = GMAX(seq2_index, last_seq2_off[d - 1][k - 1]);
349     seq1_index = seq2_index + k - diag_origin;
350    
351     if (seq2_index < 0 || seq1_index + seq2_index < xdrop_score) {
352    
353     // if no valid diagonal can reach distance d, or the
354     // X-dropoff test fails, narrow the range of diagonals
355     // to test and skip to the next diagonal
356     if (k == diag_lower)
357     diag_lower++;
358     else
359     last_seq2_off[d][k] = kInvalidOffset;
360     continue;
361     }
362     diag_upper = k;
363    
364     /* slide down diagonal k until a mismatch
365     occurs. As long as only matches are encountered,
366     the current distance d will not change */
367    
368     index = s_FindFirstMismatch(seq1, len1, seq2, len2,
369     seq1_index, seq2_index, reverse);
370     //fence_hit, reverse, rem);
371     if (index > longest_match_run) {
372     //seed->start_q = seq1_index;
373     //seed->start_s = seq2_index;
374     //seed->match_length = index;
375     longest_match_run = index;
376     }
377     seq1_index += index;
378     seq2_index += index;
379    
380     // set the new largest seq2 offset that achieves
381     // distance d on diagonal k
382    
383     last_seq2_off[d][k] = seq2_index;
384    
385     // since all values of k are constrained to have the
386     // same distance d, the value of k which maximizes the
387     // alignment score is the one that covers the most of seq1 and seq2
388     if (seq1_index + seq2_index > curr_extent) {
389     curr_extent = seq1_index + seq2_index;
390     curr_seq2_index = seq2_index;
391     curr_diag = k;
392     }
393    
394     /* clamp the bounds on diagonals to avoid walking off
395     either sequence. Because the bounds increase by at
396     most one for each distance, diag_lower and diag_upper
397     can each be of size at most max_diags+2 */
398    
399     if (seq2_index == len2) {
400     diag_lower = k + 1;
401     end2_reached = true;
402     }
403     if (seq1_index == len1) {
404     diag_upper = k - 1;
405     end1_reached = true;
406     }
407     } // end loop over diagonals
408    
409     // compute the maximum score possible for distance d
410     curr_score = curr_extent * (match_cost / 2) -
411     d * (match_cost + mismatch_cost);
412     // if this is the best score seen so far, update the
413     // statistics of the best alignment
414     if (curr_score > max_score[d - 1]) {
415     max_score[d] = curr_score;
416     best_dist = d;
417     best_diag = curr_diag;
418     seq2_align_len = curr_seq2_index;
419     seq1_align_len = curr_seq2_index + best_diag - diag_origin;
420     }
421     else {
422     max_score[d] = max_score[d - 1];
423     }
424    
425     // alignment has finished if the lower and upper bounds
426     // on diagonals to check have converged to each other
427    
428     if (diag_lower > diag_upper)
429     break;
430    
431     /* set up for the next distance to examine. Because the
432     bounds increase by at most one for each distance,
433     diag_lower and diag_upper can each be of size at
434     most max_diags+2 */
435    
436     if (!end2_reached)
437     diag_lower--;
438     if (!end1_reached)
439     diag_upper++;
440    
441     if (edit_block == NULL) {
442     // if no traceback is specified, the next row of
443     // last_seq2_off can reuse previously allocated memory
444     //TODO FIXME The following assumes two arrays of
445     // at least max_dist+4 int's have already been allocated
446     last_seq2_off[d + 1] = last_seq2_off[d - 1];
447     }
448     else {
449     // traceback requires all rows of last_seq2_off to be saved,
450     // so a new row must be allocated
451     last_seq2_off[d + 1] = (int*)mem_pool->getByteSpace((diag_upper - diag_lower + 7)*sizeof(int));
452     // move the origin for this row backwards
453     //TODO FIXME: dubious pointer arithmetic ?!
454     //last_seq2_off[d + 1] = last_seq2_off[d + 1] - diag_lower + 2;
455     }
456     } // end loop over distinct distances
457    
458    
459     if (edit_block == NULL)
460     return best_dist;
461    
462     //---- perform traceback
463     d = best_dist;
464     seq1_index = seq1_align_len;
465     seq2_index = seq2_align_len;
466     // for all positive distances
467    
468     //if (fence_hit && *fence_hit)
469     // goto done;
470     if (index==len1 || index==len2) d=0;
471     while (d > 0) {
472     int new_diag;
473     int new_seq2_index;
474    
475     /* retrieve the value of the diagonal after the next
476     traceback operation. best_diag starts off with the
477     value computed during the alignment process */
478    
479     new_diag = s_GetNextNonAffineTback(last_seq2_off, d,
480     best_diag, &new_seq2_index);
481    
482     if (new_diag == best_diag) {
483     // same diagonal: issue a group of substitutions
484     if (seq2_index - new_seq2_index > 0) {
485     edit_block->opRep(seq2_index - new_seq2_index);
486     }
487     }
488     else if (new_diag < best_diag) {
489     // smaller diagonal: issue a group of substitutions
490     // and then a gap in seq2 */
491     if (seq2_index - new_seq2_index > 0) {
492     edit_block->opRep(seq2_index - new_seq2_index);
493     }
494     //GapPrelimEditBlockAdd(edit_block, eGapAlignIns, 1);
495     edit_block->opIns(1);
496     }
497     else {
498     // larger diagonal: issue a group of substitutions
499     // and then a gap in seq1
500     if (seq2_index - new_seq2_index - 1 > 0) {
501     edit_block->opRep(seq2_index - new_seq2_index - 1);
502     }
503     edit_block->opDel(1);
504     }
505     d--;
506     best_diag = new_diag;
507     seq2_index = new_seq2_index;
508     }
509     //done:
510     // handle the final group of substitutions back to distance zero,
511     // i.e. back to offset zero of seq1 and seq2
512     //GapPrelimEditBlockAdd(edit_block, eGapAlignSub,
513     // last_seq2_off[0][diag_origin]);
514     edit_block->opRep(last_seq2_off[0][diag_origin]);
515     if (!reverse)
516     edit_block->reverse();
517     return best_dist;
518     }
519    
520     void printEditScript(GXEditScript* ed_script) {
521     uint i;
522     if (ed_script==NULL || ed_script->opnum == 0)
523     return;
524     for (i=0; i<ed_script->opnum; i++) {
525     int num=((ed_script->ops[i]) >> 2);
526     unsigned char op_type = 3 - ( ed_script->ops[i] & gxEDIT_OP_MASK );
527     if (op_type == 3)
528     GError("Error: printEditScript encountered op_type 3 ?!\n");
529     GMessage("%d%c ", num, xgapcodes[op_type]);
530     }
531     GMessage("\n");
532     }
533    
534     GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart,
535     bool editscript, int reward, int penalty, int xdrop) {
536     int q_max=strlen(q_seq); //query
537     int s_max=strlen(s_seq); //subj
538     return GreedyAlignRegion(q_seq, q_alnstart, q_max,
539     s_seq, s_alnstart, s_max, galn_NoTrim, editscript, NULL, reward, penalty, xdrop);
540     }
541    
542     struct GXSeedTable {
543     int a_num, b_num;
544     int a_cap, b_cap;
545     char* xc;
546     GXSeedTable(int a=12, int b=255) {
547     a_cap=0;
548     b_cap=0;
549     a_num=0;
550     b_num=0;
551     xc=NULL;
552     init(a,b);
553     }
554     ~GXSeedTable() {
555     GFREE(xc);
556     }
557     void init(int a, int b) {
558     a_num=a;
559     b_num=b;
560     bool resize=false;
561     if (b_num>b_cap) { resize=true; b_cap=b_num;}
562     if (a_num>a_cap) { resize=true; a_cap=a_num;}
563     if (resize) {
564     GFREE(xc);
565     GCALLOC(xc, (a_num*b_num));
566     }
567     else {
568     //just clear up to a_max, b_max
569     memset((void*)xc, 0, (a_num*b_num));
570     }
571     }
572     char& x(int ax, int by) {
573     return xc[by*a_num+ax];
574     }
575    
576     };
577    
578 gpertea 99 const int a_m_score=2; //match score
579     const int a_mis_score=-3; //mismatch
580     const int a_dropoff_score=7;
581 gpertea 93
582 gpertea 99 // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
583     //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
584     //check minimum score and
585     //for 3' adapter trimming:
586     // require that the right end of the alignment for either the adaptor OR the read must be
587     // < 3 distance from its right end
588     // for 5' adapter trimming:
589     // require that the left end of the alignment for either the adaptor OR the read must
590     // be at coordinate < 3 from start
591    
592     bool extendUngapped(const char* a, int alen, int ai,
593     const char* b, int blen, int bi, int& mlen, int& l5, int& l3, bool end5=false) {
594     //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
595     //if (debug) {
596     // GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
597     // }
598     int a_l=ai; //alignment coordinates on a
599     int a_r=ai+mlen-1;
600     int b_l=bi; //alignment coordinates on b
601     int b_r=bi+mlen-1;
602     int ai_maxscore=ai;
603     int bi_maxscore=bi;
604     int score=mlen*a_m_score;
605     int maxscore=score;
606     int mism5score=a_mis_score;
607     if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
608     //try to extend to the left first, if possible
609     while (ai>0 && bi>0) {
610     ai--;
611     bi--;
612     score+= (a[ai]==b[bi])? a_m_score : mism5score;
613     if (score>maxscore) {
614     ai_maxscore=ai;
615     bi_maxscore=bi;
616     maxscore=score;
617     }
618     else if (maxscore-score>a_dropoff_score) break;
619     }
620     a_l=ai_maxscore;
621     b_l=bi_maxscore;
622     //now extend to the right
623     ai_maxscore=a_r;
624     bi_maxscore=b_r;
625     ai=a_r;
626     bi=b_r;
627     score=maxscore;
628     //sometimes there are extra As at the end of the read, ignore those
629     if (a[alen-2]=='A' && a[alen-1]=='A') {
630     alen-=2;
631     while (a[alen-1]=='A' && alen>ai) alen--;
632     }
633     while (ai<alen-1 && bi<blen-1) {
634     ai++;
635     bi++;
636     //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
637     if (a[ai]==b[bi]) { //match
638     score+=a_m_score;
639     if (ai>=alen-2) {
640     score+=a_m_score-(alen-ai-1);
641     }
642     }
643     else { //mismatch
644     score+=a_mis_score;
645     }
646     if (score>maxscore) {
647     ai_maxscore=ai;
648     bi_maxscore=bi;
649     maxscore=score;
650     }
651     else if (maxscore-score>a_dropoff_score) break;
652     }
653     a_r=ai_maxscore;
654     b_r=bi_maxscore;
655     int a_ovh3=alen-a_r-1;
656     int b_ovh3=blen-b_r-1;
657     int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
658     int mmovh5=(a_l<b_l)? a_l : b_l;
659     if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
660     if (a_l<a_ovh3) {
661     //adapter closer to the left end (typical for 5' adapter)
662     l5=a_r+1;
663     l3=alen-1;
664     }
665     else {
666     //adapter matching at the right end (typical for 3' adapter)
667     l5=0;
668     l3=a_l-1;
669     }
670     return true;
671     }
672     //do not trim:
673     l5=0;
674     l3=alen-1;
675     return false;
676     }
677    
678    
679     bool extendUngapped_R(const char* seqa, int a_len, int &aix,
680     const char* seqb, int b_len, int &bix, int &len) {
681     //must start with a match
682     if (seqa[aix]!=seqb[bix]) GError("Error at extendUngapped_R: no initial match!\n");
683    
684     }
685    
686 gpertea 93 GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
687     //overlap of right (3') end of seqb
688     //hash the first 12 bases of seqa:
689     int aimin=0;
690     int aimax=GMIN(9,(a_len-4));
691     int bimin=GMAX(0,(b_len-a_len-2));//allow 2 extraneous bases, just in case there is some garbage at the end
692     int bimax=b_len-4;
693     int a_maxlen=aimax+4; //number of rows in the diagonal matrix
694     int b_maxlen=b_len; //number of cols in the diagonal matrix
695     GXSeedTable gx(a_maxlen, b_maxlen);
696     GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
697     for (int ai=aimin;ai<=aimax;ai++) {
698     int32* av=(int32 *)(&seqa[ai]);
699     //for (int bi=b_len-4;bi>=bimin;bi--) {
700     for (int bi=bimin;bi<=bimax;bi++) {
701     if (gx.x(ai,bi))
702     continue; //already have a previous seed covering this region of this diagonal
703     int32* bv=(int32 *)(&seqb[bi]);
704     if (*av == *bv) {
705     //word match
706     //see if we can extend to the right
707     gx.x(ai+1,bi+1)=1;
708     gx.x(ai+2,bi+2)=1;
709     gx.x(ai+3,bi+3)=1;
710     int aix=ai+4;
711     int bix=bi+4;
712     int len=4;
713     while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
714     {
715     gx.x(aix,bix)=1;
716     aix++;bix++;
717     len++;
718     }
719 gpertea 99 if (len>8) {
720     //heuristics: very likely the best we can get
721     int qlen=len;
722     while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
723     aix++;
724     bix++;
725     qlen++;
726     }
727     if (qlen>10) {
728     diagstrips->qmatch=new GXSeed(ai,bi,qlen);
729     return diagstrips;
730     }
731     }
732 gpertea 93 GXSeed* newseed=new GXSeed(ai,bi,len);
733     seeds.Add(newseed);
734     diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
735     }
736     }
737     }//for each 4-mer at the beginning of seqa
738     for (int i=0;i<diagstrips->Count();i++) {
739     diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
740     }
741     diagstrips->setSorted(true); //sort by score
742     return diagstrips;
743     }
744    
745     GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
746     //overlap of left (5') end of seqb
747     //hash the last 12 bases of seqa:
748     int aimin=GMAX(0,(a_len-16));
749     int aimax=a_len-4;
750     int bimin=0;
751     int bimax=GMIN((a_len-2), (b_len-4));
752     int a_maxlen=aimax+4; //number of rows in the diagonal matrix
753     int b_maxlen=b_len; //number of cols in the diagonal matrix
754     //gx.init(a_maxlen, b_maxlen);
755     GXSeedTable gx(a_maxlen, b_maxlen);
756     GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
757     for (int ai=aimin;ai<=aimax;ai++) {
758     int32* av=(int32 *)(&seqa[ai]);
759     for (int bi=bimin;bi<=bimax;bi++) {
760     if (gx.x(ai,bi))
761     continue; //already have a previous seed covering this region of this diagonal
762     int32* bv=(int32 *)(&seqb[bi]);
763     if (*av == *bv) {
764     //word match
765     //see if we can extend to the right
766     gx.x(ai+1,bi+1)=1;
767     gx.x(ai+2,bi+2)=1;
768     gx.x(ai+3,bi+3)=1;
769     int aix=ai+4;
770     int bix=bi+4;
771     int len=4;
772     while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
773     {
774     gx.x(aix,bix)=1;
775     aix++;bix++;
776     len++;
777     }
778     GXSeed* newseed=new GXSeed(ai,bi,len);
779     seeds.Add(newseed);
780     diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
781     }
782     }
783     }//for each 4-mer at the end of seqa
784     for (int i=0;i<diagstrips->Count();i++) {
785     diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
786     }
787     diagstrips->setSorted(true); //sort by score
788     return diagstrips;
789     }
790    
791    
792    
793     int cmpSeedScore(const pointer p1, const pointer p2) {
794     //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
795     GXSeed* s1=(GXSeed*)p1;
796     GXSeed* s2=(GXSeed*)p2;
797     if (s1->len==s2->len) {
798     return (s1->b_ofs-s2->b_ofs);
799     }
800     else return (s2->len-s1->len);
801     }
802    
803     int cmpSeedDiag(const pointer p1, const pointer p2) {
804     GXSeed* s1=(GXSeed*)p1;
805     GXSeed* s2=(GXSeed*)p2;
806     return ((s1->b_ofs-s1->a_ofs)-(s2->b_ofs-s2->a_ofs));
807     }
808    
809     GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
810     const char* s_seq, int s_alnstart, int s_max, GAlnTrimType trimtype,
811     bool editscript, SGreedyAlignMem* gxmem, int reward, int penalty, int xdrop) {
812     GXEditScript* ed_script_fwd = NULL;
813     GXEditScript* ed_script_rev = NULL;
814     if ( q_alnstart>q_max || q_alnstart<1 || s_alnstart>s_max || s_alnstart<1 )
815     GError("GreedyAlign() Error: invalid anchor coordinate.\n");
816     q_alnstart--;
817     s_alnstart--;
818     if (q_seq==NULL || q_seq[0]==0 || s_seq==NULL || s_seq[0]==0)
819     GError("GreedyAlign() Error: attempt to use an empty sequence string!\n");
820     if (q_seq[q_alnstart]!=s_seq[s_alnstart])
821     GError("GreedyAlign() Error: improper anchor (mismatch)!\n");
822     int q_ext_l=0, q_ext_r=0, s_ext_l=0, s_ext_r=0;
823     const char* q=q_seq+q_alnstart;
824     int q_avail=q_max-q_alnstart;
825     const char* s=s_seq+s_alnstart;
826     int s_avail=s_max-s_alnstart;
827     if (penalty<0) penalty=-penalty;
828     int MIN_GREEDY_SCORE=5*reward; //minimum score for an alignment to be reported
829     GXAlnInfo* alninfo=NULL;
830     bool freeAlnMem=(gxmem==NULL);
831     if (freeAlnMem) {
832     gxmem=new SGreedyAlignMem(reward, penalty, xdrop);
833     }
834     else gxmem->reset();
835     int retscore = 0;
836     int numdiffs = 0;
837     if (trimtype==galn_TrimLeft) {
838     if (editscript)
839     ed_script_rev=new GXEditScript();
840    
841     int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
842     reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
843     //check this extension here and bail out if it's not a good extension
844     if (s_alnstart+1-s_ext_l>3) {
845     delete ed_script_rev;
846     if (freeAlnMem) delete gxmem;
847     return NULL;
848     }
849    
850     if (editscript)
851     ed_script_fwd=new GXEditScript();
852     int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
853     reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
854     numdiffs=numdiffs_r+numdiffs_l;
855     //convert num diffs to actual score
856     retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward-penalty);
857     if (editscript)
858     ed_script_rev->Append(ed_script_fwd); //combine the two extensions
859     }
860     else {
861     if (editscript) {
862     ed_script_fwd=new GXEditScript();
863     }
864     int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
865     reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
866     //check extension here and bail out if not a good right extension
867     //assuming s_max is really at the right end of s_seq
868     if (trimtype==galn_TrimRight && s_alnstart+s_ext_r<s_max-2) {
869     delete ed_script_fwd;
870     if (freeAlnMem) delete gxmem;
871     return NULL;
872     }
873     if (editscript)
874     ed_script_rev=new GXEditScript();
875     int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
876     reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
877     //convert num diffs to actual score
878     numdiffs=numdiffs_r+numdiffs_l;
879     retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward-penalty);
880     if (editscript)
881     ed_script_rev->Append(ed_script_fwd); //combine the two extensions
882     }
883    
884     if (retscore>=MIN_GREEDY_SCORE) {
885     alninfo=new GXAlnInfo(q_seq, q_alnstart+1-q_ext_l,q_alnstart+q_ext_r, s_seq, s_alnstart+1-s_ext_l, s_alnstart+s_ext_r);
886     int hsp_length = GMIN(q_ext_l+q_ext_r, s_ext_l+s_ext_r);
887     alninfo->score=retscore;
888     alninfo->pid = 100 * (1 - ((double) numdiffs) / hsp_length);
889     #ifdef GDEBUG
890     if (ed_script_rev) {
891     GMessage("Final Edit script ::: ");
892     printEditScript(ed_script_rev);
893     }
894     #endif
895     alninfo->editscript=ed_script_rev;
896     alninfo->gapinfo = new CAlnGapInfo(ed_script_rev, alninfo->ql-1, alninfo->sl-1);
897     }
898     else {
899     if (freeAlnMem) delete gxmem;
900     delete ed_script_rev;
901     delete alninfo;
902     alninfo=NULL;
903     }
904     delete ed_script_fwd;
905     return alninfo;
906     }
907 gpertea 96
908     GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
909     SGreedyAlignMem* gxmem, int match_reward, int mismatch_penalty, int Xdrop) {
910     bool editscript=false;
911     #ifdef GDEBUG
912     editscript=true;
913     GMessage("==========> matching Right (3') end ========\n");
914     #endif
915     GList<GXSeed> rseeds(true,true,false);
916     GXBandSet* alnbands=collectSeeds_R(rseeds, seqa, seqa_len, seqb, seqb_len);
917 gpertea 99 //did we find a shortcut?
918     if (alnbands->qmatch) {
919    
920     }
921 gpertea 96 GList<GXAlnInfo> galns(true,true,false);
922     int max_top_bands=5;
923     int top_band_count=0;
924     GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
925     for (int b=0;b<alnbands->Count();b++) {
926     if (alnbands->Get(b)->score<4) break;
927     top_band_count++;
928     GXBand& band=*(alnbands->Get(b));
929     band.seeds.setSorted(cmpSeedScore);
930     anchor_seeds.Add(band.seeds.First());
931     if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
932     }
933     #ifdef GDEBUG
934     GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
935     #endif
936    
937     for (int i=0;i<anchor_seeds.Count();i++) {
938     GXSeed& aseed=*anchor_seeds[i];
939     int a1=aseed.a_ofs+(aseed.len>>1)+1;
940     int a2=aseed.b_ofs+(aseed.len>>1)+1;
941     GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
942     seqb, a2, seqb_len, galn_TrimRight, editscript, gxmem,
943     match_reward, mismatch_penalty, Xdrop);
944     if (alninfo && alninfo->pid>73 && alninfo->sr>seqb_len-4 && alninfo->ql<4)
945     galns.AddIfNew(alninfo, true);
946     else delete alninfo;
947     }
948    
949     #ifdef GDEBUG
950     for (int i=0;i<galns.Count();i++) {
951     GXAlnInfo* alninfo=galns[i];
952     GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
953     alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
954     if (alninfo->gapinfo!=NULL) {
955     GMessage("Alignment:\n");
956     alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
957     }
958     }
959     #endif
960     //---- done
961     delete alnbands;
962     if (galns.Count()) {
963     GXAlnInfo* bestaln=galns.Shift();
964     return bestaln;
965     }
966     else return NULL;
967     }
968    
969     GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
970     SGreedyAlignMem* gxmem, int match_reward, int mismatch_penalty, int Xdrop) {
971     bool editscript=false;
972     #ifdef GDEBUG
973     editscript=true;
974     GMessage("==========> matching Left (5') end ========\n");
975     #endif
976     GList<GXSeed> rseeds(true,true,false);
977     GXBandSet* alnbands=collectSeeds_L(rseeds, seqa, seqa_len, seqb, seqb_len);
978     GList<GXAlnInfo> galns(true,true,false);
979     int max_top_bands=5;
980     int top_band_count=0;
981     GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
982     for (int b=0;b<alnbands->Count();b++) {
983     if (alnbands->Get(b)->score<4) break;
984     top_band_count++;
985     GXBand& band=*(alnbands->Get(b));
986     band.seeds.setSorted(cmpSeedScore);
987     anchor_seeds.Add(band.seeds.First());
988     if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
989     }
990     #ifdef GDEBUG
991     GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
992     #endif
993    
994     for (int i=0;i<anchor_seeds.Count();i++) {
995     GXSeed& aseed=*anchor_seeds[i];
996     int a1=aseed.a_ofs+(aseed.len>>1)+1;
997     int a2=aseed.b_ofs+(aseed.len>>1)+1;
998     GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
999     seqb, a2, seqb_len, galn_TrimLeft, editscript, gxmem,
1000     match_reward, mismatch_penalty, Xdrop-2);
1001     if (alninfo && alninfo->pid>82 && alninfo->sl<3 && alninfo->qr>seqa_len-4)
1002     //TODO: at 5' end we should use a higher pid threshold
1003     // we could also decrease the X-drop value!
1004     galns.AddIfNew(alninfo, true);
1005     else delete alninfo;
1006     }
1007    
1008     #ifdef GDEBUG
1009     for (int i=0;i<galns.Count();i++) {
1010     GXAlnInfo* alninfo=galns[i];
1011     GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
1012     alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
1013     if (alninfo->gapinfo!=NULL) {
1014     GMessage("Alignment:\n");
1015     alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1016     }
1017     }
1018     #endif
1019     //---- done
1020     delete alnbands;
1021     if (galns.Count()) {
1022     GXAlnInfo* bestaln=galns.Shift();
1023     return bestaln;
1024     }
1025     else return NULL;
1026     }