ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.cpp
Revision: 104
Committed: Mon Oct 10 19:34:20 2011 UTC (7 years, 9 months ago) by gpertea
File size: 39728 byte(s)
Log Message:
fixed GVec, GArray to use new[] instead of malloc

Line User Rev File contents
1 gpertea 93 #include "GAlnExtend.h"
2    
3     //greedy gapped alignment extension
4     //(mostly lifted from NCBI's megablast gapped extension code)
5    
6     int GXMemPool::kMinSpace = 1000000;
7    
8     #ifdef GDEBUG
9     char buf[6]={0x1B,'[', 'n','m','m','\0'};
10    
11     void color_fg(int c,FILE* f) {
12     if (f!=stderr && f!=stdout) return;
13     sprintf((char *)(&buf[2]),"%dm",c+30);
14     fwrite(buf,1,strlen(buf), f);
15     }
16    
17     void color_bg(int c, FILE* f) {
18     if (f!=stderr && f!=stdout) return;
19     sprintf((char *)(&buf[2]),"%dm",c+40);
20     fwrite(buf,1,strlen(buf),f);
21     };
22    
23     void color_resetfg(FILE* f) {
24     if (f!=stderr && f!=stdout) return;
25     sprintf((char *)(&buf[2]),"39m");
26     fwrite(buf,1,strlen(buf), f);
27     };
28    
29     void color_resetbg(FILE* f) {
30     if (f!=stderr && f!=stdout) return;
31     sprintf((char *)(&buf[2]),"49m");
32     fwrite(buf,1,strlen(buf), f);
33     }
34    
35     void color_reset(FILE* f) {
36     if (f!=stderr && f!=stdout) return;
37     sprintf((char *)(&buf[2]),"0m");
38     fwrite(buf,1,strlen(buf), f);
39     };
40    
41     void color_normal(FILE* f) {
42     if (f!=stderr && f!=stdout) return;
43     sprintf((char *)(&buf[2]),"22m");
44     fwrite(buf,1,strlen(buf), f);
45     };
46    
47     #endif
48    
49    
50     char xgapcodes[4]={'S','I', 'D', 'X'};
51    
52     int get_last(int **flast_d, int d, int diag, int *row1) {
53     if (flast_d[d-1][diag-1] > GMAX(flast_d[d-1][diag], flast_d[d-1][diag+1])) {
54     *row1 = flast_d[d-1][diag-1];
55     return diag-1;
56     }
57     if (flast_d[d-1][diag] > flast_d[d-1][diag+1]) {
58     *row1 = flast_d[d-1][diag];
59     return diag;
60     }
61     *row1 = flast_d[d-1][diag+1];
62     return diag+1;
63     }
64    
65     void GapXEditScript::print() { //debug
66     GapXEditScript* p=this;
67     do {
68     GMessage("%d%c ",p->num, xgapcodes[p->op_type]);
69     } while ((p=p->next)!=NULL);
70     GMessage("\n");
71     }
72    
73    
74     int BLAST_Gcd(int a, int b) {
75     int c;
76    
77     b = abs(b);
78     if (b > a)
79     c=a, a=b, b=c;
80    
81     while (b != 0) {
82     c = a%b;
83     a = b;
84     b = c;
85     }
86     return a;
87     }
88    
89     int BLAST_Gdb3(int* a, int* b, int* c) {
90     int g;
91     if (*b == 0)
92     g = BLAST_Gcd(*a, *c);
93     else
94     g = BLAST_Gcd(*a, BLAST_Gcd(*b, *c));
95     if (g > 1) {
96     *a /= g;
97     *b /= g;
98     *c /= g;
99     }
100     return g;
101     }
102    
103    
104    
105     //signal that a diagonal is invalid
106     static const int kInvalidOffset = -2;
107    
108     int s_FindFirstMismatch(const char *seq1, int len1,
109     const char *seq2, int len2,
110     int seq1_index, int seq2_index,
111     //bool &fence_hit,
112     bool reverse) {
113     int start_index = seq1_index;
114     /* Sentry detection here should be relatively inexpensive: The
115     sentry value cannot appear in the query, so detection only
116     needs to be done at exit from the subject-query matching loop.
117     For uncompressed sequences, ambiguities in the query (i.e. seq1)
118     always count as mismatches */
119     if (reverse) {
120     while (seq1_index < len1 && seq2_index < len2 &&
121     //seq1[len1-1 - seq1_index] < 4 &&
122     seq1[len1-1 - seq1_index] == seq2[len2-1 - seq2_index]) {
123     ++seq1_index;
124     ++seq2_index;
125     }
126     //if (seq2_index < len2 && seq2[len2-1-seq2_index] == FENCE_SENTRY) {
127     //if len2-1-seq2_index<=0) {
128     // fence_hit = true;
129     // }
130     }
131     else { //forward lookup
132     while (seq1_index < len1 && seq2_index < len2 &&
133     //seq1[seq1_index] < 4 &&
134     seq1[seq1_index] == seq2[seq2_index]) {
135     ++seq1_index;
136     ++seq2_index;
137     }
138     //if (seq2_index < len2 && seq2[seq2_index] == FENCE_SENTRY) {
139     //if (seq2_index==len2) {
140     // fence_hit = true;
141     //}
142     }
143     return seq1_index - start_index;
144     }
145    
146    
147    
148     /** During the traceback for a non-affine greedy alignment,
149     compute the diagonal that will result from the next
150     traceback operation
151    
152     @param last_seq2_off Array of offsets into the second sequence;
153     last_seq2_off[d][k] gives the largest offset into
154     the second sequence that lies on diagonal k and
155     has distance d [in]
156     @param d Starting distance [in]
157     @param diag Index of diagonal that produced the starting distance [in]
158     @param seq2_index The offset into the second sequence after the traceback
159     operation has completed [out]
160     @return The diagonal resulting from the next traceback operation
161     being applied
162     */
163     int s_GetNextNonAffineTback(int **last_seq2_off, int d,
164     int diag, int *seq2_index) {
165     // choose the traceback operation that results in the
166     // largest seq2 offset at this point, then compute the
167     // new diagonal that is implied by the operation
168     if (last_seq2_off[d-1][diag-1] >
169     GMAX(last_seq2_off[d-1][diag], last_seq2_off[d-1][diag+1])) {
170     *seq2_index = last_seq2_off[d-1][diag-1];
171     return diag - 1; // gap in seq2
172     }
173     if (last_seq2_off[d-1][diag] > last_seq2_off[d-1][diag+1]) {
174     *seq2_index = last_seq2_off[d-1][diag];
175     return diag; // match
176     }
177     *seq2_index = last_seq2_off[d-1][diag+1];
178     return diag + 1; // gap in seq1
179     }
180    
181    
182     int GXGreedyExtend(const char* seq1, int len1,
183     const char* seq2, int len2,
184     bool reverse, int xdrop_threshold,
185     int match_cost, int mismatch_cost,
186     int& seq1_align_len, int& seq2_align_len,
187 gpertea 101 CGreedyAlignData& aux_data,
188 gpertea 93 GXEditScript *edit_block) {
189     //GapPrelimEditBlock *edit_block,
190     //bool& fence_hit, SGreedySeed *seed) {
191     int seq1_index;
192     int seq2_index;
193     int index;
194     int d;
195     int k;
196     int diag_lower, diag_upper;
197     int max_dist;
198     int diag_origin;
199     int best_dist;
200     int best_diag;
201     int** last_seq2_off;
202     int* max_score;
203     int xdrop_offset;
204     int longest_match_run;
205     bool end1_reached, end2_reached;
206     GXMemPool* mem_pool;
207    
208     /* ordinary dynamic programming alignment, for each offset
209     in seq1, walks through offsets in seq2 until an X-dropoff
210     test fails, saving the best score encountered along
211     the way. Instead of score, this code tracks the 'distance'
212     (number of mismatches plus number of gaps) between seq1
213     and seq2. Instead of walking through sequence offsets, it
214     walks through diagonals that can achieve a given distance.
215    
216     Note that in what follows, the numbering of diagonals implies
217     a dot matrix where increasing seq1 offsets go to the right on
218     the x axis, and increasing seq2 offsets go up the y axis.
219     The gapped alignment thus proceeds up and to the right in
220     the graph, and diagonals are numbered increasing to the right */
221    
222     best_dist = 0;
223     best_diag = 0;
224    
225     /* set the number of distinct distances the algorithm will
226     examine in the search for an optimal alignment. The
227     heuristic worst-case running time of the algorithm is
228     O(max_dist**2 + (len1+len2)); for sequences which are
229     very similar, the average running time will be sig-
230     nificantly better than this */
231    
232     max_dist = GMIN(GREEDY_MAX_COST,
233     len2 / GREEDY_MAX_COST_FRACTION + 1);
234    
235     /* the main loop assumes that the index of all diagonals is
236     biased to lie in the middle of allocated bookkeeping
237     structures */
238    
239     diag_origin = max_dist + 2;
240    
241     // last_seq2_off[d][k] is the largest offset into seq2 that
242     // lies on diagonal k and has distance d
243    
244     last_seq2_off = aux_data.last_seq2_off;
245    
246     /* Instead of tracking the best alignment score and using
247     xdrop_theshold directly, track the best score for each
248     unique distance and use the best score for some previously
249     computed distance to implement the X-dropoff test.
250    
251     xdrop_offset gives the distance backwards in the score
252     array to look */
253    
254     xdrop_offset = (xdrop_threshold + match_cost / 2) /
255     (match_cost + mismatch_cost) + 1;
256    
257     // find the offset of the first mismatch between seq1 and seq2
258    
259     index = s_FindFirstMismatch(seq1, len1, seq2, len2, 0, 0, reverse);
260     // fence_hit, reverse, rem);
261    
262     // update the extents of the alignment, and bail out
263     // early if no further work is needed
264    
265     seq1_align_len = index;
266     seq2_align_len = index;
267     seq1_index = index;
268     /*
269     seed->start_q = 0;
270     seed->start_s = 0;
271     seed->match_length = index;
272     */
273     longest_match_run = index;
274    
275     if (index == len1 || index == len2) {
276     /* Return the number of differences, which is zero here */
277     if (edit_block != NULL)
278     //GapPrelimEditBlockAdd(edit_block, eGapAlignSub, index);
279     edit_block->opRep(index);
280     return 0;
281     }
282    
283     // set up the memory pool
284     mem_pool = aux_data.space;
285     if (edit_block == NULL) {
286     mem_pool = NULL;
287     }
288     else if (mem_pool == NULL) {
289     aux_data.space = mem_pool = new GXMemPool();
290     }
291     else {
292     mem_pool->refresh();
293     }
294    
295     /* set up the array of per-distance maximum scores. There
296     are max_diags + xdrop_offset distances to track, the first
297     xdrop_offset of which are 0 */
298    
299     max_score = aux_data.max_score + xdrop_offset;
300     for (index = 0; index < xdrop_offset; index++)
301     aux_data.max_score[index] = 0;
302    
303     // fill in the initial offsets of the distance matrix
304    
305     last_seq2_off[0][diag_origin] = seq1_index;
306     max_score[0] = seq1_index * match_cost;
307     diag_lower = diag_origin - 1;
308     diag_upper = diag_origin + 1;
309     end1_reached = end2_reached = false;
310    
311     // for each distance
312     for (d = 1; d <= max_dist; d++) {
313     int xdrop_score;
314     int curr_score;
315     int curr_extent = 0;
316     int curr_seq2_index = 0;
317     int curr_diag = 0;
318     int tmp_diag_lower = diag_lower;
319     int tmp_diag_upper = diag_upper;
320    
321     // Assign impossible seq2 offsets to any diagonals that
322     // are not in the range (diag_lower,diag_upper).
323     // These will serve as sentinel values for the inner loop
324     last_seq2_off[d - 1][diag_lower-1] = kInvalidOffset;
325     last_seq2_off[d - 1][diag_lower] = kInvalidOffset;
326     last_seq2_off[d - 1][diag_upper] = kInvalidOffset;
327     last_seq2_off[d - 1][diag_upper+1] = kInvalidOffset;
328    
329     // compute the score for distance d corresponding to the X-dropoff criterion
330    
331     xdrop_score = max_score[d - xdrop_offset] +
332     (match_cost + mismatch_cost) * d - xdrop_threshold;
333     xdrop_score = (int)ceil((double)xdrop_score / (match_cost / 2));
334    
335     // for each diagonal of interest
336     for (k = tmp_diag_lower; k <= tmp_diag_upper; k++) {
337     /* find the largest offset into seq2 that increases
338     the distance from d-1 to d (i.e. keeps the alignment
339     from getting worse for as long as possible), then
340     choose the offset into seq1 that will keep the
341     resulting diagonal fixed at k
342    
343     Note that this requires kInvalidOffset+1 to be smaller
344     than any valid offset into seq2, i.e. to be negative */
345    
346     seq2_index = GMAX(last_seq2_off[d - 1][k + 1],
347     last_seq2_off[d - 1][k ]) + 1;
348     seq2_index = GMAX(seq2_index, last_seq2_off[d - 1][k - 1]);
349     seq1_index = seq2_index + k - diag_origin;
350    
351     if (seq2_index < 0 || seq1_index + seq2_index < xdrop_score) {
352    
353     // if no valid diagonal can reach distance d, or the
354     // X-dropoff test fails, narrow the range of diagonals
355     // to test and skip to the next diagonal
356     if (k == diag_lower)
357     diag_lower++;
358     else
359     last_seq2_off[d][k] = kInvalidOffset;
360     continue;
361     }
362     diag_upper = k;
363    
364     /* slide down diagonal k until a mismatch
365     occurs. As long as only matches are encountered,
366     the current distance d will not change */
367    
368     index = s_FindFirstMismatch(seq1, len1, seq2, len2,
369     seq1_index, seq2_index, reverse);
370     //fence_hit, reverse, rem);
371     if (index > longest_match_run) {
372     //seed->start_q = seq1_index;
373     //seed->start_s = seq2_index;
374     //seed->match_length = index;
375     longest_match_run = index;
376     }
377     seq1_index += index;
378     seq2_index += index;
379    
380     // set the new largest seq2 offset that achieves
381     // distance d on diagonal k
382    
383     last_seq2_off[d][k] = seq2_index;
384    
385     // since all values of k are constrained to have the
386     // same distance d, the value of k which maximizes the
387     // alignment score is the one that covers the most of seq1 and seq2
388     if (seq1_index + seq2_index > curr_extent) {
389     curr_extent = seq1_index + seq2_index;
390     curr_seq2_index = seq2_index;
391     curr_diag = k;
392     }
393    
394     /* clamp the bounds on diagonals to avoid walking off
395     either sequence. Because the bounds increase by at
396     most one for each distance, diag_lower and diag_upper
397     can each be of size at most max_diags+2 */
398    
399     if (seq2_index == len2) {
400     diag_lower = k + 1;
401     end2_reached = true;
402     }
403     if (seq1_index == len1) {
404     diag_upper = k - 1;
405     end1_reached = true;
406     }
407     } // end loop over diagonals
408    
409     // compute the maximum score possible for distance d
410     curr_score = curr_extent * (match_cost / 2) -
411     d * (match_cost + mismatch_cost);
412     // if this is the best score seen so far, update the
413     // statistics of the best alignment
414     if (curr_score > max_score[d - 1]) {
415     max_score[d] = curr_score;
416     best_dist = d;
417     best_diag = curr_diag;
418     seq2_align_len = curr_seq2_index;
419     seq1_align_len = curr_seq2_index + best_diag - diag_origin;
420     }
421     else {
422     max_score[d] = max_score[d - 1];
423     }
424    
425     // alignment has finished if the lower and upper bounds
426     // on diagonals to check have converged to each other
427    
428     if (diag_lower > diag_upper)
429     break;
430    
431     /* set up for the next distance to examine. Because the
432     bounds increase by at most one for each distance,
433     diag_lower and diag_upper can each be of size at
434     most max_diags+2 */
435    
436     if (!end2_reached)
437     diag_lower--;
438     if (!end1_reached)
439     diag_upper++;
440    
441     if (edit_block == NULL) {
442     // if no traceback is specified, the next row of
443     // last_seq2_off can reuse previously allocated memory
444     //TODO FIXME The following assumes two arrays of
445     // at least max_dist+4 int's have already been allocated
446     last_seq2_off[d + 1] = last_seq2_off[d - 1];
447     }
448     else {
449     // traceback requires all rows of last_seq2_off to be saved,
450     // so a new row must be allocated
451     last_seq2_off[d + 1] = (int*)mem_pool->getByteSpace((diag_upper - diag_lower + 7)*sizeof(int));
452     // move the origin for this row backwards
453     //TODO FIXME: dubious pointer arithmetic ?!
454     //last_seq2_off[d + 1] = last_seq2_off[d + 1] - diag_lower + 2;
455     }
456     } // end loop over distinct distances
457    
458    
459     if (edit_block == NULL)
460     return best_dist;
461    
462     //---- perform traceback
463     d = best_dist;
464     seq1_index = seq1_align_len;
465     seq2_index = seq2_align_len;
466     // for all positive distances
467    
468     //if (fence_hit && *fence_hit)
469     // goto done;
470     if (index==len1 || index==len2) d=0;
471     while (d > 0) {
472     int new_diag;
473     int new_seq2_index;
474    
475     /* retrieve the value of the diagonal after the next
476     traceback operation. best_diag starts off with the
477     value computed during the alignment process */
478    
479     new_diag = s_GetNextNonAffineTback(last_seq2_off, d,
480     best_diag, &new_seq2_index);
481    
482     if (new_diag == best_diag) {
483     // same diagonal: issue a group of substitutions
484     if (seq2_index - new_seq2_index > 0) {
485     edit_block->opRep(seq2_index - new_seq2_index);
486     }
487     }
488     else if (new_diag < best_diag) {
489     // smaller diagonal: issue a group of substitutions
490     // and then a gap in seq2 */
491     if (seq2_index - new_seq2_index > 0) {
492     edit_block->opRep(seq2_index - new_seq2_index);
493     }
494     //GapPrelimEditBlockAdd(edit_block, eGapAlignIns, 1);
495     edit_block->opIns(1);
496     }
497     else {
498     // larger diagonal: issue a group of substitutions
499     // and then a gap in seq1
500     if (seq2_index - new_seq2_index - 1 > 0) {
501     edit_block->opRep(seq2_index - new_seq2_index - 1);
502     }
503     edit_block->opDel(1);
504     }
505     d--;
506     best_diag = new_diag;
507     seq2_index = new_seq2_index;
508     }
509     //done:
510     // handle the final group of substitutions back to distance zero,
511     // i.e. back to offset zero of seq1 and seq2
512     //GapPrelimEditBlockAdd(edit_block, eGapAlignSub,
513     // last_seq2_off[0][diag_origin]);
514     edit_block->opRep(last_seq2_off[0][diag_origin]);
515     if (!reverse)
516     edit_block->reverse();
517     return best_dist;
518     }
519    
520     void printEditScript(GXEditScript* ed_script) {
521     uint i;
522     if (ed_script==NULL || ed_script->opnum == 0)
523     return;
524     for (i=0; i<ed_script->opnum; i++) {
525     int num=((ed_script->ops[i]) >> 2);
526     unsigned char op_type = 3 - ( ed_script->ops[i] & gxEDIT_OP_MASK );
527     if (op_type == 3)
528     GError("Error: printEditScript encountered op_type 3 ?!\n");
529     GMessage("%d%c ", num, xgapcodes[op_type]);
530     }
531     GMessage("\n");
532     }
533    
534     GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart,
535     bool editscript, int reward, int penalty, int xdrop) {
536     int q_max=strlen(q_seq); //query
537     int s_max=strlen(s_seq); //subj
538     return GreedyAlignRegion(q_seq, q_alnstart, q_max,
539 gpertea 101 s_seq, s_alnstart, s_max, reward, penalty, xdrop, NULL, NULL, editscript);
540 gpertea 93 }
541    
542     struct GXSeedTable {
543     int a_num, b_num;
544     int a_cap, b_cap;
545     char* xc;
546     GXSeedTable(int a=12, int b=255) {
547     a_cap=0;
548     b_cap=0;
549     a_num=0;
550     b_num=0;
551     xc=NULL;
552     init(a,b);
553     }
554     ~GXSeedTable() {
555     GFREE(xc);
556     }
557     void init(int a, int b) {
558     a_num=a;
559     b_num=b;
560     bool resize=false;
561     if (b_num>b_cap) { resize=true; b_cap=b_num;}
562     if (a_num>a_cap) { resize=true; a_cap=a_num;}
563     if (resize) {
564     GFREE(xc);
565     GCALLOC(xc, (a_num*b_num));
566     }
567     else {
568     //just clear up to a_max, b_max
569     memset((void*)xc, 0, (a_num*b_num));
570     }
571     }
572     char& x(int ax, int by) {
573     return xc[by*a_num+ax];
574     }
575    
576     };
577    
578 gpertea 99 const int a_m_score=2; //match score
579     const int a_mis_score=-3; //mismatch
580     const int a_dropoff_score=7;
581 gpertea 101 const int a_min_score=12; //at least 6 bases full match
582 gpertea 93
583 gpertea 99 // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
584     //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
585     //check minimum score and
586     //for 3' adapter trimming:
587     // require that the right end of the alignment for either the adaptor OR the read must be
588     // < 3 distance from its right end
589     // for 5' adapter trimming:
590     // require that the left end of the alignment for either the adaptor OR the read must
591     // be at coordinate < 3 from start
592    
593     bool extendUngapped(const char* a, int alen, int ai,
594     const char* b, int blen, int bi, int& mlen, int& l5, int& l3, bool end5=false) {
595     //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
596     //if (debug) {
597     // GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
598     // }
599     int a_l=ai; //alignment coordinates on a
600     int a_r=ai+mlen-1;
601     int b_l=bi; //alignment coordinates on b
602     int b_r=bi+mlen-1;
603     int ai_maxscore=ai;
604     int bi_maxscore=bi;
605     int score=mlen*a_m_score;
606     int maxscore=score;
607     int mism5score=a_mis_score;
608     if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
609     //try to extend to the left first, if possible
610     while (ai>0 && bi>0) {
611     ai--;
612     bi--;
613     score+= (a[ai]==b[bi])? a_m_score : mism5score;
614     if (score>maxscore) {
615     ai_maxscore=ai;
616     bi_maxscore=bi;
617     maxscore=score;
618     }
619     else if (maxscore-score>a_dropoff_score) break;
620     }
621     a_l=ai_maxscore;
622     b_l=bi_maxscore;
623     //now extend to the right
624     ai_maxscore=a_r;
625     bi_maxscore=b_r;
626     ai=a_r;
627     bi=b_r;
628     score=maxscore;
629     //sometimes there are extra As at the end of the read, ignore those
630     if (a[alen-2]=='A' && a[alen-1]=='A') {
631     alen-=2;
632     while (a[alen-1]=='A' && alen>ai) alen--;
633     }
634     while (ai<alen-1 && bi<blen-1) {
635     ai++;
636     bi++;
637     //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
638     if (a[ai]==b[bi]) { //match
639     score+=a_m_score;
640     if (ai>=alen-2) {
641     score+=a_m_score-(alen-ai-1);
642     }
643     }
644     else { //mismatch
645     score+=a_mis_score;
646     }
647     if (score>maxscore) {
648     ai_maxscore=ai;
649     bi_maxscore=bi;
650     maxscore=score;
651     }
652     else if (maxscore-score>a_dropoff_score) break;
653     }
654     a_r=ai_maxscore;
655     b_r=bi_maxscore;
656     int a_ovh3=alen-a_r-1;
657     int b_ovh3=blen-b_r-1;
658     int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
659     int mmovh5=(a_l<b_l)? a_l : b_l;
660     if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
661     if (a_l<a_ovh3) {
662     //adapter closer to the left end (typical for 5' adapter)
663     l5=a_r+1;
664     l3=alen-1;
665     }
666     else {
667     //adapter matching at the right end (typical for 3' adapter)
668     l5=0;
669     l3=a_l-1;
670     }
671     return true;
672     }
673     //do not trim:
674     l5=0;
675     l3=alen-1;
676     return false;
677     }
678    
679    
680 gpertea 93 GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
681     //overlap of right (3') end of seqb
682     //hash the first 12 bases of seqa:
683     int aimin=0;
684     int aimax=GMIN(9,(a_len-4));
685     int bimin=GMAX(0,(b_len-a_len-2));//allow 2 extraneous bases, just in case there is some garbage at the end
686     int bimax=b_len-4;
687     int a_maxlen=aimax+4; //number of rows in the diagonal matrix
688     int b_maxlen=b_len; //number of cols in the diagonal matrix
689     GXSeedTable gx(a_maxlen, b_maxlen);
690     GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
691     for (int ai=aimin;ai<=aimax;ai++) {
692     int32* av=(int32 *)(&seqa[ai]);
693     //for (int bi=b_len-4;bi>=bimin;bi--) {
694     for (int bi=bimin;bi<=bimax;bi++) {
695     if (gx.x(ai,bi))
696     continue; //already have a previous seed covering this region of this diagonal
697     int32* bv=(int32 *)(&seqb[bi]);
698     if (*av == *bv) {
699     //word match
700     //see if we can extend to the right
701     gx.x(ai+1,bi+1)=1;
702     gx.x(ai+2,bi+2)=1;
703     gx.x(ai+3,bi+3)=1;
704     int aix=ai+4;
705     int bix=bi+4;
706     int len=4;
707     while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
708     {
709     gx.x(aix,bix)=1;
710     aix++;bix++;
711     len++;
712     }
713 gpertea 101 if (len>7) {
714 gpertea 99 //heuristics: very likely the best we can get
715     int qlen=len;
716     while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
717     aix++;
718     bix++;
719     qlen++;
720     }
721 gpertea 101 if (qlen>10) { //found a quick match shortcut
722 gpertea 99 diagstrips->qmatch=new GXSeed(ai,bi,qlen);
723     return diagstrips;
724     }
725     }
726 gpertea 93 GXSeed* newseed=new GXSeed(ai,bi,len);
727     seeds.Add(newseed);
728     diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
729 gpertea 104 //special last resort terminal match to be used if no better alignment is there
730     if (len>4 && ai==0 && (bi+len==b_len || bi+len==b_len-1)) diagstrips->tmatch=newseed;
731 gpertea 93 }
732     }
733     }//for each 4-mer at the beginning of seqa
734     for (int i=0;i<diagstrips->Count();i++) {
735     diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
736     }
737     diagstrips->setSorted(true); //sort by score
738     return diagstrips;
739     }
740    
741     GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
742     //overlap of left (5') end of seqb
743     //hash the last 12 bases of seqa:
744     int aimin=GMAX(0,(a_len-16));
745     int aimax=a_len-4;
746     int bimin=0;
747     int bimax=GMIN((a_len-2), (b_len-4));
748     int a_maxlen=aimax+4; //number of rows in the diagonal matrix
749     int b_maxlen=b_len; //number of cols in the diagonal matrix
750     //gx.init(a_maxlen, b_maxlen);
751     GXSeedTable gx(a_maxlen, b_maxlen);
752     GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
753     for (int ai=aimin;ai<=aimax;ai++) {
754     int32* av=(int32 *)(&seqa[ai]);
755     for (int bi=bimin;bi<=bimax;bi++) {
756     if (gx.x(ai,bi))
757     continue; //already have a previous seed covering this region of this diagonal
758     int32* bv=(int32 *)(&seqb[bi]);
759     if (*av == *bv) {
760     //word match
761     //see if we can extend to the right
762     gx.x(ai+1,bi+1)=1;
763     gx.x(ai+2,bi+2)=1;
764     gx.x(ai+3,bi+3)=1;
765     int aix=ai+4;
766     int bix=bi+4;
767     int len=4;
768     while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
769     {
770     gx.x(aix,bix)=1;
771     aix++;bix++;
772     len++;
773     }
774 gpertea 101 if (len>7) {
775     //heuristics: very likely the best we can get
776     int qlen=len;
777     while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
778     aix++;
779     bix++;
780     qlen++;
781     }
782     if (qlen>10) { //found a quick match shortcut
783     diagstrips->qmatch=new GXSeed(ai,bi,qlen);
784     return diagstrips;
785     }
786     }
787 gpertea 93 GXSeed* newseed=new GXSeed(ai,bi,len);
788     seeds.Add(newseed);
789     diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
790 gpertea 104 //special last resort terminal match to be used if no better alignment is there
791     if (len>4 && bi==0 && ai+len==a_len) diagstrips->tmatch=newseed;
792 gpertea 93 }
793     }
794     }//for each 4-mer at the end of seqa
795     for (int i=0;i<diagstrips->Count();i++) {
796     diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
797     }
798     diagstrips->setSorted(true); //sort by score
799     return diagstrips;
800     }
801    
802     int cmpSeedScore(const pointer p1, const pointer p2) {
803     //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
804     GXSeed* s1=(GXSeed*)p1;
805     GXSeed* s2=(GXSeed*)p2;
806     if (s1->len==s2->len) {
807     return (s1->b_ofs-s2->b_ofs);
808     }
809     else return (s2->len-s1->len);
810     }
811    
812     int cmpSeedDiag(const pointer p1, const pointer p2) {
813     GXSeed* s1=(GXSeed*)p1;
814     GXSeed* s2=(GXSeed*)p2;
815     return ((s1->b_ofs-s1->a_ofs)-(s2->b_ofs-s2->a_ofs));
816     }
817    
818 gpertea 101
819     int cmpDiagBands_R(const pointer p1, const pointer p2) {
820     //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
821     GXBand* b1=(GXBand*)p1;
822     GXBand* b2=(GXBand*)p2;
823     if (b1->score==b2->score) {
824     return (b2->w_min_b-b1->w_min_b);
825     }
826     else return (b2->score-b1->score);
827     }
828    
829    
830    
831 gpertea 93 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
832 gpertea 101 const char* s_seq, int s_alnstart, int s_max,
833     int reward, int penalty, int xdrop, CGreedyAlignData* gxmem,
834     CAlnTrim* trim, bool editscript) {
835 gpertea 93 GXEditScript* ed_script_fwd = NULL;
836     GXEditScript* ed_script_rev = NULL;
837     if ( q_alnstart>q_max || q_alnstart<1 || s_alnstart>s_max || s_alnstart<1 )
838     GError("GreedyAlign() Error: invalid anchor coordinate.\n");
839     q_alnstart--;
840     s_alnstart--;
841     if (q_seq==NULL || q_seq[0]==0 || s_seq==NULL || s_seq[0]==0)
842     GError("GreedyAlign() Error: attempt to use an empty sequence string!\n");
843     if (q_seq[q_alnstart]!=s_seq[s_alnstart])
844     GError("GreedyAlign() Error: improper anchor (mismatch)!\n");
845     int q_ext_l=0, q_ext_r=0, s_ext_l=0, s_ext_r=0;
846     const char* q=q_seq+q_alnstart;
847     int q_avail=q_max-q_alnstart;
848     const char* s=s_seq+s_alnstart;
849     int s_avail=s_max-s_alnstart;
850     if (penalty<0) penalty=-penalty;
851 gpertea 101 int MIN_GREEDY_SCORE=5*reward; //minimum score for an alignment to be reported for 0 diffs
852 gpertea 93 GXAlnInfo* alninfo=NULL;
853     bool freeAlnMem=(gxmem==NULL);
854     if (freeAlnMem) {
855 gpertea 101 gxmem=new CGreedyAlignData(reward, penalty, xdrop);
856 gpertea 93 }
857     else gxmem->reset();
858     int retscore = 0;
859     int numdiffs = 0;
860 gpertea 101 if (trim!=NULL && trim->type==galn_TrimLeft) {
861 gpertea 93 if (editscript)
862     ed_script_rev=new GXEditScript();
863    
864 gpertea 101 int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
865 gpertea 93 reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
866     //check this extension here and bail out if it's not a good extension
867 gpertea 101 if (s_alnstart+1-s_ext_l>trim->boundary) {
868 gpertea 93 delete ed_script_rev;
869     if (freeAlnMem) delete gxmem;
870     return NULL;
871     }
872    
873     if (editscript)
874     ed_script_fwd=new GXEditScript();
875     int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
876     reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
877     numdiffs=numdiffs_r+numdiffs_l;
878     //convert num diffs to actual score
879 gpertea 101 retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward+penalty);
880 gpertea 93 if (editscript)
881     ed_script_rev->Append(ed_script_fwd); //combine the two extensions
882     }
883     else {
884     if (editscript) {
885     ed_script_fwd=new GXEditScript();
886     }
887     int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
888     reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
889     //check extension here and bail out if not a good right extension
890     //assuming s_max is really at the right end of s_seq
891 gpertea 101 if (trim!=NULL && trim->type==galn_TrimRight && s_alnstart+s_ext_r<trim->boundary) {
892 gpertea 93 delete ed_script_fwd;
893     if (freeAlnMem) delete gxmem;
894     return NULL;
895     }
896     if (editscript)
897     ed_script_rev=new GXEditScript();
898     int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
899     reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
900     //convert num diffs to actual score
901     numdiffs=numdiffs_r+numdiffs_l;
902 gpertea 101 retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward+penalty);
903 gpertea 93 if (editscript)
904     ed_script_rev->Append(ed_script_fwd); //combine the two extensions
905     }
906    
907     if (retscore>=MIN_GREEDY_SCORE) {
908     alninfo=new GXAlnInfo(q_seq, q_alnstart+1-q_ext_l,q_alnstart+q_ext_r, s_seq, s_alnstart+1-s_ext_l, s_alnstart+s_ext_r);
909     int hsp_length = GMIN(q_ext_l+q_ext_r, s_ext_l+s_ext_r);
910     alninfo->score=retscore;
911     alninfo->pid = 100 * (1 - ((double) numdiffs) / hsp_length);
912     #ifdef GDEBUG
913 gpertea 104 //if (ed_script_rev) {
914     // GMessage("Final Edit script ::: ");
915     // printEditScript(ed_script_rev);
916     // }
917 gpertea 93 #endif
918     alninfo->editscript=ed_script_rev;
919     alninfo->gapinfo = new CAlnGapInfo(ed_script_rev, alninfo->ql-1, alninfo->sl-1);
920     }
921     else {
922     if (freeAlnMem) delete gxmem;
923     delete ed_script_rev;
924     delete alninfo;
925     alninfo=NULL;
926     }
927     delete ed_script_fwd;
928     return alninfo;
929     }
930 gpertea 96
931 gpertea 101 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
932     const char* s_seq, int s_alnstart, int s_max, CGreedyAlignData* gxmem,
933     CAlnTrim* trim, bool editscript) {
934     int reward=2;
935     int penalty=3;
936     int xdrop=8;
937     if (gxmem) {
938     reward=gxmem->match_reward;
939     penalty=gxmem->mismatch_penalty;
940     xdrop=gxmem->x_drop;
941     }
942     return GreedyAlignRegion(q_seq, q_alnstart, q_max, s_seq, s_alnstart, s_max,
943     reward, penalty, xdrop, gxmem, trim, editscript);
944     }
945    
946 gpertea 96 GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
947 gpertea 101 CGreedyAlignData* gxmem, int min_pid) {
948 gpertea 96 bool editscript=false;
949     #ifdef GDEBUG
950     editscript=true;
951 gpertea 104 // GMessage("==========> matching Right (3') end ========\n");
952 gpertea 96 #endif
953 gpertea 101
954     CAlnTrim trimInfo(galn_TrimRight, seqb, seqb_len);
955 gpertea 96 GList<GXSeed> rseeds(true,true,false);
956     GXBandSet* alnbands=collectSeeds_R(rseeds, seqa, seqa_len, seqb, seqb_len);
957 gpertea 101 GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
958 gpertea 99 //did we find a shortcut?
959     if (alnbands->qmatch) {
960 gpertea 104 //#ifdef GDEBUG
961     // GMessage("::: Found a quick long match -- using it instead of diagonal bands\n");
962     //#endif
963 gpertea 101 anchor_seeds.Add(alnbands->qmatch);
964     }
965     else {
966     int max_top_bands=5;
967     int top_band_count=0;
968     for (int b=0;b<alnbands->Count();b++) {
969     if (alnbands->Get(b)->score<4) break;
970 gpertea 104 //#ifdef GDEBUG
971     //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
972     //#endif
973 gpertea 101 top_band_count++;
974     GXBand& band=*(alnbands->Get(b));
975     band.seeds.setSorted(cmpSeedScore);
976     anchor_seeds.Add(band.seeds.First());
977     band.tested=true;
978     if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
979     }
980 gpertea 104 //#ifdef GDEBUG
981     //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
982     //#endif
983 gpertea 101 }
984 gpertea 99
985 gpertea 96 GList<GXAlnInfo> galns(true,true,false);
986     for (int i=0;i<anchor_seeds.Count();i++) {
987     GXSeed& aseed=*anchor_seeds[i];
988     int a1=aseed.a_ofs+(aseed.len>>1)+1;
989     int a2=aseed.b_ofs+(aseed.len>>1)+1;
990     GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
991 gpertea 101 seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
992     if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
993 gpertea 96 galns.AddIfNew(alninfo, true);
994     else delete alninfo;
995     }
996 gpertea 104 //special 3' end case: due to the seed scoring scheme being biased towards the 5' end of the read,
997     //we should also try some seeds closer to the 3' end
998 gpertea 101 if (galns.Count()==0) {
999 gpertea 104 anchor_seeds.Clear();
1000 gpertea 101 alnbands->setSorted(cmpDiagBands_R);
1001     int max_top_bands=4;
1002     int top_band_count=0;
1003 gpertea 104 //#ifdef GDEBUG
1004     //GMessage(":::>> Retrying adjusting sort order.\n");
1005     //#endif
1006     if (alnbands->tmatch) {
1007     //anchor_seeds.setSorted(false);
1008     anchor_seeds.Add(alnbands->tmatch);
1009     }
1010 gpertea 101 for (int b=0;b<alnbands->Count();b++) {
1011     if (alnbands->Get(b)->score<4) break;
1012 gpertea 104 //#ifdef GDEBUG
1013     //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
1014     //#endif
1015 gpertea 101 if (alnbands->Get(b)->tested) continue;
1016     top_band_count++;
1017     GXBand& band=*(alnbands->Get(b));
1018     band.seeds.setSorted(cmpSeedScore);
1019     anchor_seeds.Add(band.seeds.First());
1020     if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1021     }
1022 gpertea 104 //#ifdef GDEBUG
1023     //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1024     //#endif
1025 gpertea 101 for (int i=0;i<anchor_seeds.Count();i++) {
1026     GXSeed& aseed=*anchor_seeds[i];
1027     int a1=aseed.a_ofs+(aseed.len>>1)+1;
1028     int a2=aseed.b_ofs+(aseed.len>>1)+1;
1029     GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1030     seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1031     if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1032     galns.AddIfNew(alninfo, true);
1033     else delete alninfo;
1034     }
1035     }
1036 gpertea 96 //---- done
1037     delete alnbands;
1038     if (galns.Count()) {
1039     GXAlnInfo* bestaln=galns.Shift();
1040 gpertea 104 #ifdef GDEBUG
1041     GMessage("Best alignment: a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", bestaln->ql, bestaln->qr,
1042     bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid);
1043     if (bestaln->gapinfo!=NULL) {
1044     bestaln->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1045     }
1046     #endif
1047    
1048 gpertea 96 return bestaln;
1049     }
1050     else return NULL;
1051     }
1052    
1053     GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
1054 gpertea 101 CGreedyAlignData* gxmem, int min_pid) {
1055 gpertea 96 bool editscript=false;
1056     #ifdef GDEBUG
1057     editscript=true;
1058 gpertea 104 //GMessage("==========> matching Left (5') end ========\n");
1059 gpertea 96 #endif
1060 gpertea 101 CAlnTrim trimInfo(galn_TrimLeft, seqb, seqb_len);
1061 gpertea 96 GList<GXSeed> rseeds(true,true,false);
1062     GXBandSet* alnbands=collectSeeds_L(rseeds, seqa, seqa_len, seqb, seqb_len);
1063     GList<GXAlnInfo> galns(true,true,false);
1064     GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
1065    
1066 gpertea 101 if (alnbands->qmatch) {
1067 gpertea 104 //#ifdef GDEBUG
1068     // GMessage("::: Found a quick long match -- using it instead of diagonal bands\n");
1069     //#endif
1070 gpertea 101 anchor_seeds.Add(alnbands->qmatch);
1071     }
1072     else {
1073     int max_top_bands=5;
1074     int top_band_count=0;
1075     for (int b=0;b<alnbands->Count();b++) {
1076     if (alnbands->Get(b)->score<4) break;
1077 gpertea 104 //#ifdef GDEBUG
1078     //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
1079     //#endif
1080 gpertea 101 top_band_count++;
1081     GXBand& band=*(alnbands->Get(b));
1082     band.seeds.setSorted(cmpSeedScore);
1083     anchor_seeds.Add(band.seeds.First());
1084     if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1085     }
1086 gpertea 104 //#ifdef GDEBUG
1087     //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1088     //#endif
1089 gpertea 101 }
1090 gpertea 96 for (int i=0;i<anchor_seeds.Count();i++) {
1091     GXSeed& aseed=*anchor_seeds[i];
1092     int a1=aseed.a_ofs+(aseed.len>>1)+1;
1093     int a2=aseed.b_ofs+(aseed.len>>1)+1;
1094     GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1095 gpertea 101 seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1096     if (alninfo && alninfo->pid>=min_pid
1097     && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, seqa_len-alninfo->qr))
1098 gpertea 96 galns.AddIfNew(alninfo, true);
1099     else delete alninfo;
1100     }
1101 gpertea 104 if (galns.Count()==0 && alnbands->tmatch) {
1102     GXSeed& aseed=*alnbands->tmatch;
1103     int a1=aseed.a_ofs+(aseed.len>>1)+1;
1104     int a2=aseed.b_ofs+(aseed.len>>1)+1;
1105     GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1106     seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1107     if (alninfo) galns.Add(alninfo);
1108     }
1109 gpertea 96 #ifdef GDEBUG
1110 gpertea 104 /*
1111 gpertea 96 for (int i=0;i<galns.Count();i++) {
1112     GXAlnInfo* alninfo=galns[i];
1113     GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
1114     alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
1115     if (alninfo->gapinfo!=NULL) {
1116     GMessage("Alignment:\n");
1117     alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1118     }
1119     }
1120 gpertea 104 */
1121 gpertea 96 #endif
1122     //---- done
1123     delete alnbands;
1124     if (galns.Count()) {
1125     GXAlnInfo* bestaln=galns.Shift();
1126 gpertea 104 #ifdef GDEBUG
1127     GMessage("Best alignment: a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", bestaln->ql, bestaln->qr,
1128     bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid);
1129     if (bestaln->gapinfo!=NULL) {
1130     bestaln->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1131     }
1132     #endif
1133 gpertea 96 return bestaln;
1134     }
1135     else return NULL;
1136     }