ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.cpp
Revision: 109
Committed: Tue Oct 11 19:50:14 2011 UTC (8 years, 1 month ago) by gpertea
File size: 39739 byte(s)
Log Message:
added GStr::reverse(), GAlnExtend stringency tweaks

Line User Rev File contents
1 gpertea 93 #include "GAlnExtend.h"
2    
3     //greedy gapped alignment extension
4     //(mostly lifted from NCBI's megablast gapped extension code)
5    
6     int GXMemPool::kMinSpace = 1000000;
7    
8     #ifdef GDEBUG
9     char buf[6]={0x1B,'[', 'n','m','m','\0'};
10    
11     void color_fg(int c,FILE* f) {
12     if (f!=stderr && f!=stdout) return;
13     sprintf((char *)(&buf[2]),"%dm",c+30);
14     fwrite(buf,1,strlen(buf), f);
15     }
16    
17     void color_bg(int c, FILE* f) {
18     if (f!=stderr && f!=stdout) return;
19     sprintf((char *)(&buf[2]),"%dm",c+40);
20     fwrite(buf,1,strlen(buf),f);
21     };
22    
23     void color_resetfg(FILE* f) {
24     if (f!=stderr && f!=stdout) return;
25     sprintf((char *)(&buf[2]),"39m");
26     fwrite(buf,1,strlen(buf), f);
27     };
28    
29     void color_resetbg(FILE* f) {
30     if (f!=stderr && f!=stdout) return;
31     sprintf((char *)(&buf[2]),"49m");
32     fwrite(buf,1,strlen(buf), f);
33     }
34    
35     void color_reset(FILE* f) {
36     if (f!=stderr && f!=stdout) return;
37     sprintf((char *)(&buf[2]),"0m");
38     fwrite(buf,1,strlen(buf), f);
39     };
40    
41     void color_normal(FILE* f) {
42     if (f!=stderr && f!=stdout) return;
43     sprintf((char *)(&buf[2]),"22m");
44     fwrite(buf,1,strlen(buf), f);
45     };
46    
47     #endif
48    
49    
50     char xgapcodes[4]={'S','I', 'D', 'X'};
51    
52     int get_last(int **flast_d, int d, int diag, int *row1) {
53     if (flast_d[d-1][diag-1] > GMAX(flast_d[d-1][diag], flast_d[d-1][diag+1])) {
54     *row1 = flast_d[d-1][diag-1];
55     return diag-1;
56     }
57     if (flast_d[d-1][diag] > flast_d[d-1][diag+1]) {
58     *row1 = flast_d[d-1][diag];
59     return diag;
60     }
61     *row1 = flast_d[d-1][diag+1];
62     return diag+1;
63     }
64    
65     void GapXEditScript::print() { //debug
66     GapXEditScript* p=this;
67     do {
68     GMessage("%d%c ",p->num, xgapcodes[p->op_type]);
69     } while ((p=p->next)!=NULL);
70     GMessage("\n");
71     }
72    
73    
74     int BLAST_Gcd(int a, int b) {
75     int c;
76    
77     b = abs(b);
78     if (b > a)
79     c=a, a=b, b=c;
80    
81     while (b != 0) {
82     c = a%b;
83     a = b;
84     b = c;
85     }
86     return a;
87     }
88    
89     int BLAST_Gdb3(int* a, int* b, int* c) {
90     int g;
91     if (*b == 0)
92     g = BLAST_Gcd(*a, *c);
93     else
94     g = BLAST_Gcd(*a, BLAST_Gcd(*b, *c));
95     if (g > 1) {
96     *a /= g;
97     *b /= g;
98     *c /= g;
99     }
100     return g;
101     }
102    
103    
104    
105     //signal that a diagonal is invalid
106     static const int kInvalidOffset = -2;
107    
108     int s_FindFirstMismatch(const char *seq1, int len1,
109     const char *seq2, int len2,
110     int seq1_index, int seq2_index,
111     //bool &fence_hit,
112     bool reverse) {
113     int start_index = seq1_index;
114     /* Sentry detection here should be relatively inexpensive: The
115     sentry value cannot appear in the query, so detection only
116     needs to be done at exit from the subject-query matching loop.
117     For uncompressed sequences, ambiguities in the query (i.e. seq1)
118     always count as mismatches */
119     if (reverse) {
120     while (seq1_index < len1 && seq2_index < len2 &&
121     //seq1[len1-1 - seq1_index] < 4 &&
122     seq1[len1-1 - seq1_index] == seq2[len2-1 - seq2_index]) {
123     ++seq1_index;
124     ++seq2_index;
125     }
126     //if (seq2_index < len2 && seq2[len2-1-seq2_index] == FENCE_SENTRY) {
127     //if len2-1-seq2_index<=0) {
128     // fence_hit = true;
129     // }
130     }
131     else { //forward lookup
132     while (seq1_index < len1 && seq2_index < len2 &&
133     //seq1[seq1_index] < 4 &&
134     seq1[seq1_index] == seq2[seq2_index]) {
135     ++seq1_index;
136     ++seq2_index;
137     }
138     //if (seq2_index < len2 && seq2[seq2_index] == FENCE_SENTRY) {
139     //if (seq2_index==len2) {
140     // fence_hit = true;
141     //}
142     }
143     return seq1_index - start_index;
144     }
145    
146    
147    
148     /** During the traceback for a non-affine greedy alignment,
149     compute the diagonal that will result from the next
150     traceback operation
151    
152     @param last_seq2_off Array of offsets into the second sequence;
153     last_seq2_off[d][k] gives the largest offset into
154     the second sequence that lies on diagonal k and
155     has distance d [in]
156     @param d Starting distance [in]
157     @param diag Index of diagonal that produced the starting distance [in]
158     @param seq2_index The offset into the second sequence after the traceback
159     operation has completed [out]
160     @return The diagonal resulting from the next traceback operation
161     being applied
162     */
163     int s_GetNextNonAffineTback(int **last_seq2_off, int d,
164     int diag, int *seq2_index) {
165     // choose the traceback operation that results in the
166     // largest seq2 offset at this point, then compute the
167     // new diagonal that is implied by the operation
168     if (last_seq2_off[d-1][diag-1] >
169     GMAX(last_seq2_off[d-1][diag], last_seq2_off[d-1][diag+1])) {
170     *seq2_index = last_seq2_off[d-1][diag-1];
171     return diag - 1; // gap in seq2
172     }
173     if (last_seq2_off[d-1][diag] > last_seq2_off[d-1][diag+1]) {
174     *seq2_index = last_seq2_off[d-1][diag];
175     return diag; // match
176     }
177     *seq2_index = last_seq2_off[d-1][diag+1];
178     return diag + 1; // gap in seq1
179     }
180    
181    
182     int GXGreedyExtend(const char* seq1, int len1,
183     const char* seq2, int len2,
184     bool reverse, int xdrop_threshold,
185     int match_cost, int mismatch_cost,
186     int& seq1_align_len, int& seq2_align_len,
187 gpertea 101 CGreedyAlignData& aux_data,
188 gpertea 93 GXEditScript *edit_block) {
189     //GapPrelimEditBlock *edit_block,
190     //bool& fence_hit, SGreedySeed *seed) {
191     int seq1_index;
192     int seq2_index;
193     int index;
194     int d;
195     int k;
196     int diag_lower, diag_upper;
197     int max_dist;
198     int diag_origin;
199     int best_dist;
200     int best_diag;
201     int** last_seq2_off;
202     int* max_score;
203     int xdrop_offset;
204     int longest_match_run;
205     bool end1_reached, end2_reached;
206     GXMemPool* mem_pool;
207    
208     /* ordinary dynamic programming alignment, for each offset
209     in seq1, walks through offsets in seq2 until an X-dropoff
210     test fails, saving the best score encountered along
211     the way. Instead of score, this code tracks the 'distance'
212     (number of mismatches plus number of gaps) between seq1
213     and seq2. Instead of walking through sequence offsets, it
214     walks through diagonals that can achieve a given distance.
215    
216     Note that in what follows, the numbering of diagonals implies
217     a dot matrix where increasing seq1 offsets go to the right on
218     the x axis, and increasing seq2 offsets go up the y axis.
219     The gapped alignment thus proceeds up and to the right in
220     the graph, and diagonals are numbered increasing to the right */
221    
222     best_dist = 0;
223     best_diag = 0;
224    
225     /* set the number of distinct distances the algorithm will
226     examine in the search for an optimal alignment. The
227     heuristic worst-case running time of the algorithm is
228     O(max_dist**2 + (len1+len2)); for sequences which are
229     very similar, the average running time will be sig-
230     nificantly better than this */
231    
232     max_dist = GMIN(GREEDY_MAX_COST,
233 gpertea 109 (len2/GREEDY_MAX_COST_FRACTION + 1));
234 gpertea 93
235     /* the main loop assumes that the index of all diagonals is
236     biased to lie in the middle of allocated bookkeeping
237     structures */
238    
239     diag_origin = max_dist + 2;
240    
241     // last_seq2_off[d][k] is the largest offset into seq2 that
242     // lies on diagonal k and has distance d
243    
244     last_seq2_off = aux_data.last_seq2_off;
245    
246     /* Instead of tracking the best alignment score and using
247     xdrop_theshold directly, track the best score for each
248     unique distance and use the best score for some previously
249     computed distance to implement the X-dropoff test.
250    
251     xdrop_offset gives the distance backwards in the score
252     array to look */
253    
254     xdrop_offset = (xdrop_threshold + match_cost / 2) /
255     (match_cost + mismatch_cost) + 1;
256    
257     // find the offset of the first mismatch between seq1 and seq2
258    
259     index = s_FindFirstMismatch(seq1, len1, seq2, len2, 0, 0, reverse);
260     // fence_hit, reverse, rem);
261    
262     // update the extents of the alignment, and bail out
263     // early if no further work is needed
264    
265     seq1_align_len = index;
266     seq2_align_len = index;
267     seq1_index = index;
268     /*
269     seed->start_q = 0;
270     seed->start_s = 0;
271     seed->match_length = index;
272     */
273     longest_match_run = index;
274    
275     if (index == len1 || index == len2) {
276     /* Return the number of differences, which is zero here */
277     if (edit_block != NULL)
278     //GapPrelimEditBlockAdd(edit_block, eGapAlignSub, index);
279     edit_block->opRep(index);
280     return 0;
281     }
282    
283     // set up the memory pool
284     mem_pool = aux_data.space;
285     if (edit_block == NULL) {
286     mem_pool = NULL;
287     }
288     else if (mem_pool == NULL) {
289     aux_data.space = mem_pool = new GXMemPool();
290     }
291     else {
292     mem_pool->refresh();
293     }
294    
295     /* set up the array of per-distance maximum scores. There
296     are max_diags + xdrop_offset distances to track, the first
297     xdrop_offset of which are 0 */
298    
299     max_score = aux_data.max_score + xdrop_offset;
300     for (index = 0; index < xdrop_offset; index++)
301     aux_data.max_score[index] = 0;
302    
303     // fill in the initial offsets of the distance matrix
304    
305     last_seq2_off[0][diag_origin] = seq1_index;
306     max_score[0] = seq1_index * match_cost;
307     diag_lower = diag_origin - 1;
308     diag_upper = diag_origin + 1;
309     end1_reached = end2_reached = false;
310    
311     // for each distance
312     for (d = 1; d <= max_dist; d++) {
313     int xdrop_score;
314     int curr_score;
315     int curr_extent = 0;
316     int curr_seq2_index = 0;
317     int curr_diag = 0;
318     int tmp_diag_lower = diag_lower;
319     int tmp_diag_upper = diag_upper;
320    
321     // Assign impossible seq2 offsets to any diagonals that
322     // are not in the range (diag_lower,diag_upper).
323     // These will serve as sentinel values for the inner loop
324     last_seq2_off[d - 1][diag_lower-1] = kInvalidOffset;
325     last_seq2_off[d - 1][diag_lower] = kInvalidOffset;
326     last_seq2_off[d - 1][diag_upper] = kInvalidOffset;
327     last_seq2_off[d - 1][diag_upper+1] = kInvalidOffset;
328    
329     // compute the score for distance d corresponding to the X-dropoff criterion
330    
331     xdrop_score = max_score[d - xdrop_offset] +
332     (match_cost + mismatch_cost) * d - xdrop_threshold;
333     xdrop_score = (int)ceil((double)xdrop_score / (match_cost / 2));
334    
335     // for each diagonal of interest
336     for (k = tmp_diag_lower; k <= tmp_diag_upper; k++) {
337     /* find the largest offset into seq2 that increases
338     the distance from d-1 to d (i.e. keeps the alignment
339     from getting worse for as long as possible), then
340     choose the offset into seq1 that will keep the
341     resulting diagonal fixed at k
342    
343     Note that this requires kInvalidOffset+1 to be smaller
344     than any valid offset into seq2, i.e. to be negative */
345    
346     seq2_index = GMAX(last_seq2_off[d - 1][k + 1],
347     last_seq2_off[d - 1][k ]) + 1;
348     seq2_index = GMAX(seq2_index, last_seq2_off[d - 1][k - 1]);
349     seq1_index = seq2_index + k - diag_origin;
350    
351     if (seq2_index < 0 || seq1_index + seq2_index < xdrop_score) {
352    
353     // if no valid diagonal can reach distance d, or the
354     // X-dropoff test fails, narrow the range of diagonals
355     // to test and skip to the next diagonal
356     if (k == diag_lower)
357     diag_lower++;
358     else
359     last_seq2_off[d][k] = kInvalidOffset;
360     continue;
361     }
362     diag_upper = k;
363    
364     /* slide down diagonal k until a mismatch
365     occurs. As long as only matches are encountered,
366     the current distance d will not change */
367    
368     index = s_FindFirstMismatch(seq1, len1, seq2, len2,
369     seq1_index, seq2_index, reverse);
370     //fence_hit, reverse, rem);
371     if (index > longest_match_run) {
372     //seed->start_q = seq1_index;
373     //seed->start_s = seq2_index;
374     //seed->match_length = index;
375     longest_match_run = index;
376     }
377     seq1_index += index;
378     seq2_index += index;
379    
380     // set the new largest seq2 offset that achieves
381     // distance d on diagonal k
382    
383     last_seq2_off[d][k] = seq2_index;
384    
385     // since all values of k are constrained to have the
386     // same distance d, the value of k which maximizes the
387     // alignment score is the one that covers the most of seq1 and seq2
388     if (seq1_index + seq2_index > curr_extent) {
389     curr_extent = seq1_index + seq2_index;
390     curr_seq2_index = seq2_index;
391     curr_diag = k;
392     }
393    
394     /* clamp the bounds on diagonals to avoid walking off
395     either sequence. Because the bounds increase by at
396     most one for each distance, diag_lower and diag_upper
397     can each be of size at most max_diags+2 */
398    
399     if (seq2_index == len2) {
400     diag_lower = k + 1;
401     end2_reached = true;
402     }
403     if (seq1_index == len1) {
404     diag_upper = k - 1;
405     end1_reached = true;
406     }
407     } // end loop over diagonals
408    
409     // compute the maximum score possible for distance d
410     curr_score = curr_extent * (match_cost / 2) -
411     d * (match_cost + mismatch_cost);
412     // if this is the best score seen so far, update the
413     // statistics of the best alignment
414     if (curr_score > max_score[d - 1]) {
415     max_score[d] = curr_score;
416     best_dist = d;
417     best_diag = curr_diag;
418     seq2_align_len = curr_seq2_index;
419     seq1_align_len = curr_seq2_index + best_diag - diag_origin;
420     }
421     else {
422     max_score[d] = max_score[d - 1];
423     }
424    
425     // alignment has finished if the lower and upper bounds
426     // on diagonals to check have converged to each other
427    
428     if (diag_lower > diag_upper)
429     break;
430    
431     /* set up for the next distance to examine. Because the
432     bounds increase by at most one for each distance,
433     diag_lower and diag_upper can each be of size at
434     most max_diags+2 */
435    
436     if (!end2_reached)
437     diag_lower--;
438     if (!end1_reached)
439     diag_upper++;
440    
441     if (edit_block == NULL) {
442     // if no traceback is specified, the next row of
443     // last_seq2_off can reuse previously allocated memory
444     //TODO FIXME The following assumes two arrays of
445     // at least max_dist+4 int's have already been allocated
446     last_seq2_off[d + 1] = last_seq2_off[d - 1];
447     }
448     else {
449     // traceback requires all rows of last_seq2_off to be saved,
450     // so a new row must be allocated
451     last_seq2_off[d + 1] = (int*)mem_pool->getByteSpace((diag_upper - diag_lower + 7)*sizeof(int));
452     // move the origin for this row backwards
453     //TODO FIXME: dubious pointer arithmetic ?!
454     //last_seq2_off[d + 1] = last_seq2_off[d + 1] - diag_lower + 2;
455     }
456     } // end loop over distinct distances
457    
458    
459     if (edit_block == NULL)
460     return best_dist;
461    
462     //---- perform traceback
463     d = best_dist;
464     seq1_index = seq1_align_len;
465     seq2_index = seq2_align_len;
466     // for all positive distances
467    
468     //if (fence_hit && *fence_hit)
469     // goto done;
470     if (index==len1 || index==len2) d=0;
471     while (d > 0) {
472     int new_diag;
473     int new_seq2_index;
474    
475     /* retrieve the value of the diagonal after the next
476     traceback operation. best_diag starts off with the
477     value computed during the alignment process */
478    
479     new_diag = s_GetNextNonAffineTback(last_seq2_off, d,
480     best_diag, &new_seq2_index);
481    
482     if (new_diag == best_diag) {
483     // same diagonal: issue a group of substitutions
484     if (seq2_index - new_seq2_index > 0) {
485     edit_block->opRep(seq2_index - new_seq2_index);
486     }
487     }
488     else if (new_diag < best_diag) {
489     // smaller diagonal: issue a group of substitutions
490     // and then a gap in seq2 */
491     if (seq2_index - new_seq2_index > 0) {
492     edit_block->opRep(seq2_index - new_seq2_index);
493     }
494     //GapPrelimEditBlockAdd(edit_block, eGapAlignIns, 1);
495     edit_block->opIns(1);
496     }
497     else {
498     // larger diagonal: issue a group of substitutions
499     // and then a gap in seq1
500     if (seq2_index - new_seq2_index - 1 > 0) {
501     edit_block->opRep(seq2_index - new_seq2_index - 1);
502     }
503     edit_block->opDel(1);
504     }
505     d--;
506     best_diag = new_diag;
507     seq2_index = new_seq2_index;
508     }
509     //done:
510     // handle the final group of substitutions back to distance zero,
511     // i.e. back to offset zero of seq1 and seq2
512     //GapPrelimEditBlockAdd(edit_block, eGapAlignSub,
513     // last_seq2_off[0][diag_origin]);
514     edit_block->opRep(last_seq2_off[0][diag_origin]);
515     if (!reverse)
516     edit_block->reverse();
517     return best_dist;
518     }
519    
520     void printEditScript(GXEditScript* ed_script) {
521     uint i;
522     if (ed_script==NULL || ed_script->opnum == 0)
523     return;
524     for (i=0; i<ed_script->opnum; i++) {
525     int num=((ed_script->ops[i]) >> 2);
526     unsigned char op_type = 3 - ( ed_script->ops[i] & gxEDIT_OP_MASK );
527     if (op_type == 3)
528     GError("Error: printEditScript encountered op_type 3 ?!\n");
529     GMessage("%d%c ", num, xgapcodes[op_type]);
530     }
531     GMessage("\n");
532     }
533    
534     GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart,
535     bool editscript, int reward, int penalty, int xdrop) {
536     int q_max=strlen(q_seq); //query
537     int s_max=strlen(s_seq); //subj
538     return GreedyAlignRegion(q_seq, q_alnstart, q_max,
539 gpertea 101 s_seq, s_alnstart, s_max, reward, penalty, xdrop, NULL, NULL, editscript);
540 gpertea 93 }
541    
542     struct GXSeedTable {
543     int a_num, b_num;
544     int a_cap, b_cap;
545     char* xc;
546     GXSeedTable(int a=12, int b=255) {
547     a_cap=0;
548     b_cap=0;
549     a_num=0;
550     b_num=0;
551     xc=NULL;
552     init(a,b);
553     }
554     ~GXSeedTable() {
555     GFREE(xc);
556     }
557     void init(int a, int b) {
558     a_num=a;
559     b_num=b;
560     bool resize=false;
561     if (b_num>b_cap) { resize=true; b_cap=b_num;}
562     if (a_num>a_cap) { resize=true; a_cap=a_num;}
563     if (resize) {
564     GFREE(xc);
565     GCALLOC(xc, (a_num*b_num));
566     }
567     else {
568     //just clear up to a_max, b_max
569     memset((void*)xc, 0, (a_num*b_num));
570     }
571     }
572     char& x(int ax, int by) {
573     return xc[by*a_num+ax];
574     }
575    
576     };
577    
578 gpertea 99 const int a_m_score=2; //match score
579     const int a_mis_score=-3; //mismatch
580     const int a_dropoff_score=7;
581 gpertea 101 const int a_min_score=12; //at least 6 bases full match
582 gpertea 93
583 gpertea 99 // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
584     //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
585     //check minimum score and
586     //for 3' adapter trimming:
587     // require that the right end of the alignment for either the adaptor OR the read must be
588     // < 3 distance from its right end
589     // for 5' adapter trimming:
590     // require that the left end of the alignment for either the adaptor OR the read must
591     // be at coordinate < 3 from start
592    
593     bool extendUngapped(const char* a, int alen, int ai,
594     const char* b, int blen, int bi, int& mlen, int& l5, int& l3, bool end5=false) {
595     //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
596     //if (debug) {
597     // GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
598     // }
599     int a_l=ai; //alignment coordinates on a
600     int a_r=ai+mlen-1;
601     int b_l=bi; //alignment coordinates on b
602     int b_r=bi+mlen-1;
603     int ai_maxscore=ai;
604     int bi_maxscore=bi;
605     int score=mlen*a_m_score;
606     int maxscore=score;
607     int mism5score=a_mis_score;
608     if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
609     //try to extend to the left first, if possible
610     while (ai>0 && bi>0) {
611     ai--;
612     bi--;
613     score+= (a[ai]==b[bi])? a_m_score : mism5score;
614     if (score>maxscore) {
615     ai_maxscore=ai;
616     bi_maxscore=bi;
617     maxscore=score;
618     }
619     else if (maxscore-score>a_dropoff_score) break;
620     }
621     a_l=ai_maxscore;
622     b_l=bi_maxscore;
623     //now extend to the right
624     ai_maxscore=a_r;
625     bi_maxscore=b_r;
626     ai=a_r;
627     bi=b_r;
628     score=maxscore;
629     //sometimes there are extra As at the end of the read, ignore those
630     if (a[alen-2]=='A' && a[alen-1]=='A') {
631     alen-=2;
632     while (a[alen-1]=='A' && alen>ai) alen--;
633     }
634     while (ai<alen-1 && bi<blen-1) {
635     ai++;
636     bi++;
637     //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
638     if (a[ai]==b[bi]) { //match
639     score+=a_m_score;
640     if (ai>=alen-2) {
641     score+=a_m_score-(alen-ai-1);
642     }
643     }
644     else { //mismatch
645     score+=a_mis_score;
646     }
647     if (score>maxscore) {
648     ai_maxscore=ai;
649     bi_maxscore=bi;
650     maxscore=score;
651     }
652     else if (maxscore-score>a_dropoff_score) break;
653     }
654     a_r=ai_maxscore;
655     b_r=bi_maxscore;
656     int a_ovh3=alen-a_r-1;
657     int b_ovh3=blen-b_r-1;
658     int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
659     int mmovh5=(a_l<b_l)? a_l : b_l;
660     if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
661     if (a_l<a_ovh3) {
662     //adapter closer to the left end (typical for 5' adapter)
663     l5=a_r+1;
664     l3=alen-1;
665     }
666     else {
667     //adapter matching at the right end (typical for 3' adapter)
668     l5=0;
669     l3=a_l-1;
670     }
671     return true;
672     }
673     //do not trim:
674     l5=0;
675     l3=alen-1;
676     return false;
677     }
678    
679    
680 gpertea 93 GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
681     //overlap of right (3') end of seqb
682     //hash the first 12 bases of seqa:
683     int aimin=0;
684     int aimax=GMIN(9,(a_len-4));
685     int bimin=GMAX(0,(b_len-a_len-2));//allow 2 extraneous bases, just in case there is some garbage at the end
686     int bimax=b_len-4;
687     int a_maxlen=aimax+4; //number of rows in the diagonal matrix
688     int b_maxlen=b_len; //number of cols in the diagonal matrix
689     GXSeedTable gx(a_maxlen, b_maxlen);
690     GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
691     for (int ai=aimin;ai<=aimax;ai++) {
692     int32* av=(int32 *)(&seqa[ai]);
693     //for (int bi=b_len-4;bi>=bimin;bi--) {
694     for (int bi=bimin;bi<=bimax;bi++) {
695     if (gx.x(ai,bi))
696     continue; //already have a previous seed covering this region of this diagonal
697     int32* bv=(int32 *)(&seqb[bi]);
698     if (*av == *bv) {
699     //word match
700     //see if we can extend to the right
701     gx.x(ai+1,bi+1)=1;
702     gx.x(ai+2,bi+2)=1;
703     gx.x(ai+3,bi+3)=1;
704     int aix=ai+4;
705     int bix=bi+4;
706     int len=4;
707     while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
708     {
709     gx.x(aix,bix)=1;
710     aix++;bix++;
711     len++;
712     }
713 gpertea 101 if (len>7) {
714 gpertea 99 //heuristics: very likely the best we can get
715     int qlen=len;
716     while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
717     aix++;
718     bix++;
719     qlen++;
720     }
721 gpertea 101 if (qlen>10) { //found a quick match shortcut
722 gpertea 99 diagstrips->qmatch=new GXSeed(ai,bi,qlen);
723     return diagstrips;
724     }
725     }
726 gpertea 93 GXSeed* newseed=new GXSeed(ai,bi,len);
727     seeds.Add(newseed);
728     diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
729 gpertea 104 //special last resort terminal match to be used if no better alignment is there
730     if (len>4 && ai==0 && (bi+len==b_len || bi+len==b_len-1)) diagstrips->tmatch=newseed;
731 gpertea 93 }
732     }
733     }//for each 4-mer at the beginning of seqa
734     for (int i=0;i<diagstrips->Count();i++) {
735     diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
736     }
737     diagstrips->setSorted(true); //sort by score
738     return diagstrips;
739     }
740    
741     GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
742     //overlap of left (5') end of seqb
743     //hash the last 12 bases of seqa:
744     int aimin=GMAX(0,(a_len-16));
745     int aimax=a_len-4;
746     int bimin=0;
747     int bimax=GMIN((a_len-2), (b_len-4));
748     int a_maxlen=aimax+4; //number of rows in the diagonal matrix
749     int b_maxlen=b_len; //number of cols in the diagonal matrix
750     //gx.init(a_maxlen, b_maxlen);
751     GXSeedTable gx(a_maxlen, b_maxlen);
752     GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
753     for (int ai=aimin;ai<=aimax;ai++) {
754     int32* av=(int32 *)(&seqa[ai]);
755     for (int bi=bimin;bi<=bimax;bi++) {
756     if (gx.x(ai,bi))
757     continue; //already have a previous seed covering this region of this diagonal
758     int32* bv=(int32 *)(&seqb[bi]);
759     if (*av == *bv) {
760     //word match
761     //see if we can extend to the right
762     gx.x(ai+1,bi+1)=1;
763     gx.x(ai+2,bi+2)=1;
764     gx.x(ai+3,bi+3)=1;
765     int aix=ai+4;
766     int bix=bi+4;
767     int len=4;
768     while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
769     {
770     gx.x(aix,bix)=1;
771     aix++;bix++;
772     len++;
773     }
774 gpertea 101 if (len>7) {
775     //heuristics: very likely the best we can get
776     int qlen=len;
777     while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
778     aix++;
779     bix++;
780     qlen++;
781     }
782     if (qlen>10) { //found a quick match shortcut
783     diagstrips->qmatch=new GXSeed(ai,bi,qlen);
784     return diagstrips;
785     }
786     }
787 gpertea 93 GXSeed* newseed=new GXSeed(ai,bi,len);
788     seeds.Add(newseed);
789     diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
790 gpertea 104 //special last resort terminal match to be used if no better alignment is there
791     if (len>4 && bi==0 && ai+len==a_len) diagstrips->tmatch=newseed;
792 gpertea 93 }
793     }
794     }//for each 4-mer at the end of seqa
795     for (int i=0;i<diagstrips->Count();i++) {
796     diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
797     }
798     diagstrips->setSorted(true); //sort by score
799     return diagstrips;
800     }
801    
802     int cmpSeedScore(const pointer p1, const pointer p2) {
803     //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
804     GXSeed* s1=(GXSeed*)p1;
805     GXSeed* s2=(GXSeed*)p2;
806     if (s1->len==s2->len) {
807     return (s1->b_ofs-s2->b_ofs);
808     }
809     else return (s2->len-s1->len);
810     }
811    
812     int cmpSeedDiag(const pointer p1, const pointer p2) {
813     GXSeed* s1=(GXSeed*)p1;
814     GXSeed* s2=(GXSeed*)p2;
815     return ((s1->b_ofs-s1->a_ofs)-(s2->b_ofs-s2->a_ofs));
816     }
817    
818 gpertea 101
819     int cmpDiagBands_R(const pointer p1, const pointer p2) {
820     //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
821     GXBand* b1=(GXBand*)p1;
822     GXBand* b2=(GXBand*)p2;
823     if (b1->score==b2->score) {
824     return (b2->w_min_b-b1->w_min_b);
825     }
826     else return (b2->score-b1->score);
827     }
828    
829    
830    
831 gpertea 93 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
832 gpertea 101 const char* s_seq, int s_alnstart, int s_max,
833     int reward, int penalty, int xdrop, CGreedyAlignData* gxmem,
834     CAlnTrim* trim, bool editscript) {
835 gpertea 93 GXEditScript* ed_script_fwd = NULL;
836     GXEditScript* ed_script_rev = NULL;
837     if ( q_alnstart>q_max || q_alnstart<1 || s_alnstart>s_max || s_alnstart<1 )
838     GError("GreedyAlign() Error: invalid anchor coordinate.\n");
839     q_alnstart--;
840     s_alnstart--;
841     if (q_seq==NULL || q_seq[0]==0 || s_seq==NULL || s_seq[0]==0)
842     GError("GreedyAlign() Error: attempt to use an empty sequence string!\n");
843     if (q_seq[q_alnstart]!=s_seq[s_alnstart])
844     GError("GreedyAlign() Error: improper anchor (mismatch)!\n");
845     int q_ext_l=0, q_ext_r=0, s_ext_l=0, s_ext_r=0;
846     const char* q=q_seq+q_alnstart;
847     int q_avail=q_max-q_alnstart;
848     const char* s=s_seq+s_alnstart;
849     int s_avail=s_max-s_alnstart;
850     if (penalty<0) penalty=-penalty;
851 gpertea 101 int MIN_GREEDY_SCORE=5*reward; //minimum score for an alignment to be reported for 0 diffs
852 gpertea 93 GXAlnInfo* alninfo=NULL;
853     bool freeAlnMem=(gxmem==NULL);
854     if (freeAlnMem) {
855 gpertea 101 gxmem=new CGreedyAlignData(reward, penalty, xdrop);
856 gpertea 93 }
857     else gxmem->reset();
858     int retscore = 0;
859     int numdiffs = 0;
860 gpertea 101 if (trim!=NULL && trim->type==galn_TrimLeft) {
861 gpertea 93 if (editscript)
862     ed_script_rev=new GXEditScript();
863    
864 gpertea 101 int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
865 gpertea 93 reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
866     //check this extension here and bail out if it's not a good extension
867 gpertea 101 if (s_alnstart+1-s_ext_l>trim->boundary) {
868 gpertea 93 delete ed_script_rev;
869     if (freeAlnMem) delete gxmem;
870     return NULL;
871     }
872    
873     if (editscript)
874     ed_script_fwd=new GXEditScript();
875     int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
876     reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
877     numdiffs=numdiffs_r+numdiffs_l;
878     //convert num diffs to actual score
879 gpertea 101 retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward+penalty);
880 gpertea 93 if (editscript)
881     ed_script_rev->Append(ed_script_fwd); //combine the two extensions
882     }
883     else {
884     if (editscript) {
885     ed_script_fwd=new GXEditScript();
886     }
887     int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
888     reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
889     //check extension here and bail out if not a good right extension
890     //assuming s_max is really at the right end of s_seq
891 gpertea 109 if (trim!=NULL && trim->type==galn_TrimRight &&
892     s_alnstart+s_ext_r<trim->boundary) {
893 gpertea 93 delete ed_script_fwd;
894     if (freeAlnMem) delete gxmem;
895     return NULL;
896     }
897     if (editscript)
898     ed_script_rev=new GXEditScript();
899     int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
900     reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
901     //convert num diffs to actual score
902     numdiffs=numdiffs_r+numdiffs_l;
903 gpertea 101 retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward+penalty);
904 gpertea 93 if (editscript)
905     ed_script_rev->Append(ed_script_fwd); //combine the two extensions
906     }
907    
908     if (retscore>=MIN_GREEDY_SCORE) {
909     alninfo=new GXAlnInfo(q_seq, q_alnstart+1-q_ext_l,q_alnstart+q_ext_r, s_seq, s_alnstart+1-s_ext_l, s_alnstart+s_ext_r);
910     int hsp_length = GMIN(q_ext_l+q_ext_r, s_ext_l+s_ext_r);
911     alninfo->score=retscore;
912     alninfo->pid = 100 * (1 - ((double) numdiffs) / hsp_length);
913     #ifdef GDEBUG
914 gpertea 104 //if (ed_script_rev) {
915     // GMessage("Final Edit script ::: ");
916     // printEditScript(ed_script_rev);
917     // }
918 gpertea 93 #endif
919     alninfo->editscript=ed_script_rev;
920     alninfo->gapinfo = new CAlnGapInfo(ed_script_rev, alninfo->ql-1, alninfo->sl-1);
921     }
922     else {
923     if (freeAlnMem) delete gxmem;
924     delete ed_script_rev;
925     delete alninfo;
926     alninfo=NULL;
927     }
928     delete ed_script_fwd;
929     return alninfo;
930     }
931 gpertea 96
932 gpertea 101 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
933     const char* s_seq, int s_alnstart, int s_max, CGreedyAlignData* gxmem,
934     CAlnTrim* trim, bool editscript) {
935     int reward=2;
936     int penalty=3;
937     int xdrop=8;
938     if (gxmem) {
939     reward=gxmem->match_reward;
940     penalty=gxmem->mismatch_penalty;
941     xdrop=gxmem->x_drop;
942     }
943     return GreedyAlignRegion(q_seq, q_alnstart, q_max, s_seq, s_alnstart, s_max,
944     reward, penalty, xdrop, gxmem, trim, editscript);
945     }
946    
947 gpertea 96 GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
948 gpertea 101 CGreedyAlignData* gxmem, int min_pid) {
949 gpertea 96 bool editscript=false;
950     #ifdef GDEBUG
951     editscript=true;
952 gpertea 104 // GMessage("==========> matching Right (3') end ========\n");
953 gpertea 96 #endif
954 gpertea 101
955     CAlnTrim trimInfo(galn_TrimRight, seqb, seqb_len);
956 gpertea 96 GList<GXSeed> rseeds(true,true,false);
957     GXBandSet* alnbands=collectSeeds_R(rseeds, seqa, seqa_len, seqb, seqb_len);
958 gpertea 101 GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
959 gpertea 99 //did we find a shortcut?
960     if (alnbands->qmatch) {
961 gpertea 104 //#ifdef GDEBUG
962     // GMessage("::: Found a quick long match -- using it instead of diagonal bands\n");
963     //#endif
964 gpertea 101 anchor_seeds.Add(alnbands->qmatch);
965     }
966     else {
967     int max_top_bands=5;
968     int top_band_count=0;
969     for (int b=0;b<alnbands->Count();b++) {
970     if (alnbands->Get(b)->score<4) break;
971 gpertea 104 //#ifdef GDEBUG
972     //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
973     //#endif
974 gpertea 101 top_band_count++;
975     GXBand& band=*(alnbands->Get(b));
976     band.seeds.setSorted(cmpSeedScore);
977     anchor_seeds.Add(band.seeds.First());
978     band.tested=true;
979     if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
980     }
981 gpertea 104 //#ifdef GDEBUG
982     //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
983     //#endif
984 gpertea 101 }
985 gpertea 99
986 gpertea 96 GList<GXAlnInfo> galns(true,true,false);
987     for (int i=0;i<anchor_seeds.Count();i++) {
988     GXSeed& aseed=*anchor_seeds[i];
989     int a1=aseed.a_ofs+(aseed.len>>1)+1;
990     int a2=aseed.b_ofs+(aseed.len>>1)+1;
991     GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
992 gpertea 101 seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
993     if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
994 gpertea 96 galns.AddIfNew(alninfo, true);
995     else delete alninfo;
996     }
997 gpertea 104 //special 3' end case: due to the seed scoring scheme being biased towards the 5' end of the read,
998     //we should also try some seeds closer to the 3' end
999 gpertea 101 if (galns.Count()==0) {
1000 gpertea 104 anchor_seeds.Clear();
1001 gpertea 101 alnbands->setSorted(cmpDiagBands_R);
1002     int max_top_bands=4;
1003     int top_band_count=0;
1004 gpertea 104 //#ifdef GDEBUG
1005     //GMessage(":::>> Retrying adjusting sort order.\n");
1006     //#endif
1007     if (alnbands->tmatch) {
1008     //anchor_seeds.setSorted(false);
1009     anchor_seeds.Add(alnbands->tmatch);
1010     }
1011 gpertea 101 for (int b=0;b<alnbands->Count();b++) {
1012     if (alnbands->Get(b)->score<4) break;
1013 gpertea 104 //#ifdef GDEBUG
1014     //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
1015     //#endif
1016 gpertea 101 if (alnbands->Get(b)->tested) continue;
1017     top_band_count++;
1018     GXBand& band=*(alnbands->Get(b));
1019     band.seeds.setSorted(cmpSeedScore);
1020     anchor_seeds.Add(band.seeds.First());
1021     if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1022     }
1023 gpertea 104 //#ifdef GDEBUG
1024     //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1025     //#endif
1026 gpertea 101 for (int i=0;i<anchor_seeds.Count();i++) {
1027     GXSeed& aseed=*anchor_seeds[i];
1028     int a1=aseed.a_ofs+(aseed.len>>1)+1;
1029     int a2=aseed.b_ofs+(aseed.len>>1)+1;
1030     GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1031     seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1032     if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1033     galns.AddIfNew(alninfo, true);
1034     else delete alninfo;
1035     }
1036     }
1037 gpertea 96 //---- done
1038     delete alnbands;
1039     if (galns.Count()) {
1040     GXAlnInfo* bestaln=galns.Shift();
1041 gpertea 104 #ifdef GDEBUG
1042     GMessage("Best alignment: a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", bestaln->ql, bestaln->qr,
1043     bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid);
1044     if (bestaln->gapinfo!=NULL) {
1045     bestaln->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1046     }
1047     #endif
1048    
1049 gpertea 96 return bestaln;
1050     }
1051     else return NULL;
1052     }
1053    
1054     GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
1055 gpertea 101 CGreedyAlignData* gxmem, int min_pid) {
1056 gpertea 96 bool editscript=false;
1057     #ifdef GDEBUG
1058     editscript=true;
1059 gpertea 104 //GMessage("==========> matching Left (5') end ========\n");
1060 gpertea 96 #endif
1061 gpertea 101 CAlnTrim trimInfo(galn_TrimLeft, seqb, seqb_len);
1062 gpertea 96 GList<GXSeed> rseeds(true,true,false);
1063     GXBandSet* alnbands=collectSeeds_L(rseeds, seqa, seqa_len, seqb, seqb_len);
1064     GList<GXAlnInfo> galns(true,true,false);
1065     GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
1066    
1067 gpertea 101 if (alnbands->qmatch) {
1068 gpertea 104 //#ifdef GDEBUG
1069     // GMessage("::: Found a quick long match -- using it instead of diagonal bands\n");
1070     //#endif
1071 gpertea 101 anchor_seeds.Add(alnbands->qmatch);
1072     }
1073     else {
1074     int max_top_bands=5;
1075     int top_band_count=0;
1076     for (int b=0;b<alnbands->Count();b++) {
1077     if (alnbands->Get(b)->score<4) break;
1078 gpertea 104 //#ifdef GDEBUG
1079     //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
1080     //#endif
1081 gpertea 101 top_band_count++;
1082     GXBand& band=*(alnbands->Get(b));
1083     band.seeds.setSorted(cmpSeedScore);
1084     anchor_seeds.Add(band.seeds.First());
1085     if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1086     }
1087 gpertea 104 //#ifdef GDEBUG
1088     //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1089     //#endif
1090 gpertea 101 }
1091 gpertea 96 for (int i=0;i<anchor_seeds.Count();i++) {
1092     GXSeed& aseed=*anchor_seeds[i];
1093     int a1=aseed.a_ofs+(aseed.len>>1)+1;
1094     int a2=aseed.b_ofs+(aseed.len>>1)+1;
1095     GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1096 gpertea 101 seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1097     if (alninfo && alninfo->pid>=min_pid
1098     && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, seqa_len-alninfo->qr))
1099 gpertea 96 galns.AddIfNew(alninfo, true);
1100     else delete alninfo;
1101     }
1102 gpertea 104 if (galns.Count()==0 && alnbands->tmatch) {
1103     GXSeed& aseed=*alnbands->tmatch;
1104     int a1=aseed.a_ofs+(aseed.len>>1)+1;
1105     int a2=aseed.b_ofs+(aseed.len>>1)+1;
1106     GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1107     seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1108     if (alninfo) galns.Add(alninfo);
1109     }
1110 gpertea 96 #ifdef GDEBUG
1111 gpertea 104 /*
1112 gpertea 96 for (int i=0;i<galns.Count();i++) {
1113     GXAlnInfo* alninfo=galns[i];
1114     GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
1115     alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
1116     if (alninfo->gapinfo!=NULL) {
1117     GMessage("Alignment:\n");
1118     alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1119     }
1120     }
1121 gpertea 104 */
1122 gpertea 96 #endif
1123     //---- done
1124     delete alnbands;
1125     if (galns.Count()) {
1126     GXAlnInfo* bestaln=galns.Shift();
1127 gpertea 104 #ifdef GDEBUG
1128     GMessage("Best alignment: a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", bestaln->ql, bestaln->qr,
1129     bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid);
1130     if (bestaln->gapinfo!=NULL) {
1131     bestaln->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1132     }
1133     #endif
1134 gpertea 96 return bestaln;
1135     }
1136     else return NULL;
1137     }