ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.cpp
Revision: 101
Committed: Fri Oct 7 21:13:55 2011 UTC (8 years, 1 month ago) by gpertea
File size: 38519 byte(s)
Log Message:
adjusting params

Line User Rev File contents
1 gpertea 93 #include "GAlnExtend.h"
2    
3     //greedy gapped alignment extension
4     //(mostly lifted from NCBI's megablast gapped extension code)
5    
6     int GXMemPool::kMinSpace = 1000000;
7    
8     #ifdef GDEBUG
9     char buf[6]={0x1B,'[', 'n','m','m','\0'};
10    
11     void color_fg(int c,FILE* f) {
12     if (f!=stderr && f!=stdout) return;
13     sprintf((char *)(&buf[2]),"%dm",c+30);
14     fwrite(buf,1,strlen(buf), f);
15     }
16    
17     void color_bg(int c, FILE* f) {
18     if (f!=stderr && f!=stdout) return;
19     sprintf((char *)(&buf[2]),"%dm",c+40);
20     fwrite(buf,1,strlen(buf),f);
21     };
22    
23     void color_resetfg(FILE* f) {
24     if (f!=stderr && f!=stdout) return;
25     sprintf((char *)(&buf[2]),"39m");
26     fwrite(buf,1,strlen(buf), f);
27     };
28    
29     void color_resetbg(FILE* f) {
30     if (f!=stderr && f!=stdout) return;
31     sprintf((char *)(&buf[2]),"49m");
32     fwrite(buf,1,strlen(buf), f);
33     }
34    
35     void color_reset(FILE* f) {
36     if (f!=stderr && f!=stdout) return;
37     sprintf((char *)(&buf[2]),"0m");
38     fwrite(buf,1,strlen(buf), f);
39     };
40    
41     void color_normal(FILE* f) {
42     if (f!=stderr && f!=stdout) return;
43     sprintf((char *)(&buf[2]),"22m");
44     fwrite(buf,1,strlen(buf), f);
45     };
46    
47     #endif
48    
49    
50     char xgapcodes[4]={'S','I', 'D', 'X'};
51    
52     int get_last(int **flast_d, int d, int diag, int *row1) {
53     if (flast_d[d-1][diag-1] > GMAX(flast_d[d-1][diag], flast_d[d-1][diag+1])) {
54     *row1 = flast_d[d-1][diag-1];
55     return diag-1;
56     }
57     if (flast_d[d-1][diag] > flast_d[d-1][diag+1]) {
58     *row1 = flast_d[d-1][diag];
59     return diag;
60     }
61     *row1 = flast_d[d-1][diag+1];
62     return diag+1;
63     }
64    
65     void GapXEditScript::print() { //debug
66     GapXEditScript* p=this;
67     do {
68     GMessage("%d%c ",p->num, xgapcodes[p->op_type]);
69     } while ((p=p->next)!=NULL);
70     GMessage("\n");
71     }
72    
73    
74     int BLAST_Gcd(int a, int b) {
75     int c;
76    
77     b = abs(b);
78     if (b > a)
79     c=a, a=b, b=c;
80    
81     while (b != 0) {
82     c = a%b;
83     a = b;
84     b = c;
85     }
86     return a;
87     }
88    
89     int BLAST_Gdb3(int* a, int* b, int* c) {
90     int g;
91     if (*b == 0)
92     g = BLAST_Gcd(*a, *c);
93     else
94     g = BLAST_Gcd(*a, BLAST_Gcd(*b, *c));
95     if (g > 1) {
96     *a /= g;
97     *b /= g;
98     *c /= g;
99     }
100     return g;
101     }
102    
103    
104    
105     //signal that a diagonal is invalid
106     static const int kInvalidOffset = -2;
107    
108     int s_FindFirstMismatch(const char *seq1, int len1,
109     const char *seq2, int len2,
110     int seq1_index, int seq2_index,
111     //bool &fence_hit,
112     bool reverse) {
113     int start_index = seq1_index;
114     /* Sentry detection here should be relatively inexpensive: The
115     sentry value cannot appear in the query, so detection only
116     needs to be done at exit from the subject-query matching loop.
117     For uncompressed sequences, ambiguities in the query (i.e. seq1)
118     always count as mismatches */
119     if (reverse) {
120     while (seq1_index < len1 && seq2_index < len2 &&
121     //seq1[len1-1 - seq1_index] < 4 &&
122     seq1[len1-1 - seq1_index] == seq2[len2-1 - seq2_index]) {
123     ++seq1_index;
124     ++seq2_index;
125     }
126     //if (seq2_index < len2 && seq2[len2-1-seq2_index] == FENCE_SENTRY) {
127     //if len2-1-seq2_index<=0) {
128     // fence_hit = true;
129     // }
130     }
131     else { //forward lookup
132     while (seq1_index < len1 && seq2_index < len2 &&
133     //seq1[seq1_index] < 4 &&
134     seq1[seq1_index] == seq2[seq2_index]) {
135     ++seq1_index;
136     ++seq2_index;
137     }
138     //if (seq2_index < len2 && seq2[seq2_index] == FENCE_SENTRY) {
139     //if (seq2_index==len2) {
140     // fence_hit = true;
141     //}
142     }
143     return seq1_index - start_index;
144     }
145    
146    
147    
148     /** During the traceback for a non-affine greedy alignment,
149     compute the diagonal that will result from the next
150     traceback operation
151    
152     @param last_seq2_off Array of offsets into the second sequence;
153     last_seq2_off[d][k] gives the largest offset into
154     the second sequence that lies on diagonal k and
155     has distance d [in]
156     @param d Starting distance [in]
157     @param diag Index of diagonal that produced the starting distance [in]
158     @param seq2_index The offset into the second sequence after the traceback
159     operation has completed [out]
160     @return The diagonal resulting from the next traceback operation
161     being applied
162     */
163     int s_GetNextNonAffineTback(int **last_seq2_off, int d,
164     int diag, int *seq2_index) {
165     // choose the traceback operation that results in the
166     // largest seq2 offset at this point, then compute the
167     // new diagonal that is implied by the operation
168     if (last_seq2_off[d-1][diag-1] >
169     GMAX(last_seq2_off[d-1][diag], last_seq2_off[d-1][diag+1])) {
170     *seq2_index = last_seq2_off[d-1][diag-1];
171     return diag - 1; // gap in seq2
172     }
173     if (last_seq2_off[d-1][diag] > last_seq2_off[d-1][diag+1]) {
174     *seq2_index = last_seq2_off[d-1][diag];
175     return diag; // match
176     }
177     *seq2_index = last_seq2_off[d-1][diag+1];
178     return diag + 1; // gap in seq1
179     }
180    
181    
182     int GXGreedyExtend(const char* seq1, int len1,
183     const char* seq2, int len2,
184     bool reverse, int xdrop_threshold,
185     int match_cost, int mismatch_cost,
186     int& seq1_align_len, int& seq2_align_len,
187 gpertea 101 CGreedyAlignData& aux_data,
188 gpertea 93 GXEditScript *edit_block) {
189     //GapPrelimEditBlock *edit_block,
190     //bool& fence_hit, SGreedySeed *seed) {
191     int seq1_index;
192     int seq2_index;
193     int index;
194     int d;
195     int k;
196     int diag_lower, diag_upper;
197     int max_dist;
198     int diag_origin;
199     int best_dist;
200     int best_diag;
201     int** last_seq2_off;
202     int* max_score;
203     int xdrop_offset;
204     int longest_match_run;
205     bool end1_reached, end2_reached;
206     GXMemPool* mem_pool;
207    
208     /* ordinary dynamic programming alignment, for each offset
209     in seq1, walks through offsets in seq2 until an X-dropoff
210     test fails, saving the best score encountered along
211     the way. Instead of score, this code tracks the 'distance'
212     (number of mismatches plus number of gaps) between seq1
213     and seq2. Instead of walking through sequence offsets, it
214     walks through diagonals that can achieve a given distance.
215    
216     Note that in what follows, the numbering of diagonals implies
217     a dot matrix where increasing seq1 offsets go to the right on
218     the x axis, and increasing seq2 offsets go up the y axis.
219     The gapped alignment thus proceeds up and to the right in
220     the graph, and diagonals are numbered increasing to the right */
221    
222     best_dist = 0;
223     best_diag = 0;
224    
225     /* set the number of distinct distances the algorithm will
226     examine in the search for an optimal alignment. The
227     heuristic worst-case running time of the algorithm is
228     O(max_dist**2 + (len1+len2)); for sequences which are
229     very similar, the average running time will be sig-
230     nificantly better than this */
231    
232     max_dist = GMIN(GREEDY_MAX_COST,
233     len2 / GREEDY_MAX_COST_FRACTION + 1);
234    
235     /* the main loop assumes that the index of all diagonals is
236     biased to lie in the middle of allocated bookkeeping
237     structures */
238    
239     diag_origin = max_dist + 2;
240    
241     // last_seq2_off[d][k] is the largest offset into seq2 that
242     // lies on diagonal k and has distance d
243    
244     last_seq2_off = aux_data.last_seq2_off;
245    
246     /* Instead of tracking the best alignment score and using
247     xdrop_theshold directly, track the best score for each
248     unique distance and use the best score for some previously
249     computed distance to implement the X-dropoff test.
250    
251     xdrop_offset gives the distance backwards in the score
252     array to look */
253    
254     xdrop_offset = (xdrop_threshold + match_cost / 2) /
255     (match_cost + mismatch_cost) + 1;
256    
257     // find the offset of the first mismatch between seq1 and seq2
258    
259     index = s_FindFirstMismatch(seq1, len1, seq2, len2, 0, 0, reverse);
260     // fence_hit, reverse, rem);
261    
262     // update the extents of the alignment, and bail out
263     // early if no further work is needed
264    
265     seq1_align_len = index;
266     seq2_align_len = index;
267     seq1_index = index;
268     /*
269     seed->start_q = 0;
270     seed->start_s = 0;
271     seed->match_length = index;
272     */
273     longest_match_run = index;
274    
275     if (index == len1 || index == len2) {
276     /* Return the number of differences, which is zero here */
277     if (edit_block != NULL)
278     //GapPrelimEditBlockAdd(edit_block, eGapAlignSub, index);
279     edit_block->opRep(index);
280     return 0;
281     }
282    
283     // set up the memory pool
284     mem_pool = aux_data.space;
285     if (edit_block == NULL) {
286     mem_pool = NULL;
287     }
288     else if (mem_pool == NULL) {
289     aux_data.space = mem_pool = new GXMemPool();
290     }
291     else {
292     mem_pool->refresh();
293     }
294    
295     /* set up the array of per-distance maximum scores. There
296     are max_diags + xdrop_offset distances to track, the first
297     xdrop_offset of which are 0 */
298    
299     max_score = aux_data.max_score + xdrop_offset;
300     for (index = 0; index < xdrop_offset; index++)
301     aux_data.max_score[index] = 0;
302    
303     // fill in the initial offsets of the distance matrix
304    
305     last_seq2_off[0][diag_origin] = seq1_index;
306     max_score[0] = seq1_index * match_cost;
307     diag_lower = diag_origin - 1;
308     diag_upper = diag_origin + 1;
309     end1_reached = end2_reached = false;
310    
311     // for each distance
312     for (d = 1; d <= max_dist; d++) {
313     int xdrop_score;
314     int curr_score;
315     int curr_extent = 0;
316     int curr_seq2_index = 0;
317     int curr_diag = 0;
318     int tmp_diag_lower = diag_lower;
319     int tmp_diag_upper = diag_upper;
320    
321     // Assign impossible seq2 offsets to any diagonals that
322     // are not in the range (diag_lower,diag_upper).
323     // These will serve as sentinel values for the inner loop
324     last_seq2_off[d - 1][diag_lower-1] = kInvalidOffset;
325     last_seq2_off[d - 1][diag_lower] = kInvalidOffset;
326     last_seq2_off[d - 1][diag_upper] = kInvalidOffset;
327     last_seq2_off[d - 1][diag_upper+1] = kInvalidOffset;
328    
329     // compute the score for distance d corresponding to the X-dropoff criterion
330    
331     xdrop_score = max_score[d - xdrop_offset] +
332     (match_cost + mismatch_cost) * d - xdrop_threshold;
333     xdrop_score = (int)ceil((double)xdrop_score / (match_cost / 2));
334    
335     // for each diagonal of interest
336     for (k = tmp_diag_lower; k <= tmp_diag_upper; k++) {
337     /* find the largest offset into seq2 that increases
338     the distance from d-1 to d (i.e. keeps the alignment
339     from getting worse for as long as possible), then
340     choose the offset into seq1 that will keep the
341     resulting diagonal fixed at k
342    
343     Note that this requires kInvalidOffset+1 to be smaller
344     than any valid offset into seq2, i.e. to be negative */
345    
346     seq2_index = GMAX(last_seq2_off[d - 1][k + 1],
347     last_seq2_off[d - 1][k ]) + 1;
348     seq2_index = GMAX(seq2_index, last_seq2_off[d - 1][k - 1]);
349     seq1_index = seq2_index + k - diag_origin;
350    
351     if (seq2_index < 0 || seq1_index + seq2_index < xdrop_score) {
352    
353     // if no valid diagonal can reach distance d, or the
354     // X-dropoff test fails, narrow the range of diagonals
355     // to test and skip to the next diagonal
356     if (k == diag_lower)
357     diag_lower++;
358     else
359     last_seq2_off[d][k] = kInvalidOffset;
360     continue;
361     }
362     diag_upper = k;
363    
364     /* slide down diagonal k until a mismatch
365     occurs. As long as only matches are encountered,
366     the current distance d will not change */
367    
368     index = s_FindFirstMismatch(seq1, len1, seq2, len2,
369     seq1_index, seq2_index, reverse);
370     //fence_hit, reverse, rem);
371     if (index > longest_match_run) {
372     //seed->start_q = seq1_index;
373     //seed->start_s = seq2_index;
374     //seed->match_length = index;
375     longest_match_run = index;
376     }
377     seq1_index += index;
378     seq2_index += index;
379    
380     // set the new largest seq2 offset that achieves
381     // distance d on diagonal k
382    
383     last_seq2_off[d][k] = seq2_index;
384    
385     // since all values of k are constrained to have the
386     // same distance d, the value of k which maximizes the
387     // alignment score is the one that covers the most of seq1 and seq2
388     if (seq1_index + seq2_index > curr_extent) {
389     curr_extent = seq1_index + seq2_index;
390     curr_seq2_index = seq2_index;
391     curr_diag = k;
392     }
393    
394     /* clamp the bounds on diagonals to avoid walking off
395     either sequence. Because the bounds increase by at
396     most one for each distance, diag_lower and diag_upper
397     can each be of size at most max_diags+2 */
398    
399     if (seq2_index == len2) {
400     diag_lower = k + 1;
401     end2_reached = true;
402     }
403     if (seq1_index == len1) {
404     diag_upper = k - 1;
405     end1_reached = true;
406     }
407     } // end loop over diagonals
408    
409     // compute the maximum score possible for distance d
410     curr_score = curr_extent * (match_cost / 2) -
411     d * (match_cost + mismatch_cost);
412     // if this is the best score seen so far, update the
413     // statistics of the best alignment
414     if (curr_score > max_score[d - 1]) {
415     max_score[d] = curr_score;
416     best_dist = d;
417     best_diag = curr_diag;
418     seq2_align_len = curr_seq2_index;
419     seq1_align_len = curr_seq2_index + best_diag - diag_origin;
420     }
421     else {
422     max_score[d] = max_score[d - 1];
423     }
424    
425     // alignment has finished if the lower and upper bounds
426     // on diagonals to check have converged to each other
427    
428     if (diag_lower > diag_upper)
429     break;
430    
431     /* set up for the next distance to examine. Because the
432     bounds increase by at most one for each distance,
433     diag_lower and diag_upper can each be of size at
434     most max_diags+2 */
435    
436     if (!end2_reached)
437     diag_lower--;
438     if (!end1_reached)
439     diag_upper++;
440    
441     if (edit_block == NULL) {
442     // if no traceback is specified, the next row of
443     // last_seq2_off can reuse previously allocated memory
444     //TODO FIXME The following assumes two arrays of
445     // at least max_dist+4 int's have already been allocated
446     last_seq2_off[d + 1] = last_seq2_off[d - 1];
447     }
448     else {
449     // traceback requires all rows of last_seq2_off to be saved,
450     // so a new row must be allocated
451     last_seq2_off[d + 1] = (int*)mem_pool->getByteSpace((diag_upper - diag_lower + 7)*sizeof(int));
452     // move the origin for this row backwards
453     //TODO FIXME: dubious pointer arithmetic ?!
454     //last_seq2_off[d + 1] = last_seq2_off[d + 1] - diag_lower + 2;
455     }
456     } // end loop over distinct distances
457    
458    
459     if (edit_block == NULL)
460     return best_dist;
461    
462     //---- perform traceback
463     d = best_dist;
464     seq1_index = seq1_align_len;
465     seq2_index = seq2_align_len;
466     // for all positive distances
467    
468     //if (fence_hit && *fence_hit)
469     // goto done;
470     if (index==len1 || index==len2) d=0;
471     while (d > 0) {
472     int new_diag;
473     int new_seq2_index;
474    
475     /* retrieve the value of the diagonal after the next
476     traceback operation. best_diag starts off with the
477     value computed during the alignment process */
478    
479     new_diag = s_GetNextNonAffineTback(last_seq2_off, d,
480     best_diag, &new_seq2_index);
481    
482     if (new_diag == best_diag) {
483     // same diagonal: issue a group of substitutions
484     if (seq2_index - new_seq2_index > 0) {
485     edit_block->opRep(seq2_index - new_seq2_index);
486     }
487     }
488     else if (new_diag < best_diag) {
489     // smaller diagonal: issue a group of substitutions
490     // and then a gap in seq2 */
491     if (seq2_index - new_seq2_index > 0) {
492     edit_block->opRep(seq2_index - new_seq2_index);
493     }
494     //GapPrelimEditBlockAdd(edit_block, eGapAlignIns, 1);
495     edit_block->opIns(1);
496     }
497     else {
498     // larger diagonal: issue a group of substitutions
499     // and then a gap in seq1
500     if (seq2_index - new_seq2_index - 1 > 0) {
501     edit_block->opRep(seq2_index - new_seq2_index - 1);
502     }
503     edit_block->opDel(1);
504     }
505     d--;
506     best_diag = new_diag;
507     seq2_index = new_seq2_index;
508     }
509     //done:
510     // handle the final group of substitutions back to distance zero,
511     // i.e. back to offset zero of seq1 and seq2
512     //GapPrelimEditBlockAdd(edit_block, eGapAlignSub,
513     // last_seq2_off[0][diag_origin]);
514     edit_block->opRep(last_seq2_off[0][diag_origin]);
515     if (!reverse)
516     edit_block->reverse();
517     return best_dist;
518     }
519    
520     void printEditScript(GXEditScript* ed_script) {
521     uint i;
522     if (ed_script==NULL || ed_script->opnum == 0)
523     return;
524     for (i=0; i<ed_script->opnum; i++) {
525     int num=((ed_script->ops[i]) >> 2);
526     unsigned char op_type = 3 - ( ed_script->ops[i] & gxEDIT_OP_MASK );
527     if (op_type == 3)
528     GError("Error: printEditScript encountered op_type 3 ?!\n");
529     GMessage("%d%c ", num, xgapcodes[op_type]);
530     }
531     GMessage("\n");
532     }
533    
534     GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart,
535     bool editscript, int reward, int penalty, int xdrop) {
536     int q_max=strlen(q_seq); //query
537     int s_max=strlen(s_seq); //subj
538     return GreedyAlignRegion(q_seq, q_alnstart, q_max,
539 gpertea 101 s_seq, s_alnstart, s_max, reward, penalty, xdrop, NULL, NULL, editscript);
540 gpertea 93 }
541    
542     struct GXSeedTable {
543     int a_num, b_num;
544     int a_cap, b_cap;
545     char* xc;
546     GXSeedTable(int a=12, int b=255) {
547     a_cap=0;
548     b_cap=0;
549     a_num=0;
550     b_num=0;
551     xc=NULL;
552     init(a,b);
553     }
554     ~GXSeedTable() {
555     GFREE(xc);
556     }
557     void init(int a, int b) {
558     a_num=a;
559     b_num=b;
560     bool resize=false;
561     if (b_num>b_cap) { resize=true; b_cap=b_num;}
562     if (a_num>a_cap) { resize=true; a_cap=a_num;}
563     if (resize) {
564     GFREE(xc);
565     GCALLOC(xc, (a_num*b_num));
566     }
567     else {
568     //just clear up to a_max, b_max
569     memset((void*)xc, 0, (a_num*b_num));
570     }
571     }
572     char& x(int ax, int by) {
573     return xc[by*a_num+ax];
574     }
575    
576     };
577    
578 gpertea 99 const int a_m_score=2; //match score
579     const int a_mis_score=-3; //mismatch
580     const int a_dropoff_score=7;
581 gpertea 101 const int a_min_score=12; //at least 6 bases full match
582 gpertea 93
583 gpertea 99 // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
584     //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
585     //check minimum score and
586     //for 3' adapter trimming:
587     // require that the right end of the alignment for either the adaptor OR the read must be
588     // < 3 distance from its right end
589     // for 5' adapter trimming:
590     // require that the left end of the alignment for either the adaptor OR the read must
591     // be at coordinate < 3 from start
592    
593     bool extendUngapped(const char* a, int alen, int ai,
594     const char* b, int blen, int bi, int& mlen, int& l5, int& l3, bool end5=false) {
595     //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
596     //if (debug) {
597     // GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
598     // }
599     int a_l=ai; //alignment coordinates on a
600     int a_r=ai+mlen-1;
601     int b_l=bi; //alignment coordinates on b
602     int b_r=bi+mlen-1;
603     int ai_maxscore=ai;
604     int bi_maxscore=bi;
605     int score=mlen*a_m_score;
606     int maxscore=score;
607     int mism5score=a_mis_score;
608     if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
609     //try to extend to the left first, if possible
610     while (ai>0 && bi>0) {
611     ai--;
612     bi--;
613     score+= (a[ai]==b[bi])? a_m_score : mism5score;
614     if (score>maxscore) {
615     ai_maxscore=ai;
616     bi_maxscore=bi;
617     maxscore=score;
618     }
619     else if (maxscore-score>a_dropoff_score) break;
620     }
621     a_l=ai_maxscore;
622     b_l=bi_maxscore;
623     //now extend to the right
624     ai_maxscore=a_r;
625     bi_maxscore=b_r;
626     ai=a_r;
627     bi=b_r;
628     score=maxscore;
629     //sometimes there are extra As at the end of the read, ignore those
630     if (a[alen-2]=='A' && a[alen-1]=='A') {
631     alen-=2;
632     while (a[alen-1]=='A' && alen>ai) alen--;
633     }
634     while (ai<alen-1 && bi<blen-1) {
635     ai++;
636     bi++;
637     //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
638     if (a[ai]==b[bi]) { //match
639     score+=a_m_score;
640     if (ai>=alen-2) {
641     score+=a_m_score-(alen-ai-1);
642     }
643     }
644     else { //mismatch
645     score+=a_mis_score;
646     }
647     if (score>maxscore) {
648     ai_maxscore=ai;
649     bi_maxscore=bi;
650     maxscore=score;
651     }
652     else if (maxscore-score>a_dropoff_score) break;
653     }
654     a_r=ai_maxscore;
655     b_r=bi_maxscore;
656     int a_ovh3=alen-a_r-1;
657     int b_ovh3=blen-b_r-1;
658     int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
659     int mmovh5=(a_l<b_l)? a_l : b_l;
660     if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
661     if (a_l<a_ovh3) {
662     //adapter closer to the left end (typical for 5' adapter)
663     l5=a_r+1;
664     l3=alen-1;
665     }
666     else {
667     //adapter matching at the right end (typical for 3' adapter)
668     l5=0;
669     l3=a_l-1;
670     }
671     return true;
672     }
673     //do not trim:
674     l5=0;
675     l3=alen-1;
676     return false;
677     }
678    
679    
680 gpertea 93 GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
681     //overlap of right (3') end of seqb
682     //hash the first 12 bases of seqa:
683     int aimin=0;
684     int aimax=GMIN(9,(a_len-4));
685     int bimin=GMAX(0,(b_len-a_len-2));//allow 2 extraneous bases, just in case there is some garbage at the end
686     int bimax=b_len-4;
687     int a_maxlen=aimax+4; //number of rows in the diagonal matrix
688     int b_maxlen=b_len; //number of cols in the diagonal matrix
689     GXSeedTable gx(a_maxlen, b_maxlen);
690     GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
691     for (int ai=aimin;ai<=aimax;ai++) {
692     int32* av=(int32 *)(&seqa[ai]);
693     //for (int bi=b_len-4;bi>=bimin;bi--) {
694     for (int bi=bimin;bi<=bimax;bi++) {
695     if (gx.x(ai,bi))
696     continue; //already have a previous seed covering this region of this diagonal
697     int32* bv=(int32 *)(&seqb[bi]);
698     if (*av == *bv) {
699     //word match
700     //see if we can extend to the right
701     gx.x(ai+1,bi+1)=1;
702     gx.x(ai+2,bi+2)=1;
703     gx.x(ai+3,bi+3)=1;
704     int aix=ai+4;
705     int bix=bi+4;
706     int len=4;
707     while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
708     {
709     gx.x(aix,bix)=1;
710     aix++;bix++;
711     len++;
712     }
713 gpertea 101 if (len>7) {
714 gpertea 99 //heuristics: very likely the best we can get
715     int qlen=len;
716     while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
717     aix++;
718     bix++;
719     qlen++;
720     }
721 gpertea 101 if (qlen>10) { //found a quick match shortcut
722 gpertea 99 diagstrips->qmatch=new GXSeed(ai,bi,qlen);
723     return diagstrips;
724     }
725     }
726 gpertea 93 GXSeed* newseed=new GXSeed(ai,bi,len);
727     seeds.Add(newseed);
728     diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
729     }
730     }
731     }//for each 4-mer at the beginning of seqa
732     for (int i=0;i<diagstrips->Count();i++) {
733     diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
734     }
735     diagstrips->setSorted(true); //sort by score
736     return diagstrips;
737     }
738    
739     GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) {
740     //overlap of left (5') end of seqb
741     //hash the last 12 bases of seqa:
742     int aimin=GMAX(0,(a_len-16));
743     int aimax=a_len-4;
744     int bimin=0;
745     int bimax=GMIN((a_len-2), (b_len-4));
746     int a_maxlen=aimax+4; //number of rows in the diagonal matrix
747     int b_maxlen=b_len; //number of cols in the diagonal matrix
748     //gx.init(a_maxlen, b_maxlen);
749     GXSeedTable gx(a_maxlen, b_maxlen);
750     GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips
751     for (int ai=aimin;ai<=aimax;ai++) {
752     int32* av=(int32 *)(&seqa[ai]);
753     for (int bi=bimin;bi<=bimax;bi++) {
754     if (gx.x(ai,bi))
755     continue; //already have a previous seed covering this region of this diagonal
756     int32* bv=(int32 *)(&seqb[bi]);
757     if (*av == *bv) {
758     //word match
759     //see if we can extend to the right
760     gx.x(ai+1,bi+1)=1;
761     gx.x(ai+2,bi+2)=1;
762     gx.x(ai+3,bi+3)=1;
763     int aix=ai+4;
764     int bix=bi+4;
765     int len=4;
766     while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix])
767     {
768     gx.x(aix,bix)=1;
769     aix++;bix++;
770     len++;
771     }
772 gpertea 101 if (len>7) {
773     //heuristics: very likely the best we can get
774     int qlen=len;
775     while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
776     aix++;
777     bix++;
778     qlen++;
779     }
780     if (qlen>10) { //found a quick match shortcut
781     diagstrips->qmatch=new GXSeed(ai,bi,qlen);
782     return diagstrips;
783     }
784     }
785 gpertea 93 GXSeed* newseed=new GXSeed(ai,bi,len);
786     seeds.Add(newseed);
787     diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
788     }
789     }
790     }//for each 4-mer at the end of seqa
791     for (int i=0;i<diagstrips->Count();i++) {
792     diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
793     }
794     diagstrips->setSorted(true); //sort by score
795     return diagstrips;
796     }
797    
798     int cmpSeedScore(const pointer p1, const pointer p2) {
799     //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
800     GXSeed* s1=(GXSeed*)p1;
801     GXSeed* s2=(GXSeed*)p2;
802     if (s1->len==s2->len) {
803     return (s1->b_ofs-s2->b_ofs);
804     }
805     else return (s2->len-s1->len);
806     }
807    
808     int cmpSeedDiag(const pointer p1, const pointer p2) {
809     GXSeed* s1=(GXSeed*)p1;
810     GXSeed* s2=(GXSeed*)p2;
811     return ((s1->b_ofs-s1->a_ofs)-(s2->b_ofs-s2->a_ofs));
812     }
813    
814 gpertea 101
815     int cmpDiagBands_R(const pointer p1, const pointer p2) {
816     //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
817     GXBand* b1=(GXBand*)p1;
818     GXBand* b2=(GXBand*)p2;
819     if (b1->score==b2->score) {
820     return (b2->w_min_b-b1->w_min_b);
821     }
822     else return (b2->score-b1->score);
823     }
824    
825    
826    
827 gpertea 93 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
828 gpertea 101 const char* s_seq, int s_alnstart, int s_max,
829     int reward, int penalty, int xdrop, CGreedyAlignData* gxmem,
830     CAlnTrim* trim, bool editscript) {
831 gpertea 93 GXEditScript* ed_script_fwd = NULL;
832     GXEditScript* ed_script_rev = NULL;
833     if ( q_alnstart>q_max || q_alnstart<1 || s_alnstart>s_max || s_alnstart<1 )
834     GError("GreedyAlign() Error: invalid anchor coordinate.\n");
835     q_alnstart--;
836     s_alnstart--;
837     if (q_seq==NULL || q_seq[0]==0 || s_seq==NULL || s_seq[0]==0)
838     GError("GreedyAlign() Error: attempt to use an empty sequence string!\n");
839     if (q_seq[q_alnstart]!=s_seq[s_alnstart])
840     GError("GreedyAlign() Error: improper anchor (mismatch)!\n");
841     int q_ext_l=0, q_ext_r=0, s_ext_l=0, s_ext_r=0;
842     const char* q=q_seq+q_alnstart;
843     int q_avail=q_max-q_alnstart;
844     const char* s=s_seq+s_alnstart;
845     int s_avail=s_max-s_alnstart;
846     if (penalty<0) penalty=-penalty;
847 gpertea 101 int MIN_GREEDY_SCORE=5*reward; //minimum score for an alignment to be reported for 0 diffs
848 gpertea 93 GXAlnInfo* alninfo=NULL;
849     bool freeAlnMem=(gxmem==NULL);
850     if (freeAlnMem) {
851 gpertea 101 gxmem=new CGreedyAlignData(reward, penalty, xdrop);
852 gpertea 93 }
853     else gxmem->reset();
854     int retscore = 0;
855     int numdiffs = 0;
856 gpertea 101 if (trim!=NULL && trim->type==galn_TrimLeft) {
857 gpertea 93 if (editscript)
858     ed_script_rev=new GXEditScript();
859    
860 gpertea 101 int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
861 gpertea 93 reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
862     //check this extension here and bail out if it's not a good extension
863 gpertea 101 if (s_alnstart+1-s_ext_l>trim->boundary) {
864 gpertea 93 delete ed_script_rev;
865     if (freeAlnMem) delete gxmem;
866     return NULL;
867     }
868    
869     if (editscript)
870     ed_script_fwd=new GXEditScript();
871     int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
872     reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
873     numdiffs=numdiffs_r+numdiffs_l;
874     //convert num diffs to actual score
875 gpertea 101 retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward+penalty);
876 gpertea 93 if (editscript)
877     ed_script_rev->Append(ed_script_fwd); //combine the two extensions
878     }
879     else {
880     if (editscript) {
881     ed_script_fwd=new GXEditScript();
882     }
883     int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
884     reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
885     //check extension here and bail out if not a good right extension
886     //assuming s_max is really at the right end of s_seq
887 gpertea 101 if (trim!=NULL && trim->type==galn_TrimRight && s_alnstart+s_ext_r<trim->boundary) {
888 gpertea 93 delete ed_script_fwd;
889     if (freeAlnMem) delete gxmem;
890     return NULL;
891     }
892     if (editscript)
893     ed_script_rev=new GXEditScript();
894     int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
895     reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
896     //convert num diffs to actual score
897     numdiffs=numdiffs_r+numdiffs_l;
898 gpertea 101 retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward+penalty);
899 gpertea 93 if (editscript)
900     ed_script_rev->Append(ed_script_fwd); //combine the two extensions
901     }
902    
903     if (retscore>=MIN_GREEDY_SCORE) {
904     alninfo=new GXAlnInfo(q_seq, q_alnstart+1-q_ext_l,q_alnstart+q_ext_r, s_seq, s_alnstart+1-s_ext_l, s_alnstart+s_ext_r);
905     int hsp_length = GMIN(q_ext_l+q_ext_r, s_ext_l+s_ext_r);
906     alninfo->score=retscore;
907     alninfo->pid = 100 * (1 - ((double) numdiffs) / hsp_length);
908     #ifdef GDEBUG
909     if (ed_script_rev) {
910     GMessage("Final Edit script ::: ");
911     printEditScript(ed_script_rev);
912     }
913     #endif
914     alninfo->editscript=ed_script_rev;
915     alninfo->gapinfo = new CAlnGapInfo(ed_script_rev, alninfo->ql-1, alninfo->sl-1);
916     }
917     else {
918     if (freeAlnMem) delete gxmem;
919     delete ed_script_rev;
920     delete alninfo;
921     alninfo=NULL;
922     }
923     delete ed_script_fwd;
924     return alninfo;
925     }
926 gpertea 96
927 gpertea 101 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
928     const char* s_seq, int s_alnstart, int s_max, CGreedyAlignData* gxmem,
929     CAlnTrim* trim, bool editscript) {
930     int reward=2;
931     int penalty=3;
932     int xdrop=8;
933     if (gxmem) {
934     reward=gxmem->match_reward;
935     penalty=gxmem->mismatch_penalty;
936     xdrop=gxmem->x_drop;
937     }
938     return GreedyAlignRegion(q_seq, q_alnstart, q_max, s_seq, s_alnstart, s_max,
939     reward, penalty, xdrop, gxmem, trim, editscript);
940     }
941    
942 gpertea 96 GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
943 gpertea 101 CGreedyAlignData* gxmem, int min_pid) {
944 gpertea 96 bool editscript=false;
945     #ifdef GDEBUG
946     editscript=true;
947     GMessage("==========> matching Right (3') end ========\n");
948     #endif
949 gpertea 101
950     CAlnTrim trimInfo(galn_TrimRight, seqb, seqb_len);
951 gpertea 96 GList<GXSeed> rseeds(true,true,false);
952     GXBandSet* alnbands=collectSeeds_R(rseeds, seqa, seqa_len, seqb, seqb_len);
953 gpertea 101 GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
954 gpertea 99 //did we find a shortcut?
955     if (alnbands->qmatch) {
956 gpertea 101 #ifdef GDEBUG
957     GMessage("::: Found a quick long match -- using it instead of diagonal bands\n");
958     #endif
959     anchor_seeds.Add(alnbands->qmatch);
960     }
961     else {
962     int max_top_bands=5;
963     int top_band_count=0;
964     for (int b=0;b<alnbands->Count();b++) {
965     if (alnbands->Get(b)->score<4) break;
966     #ifdef GDEBUG
967     GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
968     #endif
969     top_band_count++;
970     GXBand& band=*(alnbands->Get(b));
971     band.seeds.setSorted(cmpSeedScore);
972     anchor_seeds.Add(band.seeds.First());
973     band.tested=true;
974     if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
975     }
976     #ifdef GDEBUG
977     GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
978     #endif
979     }
980 gpertea 99
981 gpertea 96 GList<GXAlnInfo> galns(true,true,false);
982     for (int i=0;i<anchor_seeds.Count();i++) {
983     GXSeed& aseed=*anchor_seeds[i];
984     int a1=aseed.a_ofs+(aseed.len>>1)+1;
985     int a2=aseed.b_ofs+(aseed.len>>1)+1;
986     GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
987 gpertea 101 seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
988     if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
989 gpertea 96 galns.AddIfNew(alninfo, true);
990     else delete alninfo;
991     }
992 gpertea 101 //special case: due to the scoring scheme bias towards the 5' end of the read,
993     //also try some seeds at the 3' end
994     if (galns.Count()==0) {
995     alnbands->setSorted(cmpDiagBands_R);
996     int max_top_bands=4;
997     int top_band_count=0;
998     #ifdef GDEBUG
999     GMessage(":::>> Retrying adjusting sort order.\n");
1000     #endif
1001     for (int b=0;b<alnbands->Count();b++) {
1002     if (alnbands->Get(b)->score<4) break;
1003     #ifdef GDEBUG
1004     GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
1005     #endif
1006     if (alnbands->Get(b)->tested) continue;
1007     top_band_count++;
1008     GXBand& band=*(alnbands->Get(b));
1009     band.seeds.setSorted(cmpSeedScore);
1010     anchor_seeds.Add(band.seeds.First());
1011     if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1012     }
1013     #ifdef GDEBUG
1014     GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1015     #endif
1016     for (int i=0;i<anchor_seeds.Count();i++) {
1017     GXSeed& aseed=*anchor_seeds[i];
1018     int a1=aseed.a_ofs+(aseed.len>>1)+1;
1019     int a2=aseed.b_ofs+(aseed.len>>1)+1;
1020     GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1021     seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1022     if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1023     galns.AddIfNew(alninfo, true);
1024     else delete alninfo;
1025     }
1026     }
1027 gpertea 96 #ifdef GDEBUG
1028     for (int i=0;i<galns.Count();i++) {
1029     GXAlnInfo* alninfo=galns[i];
1030     GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
1031     alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
1032     if (alninfo->gapinfo!=NULL) {
1033     GMessage("Alignment:\n");
1034     alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1035     }
1036     }
1037     #endif
1038     //---- done
1039     delete alnbands;
1040     if (galns.Count()) {
1041     GXAlnInfo* bestaln=galns.Shift();
1042     return bestaln;
1043     }
1044     else return NULL;
1045     }
1046    
1047     GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len,
1048 gpertea 101 CGreedyAlignData* gxmem, int min_pid) {
1049 gpertea 96 bool editscript=false;
1050     #ifdef GDEBUG
1051     editscript=true;
1052     GMessage("==========> matching Left (5') end ========\n");
1053     #endif
1054 gpertea 101 CAlnTrim trimInfo(galn_TrimLeft, seqb, seqb_len);
1055 gpertea 96 GList<GXSeed> rseeds(true,true,false);
1056     GXBandSet* alnbands=collectSeeds_L(rseeds, seqa, seqa_len, seqb, seqb_len);
1057     GList<GXAlnInfo> galns(true,true,false);
1058     GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
1059    
1060 gpertea 101 if (alnbands->qmatch) {
1061     #ifdef GDEBUG
1062     GMessage("::: Found a quick long match -- using it instead of diagonal bands\n");
1063     #endif
1064     anchor_seeds.Add(alnbands->qmatch);
1065     }
1066     else {
1067     int max_top_bands=5;
1068     int top_band_count=0;
1069     for (int b=0;b<alnbands->Count();b++) {
1070     if (alnbands->Get(b)->score<4) break;
1071     #ifdef GDEBUG
1072     GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
1073     #endif
1074     top_band_count++;
1075     GXBand& band=*(alnbands->Get(b));
1076     band.seeds.setSorted(cmpSeedScore);
1077     anchor_seeds.Add(band.seeds.First());
1078     if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1079     }
1080     #ifdef GDEBUG
1081     GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1082     #endif
1083     }
1084 gpertea 96 for (int i=0;i<anchor_seeds.Count();i++) {
1085     GXSeed& aseed=*anchor_seeds[i];
1086     int a1=aseed.a_ofs+(aseed.len>>1)+1;
1087     int a2=aseed.b_ofs+(aseed.len>>1)+1;
1088     GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1089 gpertea 101 seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1090     if (alninfo && alninfo->pid>=min_pid
1091     && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, seqa_len-alninfo->qr))
1092 gpertea 96 galns.AddIfNew(alninfo, true);
1093     else delete alninfo;
1094     }
1095    
1096     #ifdef GDEBUG
1097     for (int i=0;i<galns.Count();i++) {
1098     GXAlnInfo* alninfo=galns[i];
1099     GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
1100     alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
1101     if (alninfo->gapinfo!=NULL) {
1102     GMessage("Alignment:\n");
1103     alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1104     }
1105     }
1106     #endif
1107     //---- done
1108     delete alnbands;
1109     if (galns.Count()) {
1110     GXAlnInfo* bestaln=galns.Shift();
1111     return bestaln;
1112     }
1113     else return NULL;
1114     }