ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.cpp
Revision: 178
Committed: Thu Feb 16 07:10:45 2012 UTC (7 years, 5 months ago) by gpertea
File size: 40982 byte(s)
Log Message:
fqtrim

Line User Rev File contents
1 gpertea 93 #include "GAlnExtend.h"
2    
3     //greedy gapped alignment extension
4     //(mostly lifted from NCBI's megablast gapped extension code)
5    
6     int GXMemPool::kMinSpace = 1000000;
7    
8     #ifdef GDEBUG
9     char buf[6]={0x1B,'[', 'n','m','m','\0'};
10    
11     void color_fg(int c,FILE* f) {
12     if (f!=stderr && f!=stdout) return;
13     sprintf((char *)(&buf[2]),"%dm",c+30);
14     fwrite(buf,1,strlen(buf), f);
15     }
16    
17     void color_bg(int c, FILE* f) {
18     if (f!=stderr && f!=stdout) return;
19     sprintf((char *)(&buf[2]),"%dm",c+40);
20     fwrite(buf,1,strlen(buf),f);
21     };
22    
23     void color_resetfg(FILE* f) {
24     if (f!=stderr && f!=stdout) return;
25     sprintf((char *)(&buf[2]),"39m");
26     fwrite(buf,1,strlen(buf), f);
27     };
28    
29     void color_resetbg(FILE* f) {
30     if (f!=stderr && f!=stdout) return;
31     sprintf((char *)(&buf[2]),"49m");
32     fwrite(buf,1,strlen(buf), f);
33     }
34    
35     void color_reset(FILE* f) {
36     if (f!=stderr && f!=stdout) return;
37     sprintf((char *)(&buf[2]),"0m");
38     fwrite(buf,1,strlen(buf), f);
39     };
40    
41     void color_normal(FILE* f) {
42     if (f!=stderr && f!=stdout) return;
43     sprintf((char *)(&buf[2]),"22m");
44     fwrite(buf,1,strlen(buf), f);
45     };
46    
47     #endif
48    
49    
50     char xgapcodes[4]={'S','I', 'D', 'X'};
51    
52     int get_last(int **flast_d, int d, int diag, int *row1) {
53     if (flast_d[d-1][diag-1] > GMAX(flast_d[d-1][diag], flast_d[d-1][diag+1])) {
54     *row1 = flast_d[d-1][diag-1];
55     return diag-1;
56 gpertea 171 }
57 gpertea 93 if (flast_d[d-1][diag] > flast_d[d-1][diag+1]) {
58     *row1 = flast_d[d-1][diag];
59     return diag;
60     }
61     *row1 = flast_d[d-1][diag+1];
62     return diag+1;
63     }
64    
65     void GapXEditScript::print() { //debug
66     GapXEditScript* p=this;
67     do {
68 gpertea 171 GMessage("%d%c ",p->num, xgapcodes[p->op_type]);
69     } while ((p=p->next)!=NULL);
70 gpertea 93 GMessage("\n");
71     }
72    
73    
74     int BLAST_Gcd(int a, int b) {
75     int c;
76    
77     b = abs(b);
78     if (b > a)
79     c=a, a=b, b=c;
80    
81     while (b != 0) {
82     c = a%b;
83     a = b;
84     b = c;
85     }
86     return a;
87     }
88    
89     int BLAST_Gdb3(int* a, int* b, int* c) {
90     int g;
91     if (*b == 0)
92     g = BLAST_Gcd(*a, *c);
93     else
94     g = BLAST_Gcd(*a, BLAST_Gcd(*b, *c));
95     if (g > 1) {
96 gpertea 173 *a /= g;
97 gpertea 93 *b /= g;
98     *c /= g;
99     }
100     return g;
101     }
102    
103    
104 gpertea 171 uint16 get6mer(char* p) {
105     uint16 r=gdna2bit(p,3);
106 gpertea 173 r <<= 6;
107     r |= gdna2bit(p,3);
108 gpertea 171 return r;
109     }
110 gpertea 93
111 gpertea 173
112     void table6mers(const char* s, int slen, GVec<uint16>* amers[]) {
113 gpertea 174 for (uint16 i=0; i <= slen-6; i++) {
114 gpertea 173 char* p = (char*)(s+i);
115     uint16 v=get6mer(p);
116     if (amers[v]==NULL) {
117     amers[v]=new GVec<uint16>(1);
118     }
119     amers[v]->Add(i);
120     }
121     }
122    
123     GVec<uint16>* match6mer(char* start, GVec<uint16>* amers[]) {
124     //careful: this is broken if start+5 falls beyond the end of the string!
125     uint16 r=get6mer(start);
126     return amers[r];
127     }
128    
129 gpertea 93 //signal that a diagonal is invalid
130     static const int kInvalidOffset = -2;
131    
132     int s_FindFirstMismatch(const char *seq1, int len1,
133     const char *seq2, int len2,
134     int seq1_index, int seq2_index,
135     //bool &fence_hit,
136     bool reverse) {
137     int start_index = seq1_index;
138     /* Sentry detection here should be relatively inexpensive: The
139     sentry value cannot appear in the query, so detection only
140     needs to be done at exit from the subject-query matching loop.
141     For uncompressed sequences, ambiguities in the query (i.e. seq1)
142     always count as mismatches */
143     if (reverse) {
144     while (seq1_index < len1 && seq2_index < len2 &&
145     //seq1[len1-1 - seq1_index] < 4 &&
146     seq1[len1-1 - seq1_index] == seq2[len2-1 - seq2_index]) {
147     ++seq1_index;
148     ++seq2_index;
149     }
150     //if (seq2_index < len2 && seq2[len2-1-seq2_index] == FENCE_SENTRY) {
151     //if len2-1-seq2_index<=0) {
152     // fence_hit = true;
153     // }
154     }
155     else { //forward lookup
156 gpertea 171 while (seq1_index < len1 && seq2_index < len2 &&
157 gpertea 93 //seq1[seq1_index] < 4 &&
158     seq1[seq1_index] == seq2[seq2_index]) {
159     ++seq1_index;
160     ++seq2_index;
161     }
162     //if (seq2_index < len2 && seq2[seq2_index] == FENCE_SENTRY) {
163     //if (seq2_index==len2) {
164     // fence_hit = true;
165     //}
166     }
167     return seq1_index - start_index;
168     }
169    
170    
171    
172     /** During the traceback for a non-affine greedy alignment,
173     compute the diagonal that will result from the next
174     traceback operation
175    
176     @param last_seq2_off Array of offsets into the second sequence;
177     last_seq2_off[d][k] gives the largest offset into
178     the second sequence that lies on diagonal k and
179     has distance d [in]
180     @param d Starting distance [in]
181     @param diag Index of diagonal that produced the starting distance [in]
182     @param seq2_index The offset into the second sequence after the traceback
183     operation has completed [out]
184     @return The diagonal resulting from the next traceback operation
185     being applied
186     */
187     int s_GetNextNonAffineTback(int **last_seq2_off, int d,
188     int diag, int *seq2_index) {
189     // choose the traceback operation that results in the
190     // largest seq2 offset at this point, then compute the
191     // new diagonal that is implied by the operation
192     if (last_seq2_off[d-1][diag-1] >
193     GMAX(last_seq2_off[d-1][diag], last_seq2_off[d-1][diag+1])) {
194     *seq2_index = last_seq2_off[d-1][diag-1];
195     return diag - 1; // gap in seq2
196     }
197     if (last_seq2_off[d-1][diag] > last_seq2_off[d-1][diag+1]) {
198     *seq2_index = last_seq2_off[d-1][diag];
199     return diag; // match
200     }
201     *seq2_index = last_seq2_off[d-1][diag+1];
202     return diag + 1; // gap in seq1
203     }
204    
205    
206     int GXGreedyExtend(const char* seq1, int len1,
207     const char* seq2, int len2,
208     bool reverse, int xdrop_threshold,
209     int match_cost, int mismatch_cost,
210     int& seq1_align_len, int& seq2_align_len,
211 gpertea 101 CGreedyAlignData& aux_data,
212 gpertea 93 GXEditScript *edit_block) {
213     //GapPrelimEditBlock *edit_block,
214     //bool& fence_hit, SGreedySeed *seed) {
215     int seq1_index;
216     int seq2_index;
217     int index;
218     int d;
219     int k;
220     int diag_lower, diag_upper;
221     int max_dist;
222     int diag_origin;
223     int best_dist;
224     int best_diag;
225     int** last_seq2_off;
226     int* max_score;
227     int xdrop_offset;
228     int longest_match_run;
229     bool end1_reached, end2_reached;
230     GXMemPool* mem_pool;
231 gpertea 171
232 gpertea 93 /* ordinary dynamic programming alignment, for each offset
233     in seq1, walks through offsets in seq2 until an X-dropoff
234 gpertea 171 test fails, saving the best score encountered along
235 gpertea 93 the way. Instead of score, this code tracks the 'distance'
236     (number of mismatches plus number of gaps) between seq1
237     and seq2. Instead of walking through sequence offsets, it
238     walks through diagonals that can achieve a given distance.
239 gpertea 171
240 gpertea 93 Note that in what follows, the numbering of diagonals implies
241 gpertea 171 a dot matrix where increasing seq1 offsets go to the right on
242 gpertea 93 the x axis, and increasing seq2 offsets go up the y axis.
243     The gapped alignment thus proceeds up and to the right in
244     the graph, and diagonals are numbered increasing to the right */
245    
246     best_dist = 0;
247     best_diag = 0;
248    
249     /* set the number of distinct distances the algorithm will
250 gpertea 171 examine in the search for an optimal alignment. The
251     heuristic worst-case running time of the algorithm is
252 gpertea 93 O(max_dist**2 + (len1+len2)); for sequences which are
253     very similar, the average running time will be sig-
254     nificantly better than this */
255    
256     max_dist = GMIN(GREEDY_MAX_COST,
257 gpertea 109 (len2/GREEDY_MAX_COST_FRACTION + 1));
258 gpertea 93
259     /* the main loop assumes that the index of all diagonals is
260 gpertea 171 biased to lie in the middle of allocated bookkeeping
261 gpertea 93 structures */
262    
263     diag_origin = max_dist + 2;
264    
265     // last_seq2_off[d][k] is the largest offset into seq2 that
266     // lies on diagonal k and has distance d
267    
268     last_seq2_off = aux_data.last_seq2_off;
269    
270     /* Instead of tracking the best alignment score and using
271     xdrop_theshold directly, track the best score for each
272     unique distance and use the best score for some previously
273     computed distance to implement the X-dropoff test.
274    
275     xdrop_offset gives the distance backwards in the score
276     array to look */
277    
278 gpertea 171 xdrop_offset = (xdrop_threshold + match_cost / 2) /
279 gpertea 93 (match_cost + mismatch_cost) + 1;
280 gpertea 171
281 gpertea 93 // find the offset of the first mismatch between seq1 and seq2
282    
283     index = s_FindFirstMismatch(seq1, len1, seq2, len2, 0, 0, reverse);
284     // fence_hit, reverse, rem);
285    
286     // update the extents of the alignment, and bail out
287     // early if no further work is needed
288    
289     seq1_align_len = index;
290     seq2_align_len = index;
291     seq1_index = index;
292     /*
293     seed->start_q = 0;
294     seed->start_s = 0;
295     seed->match_length = index;
296     */
297     longest_match_run = index;
298    
299     if (index == len1 || index == len2) {
300     /* Return the number of differences, which is zero here */
301     if (edit_block != NULL)
302     //GapPrelimEditBlockAdd(edit_block, eGapAlignSub, index);
303     edit_block->opRep(index);
304 gpertea 171 return 0;
305 gpertea 93 }
306    
307     // set up the memory pool
308     mem_pool = aux_data.space;
309     if (edit_block == NULL) {
310     mem_pool = NULL;
311     }
312     else if (mem_pool == NULL) {
313     aux_data.space = mem_pool = new GXMemPool();
314 gpertea 171 }
315 gpertea 93 else {
316     mem_pool->refresh();
317     }
318 gpertea 171
319 gpertea 93 /* set up the array of per-distance maximum scores. There
320     are max_diags + xdrop_offset distances to track, the first
321     xdrop_offset of which are 0 */
322    
323     max_score = aux_data.max_score + xdrop_offset;
324     for (index = 0; index < xdrop_offset; index++)
325     aux_data.max_score[index] = 0;
326 gpertea 171
327 gpertea 93 // fill in the initial offsets of the distance matrix
328    
329     last_seq2_off[0][diag_origin] = seq1_index;
330     max_score[0] = seq1_index * match_cost;
331     diag_lower = diag_origin - 1;
332     diag_upper = diag_origin + 1;
333     end1_reached = end2_reached = false;
334    
335     // for each distance
336     for (d = 1; d <= max_dist; d++) {
337     int xdrop_score;
338     int curr_score;
339     int curr_extent = 0;
340     int curr_seq2_index = 0;
341     int curr_diag = 0;
342     int tmp_diag_lower = diag_lower;
343     int tmp_diag_upper = diag_upper;
344    
345     // Assign impossible seq2 offsets to any diagonals that
346     // are not in the range (diag_lower,diag_upper).
347     // These will serve as sentinel values for the inner loop
348     last_seq2_off[d - 1][diag_lower-1] = kInvalidOffset;
349     last_seq2_off[d - 1][diag_lower] = kInvalidOffset;
350     last_seq2_off[d - 1][diag_upper] = kInvalidOffset;
351     last_seq2_off[d - 1][diag_upper+1] = kInvalidOffset;
352    
353     // compute the score for distance d corresponding to the X-dropoff criterion
354    
355 gpertea 171 xdrop_score = max_score[d - xdrop_offset] +
356 gpertea 93 (match_cost + mismatch_cost) * d - xdrop_threshold;
357 gpertea 171 xdrop_score = (int)ceil((double)xdrop_score / (match_cost / 2));
358 gpertea 93
359     // for each diagonal of interest
360     for (k = tmp_diag_lower; k <= tmp_diag_upper; k++) {
361     /* find the largest offset into seq2 that increases
362     the distance from d-1 to d (i.e. keeps the alignment
363 gpertea 171 from getting worse for as long as possible), then
364 gpertea 93 choose the offset into seq1 that will keep the
365 gpertea 171 resulting diagonal fixed at k
366    
367 gpertea 93 Note that this requires kInvalidOffset+1 to be smaller
368     than any valid offset into seq2, i.e. to be negative */
369    
370     seq2_index = GMAX(last_seq2_off[d - 1][k + 1],
371     last_seq2_off[d - 1][k ]) + 1;
372     seq2_index = GMAX(seq2_index, last_seq2_off[d - 1][k - 1]);
373     seq1_index = seq2_index + k - diag_origin;
374    
375     if (seq2_index < 0 || seq1_index + seq2_index < xdrop_score) {
376    
377     // if no valid diagonal can reach distance d, or the
378     // X-dropoff test fails, narrow the range of diagonals
379     // to test and skip to the next diagonal
380     if (k == diag_lower)
381     diag_lower++;
382     else
383     last_seq2_off[d][k] = kInvalidOffset;
384     continue;
385     }
386     diag_upper = k;
387 gpertea 171
388     /* slide down diagonal k until a mismatch
389 gpertea 93 occurs. As long as only matches are encountered,
390     the current distance d will not change */
391    
392     index = s_FindFirstMismatch(seq1, len1, seq2, len2,
393     seq1_index, seq2_index, reverse);
394     //fence_hit, reverse, rem);
395     if (index > longest_match_run) {
396     //seed->start_q = seq1_index;
397     //seed->start_s = seq2_index;
398     //seed->match_length = index;
399     longest_match_run = index;
400     }
401     seq1_index += index;
402     seq2_index += index;
403    
404     // set the new largest seq2 offset that achieves
405     // distance d on diagonal k
406    
407     last_seq2_off[d][k] = seq2_index;
408    
409     // since all values of k are constrained to have the
410     // same distance d, the value of k which maximizes the
411     // alignment score is the one that covers the most of seq1 and seq2
412     if (seq1_index + seq2_index > curr_extent) {
413     curr_extent = seq1_index + seq2_index;
414     curr_seq2_index = seq2_index;
415     curr_diag = k;
416     }
417    
418     /* clamp the bounds on diagonals to avoid walking off
419     either sequence. Because the bounds increase by at
420     most one for each distance, diag_lower and diag_upper
421     can each be of size at most max_diags+2 */
422    
423     if (seq2_index == len2) {
424 gpertea 171 diag_lower = k + 1;
425 gpertea 93 end2_reached = true;
426     }
427     if (seq1_index == len1) {
428 gpertea 171 diag_upper = k - 1;
429 gpertea 93 end1_reached = true;
430     }
431     } // end loop over diagonals
432    
433     // compute the maximum score possible for distance d
434 gpertea 171 curr_score = curr_extent * (match_cost / 2) -
435 gpertea 93 d * (match_cost + mismatch_cost);
436     // if this is the best score seen so far, update the
437     // statistics of the best alignment
438     if (curr_score > max_score[d - 1]) {
439     max_score[d] = curr_score;
440     best_dist = d;
441     best_diag = curr_diag;
442     seq2_align_len = curr_seq2_index;
443     seq1_align_len = curr_seq2_index + best_diag - diag_origin;
444 gpertea 171 }
445 gpertea 93 else {
446     max_score[d] = max_score[d - 1];
447     }
448    
449     // alignment has finished if the lower and upper bounds
450     // on diagonals to check have converged to each other
451    
452     if (diag_lower > diag_upper)
453     break;
454    
455 gpertea 171 /* set up for the next distance to examine. Because the
456     bounds increase by at most one for each distance,
457     diag_lower and diag_upper can each be of size at
458 gpertea 93 most max_diags+2 */
459    
460     if (!end2_reached)
461 gpertea 171 diag_lower--;
462 gpertea 93 if (!end1_reached)
463     diag_upper++;
464    
465     if (edit_block == NULL) {
466     // if no traceback is specified, the next row of
467     // last_seq2_off can reuse previously allocated memory
468     //TODO FIXME The following assumes two arrays of
469     // at least max_dist+4 int's have already been allocated
470     last_seq2_off[d + 1] = last_seq2_off[d - 1];
471     }
472     else {
473     // traceback requires all rows of last_seq2_off to be saved,
474     // so a new row must be allocated
475     last_seq2_off[d + 1] = (int*)mem_pool->getByteSpace((diag_upper - diag_lower + 7)*sizeof(int));
476     // move the origin for this row backwards
477     //TODO FIXME: dubious pointer arithmetic ?!
478     //last_seq2_off[d + 1] = last_seq2_off[d + 1] - diag_lower + 2;
479     }
480     } // end loop over distinct distances
481    
482 gpertea 171
483 gpertea 93 if (edit_block == NULL)
484     return best_dist;
485    
486     //---- perform traceback
487 gpertea 171 d = best_dist;
488 gpertea 93 seq1_index = seq1_align_len;
489     seq2_index = seq2_align_len;
490     // for all positive distances
491    
492     //if (fence_hit && *fence_hit)
493     // goto done;
494     if (index==len1 || index==len2) d=0;
495     while (d > 0) {
496     int new_diag;
497     int new_seq2_index;
498    
499     /* retrieve the value of the diagonal after the next
500     traceback operation. best_diag starts off with the
501     value computed during the alignment process */
502    
503 gpertea 171 new_diag = s_GetNextNonAffineTback(last_seq2_off, d,
504 gpertea 93 best_diag, &new_seq2_index);
505    
506     if (new_diag == best_diag) {
507     // same diagonal: issue a group of substitutions
508     if (seq2_index - new_seq2_index > 0) {
509     edit_block->opRep(seq2_index - new_seq2_index);
510     }
511 gpertea 171 }
512 gpertea 93 else if (new_diag < best_diag) {
513     // smaller diagonal: issue a group of substitutions
514     // and then a gap in seq2 */
515     if (seq2_index - new_seq2_index > 0) {
516     edit_block->opRep(seq2_index - new_seq2_index);
517     }
518     //GapPrelimEditBlockAdd(edit_block, eGapAlignIns, 1);
519     edit_block->opIns(1);
520 gpertea 171 }
521 gpertea 93 else {
522     // larger diagonal: issue a group of substitutions
523     // and then a gap in seq1
524     if (seq2_index - new_seq2_index - 1 > 0) {
525     edit_block->opRep(seq2_index - new_seq2_index - 1);
526     }
527     edit_block->opDel(1);
528     }
529 gpertea 171 d--;
530     best_diag = new_diag;
531     seq2_index = new_seq2_index;
532 gpertea 93 }
533     //done:
534     // handle the final group of substitutions back to distance zero,
535     // i.e. back to offset zero of seq1 and seq2
536     //GapPrelimEditBlockAdd(edit_block, eGapAlignSub,
537     // last_seq2_off[0][diag_origin]);
538     edit_block->opRep(last_seq2_off[0][diag_origin]);
539     if (!reverse)
540     edit_block->reverse();
541     return best_dist;
542     }
543    
544     void printEditScript(GXEditScript* ed_script) {
545     uint i;
546     if (ed_script==NULL || ed_script->opnum == 0)
547     return;
548     for (i=0; i<ed_script->opnum; i++) {
549     int num=((ed_script->ops[i]) >> 2);
550     unsigned char op_type = 3 - ( ed_script->ops[i] & gxEDIT_OP_MASK );
551     if (op_type == 3)
552     GError("Error: printEditScript encountered op_type 3 ?!\n");
553 gpertea 171 GMessage("%d%c ", num, xgapcodes[op_type]);
554 gpertea 93 }
555     GMessage("\n");
556     }
557    
558     GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart,
559     bool editscript, int reward, int penalty, int xdrop) {
560     int q_max=strlen(q_seq); //query
561     int s_max=strlen(s_seq); //subj
562     return GreedyAlignRegion(q_seq, q_alnstart, q_max,
563 gpertea 101 s_seq, s_alnstart, s_max, reward, penalty, xdrop, NULL, NULL, editscript);
564 gpertea 93 }
565    
566     struct GXSeedTable {
567     int a_num, b_num;
568     int a_cap, b_cap;
569     char* xc;
570     GXSeedTable(int a=12, int b=255) {
571     a_cap=0;
572     b_cap=0;
573     a_num=0;
574     b_num=0;
575     xc=NULL;
576     init(a,b);
577     }
578     ~GXSeedTable() {
579     GFREE(xc);
580     }
581     void init(int a, int b) {
582     a_num=a;
583     b_num=b;
584     bool resize=false;
585     if (b_num>b_cap) { resize=true; b_cap=b_num;}
586     if (a_num>a_cap) { resize=true; a_cap=a_num;}
587     if (resize) {
588     GFREE(xc);
589     GCALLOC(xc, (a_num*b_num));
590     }
591     else {
592     //just clear up to a_max, b_max
593     memset((void*)xc, 0, (a_num*b_num));
594     }
595     }
596     char& x(int ax, int by) {
597     return xc[by*a_num+ax];
598     }
599    
600     };
601    
602 gpertea 99 const int a_m_score=2; //match score
603     const int a_mis_score=-3; //mismatch
604     const int a_dropoff_score=7;
605 gpertea 101 const int a_min_score=12; //at least 6 bases full match
606 gpertea 93
607 gpertea 99 // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
608     //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
609     //check minimum score and
610     //for 3' adapter trimming:
611     // require that the right end of the alignment for either the adaptor OR the read must be
612     // < 3 distance from its right end
613     // for 5' adapter trimming:
614     // require that the left end of the alignment for either the adaptor OR the read must
615     // be at coordinate < 3 from start
616    
617     bool extendUngapped(const char* a, int alen, int ai,
618     const char* b, int blen, int bi, int& mlen, int& l5, int& l3, bool end5=false) {
619     //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
620     //if (debug) {
621     // GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
622     // }
623     int a_l=ai; //alignment coordinates on a
624     int a_r=ai+mlen-1;
625     int b_l=bi; //alignment coordinates on b
626     int b_r=bi+mlen-1;
627     int ai_maxscore=ai;
628     int bi_maxscore=bi;
629     int score=mlen*a_m_score;
630     int maxscore=score;
631     int mism5score=a_mis_score;
632     if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
633     //try to extend to the left first, if possible
634     while (ai>0 && bi>0) {
635     ai--;
636     bi--;
637     score+= (a[ai]==b[bi])? a_m_score : mism5score;
638     if (score>maxscore) {
639     ai_maxscore=ai;
640     bi_maxscore=bi;
641     maxscore=score;
642     }
643     else if (maxscore-score>a_dropoff_score) break;
644     }
645     a_l=ai_maxscore;
646     b_l=bi_maxscore;
647     //now extend to the right
648     ai_maxscore=a_r;
649     bi_maxscore=b_r;
650     ai=a_r;
651     bi=b_r;
652     score=maxscore;
653     //sometimes there are extra As at the end of the read, ignore those
654     if (a[alen-2]=='A' && a[alen-1]=='A') {
655     alen-=2;
656     while (a[alen-1]=='A' && alen>ai) alen--;
657     }
658     while (ai<alen-1 && bi<blen-1) {
659     ai++;
660     bi++;
661     //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
662     if (a[ai]==b[bi]) { //match
663     score+=a_m_score;
664     if (ai>=alen-2) {
665     score+=a_m_score-(alen-ai-1);
666     }
667     }
668     else { //mismatch
669     score+=a_mis_score;
670     }
671     if (score>maxscore) {
672     ai_maxscore=ai;
673     bi_maxscore=bi;
674     maxscore=score;
675     }
676     else if (maxscore-score>a_dropoff_score) break;
677     }
678     a_r=ai_maxscore;
679     b_r=bi_maxscore;
680     int a_ovh3=alen-a_r-1;
681     int b_ovh3=blen-b_r-1;
682     int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
683     int mmovh5=(a_l<b_l)? a_l : b_l;
684     if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
685     if (a_l<a_ovh3) {
686     //adapter closer to the left end (typical for 5' adapter)
687     l5=a_r+1;
688     l3=alen-1;
689     }
690     else {
691     //adapter matching at the right end (typical for 3' adapter)
692     l5=0;
693     l3=a_l-1;
694     }
695     return true;
696     }
697     //do not trim:
698     l5=0;
699     l3=alen-1;
700     return false;
701     }
702    
703 gpertea 178 /*
704 gpertea 176 GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, GXSeqData& sd) {
705     int bimin=GMAX(0,(sd.blen-sd.alen-6));
706     GXSeedTable gx(sd.alen, sd.blen);
707     GXBandSet* diagstrips=new GXBandSet(sd.alen, sd.blen); //set of overlapping 3-diagonal strips
708     //for (int bi=0;bi<=b_len-6;bi++) {
709     for (int bi=sd.blen-6;bi>=0;bi--) {
710 gpertea 174 //for each 6-mer of seqb
711 gpertea 176 uint16 bv = get6mer((char*)&(sd.bseq[bi]));
712     GVec<uint16>* alocs = sd.amers[bv];
713 gpertea 174 if (alocs==NULL) continue;
714     //extend each hit
715     for (int h=0;h<alocs->Count();h++) {
716     int ai=alocs->Get(h);
717     //word match
718     if (gx.x(ai,bi))
719     //already have a previous seed covering this region of this diagonal
720     continue;
721     for (int i=0;i<6;i++)
722     gx.x(ai+i,bi+i)=1;
723 gpertea 176 //see if we can extend to the left
724     int aix=ai-1;
725     int bix=bi-1;
726 gpertea 174 int len=6;
727 gpertea 176 while (bix>=0 && aix>=0 && sd.aseq[aix]==sd.bseq[bix]) {
728 gpertea 174 gx.x(aix,bix)=1;
729 gpertea 176 ai=aix;
730     bi=bix;
731     aix--;bix--;
732 gpertea 174 len++;
733     }
734 gpertea 176 if (len>sd.amlen) {
735     //heuristics: likely the best we can get
736 gpertea 174 //quick match shortcut
737     diagstrips->qmatch=new GXSeed(ai,bi,len);
738     return diagstrips;
739 gpertea 176 }
740 gpertea 174 if (bi<bimin && len<9) continue; //skip middle seeds that are not high scoring enough
741     GXSeed* newseed=new GXSeed(ai,bi,len);
742     seeds.Add(newseed);
743     diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
744     //special last resort terminal match to be used if no better alignment is there
745 gpertea 176 if (ai<2 && bi+len>sd.blen-2 &&
746 gpertea 174 (!diagstrips->tmatch || diagstrips->tmatch->len<len)) diagstrips->tmatch=newseed;
747     }
748     }
749 gpertea 93 for (int i=0;i<diagstrips->Count();i++) {
750     diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
751     }
752     diagstrips->setSorted(true); //sort by score
753     return diagstrips;
754     }
755 gpertea 178 */
756     GXBandSet* collectSeeds(GList<GXSeed>& seeds, GXSeqData& sd) {
757 gpertea 176 int bimax=GMIN((sd.alen+2), (sd.blen-6));
758 gpertea 178 int bimin=GMAX(0,(sd.blen-sd.alen-6)); //from collectSeeds_R
759 gpertea 93 //gx.init(a_maxlen, b_maxlen);
760 gpertea 176 GXSeedTable gx(sd.alen, sd.blen);
761     GXBandSet* diagstrips=new GXBandSet(sd.alen, sd.blen); //set of overlapping 3-diagonal strips
762     for (int bi=0;bi<=sd.blen-6;bi++) {
763 gpertea 174 //for each 6-mer of seqb
764 gpertea 176 uint16 bv = get6mer((char*) & (sd.bseq[bi]));
765     GVec<uint16>* alocs = sd.amers[bv];
766 gpertea 174 if (alocs==NULL) continue;
767     //extend each hit
768     for (int h=0;h<alocs->Count();h++) {
769     int ai=alocs->Get(h);
770     //word match
771     if (gx.x(ai,bi))
772     //already have a previous seed covering this region of this diagonal
773     continue;
774     for (int i=0;i<6;i++)
775     gx.x(ai+i,bi+i)=1;
776     //see if we can extend to the right
777     int aix=ai+6;
778     int bix=bi+6;
779     int len=6;
780 gpertea 176 while (bix<sd.blen && aix<sd.alen && sd.aseq[aix]==sd.bseq[bix]) {
781 gpertea 174 gx.x(aix,bix)=1;
782     aix++;bix++;
783     len++;
784     }
785 gpertea 176 if (len>sd.amlen) {
786 gpertea 174 //heuristics: very likely the best we can get
787     //quick match shortcut
788     diagstrips->qmatch=new GXSeed(ai,bi,len);
789     return diagstrips;
790 gpertea 176 }
791 gpertea 174 if (bi>bimax && len<9) continue; //skip middle seeds that are not high scoring enough
792 gpertea 178 if (bi<bimin && len<9) continue; //collectSeeds_R
793    
794 gpertea 174 GXSeed* newseed=new GXSeed(ai,bi,len);
795     seeds.Add(newseed);
796     diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
797     //special last resort terminal match to be used if no better alignment is there
798 gpertea 176 if (bi<2 && ai+len>=sd.alen-1 &&
799 gpertea 178 (!diagstrips->tmatch_l || diagstrips->tmatch_l->len<len))
800     diagstrips->tmatch_l=newseed;
801     //collectSeeds_R:
802     if (ai<2 && bi+len>sd.blen-2 &&
803     (!diagstrips->tmatch_r || diagstrips->tmatch_r->len<len))
804     diagstrips->tmatch_r=newseed;
805 gpertea 174 }
806     } //for each 6-mer of the read
807 gpertea 93 for (int i=0;i<diagstrips->Count();i++) {
808     diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
809     }
810     diagstrips->setSorted(true); //sort by score
811     return diagstrips;
812     }
813    
814     int cmpSeedScore(const pointer p1, const pointer p2) {
815     //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
816     GXSeed* s1=(GXSeed*)p1;
817     GXSeed* s2=(GXSeed*)p2;
818     if (s1->len==s2->len) {
819     return (s1->b_ofs-s2->b_ofs);
820     }
821     else return (s2->len-s1->len);
822     }
823    
824 gpertea 174 int cmpSeedScore_R(const pointer p1, const pointer p2) {
825     //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
826     GXSeed* s1=(GXSeed*)p1;
827     GXSeed* s2=(GXSeed*)p2;
828     if (s1->len==s2->len) {
829     return (s2->b_ofs-s1->b_ofs);
830     }
831     else return (s2->len-s1->len);
832     }
833    
834    
835 gpertea 93 int cmpSeedDiag(const pointer p1, const pointer p2) {
836     GXSeed* s1=(GXSeed*)p1;
837     GXSeed* s2=(GXSeed*)p2;
838     return ((s1->b_ofs-s1->a_ofs)-(s2->b_ofs-s2->a_ofs));
839     }
840    
841 gpertea 101
842     int cmpDiagBands_R(const pointer p1, const pointer p2) {
843     //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
844     GXBand* b1=(GXBand*)p1;
845     GXBand* b2=(GXBand*)p2;
846     if (b1->score==b2->score) {
847     return (b2->w_min_b-b1->w_min_b);
848     }
849     else return (b2->score-b1->score);
850     }
851    
852    
853    
854 gpertea 93 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
855 gpertea 101 const char* s_seq, int s_alnstart, int s_max,
856     int reward, int penalty, int xdrop, CGreedyAlignData* gxmem,
857     CAlnTrim* trim, bool editscript) {
858 gpertea 93 GXEditScript* ed_script_fwd = NULL;
859     GXEditScript* ed_script_rev = NULL;
860     if ( q_alnstart>q_max || q_alnstart<1 || s_alnstart>s_max || s_alnstart<1 )
861     GError("GreedyAlign() Error: invalid anchor coordinate.\n");
862     q_alnstart--;
863     s_alnstart--;
864     if (q_seq==NULL || q_seq[0]==0 || s_seq==NULL || s_seq[0]==0)
865     GError("GreedyAlign() Error: attempt to use an empty sequence string!\n");
866     if (q_seq[q_alnstart]!=s_seq[s_alnstart])
867     GError("GreedyAlign() Error: improper anchor (mismatch)!\n");
868     int q_ext_l=0, q_ext_r=0, s_ext_l=0, s_ext_r=0;
869     const char* q=q_seq+q_alnstart;
870     int q_avail=q_max-q_alnstart;
871     const char* s=s_seq+s_alnstart;
872     int s_avail=s_max-s_alnstart;
873     if (penalty<0) penalty=-penalty;
874 gpertea 178 int MIN_GREEDY_SCORE=6*reward; //minimum score for an alignment to be reported for 0 diffs
875 gpertea 93 GXAlnInfo* alninfo=NULL;
876     bool freeAlnMem=(gxmem==NULL);
877     if (freeAlnMem) {
878 gpertea 101 gxmem=new CGreedyAlignData(reward, penalty, xdrop);
879 gpertea 93 }
880     else gxmem->reset();
881     int retscore = 0;
882     int numdiffs = 0;
883 gpertea 101 if (trim!=NULL && trim->type==galn_TrimLeft) {
884 gpertea 176 //intent: trimming the left side
885 gpertea 93 if (editscript)
886     ed_script_rev=new GXEditScript();
887    
888 gpertea 101 int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
889 gpertea 93 reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
890     //check this extension here and bail out if it's not a good extension
891 gpertea 178 if (s_ext_l+(trim->seedlen>>1) < trim->safelen &&
892     q_alnstart+1-q_ext_l>trim->boundary &&
893     s_alnstart+1-s_ext_l>trim->boundary) {
894 gpertea 93 delete ed_script_rev;
895     if (freeAlnMem) delete gxmem;
896     return NULL;
897     }
898    
899     if (editscript)
900     ed_script_fwd=new GXEditScript();
901     int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
902     reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
903     numdiffs=numdiffs_r+numdiffs_l;
904     //convert num diffs to actual score
905 gpertea 101 retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward+penalty);
906 gpertea 93 if (editscript)
907     ed_script_rev->Append(ed_script_fwd); //combine the two extensions
908     }
909     else {
910     if (editscript) {
911     ed_script_fwd=new GXEditScript();
912     }
913     int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, xdrop,
914     reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
915     //check extension here and bail out if not a good right extension
916     //assuming s_max is really at the right end of s_seq
917 gpertea 109 if (trim!=NULL && trim->type==galn_TrimRight &&
918 gpertea 176 s_ext_r+(trim->seedlen>>1) < trim->safelen &&
919 gpertea 178 q_alnstart+q_ext_r<q_max-3 &&
920 gpertea 176 s_alnstart+s_ext_r<trim->boundary) {
921 gpertea 93 delete ed_script_fwd;
922     if (freeAlnMem) delete gxmem;
923     return NULL;
924     }
925     if (editscript)
926     ed_script_rev=new GXEditScript();
927     int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
928     reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
929     //convert num diffs to actual score
930     numdiffs=numdiffs_r+numdiffs_l;
931 gpertea 101 retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward+penalty);
932 gpertea 93 if (editscript)
933     ed_script_rev->Append(ed_script_fwd); //combine the two extensions
934     }
935    
936     if (retscore>=MIN_GREEDY_SCORE) {
937     alninfo=new GXAlnInfo(q_seq, q_alnstart+1-q_ext_l,q_alnstart+q_ext_r, s_seq, s_alnstart+1-s_ext_l, s_alnstart+s_ext_r);
938     int hsp_length = GMIN(q_ext_l+q_ext_r, s_ext_l+s_ext_r);
939     alninfo->score=retscore;
940     alninfo->pid = 100 * (1 - ((double) numdiffs) / hsp_length);
941     #ifdef GDEBUG
942 gpertea 104 //if (ed_script_rev) {
943     // GMessage("Final Edit script ::: ");
944     // printEditScript(ed_script_rev);
945     // }
946 gpertea 93 #endif
947     alninfo->editscript=ed_script_rev;
948     alninfo->gapinfo = new CAlnGapInfo(ed_script_rev, alninfo->ql-1, alninfo->sl-1);
949     }
950     else {
951     if (freeAlnMem) delete gxmem;
952     delete ed_script_rev;
953     delete alninfo;
954     alninfo=NULL;
955     }
956     delete ed_script_fwd;
957     return alninfo;
958     }
959 gpertea 96
960 gpertea 101 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
961     const char* s_seq, int s_alnstart, int s_max, CGreedyAlignData* gxmem,
962     CAlnTrim* trim, bool editscript) {
963     int reward=2;
964     int penalty=3;
965     int xdrop=8;
966     if (gxmem) {
967     reward=gxmem->match_reward;
968     penalty=gxmem->mismatch_penalty;
969     xdrop=gxmem->x_drop;
970     }
971     return GreedyAlignRegion(q_seq, q_alnstart, q_max, s_seq, s_alnstart, s_max,
972     reward, penalty, xdrop, gxmem, trim, editscript);
973     }
974    
975 gpertea 178 GXAlnInfo* match_RightEnd(GXSeqData& sd, CGreedyAlignData* gxmem, int min_pid) {
976 gpertea 96 bool editscript=false;
977     #ifdef GDEBUG
978     editscript=true;
979 gpertea 176 GMessage("==========> matching Right (3') end : %s\n", sd.aseq);
980 gpertea 96 #endif
981 gpertea 176 CAlnTrim trimInfo(galn_TrimRight, sd.bseq, sd.blen, sd.amlen);
982     GList<GXSeed> rseeds(true,true,false);
983 gpertea 101
984 gpertea 178 GXBandSet* alnbands=collectSeeds(rseeds, sd);
985 gpertea 101 GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
986 gpertea 99 //did we find a shortcut?
987     if (alnbands->qmatch) {
988 gpertea 174 #ifdef GDEBUG
989     GMessage("::: Found a quick long match at %d, len %d\n",
990     alnbands->qmatch->b_ofs, alnbands->qmatch->len);
991     #endif
992 gpertea 101 anchor_seeds.Add(alnbands->qmatch);
993     }
994     else {
995     int max_top_bands=5;
996     int top_band_count=0;
997     for (int b=0;b<alnbands->Count();b++) {
998 gpertea 174 if (alnbands->Get(b)->score<6) break;
999 gpertea 104 //#ifdef GDEBUG
1000     //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
1001     //#endif
1002 gpertea 101 top_band_count++;
1003     GXBand& band=*(alnbands->Get(b));
1004     band.seeds.setSorted(cmpSeedScore);
1005     anchor_seeds.Add(band.seeds.First());
1006     band.tested=true;
1007     if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1008     }
1009 gpertea 104 //#ifdef GDEBUG
1010     //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1011     //#endif
1012 gpertea 101 }
1013 gpertea 96 GList<GXAlnInfo> galns(true,true,false);
1014     for (int i=0;i<anchor_seeds.Count();i++) {
1015     GXSeed& aseed=*anchor_seeds[i];
1016     int a1=aseed.a_ofs+(aseed.len>>1)+1;
1017     int a2=aseed.b_ofs+(aseed.len>>1)+1;
1018 gpertea 176 trimInfo.seedlen=aseed.len;
1019 gpertea 178 #ifdef GDEBUG
1020     GMessage("\t::: align from seed (%d, %d)of len %d.\n",aseed.a_ofs, aseed.b_ofs,
1021     aseed.len);
1022     #endif
1023 gpertea 176 GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen,
1024     sd.bseq, a2, sd.blen, gxmem, &trimInfo, editscript);
1025 gpertea 174 if (alninfo && alninfo->pid>=min_pid &&
1026 gpertea 176 trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1027 gpertea 96 galns.AddIfNew(alninfo, true);
1028     else delete alninfo;
1029     }
1030 gpertea 178 if (galns.Count()==0 && alnbands->tmatch_r) {
1031 gpertea 174 //last resort seed
1032 gpertea 178 GXSeed& aseed=*alnbands->tmatch_r;
1033 gpertea 176 int halfseed=aseed.len>>1;
1034     int a1=aseed.a_ofs+halfseed+1;
1035     int a2=aseed.b_ofs+halfseed+1;
1036     trimInfo.seedlen=aseed.len;
1037 gpertea 178 #ifdef GDEBUG
1038     GMessage("\t::: align from seed (%d, %d)of len %d.\n",aseed.a_ofs, aseed.b_ofs,
1039     aseed.len);
1040     #endif
1041 gpertea 176 GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen,
1042     sd.bseq, a2, sd.blen, gxmem, &trimInfo, editscript);
1043 gpertea 174 if (alninfo && alninfo->pid>=min_pid &&
1044 gpertea 176 trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1045 gpertea 174 galns.AddIfNew(alninfo, true);
1046     else delete alninfo;
1047     }
1048    
1049     /*
1050 gpertea 104 //special 3' end case: due to the seed scoring scheme being biased towards the 5' end of the read,
1051     //we should also try some seeds closer to the 3' end
1052 gpertea 101 if (galns.Count()==0) {
1053 gpertea 104 anchor_seeds.Clear();
1054 gpertea 101 alnbands->setSorted(cmpDiagBands_R);
1055     int max_top_bands=4;
1056     int top_band_count=0;
1057 gpertea 104 //#ifdef GDEBUG
1058     //GMessage(":::>> Retrying adjusting sort order.\n");
1059     //#endif
1060     if (alnbands->tmatch) {
1061     //anchor_seeds.setSorted(false);
1062     anchor_seeds.Add(alnbands->tmatch);
1063     }
1064 gpertea 101 for (int b=0;b<alnbands->Count();b++) {
1065     if (alnbands->Get(b)->score<4) break;
1066 gpertea 104 //#ifdef GDEBUG
1067     //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
1068     //#endif
1069 gpertea 101 if (alnbands->Get(b)->tested) continue;
1070     top_band_count++;
1071     GXBand& band=*(alnbands->Get(b));
1072     band.seeds.setSorted(cmpSeedScore);
1073     anchor_seeds.Add(band.seeds.First());
1074     if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1075     }
1076 gpertea 104 //#ifdef GDEBUG
1077     //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1078     //#endif
1079 gpertea 101 for (int i=0;i<anchor_seeds.Count();i++) {
1080     GXSeed& aseed=*anchor_seeds[i];
1081     int a1=aseed.a_ofs+(aseed.len>>1)+1;
1082     int a2=aseed.b_ofs+(aseed.len>>1)+1;
1083     GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1084     seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1085     if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1086     galns.AddIfNew(alninfo, true);
1087     else delete alninfo;
1088     }
1089 gpertea 174 } */
1090    
1091 gpertea 96 //---- done
1092     delete alnbands;
1093     if (galns.Count()) {
1094     GXAlnInfo* bestaln=galns.Shift();
1095 gpertea 104 #ifdef GDEBUG
1096     GMessage("Best alignment: a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", bestaln->ql, bestaln->qr,
1097     bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid);
1098     if (bestaln->gapinfo!=NULL) {
1099 gpertea 176 bestaln->gapinfo->printAlignment(stderr, sd.aseq, sd.alen, sd.bseq, sd.blen);
1100 gpertea 104 }
1101     #endif
1102    
1103 gpertea 96 return bestaln;
1104     }
1105     else return NULL;
1106     }
1107    
1108 gpertea 176 GXAlnInfo* match_LeftEnd(GXSeqData& sd, CGreedyAlignData* gxmem, int min_pid) {
1109 gpertea 96 bool editscript=false;
1110     #ifdef GDEBUG
1111     editscript=true;
1112 gpertea 176 GMessage("==========> matching Left (5') end : %s\n", sd.aseq);
1113 gpertea 96 #endif
1114 gpertea 176 CAlnTrim trimInfo(galn_TrimLeft, sd.bseq, sd.blen, sd.amlen);
1115 gpertea 96 GList<GXSeed> rseeds(true,true,false);
1116 gpertea 178 GXBandSet* alnbands = collectSeeds(rseeds, sd);
1117 gpertea 96 GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
1118 gpertea 101 if (alnbands->qmatch) {
1119 gpertea 174 #ifdef GDEBUG
1120     GMessage("::: Found a quick long match at %d, len %d\n",
1121     alnbands->qmatch->b_ofs, alnbands->qmatch->len);
1122     #endif
1123 gpertea 101 anchor_seeds.Add(alnbands->qmatch);
1124     }
1125     else {
1126     int max_top_bands=5;
1127     int top_band_count=0;
1128     for (int b=0;b<alnbands->Count();b++) {
1129 gpertea 174 if (alnbands->Get(b)->score<6) break;
1130 gpertea 104 //#ifdef GDEBUG
1131     //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
1132     //#endif
1133 gpertea 101 top_band_count++;
1134     GXBand& band=*(alnbands->Get(b));
1135     band.seeds.setSorted(cmpSeedScore);
1136     anchor_seeds.Add(band.seeds.First());
1137 gpertea 174 band.tested=true;
1138 gpertea 101 if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
1139     }
1140 gpertea 104 //#ifdef GDEBUG
1141     //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
1142     //#endif
1143 gpertea 101 }
1144 gpertea 174 GList<GXAlnInfo> galns(true,true,false);
1145     for (int i=0;i<anchor_seeds.Count();i++) {
1146 gpertea 96 GXSeed& aseed=*anchor_seeds[i];
1147     int a1=aseed.a_ofs+(aseed.len>>1)+1;
1148     int a2=aseed.b_ofs+(aseed.len>>1)+1;
1149 gpertea 176 trimInfo.seedlen=aseed.len;
1150 gpertea 178 #ifdef GDEBUG
1151     GMessage("\t::: align from seed (%d, %d)of len %d.\n",aseed.a_ofs, aseed.b_ofs,
1152     aseed.len);
1153     #endif
1154 gpertea 176 GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen,
1155     sd.bseq, a2, sd.blen, gxmem, &trimInfo, editscript);
1156 gpertea 101 if (alninfo && alninfo->pid>=min_pid
1157 gpertea 176 && trimInfo.validate(alninfo->sl, alninfo->sr,
1158     alninfo->pid, sd.alen-alninfo->qr))
1159 gpertea 96 galns.AddIfNew(alninfo, true);
1160     else delete alninfo;
1161     }
1162 gpertea 178 if (galns.Count()==0 && alnbands->tmatch_l) {
1163 gpertea 174 //last resort seed
1164 gpertea 178 GXSeed& aseed=*alnbands->tmatch_l;
1165 gpertea 104 int a1=aseed.a_ofs+(aseed.len>>1)+1;
1166     int a2=aseed.b_ofs+(aseed.len>>1)+1;
1167 gpertea 176 trimInfo.seedlen=aseed.len;
1168     GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen,
1169     sd.bseq, a2, sd.blen, gxmem, &trimInfo, editscript);
1170 gpertea 174 if (alninfo && alninfo->pid>=min_pid &&
1171 gpertea 176 trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, sd.alen-alninfo->qr))
1172 gpertea 174 galns.Add(alninfo);
1173 gpertea 104 }
1174 gpertea 174 /*
1175 gpertea 96 #ifdef GDEBUG
1176 gpertea 174 //print valid alignments found
1177 gpertea 96 for (int i=0;i<galns.Count();i++) {
1178     GXAlnInfo* alninfo=galns[i];
1179     GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
1180     alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
1181     if (alninfo->gapinfo!=NULL) {
1182     GMessage("Alignment:\n");
1183     alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1184     }
1185     }
1186 gpertea 174 #endif
1187 gpertea 104 */
1188 gpertea 96 //---- done
1189     delete alnbands;
1190     if (galns.Count()) {
1191     GXAlnInfo* bestaln=galns.Shift();
1192 gpertea 104 #ifdef GDEBUG
1193     GMessage("Best alignment: a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", bestaln->ql, bestaln->qr,
1194     bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid);
1195     if (bestaln->gapinfo!=NULL) {
1196 gpertea 176 bestaln->gapinfo->printAlignment(stderr, sd.aseq, sd.alen,
1197     sd.bseq, sd.blen);
1198 gpertea 104 }
1199     #endif
1200 gpertea 96 return bestaln;
1201     }
1202     else return NULL;
1203     }