53 |
|
if (flast_d[d-1][diag-1] > GMAX(flast_d[d-1][diag], flast_d[d-1][diag+1])) { |
54 |
|
*row1 = flast_d[d-1][diag-1]; |
55 |
|
return diag-1; |
56 |
< |
} |
56 |
> |
} |
57 |
|
if (flast_d[d-1][diag] > flast_d[d-1][diag+1]) { |
58 |
|
*row1 = flast_d[d-1][diag]; |
59 |
|
return diag; |
65 |
|
void GapXEditScript::print() { //debug |
66 |
|
GapXEditScript* p=this; |
67 |
|
do { |
68 |
< |
GMessage("%d%c ",p->num, xgapcodes[p->op_type]); |
69 |
< |
} while ((p=p->next)!=NULL); |
68 |
> |
GMessage("%d%c ",p->num, xgapcodes[p->op_type]); |
69 |
> |
} while ((p=p->next)!=NULL); |
70 |
|
GMessage("\n"); |
71 |
|
} |
72 |
|
|
93 |
|
else |
94 |
|
g = BLAST_Gcd(*a, BLAST_Gcd(*b, *c)); |
95 |
|
if (g > 1) { |
96 |
< |
*a /= g; |
96 |
> |
*a /= g; |
97 |
|
*b /= g; |
98 |
|
*c /= g; |
99 |
|
} |
101 |
|
} |
102 |
|
|
103 |
|
|
104 |
+ |
uint16 get6mer(char* p) { |
105 |
+ |
uint16 r=gdna2bit(p,3); |
106 |
+ |
r <<= 6; |
107 |
+ |
r |= gdna2bit(p,3); |
108 |
+ |
return r; |
109 |
+ |
} |
110 |
+ |
|
111 |
+ |
|
112 |
+ |
void table6mers(const char* s, int slen, GVec<uint16>* amers[]) { |
113 |
+ |
for (uint16 i=0; i <= slen-6; i++) { |
114 |
+ |
char* p = (char*)(s+i); |
115 |
+ |
uint16 v=get6mer(p); |
116 |
+ |
if (amers[v]==NULL) { |
117 |
+ |
amers[v]=new GVec<uint16>(1); |
118 |
+ |
} |
119 |
+ |
amers[v]->Add(i); |
120 |
+ |
} |
121 |
+ |
} |
122 |
+ |
|
123 |
+ |
GVec<uint16>* match6mer(char* start, GVec<uint16>* amers[]) { |
124 |
+ |
//careful: this is broken if start+5 falls beyond the end of the string! |
125 |
+ |
uint16 r=get6mer(start); |
126 |
+ |
return amers[r]; |
127 |
+ |
} |
128 |
|
|
129 |
|
//signal that a diagonal is invalid |
130 |
|
static const int kInvalidOffset = -2; |
153 |
|
// } |
154 |
|
} |
155 |
|
else { //forward lookup |
156 |
< |
while (seq1_index < len1 && seq2_index < len2 && |
156 |
> |
while (seq1_index < len1 && seq2_index < len2 && |
157 |
|
//seq1[seq1_index] < 4 && |
158 |
|
seq1[seq1_index] == seq2[seq2_index]) { |
159 |
|
++seq1_index; |
208 |
|
bool reverse, int xdrop_threshold, |
209 |
|
int match_cost, int mismatch_cost, |
210 |
|
int& seq1_align_len, int& seq2_align_len, |
211 |
< |
SGreedyAlignMem& aux_data, |
211 |
> |
CGreedyAlignData& aux_data, |
212 |
|
GXEditScript *edit_block) { |
213 |
|
//GapPrelimEditBlock *edit_block, |
214 |
|
//bool& fence_hit, SGreedySeed *seed) { |
228 |
|
int longest_match_run; |
229 |
|
bool end1_reached, end2_reached; |
230 |
|
GXMemPool* mem_pool; |
231 |
< |
|
231 |
> |
|
232 |
|
/* ordinary dynamic programming alignment, for each offset |
233 |
|
in seq1, walks through offsets in seq2 until an X-dropoff |
234 |
< |
test fails, saving the best score encountered along |
234 |
> |
test fails, saving the best score encountered along |
235 |
|
the way. Instead of score, this code tracks the 'distance' |
236 |
|
(number of mismatches plus number of gaps) between seq1 |
237 |
|
and seq2. Instead of walking through sequence offsets, it |
238 |
|
walks through diagonals that can achieve a given distance. |
239 |
< |
|
239 |
> |
|
240 |
|
Note that in what follows, the numbering of diagonals implies |
241 |
< |
a dot matrix where increasing seq1 offsets go to the right on |
241 |
> |
a dot matrix where increasing seq1 offsets go to the right on |
242 |
|
the x axis, and increasing seq2 offsets go up the y axis. |
243 |
|
The gapped alignment thus proceeds up and to the right in |
244 |
|
the graph, and diagonals are numbered increasing to the right */ |
247 |
|
best_diag = 0; |
248 |
|
|
249 |
|
/* set the number of distinct distances the algorithm will |
250 |
< |
examine in the search for an optimal alignment. The |
251 |
< |
heuristic worst-case running time of the algorithm is |
250 |
> |
examine in the search for an optimal alignment. The |
251 |
> |
heuristic worst-case running time of the algorithm is |
252 |
|
O(max_dist**2 + (len1+len2)); for sequences which are |
253 |
|
very similar, the average running time will be sig- |
254 |
|
nificantly better than this */ |
255 |
|
|
256 |
|
max_dist = GMIN(GREEDY_MAX_COST, |
257 |
< |
len2 / GREEDY_MAX_COST_FRACTION + 1); |
257 |
> |
(len2/GREEDY_MAX_COST_FRACTION + 1)); |
258 |
|
|
259 |
|
/* the main loop assumes that the index of all diagonals is |
260 |
< |
biased to lie in the middle of allocated bookkeeping |
260 |
> |
biased to lie in the middle of allocated bookkeeping |
261 |
|
structures */ |
262 |
|
|
263 |
|
diag_origin = max_dist + 2; |
275 |
|
xdrop_offset gives the distance backwards in the score |
276 |
|
array to look */ |
277 |
|
|
278 |
< |
xdrop_offset = (xdrop_threshold + match_cost / 2) / |
278 |
> |
xdrop_offset = (xdrop_threshold + match_cost / 2) / |
279 |
|
(match_cost + mismatch_cost) + 1; |
280 |
< |
|
280 |
> |
|
281 |
|
// find the offset of the first mismatch between seq1 and seq2 |
282 |
|
|
283 |
|
index = s_FindFirstMismatch(seq1, len1, seq2, len2, 0, 0, reverse); |
301 |
|
if (edit_block != NULL) |
302 |
|
//GapPrelimEditBlockAdd(edit_block, eGapAlignSub, index); |
303 |
|
edit_block->opRep(index); |
304 |
< |
return 0; |
304 |
> |
return 0; |
305 |
|
} |
306 |
|
|
307 |
|
// set up the memory pool |
311 |
|
} |
312 |
|
else if (mem_pool == NULL) { |
313 |
|
aux_data.space = mem_pool = new GXMemPool(); |
314 |
< |
} |
314 |
> |
} |
315 |
|
else { |
316 |
|
mem_pool->refresh(); |
317 |
|
} |
318 |
< |
|
318 |
> |
|
319 |
|
/* set up the array of per-distance maximum scores. There |
320 |
|
are max_diags + xdrop_offset distances to track, the first |
321 |
|
xdrop_offset of which are 0 */ |
323 |
|
max_score = aux_data.max_score + xdrop_offset; |
324 |
|
for (index = 0; index < xdrop_offset; index++) |
325 |
|
aux_data.max_score[index] = 0; |
326 |
< |
|
326 |
> |
|
327 |
|
// fill in the initial offsets of the distance matrix |
328 |
|
|
329 |
|
last_seq2_off[0][diag_origin] = seq1_index; |
352 |
|
|
353 |
|
// compute the score for distance d corresponding to the X-dropoff criterion |
354 |
|
|
355 |
< |
xdrop_score = max_score[d - xdrop_offset] + |
355 |
> |
xdrop_score = max_score[d - xdrop_offset] + |
356 |
|
(match_cost + mismatch_cost) * d - xdrop_threshold; |
357 |
< |
xdrop_score = (int)ceil((double)xdrop_score / (match_cost / 2)); |
357 |
> |
xdrop_score = (int)ceil((double)xdrop_score / (match_cost / 2)); |
358 |
|
|
359 |
|
// for each diagonal of interest |
360 |
|
for (k = tmp_diag_lower; k <= tmp_diag_upper; k++) { |
361 |
|
/* find the largest offset into seq2 that increases |
362 |
|
the distance from d-1 to d (i.e. keeps the alignment |
363 |
< |
from getting worse for as long as possible), then |
363 |
> |
from getting worse for as long as possible), then |
364 |
|
choose the offset into seq1 that will keep the |
365 |
< |
resulting diagonal fixed at k |
366 |
< |
|
365 |
> |
resulting diagonal fixed at k |
366 |
> |
|
367 |
|
Note that this requires kInvalidOffset+1 to be smaller |
368 |
|
than any valid offset into seq2, i.e. to be negative */ |
369 |
|
|
384 |
|
continue; |
385 |
|
} |
386 |
|
diag_upper = k; |
387 |
< |
|
388 |
< |
/* slide down diagonal k until a mismatch |
387 |
> |
|
388 |
> |
/* slide down diagonal k until a mismatch |
389 |
|
occurs. As long as only matches are encountered, |
390 |
|
the current distance d will not change */ |
391 |
|
|
421 |
|
can each be of size at most max_diags+2 */ |
422 |
|
|
423 |
|
if (seq2_index == len2) { |
424 |
< |
diag_lower = k + 1; |
424 |
> |
diag_lower = k + 1; |
425 |
|
end2_reached = true; |
426 |
|
} |
427 |
|
if (seq1_index == len1) { |
428 |
< |
diag_upper = k - 1; |
428 |
> |
diag_upper = k - 1; |
429 |
|
end1_reached = true; |
430 |
|
} |
431 |
|
} // end loop over diagonals |
432 |
|
|
433 |
|
// compute the maximum score possible for distance d |
434 |
< |
curr_score = curr_extent * (match_cost / 2) - |
434 |
> |
curr_score = curr_extent * (match_cost / 2) - |
435 |
|
d * (match_cost + mismatch_cost); |
436 |
|
// if this is the best score seen so far, update the |
437 |
|
// statistics of the best alignment |
441 |
|
best_diag = curr_diag; |
442 |
|
seq2_align_len = curr_seq2_index; |
443 |
|
seq1_align_len = curr_seq2_index + best_diag - diag_origin; |
444 |
< |
} |
444 |
> |
} |
445 |
|
else { |
446 |
|
max_score[d] = max_score[d - 1]; |
447 |
|
} |
452 |
|
if (diag_lower > diag_upper) |
453 |
|
break; |
454 |
|
|
455 |
< |
/* set up for the next distance to examine. Because the |
456 |
< |
bounds increase by at most one for each distance, |
457 |
< |
diag_lower and diag_upper can each be of size at |
455 |
> |
/* set up for the next distance to examine. Because the |
456 |
> |
bounds increase by at most one for each distance, |
457 |
> |
diag_lower and diag_upper can each be of size at |
458 |
|
most max_diags+2 */ |
459 |
|
|
460 |
|
if (!end2_reached) |
461 |
< |
diag_lower--; |
461 |
> |
diag_lower--; |
462 |
|
if (!end1_reached) |
463 |
|
diag_upper++; |
464 |
|
|
479 |
|
} |
480 |
|
} // end loop over distinct distances |
481 |
|
|
482 |
< |
|
482 |
> |
|
483 |
|
if (edit_block == NULL) |
484 |
|
return best_dist; |
485 |
|
|
486 |
|
//---- perform traceback |
487 |
< |
d = best_dist; |
487 |
> |
d = best_dist; |
488 |
|
seq1_index = seq1_align_len; |
489 |
|
seq2_index = seq2_align_len; |
490 |
|
// for all positive distances |
500 |
|
traceback operation. best_diag starts off with the |
501 |
|
value computed during the alignment process */ |
502 |
|
|
503 |
< |
new_diag = s_GetNextNonAffineTback(last_seq2_off, d, |
503 |
> |
new_diag = s_GetNextNonAffineTback(last_seq2_off, d, |
504 |
|
best_diag, &new_seq2_index); |
505 |
|
|
506 |
|
if (new_diag == best_diag) { |
508 |
|
if (seq2_index - new_seq2_index > 0) { |
509 |
|
edit_block->opRep(seq2_index - new_seq2_index); |
510 |
|
} |
511 |
< |
} |
511 |
> |
} |
512 |
|
else if (new_diag < best_diag) { |
513 |
|
// smaller diagonal: issue a group of substitutions |
514 |
|
// and then a gap in seq2 */ |
517 |
|
} |
518 |
|
//GapPrelimEditBlockAdd(edit_block, eGapAlignIns, 1); |
519 |
|
edit_block->opIns(1); |
520 |
< |
} |
520 |
> |
} |
521 |
|
else { |
522 |
|
// larger diagonal: issue a group of substitutions |
523 |
|
// and then a gap in seq1 |
526 |
|
} |
527 |
|
edit_block->opDel(1); |
528 |
|
} |
529 |
< |
d--; |
530 |
< |
best_diag = new_diag; |
531 |
< |
seq2_index = new_seq2_index; |
529 |
> |
d--; |
530 |
> |
best_diag = new_diag; |
531 |
> |
seq2_index = new_seq2_index; |
532 |
|
} |
533 |
|
//done: |
534 |
|
// handle the final group of substitutions back to distance zero, |
550 |
|
unsigned char op_type = 3 - ( ed_script->ops[i] & gxEDIT_OP_MASK ); |
551 |
|
if (op_type == 3) |
552 |
|
GError("Error: printEditScript encountered op_type 3 ?!\n"); |
553 |
< |
GMessage("%d%c ", num, xgapcodes[op_type]); |
553 |
> |
GMessage("%d%c ", num, xgapcodes[op_type]); |
554 |
|
} |
555 |
|
GMessage("\n"); |
556 |
|
} |
560 |
|
int q_max=strlen(q_seq); //query |
561 |
|
int s_max=strlen(s_seq); //subj |
562 |
|
return GreedyAlignRegion(q_seq, q_alnstart, q_max, |
563 |
< |
s_seq, s_alnstart, s_max, galn_NoTrim, editscript, NULL, reward, penalty, xdrop); |
563 |
> |
s_seq, s_alnstart, s_max, reward, penalty, xdrop, NULL, NULL, editscript); |
564 |
|
} |
565 |
|
|
566 |
|
struct GXSeedTable { |
602 |
|
const int a_m_score=2; //match score |
603 |
|
const int a_mis_score=-3; //mismatch |
604 |
|
const int a_dropoff_score=7; |
605 |
+ |
const int a_min_score=12; //at least 6 bases full match |
606 |
|
|
607 |
|
// ------------------ adapter matching - simple k-mer seed & extend, no indels for now |
608 |
|
//when a k-mer match is found, simply try to extend the alignment using a drop-off scheme |
701 |
|
} |
702 |
|
|
703 |
|
|
704 |
< |
bool extendUngapped_R(const char* seqa, int a_len, int &aix, |
705 |
< |
const char* seqb, int b_len, int &bix, int &len) { |
706 |
< |
//must start with a match |
707 |
< |
if (seqa[aix]!=seqb[bix]) GError("Error at extendUngapped_R: no initial match!\n"); |
708 |
< |
|
709 |
< |
} |
710 |
< |
|
711 |
< |
GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) { |
712 |
< |
//overlap of right (3') end of seqb |
713 |
< |
//hash the first 12 bases of seqa: |
714 |
< |
int aimin=0; |
715 |
< |
int aimax=GMIN(9,(a_len-4)); |
716 |
< |
int bimin=GMAX(0,(b_len-a_len-2));//allow 2 extraneous bases, just in case there is some garbage at the end |
717 |
< |
int bimax=b_len-4; |
718 |
< |
int a_maxlen=aimax+4; //number of rows in the diagonal matrix |
719 |
< |
int b_maxlen=b_len; //number of cols in the diagonal matrix |
720 |
< |
GXSeedTable gx(a_maxlen, b_maxlen); |
721 |
< |
GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips |
722 |
< |
for (int ai=aimin;ai<=aimax;ai++) { |
723 |
< |
int32* av=(int32 *)(&seqa[ai]); |
724 |
< |
//for (int bi=b_len-4;bi>=bimin;bi--) { |
725 |
< |
for (int bi=bimin;bi<=bimax;bi++) { |
726 |
< |
if (gx.x(ai,bi)) |
727 |
< |
continue; //already have a previous seed covering this region of this diagonal |
728 |
< |
int32* bv=(int32 *)(&seqb[bi]); |
729 |
< |
if (*av == *bv) { |
730 |
< |
//word match |
731 |
< |
//see if we can extend to the right |
732 |
< |
gx.x(ai+1,bi+1)=1; |
733 |
< |
gx.x(ai+2,bi+2)=1; |
734 |
< |
gx.x(ai+3,bi+3)=1; |
735 |
< |
int aix=ai+4; |
736 |
< |
int bix=bi+4; |
737 |
< |
int len=4; |
738 |
< |
while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix]) |
739 |
< |
{ |
740 |
< |
gx.x(aix,bix)=1; |
741 |
< |
aix++;bix++; |
742 |
< |
len++; |
743 |
< |
} |
744 |
< |
if (len>8) { |
745 |
< |
//heuristics: very likely the best we can get |
746 |
< |
int qlen=len; |
722 |
< |
while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) { |
723 |
< |
aix++; |
724 |
< |
bix++; |
725 |
< |
qlen++; |
726 |
< |
} |
727 |
< |
if (qlen>10) { |
728 |
< |
diagstrips->qmatch=new GXSeed(ai,bi,qlen); |
729 |
< |
return diagstrips; |
730 |
< |
} |
731 |
< |
} |
732 |
< |
GXSeed* newseed=new GXSeed(ai,bi,len); |
733 |
< |
seeds.Add(newseed); |
734 |
< |
diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals |
735 |
< |
} |
736 |
< |
} |
737 |
< |
}//for each 4-mer at the beginning of seqa |
704 |
> |
GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, |
705 |
> |
GVec<uint16>* amers[], const char* seqb, int b_len) { |
706 |
> |
int bimin=GMAX(0,(b_len-a_len-6)); |
707 |
> |
GXSeedTable gx(a_len, b_len); |
708 |
> |
GXBandSet* diagstrips=new GXBandSet(a_len, b_len); //set of overlapping 3-diagonal strips |
709 |
> |
for (int bi=0;bi<=b_len-6;bi++) { |
710 |
> |
//for each 6-mer of seqb |
711 |
> |
uint16 bv = get6mer((char*)&seqb[bi]); |
712 |
> |
GVec<uint16>* alocs = amers[bv]; |
713 |
> |
if (alocs==NULL) continue; |
714 |
> |
//extend each hit |
715 |
> |
for (int h=0;h<alocs->Count();h++) { |
716 |
> |
int ai=alocs->Get(h); |
717 |
> |
//word match |
718 |
> |
if (gx.x(ai,bi)) |
719 |
> |
//already have a previous seed covering this region of this diagonal |
720 |
> |
continue; |
721 |
> |
for (int i=0;i<6;i++) |
722 |
> |
gx.x(ai+i,bi+i)=1; |
723 |
> |
//see if we can extend to the right |
724 |
> |
int aix=ai+6; |
725 |
> |
int bix=bi+6; |
726 |
> |
int len=6; |
727 |
> |
while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) { |
728 |
> |
gx.x(aix,bix)=1; |
729 |
> |
aix++;bix++; |
730 |
> |
len++; |
731 |
> |
} |
732 |
> |
/*if (len>22) { |
733 |
> |
//heuristics: very likely the best we can get |
734 |
> |
//quick match shortcut |
735 |
> |
diagstrips->qmatch=new GXSeed(ai,bi,len); |
736 |
> |
return diagstrips; |
737 |
> |
} */ |
738 |
> |
if (bi<bimin && len<9) continue; //skip middle seeds that are not high scoring enough |
739 |
> |
GXSeed* newseed=new GXSeed(ai,bi,len); |
740 |
> |
seeds.Add(newseed); |
741 |
> |
diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals |
742 |
> |
//special last resort terminal match to be used if no better alignment is there |
743 |
> |
if (ai<2 && bi+len>b_len-2 && |
744 |
> |
(!diagstrips->tmatch || diagstrips->tmatch->len<len)) diagstrips->tmatch=newseed; |
745 |
> |
} |
746 |
> |
} |
747 |
|
for (int i=0;i<diagstrips->Count();i++) { |
748 |
|
diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds |
749 |
|
} |
751 |
|
return diagstrips; |
752 |
|
} |
753 |
|
|
754 |
< |
GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) { |
754 |
> |
GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len, |
755 |
> |
GVec<uint16>* amers[], const char* seqb, int b_len) { |
756 |
|
//overlap of left (5') end of seqb |
757 |
|
//hash the last 12 bases of seqa: |
758 |
< |
int aimin=GMAX(0,(a_len-16)); |
749 |
< |
int aimax=a_len-4; |
750 |
< |
int bimin=0; |
751 |
< |
int bimax=GMIN((a_len-2), (b_len-4)); |
752 |
< |
int a_maxlen=aimax+4; //number of rows in the diagonal matrix |
753 |
< |
int b_maxlen=b_len; //number of cols in the diagonal matrix |
758 |
> |
int bimax=GMIN((a_len+2), (b_len-6)); |
759 |
|
//gx.init(a_maxlen, b_maxlen); |
760 |
< |
GXSeedTable gx(a_maxlen, b_maxlen); |
761 |
< |
GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips |
762 |
< |
for (int ai=aimin;ai<=aimax;ai++) { |
763 |
< |
int32* av=(int32 *)(&seqa[ai]); |
764 |
< |
for (int bi=bimin;bi<=bimax;bi++) { |
765 |
< |
if (gx.x(ai,bi)) |
766 |
< |
continue; //already have a previous seed covering this region of this diagonal |
767 |
< |
int32* bv=(int32 *)(&seqb[bi]); |
768 |
< |
if (*av == *bv) { |
769 |
< |
//word match |
770 |
< |
//see if we can extend to the right |
771 |
< |
gx.x(ai+1,bi+1)=1; |
772 |
< |
gx.x(ai+2,bi+2)=1; |
773 |
< |
gx.x(ai+3,bi+3)=1; |
774 |
< |
int aix=ai+4; |
775 |
< |
int bix=bi+4; |
776 |
< |
int len=4; |
777 |
< |
while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix]) |
778 |
< |
{ |
779 |
< |
gx.x(aix,bix)=1; |
780 |
< |
aix++;bix++; |
781 |
< |
len++; |
782 |
< |
} |
783 |
< |
GXSeed* newseed=new GXSeed(ai,bi,len); |
784 |
< |
seeds.Add(newseed); |
785 |
< |
diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals |
786 |
< |
} |
787 |
< |
} |
788 |
< |
}//for each 4-mer at the end of seqa |
760 |
> |
GXSeedTable gx(a_len, b_len); |
761 |
> |
GXBandSet* diagstrips=new GXBandSet(a_len, b_len); //set of overlapping 3-diagonal strips |
762 |
> |
for (int bi=0;bi<=b_len-6;bi++) { |
763 |
> |
//for each 6-mer of seqb |
764 |
> |
uint16 bv = get6mer((char*)&seqb[bi]); |
765 |
> |
GVec<uint16>* alocs = amers[bv]; |
766 |
> |
if (alocs==NULL) continue; |
767 |
> |
//extend each hit |
768 |
> |
for (int h=0;h<alocs->Count();h++) { |
769 |
> |
int ai=alocs->Get(h); |
770 |
> |
//word match |
771 |
> |
if (gx.x(ai,bi)) |
772 |
> |
//already have a previous seed covering this region of this diagonal |
773 |
> |
continue; |
774 |
> |
for (int i=0;i<6;i++) |
775 |
> |
gx.x(ai+i,bi+i)=1; |
776 |
> |
//see if we can extend to the right |
777 |
> |
int aix=ai+6; |
778 |
> |
int bix=bi+6; |
779 |
> |
int len=6; |
780 |
> |
while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) { |
781 |
> |
gx.x(aix,bix)=1; |
782 |
> |
aix++;bix++; |
783 |
> |
len++; |
784 |
> |
} |
785 |
> |
/* if (len>22) { |
786 |
> |
//heuristics: very likely the best we can get |
787 |
> |
//quick match shortcut |
788 |
> |
diagstrips->qmatch=new GXSeed(ai,bi,len); |
789 |
> |
return diagstrips; |
790 |
> |
} */ |
791 |
> |
if (bi>bimax && len<9) continue; //skip middle seeds that are not high scoring enough |
792 |
> |
GXSeed* newseed=new GXSeed(ai,bi,len); |
793 |
> |
seeds.Add(newseed); |
794 |
> |
diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals |
795 |
> |
//special last resort terminal match to be used if no better alignment is there |
796 |
> |
if (bi<2 && ai+len>=a_len-1 && |
797 |
> |
(!diagstrips->tmatch || diagstrips->tmatch->len<len)) |
798 |
> |
diagstrips->tmatch=newseed; |
799 |
> |
} |
800 |
> |
} //for each 6-mer of the read |
801 |
|
for (int i=0;i<diagstrips->Count();i++) { |
802 |
|
diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds |
803 |
|
} |
805 |
|
return diagstrips; |
806 |
|
} |
807 |
|
|
791 |
– |
|
792 |
– |
|
808 |
|
int cmpSeedScore(const pointer p1, const pointer p2) { |
809 |
|
//return (((GXSeed*)s2)->len-((GXSeed*)s1)->len); |
810 |
|
GXSeed* s1=(GXSeed*)p1; |
815 |
|
else return (s2->len-s1->len); |
816 |
|
} |
817 |
|
|
818 |
+ |
int cmpSeedScore_R(const pointer p1, const pointer p2) { |
819 |
+ |
//return (((GXSeed*)s2)->len-((GXSeed*)s1)->len); |
820 |
+ |
GXSeed* s1=(GXSeed*)p1; |
821 |
+ |
GXSeed* s2=(GXSeed*)p2; |
822 |
+ |
if (s1->len==s2->len) { |
823 |
+ |
return (s2->b_ofs-s1->b_ofs); |
824 |
+ |
} |
825 |
+ |
else return (s2->len-s1->len); |
826 |
+ |
} |
827 |
+ |
|
828 |
+ |
|
829 |
|
int cmpSeedDiag(const pointer p1, const pointer p2) { |
830 |
|
GXSeed* s1=(GXSeed*)p1; |
831 |
|
GXSeed* s2=(GXSeed*)p2; |
832 |
|
return ((s1->b_ofs-s1->a_ofs)-(s2->b_ofs-s2->a_ofs)); |
833 |
|
} |
834 |
|
|
835 |
+ |
|
836 |
+ |
int cmpDiagBands_R(const pointer p1, const pointer p2) { |
837 |
+ |
//return (((GXSeed*)s2)->len-((GXSeed*)s1)->len); |
838 |
+ |
GXBand* b1=(GXBand*)p1; |
839 |
+ |
GXBand* b2=(GXBand*)p2; |
840 |
+ |
if (b1->score==b2->score) { |
841 |
+ |
return (b2->w_min_b-b1->w_min_b); |
842 |
+ |
} |
843 |
+ |
else return (b2->score-b1->score); |
844 |
+ |
} |
845 |
+ |
|
846 |
+ |
|
847 |
+ |
|
848 |
|
GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max, |
849 |
< |
const char* s_seq, int s_alnstart, int s_max, GAlnTrimType trimtype, |
850 |
< |
bool editscript, SGreedyAlignMem* gxmem, int reward, int penalty, int xdrop) { |
849 |
> |
const char* s_seq, int s_alnstart, int s_max, |
850 |
> |
int reward, int penalty, int xdrop, CGreedyAlignData* gxmem, |
851 |
> |
CAlnTrim* trim, bool editscript) { |
852 |
|
GXEditScript* ed_script_fwd = NULL; |
853 |
|
GXEditScript* ed_script_rev = NULL; |
854 |
|
if ( q_alnstart>q_max || q_alnstart<1 || s_alnstart>s_max || s_alnstart<1 ) |
865 |
|
const char* s=s_seq+s_alnstart; |
866 |
|
int s_avail=s_max-s_alnstart; |
867 |
|
if (penalty<0) penalty=-penalty; |
868 |
< |
int MIN_GREEDY_SCORE=5*reward; //minimum score for an alignment to be reported |
868 |
> |
int MIN_GREEDY_SCORE=5*reward; //minimum score for an alignment to be reported for 0 diffs |
869 |
|
GXAlnInfo* alninfo=NULL; |
870 |
|
bool freeAlnMem=(gxmem==NULL); |
871 |
|
if (freeAlnMem) { |
872 |
< |
gxmem=new SGreedyAlignMem(reward, penalty, xdrop); |
872 |
> |
gxmem=new CGreedyAlignData(reward, penalty, xdrop); |
873 |
|
} |
874 |
|
else gxmem->reset(); |
875 |
|
int retscore = 0; |
876 |
|
int numdiffs = 0; |
877 |
< |
if (trimtype==galn_TrimLeft) { |
877 |
> |
if (trim!=NULL && trim->type==galn_TrimLeft) { |
878 |
|
if (editscript) |
879 |
|
ed_script_rev=new GXEditScript(); |
880 |
|
|
881 |
< |
int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop, |
881 |
> |
int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop, |
882 |
|
reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev); |
883 |
|
//check this extension here and bail out if it's not a good extension |
884 |
< |
if (s_alnstart+1-s_ext_l>3) { |
884 |
> |
if (s_alnstart+1-s_ext_l>trim->boundary) { |
885 |
|
delete ed_script_rev; |
886 |
|
if (freeAlnMem) delete gxmem; |
887 |
|
return NULL; |
893 |
|
reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd); |
894 |
|
numdiffs=numdiffs_r+numdiffs_l; |
895 |
|
//convert num diffs to actual score |
896 |
< |
retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward-penalty); |
896 |
> |
retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward+penalty); |
897 |
|
if (editscript) |
898 |
|
ed_script_rev->Append(ed_script_fwd); //combine the two extensions |
899 |
|
} |
905 |
|
reward, penalty, s_ext_r, q_ext_r, *gxmem, ed_script_fwd); |
906 |
|
//check extension here and bail out if not a good right extension |
907 |
|
//assuming s_max is really at the right end of s_seq |
908 |
< |
if (trimtype==galn_TrimRight && s_alnstart+s_ext_r<s_max-2) { |
908 |
> |
if (trim!=NULL && trim->type==galn_TrimRight && |
909 |
> |
s_alnstart+s_ext_r<trim->boundary) { |
910 |
|
delete ed_script_fwd; |
911 |
|
if (freeAlnMem) delete gxmem; |
912 |
|
return NULL; |
917 |
|
reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev); |
918 |
|
//convert num diffs to actual score |
919 |
|
numdiffs=numdiffs_r+numdiffs_l; |
920 |
< |
retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward-penalty); |
920 |
> |
retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - numdiffs*(reward+penalty); |
921 |
|
if (editscript) |
922 |
|
ed_script_rev->Append(ed_script_fwd); //combine the two extensions |
923 |
|
} |
928 |
|
alninfo->score=retscore; |
929 |
|
alninfo->pid = 100 * (1 - ((double) numdiffs) / hsp_length); |
930 |
|
#ifdef GDEBUG |
931 |
< |
if (ed_script_rev) { |
932 |
< |
GMessage("Final Edit script ::: "); |
933 |
< |
printEditScript(ed_script_rev); |
934 |
< |
} |
931 |
> |
//if (ed_script_rev) { |
932 |
> |
// GMessage("Final Edit script ::: "); |
933 |
> |
// printEditScript(ed_script_rev); |
934 |
> |
// } |
935 |
|
#endif |
936 |
|
alninfo->editscript=ed_script_rev; |
937 |
|
alninfo->gapinfo = new CAlnGapInfo(ed_script_rev, alninfo->ql-1, alninfo->sl-1); |
946 |
|
return alninfo; |
947 |
|
} |
948 |
|
|
949 |
< |
GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len, |
950 |
< |
SGreedyAlignMem* gxmem, int match_reward, int mismatch_penalty, int Xdrop) { |
949 |
> |
GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max, |
950 |
> |
const char* s_seq, int s_alnstart, int s_max, CGreedyAlignData* gxmem, |
951 |
> |
CAlnTrim* trim, bool editscript) { |
952 |
> |
int reward=2; |
953 |
> |
int penalty=3; |
954 |
> |
int xdrop=8; |
955 |
> |
if (gxmem) { |
956 |
> |
reward=gxmem->match_reward; |
957 |
> |
penalty=gxmem->mismatch_penalty; |
958 |
> |
xdrop=gxmem->x_drop; |
959 |
> |
} |
960 |
> |
return GreedyAlignRegion(q_seq, q_alnstart, q_max, s_seq, s_alnstart, s_max, |
961 |
> |
reward, penalty, xdrop, gxmem, trim, editscript); |
962 |
> |
} |
963 |
> |
|
964 |
> |
GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, GVec<uint16>* amers[], |
965 |
> |
const char* seqb, int seqb_len, CGreedyAlignData* gxmem, int min_pid) { |
966 |
|
bool editscript=false; |
967 |
|
#ifdef GDEBUG |
968 |
|
editscript=true; |
969 |
< |
GMessage("==========> matching Right (3') end ========\n"); |
969 |
> |
GMessage("==========> matching Right (3') end : %s\n", seqa); |
970 |
|
#endif |
971 |
+ |
|
972 |
+ |
CAlnTrim trimInfo(galn_TrimRight, seqb, seqb_len); |
973 |
|
GList<GXSeed> rseeds(true,true,false); |
974 |
< |
GXBandSet* alnbands=collectSeeds_R(rseeds, seqa, seqa_len, seqb, seqb_len); |
974 |
> |
GXBandSet* alnbands=collectSeeds_R(rseeds, seqa, seqa_len, amers, seqb, seqb_len); |
975 |
> |
GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal |
976 |
|
//did we find a shortcut? |
977 |
|
if (alnbands->qmatch) { |
978 |
< |
|
978 |
> |
#ifdef GDEBUG |
979 |
> |
GMessage("::: Found a quick long match at %d, len %d\n", |
980 |
> |
alnbands->qmatch->b_ofs, alnbands->qmatch->len); |
981 |
> |
#endif |
982 |
> |
anchor_seeds.Add(alnbands->qmatch); |
983 |
> |
} |
984 |
> |
else { |
985 |
> |
int max_top_bands=5; |
986 |
> |
int top_band_count=0; |
987 |
> |
for (int b=0;b<alnbands->Count();b++) { |
988 |
> |
if (alnbands->Get(b)->score<6) break; |
989 |
> |
//#ifdef GDEBUG |
990 |
> |
//GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score); |
991 |
> |
//#endif |
992 |
> |
top_band_count++; |
993 |
> |
GXBand& band=*(alnbands->Get(b)); |
994 |
> |
band.seeds.setSorted(cmpSeedScore); |
995 |
> |
anchor_seeds.Add(band.seeds.First()); |
996 |
> |
band.tested=true; |
997 |
> |
if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break; |
998 |
> |
} |
999 |
> |
//#ifdef GDEBUG |
1000 |
> |
//GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count()); |
1001 |
> |
//#endif |
1002 |
|
} |
1003 |
|
GList<GXAlnInfo> galns(true,true,false); |
922 |
– |
int max_top_bands=5; |
923 |
– |
int top_band_count=0; |
924 |
– |
GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal |
925 |
– |
for (int b=0;b<alnbands->Count();b++) { |
926 |
– |
if (alnbands->Get(b)->score<4) break; |
927 |
– |
top_band_count++; |
928 |
– |
GXBand& band=*(alnbands->Get(b)); |
929 |
– |
band.seeds.setSorted(cmpSeedScore); |
930 |
– |
anchor_seeds.Add(band.seeds.First()); |
931 |
– |
if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break; |
932 |
– |
} |
933 |
– |
#ifdef GDEBUG |
934 |
– |
GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count()); |
935 |
– |
#endif |
936 |
– |
|
1004 |
|
for (int i=0;i<anchor_seeds.Count();i++) { |
1005 |
|
GXSeed& aseed=*anchor_seeds[i]; |
1006 |
|
int a1=aseed.a_ofs+(aseed.len>>1)+1; |
1007 |
|
int a2=aseed.b_ofs+(aseed.len>>1)+1; |
1008 |
|
GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len, |
1009 |
< |
seqb, a2, seqb_len, galn_TrimRight, editscript, gxmem, |
1010 |
< |
match_reward, mismatch_penalty, Xdrop); |
1011 |
< |
if (alninfo && alninfo->pid>73 && alninfo->sr>seqb_len-4 && alninfo->ql<4) |
1009 |
> |
seqb, a2, seqb_len, gxmem, &trimInfo, editscript); |
1010 |
> |
if (alninfo && alninfo->pid>=min_pid && |
1011 |
> |
trimInfo.validate(seqa_len, alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1)) |
1012 |
|
galns.AddIfNew(alninfo, true); |
1013 |
|
else delete alninfo; |
1014 |
|
} |
1015 |
+ |
if (galns.Count()==0 && alnbands->tmatch) { |
1016 |
+ |
//last resort seed |
1017 |
+ |
GXSeed& aseed=*alnbands->tmatch; |
1018 |
+ |
int a1=aseed.a_ofs+(aseed.len>>1)+1; |
1019 |
+ |
int a2=aseed.b_ofs+(aseed.len>>1)+1; |
1020 |
+ |
GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len, |
1021 |
+ |
seqb, a2, seqb_len, gxmem, &trimInfo, editscript); |
1022 |
+ |
if (alninfo && alninfo->pid>=min_pid && |
1023 |
+ |
trimInfo.validate(seqa_len, alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1)) |
1024 |
+ |
galns.AddIfNew(alninfo, true); |
1025 |
+ |
else delete alninfo; |
1026 |
+ |
} |
1027 |
|
|
1028 |
< |
#ifdef GDEBUG |
1029 |
< |
for (int i=0;i<galns.Count();i++) { |
1030 |
< |
GXAlnInfo* alninfo=galns[i]; |
1031 |
< |
GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr, |
1032 |
< |
alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid); |
1033 |
< |
if (alninfo->gapinfo!=NULL) { |
1034 |
< |
GMessage("Alignment:\n"); |
1035 |
< |
alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len); |
1028 |
> |
/* |
1029 |
> |
//special 3' end case: due to the seed scoring scheme being biased towards the 5' end of the read, |
1030 |
> |
//we should also try some seeds closer to the 3' end |
1031 |
> |
if (galns.Count()==0) { |
1032 |
> |
anchor_seeds.Clear(); |
1033 |
> |
alnbands->setSorted(cmpDiagBands_R); |
1034 |
> |
int max_top_bands=4; |
1035 |
> |
int top_band_count=0; |
1036 |
> |
//#ifdef GDEBUG |
1037 |
> |
//GMessage(":::>> Retrying adjusting sort order.\n"); |
1038 |
> |
//#endif |
1039 |
> |
if (alnbands->tmatch) { |
1040 |
> |
//anchor_seeds.setSorted(false); |
1041 |
> |
anchor_seeds.Add(alnbands->tmatch); |
1042 |
|
} |
1043 |
< |
} |
1044 |
< |
#endif |
1043 |
> |
for (int b=0;b<alnbands->Count();b++) { |
1044 |
> |
if (alnbands->Get(b)->score<4) break; |
1045 |
> |
//#ifdef GDEBUG |
1046 |
> |
//GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score); |
1047 |
> |
//#endif |
1048 |
> |
if (alnbands->Get(b)->tested) continue; |
1049 |
> |
top_band_count++; |
1050 |
> |
GXBand& band=*(alnbands->Get(b)); |
1051 |
> |
band.seeds.setSorted(cmpSeedScore); |
1052 |
> |
anchor_seeds.Add(band.seeds.First()); |
1053 |
> |
if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break; |
1054 |
> |
} |
1055 |
> |
//#ifdef GDEBUG |
1056 |
> |
//GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count()); |
1057 |
> |
//#endif |
1058 |
> |
for (int i=0;i<anchor_seeds.Count();i++) { |
1059 |
> |
GXSeed& aseed=*anchor_seeds[i]; |
1060 |
> |
int a1=aseed.a_ofs+(aseed.len>>1)+1; |
1061 |
> |
int a2=aseed.b_ofs+(aseed.len>>1)+1; |
1062 |
> |
GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len, |
1063 |
> |
seqb, a2, seqb_len, gxmem, &trimInfo, editscript); |
1064 |
> |
if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1)) |
1065 |
> |
galns.AddIfNew(alninfo, true); |
1066 |
> |
else delete alninfo; |
1067 |
> |
} |
1068 |
> |
} */ |
1069 |
> |
|
1070 |
|
//---- done |
1071 |
|
delete alnbands; |
1072 |
|
if (galns.Count()) { |
1073 |
|
GXAlnInfo* bestaln=galns.Shift(); |
1074 |
+ |
#ifdef GDEBUG |
1075 |
+ |
GMessage("Best alignment: a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", bestaln->ql, bestaln->qr, |
1076 |
+ |
bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid); |
1077 |
+ |
if (bestaln->gapinfo!=NULL) { |
1078 |
+ |
bestaln->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len); |
1079 |
+ |
} |
1080 |
+ |
#endif |
1081 |
+ |
|
1082 |
|
return bestaln; |
1083 |
|
} |
1084 |
|
else return NULL; |
1085 |
|
} |
1086 |
|
|
1087 |
< |
GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len, |
1088 |
< |
SGreedyAlignMem* gxmem, int match_reward, int mismatch_penalty, int Xdrop) { |
1087 |
> |
GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, GVec<uint16>* amers[], |
1088 |
> |
const char* seqb, int seqb_len, CGreedyAlignData* gxmem, int min_pid) { |
1089 |
|
bool editscript=false; |
1090 |
|
#ifdef GDEBUG |
1091 |
|
editscript=true; |
1092 |
< |
GMessage("==========> matching Left (5') end ========\n"); |
1092 |
> |
GMessage("==========> matching Left (5') end : %s\n", seqa); |
1093 |
|
#endif |
1094 |
+ |
CAlnTrim trimInfo(galn_TrimLeft, seqb, seqb_len); |
1095 |
|
GList<GXSeed> rseeds(true,true,false); |
1096 |
< |
GXBandSet* alnbands=collectSeeds_L(rseeds, seqa, seqa_len, seqb, seqb_len); |
978 |
< |
GList<GXAlnInfo> galns(true,true,false); |
979 |
< |
int max_top_bands=5; |
980 |
< |
int top_band_count=0; |
1096 |
> |
GXBandSet* alnbands=collectSeeds_L(rseeds, seqa, seqa_len, amers, seqb, seqb_len); |
1097 |
|
GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal |
1098 |
< |
for (int b=0;b<alnbands->Count();b++) { |
1099 |
< |
if (alnbands->Get(b)->score<4) break; |
1100 |
< |
top_band_count++; |
1101 |
< |
GXBand& band=*(alnbands->Get(b)); |
1102 |
< |
band.seeds.setSorted(cmpSeedScore); |
1103 |
< |
anchor_seeds.Add(band.seeds.First()); |
1104 |
< |
if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break; |
1105 |
< |
} |
1106 |
< |
#ifdef GDEBUG |
1107 |
< |
GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count()); |
1108 |
< |
#endif |
1109 |
< |
|
1110 |
< |
for (int i=0;i<anchor_seeds.Count();i++) { |
1098 |
> |
if (alnbands->qmatch) { |
1099 |
> |
#ifdef GDEBUG |
1100 |
> |
GMessage("::: Found a quick long match at %d, len %d\n", |
1101 |
> |
alnbands->qmatch->b_ofs, alnbands->qmatch->len); |
1102 |
> |
#endif |
1103 |
> |
anchor_seeds.Add(alnbands->qmatch); |
1104 |
> |
} |
1105 |
> |
else { |
1106 |
> |
int max_top_bands=5; |
1107 |
> |
int top_band_count=0; |
1108 |
> |
for (int b=0;b<alnbands->Count();b++) { |
1109 |
> |
if (alnbands->Get(b)->score<6) break; |
1110 |
> |
//#ifdef GDEBUG |
1111 |
> |
//GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score); |
1112 |
> |
//#endif |
1113 |
> |
top_band_count++; |
1114 |
> |
GXBand& band=*(alnbands->Get(b)); |
1115 |
> |
band.seeds.setSorted(cmpSeedScore); |
1116 |
> |
anchor_seeds.Add(band.seeds.First()); |
1117 |
> |
band.tested=true; |
1118 |
> |
if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break; |
1119 |
> |
} |
1120 |
> |
//#ifdef GDEBUG |
1121 |
> |
//GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count()); |
1122 |
> |
//#endif |
1123 |
> |
} |
1124 |
> |
GList<GXAlnInfo> galns(true,true,false); |
1125 |
> |
for (int i=0;i<anchor_seeds.Count();i++) { |
1126 |
|
GXSeed& aseed=*anchor_seeds[i]; |
1127 |
|
int a1=aseed.a_ofs+(aseed.len>>1)+1; |
1128 |
|
int a2=aseed.b_ofs+(aseed.len>>1)+1; |
1129 |
|
GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len, |
1130 |
< |
seqb, a2, seqb_len, galn_TrimLeft, editscript, gxmem, |
1131 |
< |
match_reward, mismatch_penalty, Xdrop-2); |
1132 |
< |
if (alninfo && alninfo->pid>82 && alninfo->sl<3 && alninfo->qr>seqa_len-4) |
1002 |
< |
//TODO: at 5' end we should use a higher pid threshold |
1003 |
< |
// we could also decrease the X-drop value! |
1130 |
> |
seqb, a2, seqb_len, gxmem, &trimInfo, editscript); |
1131 |
> |
if (alninfo && alninfo->pid>=min_pid |
1132 |
> |
&& trimInfo.validate(seqa_len, alninfo->sl, alninfo->sr, alninfo->pid, seqa_len-alninfo->qr)) |
1133 |
|
galns.AddIfNew(alninfo, true); |
1134 |
|
else delete alninfo; |
1135 |
|
} |
1136 |
< |
|
1136 |
> |
if (galns.Count()==0 && alnbands->tmatch) { |
1137 |
> |
//last resort seed |
1138 |
> |
GXSeed& aseed=*alnbands->tmatch; |
1139 |
> |
int a1=aseed.a_ofs+(aseed.len>>1)+1; |
1140 |
> |
int a2=aseed.b_ofs+(aseed.len>>1)+1; |
1141 |
> |
GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len, |
1142 |
> |
seqb, a2, seqb_len, gxmem, &trimInfo, editscript); |
1143 |
> |
if (alninfo && alninfo->pid>=min_pid && |
1144 |
> |
trimInfo.validate(seqa_len, alninfo->sl, alninfo->sr, alninfo->pid, seqa_len-alninfo->qr)) |
1145 |
> |
galns.Add(alninfo); |
1146 |
> |
} |
1147 |
> |
/* |
1148 |
|
#ifdef GDEBUG |
1149 |
+ |
//print valid alignments found |
1150 |
|
for (int i=0;i<galns.Count();i++) { |
1151 |
|
GXAlnInfo* alninfo=galns[i]; |
1152 |
|
GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr, |
1157 |
|
} |
1158 |
|
} |
1159 |
|
#endif |
1160 |
+ |
*/ |
1161 |
|
//---- done |
1162 |
|
delete alnbands; |
1163 |
|
if (galns.Count()) { |
1164 |
|
GXAlnInfo* bestaln=galns.Shift(); |
1165 |
+ |
#ifdef GDEBUG |
1166 |
+ |
GMessage("Best alignment: a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", bestaln->ql, bestaln->qr, |
1167 |
+ |
bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid); |
1168 |
+ |
if (bestaln->gapinfo!=NULL) { |
1169 |
+ |
bestaln->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len); |
1170 |
+ |
} |
1171 |
+ |
#endif |
1172 |
|
return bestaln; |
1173 |
|
} |
1174 |
|
else return NULL; |