53 |
|
if (flast_d[d-1][diag-1] > GMAX(flast_d[d-1][diag], flast_d[d-1][diag+1])) { |
54 |
|
*row1 = flast_d[d-1][diag-1]; |
55 |
|
return diag-1; |
56 |
< |
} |
56 |
> |
} |
57 |
|
if (flast_d[d-1][diag] > flast_d[d-1][diag+1]) { |
58 |
|
*row1 = flast_d[d-1][diag]; |
59 |
|
return diag; |
65 |
|
void GapXEditScript::print() { //debug |
66 |
|
GapXEditScript* p=this; |
67 |
|
do { |
68 |
< |
GMessage("%d%c ",p->num, xgapcodes[p->op_type]); |
69 |
< |
} while ((p=p->next)!=NULL); |
68 |
> |
GMessage("%d%c ",p->num, xgapcodes[p->op_type]); |
69 |
> |
} while ((p=p->next)!=NULL); |
70 |
|
GMessage("\n"); |
71 |
|
} |
72 |
|
|
93 |
|
else |
94 |
|
g = BLAST_Gcd(*a, BLAST_Gcd(*b, *c)); |
95 |
|
if (g > 1) { |
96 |
< |
*a /= g; |
96 |
> |
*a /= g; |
97 |
|
*b /= g; |
98 |
|
*c /= g; |
99 |
|
} |
101 |
|
} |
102 |
|
|
103 |
|
|
104 |
+ |
uint16 get6mer(char* p) { |
105 |
+ |
uint16 r=gdna2bit(p,3); |
106 |
+ |
r <<= 6; |
107 |
+ |
r |= gdna2bit(p,3); |
108 |
+ |
return r; |
109 |
+ |
} |
110 |
+ |
|
111 |
+ |
|
112 |
+ |
void table6mers(const char* s, int slen, GVec<uint16>* amers[]) { |
113 |
+ |
for (uint16 i=0; i <= slen-6; i++) { |
114 |
+ |
char* p = (char*)(s+i); |
115 |
+ |
uint16 v=get6mer(p); |
116 |
+ |
if (amers[v]==NULL) { |
117 |
+ |
amers[v]=new GVec<uint16>(1); |
118 |
+ |
} |
119 |
+ |
amers[v]->Add(i); |
120 |
+ |
} |
121 |
+ |
} |
122 |
+ |
|
123 |
+ |
GVec<uint16>* match6mer(char* start, GVec<uint16>* amers[]) { |
124 |
+ |
//careful: this is broken if start+5 falls beyond the end of the string! |
125 |
+ |
uint16 r=get6mer(start); |
126 |
+ |
return amers[r]; |
127 |
+ |
} |
128 |
|
|
129 |
|
//signal that a diagonal is invalid |
130 |
|
static const int kInvalidOffset = -2; |
153 |
|
// } |
154 |
|
} |
155 |
|
else { //forward lookup |
156 |
< |
while (seq1_index < len1 && seq2_index < len2 && |
156 |
> |
while (seq1_index < len1 && seq2_index < len2 && |
157 |
|
//seq1[seq1_index] < 4 && |
158 |
|
seq1[seq1_index] == seq2[seq2_index]) { |
159 |
|
++seq1_index; |
228 |
|
int longest_match_run; |
229 |
|
bool end1_reached, end2_reached; |
230 |
|
GXMemPool* mem_pool; |
231 |
< |
|
231 |
> |
|
232 |
|
/* ordinary dynamic programming alignment, for each offset |
233 |
|
in seq1, walks through offsets in seq2 until an X-dropoff |
234 |
< |
test fails, saving the best score encountered along |
234 |
> |
test fails, saving the best score encountered along |
235 |
|
the way. Instead of score, this code tracks the 'distance' |
236 |
|
(number of mismatches plus number of gaps) between seq1 |
237 |
|
and seq2. Instead of walking through sequence offsets, it |
238 |
|
walks through diagonals that can achieve a given distance. |
239 |
< |
|
239 |
> |
|
240 |
|
Note that in what follows, the numbering of diagonals implies |
241 |
< |
a dot matrix where increasing seq1 offsets go to the right on |
241 |
> |
a dot matrix where increasing seq1 offsets go to the right on |
242 |
|
the x axis, and increasing seq2 offsets go up the y axis. |
243 |
|
The gapped alignment thus proceeds up and to the right in |
244 |
|
the graph, and diagonals are numbered increasing to the right */ |
247 |
|
best_diag = 0; |
248 |
|
|
249 |
|
/* set the number of distinct distances the algorithm will |
250 |
< |
examine in the search for an optimal alignment. The |
251 |
< |
heuristic worst-case running time of the algorithm is |
250 |
> |
examine in the search for an optimal alignment. The |
251 |
> |
heuristic worst-case running time of the algorithm is |
252 |
|
O(max_dist**2 + (len1+len2)); for sequences which are |
253 |
|
very similar, the average running time will be sig- |
254 |
|
nificantly better than this */ |
257 |
|
(len2/GREEDY_MAX_COST_FRACTION + 1)); |
258 |
|
|
259 |
|
/* the main loop assumes that the index of all diagonals is |
260 |
< |
biased to lie in the middle of allocated bookkeeping |
260 |
> |
biased to lie in the middle of allocated bookkeeping |
261 |
|
structures */ |
262 |
|
|
263 |
|
diag_origin = max_dist + 2; |
275 |
|
xdrop_offset gives the distance backwards in the score |
276 |
|
array to look */ |
277 |
|
|
278 |
< |
xdrop_offset = (xdrop_threshold + match_cost / 2) / |
278 |
> |
xdrop_offset = (xdrop_threshold + match_cost / 2) / |
279 |
|
(match_cost + mismatch_cost) + 1; |
280 |
< |
|
280 |
> |
|
281 |
|
// find the offset of the first mismatch between seq1 and seq2 |
282 |
|
|
283 |
|
index = s_FindFirstMismatch(seq1, len1, seq2, len2, 0, 0, reverse); |
301 |
|
if (edit_block != NULL) |
302 |
|
//GapPrelimEditBlockAdd(edit_block, eGapAlignSub, index); |
303 |
|
edit_block->opRep(index); |
304 |
< |
return 0; |
304 |
> |
return 0; |
305 |
|
} |
306 |
|
|
307 |
|
// set up the memory pool |
311 |
|
} |
312 |
|
else if (mem_pool == NULL) { |
313 |
|
aux_data.space = mem_pool = new GXMemPool(); |
314 |
< |
} |
314 |
> |
} |
315 |
|
else { |
316 |
|
mem_pool->refresh(); |
317 |
|
} |
318 |
< |
|
318 |
> |
|
319 |
|
/* set up the array of per-distance maximum scores. There |
320 |
|
are max_diags + xdrop_offset distances to track, the first |
321 |
|
xdrop_offset of which are 0 */ |
323 |
|
max_score = aux_data.max_score + xdrop_offset; |
324 |
|
for (index = 0; index < xdrop_offset; index++) |
325 |
|
aux_data.max_score[index] = 0; |
326 |
< |
|
326 |
> |
|
327 |
|
// fill in the initial offsets of the distance matrix |
328 |
|
|
329 |
|
last_seq2_off[0][diag_origin] = seq1_index; |
352 |
|
|
353 |
|
// compute the score for distance d corresponding to the X-dropoff criterion |
354 |
|
|
355 |
< |
xdrop_score = max_score[d - xdrop_offset] + |
355 |
> |
xdrop_score = max_score[d - xdrop_offset] + |
356 |
|
(match_cost + mismatch_cost) * d - xdrop_threshold; |
357 |
< |
xdrop_score = (int)ceil((double)xdrop_score / (match_cost / 2)); |
357 |
> |
xdrop_score = (int)ceil((double)xdrop_score / (match_cost / 2)); |
358 |
|
|
359 |
|
// for each diagonal of interest |
360 |
|
for (k = tmp_diag_lower; k <= tmp_diag_upper; k++) { |
361 |
|
/* find the largest offset into seq2 that increases |
362 |
|
the distance from d-1 to d (i.e. keeps the alignment |
363 |
< |
from getting worse for as long as possible), then |
363 |
> |
from getting worse for as long as possible), then |
364 |
|
choose the offset into seq1 that will keep the |
365 |
< |
resulting diagonal fixed at k |
366 |
< |
|
365 |
> |
resulting diagonal fixed at k |
366 |
> |
|
367 |
|
Note that this requires kInvalidOffset+1 to be smaller |
368 |
|
than any valid offset into seq2, i.e. to be negative */ |
369 |
|
|
384 |
|
continue; |
385 |
|
} |
386 |
|
diag_upper = k; |
387 |
< |
|
388 |
< |
/* slide down diagonal k until a mismatch |
387 |
> |
|
388 |
> |
/* slide down diagonal k until a mismatch |
389 |
|
occurs. As long as only matches are encountered, |
390 |
|
the current distance d will not change */ |
391 |
|
|
421 |
|
can each be of size at most max_diags+2 */ |
422 |
|
|
423 |
|
if (seq2_index == len2) { |
424 |
< |
diag_lower = k + 1; |
424 |
> |
diag_lower = k + 1; |
425 |
|
end2_reached = true; |
426 |
|
} |
427 |
|
if (seq1_index == len1) { |
428 |
< |
diag_upper = k - 1; |
428 |
> |
diag_upper = k - 1; |
429 |
|
end1_reached = true; |
430 |
|
} |
431 |
|
} // end loop over diagonals |
432 |
|
|
433 |
|
// compute the maximum score possible for distance d |
434 |
< |
curr_score = curr_extent * (match_cost / 2) - |
434 |
> |
curr_score = curr_extent * (match_cost / 2) - |
435 |
|
d * (match_cost + mismatch_cost); |
436 |
|
// if this is the best score seen so far, update the |
437 |
|
// statistics of the best alignment |
441 |
|
best_diag = curr_diag; |
442 |
|
seq2_align_len = curr_seq2_index; |
443 |
|
seq1_align_len = curr_seq2_index + best_diag - diag_origin; |
444 |
< |
} |
444 |
> |
} |
445 |
|
else { |
446 |
|
max_score[d] = max_score[d - 1]; |
447 |
|
} |
452 |
|
if (diag_lower > diag_upper) |
453 |
|
break; |
454 |
|
|
455 |
< |
/* set up for the next distance to examine. Because the |
456 |
< |
bounds increase by at most one for each distance, |
457 |
< |
diag_lower and diag_upper can each be of size at |
455 |
> |
/* set up for the next distance to examine. Because the |
456 |
> |
bounds increase by at most one for each distance, |
457 |
> |
diag_lower and diag_upper can each be of size at |
458 |
|
most max_diags+2 */ |
459 |
|
|
460 |
|
if (!end2_reached) |
461 |
< |
diag_lower--; |
461 |
> |
diag_lower--; |
462 |
|
if (!end1_reached) |
463 |
|
diag_upper++; |
464 |
|
|
479 |
|
} |
480 |
|
} // end loop over distinct distances |
481 |
|
|
482 |
< |
|
482 |
> |
|
483 |
|
if (edit_block == NULL) |
484 |
|
return best_dist; |
485 |
|
|
486 |
|
//---- perform traceback |
487 |
< |
d = best_dist; |
487 |
> |
d = best_dist; |
488 |
|
seq1_index = seq1_align_len; |
489 |
|
seq2_index = seq2_align_len; |
490 |
|
// for all positive distances |
500 |
|
traceback operation. best_diag starts off with the |
501 |
|
value computed during the alignment process */ |
502 |
|
|
503 |
< |
new_diag = s_GetNextNonAffineTback(last_seq2_off, d, |
503 |
> |
new_diag = s_GetNextNonAffineTback(last_seq2_off, d, |
504 |
|
best_diag, &new_seq2_index); |
505 |
|
|
506 |
|
if (new_diag == best_diag) { |
508 |
|
if (seq2_index - new_seq2_index > 0) { |
509 |
|
edit_block->opRep(seq2_index - new_seq2_index); |
510 |
|
} |
511 |
< |
} |
511 |
> |
} |
512 |
|
else if (new_diag < best_diag) { |
513 |
|
// smaller diagonal: issue a group of substitutions |
514 |
|
// and then a gap in seq2 */ |
517 |
|
} |
518 |
|
//GapPrelimEditBlockAdd(edit_block, eGapAlignIns, 1); |
519 |
|
edit_block->opIns(1); |
520 |
< |
} |
520 |
> |
} |
521 |
|
else { |
522 |
|
// larger diagonal: issue a group of substitutions |
523 |
|
// and then a gap in seq1 |
526 |
|
} |
527 |
|
edit_block->opDel(1); |
528 |
|
} |
529 |
< |
d--; |
530 |
< |
best_diag = new_diag; |
531 |
< |
seq2_index = new_seq2_index; |
529 |
> |
d--; |
530 |
> |
best_diag = new_diag; |
531 |
> |
seq2_index = new_seq2_index; |
532 |
|
} |
533 |
|
//done: |
534 |
|
// handle the final group of substitutions back to distance zero, |
550 |
|
unsigned char op_type = 3 - ( ed_script->ops[i] & gxEDIT_OP_MASK ); |
551 |
|
if (op_type == 3) |
552 |
|
GError("Error: printEditScript encountered op_type 3 ?!\n"); |
553 |
< |
GMessage("%d%c ", num, xgapcodes[op_type]); |
553 |
> |
GMessage("%d%c ", num, xgapcodes[op_type]); |
554 |
|
} |
555 |
|
GMessage("\n"); |
556 |
|
} |
701 |
|
} |
702 |
|
|
703 |
|
|
704 |
< |
GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) { |
705 |
< |
//overlap of right (3') end of seqb |
706 |
< |
//hash the first 12 bases of seqa: |
707 |
< |
int aimin=0; |
708 |
< |
int aimax=GMIN(9,(a_len-4)); |
709 |
< |
int bimin=GMAX(0,(b_len-a_len-2));//allow 2 extraneous bases, just in case there is some garbage at the end |
710 |
< |
int bimax=b_len-4; |
711 |
< |
int a_maxlen=aimax+4; //number of rows in the diagonal matrix |
712 |
< |
int b_maxlen=b_len; //number of cols in the diagonal matrix |
713 |
< |
GXSeedTable gx(a_maxlen, b_maxlen); |
714 |
< |
GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips |
715 |
< |
for (int ai=aimin;ai<=aimax;ai++) { |
716 |
< |
int32* av=(int32 *)(&seqa[ai]); |
717 |
< |
//for (int bi=b_len-4;bi>=bimin;bi--) { |
718 |
< |
for (int bi=bimin;bi<=bimax;bi++) { |
719 |
< |
if (gx.x(ai,bi)) |
720 |
< |
continue; //already have a previous seed covering this region of this diagonal |
721 |
< |
int32* bv=(int32 *)(&seqb[bi]); |
722 |
< |
if (*av == *bv) { |
723 |
< |
//word match |
724 |
< |
//see if we can extend to the right |
725 |
< |
gx.x(ai+1,bi+1)=1; |
726 |
< |
gx.x(ai+2,bi+2)=1; |
727 |
< |
gx.x(ai+3,bi+3)=1; |
728 |
< |
int aix=ai+4; |
729 |
< |
int bix=bi+4; |
730 |
< |
int len=4; |
731 |
< |
while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix]) |
732 |
< |
{ |
733 |
< |
gx.x(aix,bix)=1; |
734 |
< |
aix++;bix++; |
735 |
< |
len++; |
736 |
< |
} |
737 |
< |
if (len>7) { |
738 |
< |
//heuristics: very likely the best we can get |
739 |
< |
int qlen=len; |
740 |
< |
while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) { |
741 |
< |
aix++; |
742 |
< |
bix++; |
743 |
< |
qlen++; |
744 |
< |
} |
745 |
< |
if (qlen>10) { //found a quick match shortcut |
746 |
< |
diagstrips->qmatch=new GXSeed(ai,bi,qlen); |
723 |
< |
return diagstrips; |
724 |
< |
} |
725 |
< |
} |
726 |
< |
GXSeed* newseed=new GXSeed(ai,bi,len); |
727 |
< |
seeds.Add(newseed); |
728 |
< |
diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals |
729 |
< |
//special last resort terminal match to be used if no better alignment is there |
730 |
< |
if (len>4 && ai==0 && (bi+len==b_len || bi+len==b_len-1)) diagstrips->tmatch=newseed; |
731 |
< |
} |
732 |
< |
} |
733 |
< |
}//for each 4-mer at the beginning of seqa |
704 |
> |
GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len, |
705 |
> |
GVec<uint16>* amers[], const char* seqb, int b_len) { |
706 |
> |
int bimin=GMAX(0,(b_len-a_len-6)); |
707 |
> |
GXSeedTable gx(a_len, b_len); |
708 |
> |
GXBandSet* diagstrips=new GXBandSet(a_len, b_len); //set of overlapping 3-diagonal strips |
709 |
> |
for (int bi=0;bi<=b_len-6;bi++) { |
710 |
> |
//for each 6-mer of seqb |
711 |
> |
uint16 bv = get6mer((char*)&seqb[bi]); |
712 |
> |
GVec<uint16>* alocs = amers[bv]; |
713 |
> |
if (alocs==NULL) continue; |
714 |
> |
//extend each hit |
715 |
> |
for (int h=0;h<alocs->Count();h++) { |
716 |
> |
int ai=alocs->Get(h); |
717 |
> |
//word match |
718 |
> |
if (gx.x(ai,bi)) |
719 |
> |
//already have a previous seed covering this region of this diagonal |
720 |
> |
continue; |
721 |
> |
for (int i=0;i<6;i++) |
722 |
> |
gx.x(ai+i,bi+i)=1; |
723 |
> |
//see if we can extend to the right |
724 |
> |
int aix=ai+6; |
725 |
> |
int bix=bi+6; |
726 |
> |
int len=6; |
727 |
> |
while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) { |
728 |
> |
gx.x(aix,bix)=1; |
729 |
> |
aix++;bix++; |
730 |
> |
len++; |
731 |
> |
} |
732 |
> |
/*if (len>22) { |
733 |
> |
//heuristics: very likely the best we can get |
734 |
> |
//quick match shortcut |
735 |
> |
diagstrips->qmatch=new GXSeed(ai,bi,len); |
736 |
> |
return diagstrips; |
737 |
> |
} */ |
738 |
> |
if (bi<bimin && len<9) continue; //skip middle seeds that are not high scoring enough |
739 |
> |
GXSeed* newseed=new GXSeed(ai,bi,len); |
740 |
> |
seeds.Add(newseed); |
741 |
> |
diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals |
742 |
> |
//special last resort terminal match to be used if no better alignment is there |
743 |
> |
if (ai<2 && bi+len>b_len-2 && |
744 |
> |
(!diagstrips->tmatch || diagstrips->tmatch->len<len)) diagstrips->tmatch=newseed; |
745 |
> |
} |
746 |
> |
} |
747 |
|
for (int i=0;i<diagstrips->Count();i++) { |
748 |
|
diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds |
749 |
|
} |
751 |
|
return diagstrips; |
752 |
|
} |
753 |
|
|
754 |
< |
GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len, const char* seqb, int b_len) { |
754 |
> |
GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len, |
755 |
> |
GVec<uint16>* amers[], const char* seqb, int b_len) { |
756 |
|
//overlap of left (5') end of seqb |
757 |
|
//hash the last 12 bases of seqa: |
758 |
< |
int aimin=GMAX(0,(a_len-16)); |
745 |
< |
int aimax=a_len-4; |
746 |
< |
int bimin=0; |
747 |
< |
int bimax=GMIN((a_len-2), (b_len-4)); |
748 |
< |
int a_maxlen=aimax+4; //number of rows in the diagonal matrix |
749 |
< |
int b_maxlen=b_len; //number of cols in the diagonal matrix |
758 |
> |
int bimax=GMIN((a_len+2), (b_len-6)); |
759 |
|
//gx.init(a_maxlen, b_maxlen); |
760 |
< |
GXSeedTable gx(a_maxlen, b_maxlen); |
761 |
< |
GXBandSet* diagstrips=new GXBandSet(a_maxlen, b_maxlen); //set of overlapping 3-diagonal strips |
762 |
< |
for (int ai=aimin;ai<=aimax;ai++) { |
763 |
< |
int32* av=(int32 *)(&seqa[ai]); |
764 |
< |
for (int bi=bimin;bi<=bimax;bi++) { |
765 |
< |
if (gx.x(ai,bi)) |
766 |
< |
continue; //already have a previous seed covering this region of this diagonal |
767 |
< |
int32* bv=(int32 *)(&seqb[bi]); |
768 |
< |
if (*av == *bv) { |
769 |
< |
//word match |
770 |
< |
//see if we can extend to the right |
771 |
< |
gx.x(ai+1,bi+1)=1; |
772 |
< |
gx.x(ai+2,bi+2)=1; |
773 |
< |
gx.x(ai+3,bi+3)=1; |
774 |
< |
int aix=ai+4; |
775 |
< |
int bix=bi+4; |
776 |
< |
int len=4; |
777 |
< |
while (bix<b_maxlen && aix<a_maxlen && seqa[aix]==seqb[bix]) |
778 |
< |
{ |
779 |
< |
gx.x(aix,bix)=1; |
780 |
< |
aix++;bix++; |
781 |
< |
len++; |
782 |
< |
} |
783 |
< |
if (len>7) { |
784 |
< |
//heuristics: very likely the best we can get |
785 |
< |
int qlen=len; |
786 |
< |
while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) { |
787 |
< |
aix++; |
788 |
< |
bix++; |
789 |
< |
qlen++; |
790 |
< |
} |
791 |
< |
if (qlen>10) { //found a quick match shortcut |
792 |
< |
diagstrips->qmatch=new GXSeed(ai,bi,qlen); |
793 |
< |
return diagstrips; |
794 |
< |
} |
795 |
< |
} |
796 |
< |
GXSeed* newseed=new GXSeed(ai,bi,len); |
797 |
< |
seeds.Add(newseed); |
798 |
< |
diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals |
799 |
< |
//special last resort terminal match to be used if no better alignment is there |
800 |
< |
if (len>4 && bi==0 && ai+len==a_len) diagstrips->tmatch=newseed; |
792 |
< |
} |
793 |
< |
} |
794 |
< |
}//for each 4-mer at the end of seqa |
760 |
> |
GXSeedTable gx(a_len, b_len); |
761 |
> |
GXBandSet* diagstrips=new GXBandSet(a_len, b_len); //set of overlapping 3-diagonal strips |
762 |
> |
for (int bi=0;bi<=b_len-6;bi++) { |
763 |
> |
//for each 6-mer of seqb |
764 |
> |
uint16 bv = get6mer((char*)&seqb[bi]); |
765 |
> |
GVec<uint16>* alocs = amers[bv]; |
766 |
> |
if (alocs==NULL) continue; |
767 |
> |
//extend each hit |
768 |
> |
for (int h=0;h<alocs->Count();h++) { |
769 |
> |
int ai=alocs->Get(h); |
770 |
> |
//word match |
771 |
> |
if (gx.x(ai,bi)) |
772 |
> |
//already have a previous seed covering this region of this diagonal |
773 |
> |
continue; |
774 |
> |
for (int i=0;i<6;i++) |
775 |
> |
gx.x(ai+i,bi+i)=1; |
776 |
> |
//see if we can extend to the right |
777 |
> |
int aix=ai+6; |
778 |
> |
int bix=bi+6; |
779 |
> |
int len=6; |
780 |
> |
while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) { |
781 |
> |
gx.x(aix,bix)=1; |
782 |
> |
aix++;bix++; |
783 |
> |
len++; |
784 |
> |
} |
785 |
> |
/* if (len>22) { |
786 |
> |
//heuristics: very likely the best we can get |
787 |
> |
//quick match shortcut |
788 |
> |
diagstrips->qmatch=new GXSeed(ai,bi,len); |
789 |
> |
return diagstrips; |
790 |
> |
} */ |
791 |
> |
if (bi>bimax && len<9) continue; //skip middle seeds that are not high scoring enough |
792 |
> |
GXSeed* newseed=new GXSeed(ai,bi,len); |
793 |
> |
seeds.Add(newseed); |
794 |
> |
diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals |
795 |
> |
//special last resort terminal match to be used if no better alignment is there |
796 |
> |
if (bi<2 && ai+len>=a_len-1 && |
797 |
> |
(!diagstrips->tmatch || diagstrips->tmatch->len<len)) |
798 |
> |
diagstrips->tmatch=newseed; |
799 |
> |
} |
800 |
> |
} //for each 6-mer of the read |
801 |
|
for (int i=0;i<diagstrips->Count();i++) { |
802 |
|
diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds |
803 |
|
} |
815 |
|
else return (s2->len-s1->len); |
816 |
|
} |
817 |
|
|
818 |
+ |
int cmpSeedScore_R(const pointer p1, const pointer p2) { |
819 |
+ |
//return (((GXSeed*)s2)->len-((GXSeed*)s1)->len); |
820 |
+ |
GXSeed* s1=(GXSeed*)p1; |
821 |
+ |
GXSeed* s2=(GXSeed*)p2; |
822 |
+ |
if (s1->len==s2->len) { |
823 |
+ |
return (s2->b_ofs-s1->b_ofs); |
824 |
+ |
} |
825 |
+ |
else return (s2->len-s1->len); |
826 |
+ |
} |
827 |
+ |
|
828 |
+ |
|
829 |
|
int cmpSeedDiag(const pointer p1, const pointer p2) { |
830 |
|
GXSeed* s1=(GXSeed*)p1; |
831 |
|
GXSeed* s2=(GXSeed*)p2; |
961 |
|
reward, penalty, xdrop, gxmem, trim, editscript); |
962 |
|
} |
963 |
|
|
964 |
< |
GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len, |
965 |
< |
CGreedyAlignData* gxmem, int min_pid) { |
964 |
> |
GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, GVec<uint16>* amers[], |
965 |
> |
const char* seqb, int seqb_len, CGreedyAlignData* gxmem, int min_pid) { |
966 |
|
bool editscript=false; |
967 |
|
#ifdef GDEBUG |
968 |
|
editscript=true; |
969 |
< |
// GMessage("==========> matching Right (3') end ========\n"); |
969 |
> |
GMessage("==========> matching Right (3') end : %s\n", seqa); |
970 |
|
#endif |
971 |
|
|
972 |
|
CAlnTrim trimInfo(galn_TrimRight, seqb, seqb_len); |
973 |
|
GList<GXSeed> rseeds(true,true,false); |
974 |
< |
GXBandSet* alnbands=collectSeeds_R(rseeds, seqa, seqa_len, seqb, seqb_len); |
974 |
> |
GXBandSet* alnbands=collectSeeds_R(rseeds, seqa, seqa_len, amers, seqb, seqb_len); |
975 |
|
GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal |
976 |
|
//did we find a shortcut? |
977 |
|
if (alnbands->qmatch) { |
978 |
< |
//#ifdef GDEBUG |
979 |
< |
// GMessage("::: Found a quick long match -- using it instead of diagonal bands\n"); |
980 |
< |
//#endif |
978 |
> |
#ifdef GDEBUG |
979 |
> |
GMessage("::: Found a quick long match at %d, len %d\n", |
980 |
> |
alnbands->qmatch->b_ofs, alnbands->qmatch->len); |
981 |
> |
#endif |
982 |
|
anchor_seeds.Add(alnbands->qmatch); |
983 |
|
} |
984 |
|
else { |
985 |
|
int max_top_bands=5; |
986 |
|
int top_band_count=0; |
987 |
|
for (int b=0;b<alnbands->Count();b++) { |
988 |
< |
if (alnbands->Get(b)->score<4) break; |
988 |
> |
if (alnbands->Get(b)->score<6) break; |
989 |
|
//#ifdef GDEBUG |
990 |
|
//GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score); |
991 |
|
//#endif |
1000 |
|
//GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count()); |
1001 |
|
//#endif |
1002 |
|
} |
985 |
– |
|
1003 |
|
GList<GXAlnInfo> galns(true,true,false); |
1004 |
|
for (int i=0;i<anchor_seeds.Count();i++) { |
1005 |
|
GXSeed& aseed=*anchor_seeds[i]; |
1007 |
|
int a2=aseed.b_ofs+(aseed.len>>1)+1; |
1008 |
|
GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len, |
1009 |
|
seqb, a2, seqb_len, gxmem, &trimInfo, editscript); |
1010 |
< |
if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1)) |
1010 |
> |
if (alninfo && alninfo->pid>=min_pid && |
1011 |
> |
trimInfo.validate(seqa_len, alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1)) |
1012 |
|
galns.AddIfNew(alninfo, true); |
1013 |
|
else delete alninfo; |
1014 |
|
} |
1015 |
+ |
if (galns.Count()==0 && alnbands->tmatch) { |
1016 |
+ |
//last resort seed |
1017 |
+ |
GXSeed& aseed=*alnbands->tmatch; |
1018 |
+ |
int a1=aseed.a_ofs+(aseed.len>>1)+1; |
1019 |
+ |
int a2=aseed.b_ofs+(aseed.len>>1)+1; |
1020 |
+ |
GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len, |
1021 |
+ |
seqb, a2, seqb_len, gxmem, &trimInfo, editscript); |
1022 |
+ |
if (alninfo && alninfo->pid>=min_pid && |
1023 |
+ |
trimInfo.validate(seqa_len, alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1)) |
1024 |
+ |
galns.AddIfNew(alninfo, true); |
1025 |
+ |
else delete alninfo; |
1026 |
+ |
} |
1027 |
+ |
|
1028 |
+ |
/* |
1029 |
|
//special 3' end case: due to the seed scoring scheme being biased towards the 5' end of the read, |
1030 |
|
//we should also try some seeds closer to the 3' end |
1031 |
|
if (galns.Count()==0) { |
1065 |
|
galns.AddIfNew(alninfo, true); |
1066 |
|
else delete alninfo; |
1067 |
|
} |
1068 |
< |
} |
1068 |
> |
} */ |
1069 |
> |
|
1070 |
|
//---- done |
1071 |
|
delete alnbands; |
1072 |
|
if (galns.Count()) { |
1084 |
|
else return NULL; |
1085 |
|
} |
1086 |
|
|
1087 |
< |
GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len, |
1088 |
< |
CGreedyAlignData* gxmem, int min_pid) { |
1087 |
> |
GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, GVec<uint16>* amers[], |
1088 |
> |
const char* seqb, int seqb_len, CGreedyAlignData* gxmem, int min_pid) { |
1089 |
|
bool editscript=false; |
1090 |
|
#ifdef GDEBUG |
1091 |
|
editscript=true; |
1092 |
< |
//GMessage("==========> matching Left (5') end ========\n"); |
1092 |
> |
GMessage("==========> matching Left (5') end : %s\n", seqa); |
1093 |
|
#endif |
1094 |
|
CAlnTrim trimInfo(galn_TrimLeft, seqb, seqb_len); |
1095 |
|
GList<GXSeed> rseeds(true,true,false); |
1096 |
< |
GXBandSet* alnbands=collectSeeds_L(rseeds, seqa, seqa_len, seqb, seqb_len); |
1064 |
< |
GList<GXAlnInfo> galns(true,true,false); |
1096 |
> |
GXBandSet* alnbands=collectSeeds_L(rseeds, seqa, seqa_len, amers, seqb, seqb_len); |
1097 |
|
GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal |
1066 |
– |
|
1098 |
|
if (alnbands->qmatch) { |
1099 |
< |
//#ifdef GDEBUG |
1100 |
< |
// GMessage("::: Found a quick long match -- using it instead of diagonal bands\n"); |
1101 |
< |
//#endif |
1099 |
> |
#ifdef GDEBUG |
1100 |
> |
GMessage("::: Found a quick long match at %d, len %d\n", |
1101 |
> |
alnbands->qmatch->b_ofs, alnbands->qmatch->len); |
1102 |
> |
#endif |
1103 |
|
anchor_seeds.Add(alnbands->qmatch); |
1104 |
|
} |
1105 |
|
else { |
1106 |
|
int max_top_bands=5; |
1107 |
|
int top_band_count=0; |
1108 |
|
for (int b=0;b<alnbands->Count();b++) { |
1109 |
< |
if (alnbands->Get(b)->score<4) break; |
1109 |
> |
if (alnbands->Get(b)->score<6) break; |
1110 |
|
//#ifdef GDEBUG |
1111 |
|
//GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score); |
1112 |
|
//#endif |
1114 |
|
GXBand& band=*(alnbands->Get(b)); |
1115 |
|
band.seeds.setSorted(cmpSeedScore); |
1116 |
|
anchor_seeds.Add(band.seeds.First()); |
1117 |
+ |
band.tested=true; |
1118 |
|
if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break; |
1119 |
|
} |
1120 |
|
//#ifdef GDEBUG |
1121 |
|
//GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count()); |
1122 |
|
//#endif |
1123 |
|
} |
1124 |
< |
for (int i=0;i<anchor_seeds.Count();i++) { |
1124 |
> |
GList<GXAlnInfo> galns(true,true,false); |
1125 |
> |
for (int i=0;i<anchor_seeds.Count();i++) { |
1126 |
|
GXSeed& aseed=*anchor_seeds[i]; |
1127 |
|
int a1=aseed.a_ofs+(aseed.len>>1)+1; |
1128 |
|
int a2=aseed.b_ofs+(aseed.len>>1)+1; |
1129 |
|
GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len, |
1130 |
|
seqb, a2, seqb_len, gxmem, &trimInfo, editscript); |
1131 |
|
if (alninfo && alninfo->pid>=min_pid |
1132 |
< |
&& trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, seqa_len-alninfo->qr)) |
1132 |
> |
&& trimInfo.validate(seqa_len, alninfo->sl, alninfo->sr, alninfo->pid, seqa_len-alninfo->qr)) |
1133 |
|
galns.AddIfNew(alninfo, true); |
1134 |
|
else delete alninfo; |
1135 |
|
} |
1136 |
|
if (galns.Count()==0 && alnbands->tmatch) { |
1137 |
+ |
//last resort seed |
1138 |
|
GXSeed& aseed=*alnbands->tmatch; |
1139 |
|
int a1=aseed.a_ofs+(aseed.len>>1)+1; |
1140 |
|
int a2=aseed.b_ofs+(aseed.len>>1)+1; |
1141 |
|
GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len, |
1142 |
|
seqb, a2, seqb_len, gxmem, &trimInfo, editscript); |
1143 |
< |
if (alninfo) galns.Add(alninfo); |
1143 |
> |
if (alninfo && alninfo->pid>=min_pid && |
1144 |
> |
trimInfo.validate(seqa_len, alninfo->sl, alninfo->sr, alninfo->pid, seqa_len-alninfo->qr)) |
1145 |
> |
galns.Add(alninfo); |
1146 |
|
} |
1110 |
– |
#ifdef GDEBUG |
1147 |
|
/* |
1148 |
+ |
#ifdef GDEBUG |
1149 |
+ |
//print valid alignments found |
1150 |
|
for (int i=0;i<galns.Count();i++) { |
1151 |
|
GXAlnInfo* alninfo=galns[i]; |
1152 |
|
GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr, |
1156 |
|
alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len); |
1157 |
|
} |
1158 |
|
} |
1121 |
– |
*/ |
1159 |
|
#endif |
1160 |
+ |
*/ |
1161 |
|
//---- done |
1162 |
|
delete alnbands; |
1163 |
|
if (galns.Count()) { |