ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.cpp
Revision: 310
Committed: Fri Mar 22 20:06:27 2013 UTC (6 years, 6 months ago) by gpertea
File size: 35900 byte(s)
Log Message:
sync with igm repo

Line User Rev File contents
1 gpertea 93 #include "GAlnExtend.h"
2    
3     //greedy gapped alignment extension
4     //(mostly lifted from NCBI's megablast gapped extension code)
5    
6     int GXMemPool::kMinSpace = 1000000;
7    
8 gpertea 310 #ifdef TRIMDEBUG
9 gpertea 93 char buf[6]={0x1B,'[', 'n','m','m','\0'};
10    
11     void color_fg(int c,FILE* f) {
12     if (f!=stderr && f!=stdout) return;
13     sprintf((char *)(&buf[2]),"%dm",c+30);
14     fwrite(buf,1,strlen(buf), f);
15     }
16    
17     void color_bg(int c, FILE* f) {
18     if (f!=stderr && f!=stdout) return;
19     sprintf((char *)(&buf[2]),"%dm",c+40);
20     fwrite(buf,1,strlen(buf),f);
21     };
22    
23     void color_resetfg(FILE* f) {
24     if (f!=stderr && f!=stdout) return;
25     sprintf((char *)(&buf[2]),"39m");
26     fwrite(buf,1,strlen(buf), f);
27     };
28    
29     void color_resetbg(FILE* f) {
30     if (f!=stderr && f!=stdout) return;
31     sprintf((char *)(&buf[2]),"49m");
32     fwrite(buf,1,strlen(buf), f);
33     }
34    
35     void color_reset(FILE* f) {
36     if (f!=stderr && f!=stdout) return;
37     sprintf((char *)(&buf[2]),"0m");
38     fwrite(buf,1,strlen(buf), f);
39     };
40    
41     void color_normal(FILE* f) {
42     if (f!=stderr && f!=stdout) return;
43     sprintf((char *)(&buf[2]),"22m");
44     fwrite(buf,1,strlen(buf), f);
45     };
46    
47     #endif
48    
49    
50     char xgapcodes[4]={'S','I', 'D', 'X'};
51    
52     int get_last(int **flast_d, int d, int diag, int *row1) {
53     if (flast_d[d-1][diag-1] > GMAX(flast_d[d-1][diag], flast_d[d-1][diag+1])) {
54     *row1 = flast_d[d-1][diag-1];
55     return diag-1;
56 gpertea 171 }
57 gpertea 93 if (flast_d[d-1][diag] > flast_d[d-1][diag+1]) {
58     *row1 = flast_d[d-1][diag];
59     return diag;
60     }
61     *row1 = flast_d[d-1][diag+1];
62     return diag+1;
63     }
64    
65     void GapXEditScript::print() { //debug
66     GapXEditScript* p=this;
67     do {
68 gpertea 171 GMessage("%d%c ",p->num, xgapcodes[p->op_type]);
69     } while ((p=p->next)!=NULL);
70 gpertea 93 GMessage("\n");
71     }
72    
73    
74     int BLAST_Gcd(int a, int b) {
75     int c;
76    
77     b = abs(b);
78     if (b > a)
79     c=a, a=b, b=c;
80    
81     while (b != 0) {
82     c = a%b;
83     a = b;
84     b = c;
85     }
86     return a;
87     }
88    
89     int BLAST_Gdb3(int* a, int* b, int* c) {
90     int g;
91     if (*b == 0)
92     g = BLAST_Gcd(*a, *c);
93     else
94     g = BLAST_Gcd(*a, BLAST_Gcd(*b, *c));
95     if (g > 1) {
96 gpertea 173 *a /= g;
97 gpertea 93 *b /= g;
98     *c /= g;
99     }
100     return g;
101     }
102    
103    
104 gpertea 171 uint16 get6mer(char* p) {
105     uint16 r=gdna2bit(p,3);
106 gpertea 173 r <<= 6;
107     r |= gdna2bit(p,3);
108 gpertea 171 return r;
109     }
110 gpertea 93
111 gpertea 173
112     void table6mers(const char* s, int slen, GVec<uint16>* amers[]) {
113 gpertea 174 for (uint16 i=0; i <= slen-6; i++) {
114 gpertea 173 char* p = (char*)(s+i);
115     uint16 v=get6mer(p);
116     if (amers[v]==NULL) {
117     amers[v]=new GVec<uint16>(1);
118     }
119     amers[v]->Add(i);
120     }
121     }
122    
123     GVec<uint16>* match6mer(char* start, GVec<uint16>* amers[]) {
124     //careful: this is broken if start+5 falls beyond the end of the string!
125     uint16 r=get6mer(start);
126     return amers[r];
127     }
128    
129 gpertea 93 //signal that a diagonal is invalid
130     static const int kInvalidOffset = -2;
131    
132     int s_FindFirstMismatch(const char *seq1, int len1,
133     const char *seq2, int len2,
134     int seq1_index, int seq2_index,
135     //bool &fence_hit,
136     bool reverse) {
137     int start_index = seq1_index;
138     /* Sentry detection here should be relatively inexpensive: The
139     sentry value cannot appear in the query, so detection only
140     needs to be done at exit from the subject-query matching loop.
141     For uncompressed sequences, ambiguities in the query (i.e. seq1)
142     always count as mismatches */
143     if (reverse) {
144     while (seq1_index < len1 && seq2_index < len2 &&
145     //seq1[len1-1 - seq1_index] < 4 &&
146     seq1[len1-1 - seq1_index] == seq2[len2-1 - seq2_index]) {
147     ++seq1_index;
148     ++seq2_index;
149     }
150     //if (seq2_index < len2 && seq2[len2-1-seq2_index] == FENCE_SENTRY) {
151     //if len2-1-seq2_index<=0) {
152     // fence_hit = true;
153     // }
154     }
155     else { //forward lookup
156 gpertea 171 while (seq1_index < len1 && seq2_index < len2 &&
157 gpertea 93 //seq1[seq1_index] < 4 &&
158     seq1[seq1_index] == seq2[seq2_index]) {
159     ++seq1_index;
160     ++seq2_index;
161     }
162     //if (seq2_index < len2 && seq2[seq2_index] == FENCE_SENTRY) {
163     //if (seq2_index==len2) {
164     // fence_hit = true;
165     //}
166     }
167     return seq1_index - start_index;
168     }
169    
170    
171    
172     /** During the traceback for a non-affine greedy alignment,
173     compute the diagonal that will result from the next
174     traceback operation
175    
176     @param last_seq2_off Array of offsets into the second sequence;
177     last_seq2_off[d][k] gives the largest offset into
178     the second sequence that lies on diagonal k and
179     has distance d [in]
180     @param d Starting distance [in]
181     @param diag Index of diagonal that produced the starting distance [in]
182     @param seq2_index The offset into the second sequence after the traceback
183     operation has completed [out]
184     @return The diagonal resulting from the next traceback operation
185     being applied
186     */
187     int s_GetNextNonAffineTback(int **last_seq2_off, int d,
188     int diag, int *seq2_index) {
189     // choose the traceback operation that results in the
190     // largest seq2 offset at this point, then compute the
191     // new diagonal that is implied by the operation
192     if (last_seq2_off[d-1][diag-1] >
193     GMAX(last_seq2_off[d-1][diag], last_seq2_off[d-1][diag+1])) {
194     *seq2_index = last_seq2_off[d-1][diag-1];
195     return diag - 1; // gap in seq2
196     }
197     if (last_seq2_off[d-1][diag] > last_seq2_off[d-1][diag+1]) {
198     *seq2_index = last_seq2_off[d-1][diag];
199     return diag; // match
200     }
201     *seq2_index = last_seq2_off[d-1][diag+1];
202     return diag + 1; // gap in seq1
203     }
204    
205    
206     int GXGreedyExtend(const char* seq1, int len1,
207     const char* seq2, int len2,
208 gpertea 191 bool reverse, //int xdrop_threshold, int match_cost, int mismatch_cost,
209 gpertea 93 int& seq1_align_len, int& seq2_align_len,
210 gpertea 101 CGreedyAlignData& aux_data,
211 gpertea 93 GXEditScript *edit_block) {
212     //GapPrelimEditBlock *edit_block,
213     //bool& fence_hit, SGreedySeed *seed) {
214     int seq1_index;
215     int seq2_index;
216     int index;
217     int d;
218     int k;
219     int diag_lower, diag_upper;
220     int max_dist;
221     int diag_origin;
222     int best_dist;
223     int best_diag;
224     int** last_seq2_off;
225     int* max_score;
226     int xdrop_offset;
227     int longest_match_run;
228     bool end1_reached, end2_reached;
229     GXMemPool* mem_pool;
230 gpertea 171
231 gpertea 93 /* ordinary dynamic programming alignment, for each offset
232     in seq1, walks through offsets in seq2 until an X-dropoff
233 gpertea 171 test fails, saving the best score encountered along
234 gpertea 93 the way. Instead of score, this code tracks the 'distance'
235     (number of mismatches plus number of gaps) between seq1
236     and seq2. Instead of walking through sequence offsets, it
237     walks through diagonals that can achieve a given distance.
238 gpertea 171
239 gpertea 93 Note that in what follows, the numbering of diagonals implies
240 gpertea 171 a dot matrix where increasing seq1 offsets go to the right on
241 gpertea 93 the x axis, and increasing seq2 offsets go up the y axis.
242     The gapped alignment thus proceeds up and to the right in
243     the graph, and diagonals are numbered increasing to the right */
244    
245     best_dist = 0;
246     best_diag = 0;
247    
248     /* set the number of distinct distances the algorithm will
249 gpertea 171 examine in the search for an optimal alignment. The
250     heuristic worst-case running time of the algorithm is
251 gpertea 93 O(max_dist**2 + (len1+len2)); for sequences which are
252     very similar, the average running time will be sig-
253     nificantly better than this */
254    
255     max_dist = GMIN(GREEDY_MAX_COST,
256 gpertea 109 (len2/GREEDY_MAX_COST_FRACTION + 1));
257 gpertea 93
258     /* the main loop assumes that the index of all diagonals is
259 gpertea 171 biased to lie in the middle of allocated bookkeeping
260 gpertea 93 structures */
261    
262     diag_origin = max_dist + 2;
263    
264     // last_seq2_off[d][k] is the largest offset into seq2 that
265     // lies on diagonal k and has distance d
266    
267     last_seq2_off = aux_data.last_seq2_off;
268    
269     /* Instead of tracking the best alignment score and using
270     xdrop_theshold directly, track the best score for each
271     unique distance and use the best score for some previously
272     computed distance to implement the X-dropoff test.
273    
274     xdrop_offset gives the distance backwards in the score
275     array to look */
276    
277 gpertea 191 xdrop_offset = aux_data.xdrop_ofs;
278 gpertea 171
279 gpertea 93 // find the offset of the first mismatch between seq1 and seq2
280    
281     index = s_FindFirstMismatch(seq1, len1, seq2, len2, 0, 0, reverse);
282     // fence_hit, reverse, rem);
283    
284     // update the extents of the alignment, and bail out
285     // early if no further work is needed
286    
287     seq1_align_len = index;
288     seq2_align_len = index;
289     seq1_index = index;
290     /*
291     seed->start_q = 0;
292     seed->start_s = 0;
293     seed->match_length = index;
294     */
295     longest_match_run = index;
296    
297     if (index == len1 || index == len2) {
298     /* Return the number of differences, which is zero here */
299     if (edit_block != NULL)
300     //GapPrelimEditBlockAdd(edit_block, eGapAlignSub, index);
301     edit_block->opRep(index);
302 gpertea 171 return 0;
303 gpertea 93 }
304    
305     // set up the memory pool
306     mem_pool = aux_data.space;
307     if (edit_block == NULL) {
308     mem_pool = NULL;
309     }
310     else if (mem_pool == NULL) {
311     aux_data.space = mem_pool = new GXMemPool();
312 gpertea 171 }
313 gpertea 93 else {
314     mem_pool->refresh();
315     }
316 gpertea 171
317 gpertea 93 /* set up the array of per-distance maximum scores. There
318     are max_diags + xdrop_offset distances to track, the first
319     xdrop_offset of which are 0 */
320    
321     max_score = aux_data.max_score + xdrop_offset;
322     for (index = 0; index < xdrop_offset; index++)
323     aux_data.max_score[index] = 0;
324 gpertea 171
325 gpertea 93 // fill in the initial offsets of the distance matrix
326    
327     last_seq2_off[0][diag_origin] = seq1_index;
328 gpertea 191 max_score[0] = seq1_index * aux_data.match_reward;
329 gpertea 93 diag_lower = diag_origin - 1;
330     diag_upper = diag_origin + 1;
331     end1_reached = end2_reached = false;
332    
333     // for each distance
334     for (d = 1; d <= max_dist; d++) {
335     int xdrop_score;
336     int curr_score;
337     int curr_extent = 0;
338     int curr_seq2_index = 0;
339     int curr_diag = 0;
340     int tmp_diag_lower = diag_lower;
341     int tmp_diag_upper = diag_upper;
342    
343     // Assign impossible seq2 offsets to any diagonals that
344     // are not in the range (diag_lower,diag_upper).
345     // These will serve as sentinel values for the inner loop
346     last_seq2_off[d - 1][diag_lower-1] = kInvalidOffset;
347     last_seq2_off[d - 1][diag_lower] = kInvalidOffset;
348     last_seq2_off[d - 1][diag_upper] = kInvalidOffset;
349     last_seq2_off[d - 1][diag_upper+1] = kInvalidOffset;
350    
351     // compute the score for distance d corresponding to the X-dropoff criterion
352    
353 gpertea 171 xdrop_score = max_score[d - xdrop_offset] +
354 gpertea 191 (aux_data.match_reward + aux_data.mismatch_penalty) * d - aux_data.x_drop;
355     xdrop_score = (int)ceil((double)xdrop_score / (aux_data.match_reward>>1));
356 gpertea 93
357     // for each diagonal of interest
358     for (k = tmp_diag_lower; k <= tmp_diag_upper; k++) {
359     /* find the largest offset into seq2 that increases
360     the distance from d-1 to d (i.e. keeps the alignment
361 gpertea 171 from getting worse for as long as possible), then
362 gpertea 93 choose the offset into seq1 that will keep the
363 gpertea 171 resulting diagonal fixed at k
364    
365 gpertea 93 Note that this requires kInvalidOffset+1 to be smaller
366     than any valid offset into seq2, i.e. to be negative */
367    
368     seq2_index = GMAX(last_seq2_off[d - 1][k + 1],
369     last_seq2_off[d - 1][k ]) + 1;
370     seq2_index = GMAX(seq2_index, last_seq2_off[d - 1][k - 1]);
371     seq1_index = seq2_index + k - diag_origin;
372    
373     if (seq2_index < 0 || seq1_index + seq2_index < xdrop_score) {
374    
375     // if no valid diagonal can reach distance d, or the
376     // X-dropoff test fails, narrow the range of diagonals
377     // to test and skip to the next diagonal
378     if (k == diag_lower)
379     diag_lower++;
380     else
381     last_seq2_off[d][k] = kInvalidOffset;
382     continue;
383     }
384     diag_upper = k;
385 gpertea 171
386     /* slide down diagonal k until a mismatch
387 gpertea 93 occurs. As long as only matches are encountered,
388     the current distance d will not change */
389    
390     index = s_FindFirstMismatch(seq1, len1, seq2, len2,
391     seq1_index, seq2_index, reverse);
392     //fence_hit, reverse, rem);
393     if (index > longest_match_run) {
394     //seed->start_q = seq1_index;
395     //seed->start_s = seq2_index;
396     //seed->match_length = index;
397     longest_match_run = index;
398     }
399     seq1_index += index;
400     seq2_index += index;
401    
402     // set the new largest seq2 offset that achieves
403     // distance d on diagonal k
404    
405     last_seq2_off[d][k] = seq2_index;
406    
407     // since all values of k are constrained to have the
408     // same distance d, the value of k which maximizes the
409     // alignment score is the one that covers the most of seq1 and seq2
410     if (seq1_index + seq2_index > curr_extent) {
411     curr_extent = seq1_index + seq2_index;
412     curr_seq2_index = seq2_index;
413     curr_diag = k;
414     }
415    
416     /* clamp the bounds on diagonals to avoid walking off
417     either sequence. Because the bounds increase by at
418     most one for each distance, diag_lower and diag_upper
419     can each be of size at most max_diags+2 */
420    
421     if (seq2_index == len2) {
422 gpertea 171 diag_lower = k + 1;
423 gpertea 93 end2_reached = true;
424     }
425     if (seq1_index == len1) {
426 gpertea 171 diag_upper = k - 1;
427 gpertea 93 end1_reached = true;
428     }
429     } // end loop over diagonals
430    
431     // compute the maximum score possible for distance d
432 gpertea 191 curr_score = curr_extent * (aux_data.match_reward / 2) -
433     d * (aux_data.match_reward + aux_data.mismatch_penalty);
434 gpertea 93 // if this is the best score seen so far, update the
435     // statistics of the best alignment
436     if (curr_score > max_score[d - 1]) {
437     max_score[d] = curr_score;
438     best_dist = d;
439     best_diag = curr_diag;
440     seq2_align_len = curr_seq2_index;
441     seq1_align_len = curr_seq2_index + best_diag - diag_origin;
442 gpertea 171 }
443 gpertea 93 else {
444     max_score[d] = max_score[d - 1];
445     }
446    
447     // alignment has finished if the lower and upper bounds
448     // on diagonals to check have converged to each other
449    
450     if (diag_lower > diag_upper)
451     break;
452    
453 gpertea 171 /* set up for the next distance to examine. Because the
454     bounds increase by at most one for each distance,
455     diag_lower and diag_upper can each be of size at
456 gpertea 93 most max_diags+2 */
457    
458     if (!end2_reached)
459 gpertea 171 diag_lower--;
460 gpertea 93 if (!end1_reached)
461     diag_upper++;
462    
463     if (edit_block == NULL) {
464     // if no traceback is specified, the next row of
465     // last_seq2_off can reuse previously allocated memory
466 gpertea 301 //WARNING The following assumes two arrays of
467 gpertea 93 // at least max_dist+4 int's have already been allocated
468     last_seq2_off[d + 1] = last_seq2_off[d - 1];
469     }
470     else {
471     // traceback requires all rows of last_seq2_off to be saved,
472     // so a new row must be allocated
473     last_seq2_off[d + 1] = (int*)mem_pool->getByteSpace((diag_upper - diag_lower + 7)*sizeof(int));
474     // move the origin for this row backwards
475 gpertea 301 // dubious pointer arithmetic ?!
476 gpertea 93 //last_seq2_off[d + 1] = last_seq2_off[d + 1] - diag_lower + 2;
477     }
478     } // end loop over distinct distances
479    
480 gpertea 171
481 gpertea 93 if (edit_block == NULL)
482     return best_dist;
483    
484     //---- perform traceback
485 gpertea 171 d = best_dist;
486 gpertea 93 seq1_index = seq1_align_len;
487     seq2_index = seq2_align_len;
488     // for all positive distances
489    
490     //if (fence_hit && *fence_hit)
491     // goto done;
492     if (index==len1 || index==len2) d=0;
493     while (d > 0) {
494     int new_diag;
495     int new_seq2_index;
496    
497     /* retrieve the value of the diagonal after the next
498     traceback operation. best_diag starts off with the
499     value computed during the alignment process */
500    
501 gpertea 171 new_diag = s_GetNextNonAffineTback(last_seq2_off, d,
502 gpertea 93 best_diag, &new_seq2_index);
503    
504     if (new_diag == best_diag) {
505     // same diagonal: issue a group of substitutions
506     if (seq2_index - new_seq2_index > 0) {
507     edit_block->opRep(seq2_index - new_seq2_index);
508     }
509 gpertea 171 }
510 gpertea 93 else if (new_diag < best_diag) {
511     // smaller diagonal: issue a group of substitutions
512     // and then a gap in seq2 */
513     if (seq2_index - new_seq2_index > 0) {
514     edit_block->opRep(seq2_index - new_seq2_index);
515     }
516     //GapPrelimEditBlockAdd(edit_block, eGapAlignIns, 1);
517     edit_block->opIns(1);
518 gpertea 171 }
519 gpertea 93 else {
520     // larger diagonal: issue a group of substitutions
521     // and then a gap in seq1
522     if (seq2_index - new_seq2_index - 1 > 0) {
523     edit_block->opRep(seq2_index - new_seq2_index - 1);
524     }
525     edit_block->opDel(1);
526     }
527 gpertea 171 d--;
528     best_diag = new_diag;
529     seq2_index = new_seq2_index;
530 gpertea 93 }
531     //done:
532     // handle the final group of substitutions back to distance zero,
533     // i.e. back to offset zero of seq1 and seq2
534     //GapPrelimEditBlockAdd(edit_block, eGapAlignSub,
535     // last_seq2_off[0][diag_origin]);
536     edit_block->opRep(last_seq2_off[0][diag_origin]);
537     if (!reverse)
538     edit_block->reverse();
539     return best_dist;
540     }
541    
542     void printEditScript(GXEditScript* ed_script) {
543     uint i;
544     if (ed_script==NULL || ed_script->opnum == 0)
545     return;
546     for (i=0; i<ed_script->opnum; i++) {
547     int num=((ed_script->ops[i]) >> 2);
548     unsigned char op_type = 3 - ( ed_script->ops[i] & gxEDIT_OP_MASK );
549     if (op_type == 3)
550     GError("Error: printEditScript encountered op_type 3 ?!\n");
551 gpertea 171 GMessage("%d%c ", num, xgapcodes[op_type]);
552 gpertea 93 }
553     GMessage("\n");
554     }
555    
556     GXAlnInfo* GreedyAlign(const char* q_seq, int q_alnstart, const char* s_seq, int s_alnstart,
557     bool editscript, int reward, int penalty, int xdrop) {
558     int q_max=strlen(q_seq); //query
559     int s_max=strlen(s_seq); //subj
560     return GreedyAlignRegion(q_seq, q_alnstart, q_max,
561 gpertea 101 s_seq, s_alnstart, s_max, reward, penalty, xdrop, NULL, NULL, editscript);
562 gpertea 93 }
563    
564     struct GXSeedTable {
565     int a_num, b_num;
566     int a_cap, b_cap;
567     char* xc;
568     GXSeedTable(int a=12, int b=255) {
569     a_cap=0;
570     b_cap=0;
571     a_num=0;
572     b_num=0;
573     xc=NULL;
574     init(a,b);
575     }
576     ~GXSeedTable() {
577     GFREE(xc);
578     }
579     void init(int a, int b) {
580     a_num=a;
581     b_num=b;
582     bool resize=false;
583     if (b_num>b_cap) { resize=true; b_cap=b_num;}
584     if (a_num>a_cap) { resize=true; a_cap=a_num;}
585     if (resize) {
586     GFREE(xc);
587     GCALLOC(xc, (a_num*b_num));
588     }
589     else {
590     //just clear up to a_max, b_max
591     memset((void*)xc, 0, (a_num*b_num));
592     }
593     }
594     char& x(int ax, int by) {
595     return xc[by*a_num+ax];
596     }
597    
598     };
599    
600 gpertea 99 const int a_m_score=2; //match score
601     const int a_mis_score=-3; //mismatch
602     const int a_dropoff_score=7;
603 gpertea 101 const int a_min_score=12; //at least 6 bases full match
604 gpertea 93
605 gpertea 181 // ------------------ adaptor matching - simple k-mer seed & extend, no indels for now
606 gpertea 99 //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
607     //check minimum score and
608 gpertea 181 //for 3' adaptor trimming:
609 gpertea 99 // require that the right end of the alignment for either the adaptor OR the read must be
610     // < 3 distance from its right end
611 gpertea 181 // for 5' adaptor trimming:
612 gpertea 99 // require that the left end of the alignment for either the adaptor OR the read must
613     // be at coordinate < 3 from start
614    
615     bool extendUngapped(const char* a, int alen, int ai,
616     const char* b, int blen, int bi, int& mlen, int& l5, int& l3, bool end5=false) {
617     //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
618     //if (debug) {
619     // GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
620     // }
621     int a_l=ai; //alignment coordinates on a
622     int a_r=ai+mlen-1;
623     int b_l=bi; //alignment coordinates on b
624     int b_r=bi+mlen-1;
625     int ai_maxscore=ai;
626     int bi_maxscore=bi;
627     int score=mlen*a_m_score;
628     int maxscore=score;
629     int mism5score=a_mis_score;
630     if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
631     //try to extend to the left first, if possible
632     while (ai>0 && bi>0) {
633     ai--;
634     bi--;
635     score+= (a[ai]==b[bi])? a_m_score : mism5score;
636     if (score>maxscore) {
637     ai_maxscore=ai;
638     bi_maxscore=bi;
639     maxscore=score;
640     }
641     else if (maxscore-score>a_dropoff_score) break;
642     }
643     a_l=ai_maxscore;
644     b_l=bi_maxscore;
645     //now extend to the right
646     ai_maxscore=a_r;
647     bi_maxscore=b_r;
648     ai=a_r;
649     bi=b_r;
650     score=maxscore;
651     //sometimes there are extra As at the end of the read, ignore those
652     if (a[alen-2]=='A' && a[alen-1]=='A') {
653     alen-=2;
654     while (a[alen-1]=='A' && alen>ai) alen--;
655     }
656     while (ai<alen-1 && bi<blen-1) {
657     ai++;
658     bi++;
659     //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
660     if (a[ai]==b[bi]) { //match
661     score+=a_m_score;
662     if (ai>=alen-2) {
663     score+=a_m_score-(alen-ai-1);
664     }
665     }
666     else { //mismatch
667     score+=a_mis_score;
668     }
669     if (score>maxscore) {
670     ai_maxscore=ai;
671     bi_maxscore=bi;
672     maxscore=score;
673     }
674     else if (maxscore-score>a_dropoff_score) break;
675     }
676     a_r=ai_maxscore;
677     b_r=bi_maxscore;
678     int a_ovh3=alen-a_r-1;
679     int b_ovh3=blen-b_r-1;
680     int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
681     int mmovh5=(a_l<b_l)? a_l : b_l;
682     if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
683     if (a_l<a_ovh3) {
684 gpertea 181 //adaptor closer to the left end (typical for 5' adaptor)
685 gpertea 99 l5=a_r+1;
686     l3=alen-1;
687     }
688     else {
689 gpertea 181 //adaptor matching at the right end (typical for 3' adaptor)
690 gpertea 99 l5=0;
691     l3=a_l-1;
692     }
693     return true;
694     }
695     //do not trim:
696     l5=0;
697     l3=alen-1;
698     return false;
699     }
700    
701 gpertea 191 GXBandSet* collectSeeds(GList<GXSeed>& seeds, GXSeqData& sd, GAlnTrimType trim_intent) {
702 gpertea 178 int bimin=GMAX(0,(sd.blen-sd.alen-6)); //from collectSeeds_R
703 gpertea 191 int bimax=GMIN((sd.alen+2), (sd.blen-6));
704     int b_start = (trim_intent==galn_TrimRight) ? bimin : 0;
705     int b_end = (trim_intent==galn_TrimLeft) ? bimax : sd.blen-6;
706 gpertea 93 //gx.init(a_maxlen, b_maxlen);
707 gpertea 176 GXSeedTable gx(sd.alen, sd.blen);
708     GXBandSet* diagstrips=new GXBandSet(sd.alen, sd.blen); //set of overlapping 3-diagonal strips
709 gpertea 191 for (int bi=b_start;bi<=b_end;bi++) {
710 gpertea 174 //for each 6-mer of seqb
711 gpertea 176 uint16 bv = get6mer((char*) & (sd.bseq[bi]));
712     GVec<uint16>* alocs = sd.amers[bv];
713 gpertea 174 if (alocs==NULL) continue;
714     //extend each hit
715     for (int h=0;h<alocs->Count();h++) {
716     int ai=alocs->Get(h);
717     //word match
718     if (gx.x(ai,bi))
719     //already have a previous seed covering this region of this diagonal
720     continue;
721 gpertea 191 if (trim_intent==galn_TrimLeft && sd.blen>sd.alen+6 && bi>ai+6)
722     continue; //improper positioning for 5' trimming
723     if (trim_intent==galn_TrimRight && sd.blen>sd.alen+6 && bi<ai-6)
724     continue; //improper positioning for 5' trimming
725    
726 gpertea 188 //TODO: there could be Ns in this seed, should we count/adjust score?
727 gpertea 174 for (int i=0;i<6;i++)
728     gx.x(ai+i,bi+i)=1;
729     //see if we can extend to the right
730     int aix=ai+6;
731     int bix=bi+6;
732     int len=6;
733 gpertea 176 while (bix<sd.blen && aix<sd.alen && sd.aseq[aix]==sd.bseq[bix]) {
734 gpertea 174 gx.x(aix,bix)=1;
735     aix++;bix++;
736     len++;
737     }
738 gpertea 176 if (len>sd.amlen) {
739 gpertea 174 //heuristics: very likely the best we can get
740     //quick match shortcut
741     diagstrips->qmatch=new GXSeed(ai,bi,len);
742     return diagstrips;
743 gpertea 176 }
744 gpertea 181 if (bi>bimax && bi<bimin && len<9)
745 gpertea 191 //ignore mid-sequence seeds that are not high scoring
746 gpertea 181 continue;
747 gpertea 178
748 gpertea 174 GXSeed* newseed=new GXSeed(ai,bi,len);
749     seeds.Add(newseed);
750     diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
751 gpertea 191 //keep last resort terminal match to be used if no better alignment is there
752 gpertea 176 if (bi<2 && ai+len>=sd.alen-1 &&
753 gpertea 178 (!diagstrips->tmatch_l || diagstrips->tmatch_l->len<len))
754     diagstrips->tmatch_l=newseed;
755     //collectSeeds_R:
756     if (ai<2 && bi+len>sd.blen-2 &&
757     (!diagstrips->tmatch_r || diagstrips->tmatch_r->len<len))
758     diagstrips->tmatch_r=newseed;
759 gpertea 174 }
760     } //for each 6-mer of the read
761 gpertea 93 for (int i=0;i<diagstrips->Count();i++) {
762     diagstrips->Get(i)->finalize(); //adjust scores due to overlaps or gaps between seeds
763     }
764     diagstrips->setSorted(true); //sort by score
765     return diagstrips;
766     }
767    
768     int cmpSeedScore(const pointer p1, const pointer p2) {
769     //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
770     GXSeed* s1=(GXSeed*)p1;
771     GXSeed* s2=(GXSeed*)p2;
772     if (s1->len==s2->len) {
773     return (s1->b_ofs-s2->b_ofs);
774     }
775     else return (s2->len-s1->len);
776     }
777    
778 gpertea 174 int cmpSeedScore_R(const pointer p1, const pointer p2) {
779     //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
780     GXSeed* s1=(GXSeed*)p1;
781     GXSeed* s2=(GXSeed*)p2;
782     if (s1->len==s2->len) {
783     return (s2->b_ofs-s1->b_ofs);
784     }
785     else return (s2->len-s1->len);
786     }
787    
788    
789 gpertea 93 int cmpSeedDiag(const pointer p1, const pointer p2) {
790     GXSeed* s1=(GXSeed*)p1;
791     GXSeed* s2=(GXSeed*)p2;
792     return ((s1->b_ofs-s1->a_ofs)-(s2->b_ofs-s2->a_ofs));
793     }
794    
795 gpertea 101
796     int cmpDiagBands_R(const pointer p1, const pointer p2) {
797     //return (((GXSeed*)s2)->len-((GXSeed*)s1)->len);
798     GXBand* b1=(GXBand*)p1;
799     GXBand* b2=(GXBand*)p2;
800     if (b1->score==b2->score) {
801     return (b2->w_min_b-b1->w_min_b);
802     }
803     else return (b2->score-b1->score);
804     }
805    
806    
807    
808 gpertea 93 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
809 gpertea 101 const char* s_seq, int s_alnstart, int s_max,
810     int reward, int penalty, int xdrop, CGreedyAlignData* gxmem,
811     CAlnTrim* trim, bool editscript) {
812 gpertea 93 GXEditScript* ed_script_fwd = NULL;
813     GXEditScript* ed_script_rev = NULL;
814     if ( q_alnstart>q_max || q_alnstart<1 || s_alnstart>s_max || s_alnstart<1 )
815     GError("GreedyAlign() Error: invalid anchor coordinate.\n");
816     q_alnstart--;
817     s_alnstart--;
818     if (q_seq==NULL || q_seq[0]==0 || s_seq==NULL || s_seq[0]==0)
819     GError("GreedyAlign() Error: attempt to use an empty sequence string!\n");
820 gpertea 188 /*if (q_seq[q_alnstart]!=s_seq[s_alnstart])
821     GError("GreedyAlign() Error: improper anchor (mismatch):\n%s (start %d len %d)\n%s (start %d len %d)\n",
822     q_seq, q_alnstart, q_max, s_seq, s_alnstart, s_max);
823     */
824 gpertea 93 int q_ext_l=0, q_ext_r=0, s_ext_l=0, s_ext_r=0;
825     const char* q=q_seq+q_alnstart;
826     int q_avail=q_max-q_alnstart;
827     const char* s=s_seq+s_alnstart;
828     int s_avail=s_max-s_alnstart;
829     if (penalty<0) penalty=-penalty;
830     GXAlnInfo* alninfo=NULL;
831     bool freeAlnMem=(gxmem==NULL);
832     if (freeAlnMem) {
833 gpertea 101 gxmem=new CGreedyAlignData(reward, penalty, xdrop);
834 gpertea 191 reward=gxmem->match_reward;
835     penalty=gxmem->mismatch_penalty;
836     xdrop=gxmem->x_drop;
837 gpertea 93 }
838 gpertea 191 else
839     gxmem->reset();
840     int minMatch= trim ? trim->minMatch : 6;
841     int MIN_GREEDY_SCORE=minMatch*reward; //minimum score for an alignment to be reported for 0 diffs
842 gpertea 93 int retscore = 0;
843     int numdiffs = 0;
844 gpertea 101 if (trim!=NULL && trim->type==galn_TrimLeft) {
845 gpertea 176 //intent: trimming the left side
846 gpertea 93 if (editscript)
847     ed_script_rev=new GXEditScript();
848    
849 gpertea 191 int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, // xdrop, reward, penalty,
850     s_ext_l, q_ext_l, *gxmem, ed_script_rev);
851 gpertea 93 //check this extension here and bail out if it's not a good extension
852 gpertea 178 if (s_ext_l+(trim->seedlen>>1) < trim->safelen &&
853 gpertea 181 q_alnstart+1-q_ext_l>1 &&
854     s_alnstart+1-s_ext_l>trim->l_boundary) {
855 gpertea 93 delete ed_script_rev;
856     if (freeAlnMem) delete gxmem;
857     return NULL;
858     }
859    
860     if (editscript)
861     ed_script_fwd=new GXEditScript();
862 gpertea 191 int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, //xdrop, reward, penalty,
863     s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
864 gpertea 93 numdiffs=numdiffs_r+numdiffs_l;
865     //convert num diffs to actual score
866 gpertea 191 retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*(reward>>1) - numdiffs*(reward+penalty);
867 gpertea 93 if (editscript)
868     ed_script_rev->Append(ed_script_fwd); //combine the two extensions
869     }
870     else {
871     if (editscript) {
872     ed_script_fwd=new GXEditScript();
873     }
874 gpertea 191 int numdiffs_r = GXGreedyExtend(s, s_avail, q, q_avail, false, // xdrop, reward, penalty,
875     s_ext_r, q_ext_r, *gxmem, ed_script_fwd);
876 gpertea 93 //check extension here and bail out if not a good right extension
877     //assuming s_max is really at the right end of s_seq
878 gpertea 109 if (trim!=NULL && trim->type==galn_TrimRight &&
879 gpertea 176 s_ext_r+(trim->seedlen>>1) < trim->safelen &&
880 gpertea 181 q_alnstart+q_ext_r<q_max-2 &&
881     s_alnstart+s_ext_r<trim->r_boundary) {
882 gpertea 93 delete ed_script_fwd;
883     if (freeAlnMem) delete gxmem;
884     return NULL;
885     }
886     if (editscript)
887     ed_script_rev=new GXEditScript();
888 gpertea 191 int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, // xdrop, reward, penalty,
889     s_ext_l, q_ext_l, *gxmem, ed_script_rev);
890 gpertea 93 //convert num diffs to actual score
891     numdiffs=numdiffs_r+numdiffs_l;
892 gpertea 191 retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*(reward>>1) - numdiffs*(reward+penalty);
893 gpertea 93 if (editscript)
894     ed_script_rev->Append(ed_script_fwd); //combine the two extensions
895     }
896     if (retscore>=MIN_GREEDY_SCORE) {
897 gpertea 191 alninfo=new GXAlnInfo(q_seq, q_alnstart+1-q_ext_l, q_alnstart+q_ext_r, s_seq, s_alnstart+1-s_ext_l, s_alnstart+s_ext_r);
898 gpertea 93 int hsp_length = GMIN(q_ext_l+q_ext_r, s_ext_l+s_ext_r);
899     alninfo->score=retscore;
900 gpertea 191 if (gxmem->scaled) alninfo->score >>= 1;
901 gpertea 93 alninfo->pid = 100 * (1 - ((double) numdiffs) / hsp_length);
902 gpertea 310 #ifdef TRIMDEBUG
903 gpertea 104 //if (ed_script_rev) {
904     // GMessage("Final Edit script ::: ");
905     // printEditScript(ed_script_rev);
906     // }
907 gpertea 93 #endif
908     alninfo->editscript=ed_script_rev;
909     alninfo->gapinfo = new CAlnGapInfo(ed_script_rev, alninfo->ql-1, alninfo->sl-1);
910     }
911     else {
912 gpertea 191 //if (freeAlnMem) delete gxmem;
913 gpertea 93 delete ed_script_rev;
914     delete alninfo;
915     alninfo=NULL;
916     }
917 gpertea 191 if (freeAlnMem) delete gxmem;
918 gpertea 93 delete ed_script_fwd;
919     return alninfo;
920     }
921 gpertea 96
922 gpertea 101 GXAlnInfo* GreedyAlignRegion(const char* q_seq, int q_alnstart, int q_max,
923     const char* s_seq, int s_alnstart, int s_max, CGreedyAlignData* gxmem,
924     CAlnTrim* trim, bool editscript) {
925     int reward=2;
926 gpertea 191 int penalty=10;
927     int xdrop=32;
928 gpertea 101 if (gxmem) {
929     reward=gxmem->match_reward;
930     penalty=gxmem->mismatch_penalty;
931     xdrop=gxmem->x_drop;
932     }
933     return GreedyAlignRegion(q_seq, q_alnstart, q_max, s_seq, s_alnstart, s_max,
934     reward, penalty, xdrop, gxmem, trim, editscript);
935     }
936    
937 gpertea 191 GXAlnInfo* match_adaptor(GXSeqData& sd, GAlnTrimType trim_type, int minMatch,
938 gpertea 181 CGreedyAlignData* gxmem, int min_pid) {
939 gpertea 96 bool editscript=false;
940 gpertea 310 #ifdef TRIMDEBUG
941 gpertea 96 editscript=true;
942 gpertea 181 if (trim_type==galn_TrimLeft) {
943     GMessage("=======> searching left (5') end : %s\n", sd.aseq);
944     }
945     else if (trim_type==galn_TrimRight) {
946     GMessage("=======> searching right(3') end : %s\n", sd.aseq);
947     }
948     else if (trim_type==galn_TrimEither) {
949     GMessage("==========> searching both ends : %s\n", sd.aseq);
950     }
951 gpertea 96 #endif
952 gpertea 191 CAlnTrim trimInfo(trim_type, sd.bseq, sd.blen, sd.alen, minMatch, sd.amlen);
953 gpertea 176 GList<GXSeed> rseeds(true,true,false);
954 gpertea 191 GXBandSet* alnbands=collectSeeds(rseeds, sd, trim_type);
955 gpertea 101 GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
956 gpertea 99 //did we find a shortcut?
957     if (alnbands->qmatch) {
958 gpertea 310 #ifdef TRIMDEBUG
959 gpertea 174 GMessage("::: Found a quick long match at %d, len %d\n",
960     alnbands->qmatch->b_ofs, alnbands->qmatch->len);
961     #endif
962 gpertea 101 anchor_seeds.Add(alnbands->qmatch);
963     }
964     else {
965     int max_top_bands=5;
966     int top_band_count=0;
967     for (int b=0;b<alnbands->Count();b++) {
968 gpertea 174 if (alnbands->Get(b)->score<6) break;
969 gpertea 310 //#ifdef TRIMDEBUG
970 gpertea 104 //GMessage("\tBand %d score: %d\n", b, alnbands->Get(b)->score);
971     //#endif
972 gpertea 101 top_band_count++;
973     GXBand& band=*(alnbands->Get(b));
974     band.seeds.setSorted(cmpSeedScore);
975     anchor_seeds.Add(band.seeds.First());
976 gpertea 181 //band.tested=true;
977 gpertea 101 if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
978     }
979 gpertea 310 //#ifdef TRIMDEBUG
980 gpertea 104 //GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
981     //#endif
982 gpertea 101 }
983 gpertea 96 GList<GXAlnInfo> galns(true,true,false);
984     for (int i=0;i<anchor_seeds.Count();i++) {
985     GXSeed& aseed=*anchor_seeds[i];
986     int a1=aseed.a_ofs+(aseed.len>>1)+1;
987     int a2=aseed.b_ofs+(aseed.len>>1)+1;
988 gpertea 176 trimInfo.seedlen=aseed.len;
989 gpertea 310 #ifdef TRIMDEBUG
990 gpertea 178 GMessage("\t::: align from seed (%d, %d)of len %d.\n",aseed.a_ofs, aseed.b_ofs,
991     aseed.len);
992     #endif
993 gpertea 176 GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen,
994     sd.bseq, a2, sd.blen, gxmem, &trimInfo, editscript);
995 gpertea 181 if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo))
996 gpertea 96 galns.AddIfNew(alninfo, true);
997     else delete alninfo;
998     }
999 gpertea 181
1000     if (galns.Count()==0) {
1001     //last resort: look for weaker terminal seeds
1002     GPVec<GXSeed> tmatches(2,false);
1003     if (trim_type!=galn_TrimRight) {
1004     if (alnbands->tmatch_l)
1005     tmatches.Add(alnbands->tmatch_l);
1006     }
1007     if (trim_type!=galn_TrimLeft) {
1008     if (alnbands->tmatch_r)
1009     tmatches.Add(alnbands->tmatch_r);
1010     }
1011     for (int i=0;i<tmatches.Count();i++) {
1012     GXSeed& aseed=*tmatches[i];
1013     int halfseed=aseed.len>>1;
1014     int a1=aseed.a_ofs+halfseed+1;
1015     int a2=aseed.b_ofs+halfseed+1;
1016     trimInfo.seedlen=aseed.len;
1017 gpertea 310 #ifdef TRIMDEBUG
1018 gpertea 181 GMessage("\t::: align from terminal seed (%d, %d)of len %d.\n",aseed.a_ofs, aseed.b_ofs,
1019 gpertea 178 aseed.len);
1020     #endif
1021 gpertea 181 GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen,
1022 gpertea 176 sd.bseq, a2, sd.blen, gxmem, &trimInfo, editscript);
1023 gpertea 181 if (alninfo && alninfo->pid>=min_pid && trimInfo.validate(alninfo))
1024 gpertea 174 galns.AddIfNew(alninfo, true);
1025 gpertea 181 else delete alninfo;
1026     }//for each terminal seed
1027 gpertea 174 }
1028 gpertea 181 //---- found all alignments
1029     delete alnbands;
1030 gpertea 174 /*
1031 gpertea 310 #ifdef TRIMDEBUG
1032 gpertea 181 //print all valid alignments found
1033     for (int i=0;i<galns.Count();i++) {
1034     GXAlnInfo* alninfo=galns[i];
1035     GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
1036     alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
1037     if (alninfo->gapinfo!=NULL) {
1038     GMessage("Alignment:\n");
1039     alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1040 gpertea 104 }
1041 gpertea 181 }
1042     #endif
1043     */
1044 gpertea 96 if (galns.Count()) {
1045     GXAlnInfo* bestaln=galns.Shift();
1046 gpertea 310 #ifdef TRIMDEBUG
1047 gpertea 104 GMessage("Best alignment: a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", bestaln->ql, bestaln->qr,
1048     bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid);
1049     if (bestaln->gapinfo!=NULL) {
1050 gpertea 176 bestaln->gapinfo->printAlignment(stderr, sd.aseq, sd.alen, sd.bseq, sd.blen);
1051 gpertea 104 }
1052     #endif
1053 gpertea 96 return bestaln;
1054     }
1055     else return NULL;
1056     }