ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/PrimerMatch/select.cc
Revision: 1.2
Committed: Wed May 4 18:03:45 2005 UTC (10 years, 11 months ago) by nje01
Branch: MAIN
CVS Tags: HEAD
Changes since 1.1: +27 -12 lines
Log Message:
Small bug fixes, plus codon based edit distance for peptide searching.

Line File contents
1 /**************************************************************************
2 * This code is part of the supporting infrastructure for ATA Mapper.
3 * Copyright (C) 2002,2003,2004 Applera Corporation. All rights reserved.
4 * Author: Nathan Edwards
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received (LICENSE.txt) a copy of the GNU General Public
17 * License along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *************************************************************************/
20
21
22 #include "select.h"
23
24 #include "keyword_tree.h"
25 #include "keyword_tree.t"
26 #include "shift_and.h"
27 #include "shift_and_inexact.h"
28 #include "exact_bases.h"
29 #include "exact_halves.h"
30 #include "filter_bitvec.h"
31 #include "hash_table.h"
32 #include "math.h"
33
34 PatternMatch * pick_pattern_index(CharacterProducer * const & ff,
35 int pmselect,
36 int nmismatch,
37 std::vector<std::pair<int,int> > *exact_const,
38 std::vector<int> *patlen,
39 int seedlen,
40 bool wildcard,
41 bool textn,
42 bool indels,
43 bool dna_mut,
44 char eos,
45 bool verbose) {
46 long int min_exact_const=0;
47 long int cumbooldiff=0;
48 long int cumdiff=0;
49 long int cum_exact_const=0;
50 long int min_inexact_bases=MAXINT;
51 double avexact=0;
52 double avbool=0;
53 double avdiff=0;
54 if (exact_const) {
55 min_exact_const=MAXINT;
56 for (int i=1;i<(*exact_const).size();i++) {
57 if ((*exact_const)[i].first >= (*exact_const)[i].second) {
58 if (min_exact_const > (*exact_const)[i].first) {
59 min_exact_const = (*exact_const)[i].first;
60 }
61 cum_exact_const += (*exact_const)[i].first;
62 cumdiff += ((*exact_const)[i].first - ((*patlen)[i]/2));
63 cumbooldiff += ((((*exact_const)[i].first - ((*patlen)[i]/2))>=0)?1:0);
64 if (min_inexact_bases > -((*exact_const)[i].first - (*patlen)[i])) {
65 min_inexact_bases = -((*exact_const)[i].first - (*patlen)[i]);
66 }
67 } else /* (*exact_const)[i].first < (*exact_const)[i].second */ {
68 if (min_exact_const > (*exact_const)[i].second) {
69 min_exact_const = (*exact_const)[i].second;
70 }
71 cum_exact_const += (*exact_const)[i].second;
72 cumdiff += ((*exact_const)[i].second - ((*patlen)[i]/2));
73 cumbooldiff += ((((*exact_const)[i].second - ((*patlen)[i]/2))>=0)?1:0);
74 if (min_inexact_bases > -((*exact_const)[i].second - (*patlen)[i])) {
75 min_inexact_bases = -((*exact_const)[i].second - (*patlen)[i]);
76 }
77 }
78 }
79 avexact=((double)cum_exact_const)/((*exact_const).size()-1);
80 avbool=((double)cumbooldiff)/((*exact_const).size()-1);
81 avdiff=((double)cumdiff)/((*exact_const).size()-1);
82 }
83
84 long int min_length=0;
85 long int cum_len=0;
86 double avlen=0;
87 if (patlen) {
88 min_length=MAXINT;
89 for (int i=1;i<(*patlen).size();i++) {
90 if (min_length > (*patlen)[i]) {
91 min_length = (*patlen)[i];
92 }
93 cum_len += (*patlen)[i];
94 }
95 avlen=((double)cum_len)/((*patlen).size()-1);
96 }
97
98 if (min_inexact_bases > min_length) {
99 min_inexact_bases = min_length;
100 }
101
102 if (exact_const && nmismatch >= min_inexact_bases && nmismatch > 0) {
103 timestampli("Fatal error: Number of edits >= Minimum number of inexact bases: ",min_inexact_bases);
104 exit(1);
105 }
106
107 if (dna_mut && ff->size() < 20) {
108 timestamp("Fatal error: DNA Mutations for non amino-acid alphabet (size < 20)");
109 exit(1);
110 }
111
112 PatternMatch * pm=0;
113 if (pmselect == 0) {
114 if (wildcard) {
115 pmselect = 4;
116 } else if (ff->size() < 255) {
117 if (ff->nch('A') == 0 &&
118 ff->nch('C') == 1 &&
119 ff->nch('G') == 2 &&
120 ff->nch('T') == 3) {
121 pmselect = 2;
122 } else {
123 pmselect = 3;
124 }
125 } else {
126 pmselect = 3;
127 }
128 // pmselect is now the offset of the best exact pattern lookup.
129 // Now determine if we can use any of our tricks for inexact lookup.
130 if (nmismatch>0) {
131 if (nmismatch == 1 &&
132 ((min_length >= 12 && ff->size() < 10) ||
133 (min_length >= 8 && ff->size() >=10)) &&
134 (cumbooldiff <= 0 || cumdiff <= 0)) {
135 // Use exact_halves...
136 pmselect = 11 + pmselect - 1;
137 } else if (min_exact_const >= 6) {
138 // Use exact_bases...
139 pmselect = 7 + pmselect - 1;
140 } else if (seedlen > 0) {
141 // use hash table
142 pmselect = 6;
143 } else {
144 // Use inexact bitvector...
145 pmselect = 5;
146 }
147 }
148 }
149
150 if (dna_mut && pmselect == 5) {
151 timestamp("Fatal error: DNA Mutations incompatible with inexact bitvector iimplementation");
152 exit(1);
153 }
154
155 if (verbose && patlen) {
156 timestampli("Primer stats: min length: ",min_length);
157 timestampd(" average len: ",floor(avlen*10+.5)/10);
158 if (nmismatch>0) {
159 timestampli(" min exact bases: ",min_exact_const);
160 timestampd(" average exact: ",floor(avexact*10+.5)/10);
161 timestampd(" average (exact - len/2): ",floor(avdiff*10+.5)/10);
162 timestampli(" count (exact >= len/2): ",cumbooldiff);
163 timestampli(" seed length: ",seedlen);
164 }
165 timestampli(" number of primers: ",((*patlen).size()-1));
166 }
167
168 if (verbose) {
169 if (indels) {
170 timestampi("Options summary: string edits: ",nmismatch);
171 } else {
172 timestampi("Options summary: mismatches: ",nmismatch);
173 }
174 if (dna_mut) {
175 timestamp(" DNA mutation scoring");
176 }
177 if (wildcard) {
178 if (textn) {
179 timestamp(" wildcard, w/ text N");
180 } else {
181 timestamp(" wildcard, no text N");
182 }
183 } else {
184 timestamp(" no wildcard");
185 }
186 }
187
188 switch (pmselect) {
189 case 1:
190 pm = new keyword_tree<ktnode_list>;
191 if (verbose) timestamp("Using keyword tree with list nodes...");
192 break;
193 case 2:
194 pm = new keyword_tree<ktnode_dna_list>;
195 if (verbose) timestamp("Using keyword tree with nodes optimized for DNA...");
196 break;
197 case 3:
198 pm = new keyword_tree<ktnode_jtable>;
199 if (verbose) timestamp("Using keyword tree with jump table nodes...");
200 break;
201 case 4:
202 pm = new shift_and(wildcard,textn);
203 if (verbose) timestamp("Using bitvector...");
204 break;
205 case 5:
206 pm = new filter_bitvec(nmismatch,eos,wildcard,textn,indels,dna_mut);
207 if (verbose) timestamp("Using inexact bitvector...");
208 break;
209 case 6:
210 pm = new hash_table(seedlen,nmismatch,eos,wildcard,textn,indels,dna_mut);
211 if (verbose) timestamp("Using exact seed with hash table...");
212 break;
213 case 7:
214 pm = new exact_bases(new keyword_tree<ktnode_list>,nmismatch,eos,wildcard,textn,indels,dna_mut);
215 if (verbose) timestamp("Using keyword tree with list nodes for exact portion...");
216 break;
217 case 8:
218 pm = new exact_bases(new keyword_tree<ktnode_dna_list>,nmismatch,eos,wildcard,textn,indels,dna_mut);
219 if (verbose) timestamp("Using keyword tree with nodes optimized for DNA for exact portion...");
220 break;
221 case 9:
222 pm = new exact_bases(new keyword_tree<ktnode_jtable>,nmismatch,eos,wildcard,textn,indels,dna_mut);
223 if (verbose) timestamp("Using keyword tree with jump table nodes for exact portion...");
224 break;
225 case 10:
226 pm = new exact_bases(new shift_and(wildcard,textn),nmismatch,eos,wildcard,textn,indels,dna_mut);
227 if (verbose) timestamp("Using bitvector for exact portion...");
228 break;
229 case 11:
230 pm = new exact_halves(new keyword_tree<ktnode_list>,nmismatch,eos,wildcard,textn,indels,dna_mut);
231 if (verbose) timestamp("Using keyword tree with list nodes for exact halves...");
232 break;
233 case 12:
234 pm = new exact_halves(new keyword_tree<ktnode_dna_list>,nmismatch,eos,wildcard,textn,indels,dna_mut);
235 if (verbose) timestamp("Using keyword tree with nodes optimized for DNA for exact halves...");
236 break;
237 case 13:
238 pm = new exact_halves(new keyword_tree<ktnode_jtable>,nmismatch,eos,wildcard,textn,indels,dna_mut);
239 if (verbose) timestamp("Using keyword tree with jump table nodes for exact halves...");
240 break;
241 case 14:
242 pm = new exact_halves(new shift_and(wildcard,textn),nmismatch,eos,wildcard,textn,indels,dna_mut);
243 if (verbose) timestamp("Using bitvector for exact halves...");
244 break;
245 default:
246 timestamp("Bad pattern index selected...");
247 assert(0);
248 break;
249 }
250 return pm;
251 }