ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/PrimerMatch/shift_and_inexact.cc
Revision: 1.2
Committed: Wed May 4 18:03:45 2005 UTC (11 years, 2 months ago) by nje01
Branch: MAIN
CVS Tags: HEAD
Changes since 1.1: +10 -1 lines
Log Message:
Small bug fixes, plus codon based edit distance for peptide searching.

Line File contents
1 /**************************************************************************
2 * This code is part of the supporting infrastructure for ATA Mapper.
3 * Copyright (C) 2002,2003,2004 Applera Corporation. All rights reserved.
4 * Author: Nathan Edwards
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received (LICENSE.txt) a copy of the GNU General Public
17 * License along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *************************************************************************/
20
21
22 #include <assert.h>
23 #include <string.h>
24 #include "shift_and_inexact.h"
25 #include "util.h"
26
27 shift_and_inexact::shift_and_inexact(unsigned int k, unsigned char eos, bool wc, bool tn, bool id, bool dm) {
28 m_ = 0;
29 u_ = 0;
30 mask_=0;
31 k_ = k;
32 eos_ = eos;
33 _wc = wc;
34 _textn = tn;
35 _indels = id;
36 _dna_mut = dm;
37 }
38
39 shift_and_inexact::~shift_and_inexact() {
40 clearu();
41 }
42
43 unsigned int shift_and_inexact::mismatches() const {
44 return k_;
45 }
46
47 void shift_and_inexact::mismatches(unsigned int k) {
48 k_ = k;
49 }
50
51 bool shift_and_inexact::wildcards() const {
52 return _wc;
53 }
54
55 void shift_and_inexact::wildcards(bool wc) {
56 _wc = wc;
57 }
58
59 bool shift_and_inexact::wildcard_text_N() const {
60 return _textn;
61 }
62
63 void shift_and_inexact::wildcard_text_N(bool tn) {
64 _textn = tn;
65 }
66
67 bool shift_and_inexact::indels() const {
68 return _indels;
69 }
70
71 void shift_and_inexact::indels(bool id) {
72 _indels = id;
73 }
74
75 bool shift_and_inexact::dna_mut() const {
76 return _dna_mut;
77 }
78
79 void shift_and_inexact::dna_mut(bool dm) {
80 _dna_mut = dm;
81 }
82
83 char shift_and_inexact::eos_char() const {
84 return eos_;
85 }
86
87 void shift_and_inexact::eos_char(char c) {
88 eos_ = c;
89 }
90
91 void shift_and_inexact::clearu() {
92 if (u_) {
93 if (u_[0]) delete [] u_[0];
94 delete [] u_;
95 }
96 if (m_) {
97 if (m_[0]) delete [] m_[0];
98 delete [] m_;
99 }
100 if (mask_) delete [] mask_;
101 }
102
103 unsigned long shift_and_inexact::add_pattern(std::string const & pat,
104 long unsigned id,
105 int esb, int eeb) {
106 add_pattern_(pat,id,esb,eeb);
107 return id;
108 }
109
110 void shift_and_inexact::computeu(CharacterProducer & cp) {
111 clearu();
112 long unsigned int patterns_length=0;
113 long unsigned int patterns_count=0;
114 tinylist<pattern_list_element>::const_iterator it;
115 for (it=patterns().begin();it!=patterns().end();++it) {
116 patterns_length+=it->pattern().length();
117 patterns_count++;
118 }
119 unsigned int wordbits = (sizeof(bigword)*8);
120 long unsigned int patterns_wordcount;
121 patterns_wordcount = patterns_length/wordbits +
122 ((patterns_length%wordbits)?1:0);
123 // assert(patterns_wordcount*wordbits >= patterns_length);
124 _wordcount = patterns_wordcount;
125 // cerr << "# " << patterns_count << " L " << patterns_length
126 // << " W " << _wordcount << " w " << wordbits << endl;
127 _highbit = (((bigword)1)<<(wordbits-1));
128 // cerr << "_highbit " << binary(_highbit) << endl;
129
130 bigword *buffer = new bigword[_wordcount*cp.size()];
131 memset(buffer,0,_wordcount*cp.size()*sizeof(bigword));
132 u_ = new bigword*[cp.size()];
133 for (unsigned int i=0;i<cp.size();i++) {
134 u_[i] = buffer+_wordcount*i;
135 }
136 mask_=new bigword[_wordcount];
137 memset(mask_,0,_wordcount*sizeof(bigword));
138 s_=new bigword[_wordcount];
139 memset(s_,0,_wordcount*sizeof(bigword));
140 buffer=new bigword[_wordcount*(k_+1)];
141 memset(buffer,0,_wordcount*(k_+1)*sizeof(bigword));
142 m_ = new bigword*[k_+1];
143 for (unsigned int i=0;i<=k_;i++) {
144 m_[i] = buffer+_wordcount*i;
145 }
146 _patbits = new patbit[patterns_count];
147 memset(_patbits,0,patterns_count*sizeof(patbit));
148 _patbitind = new long unsigned int[_wordcount+1];
149 memset(_patbitind,0,(_wordcount+1)*sizeof(long unsigned int));
150
151 eos_ = cp.nch(eos_);
152
153 unsigned int bitposition=0;
154 unsigned int wordposition=0;
155 unsigned int wordbitposition=0;
156 unsigned int patbitsposition=0;
157 _patbitind[0] = 0;
158
159 for (it=patterns().begin();it!=patterns().end();++it) {
160 std::string const & keyword = it->pattern();
161 unsigned int pbits=keyword.length();
162
163 for (unsigned int i=0;i<pbits;i++) {
164 char *wccompat;
165 if (_wc && ((wccompat=iupac_compatible(keyword[i])) != 0)) {
166 unsigned int j=0;
167 while (wccompat[j]) {
168 int nch1 = cp.nch(wccompat[j]);
169 if (nch1 >= 0 && (wccompat[j]!='N' || _textn)) {
170 u_[nch1][wordposition]
171 |= ( ((bigword)1) << wordbitposition);
172 }
173 j++;
174 }
175 } else {
176 int nch = cp.nch(keyword[i]);
177 if (nch >=0 ) {
178 u_[nch][wordposition]
179 |= ( ((bigword)1) << wordbitposition);
180 }
181 }
182 for (unsigned int k=i+1;k<=k_;k++) {
183 m_[k][wordposition] |= (((bigword)1) << wordbitposition);
184 }
185 if (i==0) { /* first position of this pattern */
186 s_[wordposition] |= (((bigword)1) << wordbitposition);
187 // cerr << wordposition << " " << binary(s_[wordposition]) << endl;
188 }
189 if (i==(pbits-1)) { /* last position of this pattern */
190 mask_[wordposition] |= (((bigword)1) << wordbitposition);
191 _patbits[patbitsposition].bit = wordbitposition;
192 _patbits[patbitsposition].it = it;
193 patbitsposition++;
194 }
195 bitposition++;
196 wordposition = bitposition/wordbits;
197 wordbitposition = bitposition%wordbits;
198 if (wordbitposition%wordbits==0) {
199 _patbitind[wordposition] = patbitsposition;
200 }
201 }
202 }
203 _patbitind[_wordcount] = patbitsposition;
204 /*
205 cerr << " s_ ";
206 for (int i=_wordcount-1;i>=0;i--) {
207 cerr << binary(s_[i]);
208 }
209 cerr << endl;
210 cerr << "mask_ ";
211 for (int i=_wordcount-1;i>=0;i--) {
212 cerr << binary(mask_[i]);
213 }
214 cerr << endl;
215 int p=0;
216 for (int i=0;i<_wordcount;i++) {
217 for (int j=_patbitind[i];j<_patbitind[i+1];j++) {
218 cerr << "Word " << i << " bit " << j << " pattern " << p << endl;
219 p++;
220 }
221 }
222 for (int ch=0;ch<cp.size();ch++) {
223 cerr << "u_[";
224 cerr << cp.ch(ch) << "] ";
225 for (int i=_wordcount-1;i>=0;i--) {
226 cerr << binary(u_[ch][i]);
227 }
228 cerr << endl;
229 }
230
231 for (unsigned int k=0;k<=k_;k++) {
232 cerr << "m_[";
233 cerr << k << "] ";
234 for (int i=_wordcount-1;i>=0;i--) {
235 cerr << binary(m_[k][i]);
236 }
237 cerr << endl;
238 }
239 */
240 }
241
242
243 void shift_and_inexact::reset() {
244 memset(m_,0,_wordcount*sizeof(bigword));
245 tinylist<pattern_list_element>::const_iterator it;
246 unsigned int bitposition=0;
247 unsigned int wordposition=0;
248 unsigned int wordbitposition=0;
249 unsigned int patbitsposition=0;
250 unsigned int wordbits = (sizeof(bigword)*8);
251 for (it=patterns().begin();it!=patterns().end();++it) {
252 std::string const & keyword = it->pattern();
253 unsigned int pbits=keyword.length();
254
255 for (unsigned int i=0;i<pbits;i++) {
256 for (unsigned int k=i+1;k<=k_;k++) {
257 m_[k][wordposition] |= (((bigword)1) << wordbitposition);
258 }
259 bitposition++;
260 wordposition = bitposition/wordbits;
261 wordbitposition = bitposition%wordbits;
262 if (wordbitposition%wordbits==0) {
263 _patbitind[wordposition] = patbitsposition;
264 }
265 }
266 }
267 }
268
269 bool shift_and_inexact::find_patterns(CharacterProducer & cp,
270 pattern_hit_vector & pas,
271 long unsigned minpa) {
272 // checkpoint;
273 unsigned ccount=0;
274 long unsigned pacount=0;
275 FILE_POSITION_TYPE lastpapos=0;
276 if (cp.eof()) return false;
277 static bigword *m0=0,*m1=0,*m2=0,*m3=0;
278 if (!m0) {
279 m0 = new bigword[4*_wordcount];
280 m1 = m0+1*_wordcount;
281 m2 = m0+2*_wordcount;
282 m3 = m0+3*_wordcount;
283 }
284 unsigned char ch=cp.getnch();
285 while (1) {
286 bigword * const & m0_ = m_[0];
287 if (_indels) {
288 memcpy(m0,m0_,sizeof(bigword)*_wordcount);
289 }
290 for (int i=_wordcount-1;i>=1;i--) {
291 m1[i] = ((m0_[i] << 1) | ((m0_[i-1]&_highbit)?1:0)) | s_[i] ;
292 }
293 m1[0] = ((m0_[0] << 1) | s_[0]);
294
295 bigword * const & uch = u_[ch];
296 for (unsigned int i=0;i<_wordcount;i++) {
297 m0_[i] = m1[i] & uch[i];
298 if (_indels) {
299 m1[i] |= m0[i];
300 }
301 }
302 for (unsigned int l=1;l<=k_;l++) {
303 bigword * const & ml_ = m_[l];
304 if (_indels) {
305 memcpy(m2,ml_,sizeof(bigword)*_wordcount);
306 }
307 for (int i=_wordcount-1;i>=1;i--) {
308 m3[i] = ((ml_[i] << 1) | ((ml_[i-1]&_highbit)?1:0) | s_[i]);
309 ml_[i] = m3[i] & uch[i];
310 }
311 m3[0] = ((ml_[0] << 1) | s_[0]);
312 ml_[0] = m3[0] & uch[0];
313 if (ch != eos_) {
314 for (int i=_wordcount-1;i>=1;i--) {
315 ml_[i] |= m1[i];
316 if (_indels) {
317 ml_[i] |= ((m_[l-1][i] << 1)|((m_[l-1][i-1]&_highbit)?1:0) | s_[i]);
318 ml_[i] |= m_[l-1][i];
319 }
320 m1[i] = m3[i];
321 if (_indels) {
322 m1[i] |= m2[i];
323 }
324 }
325 ml_[0] |= m1[0];
326 if (_indels) {
327 ml_[0] |= ((m_[l-1][0] << 1) | s_[0]);
328 ml_[0] |= (m_[l-1][0] | m1[0]);
329 }
330 m1[0] = m3[0];
331 if (_indels) {
332 m1[0] |= m2[0];
333 }
334 }
335 }
336 for (unsigned int i=0;i<_wordcount;i++) {
337 if (m_[k_][i] & mask_[i]) {
338 /* There is a hit somewhere in this word... */
339 for (unsigned int j=_patbitind[i];j<_patbitind[i+1];j++) {
340 if (m_[k_][i] & (((bigword)1)<<(_patbits[j].bit))) {
341 // checkpoint;
342 // cerr << _patbits[j].it->id() << " " << cp.pos() << " " << k_ << endl;
343 int k=k_-1;
344 while (k >= 0 && (m_[k][i] & (((bigword)1)<<(_patbits[j].bit)))) {
345 // checkpoint;
346 k--;
347 }
348 k++;
349 // cerr << _patbits[j].it->id() << " " << cp.pos() << " " << k << endl;
350 pas.push_back(cp.pos(),_patbits[j].it);
351 pacount++;
352 lastpapos=cp.pos();
353 }
354 }
355 }
356 }
357 if (pacount >= minpa && cp.pos() > lastpapos+1) {
358 return true;
359 }
360 if (ccount>1000) {
361 report_progress(cp);
362 ccount=0;
363 }
364 ccount++;
365 if (cp.eof()) break;
366 ch=cp.getnch();
367 }
368 if (pacount > 0) {
369 return true;
370 }
371 return false;
372 }