ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/PrimerMatch/shift_and.cc
Revision: 1.1.1.1 (vendor branch)
Committed: Wed Dec 22 21:37:18 2004 UTC (11 years, 5 months ago) by nje01
Branch: MAIN
CVS Tags: HEAD, RELEASE-20041222, HEAD
Changes since 1.1: +0 -0 lines
Log Message:
Initial primer_match import

Line File contents
1 /**************************************************************************
2 * This code is part of the supporting infrastructure for ATA Mapper.
3 * Copyright (C) 2002,2003,2004 Applera Corporation. All rights reserved.
4 * Author: Nathan Edwards
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received (LICENSE.txt) a copy of the GNU General Public
17 * License along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *************************************************************************/
20
21
22 #include <assert.h>
23 #include <string.h>
24 #include "shift_and.h"
25 #include "util.h"
26
27 shift_and::shift_and(bool wc, bool tn) {
28 m_ = 0;
29 u_ = 0;
30 mask_=0;
31 _wc = wc;
32 _textn = tn;
33 }
34
35 shift_and::~shift_and() {
36 clearu();
37 }
38
39 bool shift_and::wildcards() const {
40 return _wc;
41 }
42
43 void shift_and::wildcards(bool wc) {
44 _wc = wc;
45 }
46
47 bool shift_and::wildcard_text_N() const {
48 return _textn;
49 }
50
51 void shift_and::wildcard_text_N(bool tn) {
52 _textn = tn;
53 }
54
55 void shift_and::clearu() {
56 if (u_) {
57 if (u_[0]) delete [] u_[0];
58 delete [] u_;
59 }
60 if (mask_) delete [] mask_;
61 if (m_) delete [] m_;
62 }
63
64 unsigned long shift_and::add_pattern(std::string const & pat, long unsigned id,
65 int esb, int eeb) {
66 add_pattern_(pat,id,esb,eeb);
67 return id;
68 }
69
70 void shift_and::computeu(CharacterProducer & cp) {
71 clearu();
72 long unsigned int patterns_length=0;
73 long unsigned int patterns_count=0;
74 tinylist<pattern_list_element>::const_iterator it;
75 for (it=patterns().begin();it!=patterns().end();++it) {
76 patterns_length+=it->pattern().length();
77 patterns_count++;
78 }
79 unsigned int wordbits = (sizeof(bigword)*8);
80 long unsigned int patterns_wordcount;
81 patterns_wordcount = patterns_length/wordbits +
82 ((patterns_length%wordbits)?1:0);
83 // assert(patterns_wordcount*wordbits >= patterns_length);
84 _wordcount = patterns_wordcount;
85 _highbit = (((bigword)1)<<(wordbits-1));
86 _wordbits_1 = wordbits-1;
87 // cerr << "# " << patterns_count << " L " << patterns_length
88 // << " W " << _wordcount << " w " << wordbits << endl;
89 // cerr << "_highbit " << binary(_highbit) << endl;
90
91 bigword *buffer = new bigword[_wordcount*cp.size()];
92 memset(buffer,0,_wordcount*cp.size()*sizeof(bigword));
93 u_ = new bigword*[cp.size()];
94 for (unsigned int i=0;i<cp.size();i++) {
95 u_[i] = buffer+_wordcount*i;
96 }
97 // cerr << cp.size() << endl;
98 mask_=new bigword[_wordcount];
99 memset(mask_,0,_wordcount*sizeof(bigword));
100
101 s_=new bigword[_wordcount];
102 memset(s_,0,_wordcount*sizeof(bigword));
103
104 m_=new bigword[_wordcount];
105 memset(m_,0,_wordcount*sizeof(bigword));
106
107 _patbits = new patbit[patterns_count];
108 memset(_patbits,0,patterns_count*sizeof(patbit));
109
110 _patbitind = new unsigned int[_wordcount+1];
111 memset(_patbitind,0,(_wordcount+1)*sizeof(unsigned int));
112
113 unsigned int bitposition=0;
114 unsigned int wordposition=0;
115 unsigned int wordbitposition=0;
116 unsigned int patbitsposition=0;
117 _patbitind[0] = 0;
118
119 for (it=patterns().begin();it!=patterns().end();++it) {
120 std::string const & keyword = it->pattern();
121 unsigned int pbits=keyword.length();
122
123 for (unsigned int i=0;i<pbits;i++) {
124 char *wccompat;
125 if (_wc && ((wccompat=iupac_compatible(keyword[i])) != 0)) {
126 unsigned int j=0;
127 while (wccompat[j]) {
128 int nch1 = cp.nch(wccompat[j]);
129 if (nch1 >= 0 && (wccompat[j]!='N' || _textn)) {
130 u_[nch1][wordposition]
131 |= ( ((bigword)1) << wordbitposition);
132 }
133 j++;
134 }
135 } else {
136 int nch = cp.nch(keyword[i]);
137 if (nch >=0 ) {
138 u_[nch][wordposition]
139 |= ( ((bigword)1) << wordbitposition);
140 }
141 }
142 if (i==0) { /* first position of this pattern */
143 s_[wordposition] |= (((bigword)1) << wordbitposition);
144 // cerr << wordposition << " " << binary(s_[wordposition]) << endl;
145 }
146 if (i==(pbits-1)) { /* last position of this pattern */
147 mask_[wordposition] |= (((bigword)1) << wordbitposition);
148 _patbits[patbitsposition].bit = wordbitposition;
149 _patbits[patbitsposition].it = it;
150 patbitsposition++;
151 }
152 bitposition++;
153 wordposition = bitposition/wordbits;
154 wordbitposition = bitposition%wordbits;
155 if (wordbitposition%wordbits==0) {
156 _patbitind[wordposition] = patbitsposition;
157 }
158 }
159 }
160 _patbitind[_wordcount] = patbitsposition;
161
162 /* cerr << " s_ ";
163 for (int i=_wordcount-1;i>=0;i--) {
164 cerr << binary(s_[i]);
165 }
166 cerr << endl;
167 cerr << "mask_ ";
168 for (int i=_wordcount-1;i>=0;i--) {
169 cerr << binary(mask_[i]);
170 }
171 cerr << endl;
172 int p=0;
173 for (int i=0;i<_wordcount;i++) {
174 for (int j=_patbitind[i];j<_patbitind[i+1];j++) {
175 cerr << "Word " << i << " bit " << _patbits[j].bit << " pattern " << p << endl;
176 p++;
177 }
178 }
179 checkpoint;
180 for (int ch=0;ch<cp.size();ch++) {
181 cerr << "u_[";
182 cerr << ch << "] ";
183 for (int i=_wordcount-1;i>=0;i--) {
184 cerr << binary(u_[ch][i]);
185 }
186 cerr << endl;
187 }
188
189 cerr << "m_ ";
190 for (int i=_wordcount-1;i>=0;i--) {
191 cerr << binary(m_[i]);
192 }
193 cerr << endl;
194 */
195 }
196
197 void shift_and::reset() {
198 memset(m_,0,_wordcount*sizeof(bigword));
199 }
200
201 bool shift_and::find_patterns(CharacterProducer & cp,
202 pattern_hit_vector & pas,
203 long unsigned minpa) {
204 unsigned ccount=0;
205 long unsigned pacount=0;
206 if (cp.eof()) return false;
207 unsigned char ch=cp.getnch();
208 // cerr << "m: " << binary<unsigned long>(m_) << endl;
209 // cerr << "$: " << binary<unsigned long>(mask_) << endl;
210 // cerr << "Read: " << ch << endl;
211 while (1) {
212 for (int i=_wordcount-1;i>=1;i--) {
213 m_[i] = ((m_[i] << 1) | (m_[i-1] >> _wordbits_1) | s_[i]) & u_[ch][i];
214 }
215 m_[0] = ((m_[0] << 1) | s_[0]) & u_[ch][0];
216
217 for (unsigned int i=0;i<_wordcount;i++) {
218 // checkpoint;
219 // cerr << "i: " << i << endl;
220 // cerr << "_patbitind[i]: " << _patbitind[i] << endl;
221 // cerr << "_patbitind[i+1]-1: " << _patbitind[i+1]-1 << endl;
222 if (m_[i] & mask_[i]) {
223 /* There is a hit somewhere in this word... */
224 for (unsigned int j=_patbitind[i];j<_patbitind[i+1];j++) {
225 // checkpoint;
226 // cerr << "j: " << j << endl;
227 if (m_[i] & (((bigword)1)<<(_patbits[j].bit))) {
228 pas.push_back(cp.pos(),_patbits[j].it);
229 pacount++;
230 }
231 }
232 }
233 }
234 if (pacount >= minpa) {
235 return true;
236 }
237 if (ccount>1000) {
238 report_progress(cp);
239 ccount=0;
240 }
241 ccount++;
242 if (cp.eof()) break;
243 ch=cp.getnch();
244 // cerr << "Read: " << ch << endl;
245 }
246 if (pacount > 0) return true;
247 return false;
248 }