ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/PrimerMatch/hash_table.cc
Revision: 1.2
Committed: Wed May 4 18:03:45 2005 UTC (11 years, 7 months ago) by nje01
Branch: MAIN
CVS Tags: HEAD
Changes since 1.1: +3 -2 lines
Log Message:
Small bug fixes, plus codon based edit distance for peptide searching.

Line File contents
1 /**************************************************************************
2 * This code is part of the supporting infrastructure for ATA Mapper.
3 * Copyright (C) 2002,2003,2004 Applera Corporation. All rights reserved.
4 * Author: Nathan Edwards
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received (LICENSE.txt) a copy of the GNU General Public
17 * License along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *************************************************************************/
20
21
22 #include "hash_table.h"
23
24 hash_table_elt::hash_table_elt() : patid_(0) {};
25
26 hash_table_elt::~hash_table_elt() {
27 if (patid_) {
28 delete patid_;
29 patid_ = 0;
30 }
31 }
32
33 void hash_table_elt::add_patid(pattern_list::const_iterator const & it,
34 unsigned int p) {
35 // checkpoint;
36 // cerr << (void*)patid_ << endl;
37 if (!patid_) patid_ = new tinylist<htele>;
38 // checkpoint;
39 patid_->push_front(htele(it,p));
40 // checkpoint;
41 }
42
43 tinylist<htele> * const & hash_table_elt::patids() const {
44 return patid_;
45 }
46
47 hash_table::hash_table(unsigned int ws, unsigned int k,
48 unsigned char eos, bool wc, bool tn,
49 bool indels, bool dm) {
50 ws_ = ws;
51 relchars_ = new bool[256];
52 for (int i=0;i<256;i++) {
53 relchars_[i] = false;
54 }
55 num_patterns_=0;
56 _mpl=0;
57 k_ = k;
58 eos_ = eos;
59 _wc = wc;
60 _textn = tn;
61 _indels = indels;
62 _dna_mut = dm;
63 }
64
65 hash_table::~hash_table() {
66 delete [] relchars_;
67 relchars_ = 0;
68 delete [] relcharmap_;
69 relcharmap_ = 0;
70 }
71
72 unsigned long hash_table::add_pattern(std::string const & pat,
73 long unsigned id,
74 int esb, int eeb) {
75 add_pattern_(pat,id,esb,eeb);
76 for (int i=0;i<pat.length();i++) {
77 relchars_[pat[i]] = true;
78 }
79 num_patterns_++;
80 return id;
81 }
82
83 void hash_table::init(CharacterProducer & cp) {
84
85 bool *tmp;
86 int size=cp.size();
87 // cerr << size << endl;
88 tmp = new bool[size];
89 relcharmap_ = new unsigned char[size];
90 int j=0;
91 for (int i=0;i<size;i++) {
92 if (relchars_[(unsigned char)cp.ch(i)]) {
93 tmp[i] = true;
94 relcharmap_[i] = j;
95 // cerr << i << " " << cp.ch(i) << ": "
96 // << ((int)cp.ch(i)) << ": " << j << endl;
97 j++;
98 } else {
99 tmp[i] = false;
100 }
101 }
102 delete [] relchars_;
103 relchars_ = tmp;
104
105 alphasize_=j;
106 // cerr << alphasize_ << endl;
107 int alphausize=1;
108 alphalog_ = 0;
109 while (alphasize_ > alphausize) {
110 alphausize <<= ((unsigned int)1);
111 alphalog_++;
112 }
113
114 // cerr << ws_ << " " << alphalog_ << " " << sizeof(bigword)*8 << endl;
115
116 assert(ws_ * alphalog_ <= sizeof(bigword)*8);
117 bigword tablesize = (1 << ((bigword)(alphalog_*ws_)));
118 wsmask_ = (tablesize-1);
119 table_.resize(tablesize);
120 // cerr << tablesize << endl;
121
122 // checkpoint;
123 // cerr << binary(wsmask_) << endl;
124
125 _mpl=0;
126 lastpos_.resize(num_patterns_+1);
127 fill(lastpos_.begin(),lastpos_.end(),0);
128 tinylist<pattern_list_element>::const_iterator it;
129 for (it=patterns().begin();it!=patterns().end();++it) {
130 std::string const & pat = it->pattern();
131 int nch;
132 h_ = 0;
133 if (pat.length() > _mpl) {
134 _mpl = pat.length();
135 }
136 // checkpoint;
137 // cerr << binary(h_) << endl;
138 for (int j=0, p=-ws_+1;j<pat.length();j++,p++) {
139 // cerr << j << " " << p << " " << pat[j] << endl;
140 if ((nch=cp.nch(pat[j])) == -1) {
141 // checkpoint;
142 p = -ws_;
143 nch = 0;
144 }
145 // cerr << binary(h_) << endl;
146 h_ = ((h_ << alphalog_) | relcharmap_[nch]) & wsmask_;
147 // cerr << binary(h_) << endl;
148 if (p >= 0) {
149 // checkpoint;
150 // cerr << h_ << " " << binary(h_) << ": " << j << endl;
151 // cerr << h_ << " " << j << " " << pat.substr(j+1-ws_,ws_) << endl;
152 table_[h_].add_patid(it,j);
153 // checkpoint;
154 }
155 }
156 }
157 // checkpoint;
158 h_ = 0;
159 p_ = -ws_+1;
160 }
161
162 bool hash_table::find_patterns(CharacterProducer & cp,
163 pattern_hit_vector & pas,
164 long unsigned minka) {
165 // checkpoint;
166 editdist_alignment pa(0,0,k_,eos_,_wc,_textn,_indels,_dna_mut,0,0,true);
167 pa.maxpatlen(_mpl);
168
169 long unsigned kacount=0;
170 unsigned char nch;
171 while (!cp.eof()) {
172 nch = cp.getnch();
173 // cerr << cp.ch(nch) << endl;
174 if (!relchars_[nch]) {
175 p_ = -ws_;
176 continue;
177 } else {
178 h_ = ((h_ << alphalog_) | relcharmap_[nch]) & wsmask_;
179 }
180 // cerr << binary(h_);
181 if (p_ >= 0) {
182 // cerr << endl;
183 tinylist<htele> * ptids;
184 if (ptids=table_[h_].patids()) {
185 FILE_POSITION_TYPE oldpos=cp.pos();
186 tinylist<htele>::iterator it=ptids->begin();
187 while (it!=ptids->end()) {
188 pattern_list::const_iterator const & plit = it->pattern_list_it();
189 if (k_==0) {
190 pas.push_back(cp.pos(),plit);
191 ++it;
192 continue;
193 }
194 std::string const & pat = plit->pattern();
195 unsigned int patlen = pat.length();
196 FILE_POSITION_TYPE patend = oldpos + patlen-(it->position())-1;
197 if (lastpos_[plit->id()]+((_indels)?k_:0) < patend) {
198 // checkpoint;
199 // cerr << "lastpos: " << lastpos_[plit->id()] << endl;
200 unsigned int esb = plit->exact_start_bases();
201 unsigned int eeb = plit->exact_end_bases();
202 pa.reset();
203 pa.poslb(patend-((_indels)?k_:0));
204 pa.posub(patend+((_indels)?k_:0));
205 pa.exact_start_bases(esb);
206 pa.exact_end_bases(eeb);
207 // checkpoint;
208 /// cerr << pat << " " << it->position() << " " << oldpos << endl;
209 if (pa.align(cp,pat)) {
210 // checkpoint;
211 // cerr << pa.end() << endl;
212 if (lastpos_[plit->id()]+((_indels)?k_:0) < pa.end()) {
213 lastpos_[plit->id()] = pa.end();
214 // cerr << pa.end() << " " << plit->id() << endl;
215 pas.push_back(pa.end(),plit);
216 ++kacount;
217 } else {
218 // checkpoint;
219 lastpos_[plit->id()] = patend;
220 }
221 } else {
222 // checkpoint;
223 lastpos_[plit->id()] = patend;
224 }
225 }
226 ++it;
227 }
228 cp.pos(oldpos);
229 }
230 } else {
231 // cerr << " skip..." << endl;
232 }
233 if (p_ < 0) p_++;
234 if (kacount>minka) {
235 report_progress(cp);
236 return true;
237 }
238 }
239 if (kacount>0) {
240 report_progress(cp);
241 return true;
242 }
243 return false;
244 }
245
246 void
247 hash_table::reset() {
248
249 }