1 |
#ifndef __GCDB_H |
2 |
#define __GCDB_H |
3 |
#include "GBase.h" |
4 |
#include <stddef.h> |
5 |
#include <fcntl.h> |
6 |
|
7 |
#ifdef __WIN32__ |
8 |
#define PROT_READ 1 |
9 |
#define PROT_WRITE 2 |
10 |
#define PROT_READWRITE 3 |
11 |
#define MAP_SHARED 1 |
12 |
#define MAP_PRIVATE 2 |
13 |
#define F_OK 0 |
14 |
#define R_OK 4 |
15 |
#define W_OK 2 |
16 |
#define RW_OK 6 |
17 |
|
18 |
#ifndef MAP_FAILED |
19 |
#define MAP_FAILED ((void *) -1) |
20 |
#endif |
21 |
void *mmap(char *,size_t,int,int,int,off_t); |
22 |
int munmap(void *,size_t); |
23 |
#else |
24 |
#include <sys/mman.h> |
25 |
#endif |
26 |
|
27 |
//===================================================== |
28 |
//------------- buffer stuff ------------------- |
29 |
//===================================================== |
30 |
#define GCDBUFFER_INSIZE 8192 |
31 |
#define GCDBUFFER_OUTSIZE 8192 |
32 |
|
33 |
typedef int (*opfunc)(int, char*, size_t); |
34 |
|
35 |
//typedef unsigned long gcdb_seek_pos; |
36 |
typedef off_t gcdb_seek_pos; |
37 |
typedef unsigned int (*uint_conv_func)(void*); //uint conversion function pointer |
38 |
typedef off_t (*offt_conv_func)(void*); //uint conversion function pointer |
39 |
typedef int16_t (*int16_conv_func)(void*); //int16 conversion function pointer |
40 |
|
41 |
|
42 |
//conversion function --> to platform independent uint |
43 |
extern uint_conv_func gcvt_uint; |
44 |
extern offt_conv_func gcvt_offt; |
45 |
extern int16_conv_func gcvt_int16; |
46 |
/* |
47 |
unsigned int uint32_sun(void* x86int); |
48 |
unsigned int uint32_x86(void* x86int); |
49 |
//for file offsets: off_t runtime conversions: |
50 |
off_t offt_sun(void* offt); |
51 |
off_t offt_x86(void* offt); |
52 |
int16_t int16_sun(void* i16); |
53 |
int16_t int16_x86(void* i16); |
54 |
*/ |
55 |
|
56 |
void gcvt_endian_setup(); |
57 |
|
58 |
class GCDBuffer { |
59 |
public: |
60 |
char *x; |
61 |
unsigned int p; |
62 |
unsigned int n; |
63 |
int fd; |
64 |
opfunc op; |
65 |
//methods: |
66 |
GCDBuffer():x(NULL),p(0),n(0),fd(0),op(NULL) { |
67 |
} |
68 |
GCDBuffer(opfunc aop,int afd,char *buf,unsigned int len) { |
69 |
//check endianness |
70 |
gcvt_endian_setup(); |
71 |
init(aop, afd, buf, len); |
72 |
} |
73 |
void init(opfunc aop,int afd,char *buf,unsigned int len) { |
74 |
x=buf; |
75 |
fd=afd; |
76 |
op=aop; |
77 |
p=0; |
78 |
n=len; |
79 |
} |
80 |
int flush(); |
81 |
int write_all(char* buf, unsigned int pt); |
82 |
int put(char* buf,unsigned int len); |
83 |
int putalign(char* buf,unsigned int len); |
84 |
int putflush(char* buf,unsigned int len); |
85 |
int puts(char *buf); |
86 |
int putsalign(char *buf); |
87 |
int putsflush(char *buf); |
88 |
int oneRead(char* buf, unsigned int len); |
89 |
int getthis(char* buf,unsigned int len); |
90 |
int get(char* buf,unsigned int len); |
91 |
int bget(char* buf,unsigned int len); |
92 |
int feed(); |
93 |
char *peek(); |
94 |
void seek(unsigned int len); |
95 |
int copy(GCDBuffer* bin); |
96 |
}; |
97 |
|
98 |
|
99 |
//===================================================== |
100 |
//------------- cdb utils ------------------- |
101 |
//===================================================== |
102 |
#ifndef __WIN32__ |
103 |
extern int errno; |
104 |
#endif |
105 |
extern int error_intr; |
106 |
extern int error_nomem; |
107 |
extern int error_proto; |
108 |
|
109 |
//additional data to be appended to the cdb file: |
110 |
#define CDBMSK_OPT_MULTI 0x00000001 |
111 |
#define CDBMSK_OPT_C 0x00000002 |
112 |
#define CDBMSK_OPT_CADD 0x00000004 |
113 |
#define CDBMSK_OPT_COMPRESS 0x00000008 |
114 |
#define CDBMSK_OPT_GSEQ 0x00000010 |
115 |
//creates a compressed version of the database |
116 |
//uses plenty of unions for ensuring compatibility with |
117 |
// the old 'CIDX' info structure |
118 |
|
119 |
//trying to prevent [64bit] machines to align this to 64bit -- sizeof() gets it wrong! |
120 |
#pragma pack(4) |
121 |
// eek, gcc 2.95.3 alpha-decosf version does not |
122 |
// recognize this pragma directive |
123 |
|
124 |
|
125 |
struct cdbInfo { |
126 |
uint32 num_keys; |
127 |
union { |
128 |
uint32 num_records; |
129 |
char oldtag[4]; // 'CIDX' for old tag style |
130 |
}; |
131 |
// data file size -- used to be uint32, now it could be 64bit |
132 |
union { |
133 |
int64_t dbsize; |
134 |
uint32 oldnum[2]; //num_keys, num_records |
135 |
}; |
136 |
union { |
137 |
uint32 idxflags; |
138 |
uint32 old_dbsize; |
139 |
}; |
140 |
union { |
141 |
int dbnamelen; |
142 |
int old_idxflags; |
143 |
}; |
144 |
// -- the actual db name precedes this fixed-size record |
145 |
union { |
146 |
char tag[4]; //'CDBX' for new files with LFS |
147 |
uint32 old_dbnamelen; |
148 |
}; |
149 |
}; |
150 |
|
151 |
// for passing around index data: |
152 |
struct CIdxData32 { |
153 |
uint32 fpos; |
154 |
uint32 reclen; |
155 |
}; |
156 |
/* |
157 |
struct CIdxSeqData32 { //4+4+2+1 = 11 bytes |
158 |
uint32 fpos; |
159 |
uint32 reclen; |
160 |
uint16_t linelen; //line length for FASTA-formatted seq |
161 |
byte elen; //length of end-of-line delimiter: 1 (unix/mac) or 2 (Windows) |
162 |
}; |
163 |
*/ |
164 |
struct CIdxData { |
165 |
off_t fpos; //64bit value on Linux |
166 |
uint32 reclen; |
167 |
}; |
168 |
/* |
169 |
struct CIdxSeqData { //8+4+2+1 = 15 bytes |
170 |
off_t fpos; //64bit value on Linux |
171 |
uint32 reclen; |
172 |
uint16_t linelen; //line length for FASTA-formatted seq |
173 |
byte elen; //length of end-of-line delimiter: 1 (unix/mac) or 2 (Windows) |
174 |
}; |
175 |
*/ |
176 |
#pragma pack() |
177 |
|
178 |
extern int cdbInfoSIZE; |
179 |
extern int IdxDataSIZE; |
180 |
extern int IdxDataSIZE32; |
181 |
/* |
182 |
extern int IdxSeqDataSIZE; |
183 |
extern int IdxSeqDataSIZE32; |
184 |
*/ |
185 |
|
186 |
void uint32_pack(char *,uint32); |
187 |
void uint32_pack_big(char *,uint32); |
188 |
void uint32_unpack(char *,uint32 *); |
189 |
void uint32_unpack_big(char *,uint32 *); |
190 |
|
191 |
//===================================================== |
192 |
//------------- cdb index ------------------- |
193 |
//===================================================== |
194 |
|
195 |
#define CDB_HPLIST 1000 |
196 |
|
197 |
struct cdb_hp { uint32 h; uint32 p; } ; |
198 |
|
199 |
struct cdb_hplist { |
200 |
struct cdb_hp hp[CDB_HPLIST]; |
201 |
struct cdb_hplist *next; |
202 |
int num; |
203 |
}; |
204 |
|
205 |
//the index file should always be smaller than 4GB ! |
206 |
|
207 |
class GCdbWrite { |
208 |
GCDBuffer* cdbuf; |
209 |
char bspace[8192]; |
210 |
char fname[1024]; |
211 |
char final[2048]; |
212 |
uint32 count[256]; |
213 |
uint32 start[256]; |
214 |
struct cdb_hplist *head; |
215 |
struct cdb_hp *split; /* includes space for hash */ |
216 |
struct cdb_hp *hash; |
217 |
uint32 numentries; |
218 |
uint32 pos; //file position |
219 |
int posplus(uint32 len); |
220 |
int fd; //file descriptor |
221 |
public: |
222 |
//methods: |
223 |
GCdbWrite(int afd); //was: init |
224 |
GCdbWrite(char* fname); |
225 |
~GCdbWrite(); |
226 |
int addbegin(unsigned int keylen,unsigned int datalen); |
227 |
int addend(unsigned int keylen,unsigned int datalen,uint32 h); |
228 |
int addrec(const char *key,unsigned int keylen,char *data,unsigned int datalen); |
229 |
int add(const char *key, char *data, unsigned int datalen); |
230 |
int getNumEntries() { return numentries; } |
231 |
int finish(); |
232 |
int close(); |
233 |
int getfd() { return fd; } |
234 |
char* getfile() { return fname; } |
235 |
}; |
236 |
|
237 |
|
238 |
//===================================================== |
239 |
//------------- cdb ------------------- |
240 |
//===================================================== |
241 |
|
242 |
#define CDB_HASHSTART 5381 |
243 |
|
244 |
uint32 cdb_hashadd(uint32,unsigned char); |
245 |
uint32 cdb_hash(const char *,unsigned int); |
246 |
|
247 |
class GCdbRead { |
248 |
uint32 size; // initialized if map is nonzero |
249 |
uint32 loop; // number of hash slots searched under this key |
250 |
uint32 khash; // initialized if loop is nonzero |
251 |
uint32 kpos; // initialized if loop is nonzero |
252 |
uint32 hpos; // initialized if loop is nonzero |
253 |
uint32 hslots; // initialized if loop is nonzero |
254 |
uint32 dpos; // initialized if cdb_findnext() returns 1 |
255 |
uint32 dlen; // initialized if cdb_findnext() returns 1 |
256 |
char fname[1024]; |
257 |
char *map; // 0 if no map is available |
258 |
int fd; |
259 |
public: |
260 |
//methods: |
261 |
GCdbRead(int fd); //was cdb_init |
262 |
GCdbRead(char* afname); //was cdb_init |
263 |
~GCdbRead(); //was cdb_free |
264 |
int read(char *,unsigned int,uint32); |
265 |
int match(const char *key, unsigned int len, uint32 pos); |
266 |
void findstart() { loop =0; } |
267 |
int findnext(const char *key,unsigned int len); |
268 |
int find(const char *key); |
269 |
int datapos() { return dpos; } |
270 |
int datalen() { return dlen; } |
271 |
int getfd() { return fd; } |
272 |
char* getfile() { return fname; } |
273 |
}; |
274 |
|
275 |
class GReadBuf { |
276 |
protected: |
277 |
FILE* f; |
278 |
uchar* buf; |
279 |
int buflen; |
280 |
int bufused; // |
281 |
int bufpos; |
282 |
off_t fpos; |
283 |
bool eof; |
284 |
bool eob; |
285 |
|
286 |
int refill(bool repos=false) { |
287 |
//refill the buffer----------- |
288 |
if (repos && bufpos==0) return 0; //no need to repos |
289 |
if (eof) return 0; |
290 |
int fr=0; |
291 |
if (repos && bufpos<bufused) { |
292 |
int kept=bufused-bufpos; |
293 |
memmove((void*)buf, (void*)(buf+bufpos),kept); |
294 |
fr=(int)fread((void *)(buf+kept), 1, buflen-kept, f); |
295 |
if (fr<buflen-kept) eof=true; |
296 |
buf[kept+fr]='\0'; |
297 |
bufused=kept+fr; |
298 |
} |
299 |
else { |
300 |
fr=(int)fread((void *)buf, 1, buflen, f); |
301 |
if (fr<buflen) eof=true; |
302 |
buf[fr]='\0'; //only for text record parsers |
303 |
bufused=fr; |
304 |
} |
305 |
if (feof(f)) eof=true; |
306 |
if (ferror(f)) { |
307 |
GMessage("GReadBuf::refill - error at fread!\n"); |
308 |
eof=true; |
309 |
} |
310 |
bufpos=0; |
311 |
fpos+=fr; //bytes read from file so far |
312 |
return fr; |
313 |
} |
314 |
public: |
315 |
GReadBuf(FILE* fin, int bsize=4096) { |
316 |
f=fin; |
317 |
buflen=bsize; |
318 |
GMALLOC(buf,buflen+1); |
319 |
bufpos=0; //current pointer for get function |
320 |
bufused=0; |
321 |
fpos=0; |
322 |
eof=false; |
323 |
eob=false; |
324 |
refill(); |
325 |
} |
326 |
~GReadBuf() { GFREE(buf); } |
327 |
|
328 |
//reads len chars from stream into the outbuf |
329 |
//updates bufpos |
330 |
//->returns the number of bytes read |
331 |
int get(uchar *outbuf, int len) { |
332 |
if (eob) return 0; |
333 |
int rd=0; //bytes read |
334 |
while (!eob && rd<len) { |
335 |
int to_read=GMIN((bufused-bufpos),(len-rd)); |
336 |
memcpy((void*)(outbuf+rd),(void*)(buf+bufpos), to_read); |
337 |
bufpos+=to_read; |
338 |
rd+=to_read; |
339 |
if (bufpos>=bufused) { |
340 |
if (eof) eob=true; |
341 |
else refill(); |
342 |
} |
343 |
}//while |
344 |
return rd; |
345 |
} |
346 |
|
347 |
uchar* getStr(uchar *outbuf, int len) { |
348 |
int rd=get(outbuf,len); |
349 |
if (rd==0) return NULL; |
350 |
else { |
351 |
outbuf[rd]='\0'; |
352 |
return outbuf; |
353 |
} |
354 |
} |
355 |
|
356 |
// getc equivalent |
357 |
int getch() { |
358 |
if (eob) return -1; |
359 |
int ch=(int)(uchar)buf[bufpos]; |
360 |
bufpos++; |
361 |
if (bufpos>=bufused) { |
362 |
if (eof) eob=true; |
363 |
else refill(); |
364 |
} |
365 |
return ch; |
366 |
} |
367 |
|
368 |
//--- |
369 |
bool isEof() { return eob; } |
370 |
bool ended() { return eob; } |
371 |
off_t getPos() { |
372 |
//returns the virtual file position |
373 |
// = the actual file offset of the byte at bufpos |
374 |
return fpos-(bufused-bufpos); |
375 |
} |
376 |
//skip into the stream the specified number of bytes |
377 |
int skip(int skiplen) { |
378 |
if (eob) return 0; |
379 |
int r=0; //the actual number of bytes skipped |
380 |
while (skiplen && !eob) { |
381 |
int dif=GMIN(bufused-bufpos,skiplen); |
382 |
skiplen-=dif; |
383 |
bufpos+=dif; |
384 |
r+=dif; |
385 |
if (bufpos>=bufused) { |
386 |
if (eof) { eob=true; return r; } |
387 |
refill(); |
388 |
} |
389 |
} |
390 |
return r; |
391 |
} |
392 |
//look ahead without updating the read pointer (bufpos) |
393 |
//Cannot peek more than buflen! |
394 |
int peek(uchar* outbuf, int len) { |
395 |
if (eob) return -1; |
396 |
//if (eob || len>buflen) return -1; |
397 |
if (len>bufused-bufpos) refill(true); |
398 |
int mlen=GMIN((bufused-bufpos),len); |
399 |
memcpy((void*)outbuf, (void*)(buf+bufpos), mlen); |
400 |
return mlen; |
401 |
} |
402 |
char peekChar() { |
403 |
if (eob) return -1; |
404 |
//if (eob || len>buflen) return -1; |
405 |
if (1>bufused-bufpos) refill(true); |
406 |
return *(buf+bufpos); |
407 |
} |
408 |
uchar* peekStr(uchar* outbuf, int len) { |
409 |
int rd=peek(outbuf,len); |
410 |
if (rd>0) { outbuf[rd]='\0'; return outbuf; } |
411 |
else return NULL; |
412 |
} |
413 |
//looks ahead to check if what follows matches |
414 |
int peekCmp(char* cmpstr, int cmplen=-1) { |
415 |
if (cmplen==0) return 0; |
416 |
if (eob) //GError("GReadBuf::peekcmp error: eob!\n"); |
417 |
return -2; |
418 |
if (cmplen<0) cmplen=strlen(cmpstr); |
419 |
if (cmplen>bufused-bufpos) { |
420 |
refill(true); |
421 |
if (cmplen>bufused-bufpos) return -2; |
422 |
} |
423 |
//use memcmp |
424 |
return memcmp((void*)(buf+bufpos), cmpstr, cmplen); |
425 |
} |
426 |
|
427 |
}; |
428 |
|
429 |
//circular line buffer, with read-ahead (peeking) capability |
430 |
class GReadBufLine { |
431 |
protected: |
432 |
struct BufLine { |
433 |
off_t fpos; |
434 |
int len; |
435 |
char* chars; |
436 |
}; |
437 |
int bufcap; //total number of lines in the buf array |
438 |
int bufidx; // the "current line" index in buf array |
439 |
bool isEOF; |
440 |
int lno; |
441 |
FILE* file; |
442 |
off_t filepos; //current file/stream offset for the first char of buf[bufidx] |
443 |
BufLine* buf; //array of bufferred lines |
444 |
char* readline(int idx);//read line from file into the buffer |
445 |
int fillbuf(); |
446 |
bool isEOB; |
447 |
public: |
448 |
const char* line(); //gets current line and advances the "current line" pointer |
449 |
//use putLine() to revert/undo this advancement |
450 |
off_t fpos(); //gets current line's byte offset in the file |
451 |
// does NOT advance the "current line" pointer |
452 |
int len(); //gets current line's length |
453 |
// does NOT advance the "current line" pointer |
454 |
bool isEof() { return isEOB; } |
455 |
bool eof() { return isEOB; } |
456 |
off_t getfpos() { return fpos(); } |
457 |
const char* getline() { return line(); } |
458 |
const char* getLine() { return line(); } |
459 |
int getLen() { return len(); } |
460 |
int linenumber() { return lno; } |
461 |
int lineno() { return lno; } |
462 |
int getLineNo() { return lno; } |
463 |
void putLine(); |
464 |
GReadBufLine(FILE* stream, int bcap=20) { |
465 |
if (bcap<2) bcap=2; //at least 1 prev line is needed for putLine() |
466 |
bufcap=bcap; |
467 |
bufidx=-1; |
468 |
isEOB=false; |
469 |
isEOF=false; |
470 |
lno=0; |
471 |
GMALLOC(buf, bufcap * sizeof(BufLine)); |
472 |
for (int i=0;i<bufcap;i++) { |
473 |
buf[i].chars=NULL; |
474 |
buf[i].fpos=-1; |
475 |
buf[i].len=0; |
476 |
} |
477 |
file=stream; |
478 |
fillbuf(); |
479 |
} |
480 |
~GReadBufLine() { |
481 |
for (int i=0;i<bufcap;i++) { |
482 |
GFREE(buf[i].chars); |
483 |
} |
484 |
GFREE(buf); |
485 |
} |
486 |
}; |
487 |
|
488 |
#endif |