ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/gcdb.h
Revision: 264
Committed: Mon Aug 27 18:21:11 2012 UTC (6 years, 10 months ago) by gpertea
File size: 13127 byte(s)
Log Message:
minor refactoring etc

Line File contents
1 #ifndef __GCDB_H
2 #define __GCDB_H
3 #include "GBase.h"
4 #include <stddef.h>
5 #include <fcntl.h>
6
7 #ifdef __WIN32__
8 #define PROT_READ 1
9 #define PROT_WRITE 2
10 #define PROT_READWRITE 3
11 #define MAP_SHARED 1
12 #define MAP_PRIVATE 2
13 #define F_OK 0
14 #define R_OK 4
15 #define W_OK 2
16 #define RW_OK 6
17
18 #ifndef MAP_FAILED
19 #define MAP_FAILED ((void *) -1)
20 #endif
21 void *mmap(char *,size_t,int,int,int,off_t);
22 int munmap(void *,size_t);
23 #else
24 #include <sys/mman.h>
25 #endif
26
27 //=====================================================
28 //------------- buffer stuff -------------------
29 //=====================================================
30 #define GCDBUFFER_INSIZE 8192
31 #define GCDBUFFER_OUTSIZE 8192
32
33 typedef int (*opfunc)(int, char*, size_t);
34
35 //typedef unsigned long gcdb_seek_pos;
36 typedef off_t gcdb_seek_pos;
37 typedef unsigned int (*uint_conv_func)(void*); //uint conversion function pointer
38 typedef off_t (*offt_conv_func)(void*); //uint conversion function pointer
39 typedef int16_t (*int16_conv_func)(void*); //int16 conversion function pointer
40
41
42 //conversion function --> to platform independent uint
43 extern uint_conv_func gcvt_uint;
44 extern offt_conv_func gcvt_offt;
45 extern int16_conv_func gcvt_int16;
46 /*
47 unsigned int uint32_sun(void* x86int);
48 unsigned int uint32_x86(void* x86int);
49 //for file offsets: off_t runtime conversions:
50 off_t offt_sun(void* offt);
51 off_t offt_x86(void* offt);
52 int16_t int16_sun(void* i16);
53 int16_t int16_x86(void* i16);
54 */
55
56 void gcvt_endian_setup();
57
58 class GCDBuffer {
59 public:
60 char *x;
61 unsigned int p;
62 unsigned int n;
63 int fd;
64 opfunc op;
65 //methods:
66 GCDBuffer():x(NULL),p(0),n(0),fd(0),op(NULL) {
67 }
68 GCDBuffer(opfunc aop,int afd,char *buf,unsigned int len) {
69 //check endianness
70 gcvt_endian_setup();
71 init(aop, afd, buf, len);
72 }
73 void init(opfunc aop,int afd,char *buf,unsigned int len) {
74 x=buf;
75 fd=afd;
76 op=aop;
77 p=0;
78 n=len;
79 }
80 int flush();
81 int write_all(char* buf, unsigned int pt);
82 int put(char* buf,unsigned int len);
83 int putalign(char* buf,unsigned int len);
84 int putflush(char* buf,unsigned int len);
85 int puts(char *buf);
86 int putsalign(char *buf);
87 int putsflush(char *buf);
88 int oneRead(char* buf, unsigned int len);
89 int getthis(char* buf,unsigned int len);
90 int get(char* buf,unsigned int len);
91 int bget(char* buf,unsigned int len);
92 int feed();
93 char *peek();
94 void seek(unsigned int len);
95 int copy(GCDBuffer* bin);
96 };
97
98
99 //=====================================================
100 //------------- cdb utils -------------------
101 //=====================================================
102 #ifndef __WIN32__
103 extern int errno;
104 #endif
105 extern int error_intr;
106 extern int error_nomem;
107 extern int error_proto;
108
109 //additional data to be appended to the cdb file:
110 #define CDBMSK_OPT_MULTI 0x00000001
111 #define CDBMSK_OPT_C 0x00000002
112 #define CDBMSK_OPT_CADD 0x00000004
113 #define CDBMSK_OPT_COMPRESS 0x00000008
114 #define CDBMSK_OPT_GSEQ 0x00000010
115 //creates a compressed version of the database
116 //uses plenty of unions for ensuring compatibility with
117 // the old 'CIDX' info structure
118
119 //trying to prevent [64bit] machines to align this to 64bit -- sizeof() gets it wrong!
120 #pragma pack(4)
121 // eek, gcc 2.95.3 alpha-decosf version does not
122 // recognize this pragma directive
123
124
125 struct cdbInfo {
126 uint32 num_keys;
127 union {
128 uint32 num_records;
129 char oldtag[4]; // 'CIDX' for old tag style
130 };
131 // data file size -- used to be uint32, now it could be 64bit
132 union {
133 int64_t dbsize;
134 uint32 oldnum[2]; //num_keys, num_records
135 };
136 union {
137 uint32 idxflags;
138 uint32 old_dbsize;
139 };
140 union {
141 int dbnamelen;
142 int old_idxflags;
143 };
144 // -- the actual db name precedes this fixed-size record
145 union {
146 char tag[4]; //'CDBX' for new files with LFS
147 uint32 old_dbnamelen;
148 };
149 };
150
151 // for passing around index data:
152 struct CIdxData32 {
153 uint32 fpos;
154 uint32 reclen;
155 };
156 /*
157 struct CIdxSeqData32 { //4+4+2+1 = 11 bytes
158 uint32 fpos;
159 uint32 reclen;
160 uint16_t linelen; //line length for FASTA-formatted seq
161 byte elen; //length of end-of-line delimiter: 1 (unix/mac) or 2 (Windows)
162 };
163 */
164 struct CIdxData {
165 off_t fpos; //64bit value on Linux
166 uint32 reclen;
167 };
168 /*
169 struct CIdxSeqData { //8+4+2+1 = 15 bytes
170 off_t fpos; //64bit value on Linux
171 uint32 reclen;
172 uint16_t linelen; //line length for FASTA-formatted seq
173 byte elen; //length of end-of-line delimiter: 1 (unix/mac) or 2 (Windows)
174 };
175 */
176 #pragma pack()
177
178 extern int cdbInfoSIZE;
179 extern int IdxDataSIZE;
180 extern int IdxDataSIZE32;
181 /*
182 extern int IdxSeqDataSIZE;
183 extern int IdxSeqDataSIZE32;
184 */
185
186 void uint32_pack(char *,uint32);
187 void uint32_pack_big(char *,uint32);
188 void uint32_unpack(char *,uint32 *);
189 void uint32_unpack_big(char *,uint32 *);
190
191 //=====================================================
192 //------------- cdb index -------------------
193 //=====================================================
194
195 #define CDB_HPLIST 1000
196
197 struct cdb_hp { uint32 h; uint32 p; } ;
198
199 struct cdb_hplist {
200 struct cdb_hp hp[CDB_HPLIST];
201 struct cdb_hplist *next;
202 int num;
203 };
204
205 //the index file should always be smaller than 4GB !
206
207 class GCdbWrite {
208 GCDBuffer* cdbuf;
209 char bspace[8192];
210 char fname[1024];
211 char final[2048];
212 uint32 count[256];
213 uint32 start[256];
214 struct cdb_hplist *head;
215 struct cdb_hp *split; /* includes space for hash */
216 struct cdb_hp *hash;
217 uint32 numentries;
218 uint32 pos; //file position
219 int posplus(uint32 len);
220 int fd; //file descriptor
221 public:
222 //methods:
223 GCdbWrite(int afd); //was: init
224 GCdbWrite(char* fname);
225 ~GCdbWrite();
226 int addbegin(unsigned int keylen,unsigned int datalen);
227 int addend(unsigned int keylen,unsigned int datalen,uint32 h);
228 int addrec(const char *key,unsigned int keylen,char *data,unsigned int datalen);
229 int add(const char *key, char *data, unsigned int datalen);
230 int getNumEntries() { return numentries; }
231 int finish();
232 int close();
233 int getfd() { return fd; }
234 char* getfile() { return fname; }
235 };
236
237
238 //=====================================================
239 //------------- cdb -------------------
240 //=====================================================
241
242 #define CDB_HASHSTART 5381
243
244 uint32 cdb_hashadd(uint32,unsigned char);
245 uint32 cdb_hash(const char *,unsigned int);
246
247 class GCdbRead {
248 uint32 size; // initialized if map is nonzero
249 uint32 loop; // number of hash slots searched under this key
250 uint32 khash; // initialized if loop is nonzero
251 uint32 kpos; // initialized if loop is nonzero
252 uint32 hpos; // initialized if loop is nonzero
253 uint32 hslots; // initialized if loop is nonzero
254 uint32 dpos; // initialized if cdb_findnext() returns 1
255 uint32 dlen; // initialized if cdb_findnext() returns 1
256 char fname[1024];
257 char *map; // 0 if no map is available
258 int fd;
259 public:
260 //methods:
261 GCdbRead(int fd); //was cdb_init
262 GCdbRead(char* afname); //was cdb_init
263 ~GCdbRead(); //was cdb_free
264 int read(char *,unsigned int,uint32);
265 int match(const char *key, unsigned int len, uint32 pos);
266 void findstart() { loop =0; }
267 int findnext(const char *key,unsigned int len);
268 int find(const char *key);
269 int datapos() { return dpos; }
270 int datalen() { return dlen; }
271 int getfd() { return fd; }
272 char* getfile() { return fname; }
273 };
274
275 class GReadBuf {
276 protected:
277 FILE* f;
278 uchar* buf;
279 int buflen;
280 int bufused; //
281 int bufpos;
282 off_t fpos;
283 bool eof;
284 bool eob;
285
286 int refill(bool repos=false) {
287 //refill the buffer-----------
288 if (repos && bufpos==0) return 0; //no need to repos
289 if (eof) return 0;
290 int fr=0;
291 if (repos && bufpos<bufused) {
292 int kept=bufused-bufpos;
293 memmove((void*)buf, (void*)(buf+bufpos),kept);
294 fr=(int)fread((void *)(buf+kept), 1, buflen-kept, f);
295 if (fr<buflen-kept) eof=true;
296 buf[kept+fr]='\0';
297 bufused=kept+fr;
298 }
299 else {
300 fr=(int)fread((void *)buf, 1, buflen, f);
301 if (fr<buflen) eof=true;
302 buf[fr]='\0'; //only for text record parsers
303 bufused=fr;
304 }
305 if (feof(f)) eof=true;
306 if (ferror(f)) {
307 GMessage("GReadBuf::refill - error at fread!\n");
308 eof=true;
309 }
310 bufpos=0;
311 fpos+=fr; //bytes read from file so far
312 return fr;
313 }
314 public:
315 GReadBuf(FILE* fin, int bsize=4096) {
316 f=fin;
317 buflen=bsize;
318 GMALLOC(buf,buflen+1);
319 bufpos=0; //current pointer for get function
320 bufused=0;
321 fpos=0;
322 eof=false;
323 eob=false;
324 refill();
325 }
326 ~GReadBuf() { GFREE(buf); }
327
328 //reads len chars from stream into the outbuf
329 //updates bufpos
330 //->returns the number of bytes read
331 int get(uchar *outbuf, int len) {
332 if (eob) return 0;
333 int rd=0; //bytes read
334 while (!eob && rd<len) {
335 int to_read=GMIN((bufused-bufpos),(len-rd));
336 memcpy((void*)(outbuf+rd),(void*)(buf+bufpos), to_read);
337 bufpos+=to_read;
338 rd+=to_read;
339 if (bufpos>=bufused) {
340 if (eof) eob=true;
341 else refill();
342 }
343 }//while
344 return rd;
345 }
346
347 uchar* getStr(uchar *outbuf, int len) {
348 int rd=get(outbuf,len);
349 if (rd==0) return NULL;
350 else {
351 outbuf[rd]='\0';
352 return outbuf;
353 }
354 }
355
356 // getc equivalent
357 int getch() {
358 if (eob) return -1;
359 int ch=(int)(uchar)buf[bufpos];
360 bufpos++;
361 if (bufpos>=bufused) {
362 if (eof) eob=true;
363 else refill();
364 }
365 return ch;
366 }
367
368 //---
369 bool isEof() { return eob; }
370 bool ended() { return eob; }
371 off_t getPos() {
372 //returns the virtual file position
373 // = the actual file offset of the byte at bufpos
374 return fpos-(bufused-bufpos);
375 }
376 //skip into the stream the specified number of bytes
377 int skip(int skiplen) {
378 if (eob) return 0;
379 int r=0; //the actual number of bytes skipped
380 while (skiplen && !eob) {
381 int dif=GMIN(bufused-bufpos,skiplen);
382 skiplen-=dif;
383 bufpos+=dif;
384 r+=dif;
385 if (bufpos>=bufused) {
386 if (eof) { eob=true; return r; }
387 refill();
388 }
389 }
390 return r;
391 }
392 //look ahead without updating the read pointer (bufpos)
393 //Cannot peek more than buflen!
394 int peek(uchar* outbuf, int len) {
395 if (eob) return -1;
396 //if (eob || len>buflen) return -1;
397 if (len>bufused-bufpos) refill(true);
398 int mlen=GMIN((bufused-bufpos),len);
399 memcpy((void*)outbuf, (void*)(buf+bufpos), mlen);
400 return mlen;
401 }
402 char peekChar() {
403 if (eob) return -1;
404 //if (eob || len>buflen) return -1;
405 if (1>bufused-bufpos) refill(true);
406 return *(buf+bufpos);
407 }
408 uchar* peekStr(uchar* outbuf, int len) {
409 int rd=peek(outbuf,len);
410 if (rd>0) { outbuf[rd]='\0'; return outbuf; }
411 else return NULL;
412 }
413 //looks ahead to check if what follows matches
414 int peekCmp(char* cmpstr, int cmplen=-1) {
415 if (cmplen==0) return 0;
416 if (eob) //GError("GReadBuf::peekcmp error: eob!\n");
417 return -2;
418 if (cmplen<0) cmplen=strlen(cmpstr);
419 if (cmplen>bufused-bufpos) {
420 refill(true);
421 if (cmplen>bufused-bufpos) return -2;
422 }
423 //use memcmp
424 return memcmp((void*)(buf+bufpos), cmpstr, cmplen);
425 }
426
427 };
428
429 //circular line buffer, with read-ahead (peeking) capability
430 class GReadBufLine {
431 protected:
432 struct BufLine {
433 off_t fpos;
434 int len;
435 char* chars;
436 };
437 int bufcap; //total number of lines in the buf array
438 int bufidx; // the "current line" index in buf array
439 bool isEOF;
440 int lno;
441 FILE* file;
442 off_t filepos; //current file/stream offset for the first char of buf[bufidx]
443 BufLine* buf; //array of bufferred lines
444 char* readline(int idx);//read line from file into the buffer
445 int fillbuf();
446 bool isEOB;
447 public:
448 const char* line(); //gets current line and advances the "current line" pointer
449 //use putLine() to revert/undo this advancement
450 off_t fpos(); //gets current line's byte offset in the file
451 // does NOT advance the "current line" pointer
452 int len(); //gets current line's length
453 // does NOT advance the "current line" pointer
454 bool isEof() { return isEOB; }
455 bool eof() { return isEOB; }
456 off_t getfpos() { return fpos(); }
457 const char* getline() { return line(); }
458 const char* getLine() { return line(); }
459 int getLen() { return len(); }
460 int linenumber() { return lno; }
461 int lineno() { return lno; }
462 int getLineNo() { return lno; }
463 void putLine();
464 GReadBufLine(FILE* stream, int bcap=20) {
465 if (bcap<2) bcap=2; //at least 1 prev line is needed for putLine()
466 bufcap=bcap;
467 bufidx=-1;
468 isEOB=false;
469 isEOF=false;
470 lno=0;
471 GMALLOC(buf, bufcap * sizeof(BufLine));
472 for (int i=0;i<bufcap;i++) {
473 buf[i].chars=NULL;
474 buf[i].fpos=-1;
475 buf[i].len=0;
476 }
477 file=stream;
478 fillbuf();
479 }
480 ~GReadBufLine() {
481 for (int i=0;i<bufcap;i++) {
482 GFREE(buf[i].chars);
483 }
484 GFREE(buf);
485 }
486 };
487
488 #endif