ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/gcdb.h
Revision: 16
Committed: Mon Jul 18 20:56:02 2011 UTC (7 years, 11 months ago) by gpertea
File size: 13136 byte(s)
Log Message:
sync with local source

Line User Rev File contents
1 gpertea 2 #ifndef __GCDB_H
2     #define __GCDB_H
3 gpertea 16 #include "GBase.h"
4 gpertea 2 #include <stddef.h>
5     #include <fcntl.h>
6    
7 gpertea 16 #ifdef __WIN32__
8 gpertea 2 #define PROT_READ 1
9     #define PROT_WRITE 2
10     #define PROT_READWRITE 3
11     #define MAP_SHARED 1
12     #define MAP_PRIVATE 2
13     #define F_OK 0
14     #define R_OK 4
15     #define W_OK 2
16     #define RW_OK 6
17    
18 gpertea 16 #ifndef MAP_FAILED
19 gpertea 2 #define MAP_FAILED ((void *) -1)
20     #endif
21     void *mmap(char *,size_t,int,int,int,off_t);
22     int munmap(void *,size_t);
23     #else
24     #include <sys/mman.h>
25     #endif
26    
27     //=====================================================
28     //------------- buffer stuff -------------------
29     //=====================================================
30     #define GCDBUFFER_INSIZE 8192
31     #define GCDBUFFER_OUTSIZE 8192
32    
33     typedef int (*opfunc)(int, char*, size_t);
34    
35     //typedef unsigned long gcdb_seek_pos;
36     typedef off_t gcdb_seek_pos;
37 gpertea 16 typedef unsigned int (*uint_conv_func)(void*); //uint conversion function pointer
38     typedef off_t (*offt_conv_func)(void*); //uint conversion function pointer
39     typedef int16_t (*int16_conv_func)(void*); //int16 conversion function pointer
40 gpertea 2
41    
42     //conversion function --> to platform independent uint
43 gpertea 16 extern uint_conv_func gcvt_uint;
44     extern offt_conv_func gcvt_offt;
45     extern int16_conv_func gcvt_int16;
46     /*
47 gpertea 2 unsigned int uint32_sun(void* x86int);
48     unsigned int uint32_x86(void* x86int);
49     //for file offsets: off_t runtime conversions:
50     off_t offt_sun(void* offt);
51     off_t offt_x86(void* offt);
52 gpertea 16 int16_t int16_sun(void* i16);
53     int16_t int16_x86(void* i16);
54     */
55 gpertea 2
56 gpertea 16 void gcvt_endian_setup();
57 gpertea 2
58     class GCDBuffer {
59     public:
60     char *x;
61     unsigned int p;
62     unsigned int n;
63     int fd;
64     opfunc op;
65     //methods:
66     GCDBuffer() {
67     x=NULL;
68     fd=0;
69     op=NULL;
70     n=0;
71     }
72     GCDBuffer(opfunc aop,int afd,char *buf,unsigned int len) {
73     //check endianness
74 gpertea 16 gcvt_endian_setup();
75 gpertea 2 init(aop, afd, buf, len);
76     }
77     void init(opfunc aop,int afd,char *buf,unsigned int len) {
78     x=buf;
79     fd=afd;
80     op=aop;
81     p=0;
82     n=len;
83     }
84     int flush();
85     int write_all(char* buf, unsigned int pt);
86     int put(char* buf,unsigned int len);
87     int putalign(char* buf,unsigned int len);
88     int putflush(char* buf,unsigned int len);
89     int puts(char *buf);
90     int putsalign(char *buf);
91     int putsflush(char *buf);
92     int oneRead(char* buf, unsigned int len);
93     int getthis(char* buf,unsigned int len);
94     int get(char* buf,unsigned int len);
95     int bget(char* buf,unsigned int len);
96     int feed();
97     char *peek();
98     void seek(unsigned int len);
99     int copy(GCDBuffer* bin);
100     };
101    
102    
103     //=====================================================
104     //------------- cdb utils -------------------
105     //=====================================================
106     #ifndef __WIN32__
107     extern int errno;
108     #endif
109     extern int error_intr;
110     extern int error_nomem;
111     extern int error_proto;
112    
113     //additional data to be appended to the cdb file:
114     #define CDBMSK_OPT_MULTI 0x00000001
115     #define CDBMSK_OPT_C 0x00000002
116     #define CDBMSK_OPT_CADD 0x00000004
117     #define CDBMSK_OPT_COMPRESS 0x00000008
118 gpertea 16 #define CDBMSK_OPT_GSEQ 0x00000010
119 gpertea 2 //creates a compressed version of the database
120     //uses plenty of unions for ensuring compatibility with
121     // the old 'CIDX' info structure
122    
123 gpertea 16 //trying to prevent [64bit] machines to align this to 64bit -- sizeof() gets it wrong!
124 gpertea 2 #pragma pack(4)
125 gpertea 16 // eek, gcc 2.95.3 alpha-decosf version does not
126     // recognize this pragma directive
127    
128    
129 gpertea 2 struct cdbInfo {
130     uint32 num_keys;
131     union {
132     uint32 num_records;
133     char oldtag[4]; // 'CIDX' for old tag style
134     };
135     // data file size -- used to be uint32, now it could be 64bit
136     union {
137     off_t dbsize;
138     uint32 oldnum[2]; //num_keys, num_records
139     };
140     union {
141     uint32 idxflags;
142     uint32 old_dbsize;
143     };
144     union {
145     int dbnamelen;
146     int old_idxflags;
147     };
148     // -- the actual db name precedes this fixed-size record
149     union {
150     char tag[4]; //'CDBX' for new files with LFS
151     uint32 old_dbnamelen;
152     };
153     };
154 gpertea 16
155     // for passing around index data:
156     struct CIdxData32 {
157     uint32 fpos;
158     uint32 reclen;
159     };
160     /*
161     struct CIdxSeqData32 { //4+4+2+1 = 11 bytes
162     uint32 fpos;
163     uint32 reclen;
164     uint16_t linelen; //line length for FASTA-formatted seq
165     byte elen; //length of end-of-line delimiter: 1 (unix/mac) or 2 (Windows)
166     };
167     */
168     struct CIdxData {
169     off_t fpos; //64bit value on Linux
170     uint32 reclen;
171     };
172     /*
173     struct CIdxSeqData { //8+4+2+1 = 15 bytes
174     off_t fpos; //64bit value on Linux
175     uint32 reclen;
176     uint16_t linelen; //line length for FASTA-formatted seq
177     byte elen; //length of end-of-line delimiter: 1 (unix/mac) or 2 (Windows)
178     };
179     */
180 gpertea 2 #pragma pack()
181    
182     extern int cdbInfoSIZE;
183 gpertea 16 extern int IdxDataSIZE;
184     extern int IdxDataSIZE32;
185     /*
186     extern int IdxSeqDataSIZE;
187     extern int IdxSeqDataSIZE32;
188     */
189 gpertea 2
190     void uint32_pack(char *,uint32);
191     void uint32_pack_big(char *,uint32);
192     void uint32_unpack(char *,uint32 *);
193     void uint32_unpack_big(char *,uint32 *);
194    
195     //=====================================================
196     //------------- cdb index -------------------
197     //=====================================================
198    
199     #define CDB_HPLIST 1000
200    
201     struct cdb_hp { uint32 h; uint32 p; } ;
202    
203     struct cdb_hplist {
204     struct cdb_hp hp[CDB_HPLIST];
205     struct cdb_hplist *next;
206     int num;
207     };
208    
209     //the index file should always be smaller than 4GB !
210    
211     class GCdbWrite {
212     GCDBuffer* cdbuf;
213     char bspace[8192];
214     char fname[1024];
215     char final[2048];
216     uint32 count[256];
217     uint32 start[256];
218     struct cdb_hplist *head;
219     struct cdb_hp *split; /* includes space for hash */
220     struct cdb_hp *hash;
221     uint32 numentries;
222     uint32 pos; //file position
223     int posplus(uint32 len);
224     int fd; //file descriptor
225     public:
226     //methods:
227     GCdbWrite(int afd); //was: init
228     GCdbWrite(char* fname);
229     ~GCdbWrite();
230     int addbegin(unsigned int keylen,unsigned int datalen);
231     int addend(unsigned int keylen,unsigned int datalen,uint32 h);
232     int addrec(const char *key,unsigned int keylen,char *data,unsigned int datalen);
233     int add(const char *key, char *data, unsigned int datalen);
234     int getNumEntries() { return numentries; }
235     int finish();
236     int close();
237     int getfd() { return fd; }
238     char* getfile() { return fname; }
239     };
240    
241    
242     //=====================================================
243     //------------- cdb -------------------
244     //=====================================================
245    
246     #define CDB_HASHSTART 5381
247    
248     uint32 cdb_hashadd(uint32,unsigned char);
249     uint32 cdb_hash(const char *,unsigned int);
250    
251     class GCdbRead {
252     uint32 size; // initialized if map is nonzero
253     uint32 loop; // number of hash slots searched under this key
254     uint32 khash; // initialized if loop is nonzero
255     uint32 kpos; // initialized if loop is nonzero
256     uint32 hpos; // initialized if loop is nonzero
257     uint32 hslots; // initialized if loop is nonzero
258     uint32 dpos; // initialized if cdb_findnext() returns 1
259     uint32 dlen; // initialized if cdb_findnext() returns 1
260     char fname[1024];
261     char *map; // 0 if no map is available
262     int fd;
263     public:
264     //methods:
265     GCdbRead(int fd); //was cdb_init
266     GCdbRead(char* afname); //was cdb_init
267     ~GCdbRead(); //was cdb_free
268     int read(char *,unsigned int,uint32);
269     int match(const char *key, unsigned int len, uint32 pos);
270     void findstart() { loop =0; }
271     int findnext(const char *key,unsigned int len);
272     int find(const char *key);
273     int datapos() { return dpos; }
274     int datalen() { return dlen; }
275     int getfd() { return fd; }
276     char* getfile() { return fname; }
277     };
278    
279     class GReadBuf {
280     protected:
281     FILE* f;
282     uchar* buf;
283     int buflen;
284     int bufused; //
285     int bufpos;
286     off_t fpos;
287     bool eof;
288     bool eob;
289    
290     int refill(bool repos=false) {
291     //refill the buffer-----------
292     if (repos && bufpos==0) return 0; //no need to repos
293     if (eof) return 0;
294     int fr=0;
295     if (repos && bufpos<bufused) {
296     int kept=bufused-bufpos;
297     memmove((void*)buf, (void*)(buf+bufpos),kept);
298     fr=(int)fread((void *)(buf+kept), 1, buflen-kept, f);
299     if (fr<buflen-kept) eof=true;
300     buf[kept+fr]='\0';
301     bufused=kept+fr;
302     }
303     else {
304     fr=(int)fread((void *)buf, 1, buflen, f);
305     if (fr<buflen) eof=true;
306     buf[fr]='\0'; //only for text record parsers
307     bufused=fr;
308     }
309     if (feof(f)) eof=true;
310     if (ferror(f)) {
311     GMessage("GReadBuf::refill - error at fread!\n");
312     eof=true;
313     }
314     bufpos=0;
315     fpos+=fr; //bytes read from file so far
316     return fr;
317     }
318     public:
319     GReadBuf(FILE* fin, int bsize=4096) {
320     f=fin;
321     buflen=bsize;
322     GMALLOC(buf,buflen+1);
323     bufpos=0; //current pointer for get function
324     bufused=0;
325     fpos=0;
326     eof=false;
327     eob=false;
328     refill();
329     }
330     ~GReadBuf() { GFREE(buf); }
331    
332     //reads len chars from stream into the outbuf
333     //updates bufpos
334     //->returns the number of bytes read
335     int get(uchar *outbuf, int len) {
336     if (eob) return 0;
337     int rd=0; //bytes read
338     while (!eob && rd<len) {
339     int to_read=GMIN((bufused-bufpos),(len-rd));
340     memcpy((void*)(outbuf+rd),(void*)(buf+bufpos), to_read);
341     bufpos+=to_read;
342     rd+=to_read;
343     if (bufpos>=bufused) {
344     if (eof) eob=true;
345     else refill();
346     }
347     }//while
348     return rd;
349     }
350    
351     uchar* getStr(uchar *outbuf, int len) {
352     int rd=get(outbuf,len);
353     if (rd==0) return NULL;
354     else {
355     outbuf[rd]='\0';
356     return outbuf;
357     }
358     }
359    
360     // getc equivalent
361     int getch() {
362     if (eob) return -1;
363     int ch=(int)(uchar)buf[bufpos];
364     bufpos++;
365     if (bufpos>=bufused) {
366     if (eof) eob=true;
367     else refill();
368     }
369     return ch;
370     }
371    
372     //---
373     bool isEof() { return eob; }
374     bool ended() { return eob; }
375     off_t getPos() {
376     //returns the virtual file position
377     // = the actual file offset of the byte at bufpos
378     return fpos-(bufused-bufpos);
379     }
380     //skip into the stream the specified number of bytes
381     int skip(int skiplen) {
382     if (eob) return 0;
383     int r=0; //the actual number of bytes skipped
384     while (skiplen && !eob) {
385     int dif=GMIN(bufused-bufpos,skiplen);
386     skiplen-=dif;
387     bufpos+=dif;
388     r+=dif;
389     if (bufpos>=bufused) {
390     if (eof) { eob=true; return r; }
391     refill();
392     }
393     }
394     return r;
395     }
396     //look ahead without updating the read pointer (bufpos)
397     //Cannot peek more than buflen!
398     int peek(uchar* outbuf, int len) {
399     if (eob) return -1;
400     //if (eob || len>buflen) return -1;
401     if (len>bufused-bufpos) refill(true);
402     int mlen=GMIN((bufused-bufpos),len);
403     memcpy((void*)outbuf, (void*)(buf+bufpos), mlen);
404     return mlen;
405     }
406 gpertea 16 char peekChar() {
407     if (eob) return -1;
408     //if (eob || len>buflen) return -1;
409     if (1>bufused-bufpos) refill(true);
410     return *(buf+bufpos);
411     }
412 gpertea 2 uchar* peekStr(uchar* outbuf, int len) {
413     int rd=peek(outbuf,len);
414     if (rd>0) { outbuf[rd]='\0'; return outbuf; }
415     else return NULL;
416     }
417     //looks ahead to check if what follows matches
418 gpertea 16 int peekCmp(char* cmpstr, int cmplen=-1) {
419     if (cmplen==0) return 0;
420 gpertea 2 if (eob) //GError("GReadBuf::peekcmp error: eob!\n");
421     return -2;
422 gpertea 16 if (cmplen<0) cmplen=strlen(cmpstr);
423 gpertea 2 if (cmplen>bufused-bufpos) {
424     refill(true);
425     if (cmplen>bufused-bufpos) return -2;
426     }
427     //use memcmp
428     return memcmp((void*)(buf+bufpos), cmpstr, cmplen);
429     }
430    
431     };
432    
433     //circular line buffer, with read-ahead (peeking) capability
434     class GReadBufLine {
435     protected:
436     struct BufLine {
437     off_t fpos;
438     int len;
439     char* chars;
440     };
441     int bufcap; //total number of lines in the buf array
442     int bufidx; // the "current line" index in buf array
443     bool isEOF;
444     int lno;
445     FILE* file;
446     off_t filepos; //current file/stream offset for the first char of buf[bufidx]
447     BufLine* buf; //array of bufferred lines
448     char* readline(int idx);//read line from file into the buffer
449     int fillbuf();
450     bool isEOB;
451     public:
452     const char* line(); //gets current line and advances the "current line" pointer
453     //use putLine() to revert/undo this advancement
454     off_t fpos(); //gets current line's byte offset in the file
455     // does NOT advance the "current line" pointer
456     int len(); //gets current line's length
457     // does NOT advance the "current line" pointer
458     bool isEof() { return isEOB; }
459     bool eof() { return isEOB; }
460     off_t getfpos() { return fpos(); }
461     const char* getline() { return line(); }
462     const char* getLine() { return line(); }
463     int getLen() { return len(); }
464     int linenumber() { return lno; }
465     int lineno() { return lno; }
466     int getLineNo() { return lno; }
467     void putLine();
468     GReadBufLine(FILE* stream, int bcap=20) {
469     if (bcap<2) bcap=2; //at least 1 prev line is needed for putLine()
470     bufcap=bcap;
471     bufidx=-1;
472     isEOB=false;
473     isEOF=false;
474     lno=0;
475     GMALLOC(buf, bufcap * sizeof(BufLine));
476     for (int i=0;i<bufcap;i++) {
477     buf[i].chars=NULL;
478     buf[i].fpos=-1;
479     buf[i].len=0;
480     }
481     file=stream;
482     fillbuf();
483     }
484     ~GReadBufLine() {
485     for (int i=0;i<bufcap;i++) {
486     GFREE(buf[i].chars);
487     }
488     GFREE(buf);
489     }
490     };
491    
492     #endif