ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/gcdb.h
Revision: 264
Committed: Mon Aug 27 18:21:11 2012 UTC (7 years, 6 months ago) by gpertea
File size: 13127 byte(s)
Log Message:
minor refactoring etc

Line User Rev File contents
1 gpertea 2 #ifndef __GCDB_H
2     #define __GCDB_H
3 gpertea 16 #include "GBase.h"
4 gpertea 2 #include <stddef.h>
5     #include <fcntl.h>
6    
7 gpertea 16 #ifdef __WIN32__
8 gpertea 2 #define PROT_READ 1
9     #define PROT_WRITE 2
10     #define PROT_READWRITE 3
11     #define MAP_SHARED 1
12     #define MAP_PRIVATE 2
13     #define F_OK 0
14     #define R_OK 4
15     #define W_OK 2
16     #define RW_OK 6
17    
18 gpertea 16 #ifndef MAP_FAILED
19 gpertea 2 #define MAP_FAILED ((void *) -1)
20     #endif
21     void *mmap(char *,size_t,int,int,int,off_t);
22     int munmap(void *,size_t);
23     #else
24     #include <sys/mman.h>
25     #endif
26    
27     //=====================================================
28     //------------- buffer stuff -------------------
29     //=====================================================
30     #define GCDBUFFER_INSIZE 8192
31     #define GCDBUFFER_OUTSIZE 8192
32    
33     typedef int (*opfunc)(int, char*, size_t);
34    
35     //typedef unsigned long gcdb_seek_pos;
36     typedef off_t gcdb_seek_pos;
37 gpertea 16 typedef unsigned int (*uint_conv_func)(void*); //uint conversion function pointer
38     typedef off_t (*offt_conv_func)(void*); //uint conversion function pointer
39     typedef int16_t (*int16_conv_func)(void*); //int16 conversion function pointer
40 gpertea 2
41    
42     //conversion function --> to platform independent uint
43 gpertea 16 extern uint_conv_func gcvt_uint;
44     extern offt_conv_func gcvt_offt;
45     extern int16_conv_func gcvt_int16;
46     /*
47 gpertea 2 unsigned int uint32_sun(void* x86int);
48     unsigned int uint32_x86(void* x86int);
49     //for file offsets: off_t runtime conversions:
50     off_t offt_sun(void* offt);
51     off_t offt_x86(void* offt);
52 gpertea 16 int16_t int16_sun(void* i16);
53     int16_t int16_x86(void* i16);
54     */
55 gpertea 2
56 gpertea 16 void gcvt_endian_setup();
57 gpertea 2
58     class GCDBuffer {
59     public:
60     char *x;
61     unsigned int p;
62     unsigned int n;
63     int fd;
64     opfunc op;
65     //methods:
66 gpertea 264 GCDBuffer():x(NULL),p(0),n(0),fd(0),op(NULL) {
67 gpertea 2 }
68     GCDBuffer(opfunc aop,int afd,char *buf,unsigned int len) {
69     //check endianness
70 gpertea 16 gcvt_endian_setup();
71 gpertea 2 init(aop, afd, buf, len);
72     }
73     void init(opfunc aop,int afd,char *buf,unsigned int len) {
74     x=buf;
75     fd=afd;
76     op=aop;
77     p=0;
78     n=len;
79     }
80     int flush();
81     int write_all(char* buf, unsigned int pt);
82     int put(char* buf,unsigned int len);
83     int putalign(char* buf,unsigned int len);
84     int putflush(char* buf,unsigned int len);
85     int puts(char *buf);
86     int putsalign(char *buf);
87     int putsflush(char *buf);
88     int oneRead(char* buf, unsigned int len);
89     int getthis(char* buf,unsigned int len);
90     int get(char* buf,unsigned int len);
91     int bget(char* buf,unsigned int len);
92     int feed();
93     char *peek();
94     void seek(unsigned int len);
95     int copy(GCDBuffer* bin);
96     };
97    
98    
99     //=====================================================
100     //------------- cdb utils -------------------
101     //=====================================================
102     #ifndef __WIN32__
103     extern int errno;
104     #endif
105     extern int error_intr;
106     extern int error_nomem;
107     extern int error_proto;
108    
109     //additional data to be appended to the cdb file:
110     #define CDBMSK_OPT_MULTI 0x00000001
111     #define CDBMSK_OPT_C 0x00000002
112     #define CDBMSK_OPT_CADD 0x00000004
113     #define CDBMSK_OPT_COMPRESS 0x00000008
114 gpertea 16 #define CDBMSK_OPT_GSEQ 0x00000010
115 gpertea 2 //creates a compressed version of the database
116     //uses plenty of unions for ensuring compatibility with
117     // the old 'CIDX' info structure
118    
119 gpertea 16 //trying to prevent [64bit] machines to align this to 64bit -- sizeof() gets it wrong!
120 gpertea 2 #pragma pack(4)
121 gpertea 16 // eek, gcc 2.95.3 alpha-decosf version does not
122     // recognize this pragma directive
123    
124    
125 gpertea 2 struct cdbInfo {
126     uint32 num_keys;
127     union {
128     uint32 num_records;
129     char oldtag[4]; // 'CIDX' for old tag style
130     };
131     // data file size -- used to be uint32, now it could be 64bit
132     union {
133 gpertea 258 int64_t dbsize;
134 gpertea 2 uint32 oldnum[2]; //num_keys, num_records
135     };
136     union {
137     uint32 idxflags;
138     uint32 old_dbsize;
139     };
140     union {
141     int dbnamelen;
142     int old_idxflags;
143     };
144     // -- the actual db name precedes this fixed-size record
145     union {
146     char tag[4]; //'CDBX' for new files with LFS
147     uint32 old_dbnamelen;
148     };
149     };
150 gpertea 16
151     // for passing around index data:
152     struct CIdxData32 {
153     uint32 fpos;
154     uint32 reclen;
155     };
156     /*
157     struct CIdxSeqData32 { //4+4+2+1 = 11 bytes
158     uint32 fpos;
159     uint32 reclen;
160     uint16_t linelen; //line length for FASTA-formatted seq
161     byte elen; //length of end-of-line delimiter: 1 (unix/mac) or 2 (Windows)
162     };
163     */
164     struct CIdxData {
165     off_t fpos; //64bit value on Linux
166     uint32 reclen;
167     };
168     /*
169     struct CIdxSeqData { //8+4+2+1 = 15 bytes
170     off_t fpos; //64bit value on Linux
171     uint32 reclen;
172     uint16_t linelen; //line length for FASTA-formatted seq
173     byte elen; //length of end-of-line delimiter: 1 (unix/mac) or 2 (Windows)
174     };
175     */
176 gpertea 2 #pragma pack()
177    
178     extern int cdbInfoSIZE;
179 gpertea 16 extern int IdxDataSIZE;
180     extern int IdxDataSIZE32;
181     /*
182     extern int IdxSeqDataSIZE;
183     extern int IdxSeqDataSIZE32;
184     */
185 gpertea 2
186     void uint32_pack(char *,uint32);
187     void uint32_pack_big(char *,uint32);
188     void uint32_unpack(char *,uint32 *);
189     void uint32_unpack_big(char *,uint32 *);
190    
191     //=====================================================
192     //------------- cdb index -------------------
193     //=====================================================
194    
195     #define CDB_HPLIST 1000
196    
197     struct cdb_hp { uint32 h; uint32 p; } ;
198    
199     struct cdb_hplist {
200     struct cdb_hp hp[CDB_HPLIST];
201     struct cdb_hplist *next;
202     int num;
203     };
204    
205     //the index file should always be smaller than 4GB !
206    
207     class GCdbWrite {
208     GCDBuffer* cdbuf;
209     char bspace[8192];
210     char fname[1024];
211     char final[2048];
212     uint32 count[256];
213     uint32 start[256];
214     struct cdb_hplist *head;
215     struct cdb_hp *split; /* includes space for hash */
216     struct cdb_hp *hash;
217     uint32 numentries;
218     uint32 pos; //file position
219     int posplus(uint32 len);
220     int fd; //file descriptor
221     public:
222     //methods:
223     GCdbWrite(int afd); //was: init
224     GCdbWrite(char* fname);
225     ~GCdbWrite();
226     int addbegin(unsigned int keylen,unsigned int datalen);
227     int addend(unsigned int keylen,unsigned int datalen,uint32 h);
228     int addrec(const char *key,unsigned int keylen,char *data,unsigned int datalen);
229     int add(const char *key, char *data, unsigned int datalen);
230     int getNumEntries() { return numentries; }
231     int finish();
232     int close();
233     int getfd() { return fd; }
234     char* getfile() { return fname; }
235     };
236    
237    
238     //=====================================================
239     //------------- cdb -------------------
240     //=====================================================
241    
242     #define CDB_HASHSTART 5381
243    
244     uint32 cdb_hashadd(uint32,unsigned char);
245     uint32 cdb_hash(const char *,unsigned int);
246    
247     class GCdbRead {
248     uint32 size; // initialized if map is nonzero
249     uint32 loop; // number of hash slots searched under this key
250     uint32 khash; // initialized if loop is nonzero
251     uint32 kpos; // initialized if loop is nonzero
252     uint32 hpos; // initialized if loop is nonzero
253     uint32 hslots; // initialized if loop is nonzero
254     uint32 dpos; // initialized if cdb_findnext() returns 1
255     uint32 dlen; // initialized if cdb_findnext() returns 1
256     char fname[1024];
257     char *map; // 0 if no map is available
258     int fd;
259     public:
260     //methods:
261     GCdbRead(int fd); //was cdb_init
262     GCdbRead(char* afname); //was cdb_init
263     ~GCdbRead(); //was cdb_free
264     int read(char *,unsigned int,uint32);
265     int match(const char *key, unsigned int len, uint32 pos);
266     void findstart() { loop =0; }
267     int findnext(const char *key,unsigned int len);
268     int find(const char *key);
269     int datapos() { return dpos; }
270     int datalen() { return dlen; }
271     int getfd() { return fd; }
272     char* getfile() { return fname; }
273     };
274    
275     class GReadBuf {
276     protected:
277     FILE* f;
278     uchar* buf;
279     int buflen;
280     int bufused; //
281     int bufpos;
282     off_t fpos;
283     bool eof;
284     bool eob;
285    
286     int refill(bool repos=false) {
287     //refill the buffer-----------
288     if (repos && bufpos==0) return 0; //no need to repos
289     if (eof) return 0;
290     int fr=0;
291     if (repos && bufpos<bufused) {
292     int kept=bufused-bufpos;
293     memmove((void*)buf, (void*)(buf+bufpos),kept);
294     fr=(int)fread((void *)(buf+kept), 1, buflen-kept, f);
295     if (fr<buflen-kept) eof=true;
296     buf[kept+fr]='\0';
297     bufused=kept+fr;
298     }
299     else {
300     fr=(int)fread((void *)buf, 1, buflen, f);
301     if (fr<buflen) eof=true;
302     buf[fr]='\0'; //only for text record parsers
303     bufused=fr;
304     }
305     if (feof(f)) eof=true;
306     if (ferror(f)) {
307     GMessage("GReadBuf::refill - error at fread!\n");
308     eof=true;
309     }
310     bufpos=0;
311     fpos+=fr; //bytes read from file so far
312     return fr;
313     }
314     public:
315     GReadBuf(FILE* fin, int bsize=4096) {
316     f=fin;
317     buflen=bsize;
318     GMALLOC(buf,buflen+1);
319     bufpos=0; //current pointer for get function
320     bufused=0;
321     fpos=0;
322     eof=false;
323     eob=false;
324     refill();
325     }
326     ~GReadBuf() { GFREE(buf); }
327    
328     //reads len chars from stream into the outbuf
329     //updates bufpos
330     //->returns the number of bytes read
331     int get(uchar *outbuf, int len) {
332     if (eob) return 0;
333     int rd=0; //bytes read
334     while (!eob && rd<len) {
335     int to_read=GMIN((bufused-bufpos),(len-rd));
336     memcpy((void*)(outbuf+rd),(void*)(buf+bufpos), to_read);
337     bufpos+=to_read;
338     rd+=to_read;
339     if (bufpos>=bufused) {
340     if (eof) eob=true;
341     else refill();
342     }
343     }//while
344     return rd;
345     }
346    
347     uchar* getStr(uchar *outbuf, int len) {
348     int rd=get(outbuf,len);
349     if (rd==0) return NULL;
350     else {
351     outbuf[rd]='\0';
352     return outbuf;
353     }
354     }
355    
356     // getc equivalent
357     int getch() {
358     if (eob) return -1;
359     int ch=(int)(uchar)buf[bufpos];
360     bufpos++;
361     if (bufpos>=bufused) {
362     if (eof) eob=true;
363     else refill();
364     }
365     return ch;
366     }
367    
368     //---
369     bool isEof() { return eob; }
370     bool ended() { return eob; }
371     off_t getPos() {
372     //returns the virtual file position
373     // = the actual file offset of the byte at bufpos
374     return fpos-(bufused-bufpos);
375     }
376     //skip into the stream the specified number of bytes
377     int skip(int skiplen) {
378     if (eob) return 0;
379     int r=0; //the actual number of bytes skipped
380     while (skiplen && !eob) {
381     int dif=GMIN(bufused-bufpos,skiplen);
382     skiplen-=dif;
383     bufpos+=dif;
384     r+=dif;
385     if (bufpos>=bufused) {
386     if (eof) { eob=true; return r; }
387     refill();
388     }
389     }
390     return r;
391     }
392     //look ahead without updating the read pointer (bufpos)
393     //Cannot peek more than buflen!
394     int peek(uchar* outbuf, int len) {
395     if (eob) return -1;
396     //if (eob || len>buflen) return -1;
397     if (len>bufused-bufpos) refill(true);
398     int mlen=GMIN((bufused-bufpos),len);
399     memcpy((void*)outbuf, (void*)(buf+bufpos), mlen);
400     return mlen;
401     }
402 gpertea 16 char peekChar() {
403     if (eob) return -1;
404     //if (eob || len>buflen) return -1;
405     if (1>bufused-bufpos) refill(true);
406     return *(buf+bufpos);
407     }
408 gpertea 2 uchar* peekStr(uchar* outbuf, int len) {
409     int rd=peek(outbuf,len);
410     if (rd>0) { outbuf[rd]='\0'; return outbuf; }
411     else return NULL;
412     }
413     //looks ahead to check if what follows matches
414 gpertea 16 int peekCmp(char* cmpstr, int cmplen=-1) {
415     if (cmplen==0) return 0;
416 gpertea 2 if (eob) //GError("GReadBuf::peekcmp error: eob!\n");
417     return -2;
418 gpertea 16 if (cmplen<0) cmplen=strlen(cmpstr);
419 gpertea 2 if (cmplen>bufused-bufpos) {
420     refill(true);
421     if (cmplen>bufused-bufpos) return -2;
422     }
423     //use memcmp
424     return memcmp((void*)(buf+bufpos), cmpstr, cmplen);
425     }
426    
427     };
428    
429     //circular line buffer, with read-ahead (peeking) capability
430     class GReadBufLine {
431     protected:
432     struct BufLine {
433     off_t fpos;
434     int len;
435     char* chars;
436     };
437     int bufcap; //total number of lines in the buf array
438     int bufidx; // the "current line" index in buf array
439     bool isEOF;
440     int lno;
441     FILE* file;
442     off_t filepos; //current file/stream offset for the first char of buf[bufidx]
443     BufLine* buf; //array of bufferred lines
444     char* readline(int idx);//read line from file into the buffer
445     int fillbuf();
446     bool isEOB;
447     public:
448     const char* line(); //gets current line and advances the "current line" pointer
449     //use putLine() to revert/undo this advancement
450     off_t fpos(); //gets current line's byte offset in the file
451     // does NOT advance the "current line" pointer
452     int len(); //gets current line's length
453     // does NOT advance the "current line" pointer
454     bool isEof() { return isEOB; }
455     bool eof() { return isEOB; }
456     off_t getfpos() { return fpos(); }
457     const char* getline() { return line(); }
458     const char* getLine() { return line(); }
459     int getLen() { return len(); }
460     int linenumber() { return lno; }
461     int lineno() { return lno; }
462     int getLineNo() { return lno; }
463     void putLine();
464     GReadBufLine(FILE* stream, int bcap=20) {
465     if (bcap<2) bcap=2; //at least 1 prev line is needed for putLine()
466     bufcap=bcap;
467     bufidx=-1;
468     isEOB=false;
469     isEOF=false;
470     lno=0;
471     GMALLOC(buf, bufcap * sizeof(BufLine));
472     for (int i=0;i<bufcap;i++) {
473     buf[i].chars=NULL;
474     buf[i].fpos=-1;
475     buf[i].len=0;
476     }
477     file=stream;
478     fillbuf();
479     }
480     ~GReadBufLine() {
481     for (int i=0;i<bufcap;i++) {
482     GFREE(buf[i].chars);
483     }
484     GFREE(buf);
485     }
486     };
487    
488     #endif