ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/cdbfasta/cdbyank.cpp
Revision: 8
Committed: Mon Mar 22 22:11:25 2010 UTC (12 years, 6 months ago) by gpertea
File size: 22888 byte(s)
Log Message:
added cdbfasta source files

Line File contents
1 #include "gcdb.h"
2 #include "GBase.h"
3 #include "GArgs.h"
4 #include "ctype.h"
5 #include <fcntl.h>
6 #include <string.h>
7 #ifdef ENABLE_COMPRESSION
8 #include "gcdbz.h"
9 #else
10 const char err_COMPRESSION[]="Error: compression requested but not compiled in!\n"
11 #endif
12
13 #ifdef __WIN32__
14 #define VERSION "cdbyank version 0.981w"
15 #else
16 #define VERSION "cdbyank version 0.981"
17 #endif
18 #define USAGE "Usage:\n\
19 cdbyank <index_file> [-d <fasta_file>] [-a <key>|-n|-l|-s]\n\
20 [-o <outfile>] [-q <char>|-Q][-F] [-R] [-P] [-x] [-w] \n\
21 [-z <dbfasta.cdbz>\n\n\
22 <index_file> is the index file created previously with cdbfasta\n\
23 (usually having a \".cidx\" suffix)\n\
24 -a <key> the sequence name (accession) for a fasta record to be\n\
25 retrieved; if not given, a list of accessions is expected\n\
26 at stdin\n\
27 -d <fasta_file> is the fasta file to pull records from; \n\
28 if not specified, cdbyank will look in the same directory\n\
29 where <index_file> resides, for a file with the same name\n\
30 but without the \".cidx\" suffix\n\
31 -o the records found are written to file <outfile> instead of stdout\n\
32 -x allows retrieval of multiple records per key, if the indexed \n\
33 database had records with the same key (non-unique keys);\n\
34 (without -x only one record for a given key is retrieved)\n\
35 -i case insensitive query (expects the <index_file> to have been \n\
36 created with cdbfasta -i option)\n\
37 -Q output the query key surrounded by character '%' before the\n\
38 corresponding record\n\
39 -q same as -Q but use character <char> instead of '%'\n\
40 -w enable warnings (sent to stderr) when a key is not found\n\
41 -F pulls only the defline for each record (discard the sequence)\n\
42 -P only displays the position(s) (file offset) within the \n\
43 database file, for the requested record(s)\n\
44 -R sequence range extraction: expects the input <key(s)> to have \n\
45 the format: '<seq_name> <start> <end>'\n\
46 and pulls only the specified sequence range\n\
47 -z decompress the entire file <dbfasta.cdbz>\n\
48 (assumes it was built using cdbfasta with '-z' option)\n\
49 -v show version number and exit\n\
50 \n\
51 Index file statistics (no database file needed):\n\
52 -n display the number of records indexed\n\
53 -l list all keys stored in <index_file>\n\
54 -s display indexing summary info\n\n"
55
56 /*
57 -E same as -R but assumes FASTA records have a fixed line length\n\
58 (faster extraction of distant ranges for long records)\n\
59 */
60 #define ERR_READ "cdbyank: error reading from file.\n"
61 #define ERR_READFMT "cdbyank read error: incorrect file format.\n"
62 #define ERR_RANGEFMT "Sequence range parsing error for key '%s'\n"
63 #define ERR_RANGE_INVALID "Invalid range (%d-%d) specified for sequence '%s' of length %d\n"
64 // 1MB memory buffer:
65 #define MAX_MEM_RECSIZE 1048576
66 #ifndef O_BINARY
67 #define O_BINARY 0x0000
68 #endif
69
70
71 static char* idxfile;
72 static int warnings;
73 bool is_compressed=false;
74 bool defline_only=false;
75 bool rec_pos_only=false;
76 bool use_range=false;
77 bool fixed_linelen=false;
78 bool caseInsensitive=false;
79 bool showQuery=false;
80 char delimQuery='%';
81
82 off_t lastfpos=-1; //to avoid pulling the same record twice in a row..
83
84 FILE* fout=NULL;
85 GCdbRead* cdb=NULL;
86 #ifdef ENABLE_COMPRESSION
87 GCdbz* cdbz=NULL;
88 #endif
89 int fdb=-1;
90 FILE* fz=NULL;
91
92 void inplace_Lower(char* c) {
93 char *p=c;
94 while (*p!='\0') { *p=tolower(*p);p++; }
95 }
96
97 void buf_get(GCDBuffer* b, uint32& pos, char *buf, unsigned int len) {
98 int r;
99 while (len > 0) {
100 r = b->get(buf,len);
101 if (r == -1) GError(ERR_READ);
102 if (r == 0)
103 GError(ERR_READFMT);
104 pos += r;
105 buf += r;
106 len -= r;
107 }
108 }
109
110 void buf_getnum(GCDBuffer* b, uint32& pos, uint32 *num) {
111 char buf[4];
112 buf_get(b, pos, buf, 4);
113 uint32_unpack(buf,num);
114 }
115
116
117 int fetch_record(char* key, char* dbname, int many, int r_start=0, int r_end=0) {
118 //assumes fdb is open, cdb was created on the index file
119 if (caseInsensitive) inplace_Lower(key);
120 int r=cdb->find(key);
121 if (r==0 && warnings) {
122 GMessage("cdbyank: key \"%s\" not found in %s\n", key, idxfile);
123 return 0;
124 }
125 if (r==-1)
126 GError("cdbyank: error searching for key %s in %s\n", key, idxfile);
127 while (r>0) {
128 off_t pos = cdb->datapos(); //position of this key's record in the index file
129 unsigned int len=cdb->datalen(); // length of this key's record
130 char bytes[32]; // data buffer -- should just accomodate fastarec_pos, fastarec_length
131 if (cdb->read(bytes,len,pos) == -1)
132 GError("cdbyank: error at GCbd::read (%s)!\n", idxfile);
133
134 off_t fpos; //this will be the fastadb offset
135 uint32 reclen; //this will be the fasta record offset
136 if (len>8) { //64 bit file offset was used
137 fpos=gcvt_offt(bytes);
138 if (rec_pos_only) {
139 fprintf(fout, "%lld\n", fpos);
140 return 1;
141 }
142 reclen=gcvt_uint(&bytes[sizeof(uint32)<<1]);
143 }
144 else { //32bit offset used
145 fpos=gcvt_uint(bytes);
146 if (rec_pos_only) {
147 fprintf(fout, "%lld\n", fpos);
148 return 1;
149 }
150 reclen=gcvt_uint(&bytes[sizeof(uint32)]);
151 }
152 //GMessage("reclen=%d\n", reclen);
153
154
155 if (fpos == lastfpos) {
156 if (many) r=cdb->findnext(key, strlen(key));
157 else r=0;
158 continue;
159 }
160 lastfpos=fpos;
161 if (showQuery)
162 fprintf(fout, "%c%s%c\t", delimQuery, key, delimQuery);
163 if (is_compressed) {
164 #ifdef ENABLE_COMPRESSION
165 //for now: ignore special retrievals, just print the whole record
166 cdbz->decompress(fout, reclen, fpos);
167 if (many) r=cdb->findnext(key, strlen(key));
168 else r=0;
169 #endif
170 continue;
171 }
172 lseek(fdb, fpos, SEEK_SET);
173 if (reclen<=MAX_MEM_RECSIZE) {
174 char* p;
175 GMALLOC(p,reclen+1);
176 //errno=0;
177 r=read(fdb, p, reclen);
178 if (r<=0)
179 GError("cdbyank: Error reading from database file [%s] for %s (returned %d, offset %d) !\n",
180 dbname, idxfile, r, fpos);
181 p[reclen]='\0';
182 //--- now we have the whole record, check if some special options were given:
183 if (defline_only) {
184 char* q=strchr(p,'\n');
185 if (q!=NULL) *q='\0';
186 //skip '>' char
187 fprintf(fout, "%s\n",p+1);
188 }
189 else
190 if (use_range && r_start>0) { //range case
191 if (r_end<=0) r_end=reclen;
192 //extract only a substring of the sequence
193 char* r=strchr(p,'\n');
194 if (r!=NULL) *r='\0'; //now p only has the defline
195 fprintf(fout, "%s\n", p); //output the defline
196 r++;
197 unsigned int recpos=r-p; //p[recpos] MUST be a nucleotide or aminoacid now!
198 int seqpos=0;
199 char linebuf[61];
200 int linelen=0;
201 while (recpos<reclen) {
202 if (isspace(p[recpos])) recpos++; //skip newlines, etc. in the fasta sequence
203 else {
204 seqpos++;
205 if (seqpos>=r_start && seqpos<=r_end) {
206 linebuf[linelen]=p[recpos];
207 linelen++;
208 if (linelen==60 || seqpos==r_end) {
209 linebuf[linelen]='\0';
210 linelen=0;
211 fprintf(fout, "%s\n", linebuf);
212 if (seqpos==r_end) break;
213 }
214 }
215 recpos++;
216 }
217 }//while
218 if (linelen>0) {
219 linebuf[linelen]='\0';
220 linelen=0;
221 fprintf(fout, "%s\n", linebuf);
222 }
223 }
224 else { //not range display
225 fprintf(fout, "%s\n",p);
226 }
227 GFREE(p);
228 } //small record
229 else { //large record, read it char by char and return it as output
230 char c='\0';
231 if (defline_only) {
232 reclen--;
233 read(fdb, &c, 1);
234 }
235 while (reclen-- && read(fdb, &c, 1)==1) {
236 fprintf(fout, "%c", c);
237 if (c=='\n') break;
238 }
239 //defline written
240 if (!defline_only) {
241 int seqpos=1;
242 if (use_range) {
243 while (reclen-- && read(fdb, &c, 1)==1 && seqpos<=r_end) {
244 if (isspace(c)) continue;
245 if (seqpos>=r_start) {
246 int written=seqpos-r_start;
247 if (written && written%60 == 0)
248 fprintf(fout,"\n");
249 fprintf(fout, "%c", c);
250 }
251 seqpos++;
252 }//while
253 } //range case
254 else { //no range, just copy all chars to output
255 while (reclen-- && read(fdb, &c, 1)==1) {
256 fprintf(fout, "%c", c);
257 }
258 }
259 fprintf(fout, "\n");
260 }
261 }
262 if (many) r=cdb->findnext(key, strlen(key));
263 else r=0;
264 }
265 return 1;
266 }
267
268 int read_dbinfo(int fd, char** fnameptr, cdbInfo& dbstat) {
269 //this is messy due to the need of compatibility with the
270 //old 32bit file-length
271 char* dbname=*fnameptr;
272 //read just the tag first: 4 bytes ID
273 lseek(fd, -cdbInfoSIZE, SEEK_END);
274 int r=read(fd, &dbstat, cdbInfoSIZE );
275 if (r!=cdbInfoSIZE) return 2;
276 //GMessage("Size of dbstat=%d\n", cdbInfoSIZE);
277 if (strncmp(dbstat.oldtag, "CIDX", 4)==0) {
278 //old dbstat structure -- convert it
279 dbstat.num_keys=gcvt_uint(&dbstat.oldnum[0]);
280 dbstat.num_records=gcvt_uint(&dbstat.oldnum[1]);
281 dbstat.dbsize=gcvt_uint(&dbstat.old_dbsize);
282 dbstat.idxflags = gcvt_uint(&dbstat.old_idxflags);
283 //position on the dbnamelen entry
284 dbstat.dbnamelen = gcvt_uint(&dbstat.old_dbnamelen);
285 //GMessage("dbnamelen=%d\n", dbstat.dbnamelen);
286 lseek(fd, -(off_t)(cdbInfoSIZE-4+dbstat.dbnamelen), SEEK_END);
287 }
288 else if (strncmp(dbstat.tag, "CDBX", 4)!=0) {
289 GMessage("Error: this doesn't appear to be a cdbfasta created file!\n");
290 return 1;
291 }
292 else { // new CDBX type:
293 dbstat.dbsize = gcvt_offt(&dbstat.dbsize);
294 dbstat.num_keys=gcvt_uint(&dbstat.num_keys);
295 dbstat.num_records=gcvt_uint(&dbstat.num_records);
296 dbstat.idxflags = gcvt_uint(&dbstat.idxflags);
297 //position on the dbnamelen entry
298 dbstat.dbnamelen = gcvt_uint(&dbstat.dbnamelen);
299 //GMessage("dbnamelen=%d\n", dbstat.dbnamelen);
300 lseek(fd, -(off_t)(cdbInfoSIZE+dbstat.dbnamelen), SEEK_END);
301 }
302
303 GMALLOC(dbname, dbstat.dbnamelen+1);
304 dbname[dbstat.dbnamelen]='\0';
305 r=read(fd, dbname, dbstat.dbnamelen);
306 *fnameptr=dbname;
307 if (r!=dbstat.dbnamelen)
308 return 2;
309 return 0;
310 }
311
312 int parse_int(FILE* f, char* buf, char* key, int& e) {
313 char* p, *q;
314 while (e!=EOF && isspace(e)) { //skip any spaces
315 if (e=='\n') return 0; //GError(ERR_RANGEFMT, key);
316 e=fgetc(stdin);
317 }
318 if (e==EOF) return 0; //GError(ERR_RANGEFMT, key);
319 //now e is the first non-space
320 p=buf;
321 q=p;
322 while (e!=EOF && !isspace(e)) {
323 *q=e;
324 q++;
325 e=fgetc(stdin);
326 }
327 *q='\0'; //now p is the starting coordinate string
328 return atoi(p);
329 //now the file pointer should be on the first space after the parsed value
330 }
331
332 int parse_int(char*& f, char* key, int& e) {
333 char* p, *q;
334 char buf[16];
335 while (e!='\0' && isspace(e)) { //skip any spaces
336 //if (e=='\n') GError(ERR_RANGEFMT, key);
337 if (e=='\n') return 0;
338 f++;
339 e=*f;
340 }
341 //if (e=='\0') GError(ERR_RANGEFMT, key);
342 if (e=='\0') return 0;
343 //now e is the first non-space char
344 p=buf;
345 q=p;
346 while (e!='\0' && !isspace(e)) {
347 *q=e;
348 q++;
349 f++;
350 e=*f;
351 }
352 *q='\0';
353 return atoi(p);
354 //now f and e should be on the first space after the parsed value (or '\0')
355 }
356
357 #ifdef ENABLE_COMPRESSION
358
359 GCdbz* openCdbz(char* p) {
360 //in case this was not done before:
361 gcvt_uint=(endian_test())? &uint32_sun : &uint32_x86;
362 FILE* zf=fopen(p, "rb");
363 if (zf==NULL) {
364 GMessage("Error: cannot open compressed file '%s'!\n",p);
365 return NULL;
366 }
367 //check if the file is valid and read the length of the first record
368 //
369 char ztag[5];
370 ztag[4]='\0';
371 if (fread(ztag, 1, 4, zf)<4) {
372 GMessage("Error reading header of compressed file '%s'\n",p);
373 return NULL;
374 }
375 if (strcmp(ztag, "CDBZ")!=0) {
376 GMessage("Error: file '%s' doesn't appear to be a zlib compressed cdb?\n",p);
377 return NULL;
378 }
379 unsigned int zrecsize;
380 if (fread((void*) &zrecsize,1,4,zf)<4) {
381 GMessage("Error reading 1st compressed record size for file '%s'!\n",p);
382 return NULL;
383 }
384 zrecsize=gcvt_uint(&zrecsize);
385 return new GCdbz(zf, true, zrecsize);
386 }
387 #endif
388
389 int main(int argc, char **argv) {
390 char namebuf[1024];
391 int r_start, r_end;
392 char* p;
393 char* dbname=NULL;
394 int result=0;
395 int r=0;
396 cdbInfo dbstat;
397 dbstat.dbsize=0;
398 GArgs args(argc, argv, "a:d:o:z:q:nlsxwvFREiPQ");
399 int e=args.isError();
400 if (e>0)
401 GError("%s Invalid argument: %s\n", USAGE, argv[e]);
402 if (args.getOpt('v')!=NULL) {
403 printf("%s\n",VERSION);
404 return 0;
405 }
406 char* outfile=(char*)args.getOpt('o');
407 if (outfile!=NULL) {
408 if ((fout=fopen(outfile, "wb"))==NULL)
409 GError("Cannot create file '%s'!", outfile);
410 }
411 else fout=stdout;
412
413 if ((p=(char*)args.getOpt('z'))!=NULL) { //simply stream-decompress cdbz
414 #ifndef ENABLE_COMPRESSION
415 GError(err_COMPRESSION);
416 #else
417 GCdbz* cdbz=openCdbz(p);
418 if (cdbz==NULL)
419 GError("Error opening the cdbz file '%s'\n");
420 FILE* zf=cdbz->getZFile();
421 int numrecs=0;
422 int xcode;
423 while ((xcode=cdbz->decompress(fout))>0) numrecs++;
424 delete cdbz;
425 fclose(zf);
426 #endif
427 return 0;
428 }
429 int numfiles = args.startNonOpt();
430 if (numfiles==0)
431 GError("%s Error: an index file must be provided !\n", USAGE);
432 idxfile=(char*)args.nextNonOpt(); //first fasta file given
433 char* key=(char*)args.getOpt('a');
434
435 defline_only=(args.getOpt('F')!=NULL);
436 rec_pos_only=(args.getOpt('P')!=NULL);
437 showQuery=(args.getOpt('Q')!=NULL);
438 const char* q;
439 if ((q=args.getOpt('q'))!=NULL) {
440 delimQuery=*q;
441 showQuery=true;
442 }
443 use_range=((args.getOpt('R')!=NULL) || (args.getOpt('E')!=NULL));
444 fixed_linelen=(args.getOpt('E')!=NULL);
445 caseInsensitive=(args.getOpt('i')!=NULL);
446 /*is_compressed=((args.getOpt('Z')!=NULL) ||
447 (strstr(idxfile,".cidxz")!=NULL));*/
448 int listQuery=(args.getOpt('l')!=NULL);
449 warnings=(args.getOpt('w')!=NULL);
450 int dataQuery=(!listQuery && args.getOpt('n')==NULL
451 && args.getOpt('l')==NULL &&args.getOpt('s')==NULL);
452 //exclude the possibility of index-only stats query
453 dbname=(char*)args.getOpt('d');
454 int fd;
455 cdb=new GCdbRead(idxfile);
456 fd=cdb->getfd();
457 char* info_dbname=NULL;
458 off_t db_size=0;
459 dbstat.dbsize=0;
460
461 r=read_dbinfo(fd, &info_dbname, dbstat);
462 lseek(fd, 0, SEEK_SET);
463 if (r==1) GError("This file does not seem to be a cdbfasta generated file.\n");
464 else if (r==2)
465 GError("Error reading info chunk!\n");
466 if (dataQuery) {
467 //--------------- DB QUERY MODE: (always read the cdb stored info!)
468 /*try to find the database file
469 rules: if given, only the -d given filename is used
470 otherwise:
471 1) the same directory with the given index file(stripping the suffix)
472 2) the dbstat filepath/name stored by cdbfasta
473 */
474
475 if (!rec_pos_only && dbname==NULL) { // no -d database given, find it
476 // 1) try to rip the suffix:
477 p = rstrchr(idxfile, '.');
478 if (p!=NULL) {
479 /*GError("%s\ncdbyank error: cannot use %s as an index file. When no -d is\n\
480 given, so the database file can be located in the same directory \n\
481 by removing the index file suffix (.cidx)\n", USAGE, idxfile);*/
482 int nlen=p-idxfile;
483 strncpy(namebuf, idxfile, nlen);
484 namebuf[nlen]='\0';
485 if (fileExists(namebuf))
486 dbname=namebuf;
487 }
488 // 2) try the stored dbstat name
489 if (dbname==NULL) {
490 if (fileExists(info_dbname)) dbname=info_dbname;
491 else GError("Cannot locate the database file for this index\n");
492 }
493 }
494 if (!rec_pos_only) {
495 if (!is_compressed) {
496 if (r==0 && (dbstat.idxflags & CDBMSK_OPT_COMPRESS))
497 is_compressed=true;
498 }
499 if (is_compressed) {
500 //try to open the dbname as a compressed file
501 #ifndef ENABLE_COMPRESSION
502 GError(err_COMPRESSION);
503 #endif
504 fz=fopen(dbname, "rb");
505 }
506 else fdb=open(dbname, O_RDONLY|O_BINARY);
507 if (fdb==-1 && fz==NULL)
508 GError("Error: cannot open database file %s\n",dbname);
509 if (is_compressed) {
510 fclose(fz);//just to start fresh here
511 if (use_range)
512 GError("Error: cannot use range extraction with compressed records, sorry.\n");
513 if (defline_only)
514 GError("Error: cannot use defline-only retrieval with compressed records (sorry).\n");
515 //determine size:
516 int ftmp = open(dbname, O_RDONLY|O_BINARY);
517 if (ftmp == -1) GError("Error reopening db '%s'?\n",dbname);
518 struct stat fdbstat;
519 fstat(ftmp, &fdbstat);
520 db_size=fdbstat.st_size;
521 close(ftmp);
522 //-------- reopen here
523 #ifdef ENABLE_COMPRESSION
524 cdbz=openCdbz(dbname);
525 if (cdbz==NULL)
526 GError("Error opening the cdbz file '%s'\n");
527 fz=cdbz->getZFile();
528 #endif
529 }
530 else {
531 struct stat fdbstat;
532 if (stat(dbname, &fdbstat)!=0) {
533 perror("stat()");
534 exit(1);
535 }
536 db_size=fdbstat.st_size;
537 }
538 //abort if the database size was read and it doesn't match the cdbfasta stored size
539 if (dbstat.dbsize>0 && dbstat.dbsize!=db_size)
540 GError("Error: invalid %d database size - (%lld vs %lld) please rerun cdbfasta for '%s'\n",
541 fdb, dbstat.dbsize, db_size, dbname);
542 }
543 int many=(args.getOpt('x')!=NULL);
544 int keypos=0;
545 if (key==NULL) { //key not given
546 GMALLOC(key, 2048);
547 //get the keys at stdin
548 if (use_range) {
549 //expects the key and its sequence range on a single line!
550 while ((e=fgetc(stdin)) != EOF) {
551 if (isspace(e)) { //word end, close it
552 key[keypos]='\0';
553 if (keypos==0) continue;
554 r_start=parse_int(stdin, &key[keypos+1], key, e);
555 if (r_start<=0) GError(ERR_RANGEFMT, key);
556 //if (e==EOF || e=='\n') GError(ERR_RANGEFMT, key);
557 r_end=0;
558 r_end=parse_int(stdin, &key[keypos+1], key, e);
559 //if (r_end<=0 || r_end<=r_start) GError(ERR_RANGEFMT, key);
560 fetch_record(key, dbname, many, r_start, r_end);
561 //if (rec_pos_only) break;
562 if (e==EOF) break;
563 keypos=0;
564 }
565 else { //extend the key string
566 key[keypos]=e;
567 keypos++;
568 }
569 } //while
570 } //range case
571 else { //no range, accept any space delimiter
572 while ((e=fgetc(stdin)) != EOF) {
573 if (isspace(e)) { //word end, close it
574 key[keypos]='\0';
575 fetch_record(key, dbname, many);
576 //if (rec_pos_only) break;
577 keypos=0;
578 }
579 else { //extend the key string
580 key[keypos]=e;
581 keypos++;
582 }
583 } //while
584 }
585 GFREE(key);
586 } //stdin case
587 else { //key given already on command line
588 //get only the first word of it:
589 size_t keylen=strlen(key);
590 p=key; while (!isspace(*p) && *p!='\0') p++;
591 if (*p!='\0') *p='\0';
592 if (use_range) {
593 //parse the range from the query string
594 if (keylen==strlen(p)) GError(ERR_RANGEFMT, key);
595 p++;e=*p;
596 r_start=parse_int(p, key, e);
597 if (r_start<=0) GError(ERR_RANGEFMT, key);
598 //if (e=='\0' || e=='\n') GError(ERR_RANGEFMT, key);
599 r_end=parse_int(p, key, e);
600 //if (r_end<=0 || r_end<=r_start) GError(ERR_RANGEFMT, key);
601 }
602 else {
603 r_start=0;
604 r_end=0;
605 }
606 if (fetch_record(key, dbname, many, r_start, r_end)==0)
607 result=1; //the only key given not found
608 }
609 //end data query:
610 if (!rec_pos_only) {
611 if (is_compressed) {
612 fclose(fz);
613 #ifdef ENABLE_COMPRESSION
614 delete cdbz;
615 #endif
616 }
617 else close(fdb);
618 }
619 if (fout!=NULL) fclose(fout);
620 }
621 //--------------- INDEX ONLY QUERY MODE:
622 else { //index query mode: just retrieve some statistics or key names
623 if (listQuery) { //request for list keys
624 uint32 eod;
625 uint32 pos=0;
626 uint32 klen;
627 uint32 dlen;
628 char* bufspace;
629 GMALLOC(bufspace, GCDBUFFER_INSIZE);
630 GCDBuffer* readbuf=new GCDBuffer((opfunc)&read,
631 fd, bufspace, GCDBUFFER_INSIZE);
632
633 buf_getnum(readbuf, pos, &eod);
634 GMALLOC(key, 1024); //!!! hopefully we don't have keys larger than that
635 while (pos < 2048)
636 buf_getnum(readbuf, pos, &dlen);
637 while (pos < eod) {
638 buf_getnum(readbuf, pos,&klen);
639 buf_getnum(readbuf, pos,&dlen);
640 //read key:
641 buf_get(readbuf, pos, key, klen);
642 key[klen]='\0';
643 printf("%s\n", key);
644 //read data (and ignore it)
645 //assume that data is always shorter than 1K (should be just 4 bytes)
646 buf_get(readbuf, pos, key, dlen);
647 }
648 GFREE(key);
649 GFREE(bufspace);
650 delete readbuf;
651 }
652 else { //dig up the info written at the end of the database file
653 if (args.getOpt('n')!=NULL) {
654 printf("%d\n",dbstat.num_records);
655 }
656 else {//must be -s
657 printf("-= Indexing information: =-\n");
658 printf("Number of records:%12d\n", dbstat.num_records);
659 printf("Number of keys :%12d\n", dbstat.num_keys);
660 if (dbstat.idxflags & CDBMSK_OPT_COMPRESS)
661 printf("Database records are compressed.\n");
662 if (dbstat.idxflags & CDBMSK_OPT_MULTI)
663 printf("Index was built with \"multi-key\" option enabled.\n");
664 if (dbstat.idxflags & CDBMSK_OPT_C)
665 printf("Index was built with \"shortcut keys\" only.\n");
666 else if (dbstat.idxflags & CDBMSK_OPT_CADD)
667 printf("The index was built with full keys and \"shortcut keys\".\n");
668 printf("Database file: %s\n", info_dbname);
669 printf("Database size: %lld bytes\n", dbstat.dbsize);
670 }
671 }
672 }
673 GFREE(info_dbname);
674 delete cdb;
675 close(fd);
676 //getc(stdin);
677 return result;
678 }