ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GCdbYank.cpp
Revision: 16
Committed: Mon Jul 18 20:56:02 2011 UTC (7 years, 8 months ago) by gpertea
File size: 15452 byte(s)
Log Message:
sync with local source

Line File contents
1 #include "GCdbYank.h"
2 #include "GBase.h"
3 #include <ctype.h>
4
5 #define ERR_READ "GCdbYank: error reading from file.\n"
6 #define ERR_READFMT "GCdbYank read error: incorrect file format.\n"
7 #define ERR_RANGEFMT "Sequence range parsing error for key '%s'\n"
8 #define ERR_RANGE_INVALID "Invalid range (%d-%d) specified for sequence '%s' of length %d\n"
9 // 1MB memory buffer:
10 #define MAX_MEM_RECSIZE 1048576
11 #ifndef O_BINARY
12 #define O_BINARY 0x0000
13 #endif
14
15 //default size of the index record for records stored with 32bit offsets
16 uint32 irec_size32=8;
17
18 #ifdef ENABLE_COMPRESSION
19
20 GCdbZFasta::GCdbZFasta(FILE* azf, int zrsize, char* r_delim) {
21 zrecsize=-1;
22 zpos=0;
23 recdelim=Gstrdup(r_delim);
24 zf=azf;
25 decomp_start(zrsize);
26 chrhandler=new GFastaCharHandler(recdelim);
27 }
28
29 GCdbZFasta::~GCdbZFasta() {
30 //if (zf!=NULL && zf!=stdout && zf!=stdin) fclose(zf);
31 // FULL_FLUSH method instead of finish
32 delete chrhandler;
33 GFREE(recdelim);
34 decomp_end();
35 }
36
37 void GCdbZFasta::decomp_start(int zrsize) {
38 zstream.zalloc = (alloc_func)0;
39 zstream.zfree = (free_func)0;
40 zstream.opaque = (voidpf)0;
41 zstream.next_in = (Bytef*)sbuf;
42 zstream.avail_in = 0;
43 zstream.next_out = (Bytef*)lbuf;
44 int err = inflateInit(&zstream);
45 if (err!=Z_OK)
46 GMessage("Error at inflateInit()\n");
47 //-- now read and discard the first record, so we can use random access later
48 // (needed by zlib)
49 int bytes_read=fread(sbuf, 1, zrsize, zf);
50 if (bytes_read<zrsize)
51 GError("Error reading 1st record from zrec file\n");
52 zstream.next_in = (Bytef*)sbuf;
53 zstream.avail_in = bytes_read;
54 //decompress first chunk
55 zstream.next_out = (Bytef*)lbuf;
56 zstream.avail_out = GCDBZ_LBUF_LEN;
57 err = inflate(&zstream, Z_SYNC_FLUSH);
58 if (err !=Z_OK && err!=Z_STREAM_END)
59 GError("GCdbZFasta error: 1st record inflate failed! (err=%d)\n",err);
60 }
61
62 void GCdbZFasta::decomp_end() {
63 int err = inflateEnd(&zstream);
64 if (err!=Z_OK)
65 GError("Error at inflateEnd() (err=%d)\n", err);
66 }
67
68
69 //fasta record decompress
70 //returns: the number of bytes decompressed
71 int GCdbZFasta::decompress(FastaSeq& rec, int csize, int zfofs, charFunc* seqCallBack) {
72 if (zfofs>=0) {
73 if (fseeko(zf, zfofs, 0))
74 GError("GCdbZFasta::decompress: error fseeko() to %d\n", zfofs);
75 }
76 else
77 if (feof(zf)) return 0;
78 bool in_rec=true;
79 int err=0;
80 int total_read=0;
81 int total_written=0;
82 chrhandler->init(&rec, seqCallBack);
83 while (in_rec) { // read loop
84 int to_read=0;
85 int bytes_read=0;
86 if (csize<=0) { //read one byte at a time
87 to_read=1;
88 int c;
89 if ((c =fgetc(zf))!=EOF) {
90 bytes_read = 1;
91 sbuf[0]=c;
92 }
93 else {
94 //bytes_read=0;
95 return 0; //eof
96 }
97 total_read+=bytes_read;
98 }
99 else {
100 to_read = csize-total_read>GCDBZ_SBUF_LEN ?
101 GCDBZ_SBUF_LEN : csize-total_read;
102 // check for csize vs bytes_read match:
103 if (to_read==0) return 0;
104 bytes_read=fread(sbuf, 1, to_read, zf);
105 if (bytes_read!=to_read)
106 GError("Error reading from zrec file\n");
107 total_read+=bytes_read;
108 in_rec=(total_read<csize);
109 }
110 if (bytes_read==0) {
111 //GMessage("bytes_read = 0\n");
112 return 0;
113 }
114 if (in_rec && bytes_read<to_read) in_rec=false;
115 zstream.next_in = (Bytef*)sbuf;
116 zstream.avail_in = bytes_read;
117
118 do { //decompression loop
119 zstream.next_out = (Bytef*)lbuf;
120 zstream.avail_out = GCDBZ_LBUF_LEN;
121 uLong t_out=zstream.total_out;
122 err = inflate(&zstream, Z_SYNC_FLUSH);
123 uLong toWrite=zstream.total_out-t_out;
124 if (toWrite>0) {
125 /* if (fwrite(lbuf, 1, toWrite, outf)<toWrite) {
126 GError("Error writing inflated chunk!\n");
127 } */
128 for (unsigned int i=0;i<toWrite;i++)
129 chrhandler->processChar(lbuf[i]);
130
131 total_written+=toWrite;
132 }
133 if (err==Z_STREAM_END) {
134 in_rec=false;
135 if (total_written==0) {
136 GMessage("Z_STREAM_END found but total_written=0!\n");
137 }
138 break;
139 }
140 else if (err !=Z_OK)
141 GError("GCdbZFasta error: inflate failed! (err=%d)\n",err);
142 } while (zstream.avail_in!=0); //decompression loop
143 } //read loop
144 chrhandler->done();
145 /*if (err!=Z_STREAM_END) {
146 GError("decompress: Z_STREAM_END not found!\n");
147 }*/
148 return total_written;
149 }
150
151 GCdbZFasta* GCdbYank::openCdbz(char* p) {
152 //in case this was not done before:
153 gcvt_uint=(endian_test())? &uint32_sun : &uint32_x86;
154 FILE* zf=fopen(p, "rb");
155 if (zf==NULL) {
156 GMessage("Error: cannot open compressed file '%s'!\n",p);
157 return NULL;
158 }
159 //check if the file is valid and read the length of the first record
160 //
161 char ztag[5];
162 ztag[4]='\0';
163 if (fread(ztag, 1, 4, zf)<4) {
164 GMessage("Error reading header of compressed file '%s'\n",p);
165 return NULL;
166 }
167 if (strcmp(ztag, "CDBZ")!=0) {
168 GMessage("Error: file '%s' doesn't appear to be a zlib compressed cdb?\n",p);
169 return NULL;
170 }
171 unsigned int zrecsize;
172 if (fread((void*) &zrecsize,1,4,zf)<4) {
173 GMessage("Error reading 1st compressed record size for file '%s'!\n",p);
174 return NULL;
175 }
176 zrecsize=gcvt_uint(&zrecsize);
177 return new GCdbZFasta(zf, zrecsize, recdelim);
178 }
179 #else
180 static const char err_COMPRESSED[]="Error: compression detected but not compiled in!\n";
181 #endif
182 //decompression stuff
183
184
185 void inplace_Lower(char* c) {
186 char *p=c;
187 while (*p!='\0') { *p=tolower(*p);p++; }
188 }
189
190 void buf_get(GCDBuffer* b, uint32& pos, char *buf, unsigned int len) {
191 int r;
192 while (len > 0) {
193 r = b->get(buf,len);
194 if (r == -1) GError(ERR_READ);
195 if (r == 0)
196 GError(ERR_READFMT);
197 pos += r;
198 buf += r;
199 len -= r;
200 }
201 }
202
203 void buf_getnum(GCDBuffer* b, uint32& pos, uint32 *num) {
204 char buf[4];
205 buf_get(b, pos, buf, 4);
206 uint32_unpack(buf,num);
207 }
208
209
210 int read_dbinfo(int fd, char** fnameptr, cdbInfo& dbstat) {
211 //this is messy due to the need of compatibility with the
212 //old 32bit file-length
213 char* dbname=*fnameptr;
214 //read just the tag first: 4 bytes ID
215 lseek(fd, -cdbInfoSIZE, SEEK_END);
216 int r=read(fd, &dbstat, cdbInfoSIZE );
217 if (r!=cdbInfoSIZE) return 2;
218 //GMessage("Size of dbstat=%d\n", cdbInfoSIZE);
219 if (strncmp(dbstat.oldtag, "CIDX", 4)==0) {
220 //old dbstat structure -- convert it
221 dbstat.num_keys=gcvt_uint(&dbstat.oldnum[0]);
222 dbstat.num_records=gcvt_uint(&dbstat.oldnum[1]);
223 dbstat.dbsize=gcvt_uint(&dbstat.old_dbsize);
224 dbstat.idxflags = gcvt_uint(&dbstat.old_idxflags);
225 //position on the dbnamelen entry
226 dbstat.dbnamelen = gcvt_uint(&dbstat.old_dbnamelen);
227 //GMessage("dbnamelen=%d\n", dbstat.dbnamelen);
228 lseek(fd, -(off_t)(cdbInfoSIZE-4+dbstat.dbnamelen), SEEK_END);
229 }
230 else if (strncmp(dbstat.tag, "CDBX", 4)!=0) {
231 GMessage("Error: this doesn't appear to be a cdbfasta created file!\n");
232 return 1;
233 }
234 else { // new CDBX type:
235 dbstat.dbsize = gcvt_offt(&dbstat.dbsize);
236 dbstat.num_keys=gcvt_uint(&dbstat.num_keys);
237 dbstat.num_records=gcvt_uint(&dbstat.num_records);
238 dbstat.idxflags = gcvt_uint(&dbstat.idxflags);
239 //position on the dbnamelen entry
240 dbstat.dbnamelen = gcvt_uint(&dbstat.dbnamelen);
241 //GMessage("dbnamelen=%d\n", dbstat.dbnamelen);
242 lseek(fd, -(off_t)(cdbInfoSIZE+dbstat.dbnamelen), SEEK_END);
243 }
244
245 GMALLOC(dbname, dbstat.dbnamelen+1);
246 dbname[dbstat.dbnamelen]='\0';
247 r=read(fd, dbname, dbstat.dbnamelen);
248 *fnameptr=dbname;
249 if (r!=dbstat.dbnamelen)
250 return 2;
251 return 0;
252 }
253
254 int parse_int(char* buf, const char* key, int& e) {
255 char* p, *q;
256 while (e!=EOF && isspace(e)) { //skip any spaces
257 if (e=='\n') GError(ERR_RANGEFMT, key);
258 e=fgetc(stdin);
259 }
260 if (e==EOF) GError(ERR_RANGEFMT, key);
261 //now e is the first non-space
262 p=buf;
263 q=p;
264 while (e!=EOF && !isspace(e)) {
265 *q=e;
266 q++;
267 e=fgetc(stdin);
268 }
269 *q='\0'; //now p is the starting coordinate string
270 return atoi(p);
271 //now the file pointer should be on the first space after the parsed value
272 }
273
274 int parse_int(char*& f, const char* key, int& e) {
275 char* p, *q;
276 char buf[16];
277 while (e!='\0' && isspace(e)) { //skip any spaces
278 if (e=='\n') GError(ERR_RANGEFMT, key);
279 f++;
280 e=*f;
281 }
282 if (e=='\0') GError(ERR_RANGEFMT, key);
283 //now e is the first non-space char
284 p=buf;
285 q=p;
286 while (e!='\0' && !isspace(e)) {
287 *q=e;
288 q++;
289 f++;
290 e=*f;
291 }
292 *q='\0';
293 return atoi(p);
294 //now f and e should be on the first space after the parsed value (or '\0')
295 }
296
297 GCdbYank::GCdbYank(const char* fidx, const char* recsep) {
298 is_compressed=false;
299 fd=-1;
300 cdb=NULL;
301 #ifdef ENABLE_COMPRESSION
302 cdbz=NULL;
303 #endif
304 fdb=-1;
305 fz=NULL;
306 dbname=NULL;
307 recdelim=Gstrdup(recsep);
308 if (fidx==NULL) GError("GCdbYank Error: NULL index file name!");
309 idxfile=Gstrdup(fidx);
310 cdb=new GCdbRead(idxfile);
311 fd=cdb->getfd();
312 db_size=0;
313 dbstat.dbsize=0;
314 info_dbname=NULL;
315 int r=read_dbinfo(fd, &info_dbname, dbstat);
316 lseek(fd, 0, SEEK_SET);
317 if (r==1) GError("This file does not seem to be a cdbfasta generated file.\n");
318 else if (r==2)
319 GError("Error reading info chunk!\n");
320 /*try to find the database file
321 rules: if given, only the -d given filename is used
322 otherwise:
323 1) the same directory with the given index file(stripping the suffix)
324 2) the dbstat filepath/name stored by cdbfasta
325 */
326 if (dbname==NULL) {
327 char* p = rstrchr(idxfile, '.');
328 if (p!=NULL) {
329 /*GError("%s\ncdbyank error: cannot use %s as an index file. When no -d is\n\
330 given, so the database file can be located in the same directory \n\
331 by removing the index file suffix (.cidx)\n", USAGE, idxfile);*/
332 int nlen=p-idxfile;
333 char* namebuf=NULL;
334 GMALLOC(namebuf, nlen+1);
335 strncpy(namebuf, idxfile, nlen);
336 namebuf[nlen]='\0';
337 if (fileExists(namebuf))
338 dbname=namebuf;
339
340 } // strip the index file extenstion
341 // 2) try the stored dbstat name
342 if (dbname==NULL) {
343 if (fileExists(info_dbname)) dbname=info_dbname;
344 else GError("Cannot locate the database file for this index\n");
345 }
346 }// database name not given
347 is_compressed=(dbstat.idxflags & CDBMSK_OPT_COMPRESS);
348 if (is_compressed)
349 //try to open the dbname as a compressed file
350 fz=fopen(dbname, "rb");
351 else fdb=open(dbname, O_RDONLY|O_BINARY);
352 if (fdb==-1 && fz==NULL)
353 GError("Error: cannot open database file %s\n",dbname);
354 if (is_compressed) {
355 #ifndef ENABLE_COMPRESSION
356 GError(err_COMPRESSED);
357 #else
358 fclose(fz);//just to start fresh here
359 //determine size:
360 int ftmp = open(dbname, O_RDONLY|O_BINARY);
361 if (ftmp == -1) GError("Error reopening db '%s'?\n",dbname);
362 struct stat fdbstat;
363 fstat(ftmp, &fdbstat);
364 db_size=fdbstat.st_size;
365 close(ftmp);
366 //-------- reopen here
367 cdbz=openCdbz(dbname);
368 if (cdbz==NULL)
369 GError("Error opening the cdbz file '%s'\n");
370 fz=cdbz->getZFile();
371 #endif
372 }
373 else {
374 struct stat fdbstat;
375 if (stat(dbname, &fdbstat)!=0) {
376 perror("stat()");
377 exit(1);
378 }
379 db_size=fdbstat.st_size;
380 }
381 //abort if the database size was read and it doesn't match the cdbfasta stored size
382 if (dbstat.dbsize>0 && dbstat.dbsize!=db_size)
383 GError("Error: invalid %d database size - (%lld vs %lld) please rerun cdbfasta for '%s'\n",
384 fdb, dbstat.dbsize, db_size, dbname);
385 fastahandler=new GFastaCharHandler(recdelim);
386 } //* GCdbYank constructor *//
387
388 GCdbYank::~GCdbYank() {
389 if (is_compressed) {
390 fclose(fz);
391 #ifdef ENABLE_COMPRESSION
392 delete cdbz;
393 #endif
394 }
395 else close(fdb);
396 GFREE(info_dbname);
397 delete fastahandler;
398 GFREE(recdelim);
399 GFREE(dbname);
400 GFREE(idxfile);
401 delete cdb;
402 close(fd);
403 }
404
405
406 int GCdbYank::getRecord(const char* key, FastaSeq& rec, charFunc* seqCallBack) {
407 //assumes fdb is open, cdb was created on the index file
408 int r=cdb->find(key);
409 if (r==0) return 0;
410 if (r==-1)
411 GError("cdbyank: error searching for key %s in %s\n", key, idxfile);
412 /* while (r>0) { */
413 off_t pos = cdb->datapos(); //position of this key's record in the index file
414 unsigned int len=cdb->datalen(); // length of this key's record
415 char bytes[32]; // data buffer -- should just accomodate fastarec_pos, fastarec_length
416 if (cdb->read(bytes,len,pos) == -1)
417 GError("cdbyank: error at GCbd::read (%s)!\n", idxfile);
418
419 off_t fpos; //this will be the fastadb offset
420 uint32 reclen; //this will be the fasta record offset
421 if (len>irec_size32) { //64 bit file offset was used
422 fpos=gcvt_offt(bytes);
423 reclen=gcvt_uint(&bytes[sizeof(uint32)<<1]);
424 }
425 else { //32bit offset used
426 fpos=gcvt_uint(bytes);
427 reclen=gcvt_uint(&bytes[sizeof(uint32)]);
428 }
429 //GMessage("reclen=%d\n", reclen);
430
431 /* if (showQuery)
432 fprintf(fout, "%c%s%c\t", delimQuery, key, delimQuery);*/
433 /*========= FETCHING RECORD CONTENT ======= */
434 if (is_compressed) {
435 //for now: ignore special retrievals, just print the whole record
436 #ifdef ENABLE_COMPRESSION
437 return cdbz->decompress(rec, reclen, fpos, seqCallBack);
438 #else
439 GError(err_COMPRESSED);
440 #endif
441 }
442 else { // not compressed -- position into the file and build an ad hoc GFastaFile
443 lseek(fdb, fpos, SEEK_SET);
444 // read it char by char and return it as output
445 char c='\0';
446 int charsread=0;
447 fastahandler->init(&rec, seqCallBack);
448 while (reclen-- && read(fdb, &c, 1)==1) {
449 fastahandler->processChar(c);
450 charsread++;
451 }
452 fastahandler->done();
453 return charsread;
454 } // not compressed
455 /* if (many) r=cdb->findnext(key, strlen(key));
456 else r=0;
457 } */
458 return 0;
459 }
460
461 off_t GCdbYank::getRecordPos(const char* key, uint32* record_len) {
462 //assumes fdb is open, cdb was created on the index file
463 int r=cdb->find(key);
464 if (r==0 && warnings) {
465 GMessage("cdbyank: key \"%s\" not found in %s\n", key, idxfile);
466 return -1;
467 }
468 if (r==-1)
469 GError("cdbyank: error searching for key %s in %s\n", key, idxfile);
470 off_t pos = cdb->datapos(); //position of this key's record in the index file
471 unsigned int len=cdb->datalen(); // length of this key's record
472 char bytes[64]; // data buffer -- should just accomodate fastarec_pos, fastarec_length
473 if (cdb->read(bytes,len,pos) == -1)
474 GError("cdbyank: error at GCbd::read (%s)!\n", idxfile);
475
476 off_t fpos; //this will be the fastadb offset
477 uint32 rlen; //this will be the fasta record length
478 if (len>irec_size32) { //64 bit file offset was used
479 fpos=gcvt_offt(bytes);
480 rlen=gcvt_uint(&bytes[offsetof(CIdxData, reclen)]);
481 }
482 else { //32bit offset used
483 fpos=gcvt_uint(bytes);
484 rlen=gcvt_uint(&bytes[offsetof(CIdxData32, reclen)]);
485 }
486 if (record_len!=NULL) *record_len=rlen;
487 return fpos;
488 }