ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GCdbYank.cpp
Revision: 2
Committed: Mon Mar 22 22:03:27 2010 UTC (9 years, 3 months ago) by gpertea
File size: 15174 byte(s)
Log Message:
added my gclib source files

Line File contents
1 #include "GCdbYank.h"
2 #include "GBase.h"
3 #include <ctype.h>
4
5 #define ERR_READ "GCdbYank: error reading from file.\n"
6 #define ERR_READFMT "GCdbYank read error: incorrect file format.\n"
7 #define ERR_RANGEFMT "Sequence range parsing error for key '%s'\n"
8 #define ERR_RANGE_INVALID "Invalid range (%d-%d) specified for sequence '%s' of length %d\n"
9 // 1MB memory buffer:
10 #define MAX_MEM_RECSIZE 1048576
11 #ifndef O_BINARY
12 #define O_BINARY 0x0000
13 #endif
14
15
16 #ifdef ENABLE_COMPRESSION
17
18 GCdbZFasta::GCdbZFasta(FILE* azf, int zrsize, char* r_delim) {
19 zrecsize=-1;
20 zpos=0;
21 recdelim=Gstrdup(r_delim);
22 zf=azf;
23 decomp_start(zrsize);
24 chrhandler=new GFastaCharHandler(recdelim);
25 }
26
27 GCdbZFasta::~GCdbZFasta() {
28 //if (zf!=NULL && zf!=stdout && zf!=stdin) fclose(zf);
29 // FULL_FLUSH method instead of finish
30 delete chrhandler;
31 GFREE(recdelim);
32 decomp_end();
33 }
34
35 void GCdbZFasta::decomp_start(int zrsize) {
36 zstream.zalloc = (alloc_func)0;
37 zstream.zfree = (free_func)0;
38 zstream.opaque = (voidpf)0;
39 zstream.next_in = (Bytef*)sbuf;
40 zstream.avail_in = 0;
41 zstream.next_out = (Bytef*)lbuf;
42 int err = inflateInit(&zstream);
43 if (err!=Z_OK)
44 GMessage("Error at inflateInit()\n");
45 //-- now read and discard the first record, so we can use random access later
46 // (needed by zlib)
47 int bytes_read=fread(sbuf, 1, zrsize, zf);
48 if (bytes_read<zrsize)
49 GError("Error reading 1st record from zrec file\n");
50 zstream.next_in = (Bytef*)sbuf;
51 zstream.avail_in = bytes_read;
52 //decompress first chunk
53 zstream.next_out = (Bytef*)lbuf;
54 zstream.avail_out = GCDBZ_LBUF_LEN;
55 err = inflate(&zstream, Z_SYNC_FLUSH);
56 if (err !=Z_OK && err!=Z_STREAM_END)
57 GError("GCdbZFasta error: 1st record inflate failed! (err=%d)\n",err);
58 }
59
60 void GCdbZFasta::decomp_end() {
61 int err = inflateEnd(&zstream);
62 if (err!=Z_OK)
63 GError("Error at inflateEnd() (err=%d)\n", err);
64 }
65
66
67 //fasta record decompress
68 //returns: the number of bytes decompressed
69 int GCdbZFasta::decompress(FastaSeq& rec, int csize, int zfofs, charFunc* seqCallBack) {
70 if (zfofs>=0) {
71 if (fseek(zf, zfofs, 0))
72 GError("GCdbZFasta::decompress: error fseek() to %d\n", zfofs);
73 }
74 else
75 if (feof(zf)) return 0;
76 bool in_rec=true;
77 int err=0;
78 int total_read=0;
79 int total_written=0;
80 chrhandler->init(&rec, seqCallBack);
81 while (in_rec) { // read loop
82 int to_read=0;
83 int bytes_read=0;
84 if (csize<=0) { //read one byte at a time
85 to_read=1;
86 int c;
87 if ((c =fgetc(zf))!=EOF) {
88 bytes_read = 1;
89 sbuf[0]=c;
90 }
91 else {
92 //bytes_read=0;
93 return 0; //eof
94 }
95 total_read+=bytes_read;
96 }
97 else {
98 to_read = csize-total_read>GCDBZ_SBUF_LEN ?
99 GCDBZ_SBUF_LEN : csize-total_read;
100 // check for csize vs bytes_read match:
101 if (to_read==0) return 0;
102 bytes_read=fread(sbuf, 1, to_read, zf);
103 if (bytes_read!=to_read)
104 GError("Error reading from zrec file\n");
105 total_read+=bytes_read;
106 in_rec=(total_read<csize);
107 }
108 if (bytes_read==0) {
109 //GMessage("bytes_read = 0\n");
110 return 0;
111 }
112 if (in_rec && bytes_read<to_read) in_rec=false;
113 zstream.next_in = (Bytef*)sbuf;
114 zstream.avail_in = bytes_read;
115
116 do { //decompression loop
117 zstream.next_out = (Bytef*)lbuf;
118 zstream.avail_out = GCDBZ_LBUF_LEN;
119 uLong t_out=zstream.total_out;
120 err = inflate(&zstream, Z_SYNC_FLUSH);
121 uLong toWrite=zstream.total_out-t_out;
122 if (toWrite>0) {
123 /* if (fwrite(lbuf, 1, toWrite, outf)<toWrite) {
124 GError("Error writing inflated chunk!\n");
125 } */
126 for (unsigned int i=0;i<toWrite;i++)
127 chrhandler->processChar(lbuf[i]);
128
129 total_written+=toWrite;
130 }
131 if (err==Z_STREAM_END) {
132 in_rec=false;
133 if (total_written==0) {
134 GMessage("Z_STREAM_END found but total_written=0!\n");
135 }
136 break;
137 }
138 else if (err !=Z_OK)
139 GError("GCdbZFasta error: inflate failed! (err=%d)\n",err);
140 } while (zstream.avail_in!=0); //decompression loop
141 } //read loop
142 chrhandler->done();
143 /*if (err!=Z_STREAM_END) {
144 GError("decompress: Z_STREAM_END not found!\n");
145 }*/
146 return total_written;
147 }
148
149 GCdbZFasta* GCdbYank::openCdbz(char* p) {
150 //in case this was not done before:
151 gcvt_uint=(endian_test())? &uint32_sun : &uint32_x86;
152 FILE* zf=fopen(p, "rb");
153 if (zf==NULL) {
154 GMessage("Error: cannot open compressed file '%s'!\n",p);
155 return NULL;
156 }
157 //check if the file is valid and read the length of the first record
158 //
159 char ztag[5];
160 ztag[4]='\0';
161 if (fread(ztag, 1, 4, zf)<4) {
162 GMessage("Error reading header of compressed file '%s'\n",p);
163 return NULL;
164 }
165 if (strcmp(ztag, "CDBZ")!=0) {
166 GMessage("Error: file '%s' doesn't appear to be a zlib compressed cdb?\n",p);
167 return NULL;
168 }
169 unsigned int zrecsize;
170 if (fread((void*) &zrecsize,1,4,zf)<4) {
171 GMessage("Error reading 1st compressed record size for file '%s'!\n",p);
172 return NULL;
173 }
174 zrecsize=gcvt_uint(&zrecsize);
175 return new GCdbZFasta(zf, zrecsize, recdelim);
176 }
177 #else
178 static const char err_COMPRESSED[]="Error: compression detected but not compiled in!\n";
179 #endif
180 //decompression stuff
181
182
183 void inplace_Lower(char* c) {
184 char *p=c;
185 while (*p!='\0') { *p=tolower(*p);p++; }
186 }
187
188 void buf_get(GCDBuffer* b, uint32& pos, char *buf, unsigned int len) {
189 int r;
190 while (len > 0) {
191 r = b->get(buf,len);
192 if (r == -1) GError(ERR_READ);
193 if (r == 0)
194 GError(ERR_READFMT);
195 pos += r;
196 buf += r;
197 len -= r;
198 }
199 }
200
201 void buf_getnum(GCDBuffer* b, uint32& pos, uint32 *num) {
202 char buf[4];
203 buf_get(b, pos, buf, 4);
204 uint32_unpack(buf,num);
205 }
206
207
208 int read_dbinfo(int fd, char** fnameptr, cdbInfo& dbstat) {
209 //this is messy due to the need of compatibility with the
210 //old 32bit file-length
211 char* dbname=*fnameptr;
212 //read just the tag first: 4 bytes ID
213 lseek(fd, -cdbInfoSIZE, SEEK_END);
214 int r=read(fd, &dbstat, cdbInfoSIZE );
215 if (r!=cdbInfoSIZE) return 2;
216 //GMessage("Size of dbstat=%d\n", cdbInfoSIZE);
217 if (strncmp(dbstat.oldtag, "CIDX", 4)==0) {
218 //old dbstat structure -- convert it
219 dbstat.num_keys=gcvt_uint(&dbstat.oldnum[0]);
220 dbstat.num_records=gcvt_uint(&dbstat.oldnum[1]);
221 dbstat.dbsize=gcvt_uint(&dbstat.old_dbsize);
222 dbstat.idxflags = gcvt_uint(&dbstat.old_idxflags);
223 //position on the dbnamelen entry
224 dbstat.dbnamelen = gcvt_uint(&dbstat.old_dbnamelen);
225 //GMessage("dbnamelen=%d\n", dbstat.dbnamelen);
226 lseek(fd, -(off_t)(cdbInfoSIZE-4+dbstat.dbnamelen), SEEK_END);
227 }
228 else if (strncmp(dbstat.tag, "CDBX", 4)!=0) {
229 GMessage("Error: this doesn't appear to be a cdbfasta created file!\n");
230 return 1;
231 }
232 else { // new CDBX type:
233 dbstat.dbsize = gcvt_offt(&dbstat.dbsize);
234 dbstat.num_keys=gcvt_uint(&dbstat.num_keys);
235 dbstat.num_records=gcvt_uint(&dbstat.num_records);
236 dbstat.idxflags = gcvt_uint(&dbstat.idxflags);
237 //position on the dbnamelen entry
238 dbstat.dbnamelen = gcvt_uint(&dbstat.dbnamelen);
239 //GMessage("dbnamelen=%d\n", dbstat.dbnamelen);
240 lseek(fd, -(off_t)(cdbInfoSIZE+dbstat.dbnamelen), SEEK_END);
241 }
242
243 GMALLOC(dbname, dbstat.dbnamelen+1);
244 dbname[dbstat.dbnamelen]='\0';
245 r=read(fd, dbname, dbstat.dbnamelen);
246 *fnameptr=dbname;
247 if (r!=dbstat.dbnamelen)
248 return 2;
249 return 0;
250 }
251
252 int parse_int(FILE* f, char* buf, const char* key, int& e) {
253 char* p, *q;
254 while (e!=EOF && isspace(e)) { //skip any spaces
255 if (e=='\n') GError(ERR_RANGEFMT, key);
256 e=fgetc(stdin);
257 }
258 if (e==EOF) GError(ERR_RANGEFMT, key);
259 //now e is the first non-space
260 p=buf;
261 q=p;
262 while (e!=EOF && !isspace(e)) {
263 *q=e;
264 q++;
265 e=fgetc(stdin);
266 }
267 *q='\0'; //now p is the starting coordinate string
268 return atoi(p);
269 //now the file pointer should be on the first space after the parsed value
270 }
271
272 int parse_int(char*& f, const char* key, int& e) {
273 char* p, *q;
274 char buf[16];
275 while (e!='\0' && isspace(e)) { //skip any spaces
276 if (e=='\n') GError(ERR_RANGEFMT, key);
277 f++;
278 e=*f;
279 }
280 if (e=='\0') GError(ERR_RANGEFMT, key);
281 //now e is the first non-space char
282 p=buf;
283 q=p;
284 while (e!='\0' && !isspace(e)) {
285 *q=e;
286 q++;
287 f++;
288 e=*f;
289 }
290 *q='\0';
291 return atoi(p);
292 //now f and e should be on the first space after the parsed value (or '\0')
293 }
294
295 GCdbYank::GCdbYank(const char* fidx, const char* recsep) {
296 is_compressed=false;
297 fd=-1;
298 cdb=NULL;
299 #ifdef ENABLE_COMPRESSION
300 cdbz=NULL;
301 #endif
302 fdb=-1;
303 fz=NULL;
304 dbname=NULL;
305 recdelim=Gstrdup(recsep);
306 if (fidx==NULL) GError("GCdbYank Error: NULL index file name!");
307 idxfile=Gstrdup(fidx);
308 cdb=new GCdbRead(idxfile);
309 fd=cdb->getfd();
310 db_size=0;
311 dbstat.dbsize=0;
312 info_dbname=NULL;
313 int r=read_dbinfo(fd, &info_dbname, dbstat);
314 lseek(fd, 0, SEEK_SET);
315 if (r==1) GError("This file does not seem to be a cdbfasta generated file.\n");
316 else if (r==2)
317 GError("Error reading info chunk!\n");
318 /*try to find the database file
319 rules: if given, only the -d given filename is used
320 otherwise:
321 1) the same directory with the given index file(stripping the suffix)
322 2) the dbstat filepath/name stored by cdbfasta
323 */
324 if (dbname==NULL) {
325 char* p = rstrchr(idxfile, '.');
326 if (p!=NULL) {
327 /*GError("%s\ncdbyank error: cannot use %s as an index file. When no -d is\n\
328 given, so the database file can be located in the same directory \n\
329 by removing the index file suffix (.cidx)\n", USAGE, idxfile);*/
330 int nlen=p-idxfile;
331 char* namebuf=NULL;
332 GMALLOC(namebuf, nlen+1);
333 strncpy(namebuf, idxfile, nlen);
334 namebuf[nlen]='\0';
335 if (fileExists(namebuf))
336 dbname=namebuf;
337
338 } // strip the index file extenstion
339 // 2) try the stored dbstat name
340 if (dbname==NULL) {
341 if (fileExists(info_dbname)) dbname=info_dbname;
342 else GError("Cannot locate the database file for this index\n");
343 }
344 }// database name not given
345 is_compressed=(dbstat.idxflags & CDBMSK_OPT_COMPRESS);
346 if (is_compressed)
347 //try to open the dbname as a compressed file
348 fz=fopen(dbname, "rb");
349 else fdb=open(dbname, O_RDONLY|O_BINARY);
350 if (fdb==-1 && fz==NULL)
351 GError("Error: cannot open database file %s\n",dbname);
352 if (is_compressed) {
353 #ifndef ENABLE_COMPRESSION
354 GError(err_COMPRESSED);
355 #else
356 fclose(fz);//just to start fresh here
357 //determine size:
358 int ftmp = open(dbname, O_RDONLY|O_BINARY);
359 if (ftmp == -1) GError("Error reopening db '%s'?\n",dbname);
360 struct stat fdbstat;
361 fstat(ftmp, &fdbstat);
362 db_size=fdbstat.st_size;
363 close(ftmp);
364 //-------- reopen here
365 cdbz=openCdbz(dbname);
366 if (cdbz==NULL)
367 GError("Error opening the cdbz file '%s'\n");
368 fz=cdbz->getZFile();
369 #endif
370 }
371 else {
372 struct stat fdbstat;
373 if (stat(dbname, &fdbstat)!=0) {
374 perror("stat()");
375 exit(1);
376 }
377 db_size=fdbstat.st_size;
378 }
379 //abort if the database size was read and it doesn't match the cdbfasta stored size
380 if (dbstat.dbsize>0 && dbstat.dbsize!=db_size)
381 GError("Error: invalid %d database size - (%lld vs %lld) please rerun cdbfasta for '%s'\n",
382 fdb, dbstat.dbsize, db_size, dbname);
383 fastahandler=new GFastaCharHandler(recdelim);
384 } //* GCdbYank constructor *//
385
386 GCdbYank::~GCdbYank() {
387 if (is_compressed) {
388 fclose(fz);
389 #ifdef ENABLE_COMPRESSION
390 delete cdbz;
391 #endif
392 }
393 else close(fdb);
394 GFREE(info_dbname);
395 delete fastahandler;
396 GFREE(recdelim);
397 GFREE(dbname);
398 GFREE(idxfile);
399 delete cdb;
400 close(fd);
401 }
402
403
404 int GCdbYank::getRecord(const char* key, FastaSeq& rec, charFunc* seqCallBack) {
405 //assumes fdb is open, cdb was created on the index file
406 int r=cdb->find(key);
407 if (r==0) return 0;
408 if (r==-1)
409 GError("cdbyank: error searching for key %s in %s\n", key, idxfile);
410 /* while (r>0) { */
411 off_t pos = cdb->datapos(); //position of this key's record in the index file
412 unsigned int len=cdb->datalen(); // length of this key's record
413 char bytes[32]; // data buffer -- should just accomodate fastarec_pos, fastarec_length
414 if (cdb->read(bytes,len,pos) == -1)
415 GError("cdbyank: error at GCbd::read (%s)!\n", idxfile);
416
417 off_t fpos; //this will be the fastadb offset
418 uint32 reclen; //this will be the fasta record offset
419 if (len>8) { //64 bit file offset was used
420 fpos=gcvt_offt(bytes);
421 reclen=gcvt_uint(&bytes[sizeof(uint32)<<1]);
422 }
423 else { //32bit offset used
424 fpos=gcvt_uint(bytes);
425 reclen=gcvt_uint(&bytes[sizeof(uint32)]);
426 }
427 //GMessage("reclen=%d\n", reclen);
428
429 /* if (showQuery)
430 fprintf(fout, "%c%s%c\t", delimQuery, key, delimQuery);*/
431 /*========= FETCHING RECORD CONTENT ======= */
432 if (is_compressed) {
433 //for now: ignore special retrievals, just print the whole record
434 #ifdef ENABLE_COMPRESSION
435 return cdbz->decompress(rec, reclen, fpos, seqCallBack);
436 #else
437 GError(err_COMPRESSED);
438 #endif
439 }
440 else { // not compressed -- position into the file and build an ad hoc GFastaFile
441 lseek(fdb, fpos, SEEK_SET);
442 // read it char by char and return it as output
443 char c='\0';
444 int charsread=0;
445 fastahandler->init(&rec, seqCallBack);
446 while (reclen-- && read(fdb, &c, 1)==1) {
447 fastahandler->processChar(c);
448 charsread++;
449 }
450 fastahandler->done();
451 return charsread;
452 } // not compressed
453 /* if (many) r=cdb->findnext(key, strlen(key));
454 else r=0;
455 } */
456 return 0;
457 }
458
459 off_t GCdbYank::getRecordPos(const char* key) {
460 //assumes fdb is open, cdb was created on the index file
461 int r=cdb->find(key);
462 if (r==0 && warnings) {
463 GMessage("cdbyank: key \"%s\" not found in %s\n", key, idxfile);
464 return -1;
465 }
466 if (r==-1)
467 GError("cdbyank: error searching for key %s in %s\n", key, idxfile);
468 off_t pos = cdb->datapos(); //position of this key's record in the index file
469 unsigned int len=cdb->datalen(); // length of this key's record
470 char bytes[32]; // data buffer -- should just accomodate fastarec_pos, fastarec_length
471 if (cdb->read(bytes,len,pos) == -1)
472 GError("cdbyank: error at GCbd::read (%s)!\n", idxfile);
473
474 off_t fpos; //this will be the fastadb offset
475 //uint32 reclen; this will be the fasta record length
476 if (len>8) //64 bit file offset was used
477 fpos=gcvt_offt(bytes);
478 else //32bit offset used
479 fpos=gcvt_uint(bytes);
480 return fpos;
481 }