ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GCdbYank.cpp
Revision: 16
Committed: Mon Jul 18 20:56:02 2011 UTC (8 years, 3 months ago) by gpertea
File size: 15452 byte(s)
Log Message:
sync with local source

Line User Rev File contents
1 gpertea 2 #include "GCdbYank.h"
2     #include "GBase.h"
3     #include <ctype.h>
4    
5     #define ERR_READ "GCdbYank: error reading from file.\n"
6     #define ERR_READFMT "GCdbYank read error: incorrect file format.\n"
7     #define ERR_RANGEFMT "Sequence range parsing error for key '%s'\n"
8     #define ERR_RANGE_INVALID "Invalid range (%d-%d) specified for sequence '%s' of length %d\n"
9     // 1MB memory buffer:
10     #define MAX_MEM_RECSIZE 1048576
11     #ifndef O_BINARY
12     #define O_BINARY 0x0000
13     #endif
14    
15 gpertea 16 //default size of the index record for records stored with 32bit offsets
16     uint32 irec_size32=8;
17 gpertea 2
18     #ifdef ENABLE_COMPRESSION
19    
20     GCdbZFasta::GCdbZFasta(FILE* azf, int zrsize, char* r_delim) {
21     zrecsize=-1;
22     zpos=0;
23     recdelim=Gstrdup(r_delim);
24     zf=azf;
25     decomp_start(zrsize);
26     chrhandler=new GFastaCharHandler(recdelim);
27     }
28    
29     GCdbZFasta::~GCdbZFasta() {
30     //if (zf!=NULL && zf!=stdout && zf!=stdin) fclose(zf);
31     // FULL_FLUSH method instead of finish
32     delete chrhandler;
33     GFREE(recdelim);
34     decomp_end();
35     }
36    
37     void GCdbZFasta::decomp_start(int zrsize) {
38     zstream.zalloc = (alloc_func)0;
39     zstream.zfree = (free_func)0;
40     zstream.opaque = (voidpf)0;
41     zstream.next_in = (Bytef*)sbuf;
42     zstream.avail_in = 0;
43     zstream.next_out = (Bytef*)lbuf;
44     int err = inflateInit(&zstream);
45     if (err!=Z_OK)
46     GMessage("Error at inflateInit()\n");
47     //-- now read and discard the first record, so we can use random access later
48     // (needed by zlib)
49     int bytes_read=fread(sbuf, 1, zrsize, zf);
50     if (bytes_read<zrsize)
51     GError("Error reading 1st record from zrec file\n");
52     zstream.next_in = (Bytef*)sbuf;
53     zstream.avail_in = bytes_read;
54     //decompress first chunk
55     zstream.next_out = (Bytef*)lbuf;
56     zstream.avail_out = GCDBZ_LBUF_LEN;
57     err = inflate(&zstream, Z_SYNC_FLUSH);
58     if (err !=Z_OK && err!=Z_STREAM_END)
59     GError("GCdbZFasta error: 1st record inflate failed! (err=%d)\n",err);
60     }
61    
62     void GCdbZFasta::decomp_end() {
63     int err = inflateEnd(&zstream);
64     if (err!=Z_OK)
65     GError("Error at inflateEnd() (err=%d)\n", err);
66     }
67    
68    
69     //fasta record decompress
70     //returns: the number of bytes decompressed
71     int GCdbZFasta::decompress(FastaSeq& rec, int csize, int zfofs, charFunc* seqCallBack) {
72     if (zfofs>=0) {
73 gpertea 16 if (fseeko(zf, zfofs, 0))
74     GError("GCdbZFasta::decompress: error fseeko() to %d\n", zfofs);
75 gpertea 2 }
76     else
77     if (feof(zf)) return 0;
78     bool in_rec=true;
79     int err=0;
80     int total_read=0;
81     int total_written=0;
82     chrhandler->init(&rec, seqCallBack);
83     while (in_rec) { // read loop
84     int to_read=0;
85     int bytes_read=0;
86     if (csize<=0) { //read one byte at a time
87     to_read=1;
88     int c;
89     if ((c =fgetc(zf))!=EOF) {
90     bytes_read = 1;
91     sbuf[0]=c;
92     }
93     else {
94     //bytes_read=0;
95     return 0; //eof
96     }
97     total_read+=bytes_read;
98     }
99     else {
100     to_read = csize-total_read>GCDBZ_SBUF_LEN ?
101     GCDBZ_SBUF_LEN : csize-total_read;
102     // check for csize vs bytes_read match:
103     if (to_read==0) return 0;
104     bytes_read=fread(sbuf, 1, to_read, zf);
105     if (bytes_read!=to_read)
106     GError("Error reading from zrec file\n");
107     total_read+=bytes_read;
108     in_rec=(total_read<csize);
109     }
110     if (bytes_read==0) {
111     //GMessage("bytes_read = 0\n");
112     return 0;
113     }
114     if (in_rec && bytes_read<to_read) in_rec=false;
115     zstream.next_in = (Bytef*)sbuf;
116     zstream.avail_in = bytes_read;
117    
118     do { //decompression loop
119     zstream.next_out = (Bytef*)lbuf;
120     zstream.avail_out = GCDBZ_LBUF_LEN;
121     uLong t_out=zstream.total_out;
122     err = inflate(&zstream, Z_SYNC_FLUSH);
123     uLong toWrite=zstream.total_out-t_out;
124     if (toWrite>0) {
125     /* if (fwrite(lbuf, 1, toWrite, outf)<toWrite) {
126     GError("Error writing inflated chunk!\n");
127     } */
128     for (unsigned int i=0;i<toWrite;i++)
129     chrhandler->processChar(lbuf[i]);
130    
131     total_written+=toWrite;
132     }
133     if (err==Z_STREAM_END) {
134     in_rec=false;
135     if (total_written==0) {
136     GMessage("Z_STREAM_END found but total_written=0!\n");
137     }
138     break;
139     }
140     else if (err !=Z_OK)
141     GError("GCdbZFasta error: inflate failed! (err=%d)\n",err);
142     } while (zstream.avail_in!=0); //decompression loop
143     } //read loop
144     chrhandler->done();
145     /*if (err!=Z_STREAM_END) {
146     GError("decompress: Z_STREAM_END not found!\n");
147     }*/
148     return total_written;
149     }
150    
151     GCdbZFasta* GCdbYank::openCdbz(char* p) {
152     //in case this was not done before:
153     gcvt_uint=(endian_test())? &uint32_sun : &uint32_x86;
154     FILE* zf=fopen(p, "rb");
155     if (zf==NULL) {
156     GMessage("Error: cannot open compressed file '%s'!\n",p);
157     return NULL;
158     }
159     //check if the file is valid and read the length of the first record
160     //
161     char ztag[5];
162     ztag[4]='\0';
163     if (fread(ztag, 1, 4, zf)<4) {
164     GMessage("Error reading header of compressed file '%s'\n",p);
165     return NULL;
166     }
167     if (strcmp(ztag, "CDBZ")!=0) {
168     GMessage("Error: file '%s' doesn't appear to be a zlib compressed cdb?\n",p);
169     return NULL;
170     }
171     unsigned int zrecsize;
172     if (fread((void*) &zrecsize,1,4,zf)<4) {
173     GMessage("Error reading 1st compressed record size for file '%s'!\n",p);
174     return NULL;
175     }
176     zrecsize=gcvt_uint(&zrecsize);
177     return new GCdbZFasta(zf, zrecsize, recdelim);
178     }
179     #else
180     static const char err_COMPRESSED[]="Error: compression detected but not compiled in!\n";
181     #endif
182     //decompression stuff
183    
184    
185     void inplace_Lower(char* c) {
186     char *p=c;
187     while (*p!='\0') { *p=tolower(*p);p++; }
188     }
189    
190     void buf_get(GCDBuffer* b, uint32& pos, char *buf, unsigned int len) {
191     int r;
192     while (len > 0) {
193     r = b->get(buf,len);
194     if (r == -1) GError(ERR_READ);
195     if (r == 0)
196     GError(ERR_READFMT);
197     pos += r;
198     buf += r;
199     len -= r;
200     }
201     }
202    
203     void buf_getnum(GCDBuffer* b, uint32& pos, uint32 *num) {
204     char buf[4];
205     buf_get(b, pos, buf, 4);
206     uint32_unpack(buf,num);
207     }
208    
209    
210     int read_dbinfo(int fd, char** fnameptr, cdbInfo& dbstat) {
211     //this is messy due to the need of compatibility with the
212     //old 32bit file-length
213     char* dbname=*fnameptr;
214     //read just the tag first: 4 bytes ID
215     lseek(fd, -cdbInfoSIZE, SEEK_END);
216     int r=read(fd, &dbstat, cdbInfoSIZE );
217     if (r!=cdbInfoSIZE) return 2;
218     //GMessage("Size of dbstat=%d\n", cdbInfoSIZE);
219     if (strncmp(dbstat.oldtag, "CIDX", 4)==0) {
220     //old dbstat structure -- convert it
221     dbstat.num_keys=gcvt_uint(&dbstat.oldnum[0]);
222     dbstat.num_records=gcvt_uint(&dbstat.oldnum[1]);
223     dbstat.dbsize=gcvt_uint(&dbstat.old_dbsize);
224     dbstat.idxflags = gcvt_uint(&dbstat.old_idxflags);
225     //position on the dbnamelen entry
226     dbstat.dbnamelen = gcvt_uint(&dbstat.old_dbnamelen);
227     //GMessage("dbnamelen=%d\n", dbstat.dbnamelen);
228     lseek(fd, -(off_t)(cdbInfoSIZE-4+dbstat.dbnamelen), SEEK_END);
229     }
230     else if (strncmp(dbstat.tag, "CDBX", 4)!=0) {
231     GMessage("Error: this doesn't appear to be a cdbfasta created file!\n");
232     return 1;
233     }
234     else { // new CDBX type:
235     dbstat.dbsize = gcvt_offt(&dbstat.dbsize);
236     dbstat.num_keys=gcvt_uint(&dbstat.num_keys);
237     dbstat.num_records=gcvt_uint(&dbstat.num_records);
238     dbstat.idxflags = gcvt_uint(&dbstat.idxflags);
239     //position on the dbnamelen entry
240     dbstat.dbnamelen = gcvt_uint(&dbstat.dbnamelen);
241     //GMessage("dbnamelen=%d\n", dbstat.dbnamelen);
242     lseek(fd, -(off_t)(cdbInfoSIZE+dbstat.dbnamelen), SEEK_END);
243     }
244    
245     GMALLOC(dbname, dbstat.dbnamelen+1);
246     dbname[dbstat.dbnamelen]='\0';
247     r=read(fd, dbname, dbstat.dbnamelen);
248     *fnameptr=dbname;
249     if (r!=dbstat.dbnamelen)
250     return 2;
251     return 0;
252     }
253    
254 gpertea 16 int parse_int(char* buf, const char* key, int& e) {
255 gpertea 2 char* p, *q;
256     while (e!=EOF && isspace(e)) { //skip any spaces
257     if (e=='\n') GError(ERR_RANGEFMT, key);
258     e=fgetc(stdin);
259     }
260     if (e==EOF) GError(ERR_RANGEFMT, key);
261     //now e is the first non-space
262     p=buf;
263     q=p;
264     while (e!=EOF && !isspace(e)) {
265     *q=e;
266     q++;
267     e=fgetc(stdin);
268     }
269     *q='\0'; //now p is the starting coordinate string
270     return atoi(p);
271     //now the file pointer should be on the first space after the parsed value
272     }
273    
274     int parse_int(char*& f, const char* key, int& e) {
275     char* p, *q;
276     char buf[16];
277     while (e!='\0' && isspace(e)) { //skip any spaces
278     if (e=='\n') GError(ERR_RANGEFMT, key);
279     f++;
280     e=*f;
281     }
282     if (e=='\0') GError(ERR_RANGEFMT, key);
283     //now e is the first non-space char
284     p=buf;
285     q=p;
286     while (e!='\0' && !isspace(e)) {
287     *q=e;
288     q++;
289     f++;
290     e=*f;
291     }
292     *q='\0';
293     return atoi(p);
294     //now f and e should be on the first space after the parsed value (or '\0')
295     }
296    
297     GCdbYank::GCdbYank(const char* fidx, const char* recsep) {
298     is_compressed=false;
299     fd=-1;
300     cdb=NULL;
301     #ifdef ENABLE_COMPRESSION
302     cdbz=NULL;
303     #endif
304     fdb=-1;
305     fz=NULL;
306     dbname=NULL;
307     recdelim=Gstrdup(recsep);
308     if (fidx==NULL) GError("GCdbYank Error: NULL index file name!");
309     idxfile=Gstrdup(fidx);
310     cdb=new GCdbRead(idxfile);
311     fd=cdb->getfd();
312     db_size=0;
313     dbstat.dbsize=0;
314     info_dbname=NULL;
315     int r=read_dbinfo(fd, &info_dbname, dbstat);
316     lseek(fd, 0, SEEK_SET);
317     if (r==1) GError("This file does not seem to be a cdbfasta generated file.\n");
318     else if (r==2)
319     GError("Error reading info chunk!\n");
320     /*try to find the database file
321     rules: if given, only the -d given filename is used
322     otherwise:
323     1) the same directory with the given index file(stripping the suffix)
324     2) the dbstat filepath/name stored by cdbfasta
325     */
326     if (dbname==NULL) {
327     char* p = rstrchr(idxfile, '.');
328     if (p!=NULL) {
329     /*GError("%s\ncdbyank error: cannot use %s as an index file. When no -d is\n\
330     given, so the database file can be located in the same directory \n\
331     by removing the index file suffix (.cidx)\n", USAGE, idxfile);*/
332     int nlen=p-idxfile;
333     char* namebuf=NULL;
334     GMALLOC(namebuf, nlen+1);
335     strncpy(namebuf, idxfile, nlen);
336     namebuf[nlen]='\0';
337     if (fileExists(namebuf))
338     dbname=namebuf;
339    
340     } // strip the index file extenstion
341     // 2) try the stored dbstat name
342     if (dbname==NULL) {
343     if (fileExists(info_dbname)) dbname=info_dbname;
344     else GError("Cannot locate the database file for this index\n");
345     }
346     }// database name not given
347     is_compressed=(dbstat.idxflags & CDBMSK_OPT_COMPRESS);
348     if (is_compressed)
349     //try to open the dbname as a compressed file
350     fz=fopen(dbname, "rb");
351     else fdb=open(dbname, O_RDONLY|O_BINARY);
352     if (fdb==-1 && fz==NULL)
353     GError("Error: cannot open database file %s\n",dbname);
354     if (is_compressed) {
355     #ifndef ENABLE_COMPRESSION
356     GError(err_COMPRESSED);
357     #else
358     fclose(fz);//just to start fresh here
359     //determine size:
360     int ftmp = open(dbname, O_RDONLY|O_BINARY);
361     if (ftmp == -1) GError("Error reopening db '%s'?\n",dbname);
362     struct stat fdbstat;
363     fstat(ftmp, &fdbstat);
364     db_size=fdbstat.st_size;
365     close(ftmp);
366     //-------- reopen here
367     cdbz=openCdbz(dbname);
368     if (cdbz==NULL)
369     GError("Error opening the cdbz file '%s'\n");
370     fz=cdbz->getZFile();
371     #endif
372     }
373     else {
374     struct stat fdbstat;
375     if (stat(dbname, &fdbstat)!=0) {
376     perror("stat()");
377     exit(1);
378     }
379     db_size=fdbstat.st_size;
380     }
381     //abort if the database size was read and it doesn't match the cdbfasta stored size
382     if (dbstat.dbsize>0 && dbstat.dbsize!=db_size)
383     GError("Error: invalid %d database size - (%lld vs %lld) please rerun cdbfasta for '%s'\n",
384     fdb, dbstat.dbsize, db_size, dbname);
385     fastahandler=new GFastaCharHandler(recdelim);
386     } //* GCdbYank constructor *//
387    
388     GCdbYank::~GCdbYank() {
389     if (is_compressed) {
390     fclose(fz);
391     #ifdef ENABLE_COMPRESSION
392     delete cdbz;
393     #endif
394     }
395     else close(fdb);
396     GFREE(info_dbname);
397     delete fastahandler;
398     GFREE(recdelim);
399     GFREE(dbname);
400     GFREE(idxfile);
401     delete cdb;
402     close(fd);
403     }
404    
405    
406     int GCdbYank::getRecord(const char* key, FastaSeq& rec, charFunc* seqCallBack) {
407     //assumes fdb is open, cdb was created on the index file
408     int r=cdb->find(key);
409     if (r==0) return 0;
410     if (r==-1)
411     GError("cdbyank: error searching for key %s in %s\n", key, idxfile);
412     /* while (r>0) { */
413     off_t pos = cdb->datapos(); //position of this key's record in the index file
414     unsigned int len=cdb->datalen(); // length of this key's record
415     char bytes[32]; // data buffer -- should just accomodate fastarec_pos, fastarec_length
416     if (cdb->read(bytes,len,pos) == -1)
417     GError("cdbyank: error at GCbd::read (%s)!\n", idxfile);
418    
419     off_t fpos; //this will be the fastadb offset
420     uint32 reclen; //this will be the fasta record offset
421 gpertea 16 if (len>irec_size32) { //64 bit file offset was used
422 gpertea 2 fpos=gcvt_offt(bytes);
423     reclen=gcvt_uint(&bytes[sizeof(uint32)<<1]);
424     }
425     else { //32bit offset used
426     fpos=gcvt_uint(bytes);
427     reclen=gcvt_uint(&bytes[sizeof(uint32)]);
428     }
429     //GMessage("reclen=%d\n", reclen);
430    
431     /* if (showQuery)
432     fprintf(fout, "%c%s%c\t", delimQuery, key, delimQuery);*/
433     /*========= FETCHING RECORD CONTENT ======= */
434     if (is_compressed) {
435     //for now: ignore special retrievals, just print the whole record
436     #ifdef ENABLE_COMPRESSION
437     return cdbz->decompress(rec, reclen, fpos, seqCallBack);
438     #else
439     GError(err_COMPRESSED);
440     #endif
441     }
442     else { // not compressed -- position into the file and build an ad hoc GFastaFile
443     lseek(fdb, fpos, SEEK_SET);
444     // read it char by char and return it as output
445     char c='\0';
446     int charsread=0;
447     fastahandler->init(&rec, seqCallBack);
448     while (reclen-- && read(fdb, &c, 1)==1) {
449     fastahandler->processChar(c);
450     charsread++;
451     }
452     fastahandler->done();
453     return charsread;
454     } // not compressed
455     /* if (many) r=cdb->findnext(key, strlen(key));
456     else r=0;
457     } */
458     return 0;
459     }
460    
461 gpertea 16 off_t GCdbYank::getRecordPos(const char* key, uint32* record_len) {
462 gpertea 2 //assumes fdb is open, cdb was created on the index file
463     int r=cdb->find(key);
464     if (r==0 && warnings) {
465     GMessage("cdbyank: key \"%s\" not found in %s\n", key, idxfile);
466     return -1;
467     }
468     if (r==-1)
469     GError("cdbyank: error searching for key %s in %s\n", key, idxfile);
470     off_t pos = cdb->datapos(); //position of this key's record in the index file
471     unsigned int len=cdb->datalen(); // length of this key's record
472 gpertea 16 char bytes[64]; // data buffer -- should just accomodate fastarec_pos, fastarec_length
473 gpertea 2 if (cdb->read(bytes,len,pos) == -1)
474     GError("cdbyank: error at GCbd::read (%s)!\n", idxfile);
475    
476     off_t fpos; //this will be the fastadb offset
477 gpertea 16 uint32 rlen; //this will be the fasta record length
478     if (len>irec_size32) { //64 bit file offset was used
479     fpos=gcvt_offt(bytes);
480     rlen=gcvt_uint(&bytes[offsetof(CIdxData, reclen)]);
481     }
482     else { //32bit offset used
483     fpos=gcvt_uint(bytes);
484     rlen=gcvt_uint(&bytes[offsetof(CIdxData32, reclen)]);
485     }
486     if (record_len!=NULL) *record_len=rlen;
487 gpertea 2 return fpos;
488     }