ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GCdbYank.cpp
Revision: 2
Committed: Mon Mar 22 22:03:27 2010 UTC (9 years, 8 months ago) by gpertea
File size: 15174 byte(s)
Log Message:
added my gclib source files

Line User Rev File contents
1 gpertea 2 #include "GCdbYank.h"
2     #include "GBase.h"
3     #include <ctype.h>
4    
5     #define ERR_READ "GCdbYank: error reading from file.\n"
6     #define ERR_READFMT "GCdbYank read error: incorrect file format.\n"
7     #define ERR_RANGEFMT "Sequence range parsing error for key '%s'\n"
8     #define ERR_RANGE_INVALID "Invalid range (%d-%d) specified for sequence '%s' of length %d\n"
9     // 1MB memory buffer:
10     #define MAX_MEM_RECSIZE 1048576
11     #ifndef O_BINARY
12     #define O_BINARY 0x0000
13     #endif
14    
15    
16     #ifdef ENABLE_COMPRESSION
17    
18     GCdbZFasta::GCdbZFasta(FILE* azf, int zrsize, char* r_delim) {
19     zrecsize=-1;
20     zpos=0;
21     recdelim=Gstrdup(r_delim);
22     zf=azf;
23     decomp_start(zrsize);
24     chrhandler=new GFastaCharHandler(recdelim);
25     }
26    
27     GCdbZFasta::~GCdbZFasta() {
28     //if (zf!=NULL && zf!=stdout && zf!=stdin) fclose(zf);
29     // FULL_FLUSH method instead of finish
30     delete chrhandler;
31     GFREE(recdelim);
32     decomp_end();
33     }
34    
35     void GCdbZFasta::decomp_start(int zrsize) {
36     zstream.zalloc = (alloc_func)0;
37     zstream.zfree = (free_func)0;
38     zstream.opaque = (voidpf)0;
39     zstream.next_in = (Bytef*)sbuf;
40     zstream.avail_in = 0;
41     zstream.next_out = (Bytef*)lbuf;
42     int err = inflateInit(&zstream);
43     if (err!=Z_OK)
44     GMessage("Error at inflateInit()\n");
45     //-- now read and discard the first record, so we can use random access later
46     // (needed by zlib)
47     int bytes_read=fread(sbuf, 1, zrsize, zf);
48     if (bytes_read<zrsize)
49     GError("Error reading 1st record from zrec file\n");
50     zstream.next_in = (Bytef*)sbuf;
51     zstream.avail_in = bytes_read;
52     //decompress first chunk
53     zstream.next_out = (Bytef*)lbuf;
54     zstream.avail_out = GCDBZ_LBUF_LEN;
55     err = inflate(&zstream, Z_SYNC_FLUSH);
56     if (err !=Z_OK && err!=Z_STREAM_END)
57     GError("GCdbZFasta error: 1st record inflate failed! (err=%d)\n",err);
58     }
59    
60     void GCdbZFasta::decomp_end() {
61     int err = inflateEnd(&zstream);
62     if (err!=Z_OK)
63     GError("Error at inflateEnd() (err=%d)\n", err);
64     }
65    
66    
67     //fasta record decompress
68     //returns: the number of bytes decompressed
69     int GCdbZFasta::decompress(FastaSeq& rec, int csize, int zfofs, charFunc* seqCallBack) {
70     if (zfofs>=0) {
71     if (fseek(zf, zfofs, 0))
72     GError("GCdbZFasta::decompress: error fseek() to %d\n", zfofs);
73     }
74     else
75     if (feof(zf)) return 0;
76     bool in_rec=true;
77     int err=0;
78     int total_read=0;
79     int total_written=0;
80     chrhandler->init(&rec, seqCallBack);
81     while (in_rec) { // read loop
82     int to_read=0;
83     int bytes_read=0;
84     if (csize<=0) { //read one byte at a time
85     to_read=1;
86     int c;
87     if ((c =fgetc(zf))!=EOF) {
88     bytes_read = 1;
89     sbuf[0]=c;
90     }
91     else {
92     //bytes_read=0;
93     return 0; //eof
94     }
95     total_read+=bytes_read;
96     }
97     else {
98     to_read = csize-total_read>GCDBZ_SBUF_LEN ?
99     GCDBZ_SBUF_LEN : csize-total_read;
100     // check for csize vs bytes_read match:
101     if (to_read==0) return 0;
102     bytes_read=fread(sbuf, 1, to_read, zf);
103     if (bytes_read!=to_read)
104     GError("Error reading from zrec file\n");
105     total_read+=bytes_read;
106     in_rec=(total_read<csize);
107     }
108     if (bytes_read==0) {
109     //GMessage("bytes_read = 0\n");
110     return 0;
111     }
112     if (in_rec && bytes_read<to_read) in_rec=false;
113     zstream.next_in = (Bytef*)sbuf;
114     zstream.avail_in = bytes_read;
115    
116     do { //decompression loop
117     zstream.next_out = (Bytef*)lbuf;
118     zstream.avail_out = GCDBZ_LBUF_LEN;
119     uLong t_out=zstream.total_out;
120     err = inflate(&zstream, Z_SYNC_FLUSH);
121     uLong toWrite=zstream.total_out-t_out;
122     if (toWrite>0) {
123     /* if (fwrite(lbuf, 1, toWrite, outf)<toWrite) {
124     GError("Error writing inflated chunk!\n");
125     } */
126     for (unsigned int i=0;i<toWrite;i++)
127     chrhandler->processChar(lbuf[i]);
128    
129     total_written+=toWrite;
130     }
131     if (err==Z_STREAM_END) {
132     in_rec=false;
133     if (total_written==0) {
134     GMessage("Z_STREAM_END found but total_written=0!\n");
135     }
136     break;
137     }
138     else if (err !=Z_OK)
139     GError("GCdbZFasta error: inflate failed! (err=%d)\n",err);
140     } while (zstream.avail_in!=0); //decompression loop
141     } //read loop
142     chrhandler->done();
143     /*if (err!=Z_STREAM_END) {
144     GError("decompress: Z_STREAM_END not found!\n");
145     }*/
146     return total_written;
147     }
148    
149     GCdbZFasta* GCdbYank::openCdbz(char* p) {
150     //in case this was not done before:
151     gcvt_uint=(endian_test())? &uint32_sun : &uint32_x86;
152     FILE* zf=fopen(p, "rb");
153     if (zf==NULL) {
154     GMessage("Error: cannot open compressed file '%s'!\n",p);
155     return NULL;
156     }
157     //check if the file is valid and read the length of the first record
158     //
159     char ztag[5];
160     ztag[4]='\0';
161     if (fread(ztag, 1, 4, zf)<4) {
162     GMessage("Error reading header of compressed file '%s'\n",p);
163     return NULL;
164     }
165     if (strcmp(ztag, "CDBZ")!=0) {
166     GMessage("Error: file '%s' doesn't appear to be a zlib compressed cdb?\n",p);
167     return NULL;
168     }
169     unsigned int zrecsize;
170     if (fread((void*) &zrecsize,1,4,zf)<4) {
171     GMessage("Error reading 1st compressed record size for file '%s'!\n",p);
172     return NULL;
173     }
174     zrecsize=gcvt_uint(&zrecsize);
175     return new GCdbZFasta(zf, zrecsize, recdelim);
176     }
177     #else
178     static const char err_COMPRESSED[]="Error: compression detected but not compiled in!\n";
179     #endif
180     //decompression stuff
181    
182    
183     void inplace_Lower(char* c) {
184     char *p=c;
185     while (*p!='\0') { *p=tolower(*p);p++; }
186     }
187    
188     void buf_get(GCDBuffer* b, uint32& pos, char *buf, unsigned int len) {
189     int r;
190     while (len > 0) {
191     r = b->get(buf,len);
192     if (r == -1) GError(ERR_READ);
193     if (r == 0)
194     GError(ERR_READFMT);
195     pos += r;
196     buf += r;
197     len -= r;
198     }
199     }
200    
201     void buf_getnum(GCDBuffer* b, uint32& pos, uint32 *num) {
202     char buf[4];
203     buf_get(b, pos, buf, 4);
204     uint32_unpack(buf,num);
205     }
206    
207    
208     int read_dbinfo(int fd, char** fnameptr, cdbInfo& dbstat) {
209     //this is messy due to the need of compatibility with the
210     //old 32bit file-length
211     char* dbname=*fnameptr;
212     //read just the tag first: 4 bytes ID
213     lseek(fd, -cdbInfoSIZE, SEEK_END);
214     int r=read(fd, &dbstat, cdbInfoSIZE );
215     if (r!=cdbInfoSIZE) return 2;
216     //GMessage("Size of dbstat=%d\n", cdbInfoSIZE);
217     if (strncmp(dbstat.oldtag, "CIDX", 4)==0) {
218     //old dbstat structure -- convert it
219     dbstat.num_keys=gcvt_uint(&dbstat.oldnum[0]);
220     dbstat.num_records=gcvt_uint(&dbstat.oldnum[1]);
221     dbstat.dbsize=gcvt_uint(&dbstat.old_dbsize);
222     dbstat.idxflags = gcvt_uint(&dbstat.old_idxflags);
223     //position on the dbnamelen entry
224     dbstat.dbnamelen = gcvt_uint(&dbstat.old_dbnamelen);
225     //GMessage("dbnamelen=%d\n", dbstat.dbnamelen);
226     lseek(fd, -(off_t)(cdbInfoSIZE-4+dbstat.dbnamelen), SEEK_END);
227     }
228     else if (strncmp(dbstat.tag, "CDBX", 4)!=0) {
229     GMessage("Error: this doesn't appear to be a cdbfasta created file!\n");
230     return 1;
231     }
232     else { // new CDBX type:
233     dbstat.dbsize = gcvt_offt(&dbstat.dbsize);
234     dbstat.num_keys=gcvt_uint(&dbstat.num_keys);
235     dbstat.num_records=gcvt_uint(&dbstat.num_records);
236     dbstat.idxflags = gcvt_uint(&dbstat.idxflags);
237     //position on the dbnamelen entry
238     dbstat.dbnamelen = gcvt_uint(&dbstat.dbnamelen);
239     //GMessage("dbnamelen=%d\n", dbstat.dbnamelen);
240     lseek(fd, -(off_t)(cdbInfoSIZE+dbstat.dbnamelen), SEEK_END);
241     }
242    
243     GMALLOC(dbname, dbstat.dbnamelen+1);
244     dbname[dbstat.dbnamelen]='\0';
245     r=read(fd, dbname, dbstat.dbnamelen);
246     *fnameptr=dbname;
247     if (r!=dbstat.dbnamelen)
248     return 2;
249     return 0;
250     }
251    
252     int parse_int(FILE* f, char* buf, const char* key, int& e) {
253     char* p, *q;
254     while (e!=EOF && isspace(e)) { //skip any spaces
255     if (e=='\n') GError(ERR_RANGEFMT, key);
256     e=fgetc(stdin);
257     }
258     if (e==EOF) GError(ERR_RANGEFMT, key);
259     //now e is the first non-space
260     p=buf;
261     q=p;
262     while (e!=EOF && !isspace(e)) {
263     *q=e;
264     q++;
265     e=fgetc(stdin);
266     }
267     *q='\0'; //now p is the starting coordinate string
268     return atoi(p);
269     //now the file pointer should be on the first space after the parsed value
270     }
271    
272     int parse_int(char*& f, const char* key, int& e) {
273     char* p, *q;
274     char buf[16];
275     while (e!='\0' && isspace(e)) { //skip any spaces
276     if (e=='\n') GError(ERR_RANGEFMT, key);
277     f++;
278     e=*f;
279     }
280     if (e=='\0') GError(ERR_RANGEFMT, key);
281     //now e is the first non-space char
282     p=buf;
283     q=p;
284     while (e!='\0' && !isspace(e)) {
285     *q=e;
286     q++;
287     f++;
288     e=*f;
289     }
290     *q='\0';
291     return atoi(p);
292     //now f and e should be on the first space after the parsed value (or '\0')
293     }
294    
295     GCdbYank::GCdbYank(const char* fidx, const char* recsep) {
296     is_compressed=false;
297     fd=-1;
298     cdb=NULL;
299     #ifdef ENABLE_COMPRESSION
300     cdbz=NULL;
301     #endif
302     fdb=-1;
303     fz=NULL;
304     dbname=NULL;
305     recdelim=Gstrdup(recsep);
306     if (fidx==NULL) GError("GCdbYank Error: NULL index file name!");
307     idxfile=Gstrdup(fidx);
308     cdb=new GCdbRead(idxfile);
309     fd=cdb->getfd();
310     db_size=0;
311     dbstat.dbsize=0;
312     info_dbname=NULL;
313     int r=read_dbinfo(fd, &info_dbname, dbstat);
314     lseek(fd, 0, SEEK_SET);
315     if (r==1) GError("This file does not seem to be a cdbfasta generated file.\n");
316     else if (r==2)
317     GError("Error reading info chunk!\n");
318     /*try to find the database file
319     rules: if given, only the -d given filename is used
320     otherwise:
321     1) the same directory with the given index file(stripping the suffix)
322     2) the dbstat filepath/name stored by cdbfasta
323     */
324     if (dbname==NULL) {
325     char* p = rstrchr(idxfile, '.');
326     if (p!=NULL) {
327     /*GError("%s\ncdbyank error: cannot use %s as an index file. When no -d is\n\
328     given, so the database file can be located in the same directory \n\
329     by removing the index file suffix (.cidx)\n", USAGE, idxfile);*/
330     int nlen=p-idxfile;
331     char* namebuf=NULL;
332     GMALLOC(namebuf, nlen+1);
333     strncpy(namebuf, idxfile, nlen);
334     namebuf[nlen]='\0';
335     if (fileExists(namebuf))
336     dbname=namebuf;
337    
338     } // strip the index file extenstion
339     // 2) try the stored dbstat name
340     if (dbname==NULL) {
341     if (fileExists(info_dbname)) dbname=info_dbname;
342     else GError("Cannot locate the database file for this index\n");
343     }
344     }// database name not given
345     is_compressed=(dbstat.idxflags & CDBMSK_OPT_COMPRESS);
346     if (is_compressed)
347     //try to open the dbname as a compressed file
348     fz=fopen(dbname, "rb");
349     else fdb=open(dbname, O_RDONLY|O_BINARY);
350     if (fdb==-1 && fz==NULL)
351     GError("Error: cannot open database file %s\n",dbname);
352     if (is_compressed) {
353     #ifndef ENABLE_COMPRESSION
354     GError(err_COMPRESSED);
355     #else
356     fclose(fz);//just to start fresh here
357     //determine size:
358     int ftmp = open(dbname, O_RDONLY|O_BINARY);
359     if (ftmp == -1) GError("Error reopening db '%s'?\n",dbname);
360     struct stat fdbstat;
361     fstat(ftmp, &fdbstat);
362     db_size=fdbstat.st_size;
363     close(ftmp);
364     //-------- reopen here
365     cdbz=openCdbz(dbname);
366     if (cdbz==NULL)
367     GError("Error opening the cdbz file '%s'\n");
368     fz=cdbz->getZFile();
369     #endif
370     }
371     else {
372     struct stat fdbstat;
373     if (stat(dbname, &fdbstat)!=0) {
374     perror("stat()");
375     exit(1);
376     }
377     db_size=fdbstat.st_size;
378     }
379     //abort if the database size was read and it doesn't match the cdbfasta stored size
380     if (dbstat.dbsize>0 && dbstat.dbsize!=db_size)
381     GError("Error: invalid %d database size - (%lld vs %lld) please rerun cdbfasta for '%s'\n",
382     fdb, dbstat.dbsize, db_size, dbname);
383     fastahandler=new GFastaCharHandler(recdelim);
384     } //* GCdbYank constructor *//
385    
386     GCdbYank::~GCdbYank() {
387     if (is_compressed) {
388     fclose(fz);
389     #ifdef ENABLE_COMPRESSION
390     delete cdbz;
391     #endif
392     }
393     else close(fdb);
394     GFREE(info_dbname);
395     delete fastahandler;
396     GFREE(recdelim);
397     GFREE(dbname);
398     GFREE(idxfile);
399     delete cdb;
400     close(fd);
401     }
402    
403    
404     int GCdbYank::getRecord(const char* key, FastaSeq& rec, charFunc* seqCallBack) {
405     //assumes fdb is open, cdb was created on the index file
406     int r=cdb->find(key);
407     if (r==0) return 0;
408     if (r==-1)
409     GError("cdbyank: error searching for key %s in %s\n", key, idxfile);
410     /* while (r>0) { */
411     off_t pos = cdb->datapos(); //position of this key's record in the index file
412     unsigned int len=cdb->datalen(); // length of this key's record
413     char bytes[32]; // data buffer -- should just accomodate fastarec_pos, fastarec_length
414     if (cdb->read(bytes,len,pos) == -1)
415     GError("cdbyank: error at GCbd::read (%s)!\n", idxfile);
416    
417     off_t fpos; //this will be the fastadb offset
418     uint32 reclen; //this will be the fasta record offset
419     if (len>8) { //64 bit file offset was used
420     fpos=gcvt_offt(bytes);
421     reclen=gcvt_uint(&bytes[sizeof(uint32)<<1]);
422     }
423     else { //32bit offset used
424     fpos=gcvt_uint(bytes);
425     reclen=gcvt_uint(&bytes[sizeof(uint32)]);
426     }
427     //GMessage("reclen=%d\n", reclen);
428    
429     /* if (showQuery)
430     fprintf(fout, "%c%s%c\t", delimQuery, key, delimQuery);*/
431     /*========= FETCHING RECORD CONTENT ======= */
432     if (is_compressed) {
433     //for now: ignore special retrievals, just print the whole record
434     #ifdef ENABLE_COMPRESSION
435     return cdbz->decompress(rec, reclen, fpos, seqCallBack);
436     #else
437     GError(err_COMPRESSED);
438     #endif
439     }
440     else { // not compressed -- position into the file and build an ad hoc GFastaFile
441     lseek(fdb, fpos, SEEK_SET);
442     // read it char by char and return it as output
443     char c='\0';
444     int charsread=0;
445     fastahandler->init(&rec, seqCallBack);
446     while (reclen-- && read(fdb, &c, 1)==1) {
447     fastahandler->processChar(c);
448     charsread++;
449     }
450     fastahandler->done();
451     return charsread;
452     } // not compressed
453     /* if (many) r=cdb->findnext(key, strlen(key));
454     else r=0;
455     } */
456     return 0;
457     }
458    
459     off_t GCdbYank::getRecordPos(const char* key) {
460     //assumes fdb is open, cdb was created on the index file
461     int r=cdb->find(key);
462     if (r==0 && warnings) {
463     GMessage("cdbyank: key \"%s\" not found in %s\n", key, idxfile);
464     return -1;
465     }
466     if (r==-1)
467     GError("cdbyank: error searching for key %s in %s\n", key, idxfile);
468     off_t pos = cdb->datapos(); //position of this key's record in the index file
469     unsigned int len=cdb->datalen(); // length of this key's record
470     char bytes[32]; // data buffer -- should just accomodate fastarec_pos, fastarec_length
471     if (cdb->read(bytes,len,pos) == -1)
472     GError("cdbyank: error at GCbd::read (%s)!\n", idxfile);
473    
474     off_t fpos; //this will be the fastadb offset
475     //uint32 reclen; this will be the fasta record length
476     if (len>8) //64 bit file offset was used
477     fpos=gcvt_offt(bytes);
478     else //32bit offset used
479     fpos=gcvt_uint(bytes);
480     return fpos;
481     }