1 |
#include "GCdbYank.h" |
2 |
#include "GBase.h" |
3 |
#include <ctype.h> |
4 |
|
5 |
#define ERR_READ "GCdbYank: error reading from file.\n" |
6 |
#define ERR_READFMT "GCdbYank read error: incorrect file format.\n" |
7 |
#define ERR_RANGEFMT "Sequence range parsing error for key '%s'\n" |
8 |
#define ERR_RANGE_INVALID "Invalid range (%d-%d) specified for sequence '%s' of length %d\n" |
9 |
// 1MB memory buffer: |
10 |
#define MAX_MEM_RECSIZE 1048576 |
11 |
#ifndef O_BINARY |
12 |
#define O_BINARY 0x0000 |
13 |
#endif |
14 |
|
15 |
|
16 |
#ifdef ENABLE_COMPRESSION |
17 |
|
18 |
GCdbZFasta::GCdbZFasta(FILE* azf, int zrsize, char* r_delim) { |
19 |
zrecsize=-1; |
20 |
zpos=0; |
21 |
recdelim=Gstrdup(r_delim); |
22 |
zf=azf; |
23 |
decomp_start(zrsize); |
24 |
chrhandler=new GFastaCharHandler(recdelim); |
25 |
} |
26 |
|
27 |
GCdbZFasta::~GCdbZFasta() { |
28 |
//if (zf!=NULL && zf!=stdout && zf!=stdin) fclose(zf); |
29 |
// FULL_FLUSH method instead of finish |
30 |
delete chrhandler; |
31 |
GFREE(recdelim); |
32 |
decomp_end(); |
33 |
} |
34 |
|
35 |
void GCdbZFasta::decomp_start(int zrsize) { |
36 |
zstream.zalloc = (alloc_func)0; |
37 |
zstream.zfree = (free_func)0; |
38 |
zstream.opaque = (voidpf)0; |
39 |
zstream.next_in = (Bytef*)sbuf; |
40 |
zstream.avail_in = 0; |
41 |
zstream.next_out = (Bytef*)lbuf; |
42 |
int err = inflateInit(&zstream); |
43 |
if (err!=Z_OK) |
44 |
GMessage("Error at inflateInit()\n"); |
45 |
//-- now read and discard the first record, so we can use random access later |
46 |
// (needed by zlib) |
47 |
int bytes_read=fread(sbuf, 1, zrsize, zf); |
48 |
if (bytes_read<zrsize) |
49 |
GError("Error reading 1st record from zrec file\n"); |
50 |
zstream.next_in = (Bytef*)sbuf; |
51 |
zstream.avail_in = bytes_read; |
52 |
//decompress first chunk |
53 |
zstream.next_out = (Bytef*)lbuf; |
54 |
zstream.avail_out = GCDBZ_LBUF_LEN; |
55 |
err = inflate(&zstream, Z_SYNC_FLUSH); |
56 |
if (err !=Z_OK && err!=Z_STREAM_END) |
57 |
GError("GCdbZFasta error: 1st record inflate failed! (err=%d)\n",err); |
58 |
} |
59 |
|
60 |
void GCdbZFasta::decomp_end() { |
61 |
int err = inflateEnd(&zstream); |
62 |
if (err!=Z_OK) |
63 |
GError("Error at inflateEnd() (err=%d)\n", err); |
64 |
} |
65 |
|
66 |
|
67 |
//fasta record decompress |
68 |
//returns: the number of bytes decompressed |
69 |
int GCdbZFasta::decompress(FastaSeq& rec, int csize, int zfofs, charFunc* seqCallBack) { |
70 |
if (zfofs>=0) { |
71 |
if (fseek(zf, zfofs, 0)) |
72 |
GError("GCdbZFasta::decompress: error fseek() to %d\n", zfofs); |
73 |
} |
74 |
else |
75 |
if (feof(zf)) return 0; |
76 |
bool in_rec=true; |
77 |
int err=0; |
78 |
int total_read=0; |
79 |
int total_written=0; |
80 |
chrhandler->init(&rec, seqCallBack); |
81 |
while (in_rec) { // read loop |
82 |
int to_read=0; |
83 |
int bytes_read=0; |
84 |
if (csize<=0) { //read one byte at a time |
85 |
to_read=1; |
86 |
int c; |
87 |
if ((c =fgetc(zf))!=EOF) { |
88 |
bytes_read = 1; |
89 |
sbuf[0]=c; |
90 |
} |
91 |
else { |
92 |
//bytes_read=0; |
93 |
return 0; //eof |
94 |
} |
95 |
total_read+=bytes_read; |
96 |
} |
97 |
else { |
98 |
to_read = csize-total_read>GCDBZ_SBUF_LEN ? |
99 |
GCDBZ_SBUF_LEN : csize-total_read; |
100 |
// check for csize vs bytes_read match: |
101 |
if (to_read==0) return 0; |
102 |
bytes_read=fread(sbuf, 1, to_read, zf); |
103 |
if (bytes_read!=to_read) |
104 |
GError("Error reading from zrec file\n"); |
105 |
total_read+=bytes_read; |
106 |
in_rec=(total_read<csize); |
107 |
} |
108 |
if (bytes_read==0) { |
109 |
//GMessage("bytes_read = 0\n"); |
110 |
return 0; |
111 |
} |
112 |
if (in_rec && bytes_read<to_read) in_rec=false; |
113 |
zstream.next_in = (Bytef*)sbuf; |
114 |
zstream.avail_in = bytes_read; |
115 |
|
116 |
do { //decompression loop |
117 |
zstream.next_out = (Bytef*)lbuf; |
118 |
zstream.avail_out = GCDBZ_LBUF_LEN; |
119 |
uLong t_out=zstream.total_out; |
120 |
err = inflate(&zstream, Z_SYNC_FLUSH); |
121 |
uLong toWrite=zstream.total_out-t_out; |
122 |
if (toWrite>0) { |
123 |
/* if (fwrite(lbuf, 1, toWrite, outf)<toWrite) { |
124 |
GError("Error writing inflated chunk!\n"); |
125 |
} */ |
126 |
for (unsigned int i=0;i<toWrite;i++) |
127 |
chrhandler->processChar(lbuf[i]); |
128 |
|
129 |
total_written+=toWrite; |
130 |
} |
131 |
if (err==Z_STREAM_END) { |
132 |
in_rec=false; |
133 |
if (total_written==0) { |
134 |
GMessage("Z_STREAM_END found but total_written=0!\n"); |
135 |
} |
136 |
break; |
137 |
} |
138 |
else if (err !=Z_OK) |
139 |
GError("GCdbZFasta error: inflate failed! (err=%d)\n",err); |
140 |
} while (zstream.avail_in!=0); //decompression loop |
141 |
} //read loop |
142 |
chrhandler->done(); |
143 |
/*if (err!=Z_STREAM_END) { |
144 |
GError("decompress: Z_STREAM_END not found!\n"); |
145 |
}*/ |
146 |
return total_written; |
147 |
} |
148 |
|
149 |
GCdbZFasta* GCdbYank::openCdbz(char* p) { |
150 |
//in case this was not done before: |
151 |
gcvt_uint=(endian_test())? &uint32_sun : &uint32_x86; |
152 |
FILE* zf=fopen(p, "rb"); |
153 |
if (zf==NULL) { |
154 |
GMessage("Error: cannot open compressed file '%s'!\n",p); |
155 |
return NULL; |
156 |
} |
157 |
//check if the file is valid and read the length of the first record |
158 |
// |
159 |
char ztag[5]; |
160 |
ztag[4]='\0'; |
161 |
if (fread(ztag, 1, 4, zf)<4) { |
162 |
GMessage("Error reading header of compressed file '%s'\n",p); |
163 |
return NULL; |
164 |
} |
165 |
if (strcmp(ztag, "CDBZ")!=0) { |
166 |
GMessage("Error: file '%s' doesn't appear to be a zlib compressed cdb?\n",p); |
167 |
return NULL; |
168 |
} |
169 |
unsigned int zrecsize; |
170 |
if (fread((void*) &zrecsize,1,4,zf)<4) { |
171 |
GMessage("Error reading 1st compressed record size for file '%s'!\n",p); |
172 |
return NULL; |
173 |
} |
174 |
zrecsize=gcvt_uint(&zrecsize); |
175 |
return new GCdbZFasta(zf, zrecsize, recdelim); |
176 |
} |
177 |
#else |
178 |
static const char err_COMPRESSED[]="Error: compression detected but not compiled in!\n"; |
179 |
#endif |
180 |
//decompression stuff |
181 |
|
182 |
|
183 |
void inplace_Lower(char* c) { |
184 |
char *p=c; |
185 |
while (*p!='\0') { *p=tolower(*p);p++; } |
186 |
} |
187 |
|
188 |
void buf_get(GCDBuffer* b, uint32& pos, char *buf, unsigned int len) { |
189 |
int r; |
190 |
while (len > 0) { |
191 |
r = b->get(buf,len); |
192 |
if (r == -1) GError(ERR_READ); |
193 |
if (r == 0) |
194 |
GError(ERR_READFMT); |
195 |
pos += r; |
196 |
buf += r; |
197 |
len -= r; |
198 |
} |
199 |
} |
200 |
|
201 |
void buf_getnum(GCDBuffer* b, uint32& pos, uint32 *num) { |
202 |
char buf[4]; |
203 |
buf_get(b, pos, buf, 4); |
204 |
uint32_unpack(buf,num); |
205 |
} |
206 |
|
207 |
|
208 |
int read_dbinfo(int fd, char** fnameptr, cdbInfo& dbstat) { |
209 |
//this is messy due to the need of compatibility with the |
210 |
//old 32bit file-length |
211 |
char* dbname=*fnameptr; |
212 |
//read just the tag first: 4 bytes ID |
213 |
lseek(fd, -cdbInfoSIZE, SEEK_END); |
214 |
int r=read(fd, &dbstat, cdbInfoSIZE ); |
215 |
if (r!=cdbInfoSIZE) return 2; |
216 |
//GMessage("Size of dbstat=%d\n", cdbInfoSIZE); |
217 |
if (strncmp(dbstat.oldtag, "CIDX", 4)==0) { |
218 |
//old dbstat structure -- convert it |
219 |
dbstat.num_keys=gcvt_uint(&dbstat.oldnum[0]); |
220 |
dbstat.num_records=gcvt_uint(&dbstat.oldnum[1]); |
221 |
dbstat.dbsize=gcvt_uint(&dbstat.old_dbsize); |
222 |
dbstat.idxflags = gcvt_uint(&dbstat.old_idxflags); |
223 |
//position on the dbnamelen entry |
224 |
dbstat.dbnamelen = gcvt_uint(&dbstat.old_dbnamelen); |
225 |
//GMessage("dbnamelen=%d\n", dbstat.dbnamelen); |
226 |
lseek(fd, -(off_t)(cdbInfoSIZE-4+dbstat.dbnamelen), SEEK_END); |
227 |
} |
228 |
else if (strncmp(dbstat.tag, "CDBX", 4)!=0) { |
229 |
GMessage("Error: this doesn't appear to be a cdbfasta created file!\n"); |
230 |
return 1; |
231 |
} |
232 |
else { // new CDBX type: |
233 |
dbstat.dbsize = gcvt_offt(&dbstat.dbsize); |
234 |
dbstat.num_keys=gcvt_uint(&dbstat.num_keys); |
235 |
dbstat.num_records=gcvt_uint(&dbstat.num_records); |
236 |
dbstat.idxflags = gcvt_uint(&dbstat.idxflags); |
237 |
//position on the dbnamelen entry |
238 |
dbstat.dbnamelen = gcvt_uint(&dbstat.dbnamelen); |
239 |
//GMessage("dbnamelen=%d\n", dbstat.dbnamelen); |
240 |
lseek(fd, -(off_t)(cdbInfoSIZE+dbstat.dbnamelen), SEEK_END); |
241 |
} |
242 |
|
243 |
GMALLOC(dbname, dbstat.dbnamelen+1); |
244 |
dbname[dbstat.dbnamelen]='\0'; |
245 |
r=read(fd, dbname, dbstat.dbnamelen); |
246 |
*fnameptr=dbname; |
247 |
if (r!=dbstat.dbnamelen) |
248 |
return 2; |
249 |
return 0; |
250 |
} |
251 |
|
252 |
int parse_int(FILE* f, char* buf, const char* key, int& e) { |
253 |
char* p, *q; |
254 |
while (e!=EOF && isspace(e)) { //skip any spaces |
255 |
if (e=='\n') GError(ERR_RANGEFMT, key); |
256 |
e=fgetc(stdin); |
257 |
} |
258 |
if (e==EOF) GError(ERR_RANGEFMT, key); |
259 |
//now e is the first non-space |
260 |
p=buf; |
261 |
q=p; |
262 |
while (e!=EOF && !isspace(e)) { |
263 |
*q=e; |
264 |
q++; |
265 |
e=fgetc(stdin); |
266 |
} |
267 |
*q='\0'; //now p is the starting coordinate string |
268 |
return atoi(p); |
269 |
//now the file pointer should be on the first space after the parsed value |
270 |
} |
271 |
|
272 |
int parse_int(char*& f, const char* key, int& e) { |
273 |
char* p, *q; |
274 |
char buf[16]; |
275 |
while (e!='\0' && isspace(e)) { //skip any spaces |
276 |
if (e=='\n') GError(ERR_RANGEFMT, key); |
277 |
f++; |
278 |
e=*f; |
279 |
} |
280 |
if (e=='\0') GError(ERR_RANGEFMT, key); |
281 |
//now e is the first non-space char |
282 |
p=buf; |
283 |
q=p; |
284 |
while (e!='\0' && !isspace(e)) { |
285 |
*q=e; |
286 |
q++; |
287 |
f++; |
288 |
e=*f; |
289 |
} |
290 |
*q='\0'; |
291 |
return atoi(p); |
292 |
//now f and e should be on the first space after the parsed value (or '\0') |
293 |
} |
294 |
|
295 |
GCdbYank::GCdbYank(const char* fidx, const char* recsep) { |
296 |
is_compressed=false; |
297 |
fd=-1; |
298 |
cdb=NULL; |
299 |
#ifdef ENABLE_COMPRESSION |
300 |
cdbz=NULL; |
301 |
#endif |
302 |
fdb=-1; |
303 |
fz=NULL; |
304 |
dbname=NULL; |
305 |
recdelim=Gstrdup(recsep); |
306 |
if (fidx==NULL) GError("GCdbYank Error: NULL index file name!"); |
307 |
idxfile=Gstrdup(fidx); |
308 |
cdb=new GCdbRead(idxfile); |
309 |
fd=cdb->getfd(); |
310 |
db_size=0; |
311 |
dbstat.dbsize=0; |
312 |
info_dbname=NULL; |
313 |
int r=read_dbinfo(fd, &info_dbname, dbstat); |
314 |
lseek(fd, 0, SEEK_SET); |
315 |
if (r==1) GError("This file does not seem to be a cdbfasta generated file.\n"); |
316 |
else if (r==2) |
317 |
GError("Error reading info chunk!\n"); |
318 |
/*try to find the database file |
319 |
rules: if given, only the -d given filename is used |
320 |
otherwise: |
321 |
1) the same directory with the given index file(stripping the suffix) |
322 |
2) the dbstat filepath/name stored by cdbfasta |
323 |
*/ |
324 |
if (dbname==NULL) { |
325 |
char* p = rstrchr(idxfile, '.'); |
326 |
if (p!=NULL) { |
327 |
/*GError("%s\ncdbyank error: cannot use %s as an index file. When no -d is\n\ |
328 |
given, so the database file can be located in the same directory \n\ |
329 |
by removing the index file suffix (.cidx)\n", USAGE, idxfile);*/ |
330 |
int nlen=p-idxfile; |
331 |
char* namebuf=NULL; |
332 |
GMALLOC(namebuf, nlen+1); |
333 |
strncpy(namebuf, idxfile, nlen); |
334 |
namebuf[nlen]='\0'; |
335 |
if (fileExists(namebuf)) |
336 |
dbname=namebuf; |
337 |
|
338 |
} // strip the index file extenstion |
339 |
// 2) try the stored dbstat name |
340 |
if (dbname==NULL) { |
341 |
if (fileExists(info_dbname)) dbname=info_dbname; |
342 |
else GError("Cannot locate the database file for this index\n"); |
343 |
} |
344 |
}// database name not given |
345 |
is_compressed=(dbstat.idxflags & CDBMSK_OPT_COMPRESS); |
346 |
if (is_compressed) |
347 |
//try to open the dbname as a compressed file |
348 |
fz=fopen(dbname, "rb"); |
349 |
else fdb=open(dbname, O_RDONLY|O_BINARY); |
350 |
if (fdb==-1 && fz==NULL) |
351 |
GError("Error: cannot open database file %s\n",dbname); |
352 |
if (is_compressed) { |
353 |
#ifndef ENABLE_COMPRESSION |
354 |
GError(err_COMPRESSED); |
355 |
#else |
356 |
fclose(fz);//just to start fresh here |
357 |
//determine size: |
358 |
int ftmp = open(dbname, O_RDONLY|O_BINARY); |
359 |
if (ftmp == -1) GError("Error reopening db '%s'?\n",dbname); |
360 |
struct stat fdbstat; |
361 |
fstat(ftmp, &fdbstat); |
362 |
db_size=fdbstat.st_size; |
363 |
close(ftmp); |
364 |
//-------- reopen here |
365 |
cdbz=openCdbz(dbname); |
366 |
if (cdbz==NULL) |
367 |
GError("Error opening the cdbz file '%s'\n"); |
368 |
fz=cdbz->getZFile(); |
369 |
#endif |
370 |
} |
371 |
else { |
372 |
struct stat fdbstat; |
373 |
if (stat(dbname, &fdbstat)!=0) { |
374 |
perror("stat()"); |
375 |
exit(1); |
376 |
} |
377 |
db_size=fdbstat.st_size; |
378 |
} |
379 |
//abort if the database size was read and it doesn't match the cdbfasta stored size |
380 |
if (dbstat.dbsize>0 && dbstat.dbsize!=db_size) |
381 |
GError("Error: invalid %d database size - (%lld vs %lld) please rerun cdbfasta for '%s'\n", |
382 |
fdb, dbstat.dbsize, db_size, dbname); |
383 |
fastahandler=new GFastaCharHandler(recdelim); |
384 |
} //* GCdbYank constructor *// |
385 |
|
386 |
GCdbYank::~GCdbYank() { |
387 |
if (is_compressed) { |
388 |
fclose(fz); |
389 |
#ifdef ENABLE_COMPRESSION |
390 |
delete cdbz; |
391 |
#endif |
392 |
} |
393 |
else close(fdb); |
394 |
GFREE(info_dbname); |
395 |
delete fastahandler; |
396 |
GFREE(recdelim); |
397 |
GFREE(dbname); |
398 |
GFREE(idxfile); |
399 |
delete cdb; |
400 |
close(fd); |
401 |
} |
402 |
|
403 |
|
404 |
int GCdbYank::getRecord(const char* key, FastaSeq& rec, charFunc* seqCallBack) { |
405 |
//assumes fdb is open, cdb was created on the index file |
406 |
int r=cdb->find(key); |
407 |
if (r==0) return 0; |
408 |
if (r==-1) |
409 |
GError("cdbyank: error searching for key %s in %s\n", key, idxfile); |
410 |
/* while (r>0) { */ |
411 |
off_t pos = cdb->datapos(); //position of this key's record in the index file |
412 |
unsigned int len=cdb->datalen(); // length of this key's record |
413 |
char bytes[32]; // data buffer -- should just accomodate fastarec_pos, fastarec_length |
414 |
if (cdb->read(bytes,len,pos) == -1) |
415 |
GError("cdbyank: error at GCbd::read (%s)!\n", idxfile); |
416 |
|
417 |
off_t fpos; //this will be the fastadb offset |
418 |
uint32 reclen; //this will be the fasta record offset |
419 |
if (len>8) { //64 bit file offset was used |
420 |
fpos=gcvt_offt(bytes); |
421 |
reclen=gcvt_uint(&bytes[sizeof(uint32)<<1]); |
422 |
} |
423 |
else { //32bit offset used |
424 |
fpos=gcvt_uint(bytes); |
425 |
reclen=gcvt_uint(&bytes[sizeof(uint32)]); |
426 |
} |
427 |
//GMessage("reclen=%d\n", reclen); |
428 |
|
429 |
/* if (showQuery) |
430 |
fprintf(fout, "%c%s%c\t", delimQuery, key, delimQuery);*/ |
431 |
/*========= FETCHING RECORD CONTENT ======= */ |
432 |
if (is_compressed) { |
433 |
//for now: ignore special retrievals, just print the whole record |
434 |
#ifdef ENABLE_COMPRESSION |
435 |
return cdbz->decompress(rec, reclen, fpos, seqCallBack); |
436 |
#else |
437 |
GError(err_COMPRESSED); |
438 |
#endif |
439 |
} |
440 |
else { // not compressed -- position into the file and build an ad hoc GFastaFile |
441 |
lseek(fdb, fpos, SEEK_SET); |
442 |
// read it char by char and return it as output |
443 |
char c='\0'; |
444 |
int charsread=0; |
445 |
fastahandler->init(&rec, seqCallBack); |
446 |
while (reclen-- && read(fdb, &c, 1)==1) { |
447 |
fastahandler->processChar(c); |
448 |
charsread++; |
449 |
} |
450 |
fastahandler->done(); |
451 |
return charsread; |
452 |
} // not compressed |
453 |
/* if (many) r=cdb->findnext(key, strlen(key)); |
454 |
else r=0; |
455 |
} */ |
456 |
return 0; |
457 |
} |
458 |
|
459 |
off_t GCdbYank::getRecordPos(const char* key) { |
460 |
//assumes fdb is open, cdb was created on the index file |
461 |
int r=cdb->find(key); |
462 |
if (r==0 && warnings) { |
463 |
GMessage("cdbyank: key \"%s\" not found in %s\n", key, idxfile); |
464 |
return -1; |
465 |
} |
466 |
if (r==-1) |
467 |
GError("cdbyank: error searching for key %s in %s\n", key, idxfile); |
468 |
off_t pos = cdb->datapos(); //position of this key's record in the index file |
469 |
unsigned int len=cdb->datalen(); // length of this key's record |
470 |
char bytes[32]; // data buffer -- should just accomodate fastarec_pos, fastarec_length |
471 |
if (cdb->read(bytes,len,pos) == -1) |
472 |
GError("cdbyank: error at GCbd::read (%s)!\n", idxfile); |
473 |
|
474 |
off_t fpos; //this will be the fastadb offset |
475 |
//uint32 reclen; this will be the fasta record length |
476 |
if (len>8) //64 bit file offset was used |
477 |
fpos=gcvt_offt(bytes); |
478 |
else //32bit offset used |
479 |
fpos=gcvt_uint(bytes); |
480 |
return fpos; |
481 |
} |