ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/cdbfasta/cdbfasta.cpp
Revision: 8
Committed: Mon Mar 22 22:11:25 2010 UTC (12 years, 6 months ago) by gpertea
File size: 25748 byte(s)
Log Message:
added cdbfasta source files

Line File contents
1 #include <ctype.h>
2 #include <fcntl.h>
3 #include <string.h>
4 #include <sys/stat.h>
5 #include "GBase.h"
6 #include "GArgs.h"
7 #include "GHash.hh"
8 #include "gcdb.h"
9 #ifdef ENABLE_COMPRESSION
10 #include "gcdbz.h"
11 #endif
12 #ifdef __WIN32__
13 #define VERSION "cdbfasta version 0.993w"
14 #else
15 #define VERSION "cdbfasta version 0.993"
16 #endif
17
18 #define USAGE "Usage:\n\
19 cdbfasta <fastafile> [-o <index_file>] [-r <record_delimiter>]\n\
20 [-z <compressed_db>] [-i] [-m|-n <numkeys>|-f<LIST>]|-c|-C]\n\
21 [-w <stopwords_list>] [-s <stripendchars>] [-v]\n\
22 \n\
23 Creates an index file for records from a multi-fasta file.\n\
24 By default (without -m/-n/-c/-C option), only the first \n\
25 space-delimited token from the defline is used as a key.\n\
26 \n\
27 <fastafile> is the multi-fasta file to index; \n\
28 -o the index file will be named <index_file>; if not given,\n\
29 the index filename is database name plus the suffix '.cidx'\n\
30 -r <record_delimiter> a string of characters at the beginning of line\n\
31 marking the start of a record (default: '>')\n\
32 -Q treat input as fastq format, i.e. with '@' as record delimiter\n\
33 and with records expected to have at least 4 lines\n\
34 -z database is compressed into the file <compressed_db>\n\
35 before indexing (<fastafile> can be \"-\" or \"stdin\" \n\
36 in order to get the input records from stdin)\n\
37 -s strip extraneous characters from *around* the space delimited\n\
38 tokens, for the multikey options below (-m,-n,-f);\n\
39 Default <stripendchars> set is: '\",`.(){}/[]!:;~|><+-\n\
40 -m (\"multi-key\" option) create hash entries pointing to \n\
41 the same record for all tokens found in\n\
42 the defline\n\
43 -n <numkeys> same as -m, but only takes the first <numkeys>\n\
44 tokens from the defline\n\
45 -f indexes *space* delimited tokens (fields) in the defline as given\n\
46 by LIST of fields or fields ranges (the same syntax as UNIX 'cut')\n\
47 -w <stopwordslist> exclude from indexing all the words found\n\
48 in the file <stopwordslist> (for options -m, -n and -k)\n\
49 -i do case insensitive indexing (i.e. create additional keys for \n\
50 all-lowercase tokens used for indexing from the defline \n\
51 -c for deflines in the format: db1|accession1|db2|accession2|...,\n\
52 only the first db-accession pair ('db1|accession1') is taken as key\n\
53 -C like -c, but also subsequent db|accession constructs are indexed,\n\
54 along with the full (default) token; additionally,\n\
55 all nrdb concatenated accessions found in the defline \n\
56 are parsed and stored (assuming 0x01 or '^|^' as separators)\n\
57 -a accession mode: like -C option, but indexes the 'accession'\n\
58 part for all 'db|accession' constructs found\n\
59 -A like -a and -C together (both accessions and 'db|accession'\n\
60 constructs are used as keys\n\
61 -v show program version and exit\n"
62
63
64 #define ERR_TOOMANYFIELDS "Error: too many fields for -f option\n"
65 //16K initial defline buffer
66 #define KBUFSIZE 0x4000
67 #ifndef O_BINARY
68 #define O_BINARY 0x0000
69 #endif
70 #define MAX_KEYLEN 1024
71 //64K input buffer
72 #define GREADBUF_SIZE 0x010000
73
74
75 typedef void (*addFuncType)(char*, off_t, uint32);
76
77 // for passing around index data:
78 struct IdxData32 {
79 uint32 fpos;
80 uint32 reclen;
81 };
82
83 struct IdxData {
84 off_t fpos; //64bit value on Linux
85 uint32 reclen;
86 };
87
88 int IdxDataSIZE=offsetof(IdxData, reclen)+sizeof(uint32);
89 int IdxDataSIZE32=offsetof(IdxData32, reclen)+sizeof(uint32);
90
91 char ftmp[365];
92 char fztmp[365];
93 char record_marker[127]; //record delimiter
94 int record_marker_len=1;
95 int num_recs;
96 int num_keys;
97
98 off_t last_cdbfpos=0;
99
100 int compact_plus; //shortcut key and
101 bool acc_mode=false;
102 bool acc_only=false;
103 bool do_compress=false; // compression used
104 bool fastq=false;
105
106 FILE* zf=NULL; //compressed file handle
107
108 //store just offset and record length
109 const char* defWordJunk="'\",`.(){}/[]!:;~|><+-";
110 char* wordJunk=NULL;
111 bool caseInsensitive=false; //case insensitive storage
112 bool useStopWords=false;
113 unsigned int numFields=0;
114 // we have fields[numFields-1]=MAX_UINT as defined in gcdb.h --
115 // as an indicator of taking every single token in the defline,
116 // or for open ended ranges (e.g. -f5- )
117 unsigned int fields[255]; //array of numFields field indices (1-based)
118 GHash<int> stopList;
119 //static int datalen=sizeof(uint32)+sizeof(off_t);
120
121 char lastKey[MAX_KEYLEN]; //keep a copy of the last valid written key
122
123 GCdbWrite* cdbidx;
124 addFuncType addKeyFunc;
125
126 #define ERR_W_DBSTAT "Error writing the database statististics!\n"
127
128
129 void die_write(const char* fname) {
130 GError("Error: cdbhash was unable to write into file %s\n",
131 fname);
132
133 }
134
135 void die_read(const char* infile) {
136 GError("Error: cdbhash was unable to read the input file %s\n",infile);
137 }
138
139
140 void die_readformat(const char* infile) {
141 GError("Error: bad format for file %s; is it a fastA file?\n",
142 infile);
143 }
144
145
146 bool add_cdbkey(char* key, off_t fpos, uint32 reclen) {
147
148 unsigned int klen=strlen(key);
149 if (fpos==last_cdbfpos && strcmp(key, lastKey)==0) return true;
150 if (klen<1) {
151 GMessage("Warning: zero length key found following key '%s'\n",
152 lastKey);
153 return false;
154 }
155 //------------ adding record -----------------
156 num_keys++;
157 strncpy(lastKey, key, MAX_KEYLEN-1);
158 lastKey[MAX_KEYLEN-1]='\0';
159 if ((uint64)fpos>(uint64)MAX_UINT) { //64 bit file offset
160 IdxData recdata;
161 uint64 v= (uint64) fpos; //needed for Solaris' off_t issues with gcc/32
162 recdata.fpos=gcvt_offt(&v);
163 recdata.reclen=gcvt_uint(&reclen);
164 if (cdbidx->addrec(key,klen,(char*)&recdata,IdxDataSIZE)==-1)
165 GError("Error adding cdb record with key '%s'\n",key);
166 }
167 else {//32 bit file offset
168 IdxData32 recdata;
169 uint32 v=(uint32) fpos;
170 recdata.fpos=gcvt_uint(&v);
171 recdata.reclen=gcvt_uint(&reclen);
172 //GMessage("Adding 32bit: '%s' reclen=%d\n", key, recdata.reclen);
173 if (cdbidx->addrec(key,klen,(char*)&recdata,IdxDataSIZE32)==-1)
174 GError("Error adding cdb record with key '%s'\n",key);
175 }
176 last_cdbfpos=fpos;
177 return true;
178 }
179
180 //default indexing: key directly passed --
181 // as the first space delimited token
182 void addKey(char* key, off_t fpos,
183 uint32 reclen) {
184 num_recs++;
185 add_cdbkey(key, fpos, reclen);
186 if (caseInsensitive) {
187 char* lckey=loCase(key);
188 if (strcmp(lckey, key)!=0)
189 add_cdbkey(lckey, fpos, reclen);
190 GFREE(lckey);
191 }
192 }
193
194 //the whole defline is passed
195 void addKeyMulti(char* defline,
196 off_t fpos, uint32 reclen) {
197 char* p=defline;
198 unsigned int fieldno=0;
199 char* pn;
200 num_recs++;
201 bool stillParsing=true;
202 unsigned int fidx=0; //index in fields[] array
203 while (stillParsing) {
204 while ((*p)==' ' || (*p)=='\t') p++;
205 if (*p == '\0') break;
206 //skip any extraneous characters at the beginning of the token
207 while (chrInStr(*p, wordJunk)) p++;
208 //skip any padding spaces or other extraneous characters
209 //at the beginning of the word
210 if (*p == '\0') break;
211 pn=p;
212 while (*pn!='\0' && !isspace(*pn)) pn++;
213 //found next space or end of string
214 fieldno++;
215 while (fields[fidx]<fieldno && fidx<numFields-1) fidx++;
216 //GMessage("> p=>'%s' [%d, %d, %d] (next='%s')\n",p, numFields,
217 // fieldno, fields[numFields-1], pn+1);
218 stillParsing = (((*pn)!='\0') && (fieldno+1<=fields[numFields-1]));
219 char* pend = pn-1; //pend is on the last non-space in the current token
220 //--strip the ending junk, if any
221 while (chrInStr(*pend, wordJunk) && pend>p) pend--;
222 if (pend<pn-1) *(pend+1)='\0';
223 else *pn='\0';
224
225 if (strlen(p)>0) {
226 if (fields[fidx]==MAX_UINT || fields[fidx]==fieldno) {
227 if (useStopWords && stopList.hasKey(p)) {
228 p=pn+1;
229 continue;
230 }
231 //--- store this key with the same current record data:
232 add_cdbkey(p, fpos, reclen);
233 //---storage code ends here
234 if (caseInsensitive) {
235 char* lcp=loCase(p);
236 if (strcmp(lcp,p)!=0)
237 add_cdbkey(lcp, fpos, reclen);
238 GFREE(lcp);
239 }
240 }
241 //if (isEnd) break; //if all the token were stored
242 }
243 p=pn+1;//p is now on the next token's start
244 } //token parsing loop
245 }
246
247
248 int qcmpInt(const void *p1, const void *p2) {
249 //int n1=*((int*)p1);
250 //int n2=*((int*)p2);
251 const unsigned int *a = (unsigned int *) p1;
252 const unsigned int *b = (unsigned int *) p2;
253 if (*a < *b) return -1;
254 else if (*a > *b) return 1;
255 else return 0;
256 }
257
258
259 char* parse_dbacc(char* pstart, char*& end_acc, char*& accst) {
260 if (pstart==NULL || *pstart==0) return NULL;
261 bool hasDigits=false;
262 char* pend=pstart;
263 end_acc=NULL; //end of first accession
264 accst=NULL;
265 while (*pstart=='|') pstart++;
266 for(char* p=pstart;;p++) {
267 if (hasDigits==false && *p>='0' && *p<='9') hasDigits=true;
268 /* if (*p==0) { //end of seq_name
269 pend=p; //doesn't matter if it's accession or "db"-like
270 break;
271 }*/
272 if (*p=='|' || *p==0) {
273 int curlen=p-pstart;
274 if (*p==0 || (hasDigits && curlen>3) || curlen>7 || accst!=NULL) {//previous token was an accession
275 pend=p; //advance pend
276 if (end_acc==NULL) end_acc=p;
277 if (accst==NULL) accst=pstart;
278 break;
279 }
280 else { //first pipe char or no digits
281 accst=p+1;
282 }
283 hasDigits=false;//reset flag
284 } // | separator
285 }
286 if (pend!=pstart) return pend;
287 else return NULL;
288 }
289
290
291
292
293 char* parseSpToken(char* str) {
294 if (str==NULL) return NULL;
295 char* p=str;
296 while (*p!=' ' && *p!='\t' && *p!='\v' && *p!=0) p++;
297 *p=0;
298 return p;
299 }
300
301 #define NRDB_CHARSEP "\1\2\3\4"
302 #define NRDB_STRSEP "^|^"
303
304 //nrdbp is positioned at the end of current nrdb concatenated
305 //defline, or NULL if there is no more
306 inline void NRDB_Rec(char* &nrdbp, char* defline) {
307 nrdbp=strpbrk(defline, NRDB_CHARSEP);
308 if (nrdbp==NULL) {
309 nrdbp=strstr(defline, NRDB_STRSEP);
310 if (nrdbp!=NULL) {
311 *nrdbp='\0';
312 nrdbp+=2;
313 }
314 }
315 else *nrdbp='\0';
316 }
317
318 //-c/-C/-a/-A indexing: key up to the first space or second '|'
319 //receives the full defline
320 void addKeyCompact(char* defline,
321 off_t fpos, uint32 reclen) {
322 //we got the first token found on the defline
323 num_recs++;
324 char* nrdb_end;
325 //breaks defline at the next nrdb concatenation point
326 NRDB_Rec(nrdb_end, defline);
327 //isolate the first token in this nrdb concatenation
328 char* token_end=parseSpToken(defline);
329 if (!compact_plus) { //shortcut key
330 //only the first db|accession construct will be indexed, if found
331 char* end_acc1=NULL; //end of first accession
332 char* acc1st=NULL;
333 char* dbacc_end=parse_dbacc(defline, end_acc1, acc1st);
334 if (end_acc1!=NULL) { //has acceptable shortcut
335 *end_acc1=0;
336 add_cdbkey(defline, fpos, reclen);
337 return;
338 }
339 if (dbacc_end!=NULL) {
340 *dbacc_end=0;
341 add_cdbkey(defline, fpos, reclen);
342 return;
343 }
344 //store this whole non-space token as key:
345 add_cdbkey(defline, fpos, reclen);
346 return;
347 }
348 //from now on only -C/-a/-A treatment:
349 for(;;) {
350 //defline is on the first token
351 if (strlen(defline)>0) //add whole non-space token as the "full key"
352 add_cdbkey(defline, fpos, reclen);
353 //add the db|accession constructs as keys
354 char* dbacc_start=defline;
355 char* firstacc_end=NULL;
356 char* accst=NULL;
357 char* dbacc_end=parse_dbacc(dbacc_start, firstacc_end, accst);
358 while (dbacc_end!=NULL) {
359 if (firstacc_end!=NULL && firstacc_end<dbacc_end) {
360 char c=*firstacc_end;
361 *firstacc_end=0;
362 if (!acc_only)
363 add_cdbkey(dbacc_start, fpos, reclen);
364 if (acc_mode) {
365 add_cdbkey(accst, fpos, reclen);
366 }
367 *firstacc_end=c;
368 }
369 if (dbacc_start==defline && dbacc_end==token_end) {
370 if (acc_mode && accst!=NULL)
371 add_cdbkey(accst, fpos, reclen);
372 break; //the whole seq_name was only one db entry
373 }
374 *dbacc_end=0; //end key here
375 if (!acc_only)
376 add_cdbkey(dbacc_start, fpos, reclen);
377 if (acc_mode)
378 add_cdbkey(accst, fpos, reclen);
379 if (dbacc_end==token_end)
380 break; //reached the end of this whole seq_name (non-space token)
381 dbacc_start=dbacc_end+1;
382 firstacc_end=NULL;
383 dbacc_end=parse_dbacc(dbacc_start, firstacc_end, accst);
384 }
385 // -- get to next concatenated defline, if any:
386 if (compact_plus && nrdb_end!=NULL) {
387 defline=nrdb_end+1; //look for the next nrdb concatenation
388 NRDB_Rec(nrdb_end, defline);
389 //isolate the first token in this nrdb record
390 token_end=parseSpToken(defline);
391 }
392 else break;
393 } //for
394 }
395
396 int readWords(FILE* f, GHash<int>& xhash) {
397 int c;
398 int count=0;
399 char name[256];
400 int len=0;
401 while ((c=getc(f))!=EOF) {
402 if (isspace(c) || c==',' || c==';') {
403 if (len>0) {
404 name[len]='\0';
405 xhash.Add(name, new int(1));
406 count++;
407 len=0;
408 }
409 continue;
410 }
411 //a non-space
412 name[len]=(char) c;
413 len++;
414 if (len>255) {
415 name[len-1]='\0';
416 GError("Error reading words file: token too long ('%s') !\n",name);
417 }
418 }
419 if (len>0) {
420 name[len]='\0';
421 xhash.Add(name, new int(1));
422 count++;
423 }
424 return count;
425 }
426
427
428
429 //========================== MAIN ===============================
430 int main(int argc, char **argv) {
431 FILE* f_read=NULL;
432 off_t fdbsize;
433 int ch;
434 char* zfilename;
435 char* fname;
436 char* marker; //record marker
437 int maxkeys=0;
438 int multikey;
439 record_marker[0]='>';
440 record_marker[1]=0;
441 GArgs args(argc, argv, "mn:o:r:z:w:f:s:icvQCaA");
442 int e=args.isError();
443 if (e>0)
444 GError("%s Invalid argument: %s\n", USAGE, argv[e] );
445 if (args.getOpt('v')!=NULL) {
446 printf("%s\n",VERSION);
447 return 0;
448 }
449 fastq = (args.getOpt('Q')!=NULL);
450
451 multikey = (args.getOpt('m')!=NULL);
452 if (multikey) {
453 fields[numFields]=1;
454 numFields++;
455 fields[numFields]=MAX_UINT;
456 numFields++;
457 }
458 caseInsensitive = (args.getOpt('i')!=NULL);
459 acc_only=(args.getOpt('a')!=NULL);
460 acc_mode=(acc_only || args.getOpt('A')!=NULL);
461 compact_plus=(args.getOpt('C')!=NULL || acc_mode);
462 wordJunk = (char *)args.getOpt('s');
463 if (wordJunk==NULL) wordJunk=(char*)defWordJunk;
464 int compact=(args.getOpt('c')!=NULL || compact_plus);
465 if (compact && multikey) {
466 GError("%s Error: invalid flags combination.\n", USAGE);
467 }
468 char* s = (char*)args.getOpt('n');
469 if (s!=NULL) {
470 maxkeys = atoi(s);
471 if (maxkeys<=1 || compact || multikey)
472 GError("%s Error: invalid options (-m, -c/C, -n and -f are exclusive)\n", USAGE);
473 multikey=1;
474 numFields=maxkeys;
475 if (numFields>254) GError(ERR_TOOMANYFIELDS);
476 for (unsigned int i=1;i<=numFields;i++) fields[i-1]=i;
477 }
478 char* argfields = (char*)args.getOpt('f');
479 if (argfields!=NULL) { //parse all the field #s
480 char* pbrk;
481 int prevnum=0;
482 char prevsep='\0';
483 numFields=0;
484 char sep;
485 char *p=argfields;
486 do {
487 pbrk=strpbrk(p,",-");
488 if (pbrk==NULL) {
489 sep='\0';
490 pbrk=p+strlen(p);
491 if (prevsep == '-' && *p=='\0' && prevnum>0) {
492 //open ended range -- ending with '-'
493 fields[numFields]=prevnum;
494 numFields++;
495 if (numFields>253) GError(ERR_TOOMANYFIELDS);
496 fields[numFields]=MAX_UINT;
497 numFields++;
498 //GMessage("--- stored %d, %d\n",prevnum, MAX_UINT);
499 break;
500 }// ending with '-'
501 } // '\0'
502 else { sep=*pbrk; *pbrk = '\0'; }
503 int num = atoi(p);
504 if (num<=0 || num>254 )
505 GError("%s Error: invalid syntax for -f option.\n", USAGE);
506 if (prevsep == '-') { //store a range
507 for (int i=prevnum;i<=num;i++) {
508 fields[numFields]=i;
509 numFields++;
510 if (numFields>254) GError(ERR_TOOMANYFIELDS);
511 }
512 }
513 else if (sep!='-') {
514 fields[numFields]=num;
515 numFields++;
516 if (numFields>254) GError(ERR_TOOMANYFIELDS);
517 }
518
519 prevsep=sep;
520 prevnum=num;
521 p=pbrk+1;
522 } while (sep != '\0'); //range parsing loop
523 if (numFields<=0 || numFields>254 )
524 GError("%s Error at parsing -f option.\n", USAGE);
525 //GMessage("[%d] Fields parsed (%d values):\n", sizeof(fields[0]), numFields);
526 qsort(fields, numFields, sizeof(fields[0]), &qcmpInt);
527 multikey=1;
528 /*-- --------debug:
529 for (unsigned int i=0;i<numFields-1;i++) {
530 GMessage("%d,", fields[i]);
531 }
532 GMessage("%d\n",fields[numFields-1]);
533 exit(0); */
534 } //fields
535 if (fastq) {
536 record_marker[0]='@';
537 record_marker_len=1;
538 }
539 if (args.getOpt('r')!=NULL) {//non-FASTA record delimiter?
540 if (fastq) {
541 GMessage("Option -r ignored because -Q was given (-Q sets delimiter to '@')\n");
542 }
543 else {
544 marker=(char*)args.getOpt('r'); //
545 int v=0;
546 if (strlen(marker)>126)
547 GError("Error: the specified record delimiter is too long. "
548 "Maximum accepted is 126\n");
549 //special case: hex (0xXX) and octal codes (\XXX) are accepted, only if by themselves
550 if (strlen(marker)==4 && marker[0]=='\\' || (marker[0]=='0' && (toupper(marker[1])=='X'))) {
551 if (marker[0]=='\\') {
552 marker++;
553 v=strtol(marker, NULL, 8);
554 }
555 else v=strtol(marker, NULL, 16);
556 if (v==0 || v>255)
557 GError("Invalid record delimiter: should be only one character,\n"
558 "'\\NNN' (octal value), '0xNN' (hexadecimal value)");
559 record_marker[0]=v;
560 record_marker_len=1;
561 }
562 else {
563 strcpy(record_marker, marker);
564 record_marker_len=strlen(record_marker);
565 }
566 }
567 }
568 char* stopwords=(char*)args.getOpt('w'); //stop words filename given?
569 if (stopwords!=NULL) {
570 FILE* fstopwords=NULL;
571 if ((fstopwords=fopen(stopwords, "r"))==NULL)
572 GError("Cannot open stop words file '%s'!\n", stopwords);
573 int c=readWords(fstopwords, stopList);
574 GMessage("Loaded %d stop words.\n", c);
575 fclose(fstopwords);
576 useStopWords=(c>0);
577 }
578 if ((zfilename=(char*)args.getOpt('z')) !=NULL) {
579 do_compress=true;
580 #ifndef ENABLE_COMPRESSION
581 GError("Error: compression requested but not enabled when cdbfasta was compiled\n")
582 #endif
583 strcpy(fztmp,zfilename);
584 strcat(fztmp,"_ztmp");
585 zf=fopen(fztmp,"wb");
586 if (zf==NULL)
587 GError("Error creating file '%s'\n'", fztmp);
588 }
589 char* outfile=(char*) args.getOpt('o');
590 int numfiles = args.startNonOpt();
591 if (numfiles==0)
592 GError("%sError: no fasta file given.\n", USAGE);
593 fname=(char*) args.nextNonOpt(); //first fasta file given
594 if (do_compress) { //-------- compression case -------------------
595 if (strcmp(fname, "-")==0 || strcmp(fname, "stdin")==0)
596 f_read=stdin;
597 else f_read= fopen(fname, "rb");
598 if (f_read == NULL) die_read(fname);
599 fname=zfilename; //forget the input file name, keep the output
600 }
601 else {//
602 int fdread= open(fname, O_RDONLY|O_BINARY);
603 if (fdread == -1) die_read(fname);
604 struct stat dbstat;
605 fstat(fdread, &dbstat);
606 fdbsize=dbstat.st_size;
607 close(fdread);
608 f_read= fopen(fname, "rb");
609 if (f_read == NULL) die_read(fname);
610 }
611
612 char idxfile[365];
613 if (outfile==NULL) {
614 if (do_compress) {
615 strcpy(ftmp, zfilename);
616 strcat(ftmp, ".cidx");
617 strcpy(idxfile, ftmp);
618 strcat(ftmp, "_tmp");
619 }
620 else {
621 strcpy(ftmp, fname);
622 strcat(ftmp, ".cidx");
623 strcpy(idxfile, ftmp);
624 strcat(ftmp, "_tmp");
625 }
626 //should add the process ID, time and user to make this unique?
627 }
628 else {
629 strcpy(ftmp, outfile);
630 strcpy(idxfile, outfile);
631 strcat(ftmp, "_tmp");
632 }
633
634 cdbidx=new GCdbWrite(ftmp); //test if this was successful?
635
636 if (compact)
637 addKeyFunc=&addKeyCompact;
638 else if (multikey)
639 addKeyFunc = &addKeyMulti;
640 else addKeyFunc = &addKey;
641
642 off_t recpos=0;
643 off_t r=0;
644 unsigned int recsize=0;
645 char* key=NULL;
646 bool fullDefline=(multikey || compact_plus);
647 GReadBuf *readbuf = new GReadBuf(f_read, GREADBUF_SIZE);
648 if (do_compress) { //---------------- compression case -------------
649 if (fastq) GError("Error: sorry, compression is not supported with fastq format\n");
650 fdbsize=0;
651 GCdbz cdbz(zf); // zlib interface
652 recpos=cdbz.getZRecPos();
653 while ((key=cdbz.compress(readbuf,record_marker))!=NULL) {
654 recsize=cdbz.getZRecSize();
655 if (!fullDefline) {
656 //find first space after the record_marker and place a '\0' there
657 for (int i=record_marker_len; key[i]!='\0';i++) {
658 if (isspace(key[i])) { key[i]='\0';break; }
659 }
660 }
661 addKeyFunc(key, recpos, recsize);
662 recpos = cdbz.getZRecPos();
663 }
664 remove(zfilename);
665 cdbz.compress_end();
666 fclose(zf);
667 //determine the size of this file:
668 int ftmp= open(fztmp, O_RDONLY|O_BINARY);
669 if (ftmp == -1) die_read(fztmp);
670 struct stat dbstat;
671 fstat(ftmp, &dbstat);
672 fdbsize=dbstat.st_size;
673 //rename it to the intended file name
674 if (rename(fztmp,zfilename) != 0) {
675 GMessage("Error: unable to rename '%s' to '%s'\n",fztmp,zfilename);
676 perror("rename");
677 }
678 } //compression requested
679 else { // not compressed -- buffered file access
680 bool defline=false;
681 int kbufsize=KBUFSIZE;
682 if (fullDefline) { GMALLOC(key, KBUFSIZE); }//large defline storage buffer, just in case
683 else { GMALLOC(key, 1024); }
684 int kidx=-1;
685 num_recs=0;
686 num_keys=0;
687 char lastchar=0;
688 //first iteration -- for the beginning of file case
689 if (readbuf->peekCmp(record_marker, record_marker_len)==0) {
690 //new record start found (defline)
691 recpos=readbuf->getPos(); //new record pos
692 defline=true; //we're in defline
693 readbuf->skip(record_marker_len);
694 kidx=0;
695 }//new record start
696 int linecounter=0;
697 bool checkNewRec=true;
698 while ((ch=readbuf->getch())>0) {
699 if (defline && kidx>=0) { //on the defline here, still parsing possible keys
700 key[kidx]=(char)ch;
701 kidx++;
702 if (kidx>=kbufsize) {
703 kbufsize+=KBUFSIZE;
704 GREALLOC(key, kbufsize);
705 }
706 if (((isspace(ch) || ch<31) && fullDefline==false) || ch=='\n' || ch=='\r') {
707 //end key here, don't care about the rest
708
709 key[kidx-1]='\0';
710 kidx=-1;
711 }
712 }
713 if (ch=='\n') { // newline!
714 //TODO: should this be '\r' on MacOS ?!
715 linecounter++;
716 //check ahead if a record delimiter follows
717 checkNewRec = (!fastq || linecounter>3);
718 if (checkNewRec && readbuf->peekCmp(record_marker, record_marker_len)==0) {
719 //new record start (defline)
720 recsize = readbuf->getPos()-recpos-1; //previous recsize
721 if (recsize>off_t(record_marker_len+1) && key[0]!='\0') {
722 //add previous record, if there
723 addKeyFunc(key, recpos, recsize);
724 linecounter=0;
725 //GMessage("adding key=%s\n",key);
726 }
727 recpos=readbuf->getPos(); //new record pos
728 defline=true; //we're in defline
729 readbuf->skip(record_marker_len);
730 //if (r<0) die_readformat(fname);
731 kidx=0;
732 } //new record start
733 else { //after newline but not a new record start
734 if (defline) { //we just finished a defline
735 if (fullDefline) { //close the defline string
736 if (kidx>0) key[kidx-1]='\0';
737 kidx=-1;
738 }
739 defline=false;
740 }
741 } //after newline but not a new record
742 } // was newline
743 lastchar=ch;
744 }//while char
745 recsize=readbuf->getPos()-recpos;
746 if (recsize>0) {//add last record, if there
747 if (lastchar=='\n') recsize--;
748 if (fullDefline && kidx>0) {//close the defline string
749 if (lastchar!='\n') kidx++;
750 key[kidx-1]='\0';
751 }
752 addKeyFunc(key, recpos, recsize);
753 linecounter=0;
754 //GMessage("adding key=%s\n",key);
755 }
756 delete readbuf;
757 }
758 if (f_read!=stdin) fclose(f_read);
759 if (cdbidx->finish() == -1) die_write("");
760
761 // === add some statistics at the end of the cdb index file!
762 r=lseek(cdbidx->getfd(), 0, SEEK_END);
763 cdbInfo info;
764 memcpy((void*)info.tag, (void*)"CDBX", 4);
765 info.idxflags=0;
766 if (multikey) info.idxflags |= CDBMSK_OPT_MULTI;
767 if (do_compress) {
768 info.idxflags |= CDBMSK_OPT_COMPRESS;
769 GMessage("Input data were compressed into file '%s'\n",fname);
770 }
771 if (compact) {
772 if (compact_plus)
773 info.idxflags |= CDBMSK_OPT_CADD;
774 else
775 info.idxflags |= CDBMSK_OPT_C;
776 }
777 info.num_records=gcvt_uint(&num_recs);
778 info.num_keys=gcvt_uint(&num_keys);
779 info.dbsize=gcvt_offt(&fdbsize);
780 info.idxflags=gcvt_uint(&info.idxflags);
781 int nlen=strlen(fname);
782 info.dbnamelen=gcvt_uint(&nlen);
783 r=write(cdbidx->getfd(), fname, nlen);
784 if (r!=nlen)
785 GError(ERR_W_DBSTAT);
786 r=write(cdbidx->getfd(), &info, cdbInfoSIZE);
787 if (r!=cdbInfoSIZE)
788 GError(ERR_W_DBSTAT);
789 delete cdbidx;
790 GFREE(key);
791 remove(idxfile);
792 if (rename(ftmp,idxfile) == -1)
793 GError("Error: unable to rename %s to %s",ftmp,idxfile);
794 GMessage("%d entries from file %s were indexed in file %s\n",
795 num_recs, fname, idxfile);
796 return 0;
797 }