ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/cdbfasta/cdbfasta.cpp
Revision: 15
Committed: Mon Jul 18 20:53:45 2011 UTC (11 years, 2 months ago) by gpertea
File size: 33280 byte(s)
Log Message:
sync with local src

Line File contents
1 #include <ctype.h>
2 #include <fcntl.h>
3 #include <string.h>
4 #include <sys/stat.h>
5 #include "GBase.h"
6 #include "GArgs.h"
7 #include "GHash.hh"
8 #include "gcdb.h"
9 #ifdef ENABLE_COMPRESSION
10 #include "gcdbz.h"
11 #endif
12 #ifdef __WIN32__
13 #define VERSION "cdbfasta version 0.995w"
14 #else
15 #define VERSION "cdbfasta version 0.995"
16 #endif
17
18 #define USAGE "Usage:\n\
19 cdbfasta <fastafile> [-o <index_file>] [-r <record_delimiter>]\n\
20 [-z <compressed_db>] [-i] [-m|-n <numkeys>|-f<LIST>]|-c|-C]\n\
21 [-w <stopwords_list>] [-s <stripendchars>] [{-Q|-G}] [-v]\n\
22 \n\
23 Creates an index file for records from a multi-fasta file.\n\
24 By default (without -m/-n/-c/-C option), only the first \n\
25 space-delimited token from the defline is used as a key.\n\
26 \n\
27 <fastafile> is the multi-fasta file to index; \n\
28 -o the index file will be named <index_file>; if not given,\n\
29 the index filename is database name plus the suffix '.cidx'\n\
30 -r <record_delimiter> a string of characters at the beginning of line\n\
31 marking the start of a record (default: '>')\n\
32 -Q treat input as fastq format, i.e. with '@' as record delimiter\n\
33 and with records expected to have at least 4 lines\n\
34 -z database is compressed into the file <compressed_db>\n\
35 before indexing (<fastafile> can be \"-\" or \"stdin\" \n\
36 in order to get the input records from stdin)\n\
37 -s strip extraneous characters from *around* the space delimited\n\
38 tokens, for the multikey options below (-m,-n,-f);\n\
39 Default <stripendchars> set is: '\",`.(){}/[]!:;~|><+-\n\
40 -m (\"multi-key\" option) create hash entries pointing to \n\
41 the same record for all tokens found in\n\
42 the defline\n\
43 -n <numkeys> same as -m, but only takes the first <numkeys>\n\
44 tokens from the defline\n\
45 -f indexes *space* delimited tokens (fields) in the defline as given\n\
46 by LIST of fields or fields ranges (the same syntax as UNIX 'cut')\n\
47 -w <stopwordslist> exclude from indexing all the words found\n\
48 in the file <stopwordslist> (for options -m, -n and -k)\n\
49 -i do case insensitive indexing (i.e. create additional keys for \n\
50 all-lowercase tokens used for indexing from the defline \n\
51 -c for deflines in the format: db1|accession1|db2|accession2|...,\n\
52 only the first db-accession pair ('db1|accession1') is taken as key\n\
53 -C like -c, but also subsequent db|accession constructs are indexed,\n\
54 along with the full (default) token; additionally,\n\
55 all nrdb concatenated accessions found in the defline \n\
56 are parsed and stored (assuming 0x01 or '^|^' as separators)\n\
57 -a accession mode: like -C option, but indexes the 'accession'\n\
58 part for all 'db|accession' constructs found\n\
59 -A like -a and -C together (both accessions and 'db|accession'\n\
60 constructs are used as keys\n\
61 -D index each pipe ('|') delimited token found in the record identifier\n\
62 (e.g. >key1|key2|key3|.. )\n\
63 -d same as -D but using a custom key delimiter <kdelim> instead of the pipe\n\
64 character '|'\n\
65 -G FASTA records are treated as large genomic sequences (e.g. full \n\
66 chromosomes/contigs) and their formatting is checked for suitability\n\
67 for fast range queries (i.e. uniform line length within each record)\n\
68 -v show program version and exit\n"
69
70 /*
71 */
72
73 #define ERR_TOOMANYFIELDS "Error: too many fields for -f option\n"
74 //16K initial defline buffer
75 #define KBUFSIZE 0x4000
76 #ifndef O_BINARY
77 #define O_BINARY 0x0000
78 #endif
79 #define MAX_KEYLEN 1024
80 //64K input buffer
81 #define GREADBUF_SIZE 0x010000
82
83
84 typedef void (*addFuncType)(char*, off_t, uint32);
85
86 char ftmp[365];
87 char fztmp[365];
88 char record_marker[127]; //record delimiter
89 int record_marker_len=1;
90 int num_recs;
91 int num_keys;
92
93 off_t last_cdbfpos=0;
94
95 int compact_plus; //shortcut key and
96 bool acc_mode=false;
97 bool acc_only=false;
98 bool do_compress=false; // compression used
99 bool fastq=false;
100 bool gFastaSeq=false;
101 char keyDelim=0;
102
103 FILE* zf=NULL; //compressed file handle
104
105 //store just offset and record length
106 const char* defWordJunk="'\",`.(){}/[]!:;~|><+-";
107 char* wordJunk=NULL;
108 bool caseInsensitive=false; //case insensitive storage
109 bool useStopWords=false;
110 unsigned int numFields=0;
111 // we have fields[numFields-1]=MAX_UINT as defined in gcdb.h --
112 // as an indicator of taking every single token in the defline,
113 // or for open ended ranges (e.g. -f5- )
114 unsigned int fields[255]; //array of numFields field indices (1-based)
115 GHash<int> stopList;
116 //static int datalen=sizeof(uint32)+sizeof(off_t);
117
118 char lastKey[MAX_KEYLEN]; //keep a copy of the last valid written key
119
120 GCdbWrite* cdbidx;
121 addFuncType addKeyFunc;
122
123 #define ERR_W_DBSTAT "Error writing the database statististics!\n"
124
125 void die_write(const char* fname) {
126 GError("Error: cdbhash was unable to write into file %s\n",
127 fname);
128
129 }
130
131 void die_read(const char* infile) {
132 GError("Error: cdbhash was unable to read the input file %s\n",infile);
133 }
134
135
136 void die_readformat(const char* infile) {
137 GError("Error: bad format for file %s; is it a fastA file?\n",
138 infile);
139 }
140
141 void die_gseqformat(const char* seqname) {
142 GError("Error: invalid FASTA sequence format for %s (not uniform line length)\n",
143 seqname);
144 }
145
146 void die_fastqformat(const char* seqid, int seqlen, int qvlen) {
147 GError("Error: invalid FASTQ sequence format for %s (seqlen=%d, qvlen=%d)\n",
148 seqid);
149 }
150
151
152 bool add_cdbkey(char* key, off_t fpos, uint32 reclen, int16_t linelen=0, byte elen=0) {
153
154 unsigned int klen=strlen(key);
155 if (fpos==last_cdbfpos && strcmp(key, lastKey)==0) return true;
156 if (klen<1) {
157 /* GMessage("Warning: zero length key found following key '%s'\n",
158 lastKey);
159 */
160 return false;
161 }
162 //------------ adding record -----------------
163 num_keys++;
164 strncpy(lastKey, key, MAX_KEYLEN-1);
165 lastKey[MAX_KEYLEN-1]='\0';
166 if ((uint64)fpos>(uint64)MAX_UINT) { //64 bit file offset
167 uint64 v= (uint64) fpos; //needed for Solaris' off_t issues with gcc/32
168 /*
169 if (linelen>0) {
170 //record with line len
171 CIdxSeqData recdata;
172 recdata.fpos=gcvt_offt(&v);
173 recdata.reclen=gcvt_uint(&reclen);
174 recdata.linelen=gcvt_int16(&linelen);
175 recdata.elen=elen;
176 if (cdbidx->addrec(key,klen,(char*)&recdata,IdxSeqDataSIZE)==-1)
177 GError("Error adding cdb record with key '%s'\n",key);
178 }
179
180 else { //plain record */
181 CIdxData recdata;
182 recdata.fpos=gcvt_offt(&v);
183 recdata.reclen=gcvt_uint(&reclen);
184 if (cdbidx->addrec(key,klen,(char*)&recdata,IdxDataSIZE)==-1)
185 GError("Error adding cdb record with key '%s'\n",key);
186 //}
187 }
188 else {//32 bit file offset is enough
189 /*
190 if (linelen>0) {
191 CIdxSeqData32 recdata;
192 uint32 v=(uint32) fpos;
193 recdata.fpos=gcvt_uint(&v);
194 recdata.reclen=gcvt_uint(&reclen);
195 recdata.linelen=gcvt_int16(&linelen);
196 recdata.elen=elen;
197 if (cdbidx->addrec(key,klen,(char*)&recdata,IdxDataSIZE32)==-1)
198 GError("Error adding cdb record with key '%s'\n",key);
199 }
200 else { */
201 CIdxData32 recdata;
202 uint32 v=(uint32) fpos;
203 recdata.fpos=gcvt_uint(&v);
204 recdata.reclen=gcvt_uint(&reclen);
205 if (cdbidx->addrec(key,klen,(char*)&recdata,IdxDataSIZE32)==-1)
206 GError("Error adding cdb record with key '%s'\n",key);
207 //}
208 }
209 last_cdbfpos=fpos;
210 return true;
211 }
212
213 //default indexing: key directly passed --
214 // as the first space delimited token
215 void addKey(char* key, off_t fpos,
216 uint32 reclen) {
217 num_recs++;
218 add_cdbkey(key, fpos, reclen);
219 if (caseInsensitive) {
220 char* lckey=loCase(key);
221 if (strcmp(lckey, key)!=0)
222 add_cdbkey(lckey, fpos, reclen);
223 GFREE(lckey);
224 }
225 }
226
227 //the whole defline is passed
228 void addKeyMulti(char* defline,
229 off_t fpos, uint32 reclen) {
230 char* p=defline;
231 unsigned int fieldno=0;
232 char* pn;
233 num_recs++;
234 bool stillParsing=true;
235 unsigned int fidx=0; //index in fields[] array
236 while (stillParsing) {
237 while ((*p)==' ' || (*p)=='\t') p++;
238 if (*p == '\0') break;
239 //skip any extraneous characters at the beginning of the token
240 while (chrInStr(*p, wordJunk)) p++;
241 //skip any padding spaces or other extraneous characters
242 //at the beginning of the word
243 if (*p == '\0') break;
244 pn=p;
245 while (*pn!='\0' && !isspace(*pn)) pn++;
246 //found next space or end of string
247 fieldno++;
248 while (fields[fidx]<fieldno && fidx<numFields-1) fidx++;
249 //GMessage("> p=>'%s' [%d, %d, %d] (next='%s')\n",p, numFields,
250 // fieldno, fields[numFields-1], pn+1);
251 stillParsing = (((*pn)!='\0') && (fieldno+1<=fields[numFields-1]));
252 char* pend = pn-1; //pend is on the last non-space in the current token
253 //--strip the ending junk, if any
254 while (chrInStr(*pend, wordJunk) && pend>p) pend--;
255 if (pend<pn-1) *(pend+1)='\0';
256 else *pn='\0';
257
258 if (strlen(p)>0) {
259 if (fields[fidx]==MAX_UINT || fields[fidx]==fieldno) {
260 if (useStopWords && stopList.hasKey(p)) {
261 p=pn+1;
262 continue;
263 }
264 //--- store this key with the same current record data:
265 add_cdbkey(p, fpos, reclen);
266 //---storage code ends here
267 if (caseInsensitive) {
268 char* lcp=loCase(p);
269 if (strcmp(lcp,p)!=0)
270 add_cdbkey(lcp, fpos, reclen);
271 GFREE(lcp);
272 }
273 }
274 //if (isEnd) break; //if all the token were stored
275 }
276 p=pn+1;//p is now on the next token's start
277 } //token parsing loop
278 }
279
280
281 int qcmpInt(const void *p1, const void *p2) {
282 //int n1=*((int*)p1);
283 //int n2=*((int*)p2);
284 const unsigned int *a = (unsigned int *) p1;
285 const unsigned int *b = (unsigned int *) p2;
286 if (*a < *b) return -1;
287 else if (*a > *b) return 1;
288 else return 0;
289 }
290
291
292 char* parse_dbacc(char* pstart, char*& end_acc, char*& accst) {
293 if (pstart==NULL || *pstart==0) return NULL;
294 bool hasDigits=false;
295 char* pend=pstart;
296 end_acc=NULL; //end of first accession
297 accst=NULL;
298 while (*pstart=='|') pstart++;
299 for(char* p=pstart;;p++) {
300 if (hasDigits==false && *p>='0' && *p<='9') hasDigits=true;
301 /* if (*p==0) { //end of seq_name
302 pend=p; //doesn't matter if it's accession or "db"-like
303 break;
304 }*/
305 if (*p=='|' || *p==0) {
306 int curlen=p-pstart;
307 if (*p==0 || (hasDigits && curlen>3) || curlen>7 || accst!=NULL) {//previous token was an accession
308 pend=p; //advance pend
309 if (end_acc==NULL) end_acc=p;
310 if (accst==NULL) accst=pstart;
311 break;
312 }
313 else { //first pipe char or no digits
314 accst=p+1;
315 }
316 hasDigits=false;//reset flag
317 } // | separator
318 }
319 if (pend!=pstart) return pend;
320 else return NULL;
321 }
322
323
324
325 char* nextKeyDelim(char* pstart, char* pmax) {
326 if (pstart==NULL || pstart>=pmax) return NULL;
327 char* p=pstart;
328 while (*p!=keyDelim && p!=pmax && *p!=0) p++;
329 return p;
330 }
331
332 char* endSpToken(char* str) {
333 if (str==NULL) return NULL;
334 char* p=str;
335 //while (*p!=' ' && *p!='\t' && *p!='\v' && *p!=0) p++;
336 while (!isspace(*p) && *p!=0) p++;
337 *p=0;
338 return p;
339 }
340
341 #define NRDB_CHARSEP "\1\2\3\4"
342 #define NRDB_STRSEP "^|^"
343
344 //nrdbp is positioned at the end of current nrdb concatenated
345 //defline, or NULL if there is no more
346 inline void NRDB_Rec(char* &nrdbp, char* defline) {
347 nrdbp=strpbrk(defline, NRDB_CHARSEP);
348 if (nrdbp==NULL) {
349 nrdbp=strstr(defline, NRDB_STRSEP);
350 if (nrdbp!=NULL) {
351 *nrdbp='\0';
352 nrdbp+=2;
353 }
354 }
355 else *nrdbp='\0';
356 }
357
358 //-c/-C/-a/-A indexing: key up to the first space or second '|'
359 //receives the full defline
360 void addKeyCompact(char* defline,
361 off_t fpos, uint32 reclen) {
362 //we got the first token found on the defline
363 num_recs++;
364 char* nrdb_end;
365 //breaks defline at the next nrdb concatenation point
366 NRDB_Rec(nrdb_end, defline);
367 //isolate the first token in this nrdb concatenation
368 char* token_end=endSpToken(defline); //this puts 0 at the first space encountered
369 if (!compact_plus) { //shortcut key
370 //only the first db|accession construct will be indexed, if found
371 char* end_acc1=NULL; //end of first accession
372 char* acc1st=NULL;
373 char* dbacc_end=parse_dbacc(defline, end_acc1, acc1st);
374 if (end_acc1!=NULL) { //has acceptable shortcut
375 *end_acc1=0;
376 add_cdbkey(defline, fpos, reclen);
377 return;
378 }
379 if (dbacc_end!=NULL) {
380 *dbacc_end=0;
381 add_cdbkey(defline, fpos, reclen);
382 return;
383 }
384 //store this whole non-space token as key:
385 add_cdbkey(defline, fpos, reclen);
386 return;
387 }
388 //from now on only -C/-a/-A treatment:
389 for(;;) {
390 //defline is on the first token
391 if (strlen(defline)>0) //add whole non-space token as the "full key"
392 add_cdbkey(defline, fpos, reclen);
393 //add the db|accession constructs as keys
394 char* dbacc_start=defline;
395 char* firstacc_end=NULL;
396 char* accst=NULL;
397 char* dbacc_end=parse_dbacc(dbacc_start, firstacc_end, accst);
398 while (dbacc_end!=NULL) {
399 if (firstacc_end!=NULL && firstacc_end<dbacc_end) {
400 char c=*firstacc_end;
401 *firstacc_end=0;
402 if (!acc_only)
403 add_cdbkey(dbacc_start, fpos, reclen);
404 if (acc_mode) {
405 add_cdbkey(accst, fpos, reclen);
406 }
407 *firstacc_end=c;
408 }
409 if (dbacc_start==defline && dbacc_end==token_end) {
410 if (acc_mode && accst!=NULL)
411 add_cdbkey(accst, fpos, reclen);
412 break; //the whole seq_name was only one db entry
413 }
414 *dbacc_end=0; //end key here
415 if (!acc_only)
416 add_cdbkey(dbacc_start, fpos, reclen);
417 if (acc_mode)
418 add_cdbkey(accst, fpos, reclen);
419 if (dbacc_end==token_end)
420 break; //reached the end of this whole seq_name (non-space token)
421 dbacc_start=dbacc_end+1;
422 firstacc_end=NULL;
423 dbacc_end=parse_dbacc(dbacc_start, firstacc_end, accst);
424 }
425 // -- get to next concatenated defline, if any:
426 if (compact_plus && nrdb_end!=NULL) {
427 defline=nrdb_end+1; //look for the next nrdb concatenation
428 NRDB_Rec(nrdb_end, defline);
429 //isolate the first token in this nrdb record
430 token_end=endSpToken(defline);
431 }
432 else break;
433 } //for
434 }
435
436 void addKeyDelim(char* defline,
437 off_t fpos, uint32 reclen) {
438 //we got the first token found on the defline
439 num_recs++;
440 char* nrdb_end;
441 //breaks defline at the next nrdb concatenation point
442 NRDB_Rec(nrdb_end, defline);
443 //isolate the first space-delimited token in this nrdb concatenation
444 char* token_end=endSpToken(defline);
445 for(;;) {
446 //defline is on the first token
447 if (strlen(defline)==0) break;
448 //add the db|accession constructs as keys
449 char* k_start=defline;
450 char* k_end=NULL;
451 while ((k_end=nextKeyDelim(k_start, token_end))!=NULL) {
452 *k_end=0;
453 add_cdbkey(k_start, fpos, reclen);
454 k_start=k_end+1;
455 }
456 // -- get to next concatenated defline, if any:
457 if (nrdb_end!=NULL) {
458 defline=nrdb_end+1; //look for the next nrdb concatenation
459 NRDB_Rec(nrdb_end, defline);
460 //isolate the first space-delimited token in this nrdb record
461 token_end=endSpToken(defline);
462 }
463 else break;
464 } //for
465 }
466
467 int readWords(FILE* f, GHash<int>& xhash) {
468 int c;
469 int count=0;
470 char name[256];
471 int len=0;
472 while ((c=getc(f))!=EOF) {
473 if (isspace(c) || c==',' || c==';') {
474 if (len>0) {
475 name[len]='\0';
476 xhash.Add(name, new int(1));
477 count++;
478 len=0;
479 }
480 continue;
481 }
482 //a non-space
483 name[len]=(char) c;
484 len++;
485 if (len>255) {
486 name[len-1]='\0';
487 GError("Error reading words file: token too long ('%s') !\n",name);
488 }
489 }
490 if (len>0) {
491 name[len]='\0';
492 xhash.Add(name, new int(1));
493 count++;
494 }
495 return count;
496 }
497
498
499
500 //========================== MAIN ===============================
501 int main(int argc, char **argv) {
502 FILE* f_read=NULL;
503 off_t fdbsize;
504 int ch;
505 char* zfilename;
506 char* fname;
507 char* marker; //record marker
508 int maxkeys=0;
509 int multikey=0;
510 record_marker[0]='>';
511 record_marker[1]=0;
512 GArgs args(argc, argv, "icvDQCaAmn:o:r:z:w:f:s:d:");
513 int e=args.isError();
514 if (e>0)
515 GError("%s Invalid argument: %s\n", USAGE, argv[e] );
516 if (args.getOpt('v')!=NULL) {
517 printf("%s\n",VERSION);
518 return 0;
519 }
520 fastq = (args.getOpt('Q')!=NULL);
521 gFastaSeq=(args.getOpt('G')!=NULL);
522 if (fastq && gFastaSeq)
523 GError("Error: options -Q and -G are mutually exclusive.\n");
524 if (args.getOpt('D')) keyDelim='|';
525 else if (args.getOpt('d')!=NULL) keyDelim=args.getOpt('d')[0];
526 multikey = (args.getOpt('m')!=NULL);
527 if (multikey) {
528 fields[numFields]=1;
529 numFields++;
530 fields[numFields]=MAX_UINT;
531 numFields++;
532 }
533 caseInsensitive = (args.getOpt('i')!=NULL);
534 acc_only=(args.getOpt('a')!=NULL);
535 acc_mode=(acc_only || args.getOpt('A')!=NULL);
536 compact_plus=(args.getOpt('C')!=NULL || acc_mode);
537 wordJunk = (char *)args.getOpt('s');
538 if (wordJunk==NULL) wordJunk=(char*)defWordJunk;
539 int compact=(args.getOpt('c')!=NULL || compact_plus);
540 if ((compact && multikey) || (multikey && keyDelim)) {
541 GError("%s Error: invalid flag combination.\n", USAGE);
542 }
543 char* argfields = (char*)args.getOpt('f');
544 char* s = (char*)args.getOpt('n');
545 if (s!=NULL) {
546 maxkeys = atoi(s);
547 if (maxkeys>1 && (compact || multikey || keyDelim || argfields!=NULL))
548 GError("%s Error: invalid options (-m, -c/C, -n, -D/-d and -f are exclusive)\n", USAGE);
549 multikey=1;
550 numFields=maxkeys;
551 if (numFields>254) GError(ERR_TOOMANYFIELDS);
552 for (unsigned int i=1;i<=numFields;i++) fields[i-1]=i;
553 }
554
555 if (argfields!=NULL) { //parse all the field #s
556 if (multikey || keyDelim || compact)
557 GError("%s Error: invalid options (-m, -c/C, -n, -D/-d and -f are exclusive)\n", USAGE);
558 char* pbrk;
559 int prevnum=0;
560 char prevsep='\0';
561 numFields=0;
562 char sep;
563 char *p=argfields;
564 do {
565 pbrk=strpbrk(p,",-");
566 if (pbrk==NULL) {
567 sep='\0';
568 pbrk=p+strlen(p);
569 if (prevsep == '-' && *p=='\0' && prevnum>0) {
570 //open ended range -- ending with '-'
571 fields[numFields]=prevnum;
572 numFields++;
573 if (numFields>253) GError(ERR_TOOMANYFIELDS);
574 fields[numFields]=MAX_UINT;
575 numFields++;
576 //GMessage("--- stored %d, %d\n",prevnum, MAX_UINT);
577 break;
578 }// ending with '-'
579 } // '\0'
580 else { sep=*pbrk; *pbrk = '\0'; }
581 int num = atoi(p);
582 if (num<=0 || num>254 )
583 GError("%s Error: invalid syntax for -f option.\n", USAGE);
584 if (prevsep == '-') { //store a range
585 for (int i=prevnum;i<=num;i++) {
586 fields[numFields]=i;
587 numFields++;
588 if (numFields>254) GError(ERR_TOOMANYFIELDS);
589 }
590 }
591 else if (sep!='-') {
592 fields[numFields]=num;
593 numFields++;
594 if (numFields>254) GError(ERR_TOOMANYFIELDS);
595 }
596
597 prevsep=sep;
598 prevnum=num;
599 p=pbrk+1;
600 } while (sep != '\0'); //range parsing loop
601 if (numFields<=0 || numFields>254 )
602 GError("%s Error at parsing -f option.\n", USAGE);
603 //GMessage("[%d] Fields parsed (%d values):\n", sizeof(fields[0]), numFields);
604 qsort(fields, numFields, sizeof(fields[0]), &qcmpInt);
605 multikey=1;
606 /*-- --------debug:
607 for (unsigned int i=0;i<numFields-1;i++) {
608 GMessage("%d,", fields[i]);
609 }
610 GMessage("%d\n",fields[numFields-1]);
611 exit(0); */
612 } //fields
613 if (fastq) {
614 record_marker[0]='@';
615 record_marker_len=1;
616 }
617 if (args.getOpt('r')!=NULL) {//non-FASTA record delimiter?
618 if (fastq) {
619 GMessage("Option -r ignored because -Q was given (-Q sets delimiter to '@')\n");
620 }
621 else {
622 marker=(char*)args.getOpt('r'); //
623 int v=0;
624 if (strlen(marker)>126)
625 GError("Error: the specified record delimiter is too long. "
626 "Maximum accepted is 126\n");
627 //special case: hex (0xXX) and octal codes (\XXX) are accepted, only if by themselves
628 if (strlen(marker)==4 && (marker[0]=='\\' || (marker[0]=='0' && toupper(marker[1])=='X') )) {
629 if (marker[0]=='\\') {
630 marker++;
631 v=strtol(marker, NULL, 8);
632 }
633 else v=strtol(marker, NULL, 16);
634 if (v==0 || v>255)
635 GError("Invalid record delimiter: should be only one character,\n"
636 "'\\NNN' (octal value), '0xNN' (hexadecimal value)");
637 record_marker[0]=v;
638 record_marker_len=1;
639 }
640 else {
641 strcpy(record_marker, marker);
642 record_marker_len=strlen(record_marker);
643 }
644 }
645 }
646 char* stopwords=(char*)args.getOpt('w'); //stop words filename given?
647 if (stopwords!=NULL) {
648 FILE* fstopwords=NULL;
649 if ((fstopwords=fopen(stopwords, "r"))==NULL)
650 GError("Cannot open stop words file '%s'!\n", stopwords);
651 int c=readWords(fstopwords, stopList);
652 GMessage("Loaded %d stop words.\n", c);
653 fclose(fstopwords);
654 useStopWords=(c>0);
655 }
656 if ((zfilename=(char*)args.getOpt('z')) !=NULL) {
657 do_compress=true;
658 #ifndef ENABLE_COMPRESSION
659 GError("Error: compression requested but not enabled when cdbfasta was compiled\n")
660 #endif
661 strcpy(fztmp,zfilename);
662 strcat(fztmp,"_ztmp");
663 zf=fopen(fztmp,"wb");
664 if (zf==NULL)
665 GError("Error creating file '%s'\n'", fztmp);
666 }
667 char* outfile=(char*) args.getOpt('o');
668 int numfiles = args.startNonOpt();
669 if (numfiles==0)
670 GError("%sError: no fasta file given.\n", USAGE);
671 fname=(char*) args.nextNonOpt(); //first fasta file given
672 if (do_compress) { //-------- compression case -------------------
673 if (strcmp(fname, "-")==0 || strcmp(fname, "stdin")==0)
674 f_read=stdin;
675 else f_read= fopen(fname, "rb");
676 if (f_read == NULL) die_read(fname);
677 fname=zfilename; //forget the input file name, keep the output
678 }
679 else {//
680 int fdread= open(fname, O_RDONLY|O_BINARY);
681 if (fdread == -1) die_read(fname);
682 struct stat dbstat;
683 fstat(fdread, &dbstat);
684 fdbsize=dbstat.st_size;
685 close(fdread);
686 f_read= fopen(fname, "rb");
687 if (f_read == NULL) die_read(fname);
688 }
689
690 char idxfile[365];
691 if (outfile==NULL) {
692 if (do_compress) {
693 strcpy(ftmp, zfilename);
694 strcat(ftmp, ".cidx");
695 strcpy(idxfile, ftmp);
696 strcat(ftmp, "_tmp");
697 }
698 else {
699 strcpy(ftmp, fname);
700 strcat(ftmp, ".cidx");
701 strcpy(idxfile, ftmp);
702 strcat(ftmp, "_tmp");
703 }
704 //should add the process ID, time and user to make this unique?
705 }
706 else {
707 strcpy(ftmp, outfile);
708 strcpy(idxfile, outfile);
709 strcat(ftmp, "_tmp");
710 }
711
712 cdbidx=new GCdbWrite(ftmp); //test if this was successful?
713 if (keyDelim>0) {
714 addKeyFunc=&addKeyDelim;
715 }
716 else {
717 if (compact)
718 addKeyFunc=&addKeyCompact;
719 else if (multikey)
720 addKeyFunc = &addKeyMulti;
721 else addKeyFunc = &addKey;
722 }
723 off_t recpos=0;
724 off_t r=0;
725 unsigned int recsize=0;
726 char* key=NULL;
727 bool fullDefline=(multikey || compact_plus);
728 GReadBuf *readbuf = new GReadBuf(f_read, GREADBUF_SIZE);
729 if (do_compress) { //---------------- compression case -------------
730 if (fastq) GError("Error: sorry, compression is not supported with fastq format\n");
731 //TODO: use a clever block compression/indexing scheme with virtual offsets, like bgzf
732 // -- this should take care of the fastq compression
733 fdbsize=0;
734 GCdbz cdbz(zf); // zlib interface
735 recpos=cdbz.getZRecPos();
736 while ((key=cdbz.compress(readbuf,record_marker))!=NULL) {
737 recsize=cdbz.getZRecSize();
738 if (!fullDefline) {
739 //find first space after the record_marker and place a '\0' there
740 for (int i=record_marker_len; key[i]!='\0';i++) {
741 if (isspace(key[i])) { key[i]='\0';break; }
742 }
743 }
744 addKeyFunc(key, recpos, recsize);
745 recpos = cdbz.getZRecPos();
746 }
747 remove(zfilename);
748 cdbz.compress_end();
749 fclose(zf);
750 //determine the size of this file:
751 int ftmp= open(fztmp, O_RDONLY|O_BINARY);
752 if (ftmp == -1) die_read(fztmp);
753 struct stat dbstat;
754 fstat(ftmp, &dbstat);
755 fdbsize=dbstat.st_size;
756 //rename it to the intended file name
757 if (rename(fztmp,zfilename) != 0) {
758 GMessage("Error: unable to rename '%s' to '%s'\n",fztmp,zfilename);
759 perror("rename");
760 }
761 } //compression requested
762 else { // not compressed -- plain (buffered) file access
763 bool defline=false;
764 bool seen_defline=false;
765 int kbufsize=KBUFSIZE;
766 if (fullDefline) { GMALLOC(key, KBUFSIZE); }//large defline storage buffer, just in case
767 else { GMALLOC(key, 1024); }
768 key[0]=0; //no keys have been parsed yet
769 int kidx=-1;
770 num_recs=0;
771 num_keys=0;
772 char prevch=0;
773 //first iteration -- for the beginning of file
774 if (readbuf->peekCmp(record_marker, record_marker_len)==0) {
775 //new record start found (defline)
776 recpos=readbuf->getPos(); //new record pos
777 defline=true; //we're in defline
778 seen_defline=true;
779 readbuf->skip(record_marker_len);
780 kidx=0;
781 }//new record start
782 int linecounter=0; //number of non-header lines for current record
783 int fq_recloc=(fastq) ? 0 : 4; //0=seq header line, 1=sequence string, 2=q-header line, 3=qvstring, 4=not fastq
784 int fq_lendata[4]={0,0,0,0}; //keep track of fastq seq len, qv len
785 int last_linelen=0; //length of last non-header line in a FASTA record
786 int first_linelen=0; //length of the first non-header line in a FASTA record
787 int last_eol_len=0;
788 bool mustbeLastLine=false;
789 bool wasEOL=false;
790 while ((ch=readbuf->getch())>0) {
791 bool isEOL=false;
792 if (ch=='\n' || ch=='\r') {
793 isEOL=true;
794 last_eol_len=1;
795 }
796 if (wasEOL && isEOL) {
797 //skipped double-char EoL (DOS or empty lines)
798 if (prevch=='\n' && ch=='\r') last_eol_len=2;
799 prevch=ch;
800 wasEOL=isEOL;
801 continue;
802 }
803 if (defline) {
804 // ===> in the header line (including its EOL)
805 if (isEOL) {
806 // --> end of header line
807 if (kidx>=0) { key[kidx]=0;kidx=-1; }
808 defline=false;
809 last_linelen=0;
810 first_linelen=0;
811 linecounter=0;
812 fq_recloc= (fastq) ? 0 : 4;
813 mustbeLastLine=false;
814 memset((void*)fq_lendata, 0, 4*sizeof(int));
815 prevch=ch;
816 wasEOL=isEOL;
817 continue;
818 }
819 else {
820 // --> within header, before EOL
821 if (kidx>=0) { //still parsing keys
822 if ((isspace(ch) || ch<31)
823 && fullDefline==false) {
824 key[kidx]=0;
825 kidx=-1;
826 wasEOL=false;
827 prevch=ch;
828 continue;
829 }
830 key[kidx]=(char)ch;
831 kidx++;
832 if (kidx>=kbufsize) {
833 kbufsize+=KBUFSIZE;
834 GREALLOC(key, kbufsize);
835 }
836 } //still parsing keys
837 } // <-- before EoL
838 } // <=== in the header line
839 else {
840 // ===> not in header line (could be "sequence block", or before first defline
841 if (isEOL) {
842 // <-- at EoL in "sequence" block (record body)
843 if (seen_defline) {
844 linecounter++; //counting sequence lines
845 if (gFastaSeq && linecounter>1) {
846 if (last_linelen>first_linelen)
847 die_gseqformat(key);
848 if (last_linelen<first_linelen)
849 mustbeLastLine=true;
850 }
851 }
852 } // --> at EoL
853 else {
854 // <-- before EoL
855 // could be a new header, should we check?
856 if (wasEOL) {
857 // - start of a new line
858 bool newRecStart=false;
859 bool checkNewRec = (!fastq || fq_lendata[1]<=fq_lendata[3]);
860 if (checkNewRec && ch==record_marker[0]) {
861 newRecStart=(record_marker_len>1) ?
862 (readbuf->peekCmp(&record_marker[1], record_marker_len-1)==0) :
863 true;
864 if (newRecStart) {
865 // new record start (new header line coming up)
866 seen_defline=true;
867 recsize = readbuf->getPos()-recpos-1-last_eol_len; //previous recsize
868 if (recsize>(off_t)(record_marker_len+1) && key[0]!='\0') {
869 //add previous record
870 //TODO: validate gFastaSeq or fastq here?
871 if (fastq && fq_lendata[1]!=fq_lendata[3])
872 die_fastqformat(key, fq_lendata[1], fq_lendata[3]);
873 addKeyFunc(key, recpos, recsize);
874 }
875 recpos=readbuf->getPos()-1; //new record pos (after reading this EOL)
876 if (record_marker_len>1)
877 readbuf->skip(record_marker_len-1); //skip past the record marker
878 defline=true; //we'll be in the header line in the next iteration
879 linecounter=0;
880 mustbeLastLine=false;
881 kidx=0; //reset keys
882 prevch=record_marker[record_marker_len-1];
883 wasEOL=(prevch=='\n' || prevch=='\r');
884 continue;
885 } // new record start
886 } //checking for new record start (record_marker match)
887 if (!seen_defline) {
888 prevch=ch;
889 wasEOL=isEOL;
890 continue;
891 }
892 if (mustbeLastLine && !newRecStart)
893 die_gseqformat(key);
894 // start of a new line in the "sequence" block
895 last_linelen=0;
896 mustbeLastLine=false;
897 if (fq_recloc<4) {
898 //take care of multi-line fastq parsing
899 if (fq_recloc==1 && ch=='+') {
900 fq_recloc=2; //q-header line
901 }
902 else if (fq_recloc==0 || fq_recloc==2) fq_recloc++;
903 } //fastq parsing
904 } // - start of a new non-header line (wasEoL)
905 if (!seen_defline) {
906 prevch=ch;
907 wasEOL=isEOL;
908 continue;
909 }
910 if (fq_recloc<4) {
911 fq_lendata[fq_recloc]++;
912 }
913 last_linelen++;
914 if (linecounter==0) first_linelen++;
915 } // --> record body, before EoL
916 } // <=== not in header line
917
918 prevch=ch;
919 wasEOL=isEOL;
920 }//while getch
921 recsize=readbuf->getPos()-recpos;
922 if (recsize>0) {//add last record, if there
923 if (prevch=='\n' || prevch=='\r') recsize-=last_eol_len;
924 if (fullDefline && kidx>0) {//close the defline string
925 if (prevch!='\n' && prevch!='\r') kidx++;
926 key[kidx-1]='\0';
927 }
928 addKeyFunc(key, recpos, recsize);
929 linecounter=0;
930 //GMessage("adding key=%s\n",key);
931 }
932 delete readbuf;
933 }
934 if (f_read!=stdin) fclose(f_read);
935 if (cdbidx->finish() == -1) die_write("");
936
937 // === add some statistics at the end of the cdb index file!
938 r=lseek(cdbidx->getfd(), 0, SEEK_END);
939 cdbInfo info;
940 memcpy((void*)info.tag, (void*)"CDBX", 4);
941 info.idxflags=0;
942 if (multikey) info.idxflags |= CDBMSK_OPT_MULTI;
943 if (do_compress) {
944 info.idxflags |= CDBMSK_OPT_COMPRESS;
945 GMessage("Input data were compressed into file '%s'\n",fname);
946 }
947 if (compact) {
948 if (compact_plus)
949 info.idxflags |= CDBMSK_OPT_CADD;
950 else
951 info.idxflags |= CDBMSK_OPT_C;
952 }
953 if (gFastaSeq) {
954 info.idxflags |= CDBMSK_OPT_GSEQ;
955 }
956 info.num_records=gcvt_uint(&num_recs);
957 info.num_keys=gcvt_uint(&num_keys);
958 info.dbsize=gcvt_offt(&fdbsize);
959 info.idxflags=gcvt_uint(&info.idxflags);
960 int nlen=strlen(fname);
961 info.dbnamelen=gcvt_uint(&nlen);
962 r=write(cdbidx->getfd(), fname, nlen);
963 if (r!=nlen)
964 GError(ERR_W_DBSTAT);
965 r=write(cdbidx->getfd(), &info, cdbInfoSIZE);
966 if (r!=cdbInfoSIZE)
967 GError(ERR_W_DBSTAT);
968 delete cdbidx;
969 GFREE(key);
970 remove(idxfile);
971 if (rename(ftmp,idxfile) == -1)
972 GError("Error: unable to rename %s to %s",ftmp,idxfile);
973 GMessage("%d entries from file %s were indexed in file %s\n",
974 num_recs, fname, idxfile);
975 return 0;
976 }