1 |
#include <ctype.h> |
2 |
#include <fcntl.h> |
3 |
#include <string.h> |
4 |
#include <sys/stat.h> |
5 |
#include "GBase.h" |
6 |
#include "GArgs.h" |
7 |
#include "GHash.hh" |
8 |
#include "gcdb.h" |
9 |
#ifdef ENABLE_COMPRESSION |
10 |
#include "gcdbz.h" |
11 |
#endif |
12 |
#ifdef __WIN32__ |
13 |
#define VERSION "cdbfasta version 0.995w" |
14 |
#else |
15 |
#define VERSION "cdbfasta version 0.995" |
16 |
#endif |
17 |
|
18 |
#define USAGE "Usage:\n\ |
19 |
cdbfasta <fastafile> [-o <index_file>] [-r <record_delimiter>]\n\ |
20 |
[-z <compressed_db>] [-i] [-m|-n <numkeys>|-f<LIST>]|-c|-C]\n\ |
21 |
[-w <stopwords_list>] [-s <stripendchars>] [{-Q|-G}] [-v]\n\ |
22 |
\n\ |
23 |
Creates an index file for records from a multi-fasta file.\n\ |
24 |
By default (without -m/-n/-c/-C option), only the first \n\ |
25 |
space-delimited token from the defline is used as a key.\n\ |
26 |
\n\ |
27 |
<fastafile> is the multi-fasta file to index; \n\ |
28 |
-o the index file will be named <index_file>; if not given,\n\ |
29 |
the index filename is database name plus the suffix '.cidx'\n\ |
30 |
-r <record_delimiter> a string of characters at the beginning of line\n\ |
31 |
marking the start of a record (default: '>')\n\ |
32 |
-Q treat input as fastq format, i.e. with '@' as record delimiter\n\ |
33 |
and with records expected to have at least 4 lines\n\ |
34 |
-z database is compressed into the file <compressed_db>\n\ |
35 |
before indexing (<fastafile> can be \"-\" or \"stdin\" \n\ |
36 |
in order to get the input records from stdin)\n\ |
37 |
-s strip extraneous characters from *around* the space delimited\n\ |
38 |
tokens, for the multikey options below (-m,-n,-f);\n\ |
39 |
Default <stripendchars> set is: '\",`.(){}/[]!:;~|><+-\n\ |
40 |
-m (\"multi-key\" option) create hash entries pointing to \n\ |
41 |
the same record for all tokens found in\n\ |
42 |
the defline\n\ |
43 |
-n <numkeys> same as -m, but only takes the first <numkeys>\n\ |
44 |
tokens from the defline\n\ |
45 |
-f indexes *space* delimited tokens (fields) in the defline as given\n\ |
46 |
by LIST of fields or fields ranges (the same syntax as UNIX 'cut')\n\ |
47 |
-w <stopwordslist> exclude from indexing all the words found\n\ |
48 |
in the file <stopwordslist> (for options -m, -n and -k)\n\ |
49 |
-i do case insensitive indexing (i.e. create additional keys for \n\ |
50 |
all-lowercase tokens used for indexing from the defline \n\ |
51 |
-c for deflines in the format: db1|accession1|db2|accession2|...,\n\ |
52 |
only the first db-accession pair ('db1|accession1') is taken as key\n\ |
53 |
-C like -c, but also subsequent db|accession constructs are indexed,\n\ |
54 |
along with the full (default) token; additionally,\n\ |
55 |
all nrdb concatenated accessions found in the defline \n\ |
56 |
are parsed and stored (assuming 0x01 or '^|^' as separators)\n\ |
57 |
-a accession mode: like -C option, but indexes the 'accession'\n\ |
58 |
part for all 'db|accession' constructs found\n\ |
59 |
-A like -a and -C together (both accessions and 'db|accession'\n\ |
60 |
constructs are used as keys\n\ |
61 |
-D index each pipe ('|') delimited token found in the record identifier\n\ |
62 |
(e.g. >key1|key2|key3|.. )\n\ |
63 |
-d same as -D but using a custom key delimiter <kdelim> instead of the pipe\n\ |
64 |
character '|'\n\ |
65 |
-G FASTA records are treated as large genomic sequences (e.g. full \n\ |
66 |
chromosomes/contigs) and their formatting is checked for suitability\n\ |
67 |
for fast range queries (i.e. uniform line length within each record)\n\ |
68 |
-v show program version and exit\n" |
69 |
|
70 |
/* |
71 |
*/ |
72 |
|
73 |
#define ERR_TOOMANYFIELDS "Error: too many fields for -f option\n" |
74 |
//16K initial defline buffer |
75 |
#define KBUFSIZE 0x4000 |
76 |
#ifndef O_BINARY |
77 |
#define O_BINARY 0x0000 |
78 |
#endif |
79 |
#define MAX_KEYLEN 1024 |
80 |
//64K input buffer |
81 |
#define GREADBUF_SIZE 0x010000 |
82 |
|
83 |
|
84 |
typedef void (*addFuncType)(char*, off_t, uint32); |
85 |
|
86 |
char ftmp[365]; |
87 |
char fztmp[365]; |
88 |
char record_marker[127]; //record delimiter |
89 |
int record_marker_len=1; |
90 |
int num_recs; |
91 |
int num_keys; |
92 |
|
93 |
off_t last_cdbfpos=0; |
94 |
|
95 |
int compact_plus; //shortcut key and |
96 |
bool acc_mode=false; |
97 |
bool acc_only=false; |
98 |
bool do_compress=false; // compression used |
99 |
bool fastq=false; |
100 |
bool gFastaSeq=false; |
101 |
char keyDelim=0; |
102 |
|
103 |
FILE* zf=NULL; //compressed file handle |
104 |
|
105 |
//store just offset and record length |
106 |
const char* defWordJunk="'\",`.(){}/[]!:;~|><+-"; |
107 |
char* wordJunk=NULL; |
108 |
bool caseInsensitive=false; //case insensitive storage |
109 |
bool useStopWords=false; |
110 |
unsigned int numFields=0; |
111 |
// we have fields[numFields-1]=MAX_UINT as defined in gcdb.h -- |
112 |
// as an indicator of taking every single token in the defline, |
113 |
// or for open ended ranges (e.g. -f5- ) |
114 |
unsigned int fields[255]; //array of numFields field indices (1-based) |
115 |
GHash<int> stopList; |
116 |
//static int datalen=sizeof(uint32)+sizeof(off_t); |
117 |
|
118 |
char lastKey[MAX_KEYLEN]; //keep a copy of the last valid written key |
119 |
|
120 |
GCdbWrite* cdbidx; |
121 |
addFuncType addKeyFunc; |
122 |
|
123 |
#define ERR_W_DBSTAT "Error writing the database statististics!\n" |
124 |
|
125 |
void die_write(const char* fname) { |
126 |
GError("Error: cdbhash was unable to write into file %s\n", |
127 |
fname); |
128 |
|
129 |
} |
130 |
|
131 |
void die_read(const char* infile) { |
132 |
GError("Error: cdbhash was unable to read the input file %s\n",infile); |
133 |
} |
134 |
|
135 |
|
136 |
void die_readformat(const char* infile) { |
137 |
GError("Error: bad format for file %s; is it a fastA file?\n", |
138 |
infile); |
139 |
} |
140 |
|
141 |
void die_gseqformat(const char* seqname) { |
142 |
GError("Error: invalid FASTA sequence format for %s (not uniform line length)\n", |
143 |
seqname); |
144 |
} |
145 |
|
146 |
void die_fastqformat(const char* seqid, int seqlen, int qvlen) { |
147 |
GError("Error: invalid FASTQ sequence format for %s (seqlen=%d, qvlen=%d)\n", |
148 |
seqid); |
149 |
} |
150 |
|
151 |
|
152 |
bool add_cdbkey(char* key, off_t fpos, uint32 reclen, int16_t linelen=0, byte elen=0) { |
153 |
|
154 |
unsigned int klen=strlen(key); |
155 |
if (fpos==last_cdbfpos && strcmp(key, lastKey)==0) return true; |
156 |
if (klen<1) { |
157 |
/* GMessage("Warning: zero length key found following key '%s'\n", |
158 |
lastKey); |
159 |
*/ |
160 |
return false; |
161 |
} |
162 |
//------------ adding record ----------------- |
163 |
num_keys++; |
164 |
strncpy(lastKey, key, MAX_KEYLEN-1); |
165 |
lastKey[MAX_KEYLEN-1]='\0'; |
166 |
if ((uint64)fpos>(uint64)MAX_UINT) { //64 bit file offset |
167 |
uint64 v= (uint64) fpos; //needed for Solaris' off_t issues with gcc/32 |
168 |
/* |
169 |
if (linelen>0) { |
170 |
//record with line len |
171 |
CIdxSeqData recdata; |
172 |
recdata.fpos=gcvt_offt(&v); |
173 |
recdata.reclen=gcvt_uint(&reclen); |
174 |
recdata.linelen=gcvt_int16(&linelen); |
175 |
recdata.elen=elen; |
176 |
if (cdbidx->addrec(key,klen,(char*)&recdata,IdxSeqDataSIZE)==-1) |
177 |
GError("Error adding cdb record with key '%s'\n",key); |
178 |
} |
179 |
|
180 |
else { //plain record */ |
181 |
CIdxData recdata; |
182 |
recdata.fpos=gcvt_offt(&v); |
183 |
recdata.reclen=gcvt_uint(&reclen); |
184 |
if (cdbidx->addrec(key,klen,(char*)&recdata,IdxDataSIZE)==-1) |
185 |
GError("Error adding cdb record with key '%s'\n",key); |
186 |
//} |
187 |
} |
188 |
else {//32 bit file offset is enough |
189 |
/* |
190 |
if (linelen>0) { |
191 |
CIdxSeqData32 recdata; |
192 |
uint32 v=(uint32) fpos; |
193 |
recdata.fpos=gcvt_uint(&v); |
194 |
recdata.reclen=gcvt_uint(&reclen); |
195 |
recdata.linelen=gcvt_int16(&linelen); |
196 |
recdata.elen=elen; |
197 |
if (cdbidx->addrec(key,klen,(char*)&recdata,IdxDataSIZE32)==-1) |
198 |
GError("Error adding cdb record with key '%s'\n",key); |
199 |
} |
200 |
else { */ |
201 |
CIdxData32 recdata; |
202 |
uint32 v=(uint32) fpos; |
203 |
recdata.fpos=gcvt_uint(&v); |
204 |
recdata.reclen=gcvt_uint(&reclen); |
205 |
if (cdbidx->addrec(key,klen,(char*)&recdata,IdxDataSIZE32)==-1) |
206 |
GError("Error adding cdb record with key '%s'\n",key); |
207 |
//} |
208 |
} |
209 |
last_cdbfpos=fpos; |
210 |
return true; |
211 |
} |
212 |
|
213 |
//default indexing: key directly passed -- |
214 |
// as the first space delimited token |
215 |
void addKey(char* key, off_t fpos, |
216 |
uint32 reclen) { |
217 |
num_recs++; |
218 |
add_cdbkey(key, fpos, reclen); |
219 |
if (caseInsensitive) { |
220 |
char* lckey=loCase(key); |
221 |
if (strcmp(lckey, key)!=0) |
222 |
add_cdbkey(lckey, fpos, reclen); |
223 |
GFREE(lckey); |
224 |
} |
225 |
} |
226 |
|
227 |
//the whole defline is passed |
228 |
void addKeyMulti(char* defline, |
229 |
off_t fpos, uint32 reclen) { |
230 |
char* p=defline; |
231 |
unsigned int fieldno=0; |
232 |
char* pn; |
233 |
num_recs++; |
234 |
bool stillParsing=true; |
235 |
unsigned int fidx=0; //index in fields[] array |
236 |
while (stillParsing) { |
237 |
while ((*p)==' ' || (*p)=='\t') p++; |
238 |
if (*p == '\0') break; |
239 |
//skip any extraneous characters at the beginning of the token |
240 |
while (chrInStr(*p, wordJunk)) p++; |
241 |
//skip any padding spaces or other extraneous characters |
242 |
//at the beginning of the word |
243 |
if (*p == '\0') break; |
244 |
pn=p; |
245 |
while (*pn!='\0' && !isspace(*pn)) pn++; |
246 |
//found next space or end of string |
247 |
fieldno++; |
248 |
while (fields[fidx]<fieldno && fidx<numFields-1) fidx++; |
249 |
//GMessage("> p=>'%s' [%d, %d, %d] (next='%s')\n",p, numFields, |
250 |
// fieldno, fields[numFields-1], pn+1); |
251 |
stillParsing = (((*pn)!='\0') && (fieldno+1<=fields[numFields-1])); |
252 |
char* pend = pn-1; //pend is on the last non-space in the current token |
253 |
//--strip the ending junk, if any |
254 |
while (chrInStr(*pend, wordJunk) && pend>p) pend--; |
255 |
if (pend<pn-1) *(pend+1)='\0'; |
256 |
else *pn='\0'; |
257 |
|
258 |
if (strlen(p)>0) { |
259 |
if (fields[fidx]==MAX_UINT || fields[fidx]==fieldno) { |
260 |
if (useStopWords && stopList.hasKey(p)) { |
261 |
p=pn+1; |
262 |
continue; |
263 |
} |
264 |
//--- store this key with the same current record data: |
265 |
add_cdbkey(p, fpos, reclen); |
266 |
//---storage code ends here |
267 |
if (caseInsensitive) { |
268 |
char* lcp=loCase(p); |
269 |
if (strcmp(lcp,p)!=0) |
270 |
add_cdbkey(lcp, fpos, reclen); |
271 |
GFREE(lcp); |
272 |
} |
273 |
} |
274 |
//if (isEnd) break; //if all the token were stored |
275 |
} |
276 |
p=pn+1;//p is now on the next token's start |
277 |
} //token parsing loop |
278 |
} |
279 |
|
280 |
|
281 |
int qcmpInt(const void *p1, const void *p2) { |
282 |
//int n1=*((int*)p1); |
283 |
//int n2=*((int*)p2); |
284 |
const unsigned int *a = (unsigned int *) p1; |
285 |
const unsigned int *b = (unsigned int *) p2; |
286 |
if (*a < *b) return -1; |
287 |
else if (*a > *b) return 1; |
288 |
else return 0; |
289 |
} |
290 |
|
291 |
|
292 |
char* parse_dbacc(char* pstart, char*& end_acc, char*& accst) { |
293 |
if (pstart==NULL || *pstart==0) return NULL; |
294 |
bool hasDigits=false; |
295 |
char* pend=pstart; |
296 |
end_acc=NULL; //end of first accession |
297 |
accst=NULL; |
298 |
while (*pstart=='|') pstart++; |
299 |
for(char* p=pstart;;p++) { |
300 |
if (hasDigits==false && *p>='0' && *p<='9') hasDigits=true; |
301 |
/* if (*p==0) { //end of seq_name |
302 |
pend=p; //doesn't matter if it's accession or "db"-like |
303 |
break; |
304 |
}*/ |
305 |
if (*p=='|' || *p==0) { |
306 |
int curlen=p-pstart; |
307 |
if (*p==0 || (hasDigits && curlen>3) || curlen>7 || accst!=NULL) {//previous token was an accession |
308 |
pend=p; //advance pend |
309 |
if (end_acc==NULL) end_acc=p; |
310 |
if (accst==NULL) accst=pstart; |
311 |
break; |
312 |
} |
313 |
else { //first pipe char or no digits |
314 |
accst=p+1; |
315 |
} |
316 |
hasDigits=false;//reset flag |
317 |
} // | separator |
318 |
} |
319 |
if (pend!=pstart) return pend; |
320 |
else return NULL; |
321 |
} |
322 |
|
323 |
|
324 |
|
325 |
char* nextKeyDelim(char* pstart, char* pmax) { |
326 |
if (pstart==NULL || pstart>=pmax) return NULL; |
327 |
char* p=pstart; |
328 |
while (*p!=keyDelim && p!=pmax && *p!=0) p++; |
329 |
return p; |
330 |
} |
331 |
|
332 |
char* endSpToken(char* str) { |
333 |
if (str==NULL) return NULL; |
334 |
char* p=str; |
335 |
//while (*p!=' ' && *p!='\t' && *p!='\v' && *p!=0) p++; |
336 |
while (!isspace(*p) && *p!=0) p++; |
337 |
*p=0; |
338 |
return p; |
339 |
} |
340 |
|
341 |
#define NRDB_CHARSEP "\1\2\3\4" |
342 |
#define NRDB_STRSEP "^|^" |
343 |
|
344 |
//nrdbp is positioned at the end of current nrdb concatenated |
345 |
//defline, or NULL if there is no more |
346 |
inline void NRDB_Rec(char* &nrdbp, char* defline) { |
347 |
nrdbp=strpbrk(defline, NRDB_CHARSEP); |
348 |
if (nrdbp==NULL) { |
349 |
nrdbp=strstr(defline, NRDB_STRSEP); |
350 |
if (nrdbp!=NULL) { |
351 |
*nrdbp='\0'; |
352 |
nrdbp+=2; |
353 |
} |
354 |
} |
355 |
else *nrdbp='\0'; |
356 |
} |
357 |
|
358 |
//-c/-C/-a/-A indexing: key up to the first space or second '|' |
359 |
//receives the full defline |
360 |
void addKeyCompact(char* defline, |
361 |
off_t fpos, uint32 reclen) { |
362 |
//we got the first token found on the defline |
363 |
num_recs++; |
364 |
char* nrdb_end; |
365 |
//breaks defline at the next nrdb concatenation point |
366 |
NRDB_Rec(nrdb_end, defline); |
367 |
//isolate the first token in this nrdb concatenation |
368 |
char* token_end=endSpToken(defline); //this puts 0 at the first space encountered |
369 |
if (!compact_plus) { //shortcut key |
370 |
//only the first db|accession construct will be indexed, if found |
371 |
char* end_acc1=NULL; //end of first accession |
372 |
char* acc1st=NULL; |
373 |
char* dbacc_end=parse_dbacc(defline, end_acc1, acc1st); |
374 |
if (end_acc1!=NULL) { //has acceptable shortcut |
375 |
*end_acc1=0; |
376 |
add_cdbkey(defline, fpos, reclen); |
377 |
return; |
378 |
} |
379 |
if (dbacc_end!=NULL) { |
380 |
*dbacc_end=0; |
381 |
add_cdbkey(defline, fpos, reclen); |
382 |
return; |
383 |
} |
384 |
//store this whole non-space token as key: |
385 |
add_cdbkey(defline, fpos, reclen); |
386 |
return; |
387 |
} |
388 |
//from now on only -C/-a/-A treatment: |
389 |
for(;;) { |
390 |
//defline is on the first token |
391 |
if (strlen(defline)>0) //add whole non-space token as the "full key" |
392 |
add_cdbkey(defline, fpos, reclen); |
393 |
//add the db|accession constructs as keys |
394 |
char* dbacc_start=defline; |
395 |
char* firstacc_end=NULL; |
396 |
char* accst=NULL; |
397 |
char* dbacc_end=parse_dbacc(dbacc_start, firstacc_end, accst); |
398 |
while (dbacc_end!=NULL) { |
399 |
if (firstacc_end!=NULL && firstacc_end<dbacc_end) { |
400 |
char c=*firstacc_end; |
401 |
*firstacc_end=0; |
402 |
if (!acc_only) |
403 |
add_cdbkey(dbacc_start, fpos, reclen); |
404 |
if (acc_mode) { |
405 |
add_cdbkey(accst, fpos, reclen); |
406 |
} |
407 |
*firstacc_end=c; |
408 |
} |
409 |
if (dbacc_start==defline && dbacc_end==token_end) { |
410 |
if (acc_mode && accst!=NULL) |
411 |
add_cdbkey(accst, fpos, reclen); |
412 |
break; //the whole seq_name was only one db entry |
413 |
} |
414 |
*dbacc_end=0; //end key here |
415 |
if (!acc_only) |
416 |
add_cdbkey(dbacc_start, fpos, reclen); |
417 |
if (acc_mode) |
418 |
add_cdbkey(accst, fpos, reclen); |
419 |
if (dbacc_end==token_end) |
420 |
break; //reached the end of this whole seq_name (non-space token) |
421 |
dbacc_start=dbacc_end+1; |
422 |
firstacc_end=NULL; |
423 |
dbacc_end=parse_dbacc(dbacc_start, firstacc_end, accst); |
424 |
} |
425 |
// -- get to next concatenated defline, if any: |
426 |
if (compact_plus && nrdb_end!=NULL) { |
427 |
defline=nrdb_end+1; //look for the next nrdb concatenation |
428 |
NRDB_Rec(nrdb_end, defline); |
429 |
//isolate the first token in this nrdb record |
430 |
token_end=endSpToken(defline); |
431 |
} |
432 |
else break; |
433 |
} //for |
434 |
} |
435 |
|
436 |
void addKeyDelim(char* defline, |
437 |
off_t fpos, uint32 reclen) { |
438 |
//we got the first token found on the defline |
439 |
num_recs++; |
440 |
char* nrdb_end; |
441 |
//breaks defline at the next nrdb concatenation point |
442 |
NRDB_Rec(nrdb_end, defline); |
443 |
//isolate the first space-delimited token in this nrdb concatenation |
444 |
char* token_end=endSpToken(defline); |
445 |
for(;;) { |
446 |
//defline is on the first token |
447 |
if (strlen(defline)==0) break; |
448 |
//add the db|accession constructs as keys |
449 |
char* k_start=defline; |
450 |
char* k_end=NULL; |
451 |
while ((k_end=nextKeyDelim(k_start, token_end))!=NULL) { |
452 |
*k_end=0; |
453 |
add_cdbkey(k_start, fpos, reclen); |
454 |
k_start=k_end+1; |
455 |
} |
456 |
// -- get to next concatenated defline, if any: |
457 |
if (nrdb_end!=NULL) { |
458 |
defline=nrdb_end+1; //look for the next nrdb concatenation |
459 |
NRDB_Rec(nrdb_end, defline); |
460 |
//isolate the first space-delimited token in this nrdb record |
461 |
token_end=endSpToken(defline); |
462 |
} |
463 |
else break; |
464 |
} //for |
465 |
} |
466 |
|
467 |
int readWords(FILE* f, GHash<int>& xhash) { |
468 |
int c; |
469 |
int count=0; |
470 |
char name[256]; |
471 |
int len=0; |
472 |
while ((c=getc(f))!=EOF) { |
473 |
if (isspace(c) || c==',' || c==';') { |
474 |
if (len>0) { |
475 |
name[len]='\0'; |
476 |
xhash.Add(name, new int(1)); |
477 |
count++; |
478 |
len=0; |
479 |
} |
480 |
continue; |
481 |
} |
482 |
//a non-space |
483 |
name[len]=(char) c; |
484 |
len++; |
485 |
if (len>255) { |
486 |
name[len-1]='\0'; |
487 |
GError("Error reading words file: token too long ('%s') !\n",name); |
488 |
} |
489 |
} |
490 |
if (len>0) { |
491 |
name[len]='\0'; |
492 |
xhash.Add(name, new int(1)); |
493 |
count++; |
494 |
} |
495 |
return count; |
496 |
} |
497 |
|
498 |
|
499 |
|
500 |
//========================== MAIN =============================== |
501 |
int main(int argc, char **argv) { |
502 |
FILE* f_read=NULL; |
503 |
off_t fdbsize; |
504 |
int ch; |
505 |
char* zfilename; |
506 |
char* fname; |
507 |
char* marker; //record marker |
508 |
int maxkeys=0; |
509 |
int multikey=0; |
510 |
record_marker[0]='>'; |
511 |
record_marker[1]=0; |
512 |
GArgs args(argc, argv, "icvDQCaAmn:o:r:z:w:f:s:d:"); |
513 |
int e=args.isError(); |
514 |
if (e>0) |
515 |
GError("%s Invalid argument: %s\n", USAGE, argv[e] ); |
516 |
if (args.getOpt('v')!=NULL) { |
517 |
printf("%s\n",VERSION); |
518 |
return 0; |
519 |
} |
520 |
fastq = (args.getOpt('Q')!=NULL); |
521 |
gFastaSeq=(args.getOpt('G')!=NULL); |
522 |
if (fastq && gFastaSeq) |
523 |
GError("Error: options -Q and -G are mutually exclusive.\n"); |
524 |
if (args.getOpt('D')) keyDelim='|'; |
525 |
else if (args.getOpt('d')!=NULL) keyDelim=args.getOpt('d')[0]; |
526 |
multikey = (args.getOpt('m')!=NULL); |
527 |
if (multikey) { |
528 |
fields[numFields]=1; |
529 |
numFields++; |
530 |
fields[numFields]=MAX_UINT; |
531 |
numFields++; |
532 |
} |
533 |
caseInsensitive = (args.getOpt('i')!=NULL); |
534 |
acc_only=(args.getOpt('a')!=NULL); |
535 |
acc_mode=(acc_only || args.getOpt('A')!=NULL); |
536 |
compact_plus=(args.getOpt('C')!=NULL || acc_mode); |
537 |
wordJunk = (char *)args.getOpt('s'); |
538 |
if (wordJunk==NULL) wordJunk=(char*)defWordJunk; |
539 |
int compact=(args.getOpt('c')!=NULL || compact_plus); |
540 |
if ((compact && multikey) || (multikey && keyDelim)) { |
541 |
GError("%s Error: invalid flag combination.\n", USAGE); |
542 |
} |
543 |
char* argfields = (char*)args.getOpt('f'); |
544 |
char* s = (char*)args.getOpt('n'); |
545 |
if (s!=NULL) { |
546 |
maxkeys = atoi(s); |
547 |
if (maxkeys>1 && (compact || multikey || keyDelim || argfields!=NULL)) |
548 |
GError("%s Error: invalid options (-m, -c/C, -n, -D/-d and -f are exclusive)\n", USAGE); |
549 |
multikey=1; |
550 |
numFields=maxkeys; |
551 |
if (numFields>254) GError(ERR_TOOMANYFIELDS); |
552 |
for (unsigned int i=1;i<=numFields;i++) fields[i-1]=i; |
553 |
} |
554 |
|
555 |
if (argfields!=NULL) { //parse all the field #s |
556 |
if (multikey || keyDelim || compact) |
557 |
GError("%s Error: invalid options (-m, -c/C, -n, -D/-d and -f are exclusive)\n", USAGE); |
558 |
char* pbrk; |
559 |
int prevnum=0; |
560 |
char prevsep='\0'; |
561 |
numFields=0; |
562 |
char sep; |
563 |
char *p=argfields; |
564 |
do { |
565 |
pbrk=strpbrk(p,",-"); |
566 |
if (pbrk==NULL) { |
567 |
sep='\0'; |
568 |
pbrk=p+strlen(p); |
569 |
if (prevsep == '-' && *p=='\0' && prevnum>0) { |
570 |
//open ended range -- ending with '-' |
571 |
fields[numFields]=prevnum; |
572 |
numFields++; |
573 |
if (numFields>253) GError(ERR_TOOMANYFIELDS); |
574 |
fields[numFields]=MAX_UINT; |
575 |
numFields++; |
576 |
//GMessage("--- stored %d, %d\n",prevnum, MAX_UINT); |
577 |
break; |
578 |
}// ending with '-' |
579 |
} // '\0' |
580 |
else { sep=*pbrk; *pbrk = '\0'; } |
581 |
int num = atoi(p); |
582 |
if (num<=0 || num>254 ) |
583 |
GError("%s Error: invalid syntax for -f option.\n", USAGE); |
584 |
if (prevsep == '-') { //store a range |
585 |
for (int i=prevnum;i<=num;i++) { |
586 |
fields[numFields]=i; |
587 |
numFields++; |
588 |
if (numFields>254) GError(ERR_TOOMANYFIELDS); |
589 |
} |
590 |
} |
591 |
else if (sep!='-') { |
592 |
fields[numFields]=num; |
593 |
numFields++; |
594 |
if (numFields>254) GError(ERR_TOOMANYFIELDS); |
595 |
} |
596 |
|
597 |
prevsep=sep; |
598 |
prevnum=num; |
599 |
p=pbrk+1; |
600 |
} while (sep != '\0'); //range parsing loop |
601 |
if (numFields<=0 || numFields>254 ) |
602 |
GError("%s Error at parsing -f option.\n", USAGE); |
603 |
//GMessage("[%d] Fields parsed (%d values):\n", sizeof(fields[0]), numFields); |
604 |
qsort(fields, numFields, sizeof(fields[0]), &qcmpInt); |
605 |
multikey=1; |
606 |
/*-- --------debug: |
607 |
for (unsigned int i=0;i<numFields-1;i++) { |
608 |
GMessage("%d,", fields[i]); |
609 |
} |
610 |
GMessage("%d\n",fields[numFields-1]); |
611 |
exit(0); */ |
612 |
} //fields |
613 |
if (fastq) { |
614 |
record_marker[0]='@'; |
615 |
record_marker_len=1; |
616 |
} |
617 |
if (args.getOpt('r')!=NULL) {//non-FASTA record delimiter? |
618 |
if (fastq) { |
619 |
GMessage("Option -r ignored because -Q was given (-Q sets delimiter to '@')\n"); |
620 |
} |
621 |
else { |
622 |
marker=(char*)args.getOpt('r'); // |
623 |
int v=0; |
624 |
if (strlen(marker)>126) |
625 |
GError("Error: the specified record delimiter is too long. " |
626 |
"Maximum accepted is 126\n"); |
627 |
//special case: hex (0xXX) and octal codes (\XXX) are accepted, only if by themselves |
628 |
if (strlen(marker)==4 && (marker[0]=='\\' || (marker[0]=='0' && toupper(marker[1])=='X') )) { |
629 |
if (marker[0]=='\\') { |
630 |
marker++; |
631 |
v=strtol(marker, NULL, 8); |
632 |
} |
633 |
else v=strtol(marker, NULL, 16); |
634 |
if (v==0 || v>255) |
635 |
GError("Invalid record delimiter: should be only one character,\n" |
636 |
"'\\NNN' (octal value), '0xNN' (hexadecimal value)"); |
637 |
record_marker[0]=v; |
638 |
record_marker_len=1; |
639 |
} |
640 |
else { |
641 |
strcpy(record_marker, marker); |
642 |
record_marker_len=strlen(record_marker); |
643 |
} |
644 |
} |
645 |
} |
646 |
char* stopwords=(char*)args.getOpt('w'); //stop words filename given? |
647 |
if (stopwords!=NULL) { |
648 |
FILE* fstopwords=NULL; |
649 |
if ((fstopwords=fopen(stopwords, "r"))==NULL) |
650 |
GError("Cannot open stop words file '%s'!\n", stopwords); |
651 |
int c=readWords(fstopwords, stopList); |
652 |
GMessage("Loaded %d stop words.\n", c); |
653 |
fclose(fstopwords); |
654 |
useStopWords=(c>0); |
655 |
} |
656 |
if ((zfilename=(char*)args.getOpt('z')) !=NULL) { |
657 |
do_compress=true; |
658 |
#ifndef ENABLE_COMPRESSION |
659 |
GError("Error: compression requested but not enabled when cdbfasta was compiled\n") |
660 |
#endif |
661 |
strcpy(fztmp,zfilename); |
662 |
strcat(fztmp,"_ztmp"); |
663 |
zf=fopen(fztmp,"wb"); |
664 |
if (zf==NULL) |
665 |
GError("Error creating file '%s'\n'", fztmp); |
666 |
} |
667 |
char* outfile=(char*) args.getOpt('o'); |
668 |
int numfiles = args.startNonOpt(); |
669 |
if (numfiles==0) |
670 |
GError("%sError: no fasta file given.\n", USAGE); |
671 |
fname=(char*) args.nextNonOpt(); //first fasta file given |
672 |
if (do_compress) { //-------- compression case ------------------- |
673 |
if (strcmp(fname, "-")==0 || strcmp(fname, "stdin")==0) |
674 |
f_read=stdin; |
675 |
else f_read= fopen(fname, "rb"); |
676 |
if (f_read == NULL) die_read(fname); |
677 |
fname=zfilename; //forget the input file name, keep the output |
678 |
} |
679 |
else {// |
680 |
int fdread= open(fname, O_RDONLY|O_BINARY); |
681 |
if (fdread == -1) die_read(fname); |
682 |
struct stat dbstat; |
683 |
fstat(fdread, &dbstat); |
684 |
fdbsize=dbstat.st_size; |
685 |
close(fdread); |
686 |
f_read= fopen(fname, "rb"); |
687 |
if (f_read == NULL) die_read(fname); |
688 |
} |
689 |
|
690 |
char idxfile[365]; |
691 |
if (outfile==NULL) { |
692 |
if (do_compress) { |
693 |
strcpy(ftmp, zfilename); |
694 |
strcat(ftmp, ".cidx"); |
695 |
strcpy(idxfile, ftmp); |
696 |
strcat(ftmp, "_tmp"); |
697 |
} |
698 |
else { |
699 |
strcpy(ftmp, fname); |
700 |
strcat(ftmp, ".cidx"); |
701 |
strcpy(idxfile, ftmp); |
702 |
strcat(ftmp, "_tmp"); |
703 |
} |
704 |
//should add the process ID, time and user to make this unique? |
705 |
} |
706 |
else { |
707 |
strcpy(ftmp, outfile); |
708 |
strcpy(idxfile, outfile); |
709 |
strcat(ftmp, "_tmp"); |
710 |
} |
711 |
|
712 |
cdbidx=new GCdbWrite(ftmp); //test if this was successful? |
713 |
if (keyDelim>0) { |
714 |
addKeyFunc=&addKeyDelim; |
715 |
} |
716 |
else { |
717 |
if (compact) |
718 |
addKeyFunc=&addKeyCompact; |
719 |
else if (multikey) |
720 |
addKeyFunc = &addKeyMulti; |
721 |
else addKeyFunc = &addKey; |
722 |
} |
723 |
off_t recpos=0; |
724 |
off_t r=0; |
725 |
unsigned int recsize=0; |
726 |
char* key=NULL; |
727 |
bool fullDefline=(multikey || compact_plus); |
728 |
GReadBuf *readbuf = new GReadBuf(f_read, GREADBUF_SIZE); |
729 |
if (do_compress) { //---------------- compression case ------------- |
730 |
if (fastq) GError("Error: sorry, compression is not supported with fastq format\n"); |
731 |
//TODO: use a clever block compression/indexing scheme with virtual offsets, like bgzf |
732 |
// -- this should take care of the fastq compression |
733 |
fdbsize=0; |
734 |
GCdbz cdbz(zf); // zlib interface |
735 |
recpos=cdbz.getZRecPos(); |
736 |
while ((key=cdbz.compress(readbuf,record_marker))!=NULL) { |
737 |
recsize=cdbz.getZRecSize(); |
738 |
if (!fullDefline) { |
739 |
//find first space after the record_marker and place a '\0' there |
740 |
for (int i=record_marker_len; key[i]!='\0';i++) { |
741 |
if (isspace(key[i])) { key[i]='\0';break; } |
742 |
} |
743 |
} |
744 |
addKeyFunc(key, recpos, recsize); |
745 |
recpos = cdbz.getZRecPos(); |
746 |
} |
747 |
remove(zfilename); |
748 |
cdbz.compress_end(); |
749 |
fclose(zf); |
750 |
//determine the size of this file: |
751 |
int ftmp= open(fztmp, O_RDONLY|O_BINARY); |
752 |
if (ftmp == -1) die_read(fztmp); |
753 |
struct stat dbstat; |
754 |
fstat(ftmp, &dbstat); |
755 |
fdbsize=dbstat.st_size; |
756 |
//rename it to the intended file name |
757 |
if (rename(fztmp,zfilename) != 0) { |
758 |
GMessage("Error: unable to rename '%s' to '%s'\n",fztmp,zfilename); |
759 |
perror("rename"); |
760 |
} |
761 |
} //compression requested |
762 |
else { // not compressed -- plain (buffered) file access |
763 |
bool defline=false; |
764 |
bool seen_defline=false; |
765 |
int kbufsize=KBUFSIZE; |
766 |
if (fullDefline) { GMALLOC(key, KBUFSIZE); }//large defline storage buffer, just in case |
767 |
else { GMALLOC(key, 1024); } |
768 |
key[0]=0; //no keys have been parsed yet |
769 |
int kidx=-1; |
770 |
num_recs=0; |
771 |
num_keys=0; |
772 |
char prevch=0; |
773 |
//first iteration -- for the beginning of file |
774 |
if (readbuf->peekCmp(record_marker, record_marker_len)==0) { |
775 |
//new record start found (defline) |
776 |
recpos=readbuf->getPos(); //new record pos |
777 |
defline=true; //we're in defline |
778 |
seen_defline=true; |
779 |
readbuf->skip(record_marker_len); |
780 |
kidx=0; |
781 |
}//new record start |
782 |
int linecounter=0; //number of non-header lines for current record |
783 |
int fq_recloc=(fastq) ? 0 : 4; //0=seq header line, 1=sequence string, 2=q-header line, 3=qvstring, 4=not fastq |
784 |
int fq_lendata[4]={0,0,0,0}; //keep track of fastq seq len, qv len |
785 |
int last_linelen=0; //length of last non-header line in a FASTA record |
786 |
int first_linelen=0; //length of the first non-header line in a FASTA record |
787 |
int last_eol_len=0; |
788 |
bool mustbeLastLine=false; |
789 |
bool wasEOL=false; |
790 |
while ((ch=readbuf->getch())>0) { |
791 |
bool isEOL=false; |
792 |
if (ch=='\n' || ch=='\r') { |
793 |
isEOL=true; |
794 |
last_eol_len=1; |
795 |
} |
796 |
if (wasEOL && isEOL) { |
797 |
//skipped double-char EoL (DOS or empty lines) |
798 |
if (prevch=='\n' && ch=='\r') last_eol_len=2; |
799 |
prevch=ch; |
800 |
wasEOL=isEOL; |
801 |
continue; |
802 |
} |
803 |
if (defline) { |
804 |
// ===> in the header line (including its EOL) |
805 |
if (isEOL) { |
806 |
// --> end of header line |
807 |
if (kidx>=0) { key[kidx]=0;kidx=-1; } |
808 |
defline=false; |
809 |
last_linelen=0; |
810 |
first_linelen=0; |
811 |
linecounter=0; |
812 |
fq_recloc= (fastq) ? 0 : 4; |
813 |
mustbeLastLine=false; |
814 |
memset((void*)fq_lendata, 0, 4*sizeof(int)); |
815 |
prevch=ch; |
816 |
wasEOL=isEOL; |
817 |
continue; |
818 |
} |
819 |
else { |
820 |
// --> within header, before EOL |
821 |
if (kidx>=0) { //still parsing keys |
822 |
if ((isspace(ch) || ch<31) |
823 |
&& fullDefline==false) { |
824 |
key[kidx]=0; |
825 |
kidx=-1; |
826 |
wasEOL=false; |
827 |
prevch=ch; |
828 |
continue; |
829 |
} |
830 |
key[kidx]=(char)ch; |
831 |
kidx++; |
832 |
if (kidx>=kbufsize) { |
833 |
kbufsize+=KBUFSIZE; |
834 |
GREALLOC(key, kbufsize); |
835 |
} |
836 |
} //still parsing keys |
837 |
} // <-- before EoL |
838 |
} // <=== in the header line |
839 |
else { |
840 |
// ===> not in header line (could be "sequence block", or before first defline |
841 |
if (isEOL) { |
842 |
// <-- at EoL in "sequence" block (record body) |
843 |
if (seen_defline) { |
844 |
linecounter++; //counting sequence lines |
845 |
if (gFastaSeq && linecounter>1) { |
846 |
if (last_linelen>first_linelen) |
847 |
die_gseqformat(key); |
848 |
if (last_linelen<first_linelen) |
849 |
mustbeLastLine=true; |
850 |
} |
851 |
} |
852 |
} // --> at EoL |
853 |
else { |
854 |
// <-- before EoL |
855 |
// could be a new header, should we check? |
856 |
if (wasEOL) { |
857 |
// - start of a new line |
858 |
bool newRecStart=false; |
859 |
bool checkNewRec = (!fastq || fq_lendata[1]<=fq_lendata[3]); |
860 |
if (checkNewRec && ch==record_marker[0]) { |
861 |
newRecStart=(record_marker_len>1) ? |
862 |
(readbuf->peekCmp(&record_marker[1], record_marker_len-1)==0) : |
863 |
true; |
864 |
if (newRecStart) { |
865 |
// new record start (new header line coming up) |
866 |
seen_defline=true; |
867 |
recsize = readbuf->getPos()-recpos-1-last_eol_len; //previous recsize |
868 |
if (recsize>(off_t)(record_marker_len+1) && key[0]!='\0') { |
869 |
//add previous record |
870 |
//TODO: validate gFastaSeq or fastq here? |
871 |
if (fastq && fq_lendata[1]!=fq_lendata[3]) |
872 |
die_fastqformat(key, fq_lendata[1], fq_lendata[3]); |
873 |
addKeyFunc(key, recpos, recsize); |
874 |
} |
875 |
recpos=readbuf->getPos()-1; //new record pos (after reading this EOL) |
876 |
if (record_marker_len>1) |
877 |
readbuf->skip(record_marker_len-1); //skip past the record marker |
878 |
defline=true; //we'll be in the header line in the next iteration |
879 |
linecounter=0; |
880 |
mustbeLastLine=false; |
881 |
kidx=0; //reset keys |
882 |
prevch=record_marker[record_marker_len-1]; |
883 |
wasEOL=(prevch=='\n' || prevch=='\r'); |
884 |
continue; |
885 |
} // new record start |
886 |
} //checking for new record start (record_marker match) |
887 |
if (!seen_defline) { |
888 |
prevch=ch; |
889 |
wasEOL=isEOL; |
890 |
continue; |
891 |
} |
892 |
if (mustbeLastLine && !newRecStart) |
893 |
die_gseqformat(key); |
894 |
// start of a new line in the "sequence" block |
895 |
last_linelen=0; |
896 |
mustbeLastLine=false; |
897 |
if (fq_recloc<4) { |
898 |
//take care of multi-line fastq parsing |
899 |
if (fq_recloc==1 && ch=='+') { |
900 |
fq_recloc=2; //q-header line |
901 |
} |
902 |
else if (fq_recloc==0 || fq_recloc==2) fq_recloc++; |
903 |
} //fastq parsing |
904 |
} // - start of a new non-header line (wasEoL) |
905 |
if (!seen_defline) { |
906 |
prevch=ch; |
907 |
wasEOL=isEOL; |
908 |
continue; |
909 |
} |
910 |
if (fq_recloc<4) { |
911 |
fq_lendata[fq_recloc]++; |
912 |
} |
913 |
last_linelen++; |
914 |
if (linecounter==0) first_linelen++; |
915 |
} // --> record body, before EoL |
916 |
} // <=== not in header line |
917 |
|
918 |
prevch=ch; |
919 |
wasEOL=isEOL; |
920 |
}//while getch |
921 |
recsize=readbuf->getPos()-recpos; |
922 |
if (recsize>0) {//add last record, if there |
923 |
if (prevch=='\n' || prevch=='\r') recsize-=last_eol_len; |
924 |
if (fullDefline && kidx>0) {//close the defline string |
925 |
if (prevch!='\n' && prevch!='\r') kidx++; |
926 |
key[kidx-1]='\0'; |
927 |
} |
928 |
addKeyFunc(key, recpos, recsize); |
929 |
linecounter=0; |
930 |
//GMessage("adding key=%s\n",key); |
931 |
} |
932 |
delete readbuf; |
933 |
} |
934 |
if (f_read!=stdin) fclose(f_read); |
935 |
if (cdbidx->finish() == -1) die_write(""); |
936 |
|
937 |
// === add some statistics at the end of the cdb index file! |
938 |
r=lseek(cdbidx->getfd(), 0, SEEK_END); |
939 |
cdbInfo info; |
940 |
memcpy((void*)info.tag, (void*)"CDBX", 4); |
941 |
info.idxflags=0; |
942 |
if (multikey) info.idxflags |= CDBMSK_OPT_MULTI; |
943 |
if (do_compress) { |
944 |
info.idxflags |= CDBMSK_OPT_COMPRESS; |
945 |
GMessage("Input data were compressed into file '%s'\n",fname); |
946 |
} |
947 |
if (compact) { |
948 |
if (compact_plus) |
949 |
info.idxflags |= CDBMSK_OPT_CADD; |
950 |
else |
951 |
info.idxflags |= CDBMSK_OPT_C; |
952 |
} |
953 |
if (gFastaSeq) { |
954 |
info.idxflags |= CDBMSK_OPT_GSEQ; |
955 |
} |
956 |
info.num_records=gcvt_uint(&num_recs); |
957 |
info.num_keys=gcvt_uint(&num_keys); |
958 |
info.dbsize=gcvt_offt(&fdbsize); |
959 |
info.idxflags=gcvt_uint(&info.idxflags); |
960 |
int nlen=strlen(fname); |
961 |
info.dbnamelen=gcvt_uint(&nlen); |
962 |
r=write(cdbidx->getfd(), fname, nlen); |
963 |
if (r!=nlen) |
964 |
GError(ERR_W_DBSTAT); |
965 |
r=write(cdbidx->getfd(), &info, cdbInfoSIZE); |
966 |
if (r!=cdbInfoSIZE) |
967 |
GError(ERR_W_DBSTAT); |
968 |
delete cdbidx; |
969 |
GFREE(key); |
970 |
remove(idxfile); |
971 |
if (rename(ftmp,idxfile) == -1) |
972 |
GError("Error: unable to rename %s to %s",ftmp,idxfile); |
973 |
GMessage("%d entries from file %s were indexed in file %s\n", |
974 |
num_recs, fname, idxfile); |
975 |
return 0; |
976 |
} |