ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/cdbfasta/cdbfasta.cpp
Revision: 15
Committed: Mon Jul 18 20:53:45 2011 UTC (11 years, 2 months ago) by gpertea
File size: 33280 byte(s)
Log Message:
sync with local src

Line User Rev File contents
1 gpertea 8 #include <ctype.h>
2     #include <fcntl.h>
3     #include <string.h>
4     #include <sys/stat.h>
5     #include "GBase.h"
6     #include "GArgs.h"
7     #include "GHash.hh"
8     #include "gcdb.h"
9     #ifdef ENABLE_COMPRESSION
10     #include "gcdbz.h"
11     #endif
12     #ifdef __WIN32__
13 gpertea 15 #define VERSION "cdbfasta version 0.995w"
14 gpertea 8 #else
15 gpertea 15 #define VERSION "cdbfasta version 0.995"
16 gpertea 8 #endif
17    
18     #define USAGE "Usage:\n\
19     cdbfasta <fastafile> [-o <index_file>] [-r <record_delimiter>]\n\
20     [-z <compressed_db>] [-i] [-m|-n <numkeys>|-f<LIST>]|-c|-C]\n\
21 gpertea 15 [-w <stopwords_list>] [-s <stripendchars>] [{-Q|-G}] [-v]\n\
22 gpertea 8 \n\
23     Creates an index file for records from a multi-fasta file.\n\
24     By default (without -m/-n/-c/-C option), only the first \n\
25     space-delimited token from the defline is used as a key.\n\
26     \n\
27     <fastafile> is the multi-fasta file to index; \n\
28     -o the index file will be named <index_file>; if not given,\n\
29     the index filename is database name plus the suffix '.cidx'\n\
30     -r <record_delimiter> a string of characters at the beginning of line\n\
31     marking the start of a record (default: '>')\n\
32     -Q treat input as fastq format, i.e. with '@' as record delimiter\n\
33     and with records expected to have at least 4 lines\n\
34     -z database is compressed into the file <compressed_db>\n\
35     before indexing (<fastafile> can be \"-\" or \"stdin\" \n\
36     in order to get the input records from stdin)\n\
37     -s strip extraneous characters from *around* the space delimited\n\
38     tokens, for the multikey options below (-m,-n,-f);\n\
39     Default <stripendchars> set is: '\",`.(){}/[]!:;~|><+-\n\
40     -m (\"multi-key\" option) create hash entries pointing to \n\
41     the same record for all tokens found in\n\
42     the defline\n\
43     -n <numkeys> same as -m, but only takes the first <numkeys>\n\
44     tokens from the defline\n\
45     -f indexes *space* delimited tokens (fields) in the defline as given\n\
46     by LIST of fields or fields ranges (the same syntax as UNIX 'cut')\n\
47     -w <stopwordslist> exclude from indexing all the words found\n\
48     in the file <stopwordslist> (for options -m, -n and -k)\n\
49     -i do case insensitive indexing (i.e. create additional keys for \n\
50     all-lowercase tokens used for indexing from the defline \n\
51     -c for deflines in the format: db1|accession1|db2|accession2|...,\n\
52     only the first db-accession pair ('db1|accession1') is taken as key\n\
53     -C like -c, but also subsequent db|accession constructs are indexed,\n\
54     along with the full (default) token; additionally,\n\
55     all nrdb concatenated accessions found in the defline \n\
56     are parsed and stored (assuming 0x01 or '^|^' as separators)\n\
57     -a accession mode: like -C option, but indexes the 'accession'\n\
58     part for all 'db|accession' constructs found\n\
59     -A like -a and -C together (both accessions and 'db|accession'\n\
60     constructs are used as keys\n\
61 gpertea 15 -D index each pipe ('|') delimited token found in the record identifier\n\
62     (e.g. >key1|key2|key3|.. )\n\
63     -d same as -D but using a custom key delimiter <kdelim> instead of the pipe\n\
64     character '|'\n\
65     -G FASTA records are treated as large genomic sequences (e.g. full \n\
66     chromosomes/contigs) and their formatting is checked for suitability\n\
67     for fast range queries (i.e. uniform line length within each record)\n\
68 gpertea 8 -v show program version and exit\n"
69    
70 gpertea 15 /*
71     */
72 gpertea 8
73     #define ERR_TOOMANYFIELDS "Error: too many fields for -f option\n"
74     //16K initial defline buffer
75     #define KBUFSIZE 0x4000
76     #ifndef O_BINARY
77     #define O_BINARY 0x0000
78     #endif
79     #define MAX_KEYLEN 1024
80     //64K input buffer
81     #define GREADBUF_SIZE 0x010000
82    
83    
84     typedef void (*addFuncType)(char*, off_t, uint32);
85    
86     char ftmp[365];
87     char fztmp[365];
88     char record_marker[127]; //record delimiter
89     int record_marker_len=1;
90     int num_recs;
91     int num_keys;
92    
93     off_t last_cdbfpos=0;
94    
95     int compact_plus; //shortcut key and
96     bool acc_mode=false;
97     bool acc_only=false;
98     bool do_compress=false; // compression used
99     bool fastq=false;
100 gpertea 15 bool gFastaSeq=false;
101     char keyDelim=0;
102 gpertea 8
103     FILE* zf=NULL; //compressed file handle
104    
105     //store just offset and record length
106     const char* defWordJunk="'\",`.(){}/[]!:;~|><+-";
107     char* wordJunk=NULL;
108     bool caseInsensitive=false; //case insensitive storage
109     bool useStopWords=false;
110     unsigned int numFields=0;
111     // we have fields[numFields-1]=MAX_UINT as defined in gcdb.h --
112     // as an indicator of taking every single token in the defline,
113     // or for open ended ranges (e.g. -f5- )
114     unsigned int fields[255]; //array of numFields field indices (1-based)
115     GHash<int> stopList;
116     //static int datalen=sizeof(uint32)+sizeof(off_t);
117    
118     char lastKey[MAX_KEYLEN]; //keep a copy of the last valid written key
119    
120     GCdbWrite* cdbidx;
121     addFuncType addKeyFunc;
122    
123     #define ERR_W_DBSTAT "Error writing the database statististics!\n"
124    
125     void die_write(const char* fname) {
126     GError("Error: cdbhash was unable to write into file %s\n",
127     fname);
128    
129     }
130    
131     void die_read(const char* infile) {
132     GError("Error: cdbhash was unable to read the input file %s\n",infile);
133     }
134    
135    
136     void die_readformat(const char* infile) {
137     GError("Error: bad format for file %s; is it a fastA file?\n",
138     infile);
139     }
140    
141 gpertea 15 void die_gseqformat(const char* seqname) {
142     GError("Error: invalid FASTA sequence format for %s (not uniform line length)\n",
143     seqname);
144     }
145 gpertea 8
146 gpertea 15 void die_fastqformat(const char* seqid, int seqlen, int qvlen) {
147     GError("Error: invalid FASTQ sequence format for %s (seqlen=%d, qvlen=%d)\n",
148     seqid);
149     }
150    
151    
152     bool add_cdbkey(char* key, off_t fpos, uint32 reclen, int16_t linelen=0, byte elen=0) {
153 gpertea 8
154     unsigned int klen=strlen(key);
155     if (fpos==last_cdbfpos && strcmp(key, lastKey)==0) return true;
156     if (klen<1) {
157 gpertea 15 /* GMessage("Warning: zero length key found following key '%s'\n",
158 gpertea 8 lastKey);
159 gpertea 15 */
160 gpertea 8 return false;
161     }
162     //------------ adding record -----------------
163     num_keys++;
164     strncpy(lastKey, key, MAX_KEYLEN-1);
165     lastKey[MAX_KEYLEN-1]='\0';
166     if ((uint64)fpos>(uint64)MAX_UINT) { //64 bit file offset
167     uint64 v= (uint64) fpos; //needed for Solaris' off_t issues with gcc/32
168 gpertea 15 /*
169     if (linelen>0) {
170     //record with line len
171     CIdxSeqData recdata;
172     recdata.fpos=gcvt_offt(&v);
173     recdata.reclen=gcvt_uint(&reclen);
174     recdata.linelen=gcvt_int16(&linelen);
175     recdata.elen=elen;
176     if (cdbidx->addrec(key,klen,(char*)&recdata,IdxSeqDataSIZE)==-1)
177     GError("Error adding cdb record with key '%s'\n",key);
178     }
179    
180     else { //plain record */
181     CIdxData recdata;
182     recdata.fpos=gcvt_offt(&v);
183     recdata.reclen=gcvt_uint(&reclen);
184     if (cdbidx->addrec(key,klen,(char*)&recdata,IdxDataSIZE)==-1)
185     GError("Error adding cdb record with key '%s'\n",key);
186     //}
187 gpertea 8 }
188 gpertea 15 else {//32 bit file offset is enough
189     /*
190     if (linelen>0) {
191     CIdxSeqData32 recdata;
192     uint32 v=(uint32) fpos;
193     recdata.fpos=gcvt_uint(&v);
194     recdata.reclen=gcvt_uint(&reclen);
195     recdata.linelen=gcvt_int16(&linelen);
196     recdata.elen=elen;
197     if (cdbidx->addrec(key,klen,(char*)&recdata,IdxDataSIZE32)==-1)
198     GError("Error adding cdb record with key '%s'\n",key);
199     }
200     else { */
201     CIdxData32 recdata;
202     uint32 v=(uint32) fpos;
203     recdata.fpos=gcvt_uint(&v);
204     recdata.reclen=gcvt_uint(&reclen);
205     if (cdbidx->addrec(key,klen,(char*)&recdata,IdxDataSIZE32)==-1)
206     GError("Error adding cdb record with key '%s'\n",key);
207     //}
208 gpertea 8 }
209     last_cdbfpos=fpos;
210     return true;
211     }
212    
213     //default indexing: key directly passed --
214     // as the first space delimited token
215     void addKey(char* key, off_t fpos,
216     uint32 reclen) {
217     num_recs++;
218     add_cdbkey(key, fpos, reclen);
219     if (caseInsensitive) {
220     char* lckey=loCase(key);
221     if (strcmp(lckey, key)!=0)
222     add_cdbkey(lckey, fpos, reclen);
223     GFREE(lckey);
224     }
225     }
226    
227     //the whole defline is passed
228     void addKeyMulti(char* defline,
229     off_t fpos, uint32 reclen) {
230     char* p=defline;
231     unsigned int fieldno=0;
232     char* pn;
233     num_recs++;
234     bool stillParsing=true;
235     unsigned int fidx=0; //index in fields[] array
236     while (stillParsing) {
237     while ((*p)==' ' || (*p)=='\t') p++;
238     if (*p == '\0') break;
239     //skip any extraneous characters at the beginning of the token
240     while (chrInStr(*p, wordJunk)) p++;
241     //skip any padding spaces or other extraneous characters
242     //at the beginning of the word
243     if (*p == '\0') break;
244     pn=p;
245     while (*pn!='\0' && !isspace(*pn)) pn++;
246     //found next space or end of string
247     fieldno++;
248     while (fields[fidx]<fieldno && fidx<numFields-1) fidx++;
249     //GMessage("> p=>'%s' [%d, %d, %d] (next='%s')\n",p, numFields,
250     // fieldno, fields[numFields-1], pn+1);
251     stillParsing = (((*pn)!='\0') && (fieldno+1<=fields[numFields-1]));
252     char* pend = pn-1; //pend is on the last non-space in the current token
253     //--strip the ending junk, if any
254     while (chrInStr(*pend, wordJunk) && pend>p) pend--;
255     if (pend<pn-1) *(pend+1)='\0';
256     else *pn='\0';
257    
258     if (strlen(p)>0) {
259     if (fields[fidx]==MAX_UINT || fields[fidx]==fieldno) {
260     if (useStopWords && stopList.hasKey(p)) {
261     p=pn+1;
262     continue;
263     }
264     //--- store this key with the same current record data:
265     add_cdbkey(p, fpos, reclen);
266     //---storage code ends here
267     if (caseInsensitive) {
268     char* lcp=loCase(p);
269     if (strcmp(lcp,p)!=0)
270     add_cdbkey(lcp, fpos, reclen);
271     GFREE(lcp);
272     }
273     }
274     //if (isEnd) break; //if all the token were stored
275     }
276     p=pn+1;//p is now on the next token's start
277     } //token parsing loop
278     }
279    
280    
281     int qcmpInt(const void *p1, const void *p2) {
282     //int n1=*((int*)p1);
283     //int n2=*((int*)p2);
284     const unsigned int *a = (unsigned int *) p1;
285     const unsigned int *b = (unsigned int *) p2;
286     if (*a < *b) return -1;
287     else if (*a > *b) return 1;
288     else return 0;
289     }
290    
291    
292     char* parse_dbacc(char* pstart, char*& end_acc, char*& accst) {
293     if (pstart==NULL || *pstart==0) return NULL;
294     bool hasDigits=false;
295     char* pend=pstart;
296     end_acc=NULL; //end of first accession
297     accst=NULL;
298     while (*pstart=='|') pstart++;
299     for(char* p=pstart;;p++) {
300     if (hasDigits==false && *p>='0' && *p<='9') hasDigits=true;
301     /* if (*p==0) { //end of seq_name
302     pend=p; //doesn't matter if it's accession or "db"-like
303     break;
304     }*/
305     if (*p=='|' || *p==0) {
306     int curlen=p-pstart;
307     if (*p==0 || (hasDigits && curlen>3) || curlen>7 || accst!=NULL) {//previous token was an accession
308     pend=p; //advance pend
309     if (end_acc==NULL) end_acc=p;
310     if (accst==NULL) accst=pstart;
311     break;
312     }
313     else { //first pipe char or no digits
314     accst=p+1;
315     }
316     hasDigits=false;//reset flag
317     } // | separator
318     }
319     if (pend!=pstart) return pend;
320     else return NULL;
321     }
322    
323    
324    
325 gpertea 15 char* nextKeyDelim(char* pstart, char* pmax) {
326     if (pstart==NULL || pstart>=pmax) return NULL;
327     char* p=pstart;
328     while (*p!=keyDelim && p!=pmax && *p!=0) p++;
329     return p;
330     }
331 gpertea 8
332 gpertea 15 char* endSpToken(char* str) {
333 gpertea 8 if (str==NULL) return NULL;
334     char* p=str;
335 gpertea 15 //while (*p!=' ' && *p!='\t' && *p!='\v' && *p!=0) p++;
336     while (!isspace(*p) && *p!=0) p++;
337 gpertea 8 *p=0;
338     return p;
339     }
340    
341     #define NRDB_CHARSEP "\1\2\3\4"
342     #define NRDB_STRSEP "^|^"
343    
344     //nrdbp is positioned at the end of current nrdb concatenated
345     //defline, or NULL if there is no more
346     inline void NRDB_Rec(char* &nrdbp, char* defline) {
347     nrdbp=strpbrk(defline, NRDB_CHARSEP);
348     if (nrdbp==NULL) {
349     nrdbp=strstr(defline, NRDB_STRSEP);
350     if (nrdbp!=NULL) {
351     *nrdbp='\0';
352     nrdbp+=2;
353     }
354     }
355     else *nrdbp='\0';
356     }
357    
358     //-c/-C/-a/-A indexing: key up to the first space or second '|'
359     //receives the full defline
360     void addKeyCompact(char* defline,
361     off_t fpos, uint32 reclen) {
362     //we got the first token found on the defline
363     num_recs++;
364     char* nrdb_end;
365     //breaks defline at the next nrdb concatenation point
366     NRDB_Rec(nrdb_end, defline);
367     //isolate the first token in this nrdb concatenation
368 gpertea 15 char* token_end=endSpToken(defline); //this puts 0 at the first space encountered
369 gpertea 8 if (!compact_plus) { //shortcut key
370     //only the first db|accession construct will be indexed, if found
371     char* end_acc1=NULL; //end of first accession
372     char* acc1st=NULL;
373     char* dbacc_end=parse_dbacc(defline, end_acc1, acc1st);
374     if (end_acc1!=NULL) { //has acceptable shortcut
375     *end_acc1=0;
376     add_cdbkey(defline, fpos, reclen);
377     return;
378     }
379     if (dbacc_end!=NULL) {
380     *dbacc_end=0;
381     add_cdbkey(defline, fpos, reclen);
382     return;
383     }
384     //store this whole non-space token as key:
385     add_cdbkey(defline, fpos, reclen);
386     return;
387     }
388     //from now on only -C/-a/-A treatment:
389     for(;;) {
390     //defline is on the first token
391     if (strlen(defline)>0) //add whole non-space token as the "full key"
392     add_cdbkey(defline, fpos, reclen);
393     //add the db|accession constructs as keys
394     char* dbacc_start=defline;
395     char* firstacc_end=NULL;
396     char* accst=NULL;
397     char* dbacc_end=parse_dbacc(dbacc_start, firstacc_end, accst);
398     while (dbacc_end!=NULL) {
399     if (firstacc_end!=NULL && firstacc_end<dbacc_end) {
400     char c=*firstacc_end;
401     *firstacc_end=0;
402     if (!acc_only)
403     add_cdbkey(dbacc_start, fpos, reclen);
404     if (acc_mode) {
405     add_cdbkey(accst, fpos, reclen);
406     }
407     *firstacc_end=c;
408     }
409     if (dbacc_start==defline && dbacc_end==token_end) {
410     if (acc_mode && accst!=NULL)
411     add_cdbkey(accst, fpos, reclen);
412     break; //the whole seq_name was only one db entry
413     }
414     *dbacc_end=0; //end key here
415     if (!acc_only)
416     add_cdbkey(dbacc_start, fpos, reclen);
417     if (acc_mode)
418     add_cdbkey(accst, fpos, reclen);
419     if (dbacc_end==token_end)
420     break; //reached the end of this whole seq_name (non-space token)
421     dbacc_start=dbacc_end+1;
422     firstacc_end=NULL;
423     dbacc_end=parse_dbacc(dbacc_start, firstacc_end, accst);
424     }
425     // -- get to next concatenated defline, if any:
426     if (compact_plus && nrdb_end!=NULL) {
427     defline=nrdb_end+1; //look for the next nrdb concatenation
428     NRDB_Rec(nrdb_end, defline);
429     //isolate the first token in this nrdb record
430 gpertea 15 token_end=endSpToken(defline);
431 gpertea 8 }
432     else break;
433     } //for
434     }
435    
436 gpertea 15 void addKeyDelim(char* defline,
437     off_t fpos, uint32 reclen) {
438     //we got the first token found on the defline
439     num_recs++;
440     char* nrdb_end;
441     //breaks defline at the next nrdb concatenation point
442     NRDB_Rec(nrdb_end, defline);
443     //isolate the first space-delimited token in this nrdb concatenation
444     char* token_end=endSpToken(defline);
445     for(;;) {
446     //defline is on the first token
447     if (strlen(defline)==0) break;
448     //add the db|accession constructs as keys
449     char* k_start=defline;
450     char* k_end=NULL;
451     while ((k_end=nextKeyDelim(k_start, token_end))!=NULL) {
452     *k_end=0;
453     add_cdbkey(k_start, fpos, reclen);
454     k_start=k_end+1;
455     }
456     // -- get to next concatenated defline, if any:
457     if (nrdb_end!=NULL) {
458     defline=nrdb_end+1; //look for the next nrdb concatenation
459     NRDB_Rec(nrdb_end, defline);
460     //isolate the first space-delimited token in this nrdb record
461     token_end=endSpToken(defline);
462     }
463     else break;
464     } //for
465     }
466    
467 gpertea 8 int readWords(FILE* f, GHash<int>& xhash) {
468     int c;
469     int count=0;
470     char name[256];
471     int len=0;
472     while ((c=getc(f))!=EOF) {
473     if (isspace(c) || c==',' || c==';') {
474     if (len>0) {
475     name[len]='\0';
476     xhash.Add(name, new int(1));
477     count++;
478     len=0;
479     }
480     continue;
481     }
482     //a non-space
483     name[len]=(char) c;
484     len++;
485     if (len>255) {
486     name[len-1]='\0';
487     GError("Error reading words file: token too long ('%s') !\n",name);
488     }
489     }
490     if (len>0) {
491     name[len]='\0';
492     xhash.Add(name, new int(1));
493     count++;
494     }
495     return count;
496     }
497    
498    
499    
500     //========================== MAIN ===============================
501     int main(int argc, char **argv) {
502     FILE* f_read=NULL;
503     off_t fdbsize;
504     int ch;
505     char* zfilename;
506     char* fname;
507     char* marker; //record marker
508     int maxkeys=0;
509 gpertea 15 int multikey=0;
510 gpertea 8 record_marker[0]='>';
511     record_marker[1]=0;
512 gpertea 15 GArgs args(argc, argv, "icvDQCaAmn:o:r:z:w:f:s:d:");
513 gpertea 8 int e=args.isError();
514     if (e>0)
515     GError("%s Invalid argument: %s\n", USAGE, argv[e] );
516     if (args.getOpt('v')!=NULL) {
517     printf("%s\n",VERSION);
518     return 0;
519     }
520     fastq = (args.getOpt('Q')!=NULL);
521 gpertea 15 gFastaSeq=(args.getOpt('G')!=NULL);
522     if (fastq && gFastaSeq)
523     GError("Error: options -Q and -G are mutually exclusive.\n");
524     if (args.getOpt('D')) keyDelim='|';
525     else if (args.getOpt('d')!=NULL) keyDelim=args.getOpt('d')[0];
526 gpertea 8 multikey = (args.getOpt('m')!=NULL);
527     if (multikey) {
528     fields[numFields]=1;
529     numFields++;
530     fields[numFields]=MAX_UINT;
531     numFields++;
532     }
533     caseInsensitive = (args.getOpt('i')!=NULL);
534     acc_only=(args.getOpt('a')!=NULL);
535     acc_mode=(acc_only || args.getOpt('A')!=NULL);
536     compact_plus=(args.getOpt('C')!=NULL || acc_mode);
537     wordJunk = (char *)args.getOpt('s');
538     if (wordJunk==NULL) wordJunk=(char*)defWordJunk;
539     int compact=(args.getOpt('c')!=NULL || compact_plus);
540 gpertea 15 if ((compact && multikey) || (multikey && keyDelim)) {
541     GError("%s Error: invalid flag combination.\n", USAGE);
542 gpertea 8 }
543 gpertea 15 char* argfields = (char*)args.getOpt('f');
544 gpertea 8 char* s = (char*)args.getOpt('n');
545     if (s!=NULL) {
546     maxkeys = atoi(s);
547 gpertea 15 if (maxkeys>1 && (compact || multikey || keyDelim || argfields!=NULL))
548     GError("%s Error: invalid options (-m, -c/C, -n, -D/-d and -f are exclusive)\n", USAGE);
549 gpertea 8 multikey=1;
550     numFields=maxkeys;
551     if (numFields>254) GError(ERR_TOOMANYFIELDS);
552     for (unsigned int i=1;i<=numFields;i++) fields[i-1]=i;
553     }
554 gpertea 15
555 gpertea 8 if (argfields!=NULL) { //parse all the field #s
556 gpertea 15 if (multikey || keyDelim || compact)
557     GError("%s Error: invalid options (-m, -c/C, -n, -D/-d and -f are exclusive)\n", USAGE);
558 gpertea 8 char* pbrk;
559     int prevnum=0;
560     char prevsep='\0';
561     numFields=0;
562     char sep;
563     char *p=argfields;
564     do {
565     pbrk=strpbrk(p,",-");
566     if (pbrk==NULL) {
567     sep='\0';
568     pbrk=p+strlen(p);
569     if (prevsep == '-' && *p=='\0' && prevnum>0) {
570     //open ended range -- ending with '-'
571     fields[numFields]=prevnum;
572     numFields++;
573     if (numFields>253) GError(ERR_TOOMANYFIELDS);
574     fields[numFields]=MAX_UINT;
575     numFields++;
576     //GMessage("--- stored %d, %d\n",prevnum, MAX_UINT);
577     break;
578     }// ending with '-'
579     } // '\0'
580     else { sep=*pbrk; *pbrk = '\0'; }
581     int num = atoi(p);
582     if (num<=0 || num>254 )
583     GError("%s Error: invalid syntax for -f option.\n", USAGE);
584     if (prevsep == '-') { //store a range
585     for (int i=prevnum;i<=num;i++) {
586     fields[numFields]=i;
587     numFields++;
588     if (numFields>254) GError(ERR_TOOMANYFIELDS);
589     }
590     }
591     else if (sep!='-') {
592     fields[numFields]=num;
593     numFields++;
594     if (numFields>254) GError(ERR_TOOMANYFIELDS);
595     }
596    
597     prevsep=sep;
598     prevnum=num;
599     p=pbrk+1;
600     } while (sep != '\0'); //range parsing loop
601     if (numFields<=0 || numFields>254 )
602     GError("%s Error at parsing -f option.\n", USAGE);
603     //GMessage("[%d] Fields parsed (%d values):\n", sizeof(fields[0]), numFields);
604     qsort(fields, numFields, sizeof(fields[0]), &qcmpInt);
605     multikey=1;
606     /*-- --------debug:
607     for (unsigned int i=0;i<numFields-1;i++) {
608     GMessage("%d,", fields[i]);
609     }
610     GMessage("%d\n",fields[numFields-1]);
611     exit(0); */
612     } //fields
613     if (fastq) {
614     record_marker[0]='@';
615     record_marker_len=1;
616     }
617     if (args.getOpt('r')!=NULL) {//non-FASTA record delimiter?
618     if (fastq) {
619     GMessage("Option -r ignored because -Q was given (-Q sets delimiter to '@')\n");
620     }
621 gpertea 15 else {
622 gpertea 8 marker=(char*)args.getOpt('r'); //
623     int v=0;
624     if (strlen(marker)>126)
625     GError("Error: the specified record delimiter is too long. "
626     "Maximum accepted is 126\n");
627     //special case: hex (0xXX) and octal codes (\XXX) are accepted, only if by themselves
628 gpertea 15 if (strlen(marker)==4 && (marker[0]=='\\' || (marker[0]=='0' && toupper(marker[1])=='X') )) {
629 gpertea 8 if (marker[0]=='\\') {
630     marker++;
631     v=strtol(marker, NULL, 8);
632     }
633     else v=strtol(marker, NULL, 16);
634     if (v==0 || v>255)
635     GError("Invalid record delimiter: should be only one character,\n"
636     "'\\NNN' (octal value), '0xNN' (hexadecimal value)");
637     record_marker[0]=v;
638     record_marker_len=1;
639     }
640     else {
641     strcpy(record_marker, marker);
642     record_marker_len=strlen(record_marker);
643     }
644     }
645     }
646     char* stopwords=(char*)args.getOpt('w'); //stop words filename given?
647     if (stopwords!=NULL) {
648     FILE* fstopwords=NULL;
649     if ((fstopwords=fopen(stopwords, "r"))==NULL)
650     GError("Cannot open stop words file '%s'!\n", stopwords);
651     int c=readWords(fstopwords, stopList);
652     GMessage("Loaded %d stop words.\n", c);
653     fclose(fstopwords);
654     useStopWords=(c>0);
655     }
656     if ((zfilename=(char*)args.getOpt('z')) !=NULL) {
657     do_compress=true;
658     #ifndef ENABLE_COMPRESSION
659     GError("Error: compression requested but not enabled when cdbfasta was compiled\n")
660     #endif
661     strcpy(fztmp,zfilename);
662     strcat(fztmp,"_ztmp");
663     zf=fopen(fztmp,"wb");
664     if (zf==NULL)
665     GError("Error creating file '%s'\n'", fztmp);
666     }
667     char* outfile=(char*) args.getOpt('o');
668     int numfiles = args.startNonOpt();
669     if (numfiles==0)
670     GError("%sError: no fasta file given.\n", USAGE);
671     fname=(char*) args.nextNonOpt(); //first fasta file given
672     if (do_compress) { //-------- compression case -------------------
673     if (strcmp(fname, "-")==0 || strcmp(fname, "stdin")==0)
674     f_read=stdin;
675     else f_read= fopen(fname, "rb");
676     if (f_read == NULL) die_read(fname);
677     fname=zfilename; //forget the input file name, keep the output
678     }
679     else {//
680     int fdread= open(fname, O_RDONLY|O_BINARY);
681     if (fdread == -1) die_read(fname);
682     struct stat dbstat;
683     fstat(fdread, &dbstat);
684     fdbsize=dbstat.st_size;
685     close(fdread);
686     f_read= fopen(fname, "rb");
687     if (f_read == NULL) die_read(fname);
688     }
689    
690     char idxfile[365];
691     if (outfile==NULL) {
692     if (do_compress) {
693     strcpy(ftmp, zfilename);
694     strcat(ftmp, ".cidx");
695     strcpy(idxfile, ftmp);
696     strcat(ftmp, "_tmp");
697     }
698     else {
699     strcpy(ftmp, fname);
700     strcat(ftmp, ".cidx");
701     strcpy(idxfile, ftmp);
702     strcat(ftmp, "_tmp");
703     }
704     //should add the process ID, time and user to make this unique?
705     }
706     else {
707     strcpy(ftmp, outfile);
708     strcpy(idxfile, outfile);
709     strcat(ftmp, "_tmp");
710     }
711    
712     cdbidx=new GCdbWrite(ftmp); //test if this was successful?
713 gpertea 15 if (keyDelim>0) {
714     addKeyFunc=&addKeyDelim;
715     }
716     else {
717     if (compact)
718 gpertea 8 addKeyFunc=&addKeyCompact;
719 gpertea 15 else if (multikey)
720 gpertea 8 addKeyFunc = &addKeyMulti;
721     else addKeyFunc = &addKey;
722 gpertea 15 }
723 gpertea 8 off_t recpos=0;
724     off_t r=0;
725     unsigned int recsize=0;
726     char* key=NULL;
727     bool fullDefline=(multikey || compact_plus);
728     GReadBuf *readbuf = new GReadBuf(f_read, GREADBUF_SIZE);
729     if (do_compress) { //---------------- compression case -------------
730     if (fastq) GError("Error: sorry, compression is not supported with fastq format\n");
731 gpertea 15 //TODO: use a clever block compression/indexing scheme with virtual offsets, like bgzf
732     // -- this should take care of the fastq compression
733 gpertea 8 fdbsize=0;
734     GCdbz cdbz(zf); // zlib interface
735     recpos=cdbz.getZRecPos();
736     while ((key=cdbz.compress(readbuf,record_marker))!=NULL) {
737     recsize=cdbz.getZRecSize();
738     if (!fullDefline) {
739     //find first space after the record_marker and place a '\0' there
740     for (int i=record_marker_len; key[i]!='\0';i++) {
741     if (isspace(key[i])) { key[i]='\0';break; }
742     }
743     }
744     addKeyFunc(key, recpos, recsize);
745     recpos = cdbz.getZRecPos();
746     }
747     remove(zfilename);
748     cdbz.compress_end();
749     fclose(zf);
750     //determine the size of this file:
751     int ftmp= open(fztmp, O_RDONLY|O_BINARY);
752     if (ftmp == -1) die_read(fztmp);
753     struct stat dbstat;
754     fstat(ftmp, &dbstat);
755     fdbsize=dbstat.st_size;
756     //rename it to the intended file name
757     if (rename(fztmp,zfilename) != 0) {
758     GMessage("Error: unable to rename '%s' to '%s'\n",fztmp,zfilename);
759     perror("rename");
760     }
761     } //compression requested
762 gpertea 15 else { // not compressed -- plain (buffered) file access
763 gpertea 8 bool defline=false;
764 gpertea 15 bool seen_defline=false;
765 gpertea 8 int kbufsize=KBUFSIZE;
766     if (fullDefline) { GMALLOC(key, KBUFSIZE); }//large defline storage buffer, just in case
767     else { GMALLOC(key, 1024); }
768 gpertea 15 key[0]=0; //no keys have been parsed yet
769 gpertea 8 int kidx=-1;
770     num_recs=0;
771     num_keys=0;
772 gpertea 15 char prevch=0;
773     //first iteration -- for the beginning of file
774 gpertea 8 if (readbuf->peekCmp(record_marker, record_marker_len)==0) {
775     //new record start found (defline)
776     recpos=readbuf->getPos(); //new record pos
777     defline=true; //we're in defline
778 gpertea 15 seen_defline=true;
779 gpertea 8 readbuf->skip(record_marker_len);
780     kidx=0;
781     }//new record start
782 gpertea 15 int linecounter=0; //number of non-header lines for current record
783     int fq_recloc=(fastq) ? 0 : 4; //0=seq header line, 1=sequence string, 2=q-header line, 3=qvstring, 4=not fastq
784     int fq_lendata[4]={0,0,0,0}; //keep track of fastq seq len, qv len
785     int last_linelen=0; //length of last non-header line in a FASTA record
786     int first_linelen=0; //length of the first non-header line in a FASTA record
787     int last_eol_len=0;
788     bool mustbeLastLine=false;
789     bool wasEOL=false;
790 gpertea 8 while ((ch=readbuf->getch())>0) {
791 gpertea 15 bool isEOL=false;
792     if (ch=='\n' || ch=='\r') {
793     isEOL=true;
794     last_eol_len=1;
795     }
796     if (wasEOL && isEOL) {
797     //skipped double-char EoL (DOS or empty lines)
798     if (prevch=='\n' && ch=='\r') last_eol_len=2;
799     prevch=ch;
800     wasEOL=isEOL;
801     continue;
802     }
803     if (defline) {
804     // ===> in the header line (including its EOL)
805     if (isEOL) {
806     // --> end of header line
807     if (kidx>=0) { key[kidx]=0;kidx=-1; }
808     defline=false;
809     last_linelen=0;
810     first_linelen=0;
811     linecounter=0;
812     fq_recloc= (fastq) ? 0 : 4;
813     mustbeLastLine=false;
814     memset((void*)fq_lendata, 0, 4*sizeof(int));
815     prevch=ch;
816     wasEOL=isEOL;
817     continue;
818     }
819     else {
820     // --> within header, before EOL
821     if (kidx>=0) { //still parsing keys
822     if ((isspace(ch) || ch<31)
823     && fullDefline==false) {
824     key[kidx]=0;
825     kidx=-1;
826     wasEOL=false;
827     prevch=ch;
828     continue;
829     }
830     key[kidx]=(char)ch;
831     kidx++;
832     if (kidx>=kbufsize) {
833     kbufsize+=KBUFSIZE;
834     GREALLOC(key, kbufsize);
835 gpertea 8 }
836 gpertea 15 } //still parsing keys
837     } // <-- before EoL
838     } // <=== in the header line
839     else {
840     // ===> not in header line (could be "sequence block", or before first defline
841     if (isEOL) {
842     // <-- at EoL in "sequence" block (record body)
843     if (seen_defline) {
844     linecounter++; //counting sequence lines
845     if (gFastaSeq && linecounter>1) {
846     if (last_linelen>first_linelen)
847     die_gseqformat(key);
848     if (last_linelen<first_linelen)
849     mustbeLastLine=true;
850     }
851     }
852     } // --> at EoL
853     else {
854     // <-- before EoL
855     // could be a new header, should we check?
856     if (wasEOL) {
857     // - start of a new line
858     bool newRecStart=false;
859     bool checkNewRec = (!fastq || fq_lendata[1]<=fq_lendata[3]);
860     if (checkNewRec && ch==record_marker[0]) {
861     newRecStart=(record_marker_len>1) ?
862     (readbuf->peekCmp(&record_marker[1], record_marker_len-1)==0) :
863     true;
864     if (newRecStart) {
865     // new record start (new header line coming up)
866     seen_defline=true;
867     recsize = readbuf->getPos()-recpos-1-last_eol_len; //previous recsize
868     if (recsize>(off_t)(record_marker_len+1) && key[0]!='\0') {
869     //add previous record
870     //TODO: validate gFastaSeq or fastq here?
871     if (fastq && fq_lendata[1]!=fq_lendata[3])
872     die_fastqformat(key, fq_lendata[1], fq_lendata[3]);
873     addKeyFunc(key, recpos, recsize);
874     }
875     recpos=readbuf->getPos()-1; //new record pos (after reading this EOL)
876     if (record_marker_len>1)
877     readbuf->skip(record_marker_len-1); //skip past the record marker
878     defline=true; //we'll be in the header line in the next iteration
879     linecounter=0;
880     mustbeLastLine=false;
881     kidx=0; //reset keys
882     prevch=record_marker[record_marker_len-1];
883     wasEOL=(prevch=='\n' || prevch=='\r');
884     continue;
885     } // new record start
886     } //checking for new record start (record_marker match)
887     if (!seen_defline) {
888     prevch=ch;
889     wasEOL=isEOL;
890     continue;
891     }
892     if (mustbeLastLine && !newRecStart)
893     die_gseqformat(key);
894     // start of a new line in the "sequence" block
895     last_linelen=0;
896     mustbeLastLine=false;
897     if (fq_recloc<4) {
898     //take care of multi-line fastq parsing
899     if (fq_recloc==1 && ch=='+') {
900     fq_recloc=2; //q-header line
901     }
902     else if (fq_recloc==0 || fq_recloc==2) fq_recloc++;
903     } //fastq parsing
904     } // - start of a new non-header line (wasEoL)
905     if (!seen_defline) {
906     prevch=ch;
907     wasEOL=isEOL;
908     continue;
909     }
910     if (fq_recloc<4) {
911     fq_lendata[fq_recloc]++;
912     }
913     last_linelen++;
914     if (linecounter==0) first_linelen++;
915     } // --> record body, before EoL
916     } // <=== not in header line
917    
918     prevch=ch;
919     wasEOL=isEOL;
920     }//while getch
921 gpertea 8 recsize=readbuf->getPos()-recpos;
922     if (recsize>0) {//add last record, if there
923 gpertea 15 if (prevch=='\n' || prevch=='\r') recsize-=last_eol_len;
924 gpertea 8 if (fullDefline && kidx>0) {//close the defline string
925 gpertea 15 if (prevch!='\n' && prevch!='\r') kidx++;
926 gpertea 8 key[kidx-1]='\0';
927     }
928     addKeyFunc(key, recpos, recsize);
929     linecounter=0;
930     //GMessage("adding key=%s\n",key);
931     }
932     delete readbuf;
933     }
934     if (f_read!=stdin) fclose(f_read);
935     if (cdbidx->finish() == -1) die_write("");
936    
937     // === add some statistics at the end of the cdb index file!
938     r=lseek(cdbidx->getfd(), 0, SEEK_END);
939     cdbInfo info;
940     memcpy((void*)info.tag, (void*)"CDBX", 4);
941     info.idxflags=0;
942     if (multikey) info.idxflags |= CDBMSK_OPT_MULTI;
943     if (do_compress) {
944     info.idxflags |= CDBMSK_OPT_COMPRESS;
945     GMessage("Input data were compressed into file '%s'\n",fname);
946     }
947     if (compact) {
948     if (compact_plus)
949     info.idxflags |= CDBMSK_OPT_CADD;
950     else
951     info.idxflags |= CDBMSK_OPT_C;
952     }
953 gpertea 15 if (gFastaSeq) {
954     info.idxflags |= CDBMSK_OPT_GSEQ;
955     }
956 gpertea 8 info.num_records=gcvt_uint(&num_recs);
957     info.num_keys=gcvt_uint(&num_keys);
958     info.dbsize=gcvt_offt(&fdbsize);
959     info.idxflags=gcvt_uint(&info.idxflags);
960     int nlen=strlen(fname);
961     info.dbnamelen=gcvt_uint(&nlen);
962     r=write(cdbidx->getfd(), fname, nlen);
963     if (r!=nlen)
964     GError(ERR_W_DBSTAT);
965     r=write(cdbidx->getfd(), &info, cdbInfoSIZE);
966     if (r!=cdbInfoSIZE)
967     GError(ERR_W_DBSTAT);
968     delete cdbidx;
969     GFREE(key);
970     remove(idxfile);
971     if (rename(ftmp,idxfile) == -1)
972     GError("Error: unable to rename %s to %s",ftmp,idxfile);
973     GMessage("%d entries from file %s were indexed in file %s\n",
974     num_recs, fname, idxfile);
975     return 0;
976     }