ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GFastaFile.h
Revision: 2
Committed: Mon Mar 22 22:03:27 2010 UTC (9 years, 3 months ago) by gpertea
File size: 21884 byte(s)
Log Message:
added my gclib source files

Line User Rev File contents
1 gpertea 2 #ifndef GFASTAFILE_H
2     #define GFASTAFILE_H
3     #include <stdio.h>
4     #include "gdna.h"
5    
6     #define CAPINC 64
7     #define SEQCAPINC 256
8     #define DEF_FASTA_DELIM ">"
9    
10     class FastaSeq { /* fasta record storage */
11     public:
12     int id_cap; /* allocated size of the sequence name string*/
13     char *id; /* id only, up to first space */
14     int namelen; // real length of seq name
15     char *descr; /* any comment on the defline, after the first space */
16     int d_cap; /* allocated size of the description */
17     int descrlen; /* real length of the description */
18     //-------actual sequence :
19     int s_cap; /* allocated length of the sequence string */
20     int len; /* the actual string length of seq */
21     char* seq; /* the sequence buffer itself */
22     //----
23     FastaSeq(char* cname, char* cdescr=NULL, char* cseq=NULL) {
24     //ntCompTableInit();
25     if (cname==NULL) {
26     FastaSeq();
27     return;
28     }
29     int l=strlen(cname);
30     GMALLOC(id, l+1);strcpy(id,cname);
31     id_cap=l+1;
32     namelen=l;
33     if (cdescr==NULL) {
34     GMALLOC(descr, CAPINC);
35     descr[0]='\0';
36     d_cap=CAPINC;
37     descrlen=0;
38     }
39     else {//copy given description
40     l=strlen(cdescr);
41     GMALLOC(descr, l+1);
42     strcpy(descr,cdescr);
43     d_cap=l+1;
44     descrlen=l;
45     }
46     if (cseq==NULL) {
47     GMALLOC(seq, SEQCAPINC);
48     seq[0]='\0';
49     len=0;
50     s_cap=SEQCAPINC;
51     }
52     else {
53     l=strlen(cseq);
54     GMALLOC(seq, l+1);
55     strcpy(seq,cseq);
56     len=l;
57     s_cap=l+1;
58     }
59     }
60     void init(int seqalloc=0) {
61     //ntCompTableInit();
62     GMALLOC(id, CAPINC);
63     id_cap=CAPINC;
64     namelen=0;
65     id[0]='\0';
66     GMALLOC(descr, CAPINC);
67     descr[0]='\0';
68     d_cap=CAPINC;
69     descrlen=0;
70     if (seqalloc<=0) {
71     s_cap=SEQCAPINC;
72     GMALLOC(seq, SEQCAPINC);
73     }
74     else {
75     s_cap=seqalloc;
76     GMALLOC(seq, seqalloc);
77     }
78     seq[0]='\0';
79     len=0;
80     }
81     FastaSeq(int seqalloc=0) {
82     init(seqalloc);
83     }
84     void clear() {
85     GFREE(id);id_cap=0;namelen=0;id=NULL;
86     GFREE(descr);d_cap=0;descrlen=0;descr=NULL;
87     GFREE(seq);s_cap=0;len=0;seq=NULL;
88     }
89     ~FastaSeq() {
90     clear();
91     }
92     int getNameLen() { return namelen; }
93     const char* getName() { return (const char*) id; }
94     const char* name() { return (const char*) id; }
95     const char* getSeqName() { return (const char*) id; }
96     const char* getId() { return (const char*) id; }
97     const char* getDescr() { return (const char*) descr; }
98     int getDescrLen() { return descrlen; }
99     const char* getSeq() { return (const char*) seq; }
100     int getSeqLen() { return len; }
101     void extendId(char c) {
102     if (namelen+1 >= id_cap) {
103     id_cap += CAPINC;
104     GREALLOC(id, id_cap);
105     }
106     id[namelen]= c;
107     namelen++;
108     }
109     void extendSeqName(char c) { extendId(c); }
110     void extendName(char c) { extendId(c); }
111     void extendDescr(char c) {
112     if (descrlen+1 >= d_cap) {
113     d_cap += CAPINC;
114     GREALLOC(descr, d_cap);
115     }
116     descr[descrlen]= c;
117     descrlen++;
118     }
119     void endId() { id[namelen]=0; }
120     void endName() { id[namelen]=0; }
121     void endSeqName() { id[namelen]=0; }
122     void endDescr() { descr[descrlen]=0; }
123     void endSeq() { seq[len]=0; }
124     void extendSeq(char c) {
125     if (len+1 >= s_cap) {
126     s_cap += SEQCAPINC;
127     GREALLOC(seq, s_cap);
128     }
129     seq[len]= c;
130     len++;
131     }
132     void compactIdMem() { if (namelen>0) {
133     GREALLOC(id, namelen+1); id_cap=namelen+1;
134     } }
135     void compactDescrMem() { if (descrlen>0) {
136     GREALLOC(descr, descrlen+1); d_cap=descrlen+1; } }
137     void compactSeqMem() { if (len>0) {
138     GREALLOC(seq, len+1); s_cap=len+1; } }
139     void compactMem() {
140     compactIdMem();
141     compactDescrMem();
142     compactSeqMem();
143     }
144     char* detachSeqPtr() { //such that the sequence allocated memory is no longer
145     // freed when the FastaSeq object is destroyed
146     // the returned pointer MUST be deallocated by the the user, later!
147     char* p=seq;
148     GMALLOC(seq, SEQCAPINC);
149     s_cap=SEQCAPINC;
150     len=0;
151     return p;
152     }
153     char* setSeqPtr(char* newseq, int newlen=0, int newcap=0) {
154     if (newlen==0) newlen=strlen(newseq);
155     if (newcap<=newlen) newcap=newlen+1;
156     GFREE(seq);
157     seq=newseq;
158     len=newlen;
159     s_cap=newcap;
160     return seq;
161     }
162     void reset() {// allocated space remains the same!
163     namelen=0;id[0]=0;
164     descrlen=0;descr[0]=0;
165     len=0;seq[0]=0;
166     }
167     //reverse-complement a nucleotide sequence:
168     void reverseComplement() {
169     if (len==0) return;
170     //ntCompTableInit();
171     reverseChars(seq,len);
172     for (int i=0;i<len;i++) seq[i]=ntComplement(seq[i]);
173     }
174     //printing fasta formatted sequence to a file stream
175     void fprint(FILE* fout, int line_len=60, bool defline=false) {
176     if (defline) {
177     if (descrlen>0) fprintf(fout, "%s %s\n", id, descr);
178     else fprintf(fout, ">%s\n", id);
179     }
180     int l=len;
181     char* p=seq;
182     while (l>0) {
183     int to_write=GMIN(line_len, l);
184     fwrite(p,1,to_write,fout);
185     fprintf(fout,"\n");
186     p+=line_len;
187     l-=line_len;
188     }
189     }
190     //
191     static void write(FILE *fh, char* seqid, char* descr, char* seq,
192     const int linelen=60, const int seqlen=0) {
193     int i, idlen, ilen;
194     //char *s;
195     //s = (seqid == NULL) ? (char*)"ANONYMOUS" : seqid;
196     // write header line only if given!
197     idlen=(seqid==NULL)? 0 : strlen(seqid);
198     if (idlen>0) {
199     if (*seqid != '>') putc('>', fh);
200     fwrite(seqid, 1, idlen, fh);
201     i=(descr==NULL)? 0 : strlen(descr);
202     if (i>0) {
203     putc(' ',fh);
204     fwrite(descr, 1, i, fh);
205     }
206     putc('\n', fh);
207     }
208     if (linelen>0) { //fixed line length
209     int len = (seqlen>0) ? seqlen : strlen(seq);
210     ilen=0;
211     for (i=0; i < len; i++, ilen++) {
212     if (ilen == linelen) {
213     putc('\n', fh);
214     ilen = 0;
215     }
216     putc(seq[i], fh);
217     } //for
218     putc('\n', fh);
219     }
220     else { //no line length limit
221     fprintf(fh, "\n%s\n", seq);
222     }
223     fflush(fh);
224     }
225    
226    
227     };
228    
229     typedef int charFunc(char c, int pos, FastaSeq* fseq); //char processing function
230     /* passes:
231     c = current sequence character (generally aminoacid or nucleotide)
232     pos = 0-based coordinate of the given character within the sequence
233     fseq = FastaSeq pointer (useful for retrieving sequence defline info)
234     the return value is not used yet
235     */
236    
237    
238     //(for reading/writing variable length records, etc.)
239     enum fileMode {
240     fmRead,
241     fmWrite
242     };
243    
244     class GFastaFile {
245     char* fname;
246     FILE* fh;
247     fileMode fmode;
248    
249     long int rec_fpos; //the input stream offset of the current record to be read
250     long int cur_fpos; //the input stream offset of the current byte to be read
251     uint seqcoord; //1-based coordinate of the current record's sequence reading position
252     //(updated by getSeqRange() mostly)
253     protected:
254     void bad_fastafmt() {
255     GError("Error parsing file '%s'. Not a Fasta file?\n", fname);
256     }
257     void check_eof(int c) {
258     if (c == EOF) bad_fastafmt();
259     }
260     public:
261     GFastaFile(const char* filename, fileMode filemode=fmRead) {
262     fh=NULL;
263     cur_fpos=0;
264     rec_fpos=0;
265     fmode=filemode;
266     seqcoord=0;
267     const char *mode=(filemode==fmRead) ? "rb" : "wb";
268     if (filename == NULL || filename[0]=='\0') {
269     fh = (filemode == fmRead) ? stdin : stdout;
270     fname=NULL;
271     }
272     else {
273     if ((fh = fopen(filename, mode)) == NULL)
274     GError("Cannot open file '%s'!", filename);
275     fname=Gstrdup(filename);
276     }
277     /*
278     GCALLOC(curseqid, CAPINC);
279     curseqidlen=CAPINC;
280     GCALLOC(curdescr, CAPINC);
281     curdescrlen=CAPINC;*/
282     }
283    
284     //attach a GFastaFile object to an already open handle
285     GFastaFile(FILE* fhandle, fileMode filemode=fmRead, const char* filename=NULL) {
286     fh=fhandle;
287     cur_fpos=ftell(fh);
288     fmode=filemode;
289     rec_fpos=cur_fpos;
290     seqcoord=0;
291     if (filename == NULL || filename[0]=='\0') {
292     fname=NULL;
293     }
294     else
295     fname=Gstrdup(filename);
296     }
297    
298    
299     void reset() {
300     if (fh!=NULL && fh!=stdout && fh!=stdin) {
301     fseek(fh,0L, SEEK_SET);
302     cur_fpos=0;
303     rec_fpos=0;
304     }
305     else GError("Cannot use GFastaFile::reset() on stdin, stdout or NULL handles.\n");
306     }
307    
308     void seek(int pos) {
309     if (fh!=NULL && fh!=stdout && fh!=stdin) {
310     fseek(fh, pos, SEEK_SET);
311     cur_fpos=pos;
312     seqcoord=0; //seqcoord agnostic after a seek
313     }
314     else GError("Cannot use GFastaFile::seek() on stdin, stdout or NULL handles.\n");
315     }
316    
317     ~GFastaFile() {
318     if (fh!=NULL && fh!=stdout && fh!=stdin) fclose(fh);
319     fh=NULL;
320     GFREE(fname);
321     /*GFREE(curseqid);
322     GFREE(curdescr);*/
323     }
324    
325     int getReadPos() { return cur_fpos; } /* returns current read position in the
326     input stream (can be used within callback) */
327     int ReadSeqPos() {return rec_fpos; } /* returns the input stream offset of the last fasta
328     record processed by getFastaSeq*/
329     bool readHeader(FastaSeq& seq) { return (readHeader(&seq)!=NULL); }
330     FastaSeq* readSeq(int seqalloc=0) {
331     //allocate a new FastaSeq, reads the next record and returns it
332     //caller is responsible for deallocating returned FastaSeq memory!
333     FastaSeq* r=readHeader(NULL, seqalloc);
334     int len=0;
335     char before=1; //newline before indicator
336     int c=-1;
337     //load the whole sequence in FastaSeq
338     while ((c = getc(fh)) != EOF && c != '>') {
339     cur_fpos++;
340     //if (isspace(c) || c<31)
341     if (c<=32) {
342     before = (c=='\n' || c=='\r')?1:0;
343     continue; /* skip spaces */
344     }
345     if (len >= r->s_cap-1) {
346     GREALLOC(r->seq, r->s_cap + SEQCAPINC);
347     r->s_cap+=SEQCAPINC;
348     }
349     r->seq[len] = c;
350     before=0;
351     len++;
352     }
353     r->seq[len] = '\0';
354     r->len=len;
355     return r;
356     }
357     FastaSeq* readHeader(FastaSeq* seq=NULL, int seqalloc=0) {
358     /* reads the Fasta sequence header
359     the first character must be '>' for this call, after any spaces,
360     if seq is NULL a new FastaSeq object is allocated and returned,
361     otherwise id and descr are updated */
362     seqcoord=0;
363     int* buflen;
364     int* buflenstr;
365     char** buf;
366     int before;
367     if (feof(fh)) return NULL;
368     int c = getc(fh);
369     if (c==EOF) return NULL;
370     cur_fpos++;
371     while (c!=EOF && c<=32) { c=getc(fh); cur_fpos++; }//skip spaces etc.
372     if (c == EOF) return NULL;
373     if (c != '>')
374     bad_fastafmt();
375     if (seq==NULL) seq=new FastaSeq(seqalloc);
376     else { seq->clear(); seq->init(seqalloc); }
377    
378     int len = 0; //chars accumulated so far
379     buflen=&(seq->id_cap);
380     buf=&(seq->id);
381     buflenstr=&(seq->namelen);
382     before=1;
383     while ((c = getc(fh)) != EOF) {
384     cur_fpos++;
385     if (c=='\n' || c=='\r') break;
386     if (len >= *buflen-1) {
387     GREALLOC(*buf, *buflen + CAPINC);
388     *buflen+=CAPINC;
389     }
390     if (before && (c<=32)) {
391     // space encountered => seq_name finished
392     before=0;
393     (*buf)[len]='\0';
394     *buflenstr=len;
395     buf=&seq->descr;
396     buflen=&seq->d_cap;
397     buflenstr=&seq->descrlen;
398     len=0;
399     if (c!=1) // special case, nrdb concatenation
400     continue; // skip this space
401     }
402     (*buf)[len]=c;
403     len++;
404     }
405     (*buf)[len]='\0'; /* terminate the comment string */
406     *buflenstr = len;
407     check_eof(c); /* it's wrong to have eof here */
408     seqcoord=1;
409     return (seq->namelen==0) ? NULL : seq;
410     }
411    
412     FastaSeq *getFastaSeq(bool& is_last, FastaSeq* seq, charFunc* callbackFn = NULL ) {
413     /* seq must be a pointer to a initialized FastaSeq structure
414     if seq is NULL, the sequence is not actually read,
415     but just skipped and the file pointer set accordingly, while
416     the returned "pointer" will not be a FastaSeq one but just NULL or not NULL
417     (depending if eof was encountered)
418     if callbackFn is NULL, the sequence is read entirely in memory in a FastaSeq.seq field
419     otherwise only the defline is parsed into FastaSeq::id and FastaSeq::descr but actual
420     sequence letters are passed one by one to the callback function
421     and the actual sequence is never stored in memory (unless the callback does it)
422     */
423     int c, len;
424     int before;
425     rec_fpos=cur_fpos;
426     len = 0; //chars accumulated so far
427     if (fh==NULL || feof(fh)) return NULL;
428     // -------- read the defline first
429     if (seq==NULL) { // navigate only! don't read/parse anything but the record delimiter
430     before=1;
431     while ((c = getc(fh)) != EOF && c != '\n' && c !='\r') cur_fpos++; // skip defline
432     if (c==EOF && cur_fpos<=rec_fpos+2) return NULL;
433     check_eof(c); /* it's wrong to have eof here! */
434     cur_fpos++; //to account for the '\n' read
435     /*----- read the sequence now: */
436     before=1; /* "newline before" flag */
437     while ((c = getc(fh)) != EOF && c != '>') {
438     cur_fpos++;
439     before = (c=='\n' || c=='\r') ? 1 : 0;
440     }
441     //we should end up at a '>' character here, or EOF
442     } /* fasta fmt navigation to next sequence, no seq storage */
443     else { // sequence storage:
444     if (!readHeader(seq)) {
445     is_last=true;
446     return NULL;
447     }
448     /*----- read the actual sequence now: */
449     len=0;
450     before=1; //newline before indicator
451     if (callbackFn==NULL) { //load the whole sequence in FastaSeq
452     while ((c = getc(fh)) != EOF && c != '>') {
453     cur_fpos++;
454     //if (isspace(c) || c<31)
455     if (c<=32) {
456     before = (c=='\n' || c=='\r')?1:0;
457     continue; /* skip spaces */
458     }
459     if (len >= seq->s_cap-1) {
460     GREALLOC(seq->seq, seq->s_cap + CAPINC);
461     seq->s_cap+=CAPINC;
462     }
463     seq->seq[len] = c;
464     before=0;
465     len++;
466     }
467     seq->seq[len] = '\0';
468     seq->len=len;
469     } /* sequence storage */
470     else { //use the callback for each letter, do not store the whole sequence in FastaSeq
471     while ((c = getc(fh)) != EOF && c != '>') {
472     cur_fpos++;
473     if (c<=32) {
474     before = (c=='\n' || c=='\r')?1:0;
475     continue; /* skip spaces within sequence*/
476     }
477     (*callbackFn)(c, len, seq); //call the user function for each letter
478     before=0;
479     len++;
480     }
481     seq->len=len;
482     } /* callback sequence reading (no storage)*/
483     } /* sequence parsing */
484     if (c=='>') {
485     if (!before) bad_fastafmt(); /* '>' must only be at start of line,
486     never within the sequence ! */
487     is_last=false; /* FALSE - not the last one */
488     ungetc(c, fh);
489     }
490     else is_last=true; /* TRUE - eof() here */
491     return ((seq==NULL) ? (FastaSeq*)fh : seq); //alwayws return non NULL here!
492     } //getFastaSeq
493    
494     //simplified call to ignore the is_last flag
495     FastaSeq *getFastaSeq(FastaSeq* seq, charFunc* callbackFn = NULL) {
496     bool b;
497     if (fh==NULL || feof(fh)) return NULL;
498     return getFastaSeq(b, seq, callbackFn);
499     }
500    
501    
502     uint seqSkip(uint slen, int& c){
503     //assumes the header was read !
504     //skip exactly slen characters in the actual aa or nt sequence
505     //(spaces are not counted)
506     uint skipacc=0;
507     while (skipacc<slen && ((c=getc(fh))!= EOF && c != '>')) {
508     cur_fpos++;
509     if (c<=32) continue; //skip spaces and other non-ASCII characters
510     seqcoord++;
511     skipacc++;
512     }
513     return skipacc; //may terminate prematurely
514     }
515    
516     /* read a sequence range from the current FASTA record
517     this is much faster when rcoord>=seqcoord (i.e. when sequence
518     ranges are read sequentially)
519     if rcoord>=seqcoord assumes the header has been read already!
520     Returns the actual length of the sequence returned (0 if rcoord>seq_length)
521     and updates seqcoord, cur_fpos accordingly (rec_fpos is unchanged)
522     */
523     uint getSeqRange(FastaSeq& seq, uint rcoord, uint rlen=0) {
524     int c;
525     uint len;
526     int before;
527     rec_fpos=cur_fpos;
528     if (!seqcoord || seqcoord>rcoord) {
529     // slow -- go back to the beginning of the record
530     seek(rec_fpos);
531     readHeader(&seq); //this will also reset seqcoord to 1
532     }
533     if (rcoord!=seqcoord) {
534     seqSkip(rcoord-seqcoord, c);
535     check_eof(c);
536     if (c=='>')
537     GError("Error: '>' character found while skipping through sequence!\n");
538     }
539     len = 0; //chars accumulated so far
540     seq.seq[0]='\0';
541     seq.len=0;
542     //----- read the actual subsequence now:
543     len=0;
544     before=1; //"newline before" flag
545     while ((c = getc(fh)) != EOF && c != '>') {
546     cur_fpos++;
547     if (c<=32) continue; // skip spaces
548     if (len >= (uint) (seq.s_cap-1)) {
549     GREALLOC(seq.seq, seq.s_cap + CAPINC);
550     seq.s_cap+=CAPINC;
551     }
552     seq.seq[len] = c;
553     len++;
554     seqcoord++;
555     if (rlen>0 && len==rlen) break;
556     }
557     seq.seq[len] = '\0';
558     seq.len=len;
559     if (c=='>') bad_fastafmt(); /* '>' must only be at start of line,
560     never within the sequence ! */
561     return len;
562     } //getSeqRange
563    
564     //only for writing
565     void putFastaSeq(FastaSeq *fa, const int linelen=60) {
566     writeFasta(fh, fa->id, fa->descr, fa->seq, linelen);
567     }
568    
569     static void writeFasta(FILE *fh, char* seqid, char* descr, char* seq, const int linelen=60, const int seqlen=0) {
570     FastaSeq::write(fh, seqid, descr, seq, linelen, seqlen);
571     }
572    
573     };
574    
575     // ------------- FASTA parser/handler ----
576     // REQUIRES the first character processed after init()
577     // to be the first character of the record delimiter
578     // (default: ">")
579    
580    
581     class GFastaCharHandler {
582     protected:
583     char* recdelim;
584     charFunc* seqCallBack;
585     bool in_delim;
586     int delim_pos;
587     bool in_seqname;
588     bool in_descr;
589     bool in_seq;
590     FastaSeq* rec;
591     unsigned int seq_pos;
592     void reset() {
593     in_delim=true;
594     delim_pos=0;
595     in_seqname=false;
596     in_descr=false;
597     in_seq=false;
598     seq_pos=0;
599     }
600     public:
601     GFastaCharHandler(char* recdel=DEF_FASTA_DELIM) {
602     reset();
603     rec=NULL;
604     recdelim=recdel;
605     seqCallBack=NULL;
606     }
607     GFastaCharHandler(charFunc* chrCallBack, FastaSeq* r=NULL, char* recdel=DEF_FASTA_DELIM) {
608     reset();
609     rec=r;
610     recdelim=recdel;
611     seqCallBack=chrCallBack;
612     if (rec!=NULL) rec->reset();
613     }
614     void init() {
615     init(rec, seqCallBack);
616     }
617     void init(charFunc* chrCallBack) {
618     init(rec,chrCallBack);
619     }
620     void init(FastaSeq* r) {
621     init(r,seqCallBack);
622     }
623     void init(FastaSeq* r, charFunc* chrCallBack) {
624     rec=r;
625     seqCallBack=chrCallBack;
626     if (rec==NULL)
627     GError("GFastaCharHandler::init() Error: cannot use NULL FastaSeq!\n");
628     rec->reset();
629     reset();
630     }
631     void done() {
632     if (rec==NULL)
633     GError("GFastaCharHandler::done() Error: cannot use NULL FastaSeq!\n");
634     rec->endId();
635     rec->endDescr();
636     rec->endSeq();
637     }
638    
639     //~GFastaCharHandler();
640    
641     void processChar(char c) {
642     if (in_delim) { //skip record delimiter -- but it must be there!
643     if (recdelim[delim_pos]!=c) {//the only way to detect an Id starting
644     in_seqname=true;
645     in_delim=false;
646     }
647     delim_pos++;
648     }
649     if (in_seqname) {
650     if (rec->namelen>0 && c<=32) {
651     //breaking out of seq_name
652     rec->endId();
653     if (c=='\n' || c=='\r') { //end defline
654     in_seqname=false;
655     in_seq=true;
656     }
657     else { //seqname break, not defline end
658     in_seqname=false;
659     in_descr=true;
660     }
661     } // seqname termination
662     else { //seqname continues
663     if (c>32) rec->extendId(c);
664     }
665     return;
666     } // in_seqname
667     if (in_descr) {
668     if (c=='\n' || c=='\r') { //end defline
669     rec->endDescr();
670     in_descr=false;
671     in_seq=true;
672     }
673     else rec->extendDescr(c);
674     return;
675     } // in_descr
676     if (in_seq && c>32) {
677     seq_pos++; // 1-based sequence position !
678     if (seqCallBack==NULL) rec->extendSeq(c);
679     else (*seqCallBack)(c,seq_pos,rec);
680     }
681     }
682    
683     };
684    
685    
686     #endif