ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GFastaFile.h
Revision: 2
Committed: Mon Mar 22 22:03:27 2010 UTC (9 years, 5 months ago) by gpertea
File size: 21884 byte(s)
Log Message:
added my gclib source files

Line File contents
1 #ifndef GFASTAFILE_H
2 #define GFASTAFILE_H
3 #include <stdio.h>
4 #include "gdna.h"
5
6 #define CAPINC 64
7 #define SEQCAPINC 256
8 #define DEF_FASTA_DELIM ">"
9
10 class FastaSeq { /* fasta record storage */
11 public:
12 int id_cap; /* allocated size of the sequence name string*/
13 char *id; /* id only, up to first space */
14 int namelen; // real length of seq name
15 char *descr; /* any comment on the defline, after the first space */
16 int d_cap; /* allocated size of the description */
17 int descrlen; /* real length of the description */
18 //-------actual sequence :
19 int s_cap; /* allocated length of the sequence string */
20 int len; /* the actual string length of seq */
21 char* seq; /* the sequence buffer itself */
22 //----
23 FastaSeq(char* cname, char* cdescr=NULL, char* cseq=NULL) {
24 //ntCompTableInit();
25 if (cname==NULL) {
26 FastaSeq();
27 return;
28 }
29 int l=strlen(cname);
30 GMALLOC(id, l+1);strcpy(id,cname);
31 id_cap=l+1;
32 namelen=l;
33 if (cdescr==NULL) {
34 GMALLOC(descr, CAPINC);
35 descr[0]='\0';
36 d_cap=CAPINC;
37 descrlen=0;
38 }
39 else {//copy given description
40 l=strlen(cdescr);
41 GMALLOC(descr, l+1);
42 strcpy(descr,cdescr);
43 d_cap=l+1;
44 descrlen=l;
45 }
46 if (cseq==NULL) {
47 GMALLOC(seq, SEQCAPINC);
48 seq[0]='\0';
49 len=0;
50 s_cap=SEQCAPINC;
51 }
52 else {
53 l=strlen(cseq);
54 GMALLOC(seq, l+1);
55 strcpy(seq,cseq);
56 len=l;
57 s_cap=l+1;
58 }
59 }
60 void init(int seqalloc=0) {
61 //ntCompTableInit();
62 GMALLOC(id, CAPINC);
63 id_cap=CAPINC;
64 namelen=0;
65 id[0]='\0';
66 GMALLOC(descr, CAPINC);
67 descr[0]='\0';
68 d_cap=CAPINC;
69 descrlen=0;
70 if (seqalloc<=0) {
71 s_cap=SEQCAPINC;
72 GMALLOC(seq, SEQCAPINC);
73 }
74 else {
75 s_cap=seqalloc;
76 GMALLOC(seq, seqalloc);
77 }
78 seq[0]='\0';
79 len=0;
80 }
81 FastaSeq(int seqalloc=0) {
82 init(seqalloc);
83 }
84 void clear() {
85 GFREE(id);id_cap=0;namelen=0;id=NULL;
86 GFREE(descr);d_cap=0;descrlen=0;descr=NULL;
87 GFREE(seq);s_cap=0;len=0;seq=NULL;
88 }
89 ~FastaSeq() {
90 clear();
91 }
92 int getNameLen() { return namelen; }
93 const char* getName() { return (const char*) id; }
94 const char* name() { return (const char*) id; }
95 const char* getSeqName() { return (const char*) id; }
96 const char* getId() { return (const char*) id; }
97 const char* getDescr() { return (const char*) descr; }
98 int getDescrLen() { return descrlen; }
99 const char* getSeq() { return (const char*) seq; }
100 int getSeqLen() { return len; }
101 void extendId(char c) {
102 if (namelen+1 >= id_cap) {
103 id_cap += CAPINC;
104 GREALLOC(id, id_cap);
105 }
106 id[namelen]= c;
107 namelen++;
108 }
109 void extendSeqName(char c) { extendId(c); }
110 void extendName(char c) { extendId(c); }
111 void extendDescr(char c) {
112 if (descrlen+1 >= d_cap) {
113 d_cap += CAPINC;
114 GREALLOC(descr, d_cap);
115 }
116 descr[descrlen]= c;
117 descrlen++;
118 }
119 void endId() { id[namelen]=0; }
120 void endName() { id[namelen]=0; }
121 void endSeqName() { id[namelen]=0; }
122 void endDescr() { descr[descrlen]=0; }
123 void endSeq() { seq[len]=0; }
124 void extendSeq(char c) {
125 if (len+1 >= s_cap) {
126 s_cap += SEQCAPINC;
127 GREALLOC(seq, s_cap);
128 }
129 seq[len]= c;
130 len++;
131 }
132 void compactIdMem() { if (namelen>0) {
133 GREALLOC(id, namelen+1); id_cap=namelen+1;
134 } }
135 void compactDescrMem() { if (descrlen>0) {
136 GREALLOC(descr, descrlen+1); d_cap=descrlen+1; } }
137 void compactSeqMem() { if (len>0) {
138 GREALLOC(seq, len+1); s_cap=len+1; } }
139 void compactMem() {
140 compactIdMem();
141 compactDescrMem();
142 compactSeqMem();
143 }
144 char* detachSeqPtr() { //such that the sequence allocated memory is no longer
145 // freed when the FastaSeq object is destroyed
146 // the returned pointer MUST be deallocated by the the user, later!
147 char* p=seq;
148 GMALLOC(seq, SEQCAPINC);
149 s_cap=SEQCAPINC;
150 len=0;
151 return p;
152 }
153 char* setSeqPtr(char* newseq, int newlen=0, int newcap=0) {
154 if (newlen==0) newlen=strlen(newseq);
155 if (newcap<=newlen) newcap=newlen+1;
156 GFREE(seq);
157 seq=newseq;
158 len=newlen;
159 s_cap=newcap;
160 return seq;
161 }
162 void reset() {// allocated space remains the same!
163 namelen=0;id[0]=0;
164 descrlen=0;descr[0]=0;
165 len=0;seq[0]=0;
166 }
167 //reverse-complement a nucleotide sequence:
168 void reverseComplement() {
169 if (len==0) return;
170 //ntCompTableInit();
171 reverseChars(seq,len);
172 for (int i=0;i<len;i++) seq[i]=ntComplement(seq[i]);
173 }
174 //printing fasta formatted sequence to a file stream
175 void fprint(FILE* fout, int line_len=60, bool defline=false) {
176 if (defline) {
177 if (descrlen>0) fprintf(fout, "%s %s\n", id, descr);
178 else fprintf(fout, ">%s\n", id);
179 }
180 int l=len;
181 char* p=seq;
182 while (l>0) {
183 int to_write=GMIN(line_len, l);
184 fwrite(p,1,to_write,fout);
185 fprintf(fout,"\n");
186 p+=line_len;
187 l-=line_len;
188 }
189 }
190 //
191 static void write(FILE *fh, char* seqid, char* descr, char* seq,
192 const int linelen=60, const int seqlen=0) {
193 int i, idlen, ilen;
194 //char *s;
195 //s = (seqid == NULL) ? (char*)"ANONYMOUS" : seqid;
196 // write header line only if given!
197 idlen=(seqid==NULL)? 0 : strlen(seqid);
198 if (idlen>0) {
199 if (*seqid != '>') putc('>', fh);
200 fwrite(seqid, 1, idlen, fh);
201 i=(descr==NULL)? 0 : strlen(descr);
202 if (i>0) {
203 putc(' ',fh);
204 fwrite(descr, 1, i, fh);
205 }
206 putc('\n', fh);
207 }
208 if (linelen>0) { //fixed line length
209 int len = (seqlen>0) ? seqlen : strlen(seq);
210 ilen=0;
211 for (i=0; i < len; i++, ilen++) {
212 if (ilen == linelen) {
213 putc('\n', fh);
214 ilen = 0;
215 }
216 putc(seq[i], fh);
217 } //for
218 putc('\n', fh);
219 }
220 else { //no line length limit
221 fprintf(fh, "\n%s\n", seq);
222 }
223 fflush(fh);
224 }
225
226
227 };
228
229 typedef int charFunc(char c, int pos, FastaSeq* fseq); //char processing function
230 /* passes:
231 c = current sequence character (generally aminoacid or nucleotide)
232 pos = 0-based coordinate of the given character within the sequence
233 fseq = FastaSeq pointer (useful for retrieving sequence defline info)
234 the return value is not used yet
235 */
236
237
238 //(for reading/writing variable length records, etc.)
239 enum fileMode {
240 fmRead,
241 fmWrite
242 };
243
244 class GFastaFile {
245 char* fname;
246 FILE* fh;
247 fileMode fmode;
248
249 long int rec_fpos; //the input stream offset of the current record to be read
250 long int cur_fpos; //the input stream offset of the current byte to be read
251 uint seqcoord; //1-based coordinate of the current record's sequence reading position
252 //(updated by getSeqRange() mostly)
253 protected:
254 void bad_fastafmt() {
255 GError("Error parsing file '%s'. Not a Fasta file?\n", fname);
256 }
257 void check_eof(int c) {
258 if (c == EOF) bad_fastafmt();
259 }
260 public:
261 GFastaFile(const char* filename, fileMode filemode=fmRead) {
262 fh=NULL;
263 cur_fpos=0;
264 rec_fpos=0;
265 fmode=filemode;
266 seqcoord=0;
267 const char *mode=(filemode==fmRead) ? "rb" : "wb";
268 if (filename == NULL || filename[0]=='\0') {
269 fh = (filemode == fmRead) ? stdin : stdout;
270 fname=NULL;
271 }
272 else {
273 if ((fh = fopen(filename, mode)) == NULL)
274 GError("Cannot open file '%s'!", filename);
275 fname=Gstrdup(filename);
276 }
277 /*
278 GCALLOC(curseqid, CAPINC);
279 curseqidlen=CAPINC;
280 GCALLOC(curdescr, CAPINC);
281 curdescrlen=CAPINC;*/
282 }
283
284 //attach a GFastaFile object to an already open handle
285 GFastaFile(FILE* fhandle, fileMode filemode=fmRead, const char* filename=NULL) {
286 fh=fhandle;
287 cur_fpos=ftell(fh);
288 fmode=filemode;
289 rec_fpos=cur_fpos;
290 seqcoord=0;
291 if (filename == NULL || filename[0]=='\0') {
292 fname=NULL;
293 }
294 else
295 fname=Gstrdup(filename);
296 }
297
298
299 void reset() {
300 if (fh!=NULL && fh!=stdout && fh!=stdin) {
301 fseek(fh,0L, SEEK_SET);
302 cur_fpos=0;
303 rec_fpos=0;
304 }
305 else GError("Cannot use GFastaFile::reset() on stdin, stdout or NULL handles.\n");
306 }
307
308 void seek(int pos) {
309 if (fh!=NULL && fh!=stdout && fh!=stdin) {
310 fseek(fh, pos, SEEK_SET);
311 cur_fpos=pos;
312 seqcoord=0; //seqcoord agnostic after a seek
313 }
314 else GError("Cannot use GFastaFile::seek() on stdin, stdout or NULL handles.\n");
315 }
316
317 ~GFastaFile() {
318 if (fh!=NULL && fh!=stdout && fh!=stdin) fclose(fh);
319 fh=NULL;
320 GFREE(fname);
321 /*GFREE(curseqid);
322 GFREE(curdescr);*/
323 }
324
325 int getReadPos() { return cur_fpos; } /* returns current read position in the
326 input stream (can be used within callback) */
327 int ReadSeqPos() {return rec_fpos; } /* returns the input stream offset of the last fasta
328 record processed by getFastaSeq*/
329 bool readHeader(FastaSeq& seq) { return (readHeader(&seq)!=NULL); }
330 FastaSeq* readSeq(int seqalloc=0) {
331 //allocate a new FastaSeq, reads the next record and returns it
332 //caller is responsible for deallocating returned FastaSeq memory!
333 FastaSeq* r=readHeader(NULL, seqalloc);
334 int len=0;
335 char before=1; //newline before indicator
336 int c=-1;
337 //load the whole sequence in FastaSeq
338 while ((c = getc(fh)) != EOF && c != '>') {
339 cur_fpos++;
340 //if (isspace(c) || c<31)
341 if (c<=32) {
342 before = (c=='\n' || c=='\r')?1:0;
343 continue; /* skip spaces */
344 }
345 if (len >= r->s_cap-1) {
346 GREALLOC(r->seq, r->s_cap + SEQCAPINC);
347 r->s_cap+=SEQCAPINC;
348 }
349 r->seq[len] = c;
350 before=0;
351 len++;
352 }
353 r->seq[len] = '\0';
354 r->len=len;
355 return r;
356 }
357 FastaSeq* readHeader(FastaSeq* seq=NULL, int seqalloc=0) {
358 /* reads the Fasta sequence header
359 the first character must be '>' for this call, after any spaces,
360 if seq is NULL a new FastaSeq object is allocated and returned,
361 otherwise id and descr are updated */
362 seqcoord=0;
363 int* buflen;
364 int* buflenstr;
365 char** buf;
366 int before;
367 if (feof(fh)) return NULL;
368 int c = getc(fh);
369 if (c==EOF) return NULL;
370 cur_fpos++;
371 while (c!=EOF && c<=32) { c=getc(fh); cur_fpos++; }//skip spaces etc.
372 if (c == EOF) return NULL;
373 if (c != '>')
374 bad_fastafmt();
375 if (seq==NULL) seq=new FastaSeq(seqalloc);
376 else { seq->clear(); seq->init(seqalloc); }
377
378 int len = 0; //chars accumulated so far
379 buflen=&(seq->id_cap);
380 buf=&(seq->id);
381 buflenstr=&(seq->namelen);
382 before=1;
383 while ((c = getc(fh)) != EOF) {
384 cur_fpos++;
385 if (c=='\n' || c=='\r') break;
386 if (len >= *buflen-1) {
387 GREALLOC(*buf, *buflen + CAPINC);
388 *buflen+=CAPINC;
389 }
390 if (before && (c<=32)) {
391 // space encountered => seq_name finished
392 before=0;
393 (*buf)[len]='\0';
394 *buflenstr=len;
395 buf=&seq->descr;
396 buflen=&seq->d_cap;
397 buflenstr=&seq->descrlen;
398 len=0;
399 if (c!=1) // special case, nrdb concatenation
400 continue; // skip this space
401 }
402 (*buf)[len]=c;
403 len++;
404 }
405 (*buf)[len]='\0'; /* terminate the comment string */
406 *buflenstr = len;
407 check_eof(c); /* it's wrong to have eof here */
408 seqcoord=1;
409 return (seq->namelen==0) ? NULL : seq;
410 }
411
412 FastaSeq *getFastaSeq(bool& is_last, FastaSeq* seq, charFunc* callbackFn = NULL ) {
413 /* seq must be a pointer to a initialized FastaSeq structure
414 if seq is NULL, the sequence is not actually read,
415 but just skipped and the file pointer set accordingly, while
416 the returned "pointer" will not be a FastaSeq one but just NULL or not NULL
417 (depending if eof was encountered)
418 if callbackFn is NULL, the sequence is read entirely in memory in a FastaSeq.seq field
419 otherwise only the defline is parsed into FastaSeq::id and FastaSeq::descr but actual
420 sequence letters are passed one by one to the callback function
421 and the actual sequence is never stored in memory (unless the callback does it)
422 */
423 int c, len;
424 int before;
425 rec_fpos=cur_fpos;
426 len = 0; //chars accumulated so far
427 if (fh==NULL || feof(fh)) return NULL;
428 // -------- read the defline first
429 if (seq==NULL) { // navigate only! don't read/parse anything but the record delimiter
430 before=1;
431 while ((c = getc(fh)) != EOF && c != '\n' && c !='\r') cur_fpos++; // skip defline
432 if (c==EOF && cur_fpos<=rec_fpos+2) return NULL;
433 check_eof(c); /* it's wrong to have eof here! */
434 cur_fpos++; //to account for the '\n' read
435 /*----- read the sequence now: */
436 before=1; /* "newline before" flag */
437 while ((c = getc(fh)) != EOF && c != '>') {
438 cur_fpos++;
439 before = (c=='\n' || c=='\r') ? 1 : 0;
440 }
441 //we should end up at a '>' character here, or EOF
442 } /* fasta fmt navigation to next sequence, no seq storage */
443 else { // sequence storage:
444 if (!readHeader(seq)) {
445 is_last=true;
446 return NULL;
447 }
448 /*----- read the actual sequence now: */
449 len=0;
450 before=1; //newline before indicator
451 if (callbackFn==NULL) { //load the whole sequence in FastaSeq
452 while ((c = getc(fh)) != EOF && c != '>') {
453 cur_fpos++;
454 //if (isspace(c) || c<31)
455 if (c<=32) {
456 before = (c=='\n' || c=='\r')?1:0;
457 continue; /* skip spaces */
458 }
459 if (len >= seq->s_cap-1) {
460 GREALLOC(seq->seq, seq->s_cap + CAPINC);
461 seq->s_cap+=CAPINC;
462 }
463 seq->seq[len] = c;
464 before=0;
465 len++;
466 }
467 seq->seq[len] = '\0';
468 seq->len=len;
469 } /* sequence storage */
470 else { //use the callback for each letter, do not store the whole sequence in FastaSeq
471 while ((c = getc(fh)) != EOF && c != '>') {
472 cur_fpos++;
473 if (c<=32) {
474 before = (c=='\n' || c=='\r')?1:0;
475 continue; /* skip spaces within sequence*/
476 }
477 (*callbackFn)(c, len, seq); //call the user function for each letter
478 before=0;
479 len++;
480 }
481 seq->len=len;
482 } /* callback sequence reading (no storage)*/
483 } /* sequence parsing */
484 if (c=='>') {
485 if (!before) bad_fastafmt(); /* '>' must only be at start of line,
486 never within the sequence ! */
487 is_last=false; /* FALSE - not the last one */
488 ungetc(c, fh);
489 }
490 else is_last=true; /* TRUE - eof() here */
491 return ((seq==NULL) ? (FastaSeq*)fh : seq); //alwayws return non NULL here!
492 } //getFastaSeq
493
494 //simplified call to ignore the is_last flag
495 FastaSeq *getFastaSeq(FastaSeq* seq, charFunc* callbackFn = NULL) {
496 bool b;
497 if (fh==NULL || feof(fh)) return NULL;
498 return getFastaSeq(b, seq, callbackFn);
499 }
500
501
502 uint seqSkip(uint slen, int& c){
503 //assumes the header was read !
504 //skip exactly slen characters in the actual aa or nt sequence
505 //(spaces are not counted)
506 uint skipacc=0;
507 while (skipacc<slen && ((c=getc(fh))!= EOF && c != '>')) {
508 cur_fpos++;
509 if (c<=32) continue; //skip spaces and other non-ASCII characters
510 seqcoord++;
511 skipacc++;
512 }
513 return skipacc; //may terminate prematurely
514 }
515
516 /* read a sequence range from the current FASTA record
517 this is much faster when rcoord>=seqcoord (i.e. when sequence
518 ranges are read sequentially)
519 if rcoord>=seqcoord assumes the header has been read already!
520 Returns the actual length of the sequence returned (0 if rcoord>seq_length)
521 and updates seqcoord, cur_fpos accordingly (rec_fpos is unchanged)
522 */
523 uint getSeqRange(FastaSeq& seq, uint rcoord, uint rlen=0) {
524 int c;
525 uint len;
526 int before;
527 rec_fpos=cur_fpos;
528 if (!seqcoord || seqcoord>rcoord) {
529 // slow -- go back to the beginning of the record
530 seek(rec_fpos);
531 readHeader(&seq); //this will also reset seqcoord to 1
532 }
533 if (rcoord!=seqcoord) {
534 seqSkip(rcoord-seqcoord, c);
535 check_eof(c);
536 if (c=='>')
537 GError("Error: '>' character found while skipping through sequence!\n");
538 }
539 len = 0; //chars accumulated so far
540 seq.seq[0]='\0';
541 seq.len=0;
542 //----- read the actual subsequence now:
543 len=0;
544 before=1; //"newline before" flag
545 while ((c = getc(fh)) != EOF && c != '>') {
546 cur_fpos++;
547 if (c<=32) continue; // skip spaces
548 if (len >= (uint) (seq.s_cap-1)) {
549 GREALLOC(seq.seq, seq.s_cap + CAPINC);
550 seq.s_cap+=CAPINC;
551 }
552 seq.seq[len] = c;
553 len++;
554 seqcoord++;
555 if (rlen>0 && len==rlen) break;
556 }
557 seq.seq[len] = '\0';
558 seq.len=len;
559 if (c=='>') bad_fastafmt(); /* '>' must only be at start of line,
560 never within the sequence ! */
561 return len;
562 } //getSeqRange
563
564 //only for writing
565 void putFastaSeq(FastaSeq *fa, const int linelen=60) {
566 writeFasta(fh, fa->id, fa->descr, fa->seq, linelen);
567 }
568
569 static void writeFasta(FILE *fh, char* seqid, char* descr, char* seq, const int linelen=60, const int seqlen=0) {
570 FastaSeq::write(fh, seqid, descr, seq, linelen, seqlen);
571 }
572
573 };
574
575 // ------------- FASTA parser/handler ----
576 // REQUIRES the first character processed after init()
577 // to be the first character of the record delimiter
578 // (default: ">")
579
580
581 class GFastaCharHandler {
582 protected:
583 char* recdelim;
584 charFunc* seqCallBack;
585 bool in_delim;
586 int delim_pos;
587 bool in_seqname;
588 bool in_descr;
589 bool in_seq;
590 FastaSeq* rec;
591 unsigned int seq_pos;
592 void reset() {
593 in_delim=true;
594 delim_pos=0;
595 in_seqname=false;
596 in_descr=false;
597 in_seq=false;
598 seq_pos=0;
599 }
600 public:
601 GFastaCharHandler(char* recdel=DEF_FASTA_DELIM) {
602 reset();
603 rec=NULL;
604 recdelim=recdel;
605 seqCallBack=NULL;
606 }
607 GFastaCharHandler(charFunc* chrCallBack, FastaSeq* r=NULL, char* recdel=DEF_FASTA_DELIM) {
608 reset();
609 rec=r;
610 recdelim=recdel;
611 seqCallBack=chrCallBack;
612 if (rec!=NULL) rec->reset();
613 }
614 void init() {
615 init(rec, seqCallBack);
616 }
617 void init(charFunc* chrCallBack) {
618 init(rec,chrCallBack);
619 }
620 void init(FastaSeq* r) {
621 init(r,seqCallBack);
622 }
623 void init(FastaSeq* r, charFunc* chrCallBack) {
624 rec=r;
625 seqCallBack=chrCallBack;
626 if (rec==NULL)
627 GError("GFastaCharHandler::init() Error: cannot use NULL FastaSeq!\n");
628 rec->reset();
629 reset();
630 }
631 void done() {
632 if (rec==NULL)
633 GError("GFastaCharHandler::done() Error: cannot use NULL FastaSeq!\n");
634 rec->endId();
635 rec->endDescr();
636 rec->endSeq();
637 }
638
639 //~GFastaCharHandler();
640
641 void processChar(char c) {
642 if (in_delim) { //skip record delimiter -- but it must be there!
643 if (recdelim[delim_pos]!=c) {//the only way to detect an Id starting
644 in_seqname=true;
645 in_delim=false;
646 }
647 delim_pos++;
648 }
649 if (in_seqname) {
650 if (rec->namelen>0 && c<=32) {
651 //breaking out of seq_name
652 rec->endId();
653 if (c=='\n' || c=='\r') { //end defline
654 in_seqname=false;
655 in_seq=true;
656 }
657 else { //seqname break, not defline end
658 in_seqname=false;
659 in_descr=true;
660 }
661 } // seqname termination
662 else { //seqname continues
663 if (c>32) rec->extendId(c);
664 }
665 return;
666 } // in_seqname
667 if (in_descr) {
668 if (c=='\n' || c=='\r') { //end defline
669 rec->endDescr();
670 in_descr=false;
671 in_seq=true;
672 }
673 else rec->extendDescr(c);
674 return;
675 } // in_descr
676 if (in_seq && c>32) {
677 seq_pos++; // 1-based sequence position !
678 if (seqCallBack==NULL) rec->extendSeq(c);
679 else (*seqCallBack)(c,seq_pos,rec);
680 }
681 }
682
683 };
684
685
686 #endif