ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/gff.h
Revision: 2
Committed: Mon Mar 22 22:03:27 2010 UTC (9 years, 7 months ago) by gpertea
File size: 24599 byte(s)
Log Message:
added my gclib source files

Line File contents
1 #ifndef GFF_H
2 #define GFF_H
3
4 #ifdef HAVE_CONFIG_H
5 #include <config.h>
6 #endif
7
8 #include "GBase.h"
9 #include "gdna.h"
10 #include "codons.h"
11 #include "GFaSeqGet.h"
12 #include "GList.hh"
13 #include "GHash.hh"
14
15 /*
16 const byte exMskMajSpliceL = 0x01;
17 const byte exMskMajSpliceR = 0x02;
18 const byte exMskMinSpliceL = 0x04;
19 const byte exMskMinSpliceR = 0x08;
20 const byte exMskTag = 0x80;
21 */
22 //reserved Gffnames::feats entries
23 extern const int gff_fid_mRNA;
24 extern const int gff_fid_exon;
25 extern const int gff_fid_CDS;
26 extern bool gff_warns; //show parser warnings
27
28 extern const uint GFF_MAX_LOCUS;
29 extern const uint GFF_MAX_EXON;
30 extern const uint GFF_MAX_INTRON;
31
32 #define GFF_LINELEN 2048
33 #define ERR_NULL_GFNAMES "Error: GffObj::%s requires a non-null GffNames* names!\n"
34
35
36 enum GffExonType {
37 exgffNone=0,
38 exgffStop, //from "stop_codon" feature
39 exgffCDS, //from "CDS" feature
40 exgffUTR, //from "UTR" feature
41 exgffExon, //from "exon" feature
42 };
43
44 class GffReader;
45
46 class GffLine {
47 public:
48 char* line;
49 char* gseqname;
50 char* track;
51 char* ftype; //feature name: mRNA/gene/exon/CDS
52 uint fstart;
53 uint fend;
54 uint qstart; //overlap coords on query, if available
55 uint qend;
56 uint qlen; //query len, if given
57 double score;
58 char strand;
59 bool skip;
60 bool is_cds; //"cds" and "stop_codon" features
61 bool is_exon; //"exon" and "utr" features
62 char exontype; // gffExonType
63 bool is_mrna;
64 char phase; // '.' , '0', '1' or '2'
65 char* gname; //gene_id or Name= value (or an otherwise parsed gene denominator
66 //(for grouping isoforms)
67 char* info; //the last, attributes' field, unparsed
68 //
69 char* Parent; // if a Parent=.. attribute was found
70 char* ID; // if a ID=.. attribute was parsed
71 GffLine(GffReader* reader, const char* l); //parse the line accordingly
72 GffLine() {
73 line=NULL;
74 gseqname=NULL;
75 track=NULL;
76 ftype=NULL;
77 fstart=0;
78 fend=0;
79 info=NULL;
80 Parent=NULL;
81 ID=NULL;
82 gname=NULL;
83 skip=true;
84 qstart=0;
85 qend=0;
86 qlen=0;
87 exontype=0;
88 is_cds=false;
89 is_mrna=false;
90 is_exon=false;
91 }
92 ~GffLine() {
93 GFREE(line);
94 GFREE(Parent);
95 GFREE(ID);
96 GFREE(gname);
97 }
98 };
99
100 class GffAttr {
101 public:
102 int attr_id;
103 char* attr_val;
104 GffAttr(int an_id, const char* av=NULL) {
105 attr_id=an_id;
106 attr_val=NULL;
107 if (av!=NULL) {
108 char* lastch = (char*)(av+strlen(av)-1);
109 //remove spaces at the end:
110 while (*lastch==' ' && lastch!=av) lastch--;
111 lastch[1]=0;
112 //practical use: if it doesn't have any spaces just strip those useless double quotes
113 if (av[0]=='"' && strpbrk(av+1," ;")==NULL) {
114 if (*lastch=='"') *lastch=0;
115 attr_val=Gstrdup(av+1);
116 }
117 else attr_val=Gstrdup(av);
118 }
119
120 }
121 ~GffAttr() {
122 GFREE(attr_val);
123 }
124 bool operator==(GffAttr& d){
125 return (this==&d);
126 }
127 bool operator>(GffAttr& d){
128 return (this>&d);
129 }
130 bool operator<(GffAttr& d){
131 return (this<&d);
132 }
133
134 };
135
136 class GffNameList;
137 class GffNames;
138
139 class GffNameInfo {
140 friend class GffNameList;
141 protected:
142 //unsigned char shared;
143 int idx;
144 public:
145 char* name;
146 GffNameInfo() { name=NULL; idx=-1; }
147 GffNameInfo(const char* n) {
148 name=Gstrdup(n);
149 }
150 /*
151 GffNameInfo(const char* n, bool strshared=false) {
152 idx=-1;
153 if (strshared) {
154 shared=1;
155 name=(char *)n;
156 }
157 else {
158 shared=0;
159 name = (n==NULL) ? NULL : Gstrdup(n);
160 }
161 }
162 */
163
164 ~GffNameInfo() {
165 //if (shared==0)
166 GFREE(name);
167 }
168
169 bool operator==(GffNameInfo& d){
170 return (strcmp(this->name, d.name)==0);
171 }
172 bool operator>(GffNameInfo& d){
173 return (strcmp(this->name, d.name)>0);
174 }
175 bool operator<(GffNameInfo& d){
176 return (strcmp(this->name, d.name)<0);
177 }
178 };
179
180 /*
181 int compareNameId(const pointer item1, const pointer item2) {
182 GffNameInfo* ni1=(GffNameInfo*)item1;
183 GffNameInfo* ni2=(GffNameInfo*)item2;
184 return (ni1->id > ni2->id )? 1 : (ni1->id==ni2->id ? 0 : -1);
185 }
186 */
187
188 class GffNameList:public GList<GffNameInfo> {
189 friend class GffNameInfo;
190 friend class GffNames;
191 protected:
192 GHash<GffNameInfo> byName;//hash with shared keys
193 int idlast; //fList index of last added/reused name
194 void addStatic(const char* tname) {// fast add
195 GffNameInfo* f=new GffNameInfo(tname);
196 idlast=this->Add(f);
197 f->idx=idlast;
198 byName.shkAdd(f->name,f);
199 }
200 public:
201 GffNameList():GList<GffNameInfo>(false,true,true), byName(false) {
202 idlast=-1;
203 }
204 char* lastNameUsed() { return idlast<0 ? NULL : Get(idlast)->name; }
205 int lastNameId() { return idlast; }
206 char* getName(int nid) { //retrieve name by its ID
207 if (nid<0 || nid>=fCount)
208 GError("GffNameList Error: invalid index (%d)\n",nid);
209 return fList[nid]->name;
210 }
211
212 int addName(const char* tname) {//returns or create an id for the given name
213 //check idlast first, chances are it's the same feature name checked
214 if (idlast>=0 && strcmp(fList[idlast]->name,tname)==0)
215 return idlast;
216 GffNameInfo* f=byName.Find(tname);
217 int fidx=-1;
218 if (f!=NULL) fidx=f->idx;
219 else {//add new entry
220 f=new GffNameInfo(tname);
221 fidx=this->Add(f);
222 f->idx=fidx;
223 byName.shkAdd(f->name,f);
224 }
225 idlast=fidx;
226 return fidx;
227 }
228 int getId(const char* tname) { //only returns a name id# if found
229 GffNameInfo* f=byName.Find(tname);
230 if (f==NULL) return -1;
231 return f->idx;
232 }
233 int removeName(const char* tname) {
234 GError("Error: removing names from GffNameList not allowed!\n");
235 return -1;
236 }
237 };
238
239 class GffNames {
240 public:
241 int numrefs;
242 GffNameList tracks;
243 GffNameList gseqs;
244 GffNameList attrs;
245 GffNameList feats; //feature names - anything except 'mRNA', 'exon', 'CDS' gets stored here
246 GffNames():tracks(),gseqs(),attrs(), feats() {
247 numrefs=0;
248 //the order below is critical!
249 //has to match: gff_fid_mRNA, gff_fid_exon, gff_fid_CDS
250 feats.addStatic("mRNA");//index 0=gff_fid_mRNA
251 feats.addStatic("exon");//index 1=gff_fid_exon
252 feats.addStatic("CDS"); //index 2=gff_fid_CDS
253 }
254 };
255
256 void gffnames_ref(GffNames* &n);
257 void gffnames_unref(GffNames* &n);
258
259 enum GffPrintMode {
260 pgtfAny, //print record as read
261 pgtfExon,
262 pgtfCDS,
263 pgffAny, //print record as read
264 pgffExon,
265 pgffCDS,
266 pgffBoth,
267 };
268
269
270 class GffAttrs:public GList<GffAttr> {
271 public:
272 GffAttrs():GList<GffAttr>(false,true,false) { }
273 char* getAttr(GffNames* names, const char* attrname) {
274 int aid=names->attrs.getId(attrname);
275 if (aid>=0)
276 for (int i=0;i<Count();i++)
277 if (aid==Get(i)->attr_id) return Get(i)->attr_val;
278 return NULL;
279 }
280 };
281
282
283 class GffExon : public GSeg {
284 public:
285 void* uptr; //for later exensibility
286 GffAttrs* attrs; //other attributes kept for this exon
287 double score; // gff score column
288 char phase; //GFF phase column - for CDS segments only
289 // '.' = undefined (UTR), '0','1','2' for CDS exons
290 char exontype; // 1="exon" 2="cds" 3="utr" 4="stop_codon"
291 int qstart; // for mRNA/protein exon mappings: coordinates on query
292 int qend;
293 GffExon(int s=0, int e=0, double sc=0, char fr=0, int qs=0, int qe=0, char et=0) {
294 uptr=NULL;
295 attrs=NULL;
296 if (s<e) {
297 start=s;
298 end=e;
299 }
300 else {
301 start=e;
302 end=s;
303 }
304 if (qs<qe) {
305 qstart=qs;
306 qend=qe;
307 } else {
308 qstart=qe;
309 qend=qs;
310 }
311 score=sc;
312 phase=fr;
313 exontype=et;
314 } //constructor
315
316 char* getAttr(GffNames* names, const char* atrname) {
317 if (attrs==NULL || names==NULL || atrname==NULL) return NULL;
318 return attrs->getAttr(names, atrname);
319 }
320
321 ~GffExon() { //destructor
322 if (attrs!=NULL) delete attrs;
323 }
324 };
325
326
327 class GffCDSeg:public GSeg {
328 public:
329 char phase;
330 int exonidx;
331 };
332 //one GFF mRNA object -- e.g. a mRNA with its exons and/or CDS segments
333 class GffObj:public GSeg {
334 //utility segment-merging function for addExon()
335 void expandExon(int xovl, uint segstart, uint segend,
336 char exontype, double sc, char fr, int qs, int qe);
337 protected:
338 //coordinate transformation data:
339 uint xstart; //absolute genomic coordinates of reference region
340 uint xend;
341 char xstatus; //coordinate transform status:
342 //0 : (start,end) coordinates are absolute
343 //'+' : (start,end) coords are relative to xstart..xend region
344 //'-' : (start,end) are relative to the reverse complement of xstart..xend region
345 //--
346 char* gffID; // ID name for mRNA (parent) feature
347 char* gname; // value of Name attribute (if given)
348 //-- friends:
349 friend class GffReader;
350 friend class GffExon;
351 public:
352 bool hasErrors; //overlapping exons, or too short introns, etc.
353 static GffNames* names; // common string storage that holds the various attribute names etc.
354 int track_id; // index of track name in names->tracks
355 int gseq_id; // index of genomic sequence name in names->gseqs
356 int ftype_id; // index of this record's feature name in names->feats, or the special gff_fid_mRNA value
357 int subftype_id; //index of child subfeature name in names->feats (that subfeature stored in "exons")
358 //if ftype_id==gff_fid_mRNA then this value is ignored
359 GList<GffExon> exons; //for non-mRNA entries, these can be any subfeature of type subftype_id
360 int udata; //user data, flags etc.
361 void* uptr; //user pointer (to a parent object, cluster, locus etc.)
362 GffObj* ulink; //link to another GffObj (user controlled field)
363 // mRNA specific fields:
364 bool isCDS; //just a CDS, no UTRs
365 bool partial; //partial CDS
366 uint CDstart; //CDS start coord
367 uint CDend; //CDS end coord
368 char CDphase; //initial phase for CDS start
369
370 bool ismRNA() { return (ftype_id==gff_fid_mRNA); }
371
372 int addExon(uint segstart, uint segend, double sc=0, char fr='.',
373 int qs=0, int qe=0, bool iscds=false, char exontype=0);
374
375 int addExon(GffLine* gl, bool keepAttr=false, bool noExonAttr=true);
376 void removeExon(int idx);
377 /*
378 uint gstart; // global feature coordinates on genomic sequence
379 uint gend; // ALWAYS gstart <= gend
380 */
381 char strand; //true if features are on the reverse complement strand
382 double gscore;
383 double uscore; //custom, user-computed score, if needed
384 int covlen; //total coverage of reference genomic sequence (sum of maxcf segment lengths)
385
386 //--------- optional data:
387 int qlen; //query length, start, end - if available
388 int qstart;
389 int qend;
390 int qcov; //query coverage - percent
391 GffAttrs* attrs; //other gff3 attributes found for the main mRNA feature
392 //constructor by gff line parsing:
393 GffObj(GffReader* gfrd, GffLine* gffline, bool keepAttrs=false, bool noExonAttr=true);
394 //if gfline->Parent!=NULL then this will also add the first sub-feature
395 // otherwise, only the main feature is created
396 void clearAttrs() {
397 if (attrs!=NULL) delete attrs;
398 }
399 GffObj(char* anid=NULL):GSeg(0,0), exons(true,true,true) { //exons: sorted, free, unique
400 gffID=NULL;
401 uptr=NULL;
402 ulink=NULL;
403 udata=0;
404 hasErrors=false;
405 ftype_id=-1;
406 subftype_id=-1;
407 if (anid!=NULL) gffID=Gstrdup(anid);
408 gffnames_ref(names);
409 qlen=0;
410 qstart=0;
411 qend=0;
412 qcov=0;
413 partial=true;
414 isCDS=false;
415 CDstart=0; // hasCDS <=> CDstart>0
416 CDend=0;
417 CDphase=0;
418 gseq_id=-1;
419 track_id=-1;
420 /*
421 gstart=0;
422 gend=0;
423 */
424 xstart=0;
425 xend=0;
426 xstatus=0;
427 strand=0;
428 gscore=0;
429 uscore=0;
430 attrs=NULL;
431 covlen=0;
432 gname=NULL;
433 }
434 ~GffObj() {
435 GFREE(gffID);
436 GFREE(gname);
437 if (attrs!=NULL) delete attrs;
438 gffnames_unref(names);
439 }
440 /*
441 void addFeature(GffLine* gfline);
442 int getFeatureId(int fnid);
443 int getFeatureId(const char* fname);
444 GFeature* hasFeature(char* parentID);
445 void promote(GFeature* subf, GffLine* gfline); */
446 //--------------
447 GffObj* finalize(bool mergeCloseExons=false); //finalize parsing: must be called in order to merge adjacent/close proximity subfeatures
448 void parseAttrs(GffAttrs*& atrlist, char* info, bool noExonAttr=false);
449 const char* getSubfName() { //returns the generic feature type of the entries in exons array
450 int sid=subftype_id;
451 if (sid==gff_fid_exon && isCDS) sid=gff_fid_CDS;
452 return names->feats.getName(sid);
453 }
454 const char* getFeatureName() {
455 return names->feats.getName(ftype_id);
456 }
457 void addAttr(const char* attrname, char* attrvalue);
458 const char* getAttrName(int i) {
459 if (attrs==NULL) return NULL;
460 return names->attrs.getName(attrs->Get(i)->attr_id);
461 }
462 char* getAttr(const char* atrname) {
463 if (attrs==NULL || names==NULL || atrname==NULL) return NULL;
464 return attrs->getAttr(names, atrname);
465 }
466
467 char* getAttrValue(int i) {
468 if (attrs==NULL) return NULL;
469 return attrs->Get(i)->attr_val;
470 }
471 const char* getGSeqName() {
472 return names->gseqs.getName(gseq_id);
473 }
474 const char* getTrackName() {
475 return names->tracks.getName(track_id);
476 }
477 /*
478 bool overlap(GffObj* mrna) {
479 //basic overlap: just segment ends
480 return overlap(*mrna);
481 }
482 bool overlap(GffObj& d) {
483 //ignores strand and gseq_id -- the user must do that in advance
484 // just rough locus overlap, exons may not overlap
485 return (gstart<d.gstart ? (d.gstart<=gend) : (gstart<=d.gend));
486 }
487
488 bool overlap(uint s, uint e) {
489 if (s>e) swap(s,e);
490 return (gstart<s ? (s<=gend) : (gstart<=e));
491 }
492 */
493 bool exonOverlap(uint s, uint e) {//check if ANY exon overlaps given segment
494 //ignores strand!
495 if (s>e) swap(s,e);
496 for (int i=0;i<exons.Count();i++) {
497 if (exons[i]->overlap(s,e)) return true;
498 }
499 return false;
500 }
501 bool exonOverlap(GffObj& m) {//check if ANY exon overlaps given segment
502 //if (gseq_id!=m.gseq_id) return false;
503 // ignores strand and gseq_id, must check in advance
504 for (int i=0;i<exons.Count();i++) {
505 for (int j=0;j<m.exons.Count();j++) {
506 if (exons[i]->start>m.exons[j]->end) continue;
507 if (m.exons[j]->start>exons[i]->end) break;
508 //-- overlap if we are here:
509 // if (exons[i]->overlap(m.exons[j])) return true;
510 return true;
511 }
512 }
513 return false;
514 }
515
516 int exonOverlapIdx(uint s, uint e, int* ovlen=NULL) {
517 //return the exons' index for the overlapping exon
518 //ovlen, if given, will return the overlap length
519 if (s>e) swap(s,e);
520 for (int i=0;i<exons.Count();i++) {
521 if (exons[i]->start>e) break;
522 if (s>exons[i]->end) continue;
523 //-- overlap if we are here:
524 if (ovlen!=NULL) {
525 int ovlend= (exons[i]->end>e) ? e : exons[i]->end;
526 *ovlen= ovlend - ((s>exons[i]->start)? s : exons[i]->start)+1;
527 }
528 return i;
529 } //for each exon
530 *ovlen=0;
531 return -1;
532 }
533
534 int exonOverlapLen(GffObj& m) {
535 if (start>m.end || m.start>end) return 0;
536 int i=0;
537 int j=0;
538 int ovlen=0;
539 while (i<exons.Count() && j<m.exons.Count()) {
540 uint istart=exons[i]->start;
541 uint iend=exons[i]->end;
542 uint jstart=m.exons[j]->start;
543 uint jend=m.exons[j]->end;
544 if (istart>jend) { j++; continue; }
545 if (jstart>iend) { i++; continue; }
546 //exon overlap
547 uint ovstart=GMAX(istart,jstart);
548 if (iend<jend) {
549 ovlen+=iend-ovstart+1;
550 i++;
551 }
552 else {
553 ovlen+=jend-ovstart+1;
554 j++;
555 }
556 }//while comparing exons
557 return ovlen;
558 }
559
560 bool exonOverlap(GffObj* m) {
561 return exonOverlap(*m);
562 }
563 //---------- coordinate transformation
564 void xcoord(uint grstart, uint grend, char xstrand='+') {
565 //relative coordinate transform, and reverse-complement transform if xstrand is '-'
566 //does nothing if xstatus is the same already
567 if (xstatus) {
568 if (xstatus==xstrand && grstart==xstart && grend==xend) return;
569 unxcoord();//restore original coordinates
570 }
571 xstatus=xstrand;
572 xstart=grstart;
573 xend=grend;
574 if (CDstart>0) xcoordseg(CDstart, CDend);
575 for (int i=0;i<exons.Count();i++) {
576 xcoordseg(exons[i]->start, exons[i]->end);
577 }
578 if (xstatus=='-') {
579 exons.Reverse();
580 int flen=end-start;
581 start=xend-end+1;
582 end=start+flen;
583 }
584 else {
585 start=start-xstart+1;
586 end=end-xstart+1;
587 }
588 }
589
590 //transform an arbitrary segment based on current xstatus/xstart-xend
591 void xcoordseg(uint& segstart, uint &segend) {
592 if (xstatus==0) return;
593 if (xstatus=='-') {
594 int flen=segend-segstart;
595 segstart=xend-segend+1;
596 segend=segstart+flen;
597 return;
598 }
599 else {
600 segstart=segstart-xstart+1;
601 segend=segend-xstart+1;
602 }
603 }
604
605 void unxcoord() { //revert back to absolute genomic/gff coordinates if xstatus==true
606 if (xstatus==0) return; //nothing to do, no transformation appplied
607 if (CDstart>0) unxcoordseg(CDstart, CDend);
608 //restore all GffExon intervals too
609 for (int i=0;i<exons.Count();i++) {
610 unxcoordseg(exons[i]->start, exons[i]->end);
611 }
612 if (xstatus=='-') {
613 exons.Reverse();
614 int flen=end-start;
615 start=xend-end+1;
616 end=start+flen;
617 }
618 else {
619 start=start+xstart-1;
620 end=end+xstart-1;
621 }
622 xstatus=0;
623 }
624 void unxcoordseg(uint& astart, uint &aend) {
625 //restore an arbitrary interval -- does NOT change the transform state!
626 if (xstatus==0) return;
627 if (xstatus=='-') {
628 int flen=aend-astart;
629 astart=xend-aend+1;
630 aend=astart+flen;
631 }
632 else {
633 astart=astart+xstart-1;
634 aend=aend+xstart-1;
635 }
636 }
637 //---------------------
638 bool operator==(GffObj& d){
639 return (gseq_id==d.gseq_id && start==d.start && end==d.end && strcmp(gffID, d.gffID)==0);
640 }
641 bool operator>(GffObj& d){
642 if (gseq_id!=d.gseq_id) return (gseq_id>d.gseq_id);
643 if (start==d.start) {
644 if (end==d.end) return strcmp(gffID, d.gffID)>0;
645 else return end>d.end;
646 } else return (start>d.start);
647 }
648 bool operator<(GffObj& d){
649 if (gseq_id!=d.gseq_id) return (gseq_id<d.gseq_id);
650 if (start==d.start) {
651 if (end==d.end) return strcmp(gffID, d.gffID)<0;
652 else return end<d.end;
653 } else return (start<d.start);
654 }
655 char* getID() { return gffID; }
656 char* getGene() { return gname; }
657 //void calcScore();
658 /*int exonCount() { return exoncount; }
659 GffExon* getExonSegs() { return exons; }
660 int cdsCount() { return cdscount; }
661 GffExon* getCDSegs() { return cds; } */
662
663 /* bool validateMapping(int qrylen, bool pmap_parsing, int minpid,
664 int mincov, int maxintron);*/
665 /* int addSeg(char* feat, int nstart, int nend, int sc=0, char fr='.',
666 char* tid=NULL, char* tinfo=NULL);*/
667 int addSeg(GffLine* gfline);
668 int addSeg(int fnid, GffLine* gfline);
669 // (int fnid, char* feat, int nstart, int nend, int sc=0,
670 // char fr='.', char* tid=NULL, char* tinfo=NULL);
671 void getCDSegs(GArray<GffCDSeg>& cds);
672 void printGxfLine(FILE* fout, char* tlabel, char* gseqname, bool iscds,
673 uint segstart, uint segend, int exidx, char phase, bool gff3);
674 void printGxf(FILE* fout, GffPrintMode gffp=pgffExon, char* tlabel=NULL);
675 void printGtf(FILE* fout, char* tlabel=NULL) {
676 printGxf(fout, pgtfAny, tlabel);
677 }
678 void printGff(FILE* fout, char* tlabel=NULL) {
679 printGxf(fout, pgffAny, tlabel);
680 }
681 void print_mRNAGff(FILE* fout, char* tlabel=NULL, bool showCDS=false) {
682 if (ismRNA())
683 printGxf(fout, showCDS ? pgffBoth : pgffExon, tlabel);
684 }
685 void printSummary(FILE* fout=NULL);
686 void getCDS_ends(uint& cds_start, uint& cds_end);
687 void mRNA_CDS_coords(uint& cds_start, uint& cds_end);
688 char* getSpliced(GFaSeqGet* faseq, bool CDSonly=false, int* rlen=NULL,
689 uint* cds_start=NULL, uint* cds_end=NULL, GList<GSeg>* seglst=NULL);
690 char* getSplicedTr(GFaSeqGet* faseq, bool CDSonly=true, int* rlen=NULL);
691 //bool validCDS(GFaSeqGet* faseq); //has In-Frame Stop Codon ?
692 bool empty() { return (start==0); }
693 };
694
695 //int cmpGMapScore(const pointer a, const pointer b);
696
697 typedef bool GffRecFunc(GffObj* gobj, void* usrptr1, void* usrptr2);
698 //user callback after parsing a mapping object:
699 // Returns: "done with it" status:
700 // TRUE if gobj is no longer needed so it's FREEd upon return
701 // FALSE if the user needs the gobj pointer and is responsible for
702 // collecting and freeing all GffObj objects
703
704
705 //GSeqStat: collect basic stats about a common underlying genomic sequence
706 // for multiple GffObj
707 class GSeqStat {
708 public:
709 int gseqid; //gseq id in the global static pool of gseqs
710 char* gseqname; //just a pointer to the name of gseq
711 //int fcount;//number of features on this gseq
712 uint mincoord;
713 uint maxcoord;
714 GList<GffObj> gflst;
715 GSeqStat(int id=-1, char* name=NULL):gflst(true,false,false) {
716 gseqid=id;
717 gseqname=name;
718 mincoord=MAXUINT;
719 maxcoord=0;
720 }
721 bool operator>(GSeqStat& g) {
722 return (gseqid>g.gseqid);
723 }
724 bool operator<(GSeqStat& g) {
725 return (gseqid<g.gseqid);
726 }
727 bool operator==(GSeqStat& g) {
728 return (gseqid==g.gseqid);
729 }
730 };
731
732
733 int gfo_cmpByLoc(const pointer p1, const pointer p2);
734
735 class GfList: public GList<GffObj> {
736 //just adding the option to sort by genomic sequence and coordinate
737 public:
738 GfList(bool sortbyloc=false):GList<GffObj>(false,false,false) {
739 if (sortbyloc) this->setSorted((GCompareProc*)gfo_cmpByLoc);
740 }
741 };
742
743 class GffReader {
744 friend class GffObj;
745 friend class GffLine;
746 char* linebuf;
747 off_t fpos;
748 int buflen;
749 protected:
750 FILE* fh;
751 char* fname; //optional fasta file with the underlying genomic sequence to be attached to this reader
752 GffNames* names; //just a pointer to the global static Gff names repository in GffObj
753 GffLine* gffline;
754 bool mrnaOnly; //read only mRNAs ? (exon/CDS features only)
755 bool sortbyloc; //sort by location: genomic sequence and start coordinate
756 GHash<GffObj> phash; //transcript_id (Parent~Contig) => GffObj pointer
757 char* gfoBuildId(const char* id, const char* ctg);
758 void gfoRemove(const char* id, const char* ctg);
759 void gfoAdd(const char* id, const char* ctg, GffObj* gfo);
760 GffObj* gfoFind(const char* id, const char* ctg);
761 public:
762 GfList gflst; //all read gflst
763 GList<GSeqStat> gseqstats; //list of all genomic sequences seen by this reader, accumulates stats
764 GffReader(FILE* f, bool justmrna=false, bool sort=false):phash(false),gflst(sort), gseqstats(true,true,true) {
765 names=NULL;
766 gffline=NULL;
767 mrnaOnly=justmrna;
768 sortbyloc=sort;
769 fpos=0;
770 fname=NULL;
771 fh=f;
772 GMALLOC(linebuf, GFF_LINELEN);
773 buflen=GFF_LINELEN-1;
774 }
775 GffReader(char* fn, bool justmrna=false, bool sort=false):phash(false),gflst(sort) {
776 names=NULL;
777 fname=Gstrdup(fn);
778 mrnaOnly=justmrna;
779 sortbyloc=sort;
780 fh=fopen(fname, "rb");
781 fpos=0;
782 gffline=NULL;
783 GMALLOC(linebuf, GFF_LINELEN);
784 buflen=GFF_LINELEN-1;
785 }
786
787 virtual ~GffReader() {
788 gffline=NULL;
789 fpos=0;
790 delete gffline;
791 gflst.Clear();
792 phash.Clear();
793 gseqstats.Clear();
794 GFREE(fname);
795 GFREE(linebuf);
796 }
797
798 GffLine* nextGffLine();
799
800 // parse -> block parsing functions -- do not use,
801 // they always assume that subfeatures (exons) are grouped together by parent
802 GffObj* parse(bool keepAttr=false, bool noExonAttr=true);
803 void parseAll(GffRecFunc* gproc, bool keepAttr=false, bool noExonAttr=true, void* userptr1=NULL, void* userptr2=NULL);
804
805 // use this instead of parse: load all subfeatures, re-group them in memory:
806 void readAll(bool keepAttr=false, bool mergeCloseExons=false, bool noExonAttr=true); //just reads all gff records into gflst GList
807
808 }; // end of GffReader
809
810 #endif