ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/gff.h
Revision: 2
Committed: Mon Mar 22 22:03:27 2010 UTC (9 years, 4 months ago) by gpertea
File size: 24599 byte(s)
Log Message:
added my gclib source files

Line User Rev File contents
1 gpertea 2 #ifndef GFF_H
2     #define GFF_H
3    
4     #ifdef HAVE_CONFIG_H
5     #include <config.h>
6     #endif
7    
8     #include "GBase.h"
9     #include "gdna.h"
10     #include "codons.h"
11     #include "GFaSeqGet.h"
12     #include "GList.hh"
13     #include "GHash.hh"
14    
15     /*
16     const byte exMskMajSpliceL = 0x01;
17     const byte exMskMajSpliceR = 0x02;
18     const byte exMskMinSpliceL = 0x04;
19     const byte exMskMinSpliceR = 0x08;
20     const byte exMskTag = 0x80;
21     */
22     //reserved Gffnames::feats entries
23     extern const int gff_fid_mRNA;
24     extern const int gff_fid_exon;
25     extern const int gff_fid_CDS;
26     extern bool gff_warns; //show parser warnings
27    
28     extern const uint GFF_MAX_LOCUS;
29     extern const uint GFF_MAX_EXON;
30     extern const uint GFF_MAX_INTRON;
31    
32     #define GFF_LINELEN 2048
33     #define ERR_NULL_GFNAMES "Error: GffObj::%s requires a non-null GffNames* names!\n"
34    
35    
36     enum GffExonType {
37     exgffNone=0,
38     exgffStop, //from "stop_codon" feature
39     exgffCDS, //from "CDS" feature
40     exgffUTR, //from "UTR" feature
41     exgffExon, //from "exon" feature
42     };
43    
44     class GffReader;
45    
46     class GffLine {
47     public:
48     char* line;
49     char* gseqname;
50     char* track;
51     char* ftype; //feature name: mRNA/gene/exon/CDS
52     uint fstart;
53     uint fend;
54     uint qstart; //overlap coords on query, if available
55     uint qend;
56     uint qlen; //query len, if given
57     double score;
58     char strand;
59     bool skip;
60     bool is_cds; //"cds" and "stop_codon" features
61     bool is_exon; //"exon" and "utr" features
62     char exontype; // gffExonType
63     bool is_mrna;
64     char phase; // '.' , '0', '1' or '2'
65     char* gname; //gene_id or Name= value (or an otherwise parsed gene denominator
66     //(for grouping isoforms)
67     char* info; //the last, attributes' field, unparsed
68     //
69     char* Parent; // if a Parent=.. attribute was found
70     char* ID; // if a ID=.. attribute was parsed
71     GffLine(GffReader* reader, const char* l); //parse the line accordingly
72     GffLine() {
73     line=NULL;
74     gseqname=NULL;
75     track=NULL;
76     ftype=NULL;
77     fstart=0;
78     fend=0;
79     info=NULL;
80     Parent=NULL;
81     ID=NULL;
82     gname=NULL;
83     skip=true;
84     qstart=0;
85     qend=0;
86     qlen=0;
87     exontype=0;
88     is_cds=false;
89     is_mrna=false;
90     is_exon=false;
91     }
92     ~GffLine() {
93     GFREE(line);
94     GFREE(Parent);
95     GFREE(ID);
96     GFREE(gname);
97     }
98     };
99    
100     class GffAttr {
101     public:
102     int attr_id;
103     char* attr_val;
104     GffAttr(int an_id, const char* av=NULL) {
105     attr_id=an_id;
106     attr_val=NULL;
107     if (av!=NULL) {
108     char* lastch = (char*)(av+strlen(av)-1);
109     //remove spaces at the end:
110     while (*lastch==' ' && lastch!=av) lastch--;
111     lastch[1]=0;
112     //practical use: if it doesn't have any spaces just strip those useless double quotes
113     if (av[0]=='"' && strpbrk(av+1," ;")==NULL) {
114     if (*lastch=='"') *lastch=0;
115     attr_val=Gstrdup(av+1);
116     }
117     else attr_val=Gstrdup(av);
118     }
119    
120     }
121     ~GffAttr() {
122     GFREE(attr_val);
123     }
124     bool operator==(GffAttr& d){
125     return (this==&d);
126     }
127     bool operator>(GffAttr& d){
128     return (this>&d);
129     }
130     bool operator<(GffAttr& d){
131     return (this<&d);
132     }
133    
134     };
135    
136     class GffNameList;
137     class GffNames;
138    
139     class GffNameInfo {
140     friend class GffNameList;
141     protected:
142     //unsigned char shared;
143     int idx;
144     public:
145     char* name;
146     GffNameInfo() { name=NULL; idx=-1; }
147     GffNameInfo(const char* n) {
148     name=Gstrdup(n);
149     }
150     /*
151     GffNameInfo(const char* n, bool strshared=false) {
152     idx=-1;
153     if (strshared) {
154     shared=1;
155     name=(char *)n;
156     }
157     else {
158     shared=0;
159     name = (n==NULL) ? NULL : Gstrdup(n);
160     }
161     }
162     */
163    
164     ~GffNameInfo() {
165     //if (shared==0)
166     GFREE(name);
167     }
168    
169     bool operator==(GffNameInfo& d){
170     return (strcmp(this->name, d.name)==0);
171     }
172     bool operator>(GffNameInfo& d){
173     return (strcmp(this->name, d.name)>0);
174     }
175     bool operator<(GffNameInfo& d){
176     return (strcmp(this->name, d.name)<0);
177     }
178     };
179    
180     /*
181     int compareNameId(const pointer item1, const pointer item2) {
182     GffNameInfo* ni1=(GffNameInfo*)item1;
183     GffNameInfo* ni2=(GffNameInfo*)item2;
184     return (ni1->id > ni2->id )? 1 : (ni1->id==ni2->id ? 0 : -1);
185     }
186     */
187    
188     class GffNameList:public GList<GffNameInfo> {
189     friend class GffNameInfo;
190     friend class GffNames;
191     protected:
192     GHash<GffNameInfo> byName;//hash with shared keys
193     int idlast; //fList index of last added/reused name
194     void addStatic(const char* tname) {// fast add
195     GffNameInfo* f=new GffNameInfo(tname);
196     idlast=this->Add(f);
197     f->idx=idlast;
198     byName.shkAdd(f->name,f);
199     }
200     public:
201     GffNameList():GList<GffNameInfo>(false,true,true), byName(false) {
202     idlast=-1;
203     }
204     char* lastNameUsed() { return idlast<0 ? NULL : Get(idlast)->name; }
205     int lastNameId() { return idlast; }
206     char* getName(int nid) { //retrieve name by its ID
207     if (nid<0 || nid>=fCount)
208     GError("GffNameList Error: invalid index (%d)\n",nid);
209     return fList[nid]->name;
210     }
211    
212     int addName(const char* tname) {//returns or create an id for the given name
213     //check idlast first, chances are it's the same feature name checked
214     if (idlast>=0 && strcmp(fList[idlast]->name,tname)==0)
215     return idlast;
216     GffNameInfo* f=byName.Find(tname);
217     int fidx=-1;
218     if (f!=NULL) fidx=f->idx;
219     else {//add new entry
220     f=new GffNameInfo(tname);
221     fidx=this->Add(f);
222     f->idx=fidx;
223     byName.shkAdd(f->name,f);
224     }
225     idlast=fidx;
226     return fidx;
227     }
228     int getId(const char* tname) { //only returns a name id# if found
229     GffNameInfo* f=byName.Find(tname);
230     if (f==NULL) return -1;
231     return f->idx;
232     }
233     int removeName(const char* tname) {
234     GError("Error: removing names from GffNameList not allowed!\n");
235     return -1;
236     }
237     };
238    
239     class GffNames {
240     public:
241     int numrefs;
242     GffNameList tracks;
243     GffNameList gseqs;
244     GffNameList attrs;
245     GffNameList feats; //feature names - anything except 'mRNA', 'exon', 'CDS' gets stored here
246     GffNames():tracks(),gseqs(),attrs(), feats() {
247     numrefs=0;
248     //the order below is critical!
249     //has to match: gff_fid_mRNA, gff_fid_exon, gff_fid_CDS
250     feats.addStatic("mRNA");//index 0=gff_fid_mRNA
251     feats.addStatic("exon");//index 1=gff_fid_exon
252     feats.addStatic("CDS"); //index 2=gff_fid_CDS
253     }
254     };
255    
256     void gffnames_ref(GffNames* &n);
257     void gffnames_unref(GffNames* &n);
258    
259     enum GffPrintMode {
260     pgtfAny, //print record as read
261     pgtfExon,
262     pgtfCDS,
263     pgffAny, //print record as read
264     pgffExon,
265     pgffCDS,
266     pgffBoth,
267     };
268    
269    
270     class GffAttrs:public GList<GffAttr> {
271     public:
272     GffAttrs():GList<GffAttr>(false,true,false) { }
273     char* getAttr(GffNames* names, const char* attrname) {
274     int aid=names->attrs.getId(attrname);
275     if (aid>=0)
276     for (int i=0;i<Count();i++)
277     if (aid==Get(i)->attr_id) return Get(i)->attr_val;
278     return NULL;
279     }
280     };
281    
282    
283     class GffExon : public GSeg {
284     public:
285     void* uptr; //for later exensibility
286     GffAttrs* attrs; //other attributes kept for this exon
287     double score; // gff score column
288     char phase; //GFF phase column - for CDS segments only
289     // '.' = undefined (UTR), '0','1','2' for CDS exons
290     char exontype; // 1="exon" 2="cds" 3="utr" 4="stop_codon"
291     int qstart; // for mRNA/protein exon mappings: coordinates on query
292     int qend;
293     GffExon(int s=0, int e=0, double sc=0, char fr=0, int qs=0, int qe=0, char et=0) {
294     uptr=NULL;
295     attrs=NULL;
296     if (s<e) {
297     start=s;
298     end=e;
299     }
300     else {
301     start=e;
302     end=s;
303     }
304     if (qs<qe) {
305     qstart=qs;
306     qend=qe;
307     } else {
308     qstart=qe;
309     qend=qs;
310     }
311     score=sc;
312     phase=fr;
313     exontype=et;
314     } //constructor
315    
316     char* getAttr(GffNames* names, const char* atrname) {
317     if (attrs==NULL || names==NULL || atrname==NULL) return NULL;
318     return attrs->getAttr(names, atrname);
319     }
320    
321     ~GffExon() { //destructor
322     if (attrs!=NULL) delete attrs;
323     }
324     };
325    
326    
327     class GffCDSeg:public GSeg {
328     public:
329     char phase;
330     int exonidx;
331     };
332     //one GFF mRNA object -- e.g. a mRNA with its exons and/or CDS segments
333     class GffObj:public GSeg {
334     //utility segment-merging function for addExon()
335     void expandExon(int xovl, uint segstart, uint segend,
336     char exontype, double sc, char fr, int qs, int qe);
337     protected:
338     //coordinate transformation data:
339     uint xstart; //absolute genomic coordinates of reference region
340     uint xend;
341     char xstatus; //coordinate transform status:
342     //0 : (start,end) coordinates are absolute
343     //'+' : (start,end) coords are relative to xstart..xend region
344     //'-' : (start,end) are relative to the reverse complement of xstart..xend region
345     //--
346     char* gffID; // ID name for mRNA (parent) feature
347     char* gname; // value of Name attribute (if given)
348     //-- friends:
349     friend class GffReader;
350     friend class GffExon;
351     public:
352     bool hasErrors; //overlapping exons, or too short introns, etc.
353     static GffNames* names; // common string storage that holds the various attribute names etc.
354     int track_id; // index of track name in names->tracks
355     int gseq_id; // index of genomic sequence name in names->gseqs
356     int ftype_id; // index of this record's feature name in names->feats, or the special gff_fid_mRNA value
357     int subftype_id; //index of child subfeature name in names->feats (that subfeature stored in "exons")
358     //if ftype_id==gff_fid_mRNA then this value is ignored
359     GList<GffExon> exons; //for non-mRNA entries, these can be any subfeature of type subftype_id
360     int udata; //user data, flags etc.
361     void* uptr; //user pointer (to a parent object, cluster, locus etc.)
362     GffObj* ulink; //link to another GffObj (user controlled field)
363     // mRNA specific fields:
364     bool isCDS; //just a CDS, no UTRs
365     bool partial; //partial CDS
366     uint CDstart; //CDS start coord
367     uint CDend; //CDS end coord
368     char CDphase; //initial phase for CDS start
369    
370     bool ismRNA() { return (ftype_id==gff_fid_mRNA); }
371    
372     int addExon(uint segstart, uint segend, double sc=0, char fr='.',
373     int qs=0, int qe=0, bool iscds=false, char exontype=0);
374    
375     int addExon(GffLine* gl, bool keepAttr=false, bool noExonAttr=true);
376     void removeExon(int idx);
377     /*
378     uint gstart; // global feature coordinates on genomic sequence
379     uint gend; // ALWAYS gstart <= gend
380     */
381     char strand; //true if features are on the reverse complement strand
382     double gscore;
383     double uscore; //custom, user-computed score, if needed
384     int covlen; //total coverage of reference genomic sequence (sum of maxcf segment lengths)
385    
386     //--------- optional data:
387     int qlen; //query length, start, end - if available
388     int qstart;
389     int qend;
390     int qcov; //query coverage - percent
391     GffAttrs* attrs; //other gff3 attributes found for the main mRNA feature
392     //constructor by gff line parsing:
393     GffObj(GffReader* gfrd, GffLine* gffline, bool keepAttrs=false, bool noExonAttr=true);
394     //if gfline->Parent!=NULL then this will also add the first sub-feature
395     // otherwise, only the main feature is created
396     void clearAttrs() {
397     if (attrs!=NULL) delete attrs;
398     }
399     GffObj(char* anid=NULL):GSeg(0,0), exons(true,true,true) { //exons: sorted, free, unique
400     gffID=NULL;
401     uptr=NULL;
402     ulink=NULL;
403     udata=0;
404     hasErrors=false;
405     ftype_id=-1;
406     subftype_id=-1;
407     if (anid!=NULL) gffID=Gstrdup(anid);
408     gffnames_ref(names);
409     qlen=0;
410     qstart=0;
411     qend=0;
412     qcov=0;
413     partial=true;
414     isCDS=false;
415     CDstart=0; // hasCDS <=> CDstart>0
416     CDend=0;
417     CDphase=0;
418     gseq_id=-1;
419     track_id=-1;
420     /*
421     gstart=0;
422     gend=0;
423     */
424     xstart=0;
425     xend=0;
426     xstatus=0;
427     strand=0;
428     gscore=0;
429     uscore=0;
430     attrs=NULL;
431     covlen=0;
432     gname=NULL;
433     }
434     ~GffObj() {
435     GFREE(gffID);
436     GFREE(gname);
437     if (attrs!=NULL) delete attrs;
438     gffnames_unref(names);
439     }
440     /*
441     void addFeature(GffLine* gfline);
442     int getFeatureId(int fnid);
443     int getFeatureId(const char* fname);
444     GFeature* hasFeature(char* parentID);
445     void promote(GFeature* subf, GffLine* gfline); */
446     //--------------
447     GffObj* finalize(bool mergeCloseExons=false); //finalize parsing: must be called in order to merge adjacent/close proximity subfeatures
448     void parseAttrs(GffAttrs*& atrlist, char* info, bool noExonAttr=false);
449     const char* getSubfName() { //returns the generic feature type of the entries in exons array
450     int sid=subftype_id;
451     if (sid==gff_fid_exon && isCDS) sid=gff_fid_CDS;
452     return names->feats.getName(sid);
453     }
454     const char* getFeatureName() {
455     return names->feats.getName(ftype_id);
456     }
457     void addAttr(const char* attrname, char* attrvalue);
458     const char* getAttrName(int i) {
459     if (attrs==NULL) return NULL;
460     return names->attrs.getName(attrs->Get(i)->attr_id);
461     }
462     char* getAttr(const char* atrname) {
463     if (attrs==NULL || names==NULL || atrname==NULL) return NULL;
464     return attrs->getAttr(names, atrname);
465     }
466    
467     char* getAttrValue(int i) {
468     if (attrs==NULL) return NULL;
469     return attrs->Get(i)->attr_val;
470     }
471     const char* getGSeqName() {
472     return names->gseqs.getName(gseq_id);
473     }
474     const char* getTrackName() {
475     return names->tracks.getName(track_id);
476     }
477     /*
478     bool overlap(GffObj* mrna) {
479     //basic overlap: just segment ends
480     return overlap(*mrna);
481     }
482     bool overlap(GffObj& d) {
483     //ignores strand and gseq_id -- the user must do that in advance
484     // just rough locus overlap, exons may not overlap
485     return (gstart<d.gstart ? (d.gstart<=gend) : (gstart<=d.gend));
486     }
487    
488     bool overlap(uint s, uint e) {
489     if (s>e) swap(s,e);
490     return (gstart<s ? (s<=gend) : (gstart<=e));
491     }
492     */
493     bool exonOverlap(uint s, uint e) {//check if ANY exon overlaps given segment
494     //ignores strand!
495     if (s>e) swap(s,e);
496     for (int i=0;i<exons.Count();i++) {
497     if (exons[i]->overlap(s,e)) return true;
498     }
499     return false;
500     }
501     bool exonOverlap(GffObj& m) {//check if ANY exon overlaps given segment
502     //if (gseq_id!=m.gseq_id) return false;
503     // ignores strand and gseq_id, must check in advance
504     for (int i=0;i<exons.Count();i++) {
505     for (int j=0;j<m.exons.Count();j++) {
506     if (exons[i]->start>m.exons[j]->end) continue;
507     if (m.exons[j]->start>exons[i]->end) break;
508     //-- overlap if we are here:
509     // if (exons[i]->overlap(m.exons[j])) return true;
510     return true;
511     }
512     }
513     return false;
514     }
515    
516     int exonOverlapIdx(uint s, uint e, int* ovlen=NULL) {
517     //return the exons' index for the overlapping exon
518     //ovlen, if given, will return the overlap length
519     if (s>e) swap(s,e);
520     for (int i=0;i<exons.Count();i++) {
521     if (exons[i]->start>e) break;
522     if (s>exons[i]->end) continue;
523     //-- overlap if we are here:
524     if (ovlen!=NULL) {
525     int ovlend= (exons[i]->end>e) ? e : exons[i]->end;
526     *ovlen= ovlend - ((s>exons[i]->start)? s : exons[i]->start)+1;
527     }
528     return i;
529     } //for each exon
530     *ovlen=0;
531     return -1;
532     }
533    
534     int exonOverlapLen(GffObj& m) {
535     if (start>m.end || m.start>end) return 0;
536     int i=0;
537     int j=0;
538     int ovlen=0;
539     while (i<exons.Count() && j<m.exons.Count()) {
540     uint istart=exons[i]->start;
541     uint iend=exons[i]->end;
542     uint jstart=m.exons[j]->start;
543     uint jend=m.exons[j]->end;
544     if (istart>jend) { j++; continue; }
545     if (jstart>iend) { i++; continue; }
546     //exon overlap
547     uint ovstart=GMAX(istart,jstart);
548     if (iend<jend) {
549     ovlen+=iend-ovstart+1;
550     i++;
551     }
552     else {
553     ovlen+=jend-ovstart+1;
554     j++;
555     }
556     }//while comparing exons
557     return ovlen;
558     }
559    
560     bool exonOverlap(GffObj* m) {
561     return exonOverlap(*m);
562     }
563     //---------- coordinate transformation
564     void xcoord(uint grstart, uint grend, char xstrand='+') {
565     //relative coordinate transform, and reverse-complement transform if xstrand is '-'
566     //does nothing if xstatus is the same already
567     if (xstatus) {
568     if (xstatus==xstrand && grstart==xstart && grend==xend) return;
569     unxcoord();//restore original coordinates
570     }
571     xstatus=xstrand;
572     xstart=grstart;
573     xend=grend;
574     if (CDstart>0) xcoordseg(CDstart, CDend);
575     for (int i=0;i<exons.Count();i++) {
576     xcoordseg(exons[i]->start, exons[i]->end);
577     }
578     if (xstatus=='-') {
579     exons.Reverse();
580     int flen=end-start;
581     start=xend-end+1;
582     end=start+flen;
583     }
584     else {
585     start=start-xstart+1;
586     end=end-xstart+1;
587     }
588     }
589    
590     //transform an arbitrary segment based on current xstatus/xstart-xend
591     void xcoordseg(uint& segstart, uint &segend) {
592     if (xstatus==0) return;
593     if (xstatus=='-') {
594     int flen=segend-segstart;
595     segstart=xend-segend+1;
596     segend=segstart+flen;
597     return;
598     }
599     else {
600     segstart=segstart-xstart+1;
601     segend=segend-xstart+1;
602     }
603     }
604    
605     void unxcoord() { //revert back to absolute genomic/gff coordinates if xstatus==true
606     if (xstatus==0) return; //nothing to do, no transformation appplied
607     if (CDstart>0) unxcoordseg(CDstart, CDend);
608     //restore all GffExon intervals too
609     for (int i=0;i<exons.Count();i++) {
610     unxcoordseg(exons[i]->start, exons[i]->end);
611     }
612     if (xstatus=='-') {
613     exons.Reverse();
614     int flen=end-start;
615     start=xend-end+1;
616     end=start+flen;
617     }
618     else {
619     start=start+xstart-1;
620     end=end+xstart-1;
621     }
622     xstatus=0;
623     }
624     void unxcoordseg(uint& astart, uint &aend) {
625     //restore an arbitrary interval -- does NOT change the transform state!
626     if (xstatus==0) return;
627     if (xstatus=='-') {
628     int flen=aend-astart;
629     astart=xend-aend+1;
630     aend=astart+flen;
631     }
632     else {
633     astart=astart+xstart-1;
634     aend=aend+xstart-1;
635     }
636     }
637     //---------------------
638     bool operator==(GffObj& d){
639     return (gseq_id==d.gseq_id && start==d.start && end==d.end && strcmp(gffID, d.gffID)==0);
640     }
641     bool operator>(GffObj& d){
642     if (gseq_id!=d.gseq_id) return (gseq_id>d.gseq_id);
643     if (start==d.start) {
644     if (end==d.end) return strcmp(gffID, d.gffID)>0;
645     else return end>d.end;
646     } else return (start>d.start);
647     }
648     bool operator<(GffObj& d){
649     if (gseq_id!=d.gseq_id) return (gseq_id<d.gseq_id);
650     if (start==d.start) {
651     if (end==d.end) return strcmp(gffID, d.gffID)<0;
652     else return end<d.end;
653     } else return (start<d.start);
654     }
655     char* getID() { return gffID; }
656     char* getGene() { return gname; }
657     //void calcScore();
658     /*int exonCount() { return exoncount; }
659     GffExon* getExonSegs() { return exons; }
660     int cdsCount() { return cdscount; }
661     GffExon* getCDSegs() { return cds; } */
662    
663     /* bool validateMapping(int qrylen, bool pmap_parsing, int minpid,
664     int mincov, int maxintron);*/
665     /* int addSeg(char* feat, int nstart, int nend, int sc=0, char fr='.',
666     char* tid=NULL, char* tinfo=NULL);*/
667     int addSeg(GffLine* gfline);
668     int addSeg(int fnid, GffLine* gfline);
669     // (int fnid, char* feat, int nstart, int nend, int sc=0,
670     // char fr='.', char* tid=NULL, char* tinfo=NULL);
671     void getCDSegs(GArray<GffCDSeg>& cds);
672     void printGxfLine(FILE* fout, char* tlabel, char* gseqname, bool iscds,
673     uint segstart, uint segend, int exidx, char phase, bool gff3);
674     void printGxf(FILE* fout, GffPrintMode gffp=pgffExon, char* tlabel=NULL);
675     void printGtf(FILE* fout, char* tlabel=NULL) {
676     printGxf(fout, pgtfAny, tlabel);
677     }
678     void printGff(FILE* fout, char* tlabel=NULL) {
679     printGxf(fout, pgffAny, tlabel);
680     }
681     void print_mRNAGff(FILE* fout, char* tlabel=NULL, bool showCDS=false) {
682     if (ismRNA())
683     printGxf(fout, showCDS ? pgffBoth : pgffExon, tlabel);
684     }
685     void printSummary(FILE* fout=NULL);
686     void getCDS_ends(uint& cds_start, uint& cds_end);
687     void mRNA_CDS_coords(uint& cds_start, uint& cds_end);
688     char* getSpliced(GFaSeqGet* faseq, bool CDSonly=false, int* rlen=NULL,
689     uint* cds_start=NULL, uint* cds_end=NULL, GList<GSeg>* seglst=NULL);
690     char* getSplicedTr(GFaSeqGet* faseq, bool CDSonly=true, int* rlen=NULL);
691     //bool validCDS(GFaSeqGet* faseq); //has In-Frame Stop Codon ?
692     bool empty() { return (start==0); }
693     };
694    
695     //int cmpGMapScore(const pointer a, const pointer b);
696    
697     typedef bool GffRecFunc(GffObj* gobj, void* usrptr1, void* usrptr2);
698     //user callback after parsing a mapping object:
699     // Returns: "done with it" status:
700     // TRUE if gobj is no longer needed so it's FREEd upon return
701     // FALSE if the user needs the gobj pointer and is responsible for
702     // collecting and freeing all GffObj objects
703    
704    
705     //GSeqStat: collect basic stats about a common underlying genomic sequence
706     // for multiple GffObj
707     class GSeqStat {
708     public:
709     int gseqid; //gseq id in the global static pool of gseqs
710     char* gseqname; //just a pointer to the name of gseq
711     //int fcount;//number of features on this gseq
712     uint mincoord;
713     uint maxcoord;
714     GList<GffObj> gflst;
715     GSeqStat(int id=-1, char* name=NULL):gflst(true,false,false) {
716     gseqid=id;
717     gseqname=name;
718     mincoord=MAXUINT;
719     maxcoord=0;
720     }
721     bool operator>(GSeqStat& g) {
722     return (gseqid>g.gseqid);
723     }
724     bool operator<(GSeqStat& g) {
725     return (gseqid<g.gseqid);
726     }
727     bool operator==(GSeqStat& g) {
728     return (gseqid==g.gseqid);
729     }
730     };
731    
732    
733     int gfo_cmpByLoc(const pointer p1, const pointer p2);
734    
735     class GfList: public GList<GffObj> {
736     //just adding the option to sort by genomic sequence and coordinate
737     public:
738     GfList(bool sortbyloc=false):GList<GffObj>(false,false,false) {
739     if (sortbyloc) this->setSorted((GCompareProc*)gfo_cmpByLoc);
740     }
741     };
742    
743     class GffReader {
744     friend class GffObj;
745     friend class GffLine;
746     char* linebuf;
747     off_t fpos;
748     int buflen;
749     protected:
750     FILE* fh;
751     char* fname; //optional fasta file with the underlying genomic sequence to be attached to this reader
752     GffNames* names; //just a pointer to the global static Gff names repository in GffObj
753     GffLine* gffline;
754     bool mrnaOnly; //read only mRNAs ? (exon/CDS features only)
755     bool sortbyloc; //sort by location: genomic sequence and start coordinate
756     GHash<GffObj> phash; //transcript_id (Parent~Contig) => GffObj pointer
757     char* gfoBuildId(const char* id, const char* ctg);
758     void gfoRemove(const char* id, const char* ctg);
759     void gfoAdd(const char* id, const char* ctg, GffObj* gfo);
760     GffObj* gfoFind(const char* id, const char* ctg);
761     public:
762     GfList gflst; //all read gflst
763     GList<GSeqStat> gseqstats; //list of all genomic sequences seen by this reader, accumulates stats
764     GffReader(FILE* f, bool justmrna=false, bool sort=false):phash(false),gflst(sort), gseqstats(true,true,true) {
765     names=NULL;
766     gffline=NULL;
767     mrnaOnly=justmrna;
768     sortbyloc=sort;
769     fpos=0;
770     fname=NULL;
771     fh=f;
772     GMALLOC(linebuf, GFF_LINELEN);
773     buflen=GFF_LINELEN-1;
774     }
775     GffReader(char* fn, bool justmrna=false, bool sort=false):phash(false),gflst(sort) {
776     names=NULL;
777     fname=Gstrdup(fn);
778     mrnaOnly=justmrna;
779     sortbyloc=sort;
780     fh=fopen(fname, "rb");
781     fpos=0;
782     gffline=NULL;
783     GMALLOC(linebuf, GFF_LINELEN);
784     buflen=GFF_LINELEN-1;
785     }
786    
787     virtual ~GffReader() {
788     gffline=NULL;
789     fpos=0;
790     delete gffline;
791     gflst.Clear();
792     phash.Clear();
793     gseqstats.Clear();
794     GFREE(fname);
795     GFREE(linebuf);
796     }
797    
798     GffLine* nextGffLine();
799    
800     // parse -> block parsing functions -- do not use,
801     // they always assume that subfeatures (exons) are grouped together by parent
802     GffObj* parse(bool keepAttr=false, bool noExonAttr=true);
803     void parseAll(GffRecFunc* gproc, bool keepAttr=false, bool noExonAttr=true, void* userptr1=NULL, void* userptr2=NULL);
804    
805     // use this instead of parse: load all subfeatures, re-group them in memory:
806     void readAll(bool keepAttr=false, bool mergeCloseExons=false, bool noExonAttr=true); //just reads all gff records into gflst GList
807    
808     }; // end of GffReader
809    
810     #endif