ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/gff.h
Revision: 286
Committed: Mon Oct 15 17:31:47 2012 UTC (7 years ago) by gpertea
File size: 35016 byte(s)
Log Message:
slight mods for rejoin use

Line User Rev File contents
1 gpertea 2 #ifndef GFF_H
2     #define GFF_H
3    
4     #include "GBase.h"
5     #include "gdna.h"
6     #include "codons.h"
7     #include "GFaSeqGet.h"
8     #include "GList.hh"
9     #include "GHash.hh"
10    
11     /*
12     const byte exMskMajSpliceL = 0x01;
13     const byte exMskMajSpliceR = 0x02;
14     const byte exMskMinSpliceL = 0x04;
15     const byte exMskMinSpliceR = 0x08;
16     const byte exMskTag = 0x80;
17     */
18 gpertea 16
19     //reserved Gffnames::feats entries -- basic feature types
20     extern const int gff_fid_mRNA; // "mRNA" feature name
21     extern const int gff_fid_transcript; // *RNA, *transcript feature name
22 gpertea 2 extern const int gff_fid_exon;
23 gpertea 16 extern const int gff_fid_CDS; //never really used, except for display only
24     //use gff_fid_exon instead
25 gpertea 2 extern const uint GFF_MAX_LOCUS;
26     extern const uint GFF_MAX_EXON;
27     extern const uint GFF_MAX_INTRON;
28    
29 gpertea 16 extern const uint gfo_flag_CHILDREN_PROMOTED;
30     extern const uint gfo_flag_HAS_ERRORS;
31     extern const uint gfo_flag_IS_GENE;
32 gpertea 153 extern const uint gfo_flag_HAS_GFF_ID; //found a GFF3 formatted main feature with its own ID
33 gpertea 16 extern const uint gfo_flag_BY_EXON; //created by subfeature (exon) directly
34     //(GTF2 and some chado gff3 dumps with exons given before their mRNA)
35     extern const uint gfo_flag_IS_TRANSCRIPT; //recognized as '*RNA' or '*transcript'
36     extern const uint gfo_flag_DISCARDED; //should not be printed under the "transcriptsOnly" directive
37     extern const uint gfo_flag_LST_KEEP; //GffObj from GffReader::gflst is to be kept (not deallocated)
38     //when GffReader is destroyed
39     extern const uint gfo_flag_LEVEL_MSK; //hierarchical level: 0 = no parent
40     extern const byte gfo_flagShift_LEVEL;
41    
42     extern bool gff_show_warnings;
43    
44 gpertea 2 #define GFF_LINELEN 2048
45     #define ERR_NULL_GFNAMES "Error: GffObj::%s requires a non-null GffNames* names!\n"
46    
47    
48     enum GffExonType {
49 gpertea 16 exgffNone=0, //not a recognizable exon or CDS segment
50     exgffStart, //from "start_codon" feature (within CDS)
51     exgffStop, //from "stop_codon" feature (may be outside CDS)
52 gpertea 2 exgffCDS, //from "CDS" feature
53     exgffUTR, //from "UTR" feature
54 gpertea 16 exgffCDSUTR, //from a merge of UTR and CDS feature
55 gpertea 2 exgffExon, //from "exon" feature
56     };
57    
58     class GffReader;
59    
60     class GffLine {
61 gpertea 16 char* _parents; //stores a copy of the Parent attribute value,
62     //with commas replaced by \0
63     int _parents_len;
64 gpertea 2 public:
65 gpertea 16 char* dupline; //duplicate of original line
66     char* line; //this will have tabs replaced by \0
67     int llen;
68 gpertea 2 char* gseqname;
69     char* track;
70     char* ftype; //feature name: mRNA/gene/exon/CDS
71 gpertea 16 char* info; //the last, attributes' field, unparsed
72 gpertea 2 uint fstart;
73     uint fend;
74     uint qstart; //overlap coords on query, if available
75     uint qend;
76     uint qlen; //query len, if given
77     double score;
78     char strand;
79     bool skip;
80 gpertea 16 bool is_gff3; //if the line appears to be in GFF3 format
81 gpertea 2 bool is_cds; //"cds" and "stop_codon" features
82     bool is_exon; //"exon" and "utr" features
83     char exontype; // gffExonType
84 gpertea 16 bool is_transcript; //if current feature is *RNA or *transcript
85     bool is_gene; //if current feature is *gene
86 gpertea 2 char phase; // '.' , '0', '1' or '2'
87 gpertea 16 // -- allocated strings:
88     char* gene_name; //value of gene_name attribute (GTF) if present or Name attribute of a gene feature (GFF3)
89     char* gene_id; //value of gene_id attribute (GTF) if present or ID attribute of a gene feature (GFF3)
90 gpertea 2 //
91 gpertea 16 char** parents; //for GTF only parents[0] is used
92     int num_parents;
93     char* ID; // if a ID=.. attribute was parsed, or a GTF with 'transcript' line (transcript_id)
94 gpertea 2 GffLine(GffReader* reader, const char* l); //parse the line accordingly
95 gpertea 16 void discardParent() {
96     GFREE(_parents);
97     _parents_len=0;
98     num_parents=0;
99     parents=NULL;
100     }
101 gpertea 150 char* extractAttr(const char* pre, bool caseStrict=false, bool enforce_GTF2=false);
102 gpertea 16 GffLine(GffLine* l) { //a copy constructor
103     memcpy((void*)this, (void*)l, sizeof(GffLine));
104     line=NULL;
105     GMALLOC(line, llen+1);
106     memcpy(line, l->line, llen+1);
107     GMALLOC(dupline, llen+1);
108     memcpy(dupline, l->dupline, llen+1);
109     //--offsets within line[]
110     gseqname=line+(l->gseqname-l->line);
111     track=line+(l->track-l->line);
112     ftype=line+(l->ftype-l->line);
113     info=line+(l->info-l->line);
114     //Parent=Gstrdup(l->Parent);
115     if (l->_parents_len>0) {
116     _parents_len=l->_parents_len;
117     GMALLOC(_parents, _parents_len);
118     memcpy(_parents, l->_parents, _parents_len);
119     num_parents=l->num_parents;
120     for (int i=0;i<num_parents;i++) {
121     parents[i]=_parents+(l->parents[i] - l->_parents);
122     }
123     }
124     //-- allocated string copies:
125     ID=Gstrdup(l->ID);
126     if (l->gene_name!=NULL)
127     gene_name=Gstrdup(l->gene_name);
128     if (l->gene_id!=NULL)
129     gene_id=Gstrdup(l->gene_id);
130     }
131 gpertea 2 GffLine() {
132     line=NULL;
133 gpertea 16 dupline=NULL;
134 gpertea 2 gseqname=NULL;
135     track=NULL;
136     ftype=NULL;
137     fstart=0;
138     fend=0;
139 gpertea 248 strand=0;phase=0;
140     llen=0;score=0;
141 gpertea 2 info=NULL;
142 gpertea 16 _parents=NULL;
143     _parents_len=0;
144     parents=NULL;
145     num_parents=0;
146 gpertea 2 ID=NULL;
147 gpertea 16 gene_name=NULL;
148     gene_id=NULL;
149 gpertea 2 skip=true;
150     qstart=0;
151     qend=0;
152     qlen=0;
153     exontype=0;
154     is_cds=false;
155 gpertea 16 is_gff3=false;
156     is_transcript=false;
157     is_gene=false;
158 gpertea 2 is_exon=false;
159     }
160     ~GffLine() {
161 gpertea 16 GFREE(dupline);
162 gpertea 2 GFREE(line);
163 gpertea 16 GFREE(_parents);
164     GFREE(parents);
165 gpertea 2 GFREE(ID);
166 gpertea 16 GFREE(gene_name);
167     GFREE(gene_id);
168 gpertea 2 }
169     };
170    
171     class GffAttr {
172     public:
173     int attr_id;
174     char* attr_val;
175     GffAttr(int an_id, const char* av=NULL) {
176     attr_id=an_id;
177     attr_val=NULL;
178 gpertea 16 setValue(av);
179 gpertea 2 }
180     ~GffAttr() {
181     GFREE(attr_val);
182     }
183 gpertea 16 void setValue(const char* av) {
184     if (attr_val!=NULL) {
185     GFREE(attr_val);
186     }
187     if (av==NULL || av[0]==0) return;
188     //trim spaces
189     const char* vstart=av;
190     while (*vstart==' ') av++;
191     const char* vend=vstart;
192     bool keep_dq=false;
193     while (vend[1]!=0) {
194     if (*vend==' ' && vend[1]!=' ') keep_dq=true;
195     else if (*vend==';') keep_dq=true;
196     vend++;
197     }
198     //remove spaces at the end:
199     while (*vend==' ' && vend!=vstart) vend--;
200     //practical clean-up: if it doesn't have any internal spaces just strip those useless double quotes
201     if (!keep_dq && *vstart=='"' && *vend=='"') {
202     vend--;
203     vstart++;
204     }
205     attr_val=Gstrdup(vstart, vend);
206     }
207 gpertea 2 bool operator==(GffAttr& d){
208     return (this==&d);
209     }
210     bool operator>(GffAttr& d){
211     return (this>&d);
212     }
213     bool operator<(GffAttr& d){
214     return (this<&d);
215     }
216    
217     };
218    
219     class GffNameList;
220     class GffNames;
221    
222     class GffNameInfo {
223     friend class GffNameList;
224     protected:
225     int idx;
226     public:
227     char* name;
228 gpertea 248 GffNameInfo(const char* n=NULL):idx(-1),name(NULL) {
229     if (n) name=Gstrdup(n);
230 gpertea 2 }
231    
232     ~GffNameInfo() {
233 gpertea 16 GFREE(name);
234     }
235 gpertea 2
236     bool operator==(GffNameInfo& d){
237     return (strcmp(this->name, d.name)==0);
238     }
239     bool operator<(GffNameInfo& d){
240     return (strcmp(this->name, d.name)<0);
241     }
242     };
243    
244     class GffNameList:public GList<GffNameInfo> {
245     friend class GffNameInfo;
246     friend class GffNames;
247     protected:
248     GHash<GffNameInfo> byName;//hash with shared keys
249     int idlast; //fList index of last added/reused name
250     void addStatic(const char* tname) {// fast add
251     GffNameInfo* f=new GffNameInfo(tname);
252     idlast=this->Add(f);
253     f->idx=idlast;
254     byName.shkAdd(f->name,f);
255     }
256     public:
257 gpertea 227 GffNameList(int init_capacity=6):GList<GffNameInfo>(init_capacity, false,true,true), byName(false) {
258 gpertea 2 idlast=-1;
259 gpertea 227 setCapacity(init_capacity);
260 gpertea 2 }
261     char* lastNameUsed() { return idlast<0 ? NULL : Get(idlast)->name; }
262     int lastNameId() { return idlast; }
263     char* getName(int nid) { //retrieve name by its ID
264     if (nid<0 || nid>=fCount)
265     GError("GffNameList Error: invalid index (%d)\n",nid);
266     return fList[nid]->name;
267     }
268    
269     int addName(const char* tname) {//returns or create an id for the given name
270     //check idlast first, chances are it's the same feature name checked
271 gpertea 227 /*if (idlast>=0 && strcmp(fList[idlast]->name,tname)==0)
272     return idlast;*/
273 gpertea 2 GffNameInfo* f=byName.Find(tname);
274     int fidx=-1;
275     if (f!=NULL) fidx=f->idx;
276     else {//add new entry
277     f=new GffNameInfo(tname);
278     fidx=this->Add(f);
279     f->idx=fidx;
280     byName.shkAdd(f->name,f);
281     }
282     idlast=fidx;
283     return fidx;
284     }
285 gpertea 16
286     int addNewName(const char* tname) {
287     GffNameInfo* f=new GffNameInfo(tname);
288     int fidx=this->Add(f);
289     f->idx=fidx;
290     byName.shkAdd(f->name,f);
291     return fidx;
292     }
293    
294 gpertea 2 int getId(const char* tname) { //only returns a name id# if found
295     GffNameInfo* f=byName.Find(tname);
296     if (f==NULL) return -1;
297     return f->idx;
298     }
299 gpertea 16 int removeName() {
300 gpertea 2 GError("Error: removing names from GffNameList not allowed!\n");
301     return -1;
302     }
303     };
304    
305     class GffNames {
306     public:
307     int numrefs;
308     GffNameList tracks;
309     GffNameList gseqs;
310     GffNameList attrs;
311 gpertea 16 GffNameList feats; //feature names: 'mRNA', 'exon', 'CDS' etc.
312 gpertea 2 GffNames():tracks(),gseqs(),attrs(), feats() {
313     numrefs=0;
314     //the order below is critical!
315     //has to match: gff_fid_mRNA, gff_fid_exon, gff_fid_CDS
316     feats.addStatic("mRNA");//index 0=gff_fid_mRNA
317 gpertea 16 feats.addStatic("transcript");//index 1=gff_fid_transcript
318 gpertea 2 feats.addStatic("exon");//index 1=gff_fid_exon
319     feats.addStatic("CDS"); //index 2=gff_fid_CDS
320     }
321     };
322    
323     void gffnames_ref(GffNames* &n);
324     void gffnames_unref(GffNames* &n);
325    
326     enum GffPrintMode {
327     pgtfAny, //print record as read
328     pgtfExon,
329     pgtfCDS,
330     pgffAny, //print record as read
331     pgffExon,
332     pgffCDS,
333     pgffBoth,
334     };
335    
336    
337     class GffAttrs:public GList<GffAttr> {
338     public:
339     GffAttrs():GList<GffAttr>(false,true,false) { }
340 gpertea 16 void add_or_update(GffNames* names, const char* attrname, const char* val) {
341     int aid=names->attrs.getId(attrname);
342     if (aid>=0) {
343     //attribute found in the dictionary
344     for (int i=0;i<Count();i++) {
345     //do we have it?
346     if (aid==Get(i)->attr_id) {
347     //update the value
348     Get(i)->setValue(val);
349     return;
350     }
351     }
352     }
353     else {
354     aid=names->attrs.addNewName(attrname);
355     }
356     this->Add(new GffAttr(aid, val));
357     }
358    
359 gpertea 2 char* getAttr(GffNames* names, const char* attrname) {
360     int aid=names->attrs.getId(attrname);
361     if (aid>=0)
362     for (int i=0;i<Count();i++)
363     if (aid==Get(i)->attr_id) return Get(i)->attr_val;
364     return NULL;
365     }
366 gpertea 16 char* getAttr(int aid) {
367     if (aid>=0)
368     for (int i=0;i<Count();i++)
369     if (aid==Get(i)->attr_id) return Get(i)->attr_val;
370     return NULL;
371     }
372 gpertea 2 };
373    
374    
375     class GffExon : public GSeg {
376     public:
377 gpertea 16 void* uptr; //for later extensions
378 gpertea 2 GffAttrs* attrs; //other attributes kept for this exon
379     double score; // gff score column
380     char phase; //GFF phase column - for CDS segments only
381     // '.' = undefined (UTR), '0','1','2' for CDS exons
382     char exontype; // 1="exon" 2="cds" 3="utr" 4="stop_codon"
383     int qstart; // for mRNA/protein exon mappings: coordinates on query
384     int qend;
385     GffExon(int s=0, int e=0, double sc=0, char fr=0, int qs=0, int qe=0, char et=0) {
386     uptr=NULL;
387     attrs=NULL;
388     if (s<e) {
389     start=s;
390     end=e;
391     }
392     else {
393     start=e;
394     end=s;
395     }
396     if (qs<qe) {
397     qstart=qs;
398     qend=qe;
399     } else {
400     qstart=qe;
401     qend=qs;
402     }
403     score=sc;
404     phase=fr;
405     exontype=et;
406     } //constructor
407    
408     char* getAttr(GffNames* names, const char* atrname) {
409     if (attrs==NULL || names==NULL || atrname==NULL) return NULL;
410     return attrs->getAttr(names, atrname);
411     }
412    
413 gpertea 16 char* getAttr(int aid) {
414     if (attrs==NULL) return NULL;
415     return attrs->getAttr(aid);
416     }
417    
418 gpertea 2 ~GffExon() { //destructor
419     if (attrs!=NULL) delete attrs;
420     }
421     };
422    
423    
424     class GffCDSeg:public GSeg {
425     public:
426     char phase;
427     int exonidx;
428     };
429     //one GFF mRNA object -- e.g. a mRNA with its exons and/or CDS segments
430     class GffObj:public GSeg {
431     //utility segment-merging function for addExon()
432     void expandExon(int xovl, uint segstart, uint segend,
433     char exontype, double sc, char fr, int qs, int qe);
434     protected:
435     //coordinate transformation data:
436     uint xstart; //absolute genomic coordinates of reference region
437     uint xend;
438     char xstatus; //coordinate transform status:
439     //0 : (start,end) coordinates are absolute
440     //'+' : (start,end) coords are relative to xstart..xend region
441     //'-' : (start,end) are relative to the reverse complement of xstart..xend region
442     //--
443     char* gffID; // ID name for mRNA (parent) feature
444 gpertea 16 char* gene_name; //value of gene_name attribute (GTF) if present or Name attribute of the parent gene feature (GFF3)
445     char* geneID; //value of gene_id attribute (GTF) if present or ID attribute of a parent gene feature (GFF3)
446     unsigned int flags;
447 gpertea 2 //-- friends:
448     friend class GffReader;
449     friend class GffExon;
450     public:
451 gpertea 16 static GffNames* names; // dictionary storage that holds the various attribute names etc.
452 gpertea 2 int track_id; // index of track name in names->tracks
453     int gseq_id; // index of genomic sequence name in names->gseqs
454     int ftype_id; // index of this record's feature name in names->feats, or the special gff_fid_mRNA value
455 gpertea 16 int exon_ftype_id; //index of child subfeature name in names->feats (that subfeature stored in "exons")
456 gpertea 2 //if ftype_id==gff_fid_mRNA then this value is ignored
457     GList<GffExon> exons; //for non-mRNA entries, these can be any subfeature of type subftype_id
458 gpertea 16 GPVec<GffObj> children;
459     GffObj* parent;
460 gpertea 2 int udata; //user data, flags etc.
461     void* uptr; //user pointer (to a parent object, cluster, locus etc.)
462     GffObj* ulink; //link to another GffObj (user controlled field)
463     // mRNA specific fields:
464     bool isCDS; //just a CDS, no UTRs
465     bool partial; //partial CDS
466     uint CDstart; //CDS start coord
467     uint CDend; //CDS end coord
468     char CDphase; //initial phase for CDS start
469 gpertea 16 bool hasErrors() { return ((flags & gfo_flag_HAS_ERRORS)!=0); }
470     void hasErrors(bool v) {
471     if (v) flags |= gfo_flag_HAS_ERRORS;
472     else flags &= ~gfo_flag_HAS_ERRORS;
473     }
474 gpertea 153 bool hasGffID() { return ((flags & gfo_flag_HAS_GFF_ID)!=0); }
475     void hasGffID(bool v) {
476     if (v) flags |= gfo_flag_HAS_GFF_ID;
477     else flags &= ~gfo_flag_HAS_GFF_ID;
478 gpertea 16 }
479     bool createdByExon() { return ((flags & gfo_flag_BY_EXON)!=0); }
480     void createdByExon(bool v) {
481     if (v) flags |= gfo_flag_BY_EXON;
482     else flags &= ~gfo_flag_BY_EXON;
483     }
484     bool isGene() { return ((flags & gfo_flag_IS_GENE)!=0); }
485     void isGene(bool v) {
486     if (v) flags |= gfo_flag_IS_GENE;
487     else flags &= ~gfo_flag_IS_GENE;
488     }
489     bool isDiscarded() { return ((flags & gfo_flag_DISCARDED)!=0); }
490     void isDiscarded(bool v) {
491     if (v) flags |= gfo_flag_DISCARDED;
492     else flags &= ~gfo_flag_DISCARDED;
493     }
494    
495     bool isUsed() { return ((flags & gfo_flag_LST_KEEP)!=0); }
496     void isUsed(bool v) {
497     if (v) flags |= gfo_flag_LST_KEEP;
498     else flags &= ~gfo_flag_LST_KEEP;
499     }
500     bool isTranscript() { return ((flags & gfo_flag_IS_TRANSCRIPT)!=0); }
501     void isTranscript(bool v) {
502     if (v) flags |= gfo_flag_IS_TRANSCRIPT;
503     else flags &= ~gfo_flag_IS_TRANSCRIPT;
504     }
505     bool promotedChildren() { return ((flags & gfo_flag_CHILDREN_PROMOTED)!=0); }
506     void promotedChildren(bool v) {
507     if (v) flags |= gfo_flag_CHILDREN_PROMOTED;
508     else flags &= ~gfo_flag_CHILDREN_PROMOTED;
509     }
510     void setLevel(byte v) {
511     if (v==0) flags &= ~gfo_flag_LEVEL_MSK;
512     else flags &= ~(((uint)v) << gfo_flagShift_LEVEL);
513     }
514     byte incLevel() {
515     uint v=((flags & gfo_flag_LEVEL_MSK) >> gfo_flagShift_LEVEL);
516     v++;
517     flags &= ~(v << gfo_flagShift_LEVEL);
518     return v;
519     }
520     byte getLevel() {
521     return ((byte)((flags & gfo_flag_LEVEL_MSK) >> gfo_flagShift_LEVEL));
522     }
523 gpertea 2
524 gpertea 16 bool isValidTranscript() {
525     //return (ftype_id==gff_fid_mRNA && exons.Count()>0);
526     return (isTranscript() && exons.Count()>0);
527     }
528    
529 gpertea 2
530     int addExon(uint segstart, uint segend, double sc=0, char fr='.',
531     int qs=0, int qe=0, bool iscds=false, char exontype=0);
532    
533 gpertea 16 int addExon(GffReader* reader, GffLine* gl, bool keepAttr=false, bool noExonAttr=true);
534    
535 gpertea 2 void removeExon(int idx);
536 gpertea 16 void removeExon(GffExon* p);
537 gpertea 2 char strand; //true if features are on the reverse complement strand
538     double gscore;
539     double uscore; //custom, user-computed score, if needed
540     int covlen; //total coverage of reference genomic sequence (sum of maxcf segment lengths)
541 gpertea 16
542 gpertea 2 //--------- optional data:
543     int qlen; //query length, start, end - if available
544     int qstart;
545     int qend;
546     int qcov; //query coverage - percent
547     GffAttrs* attrs; //other gff3 attributes found for the main mRNA feature
548     //constructor by gff line parsing:
549     GffObj(GffReader* gfrd, GffLine* gffline, bool keepAttrs=false, bool noExonAttr=true);
550     //if gfline->Parent!=NULL then this will also add the first sub-feature
551     // otherwise, only the main feature is created
552     void clearAttrs() {
553 gpertea 16 if (attrs!=NULL) {
554     bool sharedattrs=(exons.Count()>0 && exons[0]->attrs==attrs);
555     delete attrs; attrs=NULL;
556     if (sharedattrs) exons[0]->attrs=NULL;
557     }
558 gpertea 2 }
559 gpertea 16 GffObj(char* anid=NULL):GSeg(0,0), exons(true,true,false), children(1,false) {
560     //exons: sorted, free, non-unique
561 gpertea 2 gffID=NULL;
562     uptr=NULL;
563     ulink=NULL;
564 gpertea 16 flags=0;
565 gpertea 2 udata=0;
566 gpertea 16 parent=NULL;
567 gpertea 2 ftype_id=-1;
568 gpertea 16 exon_ftype_id=-1;
569 gpertea 2 if (anid!=NULL) gffID=Gstrdup(anid);
570     gffnames_ref(names);
571     qlen=0;
572     qstart=0;
573     qend=0;
574     qcov=0;
575     partial=true;
576     isCDS=false;
577     CDstart=0; // hasCDS <=> CDstart>0
578     CDend=0;
579     CDphase=0;
580     gseq_id=-1;
581     track_id=-1;
582     xstart=0;
583     xend=0;
584     xstatus=0;
585 gpertea 16 strand='.';
586 gpertea 2 gscore=0;
587     uscore=0;
588     attrs=NULL;
589     covlen=0;
590 gpertea 16 gene_name=NULL;
591     geneID=NULL;
592 gpertea 2 }
593     ~GffObj() {
594     GFREE(gffID);
595 gpertea 16 GFREE(gene_name);
596     GFREE(geneID);
597     clearAttrs();
598 gpertea 2 gffnames_unref(names);
599     }
600     //--------------
601 gpertea 16 GffObj* finalize(GffReader* gfr, bool mergeCloseExons=false,
602     bool keepAttrs=false, bool noExonAttr=true);
603     //complete parsing: must be called in order to merge adjacent/close proximity subfeatures
604     void parseAttrs(GffAttrs*& atrlist, char* info, bool isExon=false);
605 gpertea 2 const char* getSubfName() { //returns the generic feature type of the entries in exons array
606 gpertea 16 int sid=exon_ftype_id;
607 gpertea 2 if (sid==gff_fid_exon && isCDS) sid=gff_fid_CDS;
608     return names->feats.getName(sid);
609     }
610 gpertea 55 void addCDS(uint cd_start, uint cd_end, char phase=0);
611    
612 gpertea 16 bool monoFeature() {
613     return (exons.Count()==0 ||
614 gpertea 63 (exons.Count()==1 && //exon_ftype_id==ftype_id &&
615 gpertea 50 exons[0]->end==this->end && exons[0]->start==this->start));
616 gpertea 16 }
617    
618     bool hasCDS() { return (CDstart>0); }
619    
620 gpertea 2 const char* getFeatureName() {
621     return names->feats.getName(ftype_id);
622     }
623 gpertea 16 void setFeatureName(const char* feature);
624    
625     void addAttr(const char* attrname, const char* attrvalue);
626     int removeAttr(const char* attrname, const char* attrval=NULL);
627     int removeAttr(int aid, const char* attrval=NULL);
628     int removeExonAttr(GffExon& exon, const char* attrname, const char* attrval=NULL);
629     int removeExonAttr(GffExon& exon, int aid, const char* attrval=NULL);
630 gpertea 2 const char* getAttrName(int i) {
631     if (attrs==NULL) return NULL;
632     return names->attrs.getName(attrs->Get(i)->attr_id);
633     }
634 gpertea 16 char* getAttr(const char* attrname, bool checkFirstExon=false) {
635     if (names==NULL || attrname==NULL) return NULL;
636     char* r=NULL;
637     if (attrs==NULL) {
638     if (!checkFirstExon) return NULL;
639     }
640     else r=attrs->getAttr(names, attrname);
641     if (r!=NULL) return r;
642     if (checkFirstExon && exons.Count()>0) {
643     r=exons[0]->getAttr(names, attrname);
644     }
645     return r;
646 gpertea 2 }
647    
648 gpertea 16 char* getExonAttr(GffExon* exon, const char* attrname) {
649     if (exon==NULL || attrname==NULL) return NULL;
650     return exon->getAttr(names, attrname);
651     }
652    
653     char* getExonAttr(int exonidx, const char* attrname) {
654     if (exonidx<0 || exonidx>=exons.Count() || attrname==NULL) return NULL;
655     return exons[exonidx]->getAttr(names, attrname);
656     }
657    
658 gpertea 2 char* getAttrValue(int i) {
659     if (attrs==NULL) return NULL;
660     return attrs->Get(i)->attr_val;
661     }
662     const char* getGSeqName() {
663     return names->gseqs.getName(gseq_id);
664     }
665 gpertea 16
666     const char* getRefName() {
667     return names->gseqs.getName(gseq_id);
668     }
669     void setRefName(const char* newname);
670    
671 gpertea 2 const char* getTrackName() {
672     return names->tracks.getName(track_id);
673     }
674 gpertea 16 bool exonOverlap(uint s, uint e) {//check if ANY exon overlaps given segment
675 gpertea 2 //ignores strand!
676 gpertea 144 if (s>e) Gswap(s,e);
677 gpertea 2 for (int i=0;i<exons.Count();i++) {
678     if (exons[i]->overlap(s,e)) return true;
679     }
680     return false;
681     }
682     bool exonOverlap(GffObj& m) {//check if ANY exon overlaps given segment
683     //if (gseq_id!=m.gseq_id) return false;
684     // ignores strand and gseq_id, must check in advance
685     for (int i=0;i<exons.Count();i++) {
686     for (int j=0;j<m.exons.Count();j++) {
687     if (exons[i]->start>m.exons[j]->end) continue;
688     if (m.exons[j]->start>exons[i]->end) break;
689     //-- overlap if we are here:
690     return true;
691     }
692     }
693     return false;
694     }
695 gpertea 16
696 gpertea 2 int exonOverlapIdx(uint s, uint e, int* ovlen=NULL) {
697 gpertea 16 //return the exons' index for the overlapping OR ADJACENT exon
698 gpertea 2 //ovlen, if given, will return the overlap length
699 gpertea 144 if (s>e) Gswap(s,e);
700 gpertea 16 s--;e++; //to also catch adjacent exons
701 gpertea 2 for (int i=0;i<exons.Count();i++) {
702     if (exons[i]->start>e) break;
703     if (s>exons[i]->end) continue;
704     //-- overlap if we are here:
705     if (ovlen!=NULL) {
706 gpertea 16 s++;e--;
707 gpertea 2 int ovlend= (exons[i]->end>e) ? e : exons[i]->end;
708     *ovlen= ovlend - ((s>exons[i]->start)? s : exons[i]->start)+1;
709     }
710     return i;
711     } //for each exon
712     *ovlen=0;
713     return -1;
714     }
715 gpertea 16
716 gpertea 2 int exonOverlapLen(GffObj& m) {
717     if (start>m.end || m.start>end) return 0;
718     int i=0;
719     int j=0;
720     int ovlen=0;
721     while (i<exons.Count() && j<m.exons.Count()) {
722     uint istart=exons[i]->start;
723     uint iend=exons[i]->end;
724     uint jstart=m.exons[j]->start;
725     uint jend=m.exons[j]->end;
726     if (istart>jend) { j++; continue; }
727     if (jstart>iend) { i++; continue; }
728     //exon overlap
729     uint ovstart=GMAX(istart,jstart);
730     if (iend<jend) {
731     ovlen+=iend-ovstart+1;
732     i++;
733     }
734     else {
735     ovlen+=jend-ovstart+1;
736     j++;
737     }
738     }//while comparing exons
739     return ovlen;
740     }
741    
742     bool exonOverlap(GffObj* m) {
743     return exonOverlap(*m);
744     }
745     //---------- coordinate transformation
746     void xcoord(uint grstart, uint grend, char xstrand='+') {
747     //relative coordinate transform, and reverse-complement transform if xstrand is '-'
748     //does nothing if xstatus is the same already
749     if (xstatus) {
750     if (xstatus==xstrand && grstart==xstart && grend==xend) return;
751     unxcoord();//restore original coordinates
752     }
753     xstatus=xstrand;
754     xstart=grstart;
755     xend=grend;
756     if (CDstart>0) xcoordseg(CDstart, CDend);
757     for (int i=0;i<exons.Count();i++) {
758     xcoordseg(exons[i]->start, exons[i]->end);
759     }
760     if (xstatus=='-') {
761     exons.Reverse();
762     int flen=end-start;
763     start=xend-end+1;
764     end=start+flen;
765     }
766     else {
767     start=start-xstart+1;
768     end=end-xstart+1;
769     }
770     }
771    
772     //transform an arbitrary segment based on current xstatus/xstart-xend
773     void xcoordseg(uint& segstart, uint &segend) {
774     if (xstatus==0) return;
775     if (xstatus=='-') {
776     int flen=segend-segstart;
777     segstart=xend-segend+1;
778     segend=segstart+flen;
779     return;
780     }
781     else {
782     segstart=segstart-xstart+1;
783     segend=segend-xstart+1;
784     }
785     }
786    
787     void unxcoord() { //revert back to absolute genomic/gff coordinates if xstatus==true
788     if (xstatus==0) return; //nothing to do, no transformation appplied
789     if (CDstart>0) unxcoordseg(CDstart, CDend);
790     //restore all GffExon intervals too
791     for (int i=0;i<exons.Count();i++) {
792     unxcoordseg(exons[i]->start, exons[i]->end);
793     }
794     if (xstatus=='-') {
795     exons.Reverse();
796     int flen=end-start;
797     start=xend-end+1;
798     end=start+flen;
799     }
800     else {
801     start=start+xstart-1;
802     end=end+xstart-1;
803     }
804     xstatus=0;
805     }
806     void unxcoordseg(uint& astart, uint &aend) {
807     //restore an arbitrary interval -- does NOT change the transform state!
808     if (xstatus==0) return;
809     if (xstatus=='-') {
810     int flen=aend-astart;
811     astart=xend-aend+1;
812     aend=astart+flen;
813     }
814     else {
815     astart=astart+xstart-1;
816     aend=aend+xstart-1;
817     }
818     }
819     //---------------------
820     bool operator==(GffObj& d){
821     return (gseq_id==d.gseq_id && start==d.start && end==d.end && strcmp(gffID, d.gffID)==0);
822     }
823     bool operator>(GffObj& d){
824     if (gseq_id!=d.gseq_id) return (gseq_id>d.gseq_id);
825     if (start==d.start) {
826 gpertea 16 if (getLevel()==d.getLevel()) {
827     if (end==d.end) return (strcmp(gffID, d.gffID)>0);
828     else return (end>d.end);
829     } else return (getLevel()>d.getLevel());
830 gpertea 2 } else return (start>d.start);
831     }
832     bool operator<(GffObj& d){
833     if (gseq_id!=d.gseq_id) return (gseq_id<d.gseq_id);
834     if (start==d.start) {
835 gpertea 16 if (getLevel()==d.getLevel()) {
836     if (end==d.end) return strcmp(gffID, d.gffID)<0;
837 gpertea 2 else return end<d.end;
838 gpertea 16 } else return (getLevel()<d.getLevel());
839 gpertea 2 } else return (start<d.start);
840     }
841     char* getID() { return gffID; }
842 gpertea 16 char* getGeneID() { return geneID; }
843     char* getGeneName() { return gene_name; }
844     void setGeneName(const char* gname) {
845     GFREE(gene_name);
846     if (gname) gene_name=Gstrdup(gname);
847     }
848     void setGeneID(const char* gene_id) {
849     GFREE(geneID);
850     if (gene_id) geneID=Gstrdup(gene_id);
851     }
852 gpertea 2 int addSeg(GffLine* gfline);
853     int addSeg(int fnid, GffLine* gfline);
854     void getCDSegs(GArray<GffCDSeg>& cds);
855 gpertea 16
856     void updateExonPhase(); //for CDS-only features, updates GExon::phase
857    
858     void printGxfLine(FILE* fout, const char* tlabel, const char* gseqname,
859     bool iscds, uint segstart, uint segend, int exidx, char phase, bool gff3);
860     void printGxf(FILE* fout, GffPrintMode gffp=pgffExon,
861     const char* tlabel=NULL, const char* gfparent=NULL);
862     void printGtf(FILE* fout, const char* tlabel=NULL) {
863 gpertea 2 printGxf(fout, pgtfAny, tlabel);
864     }
865 gpertea 16 void printGff(FILE* fout, const char* tlabel=NULL,
866     const char* gfparent=NULL) {
867     printGxf(fout, pgffAny, tlabel, gfparent);
868 gpertea 2 }
869 gpertea 16 void printTranscriptGff(FILE* fout, char* tlabel=NULL,
870     bool showCDS=false, const char* gfparent=NULL) {
871     if (isValidTranscript())
872     printGxf(fout, showCDS ? pgffBoth : pgffExon, tlabel, gfparent);
873 gpertea 2 }
874     void printSummary(FILE* fout=NULL);
875     void getCDS_ends(uint& cds_start, uint& cds_end);
876     void mRNA_CDS_coords(uint& cds_start, uint& cds_end);
877     char* getSpliced(GFaSeqGet* faseq, bool CDSonly=false, int* rlen=NULL,
878     uint* cds_start=NULL, uint* cds_end=NULL, GList<GSeg>* seglst=NULL);
879 gpertea 16 char* getUnspliced(GFaSeqGet* faseq, int* rlen, GList<GSeg>* seglst);
880 gpertea 2 char* getSplicedTr(GFaSeqGet* faseq, bool CDSonly=true, int* rlen=NULL);
881     //bool validCDS(GFaSeqGet* faseq); //has In-Frame Stop Codon ?
882     bool empty() { return (start==0); }
883     };
884    
885     typedef bool GffRecFunc(GffObj* gobj, void* usrptr1, void* usrptr2);
886     //user callback after parsing a mapping object:
887     // Returns: "done with it" status:
888     // TRUE if gobj is no longer needed so it's FREEd upon return
889     // FALSE if the user needs the gobj pointer and is responsible for
890     // collecting and freeing all GffObj objects
891    
892    
893     //GSeqStat: collect basic stats about a common underlying genomic sequence
894     // for multiple GffObj
895     class GSeqStat {
896     public:
897     int gseqid; //gseq id in the global static pool of gseqs
898     char* gseqname; //just a pointer to the name of gseq
899 gpertea 286 int fcount;//number of features on this gseq
900 gpertea 2 uint mincoord;
901     uint maxcoord;
902 gpertea 16 uint maxfeat_len; //maximum feature length on this genomic sequence
903     GffObj* maxfeat;
904     GSeqStat(int id=-1, char* name=NULL) {
905 gpertea 2 gseqid=id;
906     gseqname=name;
907 gpertea 286 fcount=0;
908 gpertea 2 mincoord=MAXUINT;
909     maxcoord=0;
910 gpertea 16 maxfeat_len=0;
911     maxfeat=NULL;
912 gpertea 2 }
913     bool operator>(GSeqStat& g) {
914     return (gseqid>g.gseqid);
915     }
916     bool operator<(GSeqStat& g) {
917     return (gseqid<g.gseqid);
918     }
919     bool operator==(GSeqStat& g) {
920     return (gseqid==g.gseqid);
921     }
922     };
923    
924    
925     int gfo_cmpByLoc(const pointer p1, const pointer p2);
926    
927     class GfList: public GList<GffObj> {
928 gpertea 16 //just adding the option to sort by genomic sequence and coordinate
929     bool mustSort;
930 gpertea 2 public:
931     GfList(bool sortbyloc=false):GList<GffObj>(false,false,false) {
932 gpertea 16 //GffObjs in this list are NOT deleted when the list is cleared
933     //-- for deallocation of these objects, call freeAll() or freeUnused() as needed
934     mustSort=sortbyloc;
935     }
936     void sortedByLoc(bool v=true) {
937     bool prev=mustSort;
938     mustSort=v;
939     if (fCount>0 && mustSort && !prev) {
940     this->setSorted((GCompareProc*)gfo_cmpByLoc);
941     }
942     }
943     void finalize(GffReader* gfr, bool mergeCloseExons,
944 gpertea 286 bool keepAttrs=false, bool noExonAttr=true);
945    
946 gpertea 16 void freeAll() {
947     for (int i=0;i<fCount;i++) {
948     delete fList[i];
949     fList[i]=NULL;
950     }
951     Clear();
952     }
953     void freeUnused() {
954     for (int i=0;i<fCount;i++) {
955     if (fList[i]->isUsed()) continue;
956     //inform the children
957     for (int c=0;c<fList[i]->children.Count();c++) {
958     fList[i]->children[c]->parent=NULL;
959     }
960     delete fList[i];
961     fList[i]=NULL;
962     }
963     Clear();
964     }
965    
966 gpertea 2 };
967    
968 gpertea 151 struct GfoHolder {
969 gpertea 16 int idx; //position in GffReader::gflst array
970 gpertea 151 GffObj* gffobj;
971 gpertea 16 GfoHolder(GffObj* gfo=NULL, int i=0) {
972     idx=i;
973     gffobj=gfo;
974     }
975     };
976    
977     class CNonExon { //utility class used in subfeature promotion
978     public:
979     int idx;
980     GffObj* parent;
981     GffExon* exon;
982     GffLine* gffline;
983     CNonExon(int i, GffObj* p, GffExon* e, GffLine* gl) {
984     parent=p;
985     exon=e;
986     idx=i;
987     gffline=new GffLine(gl);
988     }
989     ~CNonExon() {
990     delete gffline;
991     }
992     };
993    
994    
995 gpertea 2 class GffReader {
996     friend class GffObj;
997     friend class GffLine;
998     char* linebuf;
999     off_t fpos;
1000     int buflen;
1001     protected:
1002 gpertea 16 bool gff_warns; //warn about duplicate IDs, etc. even when they are on different chromosomes
1003 gpertea 2 FILE* fh;
1004     char* fname; //optional fasta file with the underlying genomic sequence to be attached to this reader
1005     GffNames* names; //just a pointer to the global static Gff names repository in GffObj
1006     GffLine* gffline;
1007 gpertea 16 bool transcriptsOnly; //keep only transcripts w/ their exon/CDS features
1008     GHash<int> discarded_ids; //for transcriptsOnly mode, keep track
1009     // of discarded parent IDs
1010 gpertea 151 GHash< GVec<GfoHolder> > phash; //transcript_id+contig (Parent~Contig) => [gflst index, GffObj]
1011 gpertea 153 //GHash<int> tids; //just for transcript_id uniqueness
1012 gpertea 2 char* gfoBuildId(const char* id, const char* ctg);
1013 gpertea 151 //void gfoRemove(const char* id, const char* ctg);
1014     GfoHolder* gfoAdd(GffObj* gfo, int idx);
1015     GfoHolder* gfoAdd(GVec<GfoHolder>& glst, GffObj* gfo, int idx);
1016 gpertea 195 // const char* id, const char* ctg, char strand, GVec<GfoHolder>** glst, uint start, uint end
1017     GfoHolder* gfoFind(const char* id, const char* ctg=NULL, GVec<GfoHolder>** glst=NULL,
1018     char strand=0, uint start=0, uint end=0);
1019 gpertea 16 CNonExon* subfPoolCheck(GffLine* gffline, GHash<CNonExon>& pex, char*& subp_name);
1020     void subfPoolAdd(GHash<CNonExon>& pex, GfoHolder* newgfo);
1021     GfoHolder* promoteFeature(CNonExon* subp, char*& subp_name, GHash<CNonExon>& pex,
1022     bool keepAttr, bool noExonAttr);
1023 gpertea 286 GList<GSeqStat> gseqstats; //list of all genomic sequences seen by this reader, accumulates stats
1024 gpertea 2 public:
1025 gpertea 16 GfList gflst; //accumulate GffObjs being read
1026     GfoHolder* newGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr,
1027 gpertea 151 GffObj* parent=NULL, GffExon* pexon=NULL, GVec<GfoHolder>* glst=NULL);
1028 gpertea 16 GfoHolder* replaceGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, int replaceidx);
1029     GfoHolder* updateGffRec(GfoHolder* prevgfo, GffLine* gffline,
1030     bool keepAttr);
1031     GfoHolder* updateParent(GfoHolder* newgfh, GffObj* parent);
1032     bool addExonFeature(GfoHolder* prevgfo, GffLine* gffline, GHash<CNonExon>& pex, bool noExonAttr);
1033 gpertea 286 GPVec<GSeqStat> gseqStats; //only populated after finalize()
1034 gpertea 16 GffReader(FILE* f=NULL, bool t_only=false, bool sortbyloc=false):discarded_ids(true),
1035 gpertea 286 phash(true), gseqstats(true,true,true), gflst(sortbyloc), gseqStats(1, false) {
1036 gpertea 16 gff_warns=gff_show_warnings;
1037 gpertea 2 names=NULL;
1038     gffline=NULL;
1039 gpertea 16 transcriptsOnly=t_only;
1040 gpertea 2 fpos=0;
1041     fname=NULL;
1042     fh=f;
1043     GMALLOC(linebuf, GFF_LINELEN);
1044     buflen=GFF_LINELEN-1;
1045     }
1046 gpertea 16 void init(FILE *f, bool t_only=false, bool sortbyloc=false) {
1047     fname=NULL;
1048     fh=f;
1049     if (fh!=NULL) rewind(fh);
1050     fpos=0;
1051     transcriptsOnly=t_only;
1052     gflst.sortedByLoc(sortbyloc);
1053     }
1054     GffReader(char* fn, bool t_only=false, bool sort=false):discarded_ids(true), phash(true),
1055 gpertea 286 gseqstats(true,true,true), gflst(sort), gseqStats(1,false) {
1056 gpertea 16 gff_warns=gff_show_warnings;
1057 gpertea 2 names=NULL;
1058     fname=Gstrdup(fn);
1059 gpertea 16 transcriptsOnly=t_only;
1060 gpertea 2 fh=fopen(fname, "rb");
1061     fpos=0;
1062     gffline=NULL;
1063     GMALLOC(linebuf, GFF_LINELEN);
1064     buflen=GFF_LINELEN-1;
1065     }
1066    
1067 gpertea 16 ~GffReader() {
1068     delete gffline;
1069 gpertea 2 gffline=NULL;
1070     fpos=0;
1071 gpertea 16 gflst.freeUnused();
1072 gpertea 2 gflst.Clear();
1073 gpertea 16 discarded_ids.Clear();
1074 gpertea 2 phash.Clear();
1075     gseqstats.Clear();
1076     GFREE(fname);
1077     GFREE(linebuf);
1078     }
1079    
1080 gpertea 16 void showWarnings(bool v=true) {
1081     gff_warns=v;
1082     gff_show_warnings=v;
1083     }
1084    
1085 gpertea 2 GffLine* nextGffLine();
1086    
1087 gpertea 16 // load all subfeatures, re-group them:
1088     void readAll(bool keepAttr=false, bool mergeCloseExons=false, bool noExonAttr=true);
1089 gpertea 2
1090     }; // end of GffReader
1091    
1092     #endif