ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/gff.h
(Generate patch)
# Line 1 | Line 1
1   #ifndef GFF_H
2   #define GFF_H
3  
4 #ifdef HAVE_CONFIG_H
5 #include <config.h>
6 #endif
7
4   #include "GBase.h"
5   #include "gdna.h"
6   #include "codons.h"
# Line 19 | Line 15
15   const byte exMskMinSpliceR = 0x08;
16   const byte exMskTag = 0x80;
17   */
22 //reserved Gffnames::feats entries
23 extern const int gff_fid_mRNA;
24 extern const int gff_fid_exon;
25 extern const int gff_fid_CDS;
26 extern bool gff_warns; //show parser warnings
18  
19 + //reserved Gffnames::feats entries -- basic feature types
20 + extern const int gff_fid_mRNA; // "mRNA" feature name
21 + extern const int gff_fid_transcript; // *RNA, *transcript feature name
22 + extern const int gff_fid_exon;
23 + extern const int gff_fid_CDS; //never really used, except for display only
24 +                              //use gff_fid_exon instead
25   extern const uint GFF_MAX_LOCUS;
26   extern const uint GFF_MAX_EXON;
27   extern const uint GFF_MAX_INTRON;
28  
29 + extern const uint gfo_flag_CHILDREN_PROMOTED;
30 + extern const uint gfo_flag_HAS_ERRORS;
31 + extern const uint gfo_flag_IS_GENE;
32 + extern const uint gfo_flag_HAS_GFF_ID; //found a GFF3 formatted main feature with its own ID
33 + extern const uint gfo_flag_BY_EXON;  //created by subfeature (exon) directly
34 +                      //(GTF2 and some chado gff3 dumps with exons given before their mRNA)
35 + extern const uint gfo_flag_IS_TRANSCRIPT; //recognized as '*RNA' or '*transcript'
36 + extern const uint gfo_flag_DISCARDED; //should not be printed under the "transcriptsOnly" directive
37 + extern const uint gfo_flag_LST_KEEP; //GffObj from GffReader::gflst is to be kept (not deallocated)
38 +                                     //when GffReader is destroyed
39 + extern const uint gfo_flag_LEVEL_MSK; //hierarchical level: 0 = no parent
40 + extern const byte gfo_flagShift_LEVEL;
41 +
42 + extern bool gff_show_warnings;
43 +
44   #define GFF_LINELEN 2048
45   #define ERR_NULL_GFNAMES "Error: GffObj::%s requires a non-null GffNames* names!\n"
46  
47  
48   enum GffExonType {
49 <  exgffNone=0,
50 <  exgffStop, //from "stop_codon" feature
49 >  exgffNone=0,  //not a recognizable exon or CDS segment
50 >  exgffStart, //from "start_codon" feature (within CDS)
51 >  exgffStop, //from "stop_codon" feature (may be outside CDS)
52    exgffCDS,  //from "CDS" feature
53    exgffUTR,  //from "UTR" feature
54 +  exgffCDSUTR, //from a merge of UTR and CDS feature
55    exgffExon, //from "exon" feature
56   };
57  
58   class GffReader;
59  
60   class GffLine {
61 +    char* _parents; //stores a copy of the Parent attribute value,
62 +       //with commas replaced by \0
63 +    int _parents_len;
64   public:
65 <    char* line;
65 >    char* dupline; //duplicate of original line
66 >    char* line; //this will have tabs replaced by \0
67 >    int llen;
68      char* gseqname;
69      char* track;
70      char* ftype; //feature name: mRNA/gene/exon/CDS
71 +    char* info; //the last, attributes' field, unparsed
72      uint fstart;
73      uint fend;
74      uint qstart; //overlap coords on query, if available
# Line 57 | Line 77
77      double score;
78      char strand;
79      bool skip;
80 +    bool is_gff3; //if the line appears to be in GFF3 format
81      bool is_cds; //"cds" and "stop_codon" features
82      bool is_exon; //"exon" and "utr" features
83      char exontype; // gffExonType
84 <    bool is_mrna;
84 >    bool is_transcript; //if current feature is *RNA or *transcript
85 >    bool is_gene; //if current feature is *gene
86      char phase;  // '.' , '0', '1' or '2'
87 <    char* gname; //gene_id or Name= value (or an otherwise parsed gene denominator
88 <                 //(for grouping isoforms)
89 <    char* info; //the last, attributes' field, unparsed
87 >    // -- allocated strings:
88 >    char* gene_name; //value of gene_name attribute (GTF) if present or Name attribute of a gene feature (GFF3)
89 >    char* gene_id; //value of gene_id attribute (GTF) if present or ID attribute of a gene feature (GFF3)
90      //
91 <    char* Parent; // if a Parent=.. attribute was found
92 <    char* ID;     // if a ID=.. attribute was parsed
91 >    char** parents; //for GTF only parents[0] is used
92 >    int num_parents;
93 >    char* ID;     // if a ID=.. attribute was parsed, or a GTF with 'transcript' line (transcript_id)
94      GffLine(GffReader* reader, const char* l); //parse the line accordingly
95 +    void discardParent() {
96 +       GFREE(_parents);
97 +       _parents_len=0;
98 +       num_parents=0;
99 +       parents=NULL;
100 +       }
101 +    char* extractAttr(const char* pre, bool caseStrict=false, bool enforce_GTF2=false);
102 +    GffLine(GffLine* l) { //a copy constructor
103 +      memcpy((void*)this, (void*)l, sizeof(GffLine));
104 +      line=NULL;
105 +      GMALLOC(line, llen+1);
106 +      memcpy(line, l->line, llen+1);
107 +      GMALLOC(dupline, llen+1);
108 +      memcpy(dupline, l->dupline, llen+1);
109 +      //--offsets within line[]
110 +      gseqname=line+(l->gseqname-l->line);
111 +      track=line+(l->track-l->line);
112 +      ftype=line+(l->ftype-l->line);
113 +      info=line+(l->info-l->line);
114 +      //Parent=Gstrdup(l->Parent);
115 +      if (l->_parents_len>0) {
116 +         _parents_len=l->_parents_len;
117 +         GMALLOC(_parents, _parents_len);
118 +         memcpy(_parents, l->_parents, _parents_len);
119 +         num_parents=l->num_parents;
120 +         for (int i=0;i<num_parents;i++) {
121 +            parents[i]=_parents+(l->parents[i] - l->_parents);
122 +            }
123 +         }
124 +      //-- allocated string copies:
125 +      ID=Gstrdup(l->ID);
126 +      if (l->gene_name!=NULL)
127 +          gene_name=Gstrdup(l->gene_name);
128 +      if (l->gene_id!=NULL)
129 +          gene_id=Gstrdup(l->gene_id);
130 +      }
131      GffLine() {
132        line=NULL;
133 +      dupline=NULL;
134        gseqname=NULL;
135        track=NULL;
136        ftype=NULL;
137        fstart=0;
138        fend=0;
139        info=NULL;
140 <      Parent=NULL;
140 >      _parents=NULL;
141 >      _parents_len=0;
142 >      parents=NULL;
143 >      num_parents=0;
144        ID=NULL;
145 <      gname=NULL;
145 >      gene_name=NULL;
146 >      gene_id=NULL;
147        skip=true;
148        qstart=0;
149        qend=0;
150        qlen=0;
151        exontype=0;
152        is_cds=false;
153 <      is_mrna=false;
153 >      is_gff3=false;
154 >      is_transcript=false;
155 >      is_gene=false;
156        is_exon=false;
157        }
158      ~GffLine() {
159 +      GFREE(dupline);
160        GFREE(line);
161 <      GFREE(Parent);
161 >      GFREE(_parents);
162 >      GFREE(parents);
163        GFREE(ID);
164 <      GFREE(gname);
164 >      GFREE(gene_name);
165 >      GFREE(gene_id);
166       }
167   };
168  
# Line 104 | Line 173
173     GffAttr(int an_id, const char* av=NULL) {
174       attr_id=an_id;
175       attr_val=NULL;
176 <     if (av!=NULL) {
108 <       char* lastch = (char*)(av+strlen(av)-1);
109 <       //remove spaces at the end:
110 <       while (*lastch==' ' && lastch!=av) lastch--;
111 <       lastch[1]=0;
112 <       //practical use: if it doesn't have any spaces just strip those useless double quotes
113 <       if (av[0]=='"' && strpbrk(av+1," ;")==NULL) {
114 <               if (*lastch=='"') *lastch=0;
115 <               attr_val=Gstrdup(av+1);
116 <               }
117 <          else attr_val=Gstrdup(av);
118 <       }
119 <
176 >     setValue(av);
177       }
178    ~GffAttr() {
179       GFREE(attr_val);
180       }
181 +  void setValue(const char* av) {
182 +     if (attr_val!=NULL) {
183 +        GFREE(attr_val);
184 +        }
185 +     if (av==NULL || av[0]==0) return;
186 +     //trim spaces
187 +     const char* vstart=av;
188 +     while (*vstart==' ') av++;
189 +     const char* vend=vstart;
190 +     bool keep_dq=false;
191 +     while (vend[1]!=0) {
192 +        if (*vend==' ' && vend[1]!=' ') keep_dq=true;
193 +          else if (*vend==';') keep_dq=true;
194 +        vend++;
195 +        }
196 +     //remove spaces at the end:
197 +     while (*vend==' ' && vend!=vstart) vend--;
198 +     //practical clean-up: if it doesn't have any internal spaces just strip those useless double quotes
199 +     if (!keep_dq && *vstart=='"' && *vend=='"') {
200 +               vend--;
201 +               vstart++;
202 +               }
203 +     attr_val=Gstrdup(vstart, vend);
204 +     }
205    bool operator==(GffAttr& d){
206        return (this==&d);
207        }
# Line 139 | Line 220
220   class GffNameInfo {
221    friend class GffNameList;
222   protected:
142   //unsigned char shared;
223     int idx;
224   public:
225     char* name;
# Line 147 | Line 227
227     GffNameInfo(const char* n) {
228       name=Gstrdup(n);
229       }
150   /*
151   GffNameInfo(const char* n, bool strshared=false) {
152     idx=-1;
153     if (strshared) {
154        shared=1;
155        name=(char *)n;
156        }
157     else {
158        shared=0;
159        name = (n==NULL) ? NULL : Gstrdup(n);
160        }
161     }
162     */
230  
231     ~GffNameInfo() {
232 <      //if (shared==0)
233 <          GFREE(name);
167 <    }
232 >      GFREE(name);
233 >     }
234  
235     bool operator==(GffNameInfo& d){
236         return (strcmp(this->name, d.name)==0);
237         }
172   bool operator>(GffNameInfo& d){
173      return (strcmp(this->name, d.name)>0);
174      }
238     bool operator<(GffNameInfo& d){
239       return (strcmp(this->name, d.name)<0);
240       }
241   };
242  
180 /*
181 int compareNameId(const pointer item1, const pointer item2) {
182  GffNameInfo* ni1=(GffNameInfo*)item1;
183  GffNameInfo* ni2=(GffNameInfo*)item2;
184  return (ni1->id > ni2->id )? 1 : (ni1->id==ni2->id ? 0 : -1);
185 }
186 */
187
243   class GffNameList:public GList<GffNameInfo> {
244    friend class GffNameInfo;
245    friend class GffNames;
# Line 225 | Line 280
280     idlast=fidx;
281     return fidx;
282     }
283 +
284 + int addNewName(const char* tname) {
285 +    GffNameInfo* f=new GffNameInfo(tname);
286 +    int fidx=this->Add(f);
287 +    f->idx=fidx;
288 +    byName.shkAdd(f->name,f);
289 +    return fidx;
290 +    }
291 +
292   int getId(const char* tname) { //only returns a name id# if found
293      GffNameInfo* f=byName.Find(tname);
294      if (f==NULL) return -1;
295      return f->idx;
296      }
297 < int removeName(const char* tname) {
297 > int removeName() {
298     GError("Error: removing names from GffNameList not allowed!\n");
299     return -1;
300     }
# Line 242 | Line 306
306     GffNameList tracks;
307     GffNameList gseqs;
308     GffNameList attrs;
309 <   GffNameList feats; //feature names - anything except 'mRNA', 'exon', 'CDS' gets stored here
309 >   GffNameList feats; //feature names: 'mRNA', 'exon', 'CDS' etc.
310     GffNames():tracks(),gseqs(),attrs(), feats() {
311      numrefs=0;
312      //the order below is critical!
313      //has to match: gff_fid_mRNA, gff_fid_exon, gff_fid_CDS
314      feats.addStatic("mRNA");//index 0=gff_fid_mRNA
315 +    feats.addStatic("transcript");//index 1=gff_fid_transcript
316      feats.addStatic("exon");//index 1=gff_fid_exon
317      feats.addStatic("CDS"); //index 2=gff_fid_CDS
318      }
# Line 270 | Line 335
335   class GffAttrs:public GList<GffAttr> {
336    public:
337      GffAttrs():GList<GffAttr>(false,true,false) { }
338 +    void add_or_update(GffNames* names, const char* attrname, const char* val) {
339 +      int aid=names->attrs.getId(attrname);
340 +      if (aid>=0) {
341 +         //attribute found in the dictionary
342 +         for (int i=0;i<Count();i++) {
343 +            //do we have it?
344 +            if (aid==Get(i)->attr_id) {
345 +                //update the value
346 +                Get(i)->setValue(val);
347 +                return;
348 +                }
349 +            }
350 +         }
351 +        else {
352 +         aid=names->attrs.addNewName(attrname);
353 +         }
354 +      this->Add(new GffAttr(aid, val));
355 +      }
356 +      
357      char* getAttr(GffNames* names, const char* attrname) {
358        int aid=names->attrs.getId(attrname);
359        if (aid>=0)
# Line 277 | Line 361
361            if (aid==Get(i)->attr_id) return Get(i)->attr_val;
362        return NULL;
363        }
364 +    char* getAttr(int aid) {
365 +      if (aid>=0)
366 +        for (int i=0;i<Count();i++)
367 +          if (aid==Get(i)->attr_id) return Get(i)->attr_val;
368 +      return NULL;
369 +      }
370   };
371  
372  
373   class GffExon : public GSeg {
374   public:
375 <  void* uptr; //for later exensibility
375 >  void* uptr; //for later extensions
376    GffAttrs* attrs; //other attributes kept for this exon
377    double score; // gff score column
378    char phase; //GFF phase column - for CDS segments only
# Line 318 | Line 408
408     return attrs->getAttr(names, atrname);
409     }
410  
411 + char* getAttr(int aid) {
412 +   if (attrs==NULL) return NULL;
413 +   return attrs->getAttr(aid);
414 +   }
415 +
416   ~GffExon() { //destructor
417     if (attrs!=NULL) delete attrs;
418     }
# Line 344 | Line 439
439              //'-' : (start,end) are relative to the reverse complement of xstart..xend region
440     //--
441     char* gffID; // ID name for mRNA (parent) feature
442 <   char* gname; // value of Name attribute (if given)
442 >   char* gene_name; //value of gene_name attribute (GTF) if present or Name attribute of the parent gene feature (GFF3)
443 >   char* geneID; //value of gene_id attribute (GTF) if present or ID attribute of a parent gene feature (GFF3)
444 >   unsigned int flags;
445     //-- friends:
446     friend class GffReader;
447     friend class GffExon;
448   public:
449 <  bool hasErrors; //overlapping exons, or too short introns, etc.
353 <  static GffNames* names; // common string storage that holds the various attribute names etc.
449 >  static GffNames* names; // dictionary storage that holds the various attribute names etc.
450    int track_id; // index of track name in names->tracks
451    int gseq_id; // index of genomic sequence name in names->gseqs
452    int ftype_id; // index of this record's feature name in names->feats, or the special gff_fid_mRNA value
453 <  int subftype_id; //index of child subfeature name in names->feats (that subfeature stored in "exons")
453 >  int exon_ftype_id; //index of child subfeature name in names->feats (that subfeature stored in "exons")
454                     //if ftype_id==gff_fid_mRNA then this value is ignored
455    GList<GffExon> exons; //for non-mRNA entries, these can be any subfeature of type subftype_id
456 +  GPVec<GffObj> children;
457 +  GffObj* parent;
458    int udata; //user data, flags etc.
459    void* uptr; //user pointer (to a parent object, cluster, locus etc.)
460    GffObj* ulink; //link to another GffObj (user controlled field)
# Line 366 | Line 464
464    uint CDstart; //CDS start coord
465    uint CDend;   //CDS end coord
466    char CDphase; //initial phase for CDS start
467 +  bool hasErrors() { return ((flags & gfo_flag_HAS_ERRORS)!=0); }
468 +  void hasErrors(bool v) {
469 +      if (v) flags |= gfo_flag_HAS_ERRORS;
470 +        else flags &= ~gfo_flag_HAS_ERRORS;
471 +      }
472 +  bool hasGffID() { return ((flags & gfo_flag_HAS_GFF_ID)!=0); }
473 +  void hasGffID(bool v) {
474 +      if (v) flags |= gfo_flag_HAS_GFF_ID;
475 +        else flags &= ~gfo_flag_HAS_GFF_ID;
476 +      }
477 +  bool createdByExon() { return ((flags & gfo_flag_BY_EXON)!=0); }
478 +  void createdByExon(bool v) {
479 +      if (v) flags |= gfo_flag_BY_EXON;
480 +        else flags &= ~gfo_flag_BY_EXON;
481 +      }
482 +  bool isGene() { return ((flags & gfo_flag_IS_GENE)!=0); }
483 +  void isGene(bool v) {
484 +      if (v) flags |= gfo_flag_IS_GENE;
485 +        else flags &= ~gfo_flag_IS_GENE;
486 +      }
487 +  bool isDiscarded() { return ((flags & gfo_flag_DISCARDED)!=0); }
488 +  void isDiscarded(bool v) {
489 +      if (v) flags |= gfo_flag_DISCARDED;
490 +        else flags &= ~gfo_flag_DISCARDED;
491 +      }
492 +      
493 +  bool isUsed() { return ((flags & gfo_flag_LST_KEEP)!=0); }
494 +  void isUsed(bool v) {
495 +      if (v) flags |= gfo_flag_LST_KEEP;
496 +        else flags &= ~gfo_flag_LST_KEEP;
497 +      }
498 +  bool isTranscript() { return ((flags & gfo_flag_IS_TRANSCRIPT)!=0); }
499 +  void isTranscript(bool v) {
500 +      if (v) flags |= gfo_flag_IS_TRANSCRIPT;
501 +        else flags &= ~gfo_flag_IS_TRANSCRIPT;
502 +      }
503 +  bool promotedChildren() { return ((flags & gfo_flag_CHILDREN_PROMOTED)!=0); }
504 +  void promotedChildren(bool v) {
505 +    if (v) flags |= gfo_flag_CHILDREN_PROMOTED;
506 +      else flags &= ~gfo_flag_CHILDREN_PROMOTED;
507 +     }
508 +  void setLevel(byte v) {
509 +    if (v==0) flags &= ~gfo_flag_LEVEL_MSK;
510 +         else flags &= ~(((uint)v) << gfo_flagShift_LEVEL);
511 +    }
512 +  byte incLevel() {
513 +    uint v=((flags & gfo_flag_LEVEL_MSK) >> gfo_flagShift_LEVEL);
514 +    v++;
515 +    flags &= ~(v << gfo_flagShift_LEVEL);
516 +    return v;
517 +    }
518 +  byte getLevel() {
519 +    return ((byte)((flags & gfo_flag_LEVEL_MSK) >> gfo_flagShift_LEVEL));
520 +    }
521  
522 <  bool ismRNA() { return (ftype_id==gff_fid_mRNA); }
522 >  bool isValidTranscript() {
523 >    //return (ftype_id==gff_fid_mRNA && exons.Count()>0);
524 >    return (isTranscript() && exons.Count()>0);
525 >    }
526 >  
527  
528    int addExon(uint segstart, uint segend, double sc=0, char fr='.',
529               int qs=0, int qe=0, bool iscds=false, char exontype=0);
530  
531 <  int addExon(GffLine* gl, bool keepAttr=false, bool noExonAttr=true);
531 >  int addExon(GffReader* reader, GffLine* gl, bool keepAttr=false, bool noExonAttr=true);
532 >
533    void removeExon(int idx);
534 <  /*
378 <  uint  gstart; // global feature coordinates on genomic sequence
379 <  uint  gend;   // ALWAYS gstart <= gend
380 <  */
534 >  void removeExon(GffExon* p);
535    char  strand; //true if features are on the reverse complement strand
536    double gscore;
537    double uscore; //custom, user-computed score, if needed
538    int covlen; //total coverage of  reference genomic sequence (sum of maxcf segment lengths)
539 <
539 >  
540     //--------- optional data:
541    int qlen; //query length, start, end - if available
542    int qstart;
# Line 394 | Line 548
548     //if gfline->Parent!=NULL then this will also add the first sub-feature
549     // otherwise, only the main feature is created
550    void clearAttrs() {
551 <    if (attrs!=NULL) delete attrs;
551 >    if (attrs!=NULL) {
552 >      bool sharedattrs=(exons.Count()>0 && exons[0]->attrs==attrs);
553 >      delete attrs; attrs=NULL;
554 >      if (sharedattrs) exons[0]->attrs=NULL;
555 >      }
556      }
557 <  GffObj(char* anid=NULL):GSeg(0,0), exons(true,true,true) { //exons: sorted, free, unique
557 >  GffObj(char* anid=NULL):GSeg(0,0), exons(true,true,false), children(1,false) {
558 >                                   //exons: sorted, free, non-unique
559         gffID=NULL;
560         uptr=NULL;
561         ulink=NULL;
562 +       flags=0;
563         udata=0;
564 <       hasErrors=false;
564 >       parent=NULL;
565         ftype_id=-1;
566 <       subftype_id=-1;
566 >       exon_ftype_id=-1;
567         if (anid!=NULL) gffID=Gstrdup(anid);
568         gffnames_ref(names);
569         qlen=0;
# Line 417 | Line 577
577         CDphase=0;
578         gseq_id=-1;
579         track_id=-1;
420       /*
421       gstart=0;
422       gend=0;
423       */
580         xstart=0;
581         xend=0;
582         xstatus=0;
583 <       strand=0;
583 >       strand='.';
584         gscore=0;
585         uscore=0;
586         attrs=NULL;
587         covlen=0;
588 <       gname=NULL;
588 >       gene_name=NULL;
589 >       geneID=NULL;
590         }
591     ~GffObj() {
592         GFREE(gffID);
593 <       GFREE(gname);
594 <       if (attrs!=NULL) delete attrs;
593 >       GFREE(gene_name);
594 >       GFREE(geneID);
595 >       clearAttrs();
596         gffnames_unref(names);
597         }
440   /*
441    void addFeature(GffLine* gfline);
442    int getFeatureId(int fnid);
443    int getFeatureId(const char* fname);
444    GFeature* hasFeature(char* parentID);
445    void promote(GFeature* subf, GffLine* gfline); */
598     //--------------
599 <   GffObj* finalize(bool mergeCloseExons=false); //finalize parsing: must be called in order to merge adjacent/close proximity subfeatures
600 <   void parseAttrs(GffAttrs*& atrlist, char* info, bool noExonAttr=false);
599 >   GffObj* finalize(GffReader* gfr, bool mergeCloseExons=false,
600 >               bool keepAttrs=false, bool noExonAttr=true);
601 >               //complete parsing: must be called in order to merge adjacent/close proximity subfeatures
602 >   void parseAttrs(GffAttrs*& atrlist, char* info, bool isExon=false);
603     const char* getSubfName() { //returns the generic feature type of the entries in exons array
604 <     int sid=subftype_id;
604 >     int sid=exon_ftype_id;
605       if (sid==gff_fid_exon && isCDS) sid=gff_fid_CDS;
606       return names->feats.getName(sid);
607       }
608 +   void addCDS(uint cd_start, uint cd_end, char phase=0);
609 +  
610 +   bool monoFeature() {
611 +     return (exons.Count()==0 ||
612 +          (exons.Count()==1 &&  //exon_ftype_id==ftype_id &&
613 +              exons[0]->end==this->end && exons[0]->start==this->start));
614 +     }
615 +
616 +   bool hasCDS() { return (CDstart>0); }
617 +
618     const char* getFeatureName() {
619       return names->feats.getName(ftype_id);
620       }
621 <   void addAttr(const char* attrname, char* attrvalue);
621 >   void setFeatureName(const char* feature);
622 >  
623 >   void addAttr(const char* attrname, const char* attrvalue);
624 >   int removeAttr(const char* attrname, const char* attrval=NULL);
625 >   int removeAttr(int aid, const char* attrval=NULL);
626 >   int removeExonAttr(GffExon& exon, const char* attrname, const char* attrval=NULL);
627 >   int removeExonAttr(GffExon& exon, int aid, const char* attrval=NULL);
628     const char* getAttrName(int i) {
629       if (attrs==NULL) return NULL;
630       return names->attrs.getName(attrs->Get(i)->attr_id);
631       }
632 <   char* getAttr(const char* atrname) {
633 <     if (attrs==NULL || names==NULL || atrname==NULL) return NULL;
634 <     return attrs->getAttr(names, atrname);
632 >   char* getAttr(const char* attrname, bool checkFirstExon=false) {
633 >     if (names==NULL || attrname==NULL) return NULL;
634 >     char* r=NULL;
635 >     if (attrs==NULL) {
636 >         if (!checkFirstExon) return NULL;
637 >         }
638 >       else r=attrs->getAttr(names, attrname);
639 >     if (r!=NULL) return r;
640 >     if (checkFirstExon && exons.Count()>0) {
641 >        r=exons[0]->getAttr(names, attrname);
642 >        }
643 >     return r;
644       }
645  
646 +   char* getExonAttr(GffExon* exon, const char* attrname) {
647 +      if (exon==NULL || attrname==NULL) return NULL;
648 +      return exon->getAttr(names, attrname);
649 +      }
650 +
651 +   char* getExonAttr(int exonidx, const char* attrname) {
652 +      if (exonidx<0 || exonidx>=exons.Count() || attrname==NULL) return NULL;
653 +      return exons[exonidx]->getAttr(names, attrname);
654 +      }
655 +
656     char* getAttrValue(int i) {
657       if (attrs==NULL) return NULL;
658       return attrs->Get(i)->attr_val;
# Line 471 | Line 660
660     const char* getGSeqName() {
661       return names->gseqs.getName(gseq_id);
662       }
663 +
664 +   const char* getRefName() {
665 +     return names->gseqs.getName(gseq_id);
666 +     }
667 +   void setRefName(const char* newname);
668 +  
669     const char* getTrackName() {
670       return names->tracks.getName(track_id);
671       }
672 <   /*
478 <   bool overlap(GffObj* mrna) {
479 <     //basic overlap: just segment ends
480 <     return overlap(*mrna);
481 <     }
482 <   bool overlap(GffObj& d) {
483 <      //ignores strand and gseq_id -- the user must do that in advance
484 <     // just rough locus overlap, exons may not overlap
485 <      return (gstart<d.gstart ? (d.gstart<=gend) : (gstart<=d.gend));
486 <      }
487 <
488 <    bool overlap(uint s, uint e) {
489 <      if (s>e) swap(s,e);
490 <      return (gstart<s ? (s<=gend) : (gstart<=e));
491 <      }
492 <    */
493 <    bool exonOverlap(uint s, uint e) {//check if ANY exon overlaps given segment
672 >   bool exonOverlap(uint s, uint e) {//check if ANY exon overlaps given segment
673        //ignores strand!
674 <      if (s>e) swap(s,e);
674 >      if (s>e) Gswap(s,e);
675        for (int i=0;i<exons.Count();i++) {
676           if (exons[i]->overlap(s,e)) return true;
677           }
# Line 506 | Line 685
685              if (exons[i]->start>m.exons[j]->end) continue;
686              if (m.exons[j]->start>exons[i]->end) break;
687              //-- overlap if we are here:
509            // if (exons[i]->overlap(m.exons[j])) return true;
688              return true;
689              }
690           }
691        return false;
692        }
693 <
693 >    
694      int exonOverlapIdx(uint s, uint e, int* ovlen=NULL) {
695 <      //return the exons' index for the overlapping exon
695 >      //return the exons' index for the overlapping OR ADJACENT exon
696        //ovlen, if given, will return the overlap length
697 <      if (s>e) swap(s,e);
697 >      if (s>e) Gswap(s,e);
698 >      s--;e++; //to also catch adjacent exons
699        for (int i=0;i<exons.Count();i++) {
700              if (exons[i]->start>e) break;
701              if (s>exons[i]->end) continue;
702              //-- overlap if we are here:
703              if (ovlen!=NULL) {
704 +              s++;e--;
705                int ovlend= (exons[i]->end>e) ? e : exons[i]->end;
706                *ovlen= ovlend - ((s>exons[i]->start)? s : exons[i]->start)+1;
707                }
# Line 530 | Line 710
710        *ovlen=0;
711        return -1;
712        }
713 <
713 >    
714      int exonOverlapLen(GffObj& m) {
715        if (start>m.end || m.start>end) return 0;
716        int i=0;
# Line 641 | Line 821
821     bool operator>(GffObj& d){
822        if (gseq_id!=d.gseq_id) return (gseq_id>d.gseq_id);
823        if (start==d.start) {
824 <         if (end==d.end) return strcmp(gffID, d.gffID)>0;
825 <                      else return end>d.end;
824 >         if (getLevel()==d.getLevel()) {
825 >             if (end==d.end) return (strcmp(gffID, d.gffID)>0);
826 >                        else return (end>d.end);
827 >             } else return (getLevel()>d.getLevel());
828           } else return (start>d.start);
829        }
830     bool operator<(GffObj& d){
831       if (gseq_id!=d.gseq_id) return (gseq_id<d.gseq_id);
832       if (start==d.start) {
833 <        if (end==d.end) return strcmp(gffID, d.gffID)<0;
833 >         if (getLevel()==d.getLevel()) {
834 >            if (end==d.end) return strcmp(gffID, d.gffID)<0;
835                       else return end<d.end;
836 +            } else return (getLevel()<d.getLevel());
837          } else return (start<d.start);
838       }
839     char* getID() { return gffID; }
840 <   char* getGene() { return gname; }
841 <   //void calcScore();
842 <   /*int exonCount() { return exoncount; }
843 <   GffExon* getExonSegs() { return exons; }
844 <   int cdsCount() { return cdscount; }
845 <   GffExon* getCDSegs() { return cds; } */
846 <
847 <   /* bool validateMapping(int qrylen, bool pmap_parsing, int minpid,
848 <                                         int mincov, int maxintron);*/
849 <   /* int addSeg(char* feat, int nstart, int nend, int sc=0, char fr='.',
666 <                                          char* tid=NULL, char* tinfo=NULL);*/
840 >   char* getGeneID() { return geneID; }
841 >   char* getGeneName() { return gene_name; }
842 >   void setGeneName(const char* gname) {
843 >        GFREE(gene_name);
844 >        if (gname) gene_name=Gstrdup(gname);
845 >        }
846 >   void setGeneID(const char* gene_id) {
847 >        GFREE(geneID);
848 >        if (gene_id) geneID=Gstrdup(gene_id);
849 >        }
850     int addSeg(GffLine* gfline);
851     int addSeg(int fnid, GffLine* gfline);
669    // (int fnid, char* feat, int nstart, int nend, int sc=0,
670    //                           char fr='.', char* tid=NULL, char* tinfo=NULL);
852     void getCDSegs(GArray<GffCDSeg>& cds);
853 <   void printGxfLine(FILE* fout, char* tlabel, char* gseqname, bool iscds,
854 <                                uint segstart, uint segend, int exidx, char phase, bool gff3);
855 <   void printGxf(FILE* fout, GffPrintMode gffp=pgffExon, char* tlabel=NULL);
856 <   void printGtf(FILE* fout, char* tlabel=NULL) {
853 >
854 >   void updateExonPhase(); //for CDS-only features, updates GExon::phase
855 >
856 >   void printGxfLine(FILE* fout, const char* tlabel, const char* gseqname,
857 >          bool iscds, uint segstart, uint segend, int exidx, char phase, bool gff3);
858 >   void printGxf(FILE* fout, GffPrintMode gffp=pgffExon,
859 >             const char* tlabel=NULL, const char* gfparent=NULL);
860 >   void printGtf(FILE* fout, const char* tlabel=NULL) {
861        printGxf(fout, pgtfAny, tlabel);
862        }
863 <   void printGff(FILE* fout, char* tlabel=NULL) {
864 <      printGxf(fout, pgffAny, tlabel);
863 >   void printGff(FILE* fout, const char* tlabel=NULL,
864 >                                const char* gfparent=NULL) {
865 >      printGxf(fout, pgffAny, tlabel, gfparent);
866        }
867 <   void print_mRNAGff(FILE* fout, char* tlabel=NULL, bool showCDS=false) {
868 <      if (ismRNA())
869 <         printGxf(fout, showCDS ? pgffBoth : pgffExon, tlabel);
867 >   void printTranscriptGff(FILE* fout, char* tlabel=NULL,
868 >                            bool showCDS=false, const char* gfparent=NULL) {
869 >      if (isValidTranscript())
870 >         printGxf(fout, showCDS ? pgffBoth : pgffExon, tlabel, gfparent);
871        }
872     void printSummary(FILE* fout=NULL);
873     void getCDS_ends(uint& cds_start, uint& cds_end);
874     void mRNA_CDS_coords(uint& cds_start, uint& cds_end);
875     char* getSpliced(GFaSeqGet* faseq, bool CDSonly=false, int* rlen=NULL,
876             uint* cds_start=NULL, uint* cds_end=NULL, GList<GSeg>* seglst=NULL);
877 +    char* getUnspliced(GFaSeqGet* faseq, int* rlen, GList<GSeg>* seglst);
878     char* getSplicedTr(GFaSeqGet* faseq, bool CDSonly=true, int* rlen=NULL);
879     //bool validCDS(GFaSeqGet* faseq); //has In-Frame Stop Codon ?
880     bool empty() { return (start==0); }
881   };
882  
695 //int cmpGMapScore(const pointer a, const pointer b);
696
883   typedef bool GffRecFunc(GffObj* gobj, void* usrptr1, void* usrptr2);
884   //user callback after parsing a mapping object:
885   // Returns: "done with it" status:
# Line 711 | Line 897
897     //int fcount;//number of features on this gseq
898     uint mincoord;
899     uint maxcoord;
900 <   GList<GffObj> gflst;
901 <   GSeqStat(int id=-1, char* name=NULL):gflst(true,false,false) {
900 >   uint maxfeat_len; //maximum feature length on this genomic sequence
901 >   GffObj* maxfeat;
902 >   GSeqStat(int id=-1, char* name=NULL) {
903       gseqid=id;
904       gseqname=name;
905       mincoord=MAXUINT;
906       maxcoord=0;
907 +     maxfeat_len=0;
908 +     maxfeat=NULL;
909       }
910     bool operator>(GSeqStat& g) {
911      return (gseqid>g.gseqid);
# Line 733 | Line 922
922   int gfo_cmpByLoc(const pointer p1, const pointer p2);
923  
924   class GfList: public GList<GffObj> {
925 < //just adding the option to sort by genomic sequence and coordinate
925 >  //just adding the option to sort by genomic sequence and coordinate
926 >   bool mustSort;
927   public:
928     GfList(bool sortbyloc=false):GList<GffObj>(false,false,false) {
929 <    if (sortbyloc) this->setSorted((GCompareProc*)gfo_cmpByLoc);
930 <    }
929 >     //GffObjs in this list are NOT deleted when the list is cleared
930 >     //-- for deallocation of these objects, call freeAll() or freeUnused() as needed
931 >     mustSort=sortbyloc;
932 >     }
933 >   void sortedByLoc(bool v=true) {
934 >     bool prev=mustSort;
935 >     mustSort=v;
936 >     if (fCount>0 && mustSort && !prev) {
937 >       this->setSorted((GCompareProc*)gfo_cmpByLoc);
938 >       }
939 >     }
940 >   void finalize(GffReader* gfr, bool mergeCloseExons,
941 >                bool keepAttrs=false, bool noExonAttr=true) { //if set, enforce sort by locus
942 >     if (mustSort) { //force (re-)sorting
943 >        this->setSorted(false);
944 >        this->setSorted((GCompareProc*)gfo_cmpByLoc);
945 >        }
946 >     int delcount=0;
947 >     for (int i=0;i<Count();i++) {
948 >       //finish the parsing for each GffObj
949 >       fList[i]->finalize(gfr, mergeCloseExons, keepAttrs, noExonAttr);
950 >       }
951 >     if (delcount>0) this->Pack();
952 >     }
953 >   void freeAll() {
954 >     for (int i=0;i<fCount;i++) {
955 >       delete fList[i];
956 >       fList[i]=NULL;
957 >       }
958 >     Clear();
959 >     }
960 >   void freeUnused() {
961 >     for (int i=0;i<fCount;i++) {
962 >       if (fList[i]->isUsed()) continue;
963 >       //inform the children
964 >       for (int c=0;c<fList[i]->children.Count();c++) {
965 >          fList[i]->children[c]->parent=NULL;
966 >          }
967 >       delete fList[i];
968 >       fList[i]=NULL;
969 >       }
970 >     Clear();
971 >     }
972 >
973   };
974  
975 + struct GfoHolder {
976 +   int idx; //position in GffReader::gflst array
977 +   GffObj* gffobj;
978 +   GfoHolder(GffObj* gfo=NULL, int i=0) {
979 +     idx=i;
980 +     gffobj=gfo;
981 +     }
982 + };
983 +
984 + class CNonExon { //utility class used in subfeature promotion
985 + public:
986 +   int idx;
987 +   GffObj* parent;
988 +   GffExon* exon;
989 +   GffLine* gffline;
990 +   CNonExon(int i, GffObj* p, GffExon* e, GffLine* gl) {
991 +     parent=p;
992 +     exon=e;
993 +     idx=i;
994 +     gffline=new GffLine(gl);
995 +     }
996 +  ~CNonExon() {
997 +     delete gffline;
998 +     }
999 + };
1000 +
1001 +
1002   class GffReader {
1003    friend class GffObj;
1004    friend class GffLine;
# Line 747 | Line 1006
1006    off_t fpos;
1007    int buflen;
1008   protected:
1009 +  bool gff_warns; //warn about duplicate IDs, etc. even when they are on different chromosomes
1010    FILE* fh;
1011    char* fname;  //optional fasta file with the underlying genomic sequence to be attached to this reader
1012    GffNames* names; //just a pointer to the global static Gff names repository in GffObj
1013    GffLine* gffline;
1014 <  bool mrnaOnly; //read only mRNAs ? (exon/CDS features only)
1015 <  bool sortbyloc; //sort by location: genomic sequence and start coordinate
1016 <  GHash<GffObj> phash; //transcript_id (Parent~Contig) => GffObj pointer
1014 >  bool transcriptsOnly; //keep only transcripts w/ their exon/CDS features
1015 >  GHash<int> discarded_ids; //for transcriptsOnly mode, keep track
1016 >                            // of discarded parent IDs
1017 >  GHash< GVec<GfoHolder> > phash; //transcript_id+contig (Parent~Contig) => [gflst index, GffObj]
1018 >  //GHash<int> tids; //just for transcript_id uniqueness
1019    char* gfoBuildId(const char* id, const char* ctg);
1020 <  void gfoRemove(const char* id, const char* ctg);
1021 <  void gfoAdd(const char* id, const char* ctg, GffObj* gfo);
1022 <  GffObj* gfoFind(const char* id, const char* ctg);
1020 >  //void gfoRemove(const char* id, const char* ctg);
1021 >  GfoHolder* gfoAdd(GffObj* gfo, int idx);
1022 >  GfoHolder* gfoAdd(GVec<GfoHolder>& glst, GffObj* gfo, int idx);
1023 >  // const char* id, const char* ctg, char strand, GVec<GfoHolder>** glst, uint start, uint end
1024 >  GfoHolder* gfoFind(const char* id, const char* ctg=NULL, GVec<GfoHolder>** glst=NULL,
1025 >                                                 char strand=0, uint start=0, uint end=0);
1026 >  CNonExon* subfPoolCheck(GffLine* gffline, GHash<CNonExon>& pex, char*& subp_name);
1027 >  void subfPoolAdd(GHash<CNonExon>& pex, GfoHolder* newgfo);
1028 >  GfoHolder* promoteFeature(CNonExon* subp, char*& subp_name, GHash<CNonExon>& pex,
1029 >                                  bool keepAttr, bool noExonAttr);
1030   public:
1031 <  GfList gflst; //all read gflst
1031 >  GfList gflst; //accumulate GffObjs being read
1032 >  GfoHolder* newGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr,
1033 >                               GffObj* parent=NULL, GffExon* pexon=NULL, GVec<GfoHolder>* glst=NULL);
1034 >  GfoHolder* replaceGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, int replaceidx);
1035 >  GfoHolder* updateGffRec(GfoHolder* prevgfo, GffLine* gffline,
1036 >                                         bool keepAttr);
1037 >  GfoHolder* updateParent(GfoHolder* newgfh, GffObj* parent);
1038 >  bool addExonFeature(GfoHolder* prevgfo, GffLine* gffline, GHash<CNonExon>& pex, bool noExonAttr);
1039    GList<GSeqStat> gseqstats; //list of all genomic sequences seen by this reader, accumulates stats
1040 <  GffReader(FILE* f, bool justmrna=false, bool sort=false):phash(false),gflst(sort), gseqstats(true,true,true) {
1040 >  GffReader(FILE* f=NULL, bool t_only=false, bool sortbyloc=false):discarded_ids(true),
1041 >                       phash(true), gflst(sortbyloc), gseqstats(true,true,true) {
1042 >      gff_warns=gff_show_warnings;
1043        names=NULL;
1044        gffline=NULL;
1045 <      mrnaOnly=justmrna;
768 <      sortbyloc=sort;
1045 >      transcriptsOnly=t_only;
1046        fpos=0;
1047        fname=NULL;
1048        fh=f;
1049        GMALLOC(linebuf, GFF_LINELEN);
1050        buflen=GFF_LINELEN-1;
1051        }
1052 <  GffReader(char* fn, bool justmrna=false, bool sort=false):phash(false),gflst(sort) {
1052 >  void init(FILE *f, bool t_only=false, bool sortbyloc=false) {
1053 >      fname=NULL;
1054 >      fh=f;
1055 >      if (fh!=NULL) rewind(fh);
1056 >      fpos=0;
1057 >      transcriptsOnly=t_only;
1058 >      gflst.sortedByLoc(sortbyloc);
1059 >      }
1060 >  GffReader(char* fn, bool t_only=false, bool sort=false):discarded_ids(true), phash(true),
1061 >                             gflst(sort),gseqstats(true,true,true) {
1062 >      gff_warns=gff_show_warnings;
1063        names=NULL;
1064        fname=Gstrdup(fn);
1065 <      mrnaOnly=justmrna;
779 <      sortbyloc=sort;
1065 >      transcriptsOnly=t_only;
1066        fh=fopen(fname, "rb");
1067        fpos=0;
1068        gffline=NULL;
# Line 784 | Line 1070
1070        buflen=GFF_LINELEN-1;
1071        }
1072  
1073 <  virtual ~GffReader() {
1073 > ~GffReader() {
1074 >      delete gffline;
1075        gffline=NULL;
1076        fpos=0;
1077 <      delete gffline;
1077 >      gflst.freeUnused();
1078        gflst.Clear();
1079 +      discarded_ids.Clear();
1080        phash.Clear();
1081        gseqstats.Clear();
1082        GFREE(fname);
1083        GFREE(linebuf);
1084        }
1085  
1086 +  void showWarnings(bool v=true) {
1087 +      gff_warns=v;
1088 +      gff_show_warnings=v;
1089 +      }
1090 +      
1091    GffLine* nextGffLine();
1092  
1093 <  // parse -> block parsing functions -- do not use,
1094 <  // they always assume that subfeatures (exons) are grouped together by parent
802 <  GffObj* parse(bool keepAttr=false, bool noExonAttr=true);
803 <  void parseAll(GffRecFunc* gproc, bool keepAttr=false, bool noExonAttr=true, void* userptr1=NULL, void* userptr2=NULL);
804 <
805 <  // use this instead of parse: load all subfeatures, re-group them in memory:
806 <  void readAll(bool keepAttr=false, bool mergeCloseExons=false, bool noExonAttr=true); //just reads all gff records into gflst GList
1093 >  // load all subfeatures, re-group them:
1094 >  void readAll(bool keepAttr=false, bool mergeCloseExons=false, bool noExonAttr=true);
1095  
1096   }; // end of GffReader
1097  

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines