ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/gff.h
Revision: 248
Committed: Wed May 23 22:10:55 2012 UTC (7 years, 1 month ago) by gpertea
File size: 35308 byte(s)
Log Message:
put GVec and GPVec in GVec.hh, added GIntervalTree.hh

Line File contents
1 #ifndef GFF_H
2 #define GFF_H
3
4 #include "GBase.h"
5 #include "gdna.h"
6 #include "codons.h"
7 #include "GFaSeqGet.h"
8 #include "GList.hh"
9 #include "GHash.hh"
10
11 /*
12 const byte exMskMajSpliceL = 0x01;
13 const byte exMskMajSpliceR = 0x02;
14 const byte exMskMinSpliceL = 0x04;
15 const byte exMskMinSpliceR = 0x08;
16 const byte exMskTag = 0x80;
17 */
18
19 //reserved Gffnames::feats entries -- basic feature types
20 extern const int gff_fid_mRNA; // "mRNA" feature name
21 extern const int gff_fid_transcript; // *RNA, *transcript feature name
22 extern const int gff_fid_exon;
23 extern const int gff_fid_CDS; //never really used, except for display only
24 //use gff_fid_exon instead
25 extern const uint GFF_MAX_LOCUS;
26 extern const uint GFF_MAX_EXON;
27 extern const uint GFF_MAX_INTRON;
28
29 extern const uint gfo_flag_CHILDREN_PROMOTED;
30 extern const uint gfo_flag_HAS_ERRORS;
31 extern const uint gfo_flag_IS_GENE;
32 extern const uint gfo_flag_HAS_GFF_ID; //found a GFF3 formatted main feature with its own ID
33 extern const uint gfo_flag_BY_EXON; //created by subfeature (exon) directly
34 //(GTF2 and some chado gff3 dumps with exons given before their mRNA)
35 extern const uint gfo_flag_IS_TRANSCRIPT; //recognized as '*RNA' or '*transcript'
36 extern const uint gfo_flag_DISCARDED; //should not be printed under the "transcriptsOnly" directive
37 extern const uint gfo_flag_LST_KEEP; //GffObj from GffReader::gflst is to be kept (not deallocated)
38 //when GffReader is destroyed
39 extern const uint gfo_flag_LEVEL_MSK; //hierarchical level: 0 = no parent
40 extern const byte gfo_flagShift_LEVEL;
41
42 extern bool gff_show_warnings;
43
44 #define GFF_LINELEN 2048
45 #define ERR_NULL_GFNAMES "Error: GffObj::%s requires a non-null GffNames* names!\n"
46
47
48 enum GffExonType {
49 exgffNone=0, //not a recognizable exon or CDS segment
50 exgffStart, //from "start_codon" feature (within CDS)
51 exgffStop, //from "stop_codon" feature (may be outside CDS)
52 exgffCDS, //from "CDS" feature
53 exgffUTR, //from "UTR" feature
54 exgffCDSUTR, //from a merge of UTR and CDS feature
55 exgffExon, //from "exon" feature
56 };
57
58 class GffReader;
59
60 class GffLine {
61 char* _parents; //stores a copy of the Parent attribute value,
62 //with commas replaced by \0
63 int _parents_len;
64 public:
65 char* dupline; //duplicate of original line
66 char* line; //this will have tabs replaced by \0
67 int llen;
68 char* gseqname;
69 char* track;
70 char* ftype; //feature name: mRNA/gene/exon/CDS
71 char* info; //the last, attributes' field, unparsed
72 uint fstart;
73 uint fend;
74 uint qstart; //overlap coords on query, if available
75 uint qend;
76 uint qlen; //query len, if given
77 double score;
78 char strand;
79 bool skip;
80 bool is_gff3; //if the line appears to be in GFF3 format
81 bool is_cds; //"cds" and "stop_codon" features
82 bool is_exon; //"exon" and "utr" features
83 char exontype; // gffExonType
84 bool is_transcript; //if current feature is *RNA or *transcript
85 bool is_gene; //if current feature is *gene
86 char phase; // '.' , '0', '1' or '2'
87 // -- allocated strings:
88 char* gene_name; //value of gene_name attribute (GTF) if present or Name attribute of a gene feature (GFF3)
89 char* gene_id; //value of gene_id attribute (GTF) if present or ID attribute of a gene feature (GFF3)
90 //
91 char** parents; //for GTF only parents[0] is used
92 int num_parents;
93 char* ID; // if a ID=.. attribute was parsed, or a GTF with 'transcript' line (transcript_id)
94 GffLine(GffReader* reader, const char* l); //parse the line accordingly
95 void discardParent() {
96 GFREE(_parents);
97 _parents_len=0;
98 num_parents=0;
99 parents=NULL;
100 }
101 char* extractAttr(const char* pre, bool caseStrict=false, bool enforce_GTF2=false);
102 GffLine(GffLine* l) { //a copy constructor
103 memcpy((void*)this, (void*)l, sizeof(GffLine));
104 line=NULL;
105 GMALLOC(line, llen+1);
106 memcpy(line, l->line, llen+1);
107 GMALLOC(dupline, llen+1);
108 memcpy(dupline, l->dupline, llen+1);
109 //--offsets within line[]
110 gseqname=line+(l->gseqname-l->line);
111 track=line+(l->track-l->line);
112 ftype=line+(l->ftype-l->line);
113 info=line+(l->info-l->line);
114 //Parent=Gstrdup(l->Parent);
115 if (l->_parents_len>0) {
116 _parents_len=l->_parents_len;
117 GMALLOC(_parents, _parents_len);
118 memcpy(_parents, l->_parents, _parents_len);
119 num_parents=l->num_parents;
120 for (int i=0;i<num_parents;i++) {
121 parents[i]=_parents+(l->parents[i] - l->_parents);
122 }
123 }
124 //-- allocated string copies:
125 ID=Gstrdup(l->ID);
126 if (l->gene_name!=NULL)
127 gene_name=Gstrdup(l->gene_name);
128 if (l->gene_id!=NULL)
129 gene_id=Gstrdup(l->gene_id);
130 }
131 GffLine() {
132 line=NULL;
133 dupline=NULL;
134 gseqname=NULL;
135 track=NULL;
136 ftype=NULL;
137 fstart=0;
138 fend=0;
139 strand=0;phase=0;
140 llen=0;score=0;
141 info=NULL;
142 _parents=NULL;
143 _parents_len=0;
144 parents=NULL;
145 num_parents=0;
146 ID=NULL;
147 gene_name=NULL;
148 gene_id=NULL;
149 skip=true;
150 qstart=0;
151 qend=0;
152 qlen=0;
153 exontype=0;
154 is_cds=false;
155 is_gff3=false;
156 is_transcript=false;
157 is_gene=false;
158 is_exon=false;
159 }
160 ~GffLine() {
161 GFREE(dupline);
162 GFREE(line);
163 GFREE(_parents);
164 GFREE(parents);
165 GFREE(ID);
166 GFREE(gene_name);
167 GFREE(gene_id);
168 }
169 };
170
171 class GffAttr {
172 public:
173 int attr_id;
174 char* attr_val;
175 GffAttr(int an_id, const char* av=NULL) {
176 attr_id=an_id;
177 attr_val=NULL;
178 setValue(av);
179 }
180 ~GffAttr() {
181 GFREE(attr_val);
182 }
183 void setValue(const char* av) {
184 if (attr_val!=NULL) {
185 GFREE(attr_val);
186 }
187 if (av==NULL || av[0]==0) return;
188 //trim spaces
189 const char* vstart=av;
190 while (*vstart==' ') av++;
191 const char* vend=vstart;
192 bool keep_dq=false;
193 while (vend[1]!=0) {
194 if (*vend==' ' && vend[1]!=' ') keep_dq=true;
195 else if (*vend==';') keep_dq=true;
196 vend++;
197 }
198 //remove spaces at the end:
199 while (*vend==' ' && vend!=vstart) vend--;
200 //practical clean-up: if it doesn't have any internal spaces just strip those useless double quotes
201 if (!keep_dq && *vstart=='"' && *vend=='"') {
202 vend--;
203 vstart++;
204 }
205 attr_val=Gstrdup(vstart, vend);
206 }
207 bool operator==(GffAttr& d){
208 return (this==&d);
209 }
210 bool operator>(GffAttr& d){
211 return (this>&d);
212 }
213 bool operator<(GffAttr& d){
214 return (this<&d);
215 }
216
217 };
218
219 class GffNameList;
220 class GffNames;
221
222 class GffNameInfo {
223 friend class GffNameList;
224 protected:
225 int idx;
226 public:
227 char* name;
228 GffNameInfo(const char* n=NULL):idx(-1),name(NULL) {
229 if (n) name=Gstrdup(n);
230 }
231
232 ~GffNameInfo() {
233 GFREE(name);
234 }
235
236 bool operator==(GffNameInfo& d){
237 return (strcmp(this->name, d.name)==0);
238 }
239 bool operator<(GffNameInfo& d){
240 return (strcmp(this->name, d.name)<0);
241 }
242 };
243
244 class GffNameList:public GList<GffNameInfo> {
245 friend class GffNameInfo;
246 friend class GffNames;
247 protected:
248 GHash<GffNameInfo> byName;//hash with shared keys
249 int idlast; //fList index of last added/reused name
250 void addStatic(const char* tname) {// fast add
251 GffNameInfo* f=new GffNameInfo(tname);
252 idlast=this->Add(f);
253 f->idx=idlast;
254 byName.shkAdd(f->name,f);
255 }
256 public:
257 GffNameList(int init_capacity=6):GList<GffNameInfo>(init_capacity, false,true,true), byName(false) {
258 idlast=-1;
259 setCapacity(init_capacity);
260 }
261 char* lastNameUsed() { return idlast<0 ? NULL : Get(idlast)->name; }
262 int lastNameId() { return idlast; }
263 char* getName(int nid) { //retrieve name by its ID
264 if (nid<0 || nid>=fCount)
265 GError("GffNameList Error: invalid index (%d)\n",nid);
266 return fList[nid]->name;
267 }
268
269 int addName(const char* tname) {//returns or create an id for the given name
270 //check idlast first, chances are it's the same feature name checked
271 /*if (idlast>=0 && strcmp(fList[idlast]->name,tname)==0)
272 return idlast;*/
273 GffNameInfo* f=byName.Find(tname);
274 int fidx=-1;
275 if (f!=NULL) fidx=f->idx;
276 else {//add new entry
277 f=new GffNameInfo(tname);
278 fidx=this->Add(f);
279 f->idx=fidx;
280 byName.shkAdd(f->name,f);
281 }
282 idlast=fidx;
283 return fidx;
284 }
285
286 int addNewName(const char* tname) {
287 GffNameInfo* f=new GffNameInfo(tname);
288 int fidx=this->Add(f);
289 f->idx=fidx;
290 byName.shkAdd(f->name,f);
291 return fidx;
292 }
293
294 int getId(const char* tname) { //only returns a name id# if found
295 GffNameInfo* f=byName.Find(tname);
296 if (f==NULL) return -1;
297 return f->idx;
298 }
299 int removeName() {
300 GError("Error: removing names from GffNameList not allowed!\n");
301 return -1;
302 }
303 };
304
305 class GffNames {
306 public:
307 int numrefs;
308 GffNameList tracks;
309 GffNameList gseqs;
310 GffNameList attrs;
311 GffNameList feats; //feature names: 'mRNA', 'exon', 'CDS' etc.
312 GffNames():tracks(),gseqs(),attrs(), feats() {
313 numrefs=0;
314 //the order below is critical!
315 //has to match: gff_fid_mRNA, gff_fid_exon, gff_fid_CDS
316 feats.addStatic("mRNA");//index 0=gff_fid_mRNA
317 feats.addStatic("transcript");//index 1=gff_fid_transcript
318 feats.addStatic("exon");//index 1=gff_fid_exon
319 feats.addStatic("CDS"); //index 2=gff_fid_CDS
320 }
321 };
322
323 void gffnames_ref(GffNames* &n);
324 void gffnames_unref(GffNames* &n);
325
326 enum GffPrintMode {
327 pgtfAny, //print record as read
328 pgtfExon,
329 pgtfCDS,
330 pgffAny, //print record as read
331 pgffExon,
332 pgffCDS,
333 pgffBoth,
334 };
335
336
337 class GffAttrs:public GList<GffAttr> {
338 public:
339 GffAttrs():GList<GffAttr>(false,true,false) { }
340 void add_or_update(GffNames* names, const char* attrname, const char* val) {
341 int aid=names->attrs.getId(attrname);
342 if (aid>=0) {
343 //attribute found in the dictionary
344 for (int i=0;i<Count();i++) {
345 //do we have it?
346 if (aid==Get(i)->attr_id) {
347 //update the value
348 Get(i)->setValue(val);
349 return;
350 }
351 }
352 }
353 else {
354 aid=names->attrs.addNewName(attrname);
355 }
356 this->Add(new GffAttr(aid, val));
357 }
358
359 char* getAttr(GffNames* names, const char* attrname) {
360 int aid=names->attrs.getId(attrname);
361 if (aid>=0)
362 for (int i=0;i<Count();i++)
363 if (aid==Get(i)->attr_id) return Get(i)->attr_val;
364 return NULL;
365 }
366 char* getAttr(int aid) {
367 if (aid>=0)
368 for (int i=0;i<Count();i++)
369 if (aid==Get(i)->attr_id) return Get(i)->attr_val;
370 return NULL;
371 }
372 };
373
374
375 class GffExon : public GSeg {
376 public:
377 void* uptr; //for later extensions
378 GffAttrs* attrs; //other attributes kept for this exon
379 double score; // gff score column
380 char phase; //GFF phase column - for CDS segments only
381 // '.' = undefined (UTR), '0','1','2' for CDS exons
382 char exontype; // 1="exon" 2="cds" 3="utr" 4="stop_codon"
383 int qstart; // for mRNA/protein exon mappings: coordinates on query
384 int qend;
385 GffExon(int s=0, int e=0, double sc=0, char fr=0, int qs=0, int qe=0, char et=0) {
386 uptr=NULL;
387 attrs=NULL;
388 if (s<e) {
389 start=s;
390 end=e;
391 }
392 else {
393 start=e;
394 end=s;
395 }
396 if (qs<qe) {
397 qstart=qs;
398 qend=qe;
399 } else {
400 qstart=qe;
401 qend=qs;
402 }
403 score=sc;
404 phase=fr;
405 exontype=et;
406 } //constructor
407
408 char* getAttr(GffNames* names, const char* atrname) {
409 if (attrs==NULL || names==NULL || atrname==NULL) return NULL;
410 return attrs->getAttr(names, atrname);
411 }
412
413 char* getAttr(int aid) {
414 if (attrs==NULL) return NULL;
415 return attrs->getAttr(aid);
416 }
417
418 ~GffExon() { //destructor
419 if (attrs!=NULL) delete attrs;
420 }
421 };
422
423
424 class GffCDSeg:public GSeg {
425 public:
426 char phase;
427 int exonidx;
428 };
429 //one GFF mRNA object -- e.g. a mRNA with its exons and/or CDS segments
430 class GffObj:public GSeg {
431 //utility segment-merging function for addExon()
432 void expandExon(int xovl, uint segstart, uint segend,
433 char exontype, double sc, char fr, int qs, int qe);
434 protected:
435 //coordinate transformation data:
436 uint xstart; //absolute genomic coordinates of reference region
437 uint xend;
438 char xstatus; //coordinate transform status:
439 //0 : (start,end) coordinates are absolute
440 //'+' : (start,end) coords are relative to xstart..xend region
441 //'-' : (start,end) are relative to the reverse complement of xstart..xend region
442 //--
443 char* gffID; // ID name for mRNA (parent) feature
444 char* gene_name; //value of gene_name attribute (GTF) if present or Name attribute of the parent gene feature (GFF3)
445 char* geneID; //value of gene_id attribute (GTF) if present or ID attribute of a parent gene feature (GFF3)
446 unsigned int flags;
447 //-- friends:
448 friend class GffReader;
449 friend class GffExon;
450 public:
451 static GffNames* names; // dictionary storage that holds the various attribute names etc.
452 int track_id; // index of track name in names->tracks
453 int gseq_id; // index of genomic sequence name in names->gseqs
454 int ftype_id; // index of this record's feature name in names->feats, or the special gff_fid_mRNA value
455 int exon_ftype_id; //index of child subfeature name in names->feats (that subfeature stored in "exons")
456 //if ftype_id==gff_fid_mRNA then this value is ignored
457 GList<GffExon> exons; //for non-mRNA entries, these can be any subfeature of type subftype_id
458 GPVec<GffObj> children;
459 GffObj* parent;
460 int udata; //user data, flags etc.
461 void* uptr; //user pointer (to a parent object, cluster, locus etc.)
462 GffObj* ulink; //link to another GffObj (user controlled field)
463 // mRNA specific fields:
464 bool isCDS; //just a CDS, no UTRs
465 bool partial; //partial CDS
466 uint CDstart; //CDS start coord
467 uint CDend; //CDS end coord
468 char CDphase; //initial phase for CDS start
469 bool hasErrors() { return ((flags & gfo_flag_HAS_ERRORS)!=0); }
470 void hasErrors(bool v) {
471 if (v) flags |= gfo_flag_HAS_ERRORS;
472 else flags &= ~gfo_flag_HAS_ERRORS;
473 }
474 bool hasGffID() { return ((flags & gfo_flag_HAS_GFF_ID)!=0); }
475 void hasGffID(bool v) {
476 if (v) flags |= gfo_flag_HAS_GFF_ID;
477 else flags &= ~gfo_flag_HAS_GFF_ID;
478 }
479 bool createdByExon() { return ((flags & gfo_flag_BY_EXON)!=0); }
480 void createdByExon(bool v) {
481 if (v) flags |= gfo_flag_BY_EXON;
482 else flags &= ~gfo_flag_BY_EXON;
483 }
484 bool isGene() { return ((flags & gfo_flag_IS_GENE)!=0); }
485 void isGene(bool v) {
486 if (v) flags |= gfo_flag_IS_GENE;
487 else flags &= ~gfo_flag_IS_GENE;
488 }
489 bool isDiscarded() { return ((flags & gfo_flag_DISCARDED)!=0); }
490 void isDiscarded(bool v) {
491 if (v) flags |= gfo_flag_DISCARDED;
492 else flags &= ~gfo_flag_DISCARDED;
493 }
494
495 bool isUsed() { return ((flags & gfo_flag_LST_KEEP)!=0); }
496 void isUsed(bool v) {
497 if (v) flags |= gfo_flag_LST_KEEP;
498 else flags &= ~gfo_flag_LST_KEEP;
499 }
500 bool isTranscript() { return ((flags & gfo_flag_IS_TRANSCRIPT)!=0); }
501 void isTranscript(bool v) {
502 if (v) flags |= gfo_flag_IS_TRANSCRIPT;
503 else flags &= ~gfo_flag_IS_TRANSCRIPT;
504 }
505 bool promotedChildren() { return ((flags & gfo_flag_CHILDREN_PROMOTED)!=0); }
506 void promotedChildren(bool v) {
507 if (v) flags |= gfo_flag_CHILDREN_PROMOTED;
508 else flags &= ~gfo_flag_CHILDREN_PROMOTED;
509 }
510 void setLevel(byte v) {
511 if (v==0) flags &= ~gfo_flag_LEVEL_MSK;
512 else flags &= ~(((uint)v) << gfo_flagShift_LEVEL);
513 }
514 byte incLevel() {
515 uint v=((flags & gfo_flag_LEVEL_MSK) >> gfo_flagShift_LEVEL);
516 v++;
517 flags &= ~(v << gfo_flagShift_LEVEL);
518 return v;
519 }
520 byte getLevel() {
521 return ((byte)((flags & gfo_flag_LEVEL_MSK) >> gfo_flagShift_LEVEL));
522 }
523
524 bool isValidTranscript() {
525 //return (ftype_id==gff_fid_mRNA && exons.Count()>0);
526 return (isTranscript() && exons.Count()>0);
527 }
528
529
530 int addExon(uint segstart, uint segend, double sc=0, char fr='.',
531 int qs=0, int qe=0, bool iscds=false, char exontype=0);
532
533 int addExon(GffReader* reader, GffLine* gl, bool keepAttr=false, bool noExonAttr=true);
534
535 void removeExon(int idx);
536 void removeExon(GffExon* p);
537 char strand; //true if features are on the reverse complement strand
538 double gscore;
539 double uscore; //custom, user-computed score, if needed
540 int covlen; //total coverage of reference genomic sequence (sum of maxcf segment lengths)
541
542 //--------- optional data:
543 int qlen; //query length, start, end - if available
544 int qstart;
545 int qend;
546 int qcov; //query coverage - percent
547 GffAttrs* attrs; //other gff3 attributes found for the main mRNA feature
548 //constructor by gff line parsing:
549 GffObj(GffReader* gfrd, GffLine* gffline, bool keepAttrs=false, bool noExonAttr=true);
550 //if gfline->Parent!=NULL then this will also add the first sub-feature
551 // otherwise, only the main feature is created
552 void clearAttrs() {
553 if (attrs!=NULL) {
554 bool sharedattrs=(exons.Count()>0 && exons[0]->attrs==attrs);
555 delete attrs; attrs=NULL;
556 if (sharedattrs) exons[0]->attrs=NULL;
557 }
558 }
559 GffObj(char* anid=NULL):GSeg(0,0), exons(true,true,false), children(1,false) {
560 //exons: sorted, free, non-unique
561 gffID=NULL;
562 uptr=NULL;
563 ulink=NULL;
564 flags=0;
565 udata=0;
566 parent=NULL;
567 ftype_id=-1;
568 exon_ftype_id=-1;
569 if (anid!=NULL) gffID=Gstrdup(anid);
570 gffnames_ref(names);
571 qlen=0;
572 qstart=0;
573 qend=0;
574 qcov=0;
575 partial=true;
576 isCDS=false;
577 CDstart=0; // hasCDS <=> CDstart>0
578 CDend=0;
579 CDphase=0;
580 gseq_id=-1;
581 track_id=-1;
582 xstart=0;
583 xend=0;
584 xstatus=0;
585 strand='.';
586 gscore=0;
587 uscore=0;
588 attrs=NULL;
589 covlen=0;
590 gene_name=NULL;
591 geneID=NULL;
592 }
593 ~GffObj() {
594 GFREE(gffID);
595 GFREE(gene_name);
596 GFREE(geneID);
597 clearAttrs();
598 gffnames_unref(names);
599 }
600 //--------------
601 GffObj* finalize(GffReader* gfr, bool mergeCloseExons=false,
602 bool keepAttrs=false, bool noExonAttr=true);
603 //complete parsing: must be called in order to merge adjacent/close proximity subfeatures
604 void parseAttrs(GffAttrs*& atrlist, char* info, bool isExon=false);
605 const char* getSubfName() { //returns the generic feature type of the entries in exons array
606 int sid=exon_ftype_id;
607 if (sid==gff_fid_exon && isCDS) sid=gff_fid_CDS;
608 return names->feats.getName(sid);
609 }
610 void addCDS(uint cd_start, uint cd_end, char phase=0);
611
612 bool monoFeature() {
613 return (exons.Count()==0 ||
614 (exons.Count()==1 && //exon_ftype_id==ftype_id &&
615 exons[0]->end==this->end && exons[0]->start==this->start));
616 }
617
618 bool hasCDS() { return (CDstart>0); }
619
620 const char* getFeatureName() {
621 return names->feats.getName(ftype_id);
622 }
623 void setFeatureName(const char* feature);
624
625 void addAttr(const char* attrname, const char* attrvalue);
626 int removeAttr(const char* attrname, const char* attrval=NULL);
627 int removeAttr(int aid, const char* attrval=NULL);
628 int removeExonAttr(GffExon& exon, const char* attrname, const char* attrval=NULL);
629 int removeExonAttr(GffExon& exon, int aid, const char* attrval=NULL);
630 const char* getAttrName(int i) {
631 if (attrs==NULL) return NULL;
632 return names->attrs.getName(attrs->Get(i)->attr_id);
633 }
634 char* getAttr(const char* attrname, bool checkFirstExon=false) {
635 if (names==NULL || attrname==NULL) return NULL;
636 char* r=NULL;
637 if (attrs==NULL) {
638 if (!checkFirstExon) return NULL;
639 }
640 else r=attrs->getAttr(names, attrname);
641 if (r!=NULL) return r;
642 if (checkFirstExon && exons.Count()>0) {
643 r=exons[0]->getAttr(names, attrname);
644 }
645 return r;
646 }
647
648 char* getExonAttr(GffExon* exon, const char* attrname) {
649 if (exon==NULL || attrname==NULL) return NULL;
650 return exon->getAttr(names, attrname);
651 }
652
653 char* getExonAttr(int exonidx, const char* attrname) {
654 if (exonidx<0 || exonidx>=exons.Count() || attrname==NULL) return NULL;
655 return exons[exonidx]->getAttr(names, attrname);
656 }
657
658 char* getAttrValue(int i) {
659 if (attrs==NULL) return NULL;
660 return attrs->Get(i)->attr_val;
661 }
662 const char* getGSeqName() {
663 return names->gseqs.getName(gseq_id);
664 }
665
666 const char* getRefName() {
667 return names->gseqs.getName(gseq_id);
668 }
669 void setRefName(const char* newname);
670
671 const char* getTrackName() {
672 return names->tracks.getName(track_id);
673 }
674 bool exonOverlap(uint s, uint e) {//check if ANY exon overlaps given segment
675 //ignores strand!
676 if (s>e) Gswap(s,e);
677 for (int i=0;i<exons.Count();i++) {
678 if (exons[i]->overlap(s,e)) return true;
679 }
680 return false;
681 }
682 bool exonOverlap(GffObj& m) {//check if ANY exon overlaps given segment
683 //if (gseq_id!=m.gseq_id) return false;
684 // ignores strand and gseq_id, must check in advance
685 for (int i=0;i<exons.Count();i++) {
686 for (int j=0;j<m.exons.Count();j++) {
687 if (exons[i]->start>m.exons[j]->end) continue;
688 if (m.exons[j]->start>exons[i]->end) break;
689 //-- overlap if we are here:
690 return true;
691 }
692 }
693 return false;
694 }
695
696 int exonOverlapIdx(uint s, uint e, int* ovlen=NULL) {
697 //return the exons' index for the overlapping OR ADJACENT exon
698 //ovlen, if given, will return the overlap length
699 if (s>e) Gswap(s,e);
700 s--;e++; //to also catch adjacent exons
701 for (int i=0;i<exons.Count();i++) {
702 if (exons[i]->start>e) break;
703 if (s>exons[i]->end) continue;
704 //-- overlap if we are here:
705 if (ovlen!=NULL) {
706 s++;e--;
707 int ovlend= (exons[i]->end>e) ? e : exons[i]->end;
708 *ovlen= ovlend - ((s>exons[i]->start)? s : exons[i]->start)+1;
709 }
710 return i;
711 } //for each exon
712 *ovlen=0;
713 return -1;
714 }
715
716 int exonOverlapLen(GffObj& m) {
717 if (start>m.end || m.start>end) return 0;
718 int i=0;
719 int j=0;
720 int ovlen=0;
721 while (i<exons.Count() && j<m.exons.Count()) {
722 uint istart=exons[i]->start;
723 uint iend=exons[i]->end;
724 uint jstart=m.exons[j]->start;
725 uint jend=m.exons[j]->end;
726 if (istart>jend) { j++; continue; }
727 if (jstart>iend) { i++; continue; }
728 //exon overlap
729 uint ovstart=GMAX(istart,jstart);
730 if (iend<jend) {
731 ovlen+=iend-ovstart+1;
732 i++;
733 }
734 else {
735 ovlen+=jend-ovstart+1;
736 j++;
737 }
738 }//while comparing exons
739 return ovlen;
740 }
741
742 bool exonOverlap(GffObj* m) {
743 return exonOverlap(*m);
744 }
745 //---------- coordinate transformation
746 void xcoord(uint grstart, uint grend, char xstrand='+') {
747 //relative coordinate transform, and reverse-complement transform if xstrand is '-'
748 //does nothing if xstatus is the same already
749 if (xstatus) {
750 if (xstatus==xstrand && grstart==xstart && grend==xend) return;
751 unxcoord();//restore original coordinates
752 }
753 xstatus=xstrand;
754 xstart=grstart;
755 xend=grend;
756 if (CDstart>0) xcoordseg(CDstart, CDend);
757 for (int i=0;i<exons.Count();i++) {
758 xcoordseg(exons[i]->start, exons[i]->end);
759 }
760 if (xstatus=='-') {
761 exons.Reverse();
762 int flen=end-start;
763 start=xend-end+1;
764 end=start+flen;
765 }
766 else {
767 start=start-xstart+1;
768 end=end-xstart+1;
769 }
770 }
771
772 //transform an arbitrary segment based on current xstatus/xstart-xend
773 void xcoordseg(uint& segstart, uint &segend) {
774 if (xstatus==0) return;
775 if (xstatus=='-') {
776 int flen=segend-segstart;
777 segstart=xend-segend+1;
778 segend=segstart+flen;
779 return;
780 }
781 else {
782 segstart=segstart-xstart+1;
783 segend=segend-xstart+1;
784 }
785 }
786
787 void unxcoord() { //revert back to absolute genomic/gff coordinates if xstatus==true
788 if (xstatus==0) return; //nothing to do, no transformation appplied
789 if (CDstart>0) unxcoordseg(CDstart, CDend);
790 //restore all GffExon intervals too
791 for (int i=0;i<exons.Count();i++) {
792 unxcoordseg(exons[i]->start, exons[i]->end);
793 }
794 if (xstatus=='-') {
795 exons.Reverse();
796 int flen=end-start;
797 start=xend-end+1;
798 end=start+flen;
799 }
800 else {
801 start=start+xstart-1;
802 end=end+xstart-1;
803 }
804 xstatus=0;
805 }
806 void unxcoordseg(uint& astart, uint &aend) {
807 //restore an arbitrary interval -- does NOT change the transform state!
808 if (xstatus==0) return;
809 if (xstatus=='-') {
810 int flen=aend-astart;
811 astart=xend-aend+1;
812 aend=astart+flen;
813 }
814 else {
815 astart=astart+xstart-1;
816 aend=aend+xstart-1;
817 }
818 }
819 //---------------------
820 bool operator==(GffObj& d){
821 return (gseq_id==d.gseq_id && start==d.start && end==d.end && strcmp(gffID, d.gffID)==0);
822 }
823 bool operator>(GffObj& d){
824 if (gseq_id!=d.gseq_id) return (gseq_id>d.gseq_id);
825 if (start==d.start) {
826 if (getLevel()==d.getLevel()) {
827 if (end==d.end) return (strcmp(gffID, d.gffID)>0);
828 else return (end>d.end);
829 } else return (getLevel()>d.getLevel());
830 } else return (start>d.start);
831 }
832 bool operator<(GffObj& d){
833 if (gseq_id!=d.gseq_id) return (gseq_id<d.gseq_id);
834 if (start==d.start) {
835 if (getLevel()==d.getLevel()) {
836 if (end==d.end) return strcmp(gffID, d.gffID)<0;
837 else return end<d.end;
838 } else return (getLevel()<d.getLevel());
839 } else return (start<d.start);
840 }
841 char* getID() { return gffID; }
842 char* getGeneID() { return geneID; }
843 char* getGeneName() { return gene_name; }
844 void setGeneName(const char* gname) {
845 GFREE(gene_name);
846 if (gname) gene_name=Gstrdup(gname);
847 }
848 void setGeneID(const char* gene_id) {
849 GFREE(geneID);
850 if (gene_id) geneID=Gstrdup(gene_id);
851 }
852 int addSeg(GffLine* gfline);
853 int addSeg(int fnid, GffLine* gfline);
854 void getCDSegs(GArray<GffCDSeg>& cds);
855
856 void updateExonPhase(); //for CDS-only features, updates GExon::phase
857
858 void printGxfLine(FILE* fout, const char* tlabel, const char* gseqname,
859 bool iscds, uint segstart, uint segend, int exidx, char phase, bool gff3);
860 void printGxf(FILE* fout, GffPrintMode gffp=pgffExon,
861 const char* tlabel=NULL, const char* gfparent=NULL);
862 void printGtf(FILE* fout, const char* tlabel=NULL) {
863 printGxf(fout, pgtfAny, tlabel);
864 }
865 void printGff(FILE* fout, const char* tlabel=NULL,
866 const char* gfparent=NULL) {
867 printGxf(fout, pgffAny, tlabel, gfparent);
868 }
869 void printTranscriptGff(FILE* fout, char* tlabel=NULL,
870 bool showCDS=false, const char* gfparent=NULL) {
871 if (isValidTranscript())
872 printGxf(fout, showCDS ? pgffBoth : pgffExon, tlabel, gfparent);
873 }
874 void printSummary(FILE* fout=NULL);
875 void getCDS_ends(uint& cds_start, uint& cds_end);
876 void mRNA_CDS_coords(uint& cds_start, uint& cds_end);
877 char* getSpliced(GFaSeqGet* faseq, bool CDSonly=false, int* rlen=NULL,
878 uint* cds_start=NULL, uint* cds_end=NULL, GList<GSeg>* seglst=NULL);
879 char* getUnspliced(GFaSeqGet* faseq, int* rlen, GList<GSeg>* seglst);
880 char* getSplicedTr(GFaSeqGet* faseq, bool CDSonly=true, int* rlen=NULL);
881 //bool validCDS(GFaSeqGet* faseq); //has In-Frame Stop Codon ?
882 bool empty() { return (start==0); }
883 };
884
885 typedef bool GffRecFunc(GffObj* gobj, void* usrptr1, void* usrptr2);
886 //user callback after parsing a mapping object:
887 // Returns: "done with it" status:
888 // TRUE if gobj is no longer needed so it's FREEd upon return
889 // FALSE if the user needs the gobj pointer and is responsible for
890 // collecting and freeing all GffObj objects
891
892
893 //GSeqStat: collect basic stats about a common underlying genomic sequence
894 // for multiple GffObj
895 class GSeqStat {
896 public:
897 int gseqid; //gseq id in the global static pool of gseqs
898 char* gseqname; //just a pointer to the name of gseq
899 //int fcount;//number of features on this gseq
900 uint mincoord;
901 uint maxcoord;
902 uint maxfeat_len; //maximum feature length on this genomic sequence
903 GffObj* maxfeat;
904 GSeqStat(int id=-1, char* name=NULL) {
905 gseqid=id;
906 gseqname=name;
907 mincoord=MAXUINT;
908 maxcoord=0;
909 maxfeat_len=0;
910 maxfeat=NULL;
911 }
912 bool operator>(GSeqStat& g) {
913 return (gseqid>g.gseqid);
914 }
915 bool operator<(GSeqStat& g) {
916 return (gseqid<g.gseqid);
917 }
918 bool operator==(GSeqStat& g) {
919 return (gseqid==g.gseqid);
920 }
921 };
922
923
924 int gfo_cmpByLoc(const pointer p1, const pointer p2);
925
926 class GfList: public GList<GffObj> {
927 //just adding the option to sort by genomic sequence and coordinate
928 bool mustSort;
929 public:
930 GfList(bool sortbyloc=false):GList<GffObj>(false,false,false) {
931 //GffObjs in this list are NOT deleted when the list is cleared
932 //-- for deallocation of these objects, call freeAll() or freeUnused() as needed
933 mustSort=sortbyloc;
934 }
935 void sortedByLoc(bool v=true) {
936 bool prev=mustSort;
937 mustSort=v;
938 if (fCount>0 && mustSort && !prev) {
939 this->setSorted((GCompareProc*)gfo_cmpByLoc);
940 }
941 }
942 void finalize(GffReader* gfr, bool mergeCloseExons,
943 bool keepAttrs=false, bool noExonAttr=true) { //if set, enforce sort by locus
944 if (mustSort) { //force (re-)sorting
945 this->setSorted(false);
946 this->setSorted((GCompareProc*)gfo_cmpByLoc);
947 }
948 int delcount=0;
949 for (int i=0;i<Count();i++) {
950 //finish the parsing for each GffObj
951 fList[i]->finalize(gfr, mergeCloseExons, keepAttrs, noExonAttr);
952 }
953 if (delcount>0) this->Pack();
954 }
955 void freeAll() {
956 for (int i=0;i<fCount;i++) {
957 delete fList[i];
958 fList[i]=NULL;
959 }
960 Clear();
961 }
962 void freeUnused() {
963 for (int i=0;i<fCount;i++) {
964 if (fList[i]->isUsed()) continue;
965 //inform the children
966 for (int c=0;c<fList[i]->children.Count();c++) {
967 fList[i]->children[c]->parent=NULL;
968 }
969 delete fList[i];
970 fList[i]=NULL;
971 }
972 Clear();
973 }
974
975 };
976
977 struct GfoHolder {
978 int idx; //position in GffReader::gflst array
979 GffObj* gffobj;
980 GfoHolder(GffObj* gfo=NULL, int i=0) {
981 idx=i;
982 gffobj=gfo;
983 }
984 };
985
986 class CNonExon { //utility class used in subfeature promotion
987 public:
988 int idx;
989 GffObj* parent;
990 GffExon* exon;
991 GffLine* gffline;
992 CNonExon(int i, GffObj* p, GffExon* e, GffLine* gl) {
993 parent=p;
994 exon=e;
995 idx=i;
996 gffline=new GffLine(gl);
997 }
998 ~CNonExon() {
999 delete gffline;
1000 }
1001 };
1002
1003
1004 class GffReader {
1005 friend class GffObj;
1006 friend class GffLine;
1007 char* linebuf;
1008 off_t fpos;
1009 int buflen;
1010 protected:
1011 bool gff_warns; //warn about duplicate IDs, etc. even when they are on different chromosomes
1012 FILE* fh;
1013 char* fname; //optional fasta file with the underlying genomic sequence to be attached to this reader
1014 GffNames* names; //just a pointer to the global static Gff names repository in GffObj
1015 GffLine* gffline;
1016 bool transcriptsOnly; //keep only transcripts w/ their exon/CDS features
1017 GHash<int> discarded_ids; //for transcriptsOnly mode, keep track
1018 // of discarded parent IDs
1019 GHash< GVec<GfoHolder> > phash; //transcript_id+contig (Parent~Contig) => [gflst index, GffObj]
1020 //GHash<int> tids; //just for transcript_id uniqueness
1021 char* gfoBuildId(const char* id, const char* ctg);
1022 //void gfoRemove(const char* id, const char* ctg);
1023 GfoHolder* gfoAdd(GffObj* gfo, int idx);
1024 GfoHolder* gfoAdd(GVec<GfoHolder>& glst, GffObj* gfo, int idx);
1025 // const char* id, const char* ctg, char strand, GVec<GfoHolder>** glst, uint start, uint end
1026 GfoHolder* gfoFind(const char* id, const char* ctg=NULL, GVec<GfoHolder>** glst=NULL,
1027 char strand=0, uint start=0, uint end=0);
1028 CNonExon* subfPoolCheck(GffLine* gffline, GHash<CNonExon>& pex, char*& subp_name);
1029 void subfPoolAdd(GHash<CNonExon>& pex, GfoHolder* newgfo);
1030 GfoHolder* promoteFeature(CNonExon* subp, char*& subp_name, GHash<CNonExon>& pex,
1031 bool keepAttr, bool noExonAttr);
1032 public:
1033 GfList gflst; //accumulate GffObjs being read
1034 GfoHolder* newGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr,
1035 GffObj* parent=NULL, GffExon* pexon=NULL, GVec<GfoHolder>* glst=NULL);
1036 GfoHolder* replaceGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, int replaceidx);
1037 GfoHolder* updateGffRec(GfoHolder* prevgfo, GffLine* gffline,
1038 bool keepAttr);
1039 GfoHolder* updateParent(GfoHolder* newgfh, GffObj* parent);
1040 bool addExonFeature(GfoHolder* prevgfo, GffLine* gffline, GHash<CNonExon>& pex, bool noExonAttr);
1041 GList<GSeqStat> gseqstats; //list of all genomic sequences seen by this reader, accumulates stats
1042 GffReader(FILE* f=NULL, bool t_only=false, bool sortbyloc=false):discarded_ids(true),
1043 phash(true), gflst(sortbyloc), gseqstats(true,true,true) {
1044 gff_warns=gff_show_warnings;
1045 names=NULL;
1046 gffline=NULL;
1047 transcriptsOnly=t_only;
1048 fpos=0;
1049 fname=NULL;
1050 fh=f;
1051 GMALLOC(linebuf, GFF_LINELEN);
1052 buflen=GFF_LINELEN-1;
1053 }
1054 void init(FILE *f, bool t_only=false, bool sortbyloc=false) {
1055 fname=NULL;
1056 fh=f;
1057 if (fh!=NULL) rewind(fh);
1058 fpos=0;
1059 transcriptsOnly=t_only;
1060 gflst.sortedByLoc(sortbyloc);
1061 }
1062 GffReader(char* fn, bool t_only=false, bool sort=false):discarded_ids(true), phash(true),
1063 gflst(sort),gseqstats(true,true,true) {
1064 gff_warns=gff_show_warnings;
1065 names=NULL;
1066 fname=Gstrdup(fn);
1067 transcriptsOnly=t_only;
1068 fh=fopen(fname, "rb");
1069 fpos=0;
1070 gffline=NULL;
1071 GMALLOC(linebuf, GFF_LINELEN);
1072 buflen=GFF_LINELEN-1;
1073 }
1074
1075 ~GffReader() {
1076 delete gffline;
1077 gffline=NULL;
1078 fpos=0;
1079 gflst.freeUnused();
1080 gflst.Clear();
1081 discarded_ids.Clear();
1082 phash.Clear();
1083 gseqstats.Clear();
1084 GFREE(fname);
1085 GFREE(linebuf);
1086 }
1087
1088 void showWarnings(bool v=true) {
1089 gff_warns=v;
1090 gff_show_warnings=v;
1091 }
1092
1093 GffLine* nextGffLine();
1094
1095 // load all subfeatures, re-group them:
1096 void readAll(bool keepAttr=false, bool mergeCloseExons=false, bool noExonAttr=true);
1097
1098 }; // end of GffReader
1099
1100 #endif