ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/gff.h
Revision: 16
Committed: Mon Jul 18 20:56:02 2011 UTC (8 years, 3 months ago) by gpertea
File size: 34861 byte(s)
Log Message:
sync with local source

Line File contents
1 #ifndef GFF_H
2 #define GFF_H
3
4 #include "GBase.h"
5 #include "gdna.h"
6 #include "codons.h"
7 #include "GFaSeqGet.h"
8 #include "GList.hh"
9 #include "GHash.hh"
10
11 /*
12 const byte exMskMajSpliceL = 0x01;
13 const byte exMskMajSpliceR = 0x02;
14 const byte exMskMinSpliceL = 0x04;
15 const byte exMskMinSpliceR = 0x08;
16 const byte exMskTag = 0x80;
17 */
18
19 //reserved Gffnames::feats entries -- basic feature types
20 extern const int gff_fid_mRNA; // "mRNA" feature name
21 extern const int gff_fid_transcript; // *RNA, *transcript feature name
22 extern const int gff_fid_exon;
23 extern const int gff_fid_CDS; //never really used, except for display only
24 //use gff_fid_exon instead
25 extern const uint GFF_MAX_LOCUS;
26 extern const uint GFF_MAX_EXON;
27 extern const uint GFF_MAX_INTRON;
28
29 extern const uint gfo_flag_CHILDREN_PROMOTED;
30 extern const uint gfo_flag_HAS_ERRORS;
31 extern const uint gfo_flag_IS_GENE;
32 extern const uint gfo_flag_FROM_GFF3; //parsed from GFF3 formatted record
33 extern const uint gfo_flag_BY_EXON; //created by subfeature (exon) directly
34 //(GTF2 and some chado gff3 dumps with exons given before their mRNA)
35 extern const uint gfo_flag_IS_TRANSCRIPT; //recognized as '*RNA' or '*transcript'
36 extern const uint gfo_flag_DISCARDED; //should not be printed under the "transcriptsOnly" directive
37 extern const uint gfo_flag_LST_KEEP; //GffObj from GffReader::gflst is to be kept (not deallocated)
38 //when GffReader is destroyed
39 extern const uint gfo_flag_LEVEL_MSK; //hierarchical level: 0 = no parent
40 extern const byte gfo_flagShift_LEVEL;
41
42 extern bool gff_show_warnings;
43
44 #define GFF_LINELEN 2048
45 #define ERR_NULL_GFNAMES "Error: GffObj::%s requires a non-null GffNames* names!\n"
46
47
48 enum GffExonType {
49 exgffNone=0, //not a recognizable exon or CDS segment
50 exgffStart, //from "start_codon" feature (within CDS)
51 exgffStop, //from "stop_codon" feature (may be outside CDS)
52 exgffCDS, //from "CDS" feature
53 exgffUTR, //from "UTR" feature
54 exgffCDSUTR, //from a merge of UTR and CDS feature
55 exgffExon, //from "exon" feature
56 };
57
58 class GffReader;
59
60 class GffLine {
61 char* _parents; //stores a copy of the Parent attribute value,
62 //with commas replaced by \0
63 int _parents_len;
64 public:
65 char* dupline; //duplicate of original line
66 char* line; //this will have tabs replaced by \0
67 int llen;
68 char* gseqname;
69 char* track;
70 char* ftype; //feature name: mRNA/gene/exon/CDS
71 char* info; //the last, attributes' field, unparsed
72 uint fstart;
73 uint fend;
74 uint qstart; //overlap coords on query, if available
75 uint qend;
76 uint qlen; //query len, if given
77 double score;
78 char strand;
79 bool skip;
80 bool is_gff3; //if the line appears to be in GFF3 format
81 bool is_cds; //"cds" and "stop_codon" features
82 bool is_exon; //"exon" and "utr" features
83 char exontype; // gffExonType
84 bool is_transcript; //if current feature is *RNA or *transcript
85 bool is_gene; //if current feature is *gene
86 char phase; // '.' , '0', '1' or '2'
87 // -- allocated strings:
88 char* gene_name; //value of gene_name attribute (GTF) if present or Name attribute of a gene feature (GFF3)
89 char* gene_id; //value of gene_id attribute (GTF) if present or ID attribute of a gene feature (GFF3)
90 //
91 char** parents; //for GTF only parents[0] is used
92 int num_parents;
93 char* ID; // if a ID=.. attribute was parsed, or a GTF with 'transcript' line (transcript_id)
94 GffLine(GffReader* reader, const char* l); //parse the line accordingly
95 void discardParent() {
96 GFREE(_parents);
97 _parents_len=0;
98 num_parents=0;
99 parents=NULL;
100 }
101 char* extractAttr(const char* pre, bool caseStrict=true, bool enforce_GTF2=false);
102 GffLine(GffLine* l) { //a copy constructor
103 memcpy((void*)this, (void*)l, sizeof(GffLine));
104 line=NULL;
105 GMALLOC(line, llen+1);
106 memcpy(line, l->line, llen+1);
107 GMALLOC(dupline, llen+1);
108 memcpy(dupline, l->dupline, llen+1);
109 //--offsets within line[]
110 gseqname=line+(l->gseqname-l->line);
111 track=line+(l->track-l->line);
112 ftype=line+(l->ftype-l->line);
113 info=line+(l->info-l->line);
114 //Parent=Gstrdup(l->Parent);
115 if (l->_parents_len>0) {
116 _parents_len=l->_parents_len;
117 GMALLOC(_parents, _parents_len);
118 memcpy(_parents, l->_parents, _parents_len);
119 num_parents=l->num_parents;
120 for (int i=0;i<num_parents;i++) {
121 parents[i]=_parents+(l->parents[i] - l->_parents);
122 }
123 }
124 //-- allocated string copies:
125 ID=Gstrdup(l->ID);
126 if (l->gene_name!=NULL)
127 gene_name=Gstrdup(l->gene_name);
128 if (l->gene_id!=NULL)
129 gene_id=Gstrdup(l->gene_id);
130 }
131 GffLine() {
132 line=NULL;
133 dupline=NULL;
134 gseqname=NULL;
135 track=NULL;
136 ftype=NULL;
137 fstart=0;
138 fend=0;
139 info=NULL;
140 _parents=NULL;
141 _parents_len=0;
142 parents=NULL;
143 num_parents=0;
144 ID=NULL;
145 gene_name=NULL;
146 gene_id=NULL;
147 skip=true;
148 qstart=0;
149 qend=0;
150 qlen=0;
151 exontype=0;
152 is_cds=false;
153 is_gff3=false;
154 is_transcript=false;
155 is_gene=false;
156 is_exon=false;
157 }
158 ~GffLine() {
159 GFREE(dupline);
160 GFREE(line);
161 GFREE(_parents);
162 GFREE(parents);
163 GFREE(ID);
164 GFREE(gene_name);
165 GFREE(gene_id);
166 }
167 };
168
169 class GffAttr {
170 public:
171 int attr_id;
172 char* attr_val;
173 GffAttr(int an_id, const char* av=NULL) {
174 attr_id=an_id;
175 attr_val=NULL;
176 setValue(av);
177 }
178 ~GffAttr() {
179 GFREE(attr_val);
180 }
181 void setValue(const char* av) {
182 if (attr_val!=NULL) {
183 GFREE(attr_val);
184 }
185 if (av==NULL || av[0]==0) return;
186 //trim spaces
187 const char* vstart=av;
188 while (*vstart==' ') av++;
189 const char* vend=vstart;
190 bool keep_dq=false;
191 while (vend[1]!=0) {
192 if (*vend==' ' && vend[1]!=' ') keep_dq=true;
193 else if (*vend==';') keep_dq=true;
194 vend++;
195 }
196 //remove spaces at the end:
197 while (*vend==' ' && vend!=vstart) vend--;
198 //practical clean-up: if it doesn't have any internal spaces just strip those useless double quotes
199 if (!keep_dq && *vstart=='"' && *vend=='"') {
200 vend--;
201 vstart++;
202 }
203 attr_val=Gstrdup(vstart, vend);
204 }
205 bool operator==(GffAttr& d){
206 return (this==&d);
207 }
208 bool operator>(GffAttr& d){
209 return (this>&d);
210 }
211 bool operator<(GffAttr& d){
212 return (this<&d);
213 }
214
215 };
216
217 class GffNameList;
218 class GffNames;
219
220 class GffNameInfo {
221 friend class GffNameList;
222 protected:
223 int idx;
224 public:
225 char* name;
226 GffNameInfo() { name=NULL; idx=-1; }
227 GffNameInfo(const char* n) {
228 name=Gstrdup(n);
229 }
230
231 ~GffNameInfo() {
232 GFREE(name);
233 }
234
235 bool operator==(GffNameInfo& d){
236 return (strcmp(this->name, d.name)==0);
237 }
238 bool operator>(GffNameInfo& d){
239 return (strcmp(this->name, d.name)>0);
240 }
241 bool operator<(GffNameInfo& d){
242 return (strcmp(this->name, d.name)<0);
243 }
244 };
245
246 class GffNameList:public GList<GffNameInfo> {
247 friend class GffNameInfo;
248 friend class GffNames;
249 protected:
250 GHash<GffNameInfo> byName;//hash with shared keys
251 int idlast; //fList index of last added/reused name
252 void addStatic(const char* tname) {// fast add
253 GffNameInfo* f=new GffNameInfo(tname);
254 idlast=this->Add(f);
255 f->idx=idlast;
256 byName.shkAdd(f->name,f);
257 }
258 public:
259 GffNameList():GList<GffNameInfo>(false,true,true), byName(false) {
260 idlast=-1;
261 }
262 char* lastNameUsed() { return idlast<0 ? NULL : Get(idlast)->name; }
263 int lastNameId() { return idlast; }
264 char* getName(int nid) { //retrieve name by its ID
265 if (nid<0 || nid>=fCount)
266 GError("GffNameList Error: invalid index (%d)\n",nid);
267 return fList[nid]->name;
268 }
269
270 int addName(const char* tname) {//returns or create an id for the given name
271 //check idlast first, chances are it's the same feature name checked
272 if (idlast>=0 && strcmp(fList[idlast]->name,tname)==0)
273 return idlast;
274 GffNameInfo* f=byName.Find(tname);
275 int fidx=-1;
276 if (f!=NULL) fidx=f->idx;
277 else {//add new entry
278 f=new GffNameInfo(tname);
279 fidx=this->Add(f);
280 f->idx=fidx;
281 byName.shkAdd(f->name,f);
282 }
283 idlast=fidx;
284 return fidx;
285 }
286
287 int addNewName(const char* tname) {
288 GffNameInfo* f=new GffNameInfo(tname);
289 int fidx=this->Add(f);
290 f->idx=fidx;
291 byName.shkAdd(f->name,f);
292 return fidx;
293 }
294
295 int getId(const char* tname) { //only returns a name id# if found
296 GffNameInfo* f=byName.Find(tname);
297 if (f==NULL) return -1;
298 return f->idx;
299 }
300 int removeName() {
301 GError("Error: removing names from GffNameList not allowed!\n");
302 return -1;
303 }
304 };
305
306 class GffNames {
307 public:
308 int numrefs;
309 GffNameList tracks;
310 GffNameList gseqs;
311 GffNameList attrs;
312 GffNameList feats; //feature names: 'mRNA', 'exon', 'CDS' etc.
313 GffNames():tracks(),gseqs(),attrs(), feats() {
314 numrefs=0;
315 //the order below is critical!
316 //has to match: gff_fid_mRNA, gff_fid_exon, gff_fid_CDS
317 feats.addStatic("mRNA");//index 0=gff_fid_mRNA
318 feats.addStatic("transcript");//index 1=gff_fid_transcript
319 feats.addStatic("exon");//index 1=gff_fid_exon
320 feats.addStatic("CDS"); //index 2=gff_fid_CDS
321 }
322 };
323
324 void gffnames_ref(GffNames* &n);
325 void gffnames_unref(GffNames* &n);
326
327 enum GffPrintMode {
328 pgtfAny, //print record as read
329 pgtfExon,
330 pgtfCDS,
331 pgffAny, //print record as read
332 pgffExon,
333 pgffCDS,
334 pgffBoth,
335 };
336
337
338 class GffAttrs:public GList<GffAttr> {
339 public:
340 GffAttrs():GList<GffAttr>(false,true,false) { }
341 void add_or_update(GffNames* names, const char* attrname, const char* val) {
342 int aid=names->attrs.getId(attrname);
343 if (aid>=0) {
344 //attribute found in the dictionary
345 for (int i=0;i<Count();i++) {
346 //do we have it?
347 if (aid==Get(i)->attr_id) {
348 //update the value
349 Get(i)->setValue(val);
350 return;
351 }
352 }
353 }
354 else {
355 aid=names->attrs.addNewName(attrname);
356 }
357 this->Add(new GffAttr(aid, val));
358 }
359
360 char* getAttr(GffNames* names, const char* attrname) {
361 int aid=names->attrs.getId(attrname);
362 if (aid>=0)
363 for (int i=0;i<Count();i++)
364 if (aid==Get(i)->attr_id) return Get(i)->attr_val;
365 return NULL;
366 }
367 char* getAttr(int aid) {
368 if (aid>=0)
369 for (int i=0;i<Count();i++)
370 if (aid==Get(i)->attr_id) return Get(i)->attr_val;
371 return NULL;
372 }
373 };
374
375
376 class GffExon : public GSeg {
377 public:
378 void* uptr; //for later extensions
379 GffAttrs* attrs; //other attributes kept for this exon
380 double score; // gff score column
381 char phase; //GFF phase column - for CDS segments only
382 // '.' = undefined (UTR), '0','1','2' for CDS exons
383 char exontype; // 1="exon" 2="cds" 3="utr" 4="stop_codon"
384 int qstart; // for mRNA/protein exon mappings: coordinates on query
385 int qend;
386 GffExon(int s=0, int e=0, double sc=0, char fr=0, int qs=0, int qe=0, char et=0) {
387 uptr=NULL;
388 attrs=NULL;
389 if (s<e) {
390 start=s;
391 end=e;
392 }
393 else {
394 start=e;
395 end=s;
396 }
397 if (qs<qe) {
398 qstart=qs;
399 qend=qe;
400 } else {
401 qstart=qe;
402 qend=qs;
403 }
404 score=sc;
405 phase=fr;
406 exontype=et;
407 } //constructor
408
409 char* getAttr(GffNames* names, const char* atrname) {
410 if (attrs==NULL || names==NULL || atrname==NULL) return NULL;
411 return attrs->getAttr(names, atrname);
412 }
413
414 char* getAttr(int aid) {
415 if (attrs==NULL) return NULL;
416 return attrs->getAttr(aid);
417 }
418
419 ~GffExon() { //destructor
420 if (attrs!=NULL) delete attrs;
421 }
422 };
423
424
425 class GffCDSeg:public GSeg {
426 public:
427 char phase;
428 int exonidx;
429 };
430 //one GFF mRNA object -- e.g. a mRNA with its exons and/or CDS segments
431 class GffObj:public GSeg {
432 //utility segment-merging function for addExon()
433 void expandExon(int xovl, uint segstart, uint segend,
434 char exontype, double sc, char fr, int qs, int qe);
435 protected:
436 //coordinate transformation data:
437 uint xstart; //absolute genomic coordinates of reference region
438 uint xend;
439 char xstatus; //coordinate transform status:
440 //0 : (start,end) coordinates are absolute
441 //'+' : (start,end) coords are relative to xstart..xend region
442 //'-' : (start,end) are relative to the reverse complement of xstart..xend region
443 //--
444 char* gffID; // ID name for mRNA (parent) feature
445 char* gene_name; //value of gene_name attribute (GTF) if present or Name attribute of the parent gene feature (GFF3)
446 char* geneID; //value of gene_id attribute (GTF) if present or ID attribute of a parent gene feature (GFF3)
447 unsigned int flags;
448 //-- friends:
449 friend class GffReader;
450 friend class GffExon;
451 public:
452 static GffNames* names; // dictionary storage that holds the various attribute names etc.
453 int track_id; // index of track name in names->tracks
454 int gseq_id; // index of genomic sequence name in names->gseqs
455 int ftype_id; // index of this record's feature name in names->feats, or the special gff_fid_mRNA value
456 int exon_ftype_id; //index of child subfeature name in names->feats (that subfeature stored in "exons")
457 //if ftype_id==gff_fid_mRNA then this value is ignored
458 GList<GffExon> exons; //for non-mRNA entries, these can be any subfeature of type subftype_id
459 GPVec<GffObj> children;
460 GffObj* parent;
461 int udata; //user data, flags etc.
462 void* uptr; //user pointer (to a parent object, cluster, locus etc.)
463 GffObj* ulink; //link to another GffObj (user controlled field)
464 // mRNA specific fields:
465 bool isCDS; //just a CDS, no UTRs
466 bool partial; //partial CDS
467 uint CDstart; //CDS start coord
468 uint CDend; //CDS end coord
469 char CDphase; //initial phase for CDS start
470 bool hasErrors() { return ((flags & gfo_flag_HAS_ERRORS)!=0); }
471 void hasErrors(bool v) {
472 if (v) flags |= gfo_flag_HAS_ERRORS;
473 else flags &= ~gfo_flag_HAS_ERRORS;
474 }
475 bool fromGff3() { return ((flags & gfo_flag_FROM_GFF3)!=0); }
476 void fromGff3(bool v) {
477 if (v) flags |= gfo_flag_FROM_GFF3;
478 else flags &= ~gfo_flag_FROM_GFF3;
479 }
480 bool createdByExon() { return ((flags & gfo_flag_BY_EXON)!=0); }
481 void createdByExon(bool v) {
482 if (v) flags |= gfo_flag_BY_EXON;
483 else flags &= ~gfo_flag_BY_EXON;
484 }
485 bool isGene() { return ((flags & gfo_flag_IS_GENE)!=0); }
486 void isGene(bool v) {
487 if (v) flags |= gfo_flag_IS_GENE;
488 else flags &= ~gfo_flag_IS_GENE;
489 }
490 bool isDiscarded() { return ((flags & gfo_flag_DISCARDED)!=0); }
491 void isDiscarded(bool v) {
492 if (v) flags |= gfo_flag_DISCARDED;
493 else flags &= ~gfo_flag_DISCARDED;
494 }
495
496 bool isUsed() { return ((flags & gfo_flag_LST_KEEP)!=0); }
497 void isUsed(bool v) {
498 if (v) flags |= gfo_flag_LST_KEEP;
499 else flags &= ~gfo_flag_LST_KEEP;
500 }
501 bool isTranscript() { return ((flags & gfo_flag_IS_TRANSCRIPT)!=0); }
502 void isTranscript(bool v) {
503 if (v) flags |= gfo_flag_IS_TRANSCRIPT;
504 else flags &= ~gfo_flag_IS_TRANSCRIPT;
505 }
506 bool promotedChildren() { return ((flags & gfo_flag_CHILDREN_PROMOTED)!=0); }
507 void promotedChildren(bool v) {
508 if (v) flags |= gfo_flag_CHILDREN_PROMOTED;
509 else flags &= ~gfo_flag_CHILDREN_PROMOTED;
510 }
511 void setLevel(byte v) {
512 if (v==0) flags &= ~gfo_flag_LEVEL_MSK;
513 else flags &= ~(((uint)v) << gfo_flagShift_LEVEL);
514 }
515 byte incLevel() {
516 uint v=((flags & gfo_flag_LEVEL_MSK) >> gfo_flagShift_LEVEL);
517 v++;
518 flags &= ~(v << gfo_flagShift_LEVEL);
519 return v;
520 }
521 byte getLevel() {
522 return ((byte)((flags & gfo_flag_LEVEL_MSK) >> gfo_flagShift_LEVEL));
523 }
524
525 bool isValidTranscript() {
526 //return (ftype_id==gff_fid_mRNA && exons.Count()>0);
527 return (isTranscript() && exons.Count()>0);
528 }
529
530
531 int addExon(uint segstart, uint segend, double sc=0, char fr='.',
532 int qs=0, int qe=0, bool iscds=false, char exontype=0);
533
534 int addExon(GffReader* reader, GffLine* gl, bool keepAttr=false, bool noExonAttr=true);
535
536 void removeExon(int idx);
537 void removeExon(GffExon* p);
538 char strand; //true if features are on the reverse complement strand
539 double gscore;
540 double uscore; //custom, user-computed score, if needed
541 int covlen; //total coverage of reference genomic sequence (sum of maxcf segment lengths)
542
543 //--------- optional data:
544 int qlen; //query length, start, end - if available
545 int qstart;
546 int qend;
547 int qcov; //query coverage - percent
548 GffAttrs* attrs; //other gff3 attributes found for the main mRNA feature
549 //constructor by gff line parsing:
550 GffObj(GffReader* gfrd, GffLine* gffline, bool keepAttrs=false, bool noExonAttr=true);
551 //if gfline->Parent!=NULL then this will also add the first sub-feature
552 // otherwise, only the main feature is created
553 void clearAttrs() {
554 if (attrs!=NULL) {
555 bool sharedattrs=(exons.Count()>0 && exons[0]->attrs==attrs);
556 delete attrs; attrs=NULL;
557 if (sharedattrs) exons[0]->attrs=NULL;
558 }
559 }
560 GffObj(char* anid=NULL):GSeg(0,0), exons(true,true,false), children(1,false) {
561 //exons: sorted, free, non-unique
562 gffID=NULL;
563 uptr=NULL;
564 ulink=NULL;
565 flags=0;
566 udata=0;
567 parent=NULL;
568 ftype_id=-1;
569 exon_ftype_id=-1;
570 if (anid!=NULL) gffID=Gstrdup(anid);
571 gffnames_ref(names);
572 qlen=0;
573 qstart=0;
574 qend=0;
575 qcov=0;
576 partial=true;
577 isCDS=false;
578 CDstart=0; // hasCDS <=> CDstart>0
579 CDend=0;
580 CDphase=0;
581 gseq_id=-1;
582 track_id=-1;
583 xstart=0;
584 xend=0;
585 xstatus=0;
586 strand='.';
587 gscore=0;
588 uscore=0;
589 attrs=NULL;
590 covlen=0;
591 gene_name=NULL;
592 geneID=NULL;
593 }
594 ~GffObj() {
595 GFREE(gffID);
596 GFREE(gene_name);
597 GFREE(geneID);
598 clearAttrs();
599 gffnames_unref(names);
600 }
601 //--------------
602 GffObj* finalize(GffReader* gfr, bool mergeCloseExons=false,
603 bool keepAttrs=false, bool noExonAttr=true);
604 //complete parsing: must be called in order to merge adjacent/close proximity subfeatures
605 void parseAttrs(GffAttrs*& atrlist, char* info, bool isExon=false);
606 const char* getSubfName() { //returns the generic feature type of the entries in exons array
607 int sid=exon_ftype_id;
608 if (sid==gff_fid_exon && isCDS) sid=gff_fid_CDS;
609 return names->feats.getName(sid);
610 }
611 bool monoFeature() {
612 return (exons.Count()==0 ||
613 (exons.Count()==1 && exon_ftype_id==ftype_id));
614 }
615
616 bool hasCDS() { return (CDstart>0); }
617
618 const char* getFeatureName() {
619 return names->feats.getName(ftype_id);
620 }
621 void setFeatureName(const char* feature);
622
623 void addAttr(const char* attrname, const char* attrvalue);
624 int removeAttr(const char* attrname, const char* attrval=NULL);
625 int removeAttr(int aid, const char* attrval=NULL);
626 int removeExonAttr(GffExon& exon, const char* attrname, const char* attrval=NULL);
627 int removeExonAttr(GffExon& exon, int aid, const char* attrval=NULL);
628 const char* getAttrName(int i) {
629 if (attrs==NULL) return NULL;
630 return names->attrs.getName(attrs->Get(i)->attr_id);
631 }
632 char* getAttr(const char* attrname, bool checkFirstExon=false) {
633 if (names==NULL || attrname==NULL) return NULL;
634 char* r=NULL;
635 if (attrs==NULL) {
636 if (!checkFirstExon) return NULL;
637 }
638 else r=attrs->getAttr(names, attrname);
639 if (r!=NULL) return r;
640 if (checkFirstExon && exons.Count()>0) {
641 r=exons[0]->getAttr(names, attrname);
642 }
643 return r;
644 }
645
646 char* getExonAttr(GffExon* exon, const char* attrname) {
647 if (exon==NULL || attrname==NULL) return NULL;
648 return exon->getAttr(names, attrname);
649 }
650
651 char* getExonAttr(int exonidx, const char* attrname) {
652 if (exonidx<0 || exonidx>=exons.Count() || attrname==NULL) return NULL;
653 return exons[exonidx]->getAttr(names, attrname);
654 }
655
656 char* getAttrValue(int i) {
657 if (attrs==NULL) return NULL;
658 return attrs->Get(i)->attr_val;
659 }
660 const char* getGSeqName() {
661 return names->gseqs.getName(gseq_id);
662 }
663
664 const char* getRefName() {
665 return names->gseqs.getName(gseq_id);
666 }
667 void setRefName(const char* newname);
668
669 const char* getTrackName() {
670 return names->tracks.getName(track_id);
671 }
672 bool exonOverlap(uint s, uint e) {//check if ANY exon overlaps given segment
673 //ignores strand!
674 if (s>e) swap(s,e);
675 for (int i=0;i<exons.Count();i++) {
676 if (exons[i]->overlap(s,e)) return true;
677 }
678 return false;
679 }
680 bool exonOverlap(GffObj& m) {//check if ANY exon overlaps given segment
681 //if (gseq_id!=m.gseq_id) return false;
682 // ignores strand and gseq_id, must check in advance
683 for (int i=0;i<exons.Count();i++) {
684 for (int j=0;j<m.exons.Count();j++) {
685 if (exons[i]->start>m.exons[j]->end) continue;
686 if (m.exons[j]->start>exons[i]->end) break;
687 //-- overlap if we are here:
688 return true;
689 }
690 }
691 return false;
692 }
693
694 int exonOverlapIdx(uint s, uint e, int* ovlen=NULL) {
695 //return the exons' index for the overlapping OR ADJACENT exon
696 //ovlen, if given, will return the overlap length
697 if (s>e) swap(s,e);
698 s--;e++; //to also catch adjacent exons
699 for (int i=0;i<exons.Count();i++) {
700 if (exons[i]->start>e) break;
701 if (s>exons[i]->end) continue;
702 //-- overlap if we are here:
703 if (ovlen!=NULL) {
704 s++;e--;
705 int ovlend= (exons[i]->end>e) ? e : exons[i]->end;
706 *ovlen= ovlend - ((s>exons[i]->start)? s : exons[i]->start)+1;
707 }
708 return i;
709 } //for each exon
710 *ovlen=0;
711 return -1;
712 }
713
714 int exonOverlapLen(GffObj& m) {
715 if (start>m.end || m.start>end) return 0;
716 int i=0;
717 int j=0;
718 int ovlen=0;
719 while (i<exons.Count() && j<m.exons.Count()) {
720 uint istart=exons[i]->start;
721 uint iend=exons[i]->end;
722 uint jstart=m.exons[j]->start;
723 uint jend=m.exons[j]->end;
724 if (istart>jend) { j++; continue; }
725 if (jstart>iend) { i++; continue; }
726 //exon overlap
727 uint ovstart=GMAX(istart,jstart);
728 if (iend<jend) {
729 ovlen+=iend-ovstart+1;
730 i++;
731 }
732 else {
733 ovlen+=jend-ovstart+1;
734 j++;
735 }
736 }//while comparing exons
737 return ovlen;
738 }
739
740 bool exonOverlap(GffObj* m) {
741 return exonOverlap(*m);
742 }
743 //---------- coordinate transformation
744 void xcoord(uint grstart, uint grend, char xstrand='+') {
745 //relative coordinate transform, and reverse-complement transform if xstrand is '-'
746 //does nothing if xstatus is the same already
747 if (xstatus) {
748 if (xstatus==xstrand && grstart==xstart && grend==xend) return;
749 unxcoord();//restore original coordinates
750 }
751 xstatus=xstrand;
752 xstart=grstart;
753 xend=grend;
754 if (CDstart>0) xcoordseg(CDstart, CDend);
755 for (int i=0;i<exons.Count();i++) {
756 xcoordseg(exons[i]->start, exons[i]->end);
757 }
758 if (xstatus=='-') {
759 exons.Reverse();
760 int flen=end-start;
761 start=xend-end+1;
762 end=start+flen;
763 }
764 else {
765 start=start-xstart+1;
766 end=end-xstart+1;
767 }
768 }
769
770 //transform an arbitrary segment based on current xstatus/xstart-xend
771 void xcoordseg(uint& segstart, uint &segend) {
772 if (xstatus==0) return;
773 if (xstatus=='-') {
774 int flen=segend-segstart;
775 segstart=xend-segend+1;
776 segend=segstart+flen;
777 return;
778 }
779 else {
780 segstart=segstart-xstart+1;
781 segend=segend-xstart+1;
782 }
783 }
784
785 void unxcoord() { //revert back to absolute genomic/gff coordinates if xstatus==true
786 if (xstatus==0) return; //nothing to do, no transformation appplied
787 if (CDstart>0) unxcoordseg(CDstart, CDend);
788 //restore all GffExon intervals too
789 for (int i=0;i<exons.Count();i++) {
790 unxcoordseg(exons[i]->start, exons[i]->end);
791 }
792 if (xstatus=='-') {
793 exons.Reverse();
794 int flen=end-start;
795 start=xend-end+1;
796 end=start+flen;
797 }
798 else {
799 start=start+xstart-1;
800 end=end+xstart-1;
801 }
802 xstatus=0;
803 }
804 void unxcoordseg(uint& astart, uint &aend) {
805 //restore an arbitrary interval -- does NOT change the transform state!
806 if (xstatus==0) return;
807 if (xstatus=='-') {
808 int flen=aend-astart;
809 astart=xend-aend+1;
810 aend=astart+flen;
811 }
812 else {
813 astart=astart+xstart-1;
814 aend=aend+xstart-1;
815 }
816 }
817 //---------------------
818 bool operator==(GffObj& d){
819 return (gseq_id==d.gseq_id && start==d.start && end==d.end && strcmp(gffID, d.gffID)==0);
820 }
821 bool operator>(GffObj& d){
822 if (gseq_id!=d.gseq_id) return (gseq_id>d.gseq_id);
823 if (start==d.start) {
824 if (getLevel()==d.getLevel()) {
825 if (end==d.end) return (strcmp(gffID, d.gffID)>0);
826 else return (end>d.end);
827 } else return (getLevel()>d.getLevel());
828 } else return (start>d.start);
829 }
830 bool operator<(GffObj& d){
831 if (gseq_id!=d.gseq_id) return (gseq_id<d.gseq_id);
832 if (start==d.start) {
833 if (getLevel()==d.getLevel()) {
834 if (end==d.end) return strcmp(gffID, d.gffID)<0;
835 else return end<d.end;
836 } else return (getLevel()<d.getLevel());
837 } else return (start<d.start);
838 }
839 char* getID() { return gffID; }
840 char* getGeneID() { return geneID; }
841 char* getGeneName() { return gene_name; }
842 void setGeneName(const char* gname) {
843 GFREE(gene_name);
844 if (gname) gene_name=Gstrdup(gname);
845 }
846 void setGeneID(const char* gene_id) {
847 GFREE(geneID);
848 if (gene_id) geneID=Gstrdup(gene_id);
849 }
850 int addSeg(GffLine* gfline);
851 int addSeg(int fnid, GffLine* gfline);
852 void getCDSegs(GArray<GffCDSeg>& cds);
853
854 void updateExonPhase(); //for CDS-only features, updates GExon::phase
855
856 void printGxfLine(FILE* fout, const char* tlabel, const char* gseqname,
857 bool iscds, uint segstart, uint segend, int exidx, char phase, bool gff3);
858 void printGxf(FILE* fout, GffPrintMode gffp=pgffExon,
859 const char* tlabel=NULL, const char* gfparent=NULL);
860 void printGtf(FILE* fout, const char* tlabel=NULL) {
861 printGxf(fout, pgtfAny, tlabel);
862 }
863 void printGff(FILE* fout, const char* tlabel=NULL,
864 const char* gfparent=NULL) {
865 printGxf(fout, pgffAny, tlabel, gfparent);
866 }
867 void printTranscriptGff(FILE* fout, char* tlabel=NULL,
868 bool showCDS=false, const char* gfparent=NULL) {
869 if (isValidTranscript())
870 printGxf(fout, showCDS ? pgffBoth : pgffExon, tlabel, gfparent);
871 }
872 void printSummary(FILE* fout=NULL);
873 void getCDS_ends(uint& cds_start, uint& cds_end);
874 void mRNA_CDS_coords(uint& cds_start, uint& cds_end);
875 char* getSpliced(GFaSeqGet* faseq, bool CDSonly=false, int* rlen=NULL,
876 uint* cds_start=NULL, uint* cds_end=NULL, GList<GSeg>* seglst=NULL);
877 char* getUnspliced(GFaSeqGet* faseq, int* rlen, GList<GSeg>* seglst);
878 char* getSplicedTr(GFaSeqGet* faseq, bool CDSonly=true, int* rlen=NULL);
879 //bool validCDS(GFaSeqGet* faseq); //has In-Frame Stop Codon ?
880 bool empty() { return (start==0); }
881 };
882
883 typedef bool GffRecFunc(GffObj* gobj, void* usrptr1, void* usrptr2);
884 //user callback after parsing a mapping object:
885 // Returns: "done with it" status:
886 // TRUE if gobj is no longer needed so it's FREEd upon return
887 // FALSE if the user needs the gobj pointer and is responsible for
888 // collecting and freeing all GffObj objects
889
890
891 //GSeqStat: collect basic stats about a common underlying genomic sequence
892 // for multiple GffObj
893 class GSeqStat {
894 public:
895 int gseqid; //gseq id in the global static pool of gseqs
896 char* gseqname; //just a pointer to the name of gseq
897 //int fcount;//number of features on this gseq
898 uint mincoord;
899 uint maxcoord;
900 uint maxfeat_len; //maximum feature length on this genomic sequence
901 GffObj* maxfeat;
902 GSeqStat(int id=-1, char* name=NULL) {
903 gseqid=id;
904 gseqname=name;
905 mincoord=MAXUINT;
906 maxcoord=0;
907 maxfeat_len=0;
908 maxfeat=NULL;
909 }
910 bool operator>(GSeqStat& g) {
911 return (gseqid>g.gseqid);
912 }
913 bool operator<(GSeqStat& g) {
914 return (gseqid<g.gseqid);
915 }
916 bool operator==(GSeqStat& g) {
917 return (gseqid==g.gseqid);
918 }
919 };
920
921
922 int gfo_cmpByLoc(const pointer p1, const pointer p2);
923
924 class GfList: public GList<GffObj> {
925 //just adding the option to sort by genomic sequence and coordinate
926 bool mustSort;
927 public:
928 GfList(bool sortbyloc=false):GList<GffObj>(false,false,false) {
929 //GffObjs in this list are NOT deleted when the list is cleared
930 //-- for deallocation of these objects, call freeAll() or freeUnused() as needed
931 mustSort=sortbyloc;
932 }
933 void sortedByLoc(bool v=true) {
934 bool prev=mustSort;
935 mustSort=v;
936 if (fCount>0 && mustSort && !prev) {
937 this->setSorted((GCompareProc*)gfo_cmpByLoc);
938 }
939 }
940 void finalize(GffReader* gfr, bool mergeCloseExons,
941 bool keepAttrs=false, bool noExonAttr=true) { //if set, enforce sort by locus
942 if (mustSort) { //force (re-)sorting
943 this->setSorted(false);
944 this->setSorted((GCompareProc*)gfo_cmpByLoc);
945 }
946 int delcount=0;
947 for (int i=0;i<Count();i++) {
948 //finish the parsing for each GffObj
949 fList[i]->finalize(gfr, mergeCloseExons, keepAttrs, noExonAttr);
950 }
951 if (delcount>0) this->Pack();
952 }
953 void freeAll() {
954 for (int i=0;i<fCount;i++) {
955 delete fList[i];
956 fList[i]=NULL;
957 }
958 Clear();
959 }
960 void freeUnused() {
961 for (int i=0;i<fCount;i++) {
962 if (fList[i]->isUsed()) continue;
963 //inform the children
964 for (int c=0;c<fList[i]->children.Count();c++) {
965 fList[i]->children[c]->parent=NULL;
966 }
967 delete fList[i];
968 fList[i]=NULL;
969 }
970 Clear();
971 }
972
973 };
974
975 class GfoHolder {
976 public:
977 int idx; //position in GffReader::gflst array
978 GffObj* gffobj;
979 GfoHolder(GffObj* gfo=NULL, int i=0) {
980 idx=i;
981 gffobj=gfo;
982 }
983 };
984
985 class CNonExon { //utility class used in subfeature promotion
986 public:
987 int idx;
988 GffObj* parent;
989 GffExon* exon;
990 GffLine* gffline;
991 CNonExon(int i, GffObj* p, GffExon* e, GffLine* gl) {
992 parent=p;
993 exon=e;
994 idx=i;
995 gffline=new GffLine(gl);
996 }
997 ~CNonExon() {
998 delete gffline;
999 }
1000 };
1001
1002
1003 class GffReader {
1004 friend class GffObj;
1005 friend class GffLine;
1006 char* linebuf;
1007 off_t fpos;
1008 int buflen;
1009 protected:
1010 bool gff_warns; //warn about duplicate IDs, etc. even when they are on different chromosomes
1011 FILE* fh;
1012 char* fname; //optional fasta file with the underlying genomic sequence to be attached to this reader
1013 GffNames* names; //just a pointer to the global static Gff names repository in GffObj
1014 GffLine* gffline;
1015 bool transcriptsOnly; //keep only transcripts w/ their exon/CDS features
1016 GHash<int> discarded_ids; //for transcriptsOnly mode, keep track
1017 // of discarded parent IDs
1018 GHash<GfoHolder> phash; //transcript_id+contig (Parent~Contig) => [gflst index, GffObj]
1019 GHash<int> tids; //transcript_id uniqueness
1020 char* gfoBuildId(const char* id, const char* ctg);
1021 void gfoRemove(const char* id, const char* ctg);
1022 GfoHolder* gfoAdd(const char* id, const char* ctg, GffObj* gfo, int idx);
1023 GfoHolder* gfoFind(const char* id, const char* ctg);
1024 CNonExon* subfPoolCheck(GffLine* gffline, GHash<CNonExon>& pex, char*& subp_name);
1025 void subfPoolAdd(GHash<CNonExon>& pex, GfoHolder* newgfo);
1026 GfoHolder* promoteFeature(CNonExon* subp, char*& subp_name, GHash<CNonExon>& pex,
1027 bool keepAttr, bool noExonAttr);
1028 public:
1029 GfList gflst; //accumulate GffObjs being read
1030 GfoHolder* newGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr,
1031 GffObj* parent=NULL, GffExon* pexon=NULL);
1032 GfoHolder* replaceGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, int replaceidx);
1033 GfoHolder* updateGffRec(GfoHolder* prevgfo, GffLine* gffline,
1034 bool keepAttr);
1035 GfoHolder* updateParent(GfoHolder* newgfh, GffObj* parent);
1036 bool addExonFeature(GfoHolder* prevgfo, GffLine* gffline, GHash<CNonExon>& pex, bool noExonAttr);
1037 GList<GSeqStat> gseqstats; //list of all genomic sequences seen by this reader, accumulates stats
1038 GffReader(FILE* f=NULL, bool t_only=false, bool sortbyloc=false):discarded_ids(true),
1039 phash(true), tids(true), gflst(sortbyloc), gseqstats(true,true,true) {
1040 gff_warns=gff_show_warnings;
1041 names=NULL;
1042 gffline=NULL;
1043 transcriptsOnly=t_only;
1044 fpos=0;
1045 fname=NULL;
1046 fh=f;
1047 GMALLOC(linebuf, GFF_LINELEN);
1048 buflen=GFF_LINELEN-1;
1049 }
1050 void init(FILE *f, bool t_only=false, bool sortbyloc=false) {
1051 fname=NULL;
1052 fh=f;
1053 if (fh!=NULL) rewind(fh);
1054 fpos=0;
1055 transcriptsOnly=t_only;
1056 gflst.sortedByLoc(sortbyloc);
1057 }
1058 GffReader(char* fn, bool t_only=false, bool sort=false):discarded_ids(true), phash(true),
1059 tids(true),gflst(sort),gseqstats(true,true,true) {
1060 gff_warns=gff_show_warnings;
1061 names=NULL;
1062 fname=Gstrdup(fn);
1063 transcriptsOnly=t_only;
1064 fh=fopen(fname, "rb");
1065 fpos=0;
1066 gffline=NULL;
1067 GMALLOC(linebuf, GFF_LINELEN);
1068 buflen=GFF_LINELEN-1;
1069 }
1070
1071 ~GffReader() {
1072 delete gffline;
1073 gffline=NULL;
1074 fpos=0;
1075 gflst.freeUnused();
1076 gflst.Clear();
1077 discarded_ids.Clear();
1078 phash.Clear();
1079 gseqstats.Clear();
1080 GFREE(fname);
1081 GFREE(linebuf);
1082 }
1083
1084 void showWarnings(bool v=true) {
1085 gff_warns=v;
1086 gff_show_warnings=v;
1087 }
1088
1089 GffLine* nextGffLine();
1090
1091 // load all subfeatures, re-group them:
1092 void readAll(bool keepAttr=false, bool mergeCloseExons=false, bool noExonAttr=true);
1093
1094 }; // end of GffReader
1095
1096 #endif