1 |
gpertea |
2 |
#ifndef GFF_H |
2 |
|
|
#define GFF_H |
3 |
|
|
|
4 |
|
|
#include "GBase.h" |
5 |
|
|
#include "gdna.h" |
6 |
|
|
#include "codons.h" |
7 |
|
|
#include "GFaSeqGet.h" |
8 |
|
|
#include "GList.hh" |
9 |
|
|
#include "GHash.hh" |
10 |
|
|
|
11 |
|
|
/* |
12 |
|
|
const byte exMskMajSpliceL = 0x01; |
13 |
|
|
const byte exMskMajSpliceR = 0x02; |
14 |
|
|
const byte exMskMinSpliceL = 0x04; |
15 |
|
|
const byte exMskMinSpliceR = 0x08; |
16 |
|
|
const byte exMskTag = 0x80; |
17 |
|
|
*/ |
18 |
gpertea |
16 |
|
19 |
|
|
//reserved Gffnames::feats entries -- basic feature types |
20 |
|
|
extern const int gff_fid_mRNA; // "mRNA" feature name |
21 |
|
|
extern const int gff_fid_transcript; // *RNA, *transcript feature name |
22 |
gpertea |
2 |
extern const int gff_fid_exon; |
23 |
gpertea |
16 |
extern const int gff_fid_CDS; //never really used, except for display only |
24 |
|
|
//use gff_fid_exon instead |
25 |
gpertea |
2 |
extern const uint GFF_MAX_LOCUS; |
26 |
|
|
extern const uint GFF_MAX_EXON; |
27 |
|
|
extern const uint GFF_MAX_INTRON; |
28 |
|
|
|
29 |
gpertea |
16 |
extern const uint gfo_flag_CHILDREN_PROMOTED; |
30 |
|
|
extern const uint gfo_flag_HAS_ERRORS; |
31 |
|
|
extern const uint gfo_flag_IS_GENE; |
32 |
|
|
extern const uint gfo_flag_FROM_GFF3; //parsed from GFF3 formatted record |
33 |
|
|
extern const uint gfo_flag_BY_EXON; //created by subfeature (exon) directly |
34 |
|
|
//(GTF2 and some chado gff3 dumps with exons given before their mRNA) |
35 |
|
|
extern const uint gfo_flag_IS_TRANSCRIPT; //recognized as '*RNA' or '*transcript' |
36 |
|
|
extern const uint gfo_flag_DISCARDED; //should not be printed under the "transcriptsOnly" directive |
37 |
|
|
extern const uint gfo_flag_LST_KEEP; //GffObj from GffReader::gflst is to be kept (not deallocated) |
38 |
|
|
//when GffReader is destroyed |
39 |
|
|
extern const uint gfo_flag_LEVEL_MSK; //hierarchical level: 0 = no parent |
40 |
|
|
extern const byte gfo_flagShift_LEVEL; |
41 |
|
|
|
42 |
|
|
extern bool gff_show_warnings; |
43 |
|
|
|
44 |
gpertea |
2 |
#define GFF_LINELEN 2048 |
45 |
|
|
#define ERR_NULL_GFNAMES "Error: GffObj::%s requires a non-null GffNames* names!\n" |
46 |
|
|
|
47 |
|
|
|
48 |
|
|
enum GffExonType { |
49 |
gpertea |
16 |
exgffNone=0, //not a recognizable exon or CDS segment |
50 |
|
|
exgffStart, //from "start_codon" feature (within CDS) |
51 |
|
|
exgffStop, //from "stop_codon" feature (may be outside CDS) |
52 |
gpertea |
2 |
exgffCDS, //from "CDS" feature |
53 |
|
|
exgffUTR, //from "UTR" feature |
54 |
gpertea |
16 |
exgffCDSUTR, //from a merge of UTR and CDS feature |
55 |
gpertea |
2 |
exgffExon, //from "exon" feature |
56 |
|
|
}; |
57 |
|
|
|
58 |
|
|
class GffReader; |
59 |
|
|
|
60 |
|
|
class GffLine { |
61 |
gpertea |
16 |
char* _parents; //stores a copy of the Parent attribute value, |
62 |
|
|
//with commas replaced by \0 |
63 |
|
|
int _parents_len; |
64 |
gpertea |
2 |
public: |
65 |
gpertea |
16 |
char* dupline; //duplicate of original line |
66 |
|
|
char* line; //this will have tabs replaced by \0 |
67 |
|
|
int llen; |
68 |
gpertea |
2 |
char* gseqname; |
69 |
|
|
char* track; |
70 |
|
|
char* ftype; //feature name: mRNA/gene/exon/CDS |
71 |
gpertea |
16 |
char* info; //the last, attributes' field, unparsed |
72 |
gpertea |
2 |
uint fstart; |
73 |
|
|
uint fend; |
74 |
|
|
uint qstart; //overlap coords on query, if available |
75 |
|
|
uint qend; |
76 |
|
|
uint qlen; //query len, if given |
77 |
|
|
double score; |
78 |
|
|
char strand; |
79 |
|
|
bool skip; |
80 |
gpertea |
16 |
bool is_gff3; //if the line appears to be in GFF3 format |
81 |
gpertea |
2 |
bool is_cds; //"cds" and "stop_codon" features |
82 |
|
|
bool is_exon; //"exon" and "utr" features |
83 |
|
|
char exontype; // gffExonType |
84 |
gpertea |
16 |
bool is_transcript; //if current feature is *RNA or *transcript |
85 |
|
|
bool is_gene; //if current feature is *gene |
86 |
gpertea |
2 |
char phase; // '.' , '0', '1' or '2' |
87 |
gpertea |
16 |
// -- allocated strings: |
88 |
|
|
char* gene_name; //value of gene_name attribute (GTF) if present or Name attribute of a gene feature (GFF3) |
89 |
|
|
char* gene_id; //value of gene_id attribute (GTF) if present or ID attribute of a gene feature (GFF3) |
90 |
gpertea |
2 |
// |
91 |
gpertea |
16 |
char** parents; //for GTF only parents[0] is used |
92 |
|
|
int num_parents; |
93 |
|
|
char* ID; // if a ID=.. attribute was parsed, or a GTF with 'transcript' line (transcript_id) |
94 |
gpertea |
2 |
GffLine(GffReader* reader, const char* l); //parse the line accordingly |
95 |
gpertea |
16 |
void discardParent() { |
96 |
|
|
GFREE(_parents); |
97 |
|
|
_parents_len=0; |
98 |
|
|
num_parents=0; |
99 |
|
|
parents=NULL; |
100 |
|
|
} |
101 |
|
|
char* extractAttr(const char* pre, bool caseStrict=true, bool enforce_GTF2=false); |
102 |
|
|
GffLine(GffLine* l) { //a copy constructor |
103 |
|
|
memcpy((void*)this, (void*)l, sizeof(GffLine)); |
104 |
|
|
line=NULL; |
105 |
|
|
GMALLOC(line, llen+1); |
106 |
|
|
memcpy(line, l->line, llen+1); |
107 |
|
|
GMALLOC(dupline, llen+1); |
108 |
|
|
memcpy(dupline, l->dupline, llen+1); |
109 |
|
|
//--offsets within line[] |
110 |
|
|
gseqname=line+(l->gseqname-l->line); |
111 |
|
|
track=line+(l->track-l->line); |
112 |
|
|
ftype=line+(l->ftype-l->line); |
113 |
|
|
info=line+(l->info-l->line); |
114 |
|
|
//Parent=Gstrdup(l->Parent); |
115 |
|
|
if (l->_parents_len>0) { |
116 |
|
|
_parents_len=l->_parents_len; |
117 |
|
|
GMALLOC(_parents, _parents_len); |
118 |
|
|
memcpy(_parents, l->_parents, _parents_len); |
119 |
|
|
num_parents=l->num_parents; |
120 |
|
|
for (int i=0;i<num_parents;i++) { |
121 |
|
|
parents[i]=_parents+(l->parents[i] - l->_parents); |
122 |
|
|
} |
123 |
|
|
} |
124 |
|
|
//-- allocated string copies: |
125 |
|
|
ID=Gstrdup(l->ID); |
126 |
|
|
if (l->gene_name!=NULL) |
127 |
|
|
gene_name=Gstrdup(l->gene_name); |
128 |
|
|
if (l->gene_id!=NULL) |
129 |
|
|
gene_id=Gstrdup(l->gene_id); |
130 |
|
|
} |
131 |
gpertea |
2 |
GffLine() { |
132 |
|
|
line=NULL; |
133 |
gpertea |
16 |
dupline=NULL; |
134 |
gpertea |
2 |
gseqname=NULL; |
135 |
|
|
track=NULL; |
136 |
|
|
ftype=NULL; |
137 |
|
|
fstart=0; |
138 |
|
|
fend=0; |
139 |
|
|
info=NULL; |
140 |
gpertea |
16 |
_parents=NULL; |
141 |
|
|
_parents_len=0; |
142 |
|
|
parents=NULL; |
143 |
|
|
num_parents=0; |
144 |
gpertea |
2 |
ID=NULL; |
145 |
gpertea |
16 |
gene_name=NULL; |
146 |
|
|
gene_id=NULL; |
147 |
gpertea |
2 |
skip=true; |
148 |
|
|
qstart=0; |
149 |
|
|
qend=0; |
150 |
|
|
qlen=0; |
151 |
|
|
exontype=0; |
152 |
|
|
is_cds=false; |
153 |
gpertea |
16 |
is_gff3=false; |
154 |
|
|
is_transcript=false; |
155 |
|
|
is_gene=false; |
156 |
gpertea |
2 |
is_exon=false; |
157 |
|
|
} |
158 |
|
|
~GffLine() { |
159 |
gpertea |
16 |
GFREE(dupline); |
160 |
gpertea |
2 |
GFREE(line); |
161 |
gpertea |
16 |
GFREE(_parents); |
162 |
|
|
GFREE(parents); |
163 |
gpertea |
2 |
GFREE(ID); |
164 |
gpertea |
16 |
GFREE(gene_name); |
165 |
|
|
GFREE(gene_id); |
166 |
gpertea |
2 |
} |
167 |
|
|
}; |
168 |
|
|
|
169 |
|
|
class GffAttr { |
170 |
|
|
public: |
171 |
|
|
int attr_id; |
172 |
|
|
char* attr_val; |
173 |
|
|
GffAttr(int an_id, const char* av=NULL) { |
174 |
|
|
attr_id=an_id; |
175 |
|
|
attr_val=NULL; |
176 |
gpertea |
16 |
setValue(av); |
177 |
gpertea |
2 |
} |
178 |
|
|
~GffAttr() { |
179 |
|
|
GFREE(attr_val); |
180 |
|
|
} |
181 |
gpertea |
16 |
void setValue(const char* av) { |
182 |
|
|
if (attr_val!=NULL) { |
183 |
|
|
GFREE(attr_val); |
184 |
|
|
} |
185 |
|
|
if (av==NULL || av[0]==0) return; |
186 |
|
|
//trim spaces |
187 |
|
|
const char* vstart=av; |
188 |
|
|
while (*vstart==' ') av++; |
189 |
|
|
const char* vend=vstart; |
190 |
|
|
bool keep_dq=false; |
191 |
|
|
while (vend[1]!=0) { |
192 |
|
|
if (*vend==' ' && vend[1]!=' ') keep_dq=true; |
193 |
|
|
else if (*vend==';') keep_dq=true; |
194 |
|
|
vend++; |
195 |
|
|
} |
196 |
|
|
//remove spaces at the end: |
197 |
|
|
while (*vend==' ' && vend!=vstart) vend--; |
198 |
|
|
//practical clean-up: if it doesn't have any internal spaces just strip those useless double quotes |
199 |
|
|
if (!keep_dq && *vstart=='"' && *vend=='"') { |
200 |
|
|
vend--; |
201 |
|
|
vstart++; |
202 |
|
|
} |
203 |
|
|
attr_val=Gstrdup(vstart, vend); |
204 |
|
|
} |
205 |
gpertea |
2 |
bool operator==(GffAttr& d){ |
206 |
|
|
return (this==&d); |
207 |
|
|
} |
208 |
|
|
bool operator>(GffAttr& d){ |
209 |
|
|
return (this>&d); |
210 |
|
|
} |
211 |
|
|
bool operator<(GffAttr& d){ |
212 |
|
|
return (this<&d); |
213 |
|
|
} |
214 |
|
|
|
215 |
|
|
}; |
216 |
|
|
|
217 |
|
|
class GffNameList; |
218 |
|
|
class GffNames; |
219 |
|
|
|
220 |
|
|
class GffNameInfo { |
221 |
|
|
friend class GffNameList; |
222 |
|
|
protected: |
223 |
|
|
int idx; |
224 |
|
|
public: |
225 |
|
|
char* name; |
226 |
|
|
GffNameInfo() { name=NULL; idx=-1; } |
227 |
|
|
GffNameInfo(const char* n) { |
228 |
|
|
name=Gstrdup(n); |
229 |
|
|
} |
230 |
|
|
|
231 |
|
|
~GffNameInfo() { |
232 |
gpertea |
16 |
GFREE(name); |
233 |
|
|
} |
234 |
gpertea |
2 |
|
235 |
|
|
bool operator==(GffNameInfo& d){ |
236 |
|
|
return (strcmp(this->name, d.name)==0); |
237 |
|
|
} |
238 |
|
|
bool operator>(GffNameInfo& d){ |
239 |
|
|
return (strcmp(this->name, d.name)>0); |
240 |
|
|
} |
241 |
|
|
bool operator<(GffNameInfo& d){ |
242 |
|
|
return (strcmp(this->name, d.name)<0); |
243 |
|
|
} |
244 |
|
|
}; |
245 |
|
|
|
246 |
|
|
class GffNameList:public GList<GffNameInfo> { |
247 |
|
|
friend class GffNameInfo; |
248 |
|
|
friend class GffNames; |
249 |
|
|
protected: |
250 |
|
|
GHash<GffNameInfo> byName;//hash with shared keys |
251 |
|
|
int idlast; //fList index of last added/reused name |
252 |
|
|
void addStatic(const char* tname) {// fast add |
253 |
|
|
GffNameInfo* f=new GffNameInfo(tname); |
254 |
|
|
idlast=this->Add(f); |
255 |
|
|
f->idx=idlast; |
256 |
|
|
byName.shkAdd(f->name,f); |
257 |
|
|
} |
258 |
|
|
public: |
259 |
|
|
GffNameList():GList<GffNameInfo>(false,true,true), byName(false) { |
260 |
|
|
idlast=-1; |
261 |
|
|
} |
262 |
|
|
char* lastNameUsed() { return idlast<0 ? NULL : Get(idlast)->name; } |
263 |
|
|
int lastNameId() { return idlast; } |
264 |
|
|
char* getName(int nid) { //retrieve name by its ID |
265 |
|
|
if (nid<0 || nid>=fCount) |
266 |
|
|
GError("GffNameList Error: invalid index (%d)\n",nid); |
267 |
|
|
return fList[nid]->name; |
268 |
|
|
} |
269 |
|
|
|
270 |
|
|
int addName(const char* tname) {//returns or create an id for the given name |
271 |
|
|
//check idlast first, chances are it's the same feature name checked |
272 |
|
|
if (idlast>=0 && strcmp(fList[idlast]->name,tname)==0) |
273 |
|
|
return idlast; |
274 |
|
|
GffNameInfo* f=byName.Find(tname); |
275 |
|
|
int fidx=-1; |
276 |
|
|
if (f!=NULL) fidx=f->idx; |
277 |
|
|
else {//add new entry |
278 |
|
|
f=new GffNameInfo(tname); |
279 |
|
|
fidx=this->Add(f); |
280 |
|
|
f->idx=fidx; |
281 |
|
|
byName.shkAdd(f->name,f); |
282 |
|
|
} |
283 |
|
|
idlast=fidx; |
284 |
|
|
return fidx; |
285 |
|
|
} |
286 |
gpertea |
16 |
|
287 |
|
|
int addNewName(const char* tname) { |
288 |
|
|
GffNameInfo* f=new GffNameInfo(tname); |
289 |
|
|
int fidx=this->Add(f); |
290 |
|
|
f->idx=fidx; |
291 |
|
|
byName.shkAdd(f->name,f); |
292 |
|
|
return fidx; |
293 |
|
|
} |
294 |
|
|
|
295 |
gpertea |
2 |
int getId(const char* tname) { //only returns a name id# if found |
296 |
|
|
GffNameInfo* f=byName.Find(tname); |
297 |
|
|
if (f==NULL) return -1; |
298 |
|
|
return f->idx; |
299 |
|
|
} |
300 |
gpertea |
16 |
int removeName() { |
301 |
gpertea |
2 |
GError("Error: removing names from GffNameList not allowed!\n"); |
302 |
|
|
return -1; |
303 |
|
|
} |
304 |
|
|
}; |
305 |
|
|
|
306 |
|
|
class GffNames { |
307 |
|
|
public: |
308 |
|
|
int numrefs; |
309 |
|
|
GffNameList tracks; |
310 |
|
|
GffNameList gseqs; |
311 |
|
|
GffNameList attrs; |
312 |
gpertea |
16 |
GffNameList feats; //feature names: 'mRNA', 'exon', 'CDS' etc. |
313 |
gpertea |
2 |
GffNames():tracks(),gseqs(),attrs(), feats() { |
314 |
|
|
numrefs=0; |
315 |
|
|
//the order below is critical! |
316 |
|
|
//has to match: gff_fid_mRNA, gff_fid_exon, gff_fid_CDS |
317 |
|
|
feats.addStatic("mRNA");//index 0=gff_fid_mRNA |
318 |
gpertea |
16 |
feats.addStatic("transcript");//index 1=gff_fid_transcript |
319 |
gpertea |
2 |
feats.addStatic("exon");//index 1=gff_fid_exon |
320 |
|
|
feats.addStatic("CDS"); //index 2=gff_fid_CDS |
321 |
|
|
} |
322 |
|
|
}; |
323 |
|
|
|
324 |
|
|
void gffnames_ref(GffNames* &n); |
325 |
|
|
void gffnames_unref(GffNames* &n); |
326 |
|
|
|
327 |
|
|
enum GffPrintMode { |
328 |
|
|
pgtfAny, //print record as read |
329 |
|
|
pgtfExon, |
330 |
|
|
pgtfCDS, |
331 |
|
|
pgffAny, //print record as read |
332 |
|
|
pgffExon, |
333 |
|
|
pgffCDS, |
334 |
|
|
pgffBoth, |
335 |
|
|
}; |
336 |
|
|
|
337 |
|
|
|
338 |
|
|
class GffAttrs:public GList<GffAttr> { |
339 |
|
|
public: |
340 |
|
|
GffAttrs():GList<GffAttr>(false,true,false) { } |
341 |
gpertea |
16 |
void add_or_update(GffNames* names, const char* attrname, const char* val) { |
342 |
|
|
int aid=names->attrs.getId(attrname); |
343 |
|
|
if (aid>=0) { |
344 |
|
|
//attribute found in the dictionary |
345 |
|
|
for (int i=0;i<Count();i++) { |
346 |
|
|
//do we have it? |
347 |
|
|
if (aid==Get(i)->attr_id) { |
348 |
|
|
//update the value |
349 |
|
|
Get(i)->setValue(val); |
350 |
|
|
return; |
351 |
|
|
} |
352 |
|
|
} |
353 |
|
|
} |
354 |
|
|
else { |
355 |
|
|
aid=names->attrs.addNewName(attrname); |
356 |
|
|
} |
357 |
|
|
this->Add(new GffAttr(aid, val)); |
358 |
|
|
} |
359 |
|
|
|
360 |
gpertea |
2 |
char* getAttr(GffNames* names, const char* attrname) { |
361 |
|
|
int aid=names->attrs.getId(attrname); |
362 |
|
|
if (aid>=0) |
363 |
|
|
for (int i=0;i<Count();i++) |
364 |
|
|
if (aid==Get(i)->attr_id) return Get(i)->attr_val; |
365 |
|
|
return NULL; |
366 |
|
|
} |
367 |
gpertea |
16 |
char* getAttr(int aid) { |
368 |
|
|
if (aid>=0) |
369 |
|
|
for (int i=0;i<Count();i++) |
370 |
|
|
if (aid==Get(i)->attr_id) return Get(i)->attr_val; |
371 |
|
|
return NULL; |
372 |
|
|
} |
373 |
gpertea |
2 |
}; |
374 |
|
|
|
375 |
|
|
|
376 |
|
|
class GffExon : public GSeg { |
377 |
|
|
public: |
378 |
gpertea |
16 |
void* uptr; //for later extensions |
379 |
gpertea |
2 |
GffAttrs* attrs; //other attributes kept for this exon |
380 |
|
|
double score; // gff score column |
381 |
|
|
char phase; //GFF phase column - for CDS segments only |
382 |
|
|
// '.' = undefined (UTR), '0','1','2' for CDS exons |
383 |
|
|
char exontype; // 1="exon" 2="cds" 3="utr" 4="stop_codon" |
384 |
|
|
int qstart; // for mRNA/protein exon mappings: coordinates on query |
385 |
|
|
int qend; |
386 |
|
|
GffExon(int s=0, int e=0, double sc=0, char fr=0, int qs=0, int qe=0, char et=0) { |
387 |
|
|
uptr=NULL; |
388 |
|
|
attrs=NULL; |
389 |
|
|
if (s<e) { |
390 |
|
|
start=s; |
391 |
|
|
end=e; |
392 |
|
|
} |
393 |
|
|
else { |
394 |
|
|
start=e; |
395 |
|
|
end=s; |
396 |
|
|
} |
397 |
|
|
if (qs<qe) { |
398 |
|
|
qstart=qs; |
399 |
|
|
qend=qe; |
400 |
|
|
} else { |
401 |
|
|
qstart=qe; |
402 |
|
|
qend=qs; |
403 |
|
|
} |
404 |
|
|
score=sc; |
405 |
|
|
phase=fr; |
406 |
|
|
exontype=et; |
407 |
|
|
} //constructor |
408 |
|
|
|
409 |
|
|
char* getAttr(GffNames* names, const char* atrname) { |
410 |
|
|
if (attrs==NULL || names==NULL || atrname==NULL) return NULL; |
411 |
|
|
return attrs->getAttr(names, atrname); |
412 |
|
|
} |
413 |
|
|
|
414 |
gpertea |
16 |
char* getAttr(int aid) { |
415 |
|
|
if (attrs==NULL) return NULL; |
416 |
|
|
return attrs->getAttr(aid); |
417 |
|
|
} |
418 |
|
|
|
419 |
gpertea |
2 |
~GffExon() { //destructor |
420 |
|
|
if (attrs!=NULL) delete attrs; |
421 |
|
|
} |
422 |
|
|
}; |
423 |
|
|
|
424 |
|
|
|
425 |
|
|
class GffCDSeg:public GSeg { |
426 |
|
|
public: |
427 |
|
|
char phase; |
428 |
|
|
int exonidx; |
429 |
|
|
}; |
430 |
|
|
//one GFF mRNA object -- e.g. a mRNA with its exons and/or CDS segments |
431 |
|
|
class GffObj:public GSeg { |
432 |
|
|
//utility segment-merging function for addExon() |
433 |
|
|
void expandExon(int xovl, uint segstart, uint segend, |
434 |
|
|
char exontype, double sc, char fr, int qs, int qe); |
435 |
|
|
protected: |
436 |
|
|
//coordinate transformation data: |
437 |
|
|
uint xstart; //absolute genomic coordinates of reference region |
438 |
|
|
uint xend; |
439 |
|
|
char xstatus; //coordinate transform status: |
440 |
|
|
//0 : (start,end) coordinates are absolute |
441 |
|
|
//'+' : (start,end) coords are relative to xstart..xend region |
442 |
|
|
//'-' : (start,end) are relative to the reverse complement of xstart..xend region |
443 |
|
|
//-- |
444 |
|
|
char* gffID; // ID name for mRNA (parent) feature |
445 |
gpertea |
16 |
char* gene_name; //value of gene_name attribute (GTF) if present or Name attribute of the parent gene feature (GFF3) |
446 |
|
|
char* geneID; //value of gene_id attribute (GTF) if present or ID attribute of a parent gene feature (GFF3) |
447 |
|
|
unsigned int flags; |
448 |
gpertea |
2 |
//-- friends: |
449 |
|
|
friend class GffReader; |
450 |
|
|
friend class GffExon; |
451 |
|
|
public: |
452 |
gpertea |
16 |
static GffNames* names; // dictionary storage that holds the various attribute names etc. |
453 |
gpertea |
2 |
int track_id; // index of track name in names->tracks |
454 |
|
|
int gseq_id; // index of genomic sequence name in names->gseqs |
455 |
|
|
int ftype_id; // index of this record's feature name in names->feats, or the special gff_fid_mRNA value |
456 |
gpertea |
16 |
int exon_ftype_id; //index of child subfeature name in names->feats (that subfeature stored in "exons") |
457 |
gpertea |
2 |
//if ftype_id==gff_fid_mRNA then this value is ignored |
458 |
|
|
GList<GffExon> exons; //for non-mRNA entries, these can be any subfeature of type subftype_id |
459 |
gpertea |
16 |
GPVec<GffObj> children; |
460 |
|
|
GffObj* parent; |
461 |
gpertea |
2 |
int udata; //user data, flags etc. |
462 |
|
|
void* uptr; //user pointer (to a parent object, cluster, locus etc.) |
463 |
|
|
GffObj* ulink; //link to another GffObj (user controlled field) |
464 |
|
|
// mRNA specific fields: |
465 |
|
|
bool isCDS; //just a CDS, no UTRs |
466 |
|
|
bool partial; //partial CDS |
467 |
|
|
uint CDstart; //CDS start coord |
468 |
|
|
uint CDend; //CDS end coord |
469 |
|
|
char CDphase; //initial phase for CDS start |
470 |
gpertea |
16 |
bool hasErrors() { return ((flags & gfo_flag_HAS_ERRORS)!=0); } |
471 |
|
|
void hasErrors(bool v) { |
472 |
|
|
if (v) flags |= gfo_flag_HAS_ERRORS; |
473 |
|
|
else flags &= ~gfo_flag_HAS_ERRORS; |
474 |
|
|
} |
475 |
|
|
bool fromGff3() { return ((flags & gfo_flag_FROM_GFF3)!=0); } |
476 |
|
|
void fromGff3(bool v) { |
477 |
|
|
if (v) flags |= gfo_flag_FROM_GFF3; |
478 |
|
|
else flags &= ~gfo_flag_FROM_GFF3; |
479 |
|
|
} |
480 |
|
|
bool createdByExon() { return ((flags & gfo_flag_BY_EXON)!=0); } |
481 |
|
|
void createdByExon(bool v) { |
482 |
|
|
if (v) flags |= gfo_flag_BY_EXON; |
483 |
|
|
else flags &= ~gfo_flag_BY_EXON; |
484 |
|
|
} |
485 |
|
|
bool isGene() { return ((flags & gfo_flag_IS_GENE)!=0); } |
486 |
|
|
void isGene(bool v) { |
487 |
|
|
if (v) flags |= gfo_flag_IS_GENE; |
488 |
|
|
else flags &= ~gfo_flag_IS_GENE; |
489 |
|
|
} |
490 |
|
|
bool isDiscarded() { return ((flags & gfo_flag_DISCARDED)!=0); } |
491 |
|
|
void isDiscarded(bool v) { |
492 |
|
|
if (v) flags |= gfo_flag_DISCARDED; |
493 |
|
|
else flags &= ~gfo_flag_DISCARDED; |
494 |
|
|
} |
495 |
|
|
|
496 |
|
|
bool isUsed() { return ((flags & gfo_flag_LST_KEEP)!=0); } |
497 |
|
|
void isUsed(bool v) { |
498 |
|
|
if (v) flags |= gfo_flag_LST_KEEP; |
499 |
|
|
else flags &= ~gfo_flag_LST_KEEP; |
500 |
|
|
} |
501 |
|
|
bool isTranscript() { return ((flags & gfo_flag_IS_TRANSCRIPT)!=0); } |
502 |
|
|
void isTranscript(bool v) { |
503 |
|
|
if (v) flags |= gfo_flag_IS_TRANSCRIPT; |
504 |
|
|
else flags &= ~gfo_flag_IS_TRANSCRIPT; |
505 |
|
|
} |
506 |
|
|
bool promotedChildren() { return ((flags & gfo_flag_CHILDREN_PROMOTED)!=0); } |
507 |
|
|
void promotedChildren(bool v) { |
508 |
|
|
if (v) flags |= gfo_flag_CHILDREN_PROMOTED; |
509 |
|
|
else flags &= ~gfo_flag_CHILDREN_PROMOTED; |
510 |
|
|
} |
511 |
|
|
void setLevel(byte v) { |
512 |
|
|
if (v==0) flags &= ~gfo_flag_LEVEL_MSK; |
513 |
|
|
else flags &= ~(((uint)v) << gfo_flagShift_LEVEL); |
514 |
|
|
} |
515 |
|
|
byte incLevel() { |
516 |
|
|
uint v=((flags & gfo_flag_LEVEL_MSK) >> gfo_flagShift_LEVEL); |
517 |
|
|
v++; |
518 |
|
|
flags &= ~(v << gfo_flagShift_LEVEL); |
519 |
|
|
return v; |
520 |
|
|
} |
521 |
|
|
byte getLevel() { |
522 |
|
|
return ((byte)((flags & gfo_flag_LEVEL_MSK) >> gfo_flagShift_LEVEL)); |
523 |
|
|
} |
524 |
gpertea |
2 |
|
525 |
gpertea |
16 |
bool isValidTranscript() { |
526 |
|
|
//return (ftype_id==gff_fid_mRNA && exons.Count()>0); |
527 |
|
|
return (isTranscript() && exons.Count()>0); |
528 |
|
|
} |
529 |
|
|
|
530 |
gpertea |
2 |
|
531 |
|
|
int addExon(uint segstart, uint segend, double sc=0, char fr='.', |
532 |
|
|
int qs=0, int qe=0, bool iscds=false, char exontype=0); |
533 |
|
|
|
534 |
gpertea |
16 |
int addExon(GffReader* reader, GffLine* gl, bool keepAttr=false, bool noExonAttr=true); |
535 |
|
|
|
536 |
gpertea |
2 |
void removeExon(int idx); |
537 |
gpertea |
16 |
void removeExon(GffExon* p); |
538 |
gpertea |
2 |
char strand; //true if features are on the reverse complement strand |
539 |
|
|
double gscore; |
540 |
|
|
double uscore; //custom, user-computed score, if needed |
541 |
|
|
int covlen; //total coverage of reference genomic sequence (sum of maxcf segment lengths) |
542 |
gpertea |
16 |
|
543 |
gpertea |
2 |
//--------- optional data: |
544 |
|
|
int qlen; //query length, start, end - if available |
545 |
|
|
int qstart; |
546 |
|
|
int qend; |
547 |
|
|
int qcov; //query coverage - percent |
548 |
|
|
GffAttrs* attrs; //other gff3 attributes found for the main mRNA feature |
549 |
|
|
//constructor by gff line parsing: |
550 |
|
|
GffObj(GffReader* gfrd, GffLine* gffline, bool keepAttrs=false, bool noExonAttr=true); |
551 |
|
|
//if gfline->Parent!=NULL then this will also add the first sub-feature |
552 |
|
|
// otherwise, only the main feature is created |
553 |
|
|
void clearAttrs() { |
554 |
gpertea |
16 |
if (attrs!=NULL) { |
555 |
|
|
bool sharedattrs=(exons.Count()>0 && exons[0]->attrs==attrs); |
556 |
|
|
delete attrs; attrs=NULL; |
557 |
|
|
if (sharedattrs) exons[0]->attrs=NULL; |
558 |
|
|
} |
559 |
gpertea |
2 |
} |
560 |
gpertea |
16 |
GffObj(char* anid=NULL):GSeg(0,0), exons(true,true,false), children(1,false) { |
561 |
|
|
//exons: sorted, free, non-unique |
562 |
gpertea |
2 |
gffID=NULL; |
563 |
|
|
uptr=NULL; |
564 |
|
|
ulink=NULL; |
565 |
gpertea |
16 |
flags=0; |
566 |
gpertea |
2 |
udata=0; |
567 |
gpertea |
16 |
parent=NULL; |
568 |
gpertea |
2 |
ftype_id=-1; |
569 |
gpertea |
16 |
exon_ftype_id=-1; |
570 |
gpertea |
2 |
if (anid!=NULL) gffID=Gstrdup(anid); |
571 |
|
|
gffnames_ref(names); |
572 |
|
|
qlen=0; |
573 |
|
|
qstart=0; |
574 |
|
|
qend=0; |
575 |
|
|
qcov=0; |
576 |
|
|
partial=true; |
577 |
|
|
isCDS=false; |
578 |
|
|
CDstart=0; // hasCDS <=> CDstart>0 |
579 |
|
|
CDend=0; |
580 |
|
|
CDphase=0; |
581 |
|
|
gseq_id=-1; |
582 |
|
|
track_id=-1; |
583 |
|
|
xstart=0; |
584 |
|
|
xend=0; |
585 |
|
|
xstatus=0; |
586 |
gpertea |
16 |
strand='.'; |
587 |
gpertea |
2 |
gscore=0; |
588 |
|
|
uscore=0; |
589 |
|
|
attrs=NULL; |
590 |
|
|
covlen=0; |
591 |
gpertea |
16 |
gene_name=NULL; |
592 |
|
|
geneID=NULL; |
593 |
gpertea |
2 |
} |
594 |
|
|
~GffObj() { |
595 |
|
|
GFREE(gffID); |
596 |
gpertea |
16 |
GFREE(gene_name); |
597 |
|
|
GFREE(geneID); |
598 |
|
|
clearAttrs(); |
599 |
gpertea |
2 |
gffnames_unref(names); |
600 |
|
|
} |
601 |
|
|
//-------------- |
602 |
gpertea |
16 |
GffObj* finalize(GffReader* gfr, bool mergeCloseExons=false, |
603 |
|
|
bool keepAttrs=false, bool noExonAttr=true); |
604 |
|
|
//complete parsing: must be called in order to merge adjacent/close proximity subfeatures |
605 |
|
|
void parseAttrs(GffAttrs*& atrlist, char* info, bool isExon=false); |
606 |
gpertea |
2 |
const char* getSubfName() { //returns the generic feature type of the entries in exons array |
607 |
gpertea |
16 |
int sid=exon_ftype_id; |
608 |
gpertea |
2 |
if (sid==gff_fid_exon && isCDS) sid=gff_fid_CDS; |
609 |
|
|
return names->feats.getName(sid); |
610 |
|
|
} |
611 |
gpertea |
16 |
bool monoFeature() { |
612 |
|
|
return (exons.Count()==0 || |
613 |
|
|
(exons.Count()==1 && exon_ftype_id==ftype_id)); |
614 |
|
|
} |
615 |
|
|
|
616 |
|
|
bool hasCDS() { return (CDstart>0); } |
617 |
|
|
|
618 |
gpertea |
2 |
const char* getFeatureName() { |
619 |
|
|
return names->feats.getName(ftype_id); |
620 |
|
|
} |
621 |
gpertea |
16 |
void setFeatureName(const char* feature); |
622 |
|
|
|
623 |
|
|
void addAttr(const char* attrname, const char* attrvalue); |
624 |
|
|
int removeAttr(const char* attrname, const char* attrval=NULL); |
625 |
|
|
int removeAttr(int aid, const char* attrval=NULL); |
626 |
|
|
int removeExonAttr(GffExon& exon, const char* attrname, const char* attrval=NULL); |
627 |
|
|
int removeExonAttr(GffExon& exon, int aid, const char* attrval=NULL); |
628 |
gpertea |
2 |
const char* getAttrName(int i) { |
629 |
|
|
if (attrs==NULL) return NULL; |
630 |
|
|
return names->attrs.getName(attrs->Get(i)->attr_id); |
631 |
|
|
} |
632 |
gpertea |
16 |
char* getAttr(const char* attrname, bool checkFirstExon=false) { |
633 |
|
|
if (names==NULL || attrname==NULL) return NULL; |
634 |
|
|
char* r=NULL; |
635 |
|
|
if (attrs==NULL) { |
636 |
|
|
if (!checkFirstExon) return NULL; |
637 |
|
|
} |
638 |
|
|
else r=attrs->getAttr(names, attrname); |
639 |
|
|
if (r!=NULL) return r; |
640 |
|
|
if (checkFirstExon && exons.Count()>0) { |
641 |
|
|
r=exons[0]->getAttr(names, attrname); |
642 |
|
|
} |
643 |
|
|
return r; |
644 |
gpertea |
2 |
} |
645 |
|
|
|
646 |
gpertea |
16 |
char* getExonAttr(GffExon* exon, const char* attrname) { |
647 |
|
|
if (exon==NULL || attrname==NULL) return NULL; |
648 |
|
|
return exon->getAttr(names, attrname); |
649 |
|
|
} |
650 |
|
|
|
651 |
|
|
char* getExonAttr(int exonidx, const char* attrname) { |
652 |
|
|
if (exonidx<0 || exonidx>=exons.Count() || attrname==NULL) return NULL; |
653 |
|
|
return exons[exonidx]->getAttr(names, attrname); |
654 |
|
|
} |
655 |
|
|
|
656 |
gpertea |
2 |
char* getAttrValue(int i) { |
657 |
|
|
if (attrs==NULL) return NULL; |
658 |
|
|
return attrs->Get(i)->attr_val; |
659 |
|
|
} |
660 |
|
|
const char* getGSeqName() { |
661 |
|
|
return names->gseqs.getName(gseq_id); |
662 |
|
|
} |
663 |
gpertea |
16 |
|
664 |
|
|
const char* getRefName() { |
665 |
|
|
return names->gseqs.getName(gseq_id); |
666 |
|
|
} |
667 |
|
|
void setRefName(const char* newname); |
668 |
|
|
|
669 |
gpertea |
2 |
const char* getTrackName() { |
670 |
|
|
return names->tracks.getName(track_id); |
671 |
|
|
} |
672 |
gpertea |
16 |
bool exonOverlap(uint s, uint e) {//check if ANY exon overlaps given segment |
673 |
gpertea |
2 |
//ignores strand! |
674 |
|
|
if (s>e) swap(s,e); |
675 |
|
|
for (int i=0;i<exons.Count();i++) { |
676 |
|
|
if (exons[i]->overlap(s,e)) return true; |
677 |
|
|
} |
678 |
|
|
return false; |
679 |
|
|
} |
680 |
|
|
bool exonOverlap(GffObj& m) {//check if ANY exon overlaps given segment |
681 |
|
|
//if (gseq_id!=m.gseq_id) return false; |
682 |
|
|
// ignores strand and gseq_id, must check in advance |
683 |
|
|
for (int i=0;i<exons.Count();i++) { |
684 |
|
|
for (int j=0;j<m.exons.Count();j++) { |
685 |
|
|
if (exons[i]->start>m.exons[j]->end) continue; |
686 |
|
|
if (m.exons[j]->start>exons[i]->end) break; |
687 |
|
|
//-- overlap if we are here: |
688 |
|
|
return true; |
689 |
|
|
} |
690 |
|
|
} |
691 |
|
|
return false; |
692 |
|
|
} |
693 |
gpertea |
16 |
|
694 |
gpertea |
2 |
int exonOverlapIdx(uint s, uint e, int* ovlen=NULL) { |
695 |
gpertea |
16 |
//return the exons' index for the overlapping OR ADJACENT exon |
696 |
gpertea |
2 |
//ovlen, if given, will return the overlap length |
697 |
|
|
if (s>e) swap(s,e); |
698 |
gpertea |
16 |
s--;e++; //to also catch adjacent exons |
699 |
gpertea |
2 |
for (int i=0;i<exons.Count();i++) { |
700 |
|
|
if (exons[i]->start>e) break; |
701 |
|
|
if (s>exons[i]->end) continue; |
702 |
|
|
//-- overlap if we are here: |
703 |
|
|
if (ovlen!=NULL) { |
704 |
gpertea |
16 |
s++;e--; |
705 |
gpertea |
2 |
int ovlend= (exons[i]->end>e) ? e : exons[i]->end; |
706 |
|
|
*ovlen= ovlend - ((s>exons[i]->start)? s : exons[i]->start)+1; |
707 |
|
|
} |
708 |
|
|
return i; |
709 |
|
|
} //for each exon |
710 |
|
|
*ovlen=0; |
711 |
|
|
return -1; |
712 |
|
|
} |
713 |
gpertea |
16 |
|
714 |
gpertea |
2 |
int exonOverlapLen(GffObj& m) { |
715 |
|
|
if (start>m.end || m.start>end) return 0; |
716 |
|
|
int i=0; |
717 |
|
|
int j=0; |
718 |
|
|
int ovlen=0; |
719 |
|
|
while (i<exons.Count() && j<m.exons.Count()) { |
720 |
|
|
uint istart=exons[i]->start; |
721 |
|
|
uint iend=exons[i]->end; |
722 |
|
|
uint jstart=m.exons[j]->start; |
723 |
|
|
uint jend=m.exons[j]->end; |
724 |
|
|
if (istart>jend) { j++; continue; } |
725 |
|
|
if (jstart>iend) { i++; continue; } |
726 |
|
|
//exon overlap |
727 |
|
|
uint ovstart=GMAX(istart,jstart); |
728 |
|
|
if (iend<jend) { |
729 |
|
|
ovlen+=iend-ovstart+1; |
730 |
|
|
i++; |
731 |
|
|
} |
732 |
|
|
else { |
733 |
|
|
ovlen+=jend-ovstart+1; |
734 |
|
|
j++; |
735 |
|
|
} |
736 |
|
|
}//while comparing exons |
737 |
|
|
return ovlen; |
738 |
|
|
} |
739 |
|
|
|
740 |
|
|
bool exonOverlap(GffObj* m) { |
741 |
|
|
return exonOverlap(*m); |
742 |
|
|
} |
743 |
|
|
//---------- coordinate transformation |
744 |
|
|
void xcoord(uint grstart, uint grend, char xstrand='+') { |
745 |
|
|
//relative coordinate transform, and reverse-complement transform if xstrand is '-' |
746 |
|
|
//does nothing if xstatus is the same already |
747 |
|
|
if (xstatus) { |
748 |
|
|
if (xstatus==xstrand && grstart==xstart && grend==xend) return; |
749 |
|
|
unxcoord();//restore original coordinates |
750 |
|
|
} |
751 |
|
|
xstatus=xstrand; |
752 |
|
|
xstart=grstart; |
753 |
|
|
xend=grend; |
754 |
|
|
if (CDstart>0) xcoordseg(CDstart, CDend); |
755 |
|
|
for (int i=0;i<exons.Count();i++) { |
756 |
|
|
xcoordseg(exons[i]->start, exons[i]->end); |
757 |
|
|
} |
758 |
|
|
if (xstatus=='-') { |
759 |
|
|
exons.Reverse(); |
760 |
|
|
int flen=end-start; |
761 |
|
|
start=xend-end+1; |
762 |
|
|
end=start+flen; |
763 |
|
|
} |
764 |
|
|
else { |
765 |
|
|
start=start-xstart+1; |
766 |
|
|
end=end-xstart+1; |
767 |
|
|
} |
768 |
|
|
} |
769 |
|
|
|
770 |
|
|
//transform an arbitrary segment based on current xstatus/xstart-xend |
771 |
|
|
void xcoordseg(uint& segstart, uint &segend) { |
772 |
|
|
if (xstatus==0) return; |
773 |
|
|
if (xstatus=='-') { |
774 |
|
|
int flen=segend-segstart; |
775 |
|
|
segstart=xend-segend+1; |
776 |
|
|
segend=segstart+flen; |
777 |
|
|
return; |
778 |
|
|
} |
779 |
|
|
else { |
780 |
|
|
segstart=segstart-xstart+1; |
781 |
|
|
segend=segend-xstart+1; |
782 |
|
|
} |
783 |
|
|
} |
784 |
|
|
|
785 |
|
|
void unxcoord() { //revert back to absolute genomic/gff coordinates if xstatus==true |
786 |
|
|
if (xstatus==0) return; //nothing to do, no transformation appplied |
787 |
|
|
if (CDstart>0) unxcoordseg(CDstart, CDend); |
788 |
|
|
//restore all GffExon intervals too |
789 |
|
|
for (int i=0;i<exons.Count();i++) { |
790 |
|
|
unxcoordseg(exons[i]->start, exons[i]->end); |
791 |
|
|
} |
792 |
|
|
if (xstatus=='-') { |
793 |
|
|
exons.Reverse(); |
794 |
|
|
int flen=end-start; |
795 |
|
|
start=xend-end+1; |
796 |
|
|
end=start+flen; |
797 |
|
|
} |
798 |
|
|
else { |
799 |
|
|
start=start+xstart-1; |
800 |
|
|
end=end+xstart-1; |
801 |
|
|
} |
802 |
|
|
xstatus=0; |
803 |
|
|
} |
804 |
|
|
void unxcoordseg(uint& astart, uint &aend) { |
805 |
|
|
//restore an arbitrary interval -- does NOT change the transform state! |
806 |
|
|
if (xstatus==0) return; |
807 |
|
|
if (xstatus=='-') { |
808 |
|
|
int flen=aend-astart; |
809 |
|
|
astart=xend-aend+1; |
810 |
|
|
aend=astart+flen; |
811 |
|
|
} |
812 |
|
|
else { |
813 |
|
|
astart=astart+xstart-1; |
814 |
|
|
aend=aend+xstart-1; |
815 |
|
|
} |
816 |
|
|
} |
817 |
|
|
//--------------------- |
818 |
|
|
bool operator==(GffObj& d){ |
819 |
|
|
return (gseq_id==d.gseq_id && start==d.start && end==d.end && strcmp(gffID, d.gffID)==0); |
820 |
|
|
} |
821 |
|
|
bool operator>(GffObj& d){ |
822 |
|
|
if (gseq_id!=d.gseq_id) return (gseq_id>d.gseq_id); |
823 |
|
|
if (start==d.start) { |
824 |
gpertea |
16 |
if (getLevel()==d.getLevel()) { |
825 |
|
|
if (end==d.end) return (strcmp(gffID, d.gffID)>0); |
826 |
|
|
else return (end>d.end); |
827 |
|
|
} else return (getLevel()>d.getLevel()); |
828 |
gpertea |
2 |
} else return (start>d.start); |
829 |
|
|
} |
830 |
|
|
bool operator<(GffObj& d){ |
831 |
|
|
if (gseq_id!=d.gseq_id) return (gseq_id<d.gseq_id); |
832 |
|
|
if (start==d.start) { |
833 |
gpertea |
16 |
if (getLevel()==d.getLevel()) { |
834 |
|
|
if (end==d.end) return strcmp(gffID, d.gffID)<0; |
835 |
gpertea |
2 |
else return end<d.end; |
836 |
gpertea |
16 |
} else return (getLevel()<d.getLevel()); |
837 |
gpertea |
2 |
} else return (start<d.start); |
838 |
|
|
} |
839 |
|
|
char* getID() { return gffID; } |
840 |
gpertea |
16 |
char* getGeneID() { return geneID; } |
841 |
|
|
char* getGeneName() { return gene_name; } |
842 |
|
|
void setGeneName(const char* gname) { |
843 |
|
|
GFREE(gene_name); |
844 |
|
|
if (gname) gene_name=Gstrdup(gname); |
845 |
|
|
} |
846 |
|
|
void setGeneID(const char* gene_id) { |
847 |
|
|
GFREE(geneID); |
848 |
|
|
if (gene_id) geneID=Gstrdup(gene_id); |
849 |
|
|
} |
850 |
gpertea |
2 |
int addSeg(GffLine* gfline); |
851 |
|
|
int addSeg(int fnid, GffLine* gfline); |
852 |
|
|
void getCDSegs(GArray<GffCDSeg>& cds); |
853 |
gpertea |
16 |
|
854 |
|
|
void updateExonPhase(); //for CDS-only features, updates GExon::phase |
855 |
|
|
|
856 |
|
|
void printGxfLine(FILE* fout, const char* tlabel, const char* gseqname, |
857 |
|
|
bool iscds, uint segstart, uint segend, int exidx, char phase, bool gff3); |
858 |
|
|
void printGxf(FILE* fout, GffPrintMode gffp=pgffExon, |
859 |
|
|
const char* tlabel=NULL, const char* gfparent=NULL); |
860 |
|
|
void printGtf(FILE* fout, const char* tlabel=NULL) { |
861 |
gpertea |
2 |
printGxf(fout, pgtfAny, tlabel); |
862 |
|
|
} |
863 |
gpertea |
16 |
void printGff(FILE* fout, const char* tlabel=NULL, |
864 |
|
|
const char* gfparent=NULL) { |
865 |
|
|
printGxf(fout, pgffAny, tlabel, gfparent); |
866 |
gpertea |
2 |
} |
867 |
gpertea |
16 |
void printTranscriptGff(FILE* fout, char* tlabel=NULL, |
868 |
|
|
bool showCDS=false, const char* gfparent=NULL) { |
869 |
|
|
if (isValidTranscript()) |
870 |
|
|
printGxf(fout, showCDS ? pgffBoth : pgffExon, tlabel, gfparent); |
871 |
gpertea |
2 |
} |
872 |
|
|
void printSummary(FILE* fout=NULL); |
873 |
|
|
void getCDS_ends(uint& cds_start, uint& cds_end); |
874 |
|
|
void mRNA_CDS_coords(uint& cds_start, uint& cds_end); |
875 |
|
|
char* getSpliced(GFaSeqGet* faseq, bool CDSonly=false, int* rlen=NULL, |
876 |
|
|
uint* cds_start=NULL, uint* cds_end=NULL, GList<GSeg>* seglst=NULL); |
877 |
gpertea |
16 |
char* getUnspliced(GFaSeqGet* faseq, int* rlen, GList<GSeg>* seglst); |
878 |
gpertea |
2 |
char* getSplicedTr(GFaSeqGet* faseq, bool CDSonly=true, int* rlen=NULL); |
879 |
|
|
//bool validCDS(GFaSeqGet* faseq); //has In-Frame Stop Codon ? |
880 |
|
|
bool empty() { return (start==0); } |
881 |
|
|
}; |
882 |
|
|
|
883 |
|
|
typedef bool GffRecFunc(GffObj* gobj, void* usrptr1, void* usrptr2); |
884 |
|
|
//user callback after parsing a mapping object: |
885 |
|
|
// Returns: "done with it" status: |
886 |
|
|
// TRUE if gobj is no longer needed so it's FREEd upon return |
887 |
|
|
// FALSE if the user needs the gobj pointer and is responsible for |
888 |
|
|
// collecting and freeing all GffObj objects |
889 |
|
|
|
890 |
|
|
|
891 |
|
|
//GSeqStat: collect basic stats about a common underlying genomic sequence |
892 |
|
|
// for multiple GffObj |
893 |
|
|
class GSeqStat { |
894 |
|
|
public: |
895 |
|
|
int gseqid; //gseq id in the global static pool of gseqs |
896 |
|
|
char* gseqname; //just a pointer to the name of gseq |
897 |
|
|
//int fcount;//number of features on this gseq |
898 |
|
|
uint mincoord; |
899 |
|
|
uint maxcoord; |
900 |
gpertea |
16 |
uint maxfeat_len; //maximum feature length on this genomic sequence |
901 |
|
|
GffObj* maxfeat; |
902 |
|
|
GSeqStat(int id=-1, char* name=NULL) { |
903 |
gpertea |
2 |
gseqid=id; |
904 |
|
|
gseqname=name; |
905 |
|
|
mincoord=MAXUINT; |
906 |
|
|
maxcoord=0; |
907 |
gpertea |
16 |
maxfeat_len=0; |
908 |
|
|
maxfeat=NULL; |
909 |
gpertea |
2 |
} |
910 |
|
|
bool operator>(GSeqStat& g) { |
911 |
|
|
return (gseqid>g.gseqid); |
912 |
|
|
} |
913 |
|
|
bool operator<(GSeqStat& g) { |
914 |
|
|
return (gseqid<g.gseqid); |
915 |
|
|
} |
916 |
|
|
bool operator==(GSeqStat& g) { |
917 |
|
|
return (gseqid==g.gseqid); |
918 |
|
|
} |
919 |
|
|
}; |
920 |
|
|
|
921 |
|
|
|
922 |
|
|
int gfo_cmpByLoc(const pointer p1, const pointer p2); |
923 |
|
|
|
924 |
|
|
class GfList: public GList<GffObj> { |
925 |
gpertea |
16 |
//just adding the option to sort by genomic sequence and coordinate |
926 |
|
|
bool mustSort; |
927 |
gpertea |
2 |
public: |
928 |
|
|
GfList(bool sortbyloc=false):GList<GffObj>(false,false,false) { |
929 |
gpertea |
16 |
//GffObjs in this list are NOT deleted when the list is cleared |
930 |
|
|
//-- for deallocation of these objects, call freeAll() or freeUnused() as needed |
931 |
|
|
mustSort=sortbyloc; |
932 |
|
|
} |
933 |
|
|
void sortedByLoc(bool v=true) { |
934 |
|
|
bool prev=mustSort; |
935 |
|
|
mustSort=v; |
936 |
|
|
if (fCount>0 && mustSort && !prev) { |
937 |
|
|
this->setSorted((GCompareProc*)gfo_cmpByLoc); |
938 |
|
|
} |
939 |
|
|
} |
940 |
|
|
void finalize(GffReader* gfr, bool mergeCloseExons, |
941 |
|
|
bool keepAttrs=false, bool noExonAttr=true) { //if set, enforce sort by locus |
942 |
|
|
if (mustSort) { //force (re-)sorting |
943 |
|
|
this->setSorted(false); |
944 |
|
|
this->setSorted((GCompareProc*)gfo_cmpByLoc); |
945 |
|
|
} |
946 |
|
|
int delcount=0; |
947 |
|
|
for (int i=0;i<Count();i++) { |
948 |
|
|
//finish the parsing for each GffObj |
949 |
|
|
fList[i]->finalize(gfr, mergeCloseExons, keepAttrs, noExonAttr); |
950 |
|
|
} |
951 |
|
|
if (delcount>0) this->Pack(); |
952 |
|
|
} |
953 |
|
|
void freeAll() { |
954 |
|
|
for (int i=0;i<fCount;i++) { |
955 |
|
|
delete fList[i]; |
956 |
|
|
fList[i]=NULL; |
957 |
|
|
} |
958 |
|
|
Clear(); |
959 |
|
|
} |
960 |
|
|
void freeUnused() { |
961 |
|
|
for (int i=0;i<fCount;i++) { |
962 |
|
|
if (fList[i]->isUsed()) continue; |
963 |
|
|
//inform the children |
964 |
|
|
for (int c=0;c<fList[i]->children.Count();c++) { |
965 |
|
|
fList[i]->children[c]->parent=NULL; |
966 |
|
|
} |
967 |
|
|
delete fList[i]; |
968 |
|
|
fList[i]=NULL; |
969 |
|
|
} |
970 |
|
|
Clear(); |
971 |
|
|
} |
972 |
|
|
|
973 |
gpertea |
2 |
}; |
974 |
|
|
|
975 |
gpertea |
16 |
class GfoHolder { |
976 |
|
|
public: |
977 |
|
|
int idx; //position in GffReader::gflst array |
978 |
|
|
GffObj* gffobj; |
979 |
|
|
GfoHolder(GffObj* gfo=NULL, int i=0) { |
980 |
|
|
idx=i; |
981 |
|
|
gffobj=gfo; |
982 |
|
|
} |
983 |
|
|
}; |
984 |
|
|
|
985 |
|
|
class CNonExon { //utility class used in subfeature promotion |
986 |
|
|
public: |
987 |
|
|
int idx; |
988 |
|
|
GffObj* parent; |
989 |
|
|
GffExon* exon; |
990 |
|
|
GffLine* gffline; |
991 |
|
|
CNonExon(int i, GffObj* p, GffExon* e, GffLine* gl) { |
992 |
|
|
parent=p; |
993 |
|
|
exon=e; |
994 |
|
|
idx=i; |
995 |
|
|
gffline=new GffLine(gl); |
996 |
|
|
} |
997 |
|
|
~CNonExon() { |
998 |
|
|
delete gffline; |
999 |
|
|
} |
1000 |
|
|
}; |
1001 |
|
|
|
1002 |
|
|
|
1003 |
gpertea |
2 |
class GffReader { |
1004 |
|
|
friend class GffObj; |
1005 |
|
|
friend class GffLine; |
1006 |
|
|
char* linebuf; |
1007 |
|
|
off_t fpos; |
1008 |
|
|
int buflen; |
1009 |
|
|
protected: |
1010 |
gpertea |
16 |
bool gff_warns; //warn about duplicate IDs, etc. even when they are on different chromosomes |
1011 |
gpertea |
2 |
FILE* fh; |
1012 |
|
|
char* fname; //optional fasta file with the underlying genomic sequence to be attached to this reader |
1013 |
|
|
GffNames* names; //just a pointer to the global static Gff names repository in GffObj |
1014 |
|
|
GffLine* gffline; |
1015 |
gpertea |
16 |
bool transcriptsOnly; //keep only transcripts w/ their exon/CDS features |
1016 |
|
|
GHash<int> discarded_ids; //for transcriptsOnly mode, keep track |
1017 |
|
|
// of discarded parent IDs |
1018 |
|
|
GHash<GfoHolder> phash; //transcript_id+contig (Parent~Contig) => [gflst index, GffObj] |
1019 |
|
|
GHash<int> tids; //transcript_id uniqueness |
1020 |
gpertea |
2 |
char* gfoBuildId(const char* id, const char* ctg); |
1021 |
|
|
void gfoRemove(const char* id, const char* ctg); |
1022 |
gpertea |
16 |
GfoHolder* gfoAdd(const char* id, const char* ctg, GffObj* gfo, int idx); |
1023 |
|
|
GfoHolder* gfoFind(const char* id, const char* ctg); |
1024 |
|
|
CNonExon* subfPoolCheck(GffLine* gffline, GHash<CNonExon>& pex, char*& subp_name); |
1025 |
|
|
void subfPoolAdd(GHash<CNonExon>& pex, GfoHolder* newgfo); |
1026 |
|
|
GfoHolder* promoteFeature(CNonExon* subp, char*& subp_name, GHash<CNonExon>& pex, |
1027 |
|
|
bool keepAttr, bool noExonAttr); |
1028 |
gpertea |
2 |
public: |
1029 |
gpertea |
16 |
GfList gflst; //accumulate GffObjs being read |
1030 |
|
|
GfoHolder* newGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, |
1031 |
|
|
GffObj* parent=NULL, GffExon* pexon=NULL); |
1032 |
|
|
GfoHolder* replaceGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, int replaceidx); |
1033 |
|
|
GfoHolder* updateGffRec(GfoHolder* prevgfo, GffLine* gffline, |
1034 |
|
|
bool keepAttr); |
1035 |
|
|
GfoHolder* updateParent(GfoHolder* newgfh, GffObj* parent); |
1036 |
|
|
bool addExonFeature(GfoHolder* prevgfo, GffLine* gffline, GHash<CNonExon>& pex, bool noExonAttr); |
1037 |
gpertea |
2 |
GList<GSeqStat> gseqstats; //list of all genomic sequences seen by this reader, accumulates stats |
1038 |
gpertea |
16 |
GffReader(FILE* f=NULL, bool t_only=false, bool sortbyloc=false):discarded_ids(true), |
1039 |
|
|
phash(true), tids(true), gflst(sortbyloc), gseqstats(true,true,true) { |
1040 |
|
|
gff_warns=gff_show_warnings; |
1041 |
gpertea |
2 |
names=NULL; |
1042 |
|
|
gffline=NULL; |
1043 |
gpertea |
16 |
transcriptsOnly=t_only; |
1044 |
gpertea |
2 |
fpos=0; |
1045 |
|
|
fname=NULL; |
1046 |
|
|
fh=f; |
1047 |
|
|
GMALLOC(linebuf, GFF_LINELEN); |
1048 |
|
|
buflen=GFF_LINELEN-1; |
1049 |
|
|
} |
1050 |
gpertea |
16 |
void init(FILE *f, bool t_only=false, bool sortbyloc=false) { |
1051 |
|
|
fname=NULL; |
1052 |
|
|
fh=f; |
1053 |
|
|
if (fh!=NULL) rewind(fh); |
1054 |
|
|
fpos=0; |
1055 |
|
|
transcriptsOnly=t_only; |
1056 |
|
|
gflst.sortedByLoc(sortbyloc); |
1057 |
|
|
} |
1058 |
|
|
GffReader(char* fn, bool t_only=false, bool sort=false):discarded_ids(true), phash(true), |
1059 |
|
|
tids(true),gflst(sort),gseqstats(true,true,true) { |
1060 |
|
|
gff_warns=gff_show_warnings; |
1061 |
gpertea |
2 |
names=NULL; |
1062 |
|
|
fname=Gstrdup(fn); |
1063 |
gpertea |
16 |
transcriptsOnly=t_only; |
1064 |
gpertea |
2 |
fh=fopen(fname, "rb"); |
1065 |
|
|
fpos=0; |
1066 |
|
|
gffline=NULL; |
1067 |
|
|
GMALLOC(linebuf, GFF_LINELEN); |
1068 |
|
|
buflen=GFF_LINELEN-1; |
1069 |
|
|
} |
1070 |
|
|
|
1071 |
gpertea |
16 |
~GffReader() { |
1072 |
|
|
delete gffline; |
1073 |
gpertea |
2 |
gffline=NULL; |
1074 |
|
|
fpos=0; |
1075 |
gpertea |
16 |
gflst.freeUnused(); |
1076 |
gpertea |
2 |
gflst.Clear(); |
1077 |
gpertea |
16 |
discarded_ids.Clear(); |
1078 |
gpertea |
2 |
phash.Clear(); |
1079 |
|
|
gseqstats.Clear(); |
1080 |
|
|
GFREE(fname); |
1081 |
|
|
GFREE(linebuf); |
1082 |
|
|
} |
1083 |
|
|
|
1084 |
gpertea |
16 |
void showWarnings(bool v=true) { |
1085 |
|
|
gff_warns=v; |
1086 |
|
|
gff_show_warnings=v; |
1087 |
|
|
} |
1088 |
|
|
|
1089 |
gpertea |
2 |
GffLine* nextGffLine(); |
1090 |
|
|
|
1091 |
gpertea |
16 |
// load all subfeatures, re-group them: |
1092 |
|
|
void readAll(bool keepAttr=false, bool mergeCloseExons=false, bool noExonAttr=true); |
1093 |
gpertea |
2 |
|
1094 |
|
|
}; // end of GffReader |
1095 |
|
|
|
1096 |
|
|
#endif |