1 |
#ifndef GFF_H |
2 |
#define GFF_H |
3 |
|
4 |
#ifdef HAVE_CONFIG_H |
5 |
#include <config.h> |
6 |
#endif |
7 |
|
8 |
#include "GBase.h" |
9 |
#include "gdna.h" |
10 |
#include "codons.h" |
11 |
#include "GFaSeqGet.h" |
12 |
#include "GList.hh" |
13 |
#include "GHash.hh" |
14 |
|
15 |
/* |
16 |
const byte exMskMajSpliceL = 0x01; |
17 |
const byte exMskMajSpliceR = 0x02; |
18 |
const byte exMskMinSpliceL = 0x04; |
19 |
const byte exMskMinSpliceR = 0x08; |
20 |
const byte exMskTag = 0x80; |
21 |
*/ |
22 |
//reserved Gffnames::feats entries |
23 |
extern const int gff_fid_mRNA; |
24 |
extern const int gff_fid_exon; |
25 |
extern const int gff_fid_CDS; |
26 |
extern bool gff_warns; //show parser warnings |
27 |
|
28 |
extern const uint GFF_MAX_LOCUS; |
29 |
extern const uint GFF_MAX_EXON; |
30 |
extern const uint GFF_MAX_INTRON; |
31 |
|
32 |
#define GFF_LINELEN 2048 |
33 |
#define ERR_NULL_GFNAMES "Error: GffObj::%s requires a non-null GffNames* names!\n" |
34 |
|
35 |
|
36 |
enum GffExonType { |
37 |
exgffNone=0, |
38 |
exgffStop, //from "stop_codon" feature |
39 |
exgffCDS, //from "CDS" feature |
40 |
exgffUTR, //from "UTR" feature |
41 |
exgffExon, //from "exon" feature |
42 |
}; |
43 |
|
44 |
class GffReader; |
45 |
|
46 |
class GffLine { |
47 |
public: |
48 |
char* line; |
49 |
char* gseqname; |
50 |
char* track; |
51 |
char* ftype; //feature name: mRNA/gene/exon/CDS |
52 |
uint fstart; |
53 |
uint fend; |
54 |
uint qstart; //overlap coords on query, if available |
55 |
uint qend; |
56 |
uint qlen; //query len, if given |
57 |
double score; |
58 |
char strand; |
59 |
bool skip; |
60 |
bool is_cds; //"cds" and "stop_codon" features |
61 |
bool is_exon; //"exon" and "utr" features |
62 |
char exontype; // gffExonType |
63 |
bool is_mrna; |
64 |
char phase; // '.' , '0', '1' or '2' |
65 |
char* gname; //gene_id or Name= value (or an otherwise parsed gene denominator |
66 |
//(for grouping isoforms) |
67 |
char* info; //the last, attributes' field, unparsed |
68 |
// |
69 |
char* Parent; // if a Parent=.. attribute was found |
70 |
char* ID; // if a ID=.. attribute was parsed |
71 |
GffLine(GffReader* reader, const char* l); //parse the line accordingly |
72 |
GffLine() { |
73 |
line=NULL; |
74 |
gseqname=NULL; |
75 |
track=NULL; |
76 |
ftype=NULL; |
77 |
fstart=0; |
78 |
fend=0; |
79 |
info=NULL; |
80 |
Parent=NULL; |
81 |
ID=NULL; |
82 |
gname=NULL; |
83 |
skip=true; |
84 |
qstart=0; |
85 |
qend=0; |
86 |
qlen=0; |
87 |
exontype=0; |
88 |
is_cds=false; |
89 |
is_mrna=false; |
90 |
is_exon=false; |
91 |
} |
92 |
~GffLine() { |
93 |
GFREE(line); |
94 |
GFREE(Parent); |
95 |
GFREE(ID); |
96 |
GFREE(gname); |
97 |
} |
98 |
}; |
99 |
|
100 |
class GffAttr { |
101 |
public: |
102 |
int attr_id; |
103 |
char* attr_val; |
104 |
GffAttr(int an_id, const char* av=NULL) { |
105 |
attr_id=an_id; |
106 |
attr_val=NULL; |
107 |
if (av!=NULL) { |
108 |
char* lastch = (char*)(av+strlen(av)-1); |
109 |
//remove spaces at the end: |
110 |
while (*lastch==' ' && lastch!=av) lastch--; |
111 |
lastch[1]=0; |
112 |
//practical use: if it doesn't have any spaces just strip those useless double quotes |
113 |
if (av[0]=='"' && strpbrk(av+1," ;")==NULL) { |
114 |
if (*lastch=='"') *lastch=0; |
115 |
attr_val=Gstrdup(av+1); |
116 |
} |
117 |
else attr_val=Gstrdup(av); |
118 |
} |
119 |
|
120 |
} |
121 |
~GffAttr() { |
122 |
GFREE(attr_val); |
123 |
} |
124 |
bool operator==(GffAttr& d){ |
125 |
return (this==&d); |
126 |
} |
127 |
bool operator>(GffAttr& d){ |
128 |
return (this>&d); |
129 |
} |
130 |
bool operator<(GffAttr& d){ |
131 |
return (this<&d); |
132 |
} |
133 |
|
134 |
}; |
135 |
|
136 |
class GffNameList; |
137 |
class GffNames; |
138 |
|
139 |
class GffNameInfo { |
140 |
friend class GffNameList; |
141 |
protected: |
142 |
//unsigned char shared; |
143 |
int idx; |
144 |
public: |
145 |
char* name; |
146 |
GffNameInfo() { name=NULL; idx=-1; } |
147 |
GffNameInfo(const char* n) { |
148 |
name=Gstrdup(n); |
149 |
} |
150 |
/* |
151 |
GffNameInfo(const char* n, bool strshared=false) { |
152 |
idx=-1; |
153 |
if (strshared) { |
154 |
shared=1; |
155 |
name=(char *)n; |
156 |
} |
157 |
else { |
158 |
shared=0; |
159 |
name = (n==NULL) ? NULL : Gstrdup(n); |
160 |
} |
161 |
} |
162 |
*/ |
163 |
|
164 |
~GffNameInfo() { |
165 |
//if (shared==0) |
166 |
GFREE(name); |
167 |
} |
168 |
|
169 |
bool operator==(GffNameInfo& d){ |
170 |
return (strcmp(this->name, d.name)==0); |
171 |
} |
172 |
bool operator>(GffNameInfo& d){ |
173 |
return (strcmp(this->name, d.name)>0); |
174 |
} |
175 |
bool operator<(GffNameInfo& d){ |
176 |
return (strcmp(this->name, d.name)<0); |
177 |
} |
178 |
}; |
179 |
|
180 |
/* |
181 |
int compareNameId(const pointer item1, const pointer item2) { |
182 |
GffNameInfo* ni1=(GffNameInfo*)item1; |
183 |
GffNameInfo* ni2=(GffNameInfo*)item2; |
184 |
return (ni1->id > ni2->id )? 1 : (ni1->id==ni2->id ? 0 : -1); |
185 |
} |
186 |
*/ |
187 |
|
188 |
class GffNameList:public GList<GffNameInfo> { |
189 |
friend class GffNameInfo; |
190 |
friend class GffNames; |
191 |
protected: |
192 |
GHash<GffNameInfo> byName;//hash with shared keys |
193 |
int idlast; //fList index of last added/reused name |
194 |
void addStatic(const char* tname) {// fast add |
195 |
GffNameInfo* f=new GffNameInfo(tname); |
196 |
idlast=this->Add(f); |
197 |
f->idx=idlast; |
198 |
byName.shkAdd(f->name,f); |
199 |
} |
200 |
public: |
201 |
GffNameList():GList<GffNameInfo>(false,true,true), byName(false) { |
202 |
idlast=-1; |
203 |
} |
204 |
char* lastNameUsed() { return idlast<0 ? NULL : Get(idlast)->name; } |
205 |
int lastNameId() { return idlast; } |
206 |
char* getName(int nid) { //retrieve name by its ID |
207 |
if (nid<0 || nid>=fCount) |
208 |
GError("GffNameList Error: invalid index (%d)\n",nid); |
209 |
return fList[nid]->name; |
210 |
} |
211 |
|
212 |
int addName(const char* tname) {//returns or create an id for the given name |
213 |
//check idlast first, chances are it's the same feature name checked |
214 |
if (idlast>=0 && strcmp(fList[idlast]->name,tname)==0) |
215 |
return idlast; |
216 |
GffNameInfo* f=byName.Find(tname); |
217 |
int fidx=-1; |
218 |
if (f!=NULL) fidx=f->idx; |
219 |
else {//add new entry |
220 |
f=new GffNameInfo(tname); |
221 |
fidx=this->Add(f); |
222 |
f->idx=fidx; |
223 |
byName.shkAdd(f->name,f); |
224 |
} |
225 |
idlast=fidx; |
226 |
return fidx; |
227 |
} |
228 |
int getId(const char* tname) { //only returns a name id# if found |
229 |
GffNameInfo* f=byName.Find(tname); |
230 |
if (f==NULL) return -1; |
231 |
return f->idx; |
232 |
} |
233 |
int removeName(const char* tname) { |
234 |
GError("Error: removing names from GffNameList not allowed!\n"); |
235 |
return -1; |
236 |
} |
237 |
}; |
238 |
|
239 |
class GffNames { |
240 |
public: |
241 |
int numrefs; |
242 |
GffNameList tracks; |
243 |
GffNameList gseqs; |
244 |
GffNameList attrs; |
245 |
GffNameList feats; //feature names - anything except 'mRNA', 'exon', 'CDS' gets stored here |
246 |
GffNames():tracks(),gseqs(),attrs(), feats() { |
247 |
numrefs=0; |
248 |
//the order below is critical! |
249 |
//has to match: gff_fid_mRNA, gff_fid_exon, gff_fid_CDS |
250 |
feats.addStatic("mRNA");//index 0=gff_fid_mRNA |
251 |
feats.addStatic("exon");//index 1=gff_fid_exon |
252 |
feats.addStatic("CDS"); //index 2=gff_fid_CDS |
253 |
} |
254 |
}; |
255 |
|
256 |
void gffnames_ref(GffNames* &n); |
257 |
void gffnames_unref(GffNames* &n); |
258 |
|
259 |
enum GffPrintMode { |
260 |
pgtfAny, //print record as read |
261 |
pgtfExon, |
262 |
pgtfCDS, |
263 |
pgffAny, //print record as read |
264 |
pgffExon, |
265 |
pgffCDS, |
266 |
pgffBoth, |
267 |
}; |
268 |
|
269 |
|
270 |
class GffAttrs:public GList<GffAttr> { |
271 |
public: |
272 |
GffAttrs():GList<GffAttr>(false,true,false) { } |
273 |
char* getAttr(GffNames* names, const char* attrname) { |
274 |
int aid=names->attrs.getId(attrname); |
275 |
if (aid>=0) |
276 |
for (int i=0;i<Count();i++) |
277 |
if (aid==Get(i)->attr_id) return Get(i)->attr_val; |
278 |
return NULL; |
279 |
} |
280 |
}; |
281 |
|
282 |
|
283 |
class GffExon : public GSeg { |
284 |
public: |
285 |
void* uptr; //for later exensibility |
286 |
GffAttrs* attrs; //other attributes kept for this exon |
287 |
double score; // gff score column |
288 |
char phase; //GFF phase column - for CDS segments only |
289 |
// '.' = undefined (UTR), '0','1','2' for CDS exons |
290 |
char exontype; // 1="exon" 2="cds" 3="utr" 4="stop_codon" |
291 |
int qstart; // for mRNA/protein exon mappings: coordinates on query |
292 |
int qend; |
293 |
GffExon(int s=0, int e=0, double sc=0, char fr=0, int qs=0, int qe=0, char et=0) { |
294 |
uptr=NULL; |
295 |
attrs=NULL; |
296 |
if (s<e) { |
297 |
start=s; |
298 |
end=e; |
299 |
} |
300 |
else { |
301 |
start=e; |
302 |
end=s; |
303 |
} |
304 |
if (qs<qe) { |
305 |
qstart=qs; |
306 |
qend=qe; |
307 |
} else { |
308 |
qstart=qe; |
309 |
qend=qs; |
310 |
} |
311 |
score=sc; |
312 |
phase=fr; |
313 |
exontype=et; |
314 |
} //constructor |
315 |
|
316 |
char* getAttr(GffNames* names, const char* atrname) { |
317 |
if (attrs==NULL || names==NULL || atrname==NULL) return NULL; |
318 |
return attrs->getAttr(names, atrname); |
319 |
} |
320 |
|
321 |
~GffExon() { //destructor |
322 |
if (attrs!=NULL) delete attrs; |
323 |
} |
324 |
}; |
325 |
|
326 |
|
327 |
class GffCDSeg:public GSeg { |
328 |
public: |
329 |
char phase; |
330 |
int exonidx; |
331 |
}; |
332 |
//one GFF mRNA object -- e.g. a mRNA with its exons and/or CDS segments |
333 |
class GffObj:public GSeg { |
334 |
//utility segment-merging function for addExon() |
335 |
void expandExon(int xovl, uint segstart, uint segend, |
336 |
char exontype, double sc, char fr, int qs, int qe); |
337 |
protected: |
338 |
//coordinate transformation data: |
339 |
uint xstart; //absolute genomic coordinates of reference region |
340 |
uint xend; |
341 |
char xstatus; //coordinate transform status: |
342 |
//0 : (start,end) coordinates are absolute |
343 |
//'+' : (start,end) coords are relative to xstart..xend region |
344 |
//'-' : (start,end) are relative to the reverse complement of xstart..xend region |
345 |
//-- |
346 |
char* gffID; // ID name for mRNA (parent) feature |
347 |
char* gname; // value of Name attribute (if given) |
348 |
//-- friends: |
349 |
friend class GffReader; |
350 |
friend class GffExon; |
351 |
public: |
352 |
bool hasErrors; //overlapping exons, or too short introns, etc. |
353 |
static GffNames* names; // common string storage that holds the various attribute names etc. |
354 |
int track_id; // index of track name in names->tracks |
355 |
int gseq_id; // index of genomic sequence name in names->gseqs |
356 |
int ftype_id; // index of this record's feature name in names->feats, or the special gff_fid_mRNA value |
357 |
int subftype_id; //index of child subfeature name in names->feats (that subfeature stored in "exons") |
358 |
//if ftype_id==gff_fid_mRNA then this value is ignored |
359 |
GList<GffExon> exons; //for non-mRNA entries, these can be any subfeature of type subftype_id |
360 |
int udata; //user data, flags etc. |
361 |
void* uptr; //user pointer (to a parent object, cluster, locus etc.) |
362 |
GffObj* ulink; //link to another GffObj (user controlled field) |
363 |
// mRNA specific fields: |
364 |
bool isCDS; //just a CDS, no UTRs |
365 |
bool partial; //partial CDS |
366 |
uint CDstart; //CDS start coord |
367 |
uint CDend; //CDS end coord |
368 |
char CDphase; //initial phase for CDS start |
369 |
|
370 |
bool ismRNA() { return (ftype_id==gff_fid_mRNA); } |
371 |
|
372 |
int addExon(uint segstart, uint segend, double sc=0, char fr='.', |
373 |
int qs=0, int qe=0, bool iscds=false, char exontype=0); |
374 |
|
375 |
int addExon(GffLine* gl, bool keepAttr=false, bool noExonAttr=true); |
376 |
void removeExon(int idx); |
377 |
/* |
378 |
uint gstart; // global feature coordinates on genomic sequence |
379 |
uint gend; // ALWAYS gstart <= gend |
380 |
*/ |
381 |
char strand; //true if features are on the reverse complement strand |
382 |
double gscore; |
383 |
double uscore; //custom, user-computed score, if needed |
384 |
int covlen; //total coverage of reference genomic sequence (sum of maxcf segment lengths) |
385 |
|
386 |
//--------- optional data: |
387 |
int qlen; //query length, start, end - if available |
388 |
int qstart; |
389 |
int qend; |
390 |
int qcov; //query coverage - percent |
391 |
GffAttrs* attrs; //other gff3 attributes found for the main mRNA feature |
392 |
//constructor by gff line parsing: |
393 |
GffObj(GffReader* gfrd, GffLine* gffline, bool keepAttrs=false, bool noExonAttr=true); |
394 |
//if gfline->Parent!=NULL then this will also add the first sub-feature |
395 |
// otherwise, only the main feature is created |
396 |
void clearAttrs() { |
397 |
if (attrs!=NULL) delete attrs; |
398 |
} |
399 |
GffObj(char* anid=NULL):GSeg(0,0), exons(true,true,true) { //exons: sorted, free, unique |
400 |
gffID=NULL; |
401 |
uptr=NULL; |
402 |
ulink=NULL; |
403 |
udata=0; |
404 |
hasErrors=false; |
405 |
ftype_id=-1; |
406 |
subftype_id=-1; |
407 |
if (anid!=NULL) gffID=Gstrdup(anid); |
408 |
gffnames_ref(names); |
409 |
qlen=0; |
410 |
qstart=0; |
411 |
qend=0; |
412 |
qcov=0; |
413 |
partial=true; |
414 |
isCDS=false; |
415 |
CDstart=0; // hasCDS <=> CDstart>0 |
416 |
CDend=0; |
417 |
CDphase=0; |
418 |
gseq_id=-1; |
419 |
track_id=-1; |
420 |
/* |
421 |
gstart=0; |
422 |
gend=0; |
423 |
*/ |
424 |
xstart=0; |
425 |
xend=0; |
426 |
xstatus=0; |
427 |
strand=0; |
428 |
gscore=0; |
429 |
uscore=0; |
430 |
attrs=NULL; |
431 |
covlen=0; |
432 |
gname=NULL; |
433 |
} |
434 |
~GffObj() { |
435 |
GFREE(gffID); |
436 |
GFREE(gname); |
437 |
if (attrs!=NULL) delete attrs; |
438 |
gffnames_unref(names); |
439 |
} |
440 |
/* |
441 |
void addFeature(GffLine* gfline); |
442 |
int getFeatureId(int fnid); |
443 |
int getFeatureId(const char* fname); |
444 |
GFeature* hasFeature(char* parentID); |
445 |
void promote(GFeature* subf, GffLine* gfline); */ |
446 |
//-------------- |
447 |
GffObj* finalize(bool mergeCloseExons=false); //finalize parsing: must be called in order to merge adjacent/close proximity subfeatures |
448 |
void parseAttrs(GffAttrs*& atrlist, char* info, bool noExonAttr=false); |
449 |
const char* getSubfName() { //returns the generic feature type of the entries in exons array |
450 |
int sid=subftype_id; |
451 |
if (sid==gff_fid_exon && isCDS) sid=gff_fid_CDS; |
452 |
return names->feats.getName(sid); |
453 |
} |
454 |
const char* getFeatureName() { |
455 |
return names->feats.getName(ftype_id); |
456 |
} |
457 |
void addAttr(const char* attrname, char* attrvalue); |
458 |
const char* getAttrName(int i) { |
459 |
if (attrs==NULL) return NULL; |
460 |
return names->attrs.getName(attrs->Get(i)->attr_id); |
461 |
} |
462 |
char* getAttr(const char* atrname) { |
463 |
if (attrs==NULL || names==NULL || atrname==NULL) return NULL; |
464 |
return attrs->getAttr(names, atrname); |
465 |
} |
466 |
|
467 |
char* getAttrValue(int i) { |
468 |
if (attrs==NULL) return NULL; |
469 |
return attrs->Get(i)->attr_val; |
470 |
} |
471 |
const char* getGSeqName() { |
472 |
return names->gseqs.getName(gseq_id); |
473 |
} |
474 |
const char* getTrackName() { |
475 |
return names->tracks.getName(track_id); |
476 |
} |
477 |
/* |
478 |
bool overlap(GffObj* mrna) { |
479 |
//basic overlap: just segment ends |
480 |
return overlap(*mrna); |
481 |
} |
482 |
bool overlap(GffObj& d) { |
483 |
//ignores strand and gseq_id -- the user must do that in advance |
484 |
// just rough locus overlap, exons may not overlap |
485 |
return (gstart<d.gstart ? (d.gstart<=gend) : (gstart<=d.gend)); |
486 |
} |
487 |
|
488 |
bool overlap(uint s, uint e) { |
489 |
if (s>e) swap(s,e); |
490 |
return (gstart<s ? (s<=gend) : (gstart<=e)); |
491 |
} |
492 |
*/ |
493 |
bool exonOverlap(uint s, uint e) {//check if ANY exon overlaps given segment |
494 |
//ignores strand! |
495 |
if (s>e) swap(s,e); |
496 |
for (int i=0;i<exons.Count();i++) { |
497 |
if (exons[i]->overlap(s,e)) return true; |
498 |
} |
499 |
return false; |
500 |
} |
501 |
bool exonOverlap(GffObj& m) {//check if ANY exon overlaps given segment |
502 |
//if (gseq_id!=m.gseq_id) return false; |
503 |
// ignores strand and gseq_id, must check in advance |
504 |
for (int i=0;i<exons.Count();i++) { |
505 |
for (int j=0;j<m.exons.Count();j++) { |
506 |
if (exons[i]->start>m.exons[j]->end) continue; |
507 |
if (m.exons[j]->start>exons[i]->end) break; |
508 |
//-- overlap if we are here: |
509 |
// if (exons[i]->overlap(m.exons[j])) return true; |
510 |
return true; |
511 |
} |
512 |
} |
513 |
return false; |
514 |
} |
515 |
|
516 |
int exonOverlapIdx(uint s, uint e, int* ovlen=NULL) { |
517 |
//return the exons' index for the overlapping exon |
518 |
//ovlen, if given, will return the overlap length |
519 |
if (s>e) swap(s,e); |
520 |
for (int i=0;i<exons.Count();i++) { |
521 |
if (exons[i]->start>e) break; |
522 |
if (s>exons[i]->end) continue; |
523 |
//-- overlap if we are here: |
524 |
if (ovlen!=NULL) { |
525 |
int ovlend= (exons[i]->end>e) ? e : exons[i]->end; |
526 |
*ovlen= ovlend - ((s>exons[i]->start)? s : exons[i]->start)+1; |
527 |
} |
528 |
return i; |
529 |
} //for each exon |
530 |
*ovlen=0; |
531 |
return -1; |
532 |
} |
533 |
|
534 |
int exonOverlapLen(GffObj& m) { |
535 |
if (start>m.end || m.start>end) return 0; |
536 |
int i=0; |
537 |
int j=0; |
538 |
int ovlen=0; |
539 |
while (i<exons.Count() && j<m.exons.Count()) { |
540 |
uint istart=exons[i]->start; |
541 |
uint iend=exons[i]->end; |
542 |
uint jstart=m.exons[j]->start; |
543 |
uint jend=m.exons[j]->end; |
544 |
if (istart>jend) { j++; continue; } |
545 |
if (jstart>iend) { i++; continue; } |
546 |
//exon overlap |
547 |
uint ovstart=GMAX(istart,jstart); |
548 |
if (iend<jend) { |
549 |
ovlen+=iend-ovstart+1; |
550 |
i++; |
551 |
} |
552 |
else { |
553 |
ovlen+=jend-ovstart+1; |
554 |
j++; |
555 |
} |
556 |
}//while comparing exons |
557 |
return ovlen; |
558 |
} |
559 |
|
560 |
bool exonOverlap(GffObj* m) { |
561 |
return exonOverlap(*m); |
562 |
} |
563 |
//---------- coordinate transformation |
564 |
void xcoord(uint grstart, uint grend, char xstrand='+') { |
565 |
//relative coordinate transform, and reverse-complement transform if xstrand is '-' |
566 |
//does nothing if xstatus is the same already |
567 |
if (xstatus) { |
568 |
if (xstatus==xstrand && grstart==xstart && grend==xend) return; |
569 |
unxcoord();//restore original coordinates |
570 |
} |
571 |
xstatus=xstrand; |
572 |
xstart=grstart; |
573 |
xend=grend; |
574 |
if (CDstart>0) xcoordseg(CDstart, CDend); |
575 |
for (int i=0;i<exons.Count();i++) { |
576 |
xcoordseg(exons[i]->start, exons[i]->end); |
577 |
} |
578 |
if (xstatus=='-') { |
579 |
exons.Reverse(); |
580 |
int flen=end-start; |
581 |
start=xend-end+1; |
582 |
end=start+flen; |
583 |
} |
584 |
else { |
585 |
start=start-xstart+1; |
586 |
end=end-xstart+1; |
587 |
} |
588 |
} |
589 |
|
590 |
//transform an arbitrary segment based on current xstatus/xstart-xend |
591 |
void xcoordseg(uint& segstart, uint &segend) { |
592 |
if (xstatus==0) return; |
593 |
if (xstatus=='-') { |
594 |
int flen=segend-segstart; |
595 |
segstart=xend-segend+1; |
596 |
segend=segstart+flen; |
597 |
return; |
598 |
} |
599 |
else { |
600 |
segstart=segstart-xstart+1; |
601 |
segend=segend-xstart+1; |
602 |
} |
603 |
} |
604 |
|
605 |
void unxcoord() { //revert back to absolute genomic/gff coordinates if xstatus==true |
606 |
if (xstatus==0) return; //nothing to do, no transformation appplied |
607 |
if (CDstart>0) unxcoordseg(CDstart, CDend); |
608 |
//restore all GffExon intervals too |
609 |
for (int i=0;i<exons.Count();i++) { |
610 |
unxcoordseg(exons[i]->start, exons[i]->end); |
611 |
} |
612 |
if (xstatus=='-') { |
613 |
exons.Reverse(); |
614 |
int flen=end-start; |
615 |
start=xend-end+1; |
616 |
end=start+flen; |
617 |
} |
618 |
else { |
619 |
start=start+xstart-1; |
620 |
end=end+xstart-1; |
621 |
} |
622 |
xstatus=0; |
623 |
} |
624 |
void unxcoordseg(uint& astart, uint &aend) { |
625 |
//restore an arbitrary interval -- does NOT change the transform state! |
626 |
if (xstatus==0) return; |
627 |
if (xstatus=='-') { |
628 |
int flen=aend-astart; |
629 |
astart=xend-aend+1; |
630 |
aend=astart+flen; |
631 |
} |
632 |
else { |
633 |
astart=astart+xstart-1; |
634 |
aend=aend+xstart-1; |
635 |
} |
636 |
} |
637 |
//--------------------- |
638 |
bool operator==(GffObj& d){ |
639 |
return (gseq_id==d.gseq_id && start==d.start && end==d.end && strcmp(gffID, d.gffID)==0); |
640 |
} |
641 |
bool operator>(GffObj& d){ |
642 |
if (gseq_id!=d.gseq_id) return (gseq_id>d.gseq_id); |
643 |
if (start==d.start) { |
644 |
if (end==d.end) return strcmp(gffID, d.gffID)>0; |
645 |
else return end>d.end; |
646 |
} else return (start>d.start); |
647 |
} |
648 |
bool operator<(GffObj& d){ |
649 |
if (gseq_id!=d.gseq_id) return (gseq_id<d.gseq_id); |
650 |
if (start==d.start) { |
651 |
if (end==d.end) return strcmp(gffID, d.gffID)<0; |
652 |
else return end<d.end; |
653 |
} else return (start<d.start); |
654 |
} |
655 |
char* getID() { return gffID; } |
656 |
char* getGene() { return gname; } |
657 |
//void calcScore(); |
658 |
/*int exonCount() { return exoncount; } |
659 |
GffExon* getExonSegs() { return exons; } |
660 |
int cdsCount() { return cdscount; } |
661 |
GffExon* getCDSegs() { return cds; } */ |
662 |
|
663 |
/* bool validateMapping(int qrylen, bool pmap_parsing, int minpid, |
664 |
int mincov, int maxintron);*/ |
665 |
/* int addSeg(char* feat, int nstart, int nend, int sc=0, char fr='.', |
666 |
char* tid=NULL, char* tinfo=NULL);*/ |
667 |
int addSeg(GffLine* gfline); |
668 |
int addSeg(int fnid, GffLine* gfline); |
669 |
// (int fnid, char* feat, int nstart, int nend, int sc=0, |
670 |
// char fr='.', char* tid=NULL, char* tinfo=NULL); |
671 |
void getCDSegs(GArray<GffCDSeg>& cds); |
672 |
void printGxfLine(FILE* fout, char* tlabel, char* gseqname, bool iscds, |
673 |
uint segstart, uint segend, int exidx, char phase, bool gff3); |
674 |
void printGxf(FILE* fout, GffPrintMode gffp=pgffExon, char* tlabel=NULL); |
675 |
void printGtf(FILE* fout, char* tlabel=NULL) { |
676 |
printGxf(fout, pgtfAny, tlabel); |
677 |
} |
678 |
void printGff(FILE* fout, char* tlabel=NULL) { |
679 |
printGxf(fout, pgffAny, tlabel); |
680 |
} |
681 |
void print_mRNAGff(FILE* fout, char* tlabel=NULL, bool showCDS=false) { |
682 |
if (ismRNA()) |
683 |
printGxf(fout, showCDS ? pgffBoth : pgffExon, tlabel); |
684 |
} |
685 |
void printSummary(FILE* fout=NULL); |
686 |
void getCDS_ends(uint& cds_start, uint& cds_end); |
687 |
void mRNA_CDS_coords(uint& cds_start, uint& cds_end); |
688 |
char* getSpliced(GFaSeqGet* faseq, bool CDSonly=false, int* rlen=NULL, |
689 |
uint* cds_start=NULL, uint* cds_end=NULL, GList<GSeg>* seglst=NULL); |
690 |
char* getSplicedTr(GFaSeqGet* faseq, bool CDSonly=true, int* rlen=NULL); |
691 |
//bool validCDS(GFaSeqGet* faseq); //has In-Frame Stop Codon ? |
692 |
bool empty() { return (start==0); } |
693 |
}; |
694 |
|
695 |
//int cmpGMapScore(const pointer a, const pointer b); |
696 |
|
697 |
typedef bool GffRecFunc(GffObj* gobj, void* usrptr1, void* usrptr2); |
698 |
//user callback after parsing a mapping object: |
699 |
// Returns: "done with it" status: |
700 |
// TRUE if gobj is no longer needed so it's FREEd upon return |
701 |
// FALSE if the user needs the gobj pointer and is responsible for |
702 |
// collecting and freeing all GffObj objects |
703 |
|
704 |
|
705 |
//GSeqStat: collect basic stats about a common underlying genomic sequence |
706 |
// for multiple GffObj |
707 |
class GSeqStat { |
708 |
public: |
709 |
int gseqid; //gseq id in the global static pool of gseqs |
710 |
char* gseqname; //just a pointer to the name of gseq |
711 |
//int fcount;//number of features on this gseq |
712 |
uint mincoord; |
713 |
uint maxcoord; |
714 |
GList<GffObj> gflst; |
715 |
GSeqStat(int id=-1, char* name=NULL):gflst(true,false,false) { |
716 |
gseqid=id; |
717 |
gseqname=name; |
718 |
mincoord=MAXUINT; |
719 |
maxcoord=0; |
720 |
} |
721 |
bool operator>(GSeqStat& g) { |
722 |
return (gseqid>g.gseqid); |
723 |
} |
724 |
bool operator<(GSeqStat& g) { |
725 |
return (gseqid<g.gseqid); |
726 |
} |
727 |
bool operator==(GSeqStat& g) { |
728 |
return (gseqid==g.gseqid); |
729 |
} |
730 |
}; |
731 |
|
732 |
|
733 |
int gfo_cmpByLoc(const pointer p1, const pointer p2); |
734 |
|
735 |
class GfList: public GList<GffObj> { |
736 |
//just adding the option to sort by genomic sequence and coordinate |
737 |
public: |
738 |
GfList(bool sortbyloc=false):GList<GffObj>(false,false,false) { |
739 |
if (sortbyloc) this->setSorted((GCompareProc*)gfo_cmpByLoc); |
740 |
} |
741 |
}; |
742 |
|
743 |
class GffReader { |
744 |
friend class GffObj; |
745 |
friend class GffLine; |
746 |
char* linebuf; |
747 |
off_t fpos; |
748 |
int buflen; |
749 |
protected: |
750 |
FILE* fh; |
751 |
char* fname; //optional fasta file with the underlying genomic sequence to be attached to this reader |
752 |
GffNames* names; //just a pointer to the global static Gff names repository in GffObj |
753 |
GffLine* gffline; |
754 |
bool mrnaOnly; //read only mRNAs ? (exon/CDS features only) |
755 |
bool sortbyloc; //sort by location: genomic sequence and start coordinate |
756 |
GHash<GffObj> phash; //transcript_id (Parent~Contig) => GffObj pointer |
757 |
char* gfoBuildId(const char* id, const char* ctg); |
758 |
void gfoRemove(const char* id, const char* ctg); |
759 |
void gfoAdd(const char* id, const char* ctg, GffObj* gfo); |
760 |
GffObj* gfoFind(const char* id, const char* ctg); |
761 |
public: |
762 |
GfList gflst; //all read gflst |
763 |
GList<GSeqStat> gseqstats; //list of all genomic sequences seen by this reader, accumulates stats |
764 |
GffReader(FILE* f, bool justmrna=false, bool sort=false):phash(false),gflst(sort), gseqstats(true,true,true) { |
765 |
names=NULL; |
766 |
gffline=NULL; |
767 |
mrnaOnly=justmrna; |
768 |
sortbyloc=sort; |
769 |
fpos=0; |
770 |
fname=NULL; |
771 |
fh=f; |
772 |
GMALLOC(linebuf, GFF_LINELEN); |
773 |
buflen=GFF_LINELEN-1; |
774 |
} |
775 |
GffReader(char* fn, bool justmrna=false, bool sort=false):phash(false),gflst(sort) { |
776 |
names=NULL; |
777 |
fname=Gstrdup(fn); |
778 |
mrnaOnly=justmrna; |
779 |
sortbyloc=sort; |
780 |
fh=fopen(fname, "rb"); |
781 |
fpos=0; |
782 |
gffline=NULL; |
783 |
GMALLOC(linebuf, GFF_LINELEN); |
784 |
buflen=GFF_LINELEN-1; |
785 |
} |
786 |
|
787 |
virtual ~GffReader() { |
788 |
gffline=NULL; |
789 |
fpos=0; |
790 |
delete gffline; |
791 |
gflst.Clear(); |
792 |
phash.Clear(); |
793 |
gseqstats.Clear(); |
794 |
GFREE(fname); |
795 |
GFREE(linebuf); |
796 |
} |
797 |
|
798 |
GffLine* nextGffLine(); |
799 |
|
800 |
// parse -> block parsing functions -- do not use, |
801 |
// they always assume that subfeatures (exons) are grouped together by parent |
802 |
GffObj* parse(bool keepAttr=false, bool noExonAttr=true); |
803 |
void parseAll(GffRecFunc* gproc, bool keepAttr=false, bool noExonAttr=true, void* userptr1=NULL, void* userptr2=NULL); |
804 |
|
805 |
// use this instead of parse: load all subfeatures, re-group them in memory: |
806 |
void readAll(bool keepAttr=false, bool mergeCloseExons=false, bool noExonAttr=true); //just reads all gff records into gflst GList |
807 |
|
808 |
}; // end of GffReader |
809 |
|
810 |
#endif |