1 |
gpertea |
2 |
#include "gff.h" |
2 |
|
|
|
3 |
|
|
//GffNames* GffReader::names=NULL; |
4 |
|
|
GffNames* GffObj::names=NULL; |
5 |
|
|
//global set of feature names, attribute names etc. |
6 |
|
|
// -- common for all GffObjs in current application! |
7 |
|
|
|
8 |
gpertea |
16 |
const uint GFF_MAX_LOCUS = 7000000; //longest known gene in human is ~2.2M, UCSC claims a gene for mouse of ~ 3.1 M |
9 |
|
|
const uint GFF_MAX_EXON = 30000; //longest known exon in human is ~11K |
10 |
gpertea |
115 |
const uint GFF_MAX_INTRON= 6000000; //Ensembl shows a >5MB human intron |
11 |
gpertea |
16 |
bool gff_show_warnings = false; //global setting, set by GffReader->showWarnings() |
12 |
gpertea |
2 |
const int gff_fid_mRNA=0; |
13 |
gpertea |
16 |
const int gff_fid_transcript=1; |
14 |
|
|
const int gff_fid_exon=2; |
15 |
|
|
const int gff_fid_CDS=3; //never really used in GffObj ftype_id or subftype_id |
16 |
|
|
const uint gfo_flag_HAS_ERRORS = 0x00000001; |
17 |
|
|
const uint gfo_flag_CHILDREN_PROMOTED= 0x00000002; |
18 |
|
|
const uint gfo_flag_IS_GENE = 0x00000004; |
19 |
|
|
const uint gfo_flag_IS_TRANSCRIPT = 0x00000008; |
20 |
|
|
const uint gfo_flag_FROM_GFF3 = 0x00000010; |
21 |
|
|
const uint gfo_flag_BY_EXON = 0x00000020; //created by subfeature (exon) directly |
22 |
|
|
const uint gfo_flag_DISCARDED = 0x00000100; |
23 |
|
|
const uint gfo_flag_LST_KEEP = 0x00000200; |
24 |
|
|
const uint gfo_flag_LEVEL_MSK = 0x00FF0000; |
25 |
|
|
const byte gfo_flagShift_LEVEL = 16; |
26 |
gpertea |
2 |
|
27 |
|
|
void gffnames_ref(GffNames* &n) { |
28 |
|
|
if (n==NULL) n=new GffNames(); |
29 |
|
|
n->numrefs++; |
30 |
|
|
} |
31 |
|
|
|
32 |
|
|
void gffnames_unref(GffNames* &n) { |
33 |
|
|
if (n==NULL) GError("Error: attempt to remove reference to null GffNames object!\n"); |
34 |
|
|
n->numrefs--; |
35 |
|
|
if (n->numrefs==0) { delete n; n=NULL; } |
36 |
|
|
} |
37 |
|
|
|
38 |
|
|
int gfo_cmpByLoc(const pointer p1, const pointer p2) { |
39 |
|
|
|
40 |
|
|
GffObj& g1=*((GffObj*)p1); |
41 |
|
|
GffObj& g2=*((GffObj*)p2); |
42 |
gpertea |
16 |
if (g1.gseq_id==g2.gseq_id) { |
43 |
|
|
if (g1.start!=g2.start) |
44 |
|
|
return (int)(g1.start-g2.start); |
45 |
|
|
else if (g1.getLevel()!=g2.getLevel()) |
46 |
|
|
return (int)(g1.getLevel()-g2.getLevel()); |
47 |
|
|
else |
48 |
|
|
if (g1.end!=g2.end) |
49 |
|
|
return (int)(g1.end-g2.end); |
50 |
|
|
else return strcmp(g1.getID(), g2.getID()); |
51 |
|
|
} |
52 |
|
|
else return (int)(g1.gseq_id-g2.gseq_id); |
53 |
gpertea |
2 |
} |
54 |
|
|
|
55 |
gpertea |
150 |
char* GffLine::extractAttr(const char* attr, bool caseStrict, bool enforce_GTF2) { |
56 |
gpertea |
16 |
//parse a key attribute and remove it from the info string |
57 |
|
|
//(only works for attributes that have values following them after ' ' or '=') |
58 |
|
|
static const char GTF2_ERR[]="Error parsing attribute %s ('\"' required) at GTF line:\n%s\n"; |
59 |
gpertea |
150 |
int attrlen=strlen(attr); |
60 |
|
|
char cend=attr[attrlen-1]; |
61 |
|
|
//char* pos = (caseStrict) ? strstr(info, attr) : strifind(info, attr); |
62 |
|
|
//must make sure attr is not found in quoted text |
63 |
|
|
char* pos=info; |
64 |
|
|
char prevch=0; |
65 |
|
|
bool in_str=false; |
66 |
|
|
bool notfound=true; |
67 |
|
|
int (*strcmpfn)(const char*, const char*, int) = caseStrict ? Gstrcmp : Gstricmp; |
68 |
|
|
while (notfound && *pos) { |
69 |
|
|
char ch=*pos; |
70 |
|
|
if (ch=='"') { |
71 |
|
|
in_str=!in_str; |
72 |
|
|
pos++; |
73 |
|
|
prevch=ch; |
74 |
|
|
continue; |
75 |
|
|
} |
76 |
|
|
if (!in_str && (prevch==0 || prevch==' ' || prevch == ';') |
77 |
|
|
&& strcmpfn(attr, pos, attrlen)==0) { |
78 |
|
|
//attr match found |
79 |
|
|
//check for word boundary on right |
80 |
|
|
char* epos=pos+attrlen; |
81 |
|
|
if (cend=='=' || cend==' ' || *epos==0 || *epos==' ') { |
82 |
|
|
notfound=false; |
83 |
|
|
break; |
84 |
|
|
} |
85 |
|
|
//not a perfect match, move on |
86 |
|
|
pos=epos; |
87 |
|
|
prevch=*(pos-1); |
88 |
|
|
continue; |
89 |
|
|
} |
90 |
|
|
//not a match or in_str |
91 |
|
|
prevch=ch; |
92 |
|
|
pos++; |
93 |
|
|
} |
94 |
|
|
if (notfound) return NULL; |
95 |
|
|
char* vp=pos+attrlen; |
96 |
gpertea |
16 |
while (*vp==' ') vp++; |
97 |
|
|
if (*vp==';' || *vp==0) |
98 |
gpertea |
150 |
GError("Error parsing value of GFF attribute \"%s\", line:\n%s\n", attr, dupline); |
99 |
gpertea |
16 |
bool dq_enclosed=false; //value string enclosed by double quotes |
100 |
|
|
if (*vp=='"') { |
101 |
|
|
dq_enclosed=true; |
102 |
|
|
vp++; |
103 |
|
|
} |
104 |
|
|
if (enforce_GTF2 && !dq_enclosed) |
105 |
gpertea |
150 |
GError(GTF2_ERR,attr, dupline); |
106 |
gpertea |
16 |
char* vend=vp; |
107 |
|
|
if (dq_enclosed) { |
108 |
|
|
while (*vend!='"' && *vend!=';' && *vend!=0) vend++; |
109 |
|
|
} |
110 |
|
|
else { |
111 |
|
|
while (*vend!=';' && *vend!=0) vend++; |
112 |
|
|
} |
113 |
|
|
if (enforce_GTF2 && *vend!='"') |
114 |
gpertea |
150 |
GError(GTF2_ERR, attr, dupline); |
115 |
gpertea |
16 |
char *r=Gstrdup(vp, vend-1); |
116 |
|
|
//-- now remove this attribute from the info string |
117 |
|
|
while (*vend!=0 && (*vend=='"' || *vend==';' || *vend==' ')) vend++; |
118 |
|
|
if (*vend==0) vend--; |
119 |
|
|
for (char *src=vend, *dest=pos;;src++,dest++) { |
120 |
|
|
*dest=*src; |
121 |
|
|
if (*src==0) break; |
122 |
|
|
} |
123 |
|
|
return r; |
124 |
|
|
} |
125 |
gpertea |
2 |
|
126 |
gpertea |
16 |
static char fnamelc[128]; |
127 |
|
|
|
128 |
gpertea |
2 |
GffLine::GffLine(GffReader* reader, const char* l) { |
129 |
gpertea |
16 |
llen=strlen(l); |
130 |
|
|
GMALLOC(line,llen+1); |
131 |
|
|
memcpy(line, l, llen+1); |
132 |
|
|
GMALLOC(dupline, llen+1); |
133 |
|
|
memcpy(dupline, l, llen+1); |
134 |
gpertea |
2 |
skip=true; |
135 |
|
|
gseqname=NULL; |
136 |
|
|
track=NULL; |
137 |
|
|
ftype=NULL; |
138 |
|
|
info=NULL; |
139 |
gpertea |
16 |
_parents=NULL; |
140 |
|
|
_parents_len=0; |
141 |
|
|
num_parents=0; |
142 |
|
|
parents=NULL; |
143 |
|
|
is_gff3=false; |
144 |
gpertea |
2 |
is_cds=false; |
145 |
gpertea |
16 |
is_transcript=false; |
146 |
gpertea |
2 |
is_exon=false; |
147 |
gpertea |
16 |
is_gene=false; |
148 |
gpertea |
2 |
exontype=0; |
149 |
gpertea |
16 |
gene_id=NULL; |
150 |
|
|
gene_name=NULL; |
151 |
gpertea |
2 |
qstart=0; |
152 |
|
|
qend=0; |
153 |
|
|
qlen=0; |
154 |
|
|
ID=NULL; |
155 |
|
|
char* t[9]; |
156 |
|
|
int i=0; |
157 |
|
|
int tidx=1; |
158 |
|
|
t[0]=line; |
159 |
|
|
|
160 |
|
|
while (line[i]!=0) { |
161 |
|
|
if (line[i]=='\t') { |
162 |
|
|
line[i]=0; |
163 |
|
|
t[tidx]=line+i+1; |
164 |
|
|
tidx++; |
165 |
|
|
if (tidx>8) break; |
166 |
|
|
} |
167 |
|
|
i++; |
168 |
|
|
} |
169 |
|
|
|
170 |
|
|
if (tidx<8) { // ignore non-GFF lines |
171 |
|
|
// GMessage("Warning: error parsing GFF/GTF line:\n%s\n", l); |
172 |
|
|
return; |
173 |
|
|
} |
174 |
|
|
gseqname=t[0]; |
175 |
|
|
track=t[1]; |
176 |
|
|
ftype=t[2]; |
177 |
|
|
info=t[8]; |
178 |
|
|
char* p=t[3]; |
179 |
gpertea |
66 |
if (!parseUInt(p,fstart)) { |
180 |
|
|
//FIXME: chromosome_band entries in Flybase |
181 |
|
|
GMessage("Warning: invalid start coordinate at line:\n%s\n",l); |
182 |
|
|
return; |
183 |
|
|
} |
184 |
gpertea |
2 |
p=t[4]; |
185 |
gpertea |
66 |
if (!parseUInt(p,fend)) { |
186 |
|
|
GMessage("Warning: invalid end coordinate at line:\n%s\n",l); |
187 |
|
|
return; |
188 |
|
|
} |
189 |
gpertea |
144 |
if (fend<fstart) Gswap(fend,fstart); //make sure fstart>=fend, always |
190 |
gpertea |
2 |
p=t[5]; |
191 |
|
|
if (p[0]=='.' && p[1]==0) { |
192 |
|
|
score=0; |
193 |
|
|
} |
194 |
|
|
else { |
195 |
|
|
if (!parseDouble(p,score)) |
196 |
|
|
GError("Error parsing feature score from GFF line:\n%s\n",l); |
197 |
|
|
} |
198 |
|
|
strand=*t[6]; |
199 |
|
|
if (strand!='+' && strand!='-' && strand!='.') |
200 |
|
|
GError("Error parsing strand (%c) from GFF line:\n%s\n",strand,l); |
201 |
gpertea |
16 |
phase=*t[7]; // must be '.', '0', '1' or '2' |
202 |
gpertea |
2 |
ID=NULL; |
203 |
|
|
// exon/CDS/mrna filter |
204 |
gpertea |
16 |
strncpy(fnamelc, ftype, 127); |
205 |
|
|
fnamelc[127]=0; |
206 |
gpertea |
2 |
strlower(fnamelc); //convert to lower case |
207 |
gpertea |
16 |
bool is_t_data=false; |
208 |
gpertea |
2 |
if (strstr(fnamelc, "utr")!=NULL) { |
209 |
|
|
exontype=exgffUTR; |
210 |
|
|
is_exon=true; |
211 |
gpertea |
16 |
is_t_data=true; |
212 |
gpertea |
2 |
} |
213 |
gpertea |
66 |
else if (endsWith(fnamelc, "exon")) { |
214 |
gpertea |
2 |
exontype=exgffExon; |
215 |
|
|
is_exon=true; |
216 |
gpertea |
16 |
is_t_data=true; |
217 |
gpertea |
2 |
} |
218 |
gpertea |
16 |
else if (strstr(fnamelc, "stop") && |
219 |
|
|
(strstr(fnamelc, "codon") || strstr(fnamelc, "cds"))){ |
220 |
gpertea |
2 |
exontype=exgffStop; |
221 |
gpertea |
16 |
is_cds=true; //though some place it outside the last CDS segment |
222 |
|
|
is_t_data=true; |
223 |
|
|
} |
224 |
|
|
else if (strstr(fnamelc, "start") && |
225 |
|
|
((strstr(fnamelc, "codon")!=NULL) || strstr(fnamelc, "cds")!=NULL)){ |
226 |
|
|
exontype=exgffStart; |
227 |
gpertea |
2 |
is_cds=true; |
228 |
gpertea |
16 |
is_t_data=true; |
229 |
gpertea |
2 |
} |
230 |
|
|
else if (strcmp(fnamelc, "cds")==0) { |
231 |
|
|
exontype=exgffCDS; |
232 |
|
|
is_cds=true; |
233 |
gpertea |
16 |
is_t_data=true; |
234 |
gpertea |
2 |
} |
235 |
gpertea |
16 |
else if (endsWith(fnamelc, "gene") || startsWith(fnamelc, "gene")) { |
236 |
|
|
is_gene=true; |
237 |
|
|
is_t_data=true; //because its name will be attached to parented transcripts |
238 |
gpertea |
2 |
} |
239 |
gpertea |
16 |
else if (endsWith(fnamelc,"rna") || endsWith(fnamelc,"transcript")) { |
240 |
|
|
is_transcript=true; |
241 |
|
|
is_t_data=true; |
242 |
|
|
} |
243 |
gpertea |
2 |
|
244 |
gpertea |
16 |
if (reader->transcriptsOnly && !is_t_data) { |
245 |
|
|
char* id=extractAttr("ID="); |
246 |
|
|
if (id==NULL) id=extractAttr("transcript_id"); |
247 |
|
|
//GMessage("Discarding non-transcript line:\n%s\n",l); |
248 |
|
|
if (id!=NULL) { |
249 |
|
|
reader->discarded_ids.Add(id, new int(1)); |
250 |
|
|
GFREE(id); |
251 |
|
|
} |
252 |
|
|
return; //skip this line, unwanted feature name |
253 |
gpertea |
2 |
} |
254 |
gpertea |
150 |
ID=extractAttr("ID=",true); |
255 |
|
|
char* Parent=extractAttr("Parent=",true); |
256 |
gpertea |
16 |
is_gff3=(ID!=NULL || Parent!=NULL); |
257 |
|
|
if (is_gff3) { |
258 |
|
|
//parse as GFF3 |
259 |
|
|
if (ID!=NULL) { |
260 |
|
|
//has ID attr so it's likely to be a parent feature |
261 |
|
|
//look for explicit gene name |
262 |
gpertea |
150 |
gene_name=extractAttr("gene_name="); |
263 |
gpertea |
16 |
if (gene_name==NULL) { |
264 |
gpertea |
150 |
gene_name=extractAttr("geneName="); |
265 |
gpertea |
16 |
if (gene_name==NULL) { |
266 |
gpertea |
150 |
gene_name=extractAttr("gene_sym="); |
267 |
gpertea |
16 |
if (gene_name==NULL) { |
268 |
gpertea |
150 |
gene_name=extractAttr("gene="); |
269 |
gpertea |
16 |
} |
270 |
|
|
} |
271 |
|
|
} |
272 |
gpertea |
150 |
gene_id=extractAttr("geneID="); |
273 |
gpertea |
16 |
if (gene_id==NULL) { |
274 |
gpertea |
150 |
gene_id=extractAttr("gene_id="); |
275 |
gpertea |
16 |
} |
276 |
|
|
if (is_gene) { |
277 |
|
|
//special case: keep the Name and ID attributes of the gene feature |
278 |
|
|
if (gene_name==NULL) |
279 |
|
|
gene_name=extractAttr("Name="); |
280 |
|
|
if (gene_id==NULL) //the ID is also gene_id in this case |
281 |
|
|
gene_id=Gstrdup(ID); |
282 |
|
|
//skip=false; |
283 |
|
|
//return; |
284 |
|
|
GFREE(Parent); //TMI, we really don't care about gene Parents? |
285 |
|
|
} //gene feature |
286 |
|
|
}// has GFF3 ID |
287 |
|
|
if (Parent!=NULL) { |
288 |
|
|
//keep Parent attr |
289 |
|
|
//parse multiple parents |
290 |
|
|
num_parents=1; |
291 |
|
|
p=Parent; |
292 |
|
|
int last_delim_pos=-1; |
293 |
|
|
while (*p!=';' && *p!=0) { |
294 |
|
|
if (*p==',' && *(p+1)!=0 && *(p+1)!=';') { |
295 |
|
|
num_parents++; |
296 |
|
|
last_delim_pos=(p-Parent); |
297 |
|
|
} |
298 |
|
|
p++; |
299 |
|
|
} |
300 |
|
|
_parents_len=p-Parent+1; |
301 |
|
|
_parents=Parent; |
302 |
|
|
GMALLOC(parents, num_parents*sizeof(char*)); |
303 |
|
|
parents[0]=_parents; |
304 |
|
|
int i=1; |
305 |
|
|
if (last_delim_pos>0) { |
306 |
|
|
for (p=_parents+1;p<=_parents+last_delim_pos;p++) { |
307 |
|
|
if (*p==',') { |
308 |
|
|
char* ep=p-1; |
309 |
|
|
while (*ep==' ' && ep>_parents) ep--; |
310 |
|
|
*(ep+1)=0; //end the string there |
311 |
|
|
parents[i]=p+1; |
312 |
|
|
i++; |
313 |
|
|
} |
314 |
|
|
} |
315 |
|
|
} |
316 |
|
|
} //has Parent field |
317 |
|
|
} //GFF3 |
318 |
|
|
else { // GTF-like expected |
319 |
gpertea |
150 |
Parent=extractAttr("transcript_id",true); |
320 |
gpertea |
16 |
if (Parent!=NULL) { //GTF2 format detected |
321 |
|
|
if (is_transcript) { |
322 |
|
|
// atypical GTF with a parent transcript line declared |
323 |
|
|
ID=Parent; |
324 |
|
|
Parent=NULL; |
325 |
|
|
} |
326 |
|
|
gene_id=extractAttr("gene_id"); // for GTF this is the only attribute accepted as geneID |
327 |
gpertea |
150 |
if (gene_id==NULL) |
328 |
|
|
gene_id=extractAttr("geneid"); |
329 |
gpertea |
16 |
gene_name=extractAttr("gene_name"); |
330 |
|
|
if (gene_name==NULL) { |
331 |
gpertea |
150 |
|
332 |
gpertea |
16 |
gene_name=extractAttr("gene_sym"); |
333 |
gpertea |
150 |
if (gene_name==NULL) { |
334 |
gpertea |
16 |
gene_name=extractAttr("gene"); |
335 |
gpertea |
150 |
if (gene_name==NULL) |
336 |
|
|
gene_name=extractAttr("genesymbol"); |
337 |
|
|
} |
338 |
gpertea |
16 |
} |
339 |
gpertea |
2 |
//prepare for parseAttr by adding '=' character instead of spaces for all attributes |
340 |
|
|
//after the attribute name |
341 |
|
|
p=info; |
342 |
|
|
bool noed=true; //not edited after the last delim |
343 |
|
|
bool nsp=false; //non-space found after last delim |
344 |
|
|
while (*p!=0) { |
345 |
gpertea |
16 |
if (*p==' ') { |
346 |
|
|
if (nsp && noed) { |
347 |
|
|
*p='='; |
348 |
|
|
noed=false; |
349 |
|
|
p++; |
350 |
|
|
continue; |
351 |
|
|
} |
352 |
|
|
} |
353 |
|
|
else nsp=true; //non-space |
354 |
|
|
if (*p==';') { noed=true; nsp=false; } |
355 |
|
|
p++; |
356 |
|
|
} |
357 |
|
|
} //GTF2 detected (no parent line) |
358 |
|
|
else {// Parent is NULL, check for jigsaw format or other pre-GTF2 format |
359 |
|
|
//char* fexon=strstr(fnamelc, "exon"); |
360 |
|
|
//if (fexon!=NULL) { |
361 |
|
|
if (exontype==exgffExon) { |
362 |
gpertea |
2 |
if (startsWith(track,"jigsaw")) { |
363 |
gpertea |
16 |
is_cds=true; |
364 |
|
|
strcpy(track,"jigsaw"); |
365 |
|
|
p=strchr(info,';'); |
366 |
|
|
if (p==NULL) { Parent=Gstrdup(info); info=NULL; } |
367 |
|
|
else { Parent=Gstrdup(info,p-1); |
368 |
|
|
info=p+1; |
369 |
|
|
} |
370 |
|
|
} |
371 |
|
|
} //exon feature? |
372 |
|
|
if (Parent==NULL && exontype>=exgffCDS && |
373 |
|
|
(i=strcspn(info,"; \t\n\r"))<=(int)(strlen(info)+1)) { |
374 |
|
|
//one word ID ? really desperate attempt to parse it here |
375 |
gpertea |
2 |
Parent=Gstrdup(info,info+i-1); |
376 |
gpertea |
16 |
info=NULL; //discard anything else on the line |
377 |
|
|
} |
378 |
|
|
} |
379 |
|
|
if (Parent!=NULL) { //GTF transcript_id for exon/CDS feature |
380 |
|
|
_parents=Parent; |
381 |
|
|
GMALLOC(parents,sizeof(char*)); |
382 |
|
|
num_parents=1; |
383 |
|
|
parents[0]=_parents; |
384 |
|
|
} |
385 |
|
|
} //GTF-like |
386 |
gpertea |
2 |
|
387 |
gpertea |
16 |
//parse other potentially useful features |
388 |
|
|
if (is_gff3) { |
389 |
|
|
if ((p=strstr(info,"Target="))!=NULL) { //has Target attr |
390 |
|
|
p+=7; |
391 |
|
|
while (*p!=';' && *p!=0 && *p!=' ') p++; |
392 |
|
|
if (*p!=' ') { |
393 |
|
|
GError("Error parsing target coordinates from GFF line:\n%s\n",l); |
394 |
|
|
} |
395 |
|
|
if (!parseUInt(p,qstart)) |
396 |
|
|
GError("Error parsing target start coordinate from GFF line:\n%s\n",l); |
397 |
|
|
if (*p!=' ') { |
398 |
|
|
GError("Error parsing next target coordinate from GFF line:\n%s\n",l); |
399 |
|
|
} |
400 |
|
|
p++; |
401 |
|
|
if (!parseUInt(p,qend)) |
402 |
|
|
GError("Error parsing target end coordinate from GFF line:\n%s\n",l); |
403 |
gpertea |
2 |
} |
404 |
gpertea |
16 |
if ((p=strifind(info,"Qreg="))!=NULL) { //has Qreg attr |
405 |
|
|
p+=5; |
406 |
|
|
if (!parseUInt(p,qstart)) |
407 |
|
|
GError("Error parsing target start coordinate from GFF line:\n%s\n",l); |
408 |
|
|
if (*p!='-') { |
409 |
|
|
GError("Error parsing next target coordinate from GFF line:\n%s\n",l); |
410 |
|
|
} |
411 |
|
|
p++; |
412 |
|
|
if (!parseUInt(p,qend)) |
413 |
|
|
GError("Error parsing target end coordinate from GFF line:\n%s\n",l); |
414 |
|
|
if (*p=='|' || *p==':') { |
415 |
|
|
p++; |
416 |
|
|
if (!parseUInt(p,qlen)) |
417 |
|
|
GError("Error parsing target length from GFF Qreg|: \n%s\n",l); |
418 |
|
|
} |
419 |
|
|
}//has Qreg attr |
420 |
|
|
if (qlen==0 && (p=strifind(info,"Qlen="))!=NULL) { |
421 |
|
|
p+=5; |
422 |
|
|
if (!parseUInt(p,qlen)) |
423 |
|
|
GError("Error parsing target length from GFF Qlen:\n%s\n",l); |
424 |
gpertea |
2 |
} |
425 |
gpertea |
16 |
}//parsing some useful attributes in GFF3 records |
426 |
|
|
if (ID==NULL && parents==NULL) { |
427 |
|
|
if (reader->gff_warns) |
428 |
|
|
GMessage("Warning: could not parse ID or Parent from GFF line:\n%s\n",dupline); |
429 |
|
|
return; //skip |
430 |
gpertea |
2 |
} |
431 |
|
|
skip=false; |
432 |
|
|
} |
433 |
|
|
|
434 |
gpertea |
55 |
|
435 |
|
|
void GffObj::addCDS(uint cd_start, uint cd_end, char phase) { |
436 |
|
|
if (cd_start>=this->start) { |
437 |
|
|
this->CDstart=cd_start; |
438 |
|
|
if (strand=='+') this->CDphase=phase; |
439 |
|
|
} |
440 |
|
|
else this->CDstart=this->start; |
441 |
|
|
if (cd_end<=this->end) { |
442 |
|
|
this->CDend=cd_end; |
443 |
|
|
if (strand=='-') this->CDphase=phase; |
444 |
|
|
} |
445 |
|
|
else this->CDend=this->end; |
446 |
|
|
isTranscript(true); |
447 |
|
|
exon_ftype_id=gff_fid_exon; |
448 |
|
|
if (monoFeature()) { |
449 |
|
|
if (exons.Count()==0) addExon(this->start, this->end,0,'.',0,0,false,exgffExon); |
450 |
|
|
else exons[0]->exontype=exgffExon; |
451 |
|
|
} |
452 |
|
|
} |
453 |
|
|
|
454 |
gpertea |
16 |
int GffObj::addExon(GffReader* reader, GffLine* gl, bool keepAttr, bool noExonAttr) { |
455 |
gpertea |
2 |
//this will make sure we have the right subftype_id! |
456 |
gpertea |
55 |
//int subf_id=-1; |
457 |
|
|
if (!isTranscript() && gl->is_cds) { |
458 |
gpertea |
50 |
isTranscript(true); |
459 |
|
|
exon_ftype_id=gff_fid_exon; |
460 |
|
|
if (exons.Count()==1) exons[0]->exontype=exgffExon; |
461 |
|
|
} |
462 |
gpertea |
16 |
if (isTranscript()) { |
463 |
|
|
if (exon_ftype_id<0) {//exon_ftype_id=gff_fid_exon; |
464 |
|
|
if (gl->exontype>0) exon_ftype_id=gff_fid_exon; |
465 |
|
|
else exon_ftype_id=names->feats.addName(gl->ftype); |
466 |
|
|
} |
467 |
gpertea |
2 |
//any recognized mRNA segment gets the generic "exon" type (also applies to CDS) |
468 |
gpertea |
16 |
if (gl->exontype==0 && !gl->is_transcript) { |
469 |
|
|
//extraneous mRNA feature, discard |
470 |
|
|
if (reader->gff_warns) |
471 |
|
|
GMessage("Warning: discarding unrecognized transcript subfeature %s of %s\n", |
472 |
|
|
gl->ftype, gffID); |
473 |
gpertea |
2 |
return -1; |
474 |
gpertea |
16 |
} |
475 |
gpertea |
2 |
} |
476 |
gpertea |
16 |
else { //non-mRNA parent feature, check this subf type |
477 |
gpertea |
55 |
int subf_id=names->feats.addName(gl->ftype); |
478 |
gpertea |
16 |
if (exon_ftype_id<0 || exons.Count()==0) //never assigned a subfeature type before (e.g. first exon being added) |
479 |
|
|
exon_ftype_id=subf_id; |
480 |
|
|
else { |
481 |
|
|
if (exon_ftype_id!=subf_id) { |
482 |
gpertea |
50 |
// |
483 |
gpertea |
16 |
if (exon_ftype_id==ftype_id && exons.Count()==1 && exons[0]->start==start && exons[0]->end==end) { |
484 |
|
|
//the existing exon was just a dummy one created by default, discard it |
485 |
|
|
exons.Clear(); |
486 |
|
|
covlen=0; |
487 |
|
|
exon_ftype_id=subf_id; //allow the new subfeature to completely takeover |
488 |
|
|
} |
489 |
|
|
else { //multiple subfeatures, prefer those with |
490 |
|
|
if (reader->gff_warns) |
491 |
|
|
GMessage("GFF Warning: multiple subfeatures (%s and %s) found for %s, discarding ", |
492 |
|
|
names->feats.getName(subf_id), names->feats.getName(exon_ftype_id),gffID); |
493 |
|
|
if (gl->exontype!=0) { //new feature is an exon, discard previously parsed subfeatures |
494 |
|
|
if (reader->gff_warns) GMessage("%s.\n", names->feats.getName(exon_ftype_id)); |
495 |
|
|
exon_ftype_id=subf_id; |
496 |
|
|
exons.Clear(); |
497 |
|
|
covlen=0; |
498 |
|
|
} |
499 |
|
|
else { //discard new feature |
500 |
|
|
if (reader->gff_warns) GMessage("%s.\n", names->feats.getName(subf_id)); |
501 |
|
|
return -1; //skip this 2nd subfeature type for this parent! |
502 |
|
|
} |
503 |
|
|
} |
504 |
|
|
} //incoming subfeature is of different type |
505 |
|
|
} //new subfeature type |
506 |
|
|
} //non-mRNA parent |
507 |
gpertea |
2 |
int eidx=addExon(gl->fstart, gl->fend, gl->score, gl->phase, |
508 |
|
|
gl->qstart,gl->qend, gl->is_cds, gl->exontype); |
509 |
gpertea |
16 |
if (eidx<0) return eidx; //this should never happen |
510 |
|
|
if (keepAttr) { |
511 |
|
|
if (noExonAttr) { |
512 |
|
|
if (attrs==NULL) //place the parsed attributes directly at transcript level |
513 |
|
|
parseAttrs(attrs, gl->info); |
514 |
|
|
} |
515 |
|
|
else { //need all exon-level attributes |
516 |
|
|
parseAttrs(exons[eidx]->attrs, gl->info, true); |
517 |
|
|
} |
518 |
gpertea |
2 |
} |
519 |
|
|
return eidx; |
520 |
|
|
} |
521 |
|
|
|
522 |
|
|
|
523 |
|
|
int GffObj::addExon(uint segstart, uint segend, double sc, char fr, int qs, int qe, bool iscds, char exontype) { |
524 |
|
|
if (exons.Count()==0) { |
525 |
|
|
if (iscds) isCDS=true; //for now, assume CDS only if first "exon" given is a CDS |
526 |
gpertea |
16 |
if (exon_ftype_id<0) { |
527 |
|
|
exon_ftype_id = isTranscript() ? gff_fid_exon : ftype_id; |
528 |
gpertea |
2 |
} |
529 |
|
|
} |
530 |
gpertea |
16 |
//special treatment of start/stop codon features, they might be broken/split between exons |
531 |
|
|
//and in that case some providers will still give the wrong end coordinate as start+2 (e.g. UCSC) |
532 |
|
|
//so we should not trust the end coordinate for such features |
533 |
|
|
if (exontype==exgffStart || exontype==exgffStop) { |
534 |
|
|
if (strand=='-') segstart=segend; |
535 |
|
|
else segend=segstart; |
536 |
|
|
if (exontype==exgffStart) { |
537 |
|
|
if (CDstart==0 || segstart<CDstart) CDstart=segstart; |
538 |
|
|
} |
539 |
|
|
else { |
540 |
|
|
if (segstart>CDend) CDend=segstart; |
541 |
|
|
} |
542 |
|
|
} |
543 |
|
|
else if (iscds) { //update CDS anchors: |
544 |
gpertea |
2 |
if (CDstart==0 || segstart<CDstart) { |
545 |
|
|
CDstart=segstart; |
546 |
|
|
if (exontype==exgffCDS && strand=='+') CDphase=fr; |
547 |
|
|
} |
548 |
|
|
if (segend>CDend) { |
549 |
|
|
if (exontype==exgffCDS && strand=='-') CDphase=fr; |
550 |
|
|
CDend=segend; |
551 |
|
|
} |
552 |
|
|
} |
553 |
gpertea |
16 |
else { // not a CDS/start/stop |
554 |
gpertea |
2 |
isCDS=false; |
555 |
|
|
} |
556 |
|
|
if (qs || qe) { |
557 |
gpertea |
144 |
if (qs>qe) Gswap(qs,qe); |
558 |
gpertea |
2 |
if (qs==0) qs=1; |
559 |
gpertea |
16 |
} |
560 |
gpertea |
2 |
int ovlen=0; |
561 |
gpertea |
16 |
if (exontype>0) { //check for overlaps between exon-type segments |
562 |
|
|
int oi=exonOverlapIdx(segstart, segend, &ovlen); |
563 |
|
|
if (oi>=0) { //overlap existing segment |
564 |
|
|
if (ovlen==0) { |
565 |
|
|
//adjacent segments will be merged |
566 |
|
|
//e.g. CDS to (UTR|exon) |
567 |
|
|
if ((exons[oi]->exontype>=exgffUTR && exontype==exgffCDS) || |
568 |
|
|
(exons[oi]->exontype==exgffCDS && exontype>=exgffUTR)) { |
569 |
|
|
expandExon(oi, segstart, segend, exgffCDSUTR, sc, fr, qs, qe); |
570 |
|
|
return oi; |
571 |
|
|
} |
572 |
|
|
//CDS adjacent to stop_codon: UCSC does (did?) this |
573 |
|
|
if ((exons[oi]->exontype==exgffStop && exontype==exgffCDS) || |
574 |
|
|
(exons[oi]->exontype==exgffCDS && exontype==exgffStop)) { |
575 |
|
|
expandExon(oi, segstart, segend, exgffCDS, sc, fr, qs, qe); |
576 |
|
|
return oi; |
577 |
|
|
} |
578 |
|
|
} |
579 |
|
|
//only allow this for CDS within exon, stop_codon within (CDS|UTR|exon), |
580 |
|
|
// start_codon within (CDS|exon) |
581 |
|
|
if (exons[oi]->exontype>exontype && |
582 |
|
|
exons[oi]->start<=segstart && exons[oi]->end>=segend && |
583 |
|
|
!(exons[oi]->exontype==exgffUTR && exontype==exgffCDS)) { |
584 |
|
|
//larger segment given first, now the smaller included one is redundant |
585 |
|
|
return oi; //only used to store attributes from current GffLine |
586 |
|
|
} |
587 |
|
|
if (exontype>exons[oi]->exontype && |
588 |
|
|
segstart<=exons[oi]->start && segend>=exons[oi]->end && |
589 |
|
|
!(exontype==exgffUTR && exons[oi]->exontype==exgffCDS)) { |
590 |
|
|
//smaller segment given first, so we have to enlarge it |
591 |
|
|
expandExon(oi, segstart, segend, exontype, sc, fr, qs, qe); |
592 |
|
|
//this should also check for overlapping next exon (oi+1) ? |
593 |
|
|
return oi; |
594 |
|
|
} |
595 |
|
|
//there is also the special case of "ribosomal slippage exception" (programmed frameshift) |
596 |
|
|
//where two CDS segments may actually overlap for 1 or 2 bases, but there should be only one encompassing exon |
597 |
|
|
//if (ovlen>2 || exons[oi]->exontype!=exgffCDS || exontype!=exgffCDS) { |
598 |
|
|
// had to relax this because of some weird UCSC annotations with exons partially overlapping the CDS segments |
599 |
|
|
/* |
600 |
|
|
if (ovlen>2 && exons[oi]->exontype!=exgffUTR && exontype!=exgffUTR) { |
601 |
|
|
if (gff_show_warnings) |
602 |
|
|
GMessage("GFF Warning: discarding overlapping feature segment (%d-%d) (vs %d-%d (%s)) for GFF ID %s on %s\n", |
603 |
|
|
segstart, segend, exons[oi]->start, exons[oi]->end, getSubfName(), gffID, getGSeqName()); |
604 |
|
|
hasErrors(true); |
605 |
|
|
return -1; //segment NOT added |
606 |
|
|
} |
607 |
|
|
*/ |
608 |
gpertea |
2 |
|
609 |
gpertea |
16 |
if ((ovlen>2 || ovlen==0) || exons[oi]->exontype!=exgffCDS || exontype!=exgffCDS) { |
610 |
|
|
if (gff_show_warnings) |
611 |
|
|
GMessage("GFF Warning: merging overlapping/adjacent feature segment (%d-%d) into (%d-%d) (%s) for GFF ID %s on %s\n", |
612 |
|
|
segstart, segend, exons[oi]->start, exons[oi]->end, getSubfName(), gffID, getGSeqName()); |
613 |
|
|
expandExon(oi, segstart, segend, exontype, sc, fr, qs, qe); |
614 |
|
|
return oi; |
615 |
|
|
} |
616 |
|
|
// else add the segment if the overlap is small and between two CDS segments |
617 |
|
|
//TODO: we might want to add an attribute here with the slippage coordinate and size? |
618 |
|
|
covlen-=ovlen; |
619 |
|
|
}//overlap or adjacent to existing segment |
620 |
|
|
} //check for overlap |
621 |
gpertea |
2 |
// --- no overlap, or accepted micro-overlap (ribosomal slippage) |
622 |
|
|
// create & add the new segment |
623 |
gpertea |
51 |
/* |
624 |
|
|
if (start>0 && exontype==exgffCDS && exons.Count()==0) { |
625 |
|
|
//adding a CDS directly as the first subfeature of a declared parent |
626 |
|
|
segstart=start; |
627 |
|
|
segend=end; |
628 |
|
|
} |
629 |
|
|
*/ |
630 |
gpertea |
2 |
GffExon* enew=new GffExon(segstart, segend, sc, fr, qs, qe, exontype); |
631 |
|
|
int eidx=exons.Add(enew); |
632 |
|
|
if (eidx<0) { |
633 |
gpertea |
16 |
//this would actually be acceptable if the object is a "Gene" and "exons" are in fact isoforms |
634 |
|
|
if (gff_show_warnings) |
635 |
|
|
GMessage("GFF Warning: failed adding segment %d-%d for %s (discarded)!\n", |
636 |
gpertea |
2 |
segstart, segend, gffID); |
637 |
gpertea |
16 |
delete enew; |
638 |
|
|
hasErrors(true); |
639 |
gpertea |
2 |
return -1; |
640 |
|
|
} |
641 |
|
|
covlen+=(int)(exons[eidx]->end-exons[eidx]->start)+1; |
642 |
gpertea |
55 |
//adjust parent feature coordinates to contain this exon |
643 |
|
|
if (start==0 || start>exons.First()->start) { |
644 |
gpertea |
51 |
start=exons.First()->start; |
645 |
|
|
} |
646 |
gpertea |
55 |
if (end<exons.Last()->end) end=exons.Last()->end; |
647 |
|
|
|
648 |
gpertea |
2 |
if (uptr!=NULL) { //collect stats about the underlying genomic sequence |
649 |
|
|
GSeqStat* gsd=(GSeqStat*)uptr; |
650 |
|
|
if (start<gsd->mincoord) gsd->mincoord=start; |
651 |
|
|
if (end>gsd->maxcoord) gsd->maxcoord=end; |
652 |
gpertea |
16 |
if (this->len()>gsd->maxfeat_len) { |
653 |
|
|
gsd->maxfeat_len=this->len(); |
654 |
|
|
gsd->maxfeat=this; |
655 |
|
|
} |
656 |
gpertea |
2 |
} |
657 |
|
|
return eidx; |
658 |
|
|
} |
659 |
|
|
|
660 |
gpertea |
16 |
void GffObj::expandExon(int oi, uint segstart, uint segend, char exontype, double sc, char fr, int qs, int qe) { |
661 |
|
|
//oi is the index of the *first* overlapping segment found that must be enlarged |
662 |
|
|
covlen-=exons[oi]->len(); |
663 |
|
|
if (segstart<exons[oi]->start) |
664 |
|
|
exons[oi]->start=segstart; |
665 |
|
|
if (qs && qs<exons[oi]->qstart) exons[oi]->qstart=qs; |
666 |
|
|
if (segend>exons[oi]->end) |
667 |
|
|
exons[oi]->end=segend; |
668 |
|
|
if (qe && qe>exons[oi]->qend) exons[oi]->qend=qe; |
669 |
|
|
//warning: score cannot be properly adjusted! e.g. if it's a p-value it's just going to get worse |
670 |
|
|
if (sc!=0) exons[oi]->score=sc; |
671 |
|
|
covlen+=exons[oi]->len(); |
672 |
|
|
//if (exons[oi]->exontype< exontype) -- always true |
673 |
|
|
exons[oi]->exontype = exontype; |
674 |
|
|
if (exontype==exgffCDS) exons[oi]->phase=fr; |
675 |
|
|
//we must check if any more exons are also overlapping this |
676 |
|
|
int ni=oi+1; //next exon index after oi |
677 |
|
|
while (ni<exons.Count() && segend>=exons[ni]->start) { // next segment overlaps new enlarged segment |
678 |
|
|
//only allow this if next segment is fully included, and a subordinate |
679 |
|
|
if (exons[ni]->exontype<exontype && exons[ni]->end<=segend) { |
680 |
|
|
/* I guess we have to relax this due to stupid UCSC hg18 files having a start_codon sticking out |
681 |
|
|
chr1 hg18_knownGene start_codon 69806911 69806913 0.000000 + . |
682 |
|
|
chr1 hg18_knownGene CDS 69806911 69806912 0.000000 + 0 |
683 |
|
|
chr1 hg18_knownGene exon 69805456 69806912 0.000000 + . |
684 |
|
|
*/ |
685 |
|
|
if (exons[ni]->qstart<exons[oi]->qstart) exons[oi]->qstart=exons[ni]->qstart; |
686 |
|
|
if (exons[ni]->qend>exons[oi]->qend) exons[oi]->qend=exons[ni]->qend; |
687 |
|
|
exons.Delete(ni); |
688 |
|
|
} |
689 |
|
|
else { |
690 |
|
|
if (gff_show_warnings) GMessage("GFF Warning: overlapping existing exon(%d-%d) while expanding to %d-%d for GFF ID %s\n", |
691 |
|
|
exons[ni]->start, exons[ni]->end, segstart, segend, gffID); |
692 |
|
|
//hasErrors(true); |
693 |
|
|
break; |
694 |
|
|
} |
695 |
|
|
} |
696 |
|
|
// -- make sure any other related boundaries are updated: |
697 |
|
|
start=exons.First()->start; |
698 |
|
|
end=exons.Last()->end; |
699 |
|
|
if (uptr!=NULL) { //collect stats about the underlying genomic sequence |
700 |
|
|
GSeqStat* gsd=(GSeqStat*)uptr; |
701 |
|
|
if (start<gsd->mincoord) gsd->mincoord=start; |
702 |
|
|
if (end>gsd->maxcoord) gsd->maxcoord=end; |
703 |
|
|
if (this->len()>gsd->maxfeat_len) { |
704 |
|
|
gsd->maxfeat_len=this->len(); |
705 |
|
|
gsd->maxfeat=this; |
706 |
|
|
} |
707 |
|
|
} |
708 |
|
|
} |
709 |
|
|
|
710 |
gpertea |
2 |
void GffObj::removeExon(int idx) { |
711 |
|
|
/* |
712 |
|
|
if (idx==0 && segs[0].start==gstart) |
713 |
|
|
gstart=segs[1].start; |
714 |
|
|
if (idx==segcount && segs[segcount].end==gend) |
715 |
|
|
gend=segs[segcount-1].end; |
716 |
|
|
*/ |
717 |
|
|
if (idx<0 || idx>=exons.Count()) return; |
718 |
|
|
int segstart=exons[idx]->start; |
719 |
|
|
int segend=exons[idx]->end; |
720 |
|
|
exons.Delete(idx); |
721 |
|
|
covlen -= (int)(segend-segstart)+1; |
722 |
|
|
start=exons.First()->start; |
723 |
|
|
end=exons.Last()->end; |
724 |
|
|
if (isCDS) { CDstart=start; CDend=end; } |
725 |
|
|
} |
726 |
|
|
|
727 |
gpertea |
16 |
void GffObj::removeExon(GffExon* p) { |
728 |
|
|
for (int idx=0;idx<exons.Count();idx++) { |
729 |
|
|
if (exons[idx]==p) { |
730 |
|
|
int segstart=exons[idx]->start; |
731 |
|
|
int segend=exons[idx]->end; |
732 |
|
|
exons.Delete(idx); |
733 |
|
|
covlen -= (int)(segend-segstart)+1; |
734 |
|
|
start=exons.First()->start; |
735 |
|
|
end=exons.Last()->end; |
736 |
|
|
if (isCDS) { CDstart=start; CDend=end; } |
737 |
|
|
return; |
738 |
|
|
} |
739 |
|
|
} |
740 |
|
|
} |
741 |
|
|
|
742 |
|
|
|
743 |
|
|
|
744 |
gpertea |
2 |
GffObj::GffObj(GffReader *gfrd, GffLine* gffline, bool keepAttr, bool noExonAttr): |
745 |
gpertea |
16 |
GSeg(0,0), exons(true,true,false), children(1,false) { |
746 |
|
|
xstart=0; |
747 |
|
|
xend=0; |
748 |
|
|
xstatus=0; |
749 |
|
|
partial=false; |
750 |
|
|
isCDS=false; |
751 |
|
|
uptr=NULL; |
752 |
|
|
ulink=NULL; |
753 |
|
|
parent=NULL; |
754 |
|
|
udata=0; |
755 |
|
|
flags=0; |
756 |
|
|
CDstart=0; |
757 |
|
|
CDend=0; |
758 |
|
|
CDphase=0; |
759 |
|
|
geneID=NULL; |
760 |
|
|
gene_name=NULL; |
761 |
|
|
attrs=NULL; |
762 |
|
|
gffID=NULL; |
763 |
|
|
track_id=-1; |
764 |
|
|
gseq_id=-1; |
765 |
|
|
ftype_id=-1; |
766 |
|
|
exon_ftype_id=-1; |
767 |
|
|
strand='.'; |
768 |
|
|
if (gfrd==NULL) |
769 |
gpertea |
2 |
GError("Cannot use this GffObj constructor with a NULL GffReader!\n"); |
770 |
gpertea |
16 |
gffnames_ref(names); |
771 |
|
|
if (gfrd->names==NULL) gfrd->names=names; |
772 |
|
|
//qlen=0;qstart=0;qend=0; |
773 |
|
|
gscore=0; |
774 |
|
|
uscore=0; |
775 |
|
|
covlen=0; |
776 |
|
|
qcov=0; |
777 |
|
|
start=gffline->fstart; |
778 |
|
|
end=gffline->fend; |
779 |
|
|
gseq_id=names->gseqs.addName(gffline->gseqname); |
780 |
|
|
track_id=names->tracks.addName(gffline->track); |
781 |
|
|
strand=gffline->strand; |
782 |
|
|
qlen=gffline->qlen; |
783 |
|
|
qstart=gffline->qstart; |
784 |
|
|
qend=gffline->qend; |
785 |
|
|
//setup flags from gffline |
786 |
|
|
isCDS=gffline->is_cds; //for now |
787 |
|
|
isGene(gffline->is_gene); |
788 |
|
|
isTranscript(gffline->is_transcript || gffline->exontype!=0); |
789 |
|
|
fromGff3(gffline->is_gff3); |
790 |
|
|
|
791 |
|
|
if (gffline->parents!=NULL) { |
792 |
|
|
//GTF style -- create a GffObj directly by subfeature |
793 |
|
|
//(also possible orphan GFF3 exon line, or an exon given before its parent (chado)) |
794 |
|
|
if (gffline->exontype!=0) { //recognized exon-like feature |
795 |
|
|
ftype_id=gff_fid_transcript; //so this is some sort of transcript |
796 |
|
|
exon_ftype_id=gff_fid_exon; //subfeatures MUST be exons |
797 |
|
|
} |
798 |
|
|
else {//unrecognized subfeatures |
799 |
|
|
//make this GffObj of the same feature type |
800 |
|
|
ftype_id=names->feats.addName(gffline->ftype); |
801 |
|
|
} |
802 |
|
|
if (gffline->ID==NULL) { //typical GTF |
803 |
|
|
gffID=Gstrdup(gffline->parents[0]); |
804 |
|
|
this->createdByExon(true); |
805 |
|
|
//this is likely the first exon/segment of the feature |
806 |
|
|
addExon(gfrd, gffline, keepAttr, noExonAttr); |
807 |
gpertea |
2 |
} |
808 |
gpertea |
16 |
else { //a parented feature with an ID -- probably an orphan GFF3 line |
809 |
|
|
if (gffline->is_gff3 && gffline->exontype!=0) { |
810 |
|
|
//premature exon given before its parent transcript |
811 |
|
|
//create the transcript entry here |
812 |
|
|
gffID=Gstrdup(gffline->parents[0]); |
813 |
|
|
this->createdByExon(true); |
814 |
|
|
//this is the first exon/segment of the transcript |
815 |
|
|
addExon(gfrd, gffline, keepAttr, noExonAttr); |
816 |
|
|
} |
817 |
|
|
else { //unrecognized non-exon feature ? use the ID instead |
818 |
|
|
gffID=Gstrdup(gffline->ID); |
819 |
|
|
if (keepAttr) this->parseAttrs(attrs, gffline->info); |
820 |
|
|
} |
821 |
gpertea |
2 |
} |
822 |
gpertea |
16 |
} //subfeature given directly |
823 |
|
|
else { //gffline->parents==NULL |
824 |
|
|
//create a parent feature in its own right |
825 |
gpertea |
2 |
gscore=gffline->score; |
826 |
|
|
if (gffline->ID==NULL || gffline->ID[0]==0) |
827 |
gpertea |
16 |
GError("Error: no ID found for GFF record start\n"); |
828 |
|
|
gffID=Gstrdup(gffline->ID); //there must be an ID here |
829 |
|
|
//if (gffline->is_transcript) ftype_id=gff_fid_mRNA; |
830 |
|
|
//else |
831 |
|
|
ftype_id=names->feats.addName(gffline->ftype); |
832 |
|
|
if (gffline->is_transcript) |
833 |
|
|
exon_ftype_id=gff_fid_exon; |
834 |
|
|
|
835 |
|
|
if (keepAttr) this->parseAttrs(attrs, gffline->info); |
836 |
|
|
}//no parent |
837 |
|
|
|
838 |
|
|
if (gffline->gene_name!=NULL) { |
839 |
|
|
gene_name=Gstrdup(gffline->gene_name); |
840 |
|
|
} |
841 |
|
|
if (gffline->gene_id!=NULL) { |
842 |
|
|
geneID=Gstrdup(gffline->gene_id); |
843 |
|
|
} |
844 |
|
|
|
845 |
|
|
GSeqStat* gsd=gfrd->gseqstats.AddIfNew(new GSeqStat(gseq_id,names->gseqs.lastNameUsed()),true); |
846 |
|
|
uptr=gsd; |
847 |
|
|
if (start<gsd->mincoord) gsd->mincoord=start; |
848 |
|
|
if (end>gsd->maxcoord) gsd->maxcoord=end; |
849 |
|
|
if (this->len()>gsd->maxfeat_len) { |
850 |
|
|
gsd->maxfeat_len=this->len(); |
851 |
|
|
gsd->maxfeat=this; |
852 |
gpertea |
2 |
} |
853 |
|
|
} |
854 |
|
|
|
855 |
|
|
GffLine* GffReader::nextGffLine() { |
856 |
|
|
if (gffline!=NULL) return gffline; //caller should free gffline after processing |
857 |
|
|
while (gffline==NULL) { |
858 |
|
|
int llen=0; |
859 |
|
|
buflen=GFF_LINELEN-1; |
860 |
|
|
char* l=fgetline(linebuf, buflen, fh, &fpos, &llen); |
861 |
|
|
if (l==NULL) { |
862 |
|
|
return NULL; //end of file |
863 |
|
|
} |
864 |
|
|
int ns=0; //first nonspace position |
865 |
|
|
while (l[ns]!=0 && isspace(l[ns])) ns++; |
866 |
|
|
if (l[ns]=='#' || llen<10) continue; |
867 |
|
|
gffline=new GffLine(this, l); |
868 |
|
|
if (gffline->skip) { |
869 |
|
|
delete gffline; |
870 |
|
|
gffline=NULL; |
871 |
gpertea |
16 |
continue; |
872 |
gpertea |
2 |
} |
873 |
gpertea |
16 |
if (gffline->ID==NULL && gffline->parents==NULL) { //it must have an ID |
874 |
|
|
//this might not be needed, already checked in the GffLine constructor |
875 |
|
|
if (gff_warns) |
876 |
|
|
GMessage("Warning: malformed GFF line, no parent or record Id (kipping\n"); |
877 |
|
|
delete gffline; |
878 |
|
|
gffline=NULL; |
879 |
|
|
//continue; |
880 |
|
|
} |
881 |
gpertea |
2 |
} |
882 |
|
|
return gffline; |
883 |
|
|
} |
884 |
|
|
|
885 |
gpertea |
151 |
|
886 |
gpertea |
2 |
char* GffReader::gfoBuildId(const char* id, const char* ctg) { |
887 |
|
|
//caller must free the returned pointer |
888 |
|
|
char* buf=NULL; |
889 |
|
|
int idlen=strlen(id); |
890 |
|
|
GMALLOC(buf, idlen+strlen(ctg)+2); |
891 |
|
|
strcpy(buf, id); |
892 |
|
|
buf[idlen]='~'; |
893 |
|
|
strcpy(buf+idlen+1, ctg); |
894 |
|
|
return buf; |
895 |
|
|
} |
896 |
gpertea |
151 |
/* |
897 |
gpertea |
2 |
void GffReader::gfoRemove(const char* id, const char* ctg) { |
898 |
|
|
char* buf=gfoBuildId(id,ctg); |
899 |
|
|
phash.Remove(buf); |
900 |
|
|
GFREE(buf); |
901 |
|
|
} |
902 |
gpertea |
151 |
*/ |
903 |
gpertea |
16 |
//Warning: if gflst gets altered, idx becomes obsolete |
904 |
gpertea |
151 |
GfoHolder* GffReader::gfoAdd(GffObj* gfo, int idx) { |
905 |
|
|
GVec<GfoHolder>* glst=new GVec<GfoHolder>(1); |
906 |
|
|
GfoHolder gh(gfo,idx); |
907 |
|
|
int i=glst->Add(gh); |
908 |
|
|
phash.Add(gfo->gffID, glst); |
909 |
|
|
return &(glst->Get(i)); |
910 |
gpertea |
2 |
} |
911 |
gpertea |
16 |
|
912 |
gpertea |
151 |
GfoHolder* GffReader::gfoAdd(GVec<GfoHolder>& glst, GffObj* gfo, int idx) { |
913 |
|
|
GfoHolder gh(gfo,idx); |
914 |
|
|
int i=glst.Add(gh); |
915 |
|
|
return &(glst[i]); |
916 |
gpertea |
2 |
} |
917 |
|
|
|
918 |
gpertea |
151 |
GfoHolder* GffReader::gfoFind(const char* id, const char* ctg, char strand, uint start, GVec<GfoHolder>** glst) { |
919 |
|
|
GVec<GfoHolder>* gl=phash.Find(id); |
920 |
|
|
GfoHolder* gh=NULL; |
921 |
|
|
if (gl) { |
922 |
|
|
for (int i=0;i<gl->Count();i++) { |
923 |
|
|
GfoHolder& gfo = gl->Get(i); |
924 |
|
|
if (ctg!=NULL && strcmp(ctg,gfo.gffobj->getGSeqName())!=0) |
925 |
|
|
continue; |
926 |
|
|
if (strand && strand != gfo.gffobj->strand) |
927 |
|
|
continue; |
928 |
|
|
if (start>0 && abs((int)start-(int)gfo.gffobj->start)>GFF_MAX_LOCUS) |
929 |
|
|
continue; |
930 |
|
|
//must be the same transcript, according to given comparison criteria |
931 |
|
|
gh=&gfo; |
932 |
|
|
break; |
933 |
|
|
} |
934 |
|
|
} |
935 |
|
|
if (glst) *glst=gl; |
936 |
|
|
return gh; |
937 |
|
|
} |
938 |
|
|
|
939 |
gpertea |
16 |
GfoHolder* GffReader::replaceGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, int replaceidx) { |
940 |
|
|
GffObj* newgfo=new GffObj(this, gffline, keepAttr, noExonAttr); |
941 |
|
|
GfoHolder* r=NULL; |
942 |
|
|
if (replaceidx>=0) { |
943 |
|
|
gflst.Put(replaceidx,newgfo); |
944 |
gpertea |
151 |
r=gfoAdd(newgfo, replaceidx); |
945 |
gpertea |
16 |
} |
946 |
|
|
else { |
947 |
|
|
int gfoidx=gflst.Add(newgfo); |
948 |
gpertea |
151 |
r=gfoAdd(newgfo, gfoidx); |
949 |
gpertea |
16 |
} |
950 |
|
|
if (gff_warns) { |
951 |
|
|
int* pcount=tids.Find(newgfo->gffID); |
952 |
|
|
if (pcount!=NULL) { |
953 |
gpertea |
151 |
if (gff_warns) GMessage("Warning: duplicate GFF ID: %s\n", newgfo->gffID); |
954 |
gpertea |
16 |
(*pcount)++; |
955 |
|
|
} |
956 |
|
|
else { |
957 |
|
|
tids.Add(newgfo->gffID,new int(1)); |
958 |
|
|
} |
959 |
|
|
} |
960 |
|
|
return r; |
961 |
|
|
} |
962 |
gpertea |
2 |
|
963 |
gpertea |
16 |
GfoHolder* GffReader::updateParent(GfoHolder* newgfh, GffObj* parent) { |
964 |
|
|
//assert(parent); |
965 |
|
|
//assert(newgfo); |
966 |
|
|
parent->children.Add(newgfh->gffobj); |
967 |
|
|
if (newgfh->gffobj->parent==NULL) newgfh->gffobj->parent=parent; |
968 |
|
|
newgfh->gffobj->setLevel(parent->getLevel()+1); |
969 |
|
|
if (parent->isGene()) { |
970 |
|
|
if (parent->gene_name!=NULL && newgfh->gffobj->gene_name==NULL) |
971 |
|
|
newgfh->gffobj->gene_name=Gstrdup(parent->gene_name); |
972 |
|
|
if (parent->geneID!=NULL && newgfh->gffobj->geneID==NULL) |
973 |
|
|
newgfh->gffobj->geneID=Gstrdup(parent->geneID); |
974 |
|
|
} |
975 |
|
|
|
976 |
|
|
return newgfh; |
977 |
gpertea |
2 |
} |
978 |
|
|
|
979 |
gpertea |
16 |
GfoHolder* GffReader::newGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, |
980 |
gpertea |
151 |
GffObj* parent, GffExon* pexon, GVec<GfoHolder>* glst) { |
981 |
gpertea |
16 |
GffObj* newgfo=new GffObj(this, gffline, keepAttr, noExonAttr); |
982 |
|
|
GfoHolder* r=NULL; |
983 |
|
|
int gfoidx=gflst.Add(newgfo); |
984 |
gpertea |
151 |
r=(glst) ? gfoAdd(*glst, newgfo, gfoidx) :gfoAdd(newgfo, gfoidx); |
985 |
gpertea |
16 |
if (parent!=NULL) { |
986 |
|
|
updateParent(r, parent); |
987 |
|
|
if (pexon!=NULL) parent->removeExon(pexon); |
988 |
|
|
} |
989 |
|
|
if (gff_warns) { |
990 |
|
|
int* pcount=tids.Find(newgfo->gffID); |
991 |
|
|
if (pcount!=NULL) { |
992 |
|
|
if (gff_warns) GMessage("Warning: duplicate GFF ID: %s\n", newgfo->gffID); |
993 |
|
|
(*pcount)++; |
994 |
|
|
} |
995 |
|
|
else { |
996 |
|
|
tids.Add(newgfo->gffID,new int(1)); |
997 |
|
|
} |
998 |
|
|
} |
999 |
|
|
return r; |
1000 |
|
|
} |
1001 |
gpertea |
2 |
|
1002 |
gpertea |
16 |
GfoHolder* GffReader::updateGffRec(GfoHolder* prevgfo, GffLine* gffline, |
1003 |
|
|
bool keepAttr) { |
1004 |
|
|
if (prevgfo==NULL) return NULL; |
1005 |
|
|
prevgfo->gffobj->createdByExon(false); |
1006 |
|
|
prevgfo->gffobj->ftype_id=prevgfo->gffobj->names->feats.addName(gffline->ftype); |
1007 |
|
|
prevgfo->gffobj->start=gffline->fstart; |
1008 |
|
|
prevgfo->gffobj->end=gffline->fend; |
1009 |
|
|
prevgfo->gffobj->isGene(gffline->is_gene); |
1010 |
|
|
prevgfo->gffobj->isTranscript(gffline->is_transcript || gffline->exontype!=0); |
1011 |
|
|
prevgfo->gffobj->fromGff3(gffline->is_gff3); |
1012 |
|
|
if (keepAttr) { |
1013 |
|
|
if (prevgfo->gffobj->attrs!=NULL) prevgfo->gffobj->attrs->Clear(); |
1014 |
|
|
prevgfo->gffobj->parseAttrs(prevgfo->gffobj->attrs, gffline->info); |
1015 |
|
|
} |
1016 |
|
|
return prevgfo; |
1017 |
|
|
} |
1018 |
|
|
|
1019 |
|
|
|
1020 |
|
|
bool GffReader::addExonFeature(GfoHolder* prevgfo, GffLine* gffline, GHash<CNonExon>& pex, bool noExonAttr) { |
1021 |
|
|
bool r=true; |
1022 |
|
|
if (gffline->strand!=prevgfo->gffobj->strand) { |
1023 |
gpertea |
115 |
//TODO: add support for trans-splicing and even inter-chromosomal fusions |
1024 |
|
|
if (prevgfo->gffobj->strand=='.') { |
1025 |
|
|
prevgfo->gffobj->strand=gffline->strand; |
1026 |
|
|
} |
1027 |
|
|
else { |
1028 |
|
|
GMessage("GFF Error at %s (%c): exon %d-%d (%c) found on different strand; discarded.\n", |
1029 |
|
|
prevgfo->gffobj->gffID, prevgfo->gffobj->strand, |
1030 |
|
|
gffline->fstart, gffline->fend, gffline->strand, prevgfo->gffobj->getGSeqName()); |
1031 |
|
|
//r=false; |
1032 |
|
|
return true; //FIXME: split trans-spliced mRNAs by strand |
1033 |
|
|
} |
1034 |
|
|
} |
1035 |
gpertea |
16 |
int gdist=(gffline->fstart>prevgfo->gffobj->end) ? gffline->fstart-prevgfo->gffobj->end : |
1036 |
|
|
((gffline->fend<prevgfo->gffobj->start)? prevgfo->gffobj->start-gffline->fend : |
1037 |
|
|
0 ); |
1038 |
|
|
if (gdist>(int)GFF_MAX_LOCUS) { //too far apart, most likely this is a duplicate ID |
1039 |
|
|
GMessage("Error: duplicate GFF ID '%s' (or exons too far apart)!\n",prevgfo->gffobj->gffID); |
1040 |
|
|
//validation_errors = true; |
1041 |
|
|
r=false; |
1042 |
|
|
if (!gff_warns) exit(1); |
1043 |
|
|
} |
1044 |
|
|
int eidx=prevgfo->gffobj->addExon(this, gffline, !noExonAttr, noExonAttr); |
1045 |
|
|
if (eidx>=0 && gffline->ID!=NULL && gffline->exontype==0) |
1046 |
|
|
subfPoolAdd(pex, prevgfo); |
1047 |
|
|
return r; |
1048 |
|
|
} |
1049 |
|
|
|
1050 |
|
|
CNonExon* GffReader::subfPoolCheck(GffLine* gffline, GHash<CNonExon>& pex, char*& subp_name) { |
1051 |
|
|
CNonExon* subp=NULL; |
1052 |
|
|
subp_name=NULL; |
1053 |
|
|
for (int i=0;i<gffline->num_parents;i++) { |
1054 |
|
|
if (transcriptsOnly && discarded_ids.Find(gffline->parents[i])!=NULL) |
1055 |
|
|
continue; |
1056 |
|
|
subp_name=gfoBuildId(gffline->parents[i], gffline->gseqname); //e.g. mRNA name |
1057 |
|
|
subp=pex.Find(subp_name); |
1058 |
|
|
if (subp!=NULL) |
1059 |
|
|
return subp; |
1060 |
|
|
GFREE(subp_name); |
1061 |
|
|
} |
1062 |
|
|
return NULL; |
1063 |
|
|
} |
1064 |
|
|
|
1065 |
|
|
void GffReader::subfPoolAdd(GHash<CNonExon>& pex, GfoHolder* newgfo) { |
1066 |
|
|
//this might become a parent feature later |
1067 |
|
|
if (newgfo->gffobj->exons.Count()>0) { |
1068 |
|
|
char* xbuf=gfoBuildId(gffline->ID, gffline->gseqname); |
1069 |
|
|
pex.Add(xbuf, new CNonExon(newgfo->idx, newgfo->gffobj, |
1070 |
|
|
newgfo->gffobj->exons[0], gffline)); |
1071 |
|
|
GFREE(xbuf); |
1072 |
|
|
} |
1073 |
|
|
} |
1074 |
|
|
|
1075 |
|
|
GfoHolder* GffReader::promoteFeature(CNonExon* subp, char*& subp_name, GHash<CNonExon>& pex, |
1076 |
|
|
bool keepAttr, bool noExonAttr) { |
1077 |
|
|
GffObj* prevp=subp->parent; //grandparent of gffline (e.g. gene) |
1078 |
|
|
if (prevp!=gflst[subp->idx]) |
1079 |
|
|
GError("Error promoting subfeature %s, gflst index mismatch?!\n", subp->gffline->ID); |
1080 |
|
|
subp->gffline->discardParent(); |
1081 |
|
|
GfoHolder* gfoh=newGffRec(subp->gffline, keepAttr, noExonAttr, prevp, subp->exon); |
1082 |
|
|
pex.Remove(subp_name); //no longer a potential parent, moved it to phash already |
1083 |
|
|
prevp->promotedChildren(true); |
1084 |
|
|
return gfoh; //returns the holder of newly promoted feature |
1085 |
|
|
} |
1086 |
|
|
|
1087 |
|
|
//have to parse the whole file because exons can be scattered all over |
1088 |
gpertea |
2 |
void GffReader::readAll(bool keepAttr, bool mergeCloseExons, bool noExonAttr) { |
1089 |
gpertea |
16 |
bool validation_errors = false; |
1090 |
|
|
//loc_debug=false; |
1091 |
|
|
GHash<CNonExon> pex; //keep track of any "exon"-like features that have an ID |
1092 |
|
|
//and thus could become promoted to parent features |
1093 |
gpertea |
2 |
while (nextGffLine()!=NULL) { |
1094 |
gpertea |
16 |
//seen this gff ID before? |
1095 |
|
|
GfoHolder* prevseen=NULL; |
1096 |
gpertea |
152 |
if (gffline->ID && gffline->exontype==0) //GFF3 parent-like feature (mRNA, gene, etc.) |
1097 |
gpertea |
16 |
prevseen=gfoFind(gffline->ID, gffline->gseqname); |
1098 |
|
|
if (prevseen!=NULL) { |
1099 |
|
|
if (prevseen->gffobj->createdByExon()) { |
1100 |
gpertea |
151 |
//just in case the exon was found before (shouldn't happen) |
1101 |
gpertea |
16 |
updateGffRec(prevseen, gffline, keepAttr); |
1102 |
|
|
} |
1103 |
|
|
else { |
1104 |
|
|
GMessage("Error: duplicate GFF ID '%s' encountered!\n",gffline->ID); |
1105 |
|
|
validation_errors = true; |
1106 |
|
|
if (gff_warns) { |
1107 |
|
|
delete gffline; gffline=NULL; continue; |
1108 |
|
|
} |
1109 |
|
|
else exit(1); |
1110 |
|
|
} |
1111 |
gpertea |
2 |
} |
1112 |
gpertea |
16 |
if (gffline->parents==NULL) {//start GFF3-like record with no parent (mRNA, gene) |
1113 |
|
|
if (!prevseen) newGffRec(gffline, keepAttr, noExonAttr); |
1114 |
gpertea |
2 |
} |
1115 |
gpertea |
151 |
else { //--- it's a parented feature (exon/CDS, but might still be a mRNA) |
1116 |
gpertea |
16 |
bool found_parent=false; |
1117 |
|
|
GfoHolder* newgfo=prevseen; |
1118 |
gpertea |
151 |
GVec<GfoHolder>* newgflst=NULL; |
1119 |
gpertea |
16 |
for (int i=0;i<gffline->num_parents;i++) { |
1120 |
|
|
if (transcriptsOnly && discarded_ids.Find(gffline->parents[i])!=NULL) |
1121 |
|
|
continue; //skipping discarded parent feature |
1122 |
gpertea |
151 |
GfoHolder* parentgfo=gfoFind(gffline->parents[i], gffline->gseqname, gffline->strand, gffline->fstart, &newgflst); |
1123 |
gpertea |
16 |
if (parentgfo!=NULL) { //parent GffObj parsed earlier |
1124 |
|
|
found_parent=true; |
1125 |
|
|
if (parentgfo->gffobj->isGene() && gffline->is_transcript |
1126 |
|
|
&& gffline->exontype==0) { |
1127 |
|
|
//not an exon, but a transcript parented by a gene |
1128 |
|
|
if (newgfo) { |
1129 |
|
|
updateParent(newgfo, parentgfo->gffobj); |
1130 |
|
|
} |
1131 |
|
|
else { |
1132 |
|
|
newgfo=newGffRec(gffline, keepAttr, noExonAttr, parentgfo->gffobj); |
1133 |
|
|
} |
1134 |
|
|
} |
1135 |
|
|
else { //potential exon subfeature |
1136 |
|
|
if (!addExonFeature(parentgfo, gffline, pex, noExonAttr)) |
1137 |
|
|
validation_errors=true; |
1138 |
|
|
} |
1139 |
gpertea |
2 |
} |
1140 |
gpertea |
16 |
} //for each parsed parent Id |
1141 |
|
|
if (!found_parent) { //new GTF-like record starting here with a subfeature directly |
1142 |
|
|
//or it could be some chado GFF3 barf with exons declared BEFORE their parent :( |
1143 |
|
|
//check if this feature isn't parented by a previously stored "exon" subfeature |
1144 |
|
|
char* subp_name=NULL; |
1145 |
|
|
CNonExon* subp=subfPoolCheck(gffline, pex, subp_name); |
1146 |
|
|
if (subp!=NULL) { //found a subfeature that is the parent of this gffline |
1147 |
|
|
//promote that subfeature to a full GffObj |
1148 |
|
|
GfoHolder* gfoh=promoteFeature(subp, subp_name, pex, keepAttr, noExonAttr); |
1149 |
|
|
//add current gffline as an exon of the newly promoted subfeature |
1150 |
|
|
if (!addExonFeature(gfoh, gffline, pex, noExonAttr)) |
1151 |
|
|
validation_errors=true; |
1152 |
|
|
} |
1153 |
|
|
else { //no parent seen before, create one directly with this exon |
1154 |
|
|
//loc_debug=true; |
1155 |
gpertea |
151 |
GfoHolder* newgfo=prevseen ? prevseen : newGffRec(gffline, keepAttr, noExonAttr, NULL, NULL, newgflst); |
1156 |
gpertea |
16 |
if (gffline->ID!=NULL && gffline->exontype==0) |
1157 |
|
|
subfPoolAdd(pex, newgfo); |
1158 |
|
|
//even those with errors will be added here! |
1159 |
|
|
} |
1160 |
|
|
GFREE(subp_name); |
1161 |
|
|
} //no previous parent found |
1162 |
|
|
} //parented feature |
1163 |
|
|
//-- |
1164 |
|
|
delete gffline; |
1165 |
|
|
gffline=NULL; |
1166 |
|
|
}//while gff lines |
1167 |
|
|
gflst.finalize(this, mergeCloseExons, keepAttr, noExonAttr); //force sorting by locus if so constructed |
1168 |
gpertea |
2 |
// all gff records are now loaded in GList gflst |
1169 |
|
|
// so we can free the hash |
1170 |
|
|
phash.Clear(); |
1171 |
gpertea |
16 |
tids.Clear(); |
1172 |
|
|
if (validation_errors) { |
1173 |
|
|
exit(1); |
1174 |
|
|
} |
1175 |
gpertea |
2 |
} |
1176 |
|
|
|
1177 |
gpertea |
16 |
GffObj* GffObj::finalize(GffReader* gfr, bool mergeCloseExons, bool keepAttrs, bool noExonAttr) { |
1178 |
|
|
//merge |
1179 |
|
|
//always merge adjacent or overlapping segments |
1180 |
|
|
//but if mergeCloseExons then merge even when distance is up to 5 bases |
1181 |
gpertea |
2 |
udata=0; |
1182 |
|
|
uptr=NULL; |
1183 |
gpertea |
16 |
if (gfr->transcriptsOnly && !(isTranscript() || (isGene() && children.Count()==0))) { |
1184 |
|
|
isDiscarded(true); |
1185 |
|
|
} |
1186 |
|
|
if (ftype_id==gff_fid_transcript && CDstart>0) { |
1187 |
|
|
ftype_id=gff_fid_mRNA; |
1188 |
|
|
//exon_ftype_id=gff_fid_exon; |
1189 |
|
|
} |
1190 |
|
|
//if (ftype_id==gff_fid_mRNA || exon_ftype_id==gff_fid_exon || mergeCloseExons) { |
1191 |
|
|
if (isTranscript() || exon_ftype_id==gff_fid_exon || mergeCloseExons) { |
1192 |
gpertea |
2 |
int mindist=mergeCloseExons ? 5:1; |
1193 |
|
|
for (int i=0;i<exons.Count()-1;i++) { |
1194 |
|
|
int ni=i+1; |
1195 |
|
|
uint mend=exons[i]->end; |
1196 |
|
|
while (ni<exons.Count()) { |
1197 |
|
|
int dist=(int)(exons[ni]->start-mend); |
1198 |
gpertea |
16 |
if (dist>mindist) break; //no merging with next segment |
1199 |
|
|
if (gfr!=NULL && gfr->gff_warns && dist!=0 && (exons[ni]->exontype!=exgffUTR && exons[i]->exontype!=exgffUTR)) { |
1200 |
|
|
GMessage("GFF warning: merging adjacent/overlapping segments of %s on %s (%d-%d, %d-%d)\n", |
1201 |
|
|
gffID, getGSeqName(), exons[i]->start, exons[i]->end,exons[ni]->start, exons[ni]->end); |
1202 |
|
|
} |
1203 |
gpertea |
2 |
mend=exons[ni]->end; |
1204 |
gpertea |
16 |
covlen-=exons[i]->len(); |
1205 |
gpertea |
2 |
exons[i]->end=mend; |
1206 |
gpertea |
16 |
covlen+=exons[i]->len(); |
1207 |
|
|
covlen-=exons[ni]->len(); |
1208 |
gpertea |
2 |
if (exons[ni]->attrs!=NULL && (exons[i]->attrs==NULL || |
1209 |
|
|
exons[i]->attrs->Count()<exons[ni]->attrs->Count())) { |
1210 |
|
|
//use the other exon attributes, if more |
1211 |
|
|
delete(exons[i]->attrs); |
1212 |
|
|
exons[i]->attrs=exons[ni]->attrs; |
1213 |
|
|
exons[ni]->attrs=NULL; |
1214 |
|
|
} |
1215 |
|
|
exons.Delete(ni); |
1216 |
|
|
} //check for merge with next exon |
1217 |
|
|
} //for each exon |
1218 |
|
|
} |
1219 |
gpertea |
16 |
//attribute reduction for GTF records |
1220 |
|
|
if (keepAttrs && !noExonAttr && !fromGff3() |
1221 |
|
|
&& exons.Count()>0 && exons[0]->attrs!=NULL) { |
1222 |
|
|
bool attrs_discarded=false; |
1223 |
|
|
for (int a=0;a<exons[0]->attrs->Count();a++) { |
1224 |
|
|
int attr_name_id=exons[0]->attrs->Get(a)->attr_id; |
1225 |
|
|
char* attr_name=names->attrs.getName(attr_name_id); |
1226 |
|
|
char* attr_val =exons[0]->attrs->Get(a)->attr_val; |
1227 |
|
|
bool sameExonAttr=true; |
1228 |
|
|
for (int i=1;i<exons.Count();i++) { |
1229 |
|
|
char* ov=exons[i]->getAttr(attr_name_id); |
1230 |
|
|
if (ov==NULL || (strcmp(ov,attr_val)!=0)) { |
1231 |
|
|
sameExonAttr=false; |
1232 |
|
|
break; |
1233 |
|
|
} |
1234 |
|
|
} |
1235 |
|
|
if (sameExonAttr) { |
1236 |
|
|
//delete this attribute from exons level |
1237 |
|
|
attrs_discarded=true; |
1238 |
|
|
this->addAttr(attr_name, attr_val); |
1239 |
|
|
for (int i=1;i<exons.Count();i++) { |
1240 |
|
|
removeExonAttr(*(exons[i]), attr_name_id); |
1241 |
|
|
} |
1242 |
|
|
exons[0]->attrs->freeItem(a); |
1243 |
|
|
} |
1244 |
|
|
} |
1245 |
|
|
if (attrs_discarded) exons[0]->attrs->Pack(); |
1246 |
|
|
} |
1247 |
gpertea |
2 |
return this; |
1248 |
|
|
} |
1249 |
|
|
|
1250 |
gpertea |
16 |
void GffObj::parseAttrs(GffAttrs*& atrlist, char* info, bool isExon) { |
1251 |
gpertea |
2 |
if (names==NULL) |
1252 |
|
|
GError(ERR_NULL_GFNAMES, "parseAttrs()"); |
1253 |
|
|
if (atrlist==NULL) |
1254 |
|
|
atrlist=new GffAttrs(); |
1255 |
|
|
char* endinfo=info+strlen(info); |
1256 |
|
|
char* start=info; |
1257 |
|
|
char* pch=start; |
1258 |
|
|
while (start<endinfo) { |
1259 |
|
|
//skip spaces |
1260 |
|
|
while (*start==' ' && start<endinfo) start++; |
1261 |
|
|
pch=strchr(start, ';'); |
1262 |
|
|
if (pch==NULL) pch=endinfo; |
1263 |
|
|
else { |
1264 |
|
|
*pch='\0'; |
1265 |
|
|
pch++; |
1266 |
|
|
} |
1267 |
|
|
char* ech=strchr(start,'='); |
1268 |
|
|
if (ech!=NULL) { // attr=value format found |
1269 |
|
|
*ech='\0'; |
1270 |
gpertea |
16 |
//if (noExonAttr && (strcmp(start, "exon_number")==0 || strcmp(start, "exon")==0)) { start=pch; continue; } |
1271 |
|
|
if (strcmp(start, "exon_number")==0 || strcmp(start, "exon")==0 || |
1272 |
|
|
strcmp(start, "exon_id")==0) |
1273 |
|
|
{ start=pch; continue; } |
1274 |
gpertea |
2 |
ech++; |
1275 |
|
|
while (*ech==' ' && ech<endinfo) ech++;//skip extra spaces after the '=' |
1276 |
gpertea |
16 |
//atrlist->Add(new GffAttr(names->attrs.addName(start),ech)); |
1277 |
|
|
//make sure we don't add the same attribute more than once |
1278 |
|
|
if (isExon && (strcmp(start, "protein_id")==0)) { |
1279 |
|
|
//Ensembl special case |
1280 |
|
|
this->addAttr(start, ech); |
1281 |
|
|
start=pch; |
1282 |
|
|
continue; |
1283 |
|
|
} |
1284 |
|
|
atrlist->add_or_update(names, start, ech); |
1285 |
gpertea |
2 |
} |
1286 |
|
|
/* |
1287 |
|
|
else { //not an attr=value format |
1288 |
|
|
atrlist->Add(new GffAttr(names->attrs.addName(start),"1")); |
1289 |
|
|
} |
1290 |
|
|
*/ |
1291 |
|
|
start=pch; |
1292 |
|
|
} |
1293 |
|
|
if (atrlist->Count()==0) { delete atrlist; atrlist=NULL; } |
1294 |
|
|
} |
1295 |
|
|
|
1296 |
gpertea |
16 |
void GffObj::addAttr(const char* attrname, const char* attrvalue) { |
1297 |
gpertea |
2 |
if (this->attrs==NULL) |
1298 |
|
|
this->attrs=new GffAttrs(); |
1299 |
gpertea |
16 |
//this->attrs->Add(new GffAttr(names->attrs.addName(attrname),attrvalue)); |
1300 |
|
|
this->attrs->add_or_update(names, attrname, attrvalue); |
1301 |
gpertea |
2 |
} |
1302 |
|
|
|
1303 |
gpertea |
16 |
|
1304 |
|
|
void GffObj::setFeatureName(const char* feature) { |
1305 |
|
|
//change the feature name/type for a transcript |
1306 |
|
|
int fid=names->feats.addName(feature); |
1307 |
|
|
if (monoFeature() && exons.Count()>0) |
1308 |
|
|
this->exon_ftype_id=fid; |
1309 |
|
|
this->ftype_id=fid; |
1310 |
|
|
} |
1311 |
|
|
|
1312 |
|
|
void GffObj::setRefName(const char* newname) { |
1313 |
|
|
//change the feature name/type for a transcript |
1314 |
|
|
int rid=names->gseqs.addName(newname); |
1315 |
|
|
this->gseq_id=rid; |
1316 |
|
|
} |
1317 |
|
|
|
1318 |
|
|
|
1319 |
|
|
|
1320 |
|
|
int GffObj::removeAttr(const char* attrname, const char* attrval) { |
1321 |
|
|
if (this->attrs==NULL || attrname==NULL || attrname[0]==0) return 0; |
1322 |
|
|
int aid=this->names->attrs.getId(attrname); |
1323 |
|
|
if (aid<0) return 0; |
1324 |
|
|
int delcount=0; //could be more than one ? |
1325 |
|
|
for (int i=0;i<this->attrs->Count();i++) { |
1326 |
|
|
if (aid==this->attrs->Get(i)->attr_id) { |
1327 |
|
|
if (attrval==NULL || |
1328 |
|
|
strcmp(attrval, this->attrs->Get(i)->attr_val)==0) { |
1329 |
|
|
delcount++; |
1330 |
|
|
this->attrs->freeItem(i); |
1331 |
|
|
} |
1332 |
|
|
} |
1333 |
|
|
} |
1334 |
|
|
if (delcount>0) this->attrs->Pack(); |
1335 |
|
|
return delcount; |
1336 |
|
|
} |
1337 |
|
|
|
1338 |
|
|
int GffObj::removeAttr(int aid, const char* attrval) { |
1339 |
|
|
if (this->attrs==NULL || aid<0) return 0; |
1340 |
|
|
int delcount=0; //could be more than one ? |
1341 |
|
|
for (int i=0;i<this->attrs->Count();i++) { |
1342 |
|
|
if (aid==this->attrs->Get(i)->attr_id) { |
1343 |
|
|
if (attrval==NULL || |
1344 |
|
|
strcmp(attrval, this->attrs->Get(i)->attr_val)==0) { |
1345 |
|
|
delcount++; |
1346 |
|
|
this->attrs->freeItem(i); |
1347 |
|
|
} |
1348 |
|
|
} |
1349 |
|
|
} |
1350 |
|
|
if (delcount>0) this->attrs->Pack(); |
1351 |
|
|
return delcount; |
1352 |
|
|
} |
1353 |
|
|
|
1354 |
|
|
|
1355 |
|
|
int GffObj::removeExonAttr(GffExon& exon, const char* attrname, const char* attrval) { |
1356 |
|
|
if (exon.attrs==NULL || attrname==NULL || attrname[0]==0) return 0; |
1357 |
|
|
int aid=this->names->attrs.getId(attrname); |
1358 |
|
|
if (aid<0) return 0; |
1359 |
|
|
int delcount=0; //could be more than one |
1360 |
|
|
for (int i=0;i<exon.attrs->Count();i++) { |
1361 |
|
|
if (aid==exon.attrs->Get(i)->attr_id) { |
1362 |
|
|
if (attrval==NULL || |
1363 |
|
|
strcmp(attrval, exon.attrs->Get(i)->attr_val)==0) { |
1364 |
|
|
delcount++; |
1365 |
|
|
exon.attrs->freeItem(i); |
1366 |
|
|
} |
1367 |
|
|
} |
1368 |
|
|
} |
1369 |
|
|
if (delcount>0) exon.attrs->Pack(); |
1370 |
|
|
return delcount; |
1371 |
|
|
} |
1372 |
|
|
|
1373 |
|
|
int GffObj::removeExonAttr(GffExon& exon, int aid, const char* attrval) { |
1374 |
|
|
if (exon.attrs==NULL || aid<0) return 0; |
1375 |
|
|
int delcount=0; //could be more than one |
1376 |
|
|
for (int i=0;i<exon.attrs->Count();i++) { |
1377 |
|
|
if (aid==exon.attrs->Get(i)->attr_id) { |
1378 |
|
|
if (attrval==NULL || |
1379 |
|
|
strcmp(attrval, exon.attrs->Get(i)->attr_val)==0) { |
1380 |
|
|
delcount++; |
1381 |
|
|
exon.attrs->freeItem(i); |
1382 |
|
|
} |
1383 |
|
|
} |
1384 |
|
|
} |
1385 |
|
|
if (delcount>0) exon.attrs->Pack(); |
1386 |
|
|
return delcount; |
1387 |
|
|
} |
1388 |
|
|
|
1389 |
|
|
|
1390 |
gpertea |
2 |
void GffObj::getCDS_ends(uint& cds_start, uint& cds_end) { |
1391 |
|
|
cds_start=0; |
1392 |
|
|
cds_end=0; |
1393 |
|
|
if (CDstart==0 || CDend==0) return; //no CDS info |
1394 |
|
|
int cdsadj=0; |
1395 |
|
|
if (CDphase=='1' || CDphase=='2') { |
1396 |
|
|
cdsadj=CDphase-'0'; |
1397 |
|
|
} |
1398 |
|
|
cds_start=CDstart; |
1399 |
|
|
cds_end=CDend; |
1400 |
|
|
if (strand=='-') cds_end-=cdsadj; |
1401 |
|
|
else cds_start+=cdsadj; |
1402 |
|
|
} |
1403 |
|
|
|
1404 |
|
|
void GffObj::mRNA_CDS_coords(uint& cds_mstart, uint& cds_mend) { |
1405 |
|
|
//sets cds_start and cds_end to the CDS start,end coordinates on the spliced mRNA transcript |
1406 |
|
|
cds_mstart=0; |
1407 |
|
|
cds_mend=0; |
1408 |
|
|
if (CDstart==0 || CDend==0) return; //no CDS info |
1409 |
|
|
//restore normal coordinates, just in case |
1410 |
|
|
unxcoord(); |
1411 |
|
|
int cdsadj=0; |
1412 |
|
|
if (CDphase=='1' || CDphase=='2') { |
1413 |
|
|
cdsadj=CDphase-'0'; |
1414 |
|
|
} |
1415 |
|
|
/* |
1416 |
|
|
uint seqstart=CDstart; |
1417 |
|
|
uint seqend=CDend; |
1418 |
|
|
*/ |
1419 |
|
|
uint seqstart=exons.First()->start; |
1420 |
|
|
uint seqend=exons.Last()->end; |
1421 |
|
|
int s=0; //resulting nucleotide counter |
1422 |
|
|
if (strand=='-') { |
1423 |
|
|
for (int x=exons.Count()-1;x>=0;x--) { |
1424 |
|
|
uint sgstart=exons[x]->start; |
1425 |
|
|
uint sgend=exons[x]->end; |
1426 |
|
|
if (seqend<sgstart || seqstart>sgend) continue; |
1427 |
|
|
if (seqstart>=sgstart && seqstart<=sgend) |
1428 |
|
|
sgstart=seqstart; //seqstart within this segment |
1429 |
|
|
if (seqend>=sgstart && seqend<=sgend) |
1430 |
|
|
sgend=seqend; //seqend within this segment |
1431 |
|
|
s+=(int)(sgend-sgstart)+1; |
1432 |
|
|
if (CDstart>=sgstart && CDstart<=sgend) { |
1433 |
|
|
//CDstart in this segment |
1434 |
|
|
//and we are getting the whole transcript |
1435 |
|
|
cds_mend=s-(int)(CDstart-sgstart); |
1436 |
|
|
} |
1437 |
|
|
if (CDend>=sgstart && CDend<=sgend) { |
1438 |
|
|
//CDstart in this segment |
1439 |
|
|
//and we are getting the whole transcript |
1440 |
|
|
cds_mstart=s-(int)(CDend-cdsadj-sgstart); |
1441 |
|
|
} |
1442 |
|
|
} //for each exon |
1443 |
|
|
} // - strand |
1444 |
|
|
else { // + strand |
1445 |
|
|
for (int x=0;x<exons.Count();x++) { |
1446 |
|
|
uint sgstart=exons[x]->start; |
1447 |
|
|
uint sgend=exons[x]->end; |
1448 |
|
|
if (seqend<sgstart || seqstart>sgend) continue; |
1449 |
|
|
if (seqstart>=sgstart && seqstart<=sgend) |
1450 |
|
|
sgstart=seqstart; //seqstart within this segment |
1451 |
|
|
if (seqend>=sgstart && seqend<=sgend) |
1452 |
|
|
sgend=seqend; //seqend within this segment |
1453 |
|
|
s+=(int)(sgend-sgstart)+1; |
1454 |
|
|
/* for (uint i=sgstart;i<=sgend;i++) { |
1455 |
|
|
spliced[s]=gsubseq[i-gstart]; |
1456 |
|
|
s++; |
1457 |
|
|
}//for each nt |
1458 |
|
|
*/ |
1459 |
|
|
if (CDstart>=sgstart && CDstart<=sgend) { |
1460 |
|
|
//CDstart in this segment |
1461 |
|
|
cds_mstart=s-(int)(sgend-CDstart-cdsadj); |
1462 |
|
|
} |
1463 |
|
|
if (CDend>=sgstart && CDend<=sgend) { |
1464 |
|
|
//CDend in this segment |
1465 |
|
|
cds_mend=s-(int)(sgend-CDend); |
1466 |
|
|
} |
1467 |
|
|
} //for each exon |
1468 |
|
|
} // + strand |
1469 |
|
|
//spliced[s]=0; |
1470 |
|
|
//if (rlen!=NULL) *rlen=s; |
1471 |
|
|
//return spliced; |
1472 |
|
|
} |
1473 |
|
|
|
1474 |
gpertea |
16 |
char* GffObj::getUnspliced(GFaSeqGet* faseq, int* rlen, GList<GSeg>* seglst) |
1475 |
|
|
{ |
1476 |
|
|
if (faseq==NULL) { GMessage("Warning: getUnspliced(NULL,.. ) called!\n"); |
1477 |
|
|
return NULL; |
1478 |
|
|
} |
1479 |
|
|
//restore normal coordinates: |
1480 |
|
|
unxcoord(); |
1481 |
|
|
if (exons.Count()==0) return NULL; |
1482 |
|
|
int fspan=end-start+1; |
1483 |
|
|
const char* gsubseq=faseq->subseq(start, fspan); |
1484 |
|
|
if (gsubseq==NULL) { |
1485 |
|
|
GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end); |
1486 |
|
|
} |
1487 |
|
|
char* unspliced=NULL; |
1488 |
|
|
|
1489 |
|
|
int seqstart=exons.First()->start; |
1490 |
|
|
int seqend=exons.Last()->end; |
1491 |
|
|
|
1492 |
|
|
int unsplicedlen = 0; |
1493 |
|
|
|
1494 |
|
|
unsplicedlen += seqend - seqstart + 1; |
1495 |
|
|
|
1496 |
|
|
GMALLOC(unspliced, unsplicedlen+1); //allocate more here |
1497 |
|
|
//uint seqstart, seqend; |
1498 |
|
|
|
1499 |
|
|
int s = 0; //resulting nucleotide counter |
1500 |
|
|
if (strand=='-') |
1501 |
|
|
{ |
1502 |
|
|
if (seglst!=NULL) |
1503 |
|
|
seglst->Add(new GSeg(s+1,s+1+seqend-seqstart)); |
1504 |
|
|
for (int i=seqend;i>=seqstart;i--) |
1505 |
|
|
{ |
1506 |
|
|
unspliced[s] = ntComplement(gsubseq[i-start]); |
1507 |
|
|
s++; |
1508 |
|
|
}//for each nt |
1509 |
|
|
} // - strand |
1510 |
|
|
else |
1511 |
|
|
{ // + strand |
1512 |
|
|
if (seglst!=NULL) |
1513 |
|
|
seglst->Add(new GSeg(s+1,s+1+seqend-seqstart)); |
1514 |
|
|
for (int i=seqstart;i<=seqend;i++) |
1515 |
|
|
{ |
1516 |
|
|
unspliced[s]=gsubseq[i-start]; |
1517 |
|
|
s++; |
1518 |
|
|
}//for each nt |
1519 |
|
|
} // + strand |
1520 |
|
|
//assert(s <= unsplicedlen); |
1521 |
|
|
unspliced[s]=0; |
1522 |
|
|
if (rlen!=NULL) *rlen=s; |
1523 |
|
|
return unspliced; |
1524 |
|
|
} |
1525 |
|
|
|
1526 |
gpertea |
2 |
char* GffObj::getSpliced(GFaSeqGet* faseq, bool CDSonly, int* rlen, uint* cds_start, uint* cds_end, |
1527 |
|
|
GList<GSeg>* seglst) { |
1528 |
|
|
if (CDSonly && CDstart==0) return NULL; |
1529 |
|
|
if (faseq==NULL) { GMessage("Warning: getSpliced(NULL,.. ) called!\n"); |
1530 |
|
|
return NULL; |
1531 |
|
|
} |
1532 |
|
|
//restore normal coordinates: |
1533 |
|
|
unxcoord(); |
1534 |
|
|
if (exons.Count()==0) return NULL; |
1535 |
|
|
int fspan=end-start+1; |
1536 |
|
|
const char* gsubseq=faseq->subseq(start, fspan); |
1537 |
|
|
if (gsubseq==NULL) { |
1538 |
|
|
GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end); |
1539 |
|
|
} |
1540 |
gpertea |
16 |
if (fspan<(int)(end-start+1)) { //special case: stop coordinate was extended past the gseq length, must adjust |
1541 |
|
|
int endadj=end-start+1-fspan; |
1542 |
|
|
uint prevend=end; |
1543 |
|
|
end-=endadj; |
1544 |
|
|
if (CDend>end) CDend=end; |
1545 |
|
|
if (exons.Last()->end>end) { |
1546 |
|
|
exons.Last()->end=end; //this could get us into trouble if exon start is also > end |
1547 |
|
|
if (exons.Last()->start>exons.Last()->end) { |
1548 |
|
|
GError("GffObj::getSpliced() error: improper genomic coordinate %d on %s for %s\n", |
1549 |
|
|
prevend,getGSeqName(), getID()); |
1550 |
|
|
} |
1551 |
|
|
covlen-=endadj; |
1552 |
|
|
} |
1553 |
|
|
} |
1554 |
gpertea |
2 |
char* spliced=NULL; |
1555 |
|
|
GMALLOC(spliced, covlen+1); //allocate more here |
1556 |
|
|
uint seqstart, seqend; |
1557 |
|
|
int cdsadj=0; |
1558 |
|
|
if (CDphase=='1' || CDphase=='2') { |
1559 |
|
|
cdsadj=CDphase-'0'; |
1560 |
|
|
} |
1561 |
|
|
if (CDSonly) { |
1562 |
|
|
seqstart=CDstart; |
1563 |
|
|
seqend=CDend; |
1564 |
|
|
if (strand=='-') seqend-=cdsadj; |
1565 |
|
|
else seqstart+=cdsadj; |
1566 |
|
|
} |
1567 |
|
|
else { |
1568 |
|
|
seqstart=exons.First()->start; |
1569 |
|
|
seqend=exons.Last()->end; |
1570 |
|
|
} |
1571 |
|
|
int s=0; //resulting nucleotide counter |
1572 |
|
|
if (strand=='-') { |
1573 |
|
|
for (int x=exons.Count()-1;x>=0;x--) { |
1574 |
|
|
uint sgstart=exons[x]->start; |
1575 |
|
|
uint sgend=exons[x]->end; |
1576 |
|
|
if (seqend<sgstart || seqstart>sgend) continue; |
1577 |
|
|
if (seqstart>=sgstart && seqstart<=sgend) |
1578 |
|
|
sgstart=seqstart; //seqstart within this segment |
1579 |
|
|
if (seqend>=sgstart && seqend<=sgend) |
1580 |
|
|
sgend=seqend; //seqend within this segment |
1581 |
|
|
if (seglst!=NULL) |
1582 |
|
|
seglst->Add(new GSeg(s+1,s+1+sgend-sgstart)); |
1583 |
|
|
for (uint i=sgend;i>=sgstart;i--) { |
1584 |
|
|
spliced[s] = ntComplement(gsubseq[i-start]); |
1585 |
|
|
s++; |
1586 |
|
|
}//for each nt |
1587 |
|
|
|
1588 |
|
|
if (!CDSonly && cds_start!=NULL && CDstart>0) { |
1589 |
|
|
if (CDstart>=sgstart && CDstart<=sgend) { |
1590 |
|
|
//CDstart in this segment |
1591 |
|
|
//and we are getting the whole transcript |
1592 |
|
|
*cds_end=s-(CDstart-sgstart); |
1593 |
|
|
} |
1594 |
|
|
if (CDend>=sgstart && CDend<=sgend) { |
1595 |
|
|
//CDstart in this segment |
1596 |
|
|
//and we are getting the whole transcript |
1597 |
|
|
*cds_start=s-(CDend-cdsadj-sgstart); |
1598 |
|
|
} |
1599 |
|
|
}//update local CDS coordinates |
1600 |
|
|
} //for each exon |
1601 |
|
|
} // - strand |
1602 |
|
|
else { // + strand |
1603 |
|
|
for (int x=0;x<exons.Count();x++) { |
1604 |
|
|
uint sgstart=exons[x]->start; |
1605 |
|
|
uint sgend=exons[x]->end; |
1606 |
|
|
if (seqend<sgstart || seqstart>sgend) continue; |
1607 |
|
|
if (seqstart>=sgstart && seqstart<=sgend) |
1608 |
|
|
sgstart=seqstart; //seqstart within this segment |
1609 |
|
|
if (seqend>=sgstart && seqend<=sgend) |
1610 |
|
|
sgend=seqend; //seqend within this segment |
1611 |
|
|
if (seglst!=NULL) |
1612 |
|
|
seglst->Add(new GSeg(s+1,s+1+sgend-sgstart)); |
1613 |
|
|
for (uint i=sgstart;i<=sgend;i++) { |
1614 |
|
|
spliced[s]=gsubseq[i-start]; |
1615 |
|
|
s++; |
1616 |
|
|
}//for each nt |
1617 |
|
|
if (!CDSonly && cds_start!=NULL && CDstart>0) { |
1618 |
|
|
if (CDstart>=sgstart && CDstart<=sgend) { |
1619 |
|
|
//CDstart in this segment |
1620 |
|
|
//and we are getting the whole transcript |
1621 |
|
|
*cds_start=s-(sgend-CDstart-cdsadj); |
1622 |
|
|
} |
1623 |
|
|
if (CDend>=sgstart && CDend<=sgend) { |
1624 |
|
|
//CDstart in this segment |
1625 |
|
|
//and we are getting the whole transcript |
1626 |
|
|
*cds_end=s-(sgend-CDend); |
1627 |
|
|
} |
1628 |
|
|
}//update local CDS coordinates |
1629 |
|
|
} //for each exon |
1630 |
|
|
} // + strand |
1631 |
|
|
spliced[s]=0; |
1632 |
|
|
if (rlen!=NULL) *rlen=s; |
1633 |
|
|
return spliced; |
1634 |
|
|
} |
1635 |
|
|
|
1636 |
|
|
char* GffObj::getSplicedTr(GFaSeqGet* faseq, bool CDSonly, int* rlen) { |
1637 |
|
|
if (CDSonly && CDstart==0) return NULL; |
1638 |
|
|
//restore normal coordinates: |
1639 |
|
|
unxcoord(); |
1640 |
|
|
if (exons.Count()==0) return NULL; |
1641 |
|
|
int fspan=end-start+1; |
1642 |
|
|
const char* gsubseq=faseq->subseq(start, fspan); |
1643 |
|
|
if (gsubseq==NULL) { |
1644 |
|
|
GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end); |
1645 |
|
|
} |
1646 |
|
|
|
1647 |
|
|
char* translation=NULL; |
1648 |
|
|
GMALLOC(translation, (int)(covlen/3)+1); |
1649 |
|
|
uint seqstart, seqend; |
1650 |
|
|
int cdsadj=0; |
1651 |
|
|
if (CDphase=='1' || CDphase=='2') { |
1652 |
|
|
cdsadj=CDphase-'0'; |
1653 |
|
|
} |
1654 |
|
|
if (CDSonly) { |
1655 |
|
|
seqstart=CDstart; |
1656 |
|
|
seqend=CDend; |
1657 |
|
|
if (strand=='-') seqend-=cdsadj; |
1658 |
|
|
else seqstart+=cdsadj; |
1659 |
|
|
} |
1660 |
|
|
else { |
1661 |
|
|
seqstart=exons.First()->start; |
1662 |
|
|
seqend=exons.Last()->end; |
1663 |
|
|
} |
1664 |
|
|
Codon codon; |
1665 |
|
|
int nt=0; //codon nucleotide counter (0..2) |
1666 |
|
|
int aa=0; //aminoacid count |
1667 |
|
|
if (strand=='-') { |
1668 |
|
|
for (int x=exons.Count()-1;x>=0;x--) { |
1669 |
|
|
uint sgstart=exons[x]->start; |
1670 |
|
|
uint sgend=exons[x]->end; |
1671 |
|
|
if (seqend<sgstart || seqstart>sgend) continue; |
1672 |
|
|
if (seqstart>=sgstart && seqstart<=sgend) |
1673 |
|
|
sgstart=seqstart; //seqstart within this segment |
1674 |
|
|
if (seqend>=sgstart && seqend<=sgend) { |
1675 |
|
|
sgend=seqend; //seqend within this segment |
1676 |
|
|
} |
1677 |
|
|
for (uint i=sgend;i>=sgstart;i--) { |
1678 |
|
|
codon.nuc[nt]=ntComplement(gsubseq[i-start]); |
1679 |
|
|
nt++; |
1680 |
|
|
if (nt==3) { |
1681 |
|
|
nt=0; |
1682 |
|
|
translation[aa]=codon.translate(); |
1683 |
|
|
aa++; |
1684 |
|
|
} |
1685 |
|
|
}//for each nt |
1686 |
|
|
} //for each exon |
1687 |
|
|
} // - strand |
1688 |
|
|
else { // + strand |
1689 |
|
|
for (int x=0;x<exons.Count();x++) { |
1690 |
|
|
uint sgstart=exons[x]->start; |
1691 |
|
|
uint sgend=exons[x]->end; |
1692 |
|
|
if (seqend<sgstart || seqstart>sgend) continue; |
1693 |
|
|
if (seqstart>=sgstart && seqstart<=sgend) |
1694 |
|
|
sgstart=seqstart; //seqstart within this segment |
1695 |
|
|
if (seqend>=sgstart && seqend<=sgend) |
1696 |
|
|
sgend=seqend; //seqend within this segment |
1697 |
|
|
for (uint i=sgstart;i<=sgend;i++) { |
1698 |
|
|
codon.nuc[nt]=gsubseq[i-start]; |
1699 |
|
|
nt++; |
1700 |
|
|
if (nt==3) { |
1701 |
|
|
nt=0; |
1702 |
|
|
translation[aa]=codon.translate(); |
1703 |
|
|
aa++; |
1704 |
|
|
} |
1705 |
|
|
}//for each nt |
1706 |
|
|
} //for each exon |
1707 |
|
|
} // + strand |
1708 |
|
|
translation[aa]=0; |
1709 |
|
|
if (rlen!=NULL) *rlen=aa; |
1710 |
|
|
return translation; |
1711 |
|
|
} |
1712 |
|
|
|
1713 |
|
|
void GffObj::printSummary(FILE* fout) { |
1714 |
|
|
if (fout==NULL) fout=stdout; |
1715 |
|
|
fprintf(fout, "%s\t%c\t%d\t%d\t%4.2f\t%4.1f\n", gffID, |
1716 |
|
|
strand, start, end, gscore, (float)qcov/10.0); |
1717 |
|
|
} |
1718 |
|
|
|
1719 |
gpertea |
16 |
void GffObj::printGxfLine(FILE* fout, const char* tlabel, const char* gseqname, bool iscds, |
1720 |
gpertea |
2 |
uint segstart, uint segend, int exidx, char phase, bool gff3) { |
1721 |
|
|
static char scorestr[14]; |
1722 |
|
|
strcpy(scorestr,"."); |
1723 |
|
|
GffAttrs* xattrs=NULL; |
1724 |
|
|
if (exidx>=0) { |
1725 |
|
|
if (exons[exidx]->score) sprintf(scorestr,"%.2f", exons[exidx]->score); |
1726 |
|
|
xattrs=exons[exidx]->attrs; |
1727 |
|
|
} |
1728 |
|
|
if (phase==0 || !iscds) phase='.'; |
1729 |
|
|
const char* ftype=iscds ? "CDS" : getSubfName(); |
1730 |
|
|
if (gff3) { |
1731 |
|
|
fprintf(fout, |
1732 |
|
|
"%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\tParent=%s", |
1733 |
|
|
gseqname, tlabel, ftype, segstart, segend, scorestr, strand, |
1734 |
|
|
phase, gffID); |
1735 |
|
|
if (xattrs!=NULL) { |
1736 |
|
|
for (int i=0;i<xattrs->Count();i++) |
1737 |
|
|
fprintf(fout, ";%s=%s",names->attrs.getName(xattrs->Get(i)->attr_id), |
1738 |
|
|
xattrs->Get(i)->attr_val); |
1739 |
|
|
} |
1740 |
|
|
fprintf(fout, "\n"); |
1741 |
|
|
} //GFF |
1742 |
gpertea |
16 |
else {//for GTF -- we print only transcripts |
1743 |
|
|
//if (isValidTranscript()) |
1744 |
|
|
fprintf(fout, "%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\ttranscript_id \"%s\";", |
1745 |
|
|
gseqname, tlabel, ftype, segstart, segend, scorestr, strand, phase, gffID); |
1746 |
|
|
//char* geneid=(geneID!=NULL)? geneID : gffID; |
1747 |
|
|
if (geneID) |
1748 |
|
|
fprintf(fout," gene_id \"%s\";",geneID); |
1749 |
|
|
if (gene_name!=NULL) { |
1750 |
|
|
//fprintf(fout, " gene_name "); |
1751 |
|
|
//if (gene_name[0]=='"') fprintf (fout, "%s;",gene_name); |
1752 |
|
|
// else fprintf(fout, "\"%s\";",gene_name); |
1753 |
|
|
fprintf(fout," gene_name \"%s\";",gene_name); |
1754 |
|
|
} |
1755 |
gpertea |
2 |
if (xattrs!=NULL) { |
1756 |
gpertea |
16 |
for (int i=0;i<xattrs->Count();i++) { |
1757 |
|
|
if (xattrs->Get(i)->attr_val==NULL) continue; |
1758 |
|
|
const char* attrname=names->attrs.getName(xattrs->Get(i)->attr_id); |
1759 |
|
|
fprintf(fout, " %s ",attrname); |
1760 |
|
|
if (xattrs->Get(i)->attr_val[0]=='"') |
1761 |
|
|
fprintf(fout, "%s;",xattrs->Get(i)->attr_val); |
1762 |
|
|
else fprintf(fout, "\"%s\";",xattrs->Get(i)->attr_val); |
1763 |
|
|
} |
1764 |
gpertea |
2 |
} |
1765 |
gpertea |
16 |
//for GTF, also append the GffObj attributes to each exon line |
1766 |
|
|
if ((xattrs=this->attrs)!=NULL) { |
1767 |
|
|
for (int i=0;i<xattrs->Count();i++) { |
1768 |
|
|
if (xattrs->Get(i)->attr_val==NULL) continue; |
1769 |
|
|
const char* attrname=names->attrs.getName(xattrs->Get(i)->attr_id); |
1770 |
|
|
fprintf(fout, " %s ",attrname); |
1771 |
|
|
if (xattrs->Get(i)->attr_val[0]=='"') |
1772 |
|
|
fprintf(fout, "%s;",xattrs->Get(i)->attr_val); |
1773 |
|
|
else fprintf(fout, "\"%s\";",xattrs->Get(i)->attr_val); |
1774 |
|
|
} |
1775 |
|
|
} |
1776 |
gpertea |
2 |
fprintf(fout, "\n"); |
1777 |
|
|
}//GTF |
1778 |
|
|
} |
1779 |
|
|
|
1780 |
gpertea |
16 |
void GffObj::printGxf(FILE* fout, GffPrintMode gffp, |
1781 |
|
|
const char* tlabel, const char* gfparent) { |
1782 |
gpertea |
2 |
static char tmpstr[255]; |
1783 |
|
|
if (tlabel==NULL) { |
1784 |
|
|
tlabel=track_id>=0 ? names->tracks.Get(track_id)->name : |
1785 |
|
|
(char*)"gffobj" ; |
1786 |
|
|
} |
1787 |
|
|
unxcoord(); |
1788 |
gpertea |
16 |
//if (exons.Count()==0) return; |
1789 |
|
|
const char* gseqname=names->gseqs.Get(gseq_id)->name; |
1790 |
gpertea |
2 |
bool gff3 = (gffp>=pgffAny); |
1791 |
|
|
bool showCDS = (gffp==pgtfAny || gffp==pgtfCDS || gffp==pgffCDS || gffp==pgffAny || gffp==pgffBoth); |
1792 |
|
|
bool showExon = (gffp<=pgtfExon || gffp==pgffAny || gffp==pgffExon || gffp==pgffBoth); |
1793 |
|
|
if (gff3) { |
1794 |
|
|
//print GFF3 mRNA line: |
1795 |
|
|
if (gscore>0.0) sprintf(tmpstr,"%.2f", gscore); |
1796 |
|
|
else strcpy(tmpstr,"."); |
1797 |
|
|
uint pstart, pend; |
1798 |
|
|
if (gffp==pgffCDS) { |
1799 |
|
|
pstart=CDstart; |
1800 |
|
|
pend=CDend; |
1801 |
|
|
} |
1802 |
|
|
else { pstart=start;pend=end; } |
1803 |
gpertea |
16 |
//const char* ftype=isTranscript() ? "mRNA" : getFeatureName(); |
1804 |
|
|
const char* ftype=getFeatureName(); |
1805 |
gpertea |
2 |
fprintf(fout, |
1806 |
|
|
"%s\t%s\t%s\t%d\t%d\t%s\t%c\t.\tID=%s", |
1807 |
|
|
gseqname, tlabel, ftype, pstart, pend, tmpstr, strand, gffID); |
1808 |
gpertea |
16 |
if (CDstart>0 && !showCDS && !isCDS) fprintf(fout,";CDS=%d-%d",CDstart,CDend); |
1809 |
|
|
if (gfparent!=NULL) { |
1810 |
|
|
//parent override |
1811 |
|
|
fprintf(fout, ";Parent=%s",gfparent); |
1812 |
|
|
} |
1813 |
|
|
else { |
1814 |
|
|
if (parent!=NULL && !parent->isDiscarded()) |
1815 |
|
|
fprintf(fout, ";Parent=%s",parent->getID()); |
1816 |
|
|
} |
1817 |
|
|
if (geneID!=NULL) |
1818 |
|
|
fprintf(fout, ";geneID=%s",geneID); |
1819 |
|
|
if (gene_name!=NULL) |
1820 |
|
|
fprintf(fout, ";gene_name=%s",gene_name); |
1821 |
gpertea |
2 |
if (attrs!=NULL) { |
1822 |
|
|
for (int i=0;i<attrs->Count();i++) { |
1823 |
gpertea |
16 |
const char* attrname=names->attrs.getName(attrs->Get(i)->attr_id); |
1824 |
|
|
fprintf(fout,";%s=%s", attrname, |
1825 |
gpertea |
2 |
attrs->Get(i)->attr_val); |
1826 |
|
|
} |
1827 |
|
|
} |
1828 |
|
|
fprintf(fout,"\n"); |
1829 |
|
|
}// gff3 mRNA line |
1830 |
|
|
if (showExon) { |
1831 |
|
|
//print exons |
1832 |
gpertea |
16 |
if (isCDS && exons.Count()>0 && |
1833 |
|
|
((strand=='-' && exons.Last()->phase<'0') || (strand=='+' && exons.Last()->phase<'0'))) |
1834 |
|
|
updateExonPhase(); |
1835 |
|
|
|
1836 |
|
|
for (int i=0;i<exons.Count();i++) { |
1837 |
|
|
printGxfLine(fout, tlabel, gseqname, isCDS, exons[i]->start, exons[i]->end, i, exons[i]->phase, gff3); |
1838 |
|
|
} |
1839 |
|
|
}//printing exons |
1840 |
gpertea |
2 |
if (showCDS && !isCDS && CDstart>0) { |
1841 |
|
|
GArray<GffCDSeg> cds(true,true); |
1842 |
|
|
getCDSegs(cds); |
1843 |
|
|
for (int i=0;i<cds.Count();i++) { |
1844 |
|
|
printGxfLine(fout, tlabel, gseqname, true, cds[i].start, cds[i].end, -1, cds[i].phase, gff3); |
1845 |
|
|
} |
1846 |
|
|
} //showCDS |
1847 |
|
|
} |
1848 |
|
|
|
1849 |
gpertea |
16 |
void GffObj::updateExonPhase() { |
1850 |
|
|
if (!isCDS) return; |
1851 |
|
|
int cdsacc=0; |
1852 |
|
|
if (CDphase=='1' || CDphase=='2') { |
1853 |
|
|
cdsacc+= 3-(CDphase-'0'); |
1854 |
|
|
} |
1855 |
|
|
if (strand=='-') { //reverse strand |
1856 |
|
|
for (int i=exons.Count()-1;i>=0;i--) { |
1857 |
|
|
exons[i]->phase='0'+ (3-cdsacc%3)%3; |
1858 |
|
|
cdsacc+=exons[i]->end-exons[i]->start+1; |
1859 |
|
|
} |
1860 |
|
|
} |
1861 |
|
|
else { //forward strand |
1862 |
|
|
for (int i=0;i<exons.Count();i++) { |
1863 |
|
|
exons[i]->phase='0'+ (3-cdsacc%3)%3; |
1864 |
|
|
cdsacc+=exons[i]->end-exons[i]->start+1; |
1865 |
|
|
} |
1866 |
|
|
} |
1867 |
|
|
} |
1868 |
gpertea |
2 |
|
1869 |
gpertea |
16 |
|
1870 |
gpertea |
2 |
void GffObj::getCDSegs(GArray<GffCDSeg>& cds) { |
1871 |
|
|
GffCDSeg cdseg; |
1872 |
|
|
int cdsacc=0; |
1873 |
|
|
if (CDphase=='1' || CDphase=='2') { |
1874 |
|
|
cdsacc+= 3-(CDphase-'0'); |
1875 |
|
|
} |
1876 |
|
|
if (strand=='-') { |
1877 |
|
|
for (int x=exons.Count()-1;x>=0;x--) { |
1878 |
|
|
uint sgstart=exons[x]->start; |
1879 |
|
|
uint sgend=exons[x]->end; |
1880 |
|
|
if (CDend<sgstart || CDstart>sgend) continue; |
1881 |
|
|
if (CDstart>=sgstart && CDstart<=sgend) |
1882 |
|
|
sgstart=CDstart; //cdstart within this segment |
1883 |
|
|
if (CDend>=sgstart && CDend<=sgend) |
1884 |
|
|
sgend=CDend; //cdend within this segment |
1885 |
|
|
cdseg.start=sgstart; |
1886 |
|
|
cdseg.end=sgend; |
1887 |
|
|
cdseg.exonidx=x; |
1888 |
|
|
//cdseg.phase='0'+(cdsacc>0 ? (3-cdsacc%3)%3 : 0); |
1889 |
|
|
cdseg.phase='0'+ (3-cdsacc%3)%3; |
1890 |
|
|
cdsacc+=sgend-sgstart+1; |
1891 |
|
|
cds.Add(cdseg); |
1892 |
|
|
} //for each exon |
1893 |
|
|
} // - strand |
1894 |
|
|
else { // + strand |
1895 |
|
|
for (int x=0;x<exons.Count();x++) { |
1896 |
|
|
uint sgstart=exons[x]->start; |
1897 |
|
|
uint sgend=exons[x]->end; |
1898 |
|
|
if (CDend<sgstart || CDstart>sgend) continue; |
1899 |
|
|
if (CDstart>=sgstart && CDstart<=sgend) |
1900 |
|
|
sgstart=CDstart; //seqstart within this segment |
1901 |
|
|
if (CDend>=sgstart && CDend<=sgend) |
1902 |
|
|
sgend=CDend; //seqend within this segment |
1903 |
|
|
cdseg.start=sgstart; |
1904 |
|
|
cdseg.end=sgend; |
1905 |
|
|
cdseg.exonidx=x; |
1906 |
|
|
//cdseg.phase='0'+(cdsacc>0 ? (3-cdsacc%3)%3 : 0); |
1907 |
|
|
cdseg.phase='0' + (3-cdsacc%3)%3 ; |
1908 |
|
|
cdsacc+=sgend-sgstart+1; |
1909 |
|
|
cds.Add(cdseg); |
1910 |
|
|
} //for each exon |
1911 |
|
|
} // + strand |
1912 |
|
|
} |