ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/gff.cpp
Revision: 66
Committed: Fri Sep 16 18:33:54 2011 UTC (7 years, 10 months ago) by gpertea
File size: 65800 byte(s)
Log Message:
made some Flybase GFF3 "errors" non-fatal

Line File contents
1 #include "gff.h"
2
3 //GffNames* GffReader::names=NULL;
4 GffNames* GffObj::names=NULL;
5 //global set of feature names, attribute names etc.
6 // -- common for all GffObjs in current application!
7
8 const uint GFF_MAX_LOCUS = 7000000; //longest known gene in human is ~2.2M, UCSC claims a gene for mouse of ~ 3.1 M
9 const uint GFF_MAX_EXON = 30000; //longest known exon in human is ~11K
10 const uint GFF_MAX_INTRON= 6000000;
11 bool gff_show_warnings = false; //global setting, set by GffReader->showWarnings()
12 const int gff_fid_mRNA=0;
13 const int gff_fid_transcript=1;
14 const int gff_fid_exon=2;
15 const int gff_fid_CDS=3; //never really used in GffObj ftype_id or subftype_id
16 const uint gfo_flag_HAS_ERRORS = 0x00000001;
17 const uint gfo_flag_CHILDREN_PROMOTED= 0x00000002;
18 const uint gfo_flag_IS_GENE = 0x00000004;
19 const uint gfo_flag_IS_TRANSCRIPT = 0x00000008;
20 const uint gfo_flag_FROM_GFF3 = 0x00000010;
21 const uint gfo_flag_BY_EXON = 0x00000020; //created by subfeature (exon) directly
22 const uint gfo_flag_DISCARDED = 0x00000100;
23 const uint gfo_flag_LST_KEEP = 0x00000200;
24 const uint gfo_flag_LEVEL_MSK = 0x00FF0000;
25 const byte gfo_flagShift_LEVEL = 16;
26
27 void gffnames_ref(GffNames* &n) {
28 if (n==NULL) n=new GffNames();
29 n->numrefs++;
30 }
31
32 void gffnames_unref(GffNames* &n) {
33 if (n==NULL) GError("Error: attempt to remove reference to null GffNames object!\n");
34 n->numrefs--;
35 if (n->numrefs==0) { delete n; n=NULL; }
36 }
37
38 int gfo_cmpByLoc(const pointer p1, const pointer p2) {
39
40 GffObj& g1=*((GffObj*)p1);
41 GffObj& g2=*((GffObj*)p2);
42 if (g1.gseq_id==g2.gseq_id) {
43 if (g1.start!=g2.start)
44 return (int)(g1.start-g2.start);
45 else if (g1.getLevel()!=g2.getLevel())
46 return (int)(g1.getLevel()-g2.getLevel());
47 else
48 if (g1.end!=g2.end)
49 return (int)(g1.end-g2.end);
50 else return strcmp(g1.getID(), g2.getID());
51 }
52 else return (int)(g1.gseq_id-g2.gseq_id);
53 }
54
55 char* GffLine::extractAttr(const char* pre, bool caseStrict, bool enforce_GTF2) {
56 //parse a key attribute and remove it from the info string
57 //(only works for attributes that have values following them after ' ' or '=')
58 static const char GTF2_ERR[]="Error parsing attribute %s ('\"' required) at GTF line:\n%s\n";
59 int lpre=strlen(pre);
60 char cend=pre[lpre-1];
61 char* pos = (caseStrict) ? strstr(info, pre) : strifind(info, pre);
62 if (pos==NULL) return NULL;
63 char* findstart=info;
64 //require word boundary on the left:
65 while (pos!=NULL && pos!=info && *(pos-1)!=';' && *(pos-1)!=' ') {
66 findstart=pos+lpre;
67 pos = (caseStrict) ? strstr(findstart, pre) : strifind(findstart, pre);
68 }
69 if (pos==NULL) return NULL;
70 if (cend!=' ' && cend!='=') {
71 //require word boundary on the right:
72 while (pos!=NULL && *(pos+lpre)!=' ' && *(pos+lpre)!='=') {
73 findstart=pos+lpre;
74 pos = (caseStrict) ? strstr(findstart, pre) : strifind(findstart, pre);
75 }
76 }
77 if (pos==NULL) return NULL;
78 char* vp=pos+lpre;
79 while (*vp==' ') vp++;
80 if (*vp==';' || *vp==0)
81 GError("Error parsing value of GFF attribute \"%s\", line:\n%s\n", pre, dupline);
82 bool dq_enclosed=false; //value string enclosed by double quotes
83 if (*vp=='"') {
84 dq_enclosed=true;
85 vp++;
86 }
87 if (enforce_GTF2 && !dq_enclosed)
88 GError(GTF2_ERR,pre, dupline);
89 char* vend=vp;
90 if (dq_enclosed) {
91 while (*vend!='"' && *vend!=';' && *vend!=0) vend++;
92 }
93 else {
94 while (*vend!=';' && *vend!=0) vend++;
95 }
96 if (enforce_GTF2 && *vend!='"')
97 GError(GTF2_ERR, pre, dupline);
98 char *r=Gstrdup(vp, vend-1);
99 //-- now remove this attribute from the info string
100 while (*vend!=0 && (*vend=='"' || *vend==';' || *vend==' ')) vend++;
101 if (*vend==0) vend--;
102 for (char *src=vend, *dest=pos;;src++,dest++) {
103 *dest=*src;
104 if (*src==0) break;
105 }
106 return r;
107 }
108
109 static char fnamelc[128];
110
111 GffLine::GffLine(GffReader* reader, const char* l) {
112 llen=strlen(l);
113 GMALLOC(line,llen+1);
114 memcpy(line, l, llen+1);
115 GMALLOC(dupline, llen+1);
116 memcpy(dupline, l, llen+1);
117 skip=true;
118 gseqname=NULL;
119 track=NULL;
120 ftype=NULL;
121 info=NULL;
122 _parents=NULL;
123 _parents_len=0;
124 num_parents=0;
125 parents=NULL;
126 is_gff3=false;
127 is_cds=false;
128 is_transcript=false;
129 is_exon=false;
130 is_gene=false;
131 exontype=0;
132 gene_id=NULL;
133 gene_name=NULL;
134 qstart=0;
135 qend=0;
136 qlen=0;
137 ID=NULL;
138 char* t[9];
139 int i=0;
140 int tidx=1;
141 t[0]=line;
142
143 while (line[i]!=0) {
144 if (line[i]=='\t') {
145 line[i]=0;
146 t[tidx]=line+i+1;
147 tidx++;
148 if (tidx>8) break;
149 }
150 i++;
151 }
152
153 if (tidx<8) { // ignore non-GFF lines
154 // GMessage("Warning: error parsing GFF/GTF line:\n%s\n", l);
155 return;
156 }
157 gseqname=t[0];
158 track=t[1];
159 ftype=t[2];
160 info=t[8];
161 char* p=t[3];
162 if (!parseUInt(p,fstart)) {
163 //FIXME: chromosome_band entries in Flybase
164 GMessage("Warning: invalid start coordinate at line:\n%s\n",l);
165 return;
166 }
167 p=t[4];
168 if (!parseUInt(p,fend)) {
169 GMessage("Warning: invalid end coordinate at line:\n%s\n",l);
170 return;
171 }
172 if (fend<fstart) swap(fend,fstart); //make sure fstart>=fend, always
173 p=t[5];
174 if (p[0]=='.' && p[1]==0) {
175 score=0;
176 }
177 else {
178 if (!parseDouble(p,score))
179 GError("Error parsing feature score from GFF line:\n%s\n",l);
180 }
181 strand=*t[6];
182 if (strand!='+' && strand!='-' && strand!='.')
183 GError("Error parsing strand (%c) from GFF line:\n%s\n",strand,l);
184 phase=*t[7]; // must be '.', '0', '1' or '2'
185 ID=NULL;
186 // exon/CDS/mrna filter
187 strncpy(fnamelc, ftype, 127);
188 fnamelc[127]=0;
189 strlower(fnamelc); //convert to lower case
190 bool is_t_data=false;
191 if (strstr(fnamelc, "utr")!=NULL) {
192 exontype=exgffUTR;
193 is_exon=true;
194 is_t_data=true;
195 }
196 else if (endsWith(fnamelc, "exon")) {
197 exontype=exgffExon;
198 is_exon=true;
199 is_t_data=true;
200 }
201 else if (strstr(fnamelc, "stop") &&
202 (strstr(fnamelc, "codon") || strstr(fnamelc, "cds"))){
203 exontype=exgffStop;
204 is_cds=true; //though some place it outside the last CDS segment
205 is_t_data=true;
206 }
207 else if (strstr(fnamelc, "start") &&
208 ((strstr(fnamelc, "codon")!=NULL) || strstr(fnamelc, "cds")!=NULL)){
209 exontype=exgffStart;
210 is_cds=true;
211 is_t_data=true;
212 }
213 else if (strcmp(fnamelc, "cds")==0) {
214 exontype=exgffCDS;
215 is_cds=true;
216 is_t_data=true;
217 }
218 else if (endsWith(fnamelc, "gene") || startsWith(fnamelc, "gene")) {
219 is_gene=true;
220 is_t_data=true; //because its name will be attached to parented transcripts
221 }
222 else if (endsWith(fnamelc,"rna") || endsWith(fnamelc,"transcript")) {
223 is_transcript=true;
224 is_t_data=true;
225 }
226
227 if (reader->transcriptsOnly && !is_t_data) {
228 char* id=extractAttr("ID=");
229 if (id==NULL) id=extractAttr("transcript_id");
230 //GMessage("Discarding non-transcript line:\n%s\n",l);
231 if (id!=NULL) {
232 reader->discarded_ids.Add(id, new int(1));
233 GFREE(id);
234 }
235 return; //skip this line, unwanted feature name
236 }
237 ID=extractAttr("ID=");
238 char* Parent=extractAttr("Parent=");
239 is_gff3=(ID!=NULL || Parent!=NULL);
240 if (is_gff3) {
241 //parse as GFF3
242 if (ID!=NULL) {
243 //has ID attr so it's likely to be a parent feature
244 //look for explicit gene name
245 gene_name=extractAttr("gene_name=",false);
246 if (gene_name==NULL) {
247 gene_name=extractAttr("geneName=",false);
248 if (gene_name==NULL) {
249 gene_name=extractAttr("gene_sym=",false);
250 if (gene_name==NULL) {
251 gene_name=extractAttr("gene=",false);
252 }
253 }
254 }
255 gene_id=extractAttr("geneID=",false);
256 if (gene_id==NULL) {
257 gene_id=extractAttr("gene_id=",false);
258 }
259 if (is_gene) {
260 //special case: keep the Name and ID attributes of the gene feature
261 if (gene_name==NULL)
262 gene_name=extractAttr("Name=");
263 if (gene_id==NULL) //the ID is also gene_id in this case
264 gene_id=Gstrdup(ID);
265 //skip=false;
266 //return;
267 GFREE(Parent); //TMI, we really don't care about gene Parents?
268 } //gene feature
269 }// has GFF3 ID
270 if (Parent!=NULL) {
271 //keep Parent attr
272 //parse multiple parents
273 num_parents=1;
274 p=Parent;
275 int last_delim_pos=-1;
276 while (*p!=';' && *p!=0) {
277 if (*p==',' && *(p+1)!=0 && *(p+1)!=';') {
278 num_parents++;
279 last_delim_pos=(p-Parent);
280 }
281 p++;
282 }
283 _parents_len=p-Parent+1;
284 _parents=Parent;
285 GMALLOC(parents, num_parents*sizeof(char*));
286 parents[0]=_parents;
287 int i=1;
288 if (last_delim_pos>0) {
289 for (p=_parents+1;p<=_parents+last_delim_pos;p++) {
290 if (*p==',') {
291 char* ep=p-1;
292 while (*ep==' ' && ep>_parents) ep--;
293 *(ep+1)=0; //end the string there
294 parents[i]=p+1;
295 i++;
296 }
297 }
298 }
299 } //has Parent field
300 } //GFF3
301 else { // GTF-like expected
302 Parent=extractAttr("transcript_id");
303 if (Parent!=NULL) { //GTF2 format detected
304 if (is_transcript) {
305 // atypical GTF with a parent transcript line declared
306 ID=Parent;
307 Parent=NULL;
308 }
309 gene_id=extractAttr("gene_id"); // for GTF this is the only attribute accepted as geneID
310 gene_name=extractAttr("gene_name");
311 if (gene_name==NULL) {
312 gene_name=extractAttr("gene_sym");
313 if (gene_name==NULL)
314 gene_name=extractAttr("gene");
315 }
316 //prepare for parseAttr by adding '=' character instead of spaces for all attributes
317 //after the attribute name
318 p=info;
319 bool noed=true; //not edited after the last delim
320 bool nsp=false; //non-space found after last delim
321 while (*p!=0) {
322 if (*p==' ') {
323 if (nsp && noed) {
324 *p='=';
325 noed=false;
326 p++;
327 continue;
328 }
329 }
330 else nsp=true; //non-space
331 if (*p==';') { noed=true; nsp=false; }
332 p++;
333 }
334 } //GTF2 detected (no parent line)
335 else {// Parent is NULL, check for jigsaw format or other pre-GTF2 format
336 //char* fexon=strstr(fnamelc, "exon");
337 //if (fexon!=NULL) {
338 if (exontype==exgffExon) {
339 if (startsWith(track,"jigsaw")) {
340 is_cds=true;
341 strcpy(track,"jigsaw");
342 p=strchr(info,';');
343 if (p==NULL) { Parent=Gstrdup(info); info=NULL; }
344 else { Parent=Gstrdup(info,p-1);
345 info=p+1;
346 }
347 }
348 } //exon feature?
349 if (Parent==NULL && exontype>=exgffCDS &&
350 (i=strcspn(info,"; \t\n\r"))<=(int)(strlen(info)+1)) {
351 //one word ID ? really desperate attempt to parse it here
352 Parent=Gstrdup(info,info+i-1);
353 info=NULL; //discard anything else on the line
354 }
355 }
356 if (Parent!=NULL) { //GTF transcript_id for exon/CDS feature
357 _parents=Parent;
358 GMALLOC(parents,sizeof(char*));
359 num_parents=1;
360 parents[0]=_parents;
361 }
362 } //GTF-like
363
364 //parse other potentially useful features
365 if (is_gff3) {
366 if ((p=strstr(info,"Target="))!=NULL) { //has Target attr
367 p+=7;
368 while (*p!=';' && *p!=0 && *p!=' ') p++;
369 if (*p!=' ') {
370 GError("Error parsing target coordinates from GFF line:\n%s\n",l);
371 }
372 if (!parseUInt(p,qstart))
373 GError("Error parsing target start coordinate from GFF line:\n%s\n",l);
374 if (*p!=' ') {
375 GError("Error parsing next target coordinate from GFF line:\n%s\n",l);
376 }
377 p++;
378 if (!parseUInt(p,qend))
379 GError("Error parsing target end coordinate from GFF line:\n%s\n",l);
380 }
381 if ((p=strifind(info,"Qreg="))!=NULL) { //has Qreg attr
382 p+=5;
383 if (!parseUInt(p,qstart))
384 GError("Error parsing target start coordinate from GFF line:\n%s\n",l);
385 if (*p!='-') {
386 GError("Error parsing next target coordinate from GFF line:\n%s\n",l);
387 }
388 p++;
389 if (!parseUInt(p,qend))
390 GError("Error parsing target end coordinate from GFF line:\n%s\n",l);
391 if (*p=='|' || *p==':') {
392 p++;
393 if (!parseUInt(p,qlen))
394 GError("Error parsing target length from GFF Qreg|: \n%s\n",l);
395 }
396 }//has Qreg attr
397 if (qlen==0 && (p=strifind(info,"Qlen="))!=NULL) {
398 p+=5;
399 if (!parseUInt(p,qlen))
400 GError("Error parsing target length from GFF Qlen:\n%s\n",l);
401 }
402 }//parsing some useful attributes in GFF3 records
403 if (ID==NULL && parents==NULL) {
404 if (reader->gff_warns)
405 GMessage("Warning: could not parse ID or Parent from GFF line:\n%s\n",dupline);
406 return; //skip
407 }
408 skip=false;
409 }
410
411
412 void GffObj::addCDS(uint cd_start, uint cd_end, char phase) {
413 if (cd_start>=this->start) {
414 this->CDstart=cd_start;
415 if (strand=='+') this->CDphase=phase;
416 }
417 else this->CDstart=this->start;
418 if (cd_end<=this->end) {
419 this->CDend=cd_end;
420 if (strand=='-') this->CDphase=phase;
421 }
422 else this->CDend=this->end;
423 isTranscript(true);
424 exon_ftype_id=gff_fid_exon;
425 if (monoFeature()) {
426 if (exons.Count()==0) addExon(this->start, this->end,0,'.',0,0,false,exgffExon);
427 else exons[0]->exontype=exgffExon;
428 }
429 }
430
431 int GffObj::addExon(GffReader* reader, GffLine* gl, bool keepAttr, bool noExonAttr) {
432 //this will make sure we have the right subftype_id!
433 //int subf_id=-1;
434 if (!isTranscript() && gl->is_cds) {
435 isTranscript(true);
436 exon_ftype_id=gff_fid_exon;
437 if (exons.Count()==1) exons[0]->exontype=exgffExon;
438 }
439 if (isTranscript()) {
440 if (exon_ftype_id<0) {//exon_ftype_id=gff_fid_exon;
441 if (gl->exontype>0) exon_ftype_id=gff_fid_exon;
442 else exon_ftype_id=names->feats.addName(gl->ftype);
443 }
444 //any recognized mRNA segment gets the generic "exon" type (also applies to CDS)
445 if (gl->exontype==0 && !gl->is_transcript) {
446 //extraneous mRNA feature, discard
447 if (reader->gff_warns)
448 GMessage("Warning: discarding unrecognized transcript subfeature %s of %s\n",
449 gl->ftype, gffID);
450 return -1;
451 }
452 }
453 else { //non-mRNA parent feature, check this subf type
454 int subf_id=names->feats.addName(gl->ftype);
455 if (exon_ftype_id<0 || exons.Count()==0) //never assigned a subfeature type before (e.g. first exon being added)
456 exon_ftype_id=subf_id;
457 else {
458 if (exon_ftype_id!=subf_id) {
459 //
460 if (exon_ftype_id==ftype_id && exons.Count()==1 && exons[0]->start==start && exons[0]->end==end) {
461 //the existing exon was just a dummy one created by default, discard it
462 exons.Clear();
463 covlen=0;
464 exon_ftype_id=subf_id; //allow the new subfeature to completely takeover
465 }
466 else { //multiple subfeatures, prefer those with
467 if (reader->gff_warns)
468 GMessage("GFF Warning: multiple subfeatures (%s and %s) found for %s, discarding ",
469 names->feats.getName(subf_id), names->feats.getName(exon_ftype_id),gffID);
470 if (gl->exontype!=0) { //new feature is an exon, discard previously parsed subfeatures
471 if (reader->gff_warns) GMessage("%s.\n", names->feats.getName(exon_ftype_id));
472 exon_ftype_id=subf_id;
473 exons.Clear();
474 covlen=0;
475 }
476 else { //discard new feature
477 if (reader->gff_warns) GMessage("%s.\n", names->feats.getName(subf_id));
478 return -1; //skip this 2nd subfeature type for this parent!
479 }
480 }
481 } //incoming subfeature is of different type
482 } //new subfeature type
483 } //non-mRNA parent
484 int eidx=addExon(gl->fstart, gl->fend, gl->score, gl->phase,
485 gl->qstart,gl->qend, gl->is_cds, gl->exontype);
486 if (eidx<0) return eidx; //this should never happen
487 if (keepAttr) {
488 if (noExonAttr) {
489 if (attrs==NULL) //place the parsed attributes directly at transcript level
490 parseAttrs(attrs, gl->info);
491 }
492 else { //need all exon-level attributes
493 parseAttrs(exons[eidx]->attrs, gl->info, true);
494 }
495 }
496 return eidx;
497 }
498
499
500 int GffObj::addExon(uint segstart, uint segend, double sc, char fr, int qs, int qe, bool iscds, char exontype) {
501 if (exons.Count()==0) {
502 if (iscds) isCDS=true; //for now, assume CDS only if first "exon" given is a CDS
503 if (exon_ftype_id<0) {
504 exon_ftype_id = isTranscript() ? gff_fid_exon : ftype_id;
505 }
506 }
507 //special treatment of start/stop codon features, they might be broken/split between exons
508 //and in that case some providers will still give the wrong end coordinate as start+2 (e.g. UCSC)
509 //so we should not trust the end coordinate for such features
510 if (exontype==exgffStart || exontype==exgffStop) {
511 if (strand=='-') segstart=segend;
512 else segend=segstart;
513 if (exontype==exgffStart) {
514 if (CDstart==0 || segstart<CDstart) CDstart=segstart;
515 }
516 else {
517 if (segstart>CDend) CDend=segstart;
518 }
519 }
520 else if (iscds) { //update CDS anchors:
521 if (CDstart==0 || segstart<CDstart) {
522 CDstart=segstart;
523 if (exontype==exgffCDS && strand=='+') CDphase=fr;
524 }
525 if (segend>CDend) {
526 if (exontype==exgffCDS && strand=='-') CDphase=fr;
527 CDend=segend;
528 }
529 }
530 else { // not a CDS/start/stop
531 isCDS=false;
532 }
533 if (qs || qe) {
534 if (qs>qe) swap(qs,qe);
535 if (qs==0) qs=1;
536 }
537 int ovlen=0;
538 if (exontype>0) { //check for overlaps between exon-type segments
539 int oi=exonOverlapIdx(segstart, segend, &ovlen);
540 if (oi>=0) { //overlap existing segment
541 if (ovlen==0) {
542 //adjacent segments will be merged
543 //e.g. CDS to (UTR|exon)
544 if ((exons[oi]->exontype>=exgffUTR && exontype==exgffCDS) ||
545 (exons[oi]->exontype==exgffCDS && exontype>=exgffUTR)) {
546 expandExon(oi, segstart, segend, exgffCDSUTR, sc, fr, qs, qe);
547 return oi;
548 }
549 //CDS adjacent to stop_codon: UCSC does (did?) this
550 if ((exons[oi]->exontype==exgffStop && exontype==exgffCDS) ||
551 (exons[oi]->exontype==exgffCDS && exontype==exgffStop)) {
552 expandExon(oi, segstart, segend, exgffCDS, sc, fr, qs, qe);
553 return oi;
554 }
555 }
556 //only allow this for CDS within exon, stop_codon within (CDS|UTR|exon),
557 // start_codon within (CDS|exon)
558 if (exons[oi]->exontype>exontype &&
559 exons[oi]->start<=segstart && exons[oi]->end>=segend &&
560 !(exons[oi]->exontype==exgffUTR && exontype==exgffCDS)) {
561 //larger segment given first, now the smaller included one is redundant
562 return oi; //only used to store attributes from current GffLine
563 }
564 if (exontype>exons[oi]->exontype &&
565 segstart<=exons[oi]->start && segend>=exons[oi]->end &&
566 !(exontype==exgffUTR && exons[oi]->exontype==exgffCDS)) {
567 //smaller segment given first, so we have to enlarge it
568 expandExon(oi, segstart, segend, exontype, sc, fr, qs, qe);
569 //this should also check for overlapping next exon (oi+1) ?
570 return oi;
571 }
572 //there is also the special case of "ribosomal slippage exception" (programmed frameshift)
573 //where two CDS segments may actually overlap for 1 or 2 bases, but there should be only one encompassing exon
574 //if (ovlen>2 || exons[oi]->exontype!=exgffCDS || exontype!=exgffCDS) {
575 // had to relax this because of some weird UCSC annotations with exons partially overlapping the CDS segments
576 /*
577 if (ovlen>2 && exons[oi]->exontype!=exgffUTR && exontype!=exgffUTR) {
578 if (gff_show_warnings)
579 GMessage("GFF Warning: discarding overlapping feature segment (%d-%d) (vs %d-%d (%s)) for GFF ID %s on %s\n",
580 segstart, segend, exons[oi]->start, exons[oi]->end, getSubfName(), gffID, getGSeqName());
581 hasErrors(true);
582 return -1; //segment NOT added
583 }
584 */
585
586 if ((ovlen>2 || ovlen==0) || exons[oi]->exontype!=exgffCDS || exontype!=exgffCDS) {
587 if (gff_show_warnings)
588 GMessage("GFF Warning: merging overlapping/adjacent feature segment (%d-%d) into (%d-%d) (%s) for GFF ID %s on %s\n",
589 segstart, segend, exons[oi]->start, exons[oi]->end, getSubfName(), gffID, getGSeqName());
590 expandExon(oi, segstart, segend, exontype, sc, fr, qs, qe);
591 return oi;
592 }
593 // else add the segment if the overlap is small and between two CDS segments
594 //TODO: we might want to add an attribute here with the slippage coordinate and size?
595 covlen-=ovlen;
596 }//overlap or adjacent to existing segment
597 } //check for overlap
598 // --- no overlap, or accepted micro-overlap (ribosomal slippage)
599 // create & add the new segment
600 /*
601 if (start>0 && exontype==exgffCDS && exons.Count()==0) {
602 //adding a CDS directly as the first subfeature of a declared parent
603 segstart=start;
604 segend=end;
605 }
606 */
607 GffExon* enew=new GffExon(segstart, segend, sc, fr, qs, qe, exontype);
608 int eidx=exons.Add(enew);
609 if (eidx<0) {
610 //this would actually be acceptable if the object is a "Gene" and "exons" are in fact isoforms
611 if (gff_show_warnings)
612 GMessage("GFF Warning: failed adding segment %d-%d for %s (discarded)!\n",
613 segstart, segend, gffID);
614 delete enew;
615 hasErrors(true);
616 return -1;
617 }
618 covlen+=(int)(exons[eidx]->end-exons[eidx]->start)+1;
619 //adjust parent feature coordinates to contain this exon
620 if (start==0 || start>exons.First()->start) {
621 start=exons.First()->start;
622 }
623 if (end<exons.Last()->end) end=exons.Last()->end;
624
625 if (uptr!=NULL) { //collect stats about the underlying genomic sequence
626 GSeqStat* gsd=(GSeqStat*)uptr;
627 if (start<gsd->mincoord) gsd->mincoord=start;
628 if (end>gsd->maxcoord) gsd->maxcoord=end;
629 if (this->len()>gsd->maxfeat_len) {
630 gsd->maxfeat_len=this->len();
631 gsd->maxfeat=this;
632 }
633 }
634 return eidx;
635 }
636
637 void GffObj::expandExon(int oi, uint segstart, uint segend, char exontype, double sc, char fr, int qs, int qe) {
638 //oi is the index of the *first* overlapping segment found that must be enlarged
639 covlen-=exons[oi]->len();
640 if (segstart<exons[oi]->start)
641 exons[oi]->start=segstart;
642 if (qs && qs<exons[oi]->qstart) exons[oi]->qstart=qs;
643 if (segend>exons[oi]->end)
644 exons[oi]->end=segend;
645 if (qe && qe>exons[oi]->qend) exons[oi]->qend=qe;
646 //warning: score cannot be properly adjusted! e.g. if it's a p-value it's just going to get worse
647 if (sc!=0) exons[oi]->score=sc;
648 covlen+=exons[oi]->len();
649 //if (exons[oi]->exontype< exontype) -- always true
650 exons[oi]->exontype = exontype;
651 if (exontype==exgffCDS) exons[oi]->phase=fr;
652 //we must check if any more exons are also overlapping this
653 int ni=oi+1; //next exon index after oi
654 while (ni<exons.Count() && segend>=exons[ni]->start) { // next segment overlaps new enlarged segment
655 //only allow this if next segment is fully included, and a subordinate
656 if (exons[ni]->exontype<exontype && exons[ni]->end<=segend) {
657 /* I guess we have to relax this due to stupid UCSC hg18 files having a start_codon sticking out
658 chr1 hg18_knownGene start_codon 69806911 69806913 0.000000 + .
659 chr1 hg18_knownGene CDS 69806911 69806912 0.000000 + 0
660 chr1 hg18_knownGene exon 69805456 69806912 0.000000 + .
661 */
662 if (exons[ni]->qstart<exons[oi]->qstart) exons[oi]->qstart=exons[ni]->qstart;
663 if (exons[ni]->qend>exons[oi]->qend) exons[oi]->qend=exons[ni]->qend;
664 exons.Delete(ni);
665 }
666 else {
667 if (gff_show_warnings) GMessage("GFF Warning: overlapping existing exon(%d-%d) while expanding to %d-%d for GFF ID %s\n",
668 exons[ni]->start, exons[ni]->end, segstart, segend, gffID);
669 //hasErrors(true);
670 break;
671 }
672 }
673 // -- make sure any other related boundaries are updated:
674 start=exons.First()->start;
675 end=exons.Last()->end;
676 if (uptr!=NULL) { //collect stats about the underlying genomic sequence
677 GSeqStat* gsd=(GSeqStat*)uptr;
678 if (start<gsd->mincoord) gsd->mincoord=start;
679 if (end>gsd->maxcoord) gsd->maxcoord=end;
680 if (this->len()>gsd->maxfeat_len) {
681 gsd->maxfeat_len=this->len();
682 gsd->maxfeat=this;
683 }
684 }
685 }
686
687 void GffObj::removeExon(int idx) {
688 /*
689 if (idx==0 && segs[0].start==gstart)
690 gstart=segs[1].start;
691 if (idx==segcount && segs[segcount].end==gend)
692 gend=segs[segcount-1].end;
693 */
694 if (idx<0 || idx>=exons.Count()) return;
695 int segstart=exons[idx]->start;
696 int segend=exons[idx]->end;
697 exons.Delete(idx);
698 covlen -= (int)(segend-segstart)+1;
699 start=exons.First()->start;
700 end=exons.Last()->end;
701 if (isCDS) { CDstart=start; CDend=end; }
702 }
703
704 void GffObj::removeExon(GffExon* p) {
705 for (int idx=0;idx<exons.Count();idx++) {
706 if (exons[idx]==p) {
707 int segstart=exons[idx]->start;
708 int segend=exons[idx]->end;
709 exons.Delete(idx);
710 covlen -= (int)(segend-segstart)+1;
711 start=exons.First()->start;
712 end=exons.Last()->end;
713 if (isCDS) { CDstart=start; CDend=end; }
714 return;
715 }
716 }
717 }
718
719
720
721 GffObj::GffObj(GffReader *gfrd, GffLine* gffline, bool keepAttr, bool noExonAttr):
722 GSeg(0,0), exons(true,true,false), children(1,false) {
723 xstart=0;
724 xend=0;
725 xstatus=0;
726 partial=false;
727 isCDS=false;
728 uptr=NULL;
729 ulink=NULL;
730 parent=NULL;
731 udata=0;
732 flags=0;
733 CDstart=0;
734 CDend=0;
735 CDphase=0;
736 geneID=NULL;
737 gene_name=NULL;
738 attrs=NULL;
739 gffID=NULL;
740 track_id=-1;
741 gseq_id=-1;
742 ftype_id=-1;
743 exon_ftype_id=-1;
744 strand='.';
745 if (gfrd==NULL)
746 GError("Cannot use this GffObj constructor with a NULL GffReader!\n");
747 gffnames_ref(names);
748 if (gfrd->names==NULL) gfrd->names=names;
749 //qlen=0;qstart=0;qend=0;
750 gscore=0;
751 uscore=0;
752 covlen=0;
753 qcov=0;
754 start=gffline->fstart;
755 end=gffline->fend;
756 gseq_id=names->gseqs.addName(gffline->gseqname);
757 track_id=names->tracks.addName(gffline->track);
758 strand=gffline->strand;
759 qlen=gffline->qlen;
760 qstart=gffline->qstart;
761 qend=gffline->qend;
762 //setup flags from gffline
763 isCDS=gffline->is_cds; //for now
764 isGene(gffline->is_gene);
765 isTranscript(gffline->is_transcript || gffline->exontype!=0);
766 fromGff3(gffline->is_gff3);
767
768 if (gffline->parents!=NULL) {
769 //GTF style -- create a GffObj directly by subfeature
770 //(also possible orphan GFF3 exon line, or an exon given before its parent (chado))
771 if (gffline->exontype!=0) { //recognized exon-like feature
772 ftype_id=gff_fid_transcript; //so this is some sort of transcript
773 exon_ftype_id=gff_fid_exon; //subfeatures MUST be exons
774 }
775 else {//unrecognized subfeatures
776 //make this GffObj of the same feature type
777 ftype_id=names->feats.addName(gffline->ftype);
778 }
779 if (gffline->ID==NULL) { //typical GTF
780 gffID=Gstrdup(gffline->parents[0]);
781 this->createdByExon(true);
782 //this is likely the first exon/segment of the feature
783 addExon(gfrd, gffline, keepAttr, noExonAttr);
784 }
785 else { //a parented feature with an ID -- probably an orphan GFF3 line
786 if (gffline->is_gff3 && gffline->exontype!=0) {
787 //premature exon given before its parent transcript
788 //create the transcript entry here
789 gffID=Gstrdup(gffline->parents[0]);
790 this->createdByExon(true);
791 //this is the first exon/segment of the transcript
792 addExon(gfrd, gffline, keepAttr, noExonAttr);
793 }
794 else { //unrecognized non-exon feature ? use the ID instead
795 gffID=Gstrdup(gffline->ID);
796 if (keepAttr) this->parseAttrs(attrs, gffline->info);
797 }
798 }
799 } //subfeature given directly
800 else { //gffline->parents==NULL
801 //create a parent feature in its own right
802 gscore=gffline->score;
803 if (gffline->ID==NULL || gffline->ID[0]==0)
804 GError("Error: no ID found for GFF record start\n");
805 gffID=Gstrdup(gffline->ID); //there must be an ID here
806 //if (gffline->is_transcript) ftype_id=gff_fid_mRNA;
807 //else
808 ftype_id=names->feats.addName(gffline->ftype);
809 if (gffline->is_transcript)
810 exon_ftype_id=gff_fid_exon;
811
812 if (keepAttr) this->parseAttrs(attrs, gffline->info);
813 }//no parent
814
815 if (gffline->gene_name!=NULL) {
816 gene_name=Gstrdup(gffline->gene_name);
817 }
818 if (gffline->gene_id!=NULL) {
819 geneID=Gstrdup(gffline->gene_id);
820 }
821
822 GSeqStat* gsd=gfrd->gseqstats.AddIfNew(new GSeqStat(gseq_id,names->gseqs.lastNameUsed()),true);
823 uptr=gsd;
824 if (start<gsd->mincoord) gsd->mincoord=start;
825 if (end>gsd->maxcoord) gsd->maxcoord=end;
826 if (this->len()>gsd->maxfeat_len) {
827 gsd->maxfeat_len=this->len();
828 gsd->maxfeat=this;
829 }
830 }
831
832 GffLine* GffReader::nextGffLine() {
833 if (gffline!=NULL) return gffline; //caller should free gffline after processing
834 while (gffline==NULL) {
835 int llen=0;
836 buflen=GFF_LINELEN-1;
837 char* l=fgetline(linebuf, buflen, fh, &fpos, &llen);
838 if (l==NULL) {
839 return NULL; //end of file
840 }
841 int ns=0; //first nonspace position
842 while (l[ns]!=0 && isspace(l[ns])) ns++;
843 if (l[ns]=='#' || llen<10) continue;
844 gffline=new GffLine(this, l);
845 if (gffline->skip) {
846 delete gffline;
847 gffline=NULL;
848 continue;
849 }
850 if (gffline->ID==NULL && gffline->parents==NULL) { //it must have an ID
851 //this might not be needed, already checked in the GffLine constructor
852 if (gff_warns)
853 GMessage("Warning: malformed GFF line, no parent or record Id (kipping\n");
854 delete gffline;
855 gffline=NULL;
856 //continue;
857 }
858 }
859 return gffline;
860 }
861
862 char* GffReader::gfoBuildId(const char* id, const char* ctg) {
863 //caller must free the returned pointer
864 char* buf=NULL;
865 int idlen=strlen(id);
866 GMALLOC(buf, idlen+strlen(ctg)+2);
867 strcpy(buf, id);
868 buf[idlen]='~';
869 strcpy(buf+idlen+1, ctg);
870 return buf;
871 }
872
873 void GffReader::gfoRemove(const char* id, const char* ctg) {
874 char* buf=gfoBuildId(id,ctg);
875 phash.Remove(buf);
876 GFREE(buf);
877 }
878
879 //Warning: if gflst gets altered, idx becomes obsolete
880 GfoHolder* GffReader::gfoAdd(const char* id, const char* ctg, GffObj* gfo, int idx) {
881 char* buf=gfoBuildId(id,ctg);
882 GfoHolder* r=new GfoHolder(gfo,idx);
883 phash.Add(buf, r);
884 GFREE(buf);
885 return r;
886 }
887
888 GfoHolder* GffReader::gfoFind(const char* id, const char* ctg) {
889 char* buf=gfoBuildId(id,ctg);
890 GfoHolder* r=phash.Find(buf);
891 GFREE(buf);
892 return r;
893 }
894
895 GfoHolder* GffReader::replaceGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, int replaceidx) {
896 GffObj* newgfo=new GffObj(this, gffline, keepAttr, noExonAttr);
897 GfoHolder* r=NULL;
898 if (replaceidx>=0) {
899 gflst.Put(replaceidx,newgfo);
900 r=gfoAdd(newgfo->gffID, gffline->gseqname, newgfo, replaceidx);
901 }
902 else {
903 int gfoidx=gflst.Add(newgfo);
904 r=gfoAdd(newgfo->gffID, gffline->gseqname, newgfo, gfoidx);
905 }
906 if (gff_warns) {
907 int* pcount=tids.Find(newgfo->gffID);
908 if (pcount!=NULL) {
909 if (gff_warns) GMessage("Warning: duplicate GFF ID: %s\n", newgfo->gffID);
910 (*pcount)++;
911 }
912 else {
913 tids.Add(newgfo->gffID,new int(1));
914 }
915 }
916 return r;
917 }
918
919 GfoHolder* GffReader::updateParent(GfoHolder* newgfh, GffObj* parent) {
920 //assert(parent);
921 //assert(newgfo);
922 parent->children.Add(newgfh->gffobj);
923 if (newgfh->gffobj->parent==NULL) newgfh->gffobj->parent=parent;
924 newgfh->gffobj->setLevel(parent->getLevel()+1);
925 if (parent->isGene()) {
926 if (parent->gene_name!=NULL && newgfh->gffobj->gene_name==NULL)
927 newgfh->gffobj->gene_name=Gstrdup(parent->gene_name);
928 if (parent->geneID!=NULL && newgfh->gffobj->geneID==NULL)
929 newgfh->gffobj->geneID=Gstrdup(parent->geneID);
930 }
931
932 return newgfh;
933 }
934
935 GfoHolder* GffReader::newGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr,
936 GffObj* parent, GffExon* pexon) {
937 GffObj* newgfo=new GffObj(this, gffline, keepAttr, noExonAttr);
938 GfoHolder* r=NULL;
939 int gfoidx=gflst.Add(newgfo);
940 r=gfoAdd(newgfo->gffID, gffline->gseqname, newgfo, gfoidx);
941 if (parent!=NULL) {
942 updateParent(r, parent);
943 if (pexon!=NULL) parent->removeExon(pexon);
944 }
945 if (gff_warns) {
946 int* pcount=tids.Find(newgfo->gffID);
947 if (pcount!=NULL) {
948 if (gff_warns) GMessage("Warning: duplicate GFF ID: %s\n", newgfo->gffID);
949 (*pcount)++;
950 }
951 else {
952 tids.Add(newgfo->gffID,new int(1));
953 }
954 }
955 return r;
956 }
957
958 GfoHolder* GffReader::updateGffRec(GfoHolder* prevgfo, GffLine* gffline,
959 bool keepAttr) {
960 if (prevgfo==NULL) return NULL;
961 prevgfo->gffobj->createdByExon(false);
962 prevgfo->gffobj->ftype_id=prevgfo->gffobj->names->feats.addName(gffline->ftype);
963 prevgfo->gffobj->start=gffline->fstart;
964 prevgfo->gffobj->end=gffline->fend;
965 prevgfo->gffobj->isGene(gffline->is_gene);
966 prevgfo->gffobj->isTranscript(gffline->is_transcript || gffline->exontype!=0);
967 prevgfo->gffobj->fromGff3(gffline->is_gff3);
968 if (keepAttr) {
969 if (prevgfo->gffobj->attrs!=NULL) prevgfo->gffobj->attrs->Clear();
970 prevgfo->gffobj->parseAttrs(prevgfo->gffobj->attrs, gffline->info);
971 }
972 return prevgfo;
973 }
974
975
976 bool GffReader::addExonFeature(GfoHolder* prevgfo, GffLine* gffline, GHash<CNonExon>& pex, bool noExonAttr) {
977 bool r=true;
978 if (gffline->strand!=prevgfo->gffobj->strand) {
979 if (prevgfo->gffobj->strand=='.') {
980 prevgfo->gffobj->strand=gffline->strand;
981 }
982 else {
983 GMessage("GFF Error at %s (%c): exon %d-%d (%c) found on different strand; discarded.\n",
984 prevgfo->gffobj->gffID, prevgfo->gffobj->strand,
985 gffline->fstart, gffline->fend, gffline->strand, prevgfo->gffobj->getGSeqName());
986 //r=false;
987 return true; //FIXME: split trans-spliced mRNAs by strand
988 }
989 }
990 int gdist=(gffline->fstart>prevgfo->gffobj->end) ? gffline->fstart-prevgfo->gffobj->end :
991 ((gffline->fend<prevgfo->gffobj->start)? prevgfo->gffobj->start-gffline->fend :
992 0 );
993 if (gdist>(int)GFF_MAX_LOCUS) { //too far apart, most likely this is a duplicate ID
994 GMessage("Error: duplicate GFF ID '%s' (or exons too far apart)!\n",prevgfo->gffobj->gffID);
995 //validation_errors = true;
996 r=false;
997 if (!gff_warns) exit(1);
998 }
999 int eidx=prevgfo->gffobj->addExon(this, gffline, !noExonAttr, noExonAttr);
1000 if (eidx>=0 && gffline->ID!=NULL && gffline->exontype==0)
1001 subfPoolAdd(pex, prevgfo);
1002 return r;
1003 }
1004
1005 CNonExon* GffReader::subfPoolCheck(GffLine* gffline, GHash<CNonExon>& pex, char*& subp_name) {
1006 CNonExon* subp=NULL;
1007 subp_name=NULL;
1008 for (int i=0;i<gffline->num_parents;i++) {
1009 if (transcriptsOnly && discarded_ids.Find(gffline->parents[i])!=NULL)
1010 continue;
1011 subp_name=gfoBuildId(gffline->parents[i], gffline->gseqname); //e.g. mRNA name
1012 subp=pex.Find(subp_name);
1013 if (subp!=NULL)
1014 return subp;
1015 GFREE(subp_name);
1016 }
1017 return NULL;
1018 }
1019
1020 void GffReader::subfPoolAdd(GHash<CNonExon>& pex, GfoHolder* newgfo) {
1021 //this might become a parent feature later
1022 if (newgfo->gffobj->exons.Count()>0) {
1023 char* xbuf=gfoBuildId(gffline->ID, gffline->gseqname);
1024 pex.Add(xbuf, new CNonExon(newgfo->idx, newgfo->gffobj,
1025 newgfo->gffobj->exons[0], gffline));
1026 GFREE(xbuf);
1027 }
1028 }
1029
1030 GfoHolder* GffReader::promoteFeature(CNonExon* subp, char*& subp_name, GHash<CNonExon>& pex,
1031 bool keepAttr, bool noExonAttr) {
1032 GffObj* prevp=subp->parent; //grandparent of gffline (e.g. gene)
1033 if (prevp!=gflst[subp->idx])
1034 GError("Error promoting subfeature %s, gflst index mismatch?!\n", subp->gffline->ID);
1035 subp->gffline->discardParent();
1036 GfoHolder* gfoh=newGffRec(subp->gffline, keepAttr, noExonAttr, prevp, subp->exon);
1037 pex.Remove(subp_name); //no longer a potential parent, moved it to phash already
1038 prevp->promotedChildren(true);
1039 return gfoh; //returns the holder of newly promoted feature
1040 }
1041
1042 //have to parse the whole file because exons can be scattered all over
1043 void GffReader::readAll(bool keepAttr, bool mergeCloseExons, bool noExonAttr) {
1044 bool validation_errors = false;
1045 //loc_debug=false;
1046 GHash<CNonExon> pex; //keep track of any "exon"-like features that have an ID
1047 //and thus could become promoted to parent features
1048 while (nextGffLine()!=NULL) {
1049 //seen this gff ID before?
1050 GfoHolder* prevseen=NULL;
1051 if (gffline->ID) //GFF3
1052 prevseen=gfoFind(gffline->ID, gffline->gseqname);
1053 if (prevseen!=NULL) {
1054 if (prevseen->gffobj->createdByExon()) {
1055 updateGffRec(prevseen, gffline, keepAttr);
1056 }
1057 else {
1058 GMessage("Error: duplicate GFF ID '%s' encountered!\n",gffline->ID);
1059 validation_errors = true;
1060 if (gff_warns) {
1061 delete gffline; gffline=NULL; continue;
1062 }
1063 else exit(1);
1064 }
1065 }
1066 if (gffline->parents==NULL) {//start GFF3-like record with no parent (mRNA, gene)
1067 if (!prevseen) newGffRec(gffline, keepAttr, noExonAttr);
1068 }
1069 else { //--- it's a parented feature (could still be a mRNA)
1070 bool found_parent=false;
1071 GfoHolder* newgfo=prevseen;
1072 for (int i=0;i<gffline->num_parents;i++) {
1073 if (transcriptsOnly && discarded_ids.Find(gffline->parents[i])!=NULL)
1074 continue; //skipping discarded parent feature
1075 GfoHolder* parentgfo=gfoFind(gffline->parents[i], gffline->gseqname);
1076 if (parentgfo!=NULL) { //parent GffObj parsed earlier
1077 found_parent=true;
1078 if (parentgfo->gffobj->isGene() && gffline->is_transcript
1079 && gffline->exontype==0) {
1080 //not an exon, but a transcript parented by a gene
1081 if (newgfo) {
1082 updateParent(newgfo, parentgfo->gffobj);
1083 }
1084 else {
1085 newgfo=newGffRec(gffline, keepAttr, noExonAttr, parentgfo->gffobj);
1086 }
1087 }
1088 else { //potential exon subfeature
1089 if (!addExonFeature(parentgfo, gffline, pex, noExonAttr))
1090 validation_errors=true;
1091 }
1092 }
1093 } //for each parsed parent Id
1094 if (!found_parent) { //new GTF-like record starting here with a subfeature directly
1095 //or it could be some chado GFF3 barf with exons declared BEFORE their parent :(
1096 //check if this feature isn't parented by a previously stored "exon" subfeature
1097 char* subp_name=NULL;
1098 CNonExon* subp=subfPoolCheck(gffline, pex, subp_name);
1099 if (subp!=NULL) { //found a subfeature that is the parent of this gffline
1100 //promote that subfeature to a full GffObj
1101 GfoHolder* gfoh=promoteFeature(subp, subp_name, pex, keepAttr, noExonAttr);
1102 //add current gffline as an exon of the newly promoted subfeature
1103 if (!addExonFeature(gfoh, gffline, pex, noExonAttr))
1104 validation_errors=true;
1105 }
1106 else { //no parent seen before, create one directly with this exon
1107 //loc_debug=true;
1108 GfoHolder* newgfo=prevseen ? prevseen : newGffRec(gffline, keepAttr, noExonAttr);
1109 if (gffline->ID!=NULL && gffline->exontype==0)
1110 subfPoolAdd(pex, newgfo);
1111 //even those with errors will be added here!
1112 }
1113 GFREE(subp_name);
1114 } //no previous parent found
1115 } //parented feature
1116 //--
1117 delete gffline;
1118 gffline=NULL;
1119 }//while gff lines
1120 gflst.finalize(this, mergeCloseExons, keepAttr, noExonAttr); //force sorting by locus if so constructed
1121 // all gff records are now loaded in GList gflst
1122 // so we can free the hash
1123 phash.Clear();
1124 tids.Clear();
1125 if (validation_errors) {
1126 exit(1);
1127 }
1128 }
1129
1130 GffObj* GffObj::finalize(GffReader* gfr, bool mergeCloseExons, bool keepAttrs, bool noExonAttr) {
1131 //merge
1132 //always merge adjacent or overlapping segments
1133 //but if mergeCloseExons then merge even when distance is up to 5 bases
1134 udata=0;
1135 uptr=NULL;
1136 if (gfr->transcriptsOnly && !(isTranscript() || (isGene() && children.Count()==0))) {
1137 isDiscarded(true);
1138 }
1139 if (ftype_id==gff_fid_transcript && CDstart>0) {
1140 ftype_id=gff_fid_mRNA;
1141 //exon_ftype_id=gff_fid_exon;
1142 }
1143 //if (ftype_id==gff_fid_mRNA || exon_ftype_id==gff_fid_exon || mergeCloseExons) {
1144 if (isTranscript() || exon_ftype_id==gff_fid_exon || mergeCloseExons) {
1145 int mindist=mergeCloseExons ? 5:1;
1146 for (int i=0;i<exons.Count()-1;i++) {
1147 int ni=i+1;
1148 uint mend=exons[i]->end;
1149 while (ni<exons.Count()) {
1150 int dist=(int)(exons[ni]->start-mend);
1151 if (dist>mindist) break; //no merging with next segment
1152 if (gfr!=NULL && gfr->gff_warns && dist!=0 && (exons[ni]->exontype!=exgffUTR && exons[i]->exontype!=exgffUTR)) {
1153 GMessage("GFF warning: merging adjacent/overlapping segments of %s on %s (%d-%d, %d-%d)\n",
1154 gffID, getGSeqName(), exons[i]->start, exons[i]->end,exons[ni]->start, exons[ni]->end);
1155 }
1156 mend=exons[ni]->end;
1157 covlen-=exons[i]->len();
1158 exons[i]->end=mend;
1159 covlen+=exons[i]->len();
1160 covlen-=exons[ni]->len();
1161 if (exons[ni]->attrs!=NULL && (exons[i]->attrs==NULL ||
1162 exons[i]->attrs->Count()<exons[ni]->attrs->Count())) {
1163 //use the other exon attributes, if more
1164 delete(exons[i]->attrs);
1165 exons[i]->attrs=exons[ni]->attrs;
1166 exons[ni]->attrs=NULL;
1167 }
1168 exons.Delete(ni);
1169 } //check for merge with next exon
1170 } //for each exon
1171 }
1172 //attribute reduction for GTF records
1173 if (keepAttrs && !noExonAttr && !fromGff3()
1174 && exons.Count()>0 && exons[0]->attrs!=NULL) {
1175 bool attrs_discarded=false;
1176 for (int a=0;a<exons[0]->attrs->Count();a++) {
1177 int attr_name_id=exons[0]->attrs->Get(a)->attr_id;
1178 char* attr_name=names->attrs.getName(attr_name_id);
1179 char* attr_val =exons[0]->attrs->Get(a)->attr_val;
1180 bool sameExonAttr=true;
1181 for (int i=1;i<exons.Count();i++) {
1182 char* ov=exons[i]->getAttr(attr_name_id);
1183 if (ov==NULL || (strcmp(ov,attr_val)!=0)) {
1184 sameExonAttr=false;
1185 break;
1186 }
1187 }
1188 if (sameExonAttr) {
1189 //delete this attribute from exons level
1190 attrs_discarded=true;
1191 this->addAttr(attr_name, attr_val);
1192 for (int i=1;i<exons.Count();i++) {
1193 removeExonAttr(*(exons[i]), attr_name_id);
1194 }
1195 exons[0]->attrs->freeItem(a);
1196 }
1197 }
1198 if (attrs_discarded) exons[0]->attrs->Pack();
1199 }
1200 return this;
1201 }
1202
1203 void GffObj::parseAttrs(GffAttrs*& atrlist, char* info, bool isExon) {
1204 if (names==NULL)
1205 GError(ERR_NULL_GFNAMES, "parseAttrs()");
1206 if (atrlist==NULL)
1207 atrlist=new GffAttrs();
1208 char* endinfo=info+strlen(info);
1209 char* start=info;
1210 char* pch=start;
1211 while (start<endinfo) {
1212 //skip spaces
1213 while (*start==' ' && start<endinfo) start++;
1214 pch=strchr(start, ';');
1215 if (pch==NULL) pch=endinfo;
1216 else {
1217 *pch='\0';
1218 pch++;
1219 }
1220 char* ech=strchr(start,'=');
1221 if (ech!=NULL) { // attr=value format found
1222 *ech='\0';
1223 //if (noExonAttr && (strcmp(start, "exon_number")==0 || strcmp(start, "exon")==0)) { start=pch; continue; }
1224 if (strcmp(start, "exon_number")==0 || strcmp(start, "exon")==0 ||
1225 strcmp(start, "exon_id")==0)
1226 { start=pch; continue; }
1227 ech++;
1228 while (*ech==' ' && ech<endinfo) ech++;//skip extra spaces after the '='
1229 //atrlist->Add(new GffAttr(names->attrs.addName(start),ech));
1230 //make sure we don't add the same attribute more than once
1231 if (isExon && (strcmp(start, "protein_id")==0)) {
1232 //Ensembl special case
1233 this->addAttr(start, ech);
1234 start=pch;
1235 continue;
1236 }
1237 atrlist->add_or_update(names, start, ech);
1238 }
1239 /*
1240 else { //not an attr=value format
1241 atrlist->Add(new GffAttr(names->attrs.addName(start),"1"));
1242 }
1243 */
1244 start=pch;
1245 }
1246 if (atrlist->Count()==0) { delete atrlist; atrlist=NULL; }
1247 }
1248
1249 void GffObj::addAttr(const char* attrname, const char* attrvalue) {
1250 if (this->attrs==NULL)
1251 this->attrs=new GffAttrs();
1252 //this->attrs->Add(new GffAttr(names->attrs.addName(attrname),attrvalue));
1253 this->attrs->add_or_update(names, attrname, attrvalue);
1254 }
1255
1256
1257 void GffObj::setFeatureName(const char* feature) {
1258 //change the feature name/type for a transcript
1259 int fid=names->feats.addName(feature);
1260 if (monoFeature() && exons.Count()>0)
1261 this->exon_ftype_id=fid;
1262 this->ftype_id=fid;
1263 }
1264
1265 void GffObj::setRefName(const char* newname) {
1266 //change the feature name/type for a transcript
1267 int rid=names->gseqs.addName(newname);
1268 this->gseq_id=rid;
1269 }
1270
1271
1272
1273 int GffObj::removeAttr(const char* attrname, const char* attrval) {
1274 if (this->attrs==NULL || attrname==NULL || attrname[0]==0) return 0;
1275 int aid=this->names->attrs.getId(attrname);
1276 if (aid<0) return 0;
1277 int delcount=0; //could be more than one ?
1278 for (int i=0;i<this->attrs->Count();i++) {
1279 if (aid==this->attrs->Get(i)->attr_id) {
1280 if (attrval==NULL ||
1281 strcmp(attrval, this->attrs->Get(i)->attr_val)==0) {
1282 delcount++;
1283 this->attrs->freeItem(i);
1284 }
1285 }
1286 }
1287 if (delcount>0) this->attrs->Pack();
1288 return delcount;
1289 }
1290
1291 int GffObj::removeAttr(int aid, const char* attrval) {
1292 if (this->attrs==NULL || aid<0) return 0;
1293 int delcount=0; //could be more than one ?
1294 for (int i=0;i<this->attrs->Count();i++) {
1295 if (aid==this->attrs->Get(i)->attr_id) {
1296 if (attrval==NULL ||
1297 strcmp(attrval, this->attrs->Get(i)->attr_val)==0) {
1298 delcount++;
1299 this->attrs->freeItem(i);
1300 }
1301 }
1302 }
1303 if (delcount>0) this->attrs->Pack();
1304 return delcount;
1305 }
1306
1307
1308 int GffObj::removeExonAttr(GffExon& exon, const char* attrname, const char* attrval) {
1309 if (exon.attrs==NULL || attrname==NULL || attrname[0]==0) return 0;
1310 int aid=this->names->attrs.getId(attrname);
1311 if (aid<0) return 0;
1312 int delcount=0; //could be more than one
1313 for (int i=0;i<exon.attrs->Count();i++) {
1314 if (aid==exon.attrs->Get(i)->attr_id) {
1315 if (attrval==NULL ||
1316 strcmp(attrval, exon.attrs->Get(i)->attr_val)==0) {
1317 delcount++;
1318 exon.attrs->freeItem(i);
1319 }
1320 }
1321 }
1322 if (delcount>0) exon.attrs->Pack();
1323 return delcount;
1324 }
1325
1326 int GffObj::removeExonAttr(GffExon& exon, int aid, const char* attrval) {
1327 if (exon.attrs==NULL || aid<0) return 0;
1328 int delcount=0; //could be more than one
1329 for (int i=0;i<exon.attrs->Count();i++) {
1330 if (aid==exon.attrs->Get(i)->attr_id) {
1331 if (attrval==NULL ||
1332 strcmp(attrval, exon.attrs->Get(i)->attr_val)==0) {
1333 delcount++;
1334 exon.attrs->freeItem(i);
1335 }
1336 }
1337 }
1338 if (delcount>0) exon.attrs->Pack();
1339 return delcount;
1340 }
1341
1342
1343 void GffObj::getCDS_ends(uint& cds_start, uint& cds_end) {
1344 cds_start=0;
1345 cds_end=0;
1346 if (CDstart==0 || CDend==0) return; //no CDS info
1347 int cdsadj=0;
1348 if (CDphase=='1' || CDphase=='2') {
1349 cdsadj=CDphase-'0';
1350 }
1351 cds_start=CDstart;
1352 cds_end=CDend;
1353 if (strand=='-') cds_end-=cdsadj;
1354 else cds_start+=cdsadj;
1355 }
1356
1357 void GffObj::mRNA_CDS_coords(uint& cds_mstart, uint& cds_mend) {
1358 //sets cds_start and cds_end to the CDS start,end coordinates on the spliced mRNA transcript
1359 cds_mstart=0;
1360 cds_mend=0;
1361 if (CDstart==0 || CDend==0) return; //no CDS info
1362 //restore normal coordinates, just in case
1363 unxcoord();
1364 int cdsadj=0;
1365 if (CDphase=='1' || CDphase=='2') {
1366 cdsadj=CDphase-'0';
1367 }
1368 /*
1369 uint seqstart=CDstart;
1370 uint seqend=CDend;
1371 */
1372 uint seqstart=exons.First()->start;
1373 uint seqend=exons.Last()->end;
1374 int s=0; //resulting nucleotide counter
1375 if (strand=='-') {
1376 for (int x=exons.Count()-1;x>=0;x--) {
1377 uint sgstart=exons[x]->start;
1378 uint sgend=exons[x]->end;
1379 if (seqend<sgstart || seqstart>sgend) continue;
1380 if (seqstart>=sgstart && seqstart<=sgend)
1381 sgstart=seqstart; //seqstart within this segment
1382 if (seqend>=sgstart && seqend<=sgend)
1383 sgend=seqend; //seqend within this segment
1384 s+=(int)(sgend-sgstart)+1;
1385 if (CDstart>=sgstart && CDstart<=sgend) {
1386 //CDstart in this segment
1387 //and we are getting the whole transcript
1388 cds_mend=s-(int)(CDstart-sgstart);
1389 }
1390 if (CDend>=sgstart && CDend<=sgend) {
1391 //CDstart in this segment
1392 //and we are getting the whole transcript
1393 cds_mstart=s-(int)(CDend-cdsadj-sgstart);
1394 }
1395 } //for each exon
1396 } // - strand
1397 else { // + strand
1398 for (int x=0;x<exons.Count();x++) {
1399 uint sgstart=exons[x]->start;
1400 uint sgend=exons[x]->end;
1401 if (seqend<sgstart || seqstart>sgend) continue;
1402 if (seqstart>=sgstart && seqstart<=sgend)
1403 sgstart=seqstart; //seqstart within this segment
1404 if (seqend>=sgstart && seqend<=sgend)
1405 sgend=seqend; //seqend within this segment
1406 s+=(int)(sgend-sgstart)+1;
1407 /* for (uint i=sgstart;i<=sgend;i++) {
1408 spliced[s]=gsubseq[i-gstart];
1409 s++;
1410 }//for each nt
1411 */
1412 if (CDstart>=sgstart && CDstart<=sgend) {
1413 //CDstart in this segment
1414 cds_mstart=s-(int)(sgend-CDstart-cdsadj);
1415 }
1416 if (CDend>=sgstart && CDend<=sgend) {
1417 //CDend in this segment
1418 cds_mend=s-(int)(sgend-CDend);
1419 }
1420 } //for each exon
1421 } // + strand
1422 //spliced[s]=0;
1423 //if (rlen!=NULL) *rlen=s;
1424 //return spliced;
1425 }
1426
1427 char* GffObj::getUnspliced(GFaSeqGet* faseq, int* rlen, GList<GSeg>* seglst)
1428 {
1429 if (faseq==NULL) { GMessage("Warning: getUnspliced(NULL,.. ) called!\n");
1430 return NULL;
1431 }
1432 //restore normal coordinates:
1433 unxcoord();
1434 if (exons.Count()==0) return NULL;
1435 int fspan=end-start+1;
1436 const char* gsubseq=faseq->subseq(start, fspan);
1437 if (gsubseq==NULL) {
1438 GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end);
1439 }
1440 char* unspliced=NULL;
1441
1442 int seqstart=exons.First()->start;
1443 int seqend=exons.Last()->end;
1444
1445 int unsplicedlen = 0;
1446
1447 unsplicedlen += seqend - seqstart + 1;
1448
1449 GMALLOC(unspliced, unsplicedlen+1); //allocate more here
1450 //uint seqstart, seqend;
1451
1452 int s = 0; //resulting nucleotide counter
1453 if (strand=='-')
1454 {
1455 if (seglst!=NULL)
1456 seglst->Add(new GSeg(s+1,s+1+seqend-seqstart));
1457 for (int i=seqend;i>=seqstart;i--)
1458 {
1459 unspliced[s] = ntComplement(gsubseq[i-start]);
1460 s++;
1461 }//for each nt
1462 } // - strand
1463 else
1464 { // + strand
1465 if (seglst!=NULL)
1466 seglst->Add(new GSeg(s+1,s+1+seqend-seqstart));
1467 for (int i=seqstart;i<=seqend;i++)
1468 {
1469 unspliced[s]=gsubseq[i-start];
1470 s++;
1471 }//for each nt
1472 } // + strand
1473 //assert(s <= unsplicedlen);
1474 unspliced[s]=0;
1475 if (rlen!=NULL) *rlen=s;
1476 return unspliced;
1477 }
1478
1479 char* GffObj::getSpliced(GFaSeqGet* faseq, bool CDSonly, int* rlen, uint* cds_start, uint* cds_end,
1480 GList<GSeg>* seglst) {
1481 if (CDSonly && CDstart==0) return NULL;
1482 if (faseq==NULL) { GMessage("Warning: getSpliced(NULL,.. ) called!\n");
1483 return NULL;
1484 }
1485 //restore normal coordinates:
1486 unxcoord();
1487 if (exons.Count()==0) return NULL;
1488 int fspan=end-start+1;
1489 const char* gsubseq=faseq->subseq(start, fspan);
1490 if (gsubseq==NULL) {
1491 GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end);
1492 }
1493 if (fspan<(int)(end-start+1)) { //special case: stop coordinate was extended past the gseq length, must adjust
1494 int endadj=end-start+1-fspan;
1495 uint prevend=end;
1496 end-=endadj;
1497 if (CDend>end) CDend=end;
1498 if (exons.Last()->end>end) {
1499 exons.Last()->end=end; //this could get us into trouble if exon start is also > end
1500 if (exons.Last()->start>exons.Last()->end) {
1501 GError("GffObj::getSpliced() error: improper genomic coordinate %d on %s for %s\n",
1502 prevend,getGSeqName(), getID());
1503 }
1504 covlen-=endadj;
1505 }
1506 }
1507 char* spliced=NULL;
1508 GMALLOC(spliced, covlen+1); //allocate more here
1509 uint seqstart, seqend;
1510 int cdsadj=0;
1511 if (CDphase=='1' || CDphase=='2') {
1512 cdsadj=CDphase-'0';
1513 }
1514 if (CDSonly) {
1515 seqstart=CDstart;
1516 seqend=CDend;
1517 if (strand=='-') seqend-=cdsadj;
1518 else seqstart+=cdsadj;
1519 }
1520 else {
1521 seqstart=exons.First()->start;
1522 seqend=exons.Last()->end;
1523 }
1524 int s=0; //resulting nucleotide counter
1525 if (strand=='-') {
1526 for (int x=exons.Count()-1;x>=0;x--) {
1527 uint sgstart=exons[x]->start;
1528 uint sgend=exons[x]->end;
1529 if (seqend<sgstart || seqstart>sgend) continue;
1530 if (seqstart>=sgstart && seqstart<=sgend)
1531 sgstart=seqstart; //seqstart within this segment
1532 if (seqend>=sgstart && seqend<=sgend)
1533 sgend=seqend; //seqend within this segment
1534 if (seglst!=NULL)
1535 seglst->Add(new GSeg(s+1,s+1+sgend-sgstart));
1536 for (uint i=sgend;i>=sgstart;i--) {
1537 spliced[s] = ntComplement(gsubseq[i-start]);
1538 s++;
1539 }//for each nt
1540
1541 if (!CDSonly && cds_start!=NULL && CDstart>0) {
1542 if (CDstart>=sgstart && CDstart<=sgend) {
1543 //CDstart in this segment
1544 //and we are getting the whole transcript
1545 *cds_end=s-(CDstart-sgstart);
1546 }
1547 if (CDend>=sgstart && CDend<=sgend) {
1548 //CDstart in this segment
1549 //and we are getting the whole transcript
1550 *cds_start=s-(CDend-cdsadj-sgstart);
1551 }
1552 }//update local CDS coordinates
1553 } //for each exon
1554 } // - strand
1555 else { // + strand
1556 for (int x=0;x<exons.Count();x++) {
1557 uint sgstart=exons[x]->start;
1558 uint sgend=exons[x]->end;
1559 if (seqend<sgstart || seqstart>sgend) continue;
1560 if (seqstart>=sgstart && seqstart<=sgend)
1561 sgstart=seqstart; //seqstart within this segment
1562 if (seqend>=sgstart && seqend<=sgend)
1563 sgend=seqend; //seqend within this segment
1564 if (seglst!=NULL)
1565 seglst->Add(new GSeg(s+1,s+1+sgend-sgstart));
1566 for (uint i=sgstart;i<=sgend;i++) {
1567 spliced[s]=gsubseq[i-start];
1568 s++;
1569 }//for each nt
1570 if (!CDSonly && cds_start!=NULL && CDstart>0) {
1571 if (CDstart>=sgstart && CDstart<=sgend) {
1572 //CDstart in this segment
1573 //and we are getting the whole transcript
1574 *cds_start=s-(sgend-CDstart-cdsadj);
1575 }
1576 if (CDend>=sgstart && CDend<=sgend) {
1577 //CDstart in this segment
1578 //and we are getting the whole transcript
1579 *cds_end=s-(sgend-CDend);
1580 }
1581 }//update local CDS coordinates
1582 } //for each exon
1583 } // + strand
1584 spliced[s]=0;
1585 if (rlen!=NULL) *rlen=s;
1586 return spliced;
1587 }
1588
1589 char* GffObj::getSplicedTr(GFaSeqGet* faseq, bool CDSonly, int* rlen) {
1590 if (CDSonly && CDstart==0) return NULL;
1591 //restore normal coordinates:
1592 unxcoord();
1593 if (exons.Count()==0) return NULL;
1594 int fspan=end-start+1;
1595 const char* gsubseq=faseq->subseq(start, fspan);
1596 if (gsubseq==NULL) {
1597 GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end);
1598 }
1599
1600 char* translation=NULL;
1601 GMALLOC(translation, (int)(covlen/3)+1);
1602 uint seqstart, seqend;
1603 int cdsadj=0;
1604 if (CDphase=='1' || CDphase=='2') {
1605 cdsadj=CDphase-'0';
1606 }
1607 if (CDSonly) {
1608 seqstart=CDstart;
1609 seqend=CDend;
1610 if (strand=='-') seqend-=cdsadj;
1611 else seqstart+=cdsadj;
1612 }
1613 else {
1614 seqstart=exons.First()->start;
1615 seqend=exons.Last()->end;
1616 }
1617 Codon codon;
1618 int nt=0; //codon nucleotide counter (0..2)
1619 int aa=0; //aminoacid count
1620 if (strand=='-') {
1621 for (int x=exons.Count()-1;x>=0;x--) {
1622 uint sgstart=exons[x]->start;
1623 uint sgend=exons[x]->end;
1624 if (seqend<sgstart || seqstart>sgend) continue;
1625 if (seqstart>=sgstart && seqstart<=sgend)
1626 sgstart=seqstart; //seqstart within this segment
1627 if (seqend>=sgstart && seqend<=sgend) {
1628 sgend=seqend; //seqend within this segment
1629 }
1630 for (uint i=sgend;i>=sgstart;i--) {
1631 codon.nuc[nt]=ntComplement(gsubseq[i-start]);
1632 nt++;
1633 if (nt==3) {
1634 nt=0;
1635 translation[aa]=codon.translate();
1636 aa++;
1637 }
1638 }//for each nt
1639 } //for each exon
1640 } // - strand
1641 else { // + strand
1642 for (int x=0;x<exons.Count();x++) {
1643 uint sgstart=exons[x]->start;
1644 uint sgend=exons[x]->end;
1645 if (seqend<sgstart || seqstart>sgend) continue;
1646 if (seqstart>=sgstart && seqstart<=sgend)
1647 sgstart=seqstart; //seqstart within this segment
1648 if (seqend>=sgstart && seqend<=sgend)
1649 sgend=seqend; //seqend within this segment
1650 for (uint i=sgstart;i<=sgend;i++) {
1651 codon.nuc[nt]=gsubseq[i-start];
1652 nt++;
1653 if (nt==3) {
1654 nt=0;
1655 translation[aa]=codon.translate();
1656 aa++;
1657 }
1658 }//for each nt
1659 } //for each exon
1660 } // + strand
1661 translation[aa]=0;
1662 if (rlen!=NULL) *rlen=aa;
1663 return translation;
1664 }
1665
1666 void GffObj::printSummary(FILE* fout) {
1667 if (fout==NULL) fout=stdout;
1668 fprintf(fout, "%s\t%c\t%d\t%d\t%4.2f\t%4.1f\n", gffID,
1669 strand, start, end, gscore, (float)qcov/10.0);
1670 }
1671
1672 void GffObj::printGxfLine(FILE* fout, const char* tlabel, const char* gseqname, bool iscds,
1673 uint segstart, uint segend, int exidx, char phase, bool gff3) {
1674 static char scorestr[14];
1675 strcpy(scorestr,".");
1676 GffAttrs* xattrs=NULL;
1677 if (exidx>=0) {
1678 if (exons[exidx]->score) sprintf(scorestr,"%.2f", exons[exidx]->score);
1679 xattrs=exons[exidx]->attrs;
1680 }
1681 if (phase==0 || !iscds) phase='.';
1682 const char* ftype=iscds ? "CDS" : getSubfName();
1683 if (gff3) {
1684 fprintf(fout,
1685 "%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\tParent=%s",
1686 gseqname, tlabel, ftype, segstart, segend, scorestr, strand,
1687 phase, gffID);
1688 if (xattrs!=NULL) {
1689 for (int i=0;i<xattrs->Count();i++)
1690 fprintf(fout, ";%s=%s",names->attrs.getName(xattrs->Get(i)->attr_id),
1691 xattrs->Get(i)->attr_val);
1692 }
1693 fprintf(fout, "\n");
1694 } //GFF
1695 else {//for GTF -- we print only transcripts
1696 //if (isValidTranscript())
1697 fprintf(fout, "%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\ttranscript_id \"%s\";",
1698 gseqname, tlabel, ftype, segstart, segend, scorestr, strand, phase, gffID);
1699 //char* geneid=(geneID!=NULL)? geneID : gffID;
1700 if (geneID)
1701 fprintf(fout," gene_id \"%s\";",geneID);
1702 if (gene_name!=NULL) {
1703 //fprintf(fout, " gene_name ");
1704 //if (gene_name[0]=='"') fprintf (fout, "%s;",gene_name);
1705 // else fprintf(fout, "\"%s\";",gene_name);
1706 fprintf(fout," gene_name \"%s\";",gene_name);
1707 }
1708 if (xattrs!=NULL) {
1709 for (int i=0;i<xattrs->Count();i++) {
1710 if (xattrs->Get(i)->attr_val==NULL) continue;
1711 const char* attrname=names->attrs.getName(xattrs->Get(i)->attr_id);
1712 fprintf(fout, " %s ",attrname);
1713 if (xattrs->Get(i)->attr_val[0]=='"')
1714 fprintf(fout, "%s;",xattrs->Get(i)->attr_val);
1715 else fprintf(fout, "\"%s\";",xattrs->Get(i)->attr_val);
1716 }
1717 }
1718 //for GTF, also append the GffObj attributes to each exon line
1719 if ((xattrs=this->attrs)!=NULL) {
1720 for (int i=0;i<xattrs->Count();i++) {
1721 if (xattrs->Get(i)->attr_val==NULL) continue;
1722 const char* attrname=names->attrs.getName(xattrs->Get(i)->attr_id);
1723 fprintf(fout, " %s ",attrname);
1724 if (xattrs->Get(i)->attr_val[0]=='"')
1725 fprintf(fout, "%s;",xattrs->Get(i)->attr_val);
1726 else fprintf(fout, "\"%s\";",xattrs->Get(i)->attr_val);
1727 }
1728 }
1729 fprintf(fout, "\n");
1730 }//GTF
1731 }
1732
1733 void GffObj::printGxf(FILE* fout, GffPrintMode gffp,
1734 const char* tlabel, const char* gfparent) {
1735 static char tmpstr[255];
1736 if (tlabel==NULL) {
1737 tlabel=track_id>=0 ? names->tracks.Get(track_id)->name :
1738 (char*)"gffobj" ;
1739 }
1740 unxcoord();
1741 //if (exons.Count()==0) return;
1742 const char* gseqname=names->gseqs.Get(gseq_id)->name;
1743 bool gff3 = (gffp>=pgffAny);
1744 bool showCDS = (gffp==pgtfAny || gffp==pgtfCDS || gffp==pgffCDS || gffp==pgffAny || gffp==pgffBoth);
1745 bool showExon = (gffp<=pgtfExon || gffp==pgffAny || gffp==pgffExon || gffp==pgffBoth);
1746 if (gff3) {
1747 //print GFF3 mRNA line:
1748 if (gscore>0.0) sprintf(tmpstr,"%.2f", gscore);
1749 else strcpy(tmpstr,".");
1750 uint pstart, pend;
1751 if (gffp==pgffCDS) {
1752 pstart=CDstart;
1753 pend=CDend;
1754 }
1755 else { pstart=start;pend=end; }
1756 //const char* ftype=isTranscript() ? "mRNA" : getFeatureName();
1757 const char* ftype=getFeatureName();
1758 fprintf(fout,
1759 "%s\t%s\t%s\t%d\t%d\t%s\t%c\t.\tID=%s",
1760 gseqname, tlabel, ftype, pstart, pend, tmpstr, strand, gffID);
1761 if (CDstart>0 && !showCDS && !isCDS) fprintf(fout,";CDS=%d-%d",CDstart,CDend);
1762 if (gfparent!=NULL) {
1763 //parent override
1764 fprintf(fout, ";Parent=%s",gfparent);
1765 }
1766 else {
1767 if (parent!=NULL && !parent->isDiscarded())
1768 fprintf(fout, ";Parent=%s",parent->getID());
1769 }
1770 if (geneID!=NULL)
1771 fprintf(fout, ";geneID=%s",geneID);
1772 if (gene_name!=NULL)
1773 fprintf(fout, ";gene_name=%s",gene_name);
1774 if (attrs!=NULL) {
1775 for (int i=0;i<attrs->Count();i++) {
1776 const char* attrname=names->attrs.getName(attrs->Get(i)->attr_id);
1777 fprintf(fout,";%s=%s", attrname,
1778 attrs->Get(i)->attr_val);
1779 }
1780 }
1781 fprintf(fout,"\n");
1782 }// gff3 mRNA line
1783 if (showExon) {
1784 //print exons
1785 if (isCDS && exons.Count()>0 &&
1786 ((strand=='-' && exons.Last()->phase<'0') || (strand=='+' && exons.Last()->phase<'0')))
1787 updateExonPhase();
1788
1789 for (int i=0;i<exons.Count();i++) {
1790 printGxfLine(fout, tlabel, gseqname, isCDS, exons[i]->start, exons[i]->end, i, exons[i]->phase, gff3);
1791 }
1792 }//printing exons
1793 if (showCDS && !isCDS && CDstart>0) {
1794 GArray<GffCDSeg> cds(true,true);
1795 getCDSegs(cds);
1796 for (int i=0;i<cds.Count();i++) {
1797 printGxfLine(fout, tlabel, gseqname, true, cds[i].start, cds[i].end, -1, cds[i].phase, gff3);
1798 }
1799 } //showCDS
1800 }
1801
1802 void GffObj::updateExonPhase() {
1803 if (!isCDS) return;
1804 int cdsacc=0;
1805 if (CDphase=='1' || CDphase=='2') {
1806 cdsacc+= 3-(CDphase-'0');
1807 }
1808 if (strand=='-') { //reverse strand
1809 for (int i=exons.Count()-1;i>=0;i--) {
1810 exons[i]->phase='0'+ (3-cdsacc%3)%3;
1811 cdsacc+=exons[i]->end-exons[i]->start+1;
1812 }
1813 }
1814 else { //forward strand
1815 for (int i=0;i<exons.Count();i++) {
1816 exons[i]->phase='0'+ (3-cdsacc%3)%3;
1817 cdsacc+=exons[i]->end-exons[i]->start+1;
1818 }
1819 }
1820 }
1821
1822
1823 void GffObj::getCDSegs(GArray<GffCDSeg>& cds) {
1824 GffCDSeg cdseg;
1825 int cdsacc=0;
1826 if (CDphase=='1' || CDphase=='2') {
1827 cdsacc+= 3-(CDphase-'0');
1828 }
1829 if (strand=='-') {
1830 for (int x=exons.Count()-1;x>=0;x--) {
1831 uint sgstart=exons[x]->start;
1832 uint sgend=exons[x]->end;
1833 if (CDend<sgstart || CDstart>sgend) continue;
1834 if (CDstart>=sgstart && CDstart<=sgend)
1835 sgstart=CDstart; //cdstart within this segment
1836 if (CDend>=sgstart && CDend<=sgend)
1837 sgend=CDend; //cdend within this segment
1838 cdseg.start=sgstart;
1839 cdseg.end=sgend;
1840 cdseg.exonidx=x;
1841 //cdseg.phase='0'+(cdsacc>0 ? (3-cdsacc%3)%3 : 0);
1842 cdseg.phase='0'+ (3-cdsacc%3)%3;
1843 cdsacc+=sgend-sgstart+1;
1844 cds.Add(cdseg);
1845 } //for each exon
1846 } // - strand
1847 else { // + strand
1848 for (int x=0;x<exons.Count();x++) {
1849 uint sgstart=exons[x]->start;
1850 uint sgend=exons[x]->end;
1851 if (CDend<sgstart || CDstart>sgend) continue;
1852 if (CDstart>=sgstart && CDstart<=sgend)
1853 sgstart=CDstart; //seqstart within this segment
1854 if (CDend>=sgstart && CDend<=sgend)
1855 sgend=CDend; //seqend within this segment
1856 cdseg.start=sgstart;
1857 cdseg.end=sgend;
1858 cdseg.exonidx=x;
1859 //cdseg.phase='0'+(cdsacc>0 ? (3-cdsacc%3)%3 : 0);
1860 cdseg.phase='0' + (3-cdsacc%3)%3 ;
1861 cdsacc+=sgend-sgstart+1;
1862 cds.Add(cdseg);
1863 } //for each exon
1864 } // + strand
1865 }