ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/gff.cpp
Revision: 55
Committed: Thu Sep 8 05:46:53 2011 UTC (7 years, 10 months ago) by gpertea
File size: 65483 byte(s)
Log Message:
RPS20P29, ENSG00000241828 not collapsing

Line File contents
1 #include "gff.h"
2
3 //GffNames* GffReader::names=NULL;
4 GffNames* GffObj::names=NULL;
5 //global set of feature names, attribute names etc.
6 // -- common for all GffObjs in current application!
7
8 const uint GFF_MAX_LOCUS = 7000000; //longest known gene in human is ~2.2M, UCSC claims a gene for mouse of ~ 3.1 M
9 const uint GFF_MAX_EXON = 30000; //longest known exon in human is ~11K
10 const uint GFF_MAX_INTRON= 6000000;
11 bool gff_show_warnings = false; //global setting, set by GffReader->showWarnings()
12 const int gff_fid_mRNA=0;
13 const int gff_fid_transcript=1;
14 const int gff_fid_exon=2;
15 const int gff_fid_CDS=3; //never really used in GffObj ftype_id or subftype_id
16 const uint gfo_flag_HAS_ERRORS = 0x00000001;
17 const uint gfo_flag_CHILDREN_PROMOTED= 0x00000002;
18 const uint gfo_flag_IS_GENE = 0x00000004;
19 const uint gfo_flag_IS_TRANSCRIPT = 0x00000008;
20 const uint gfo_flag_FROM_GFF3 = 0x00000010;
21 const uint gfo_flag_BY_EXON = 0x00000020; //created by subfeature (exon) directly
22 const uint gfo_flag_DISCARDED = 0x00000100;
23 const uint gfo_flag_LST_KEEP = 0x00000200;
24 const uint gfo_flag_LEVEL_MSK = 0x00FF0000;
25 const byte gfo_flagShift_LEVEL = 16;
26
27 void gffnames_ref(GffNames* &n) {
28 if (n==NULL) n=new GffNames();
29 n->numrefs++;
30 }
31
32 void gffnames_unref(GffNames* &n) {
33 if (n==NULL) GError("Error: attempt to remove reference to null GffNames object!\n");
34 n->numrefs--;
35 if (n->numrefs==0) { delete n; n=NULL; }
36 }
37
38 int gfo_cmpByLoc(const pointer p1, const pointer p2) {
39
40 GffObj& g1=*((GffObj*)p1);
41 GffObj& g2=*((GffObj*)p2);
42 if (g1.gseq_id==g2.gseq_id) {
43 if (g1.start!=g2.start)
44 return (int)(g1.start-g2.start);
45 else if (g1.getLevel()!=g2.getLevel())
46 return (int)(g1.getLevel()-g2.getLevel());
47 else
48 if (g1.end!=g2.end)
49 return (int)(g1.end-g2.end);
50 else return strcmp(g1.getID(), g2.getID());
51 }
52 else return (int)(g1.gseq_id-g2.gseq_id);
53 }
54
55 char* GffLine::extractAttr(const char* pre, bool caseStrict, bool enforce_GTF2) {
56 //parse a key attribute and remove it from the info string
57 //(only works for attributes that have values following them after ' ' or '=')
58 static const char GTF2_ERR[]="Error parsing attribute %s ('\"' required) at GTF line:\n%s\n";
59 int lpre=strlen(pre);
60 char cend=pre[lpre-1];
61 char* pos = (caseStrict) ? strstr(info, pre) : strifind(info, pre);
62 if (pos==NULL) return NULL;
63 char* findstart=info;
64 //require word boundary on the left:
65 while (pos!=NULL && pos!=info && *(pos-1)!=';' && *(pos-1)!=' ') {
66 findstart=pos+lpre;
67 pos = (caseStrict) ? strstr(findstart, pre) : strifind(findstart, pre);
68 }
69 if (pos==NULL) return NULL;
70 if (cend!=' ' && cend!='=') {
71 //require word boundary on the right:
72 while (pos!=NULL && *(pos+lpre)!=' ' && *(pos+lpre)!='=') {
73 findstart=pos+lpre;
74 pos = (caseStrict) ? strstr(findstart, pre) : strifind(findstart, pre);
75 }
76 }
77 if (pos==NULL) return NULL;
78 char* vp=pos+lpre;
79 while (*vp==' ') vp++;
80 if (*vp==';' || *vp==0)
81 GError("Error parsing value of GFF attribute \"%s\", line:\n%s\n", pre, dupline);
82 bool dq_enclosed=false; //value string enclosed by double quotes
83 if (*vp=='"') {
84 dq_enclosed=true;
85 vp++;
86 }
87 if (enforce_GTF2 && !dq_enclosed)
88 GError(GTF2_ERR,pre, dupline);
89 char* vend=vp;
90 if (dq_enclosed) {
91 while (*vend!='"' && *vend!=';' && *vend!=0) vend++;
92 }
93 else {
94 while (*vend!=';' && *vend!=0) vend++;
95 }
96 if (enforce_GTF2 && *vend!='"')
97 GError(GTF2_ERR, pre, dupline);
98 char *r=Gstrdup(vp, vend-1);
99 //-- now remove this attribute from the info string
100 while (*vend!=0 && (*vend=='"' || *vend==';' || *vend==' ')) vend++;
101 if (*vend==0) vend--;
102 for (char *src=vend, *dest=pos;;src++,dest++) {
103 *dest=*src;
104 if (*src==0) break;
105 }
106 return r;
107 }
108
109 static char fnamelc[128];
110
111 GffLine::GffLine(GffReader* reader, const char* l) {
112 llen=strlen(l);
113 GMALLOC(line,llen+1);
114 memcpy(line, l, llen+1);
115 GMALLOC(dupline, llen+1);
116 memcpy(dupline, l, llen+1);
117 skip=true;
118 gseqname=NULL;
119 track=NULL;
120 ftype=NULL;
121 info=NULL;
122 _parents=NULL;
123 _parents_len=0;
124 num_parents=0;
125 parents=NULL;
126 is_gff3=false;
127 is_cds=false;
128 is_transcript=false;
129 is_exon=false;
130 is_gene=false;
131 exontype=0;
132 gene_id=NULL;
133 gene_name=NULL;
134 qstart=0;
135 qend=0;
136 qlen=0;
137 ID=NULL;
138 char* t[9];
139 int i=0;
140 int tidx=1;
141 t[0]=line;
142
143 while (line[i]!=0) {
144 if (line[i]=='\t') {
145 line[i]=0;
146 t[tidx]=line+i+1;
147 tidx++;
148 if (tidx>8) break;
149 }
150 i++;
151 }
152
153 if (tidx<8) { // ignore non-GFF lines
154 // GMessage("Warning: error parsing GFF/GTF line:\n%s\n", l);
155 return;
156 }
157 gseqname=t[0];
158 track=t[1];
159 ftype=t[2];
160 info=t[8];
161 char* p=t[3];
162 if (!parseUInt(p,fstart))
163 GError("Error parsing start coordinate from GFF line:\n%s\n",l);
164 p=t[4];
165 if (!parseUInt(p,fend))
166 GError("Error parsing end coordinate from GFF line:\n%s\n",l);
167 if (fend<fstart) swap(fend,fstart); //make sure fstart>=fend, always
168 p=t[5];
169 if (p[0]=='.' && p[1]==0) {
170 score=0;
171 }
172 else {
173 if (!parseDouble(p,score))
174 GError("Error parsing feature score from GFF line:\n%s\n",l);
175 }
176 strand=*t[6];
177 if (strand!='+' && strand!='-' && strand!='.')
178 GError("Error parsing strand (%c) from GFF line:\n%s\n",strand,l);
179 phase=*t[7]; // must be '.', '0', '1' or '2'
180 ID=NULL;
181 // exon/CDS/mrna filter
182 strncpy(fnamelc, ftype, 127);
183 fnamelc[127]=0;
184 strlower(fnamelc); //convert to lower case
185 bool is_t_data=false;
186 if (strstr(fnamelc, "utr")!=NULL) {
187 exontype=exgffUTR;
188 is_exon=true;
189 is_t_data=true;
190 }
191 else if (strstr(fnamelc, "exon")!=NULL) {
192 exontype=exgffExon;
193 is_exon=true;
194 is_t_data=true;
195 }
196 else if (strstr(fnamelc, "stop") &&
197 (strstr(fnamelc, "codon") || strstr(fnamelc, "cds"))){
198 exontype=exgffStop;
199 is_cds=true; //though some place it outside the last CDS segment
200 is_t_data=true;
201 }
202 else if (strstr(fnamelc, "start") &&
203 ((strstr(fnamelc, "codon")!=NULL) || strstr(fnamelc, "cds")!=NULL)){
204 exontype=exgffStart;
205 is_cds=true;
206 is_t_data=true;
207 }
208 else if (strcmp(fnamelc, "cds")==0) {
209 exontype=exgffCDS;
210 is_cds=true;
211 is_t_data=true;
212 }
213 else if (endsWith(fnamelc, "gene") || startsWith(fnamelc, "gene")) {
214 is_gene=true;
215 is_t_data=true; //because its name will be attached to parented transcripts
216 }
217 else if (endsWith(fnamelc,"rna") || endsWith(fnamelc,"transcript")) {
218 is_transcript=true;
219 is_t_data=true;
220 }
221
222 if (reader->transcriptsOnly && !is_t_data) {
223 char* id=extractAttr("ID=");
224 if (id==NULL) id=extractAttr("transcript_id");
225 //GMessage("Discarding non-transcript line:\n%s\n",l);
226 if (id!=NULL) {
227 reader->discarded_ids.Add(id, new int(1));
228 GFREE(id);
229 }
230 return; //skip this line, unwanted feature name
231 }
232 ID=extractAttr("ID=");
233 char* Parent=extractAttr("Parent=");
234 is_gff3=(ID!=NULL || Parent!=NULL);
235 if (is_gff3) {
236 //parse as GFF3
237 if (ID!=NULL) {
238 //has ID attr so it's likely to be a parent feature
239 //look for explicit gene name
240 gene_name=extractAttr("gene_name=",false);
241 if (gene_name==NULL) {
242 gene_name=extractAttr("geneName=",false);
243 if (gene_name==NULL) {
244 gene_name=extractAttr("gene_sym=",false);
245 if (gene_name==NULL) {
246 gene_name=extractAttr("gene=",false);
247 }
248 }
249 }
250 gene_id=extractAttr("geneID=",false);
251 if (gene_id==NULL) {
252 gene_id=extractAttr("gene_id=",false);
253 }
254 if (is_gene) {
255 //special case: keep the Name and ID attributes of the gene feature
256 if (gene_name==NULL)
257 gene_name=extractAttr("Name=");
258 if (gene_id==NULL) //the ID is also gene_id in this case
259 gene_id=Gstrdup(ID);
260 //skip=false;
261 //return;
262 GFREE(Parent); //TMI, we really don't care about gene Parents?
263 } //gene feature
264 }// has GFF3 ID
265 if (Parent!=NULL) {
266 //keep Parent attr
267 //parse multiple parents
268 num_parents=1;
269 p=Parent;
270 int last_delim_pos=-1;
271 while (*p!=';' && *p!=0) {
272 if (*p==',' && *(p+1)!=0 && *(p+1)!=';') {
273 num_parents++;
274 last_delim_pos=(p-Parent);
275 }
276 p++;
277 }
278 _parents_len=p-Parent+1;
279 _parents=Parent;
280 GMALLOC(parents, num_parents*sizeof(char*));
281 parents[0]=_parents;
282 int i=1;
283 if (last_delim_pos>0) {
284 for (p=_parents+1;p<=_parents+last_delim_pos;p++) {
285 if (*p==',') {
286 char* ep=p-1;
287 while (*ep==' ' && ep>_parents) ep--;
288 *(ep+1)=0; //end the string there
289 parents[i]=p+1;
290 i++;
291 }
292 }
293 }
294 } //has Parent field
295 } //GFF3
296 else { // GTF-like expected
297 Parent=extractAttr("transcript_id");
298 if (Parent!=NULL) { //GTF2 format detected
299 if (is_transcript) {
300 // atypical GTF with a parent transcript line declared
301 ID=Parent;
302 Parent=NULL;
303 }
304 gene_id=extractAttr("gene_id"); // for GTF this is the only attribute accepted as geneID
305 gene_name=extractAttr("gene_name");
306 if (gene_name==NULL) {
307 gene_name=extractAttr("gene_sym");
308 if (gene_name==NULL)
309 gene_name=extractAttr("gene");
310 }
311 //prepare for parseAttr by adding '=' character instead of spaces for all attributes
312 //after the attribute name
313 p=info;
314 bool noed=true; //not edited after the last delim
315 bool nsp=false; //non-space found after last delim
316 while (*p!=0) {
317 if (*p==' ') {
318 if (nsp && noed) {
319 *p='=';
320 noed=false;
321 p++;
322 continue;
323 }
324 }
325 else nsp=true; //non-space
326 if (*p==';') { noed=true; nsp=false; }
327 p++;
328 }
329 } //GTF2 detected (no parent line)
330 else {// Parent is NULL, check for jigsaw format or other pre-GTF2 format
331 //char* fexon=strstr(fnamelc, "exon");
332 //if (fexon!=NULL) {
333 if (exontype==exgffExon) {
334 if (startsWith(track,"jigsaw")) {
335 is_cds=true;
336 strcpy(track,"jigsaw");
337 p=strchr(info,';');
338 if (p==NULL) { Parent=Gstrdup(info); info=NULL; }
339 else { Parent=Gstrdup(info,p-1);
340 info=p+1;
341 }
342 }
343 } //exon feature?
344 if (Parent==NULL && exontype>=exgffCDS &&
345 (i=strcspn(info,"; \t\n\r"))<=(int)(strlen(info)+1)) {
346 //one word ID ? really desperate attempt to parse it here
347 Parent=Gstrdup(info,info+i-1);
348 info=NULL; //discard anything else on the line
349 }
350 }
351 if (Parent!=NULL) { //GTF transcript_id for exon/CDS feature
352 _parents=Parent;
353 GMALLOC(parents,sizeof(char*));
354 num_parents=1;
355 parents[0]=_parents;
356 }
357 } //GTF-like
358
359 //parse other potentially useful features
360 if (is_gff3) {
361 if ((p=strstr(info,"Target="))!=NULL) { //has Target attr
362 p+=7;
363 while (*p!=';' && *p!=0 && *p!=' ') p++;
364 if (*p!=' ') {
365 GError("Error parsing target coordinates from GFF line:\n%s\n",l);
366 }
367 if (!parseUInt(p,qstart))
368 GError("Error parsing target start coordinate from GFF line:\n%s\n",l);
369 if (*p!=' ') {
370 GError("Error parsing next target coordinate from GFF line:\n%s\n",l);
371 }
372 p++;
373 if (!parseUInt(p,qend))
374 GError("Error parsing target end coordinate from GFF line:\n%s\n",l);
375 }
376 if ((p=strifind(info,"Qreg="))!=NULL) { //has Qreg attr
377 p+=5;
378 if (!parseUInt(p,qstart))
379 GError("Error parsing target start coordinate from GFF line:\n%s\n",l);
380 if (*p!='-') {
381 GError("Error parsing next target coordinate from GFF line:\n%s\n",l);
382 }
383 p++;
384 if (!parseUInt(p,qend))
385 GError("Error parsing target end coordinate from GFF line:\n%s\n",l);
386 if (*p=='|' || *p==':') {
387 p++;
388 if (!parseUInt(p,qlen))
389 GError("Error parsing target length from GFF Qreg|: \n%s\n",l);
390 }
391 }//has Qreg attr
392 if (qlen==0 && (p=strifind(info,"Qlen="))!=NULL) {
393 p+=5;
394 if (!parseUInt(p,qlen))
395 GError("Error parsing target length from GFF Qlen:\n%s\n",l);
396 }
397 }//parsing some useful attributes in GFF3 records
398 if (ID==NULL && parents==NULL) {
399 if (reader->gff_warns)
400 GMessage("Warning: could not parse ID or Parent from GFF line:\n%s\n",dupline);
401 return; //skip
402 }
403 skip=false;
404 }
405
406
407 void GffObj::addCDS(uint cd_start, uint cd_end, char phase) {
408 if (cd_start>=this->start) {
409 this->CDstart=cd_start;
410 if (strand=='+') this->CDphase=phase;
411 }
412 else this->CDstart=this->start;
413 if (cd_end<=this->end) {
414 this->CDend=cd_end;
415 if (strand=='-') this->CDphase=phase;
416 }
417 else this->CDend=this->end;
418 isTranscript(true);
419 exon_ftype_id=gff_fid_exon;
420 if (monoFeature()) {
421 if (exons.Count()==0) addExon(this->start, this->end,0,'.',0,0,false,exgffExon);
422 else exons[0]->exontype=exgffExon;
423 }
424 }
425
426 int GffObj::addExon(GffReader* reader, GffLine* gl, bool keepAttr, bool noExonAttr) {
427 //this will make sure we have the right subftype_id!
428 //int subf_id=-1;
429 if (!isTranscript() && gl->is_cds) {
430 isTranscript(true);
431 exon_ftype_id=gff_fid_exon;
432 if (exons.Count()==1) exons[0]->exontype=exgffExon;
433 }
434 if (isTranscript()) {
435 if (exon_ftype_id<0) {//exon_ftype_id=gff_fid_exon;
436 if (gl->exontype>0) exon_ftype_id=gff_fid_exon;
437 else exon_ftype_id=names->feats.addName(gl->ftype);
438 }
439 //any recognized mRNA segment gets the generic "exon" type (also applies to CDS)
440 if (gl->exontype==0 && !gl->is_transcript) {
441 //extraneous mRNA feature, discard
442 if (reader->gff_warns)
443 GMessage("Warning: discarding unrecognized transcript subfeature %s of %s\n",
444 gl->ftype, gffID);
445 return -1;
446 }
447 }
448 else { //non-mRNA parent feature, check this subf type
449 int subf_id=names->feats.addName(gl->ftype);
450 if (exon_ftype_id<0 || exons.Count()==0) //never assigned a subfeature type before (e.g. first exon being added)
451 exon_ftype_id=subf_id;
452 else {
453 if (exon_ftype_id!=subf_id) {
454 //
455 if (exon_ftype_id==ftype_id && exons.Count()==1 && exons[0]->start==start && exons[0]->end==end) {
456 //the existing exon was just a dummy one created by default, discard it
457 exons.Clear();
458 covlen=0;
459 exon_ftype_id=subf_id; //allow the new subfeature to completely takeover
460 }
461 else { //multiple subfeatures, prefer those with
462 if (reader->gff_warns)
463 GMessage("GFF Warning: multiple subfeatures (%s and %s) found for %s, discarding ",
464 names->feats.getName(subf_id), names->feats.getName(exon_ftype_id),gffID);
465 if (gl->exontype!=0) { //new feature is an exon, discard previously parsed subfeatures
466 if (reader->gff_warns) GMessage("%s.\n", names->feats.getName(exon_ftype_id));
467 exon_ftype_id=subf_id;
468 exons.Clear();
469 covlen=0;
470 }
471 else { //discard new feature
472 if (reader->gff_warns) GMessage("%s.\n", names->feats.getName(subf_id));
473 return -1; //skip this 2nd subfeature type for this parent!
474 }
475 }
476 } //incoming subfeature is of different type
477 } //new subfeature type
478 } //non-mRNA parent
479 int eidx=addExon(gl->fstart, gl->fend, gl->score, gl->phase,
480 gl->qstart,gl->qend, gl->is_cds, gl->exontype);
481 if (eidx<0) return eidx; //this should never happen
482 if (keepAttr) {
483 if (noExonAttr) {
484 if (attrs==NULL) //place the parsed attributes directly at transcript level
485 parseAttrs(attrs, gl->info);
486 }
487 else { //need all exon-level attributes
488 parseAttrs(exons[eidx]->attrs, gl->info, true);
489 }
490 }
491 return eidx;
492 }
493
494
495 int GffObj::addExon(uint segstart, uint segend, double sc, char fr, int qs, int qe, bool iscds, char exontype) {
496 if (exons.Count()==0) {
497 if (iscds) isCDS=true; //for now, assume CDS only if first "exon" given is a CDS
498 if (exon_ftype_id<0) {
499 exon_ftype_id = isTranscript() ? gff_fid_exon : ftype_id;
500 }
501 }
502 //special treatment of start/stop codon features, they might be broken/split between exons
503 //and in that case some providers will still give the wrong end coordinate as start+2 (e.g. UCSC)
504 //so we should not trust the end coordinate for such features
505 if (exontype==exgffStart || exontype==exgffStop) {
506 if (strand=='-') segstart=segend;
507 else segend=segstart;
508 if (exontype==exgffStart) {
509 if (CDstart==0 || segstart<CDstart) CDstart=segstart;
510 }
511 else {
512 if (segstart>CDend) CDend=segstart;
513 }
514 }
515 else if (iscds) { //update CDS anchors:
516 if (CDstart==0 || segstart<CDstart) {
517 CDstart=segstart;
518 if (exontype==exgffCDS && strand=='+') CDphase=fr;
519 }
520 if (segend>CDend) {
521 if (exontype==exgffCDS && strand=='-') CDphase=fr;
522 CDend=segend;
523 }
524 }
525 else { // not a CDS/start/stop
526 isCDS=false;
527 }
528 if (qs || qe) {
529 if (qs>qe) swap(qs,qe);
530 if (qs==0) qs=1;
531 }
532 int ovlen=0;
533 if (exontype>0) { //check for overlaps between exon-type segments
534 int oi=exonOverlapIdx(segstart, segend, &ovlen);
535 if (oi>=0) { //overlap existing segment
536 if (ovlen==0) {
537 //adjacent segments will be merged
538 //e.g. CDS to (UTR|exon)
539 if ((exons[oi]->exontype>=exgffUTR && exontype==exgffCDS) ||
540 (exons[oi]->exontype==exgffCDS && exontype>=exgffUTR)) {
541 expandExon(oi, segstart, segend, exgffCDSUTR, sc, fr, qs, qe);
542 return oi;
543 }
544 //CDS adjacent to stop_codon: UCSC does (did?) this
545 if ((exons[oi]->exontype==exgffStop && exontype==exgffCDS) ||
546 (exons[oi]->exontype==exgffCDS && exontype==exgffStop)) {
547 expandExon(oi, segstart, segend, exgffCDS, sc, fr, qs, qe);
548 return oi;
549 }
550 }
551 //only allow this for CDS within exon, stop_codon within (CDS|UTR|exon),
552 // start_codon within (CDS|exon)
553 if (exons[oi]->exontype>exontype &&
554 exons[oi]->start<=segstart && exons[oi]->end>=segend &&
555 !(exons[oi]->exontype==exgffUTR && exontype==exgffCDS)) {
556 //larger segment given first, now the smaller included one is redundant
557 return oi; //only used to store attributes from current GffLine
558 }
559 if (exontype>exons[oi]->exontype &&
560 segstart<=exons[oi]->start && segend>=exons[oi]->end &&
561 !(exontype==exgffUTR && exons[oi]->exontype==exgffCDS)) {
562 //smaller segment given first, so we have to enlarge it
563 expandExon(oi, segstart, segend, exontype, sc, fr, qs, qe);
564 //this should also check for overlapping next exon (oi+1) ?
565 return oi;
566 }
567 //there is also the special case of "ribosomal slippage exception" (programmed frameshift)
568 //where two CDS segments may actually overlap for 1 or 2 bases, but there should be only one encompassing exon
569 //if (ovlen>2 || exons[oi]->exontype!=exgffCDS || exontype!=exgffCDS) {
570 // had to relax this because of some weird UCSC annotations with exons partially overlapping the CDS segments
571 /*
572 if (ovlen>2 && exons[oi]->exontype!=exgffUTR && exontype!=exgffUTR) {
573 if (gff_show_warnings)
574 GMessage("GFF Warning: discarding overlapping feature segment (%d-%d) (vs %d-%d (%s)) for GFF ID %s on %s\n",
575 segstart, segend, exons[oi]->start, exons[oi]->end, getSubfName(), gffID, getGSeqName());
576 hasErrors(true);
577 return -1; //segment NOT added
578 }
579 */
580
581 if ((ovlen>2 || ovlen==0) || exons[oi]->exontype!=exgffCDS || exontype!=exgffCDS) {
582 if (gff_show_warnings)
583 GMessage("GFF Warning: merging overlapping/adjacent feature segment (%d-%d) into (%d-%d) (%s) for GFF ID %s on %s\n",
584 segstart, segend, exons[oi]->start, exons[oi]->end, getSubfName(), gffID, getGSeqName());
585 expandExon(oi, segstart, segend, exontype, sc, fr, qs, qe);
586 return oi;
587 }
588 // else add the segment if the overlap is small and between two CDS segments
589 //TODO: we might want to add an attribute here with the slippage coordinate and size?
590 covlen-=ovlen;
591 }//overlap or adjacent to existing segment
592 } //check for overlap
593 // --- no overlap, or accepted micro-overlap (ribosomal slippage)
594 // create & add the new segment
595 /*
596 if (start>0 && exontype==exgffCDS && exons.Count()==0) {
597 //adding a CDS directly as the first subfeature of a declared parent
598 segstart=start;
599 segend=end;
600 }
601 */
602 GffExon* enew=new GffExon(segstart, segend, sc, fr, qs, qe, exontype);
603 int eidx=exons.Add(enew);
604 if (eidx<0) {
605 //this would actually be acceptable if the object is a "Gene" and "exons" are in fact isoforms
606 if (gff_show_warnings)
607 GMessage("GFF Warning: failed adding segment %d-%d for %s (discarded)!\n",
608 segstart, segend, gffID);
609 delete enew;
610 hasErrors(true);
611 return -1;
612 }
613 covlen+=(int)(exons[eidx]->end-exons[eidx]->start)+1;
614 //adjust parent feature coordinates to contain this exon
615 if (start==0 || start>exons.First()->start) {
616 start=exons.First()->start;
617 }
618 if (end<exons.Last()->end) end=exons.Last()->end;
619
620 if (uptr!=NULL) { //collect stats about the underlying genomic sequence
621 GSeqStat* gsd=(GSeqStat*)uptr;
622 if (start<gsd->mincoord) gsd->mincoord=start;
623 if (end>gsd->maxcoord) gsd->maxcoord=end;
624 if (this->len()>gsd->maxfeat_len) {
625 gsd->maxfeat_len=this->len();
626 gsd->maxfeat=this;
627 }
628 }
629 return eidx;
630 }
631
632 void GffObj::expandExon(int oi, uint segstart, uint segend, char exontype, double sc, char fr, int qs, int qe) {
633 //oi is the index of the *first* overlapping segment found that must be enlarged
634 covlen-=exons[oi]->len();
635 if (segstart<exons[oi]->start)
636 exons[oi]->start=segstart;
637 if (qs && qs<exons[oi]->qstart) exons[oi]->qstart=qs;
638 if (segend>exons[oi]->end)
639 exons[oi]->end=segend;
640 if (qe && qe>exons[oi]->qend) exons[oi]->qend=qe;
641 //warning: score cannot be properly adjusted! e.g. if it's a p-value it's just going to get worse
642 if (sc!=0) exons[oi]->score=sc;
643 covlen+=exons[oi]->len();
644 //if (exons[oi]->exontype< exontype) -- always true
645 exons[oi]->exontype = exontype;
646 if (exontype==exgffCDS) exons[oi]->phase=fr;
647 //we must check if any more exons are also overlapping this
648 int ni=oi+1; //next exon index after oi
649 while (ni<exons.Count() && segend>=exons[ni]->start) { // next segment overlaps new enlarged segment
650 //only allow this if next segment is fully included, and a subordinate
651 if (exons[ni]->exontype<exontype && exons[ni]->end<=segend) {
652 /* I guess we have to relax this due to stupid UCSC hg18 files having a start_codon sticking out
653 chr1 hg18_knownGene start_codon 69806911 69806913 0.000000 + .
654 chr1 hg18_knownGene CDS 69806911 69806912 0.000000 + 0
655 chr1 hg18_knownGene exon 69805456 69806912 0.000000 + .
656 */
657 if (exons[ni]->qstart<exons[oi]->qstart) exons[oi]->qstart=exons[ni]->qstart;
658 if (exons[ni]->qend>exons[oi]->qend) exons[oi]->qend=exons[ni]->qend;
659 exons.Delete(ni);
660 }
661 else {
662 if (gff_show_warnings) GMessage("GFF Warning: overlapping existing exon(%d-%d) while expanding to %d-%d for GFF ID %s\n",
663 exons[ni]->start, exons[ni]->end, segstart, segend, gffID);
664 //hasErrors(true);
665 break;
666 }
667 }
668 // -- make sure any other related boundaries are updated:
669 start=exons.First()->start;
670 end=exons.Last()->end;
671 if (uptr!=NULL) { //collect stats about the underlying genomic sequence
672 GSeqStat* gsd=(GSeqStat*)uptr;
673 if (start<gsd->mincoord) gsd->mincoord=start;
674 if (end>gsd->maxcoord) gsd->maxcoord=end;
675 if (this->len()>gsd->maxfeat_len) {
676 gsd->maxfeat_len=this->len();
677 gsd->maxfeat=this;
678 }
679 }
680 }
681
682 void GffObj::removeExon(int idx) {
683 /*
684 if (idx==0 && segs[0].start==gstart)
685 gstart=segs[1].start;
686 if (idx==segcount && segs[segcount].end==gend)
687 gend=segs[segcount-1].end;
688 */
689 if (idx<0 || idx>=exons.Count()) return;
690 int segstart=exons[idx]->start;
691 int segend=exons[idx]->end;
692 exons.Delete(idx);
693 covlen -= (int)(segend-segstart)+1;
694 start=exons.First()->start;
695 end=exons.Last()->end;
696 if (isCDS) { CDstart=start; CDend=end; }
697 }
698
699 void GffObj::removeExon(GffExon* p) {
700 for (int idx=0;idx<exons.Count();idx++) {
701 if (exons[idx]==p) {
702 int segstart=exons[idx]->start;
703 int segend=exons[idx]->end;
704 exons.Delete(idx);
705 covlen -= (int)(segend-segstart)+1;
706 start=exons.First()->start;
707 end=exons.Last()->end;
708 if (isCDS) { CDstart=start; CDend=end; }
709 return;
710 }
711 }
712 }
713
714
715
716 GffObj::GffObj(GffReader *gfrd, GffLine* gffline, bool keepAttr, bool noExonAttr):
717 GSeg(0,0), exons(true,true,false), children(1,false) {
718 xstart=0;
719 xend=0;
720 xstatus=0;
721 partial=false;
722 isCDS=false;
723 uptr=NULL;
724 ulink=NULL;
725 parent=NULL;
726 udata=0;
727 flags=0;
728 CDstart=0;
729 CDend=0;
730 CDphase=0;
731 geneID=NULL;
732 gene_name=NULL;
733 attrs=NULL;
734 gffID=NULL;
735 track_id=-1;
736 gseq_id=-1;
737 ftype_id=-1;
738 exon_ftype_id=-1;
739 strand='.';
740 if (gfrd==NULL)
741 GError("Cannot use this GffObj constructor with a NULL GffReader!\n");
742 gffnames_ref(names);
743 if (gfrd->names==NULL) gfrd->names=names;
744 //qlen=0;qstart=0;qend=0;
745 gscore=0;
746 uscore=0;
747 covlen=0;
748 qcov=0;
749 start=gffline->fstart;
750 end=gffline->fend;
751 gseq_id=names->gseqs.addName(gffline->gseqname);
752 track_id=names->tracks.addName(gffline->track);
753 strand=gffline->strand;
754 qlen=gffline->qlen;
755 qstart=gffline->qstart;
756 qend=gffline->qend;
757 //setup flags from gffline
758 isCDS=gffline->is_cds; //for now
759 isGene(gffline->is_gene);
760 isTranscript(gffline->is_transcript || gffline->exontype!=0);
761 fromGff3(gffline->is_gff3);
762
763 if (gffline->parents!=NULL) {
764 //GTF style -- create a GffObj directly by subfeature
765 //(also possible orphan GFF3 exon line, or an exon given before its parent (chado))
766 if (gffline->exontype!=0) { //recognized exon-like feature
767 ftype_id=gff_fid_transcript; //so this is some sort of transcript
768 exon_ftype_id=gff_fid_exon; //subfeatures MUST be exons
769 }
770 else {//unrecognized subfeatures
771 //make this GffObj of the same feature type
772 ftype_id=names->feats.addName(gffline->ftype);
773 }
774 if (gffline->ID==NULL) { //typical GTF
775 gffID=Gstrdup(gffline->parents[0]);
776 this->createdByExon(true);
777 //this is likely the first exon/segment of the feature
778 addExon(gfrd, gffline, keepAttr, noExonAttr);
779 }
780 else { //a parented feature with an ID -- probably an orphan GFF3 line
781 if (gffline->is_gff3 && gffline->exontype!=0) {
782 //premature exon given before its parent transcript
783 //create the transcript entry here
784 gffID=Gstrdup(gffline->parents[0]);
785 this->createdByExon(true);
786 //this is the first exon/segment of the transcript
787 addExon(gfrd, gffline, keepAttr, noExonAttr);
788 }
789 else { //unrecognized non-exon feature ? use the ID instead
790 gffID=Gstrdup(gffline->ID);
791 if (keepAttr) this->parseAttrs(attrs, gffline->info);
792 }
793 }
794 } //subfeature given directly
795 else { //gffline->parents==NULL
796 //create a parent feature in its own right
797 gscore=gffline->score;
798 if (gffline->ID==NULL || gffline->ID[0]==0)
799 GError("Error: no ID found for GFF record start\n");
800 gffID=Gstrdup(gffline->ID); //there must be an ID here
801 //if (gffline->is_transcript) ftype_id=gff_fid_mRNA;
802 //else
803 ftype_id=names->feats.addName(gffline->ftype);
804 if (gffline->is_transcript)
805 exon_ftype_id=gff_fid_exon;
806
807 if (keepAttr) this->parseAttrs(attrs, gffline->info);
808 }//no parent
809
810 if (gffline->gene_name!=NULL) {
811 gene_name=Gstrdup(gffline->gene_name);
812 }
813 if (gffline->gene_id!=NULL) {
814 geneID=Gstrdup(gffline->gene_id);
815 }
816
817 GSeqStat* gsd=gfrd->gseqstats.AddIfNew(new GSeqStat(gseq_id,names->gseqs.lastNameUsed()),true);
818 uptr=gsd;
819 if (start<gsd->mincoord) gsd->mincoord=start;
820 if (end>gsd->maxcoord) gsd->maxcoord=end;
821 if (this->len()>gsd->maxfeat_len) {
822 gsd->maxfeat_len=this->len();
823 gsd->maxfeat=this;
824 }
825 }
826
827 GffLine* GffReader::nextGffLine() {
828 if (gffline!=NULL) return gffline; //caller should free gffline after processing
829 while (gffline==NULL) {
830 int llen=0;
831 buflen=GFF_LINELEN-1;
832 char* l=fgetline(linebuf, buflen, fh, &fpos, &llen);
833 if (l==NULL) {
834 return NULL; //end of file
835 }
836 int ns=0; //first nonspace position
837 while (l[ns]!=0 && isspace(l[ns])) ns++;
838 if (l[ns]=='#' || llen<10) continue;
839 gffline=new GffLine(this, l);
840 if (gffline->skip) {
841 delete gffline;
842 gffline=NULL;
843 continue;
844 }
845 if (gffline->ID==NULL && gffline->parents==NULL) { //it must have an ID
846 //this might not be needed, already checked in the GffLine constructor
847 if (gff_warns)
848 GMessage("Warning: malformed GFF line, no parent or record Id (kipping\n");
849 delete gffline;
850 gffline=NULL;
851 //continue;
852 }
853 }
854 return gffline;
855 }
856
857 char* GffReader::gfoBuildId(const char* id, const char* ctg) {
858 //caller must free the returned pointer
859 char* buf=NULL;
860 int idlen=strlen(id);
861 GMALLOC(buf, idlen+strlen(ctg)+2);
862 strcpy(buf, id);
863 buf[idlen]='~';
864 strcpy(buf+idlen+1, ctg);
865 return buf;
866 }
867
868 void GffReader::gfoRemove(const char* id, const char* ctg) {
869 char* buf=gfoBuildId(id,ctg);
870 phash.Remove(buf);
871 GFREE(buf);
872 }
873
874 //Warning: if gflst gets altered, idx becomes obsolete
875 GfoHolder* GffReader::gfoAdd(const char* id, const char* ctg, GffObj* gfo, int idx) {
876 char* buf=gfoBuildId(id,ctg);
877 GfoHolder* r=new GfoHolder(gfo,idx);
878 phash.Add(buf, r);
879 GFREE(buf);
880 return r;
881 }
882
883 GfoHolder* GffReader::gfoFind(const char* id, const char* ctg) {
884 char* buf=gfoBuildId(id,ctg);
885 GfoHolder* r=phash.Find(buf);
886 GFREE(buf);
887 return r;
888 }
889
890 GfoHolder* GffReader::replaceGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, int replaceidx) {
891 GffObj* newgfo=new GffObj(this, gffline, keepAttr, noExonAttr);
892 GfoHolder* r=NULL;
893 if (replaceidx>=0) {
894 gflst.Put(replaceidx,newgfo);
895 r=gfoAdd(newgfo->gffID, gffline->gseqname, newgfo, replaceidx);
896 }
897 else {
898 int gfoidx=gflst.Add(newgfo);
899 r=gfoAdd(newgfo->gffID, gffline->gseqname, newgfo, gfoidx);
900 }
901 if (gff_warns) {
902 int* pcount=tids.Find(newgfo->gffID);
903 if (pcount!=NULL) {
904 if (gff_warns) GMessage("Warning: duplicate GFF ID: %s\n", newgfo->gffID);
905 (*pcount)++;
906 }
907 else {
908 tids.Add(newgfo->gffID,new int(1));
909 }
910 }
911 return r;
912 }
913
914 GfoHolder* GffReader::updateParent(GfoHolder* newgfh, GffObj* parent) {
915 //assert(parent);
916 //assert(newgfo);
917 parent->children.Add(newgfh->gffobj);
918 if (newgfh->gffobj->parent==NULL) newgfh->gffobj->parent=parent;
919 newgfh->gffobj->setLevel(parent->getLevel()+1);
920 if (parent->isGene()) {
921 if (parent->gene_name!=NULL && newgfh->gffobj->gene_name==NULL)
922 newgfh->gffobj->gene_name=Gstrdup(parent->gene_name);
923 if (parent->geneID!=NULL && newgfh->gffobj->geneID==NULL)
924 newgfh->gffobj->geneID=Gstrdup(parent->geneID);
925 }
926
927 return newgfh;
928 }
929
930 GfoHolder* GffReader::newGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr,
931 GffObj* parent, GffExon* pexon) {
932 GffObj* newgfo=new GffObj(this, gffline, keepAttr, noExonAttr);
933 GfoHolder* r=NULL;
934 int gfoidx=gflst.Add(newgfo);
935 r=gfoAdd(newgfo->gffID, gffline->gseqname, newgfo, gfoidx);
936 if (parent!=NULL) {
937 updateParent(r, parent);
938 if (pexon!=NULL) parent->removeExon(pexon);
939 }
940 if (gff_warns) {
941 int* pcount=tids.Find(newgfo->gffID);
942 if (pcount!=NULL) {
943 if (gff_warns) GMessage("Warning: duplicate GFF ID: %s\n", newgfo->gffID);
944 (*pcount)++;
945 }
946 else {
947 tids.Add(newgfo->gffID,new int(1));
948 }
949 }
950 return r;
951 }
952
953 GfoHolder* GffReader::updateGffRec(GfoHolder* prevgfo, GffLine* gffline,
954 bool keepAttr) {
955 if (prevgfo==NULL) return NULL;
956 prevgfo->gffobj->createdByExon(false);
957 prevgfo->gffobj->ftype_id=prevgfo->gffobj->names->feats.addName(gffline->ftype);
958 prevgfo->gffobj->start=gffline->fstart;
959 prevgfo->gffobj->end=gffline->fend;
960 prevgfo->gffobj->isGene(gffline->is_gene);
961 prevgfo->gffobj->isTranscript(gffline->is_transcript || gffline->exontype!=0);
962 prevgfo->gffobj->fromGff3(gffline->is_gff3);
963 if (keepAttr) {
964 if (prevgfo->gffobj->attrs!=NULL) prevgfo->gffobj->attrs->Clear();
965 prevgfo->gffobj->parseAttrs(prevgfo->gffobj->attrs, gffline->info);
966 }
967 return prevgfo;
968 }
969
970
971 bool GffReader::addExonFeature(GfoHolder* prevgfo, GffLine* gffline, GHash<CNonExon>& pex, bool noExonAttr) {
972 bool r=true;
973 if (gffline->strand!=prevgfo->gffobj->strand) {
974 GMessage("GFF Error: duplicate GFF ID '%s' (exons found on different strands of %s)\n",
975 prevgfo->gffobj->gffID, prevgfo->gffobj->getGSeqName());
976 r=false;
977 }
978 int gdist=(gffline->fstart>prevgfo->gffobj->end) ? gffline->fstart-prevgfo->gffobj->end :
979 ((gffline->fend<prevgfo->gffobj->start)? prevgfo->gffobj->start-gffline->fend :
980 0 );
981 if (gdist>(int)GFF_MAX_LOCUS) { //too far apart, most likely this is a duplicate ID
982 GMessage("Error: duplicate GFF ID '%s' (or exons too far apart)!\n",prevgfo->gffobj->gffID);
983 //validation_errors = true;
984 r=false;
985 if (!gff_warns) exit(1);
986 }
987 int eidx=prevgfo->gffobj->addExon(this, gffline, !noExonAttr, noExonAttr);
988 if (eidx>=0 && gffline->ID!=NULL && gffline->exontype==0)
989 subfPoolAdd(pex, prevgfo);
990 return r;
991 }
992
993 CNonExon* GffReader::subfPoolCheck(GffLine* gffline, GHash<CNonExon>& pex, char*& subp_name) {
994 CNonExon* subp=NULL;
995 subp_name=NULL;
996 for (int i=0;i<gffline->num_parents;i++) {
997 if (transcriptsOnly && discarded_ids.Find(gffline->parents[i])!=NULL)
998 continue;
999 subp_name=gfoBuildId(gffline->parents[i], gffline->gseqname); //e.g. mRNA name
1000 subp=pex.Find(subp_name);
1001 if (subp!=NULL)
1002 return subp;
1003 GFREE(subp_name);
1004 }
1005 return NULL;
1006 }
1007
1008 void GffReader::subfPoolAdd(GHash<CNonExon>& pex, GfoHolder* newgfo) {
1009 //this might become a parent feature later
1010 if (newgfo->gffobj->exons.Count()>0) {
1011 char* xbuf=gfoBuildId(gffline->ID, gffline->gseqname);
1012 pex.Add(xbuf, new CNonExon(newgfo->idx, newgfo->gffobj,
1013 newgfo->gffobj->exons[0], gffline));
1014 GFREE(xbuf);
1015 }
1016 }
1017
1018 GfoHolder* GffReader::promoteFeature(CNonExon* subp, char*& subp_name, GHash<CNonExon>& pex,
1019 bool keepAttr, bool noExonAttr) {
1020 GffObj* prevp=subp->parent; //grandparent of gffline (e.g. gene)
1021 if (prevp!=gflst[subp->idx])
1022 GError("Error promoting subfeature %s, gflst index mismatch?!\n", subp->gffline->ID);
1023 subp->gffline->discardParent();
1024 GfoHolder* gfoh=newGffRec(subp->gffline, keepAttr, noExonAttr, prevp, subp->exon);
1025 pex.Remove(subp_name); //no longer a potential parent, moved it to phash already
1026 prevp->promotedChildren(true);
1027 return gfoh; //returns the holder of newly promoted feature
1028 }
1029
1030 //have to parse the whole file because exons can be scattered all over
1031 void GffReader::readAll(bool keepAttr, bool mergeCloseExons, bool noExonAttr) {
1032 bool validation_errors = false;
1033 //loc_debug=false;
1034 GHash<CNonExon> pex; //keep track of any "exon"-like features that have an ID
1035 //and thus could become promoted to parent features
1036 while (nextGffLine()!=NULL) {
1037 //seen this gff ID before?
1038 GfoHolder* prevseen=NULL;
1039 if (gffline->ID) //GFF3
1040 prevseen=gfoFind(gffline->ID, gffline->gseqname);
1041 if (prevseen!=NULL) {
1042 if (prevseen->gffobj->createdByExon()) {
1043 updateGffRec(prevseen, gffline, keepAttr);
1044 }
1045 else {
1046 GMessage("Error: duplicate GFF ID '%s' encountered!\n",gffline->ID);
1047 validation_errors = true;
1048 if (gff_warns) {
1049 delete gffline; gffline=NULL; continue;
1050 }
1051 else exit(1);
1052 }
1053 }
1054 if (gffline->parents==NULL) {//start GFF3-like record with no parent (mRNA, gene)
1055 if (!prevseen) newGffRec(gffline, keepAttr, noExonAttr);
1056 }
1057 else { //--- it's a parented feature (could still be a mRNA)
1058 bool found_parent=false;
1059 GfoHolder* newgfo=prevseen;
1060 for (int i=0;i<gffline->num_parents;i++) {
1061 if (transcriptsOnly && discarded_ids.Find(gffline->parents[i])!=NULL)
1062 continue; //skipping discarded parent feature
1063 GfoHolder* parentgfo=gfoFind(gffline->parents[i], gffline->gseqname);
1064 if (parentgfo!=NULL) { //parent GffObj parsed earlier
1065 found_parent=true;
1066 if (parentgfo->gffobj->isGene() && gffline->is_transcript
1067 && gffline->exontype==0) {
1068 //not an exon, but a transcript parented by a gene
1069 if (newgfo) {
1070 updateParent(newgfo, parentgfo->gffobj);
1071 }
1072 else {
1073 newgfo=newGffRec(gffline, keepAttr, noExonAttr, parentgfo->gffobj);
1074 }
1075 }
1076 else { //potential exon subfeature
1077 if (!addExonFeature(parentgfo, gffline, pex, noExonAttr))
1078 validation_errors=true;
1079 }
1080 }
1081 } //for each parsed parent Id
1082 if (!found_parent) { //new GTF-like record starting here with a subfeature directly
1083 //or it could be some chado GFF3 barf with exons declared BEFORE their parent :(
1084 //check if this feature isn't parented by a previously stored "exon" subfeature
1085 char* subp_name=NULL;
1086 CNonExon* subp=subfPoolCheck(gffline, pex, subp_name);
1087 if (subp!=NULL) { //found a subfeature that is the parent of this gffline
1088 //promote that subfeature to a full GffObj
1089 GfoHolder* gfoh=promoteFeature(subp, subp_name, pex, keepAttr, noExonAttr);
1090 //add current gffline as an exon of the newly promoted subfeature
1091 if (!addExonFeature(gfoh, gffline, pex, noExonAttr))
1092 validation_errors=true;
1093 }
1094 else { //no parent seen before, create one directly with this exon
1095 //loc_debug=true;
1096 GfoHolder* newgfo=prevseen ? prevseen : newGffRec(gffline, keepAttr, noExonAttr);
1097 if (gffline->ID!=NULL && gffline->exontype==0)
1098 subfPoolAdd(pex, newgfo);
1099 //even those with errors will be added here!
1100 }
1101 GFREE(subp_name);
1102 } //no previous parent found
1103 } //parented feature
1104 //--
1105 delete gffline;
1106 gffline=NULL;
1107 }//while gff lines
1108 gflst.finalize(this, mergeCloseExons, keepAttr, noExonAttr); //force sorting by locus if so constructed
1109 // all gff records are now loaded in GList gflst
1110 // so we can free the hash
1111 phash.Clear();
1112 tids.Clear();
1113 if (validation_errors) {
1114 exit(1);
1115 }
1116 }
1117
1118 GffObj* GffObj::finalize(GffReader* gfr, bool mergeCloseExons, bool keepAttrs, bool noExonAttr) {
1119 //merge
1120 //always merge adjacent or overlapping segments
1121 //but if mergeCloseExons then merge even when distance is up to 5 bases
1122 udata=0;
1123 uptr=NULL;
1124 if (gfr->transcriptsOnly && !(isTranscript() || (isGene() && children.Count()==0))) {
1125 isDiscarded(true);
1126 }
1127 if (ftype_id==gff_fid_transcript && CDstart>0) {
1128 ftype_id=gff_fid_mRNA;
1129 //exon_ftype_id=gff_fid_exon;
1130 }
1131 //if (ftype_id==gff_fid_mRNA || exon_ftype_id==gff_fid_exon || mergeCloseExons) {
1132 if (isTranscript() || exon_ftype_id==gff_fid_exon || mergeCloseExons) {
1133 int mindist=mergeCloseExons ? 5:1;
1134 for (int i=0;i<exons.Count()-1;i++) {
1135 int ni=i+1;
1136 uint mend=exons[i]->end;
1137 while (ni<exons.Count()) {
1138 int dist=(int)(exons[ni]->start-mend);
1139 if (dist>mindist) break; //no merging with next segment
1140 if (gfr!=NULL && gfr->gff_warns && dist!=0 && (exons[ni]->exontype!=exgffUTR && exons[i]->exontype!=exgffUTR)) {
1141 GMessage("GFF warning: merging adjacent/overlapping segments of %s on %s (%d-%d, %d-%d)\n",
1142 gffID, getGSeqName(), exons[i]->start, exons[i]->end,exons[ni]->start, exons[ni]->end);
1143 }
1144 mend=exons[ni]->end;
1145 covlen-=exons[i]->len();
1146 exons[i]->end=mend;
1147 covlen+=exons[i]->len();
1148 covlen-=exons[ni]->len();
1149 if (exons[ni]->attrs!=NULL && (exons[i]->attrs==NULL ||
1150 exons[i]->attrs->Count()<exons[ni]->attrs->Count())) {
1151 //use the other exon attributes, if more
1152 delete(exons[i]->attrs);
1153 exons[i]->attrs=exons[ni]->attrs;
1154 exons[ni]->attrs=NULL;
1155 }
1156 exons.Delete(ni);
1157 } //check for merge with next exon
1158 } //for each exon
1159 }
1160 //attribute reduction for GTF records
1161 if (keepAttrs && !noExonAttr && !fromGff3()
1162 && exons.Count()>0 && exons[0]->attrs!=NULL) {
1163 bool attrs_discarded=false;
1164 for (int a=0;a<exons[0]->attrs->Count();a++) {
1165 int attr_name_id=exons[0]->attrs->Get(a)->attr_id;
1166 char* attr_name=names->attrs.getName(attr_name_id);
1167 char* attr_val =exons[0]->attrs->Get(a)->attr_val;
1168 bool sameExonAttr=true;
1169 for (int i=1;i<exons.Count();i++) {
1170 char* ov=exons[i]->getAttr(attr_name_id);
1171 if (ov==NULL || (strcmp(ov,attr_val)!=0)) {
1172 sameExonAttr=false;
1173 break;
1174 }
1175 }
1176 if (sameExonAttr) {
1177 //delete this attribute from exons level
1178 attrs_discarded=true;
1179 this->addAttr(attr_name, attr_val);
1180 for (int i=1;i<exons.Count();i++) {
1181 removeExonAttr(*(exons[i]), attr_name_id);
1182 }
1183 exons[0]->attrs->freeItem(a);
1184 }
1185 }
1186 if (attrs_discarded) exons[0]->attrs->Pack();
1187 }
1188 return this;
1189 }
1190
1191 void GffObj::parseAttrs(GffAttrs*& atrlist, char* info, bool isExon) {
1192 if (names==NULL)
1193 GError(ERR_NULL_GFNAMES, "parseAttrs()");
1194 if (atrlist==NULL)
1195 atrlist=new GffAttrs();
1196 char* endinfo=info+strlen(info);
1197 char* start=info;
1198 char* pch=start;
1199 while (start<endinfo) {
1200 //skip spaces
1201 while (*start==' ' && start<endinfo) start++;
1202 pch=strchr(start, ';');
1203 if (pch==NULL) pch=endinfo;
1204 else {
1205 *pch='\0';
1206 pch++;
1207 }
1208 char* ech=strchr(start,'=');
1209 if (ech!=NULL) { // attr=value format found
1210 *ech='\0';
1211 //if (noExonAttr && (strcmp(start, "exon_number")==0 || strcmp(start, "exon")==0)) { start=pch; continue; }
1212 if (strcmp(start, "exon_number")==0 || strcmp(start, "exon")==0 ||
1213 strcmp(start, "exon_id")==0)
1214 { start=pch; continue; }
1215 ech++;
1216 while (*ech==' ' && ech<endinfo) ech++;//skip extra spaces after the '='
1217 //atrlist->Add(new GffAttr(names->attrs.addName(start),ech));
1218 //make sure we don't add the same attribute more than once
1219 if (isExon && (strcmp(start, "protein_id")==0)) {
1220 //Ensembl special case
1221 this->addAttr(start, ech);
1222 start=pch;
1223 continue;
1224 }
1225 atrlist->add_or_update(names, start, ech);
1226 }
1227 /*
1228 else { //not an attr=value format
1229 atrlist->Add(new GffAttr(names->attrs.addName(start),"1"));
1230 }
1231 */
1232 start=pch;
1233 }
1234 if (atrlist->Count()==0) { delete atrlist; atrlist=NULL; }
1235 }
1236
1237 void GffObj::addAttr(const char* attrname, const char* attrvalue) {
1238 if (this->attrs==NULL)
1239 this->attrs=new GffAttrs();
1240 //this->attrs->Add(new GffAttr(names->attrs.addName(attrname),attrvalue));
1241 this->attrs->add_or_update(names, attrname, attrvalue);
1242 }
1243
1244
1245 void GffObj::setFeatureName(const char* feature) {
1246 //change the feature name/type for a transcript
1247 int fid=names->feats.addName(feature);
1248 if (monoFeature() && exons.Count()>0)
1249 this->exon_ftype_id=fid;
1250 this->ftype_id=fid;
1251 }
1252
1253 void GffObj::setRefName(const char* newname) {
1254 //change the feature name/type for a transcript
1255 int rid=names->gseqs.addName(newname);
1256 this->gseq_id=rid;
1257 }
1258
1259
1260
1261 int GffObj::removeAttr(const char* attrname, const char* attrval) {
1262 if (this->attrs==NULL || attrname==NULL || attrname[0]==0) return 0;
1263 int aid=this->names->attrs.getId(attrname);
1264 if (aid<0) return 0;
1265 int delcount=0; //could be more than one ?
1266 for (int i=0;i<this->attrs->Count();i++) {
1267 if (aid==this->attrs->Get(i)->attr_id) {
1268 if (attrval==NULL ||
1269 strcmp(attrval, this->attrs->Get(i)->attr_val)==0) {
1270 delcount++;
1271 this->attrs->freeItem(i);
1272 }
1273 }
1274 }
1275 if (delcount>0) this->attrs->Pack();
1276 return delcount;
1277 }
1278
1279 int GffObj::removeAttr(int aid, const char* attrval) {
1280 if (this->attrs==NULL || aid<0) return 0;
1281 int delcount=0; //could be more than one ?
1282 for (int i=0;i<this->attrs->Count();i++) {
1283 if (aid==this->attrs->Get(i)->attr_id) {
1284 if (attrval==NULL ||
1285 strcmp(attrval, this->attrs->Get(i)->attr_val)==0) {
1286 delcount++;
1287 this->attrs->freeItem(i);
1288 }
1289 }
1290 }
1291 if (delcount>0) this->attrs->Pack();
1292 return delcount;
1293 }
1294
1295
1296 int GffObj::removeExonAttr(GffExon& exon, const char* attrname, const char* attrval) {
1297 if (exon.attrs==NULL || attrname==NULL || attrname[0]==0) return 0;
1298 int aid=this->names->attrs.getId(attrname);
1299 if (aid<0) return 0;
1300 int delcount=0; //could be more than one
1301 for (int i=0;i<exon.attrs->Count();i++) {
1302 if (aid==exon.attrs->Get(i)->attr_id) {
1303 if (attrval==NULL ||
1304 strcmp(attrval, exon.attrs->Get(i)->attr_val)==0) {
1305 delcount++;
1306 exon.attrs->freeItem(i);
1307 }
1308 }
1309 }
1310 if (delcount>0) exon.attrs->Pack();
1311 return delcount;
1312 }
1313
1314 int GffObj::removeExonAttr(GffExon& exon, int aid, const char* attrval) {
1315 if (exon.attrs==NULL || aid<0) return 0;
1316 int delcount=0; //could be more than one
1317 for (int i=0;i<exon.attrs->Count();i++) {
1318 if (aid==exon.attrs->Get(i)->attr_id) {
1319 if (attrval==NULL ||
1320 strcmp(attrval, exon.attrs->Get(i)->attr_val)==0) {
1321 delcount++;
1322 exon.attrs->freeItem(i);
1323 }
1324 }
1325 }
1326 if (delcount>0) exon.attrs->Pack();
1327 return delcount;
1328 }
1329
1330
1331 void GffObj::getCDS_ends(uint& cds_start, uint& cds_end) {
1332 cds_start=0;
1333 cds_end=0;
1334 if (CDstart==0 || CDend==0) return; //no CDS info
1335 int cdsadj=0;
1336 if (CDphase=='1' || CDphase=='2') {
1337 cdsadj=CDphase-'0';
1338 }
1339 cds_start=CDstart;
1340 cds_end=CDend;
1341 if (strand=='-') cds_end-=cdsadj;
1342 else cds_start+=cdsadj;
1343 }
1344
1345 void GffObj::mRNA_CDS_coords(uint& cds_mstart, uint& cds_mend) {
1346 //sets cds_start and cds_end to the CDS start,end coordinates on the spliced mRNA transcript
1347 cds_mstart=0;
1348 cds_mend=0;
1349 if (CDstart==0 || CDend==0) return; //no CDS info
1350 //restore normal coordinates, just in case
1351 unxcoord();
1352 int cdsadj=0;
1353 if (CDphase=='1' || CDphase=='2') {
1354 cdsadj=CDphase-'0';
1355 }
1356 /*
1357 uint seqstart=CDstart;
1358 uint seqend=CDend;
1359 */
1360 uint seqstart=exons.First()->start;
1361 uint seqend=exons.Last()->end;
1362 int s=0; //resulting nucleotide counter
1363 if (strand=='-') {
1364 for (int x=exons.Count()-1;x>=0;x--) {
1365 uint sgstart=exons[x]->start;
1366 uint sgend=exons[x]->end;
1367 if (seqend<sgstart || seqstart>sgend) continue;
1368 if (seqstart>=sgstart && seqstart<=sgend)
1369 sgstart=seqstart; //seqstart within this segment
1370 if (seqend>=sgstart && seqend<=sgend)
1371 sgend=seqend; //seqend within this segment
1372 s+=(int)(sgend-sgstart)+1;
1373 if (CDstart>=sgstart && CDstart<=sgend) {
1374 //CDstart in this segment
1375 //and we are getting the whole transcript
1376 cds_mend=s-(int)(CDstart-sgstart);
1377 }
1378 if (CDend>=sgstart && CDend<=sgend) {
1379 //CDstart in this segment
1380 //and we are getting the whole transcript
1381 cds_mstart=s-(int)(CDend-cdsadj-sgstart);
1382 }
1383 } //for each exon
1384 } // - strand
1385 else { // + strand
1386 for (int x=0;x<exons.Count();x++) {
1387 uint sgstart=exons[x]->start;
1388 uint sgend=exons[x]->end;
1389 if (seqend<sgstart || seqstart>sgend) continue;
1390 if (seqstart>=sgstart && seqstart<=sgend)
1391 sgstart=seqstart; //seqstart within this segment
1392 if (seqend>=sgstart && seqend<=sgend)
1393 sgend=seqend; //seqend within this segment
1394 s+=(int)(sgend-sgstart)+1;
1395 /* for (uint i=sgstart;i<=sgend;i++) {
1396 spliced[s]=gsubseq[i-gstart];
1397 s++;
1398 }//for each nt
1399 */
1400 if (CDstart>=sgstart && CDstart<=sgend) {
1401 //CDstart in this segment
1402 cds_mstart=s-(int)(sgend-CDstart-cdsadj);
1403 }
1404 if (CDend>=sgstart && CDend<=sgend) {
1405 //CDend in this segment
1406 cds_mend=s-(int)(sgend-CDend);
1407 }
1408 } //for each exon
1409 } // + strand
1410 //spliced[s]=0;
1411 //if (rlen!=NULL) *rlen=s;
1412 //return spliced;
1413 }
1414
1415 char* GffObj::getUnspliced(GFaSeqGet* faseq, int* rlen, GList<GSeg>* seglst)
1416 {
1417 if (faseq==NULL) { GMessage("Warning: getUnspliced(NULL,.. ) called!\n");
1418 return NULL;
1419 }
1420 //restore normal coordinates:
1421 unxcoord();
1422 if (exons.Count()==0) return NULL;
1423 int fspan=end-start+1;
1424 const char* gsubseq=faseq->subseq(start, fspan);
1425 if (gsubseq==NULL) {
1426 GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end);
1427 }
1428 char* unspliced=NULL;
1429
1430 int seqstart=exons.First()->start;
1431 int seqend=exons.Last()->end;
1432
1433 int unsplicedlen = 0;
1434
1435 unsplicedlen += seqend - seqstart + 1;
1436
1437 GMALLOC(unspliced, unsplicedlen+1); //allocate more here
1438 //uint seqstart, seqend;
1439
1440 int s = 0; //resulting nucleotide counter
1441 if (strand=='-')
1442 {
1443 if (seglst!=NULL)
1444 seglst->Add(new GSeg(s+1,s+1+seqend-seqstart));
1445 for (int i=seqend;i>=seqstart;i--)
1446 {
1447 unspliced[s] = ntComplement(gsubseq[i-start]);
1448 s++;
1449 }//for each nt
1450 } // - strand
1451 else
1452 { // + strand
1453 if (seglst!=NULL)
1454 seglst->Add(new GSeg(s+1,s+1+seqend-seqstart));
1455 for (int i=seqstart;i<=seqend;i++)
1456 {
1457 unspliced[s]=gsubseq[i-start];
1458 s++;
1459 }//for each nt
1460 } // + strand
1461 //assert(s <= unsplicedlen);
1462 unspliced[s]=0;
1463 if (rlen!=NULL) *rlen=s;
1464 return unspliced;
1465 }
1466
1467 char* GffObj::getSpliced(GFaSeqGet* faseq, bool CDSonly, int* rlen, uint* cds_start, uint* cds_end,
1468 GList<GSeg>* seglst) {
1469 if (CDSonly && CDstart==0) return NULL;
1470 if (faseq==NULL) { GMessage("Warning: getSpliced(NULL,.. ) called!\n");
1471 return NULL;
1472 }
1473 //restore normal coordinates:
1474 unxcoord();
1475 if (exons.Count()==0) return NULL;
1476 int fspan=end-start+1;
1477 const char* gsubseq=faseq->subseq(start, fspan);
1478 if (gsubseq==NULL) {
1479 GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end);
1480 }
1481 if (fspan<(int)(end-start+1)) { //special case: stop coordinate was extended past the gseq length, must adjust
1482 int endadj=end-start+1-fspan;
1483 uint prevend=end;
1484 end-=endadj;
1485 if (CDend>end) CDend=end;
1486 if (exons.Last()->end>end) {
1487 exons.Last()->end=end; //this could get us into trouble if exon start is also > end
1488 if (exons.Last()->start>exons.Last()->end) {
1489 GError("GffObj::getSpliced() error: improper genomic coordinate %d on %s for %s\n",
1490 prevend,getGSeqName(), getID());
1491 }
1492 covlen-=endadj;
1493 }
1494 }
1495 char* spliced=NULL;
1496 GMALLOC(spliced, covlen+1); //allocate more here
1497 uint seqstart, seqend;
1498 int cdsadj=0;
1499 if (CDphase=='1' || CDphase=='2') {
1500 cdsadj=CDphase-'0';
1501 }
1502 if (CDSonly) {
1503 seqstart=CDstart;
1504 seqend=CDend;
1505 if (strand=='-') seqend-=cdsadj;
1506 else seqstart+=cdsadj;
1507 }
1508 else {
1509 seqstart=exons.First()->start;
1510 seqend=exons.Last()->end;
1511 }
1512 int s=0; //resulting nucleotide counter
1513 if (strand=='-') {
1514 for (int x=exons.Count()-1;x>=0;x--) {
1515 uint sgstart=exons[x]->start;
1516 uint sgend=exons[x]->end;
1517 if (seqend<sgstart || seqstart>sgend) continue;
1518 if (seqstart>=sgstart && seqstart<=sgend)
1519 sgstart=seqstart; //seqstart within this segment
1520 if (seqend>=sgstart && seqend<=sgend)
1521 sgend=seqend; //seqend within this segment
1522 if (seglst!=NULL)
1523 seglst->Add(new GSeg(s+1,s+1+sgend-sgstart));
1524 for (uint i=sgend;i>=sgstart;i--) {
1525 spliced[s] = ntComplement(gsubseq[i-start]);
1526 s++;
1527 }//for each nt
1528
1529 if (!CDSonly && cds_start!=NULL && CDstart>0) {
1530 if (CDstart>=sgstart && CDstart<=sgend) {
1531 //CDstart in this segment
1532 //and we are getting the whole transcript
1533 *cds_end=s-(CDstart-sgstart);
1534 }
1535 if (CDend>=sgstart && CDend<=sgend) {
1536 //CDstart in this segment
1537 //and we are getting the whole transcript
1538 *cds_start=s-(CDend-cdsadj-sgstart);
1539 }
1540 }//update local CDS coordinates
1541 } //for each exon
1542 } // - strand
1543 else { // + strand
1544 for (int x=0;x<exons.Count();x++) {
1545 uint sgstart=exons[x]->start;
1546 uint sgend=exons[x]->end;
1547 if (seqend<sgstart || seqstart>sgend) continue;
1548 if (seqstart>=sgstart && seqstart<=sgend)
1549 sgstart=seqstart; //seqstart within this segment
1550 if (seqend>=sgstart && seqend<=sgend)
1551 sgend=seqend; //seqend within this segment
1552 if (seglst!=NULL)
1553 seglst->Add(new GSeg(s+1,s+1+sgend-sgstart));
1554 for (uint i=sgstart;i<=sgend;i++) {
1555 spliced[s]=gsubseq[i-start];
1556 s++;
1557 }//for each nt
1558 if (!CDSonly && cds_start!=NULL && CDstart>0) {
1559 if (CDstart>=sgstart && CDstart<=sgend) {
1560 //CDstart in this segment
1561 //and we are getting the whole transcript
1562 *cds_start=s-(sgend-CDstart-cdsadj);
1563 }
1564 if (CDend>=sgstart && CDend<=sgend) {
1565 //CDstart in this segment
1566 //and we are getting the whole transcript
1567 *cds_end=s-(sgend-CDend);
1568 }
1569 }//update local CDS coordinates
1570 } //for each exon
1571 } // + strand
1572 spliced[s]=0;
1573 if (rlen!=NULL) *rlen=s;
1574 return spliced;
1575 }
1576
1577 char* GffObj::getSplicedTr(GFaSeqGet* faseq, bool CDSonly, int* rlen) {
1578 if (CDSonly && CDstart==0) return NULL;
1579 //restore normal coordinates:
1580 unxcoord();
1581 if (exons.Count()==0) return NULL;
1582 int fspan=end-start+1;
1583 const char* gsubseq=faseq->subseq(start, fspan);
1584 if (gsubseq==NULL) {
1585 GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end);
1586 }
1587
1588 char* translation=NULL;
1589 GMALLOC(translation, (int)(covlen/3)+1);
1590 uint seqstart, seqend;
1591 int cdsadj=0;
1592 if (CDphase=='1' || CDphase=='2') {
1593 cdsadj=CDphase-'0';
1594 }
1595 if (CDSonly) {
1596 seqstart=CDstart;
1597 seqend=CDend;
1598 if (strand=='-') seqend-=cdsadj;
1599 else seqstart+=cdsadj;
1600 }
1601 else {
1602 seqstart=exons.First()->start;
1603 seqend=exons.Last()->end;
1604 }
1605 Codon codon;
1606 int nt=0; //codon nucleotide counter (0..2)
1607 int aa=0; //aminoacid count
1608 if (strand=='-') {
1609 for (int x=exons.Count()-1;x>=0;x--) {
1610 uint sgstart=exons[x]->start;
1611 uint sgend=exons[x]->end;
1612 if (seqend<sgstart || seqstart>sgend) continue;
1613 if (seqstart>=sgstart && seqstart<=sgend)
1614 sgstart=seqstart; //seqstart within this segment
1615 if (seqend>=sgstart && seqend<=sgend) {
1616 sgend=seqend; //seqend within this segment
1617 }
1618 for (uint i=sgend;i>=sgstart;i--) {
1619 codon.nuc[nt]=ntComplement(gsubseq[i-start]);
1620 nt++;
1621 if (nt==3) {
1622 nt=0;
1623 translation[aa]=codon.translate();
1624 aa++;
1625 }
1626 }//for each nt
1627 } //for each exon
1628 } // - strand
1629 else { // + strand
1630 for (int x=0;x<exons.Count();x++) {
1631 uint sgstart=exons[x]->start;
1632 uint sgend=exons[x]->end;
1633 if (seqend<sgstart || seqstart>sgend) continue;
1634 if (seqstart>=sgstart && seqstart<=sgend)
1635 sgstart=seqstart; //seqstart within this segment
1636 if (seqend>=sgstart && seqend<=sgend)
1637 sgend=seqend; //seqend within this segment
1638 for (uint i=sgstart;i<=sgend;i++) {
1639 codon.nuc[nt]=gsubseq[i-start];
1640 nt++;
1641 if (nt==3) {
1642 nt=0;
1643 translation[aa]=codon.translate();
1644 aa++;
1645 }
1646 }//for each nt
1647 } //for each exon
1648 } // + strand
1649 translation[aa]=0;
1650 if (rlen!=NULL) *rlen=aa;
1651 return translation;
1652 }
1653
1654 void GffObj::printSummary(FILE* fout) {
1655 if (fout==NULL) fout=stdout;
1656 fprintf(fout, "%s\t%c\t%d\t%d\t%4.2f\t%4.1f\n", gffID,
1657 strand, start, end, gscore, (float)qcov/10.0);
1658 }
1659
1660 void GffObj::printGxfLine(FILE* fout, const char* tlabel, const char* gseqname, bool iscds,
1661 uint segstart, uint segend, int exidx, char phase, bool gff3) {
1662 static char scorestr[14];
1663 strcpy(scorestr,".");
1664 GffAttrs* xattrs=NULL;
1665 if (exidx>=0) {
1666 if (exons[exidx]->score) sprintf(scorestr,"%.2f", exons[exidx]->score);
1667 xattrs=exons[exidx]->attrs;
1668 }
1669 if (phase==0 || !iscds) phase='.';
1670 const char* ftype=iscds ? "CDS" : getSubfName();
1671 if (gff3) {
1672 fprintf(fout,
1673 "%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\tParent=%s",
1674 gseqname, tlabel, ftype, segstart, segend, scorestr, strand,
1675 phase, gffID);
1676 if (xattrs!=NULL) {
1677 for (int i=0;i<xattrs->Count();i++)
1678 fprintf(fout, ";%s=%s",names->attrs.getName(xattrs->Get(i)->attr_id),
1679 xattrs->Get(i)->attr_val);
1680 }
1681 fprintf(fout, "\n");
1682 } //GFF
1683 else {//for GTF -- we print only transcripts
1684 //if (isValidTranscript())
1685 fprintf(fout, "%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\ttranscript_id \"%s\";",
1686 gseqname, tlabel, ftype, segstart, segend, scorestr, strand, phase, gffID);
1687 //char* geneid=(geneID!=NULL)? geneID : gffID;
1688 if (geneID)
1689 fprintf(fout," gene_id \"%s\";",geneID);
1690 if (gene_name!=NULL) {
1691 //fprintf(fout, " gene_name ");
1692 //if (gene_name[0]=='"') fprintf (fout, "%s;",gene_name);
1693 // else fprintf(fout, "\"%s\";",gene_name);
1694 fprintf(fout," gene_name \"%s\";",gene_name);
1695 }
1696 if (xattrs!=NULL) {
1697 for (int i=0;i<xattrs->Count();i++) {
1698 if (xattrs->Get(i)->attr_val==NULL) continue;
1699 const char* attrname=names->attrs.getName(xattrs->Get(i)->attr_id);
1700 fprintf(fout, " %s ",attrname);
1701 if (xattrs->Get(i)->attr_val[0]=='"')
1702 fprintf(fout, "%s;",xattrs->Get(i)->attr_val);
1703 else fprintf(fout, "\"%s\";",xattrs->Get(i)->attr_val);
1704 }
1705 }
1706 //for GTF, also append the GffObj attributes to each exon line
1707 if ((xattrs=this->attrs)!=NULL) {
1708 for (int i=0;i<xattrs->Count();i++) {
1709 if (xattrs->Get(i)->attr_val==NULL) continue;
1710 const char* attrname=names->attrs.getName(xattrs->Get(i)->attr_id);
1711 fprintf(fout, " %s ",attrname);
1712 if (xattrs->Get(i)->attr_val[0]=='"')
1713 fprintf(fout, "%s;",xattrs->Get(i)->attr_val);
1714 else fprintf(fout, "\"%s\";",xattrs->Get(i)->attr_val);
1715 }
1716 }
1717 fprintf(fout, "\n");
1718 }//GTF
1719 }
1720
1721 void GffObj::printGxf(FILE* fout, GffPrintMode gffp,
1722 const char* tlabel, const char* gfparent) {
1723 static char tmpstr[255];
1724 if (tlabel==NULL) {
1725 tlabel=track_id>=0 ? names->tracks.Get(track_id)->name :
1726 (char*)"gffobj" ;
1727 }
1728 unxcoord();
1729 //if (exons.Count()==0) return;
1730 const char* gseqname=names->gseqs.Get(gseq_id)->name;
1731 bool gff3 = (gffp>=pgffAny);
1732 bool showCDS = (gffp==pgtfAny || gffp==pgtfCDS || gffp==pgffCDS || gffp==pgffAny || gffp==pgffBoth);
1733 bool showExon = (gffp<=pgtfExon || gffp==pgffAny || gffp==pgffExon || gffp==pgffBoth);
1734 if (gff3) {
1735 //print GFF3 mRNA line:
1736 if (gscore>0.0) sprintf(tmpstr,"%.2f", gscore);
1737 else strcpy(tmpstr,".");
1738 uint pstart, pend;
1739 if (gffp==pgffCDS) {
1740 pstart=CDstart;
1741 pend=CDend;
1742 }
1743 else { pstart=start;pend=end; }
1744 //const char* ftype=isTranscript() ? "mRNA" : getFeatureName();
1745 const char* ftype=getFeatureName();
1746 fprintf(fout,
1747 "%s\t%s\t%s\t%d\t%d\t%s\t%c\t.\tID=%s",
1748 gseqname, tlabel, ftype, pstart, pend, tmpstr, strand, gffID);
1749 if (CDstart>0 && !showCDS && !isCDS) fprintf(fout,";CDS=%d-%d",CDstart,CDend);
1750 if (gfparent!=NULL) {
1751 //parent override
1752 fprintf(fout, ";Parent=%s",gfparent);
1753 }
1754 else {
1755 if (parent!=NULL && !parent->isDiscarded())
1756 fprintf(fout, ";Parent=%s",parent->getID());
1757 }
1758 if (geneID!=NULL)
1759 fprintf(fout, ";geneID=%s",geneID);
1760 if (gene_name!=NULL)
1761 fprintf(fout, ";gene_name=%s",gene_name);
1762 if (attrs!=NULL) {
1763 for (int i=0;i<attrs->Count();i++) {
1764 const char* attrname=names->attrs.getName(attrs->Get(i)->attr_id);
1765 fprintf(fout,";%s=%s", attrname,
1766 attrs->Get(i)->attr_val);
1767 }
1768 }
1769 fprintf(fout,"\n");
1770 }// gff3 mRNA line
1771 if (showExon) {
1772 //print exons
1773 if (isCDS && exons.Count()>0 &&
1774 ((strand=='-' && exons.Last()->phase<'0') || (strand=='+' && exons.Last()->phase<'0')))
1775 updateExonPhase();
1776
1777 for (int i=0;i<exons.Count();i++) {
1778 printGxfLine(fout, tlabel, gseqname, isCDS, exons[i]->start, exons[i]->end, i, exons[i]->phase, gff3);
1779 }
1780 }//printing exons
1781 if (showCDS && !isCDS && CDstart>0) {
1782 GArray<GffCDSeg> cds(true,true);
1783 getCDSegs(cds);
1784 for (int i=0;i<cds.Count();i++) {
1785 printGxfLine(fout, tlabel, gseqname, true, cds[i].start, cds[i].end, -1, cds[i].phase, gff3);
1786 }
1787 } //showCDS
1788 }
1789
1790 void GffObj::updateExonPhase() {
1791 if (!isCDS) return;
1792 int cdsacc=0;
1793 if (CDphase=='1' || CDphase=='2') {
1794 cdsacc+= 3-(CDphase-'0');
1795 }
1796 if (strand=='-') { //reverse strand
1797 for (int i=exons.Count()-1;i>=0;i--) {
1798 exons[i]->phase='0'+ (3-cdsacc%3)%3;
1799 cdsacc+=exons[i]->end-exons[i]->start+1;
1800 }
1801 }
1802 else { //forward strand
1803 for (int i=0;i<exons.Count();i++) {
1804 exons[i]->phase='0'+ (3-cdsacc%3)%3;
1805 cdsacc+=exons[i]->end-exons[i]->start+1;
1806 }
1807 }
1808 }
1809
1810
1811 void GffObj::getCDSegs(GArray<GffCDSeg>& cds) {
1812 GffCDSeg cdseg;
1813 int cdsacc=0;
1814 if (CDphase=='1' || CDphase=='2') {
1815 cdsacc+= 3-(CDphase-'0');
1816 }
1817 if (strand=='-') {
1818 for (int x=exons.Count()-1;x>=0;x--) {
1819 uint sgstart=exons[x]->start;
1820 uint sgend=exons[x]->end;
1821 if (CDend<sgstart || CDstart>sgend) continue;
1822 if (CDstart>=sgstart && CDstart<=sgend)
1823 sgstart=CDstart; //cdstart within this segment
1824 if (CDend>=sgstart && CDend<=sgend)
1825 sgend=CDend; //cdend within this segment
1826 cdseg.start=sgstart;
1827 cdseg.end=sgend;
1828 cdseg.exonidx=x;
1829 //cdseg.phase='0'+(cdsacc>0 ? (3-cdsacc%3)%3 : 0);
1830 cdseg.phase='0'+ (3-cdsacc%3)%3;
1831 cdsacc+=sgend-sgstart+1;
1832 cds.Add(cdseg);
1833 } //for each exon
1834 } // - strand
1835 else { // + strand
1836 for (int x=0;x<exons.Count();x++) {
1837 uint sgstart=exons[x]->start;
1838 uint sgend=exons[x]->end;
1839 if (CDend<sgstart || CDstart>sgend) continue;
1840 if (CDstart>=sgstart && CDstart<=sgend)
1841 sgstart=CDstart; //seqstart within this segment
1842 if (CDend>=sgstart && CDend<=sgend)
1843 sgend=CDend; //seqend within this segment
1844 cdseg.start=sgstart;
1845 cdseg.end=sgend;
1846 cdseg.exonidx=x;
1847 //cdseg.phase='0'+(cdsacc>0 ? (3-cdsacc%3)%3 : 0);
1848 cdseg.phase='0' + (3-cdsacc%3)%3 ;
1849 cdsacc+=sgend-sgstart+1;
1850 cds.Add(cdseg);
1851 } //for each exon
1852 } // + strand
1853 }