ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/gff.cpp
Revision: 55
Committed: Thu Sep 8 05:46:53 2011 UTC (8 years, 1 month ago) by gpertea
File size: 65483 byte(s)
Log Message:
RPS20P29, ENSG00000241828 not collapsing

Line User Rev File contents
1 gpertea 2 #include "gff.h"
2    
3     //GffNames* GffReader::names=NULL;
4     GffNames* GffObj::names=NULL;
5     //global set of feature names, attribute names etc.
6     // -- common for all GffObjs in current application!
7    
8 gpertea 16 const uint GFF_MAX_LOCUS = 7000000; //longest known gene in human is ~2.2M, UCSC claims a gene for mouse of ~ 3.1 M
9     const uint GFF_MAX_EXON = 30000; //longest known exon in human is ~11K
10     const uint GFF_MAX_INTRON= 6000000;
11     bool gff_show_warnings = false; //global setting, set by GffReader->showWarnings()
12 gpertea 2 const int gff_fid_mRNA=0;
13 gpertea 16 const int gff_fid_transcript=1;
14     const int gff_fid_exon=2;
15     const int gff_fid_CDS=3; //never really used in GffObj ftype_id or subftype_id
16     const uint gfo_flag_HAS_ERRORS = 0x00000001;
17     const uint gfo_flag_CHILDREN_PROMOTED= 0x00000002;
18     const uint gfo_flag_IS_GENE = 0x00000004;
19     const uint gfo_flag_IS_TRANSCRIPT = 0x00000008;
20     const uint gfo_flag_FROM_GFF3 = 0x00000010;
21     const uint gfo_flag_BY_EXON = 0x00000020; //created by subfeature (exon) directly
22     const uint gfo_flag_DISCARDED = 0x00000100;
23     const uint gfo_flag_LST_KEEP = 0x00000200;
24     const uint gfo_flag_LEVEL_MSK = 0x00FF0000;
25     const byte gfo_flagShift_LEVEL = 16;
26 gpertea 2
27     void gffnames_ref(GffNames* &n) {
28     if (n==NULL) n=new GffNames();
29     n->numrefs++;
30     }
31    
32     void gffnames_unref(GffNames* &n) {
33     if (n==NULL) GError("Error: attempt to remove reference to null GffNames object!\n");
34     n->numrefs--;
35     if (n->numrefs==0) { delete n; n=NULL; }
36     }
37    
38     int gfo_cmpByLoc(const pointer p1, const pointer p2) {
39    
40     GffObj& g1=*((GffObj*)p1);
41     GffObj& g2=*((GffObj*)p2);
42 gpertea 16 if (g1.gseq_id==g2.gseq_id) {
43     if (g1.start!=g2.start)
44     return (int)(g1.start-g2.start);
45     else if (g1.getLevel()!=g2.getLevel())
46     return (int)(g1.getLevel()-g2.getLevel());
47     else
48     if (g1.end!=g2.end)
49     return (int)(g1.end-g2.end);
50     else return strcmp(g1.getID(), g2.getID());
51     }
52     else return (int)(g1.gseq_id-g2.gseq_id);
53 gpertea 2 }
54    
55 gpertea 16 char* GffLine::extractAttr(const char* pre, bool caseStrict, bool enforce_GTF2) {
56     //parse a key attribute and remove it from the info string
57     //(only works for attributes that have values following them after ' ' or '=')
58     static const char GTF2_ERR[]="Error parsing attribute %s ('\"' required) at GTF line:\n%s\n";
59     int lpre=strlen(pre);
60     char cend=pre[lpre-1];
61     char* pos = (caseStrict) ? strstr(info, pre) : strifind(info, pre);
62     if (pos==NULL) return NULL;
63     char* findstart=info;
64     //require word boundary on the left:
65     while (pos!=NULL && pos!=info && *(pos-1)!=';' && *(pos-1)!=' ') {
66     findstart=pos+lpre;
67     pos = (caseStrict) ? strstr(findstart, pre) : strifind(findstart, pre);
68     }
69     if (pos==NULL) return NULL;
70     if (cend!=' ' && cend!='=') {
71     //require word boundary on the right:
72     while (pos!=NULL && *(pos+lpre)!=' ' && *(pos+lpre)!='=') {
73     findstart=pos+lpre;
74     pos = (caseStrict) ? strstr(findstart, pre) : strifind(findstart, pre);
75     }
76     }
77     if (pos==NULL) return NULL;
78     char* vp=pos+lpre;
79     while (*vp==' ') vp++;
80     if (*vp==';' || *vp==0)
81     GError("Error parsing value of GFF attribute \"%s\", line:\n%s\n", pre, dupline);
82     bool dq_enclosed=false; //value string enclosed by double quotes
83     if (*vp=='"') {
84     dq_enclosed=true;
85     vp++;
86     }
87     if (enforce_GTF2 && !dq_enclosed)
88     GError(GTF2_ERR,pre, dupline);
89     char* vend=vp;
90     if (dq_enclosed) {
91     while (*vend!='"' && *vend!=';' && *vend!=0) vend++;
92     }
93     else {
94     while (*vend!=';' && *vend!=0) vend++;
95     }
96     if (enforce_GTF2 && *vend!='"')
97     GError(GTF2_ERR, pre, dupline);
98     char *r=Gstrdup(vp, vend-1);
99     //-- now remove this attribute from the info string
100     while (*vend!=0 && (*vend=='"' || *vend==';' || *vend==' ')) vend++;
101     if (*vend==0) vend--;
102     for (char *src=vend, *dest=pos;;src++,dest++) {
103     *dest=*src;
104     if (*src==0) break;
105     }
106     return r;
107     }
108 gpertea 2
109 gpertea 16 static char fnamelc[128];
110    
111 gpertea 2 GffLine::GffLine(GffReader* reader, const char* l) {
112 gpertea 16 llen=strlen(l);
113     GMALLOC(line,llen+1);
114     memcpy(line, l, llen+1);
115     GMALLOC(dupline, llen+1);
116     memcpy(dupline, l, llen+1);
117 gpertea 2 skip=true;
118     gseqname=NULL;
119     track=NULL;
120     ftype=NULL;
121     info=NULL;
122 gpertea 16 _parents=NULL;
123     _parents_len=0;
124     num_parents=0;
125     parents=NULL;
126     is_gff3=false;
127 gpertea 2 is_cds=false;
128 gpertea 16 is_transcript=false;
129 gpertea 2 is_exon=false;
130 gpertea 16 is_gene=false;
131 gpertea 2 exontype=0;
132 gpertea 16 gene_id=NULL;
133     gene_name=NULL;
134 gpertea 2 qstart=0;
135     qend=0;
136     qlen=0;
137     ID=NULL;
138     char* t[9];
139     int i=0;
140     int tidx=1;
141     t[0]=line;
142    
143     while (line[i]!=0) {
144     if (line[i]=='\t') {
145     line[i]=0;
146     t[tidx]=line+i+1;
147     tidx++;
148     if (tidx>8) break;
149     }
150     i++;
151     }
152    
153     if (tidx<8) { // ignore non-GFF lines
154     // GMessage("Warning: error parsing GFF/GTF line:\n%s\n", l);
155     return;
156     }
157     gseqname=t[0];
158     track=t[1];
159     ftype=t[2];
160     info=t[8];
161     char* p=t[3];
162     if (!parseUInt(p,fstart))
163     GError("Error parsing start coordinate from GFF line:\n%s\n",l);
164     p=t[4];
165     if (!parseUInt(p,fend))
166     GError("Error parsing end coordinate from GFF line:\n%s\n",l);
167 gpertea 16 if (fend<fstart) swap(fend,fstart); //make sure fstart>=fend, always
168 gpertea 2 p=t[5];
169     if (p[0]=='.' && p[1]==0) {
170     score=0;
171     }
172     else {
173     if (!parseDouble(p,score))
174     GError("Error parsing feature score from GFF line:\n%s\n",l);
175     }
176     strand=*t[6];
177     if (strand!='+' && strand!='-' && strand!='.')
178     GError("Error parsing strand (%c) from GFF line:\n%s\n",strand,l);
179 gpertea 16 phase=*t[7]; // must be '.', '0', '1' or '2'
180 gpertea 2 ID=NULL;
181     // exon/CDS/mrna filter
182 gpertea 16 strncpy(fnamelc, ftype, 127);
183     fnamelc[127]=0;
184 gpertea 2 strlower(fnamelc); //convert to lower case
185 gpertea 16 bool is_t_data=false;
186 gpertea 2 if (strstr(fnamelc, "utr")!=NULL) {
187     exontype=exgffUTR;
188     is_exon=true;
189 gpertea 16 is_t_data=true;
190 gpertea 2 }
191     else if (strstr(fnamelc, "exon")!=NULL) {
192     exontype=exgffExon;
193     is_exon=true;
194 gpertea 16 is_t_data=true;
195 gpertea 2 }
196 gpertea 16 else if (strstr(fnamelc, "stop") &&
197     (strstr(fnamelc, "codon") || strstr(fnamelc, "cds"))){
198 gpertea 2 exontype=exgffStop;
199 gpertea 16 is_cds=true; //though some place it outside the last CDS segment
200     is_t_data=true;
201     }
202     else if (strstr(fnamelc, "start") &&
203     ((strstr(fnamelc, "codon")!=NULL) || strstr(fnamelc, "cds")!=NULL)){
204     exontype=exgffStart;
205 gpertea 2 is_cds=true;
206 gpertea 16 is_t_data=true;
207 gpertea 2 }
208     else if (strcmp(fnamelc, "cds")==0) {
209     exontype=exgffCDS;
210     is_cds=true;
211 gpertea 16 is_t_data=true;
212 gpertea 2 }
213 gpertea 16 else if (endsWith(fnamelc, "gene") || startsWith(fnamelc, "gene")) {
214     is_gene=true;
215     is_t_data=true; //because its name will be attached to parented transcripts
216 gpertea 2 }
217 gpertea 16 else if (endsWith(fnamelc,"rna") || endsWith(fnamelc,"transcript")) {
218     is_transcript=true;
219     is_t_data=true;
220     }
221 gpertea 2
222 gpertea 16 if (reader->transcriptsOnly && !is_t_data) {
223     char* id=extractAttr("ID=");
224     if (id==NULL) id=extractAttr("transcript_id");
225     //GMessage("Discarding non-transcript line:\n%s\n",l);
226     if (id!=NULL) {
227     reader->discarded_ids.Add(id, new int(1));
228     GFREE(id);
229     }
230     return; //skip this line, unwanted feature name
231 gpertea 2 }
232 gpertea 16 ID=extractAttr("ID=");
233     char* Parent=extractAttr("Parent=");
234     is_gff3=(ID!=NULL || Parent!=NULL);
235     if (is_gff3) {
236     //parse as GFF3
237     if (ID!=NULL) {
238     //has ID attr so it's likely to be a parent feature
239     //look for explicit gene name
240     gene_name=extractAttr("gene_name=",false);
241     if (gene_name==NULL) {
242     gene_name=extractAttr("geneName=",false);
243     if (gene_name==NULL) {
244     gene_name=extractAttr("gene_sym=",false);
245     if (gene_name==NULL) {
246     gene_name=extractAttr("gene=",false);
247     }
248     }
249     }
250     gene_id=extractAttr("geneID=",false);
251     if (gene_id==NULL) {
252     gene_id=extractAttr("gene_id=",false);
253     }
254     if (is_gene) {
255     //special case: keep the Name and ID attributes of the gene feature
256     if (gene_name==NULL)
257     gene_name=extractAttr("Name=");
258     if (gene_id==NULL) //the ID is also gene_id in this case
259     gene_id=Gstrdup(ID);
260     //skip=false;
261     //return;
262     GFREE(Parent); //TMI, we really don't care about gene Parents?
263     } //gene feature
264     }// has GFF3 ID
265     if (Parent!=NULL) {
266     //keep Parent attr
267     //parse multiple parents
268     num_parents=1;
269     p=Parent;
270     int last_delim_pos=-1;
271     while (*p!=';' && *p!=0) {
272     if (*p==',' && *(p+1)!=0 && *(p+1)!=';') {
273     num_parents++;
274     last_delim_pos=(p-Parent);
275     }
276     p++;
277     }
278     _parents_len=p-Parent+1;
279     _parents=Parent;
280     GMALLOC(parents, num_parents*sizeof(char*));
281     parents[0]=_parents;
282     int i=1;
283     if (last_delim_pos>0) {
284     for (p=_parents+1;p<=_parents+last_delim_pos;p++) {
285     if (*p==',') {
286     char* ep=p-1;
287     while (*ep==' ' && ep>_parents) ep--;
288     *(ep+1)=0; //end the string there
289     parents[i]=p+1;
290     i++;
291     }
292     }
293     }
294     } //has Parent field
295     } //GFF3
296     else { // GTF-like expected
297     Parent=extractAttr("transcript_id");
298     if (Parent!=NULL) { //GTF2 format detected
299     if (is_transcript) {
300     // atypical GTF with a parent transcript line declared
301     ID=Parent;
302     Parent=NULL;
303     }
304     gene_id=extractAttr("gene_id"); // for GTF this is the only attribute accepted as geneID
305     gene_name=extractAttr("gene_name");
306     if (gene_name==NULL) {
307     gene_name=extractAttr("gene_sym");
308     if (gene_name==NULL)
309     gene_name=extractAttr("gene");
310     }
311 gpertea 2 //prepare for parseAttr by adding '=' character instead of spaces for all attributes
312     //after the attribute name
313     p=info;
314     bool noed=true; //not edited after the last delim
315     bool nsp=false; //non-space found after last delim
316     while (*p!=0) {
317 gpertea 16 if (*p==' ') {
318     if (nsp && noed) {
319     *p='=';
320     noed=false;
321     p++;
322     continue;
323     }
324     }
325     else nsp=true; //non-space
326     if (*p==';') { noed=true; nsp=false; }
327     p++;
328     }
329     } //GTF2 detected (no parent line)
330     else {// Parent is NULL, check for jigsaw format or other pre-GTF2 format
331     //char* fexon=strstr(fnamelc, "exon");
332     //if (fexon!=NULL) {
333     if (exontype==exgffExon) {
334 gpertea 2 if (startsWith(track,"jigsaw")) {
335 gpertea 16 is_cds=true;
336     strcpy(track,"jigsaw");
337     p=strchr(info,';');
338     if (p==NULL) { Parent=Gstrdup(info); info=NULL; }
339     else { Parent=Gstrdup(info,p-1);
340     info=p+1;
341     }
342     }
343     } //exon feature?
344     if (Parent==NULL && exontype>=exgffCDS &&
345     (i=strcspn(info,"; \t\n\r"))<=(int)(strlen(info)+1)) {
346     //one word ID ? really desperate attempt to parse it here
347 gpertea 2 Parent=Gstrdup(info,info+i-1);
348 gpertea 16 info=NULL; //discard anything else on the line
349     }
350     }
351     if (Parent!=NULL) { //GTF transcript_id for exon/CDS feature
352     _parents=Parent;
353     GMALLOC(parents,sizeof(char*));
354     num_parents=1;
355     parents[0]=_parents;
356     }
357     } //GTF-like
358 gpertea 2
359 gpertea 16 //parse other potentially useful features
360     if (is_gff3) {
361     if ((p=strstr(info,"Target="))!=NULL) { //has Target attr
362     p+=7;
363     while (*p!=';' && *p!=0 && *p!=' ') p++;
364     if (*p!=' ') {
365     GError("Error parsing target coordinates from GFF line:\n%s\n",l);
366     }
367     if (!parseUInt(p,qstart))
368     GError("Error parsing target start coordinate from GFF line:\n%s\n",l);
369     if (*p!=' ') {
370     GError("Error parsing next target coordinate from GFF line:\n%s\n",l);
371     }
372     p++;
373     if (!parseUInt(p,qend))
374     GError("Error parsing target end coordinate from GFF line:\n%s\n",l);
375 gpertea 2 }
376 gpertea 16 if ((p=strifind(info,"Qreg="))!=NULL) { //has Qreg attr
377     p+=5;
378     if (!parseUInt(p,qstart))
379     GError("Error parsing target start coordinate from GFF line:\n%s\n",l);
380     if (*p!='-') {
381     GError("Error parsing next target coordinate from GFF line:\n%s\n",l);
382     }
383     p++;
384     if (!parseUInt(p,qend))
385     GError("Error parsing target end coordinate from GFF line:\n%s\n",l);
386     if (*p=='|' || *p==':') {
387     p++;
388     if (!parseUInt(p,qlen))
389     GError("Error parsing target length from GFF Qreg|: \n%s\n",l);
390     }
391     }//has Qreg attr
392     if (qlen==0 && (p=strifind(info,"Qlen="))!=NULL) {
393     p+=5;
394     if (!parseUInt(p,qlen))
395     GError("Error parsing target length from GFF Qlen:\n%s\n",l);
396 gpertea 2 }
397 gpertea 16 }//parsing some useful attributes in GFF3 records
398     if (ID==NULL && parents==NULL) {
399     if (reader->gff_warns)
400     GMessage("Warning: could not parse ID or Parent from GFF line:\n%s\n",dupline);
401     return; //skip
402 gpertea 2 }
403     skip=false;
404     }
405    
406 gpertea 55
407     void GffObj::addCDS(uint cd_start, uint cd_end, char phase) {
408     if (cd_start>=this->start) {
409     this->CDstart=cd_start;
410     if (strand=='+') this->CDphase=phase;
411     }
412     else this->CDstart=this->start;
413     if (cd_end<=this->end) {
414     this->CDend=cd_end;
415     if (strand=='-') this->CDphase=phase;
416     }
417     else this->CDend=this->end;
418     isTranscript(true);
419     exon_ftype_id=gff_fid_exon;
420     if (monoFeature()) {
421     if (exons.Count()==0) addExon(this->start, this->end,0,'.',0,0,false,exgffExon);
422     else exons[0]->exontype=exgffExon;
423     }
424     }
425    
426 gpertea 16 int GffObj::addExon(GffReader* reader, GffLine* gl, bool keepAttr, bool noExonAttr) {
427 gpertea 2 //this will make sure we have the right subftype_id!
428 gpertea 55 //int subf_id=-1;
429     if (!isTranscript() && gl->is_cds) {
430 gpertea 50 isTranscript(true);
431     exon_ftype_id=gff_fid_exon;
432     if (exons.Count()==1) exons[0]->exontype=exgffExon;
433     }
434 gpertea 16 if (isTranscript()) {
435     if (exon_ftype_id<0) {//exon_ftype_id=gff_fid_exon;
436     if (gl->exontype>0) exon_ftype_id=gff_fid_exon;
437     else exon_ftype_id=names->feats.addName(gl->ftype);
438     }
439 gpertea 2 //any recognized mRNA segment gets the generic "exon" type (also applies to CDS)
440 gpertea 16 if (gl->exontype==0 && !gl->is_transcript) {
441     //extraneous mRNA feature, discard
442     if (reader->gff_warns)
443     GMessage("Warning: discarding unrecognized transcript subfeature %s of %s\n",
444     gl->ftype, gffID);
445 gpertea 2 return -1;
446 gpertea 16 }
447 gpertea 2 }
448 gpertea 16 else { //non-mRNA parent feature, check this subf type
449 gpertea 55 int subf_id=names->feats.addName(gl->ftype);
450 gpertea 16 if (exon_ftype_id<0 || exons.Count()==0) //never assigned a subfeature type before (e.g. first exon being added)
451     exon_ftype_id=subf_id;
452     else {
453     if (exon_ftype_id!=subf_id) {
454 gpertea 50 //
455 gpertea 16 if (exon_ftype_id==ftype_id && exons.Count()==1 && exons[0]->start==start && exons[0]->end==end) {
456     //the existing exon was just a dummy one created by default, discard it
457     exons.Clear();
458     covlen=0;
459     exon_ftype_id=subf_id; //allow the new subfeature to completely takeover
460     }
461     else { //multiple subfeatures, prefer those with
462     if (reader->gff_warns)
463     GMessage("GFF Warning: multiple subfeatures (%s and %s) found for %s, discarding ",
464     names->feats.getName(subf_id), names->feats.getName(exon_ftype_id),gffID);
465     if (gl->exontype!=0) { //new feature is an exon, discard previously parsed subfeatures
466     if (reader->gff_warns) GMessage("%s.\n", names->feats.getName(exon_ftype_id));
467     exon_ftype_id=subf_id;
468     exons.Clear();
469     covlen=0;
470     }
471     else { //discard new feature
472     if (reader->gff_warns) GMessage("%s.\n", names->feats.getName(subf_id));
473     return -1; //skip this 2nd subfeature type for this parent!
474     }
475     }
476     } //incoming subfeature is of different type
477     } //new subfeature type
478     } //non-mRNA parent
479 gpertea 2 int eidx=addExon(gl->fstart, gl->fend, gl->score, gl->phase,
480     gl->qstart,gl->qend, gl->is_cds, gl->exontype);
481 gpertea 16 if (eidx<0) return eidx; //this should never happen
482     if (keepAttr) {
483     if (noExonAttr) {
484     if (attrs==NULL) //place the parsed attributes directly at transcript level
485     parseAttrs(attrs, gl->info);
486     }
487     else { //need all exon-level attributes
488     parseAttrs(exons[eidx]->attrs, gl->info, true);
489     }
490 gpertea 2 }
491     return eidx;
492     }
493    
494    
495     int GffObj::addExon(uint segstart, uint segend, double sc, char fr, int qs, int qe, bool iscds, char exontype) {
496     if (exons.Count()==0) {
497     if (iscds) isCDS=true; //for now, assume CDS only if first "exon" given is a CDS
498 gpertea 16 if (exon_ftype_id<0) {
499     exon_ftype_id = isTranscript() ? gff_fid_exon : ftype_id;
500 gpertea 2 }
501     }
502 gpertea 16 //special treatment of start/stop codon features, they might be broken/split between exons
503     //and in that case some providers will still give the wrong end coordinate as start+2 (e.g. UCSC)
504     //so we should not trust the end coordinate for such features
505     if (exontype==exgffStart || exontype==exgffStop) {
506     if (strand=='-') segstart=segend;
507     else segend=segstart;
508     if (exontype==exgffStart) {
509     if (CDstart==0 || segstart<CDstart) CDstart=segstart;
510     }
511     else {
512     if (segstart>CDend) CDend=segstart;
513     }
514     }
515     else if (iscds) { //update CDS anchors:
516 gpertea 2 if (CDstart==0 || segstart<CDstart) {
517     CDstart=segstart;
518     if (exontype==exgffCDS && strand=='+') CDphase=fr;
519     }
520     if (segend>CDend) {
521     if (exontype==exgffCDS && strand=='-') CDphase=fr;
522     CDend=segend;
523     }
524     }
525 gpertea 16 else { // not a CDS/start/stop
526 gpertea 2 isCDS=false;
527     }
528     if (qs || qe) {
529     if (qs>qe) swap(qs,qe);
530     if (qs==0) qs=1;
531 gpertea 16 }
532 gpertea 2 int ovlen=0;
533 gpertea 16 if (exontype>0) { //check for overlaps between exon-type segments
534     int oi=exonOverlapIdx(segstart, segend, &ovlen);
535     if (oi>=0) { //overlap existing segment
536     if (ovlen==0) {
537     //adjacent segments will be merged
538     //e.g. CDS to (UTR|exon)
539     if ((exons[oi]->exontype>=exgffUTR && exontype==exgffCDS) ||
540     (exons[oi]->exontype==exgffCDS && exontype>=exgffUTR)) {
541     expandExon(oi, segstart, segend, exgffCDSUTR, sc, fr, qs, qe);
542     return oi;
543     }
544     //CDS adjacent to stop_codon: UCSC does (did?) this
545     if ((exons[oi]->exontype==exgffStop && exontype==exgffCDS) ||
546     (exons[oi]->exontype==exgffCDS && exontype==exgffStop)) {
547     expandExon(oi, segstart, segend, exgffCDS, sc, fr, qs, qe);
548     return oi;
549     }
550     }
551     //only allow this for CDS within exon, stop_codon within (CDS|UTR|exon),
552     // start_codon within (CDS|exon)
553     if (exons[oi]->exontype>exontype &&
554     exons[oi]->start<=segstart && exons[oi]->end>=segend &&
555     !(exons[oi]->exontype==exgffUTR && exontype==exgffCDS)) {
556     //larger segment given first, now the smaller included one is redundant
557     return oi; //only used to store attributes from current GffLine
558     }
559     if (exontype>exons[oi]->exontype &&
560     segstart<=exons[oi]->start && segend>=exons[oi]->end &&
561     !(exontype==exgffUTR && exons[oi]->exontype==exgffCDS)) {
562     //smaller segment given first, so we have to enlarge it
563     expandExon(oi, segstart, segend, exontype, sc, fr, qs, qe);
564     //this should also check for overlapping next exon (oi+1) ?
565     return oi;
566     }
567     //there is also the special case of "ribosomal slippage exception" (programmed frameshift)
568     //where two CDS segments may actually overlap for 1 or 2 bases, but there should be only one encompassing exon
569     //if (ovlen>2 || exons[oi]->exontype!=exgffCDS || exontype!=exgffCDS) {
570     // had to relax this because of some weird UCSC annotations with exons partially overlapping the CDS segments
571     /*
572     if (ovlen>2 && exons[oi]->exontype!=exgffUTR && exontype!=exgffUTR) {
573     if (gff_show_warnings)
574     GMessage("GFF Warning: discarding overlapping feature segment (%d-%d) (vs %d-%d (%s)) for GFF ID %s on %s\n",
575     segstart, segend, exons[oi]->start, exons[oi]->end, getSubfName(), gffID, getGSeqName());
576     hasErrors(true);
577     return -1; //segment NOT added
578     }
579     */
580 gpertea 2
581 gpertea 16 if ((ovlen>2 || ovlen==0) || exons[oi]->exontype!=exgffCDS || exontype!=exgffCDS) {
582     if (gff_show_warnings)
583     GMessage("GFF Warning: merging overlapping/adjacent feature segment (%d-%d) into (%d-%d) (%s) for GFF ID %s on %s\n",
584     segstart, segend, exons[oi]->start, exons[oi]->end, getSubfName(), gffID, getGSeqName());
585     expandExon(oi, segstart, segend, exontype, sc, fr, qs, qe);
586     return oi;
587     }
588     // else add the segment if the overlap is small and between two CDS segments
589     //TODO: we might want to add an attribute here with the slippage coordinate and size?
590     covlen-=ovlen;
591     }//overlap or adjacent to existing segment
592     } //check for overlap
593 gpertea 2 // --- no overlap, or accepted micro-overlap (ribosomal slippage)
594     // create & add the new segment
595 gpertea 51 /*
596     if (start>0 && exontype==exgffCDS && exons.Count()==0) {
597     //adding a CDS directly as the first subfeature of a declared parent
598     segstart=start;
599     segend=end;
600     }
601     */
602 gpertea 2 GffExon* enew=new GffExon(segstart, segend, sc, fr, qs, qe, exontype);
603     int eidx=exons.Add(enew);
604     if (eidx<0) {
605 gpertea 16 //this would actually be acceptable if the object is a "Gene" and "exons" are in fact isoforms
606     if (gff_show_warnings)
607     GMessage("GFF Warning: failed adding segment %d-%d for %s (discarded)!\n",
608 gpertea 2 segstart, segend, gffID);
609 gpertea 16 delete enew;
610     hasErrors(true);
611 gpertea 2 return -1;
612     }
613     covlen+=(int)(exons[eidx]->end-exons[eidx]->start)+1;
614 gpertea 55 //adjust parent feature coordinates to contain this exon
615     if (start==0 || start>exons.First()->start) {
616 gpertea 51 start=exons.First()->start;
617     }
618 gpertea 55 if (end<exons.Last()->end) end=exons.Last()->end;
619    
620 gpertea 2 if (uptr!=NULL) { //collect stats about the underlying genomic sequence
621     GSeqStat* gsd=(GSeqStat*)uptr;
622     if (start<gsd->mincoord) gsd->mincoord=start;
623     if (end>gsd->maxcoord) gsd->maxcoord=end;
624 gpertea 16 if (this->len()>gsd->maxfeat_len) {
625     gsd->maxfeat_len=this->len();
626     gsd->maxfeat=this;
627     }
628 gpertea 2 }
629     return eidx;
630     }
631    
632 gpertea 16 void GffObj::expandExon(int oi, uint segstart, uint segend, char exontype, double sc, char fr, int qs, int qe) {
633     //oi is the index of the *first* overlapping segment found that must be enlarged
634     covlen-=exons[oi]->len();
635     if (segstart<exons[oi]->start)
636     exons[oi]->start=segstart;
637     if (qs && qs<exons[oi]->qstart) exons[oi]->qstart=qs;
638     if (segend>exons[oi]->end)
639     exons[oi]->end=segend;
640     if (qe && qe>exons[oi]->qend) exons[oi]->qend=qe;
641     //warning: score cannot be properly adjusted! e.g. if it's a p-value it's just going to get worse
642     if (sc!=0) exons[oi]->score=sc;
643     covlen+=exons[oi]->len();
644     //if (exons[oi]->exontype< exontype) -- always true
645     exons[oi]->exontype = exontype;
646     if (exontype==exgffCDS) exons[oi]->phase=fr;
647     //we must check if any more exons are also overlapping this
648     int ni=oi+1; //next exon index after oi
649     while (ni<exons.Count() && segend>=exons[ni]->start) { // next segment overlaps new enlarged segment
650     //only allow this if next segment is fully included, and a subordinate
651     if (exons[ni]->exontype<exontype && exons[ni]->end<=segend) {
652     /* I guess we have to relax this due to stupid UCSC hg18 files having a start_codon sticking out
653     chr1 hg18_knownGene start_codon 69806911 69806913 0.000000 + .
654     chr1 hg18_knownGene CDS 69806911 69806912 0.000000 + 0
655     chr1 hg18_knownGene exon 69805456 69806912 0.000000 + .
656     */
657     if (exons[ni]->qstart<exons[oi]->qstart) exons[oi]->qstart=exons[ni]->qstart;
658     if (exons[ni]->qend>exons[oi]->qend) exons[oi]->qend=exons[ni]->qend;
659     exons.Delete(ni);
660     }
661     else {
662     if (gff_show_warnings) GMessage("GFF Warning: overlapping existing exon(%d-%d) while expanding to %d-%d for GFF ID %s\n",
663     exons[ni]->start, exons[ni]->end, segstart, segend, gffID);
664     //hasErrors(true);
665     break;
666     }
667     }
668     // -- make sure any other related boundaries are updated:
669     start=exons.First()->start;
670     end=exons.Last()->end;
671     if (uptr!=NULL) { //collect stats about the underlying genomic sequence
672     GSeqStat* gsd=(GSeqStat*)uptr;
673     if (start<gsd->mincoord) gsd->mincoord=start;
674     if (end>gsd->maxcoord) gsd->maxcoord=end;
675     if (this->len()>gsd->maxfeat_len) {
676     gsd->maxfeat_len=this->len();
677     gsd->maxfeat=this;
678     }
679     }
680     }
681    
682 gpertea 2 void GffObj::removeExon(int idx) {
683     /*
684     if (idx==0 && segs[0].start==gstart)
685     gstart=segs[1].start;
686     if (idx==segcount && segs[segcount].end==gend)
687     gend=segs[segcount-1].end;
688     */
689     if (idx<0 || idx>=exons.Count()) return;
690     int segstart=exons[idx]->start;
691     int segend=exons[idx]->end;
692     exons.Delete(idx);
693     covlen -= (int)(segend-segstart)+1;
694     start=exons.First()->start;
695     end=exons.Last()->end;
696     if (isCDS) { CDstart=start; CDend=end; }
697     }
698    
699 gpertea 16 void GffObj::removeExon(GffExon* p) {
700     for (int idx=0;idx<exons.Count();idx++) {
701     if (exons[idx]==p) {
702     int segstart=exons[idx]->start;
703     int segend=exons[idx]->end;
704     exons.Delete(idx);
705     covlen -= (int)(segend-segstart)+1;
706     start=exons.First()->start;
707     end=exons.Last()->end;
708     if (isCDS) { CDstart=start; CDend=end; }
709     return;
710     }
711     }
712     }
713    
714    
715    
716 gpertea 2 GffObj::GffObj(GffReader *gfrd, GffLine* gffline, bool keepAttr, bool noExonAttr):
717 gpertea 16 GSeg(0,0), exons(true,true,false), children(1,false) {
718     xstart=0;
719     xend=0;
720     xstatus=0;
721     partial=false;
722     isCDS=false;
723     uptr=NULL;
724     ulink=NULL;
725     parent=NULL;
726     udata=0;
727     flags=0;
728     CDstart=0;
729     CDend=0;
730     CDphase=0;
731     geneID=NULL;
732     gene_name=NULL;
733     attrs=NULL;
734     gffID=NULL;
735     track_id=-1;
736     gseq_id=-1;
737     ftype_id=-1;
738     exon_ftype_id=-1;
739     strand='.';
740     if (gfrd==NULL)
741 gpertea 2 GError("Cannot use this GffObj constructor with a NULL GffReader!\n");
742 gpertea 16 gffnames_ref(names);
743     if (gfrd->names==NULL) gfrd->names=names;
744     //qlen=0;qstart=0;qend=0;
745     gscore=0;
746     uscore=0;
747     covlen=0;
748     qcov=0;
749     start=gffline->fstart;
750     end=gffline->fend;
751     gseq_id=names->gseqs.addName(gffline->gseqname);
752     track_id=names->tracks.addName(gffline->track);
753     strand=gffline->strand;
754     qlen=gffline->qlen;
755     qstart=gffline->qstart;
756     qend=gffline->qend;
757     //setup flags from gffline
758     isCDS=gffline->is_cds; //for now
759     isGene(gffline->is_gene);
760     isTranscript(gffline->is_transcript || gffline->exontype!=0);
761     fromGff3(gffline->is_gff3);
762    
763     if (gffline->parents!=NULL) {
764     //GTF style -- create a GffObj directly by subfeature
765     //(also possible orphan GFF3 exon line, or an exon given before its parent (chado))
766     if (gffline->exontype!=0) { //recognized exon-like feature
767     ftype_id=gff_fid_transcript; //so this is some sort of transcript
768     exon_ftype_id=gff_fid_exon; //subfeatures MUST be exons
769     }
770     else {//unrecognized subfeatures
771     //make this GffObj of the same feature type
772     ftype_id=names->feats.addName(gffline->ftype);
773     }
774     if (gffline->ID==NULL) { //typical GTF
775     gffID=Gstrdup(gffline->parents[0]);
776     this->createdByExon(true);
777     //this is likely the first exon/segment of the feature
778     addExon(gfrd, gffline, keepAttr, noExonAttr);
779 gpertea 2 }
780 gpertea 16 else { //a parented feature with an ID -- probably an orphan GFF3 line
781     if (gffline->is_gff3 && gffline->exontype!=0) {
782     //premature exon given before its parent transcript
783     //create the transcript entry here
784     gffID=Gstrdup(gffline->parents[0]);
785     this->createdByExon(true);
786     //this is the first exon/segment of the transcript
787     addExon(gfrd, gffline, keepAttr, noExonAttr);
788     }
789     else { //unrecognized non-exon feature ? use the ID instead
790     gffID=Gstrdup(gffline->ID);
791     if (keepAttr) this->parseAttrs(attrs, gffline->info);
792     }
793 gpertea 2 }
794 gpertea 16 } //subfeature given directly
795     else { //gffline->parents==NULL
796     //create a parent feature in its own right
797 gpertea 2 gscore=gffline->score;
798     if (gffline->ID==NULL || gffline->ID[0]==0)
799 gpertea 16 GError("Error: no ID found for GFF record start\n");
800     gffID=Gstrdup(gffline->ID); //there must be an ID here
801     //if (gffline->is_transcript) ftype_id=gff_fid_mRNA;
802     //else
803     ftype_id=names->feats.addName(gffline->ftype);
804     if (gffline->is_transcript)
805     exon_ftype_id=gff_fid_exon;
806    
807     if (keepAttr) this->parseAttrs(attrs, gffline->info);
808     }//no parent
809    
810     if (gffline->gene_name!=NULL) {
811     gene_name=Gstrdup(gffline->gene_name);
812     }
813     if (gffline->gene_id!=NULL) {
814     geneID=Gstrdup(gffline->gene_id);
815     }
816    
817     GSeqStat* gsd=gfrd->gseqstats.AddIfNew(new GSeqStat(gseq_id,names->gseqs.lastNameUsed()),true);
818     uptr=gsd;
819     if (start<gsd->mincoord) gsd->mincoord=start;
820     if (end>gsd->maxcoord) gsd->maxcoord=end;
821     if (this->len()>gsd->maxfeat_len) {
822     gsd->maxfeat_len=this->len();
823     gsd->maxfeat=this;
824 gpertea 2 }
825     }
826    
827     GffLine* GffReader::nextGffLine() {
828     if (gffline!=NULL) return gffline; //caller should free gffline after processing
829     while (gffline==NULL) {
830     int llen=0;
831     buflen=GFF_LINELEN-1;
832     char* l=fgetline(linebuf, buflen, fh, &fpos, &llen);
833     if (l==NULL) {
834     return NULL; //end of file
835     }
836     int ns=0; //first nonspace position
837     while (l[ns]!=0 && isspace(l[ns])) ns++;
838     if (l[ns]=='#' || llen<10) continue;
839     gffline=new GffLine(this, l);
840     if (gffline->skip) {
841     delete gffline;
842     gffline=NULL;
843 gpertea 16 continue;
844 gpertea 2 }
845 gpertea 16 if (gffline->ID==NULL && gffline->parents==NULL) { //it must have an ID
846     //this might not be needed, already checked in the GffLine constructor
847     if (gff_warns)
848     GMessage("Warning: malformed GFF line, no parent or record Id (kipping\n");
849     delete gffline;
850     gffline=NULL;
851     //continue;
852     }
853 gpertea 2 }
854     return gffline;
855     }
856    
857     char* GffReader::gfoBuildId(const char* id, const char* ctg) {
858     //caller must free the returned pointer
859     char* buf=NULL;
860     int idlen=strlen(id);
861     GMALLOC(buf, idlen+strlen(ctg)+2);
862     strcpy(buf, id);
863     buf[idlen]='~';
864     strcpy(buf+idlen+1, ctg);
865     return buf;
866     }
867    
868     void GffReader::gfoRemove(const char* id, const char* ctg) {
869     char* buf=gfoBuildId(id,ctg);
870     phash.Remove(buf);
871     GFREE(buf);
872     }
873    
874 gpertea 16 //Warning: if gflst gets altered, idx becomes obsolete
875     GfoHolder* GffReader::gfoAdd(const char* id, const char* ctg, GffObj* gfo, int idx) {
876 gpertea 2 char* buf=gfoBuildId(id,ctg);
877 gpertea 16 GfoHolder* r=new GfoHolder(gfo,idx);
878     phash.Add(buf, r);
879 gpertea 2 GFREE(buf);
880 gpertea 16 return r;
881 gpertea 2 }
882 gpertea 16
883     GfoHolder* GffReader::gfoFind(const char* id, const char* ctg) {
884 gpertea 2 char* buf=gfoBuildId(id,ctg);
885 gpertea 16 GfoHolder* r=phash.Find(buf);
886 gpertea 2 GFREE(buf);
887     return r;
888     }
889    
890 gpertea 16 GfoHolder* GffReader::replaceGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, int replaceidx) {
891     GffObj* newgfo=new GffObj(this, gffline, keepAttr, noExonAttr);
892     GfoHolder* r=NULL;
893     if (replaceidx>=0) {
894     gflst.Put(replaceidx,newgfo);
895     r=gfoAdd(newgfo->gffID, gffline->gseqname, newgfo, replaceidx);
896     }
897     else {
898     int gfoidx=gflst.Add(newgfo);
899     r=gfoAdd(newgfo->gffID, gffline->gseqname, newgfo, gfoidx);
900     }
901     if (gff_warns) {
902     int* pcount=tids.Find(newgfo->gffID);
903     if (pcount!=NULL) {
904     if (gff_warns) GMessage("Warning: duplicate GFF ID: %s\n", newgfo->gffID);
905     (*pcount)++;
906     }
907     else {
908     tids.Add(newgfo->gffID,new int(1));
909     }
910     }
911     return r;
912     }
913 gpertea 2
914 gpertea 16 GfoHolder* GffReader::updateParent(GfoHolder* newgfh, GffObj* parent) {
915     //assert(parent);
916     //assert(newgfo);
917     parent->children.Add(newgfh->gffobj);
918     if (newgfh->gffobj->parent==NULL) newgfh->gffobj->parent=parent;
919     newgfh->gffobj->setLevel(parent->getLevel()+1);
920     if (parent->isGene()) {
921     if (parent->gene_name!=NULL && newgfh->gffobj->gene_name==NULL)
922     newgfh->gffobj->gene_name=Gstrdup(parent->gene_name);
923     if (parent->geneID!=NULL && newgfh->gffobj->geneID==NULL)
924     newgfh->gffobj->geneID=Gstrdup(parent->geneID);
925     }
926    
927     return newgfh;
928 gpertea 2 }
929    
930 gpertea 16 GfoHolder* GffReader::newGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr,
931     GffObj* parent, GffExon* pexon) {
932     GffObj* newgfo=new GffObj(this, gffline, keepAttr, noExonAttr);
933     GfoHolder* r=NULL;
934     int gfoidx=gflst.Add(newgfo);
935     r=gfoAdd(newgfo->gffID, gffline->gseqname, newgfo, gfoidx);
936     if (parent!=NULL) {
937     updateParent(r, parent);
938     if (pexon!=NULL) parent->removeExon(pexon);
939     }
940     if (gff_warns) {
941     int* pcount=tids.Find(newgfo->gffID);
942     if (pcount!=NULL) {
943     if (gff_warns) GMessage("Warning: duplicate GFF ID: %s\n", newgfo->gffID);
944     (*pcount)++;
945     }
946     else {
947     tids.Add(newgfo->gffID,new int(1));
948     }
949     }
950     return r;
951     }
952 gpertea 2
953 gpertea 16 GfoHolder* GffReader::updateGffRec(GfoHolder* prevgfo, GffLine* gffline,
954     bool keepAttr) {
955     if (prevgfo==NULL) return NULL;
956     prevgfo->gffobj->createdByExon(false);
957     prevgfo->gffobj->ftype_id=prevgfo->gffobj->names->feats.addName(gffline->ftype);
958     prevgfo->gffobj->start=gffline->fstart;
959     prevgfo->gffobj->end=gffline->fend;
960     prevgfo->gffobj->isGene(gffline->is_gene);
961     prevgfo->gffobj->isTranscript(gffline->is_transcript || gffline->exontype!=0);
962     prevgfo->gffobj->fromGff3(gffline->is_gff3);
963     if (keepAttr) {
964     if (prevgfo->gffobj->attrs!=NULL) prevgfo->gffobj->attrs->Clear();
965     prevgfo->gffobj->parseAttrs(prevgfo->gffobj->attrs, gffline->info);
966     }
967     return prevgfo;
968     }
969    
970    
971     bool GffReader::addExonFeature(GfoHolder* prevgfo, GffLine* gffline, GHash<CNonExon>& pex, bool noExonAttr) {
972     bool r=true;
973     if (gffline->strand!=prevgfo->gffobj->strand) {
974     GMessage("GFF Error: duplicate GFF ID '%s' (exons found on different strands of %s)\n",
975     prevgfo->gffobj->gffID, prevgfo->gffobj->getGSeqName());
976     r=false;
977     }
978     int gdist=(gffline->fstart>prevgfo->gffobj->end) ? gffline->fstart-prevgfo->gffobj->end :
979     ((gffline->fend<prevgfo->gffobj->start)? prevgfo->gffobj->start-gffline->fend :
980     0 );
981     if (gdist>(int)GFF_MAX_LOCUS) { //too far apart, most likely this is a duplicate ID
982     GMessage("Error: duplicate GFF ID '%s' (or exons too far apart)!\n",prevgfo->gffobj->gffID);
983     //validation_errors = true;
984     r=false;
985     if (!gff_warns) exit(1);
986     }
987     int eidx=prevgfo->gffobj->addExon(this, gffline, !noExonAttr, noExonAttr);
988     if (eidx>=0 && gffline->ID!=NULL && gffline->exontype==0)
989     subfPoolAdd(pex, prevgfo);
990     return r;
991     }
992    
993     CNonExon* GffReader::subfPoolCheck(GffLine* gffline, GHash<CNonExon>& pex, char*& subp_name) {
994     CNonExon* subp=NULL;
995     subp_name=NULL;
996     for (int i=0;i<gffline->num_parents;i++) {
997     if (transcriptsOnly && discarded_ids.Find(gffline->parents[i])!=NULL)
998     continue;
999     subp_name=gfoBuildId(gffline->parents[i], gffline->gseqname); //e.g. mRNA name
1000     subp=pex.Find(subp_name);
1001     if (subp!=NULL)
1002     return subp;
1003     GFREE(subp_name);
1004     }
1005     return NULL;
1006     }
1007    
1008     void GffReader::subfPoolAdd(GHash<CNonExon>& pex, GfoHolder* newgfo) {
1009     //this might become a parent feature later
1010     if (newgfo->gffobj->exons.Count()>0) {
1011     char* xbuf=gfoBuildId(gffline->ID, gffline->gseqname);
1012     pex.Add(xbuf, new CNonExon(newgfo->idx, newgfo->gffobj,
1013     newgfo->gffobj->exons[0], gffline));
1014     GFREE(xbuf);
1015     }
1016     }
1017    
1018     GfoHolder* GffReader::promoteFeature(CNonExon* subp, char*& subp_name, GHash<CNonExon>& pex,
1019     bool keepAttr, bool noExonAttr) {
1020     GffObj* prevp=subp->parent; //grandparent of gffline (e.g. gene)
1021     if (prevp!=gflst[subp->idx])
1022     GError("Error promoting subfeature %s, gflst index mismatch?!\n", subp->gffline->ID);
1023     subp->gffline->discardParent();
1024     GfoHolder* gfoh=newGffRec(subp->gffline, keepAttr, noExonAttr, prevp, subp->exon);
1025     pex.Remove(subp_name); //no longer a potential parent, moved it to phash already
1026     prevp->promotedChildren(true);
1027     return gfoh; //returns the holder of newly promoted feature
1028     }
1029    
1030     //have to parse the whole file because exons can be scattered all over
1031 gpertea 2 void GffReader::readAll(bool keepAttr, bool mergeCloseExons, bool noExonAttr) {
1032 gpertea 16 bool validation_errors = false;
1033     //loc_debug=false;
1034     GHash<CNonExon> pex; //keep track of any "exon"-like features that have an ID
1035     //and thus could become promoted to parent features
1036 gpertea 2 while (nextGffLine()!=NULL) {
1037 gpertea 16 //seen this gff ID before?
1038     GfoHolder* prevseen=NULL;
1039     if (gffline->ID) //GFF3
1040     prevseen=gfoFind(gffline->ID, gffline->gseqname);
1041     if (prevseen!=NULL) {
1042     if (prevseen->gffobj->createdByExon()) {
1043     updateGffRec(prevseen, gffline, keepAttr);
1044     }
1045     else {
1046     GMessage("Error: duplicate GFF ID '%s' encountered!\n",gffline->ID);
1047     validation_errors = true;
1048     if (gff_warns) {
1049     delete gffline; gffline=NULL; continue;
1050     }
1051     else exit(1);
1052     }
1053 gpertea 2 }
1054 gpertea 16 if (gffline->parents==NULL) {//start GFF3-like record with no parent (mRNA, gene)
1055     if (!prevseen) newGffRec(gffline, keepAttr, noExonAttr);
1056 gpertea 2 }
1057 gpertea 16 else { //--- it's a parented feature (could still be a mRNA)
1058     bool found_parent=false;
1059     GfoHolder* newgfo=prevseen;
1060     for (int i=0;i<gffline->num_parents;i++) {
1061     if (transcriptsOnly && discarded_ids.Find(gffline->parents[i])!=NULL)
1062     continue; //skipping discarded parent feature
1063     GfoHolder* parentgfo=gfoFind(gffline->parents[i], gffline->gseqname);
1064     if (parentgfo!=NULL) { //parent GffObj parsed earlier
1065     found_parent=true;
1066     if (parentgfo->gffobj->isGene() && gffline->is_transcript
1067     && gffline->exontype==0) {
1068     //not an exon, but a transcript parented by a gene
1069     if (newgfo) {
1070     updateParent(newgfo, parentgfo->gffobj);
1071     }
1072     else {
1073     newgfo=newGffRec(gffline, keepAttr, noExonAttr, parentgfo->gffobj);
1074     }
1075     }
1076     else { //potential exon subfeature
1077     if (!addExonFeature(parentgfo, gffline, pex, noExonAttr))
1078     validation_errors=true;
1079     }
1080 gpertea 2 }
1081 gpertea 16 } //for each parsed parent Id
1082     if (!found_parent) { //new GTF-like record starting here with a subfeature directly
1083     //or it could be some chado GFF3 barf with exons declared BEFORE their parent :(
1084     //check if this feature isn't parented by a previously stored "exon" subfeature
1085     char* subp_name=NULL;
1086     CNonExon* subp=subfPoolCheck(gffline, pex, subp_name);
1087     if (subp!=NULL) { //found a subfeature that is the parent of this gffline
1088     //promote that subfeature to a full GffObj
1089     GfoHolder* gfoh=promoteFeature(subp, subp_name, pex, keepAttr, noExonAttr);
1090     //add current gffline as an exon of the newly promoted subfeature
1091     if (!addExonFeature(gfoh, gffline, pex, noExonAttr))
1092     validation_errors=true;
1093     }
1094     else { //no parent seen before, create one directly with this exon
1095     //loc_debug=true;
1096     GfoHolder* newgfo=prevseen ? prevseen : newGffRec(gffline, keepAttr, noExonAttr);
1097     if (gffline->ID!=NULL && gffline->exontype==0)
1098     subfPoolAdd(pex, newgfo);
1099     //even those with errors will be added here!
1100     }
1101     GFREE(subp_name);
1102     } //no previous parent found
1103     } //parented feature
1104     //--
1105     delete gffline;
1106     gffline=NULL;
1107     }//while gff lines
1108     gflst.finalize(this, mergeCloseExons, keepAttr, noExonAttr); //force sorting by locus if so constructed
1109 gpertea 2 // all gff records are now loaded in GList gflst
1110     // so we can free the hash
1111     phash.Clear();
1112 gpertea 16 tids.Clear();
1113     if (validation_errors) {
1114     exit(1);
1115     }
1116 gpertea 2 }
1117    
1118 gpertea 16 GffObj* GffObj::finalize(GffReader* gfr, bool mergeCloseExons, bool keepAttrs, bool noExonAttr) {
1119     //merge
1120     //always merge adjacent or overlapping segments
1121     //but if mergeCloseExons then merge even when distance is up to 5 bases
1122 gpertea 2 udata=0;
1123     uptr=NULL;
1124 gpertea 16 if (gfr->transcriptsOnly && !(isTranscript() || (isGene() && children.Count()==0))) {
1125     isDiscarded(true);
1126     }
1127     if (ftype_id==gff_fid_transcript && CDstart>0) {
1128     ftype_id=gff_fid_mRNA;
1129     //exon_ftype_id=gff_fid_exon;
1130     }
1131     //if (ftype_id==gff_fid_mRNA || exon_ftype_id==gff_fid_exon || mergeCloseExons) {
1132     if (isTranscript() || exon_ftype_id==gff_fid_exon || mergeCloseExons) {
1133 gpertea 2 int mindist=mergeCloseExons ? 5:1;
1134     for (int i=0;i<exons.Count()-1;i++) {
1135     int ni=i+1;
1136     uint mend=exons[i]->end;
1137     while (ni<exons.Count()) {
1138     int dist=(int)(exons[ni]->start-mend);
1139 gpertea 16 if (dist>mindist) break; //no merging with next segment
1140     if (gfr!=NULL && gfr->gff_warns && dist!=0 && (exons[ni]->exontype!=exgffUTR && exons[i]->exontype!=exgffUTR)) {
1141     GMessage("GFF warning: merging adjacent/overlapping segments of %s on %s (%d-%d, %d-%d)\n",
1142     gffID, getGSeqName(), exons[i]->start, exons[i]->end,exons[ni]->start, exons[ni]->end);
1143     }
1144 gpertea 2 mend=exons[ni]->end;
1145 gpertea 16 covlen-=exons[i]->len();
1146 gpertea 2 exons[i]->end=mend;
1147 gpertea 16 covlen+=exons[i]->len();
1148     covlen-=exons[ni]->len();
1149 gpertea 2 if (exons[ni]->attrs!=NULL && (exons[i]->attrs==NULL ||
1150     exons[i]->attrs->Count()<exons[ni]->attrs->Count())) {
1151     //use the other exon attributes, if more
1152     delete(exons[i]->attrs);
1153     exons[i]->attrs=exons[ni]->attrs;
1154     exons[ni]->attrs=NULL;
1155     }
1156     exons.Delete(ni);
1157     } //check for merge with next exon
1158     } //for each exon
1159     }
1160 gpertea 16 //attribute reduction for GTF records
1161     if (keepAttrs && !noExonAttr && !fromGff3()
1162     && exons.Count()>0 && exons[0]->attrs!=NULL) {
1163     bool attrs_discarded=false;
1164     for (int a=0;a<exons[0]->attrs->Count();a++) {
1165     int attr_name_id=exons[0]->attrs->Get(a)->attr_id;
1166     char* attr_name=names->attrs.getName(attr_name_id);
1167     char* attr_val =exons[0]->attrs->Get(a)->attr_val;
1168     bool sameExonAttr=true;
1169     for (int i=1;i<exons.Count();i++) {
1170     char* ov=exons[i]->getAttr(attr_name_id);
1171     if (ov==NULL || (strcmp(ov,attr_val)!=0)) {
1172     sameExonAttr=false;
1173     break;
1174     }
1175     }
1176     if (sameExonAttr) {
1177     //delete this attribute from exons level
1178     attrs_discarded=true;
1179     this->addAttr(attr_name, attr_val);
1180     for (int i=1;i<exons.Count();i++) {
1181     removeExonAttr(*(exons[i]), attr_name_id);
1182     }
1183     exons[0]->attrs->freeItem(a);
1184     }
1185     }
1186     if (attrs_discarded) exons[0]->attrs->Pack();
1187     }
1188 gpertea 2 return this;
1189     }
1190    
1191 gpertea 16 void GffObj::parseAttrs(GffAttrs*& atrlist, char* info, bool isExon) {
1192 gpertea 2 if (names==NULL)
1193     GError(ERR_NULL_GFNAMES, "parseAttrs()");
1194     if (atrlist==NULL)
1195     atrlist=new GffAttrs();
1196     char* endinfo=info+strlen(info);
1197     char* start=info;
1198     char* pch=start;
1199     while (start<endinfo) {
1200     //skip spaces
1201     while (*start==' ' && start<endinfo) start++;
1202     pch=strchr(start, ';');
1203     if (pch==NULL) pch=endinfo;
1204     else {
1205     *pch='\0';
1206     pch++;
1207     }
1208     char* ech=strchr(start,'=');
1209     if (ech!=NULL) { // attr=value format found
1210     *ech='\0';
1211 gpertea 16 //if (noExonAttr && (strcmp(start, "exon_number")==0 || strcmp(start, "exon")==0)) { start=pch; continue; }
1212     if (strcmp(start, "exon_number")==0 || strcmp(start, "exon")==0 ||
1213     strcmp(start, "exon_id")==0)
1214     { start=pch; continue; }
1215 gpertea 2 ech++;
1216     while (*ech==' ' && ech<endinfo) ech++;//skip extra spaces after the '='
1217 gpertea 16 //atrlist->Add(new GffAttr(names->attrs.addName(start),ech));
1218     //make sure we don't add the same attribute more than once
1219     if (isExon && (strcmp(start, "protein_id")==0)) {
1220     //Ensembl special case
1221     this->addAttr(start, ech);
1222     start=pch;
1223     continue;
1224     }
1225     atrlist->add_or_update(names, start, ech);
1226 gpertea 2 }
1227     /*
1228     else { //not an attr=value format
1229     atrlist->Add(new GffAttr(names->attrs.addName(start),"1"));
1230     }
1231     */
1232     start=pch;
1233     }
1234     if (atrlist->Count()==0) { delete atrlist; atrlist=NULL; }
1235     }
1236    
1237 gpertea 16 void GffObj::addAttr(const char* attrname, const char* attrvalue) {
1238 gpertea 2 if (this->attrs==NULL)
1239     this->attrs=new GffAttrs();
1240 gpertea 16 //this->attrs->Add(new GffAttr(names->attrs.addName(attrname),attrvalue));
1241     this->attrs->add_or_update(names, attrname, attrvalue);
1242 gpertea 2 }
1243    
1244 gpertea 16
1245     void GffObj::setFeatureName(const char* feature) {
1246     //change the feature name/type for a transcript
1247     int fid=names->feats.addName(feature);
1248     if (monoFeature() && exons.Count()>0)
1249     this->exon_ftype_id=fid;
1250     this->ftype_id=fid;
1251     }
1252    
1253     void GffObj::setRefName(const char* newname) {
1254     //change the feature name/type for a transcript
1255     int rid=names->gseqs.addName(newname);
1256     this->gseq_id=rid;
1257     }
1258    
1259    
1260    
1261     int GffObj::removeAttr(const char* attrname, const char* attrval) {
1262     if (this->attrs==NULL || attrname==NULL || attrname[0]==0) return 0;
1263     int aid=this->names->attrs.getId(attrname);
1264     if (aid<0) return 0;
1265     int delcount=0; //could be more than one ?
1266     for (int i=0;i<this->attrs->Count();i++) {
1267     if (aid==this->attrs->Get(i)->attr_id) {
1268     if (attrval==NULL ||
1269     strcmp(attrval, this->attrs->Get(i)->attr_val)==0) {
1270     delcount++;
1271     this->attrs->freeItem(i);
1272     }
1273     }
1274     }
1275     if (delcount>0) this->attrs->Pack();
1276     return delcount;
1277     }
1278    
1279     int GffObj::removeAttr(int aid, const char* attrval) {
1280     if (this->attrs==NULL || aid<0) return 0;
1281     int delcount=0; //could be more than one ?
1282     for (int i=0;i<this->attrs->Count();i++) {
1283     if (aid==this->attrs->Get(i)->attr_id) {
1284     if (attrval==NULL ||
1285     strcmp(attrval, this->attrs->Get(i)->attr_val)==0) {
1286     delcount++;
1287     this->attrs->freeItem(i);
1288     }
1289     }
1290     }
1291     if (delcount>0) this->attrs->Pack();
1292     return delcount;
1293     }
1294    
1295    
1296     int GffObj::removeExonAttr(GffExon& exon, const char* attrname, const char* attrval) {
1297     if (exon.attrs==NULL || attrname==NULL || attrname[0]==0) return 0;
1298     int aid=this->names->attrs.getId(attrname);
1299     if (aid<0) return 0;
1300     int delcount=0; //could be more than one
1301     for (int i=0;i<exon.attrs->Count();i++) {
1302     if (aid==exon.attrs->Get(i)->attr_id) {
1303     if (attrval==NULL ||
1304     strcmp(attrval, exon.attrs->Get(i)->attr_val)==0) {
1305     delcount++;
1306     exon.attrs->freeItem(i);
1307     }
1308     }
1309     }
1310     if (delcount>0) exon.attrs->Pack();
1311     return delcount;
1312     }
1313    
1314     int GffObj::removeExonAttr(GffExon& exon, int aid, const char* attrval) {
1315     if (exon.attrs==NULL || aid<0) return 0;
1316     int delcount=0; //could be more than one
1317     for (int i=0;i<exon.attrs->Count();i++) {
1318     if (aid==exon.attrs->Get(i)->attr_id) {
1319     if (attrval==NULL ||
1320     strcmp(attrval, exon.attrs->Get(i)->attr_val)==0) {
1321     delcount++;
1322     exon.attrs->freeItem(i);
1323     }
1324     }
1325     }
1326     if (delcount>0) exon.attrs->Pack();
1327     return delcount;
1328     }
1329    
1330    
1331 gpertea 2 void GffObj::getCDS_ends(uint& cds_start, uint& cds_end) {
1332     cds_start=0;
1333     cds_end=0;
1334     if (CDstart==0 || CDend==0) return; //no CDS info
1335     int cdsadj=0;
1336     if (CDphase=='1' || CDphase=='2') {
1337     cdsadj=CDphase-'0';
1338     }
1339     cds_start=CDstart;
1340     cds_end=CDend;
1341     if (strand=='-') cds_end-=cdsadj;
1342     else cds_start+=cdsadj;
1343     }
1344    
1345     void GffObj::mRNA_CDS_coords(uint& cds_mstart, uint& cds_mend) {
1346     //sets cds_start and cds_end to the CDS start,end coordinates on the spliced mRNA transcript
1347     cds_mstart=0;
1348     cds_mend=0;
1349     if (CDstart==0 || CDend==0) return; //no CDS info
1350     //restore normal coordinates, just in case
1351     unxcoord();
1352     int cdsadj=0;
1353     if (CDphase=='1' || CDphase=='2') {
1354     cdsadj=CDphase-'0';
1355     }
1356     /*
1357     uint seqstart=CDstart;
1358     uint seqend=CDend;
1359     */
1360     uint seqstart=exons.First()->start;
1361     uint seqend=exons.Last()->end;
1362     int s=0; //resulting nucleotide counter
1363     if (strand=='-') {
1364     for (int x=exons.Count()-1;x>=0;x--) {
1365     uint sgstart=exons[x]->start;
1366     uint sgend=exons[x]->end;
1367     if (seqend<sgstart || seqstart>sgend) continue;
1368     if (seqstart>=sgstart && seqstart<=sgend)
1369     sgstart=seqstart; //seqstart within this segment
1370     if (seqend>=sgstart && seqend<=sgend)
1371     sgend=seqend; //seqend within this segment
1372     s+=(int)(sgend-sgstart)+1;
1373     if (CDstart>=sgstart && CDstart<=sgend) {
1374     //CDstart in this segment
1375     //and we are getting the whole transcript
1376     cds_mend=s-(int)(CDstart-sgstart);
1377     }
1378     if (CDend>=sgstart && CDend<=sgend) {
1379     //CDstart in this segment
1380     //and we are getting the whole transcript
1381     cds_mstart=s-(int)(CDend-cdsadj-sgstart);
1382     }
1383     } //for each exon
1384     } // - strand
1385     else { // + strand
1386     for (int x=0;x<exons.Count();x++) {
1387     uint sgstart=exons[x]->start;
1388     uint sgend=exons[x]->end;
1389     if (seqend<sgstart || seqstart>sgend) continue;
1390     if (seqstart>=sgstart && seqstart<=sgend)
1391     sgstart=seqstart; //seqstart within this segment
1392     if (seqend>=sgstart && seqend<=sgend)
1393     sgend=seqend; //seqend within this segment
1394     s+=(int)(sgend-sgstart)+1;
1395     /* for (uint i=sgstart;i<=sgend;i++) {
1396     spliced[s]=gsubseq[i-gstart];
1397     s++;
1398     }//for each nt
1399     */
1400     if (CDstart>=sgstart && CDstart<=sgend) {
1401     //CDstart in this segment
1402     cds_mstart=s-(int)(sgend-CDstart-cdsadj);
1403     }
1404     if (CDend>=sgstart && CDend<=sgend) {
1405     //CDend in this segment
1406     cds_mend=s-(int)(sgend-CDend);
1407     }
1408     } //for each exon
1409     } // + strand
1410     //spliced[s]=0;
1411     //if (rlen!=NULL) *rlen=s;
1412     //return spliced;
1413     }
1414    
1415 gpertea 16 char* GffObj::getUnspliced(GFaSeqGet* faseq, int* rlen, GList<GSeg>* seglst)
1416     {
1417     if (faseq==NULL) { GMessage("Warning: getUnspliced(NULL,.. ) called!\n");
1418     return NULL;
1419     }
1420     //restore normal coordinates:
1421     unxcoord();
1422     if (exons.Count()==0) return NULL;
1423     int fspan=end-start+1;
1424     const char* gsubseq=faseq->subseq(start, fspan);
1425     if (gsubseq==NULL) {
1426     GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end);
1427     }
1428     char* unspliced=NULL;
1429    
1430     int seqstart=exons.First()->start;
1431     int seqend=exons.Last()->end;
1432    
1433     int unsplicedlen = 0;
1434    
1435     unsplicedlen += seqend - seqstart + 1;
1436    
1437     GMALLOC(unspliced, unsplicedlen+1); //allocate more here
1438     //uint seqstart, seqend;
1439    
1440     int s = 0; //resulting nucleotide counter
1441     if (strand=='-')
1442     {
1443     if (seglst!=NULL)
1444     seglst->Add(new GSeg(s+1,s+1+seqend-seqstart));
1445     for (int i=seqend;i>=seqstart;i--)
1446     {
1447     unspliced[s] = ntComplement(gsubseq[i-start]);
1448     s++;
1449     }//for each nt
1450     } // - strand
1451     else
1452     { // + strand
1453     if (seglst!=NULL)
1454     seglst->Add(new GSeg(s+1,s+1+seqend-seqstart));
1455     for (int i=seqstart;i<=seqend;i++)
1456     {
1457     unspliced[s]=gsubseq[i-start];
1458     s++;
1459     }//for each nt
1460     } // + strand
1461     //assert(s <= unsplicedlen);
1462     unspliced[s]=0;
1463     if (rlen!=NULL) *rlen=s;
1464     return unspliced;
1465     }
1466    
1467 gpertea 2 char* GffObj::getSpliced(GFaSeqGet* faseq, bool CDSonly, int* rlen, uint* cds_start, uint* cds_end,
1468     GList<GSeg>* seglst) {
1469     if (CDSonly && CDstart==0) return NULL;
1470     if (faseq==NULL) { GMessage("Warning: getSpliced(NULL,.. ) called!\n");
1471     return NULL;
1472     }
1473     //restore normal coordinates:
1474     unxcoord();
1475     if (exons.Count()==0) return NULL;
1476     int fspan=end-start+1;
1477     const char* gsubseq=faseq->subseq(start, fspan);
1478     if (gsubseq==NULL) {
1479     GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end);
1480     }
1481 gpertea 16 if (fspan<(int)(end-start+1)) { //special case: stop coordinate was extended past the gseq length, must adjust
1482     int endadj=end-start+1-fspan;
1483     uint prevend=end;
1484     end-=endadj;
1485     if (CDend>end) CDend=end;
1486     if (exons.Last()->end>end) {
1487     exons.Last()->end=end; //this could get us into trouble if exon start is also > end
1488     if (exons.Last()->start>exons.Last()->end) {
1489     GError("GffObj::getSpliced() error: improper genomic coordinate %d on %s for %s\n",
1490     prevend,getGSeqName(), getID());
1491     }
1492     covlen-=endadj;
1493     }
1494     }
1495 gpertea 2 char* spliced=NULL;
1496     GMALLOC(spliced, covlen+1); //allocate more here
1497     uint seqstart, seqend;
1498     int cdsadj=0;
1499     if (CDphase=='1' || CDphase=='2') {
1500     cdsadj=CDphase-'0';
1501     }
1502     if (CDSonly) {
1503     seqstart=CDstart;
1504     seqend=CDend;
1505     if (strand=='-') seqend-=cdsadj;
1506     else seqstart+=cdsadj;
1507     }
1508     else {
1509     seqstart=exons.First()->start;
1510     seqend=exons.Last()->end;
1511     }
1512     int s=0; //resulting nucleotide counter
1513     if (strand=='-') {
1514     for (int x=exons.Count()-1;x>=0;x--) {
1515     uint sgstart=exons[x]->start;
1516     uint sgend=exons[x]->end;
1517     if (seqend<sgstart || seqstart>sgend) continue;
1518     if (seqstart>=sgstart && seqstart<=sgend)
1519     sgstart=seqstart; //seqstart within this segment
1520     if (seqend>=sgstart && seqend<=sgend)
1521     sgend=seqend; //seqend within this segment
1522     if (seglst!=NULL)
1523     seglst->Add(new GSeg(s+1,s+1+sgend-sgstart));
1524     for (uint i=sgend;i>=sgstart;i--) {
1525     spliced[s] = ntComplement(gsubseq[i-start]);
1526     s++;
1527     }//for each nt
1528    
1529     if (!CDSonly && cds_start!=NULL && CDstart>0) {
1530     if (CDstart>=sgstart && CDstart<=sgend) {
1531     //CDstart in this segment
1532     //and we are getting the whole transcript
1533     *cds_end=s-(CDstart-sgstart);
1534     }
1535     if (CDend>=sgstart && CDend<=sgend) {
1536     //CDstart in this segment
1537     //and we are getting the whole transcript
1538     *cds_start=s-(CDend-cdsadj-sgstart);
1539     }
1540     }//update local CDS coordinates
1541     } //for each exon
1542     } // - strand
1543     else { // + strand
1544     for (int x=0;x<exons.Count();x++) {
1545     uint sgstart=exons[x]->start;
1546     uint sgend=exons[x]->end;
1547     if (seqend<sgstart || seqstart>sgend) continue;
1548     if (seqstart>=sgstart && seqstart<=sgend)
1549     sgstart=seqstart; //seqstart within this segment
1550     if (seqend>=sgstart && seqend<=sgend)
1551     sgend=seqend; //seqend within this segment
1552     if (seglst!=NULL)
1553     seglst->Add(new GSeg(s+1,s+1+sgend-sgstart));
1554     for (uint i=sgstart;i<=sgend;i++) {
1555     spliced[s]=gsubseq[i-start];
1556     s++;
1557     }//for each nt
1558     if (!CDSonly && cds_start!=NULL && CDstart>0) {
1559     if (CDstart>=sgstart && CDstart<=sgend) {
1560     //CDstart in this segment
1561     //and we are getting the whole transcript
1562     *cds_start=s-(sgend-CDstart-cdsadj);
1563     }
1564     if (CDend>=sgstart && CDend<=sgend) {
1565     //CDstart in this segment
1566     //and we are getting the whole transcript
1567     *cds_end=s-(sgend-CDend);
1568     }
1569     }//update local CDS coordinates
1570     } //for each exon
1571     } // + strand
1572     spliced[s]=0;
1573     if (rlen!=NULL) *rlen=s;
1574     return spliced;
1575     }
1576    
1577     char* GffObj::getSplicedTr(GFaSeqGet* faseq, bool CDSonly, int* rlen) {
1578     if (CDSonly && CDstart==0) return NULL;
1579     //restore normal coordinates:
1580     unxcoord();
1581     if (exons.Count()==0) return NULL;
1582     int fspan=end-start+1;
1583     const char* gsubseq=faseq->subseq(start, fspan);
1584     if (gsubseq==NULL) {
1585     GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end);
1586     }
1587    
1588     char* translation=NULL;
1589     GMALLOC(translation, (int)(covlen/3)+1);
1590     uint seqstart, seqend;
1591     int cdsadj=0;
1592     if (CDphase=='1' || CDphase=='2') {
1593     cdsadj=CDphase-'0';
1594     }
1595     if (CDSonly) {
1596     seqstart=CDstart;
1597     seqend=CDend;
1598     if (strand=='-') seqend-=cdsadj;
1599     else seqstart+=cdsadj;
1600     }
1601     else {
1602     seqstart=exons.First()->start;
1603     seqend=exons.Last()->end;
1604     }
1605     Codon codon;
1606     int nt=0; //codon nucleotide counter (0..2)
1607     int aa=0; //aminoacid count
1608     if (strand=='-') {
1609     for (int x=exons.Count()-1;x>=0;x--) {
1610     uint sgstart=exons[x]->start;
1611     uint sgend=exons[x]->end;
1612     if (seqend<sgstart || seqstart>sgend) continue;
1613     if (seqstart>=sgstart && seqstart<=sgend)
1614     sgstart=seqstart; //seqstart within this segment
1615     if (seqend>=sgstart && seqend<=sgend) {
1616     sgend=seqend; //seqend within this segment
1617     }
1618     for (uint i=sgend;i>=sgstart;i--) {
1619     codon.nuc[nt]=ntComplement(gsubseq[i-start]);
1620     nt++;
1621     if (nt==3) {
1622     nt=0;
1623     translation[aa]=codon.translate();
1624     aa++;
1625     }
1626     }//for each nt
1627     } //for each exon
1628     } // - strand
1629     else { // + strand
1630     for (int x=0;x<exons.Count();x++) {
1631     uint sgstart=exons[x]->start;
1632     uint sgend=exons[x]->end;
1633     if (seqend<sgstart || seqstart>sgend) continue;
1634     if (seqstart>=sgstart && seqstart<=sgend)
1635     sgstart=seqstart; //seqstart within this segment
1636     if (seqend>=sgstart && seqend<=sgend)
1637     sgend=seqend; //seqend within this segment
1638     for (uint i=sgstart;i<=sgend;i++) {
1639     codon.nuc[nt]=gsubseq[i-start];
1640     nt++;
1641     if (nt==3) {
1642     nt=0;
1643     translation[aa]=codon.translate();
1644     aa++;
1645     }
1646     }//for each nt
1647     } //for each exon
1648     } // + strand
1649     translation[aa]=0;
1650     if (rlen!=NULL) *rlen=aa;
1651     return translation;
1652     }
1653    
1654     void GffObj::printSummary(FILE* fout) {
1655     if (fout==NULL) fout=stdout;
1656     fprintf(fout, "%s\t%c\t%d\t%d\t%4.2f\t%4.1f\n", gffID,
1657     strand, start, end, gscore, (float)qcov/10.0);
1658     }
1659    
1660 gpertea 16 void GffObj::printGxfLine(FILE* fout, const char* tlabel, const char* gseqname, bool iscds,
1661 gpertea 2 uint segstart, uint segend, int exidx, char phase, bool gff3) {
1662     static char scorestr[14];
1663     strcpy(scorestr,".");
1664     GffAttrs* xattrs=NULL;
1665     if (exidx>=0) {
1666     if (exons[exidx]->score) sprintf(scorestr,"%.2f", exons[exidx]->score);
1667     xattrs=exons[exidx]->attrs;
1668     }
1669     if (phase==0 || !iscds) phase='.';
1670     const char* ftype=iscds ? "CDS" : getSubfName();
1671     if (gff3) {
1672     fprintf(fout,
1673     "%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\tParent=%s",
1674     gseqname, tlabel, ftype, segstart, segend, scorestr, strand,
1675     phase, gffID);
1676     if (xattrs!=NULL) {
1677     for (int i=0;i<xattrs->Count();i++)
1678     fprintf(fout, ";%s=%s",names->attrs.getName(xattrs->Get(i)->attr_id),
1679     xattrs->Get(i)->attr_val);
1680     }
1681     fprintf(fout, "\n");
1682     } //GFF
1683 gpertea 16 else {//for GTF -- we print only transcripts
1684     //if (isValidTranscript())
1685     fprintf(fout, "%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\ttranscript_id \"%s\";",
1686     gseqname, tlabel, ftype, segstart, segend, scorestr, strand, phase, gffID);
1687     //char* geneid=(geneID!=NULL)? geneID : gffID;
1688     if (geneID)
1689     fprintf(fout," gene_id \"%s\";",geneID);
1690     if (gene_name!=NULL) {
1691     //fprintf(fout, " gene_name ");
1692     //if (gene_name[0]=='"') fprintf (fout, "%s;",gene_name);
1693     // else fprintf(fout, "\"%s\";",gene_name);
1694     fprintf(fout," gene_name \"%s\";",gene_name);
1695     }
1696 gpertea 2 if (xattrs!=NULL) {
1697 gpertea 16 for (int i=0;i<xattrs->Count();i++) {
1698     if (xattrs->Get(i)->attr_val==NULL) continue;
1699     const char* attrname=names->attrs.getName(xattrs->Get(i)->attr_id);
1700     fprintf(fout, " %s ",attrname);
1701     if (xattrs->Get(i)->attr_val[0]=='"')
1702     fprintf(fout, "%s;",xattrs->Get(i)->attr_val);
1703     else fprintf(fout, "\"%s\";",xattrs->Get(i)->attr_val);
1704     }
1705 gpertea 2 }
1706 gpertea 16 //for GTF, also append the GffObj attributes to each exon line
1707     if ((xattrs=this->attrs)!=NULL) {
1708     for (int i=0;i<xattrs->Count();i++) {
1709     if (xattrs->Get(i)->attr_val==NULL) continue;
1710     const char* attrname=names->attrs.getName(xattrs->Get(i)->attr_id);
1711     fprintf(fout, " %s ",attrname);
1712     if (xattrs->Get(i)->attr_val[0]=='"')
1713     fprintf(fout, "%s;",xattrs->Get(i)->attr_val);
1714     else fprintf(fout, "\"%s\";",xattrs->Get(i)->attr_val);
1715     }
1716     }
1717 gpertea 2 fprintf(fout, "\n");
1718     }//GTF
1719     }
1720    
1721 gpertea 16 void GffObj::printGxf(FILE* fout, GffPrintMode gffp,
1722     const char* tlabel, const char* gfparent) {
1723 gpertea 2 static char tmpstr[255];
1724     if (tlabel==NULL) {
1725     tlabel=track_id>=0 ? names->tracks.Get(track_id)->name :
1726     (char*)"gffobj" ;
1727     }
1728     unxcoord();
1729 gpertea 16 //if (exons.Count()==0) return;
1730     const char* gseqname=names->gseqs.Get(gseq_id)->name;
1731 gpertea 2 bool gff3 = (gffp>=pgffAny);
1732     bool showCDS = (gffp==pgtfAny || gffp==pgtfCDS || gffp==pgffCDS || gffp==pgffAny || gffp==pgffBoth);
1733     bool showExon = (gffp<=pgtfExon || gffp==pgffAny || gffp==pgffExon || gffp==pgffBoth);
1734     if (gff3) {
1735     //print GFF3 mRNA line:
1736     if (gscore>0.0) sprintf(tmpstr,"%.2f", gscore);
1737     else strcpy(tmpstr,".");
1738     uint pstart, pend;
1739     if (gffp==pgffCDS) {
1740     pstart=CDstart;
1741     pend=CDend;
1742     }
1743     else { pstart=start;pend=end; }
1744 gpertea 16 //const char* ftype=isTranscript() ? "mRNA" : getFeatureName();
1745     const char* ftype=getFeatureName();
1746 gpertea 2 fprintf(fout,
1747     "%s\t%s\t%s\t%d\t%d\t%s\t%c\t.\tID=%s",
1748     gseqname, tlabel, ftype, pstart, pend, tmpstr, strand, gffID);
1749 gpertea 16 if (CDstart>0 && !showCDS && !isCDS) fprintf(fout,";CDS=%d-%d",CDstart,CDend);
1750     if (gfparent!=NULL) {
1751     //parent override
1752     fprintf(fout, ";Parent=%s",gfparent);
1753     }
1754     else {
1755     if (parent!=NULL && !parent->isDiscarded())
1756     fprintf(fout, ";Parent=%s",parent->getID());
1757     }
1758     if (geneID!=NULL)
1759     fprintf(fout, ";geneID=%s",geneID);
1760     if (gene_name!=NULL)
1761     fprintf(fout, ";gene_name=%s",gene_name);
1762 gpertea 2 if (attrs!=NULL) {
1763     for (int i=0;i<attrs->Count();i++) {
1764 gpertea 16 const char* attrname=names->attrs.getName(attrs->Get(i)->attr_id);
1765     fprintf(fout,";%s=%s", attrname,
1766 gpertea 2 attrs->Get(i)->attr_val);
1767     }
1768     }
1769     fprintf(fout,"\n");
1770     }// gff3 mRNA line
1771     if (showExon) {
1772     //print exons
1773 gpertea 16 if (isCDS && exons.Count()>0 &&
1774     ((strand=='-' && exons.Last()->phase<'0') || (strand=='+' && exons.Last()->phase<'0')))
1775     updateExonPhase();
1776    
1777     for (int i=0;i<exons.Count();i++) {
1778     printGxfLine(fout, tlabel, gseqname, isCDS, exons[i]->start, exons[i]->end, i, exons[i]->phase, gff3);
1779     }
1780     }//printing exons
1781 gpertea 2 if (showCDS && !isCDS && CDstart>0) {
1782     GArray<GffCDSeg> cds(true,true);
1783     getCDSegs(cds);
1784     for (int i=0;i<cds.Count();i++) {
1785     printGxfLine(fout, tlabel, gseqname, true, cds[i].start, cds[i].end, -1, cds[i].phase, gff3);
1786     }
1787     } //showCDS
1788     }
1789    
1790 gpertea 16 void GffObj::updateExonPhase() {
1791     if (!isCDS) return;
1792     int cdsacc=0;
1793     if (CDphase=='1' || CDphase=='2') {
1794     cdsacc+= 3-(CDphase-'0');
1795     }
1796     if (strand=='-') { //reverse strand
1797     for (int i=exons.Count()-1;i>=0;i--) {
1798     exons[i]->phase='0'+ (3-cdsacc%3)%3;
1799     cdsacc+=exons[i]->end-exons[i]->start+1;
1800     }
1801     }
1802     else { //forward strand
1803     for (int i=0;i<exons.Count();i++) {
1804     exons[i]->phase='0'+ (3-cdsacc%3)%3;
1805     cdsacc+=exons[i]->end-exons[i]->start+1;
1806     }
1807     }
1808     }
1809 gpertea 2
1810 gpertea 16
1811 gpertea 2 void GffObj::getCDSegs(GArray<GffCDSeg>& cds) {
1812     GffCDSeg cdseg;
1813     int cdsacc=0;
1814     if (CDphase=='1' || CDphase=='2') {
1815     cdsacc+= 3-(CDphase-'0');
1816     }
1817     if (strand=='-') {
1818     for (int x=exons.Count()-1;x>=0;x--) {
1819     uint sgstart=exons[x]->start;
1820     uint sgend=exons[x]->end;
1821     if (CDend<sgstart || CDstart>sgend) continue;
1822     if (CDstart>=sgstart && CDstart<=sgend)
1823     sgstart=CDstart; //cdstart within this segment
1824     if (CDend>=sgstart && CDend<=sgend)
1825     sgend=CDend; //cdend within this segment
1826     cdseg.start=sgstart;
1827     cdseg.end=sgend;
1828     cdseg.exonidx=x;
1829     //cdseg.phase='0'+(cdsacc>0 ? (3-cdsacc%3)%3 : 0);
1830     cdseg.phase='0'+ (3-cdsacc%3)%3;
1831     cdsacc+=sgend-sgstart+1;
1832     cds.Add(cdseg);
1833     } //for each exon
1834     } // - strand
1835     else { // + strand
1836     for (int x=0;x<exons.Count();x++) {
1837     uint sgstart=exons[x]->start;
1838     uint sgend=exons[x]->end;
1839     if (CDend<sgstart || CDstart>sgend) continue;
1840     if (CDstart>=sgstart && CDstart<=sgend)
1841     sgstart=CDstart; //seqstart within this segment
1842     if (CDend>=sgstart && CDend<=sgend)
1843     sgend=CDend; //seqend within this segment
1844     cdseg.start=sgstart;
1845     cdseg.end=sgend;
1846     cdseg.exonidx=x;
1847     //cdseg.phase='0'+(cdsacc>0 ? (3-cdsacc%3)%3 : 0);
1848     cdseg.phase='0' + (3-cdsacc%3)%3 ;
1849     cdsacc+=sgend-sgstart+1;
1850     cds.Add(cdseg);
1851     } //for each exon
1852     } // + strand
1853     }