ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/gff.cpp
Revision: 115
Committed: Mon Nov 7 21:25:03 2011 UTC (7 years, 11 months ago) by gpertea
File size: 65948 byte(s)
Log Message:
fixed horrendous bug in GList.hh (resizing GVec)

Line User Rev File contents
1 gpertea 2 #include "gff.h"
2    
3     //GffNames* GffReader::names=NULL;
4     GffNames* GffObj::names=NULL;
5     //global set of feature names, attribute names etc.
6     // -- common for all GffObjs in current application!
7    
8 gpertea 16 const uint GFF_MAX_LOCUS = 7000000; //longest known gene in human is ~2.2M, UCSC claims a gene for mouse of ~ 3.1 M
9     const uint GFF_MAX_EXON = 30000; //longest known exon in human is ~11K
10 gpertea 115 const uint GFF_MAX_INTRON= 6000000; //Ensembl shows a >5MB human intron
11 gpertea 16 bool gff_show_warnings = false; //global setting, set by GffReader->showWarnings()
12 gpertea 2 const int gff_fid_mRNA=0;
13 gpertea 16 const int gff_fid_transcript=1;
14     const int gff_fid_exon=2;
15     const int gff_fid_CDS=3; //never really used in GffObj ftype_id or subftype_id
16     const uint gfo_flag_HAS_ERRORS = 0x00000001;
17     const uint gfo_flag_CHILDREN_PROMOTED= 0x00000002;
18     const uint gfo_flag_IS_GENE = 0x00000004;
19     const uint gfo_flag_IS_TRANSCRIPT = 0x00000008;
20     const uint gfo_flag_FROM_GFF3 = 0x00000010;
21     const uint gfo_flag_BY_EXON = 0x00000020; //created by subfeature (exon) directly
22     const uint gfo_flag_DISCARDED = 0x00000100;
23     const uint gfo_flag_LST_KEEP = 0x00000200;
24     const uint gfo_flag_LEVEL_MSK = 0x00FF0000;
25     const byte gfo_flagShift_LEVEL = 16;
26 gpertea 2
27     void gffnames_ref(GffNames* &n) {
28     if (n==NULL) n=new GffNames();
29     n->numrefs++;
30     }
31    
32     void gffnames_unref(GffNames* &n) {
33     if (n==NULL) GError("Error: attempt to remove reference to null GffNames object!\n");
34     n->numrefs--;
35     if (n->numrefs==0) { delete n; n=NULL; }
36     }
37    
38     int gfo_cmpByLoc(const pointer p1, const pointer p2) {
39    
40     GffObj& g1=*((GffObj*)p1);
41     GffObj& g2=*((GffObj*)p2);
42 gpertea 16 if (g1.gseq_id==g2.gseq_id) {
43     if (g1.start!=g2.start)
44     return (int)(g1.start-g2.start);
45     else if (g1.getLevel()!=g2.getLevel())
46     return (int)(g1.getLevel()-g2.getLevel());
47     else
48     if (g1.end!=g2.end)
49     return (int)(g1.end-g2.end);
50     else return strcmp(g1.getID(), g2.getID());
51     }
52     else return (int)(g1.gseq_id-g2.gseq_id);
53 gpertea 2 }
54    
55 gpertea 16 char* GffLine::extractAttr(const char* pre, bool caseStrict, bool enforce_GTF2) {
56     //parse a key attribute and remove it from the info string
57     //(only works for attributes that have values following them after ' ' or '=')
58     static const char GTF2_ERR[]="Error parsing attribute %s ('\"' required) at GTF line:\n%s\n";
59     int lpre=strlen(pre);
60     char cend=pre[lpre-1];
61     char* pos = (caseStrict) ? strstr(info, pre) : strifind(info, pre);
62     if (pos==NULL) return NULL;
63     char* findstart=info;
64     //require word boundary on the left:
65     while (pos!=NULL && pos!=info && *(pos-1)!=';' && *(pos-1)!=' ') {
66     findstart=pos+lpre;
67     pos = (caseStrict) ? strstr(findstart, pre) : strifind(findstart, pre);
68     }
69     if (pos==NULL) return NULL;
70     if (cend!=' ' && cend!='=') {
71     //require word boundary on the right:
72     while (pos!=NULL && *(pos+lpre)!=' ' && *(pos+lpre)!='=') {
73     findstart=pos+lpre;
74     pos = (caseStrict) ? strstr(findstart, pre) : strifind(findstart, pre);
75     }
76     }
77     if (pos==NULL) return NULL;
78     char* vp=pos+lpre;
79     while (*vp==' ') vp++;
80     if (*vp==';' || *vp==0)
81     GError("Error parsing value of GFF attribute \"%s\", line:\n%s\n", pre, dupline);
82     bool dq_enclosed=false; //value string enclosed by double quotes
83     if (*vp=='"') {
84     dq_enclosed=true;
85     vp++;
86     }
87     if (enforce_GTF2 && !dq_enclosed)
88     GError(GTF2_ERR,pre, dupline);
89     char* vend=vp;
90     if (dq_enclosed) {
91     while (*vend!='"' && *vend!=';' && *vend!=0) vend++;
92     }
93     else {
94     while (*vend!=';' && *vend!=0) vend++;
95     }
96     if (enforce_GTF2 && *vend!='"')
97     GError(GTF2_ERR, pre, dupline);
98     char *r=Gstrdup(vp, vend-1);
99     //-- now remove this attribute from the info string
100     while (*vend!=0 && (*vend=='"' || *vend==';' || *vend==' ')) vend++;
101     if (*vend==0) vend--;
102     for (char *src=vend, *dest=pos;;src++,dest++) {
103     *dest=*src;
104     if (*src==0) break;
105     }
106     return r;
107     }
108 gpertea 2
109 gpertea 16 static char fnamelc[128];
110    
111 gpertea 2 GffLine::GffLine(GffReader* reader, const char* l) {
112 gpertea 16 llen=strlen(l);
113     GMALLOC(line,llen+1);
114     memcpy(line, l, llen+1);
115     GMALLOC(dupline, llen+1);
116     memcpy(dupline, l, llen+1);
117 gpertea 2 skip=true;
118     gseqname=NULL;
119     track=NULL;
120     ftype=NULL;
121     info=NULL;
122 gpertea 16 _parents=NULL;
123     _parents_len=0;
124     num_parents=0;
125     parents=NULL;
126     is_gff3=false;
127 gpertea 2 is_cds=false;
128 gpertea 16 is_transcript=false;
129 gpertea 2 is_exon=false;
130 gpertea 16 is_gene=false;
131 gpertea 2 exontype=0;
132 gpertea 16 gene_id=NULL;
133     gene_name=NULL;
134 gpertea 2 qstart=0;
135     qend=0;
136     qlen=0;
137     ID=NULL;
138     char* t[9];
139     int i=0;
140     int tidx=1;
141     t[0]=line;
142    
143     while (line[i]!=0) {
144     if (line[i]=='\t') {
145     line[i]=0;
146     t[tidx]=line+i+1;
147     tidx++;
148     if (tidx>8) break;
149     }
150     i++;
151     }
152    
153     if (tidx<8) { // ignore non-GFF lines
154     // GMessage("Warning: error parsing GFF/GTF line:\n%s\n", l);
155     return;
156     }
157     gseqname=t[0];
158     track=t[1];
159     ftype=t[2];
160     info=t[8];
161     char* p=t[3];
162 gpertea 66 if (!parseUInt(p,fstart)) {
163     //FIXME: chromosome_band entries in Flybase
164     GMessage("Warning: invalid start coordinate at line:\n%s\n",l);
165     return;
166     }
167 gpertea 2 p=t[4];
168 gpertea 66 if (!parseUInt(p,fend)) {
169     GMessage("Warning: invalid end coordinate at line:\n%s\n",l);
170     return;
171     }
172 gpertea 16 if (fend<fstart) swap(fend,fstart); //make sure fstart>=fend, always
173 gpertea 2 p=t[5];
174     if (p[0]=='.' && p[1]==0) {
175     score=0;
176     }
177     else {
178     if (!parseDouble(p,score))
179     GError("Error parsing feature score from GFF line:\n%s\n",l);
180     }
181     strand=*t[6];
182     if (strand!='+' && strand!='-' && strand!='.')
183     GError("Error parsing strand (%c) from GFF line:\n%s\n",strand,l);
184 gpertea 16 phase=*t[7]; // must be '.', '0', '1' or '2'
185 gpertea 2 ID=NULL;
186     // exon/CDS/mrna filter
187 gpertea 16 strncpy(fnamelc, ftype, 127);
188     fnamelc[127]=0;
189 gpertea 2 strlower(fnamelc); //convert to lower case
190 gpertea 16 bool is_t_data=false;
191 gpertea 2 if (strstr(fnamelc, "utr")!=NULL) {
192     exontype=exgffUTR;
193     is_exon=true;
194 gpertea 16 is_t_data=true;
195 gpertea 2 }
196 gpertea 66 else if (endsWith(fnamelc, "exon")) {
197 gpertea 2 exontype=exgffExon;
198     is_exon=true;
199 gpertea 16 is_t_data=true;
200 gpertea 2 }
201 gpertea 16 else if (strstr(fnamelc, "stop") &&
202     (strstr(fnamelc, "codon") || strstr(fnamelc, "cds"))){
203 gpertea 2 exontype=exgffStop;
204 gpertea 16 is_cds=true; //though some place it outside the last CDS segment
205     is_t_data=true;
206     }
207     else if (strstr(fnamelc, "start") &&
208     ((strstr(fnamelc, "codon")!=NULL) || strstr(fnamelc, "cds")!=NULL)){
209     exontype=exgffStart;
210 gpertea 2 is_cds=true;
211 gpertea 16 is_t_data=true;
212 gpertea 2 }
213     else if (strcmp(fnamelc, "cds")==0) {
214     exontype=exgffCDS;
215     is_cds=true;
216 gpertea 16 is_t_data=true;
217 gpertea 2 }
218 gpertea 16 else if (endsWith(fnamelc, "gene") || startsWith(fnamelc, "gene")) {
219     is_gene=true;
220     is_t_data=true; //because its name will be attached to parented transcripts
221 gpertea 2 }
222 gpertea 16 else if (endsWith(fnamelc,"rna") || endsWith(fnamelc,"transcript")) {
223     is_transcript=true;
224     is_t_data=true;
225     }
226 gpertea 2
227 gpertea 16 if (reader->transcriptsOnly && !is_t_data) {
228     char* id=extractAttr("ID=");
229     if (id==NULL) id=extractAttr("transcript_id");
230     //GMessage("Discarding non-transcript line:\n%s\n",l);
231     if (id!=NULL) {
232     reader->discarded_ids.Add(id, new int(1));
233     GFREE(id);
234     }
235     return; //skip this line, unwanted feature name
236 gpertea 2 }
237 gpertea 16 ID=extractAttr("ID=");
238     char* Parent=extractAttr("Parent=");
239     is_gff3=(ID!=NULL || Parent!=NULL);
240     if (is_gff3) {
241     //parse as GFF3
242     if (ID!=NULL) {
243     //has ID attr so it's likely to be a parent feature
244     //look for explicit gene name
245     gene_name=extractAttr("gene_name=",false);
246     if (gene_name==NULL) {
247     gene_name=extractAttr("geneName=",false);
248     if (gene_name==NULL) {
249     gene_name=extractAttr("gene_sym=",false);
250     if (gene_name==NULL) {
251     gene_name=extractAttr("gene=",false);
252     }
253     }
254     }
255     gene_id=extractAttr("geneID=",false);
256     if (gene_id==NULL) {
257     gene_id=extractAttr("gene_id=",false);
258     }
259     if (is_gene) {
260     //special case: keep the Name and ID attributes of the gene feature
261     if (gene_name==NULL)
262     gene_name=extractAttr("Name=");
263     if (gene_id==NULL) //the ID is also gene_id in this case
264     gene_id=Gstrdup(ID);
265     //skip=false;
266     //return;
267     GFREE(Parent); //TMI, we really don't care about gene Parents?
268     } //gene feature
269     }// has GFF3 ID
270     if (Parent!=NULL) {
271     //keep Parent attr
272     //parse multiple parents
273     num_parents=1;
274     p=Parent;
275     int last_delim_pos=-1;
276     while (*p!=';' && *p!=0) {
277     if (*p==',' && *(p+1)!=0 && *(p+1)!=';') {
278     num_parents++;
279     last_delim_pos=(p-Parent);
280     }
281     p++;
282     }
283     _parents_len=p-Parent+1;
284     _parents=Parent;
285     GMALLOC(parents, num_parents*sizeof(char*));
286     parents[0]=_parents;
287     int i=1;
288     if (last_delim_pos>0) {
289     for (p=_parents+1;p<=_parents+last_delim_pos;p++) {
290     if (*p==',') {
291     char* ep=p-1;
292     while (*ep==' ' && ep>_parents) ep--;
293     *(ep+1)=0; //end the string there
294     parents[i]=p+1;
295     i++;
296     }
297     }
298     }
299     } //has Parent field
300     } //GFF3
301     else { // GTF-like expected
302     Parent=extractAttr("transcript_id");
303     if (Parent!=NULL) { //GTF2 format detected
304     if (is_transcript) {
305     // atypical GTF with a parent transcript line declared
306     ID=Parent;
307     Parent=NULL;
308     }
309     gene_id=extractAttr("gene_id"); // for GTF this is the only attribute accepted as geneID
310     gene_name=extractAttr("gene_name");
311     if (gene_name==NULL) {
312     gene_name=extractAttr("gene_sym");
313     if (gene_name==NULL)
314     gene_name=extractAttr("gene");
315     }
316 gpertea 2 //prepare for parseAttr by adding '=' character instead of spaces for all attributes
317     //after the attribute name
318     p=info;
319     bool noed=true; //not edited after the last delim
320     bool nsp=false; //non-space found after last delim
321     while (*p!=0) {
322 gpertea 16 if (*p==' ') {
323     if (nsp && noed) {
324     *p='=';
325     noed=false;
326     p++;
327     continue;
328     }
329     }
330     else nsp=true; //non-space
331     if (*p==';') { noed=true; nsp=false; }
332     p++;
333     }
334     } //GTF2 detected (no parent line)
335     else {// Parent is NULL, check for jigsaw format or other pre-GTF2 format
336     //char* fexon=strstr(fnamelc, "exon");
337     //if (fexon!=NULL) {
338     if (exontype==exgffExon) {
339 gpertea 2 if (startsWith(track,"jigsaw")) {
340 gpertea 16 is_cds=true;
341     strcpy(track,"jigsaw");
342     p=strchr(info,';');
343     if (p==NULL) { Parent=Gstrdup(info); info=NULL; }
344     else { Parent=Gstrdup(info,p-1);
345     info=p+1;
346     }
347     }
348     } //exon feature?
349     if (Parent==NULL && exontype>=exgffCDS &&
350     (i=strcspn(info,"; \t\n\r"))<=(int)(strlen(info)+1)) {
351     //one word ID ? really desperate attempt to parse it here
352 gpertea 2 Parent=Gstrdup(info,info+i-1);
353 gpertea 16 info=NULL; //discard anything else on the line
354     }
355     }
356     if (Parent!=NULL) { //GTF transcript_id for exon/CDS feature
357     _parents=Parent;
358     GMALLOC(parents,sizeof(char*));
359     num_parents=1;
360     parents[0]=_parents;
361     }
362     } //GTF-like
363 gpertea 2
364 gpertea 16 //parse other potentially useful features
365     if (is_gff3) {
366     if ((p=strstr(info,"Target="))!=NULL) { //has Target attr
367     p+=7;
368     while (*p!=';' && *p!=0 && *p!=' ') p++;
369     if (*p!=' ') {
370     GError("Error parsing target coordinates from GFF line:\n%s\n",l);
371     }
372     if (!parseUInt(p,qstart))
373     GError("Error parsing target start coordinate from GFF line:\n%s\n",l);
374     if (*p!=' ') {
375     GError("Error parsing next target coordinate from GFF line:\n%s\n",l);
376     }
377     p++;
378     if (!parseUInt(p,qend))
379     GError("Error parsing target end coordinate from GFF line:\n%s\n",l);
380 gpertea 2 }
381 gpertea 16 if ((p=strifind(info,"Qreg="))!=NULL) { //has Qreg attr
382     p+=5;
383     if (!parseUInt(p,qstart))
384     GError("Error parsing target start coordinate from GFF line:\n%s\n",l);
385     if (*p!='-') {
386     GError("Error parsing next target coordinate from GFF line:\n%s\n",l);
387     }
388     p++;
389     if (!parseUInt(p,qend))
390     GError("Error parsing target end coordinate from GFF line:\n%s\n",l);
391     if (*p=='|' || *p==':') {
392     p++;
393     if (!parseUInt(p,qlen))
394     GError("Error parsing target length from GFF Qreg|: \n%s\n",l);
395     }
396     }//has Qreg attr
397     if (qlen==0 && (p=strifind(info,"Qlen="))!=NULL) {
398     p+=5;
399     if (!parseUInt(p,qlen))
400     GError("Error parsing target length from GFF Qlen:\n%s\n",l);
401 gpertea 2 }
402 gpertea 16 }//parsing some useful attributes in GFF3 records
403     if (ID==NULL && parents==NULL) {
404     if (reader->gff_warns)
405     GMessage("Warning: could not parse ID or Parent from GFF line:\n%s\n",dupline);
406     return; //skip
407 gpertea 2 }
408     skip=false;
409     }
410    
411 gpertea 55
412     void GffObj::addCDS(uint cd_start, uint cd_end, char phase) {
413     if (cd_start>=this->start) {
414     this->CDstart=cd_start;
415     if (strand=='+') this->CDphase=phase;
416     }
417     else this->CDstart=this->start;
418     if (cd_end<=this->end) {
419     this->CDend=cd_end;
420     if (strand=='-') this->CDphase=phase;
421     }
422     else this->CDend=this->end;
423     isTranscript(true);
424     exon_ftype_id=gff_fid_exon;
425     if (monoFeature()) {
426     if (exons.Count()==0) addExon(this->start, this->end,0,'.',0,0,false,exgffExon);
427     else exons[0]->exontype=exgffExon;
428     }
429     }
430    
431 gpertea 16 int GffObj::addExon(GffReader* reader, GffLine* gl, bool keepAttr, bool noExonAttr) {
432 gpertea 2 //this will make sure we have the right subftype_id!
433 gpertea 55 //int subf_id=-1;
434     if (!isTranscript() && gl->is_cds) {
435 gpertea 50 isTranscript(true);
436     exon_ftype_id=gff_fid_exon;
437     if (exons.Count()==1) exons[0]->exontype=exgffExon;
438     }
439 gpertea 16 if (isTranscript()) {
440     if (exon_ftype_id<0) {//exon_ftype_id=gff_fid_exon;
441     if (gl->exontype>0) exon_ftype_id=gff_fid_exon;
442     else exon_ftype_id=names->feats.addName(gl->ftype);
443     }
444 gpertea 2 //any recognized mRNA segment gets the generic "exon" type (also applies to CDS)
445 gpertea 16 if (gl->exontype==0 && !gl->is_transcript) {
446     //extraneous mRNA feature, discard
447     if (reader->gff_warns)
448     GMessage("Warning: discarding unrecognized transcript subfeature %s of %s\n",
449     gl->ftype, gffID);
450 gpertea 2 return -1;
451 gpertea 16 }
452 gpertea 2 }
453 gpertea 16 else { //non-mRNA parent feature, check this subf type
454 gpertea 55 int subf_id=names->feats.addName(gl->ftype);
455 gpertea 16 if (exon_ftype_id<0 || exons.Count()==0) //never assigned a subfeature type before (e.g. first exon being added)
456     exon_ftype_id=subf_id;
457     else {
458     if (exon_ftype_id!=subf_id) {
459 gpertea 50 //
460 gpertea 16 if (exon_ftype_id==ftype_id && exons.Count()==1 && exons[0]->start==start && exons[0]->end==end) {
461     //the existing exon was just a dummy one created by default, discard it
462     exons.Clear();
463     covlen=0;
464     exon_ftype_id=subf_id; //allow the new subfeature to completely takeover
465     }
466     else { //multiple subfeatures, prefer those with
467     if (reader->gff_warns)
468     GMessage("GFF Warning: multiple subfeatures (%s and %s) found for %s, discarding ",
469     names->feats.getName(subf_id), names->feats.getName(exon_ftype_id),gffID);
470     if (gl->exontype!=0) { //new feature is an exon, discard previously parsed subfeatures
471     if (reader->gff_warns) GMessage("%s.\n", names->feats.getName(exon_ftype_id));
472     exon_ftype_id=subf_id;
473     exons.Clear();
474     covlen=0;
475     }
476     else { //discard new feature
477     if (reader->gff_warns) GMessage("%s.\n", names->feats.getName(subf_id));
478     return -1; //skip this 2nd subfeature type for this parent!
479     }
480     }
481     } //incoming subfeature is of different type
482     } //new subfeature type
483     } //non-mRNA parent
484 gpertea 2 int eidx=addExon(gl->fstart, gl->fend, gl->score, gl->phase,
485     gl->qstart,gl->qend, gl->is_cds, gl->exontype);
486 gpertea 16 if (eidx<0) return eidx; //this should never happen
487     if (keepAttr) {
488     if (noExonAttr) {
489     if (attrs==NULL) //place the parsed attributes directly at transcript level
490     parseAttrs(attrs, gl->info);
491     }
492     else { //need all exon-level attributes
493     parseAttrs(exons[eidx]->attrs, gl->info, true);
494     }
495 gpertea 2 }
496     return eidx;
497     }
498    
499    
500     int GffObj::addExon(uint segstart, uint segend, double sc, char fr, int qs, int qe, bool iscds, char exontype) {
501     if (exons.Count()==0) {
502     if (iscds) isCDS=true; //for now, assume CDS only if first "exon" given is a CDS
503 gpertea 16 if (exon_ftype_id<0) {
504     exon_ftype_id = isTranscript() ? gff_fid_exon : ftype_id;
505 gpertea 2 }
506     }
507 gpertea 16 //special treatment of start/stop codon features, they might be broken/split between exons
508     //and in that case some providers will still give the wrong end coordinate as start+2 (e.g. UCSC)
509     //so we should not trust the end coordinate for such features
510     if (exontype==exgffStart || exontype==exgffStop) {
511     if (strand=='-') segstart=segend;
512     else segend=segstart;
513     if (exontype==exgffStart) {
514     if (CDstart==0 || segstart<CDstart) CDstart=segstart;
515     }
516     else {
517     if (segstart>CDend) CDend=segstart;
518     }
519     }
520     else if (iscds) { //update CDS anchors:
521 gpertea 2 if (CDstart==0 || segstart<CDstart) {
522     CDstart=segstart;
523     if (exontype==exgffCDS && strand=='+') CDphase=fr;
524     }
525     if (segend>CDend) {
526     if (exontype==exgffCDS && strand=='-') CDphase=fr;
527     CDend=segend;
528     }
529     }
530 gpertea 16 else { // not a CDS/start/stop
531 gpertea 2 isCDS=false;
532     }
533     if (qs || qe) {
534     if (qs>qe) swap(qs,qe);
535     if (qs==0) qs=1;
536 gpertea 16 }
537 gpertea 2 int ovlen=0;
538 gpertea 16 if (exontype>0) { //check for overlaps between exon-type segments
539     int oi=exonOverlapIdx(segstart, segend, &ovlen);
540     if (oi>=0) { //overlap existing segment
541     if (ovlen==0) {
542     //adjacent segments will be merged
543     //e.g. CDS to (UTR|exon)
544     if ((exons[oi]->exontype>=exgffUTR && exontype==exgffCDS) ||
545     (exons[oi]->exontype==exgffCDS && exontype>=exgffUTR)) {
546     expandExon(oi, segstart, segend, exgffCDSUTR, sc, fr, qs, qe);
547     return oi;
548     }
549     //CDS adjacent to stop_codon: UCSC does (did?) this
550     if ((exons[oi]->exontype==exgffStop && exontype==exgffCDS) ||
551     (exons[oi]->exontype==exgffCDS && exontype==exgffStop)) {
552     expandExon(oi, segstart, segend, exgffCDS, sc, fr, qs, qe);
553     return oi;
554     }
555     }
556     //only allow this for CDS within exon, stop_codon within (CDS|UTR|exon),
557     // start_codon within (CDS|exon)
558     if (exons[oi]->exontype>exontype &&
559     exons[oi]->start<=segstart && exons[oi]->end>=segend &&
560     !(exons[oi]->exontype==exgffUTR && exontype==exgffCDS)) {
561     //larger segment given first, now the smaller included one is redundant
562     return oi; //only used to store attributes from current GffLine
563     }
564     if (exontype>exons[oi]->exontype &&
565     segstart<=exons[oi]->start && segend>=exons[oi]->end &&
566     !(exontype==exgffUTR && exons[oi]->exontype==exgffCDS)) {
567     //smaller segment given first, so we have to enlarge it
568     expandExon(oi, segstart, segend, exontype, sc, fr, qs, qe);
569     //this should also check for overlapping next exon (oi+1) ?
570     return oi;
571     }
572     //there is also the special case of "ribosomal slippage exception" (programmed frameshift)
573     //where two CDS segments may actually overlap for 1 or 2 bases, but there should be only one encompassing exon
574     //if (ovlen>2 || exons[oi]->exontype!=exgffCDS || exontype!=exgffCDS) {
575     // had to relax this because of some weird UCSC annotations with exons partially overlapping the CDS segments
576     /*
577     if (ovlen>2 && exons[oi]->exontype!=exgffUTR && exontype!=exgffUTR) {
578     if (gff_show_warnings)
579     GMessage("GFF Warning: discarding overlapping feature segment (%d-%d) (vs %d-%d (%s)) for GFF ID %s on %s\n",
580     segstart, segend, exons[oi]->start, exons[oi]->end, getSubfName(), gffID, getGSeqName());
581     hasErrors(true);
582     return -1; //segment NOT added
583     }
584     */
585 gpertea 2
586 gpertea 16 if ((ovlen>2 || ovlen==0) || exons[oi]->exontype!=exgffCDS || exontype!=exgffCDS) {
587     if (gff_show_warnings)
588     GMessage("GFF Warning: merging overlapping/adjacent feature segment (%d-%d) into (%d-%d) (%s) for GFF ID %s on %s\n",
589     segstart, segend, exons[oi]->start, exons[oi]->end, getSubfName(), gffID, getGSeqName());
590     expandExon(oi, segstart, segend, exontype, sc, fr, qs, qe);
591     return oi;
592     }
593     // else add the segment if the overlap is small and between two CDS segments
594     //TODO: we might want to add an attribute here with the slippage coordinate and size?
595     covlen-=ovlen;
596     }//overlap or adjacent to existing segment
597     } //check for overlap
598 gpertea 2 // --- no overlap, or accepted micro-overlap (ribosomal slippage)
599     // create & add the new segment
600 gpertea 51 /*
601     if (start>0 && exontype==exgffCDS && exons.Count()==0) {
602     //adding a CDS directly as the first subfeature of a declared parent
603     segstart=start;
604     segend=end;
605     }
606     */
607 gpertea 2 GffExon* enew=new GffExon(segstart, segend, sc, fr, qs, qe, exontype);
608     int eidx=exons.Add(enew);
609     if (eidx<0) {
610 gpertea 16 //this would actually be acceptable if the object is a "Gene" and "exons" are in fact isoforms
611     if (gff_show_warnings)
612     GMessage("GFF Warning: failed adding segment %d-%d for %s (discarded)!\n",
613 gpertea 2 segstart, segend, gffID);
614 gpertea 16 delete enew;
615     hasErrors(true);
616 gpertea 2 return -1;
617     }
618     covlen+=(int)(exons[eidx]->end-exons[eidx]->start)+1;
619 gpertea 55 //adjust parent feature coordinates to contain this exon
620     if (start==0 || start>exons.First()->start) {
621 gpertea 51 start=exons.First()->start;
622     }
623 gpertea 55 if (end<exons.Last()->end) end=exons.Last()->end;
624    
625 gpertea 2 if (uptr!=NULL) { //collect stats about the underlying genomic sequence
626     GSeqStat* gsd=(GSeqStat*)uptr;
627     if (start<gsd->mincoord) gsd->mincoord=start;
628     if (end>gsd->maxcoord) gsd->maxcoord=end;
629 gpertea 16 if (this->len()>gsd->maxfeat_len) {
630     gsd->maxfeat_len=this->len();
631     gsd->maxfeat=this;
632     }
633 gpertea 2 }
634     return eidx;
635     }
636    
637 gpertea 16 void GffObj::expandExon(int oi, uint segstart, uint segend, char exontype, double sc, char fr, int qs, int qe) {
638     //oi is the index of the *first* overlapping segment found that must be enlarged
639     covlen-=exons[oi]->len();
640     if (segstart<exons[oi]->start)
641     exons[oi]->start=segstart;
642     if (qs && qs<exons[oi]->qstart) exons[oi]->qstart=qs;
643     if (segend>exons[oi]->end)
644     exons[oi]->end=segend;
645     if (qe && qe>exons[oi]->qend) exons[oi]->qend=qe;
646     //warning: score cannot be properly adjusted! e.g. if it's a p-value it's just going to get worse
647     if (sc!=0) exons[oi]->score=sc;
648     covlen+=exons[oi]->len();
649     //if (exons[oi]->exontype< exontype) -- always true
650     exons[oi]->exontype = exontype;
651     if (exontype==exgffCDS) exons[oi]->phase=fr;
652     //we must check if any more exons are also overlapping this
653     int ni=oi+1; //next exon index after oi
654     while (ni<exons.Count() && segend>=exons[ni]->start) { // next segment overlaps new enlarged segment
655     //only allow this if next segment is fully included, and a subordinate
656     if (exons[ni]->exontype<exontype && exons[ni]->end<=segend) {
657     /* I guess we have to relax this due to stupid UCSC hg18 files having a start_codon sticking out
658     chr1 hg18_knownGene start_codon 69806911 69806913 0.000000 + .
659     chr1 hg18_knownGene CDS 69806911 69806912 0.000000 + 0
660     chr1 hg18_knownGene exon 69805456 69806912 0.000000 + .
661     */
662     if (exons[ni]->qstart<exons[oi]->qstart) exons[oi]->qstart=exons[ni]->qstart;
663     if (exons[ni]->qend>exons[oi]->qend) exons[oi]->qend=exons[ni]->qend;
664     exons.Delete(ni);
665     }
666     else {
667     if (gff_show_warnings) GMessage("GFF Warning: overlapping existing exon(%d-%d) while expanding to %d-%d for GFF ID %s\n",
668     exons[ni]->start, exons[ni]->end, segstart, segend, gffID);
669     //hasErrors(true);
670     break;
671     }
672     }
673     // -- make sure any other related boundaries are updated:
674     start=exons.First()->start;
675     end=exons.Last()->end;
676     if (uptr!=NULL) { //collect stats about the underlying genomic sequence
677     GSeqStat* gsd=(GSeqStat*)uptr;
678     if (start<gsd->mincoord) gsd->mincoord=start;
679     if (end>gsd->maxcoord) gsd->maxcoord=end;
680     if (this->len()>gsd->maxfeat_len) {
681     gsd->maxfeat_len=this->len();
682     gsd->maxfeat=this;
683     }
684     }
685     }
686    
687 gpertea 2 void GffObj::removeExon(int idx) {
688     /*
689     if (idx==0 && segs[0].start==gstart)
690     gstart=segs[1].start;
691     if (idx==segcount && segs[segcount].end==gend)
692     gend=segs[segcount-1].end;
693     */
694     if (idx<0 || idx>=exons.Count()) return;
695     int segstart=exons[idx]->start;
696     int segend=exons[idx]->end;
697     exons.Delete(idx);
698     covlen -= (int)(segend-segstart)+1;
699     start=exons.First()->start;
700     end=exons.Last()->end;
701     if (isCDS) { CDstart=start; CDend=end; }
702     }
703    
704 gpertea 16 void GffObj::removeExon(GffExon* p) {
705     for (int idx=0;idx<exons.Count();idx++) {
706     if (exons[idx]==p) {
707     int segstart=exons[idx]->start;
708     int segend=exons[idx]->end;
709     exons.Delete(idx);
710     covlen -= (int)(segend-segstart)+1;
711     start=exons.First()->start;
712     end=exons.Last()->end;
713     if (isCDS) { CDstart=start; CDend=end; }
714     return;
715     }
716     }
717     }
718    
719    
720    
721 gpertea 2 GffObj::GffObj(GffReader *gfrd, GffLine* gffline, bool keepAttr, bool noExonAttr):
722 gpertea 16 GSeg(0,0), exons(true,true,false), children(1,false) {
723     xstart=0;
724     xend=0;
725     xstatus=0;
726     partial=false;
727     isCDS=false;
728     uptr=NULL;
729     ulink=NULL;
730     parent=NULL;
731     udata=0;
732     flags=0;
733     CDstart=0;
734     CDend=0;
735     CDphase=0;
736     geneID=NULL;
737     gene_name=NULL;
738     attrs=NULL;
739     gffID=NULL;
740     track_id=-1;
741     gseq_id=-1;
742     ftype_id=-1;
743     exon_ftype_id=-1;
744     strand='.';
745     if (gfrd==NULL)
746 gpertea 2 GError("Cannot use this GffObj constructor with a NULL GffReader!\n");
747 gpertea 16 gffnames_ref(names);
748     if (gfrd->names==NULL) gfrd->names=names;
749     //qlen=0;qstart=0;qend=0;
750     gscore=0;
751     uscore=0;
752     covlen=0;
753     qcov=0;
754     start=gffline->fstart;
755     end=gffline->fend;
756     gseq_id=names->gseqs.addName(gffline->gseqname);
757     track_id=names->tracks.addName(gffline->track);
758     strand=gffline->strand;
759     qlen=gffline->qlen;
760     qstart=gffline->qstart;
761     qend=gffline->qend;
762     //setup flags from gffline
763     isCDS=gffline->is_cds; //for now
764     isGene(gffline->is_gene);
765     isTranscript(gffline->is_transcript || gffline->exontype!=0);
766     fromGff3(gffline->is_gff3);
767    
768     if (gffline->parents!=NULL) {
769     //GTF style -- create a GffObj directly by subfeature
770     //(also possible orphan GFF3 exon line, or an exon given before its parent (chado))
771     if (gffline->exontype!=0) { //recognized exon-like feature
772     ftype_id=gff_fid_transcript; //so this is some sort of transcript
773     exon_ftype_id=gff_fid_exon; //subfeatures MUST be exons
774     }
775     else {//unrecognized subfeatures
776     //make this GffObj of the same feature type
777     ftype_id=names->feats.addName(gffline->ftype);
778     }
779     if (gffline->ID==NULL) { //typical GTF
780     gffID=Gstrdup(gffline->parents[0]);
781     this->createdByExon(true);
782     //this is likely the first exon/segment of the feature
783     addExon(gfrd, gffline, keepAttr, noExonAttr);
784 gpertea 2 }
785 gpertea 16 else { //a parented feature with an ID -- probably an orphan GFF3 line
786     if (gffline->is_gff3 && gffline->exontype!=0) {
787     //premature exon given before its parent transcript
788     //create the transcript entry here
789     gffID=Gstrdup(gffline->parents[0]);
790     this->createdByExon(true);
791     //this is the first exon/segment of the transcript
792     addExon(gfrd, gffline, keepAttr, noExonAttr);
793     }
794     else { //unrecognized non-exon feature ? use the ID instead
795     gffID=Gstrdup(gffline->ID);
796     if (keepAttr) this->parseAttrs(attrs, gffline->info);
797     }
798 gpertea 2 }
799 gpertea 16 } //subfeature given directly
800     else { //gffline->parents==NULL
801     //create a parent feature in its own right
802 gpertea 2 gscore=gffline->score;
803     if (gffline->ID==NULL || gffline->ID[0]==0)
804 gpertea 16 GError("Error: no ID found for GFF record start\n");
805     gffID=Gstrdup(gffline->ID); //there must be an ID here
806     //if (gffline->is_transcript) ftype_id=gff_fid_mRNA;
807     //else
808     ftype_id=names->feats.addName(gffline->ftype);
809     if (gffline->is_transcript)
810     exon_ftype_id=gff_fid_exon;
811    
812     if (keepAttr) this->parseAttrs(attrs, gffline->info);
813     }//no parent
814    
815     if (gffline->gene_name!=NULL) {
816     gene_name=Gstrdup(gffline->gene_name);
817     }
818     if (gffline->gene_id!=NULL) {
819     geneID=Gstrdup(gffline->gene_id);
820     }
821    
822     GSeqStat* gsd=gfrd->gseqstats.AddIfNew(new GSeqStat(gseq_id,names->gseqs.lastNameUsed()),true);
823     uptr=gsd;
824     if (start<gsd->mincoord) gsd->mincoord=start;
825     if (end>gsd->maxcoord) gsd->maxcoord=end;
826     if (this->len()>gsd->maxfeat_len) {
827     gsd->maxfeat_len=this->len();
828     gsd->maxfeat=this;
829 gpertea 2 }
830     }
831    
832     GffLine* GffReader::nextGffLine() {
833     if (gffline!=NULL) return gffline; //caller should free gffline after processing
834     while (gffline==NULL) {
835     int llen=0;
836     buflen=GFF_LINELEN-1;
837     char* l=fgetline(linebuf, buflen, fh, &fpos, &llen);
838     if (l==NULL) {
839     return NULL; //end of file
840     }
841     int ns=0; //first nonspace position
842     while (l[ns]!=0 && isspace(l[ns])) ns++;
843     if (l[ns]=='#' || llen<10) continue;
844     gffline=new GffLine(this, l);
845     if (gffline->skip) {
846     delete gffline;
847     gffline=NULL;
848 gpertea 16 continue;
849 gpertea 2 }
850 gpertea 16 if (gffline->ID==NULL && gffline->parents==NULL) { //it must have an ID
851     //this might not be needed, already checked in the GffLine constructor
852     if (gff_warns)
853     GMessage("Warning: malformed GFF line, no parent or record Id (kipping\n");
854     delete gffline;
855     gffline=NULL;
856     //continue;
857     }
858 gpertea 2 }
859     return gffline;
860     }
861    
862     char* GffReader::gfoBuildId(const char* id, const char* ctg) {
863     //caller must free the returned pointer
864     char* buf=NULL;
865     int idlen=strlen(id);
866     GMALLOC(buf, idlen+strlen(ctg)+2);
867     strcpy(buf, id);
868     buf[idlen]='~';
869     strcpy(buf+idlen+1, ctg);
870     return buf;
871     }
872    
873     void GffReader::gfoRemove(const char* id, const char* ctg) {
874     char* buf=gfoBuildId(id,ctg);
875     phash.Remove(buf);
876     GFREE(buf);
877     }
878    
879 gpertea 16 //Warning: if gflst gets altered, idx becomes obsolete
880     GfoHolder* GffReader::gfoAdd(const char* id, const char* ctg, GffObj* gfo, int idx) {
881 gpertea 2 char* buf=gfoBuildId(id,ctg);
882 gpertea 16 GfoHolder* r=new GfoHolder(gfo,idx);
883     phash.Add(buf, r);
884 gpertea 2 GFREE(buf);
885 gpertea 16 return r;
886 gpertea 2 }
887 gpertea 16
888     GfoHolder* GffReader::gfoFind(const char* id, const char* ctg) {
889 gpertea 2 char* buf=gfoBuildId(id,ctg);
890 gpertea 16 GfoHolder* r=phash.Find(buf);
891 gpertea 2 GFREE(buf);
892     return r;
893     }
894    
895 gpertea 16 GfoHolder* GffReader::replaceGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, int replaceidx) {
896     GffObj* newgfo=new GffObj(this, gffline, keepAttr, noExonAttr);
897     GfoHolder* r=NULL;
898     if (replaceidx>=0) {
899     gflst.Put(replaceidx,newgfo);
900     r=gfoAdd(newgfo->gffID, gffline->gseqname, newgfo, replaceidx);
901     }
902     else {
903     int gfoidx=gflst.Add(newgfo);
904     r=gfoAdd(newgfo->gffID, gffline->gseqname, newgfo, gfoidx);
905     }
906     if (gff_warns) {
907     int* pcount=tids.Find(newgfo->gffID);
908     if (pcount!=NULL) {
909     if (gff_warns) GMessage("Warning: duplicate GFF ID: %s\n", newgfo->gffID);
910     (*pcount)++;
911     }
912     else {
913     tids.Add(newgfo->gffID,new int(1));
914     }
915     }
916     return r;
917     }
918 gpertea 2
919 gpertea 16 GfoHolder* GffReader::updateParent(GfoHolder* newgfh, GffObj* parent) {
920     //assert(parent);
921     //assert(newgfo);
922     parent->children.Add(newgfh->gffobj);
923     if (newgfh->gffobj->parent==NULL) newgfh->gffobj->parent=parent;
924     newgfh->gffobj->setLevel(parent->getLevel()+1);
925     if (parent->isGene()) {
926     if (parent->gene_name!=NULL && newgfh->gffobj->gene_name==NULL)
927     newgfh->gffobj->gene_name=Gstrdup(parent->gene_name);
928     if (parent->geneID!=NULL && newgfh->gffobj->geneID==NULL)
929     newgfh->gffobj->geneID=Gstrdup(parent->geneID);
930     }
931    
932     return newgfh;
933 gpertea 2 }
934    
935 gpertea 16 GfoHolder* GffReader::newGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr,
936     GffObj* parent, GffExon* pexon) {
937     GffObj* newgfo=new GffObj(this, gffline, keepAttr, noExonAttr);
938     GfoHolder* r=NULL;
939     int gfoidx=gflst.Add(newgfo);
940     r=gfoAdd(newgfo->gffID, gffline->gseqname, newgfo, gfoidx);
941     if (parent!=NULL) {
942     updateParent(r, parent);
943     if (pexon!=NULL) parent->removeExon(pexon);
944     }
945     if (gff_warns) {
946     int* pcount=tids.Find(newgfo->gffID);
947     if (pcount!=NULL) {
948     if (gff_warns) GMessage("Warning: duplicate GFF ID: %s\n", newgfo->gffID);
949     (*pcount)++;
950     }
951     else {
952     tids.Add(newgfo->gffID,new int(1));
953     }
954     }
955     return r;
956     }
957 gpertea 2
958 gpertea 16 GfoHolder* GffReader::updateGffRec(GfoHolder* prevgfo, GffLine* gffline,
959     bool keepAttr) {
960     if (prevgfo==NULL) return NULL;
961     prevgfo->gffobj->createdByExon(false);
962     prevgfo->gffobj->ftype_id=prevgfo->gffobj->names->feats.addName(gffline->ftype);
963     prevgfo->gffobj->start=gffline->fstart;
964     prevgfo->gffobj->end=gffline->fend;
965     prevgfo->gffobj->isGene(gffline->is_gene);
966     prevgfo->gffobj->isTranscript(gffline->is_transcript || gffline->exontype!=0);
967     prevgfo->gffobj->fromGff3(gffline->is_gff3);
968     if (keepAttr) {
969     if (prevgfo->gffobj->attrs!=NULL) prevgfo->gffobj->attrs->Clear();
970     prevgfo->gffobj->parseAttrs(prevgfo->gffobj->attrs, gffline->info);
971     }
972     return prevgfo;
973     }
974    
975    
976     bool GffReader::addExonFeature(GfoHolder* prevgfo, GffLine* gffline, GHash<CNonExon>& pex, bool noExonAttr) {
977     bool r=true;
978     if (gffline->strand!=prevgfo->gffobj->strand) {
979 gpertea 115 //TODO: add support for trans-splicing and even inter-chromosomal fusions
980     if (prevgfo->gffobj->strand=='.') {
981     prevgfo->gffobj->strand=gffline->strand;
982     }
983     else {
984     GMessage("GFF Error at %s (%c): exon %d-%d (%c) found on different strand; discarded.\n",
985     prevgfo->gffobj->gffID, prevgfo->gffobj->strand,
986     gffline->fstart, gffline->fend, gffline->strand, prevgfo->gffobj->getGSeqName());
987     //r=false;
988     return true; //FIXME: split trans-spliced mRNAs by strand
989     }
990     }
991 gpertea 16 int gdist=(gffline->fstart>prevgfo->gffobj->end) ? gffline->fstart-prevgfo->gffobj->end :
992     ((gffline->fend<prevgfo->gffobj->start)? prevgfo->gffobj->start-gffline->fend :
993     0 );
994     if (gdist>(int)GFF_MAX_LOCUS) { //too far apart, most likely this is a duplicate ID
995     GMessage("Error: duplicate GFF ID '%s' (or exons too far apart)!\n",prevgfo->gffobj->gffID);
996     //validation_errors = true;
997     r=false;
998     if (!gff_warns) exit(1);
999     }
1000     int eidx=prevgfo->gffobj->addExon(this, gffline, !noExonAttr, noExonAttr);
1001     if (eidx>=0 && gffline->ID!=NULL && gffline->exontype==0)
1002     subfPoolAdd(pex, prevgfo);
1003     return r;
1004     }
1005    
1006     CNonExon* GffReader::subfPoolCheck(GffLine* gffline, GHash<CNonExon>& pex, char*& subp_name) {
1007     CNonExon* subp=NULL;
1008     subp_name=NULL;
1009     for (int i=0;i<gffline->num_parents;i++) {
1010     if (transcriptsOnly && discarded_ids.Find(gffline->parents[i])!=NULL)
1011     continue;
1012     subp_name=gfoBuildId(gffline->parents[i], gffline->gseqname); //e.g. mRNA name
1013     subp=pex.Find(subp_name);
1014     if (subp!=NULL)
1015     return subp;
1016     GFREE(subp_name);
1017     }
1018     return NULL;
1019     }
1020    
1021     void GffReader::subfPoolAdd(GHash<CNonExon>& pex, GfoHolder* newgfo) {
1022     //this might become a parent feature later
1023     if (newgfo->gffobj->exons.Count()>0) {
1024     char* xbuf=gfoBuildId(gffline->ID, gffline->gseqname);
1025     pex.Add(xbuf, new CNonExon(newgfo->idx, newgfo->gffobj,
1026     newgfo->gffobj->exons[0], gffline));
1027     GFREE(xbuf);
1028     }
1029     }
1030    
1031     GfoHolder* GffReader::promoteFeature(CNonExon* subp, char*& subp_name, GHash<CNonExon>& pex,
1032     bool keepAttr, bool noExonAttr) {
1033     GffObj* prevp=subp->parent; //grandparent of gffline (e.g. gene)
1034     if (prevp!=gflst[subp->idx])
1035     GError("Error promoting subfeature %s, gflst index mismatch?!\n", subp->gffline->ID);
1036     subp->gffline->discardParent();
1037     GfoHolder* gfoh=newGffRec(subp->gffline, keepAttr, noExonAttr, prevp, subp->exon);
1038     pex.Remove(subp_name); //no longer a potential parent, moved it to phash already
1039     prevp->promotedChildren(true);
1040     return gfoh; //returns the holder of newly promoted feature
1041     }
1042    
1043     //have to parse the whole file because exons can be scattered all over
1044 gpertea 2 void GffReader::readAll(bool keepAttr, bool mergeCloseExons, bool noExonAttr) {
1045 gpertea 16 bool validation_errors = false;
1046     //loc_debug=false;
1047     GHash<CNonExon> pex; //keep track of any "exon"-like features that have an ID
1048     //and thus could become promoted to parent features
1049 gpertea 2 while (nextGffLine()!=NULL) {
1050 gpertea 16 //seen this gff ID before?
1051     GfoHolder* prevseen=NULL;
1052     if (gffline->ID) //GFF3
1053     prevseen=gfoFind(gffline->ID, gffline->gseqname);
1054     if (prevseen!=NULL) {
1055     if (prevseen->gffobj->createdByExon()) {
1056     updateGffRec(prevseen, gffline, keepAttr);
1057     }
1058     else {
1059     GMessage("Error: duplicate GFF ID '%s' encountered!\n",gffline->ID);
1060     validation_errors = true;
1061     if (gff_warns) {
1062     delete gffline; gffline=NULL; continue;
1063     }
1064     else exit(1);
1065     }
1066 gpertea 2 }
1067 gpertea 16 if (gffline->parents==NULL) {//start GFF3-like record with no parent (mRNA, gene)
1068     if (!prevseen) newGffRec(gffline, keepAttr, noExonAttr);
1069 gpertea 2 }
1070 gpertea 16 else { //--- it's a parented feature (could still be a mRNA)
1071     bool found_parent=false;
1072     GfoHolder* newgfo=prevseen;
1073     for (int i=0;i<gffline->num_parents;i++) {
1074     if (transcriptsOnly && discarded_ids.Find(gffline->parents[i])!=NULL)
1075     continue; //skipping discarded parent feature
1076     GfoHolder* parentgfo=gfoFind(gffline->parents[i], gffline->gseqname);
1077     if (parentgfo!=NULL) { //parent GffObj parsed earlier
1078     found_parent=true;
1079     if (parentgfo->gffobj->isGene() && gffline->is_transcript
1080     && gffline->exontype==0) {
1081     //not an exon, but a transcript parented by a gene
1082     if (newgfo) {
1083     updateParent(newgfo, parentgfo->gffobj);
1084     }
1085     else {
1086     newgfo=newGffRec(gffline, keepAttr, noExonAttr, parentgfo->gffobj);
1087     }
1088     }
1089     else { //potential exon subfeature
1090     if (!addExonFeature(parentgfo, gffline, pex, noExonAttr))
1091     validation_errors=true;
1092     }
1093 gpertea 2 }
1094 gpertea 16 } //for each parsed parent Id
1095     if (!found_parent) { //new GTF-like record starting here with a subfeature directly
1096     //or it could be some chado GFF3 barf with exons declared BEFORE their parent :(
1097     //check if this feature isn't parented by a previously stored "exon" subfeature
1098     char* subp_name=NULL;
1099     CNonExon* subp=subfPoolCheck(gffline, pex, subp_name);
1100     if (subp!=NULL) { //found a subfeature that is the parent of this gffline
1101     //promote that subfeature to a full GffObj
1102     GfoHolder* gfoh=promoteFeature(subp, subp_name, pex, keepAttr, noExonAttr);
1103     //add current gffline as an exon of the newly promoted subfeature
1104     if (!addExonFeature(gfoh, gffline, pex, noExonAttr))
1105     validation_errors=true;
1106     }
1107     else { //no parent seen before, create one directly with this exon
1108     //loc_debug=true;
1109     GfoHolder* newgfo=prevseen ? prevseen : newGffRec(gffline, keepAttr, noExonAttr);
1110     if (gffline->ID!=NULL && gffline->exontype==0)
1111     subfPoolAdd(pex, newgfo);
1112     //even those with errors will be added here!
1113     }
1114     GFREE(subp_name);
1115     } //no previous parent found
1116     } //parented feature
1117     //--
1118     delete gffline;
1119     gffline=NULL;
1120     }//while gff lines
1121     gflst.finalize(this, mergeCloseExons, keepAttr, noExonAttr); //force sorting by locus if so constructed
1122 gpertea 2 // all gff records are now loaded in GList gflst
1123     // so we can free the hash
1124     phash.Clear();
1125 gpertea 16 tids.Clear();
1126     if (validation_errors) {
1127     exit(1);
1128     }
1129 gpertea 2 }
1130    
1131 gpertea 16 GffObj* GffObj::finalize(GffReader* gfr, bool mergeCloseExons, bool keepAttrs, bool noExonAttr) {
1132     //merge
1133     //always merge adjacent or overlapping segments
1134     //but if mergeCloseExons then merge even when distance is up to 5 bases
1135 gpertea 2 udata=0;
1136     uptr=NULL;
1137 gpertea 16 if (gfr->transcriptsOnly && !(isTranscript() || (isGene() && children.Count()==0))) {
1138     isDiscarded(true);
1139     }
1140     if (ftype_id==gff_fid_transcript && CDstart>0) {
1141     ftype_id=gff_fid_mRNA;
1142     //exon_ftype_id=gff_fid_exon;
1143     }
1144     //if (ftype_id==gff_fid_mRNA || exon_ftype_id==gff_fid_exon || mergeCloseExons) {
1145     if (isTranscript() || exon_ftype_id==gff_fid_exon || mergeCloseExons) {
1146 gpertea 2 int mindist=mergeCloseExons ? 5:1;
1147     for (int i=0;i<exons.Count()-1;i++) {
1148     int ni=i+1;
1149     uint mend=exons[i]->end;
1150     while (ni<exons.Count()) {
1151     int dist=(int)(exons[ni]->start-mend);
1152 gpertea 16 if (dist>mindist) break; //no merging with next segment
1153     if (gfr!=NULL && gfr->gff_warns && dist!=0 && (exons[ni]->exontype!=exgffUTR && exons[i]->exontype!=exgffUTR)) {
1154     GMessage("GFF warning: merging adjacent/overlapping segments of %s on %s (%d-%d, %d-%d)\n",
1155     gffID, getGSeqName(), exons[i]->start, exons[i]->end,exons[ni]->start, exons[ni]->end);
1156     }
1157 gpertea 2 mend=exons[ni]->end;
1158 gpertea 16 covlen-=exons[i]->len();
1159 gpertea 2 exons[i]->end=mend;
1160 gpertea 16 covlen+=exons[i]->len();
1161     covlen-=exons[ni]->len();
1162 gpertea 2 if (exons[ni]->attrs!=NULL && (exons[i]->attrs==NULL ||
1163     exons[i]->attrs->Count()<exons[ni]->attrs->Count())) {
1164     //use the other exon attributes, if more
1165     delete(exons[i]->attrs);
1166     exons[i]->attrs=exons[ni]->attrs;
1167     exons[ni]->attrs=NULL;
1168     }
1169     exons.Delete(ni);
1170     } //check for merge with next exon
1171     } //for each exon
1172     }
1173 gpertea 16 //attribute reduction for GTF records
1174     if (keepAttrs && !noExonAttr && !fromGff3()
1175     && exons.Count()>0 && exons[0]->attrs!=NULL) {
1176     bool attrs_discarded=false;
1177     for (int a=0;a<exons[0]->attrs->Count();a++) {
1178     int attr_name_id=exons[0]->attrs->Get(a)->attr_id;
1179     char* attr_name=names->attrs.getName(attr_name_id);
1180     char* attr_val =exons[0]->attrs->Get(a)->attr_val;
1181     bool sameExonAttr=true;
1182     for (int i=1;i<exons.Count();i++) {
1183     char* ov=exons[i]->getAttr(attr_name_id);
1184     if (ov==NULL || (strcmp(ov,attr_val)!=0)) {
1185     sameExonAttr=false;
1186     break;
1187     }
1188     }
1189     if (sameExonAttr) {
1190     //delete this attribute from exons level
1191     attrs_discarded=true;
1192     this->addAttr(attr_name, attr_val);
1193     for (int i=1;i<exons.Count();i++) {
1194     removeExonAttr(*(exons[i]), attr_name_id);
1195     }
1196     exons[0]->attrs->freeItem(a);
1197     }
1198     }
1199     if (attrs_discarded) exons[0]->attrs->Pack();
1200     }
1201 gpertea 2 return this;
1202     }
1203    
1204 gpertea 16 void GffObj::parseAttrs(GffAttrs*& atrlist, char* info, bool isExon) {
1205 gpertea 2 if (names==NULL)
1206     GError(ERR_NULL_GFNAMES, "parseAttrs()");
1207     if (atrlist==NULL)
1208     atrlist=new GffAttrs();
1209     char* endinfo=info+strlen(info);
1210     char* start=info;
1211     char* pch=start;
1212     while (start<endinfo) {
1213     //skip spaces
1214     while (*start==' ' && start<endinfo) start++;
1215     pch=strchr(start, ';');
1216     if (pch==NULL) pch=endinfo;
1217     else {
1218     *pch='\0';
1219     pch++;
1220     }
1221     char* ech=strchr(start,'=');
1222     if (ech!=NULL) { // attr=value format found
1223     *ech='\0';
1224 gpertea 16 //if (noExonAttr && (strcmp(start, "exon_number")==0 || strcmp(start, "exon")==0)) { start=pch; continue; }
1225     if (strcmp(start, "exon_number")==0 || strcmp(start, "exon")==0 ||
1226     strcmp(start, "exon_id")==0)
1227     { start=pch; continue; }
1228 gpertea 2 ech++;
1229     while (*ech==' ' && ech<endinfo) ech++;//skip extra spaces after the '='
1230 gpertea 16 //atrlist->Add(new GffAttr(names->attrs.addName(start),ech));
1231     //make sure we don't add the same attribute more than once
1232     if (isExon && (strcmp(start, "protein_id")==0)) {
1233     //Ensembl special case
1234     this->addAttr(start, ech);
1235     start=pch;
1236     continue;
1237     }
1238     atrlist->add_or_update(names, start, ech);
1239 gpertea 2 }
1240     /*
1241     else { //not an attr=value format
1242     atrlist->Add(new GffAttr(names->attrs.addName(start),"1"));
1243     }
1244     */
1245     start=pch;
1246     }
1247     if (atrlist->Count()==0) { delete atrlist; atrlist=NULL; }
1248     }
1249    
1250 gpertea 16 void GffObj::addAttr(const char* attrname, const char* attrvalue) {
1251 gpertea 2 if (this->attrs==NULL)
1252     this->attrs=new GffAttrs();
1253 gpertea 16 //this->attrs->Add(new GffAttr(names->attrs.addName(attrname),attrvalue));
1254     this->attrs->add_or_update(names, attrname, attrvalue);
1255 gpertea 2 }
1256    
1257 gpertea 16
1258     void GffObj::setFeatureName(const char* feature) {
1259     //change the feature name/type for a transcript
1260     int fid=names->feats.addName(feature);
1261     if (monoFeature() && exons.Count()>0)
1262     this->exon_ftype_id=fid;
1263     this->ftype_id=fid;
1264     }
1265    
1266     void GffObj::setRefName(const char* newname) {
1267     //change the feature name/type for a transcript
1268     int rid=names->gseqs.addName(newname);
1269     this->gseq_id=rid;
1270     }
1271    
1272    
1273    
1274     int GffObj::removeAttr(const char* attrname, const char* attrval) {
1275     if (this->attrs==NULL || attrname==NULL || attrname[0]==0) return 0;
1276     int aid=this->names->attrs.getId(attrname);
1277     if (aid<0) return 0;
1278     int delcount=0; //could be more than one ?
1279     for (int i=0;i<this->attrs->Count();i++) {
1280     if (aid==this->attrs->Get(i)->attr_id) {
1281     if (attrval==NULL ||
1282     strcmp(attrval, this->attrs->Get(i)->attr_val)==0) {
1283     delcount++;
1284     this->attrs->freeItem(i);
1285     }
1286     }
1287     }
1288     if (delcount>0) this->attrs->Pack();
1289     return delcount;
1290     }
1291    
1292     int GffObj::removeAttr(int aid, const char* attrval) {
1293     if (this->attrs==NULL || aid<0) return 0;
1294     int delcount=0; //could be more than one ?
1295     for (int i=0;i<this->attrs->Count();i++) {
1296     if (aid==this->attrs->Get(i)->attr_id) {
1297     if (attrval==NULL ||
1298     strcmp(attrval, this->attrs->Get(i)->attr_val)==0) {
1299     delcount++;
1300     this->attrs->freeItem(i);
1301     }
1302     }
1303     }
1304     if (delcount>0) this->attrs->Pack();
1305     return delcount;
1306     }
1307    
1308    
1309     int GffObj::removeExonAttr(GffExon& exon, const char* attrname, const char* attrval) {
1310     if (exon.attrs==NULL || attrname==NULL || attrname[0]==0) return 0;
1311     int aid=this->names->attrs.getId(attrname);
1312     if (aid<0) return 0;
1313     int delcount=0; //could be more than one
1314     for (int i=0;i<exon.attrs->Count();i++) {
1315     if (aid==exon.attrs->Get(i)->attr_id) {
1316     if (attrval==NULL ||
1317     strcmp(attrval, exon.attrs->Get(i)->attr_val)==0) {
1318     delcount++;
1319     exon.attrs->freeItem(i);
1320     }
1321     }
1322     }
1323     if (delcount>0) exon.attrs->Pack();
1324     return delcount;
1325     }
1326    
1327     int GffObj::removeExonAttr(GffExon& exon, int aid, const char* attrval) {
1328     if (exon.attrs==NULL || aid<0) return 0;
1329     int delcount=0; //could be more than one
1330     for (int i=0;i<exon.attrs->Count();i++) {
1331     if (aid==exon.attrs->Get(i)->attr_id) {
1332     if (attrval==NULL ||
1333     strcmp(attrval, exon.attrs->Get(i)->attr_val)==0) {
1334     delcount++;
1335     exon.attrs->freeItem(i);
1336     }
1337     }
1338     }
1339     if (delcount>0) exon.attrs->Pack();
1340     return delcount;
1341     }
1342    
1343    
1344 gpertea 2 void GffObj::getCDS_ends(uint& cds_start, uint& cds_end) {
1345     cds_start=0;
1346     cds_end=0;
1347     if (CDstart==0 || CDend==0) return; //no CDS info
1348     int cdsadj=0;
1349     if (CDphase=='1' || CDphase=='2') {
1350     cdsadj=CDphase-'0';
1351     }
1352     cds_start=CDstart;
1353     cds_end=CDend;
1354     if (strand=='-') cds_end-=cdsadj;
1355     else cds_start+=cdsadj;
1356     }
1357    
1358     void GffObj::mRNA_CDS_coords(uint& cds_mstart, uint& cds_mend) {
1359     //sets cds_start and cds_end to the CDS start,end coordinates on the spliced mRNA transcript
1360     cds_mstart=0;
1361     cds_mend=0;
1362     if (CDstart==0 || CDend==0) return; //no CDS info
1363     //restore normal coordinates, just in case
1364     unxcoord();
1365     int cdsadj=0;
1366     if (CDphase=='1' || CDphase=='2') {
1367     cdsadj=CDphase-'0';
1368     }
1369     /*
1370     uint seqstart=CDstart;
1371     uint seqend=CDend;
1372     */
1373     uint seqstart=exons.First()->start;
1374     uint seqend=exons.Last()->end;
1375     int s=0; //resulting nucleotide counter
1376     if (strand=='-') {
1377     for (int x=exons.Count()-1;x>=0;x--) {
1378     uint sgstart=exons[x]->start;
1379     uint sgend=exons[x]->end;
1380     if (seqend<sgstart || seqstart>sgend) continue;
1381     if (seqstart>=sgstart && seqstart<=sgend)
1382     sgstart=seqstart; //seqstart within this segment
1383     if (seqend>=sgstart && seqend<=sgend)
1384     sgend=seqend; //seqend within this segment
1385     s+=(int)(sgend-sgstart)+1;
1386     if (CDstart>=sgstart && CDstart<=sgend) {
1387     //CDstart in this segment
1388     //and we are getting the whole transcript
1389     cds_mend=s-(int)(CDstart-sgstart);
1390     }
1391     if (CDend>=sgstart && CDend<=sgend) {
1392     //CDstart in this segment
1393     //and we are getting the whole transcript
1394     cds_mstart=s-(int)(CDend-cdsadj-sgstart);
1395     }
1396     } //for each exon
1397     } // - strand
1398     else { // + strand
1399     for (int x=0;x<exons.Count();x++) {
1400     uint sgstart=exons[x]->start;
1401     uint sgend=exons[x]->end;
1402     if (seqend<sgstart || seqstart>sgend) continue;
1403     if (seqstart>=sgstart && seqstart<=sgend)
1404     sgstart=seqstart; //seqstart within this segment
1405     if (seqend>=sgstart && seqend<=sgend)
1406     sgend=seqend; //seqend within this segment
1407     s+=(int)(sgend-sgstart)+1;
1408     /* for (uint i=sgstart;i<=sgend;i++) {
1409     spliced[s]=gsubseq[i-gstart];
1410     s++;
1411     }//for each nt
1412     */
1413     if (CDstart>=sgstart && CDstart<=sgend) {
1414     //CDstart in this segment
1415     cds_mstart=s-(int)(sgend-CDstart-cdsadj);
1416     }
1417     if (CDend>=sgstart && CDend<=sgend) {
1418     //CDend in this segment
1419     cds_mend=s-(int)(sgend-CDend);
1420     }
1421     } //for each exon
1422     } // + strand
1423     //spliced[s]=0;
1424     //if (rlen!=NULL) *rlen=s;
1425     //return spliced;
1426     }
1427    
1428 gpertea 16 char* GffObj::getUnspliced(GFaSeqGet* faseq, int* rlen, GList<GSeg>* seglst)
1429     {
1430     if (faseq==NULL) { GMessage("Warning: getUnspliced(NULL,.. ) called!\n");
1431     return NULL;
1432     }
1433     //restore normal coordinates:
1434     unxcoord();
1435     if (exons.Count()==0) return NULL;
1436     int fspan=end-start+1;
1437     const char* gsubseq=faseq->subseq(start, fspan);
1438     if (gsubseq==NULL) {
1439     GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end);
1440     }
1441     char* unspliced=NULL;
1442    
1443     int seqstart=exons.First()->start;
1444     int seqend=exons.Last()->end;
1445    
1446     int unsplicedlen = 0;
1447    
1448     unsplicedlen += seqend - seqstart + 1;
1449    
1450     GMALLOC(unspliced, unsplicedlen+1); //allocate more here
1451     //uint seqstart, seqend;
1452    
1453     int s = 0; //resulting nucleotide counter
1454     if (strand=='-')
1455     {
1456     if (seglst!=NULL)
1457     seglst->Add(new GSeg(s+1,s+1+seqend-seqstart));
1458     for (int i=seqend;i>=seqstart;i--)
1459     {
1460     unspliced[s] = ntComplement(gsubseq[i-start]);
1461     s++;
1462     }//for each nt
1463     } // - strand
1464     else
1465     { // + strand
1466     if (seglst!=NULL)
1467     seglst->Add(new GSeg(s+1,s+1+seqend-seqstart));
1468     for (int i=seqstart;i<=seqend;i++)
1469     {
1470     unspliced[s]=gsubseq[i-start];
1471     s++;
1472     }//for each nt
1473     } // + strand
1474     //assert(s <= unsplicedlen);
1475     unspliced[s]=0;
1476     if (rlen!=NULL) *rlen=s;
1477     return unspliced;
1478     }
1479    
1480 gpertea 2 char* GffObj::getSpliced(GFaSeqGet* faseq, bool CDSonly, int* rlen, uint* cds_start, uint* cds_end,
1481     GList<GSeg>* seglst) {
1482     if (CDSonly && CDstart==0) return NULL;
1483     if (faseq==NULL) { GMessage("Warning: getSpliced(NULL,.. ) called!\n");
1484     return NULL;
1485     }
1486     //restore normal coordinates:
1487     unxcoord();
1488     if (exons.Count()==0) return NULL;
1489     int fspan=end-start+1;
1490     const char* gsubseq=faseq->subseq(start, fspan);
1491     if (gsubseq==NULL) {
1492     GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end);
1493     }
1494 gpertea 16 if (fspan<(int)(end-start+1)) { //special case: stop coordinate was extended past the gseq length, must adjust
1495     int endadj=end-start+1-fspan;
1496     uint prevend=end;
1497     end-=endadj;
1498     if (CDend>end) CDend=end;
1499     if (exons.Last()->end>end) {
1500     exons.Last()->end=end; //this could get us into trouble if exon start is also > end
1501     if (exons.Last()->start>exons.Last()->end) {
1502     GError("GffObj::getSpliced() error: improper genomic coordinate %d on %s for %s\n",
1503     prevend,getGSeqName(), getID());
1504     }
1505     covlen-=endadj;
1506     }
1507     }
1508 gpertea 2 char* spliced=NULL;
1509     GMALLOC(spliced, covlen+1); //allocate more here
1510     uint seqstart, seqend;
1511     int cdsadj=0;
1512     if (CDphase=='1' || CDphase=='2') {
1513     cdsadj=CDphase-'0';
1514     }
1515     if (CDSonly) {
1516     seqstart=CDstart;
1517     seqend=CDend;
1518     if (strand=='-') seqend-=cdsadj;
1519     else seqstart+=cdsadj;
1520     }
1521     else {
1522     seqstart=exons.First()->start;
1523     seqend=exons.Last()->end;
1524     }
1525     int s=0; //resulting nucleotide counter
1526     if (strand=='-') {
1527     for (int x=exons.Count()-1;x>=0;x--) {
1528     uint sgstart=exons[x]->start;
1529     uint sgend=exons[x]->end;
1530     if (seqend<sgstart || seqstart>sgend) continue;
1531     if (seqstart>=sgstart && seqstart<=sgend)
1532     sgstart=seqstart; //seqstart within this segment
1533     if (seqend>=sgstart && seqend<=sgend)
1534     sgend=seqend; //seqend within this segment
1535     if (seglst!=NULL)
1536     seglst->Add(new GSeg(s+1,s+1+sgend-sgstart));
1537     for (uint i=sgend;i>=sgstart;i--) {
1538     spliced[s] = ntComplement(gsubseq[i-start]);
1539     s++;
1540     }//for each nt
1541    
1542     if (!CDSonly && cds_start!=NULL && CDstart>0) {
1543     if (CDstart>=sgstart && CDstart<=sgend) {
1544     //CDstart in this segment
1545     //and we are getting the whole transcript
1546     *cds_end=s-(CDstart-sgstart);
1547     }
1548     if (CDend>=sgstart && CDend<=sgend) {
1549     //CDstart in this segment
1550     //and we are getting the whole transcript
1551     *cds_start=s-(CDend-cdsadj-sgstart);
1552     }
1553     }//update local CDS coordinates
1554     } //for each exon
1555     } // - strand
1556     else { // + strand
1557     for (int x=0;x<exons.Count();x++) {
1558     uint sgstart=exons[x]->start;
1559     uint sgend=exons[x]->end;
1560     if (seqend<sgstart || seqstart>sgend) continue;
1561     if (seqstart>=sgstart && seqstart<=sgend)
1562     sgstart=seqstart; //seqstart within this segment
1563     if (seqend>=sgstart && seqend<=sgend)
1564     sgend=seqend; //seqend within this segment
1565     if (seglst!=NULL)
1566     seglst->Add(new GSeg(s+1,s+1+sgend-sgstart));
1567     for (uint i=sgstart;i<=sgend;i++) {
1568     spliced[s]=gsubseq[i-start];
1569     s++;
1570     }//for each nt
1571     if (!CDSonly && cds_start!=NULL && CDstart>0) {
1572     if (CDstart>=sgstart && CDstart<=sgend) {
1573     //CDstart in this segment
1574     //and we are getting the whole transcript
1575     *cds_start=s-(sgend-CDstart-cdsadj);
1576     }
1577     if (CDend>=sgstart && CDend<=sgend) {
1578     //CDstart in this segment
1579     //and we are getting the whole transcript
1580     *cds_end=s-(sgend-CDend);
1581     }
1582     }//update local CDS coordinates
1583     } //for each exon
1584     } // + strand
1585     spliced[s]=0;
1586     if (rlen!=NULL) *rlen=s;
1587     return spliced;
1588     }
1589    
1590     char* GffObj::getSplicedTr(GFaSeqGet* faseq, bool CDSonly, int* rlen) {
1591     if (CDSonly && CDstart==0) return NULL;
1592     //restore normal coordinates:
1593     unxcoord();
1594     if (exons.Count()==0) return NULL;
1595     int fspan=end-start+1;
1596     const char* gsubseq=faseq->subseq(start, fspan);
1597     if (gsubseq==NULL) {
1598     GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end);
1599     }
1600    
1601     char* translation=NULL;
1602     GMALLOC(translation, (int)(covlen/3)+1);
1603     uint seqstart, seqend;
1604     int cdsadj=0;
1605     if (CDphase=='1' || CDphase=='2') {
1606     cdsadj=CDphase-'0';
1607     }
1608     if (CDSonly) {
1609     seqstart=CDstart;
1610     seqend=CDend;
1611     if (strand=='-') seqend-=cdsadj;
1612     else seqstart+=cdsadj;
1613     }
1614     else {
1615     seqstart=exons.First()->start;
1616     seqend=exons.Last()->end;
1617     }
1618     Codon codon;
1619     int nt=0; //codon nucleotide counter (0..2)
1620     int aa=0; //aminoacid count
1621     if (strand=='-') {
1622     for (int x=exons.Count()-1;x>=0;x--) {
1623     uint sgstart=exons[x]->start;
1624     uint sgend=exons[x]->end;
1625     if (seqend<sgstart || seqstart>sgend) continue;
1626     if (seqstart>=sgstart && seqstart<=sgend)
1627     sgstart=seqstart; //seqstart within this segment
1628     if (seqend>=sgstart && seqend<=sgend) {
1629     sgend=seqend; //seqend within this segment
1630     }
1631     for (uint i=sgend;i>=sgstart;i--) {
1632     codon.nuc[nt]=ntComplement(gsubseq[i-start]);
1633     nt++;
1634     if (nt==3) {
1635     nt=0;
1636     translation[aa]=codon.translate();
1637     aa++;
1638     }
1639     }//for each nt
1640     } //for each exon
1641     } // - strand
1642     else { // + strand
1643     for (int x=0;x<exons.Count();x++) {
1644     uint sgstart=exons[x]->start;
1645     uint sgend=exons[x]->end;
1646     if (seqend<sgstart || seqstart>sgend) continue;
1647     if (seqstart>=sgstart && seqstart<=sgend)
1648     sgstart=seqstart; //seqstart within this segment
1649     if (seqend>=sgstart && seqend<=sgend)
1650     sgend=seqend; //seqend within this segment
1651     for (uint i=sgstart;i<=sgend;i++) {
1652     codon.nuc[nt]=gsubseq[i-start];
1653     nt++;
1654     if (nt==3) {
1655     nt=0;
1656     translation[aa]=codon.translate();
1657     aa++;
1658     }
1659     }//for each nt
1660     } //for each exon
1661     } // + strand
1662     translation[aa]=0;
1663     if (rlen!=NULL) *rlen=aa;
1664     return translation;
1665     }
1666    
1667     void GffObj::printSummary(FILE* fout) {
1668     if (fout==NULL) fout=stdout;
1669     fprintf(fout, "%s\t%c\t%d\t%d\t%4.2f\t%4.1f\n", gffID,
1670     strand, start, end, gscore, (float)qcov/10.0);
1671     }
1672    
1673 gpertea 16 void GffObj::printGxfLine(FILE* fout, const char* tlabel, const char* gseqname, bool iscds,
1674 gpertea 2 uint segstart, uint segend, int exidx, char phase, bool gff3) {
1675     static char scorestr[14];
1676     strcpy(scorestr,".");
1677     GffAttrs* xattrs=NULL;
1678     if (exidx>=0) {
1679     if (exons[exidx]->score) sprintf(scorestr,"%.2f", exons[exidx]->score);
1680     xattrs=exons[exidx]->attrs;
1681     }
1682     if (phase==0 || !iscds) phase='.';
1683     const char* ftype=iscds ? "CDS" : getSubfName();
1684     if (gff3) {
1685     fprintf(fout,
1686     "%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\tParent=%s",
1687     gseqname, tlabel, ftype, segstart, segend, scorestr, strand,
1688     phase, gffID);
1689     if (xattrs!=NULL) {
1690     for (int i=0;i<xattrs->Count();i++)
1691     fprintf(fout, ";%s=%s",names->attrs.getName(xattrs->Get(i)->attr_id),
1692     xattrs->Get(i)->attr_val);
1693     }
1694     fprintf(fout, "\n");
1695     } //GFF
1696 gpertea 16 else {//for GTF -- we print only transcripts
1697     //if (isValidTranscript())
1698     fprintf(fout, "%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\ttranscript_id \"%s\";",
1699     gseqname, tlabel, ftype, segstart, segend, scorestr, strand, phase, gffID);
1700     //char* geneid=(geneID!=NULL)? geneID : gffID;
1701     if (geneID)
1702     fprintf(fout," gene_id \"%s\";",geneID);
1703     if (gene_name!=NULL) {
1704     //fprintf(fout, " gene_name ");
1705     //if (gene_name[0]=='"') fprintf (fout, "%s;",gene_name);
1706     // else fprintf(fout, "\"%s\";",gene_name);
1707     fprintf(fout," gene_name \"%s\";",gene_name);
1708     }
1709 gpertea 2 if (xattrs!=NULL) {
1710 gpertea 16 for (int i=0;i<xattrs->Count();i++) {
1711     if (xattrs->Get(i)->attr_val==NULL) continue;
1712     const char* attrname=names->attrs.getName(xattrs->Get(i)->attr_id);
1713     fprintf(fout, " %s ",attrname);
1714     if (xattrs->Get(i)->attr_val[0]=='"')
1715     fprintf(fout, "%s;",xattrs->Get(i)->attr_val);
1716     else fprintf(fout, "\"%s\";",xattrs->Get(i)->attr_val);
1717     }
1718 gpertea 2 }
1719 gpertea 16 //for GTF, also append the GffObj attributes to each exon line
1720     if ((xattrs=this->attrs)!=NULL) {
1721     for (int i=0;i<xattrs->Count();i++) {
1722     if (xattrs->Get(i)->attr_val==NULL) continue;
1723     const char* attrname=names->attrs.getName(xattrs->Get(i)->attr_id);
1724     fprintf(fout, " %s ",attrname);
1725     if (xattrs->Get(i)->attr_val[0]=='"')
1726     fprintf(fout, "%s;",xattrs->Get(i)->attr_val);
1727     else fprintf(fout, "\"%s\";",xattrs->Get(i)->attr_val);
1728     }
1729     }
1730 gpertea 2 fprintf(fout, "\n");
1731     }//GTF
1732     }
1733    
1734 gpertea 16 void GffObj::printGxf(FILE* fout, GffPrintMode gffp,
1735     const char* tlabel, const char* gfparent) {
1736 gpertea 2 static char tmpstr[255];
1737     if (tlabel==NULL) {
1738     tlabel=track_id>=0 ? names->tracks.Get(track_id)->name :
1739     (char*)"gffobj" ;
1740     }
1741     unxcoord();
1742 gpertea 16 //if (exons.Count()==0) return;
1743     const char* gseqname=names->gseqs.Get(gseq_id)->name;
1744 gpertea 2 bool gff3 = (gffp>=pgffAny);
1745     bool showCDS = (gffp==pgtfAny || gffp==pgtfCDS || gffp==pgffCDS || gffp==pgffAny || gffp==pgffBoth);
1746     bool showExon = (gffp<=pgtfExon || gffp==pgffAny || gffp==pgffExon || gffp==pgffBoth);
1747     if (gff3) {
1748     //print GFF3 mRNA line:
1749     if (gscore>0.0) sprintf(tmpstr,"%.2f", gscore);
1750     else strcpy(tmpstr,".");
1751     uint pstart, pend;
1752     if (gffp==pgffCDS) {
1753     pstart=CDstart;
1754     pend=CDend;
1755     }
1756     else { pstart=start;pend=end; }
1757 gpertea 16 //const char* ftype=isTranscript() ? "mRNA" : getFeatureName();
1758     const char* ftype=getFeatureName();
1759 gpertea 2 fprintf(fout,
1760     "%s\t%s\t%s\t%d\t%d\t%s\t%c\t.\tID=%s",
1761     gseqname, tlabel, ftype, pstart, pend, tmpstr, strand, gffID);
1762 gpertea 16 if (CDstart>0 && !showCDS && !isCDS) fprintf(fout,";CDS=%d-%d",CDstart,CDend);
1763     if (gfparent!=NULL) {
1764     //parent override
1765     fprintf(fout, ";Parent=%s",gfparent);
1766     }
1767     else {
1768     if (parent!=NULL && !parent->isDiscarded())
1769     fprintf(fout, ";Parent=%s",parent->getID());
1770     }
1771     if (geneID!=NULL)
1772     fprintf(fout, ";geneID=%s",geneID);
1773     if (gene_name!=NULL)
1774     fprintf(fout, ";gene_name=%s",gene_name);
1775 gpertea 2 if (attrs!=NULL) {
1776     for (int i=0;i<attrs->Count();i++) {
1777 gpertea 16 const char* attrname=names->attrs.getName(attrs->Get(i)->attr_id);
1778     fprintf(fout,";%s=%s", attrname,
1779 gpertea 2 attrs->Get(i)->attr_val);
1780     }
1781     }
1782     fprintf(fout,"\n");
1783     }// gff3 mRNA line
1784     if (showExon) {
1785     //print exons
1786 gpertea 16 if (isCDS && exons.Count()>0 &&
1787     ((strand=='-' && exons.Last()->phase<'0') || (strand=='+' && exons.Last()->phase<'0')))
1788     updateExonPhase();
1789    
1790     for (int i=0;i<exons.Count();i++) {
1791     printGxfLine(fout, tlabel, gseqname, isCDS, exons[i]->start, exons[i]->end, i, exons[i]->phase, gff3);
1792     }
1793     }//printing exons
1794 gpertea 2 if (showCDS && !isCDS && CDstart>0) {
1795     GArray<GffCDSeg> cds(true,true);
1796     getCDSegs(cds);
1797     for (int i=0;i<cds.Count();i++) {
1798     printGxfLine(fout, tlabel, gseqname, true, cds[i].start, cds[i].end, -1, cds[i].phase, gff3);
1799     }
1800     } //showCDS
1801     }
1802    
1803 gpertea 16 void GffObj::updateExonPhase() {
1804     if (!isCDS) return;
1805     int cdsacc=0;
1806     if (CDphase=='1' || CDphase=='2') {
1807     cdsacc+= 3-(CDphase-'0');
1808     }
1809     if (strand=='-') { //reverse strand
1810     for (int i=exons.Count()-1;i>=0;i--) {
1811     exons[i]->phase='0'+ (3-cdsacc%3)%3;
1812     cdsacc+=exons[i]->end-exons[i]->start+1;
1813     }
1814     }
1815     else { //forward strand
1816     for (int i=0;i<exons.Count();i++) {
1817     exons[i]->phase='0'+ (3-cdsacc%3)%3;
1818     cdsacc+=exons[i]->end-exons[i]->start+1;
1819     }
1820     }
1821     }
1822 gpertea 2
1823 gpertea 16
1824 gpertea 2 void GffObj::getCDSegs(GArray<GffCDSeg>& cds) {
1825     GffCDSeg cdseg;
1826     int cdsacc=0;
1827     if (CDphase=='1' || CDphase=='2') {
1828     cdsacc+= 3-(CDphase-'0');
1829     }
1830     if (strand=='-') {
1831     for (int x=exons.Count()-1;x>=0;x--) {
1832     uint sgstart=exons[x]->start;
1833     uint sgend=exons[x]->end;
1834     if (CDend<sgstart || CDstart>sgend) continue;
1835     if (CDstart>=sgstart && CDstart<=sgend)
1836     sgstart=CDstart; //cdstart within this segment
1837     if (CDend>=sgstart && CDend<=sgend)
1838     sgend=CDend; //cdend within this segment
1839     cdseg.start=sgstart;
1840     cdseg.end=sgend;
1841     cdseg.exonidx=x;
1842     //cdseg.phase='0'+(cdsacc>0 ? (3-cdsacc%3)%3 : 0);
1843     cdseg.phase='0'+ (3-cdsacc%3)%3;
1844     cdsacc+=sgend-sgstart+1;
1845     cds.Add(cdseg);
1846     } //for each exon
1847     } // - strand
1848     else { // + strand
1849     for (int x=0;x<exons.Count();x++) {
1850     uint sgstart=exons[x]->start;
1851     uint sgend=exons[x]->end;
1852     if (CDend<sgstart || CDstart>sgend) continue;
1853     if (CDstart>=sgstart && CDstart<=sgend)
1854     sgstart=CDstart; //seqstart within this segment
1855     if (CDend>=sgstart && CDend<=sgend)
1856     sgend=CDend; //seqend within this segment
1857     cdseg.start=sgstart;
1858     cdseg.end=sgend;
1859     cdseg.exonidx=x;
1860     //cdseg.phase='0'+(cdsacc>0 ? (3-cdsacc%3)%3 : 0);
1861     cdseg.phase='0' + (3-cdsacc%3)%3 ;
1862     cdsacc+=sgend-sgstart+1;
1863     cds.Add(cdseg);
1864     } //for each exon
1865     } // + strand
1866     }