ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/gff.cpp
Revision: 286
Committed: Mon Oct 15 17:31:47 2012 UTC (6 years, 8 months ago) by gpertea
File size: 70631 byte(s)
Log Message:
slight mods for rejoin use

Line User Rev File contents
1 gpertea 2 #include "gff.h"
2    
3     //GffNames* GffReader::names=NULL;
4     GffNames* GffObj::names=NULL;
5     //global set of feature names, attribute names etc.
6     // -- common for all GffObjs in current application!
7    
8 gpertea 16 const uint GFF_MAX_LOCUS = 7000000; //longest known gene in human is ~2.2M, UCSC claims a gene for mouse of ~ 3.1 M
9     const uint GFF_MAX_EXON = 30000; //longest known exon in human is ~11K
10 gpertea 115 const uint GFF_MAX_INTRON= 6000000; //Ensembl shows a >5MB human intron
11 gpertea 16 bool gff_show_warnings = false; //global setting, set by GffReader->showWarnings()
12 gpertea 2 const int gff_fid_mRNA=0;
13 gpertea 16 const int gff_fid_transcript=1;
14     const int gff_fid_exon=2;
15     const int gff_fid_CDS=3; //never really used in GffObj ftype_id or subftype_id
16     const uint gfo_flag_HAS_ERRORS = 0x00000001;
17     const uint gfo_flag_CHILDREN_PROMOTED= 0x00000002;
18     const uint gfo_flag_IS_GENE = 0x00000004;
19     const uint gfo_flag_IS_TRANSCRIPT = 0x00000008;
20 gpertea 153 const uint gfo_flag_HAS_GFF_ID = 0x00000010; //found GFF3 feature line with its own ID
21 gpertea 16 const uint gfo_flag_BY_EXON = 0x00000020; //created by subfeature (exon) directly
22     const uint gfo_flag_DISCARDED = 0x00000100;
23     const uint gfo_flag_LST_KEEP = 0x00000200;
24     const uint gfo_flag_LEVEL_MSK = 0x00FF0000;
25     const byte gfo_flagShift_LEVEL = 16;
26 gpertea 2
27     void gffnames_ref(GffNames* &n) {
28     if (n==NULL) n=new GffNames();
29     n->numrefs++;
30     }
31    
32     void gffnames_unref(GffNames* &n) {
33     if (n==NULL) GError("Error: attempt to remove reference to null GffNames object!\n");
34     n->numrefs--;
35     if (n->numrefs==0) { delete n; n=NULL; }
36     }
37    
38     int gfo_cmpByLoc(const pointer p1, const pointer p2) {
39    
40     GffObj& g1=*((GffObj*)p1);
41     GffObj& g2=*((GffObj*)p2);
42 gpertea 16 if (g1.gseq_id==g2.gseq_id) {
43     if (g1.start!=g2.start)
44     return (int)(g1.start-g2.start);
45     else if (g1.getLevel()!=g2.getLevel())
46     return (int)(g1.getLevel()-g2.getLevel());
47     else
48     if (g1.end!=g2.end)
49     return (int)(g1.end-g2.end);
50     else return strcmp(g1.getID(), g2.getID());
51     }
52     else return (int)(g1.gseq_id-g2.gseq_id);
53 gpertea 2 }
54    
55 gpertea 150 char* GffLine::extractAttr(const char* attr, bool caseStrict, bool enforce_GTF2) {
56 gpertea 16 //parse a key attribute and remove it from the info string
57     //(only works for attributes that have values following them after ' ' or '=')
58     static const char GTF2_ERR[]="Error parsing attribute %s ('\"' required) at GTF line:\n%s\n";
59 gpertea 150 int attrlen=strlen(attr);
60     char cend=attr[attrlen-1];
61     //char* pos = (caseStrict) ? strstr(info, attr) : strifind(info, attr);
62     //must make sure attr is not found in quoted text
63     char* pos=info;
64     char prevch=0;
65     bool in_str=false;
66     bool notfound=true;
67     int (*strcmpfn)(const char*, const char*, int) = caseStrict ? Gstrcmp : Gstricmp;
68     while (notfound && *pos) {
69     char ch=*pos;
70     if (ch=='"') {
71     in_str=!in_str;
72     pos++;
73     prevch=ch;
74     continue;
75     }
76     if (!in_str && (prevch==0 || prevch==' ' || prevch == ';')
77     && strcmpfn(attr, pos, attrlen)==0) {
78     //attr match found
79     //check for word boundary on right
80     char* epos=pos+attrlen;
81     if (cend=='=' || cend==' ' || *epos==0 || *epos==' ') {
82     notfound=false;
83     break;
84     }
85     //not a perfect match, move on
86     pos=epos;
87     prevch=*(pos-1);
88     continue;
89     }
90     //not a match or in_str
91     prevch=ch;
92     pos++;
93     }
94     if (notfound) return NULL;
95     char* vp=pos+attrlen;
96 gpertea 16 while (*vp==' ') vp++;
97     if (*vp==';' || *vp==0)
98 gpertea 150 GError("Error parsing value of GFF attribute \"%s\", line:\n%s\n", attr, dupline);
99 gpertea 16 bool dq_enclosed=false; //value string enclosed by double quotes
100     if (*vp=='"') {
101     dq_enclosed=true;
102     vp++;
103     }
104     if (enforce_GTF2 && !dq_enclosed)
105 gpertea 150 GError(GTF2_ERR,attr, dupline);
106 gpertea 16 char* vend=vp;
107     if (dq_enclosed) {
108     while (*vend!='"' && *vend!=';' && *vend!=0) vend++;
109     }
110     else {
111     while (*vend!=';' && *vend!=0) vend++;
112     }
113     if (enforce_GTF2 && *vend!='"')
114 gpertea 150 GError(GTF2_ERR, attr, dupline);
115 gpertea 16 char *r=Gstrdup(vp, vend-1);
116     //-- now remove this attribute from the info string
117     while (*vend!=0 && (*vend=='"' || *vend==';' || *vend==' ')) vend++;
118     if (*vend==0) vend--;
119     for (char *src=vend, *dest=pos;;src++,dest++) {
120     *dest=*src;
121     if (*src==0) break;
122     }
123     return r;
124     }
125 gpertea 2
126 gpertea 16 static char fnamelc[128];
127    
128 gpertea 2 GffLine::GffLine(GffReader* reader, const char* l) {
129 gpertea 16 llen=strlen(l);
130     GMALLOC(line,llen+1);
131     memcpy(line, l, llen+1);
132     GMALLOC(dupline, llen+1);
133     memcpy(dupline, l, llen+1);
134 gpertea 2 skip=true;
135     gseqname=NULL;
136     track=NULL;
137     ftype=NULL;
138     info=NULL;
139 gpertea 16 _parents=NULL;
140     _parents_len=0;
141     num_parents=0;
142     parents=NULL;
143     is_gff3=false;
144 gpertea 2 is_cds=false;
145 gpertea 16 is_transcript=false;
146 gpertea 2 is_exon=false;
147 gpertea 16 is_gene=false;
148 gpertea 2 exontype=0;
149 gpertea 16 gene_id=NULL;
150     gene_name=NULL;
151 gpertea 2 qstart=0;
152     qend=0;
153     qlen=0;
154     ID=NULL;
155     char* t[9];
156     int i=0;
157     int tidx=1;
158     t[0]=line;
159    
160     while (line[i]!=0) {
161     if (line[i]=='\t') {
162     line[i]=0;
163     t[tidx]=line+i+1;
164     tidx++;
165     if (tidx>8) break;
166     }
167     i++;
168     }
169    
170     if (tidx<8) { // ignore non-GFF lines
171     // GMessage("Warning: error parsing GFF/GTF line:\n%s\n", l);
172     return;
173     }
174     gseqname=t[0];
175     track=t[1];
176     ftype=t[2];
177     info=t[8];
178     char* p=t[3];
179 gpertea 66 if (!parseUInt(p,fstart)) {
180     //FIXME: chromosome_band entries in Flybase
181     GMessage("Warning: invalid start coordinate at line:\n%s\n",l);
182     return;
183     }
184 gpertea 2 p=t[4];
185 gpertea 66 if (!parseUInt(p,fend)) {
186     GMessage("Warning: invalid end coordinate at line:\n%s\n",l);
187     return;
188     }
189 gpertea 144 if (fend<fstart) Gswap(fend,fstart); //make sure fstart>=fend, always
190 gpertea 2 p=t[5];
191     if (p[0]=='.' && p[1]==0) {
192     score=0;
193     }
194     else {
195     if (!parseDouble(p,score))
196     GError("Error parsing feature score from GFF line:\n%s\n",l);
197     }
198     strand=*t[6];
199     if (strand!='+' && strand!='-' && strand!='.')
200     GError("Error parsing strand (%c) from GFF line:\n%s\n",strand,l);
201 gpertea 16 phase=*t[7]; // must be '.', '0', '1' or '2'
202 gpertea 2 ID=NULL;
203     // exon/CDS/mrna filter
204 gpertea 16 strncpy(fnamelc, ftype, 127);
205     fnamelc[127]=0;
206 gpertea 2 strlower(fnamelc); //convert to lower case
207 gpertea 16 bool is_t_data=false;
208 gpertea 2 if (strstr(fnamelc, "utr")!=NULL) {
209     exontype=exgffUTR;
210     is_exon=true;
211 gpertea 16 is_t_data=true;
212 gpertea 2 }
213 gpertea 66 else if (endsWith(fnamelc, "exon")) {
214 gpertea 2 exontype=exgffExon;
215     is_exon=true;
216 gpertea 16 is_t_data=true;
217 gpertea 2 }
218 gpertea 16 else if (strstr(fnamelc, "stop") &&
219     (strstr(fnamelc, "codon") || strstr(fnamelc, "cds"))){
220 gpertea 2 exontype=exgffStop;
221 gpertea 16 is_cds=true; //though some place it outside the last CDS segment
222     is_t_data=true;
223     }
224     else if (strstr(fnamelc, "start") &&
225     ((strstr(fnamelc, "codon")!=NULL) || strstr(fnamelc, "cds")!=NULL)){
226     exontype=exgffStart;
227 gpertea 2 is_cds=true;
228 gpertea 16 is_t_data=true;
229 gpertea 2 }
230     else if (strcmp(fnamelc, "cds")==0) {
231     exontype=exgffCDS;
232     is_cds=true;
233 gpertea 16 is_t_data=true;
234 gpertea 2 }
235 gpertea 16 else if (endsWith(fnamelc, "gene") || startsWith(fnamelc, "gene")) {
236     is_gene=true;
237     is_t_data=true; //because its name will be attached to parented transcripts
238 gpertea 2 }
239 gpertea 16 else if (endsWith(fnamelc,"rna") || endsWith(fnamelc,"transcript")) {
240     is_transcript=true;
241     is_t_data=true;
242     }
243 gpertea 2
244 gpertea 16 if (reader->transcriptsOnly && !is_t_data) {
245     char* id=extractAttr("ID=");
246     if (id==NULL) id=extractAttr("transcript_id");
247     //GMessage("Discarding non-transcript line:\n%s\n",l);
248     if (id!=NULL) {
249     reader->discarded_ids.Add(id, new int(1));
250     GFREE(id);
251     }
252     return; //skip this line, unwanted feature name
253 gpertea 2 }
254 gpertea 150 ID=extractAttr("ID=",true);
255     char* Parent=extractAttr("Parent=",true);
256 gpertea 16 is_gff3=(ID!=NULL || Parent!=NULL);
257     if (is_gff3) {
258     //parse as GFF3
259     if (ID!=NULL) {
260     //has ID attr so it's likely to be a parent feature
261     //look for explicit gene name
262 gpertea 150 gene_name=extractAttr("gene_name=");
263 gpertea 16 if (gene_name==NULL) {
264 gpertea 150 gene_name=extractAttr("geneName=");
265 gpertea 16 if (gene_name==NULL) {
266 gpertea 150 gene_name=extractAttr("gene_sym=");
267 gpertea 16 if (gene_name==NULL) {
268 gpertea 150 gene_name=extractAttr("gene=");
269 gpertea 16 }
270     }
271     }
272 gpertea 150 gene_id=extractAttr("geneID=");
273 gpertea 16 if (gene_id==NULL) {
274 gpertea 150 gene_id=extractAttr("gene_id=");
275 gpertea 16 }
276     if (is_gene) {
277     //special case: keep the Name and ID attributes of the gene feature
278     if (gene_name==NULL)
279     gene_name=extractAttr("Name=");
280     if (gene_id==NULL) //the ID is also gene_id in this case
281     gene_id=Gstrdup(ID);
282     //skip=false;
283     //return;
284     GFREE(Parent); //TMI, we really don't care about gene Parents?
285     } //gene feature
286     }// has GFF3 ID
287     if (Parent!=NULL) {
288     //keep Parent attr
289     //parse multiple parents
290     num_parents=1;
291     p=Parent;
292     int last_delim_pos=-1;
293     while (*p!=';' && *p!=0) {
294     if (*p==',' && *(p+1)!=0 && *(p+1)!=';') {
295     num_parents++;
296     last_delim_pos=(p-Parent);
297     }
298     p++;
299     }
300     _parents_len=p-Parent+1;
301     _parents=Parent;
302     GMALLOC(parents, num_parents*sizeof(char*));
303     parents[0]=_parents;
304     int i=1;
305     if (last_delim_pos>0) {
306     for (p=_parents+1;p<=_parents+last_delim_pos;p++) {
307     if (*p==',') {
308     char* ep=p-1;
309     while (*ep==' ' && ep>_parents) ep--;
310     *(ep+1)=0; //end the string there
311     parents[i]=p+1;
312     i++;
313     }
314     }
315     }
316     } //has Parent field
317     } //GFF3
318     else { // GTF-like expected
319 gpertea 150 Parent=extractAttr("transcript_id",true);
320 gpertea 16 if (Parent!=NULL) { //GTF2 format detected
321     if (is_transcript) {
322     // atypical GTF with a parent transcript line declared
323     ID=Parent;
324     Parent=NULL;
325     }
326     gene_id=extractAttr("gene_id"); // for GTF this is the only attribute accepted as geneID
327 gpertea 150 if (gene_id==NULL)
328     gene_id=extractAttr("geneid");
329 gpertea 16 gene_name=extractAttr("gene_name");
330     if (gene_name==NULL) {
331 gpertea 150
332 gpertea 16 gene_name=extractAttr("gene_sym");
333 gpertea 150 if (gene_name==NULL) {
334 gpertea 16 gene_name=extractAttr("gene");
335 gpertea 150 if (gene_name==NULL)
336     gene_name=extractAttr("genesymbol");
337     }
338 gpertea 16 }
339 gpertea 2 //prepare for parseAttr by adding '=' character instead of spaces for all attributes
340     //after the attribute name
341     p=info;
342     bool noed=true; //not edited after the last delim
343     bool nsp=false; //non-space found after last delim
344     while (*p!=0) {
345 gpertea 16 if (*p==' ') {
346     if (nsp && noed) {
347     *p='=';
348     noed=false;
349     p++;
350     continue;
351     }
352     }
353     else nsp=true; //non-space
354     if (*p==';') { noed=true; nsp=false; }
355     p++;
356     }
357     } //GTF2 detected (no parent line)
358     else {// Parent is NULL, check for jigsaw format or other pre-GTF2 format
359     //char* fexon=strstr(fnamelc, "exon");
360     //if (fexon!=NULL) {
361     if (exontype==exgffExon) {
362 gpertea 2 if (startsWith(track,"jigsaw")) {
363 gpertea 16 is_cds=true;
364     strcpy(track,"jigsaw");
365     p=strchr(info,';');
366     if (p==NULL) { Parent=Gstrdup(info); info=NULL; }
367     else { Parent=Gstrdup(info,p-1);
368     info=p+1;
369     }
370     }
371     } //exon feature?
372     if (Parent==NULL && exontype>=exgffCDS &&
373     (i=strcspn(info,"; \t\n\r"))<=(int)(strlen(info)+1)) {
374     //one word ID ? really desperate attempt to parse it here
375 gpertea 2 Parent=Gstrdup(info,info+i-1);
376 gpertea 16 info=NULL; //discard anything else on the line
377     }
378     }
379     if (Parent!=NULL) { //GTF transcript_id for exon/CDS feature
380     _parents=Parent;
381     GMALLOC(parents,sizeof(char*));
382     num_parents=1;
383     parents[0]=_parents;
384     }
385     } //GTF-like
386 gpertea 2
387 gpertea 16 //parse other potentially useful features
388     if (is_gff3) {
389     if ((p=strstr(info,"Target="))!=NULL) { //has Target attr
390     p+=7;
391     while (*p!=';' && *p!=0 && *p!=' ') p++;
392     if (*p!=' ') {
393     GError("Error parsing target coordinates from GFF line:\n%s\n",l);
394     }
395     if (!parseUInt(p,qstart))
396     GError("Error parsing target start coordinate from GFF line:\n%s\n",l);
397     if (*p!=' ') {
398     GError("Error parsing next target coordinate from GFF line:\n%s\n",l);
399     }
400     p++;
401     if (!parseUInt(p,qend))
402     GError("Error parsing target end coordinate from GFF line:\n%s\n",l);
403 gpertea 2 }
404 gpertea 16 if ((p=strifind(info,"Qreg="))!=NULL) { //has Qreg attr
405     p+=5;
406     if (!parseUInt(p,qstart))
407     GError("Error parsing target start coordinate from GFF line:\n%s\n",l);
408     if (*p!='-') {
409     GError("Error parsing next target coordinate from GFF line:\n%s\n",l);
410     }
411     p++;
412     if (!parseUInt(p,qend))
413     GError("Error parsing target end coordinate from GFF line:\n%s\n",l);
414     if (*p=='|' || *p==':') {
415     p++;
416     if (!parseUInt(p,qlen))
417     GError("Error parsing target length from GFF Qreg|: \n%s\n",l);
418     }
419     }//has Qreg attr
420     if (qlen==0 && (p=strifind(info,"Qlen="))!=NULL) {
421     p+=5;
422     if (!parseUInt(p,qlen))
423     GError("Error parsing target length from GFF Qlen:\n%s\n",l);
424 gpertea 2 }
425 gpertea 16 }//parsing some useful attributes in GFF3 records
426     if (ID==NULL && parents==NULL) {
427     if (reader->gff_warns)
428     GMessage("Warning: could not parse ID or Parent from GFF line:\n%s\n",dupline);
429     return; //skip
430 gpertea 2 }
431     skip=false;
432     }
433    
434 gpertea 55
435     void GffObj::addCDS(uint cd_start, uint cd_end, char phase) {
436     if (cd_start>=this->start) {
437     this->CDstart=cd_start;
438     if (strand=='+') this->CDphase=phase;
439     }
440     else this->CDstart=this->start;
441     if (cd_end<=this->end) {
442     this->CDend=cd_end;
443     if (strand=='-') this->CDphase=phase;
444     }
445     else this->CDend=this->end;
446     isTranscript(true);
447     exon_ftype_id=gff_fid_exon;
448     if (monoFeature()) {
449     if (exons.Count()==0) addExon(this->start, this->end,0,'.',0,0,false,exgffExon);
450     else exons[0]->exontype=exgffExon;
451     }
452     }
453    
454 gpertea 16 int GffObj::addExon(GffReader* reader, GffLine* gl, bool keepAttr, bool noExonAttr) {
455 gpertea 2 //this will make sure we have the right subftype_id!
456 gpertea 55 //int subf_id=-1;
457     if (!isTranscript() && gl->is_cds) {
458 gpertea 50 isTranscript(true);
459     exon_ftype_id=gff_fid_exon;
460     if (exons.Count()==1) exons[0]->exontype=exgffExon;
461     }
462 gpertea 16 if (isTranscript()) {
463     if (exon_ftype_id<0) {//exon_ftype_id=gff_fid_exon;
464     if (gl->exontype>0) exon_ftype_id=gff_fid_exon;
465     else exon_ftype_id=names->feats.addName(gl->ftype);
466     }
467 gpertea 2 //any recognized mRNA segment gets the generic "exon" type (also applies to CDS)
468 gpertea 16 if (gl->exontype==0 && !gl->is_transcript) {
469     //extraneous mRNA feature, discard
470     if (reader->gff_warns)
471     GMessage("Warning: discarding unrecognized transcript subfeature %s of %s\n",
472     gl->ftype, gffID);
473 gpertea 2 return -1;
474 gpertea 16 }
475 gpertea 2 }
476 gpertea 16 else { //non-mRNA parent feature, check this subf type
477 gpertea 55 int subf_id=names->feats.addName(gl->ftype);
478 gpertea 16 if (exon_ftype_id<0 || exons.Count()==0) //never assigned a subfeature type before (e.g. first exon being added)
479     exon_ftype_id=subf_id;
480     else {
481     if (exon_ftype_id!=subf_id) {
482 gpertea 50 //
483 gpertea 16 if (exon_ftype_id==ftype_id && exons.Count()==1 && exons[0]->start==start && exons[0]->end==end) {
484     //the existing exon was just a dummy one created by default, discard it
485     exons.Clear();
486     covlen=0;
487     exon_ftype_id=subf_id; //allow the new subfeature to completely takeover
488     }
489     else { //multiple subfeatures, prefer those with
490     if (reader->gff_warns)
491     GMessage("GFF Warning: multiple subfeatures (%s and %s) found for %s, discarding ",
492     names->feats.getName(subf_id), names->feats.getName(exon_ftype_id),gffID);
493     if (gl->exontype!=0) { //new feature is an exon, discard previously parsed subfeatures
494     if (reader->gff_warns) GMessage("%s.\n", names->feats.getName(exon_ftype_id));
495     exon_ftype_id=subf_id;
496     exons.Clear();
497     covlen=0;
498     }
499     else { //discard new feature
500     if (reader->gff_warns) GMessage("%s.\n", names->feats.getName(subf_id));
501     return -1; //skip this 2nd subfeature type for this parent!
502     }
503     }
504     } //incoming subfeature is of different type
505     } //new subfeature type
506     } //non-mRNA parent
507 gpertea 2 int eidx=addExon(gl->fstart, gl->fend, gl->score, gl->phase,
508     gl->qstart,gl->qend, gl->is_cds, gl->exontype);
509 gpertea 16 if (eidx<0) return eidx; //this should never happen
510     if (keepAttr) {
511     if (noExonAttr) {
512     if (attrs==NULL) //place the parsed attributes directly at transcript level
513     parseAttrs(attrs, gl->info);
514     }
515     else { //need all exon-level attributes
516     parseAttrs(exons[eidx]->attrs, gl->info, true);
517     }
518 gpertea 2 }
519     return eidx;
520     }
521    
522    
523     int GffObj::addExon(uint segstart, uint segend, double sc, char fr, int qs, int qe, bool iscds, char exontype) {
524     if (exons.Count()==0) {
525     if (iscds) isCDS=true; //for now, assume CDS only if first "exon" given is a CDS
526 gpertea 16 if (exon_ftype_id<0) {
527     exon_ftype_id = isTranscript() ? gff_fid_exon : ftype_id;
528 gpertea 2 }
529     }
530 gpertea 16 //special treatment of start/stop codon features, they might be broken/split between exons
531     //and in that case some providers will still give the wrong end coordinate as start+2 (e.g. UCSC)
532     //so we should not trust the end coordinate for such features
533     if (exontype==exgffStart || exontype==exgffStop) {
534     if (strand=='-') segstart=segend;
535     else segend=segstart;
536     if (exontype==exgffStart) {
537     if (CDstart==0 || segstart<CDstart) CDstart=segstart;
538     }
539     else {
540     if (segstart>CDend) CDend=segstart;
541     }
542     }
543     else if (iscds) { //update CDS anchors:
544 gpertea 2 if (CDstart==0 || segstart<CDstart) {
545     CDstart=segstart;
546     if (exontype==exgffCDS && strand=='+') CDphase=fr;
547     }
548     if (segend>CDend) {
549     if (exontype==exgffCDS && strand=='-') CDphase=fr;
550     CDend=segend;
551     }
552     }
553 gpertea 16 else { // not a CDS/start/stop
554 gpertea 2 isCDS=false;
555     }
556     if (qs || qe) {
557 gpertea 144 if (qs>qe) Gswap(qs,qe);
558 gpertea 2 if (qs==0) qs=1;
559 gpertea 16 }
560 gpertea 2 int ovlen=0;
561 gpertea 16 if (exontype>0) { //check for overlaps between exon-type segments
562     int oi=exonOverlapIdx(segstart, segend, &ovlen);
563     if (oi>=0) { //overlap existing segment
564     if (ovlen==0) {
565     //adjacent segments will be merged
566     //e.g. CDS to (UTR|exon)
567     if ((exons[oi]->exontype>=exgffUTR && exontype==exgffCDS) ||
568     (exons[oi]->exontype==exgffCDS && exontype>=exgffUTR)) {
569     expandExon(oi, segstart, segend, exgffCDSUTR, sc, fr, qs, qe);
570     return oi;
571     }
572     //CDS adjacent to stop_codon: UCSC does (did?) this
573     if ((exons[oi]->exontype==exgffStop && exontype==exgffCDS) ||
574     (exons[oi]->exontype==exgffCDS && exontype==exgffStop)) {
575     expandExon(oi, segstart, segend, exgffCDS, sc, fr, qs, qe);
576     return oi;
577     }
578     }
579     //only allow this for CDS within exon, stop_codon within (CDS|UTR|exon),
580     // start_codon within (CDS|exon)
581     if (exons[oi]->exontype>exontype &&
582     exons[oi]->start<=segstart && exons[oi]->end>=segend &&
583     !(exons[oi]->exontype==exgffUTR && exontype==exgffCDS)) {
584     //larger segment given first, now the smaller included one is redundant
585     return oi; //only used to store attributes from current GffLine
586     }
587     if (exontype>exons[oi]->exontype &&
588     segstart<=exons[oi]->start && segend>=exons[oi]->end &&
589     !(exontype==exgffUTR && exons[oi]->exontype==exgffCDS)) {
590     //smaller segment given first, so we have to enlarge it
591     expandExon(oi, segstart, segend, exontype, sc, fr, qs, qe);
592     //this should also check for overlapping next exon (oi+1) ?
593     return oi;
594     }
595     //there is also the special case of "ribosomal slippage exception" (programmed frameshift)
596     //where two CDS segments may actually overlap for 1 or 2 bases, but there should be only one encompassing exon
597     //if (ovlen>2 || exons[oi]->exontype!=exgffCDS || exontype!=exgffCDS) {
598     // had to relax this because of some weird UCSC annotations with exons partially overlapping the CDS segments
599     /*
600     if (ovlen>2 && exons[oi]->exontype!=exgffUTR && exontype!=exgffUTR) {
601     if (gff_show_warnings)
602     GMessage("GFF Warning: discarding overlapping feature segment (%d-%d) (vs %d-%d (%s)) for GFF ID %s on %s\n",
603     segstart, segend, exons[oi]->start, exons[oi]->end, getSubfName(), gffID, getGSeqName());
604     hasErrors(true);
605     return -1; //segment NOT added
606     }
607     */
608 gpertea 2
609 gpertea 16 if ((ovlen>2 || ovlen==0) || exons[oi]->exontype!=exgffCDS || exontype!=exgffCDS) {
610     if (gff_show_warnings)
611     GMessage("GFF Warning: merging overlapping/adjacent feature segment (%d-%d) into (%d-%d) (%s) for GFF ID %s on %s\n",
612     segstart, segend, exons[oi]->start, exons[oi]->end, getSubfName(), gffID, getGSeqName());
613     expandExon(oi, segstart, segend, exontype, sc, fr, qs, qe);
614     return oi;
615     }
616     // else add the segment if the overlap is small and between two CDS segments
617     //TODO: we might want to add an attribute here with the slippage coordinate and size?
618     covlen-=ovlen;
619     }//overlap or adjacent to existing segment
620     } //check for overlap
621 gpertea 2 // --- no overlap, or accepted micro-overlap (ribosomal slippage)
622     // create & add the new segment
623 gpertea 51 /*
624     if (start>0 && exontype==exgffCDS && exons.Count()==0) {
625     //adding a CDS directly as the first subfeature of a declared parent
626     segstart=start;
627     segend=end;
628     }
629     */
630 gpertea 2 GffExon* enew=new GffExon(segstart, segend, sc, fr, qs, qe, exontype);
631     int eidx=exons.Add(enew);
632     if (eidx<0) {
633 gpertea 16 //this would actually be acceptable if the object is a "Gene" and "exons" are in fact isoforms
634     if (gff_show_warnings)
635     GMessage("GFF Warning: failed adding segment %d-%d for %s (discarded)!\n",
636 gpertea 2 segstart, segend, gffID);
637 gpertea 16 delete enew;
638     hasErrors(true);
639 gpertea 2 return -1;
640     }
641     covlen+=(int)(exons[eidx]->end-exons[eidx]->start)+1;
642 gpertea 55 //adjust parent feature coordinates to contain this exon
643     if (start==0 || start>exons.First()->start) {
644 gpertea 51 start=exons.First()->start;
645     }
646 gpertea 55 if (end<exons.Last()->end) end=exons.Last()->end;
647    
648 gpertea 2 return eidx;
649     }
650    
651 gpertea 16 void GffObj::expandExon(int oi, uint segstart, uint segend, char exontype, double sc, char fr, int qs, int qe) {
652     //oi is the index of the *first* overlapping segment found that must be enlarged
653     covlen-=exons[oi]->len();
654     if (segstart<exons[oi]->start)
655     exons[oi]->start=segstart;
656     if (qs && qs<exons[oi]->qstart) exons[oi]->qstart=qs;
657     if (segend>exons[oi]->end)
658     exons[oi]->end=segend;
659     if (qe && qe>exons[oi]->qend) exons[oi]->qend=qe;
660     //warning: score cannot be properly adjusted! e.g. if it's a p-value it's just going to get worse
661     if (sc!=0) exons[oi]->score=sc;
662     covlen+=exons[oi]->len();
663     //if (exons[oi]->exontype< exontype) -- always true
664     exons[oi]->exontype = exontype;
665     if (exontype==exgffCDS) exons[oi]->phase=fr;
666     //we must check if any more exons are also overlapping this
667     int ni=oi+1; //next exon index after oi
668     while (ni<exons.Count() && segend>=exons[ni]->start) { // next segment overlaps new enlarged segment
669     //only allow this if next segment is fully included, and a subordinate
670     if (exons[ni]->exontype<exontype && exons[ni]->end<=segend) {
671     /* I guess we have to relax this due to stupid UCSC hg18 files having a start_codon sticking out
672     chr1 hg18_knownGene start_codon 69806911 69806913 0.000000 + .
673     chr1 hg18_knownGene CDS 69806911 69806912 0.000000 + 0
674     chr1 hg18_knownGene exon 69805456 69806912 0.000000 + .
675     */
676     if (exons[ni]->qstart<exons[oi]->qstart) exons[oi]->qstart=exons[ni]->qstart;
677     if (exons[ni]->qend>exons[oi]->qend) exons[oi]->qend=exons[ni]->qend;
678     exons.Delete(ni);
679     }
680     else {
681     if (gff_show_warnings) GMessage("GFF Warning: overlapping existing exon(%d-%d) while expanding to %d-%d for GFF ID %s\n",
682     exons[ni]->start, exons[ni]->end, segstart, segend, gffID);
683     //hasErrors(true);
684     break;
685     }
686     }
687     // -- make sure any other related boundaries are updated:
688     start=exons.First()->start;
689     end=exons.Last()->end;
690     if (uptr!=NULL) { //collect stats about the underlying genomic sequence
691     GSeqStat* gsd=(GSeqStat*)uptr;
692     if (start<gsd->mincoord) gsd->mincoord=start;
693     if (end>gsd->maxcoord) gsd->maxcoord=end;
694     if (this->len()>gsd->maxfeat_len) {
695     gsd->maxfeat_len=this->len();
696     gsd->maxfeat=this;
697     }
698     }
699     }
700    
701 gpertea 2 void GffObj::removeExon(int idx) {
702     /*
703     if (idx==0 && segs[0].start==gstart)
704     gstart=segs[1].start;
705     if (idx==segcount && segs[segcount].end==gend)
706     gend=segs[segcount-1].end;
707     */
708     if (idx<0 || idx>=exons.Count()) return;
709     int segstart=exons[idx]->start;
710     int segend=exons[idx]->end;
711     exons.Delete(idx);
712     covlen -= (int)(segend-segstart)+1;
713     start=exons.First()->start;
714     end=exons.Last()->end;
715     if (isCDS) { CDstart=start; CDend=end; }
716     }
717    
718 gpertea 16 void GffObj::removeExon(GffExon* p) {
719     for (int idx=0;idx<exons.Count();idx++) {
720     if (exons[idx]==p) {
721     int segstart=exons[idx]->start;
722     int segend=exons[idx]->end;
723     exons.Delete(idx);
724     covlen -= (int)(segend-segstart)+1;
725 gpertea 253
726     if (exons.Count() > 0) {
727     start=exons.First()->start;
728     end=exons.Last()->end;
729     if (isCDS) { CDstart=start; CDend=end; }
730     }
731 gpertea 16 return;
732     }
733     }
734     }
735    
736    
737    
738 gpertea 2 GffObj::GffObj(GffReader *gfrd, GffLine* gffline, bool keepAttr, bool noExonAttr):
739 gpertea 16 GSeg(0,0), exons(true,true,false), children(1,false) {
740     xstart=0;
741     xend=0;
742     xstatus=0;
743     partial=false;
744     isCDS=false;
745     uptr=NULL;
746     ulink=NULL;
747     parent=NULL;
748     udata=0;
749     flags=0;
750     CDstart=0;
751     CDend=0;
752     CDphase=0;
753     geneID=NULL;
754     gene_name=NULL;
755     attrs=NULL;
756     gffID=NULL;
757     track_id=-1;
758     gseq_id=-1;
759     ftype_id=-1;
760     exon_ftype_id=-1;
761     strand='.';
762     if (gfrd==NULL)
763 gpertea 2 GError("Cannot use this GffObj constructor with a NULL GffReader!\n");
764 gpertea 16 gffnames_ref(names);
765     if (gfrd->names==NULL) gfrd->names=names;
766     //qlen=0;qstart=0;qend=0;
767     gscore=0;
768     uscore=0;
769     covlen=0;
770     qcov=0;
771     start=gffline->fstart;
772     end=gffline->fend;
773     gseq_id=names->gseqs.addName(gffline->gseqname);
774     track_id=names->tracks.addName(gffline->track);
775     strand=gffline->strand;
776     qlen=gffline->qlen;
777     qstart=gffline->qstart;
778     qend=gffline->qend;
779     //setup flags from gffline
780     isCDS=gffline->is_cds; //for now
781     isGene(gffline->is_gene);
782     isTranscript(gffline->is_transcript || gffline->exontype!=0);
783 gpertea 153 //fromGff3(gffline->is_gff3);
784 gpertea 16
785 gpertea 196 if (gffline->parents!=NULL && !gffline->is_transcript) {
786 gpertea 16 //GTF style -- create a GffObj directly by subfeature
787     //(also possible orphan GFF3 exon line, or an exon given before its parent (chado))
788     if (gffline->exontype!=0) { //recognized exon-like feature
789     ftype_id=gff_fid_transcript; //so this is some sort of transcript
790     exon_ftype_id=gff_fid_exon; //subfeatures MUST be exons
791     }
792     else {//unrecognized subfeatures
793     //make this GffObj of the same feature type
794     ftype_id=names->feats.addName(gffline->ftype);
795     }
796 gpertea 153 if (gffline->ID==NULL) { //typical GTF2 without "transcript" line
797 gpertea 16 gffID=Gstrdup(gffline->parents[0]);
798     this->createdByExon(true);
799     //this is likely the first exon/segment of the feature
800     addExon(gfrd, gffline, keepAttr, noExonAttr);
801 gpertea 2 }
802 gpertea 196 else { //a parented feature with an ID: orphan or premature GFF3 subfeature line
803 gpertea 16 if (gffline->is_gff3 && gffline->exontype!=0) {
804     //premature exon given before its parent transcript
805     //create the transcript entry here
806     gffID=Gstrdup(gffline->parents[0]);
807     this->createdByExon(true);
808     //this is the first exon/segment of the transcript
809     addExon(gfrd, gffline, keepAttr, noExonAttr);
810     }
811 gpertea 196 else { //unrecognized non-exon feature ? use the ID instead
812     this->hasGffID(true);
813 gpertea 16 gffID=Gstrdup(gffline->ID);
814     if (keepAttr) this->parseAttrs(attrs, gffline->info);
815     }
816 gpertea 2 }
817 gpertea 196 } //non-transcript parented subfeature given directly
818     else {
819     //non-parented feature OR a recognizable transcript
820 gpertea 16 //create a parent feature in its own right
821 gpertea 2 gscore=gffline->score;
822     if (gffline->ID==NULL || gffline->ID[0]==0)
823 gpertea 16 GError("Error: no ID found for GFF record start\n");
824 gpertea 153 this->hasGffID(true);
825 gpertea 16 gffID=Gstrdup(gffline->ID); //there must be an ID here
826     //if (gffline->is_transcript) ftype_id=gff_fid_mRNA;
827     //else
828     ftype_id=names->feats.addName(gffline->ftype);
829     if (gffline->is_transcript)
830     exon_ftype_id=gff_fid_exon;
831     if (keepAttr) this->parseAttrs(attrs, gffline->info);
832     }//no parent
833    
834     if (gffline->gene_name!=NULL) {
835     gene_name=Gstrdup(gffline->gene_name);
836     }
837 gpertea 196 if (gffline->gene_id) {
838 gpertea 16 geneID=Gstrdup(gffline->gene_id);
839     }
840 gpertea 196 else if (gffline->is_transcript && gffline->parents) {
841     geneID=Gstrdup(gffline->parents[0]);
842     }
843 gpertea 16
844 gpertea 227 //GSeqStat* gsd=gfrd->gseqstats.AddIfNew(new GSeqStat(gseq_id,names->gseqs.lastNameUsed()),true);
845     GSeqStat* gsd=gfrd->gseqstats.AddIfNew(new GSeqStat(gseq_id,gffline->gseqname), true);
846 gpertea 16 uptr=gsd;
847 gpertea 286 /*
848 gpertea 16 if (start<gsd->mincoord) gsd->mincoord=start;
849     if (end>gsd->maxcoord) gsd->maxcoord=end;
850     if (this->len()>gsd->maxfeat_len) {
851     gsd->maxfeat_len=this->len();
852     gsd->maxfeat=this;
853 gpertea 2 }
854 gpertea 286 */
855 gpertea 2 }
856    
857     GffLine* GffReader::nextGffLine() {
858     if (gffline!=NULL) return gffline; //caller should free gffline after processing
859     while (gffline==NULL) {
860     int llen=0;
861     buflen=GFF_LINELEN-1;
862     char* l=fgetline(linebuf, buflen, fh, &fpos, &llen);
863     if (l==NULL) {
864     return NULL; //end of file
865     }
866     int ns=0; //first nonspace position
867     while (l[ns]!=0 && isspace(l[ns])) ns++;
868     if (l[ns]=='#' || llen<10) continue;
869     gffline=new GffLine(this, l);
870     if (gffline->skip) {
871     delete gffline;
872     gffline=NULL;
873 gpertea 16 continue;
874 gpertea 2 }
875 gpertea 16 if (gffline->ID==NULL && gffline->parents==NULL) { //it must have an ID
876     //this might not be needed, already checked in the GffLine constructor
877     if (gff_warns)
878     GMessage("Warning: malformed GFF line, no parent or record Id (kipping\n");
879     delete gffline;
880     gffline=NULL;
881     //continue;
882     }
883 gpertea 2 }
884     return gffline;
885     }
886    
887 gpertea 151
888 gpertea 2 char* GffReader::gfoBuildId(const char* id, const char* ctg) {
889     //caller must free the returned pointer
890     char* buf=NULL;
891     int idlen=strlen(id);
892     GMALLOC(buf, idlen+strlen(ctg)+2);
893     strcpy(buf, id);
894     buf[idlen]='~';
895     strcpy(buf+idlen+1, ctg);
896     return buf;
897     }
898 gpertea 151 /*
899 gpertea 2 void GffReader::gfoRemove(const char* id, const char* ctg) {
900     char* buf=gfoBuildId(id,ctg);
901     phash.Remove(buf);
902     GFREE(buf);
903     }
904 gpertea 151 */
905     GfoHolder* GffReader::gfoAdd(GffObj* gfo, int idx) {
906 gpertea 196 //Warning: if gflst gets altered, idx becomes obsolete
907 gpertea 195 GVec<GfoHolder>* glst=phash.Find(gfo->gffID);
908     if (glst==NULL)
909     glst=new GVec<GfoHolder>(1);
910 gpertea 151 GfoHolder gh(gfo,idx);
911     int i=glst->Add(gh);
912     phash.Add(gfo->gffID, glst);
913     return &(glst->Get(i));
914 gpertea 2 }
915 gpertea 16
916 gpertea 151 GfoHolder* GffReader::gfoAdd(GVec<GfoHolder>& glst, GffObj* gfo, int idx) {
917     GfoHolder gh(gfo,idx);
918     int i=glst.Add(gh);
919     return &(glst[i]);
920 gpertea 2 }
921    
922 gpertea 195 GfoHolder* GffReader::gfoFind(const char* id, const char* ctg,
923     GVec<GfoHolder>** glst, char strand, uint start, uint end) {
924 gpertea 151 GVec<GfoHolder>* gl=phash.Find(id);
925     GfoHolder* gh=NULL;
926     if (gl) {
927     for (int i=0;i<gl->Count();i++) {
928     GfoHolder& gfo = gl->Get(i);
929 gpertea 195 if (ctg!=NULL && strcmp(ctg, gfo.gffobj->getGSeqName())!=0)
930 gpertea 151 continue;
931 gpertea 196 if (strand && gfo.gffobj->strand!='.' && strand != gfo.gffobj->strand)
932 gpertea 151 continue;
933 gpertea 195 if (start>0) {
934 gpertea 253 if (abs((int)start-(int)gfo.gffobj->start)> (int)GFF_MAX_LOCUS)
935 gpertea 195 continue;
936     if (end>0 && (gfo.gffobj->start>end || gfo.gffobj->end<start))
937     continue;
938     }
939 gpertea 151 //must be the same transcript, according to given comparison criteria
940     gh=&gfo;
941     break;
942     }
943     }
944     if (glst) *glst=gl;
945     return gh;
946     }
947    
948 gpertea 16 GfoHolder* GffReader::replaceGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, int replaceidx) {
949     GffObj* newgfo=new GffObj(this, gffline, keepAttr, noExonAttr);
950     GfoHolder* r=NULL;
951     if (replaceidx>=0) {
952     gflst.Put(replaceidx,newgfo);
953 gpertea 151 r=gfoAdd(newgfo, replaceidx);
954 gpertea 16 }
955     else {
956     int gfoidx=gflst.Add(newgfo);
957 gpertea 151 r=gfoAdd(newgfo, gfoidx);
958 gpertea 16 }
959 gpertea 153 /*
960 gpertea 16 if (gff_warns) {
961     int* pcount=tids.Find(newgfo->gffID);
962     if (pcount!=NULL) {
963 gpertea 151 if (gff_warns) GMessage("Warning: duplicate GFF ID: %s\n", newgfo->gffID);
964 gpertea 16 (*pcount)++;
965     }
966     else {
967     tids.Add(newgfo->gffID,new int(1));
968     }
969     }
970 gpertea 153 */
971 gpertea 16 return r;
972     }
973 gpertea 2
974 gpertea 16 GfoHolder* GffReader::updateParent(GfoHolder* newgfh, GffObj* parent) {
975     //assert(parent);
976     //assert(newgfo);
977     parent->children.Add(newgfh->gffobj);
978     if (newgfh->gffobj->parent==NULL) newgfh->gffobj->parent=parent;
979     newgfh->gffobj->setLevel(parent->getLevel()+1);
980     if (parent->isGene()) {
981     if (parent->gene_name!=NULL && newgfh->gffobj->gene_name==NULL)
982     newgfh->gffobj->gene_name=Gstrdup(parent->gene_name);
983     if (parent->geneID!=NULL && newgfh->gffobj->geneID==NULL)
984     newgfh->gffobj->geneID=Gstrdup(parent->geneID);
985     }
986    
987     return newgfh;
988 gpertea 2 }
989    
990 gpertea 16 GfoHolder* GffReader::newGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr,
991 gpertea 151 GffObj* parent, GffExon* pexon, GVec<GfoHolder>* glst) {
992 gpertea 16 GffObj* newgfo=new GffObj(this, gffline, keepAttr, noExonAttr);
993     GfoHolder* r=NULL;
994     int gfoidx=gflst.Add(newgfo);
995 gpertea 195 r=(glst) ? gfoAdd(*glst, newgfo, gfoidx) : gfoAdd(newgfo, gfoidx);
996 gpertea 16 if (parent!=NULL) {
997     updateParent(r, parent);
998     if (pexon!=NULL) parent->removeExon(pexon);
999     }
1000 gpertea 153 /*
1001 gpertea 16 if (gff_warns) {
1002     int* pcount=tids.Find(newgfo->gffID);
1003     if (pcount!=NULL) {
1004     if (gff_warns) GMessage("Warning: duplicate GFF ID: %s\n", newgfo->gffID);
1005     (*pcount)++;
1006     }
1007     else {
1008     tids.Add(newgfo->gffID,new int(1));
1009     }
1010     }
1011 gpertea 153 */
1012 gpertea 16 return r;
1013     }
1014 gpertea 2
1015 gpertea 16 GfoHolder* GffReader::updateGffRec(GfoHolder* prevgfo, GffLine* gffline,
1016     bool keepAttr) {
1017     if (prevgfo==NULL) return NULL;
1018 gpertea 153 //prevgfo->gffobj->createdByExon(false);
1019 gpertea 16 prevgfo->gffobj->ftype_id=prevgfo->gffobj->names->feats.addName(gffline->ftype);
1020     prevgfo->gffobj->start=gffline->fstart;
1021     prevgfo->gffobj->end=gffline->fend;
1022     prevgfo->gffobj->isGene(gffline->is_gene);
1023     prevgfo->gffobj->isTranscript(gffline->is_transcript || gffline->exontype!=0);
1024 gpertea 153 prevgfo->gffobj->hasGffID(gffline->ID!=NULL);
1025 gpertea 16 if (keepAttr) {
1026     if (prevgfo->gffobj->attrs!=NULL) prevgfo->gffobj->attrs->Clear();
1027     prevgfo->gffobj->parseAttrs(prevgfo->gffobj->attrs, gffline->info);
1028     }
1029     return prevgfo;
1030     }
1031    
1032    
1033     bool GffReader::addExonFeature(GfoHolder* prevgfo, GffLine* gffline, GHash<CNonExon>& pex, bool noExonAttr) {
1034     bool r=true;
1035     if (gffline->strand!=prevgfo->gffobj->strand) {
1036 gpertea 195 //TODO: add support for trans-splicing and even inter-chromosomal fusions
1037     if (prevgfo->gffobj->strand=='.') {
1038 gpertea 115 prevgfo->gffobj->strand=gffline->strand;
1039     }
1040     else {
1041     GMessage("GFF Error at %s (%c): exon %d-%d (%c) found on different strand; discarded.\n",
1042     prevgfo->gffobj->gffID, prevgfo->gffobj->strand,
1043     gffline->fstart, gffline->fend, gffline->strand, prevgfo->gffobj->getGSeqName());
1044     //r=false;
1045     return true; //FIXME: split trans-spliced mRNAs by strand
1046     }
1047     }
1048 gpertea 16 int gdist=(gffline->fstart>prevgfo->gffobj->end) ? gffline->fstart-prevgfo->gffobj->end :
1049     ((gffline->fend<prevgfo->gffobj->start)? prevgfo->gffobj->start-gffline->fend :
1050     0 );
1051     if (gdist>(int)GFF_MAX_LOCUS) { //too far apart, most likely this is a duplicate ID
1052     GMessage("Error: duplicate GFF ID '%s' (or exons too far apart)!\n",prevgfo->gffobj->gffID);
1053     //validation_errors = true;
1054     r=false;
1055     if (!gff_warns) exit(1);
1056     }
1057     int eidx=prevgfo->gffobj->addExon(this, gffline, !noExonAttr, noExonAttr);
1058     if (eidx>=0 && gffline->ID!=NULL && gffline->exontype==0)
1059     subfPoolAdd(pex, prevgfo);
1060     return r;
1061     }
1062    
1063     CNonExon* GffReader::subfPoolCheck(GffLine* gffline, GHash<CNonExon>& pex, char*& subp_name) {
1064     CNonExon* subp=NULL;
1065     subp_name=NULL;
1066     for (int i=0;i<gffline->num_parents;i++) {
1067     if (transcriptsOnly && discarded_ids.Find(gffline->parents[i])!=NULL)
1068     continue;
1069     subp_name=gfoBuildId(gffline->parents[i], gffline->gseqname); //e.g. mRNA name
1070     subp=pex.Find(subp_name);
1071     if (subp!=NULL)
1072     return subp;
1073     GFREE(subp_name);
1074     }
1075     return NULL;
1076     }
1077    
1078     void GffReader::subfPoolAdd(GHash<CNonExon>& pex, GfoHolder* newgfo) {
1079     //this might become a parent feature later
1080     if (newgfo->gffobj->exons.Count()>0) {
1081     char* xbuf=gfoBuildId(gffline->ID, gffline->gseqname);
1082     pex.Add(xbuf, new CNonExon(newgfo->idx, newgfo->gffobj,
1083     newgfo->gffobj->exons[0], gffline));
1084     GFREE(xbuf);
1085     }
1086     }
1087    
1088     GfoHolder* GffReader::promoteFeature(CNonExon* subp, char*& subp_name, GHash<CNonExon>& pex,
1089     bool keepAttr, bool noExonAttr) {
1090     GffObj* prevp=subp->parent; //grandparent of gffline (e.g. gene)
1091     if (prevp!=gflst[subp->idx])
1092     GError("Error promoting subfeature %s, gflst index mismatch?!\n", subp->gffline->ID);
1093     subp->gffline->discardParent();
1094     GfoHolder* gfoh=newGffRec(subp->gffline, keepAttr, noExonAttr, prevp, subp->exon);
1095     pex.Remove(subp_name); //no longer a potential parent, moved it to phash already
1096     prevp->promotedChildren(true);
1097     return gfoh; //returns the holder of newly promoted feature
1098     }
1099    
1100     //have to parse the whole file because exons can be scattered all over
1101 gpertea 153 //trans-splicing and fusions are only accepted in proper GFF3 format, with a single parent feature ID entry
1102 gpertea 2 void GffReader::readAll(bool keepAttr, bool mergeCloseExons, bool noExonAttr) {
1103 gpertea 16 bool validation_errors = false;
1104     //loc_debug=false;
1105     GHash<CNonExon> pex; //keep track of any "exon"-like features that have an ID
1106     //and thus could become promoted to parent features
1107 gpertea 2 while (nextGffLine()!=NULL) {
1108 gpertea 16 GfoHolder* prevseen=NULL;
1109 gpertea 195 GVec<GfoHolder>* prevgflst=NULL;
1110     if (gffline->ID && gffline->exontype==0) {
1111 gpertea 196 //>> for a parent-like IDed feature (mRNA, gene, etc.)
1112 gpertea 195 //look for same ID on the same chromosome/strand/locus
1113     prevseen=gfoFind(gffline->ID, gffline->gseqname, &prevgflst, gffline->strand, gffline->fstart);
1114     if (prevseen!=NULL) {
1115     //same ID/chromosome combo encountered before
1116     if (prevseen->gffobj->createdByExon() &&
1117     prevseen->gffobj->start>=gffline->fstart &&
1118     prevseen->gffobj->end<=gffline->fend) {
1119     //an exon of this ID was given before
1120     //this line has the main attributes for this ID
1121     updateGffRec(prevseen, gffline, keepAttr);
1122     }
1123     else {
1124     //- duplicate ID -- this must be a discontiguous feature
1125     // e.g. a trans-spliced transcript
1126     if (prevseen->gffobj->overlap(gffline->fstart, gffline->fend)) {
1127     //overlapping with same ID not allowed
1128     GMessage("Error: duplicate GFF ID '%s' encountered!\n",gffline->ID);
1129     //validation_errors = true;
1130     if (gff_warns) {
1131     delete gffline;
1132     gffline=NULL;
1133     continue;
1134     }
1135     else exit(1);
1136     }
1137     //create a new entry with the same ID
1138     prevseen=newGffRec(gffline, keepAttr, noExonAttr,
1139     prevseen->gffobj->parent, NULL, prevgflst);
1140     } //duplicate ID on the same chromosome
1141     } //prevseeen != NULL
1142     } //parent-like ID feature
1143 gpertea 16 if (gffline->parents==NULL) {//start GFF3-like record with no parent (mRNA, gene)
1144 gpertea 195 if (!prevseen) newGffRec(gffline, keepAttr, noExonAttr, NULL, NULL, prevgflst);
1145 gpertea 2 }
1146 gpertea 195 else { //--- it's a child feature (exon/CDS but could still be a mRNA with gene(s) as parent)
1147 gpertea 196 //updates all the declared parents with this child
1148 gpertea 16 bool found_parent=false;
1149     GfoHolder* newgfo=prevseen;
1150 gpertea 151 GVec<GfoHolder>* newgflst=NULL;
1151 gpertea 16 for (int i=0;i<gffline->num_parents;i++) {
1152     if (transcriptsOnly && discarded_ids.Find(gffline->parents[i])!=NULL)
1153     continue; //skipping discarded parent feature
1154 gpertea 196 GfoHolder* parentgfo=NULL;
1155 gpertea 197 if (gffline->is_transcript || gffline->exontype==0) {//possibly a transcript
1156     parentgfo=gfoFind(gffline->parents[i], gffline->gseqname,
1157     &newgflst, gffline->strand, gffline->fstart, gffline->fend);
1158     }
1159     else {
1160     //for exon-like entities we only need a parent to be in locus distance,
1161     //on the same strand
1162     parentgfo=gfoFind(gffline->parents[i], gffline->gseqname,
1163     &newgflst, gffline->strand, gffline->fstart);
1164     }
1165 gpertea 16 if (parentgfo!=NULL) { //parent GffObj parsed earlier
1166 gpertea 197 found_parent=true;
1167 gpertea 16 if (parentgfo->gffobj->isGene() && gffline->is_transcript
1168     && gffline->exontype==0) {
1169     //not an exon, but a transcript parented by a gene
1170     if (newgfo) {
1171     updateParent(newgfo, parentgfo->gffobj);
1172     }
1173     else {
1174     newgfo=newGffRec(gffline, keepAttr, noExonAttr, parentgfo->gffobj);
1175     }
1176     }
1177     else { //potential exon subfeature
1178     if (!addExonFeature(parentgfo, gffline, pex, noExonAttr))
1179     validation_errors=true;
1180     }
1181 gpertea 195 } //overlapping parent feature found
1182 gpertea 16 } //for each parsed parent Id
1183     if (!found_parent) { //new GTF-like record starting here with a subfeature directly
1184 gpertea 196 //or it could be some chado GFF3 barf with exons coming BEFORE their parent :(
1185 gpertea 16 //check if this feature isn't parented by a previously stored "exon" subfeature
1186     char* subp_name=NULL;
1187     CNonExon* subp=subfPoolCheck(gffline, pex, subp_name);
1188     if (subp!=NULL) { //found a subfeature that is the parent of this gffline
1189     //promote that subfeature to a full GffObj
1190     GfoHolder* gfoh=promoteFeature(subp, subp_name, pex, keepAttr, noExonAttr);
1191     //add current gffline as an exon of the newly promoted subfeature
1192     if (!addExonFeature(gfoh, gffline, pex, noExonAttr))
1193     validation_errors=true;
1194     }
1195 gpertea 196 else { //no parent seen before,
1196 gpertea 16 //loc_debug=true;
1197 gpertea 195 GfoHolder* ngfo=prevseen;
1198 gpertea 196 if (ngfo==NULL) {
1199     //if it's an exon type, create directly the parent with this exon
1200     //but if it's recognized as a transcript, the object itself is created
1201 gpertea 195 ngfo=newGffRec(gffline, keepAttr, noExonAttr, NULL, NULL, newgflst);
1202 gpertea 196 }
1203     if (!ngfo->gffobj->isTranscript() &&
1204     gffline->ID!=NULL && gffline->exontype==0)
1205 gpertea 195 subfPoolAdd(pex, ngfo);
1206 gpertea 16 //even those with errors will be added here!
1207     }
1208     GFREE(subp_name);
1209     } //no previous parent found
1210     } //parented feature
1211     //--
1212     delete gffline;
1213     gffline=NULL;
1214     }//while gff lines
1215     gflst.finalize(this, mergeCloseExons, keepAttr, noExonAttr); //force sorting by locus if so constructed
1216 gpertea 286 gseqStats.setCount(gseqstats.Last()->gseqid+1);
1217     for (int gi=0;gi<gseqstats.Count();gi++) {
1218     gseqStats.Put(gseqstats[gi]->gseqid, gseqstats[gi]); //copy the pointer only
1219     }
1220 gpertea 2 // all gff records are now loaded in GList gflst
1221     // so we can free the hash
1222     phash.Clear();
1223 gpertea 153 //tids.Clear();
1224 gpertea 16 if (validation_errors) {
1225     exit(1);
1226     }
1227 gpertea 2 }
1228    
1229 gpertea 286 void GfList::finalize(GffReader* gfr, bool mergeCloseExons,
1230     bool keepAttrs, bool noExonAttr) { //if set, enforce sort by locus
1231     if (mustSort) { //force (re-)sorting
1232     this->setSorted(false);
1233     this->setSorted((GCompareProc*)gfo_cmpByLoc);
1234     }
1235     GList<GffObj> discarded(false,true,false);
1236     for (int i=0;i<Count();i++) {
1237     //finish the parsing for each GffObj
1238     fList[i]->finalize(gfr, mergeCloseExons, keepAttrs, noExonAttr);
1239     if (fList[i]->isDiscarded()) {
1240     discarded.Add(fList[i]);
1241     this->Forget(i);
1242     }
1243     }
1244     if (discarded.Count()>0) this->Pack();
1245     }
1246    
1247 gpertea 16 GffObj* GffObj::finalize(GffReader* gfr, bool mergeCloseExons, bool keepAttrs, bool noExonAttr) {
1248     //merge
1249     //always merge adjacent or overlapping segments
1250     //but if mergeCloseExons then merge even when distance is up to 5 bases
1251 gpertea 2 udata=0;
1252     uptr=NULL;
1253 gpertea 16 if (gfr->transcriptsOnly && !(isTranscript() || (isGene() && children.Count()==0))) {
1254     isDiscarded(true);
1255     }
1256     if (ftype_id==gff_fid_transcript && CDstart>0) {
1257     ftype_id=gff_fid_mRNA;
1258     //exon_ftype_id=gff_fid_exon;
1259     }
1260 gpertea 197 if (exons.Count()>0 && (isTranscript() || exon_ftype_id==gff_fid_exon)) {
1261     if (mergeCloseExons) {
1262     int mindist=mergeCloseExons ? 5:1;
1263     for (int i=0;i<exons.Count()-1;i++) {
1264     int ni=i+1;
1265     uint mend=exons[i]->end;
1266     while (ni<exons.Count()) {
1267     int dist=(int)(exons[ni]->start-mend);
1268     if (dist>mindist) break; //no merging with next segment
1269     if (gfr!=NULL && gfr->gff_warns && dist!=0 && (exons[ni]->exontype!=exgffUTR && exons[i]->exontype!=exgffUTR)) {
1270     GMessage("GFF warning: merging adjacent/overlapping segments of %s on %s (%d-%d, %d-%d)\n",
1271     gffID, getGSeqName(), exons[i]->start, exons[i]->end,exons[ni]->start, exons[ni]->end);
1272     }
1273     mend=exons[ni]->end;
1274     covlen-=exons[i]->len();
1275     exons[i]->end=mend;
1276     covlen+=exons[i]->len();
1277     covlen-=exons[ni]->len();
1278     if (exons[ni]->attrs!=NULL && (exons[i]->attrs==NULL ||
1279     exons[i]->attrs->Count()<exons[ni]->attrs->Count())) {
1280     //use the other exon attributes, if more
1281     delete(exons[i]->attrs);
1282     exons[i]->attrs=exons[ni]->attrs;
1283     exons[ni]->attrs=NULL;
1284     }
1285     exons.Delete(ni);
1286     } //check for merge with next exon
1287 gpertea 2 } //for each exon
1288 gpertea 197 } //merge close exons
1289     //shrink transcript to the exons' span
1290     this->start=exons.First()->start;
1291     this->end=exons.Last()->end;
1292 gpertea 286 //also update the stats for the reference sequence
1293     if (uptr!=NULL) { //collect stats about the underlying genomic sequence
1294     GSeqStat* gsd=(GSeqStat*)uptr;
1295     if (start<gsd->mincoord) gsd->mincoord=start;
1296     if (end>gsd->maxcoord) gsd->maxcoord=end;
1297     if (this->len()>gsd->maxfeat_len) {
1298     gsd->maxfeat_len=this->len();
1299     gsd->maxfeat=this;
1300     }
1301     }
1302     this->uptr=NULL;
1303     this->udata=0;
1304 gpertea 197 }
1305 gpertea 16 //attribute reduction for GTF records
1306 gpertea 153 if (keepAttrs && !noExonAttr && !hasGffID()
1307 gpertea 16 && exons.Count()>0 && exons[0]->attrs!=NULL) {
1308     bool attrs_discarded=false;
1309     for (int a=0;a<exons[0]->attrs->Count();a++) {
1310     int attr_name_id=exons[0]->attrs->Get(a)->attr_id;
1311     char* attr_name=names->attrs.getName(attr_name_id);
1312     char* attr_val =exons[0]->attrs->Get(a)->attr_val;
1313     bool sameExonAttr=true;
1314     for (int i=1;i<exons.Count();i++) {
1315     char* ov=exons[i]->getAttr(attr_name_id);
1316     if (ov==NULL || (strcmp(ov,attr_val)!=0)) {
1317     sameExonAttr=false;
1318     break;
1319     }
1320     }
1321     if (sameExonAttr) {
1322     //delete this attribute from exons level
1323     attrs_discarded=true;
1324     this->addAttr(attr_name, attr_val);
1325     for (int i=1;i<exons.Count();i++) {
1326     removeExonAttr(*(exons[i]), attr_name_id);
1327     }
1328     exons[0]->attrs->freeItem(a);
1329     }
1330     }
1331     if (attrs_discarded) exons[0]->attrs->Pack();
1332     }
1333 gpertea 2 return this;
1334     }
1335    
1336 gpertea 16 void GffObj::parseAttrs(GffAttrs*& atrlist, char* info, bool isExon) {
1337 gpertea 2 if (names==NULL)
1338     GError(ERR_NULL_GFNAMES, "parseAttrs()");
1339     if (atrlist==NULL)
1340     atrlist=new GffAttrs();
1341     char* endinfo=info+strlen(info);
1342     char* start=info;
1343     char* pch=start;
1344     while (start<endinfo) {
1345     //skip spaces
1346     while (*start==' ' && start<endinfo) start++;
1347     pch=strchr(start, ';');
1348     if (pch==NULL) pch=endinfo;
1349     else {
1350     *pch='\0';
1351     pch++;
1352     }
1353     char* ech=strchr(start,'=');
1354     if (ech!=NULL) { // attr=value format found
1355     *ech='\0';
1356 gpertea 16 //if (noExonAttr && (strcmp(start, "exon_number")==0 || strcmp(start, "exon")==0)) { start=pch; continue; }
1357     if (strcmp(start, "exon_number")==0 || strcmp(start, "exon")==0 ||
1358     strcmp(start, "exon_id")==0)
1359     { start=pch; continue; }
1360 gpertea 2 ech++;
1361     while (*ech==' ' && ech<endinfo) ech++;//skip extra spaces after the '='
1362 gpertea 16 //atrlist->Add(new GffAttr(names->attrs.addName(start),ech));
1363     //make sure we don't add the same attribute more than once
1364     if (isExon && (strcmp(start, "protein_id")==0)) {
1365     //Ensembl special case
1366     this->addAttr(start, ech);
1367     start=pch;
1368     continue;
1369     }
1370     atrlist->add_or_update(names, start, ech);
1371 gpertea 2 }
1372     /*
1373     else { //not an attr=value format
1374     atrlist->Add(new GffAttr(names->attrs.addName(start),"1"));
1375     }
1376     */
1377     start=pch;
1378     }
1379     if (atrlist->Count()==0) { delete atrlist; atrlist=NULL; }
1380     }
1381    
1382 gpertea 16 void GffObj::addAttr(const char* attrname, const char* attrvalue) {
1383 gpertea 2 if (this->attrs==NULL)
1384     this->attrs=new GffAttrs();
1385 gpertea 16 //this->attrs->Add(new GffAttr(names->attrs.addName(attrname),attrvalue));
1386     this->attrs->add_or_update(names, attrname, attrvalue);
1387 gpertea 2 }
1388    
1389 gpertea 16
1390     void GffObj::setFeatureName(const char* feature) {
1391     //change the feature name/type for a transcript
1392     int fid=names->feats.addName(feature);
1393     if (monoFeature() && exons.Count()>0)
1394     this->exon_ftype_id=fid;
1395     this->ftype_id=fid;
1396     }
1397    
1398     void GffObj::setRefName(const char* newname) {
1399     //change the feature name/type for a transcript
1400     int rid=names->gseqs.addName(newname);
1401     this->gseq_id=rid;
1402     }
1403    
1404    
1405    
1406     int GffObj::removeAttr(const char* attrname, const char* attrval) {
1407     if (this->attrs==NULL || attrname==NULL || attrname[0]==0) return 0;
1408     int aid=this->names->attrs.getId(attrname);
1409     if (aid<0) return 0;
1410     int delcount=0; //could be more than one ?
1411     for (int i=0;i<this->attrs->Count();i++) {
1412     if (aid==this->attrs->Get(i)->attr_id) {
1413     if (attrval==NULL ||
1414     strcmp(attrval, this->attrs->Get(i)->attr_val)==0) {
1415     delcount++;
1416     this->attrs->freeItem(i);
1417     }
1418     }
1419     }
1420     if (delcount>0) this->attrs->Pack();
1421     return delcount;
1422     }
1423    
1424     int GffObj::removeAttr(int aid, const char* attrval) {
1425     if (this->attrs==NULL || aid<0) return 0;
1426     int delcount=0; //could be more than one ?
1427     for (int i=0;i<this->attrs->Count();i++) {
1428     if (aid==this->attrs->Get(i)->attr_id) {
1429     if (attrval==NULL ||
1430     strcmp(attrval, this->attrs->Get(i)->attr_val)==0) {
1431     delcount++;
1432     this->attrs->freeItem(i);
1433     }
1434     }
1435     }
1436     if (delcount>0) this->attrs->Pack();
1437     return delcount;
1438     }
1439    
1440    
1441     int GffObj::removeExonAttr(GffExon& exon, const char* attrname, const char* attrval) {
1442     if (exon.attrs==NULL || attrname==NULL || attrname[0]==0) return 0;
1443     int aid=this->names->attrs.getId(attrname);
1444     if (aid<0) return 0;
1445     int delcount=0; //could be more than one
1446     for (int i=0;i<exon.attrs->Count();i++) {
1447     if (aid==exon.attrs->Get(i)->attr_id) {
1448     if (attrval==NULL ||
1449     strcmp(attrval, exon.attrs->Get(i)->attr_val)==0) {
1450     delcount++;
1451     exon.attrs->freeItem(i);
1452     }
1453     }
1454     }
1455     if (delcount>0) exon.attrs->Pack();
1456     return delcount;
1457     }
1458    
1459     int GffObj::removeExonAttr(GffExon& exon, int aid, const char* attrval) {
1460     if (exon.attrs==NULL || aid<0) return 0;
1461     int delcount=0; //could be more than one
1462     for (int i=0;i<exon.attrs->Count();i++) {
1463     if (aid==exon.attrs->Get(i)->attr_id) {
1464     if (attrval==NULL ||
1465     strcmp(attrval, exon.attrs->Get(i)->attr_val)==0) {
1466     delcount++;
1467     exon.attrs->freeItem(i);
1468     }
1469     }
1470     }
1471     if (delcount>0) exon.attrs->Pack();
1472     return delcount;
1473     }
1474    
1475    
1476 gpertea 2 void GffObj::getCDS_ends(uint& cds_start, uint& cds_end) {
1477     cds_start=0;
1478     cds_end=0;
1479     if (CDstart==0 || CDend==0) return; //no CDS info
1480     int cdsadj=0;
1481     if (CDphase=='1' || CDphase=='2') {
1482     cdsadj=CDphase-'0';
1483     }
1484     cds_start=CDstart;
1485     cds_end=CDend;
1486     if (strand=='-') cds_end-=cdsadj;
1487     else cds_start+=cdsadj;
1488     }
1489    
1490     void GffObj::mRNA_CDS_coords(uint& cds_mstart, uint& cds_mend) {
1491     //sets cds_start and cds_end to the CDS start,end coordinates on the spliced mRNA transcript
1492     cds_mstart=0;
1493     cds_mend=0;
1494     if (CDstart==0 || CDend==0) return; //no CDS info
1495     //restore normal coordinates, just in case
1496     unxcoord();
1497     int cdsadj=0;
1498     if (CDphase=='1' || CDphase=='2') {
1499     cdsadj=CDphase-'0';
1500     }
1501     /*
1502     uint seqstart=CDstart;
1503     uint seqend=CDend;
1504     */
1505     uint seqstart=exons.First()->start;
1506     uint seqend=exons.Last()->end;
1507     int s=0; //resulting nucleotide counter
1508     if (strand=='-') {
1509     for (int x=exons.Count()-1;x>=0;x--) {
1510     uint sgstart=exons[x]->start;
1511     uint sgend=exons[x]->end;
1512     if (seqend<sgstart || seqstart>sgend) continue;
1513     if (seqstart>=sgstart && seqstart<=sgend)
1514     sgstart=seqstart; //seqstart within this segment
1515     if (seqend>=sgstart && seqend<=sgend)
1516     sgend=seqend; //seqend within this segment
1517     s+=(int)(sgend-sgstart)+1;
1518     if (CDstart>=sgstart && CDstart<=sgend) {
1519     //CDstart in this segment
1520     //and we are getting the whole transcript
1521     cds_mend=s-(int)(CDstart-sgstart);
1522     }
1523     if (CDend>=sgstart && CDend<=sgend) {
1524     //CDstart in this segment
1525     //and we are getting the whole transcript
1526     cds_mstart=s-(int)(CDend-cdsadj-sgstart);
1527     }
1528     } //for each exon
1529     } // - strand
1530     else { // + strand
1531     for (int x=0;x<exons.Count();x++) {
1532     uint sgstart=exons[x]->start;
1533     uint sgend=exons[x]->end;
1534     if (seqend<sgstart || seqstart>sgend) continue;
1535     if (seqstart>=sgstart && seqstart<=sgend)
1536     sgstart=seqstart; //seqstart within this segment
1537     if (seqend>=sgstart && seqend<=sgend)
1538     sgend=seqend; //seqend within this segment
1539     s+=(int)(sgend-sgstart)+1;
1540     /* for (uint i=sgstart;i<=sgend;i++) {
1541     spliced[s]=gsubseq[i-gstart];
1542     s++;
1543     }//for each nt
1544     */
1545     if (CDstart>=sgstart && CDstart<=sgend) {
1546     //CDstart in this segment
1547     cds_mstart=s-(int)(sgend-CDstart-cdsadj);
1548     }
1549     if (CDend>=sgstart && CDend<=sgend) {
1550     //CDend in this segment
1551     cds_mend=s-(int)(sgend-CDend);
1552     }
1553     } //for each exon
1554     } // + strand
1555     //spliced[s]=0;
1556     //if (rlen!=NULL) *rlen=s;
1557     //return spliced;
1558     }
1559    
1560 gpertea 16 char* GffObj::getUnspliced(GFaSeqGet* faseq, int* rlen, GList<GSeg>* seglst)
1561     {
1562     if (faseq==NULL) { GMessage("Warning: getUnspliced(NULL,.. ) called!\n");
1563     return NULL;
1564     }
1565     //restore normal coordinates:
1566     unxcoord();
1567     if (exons.Count()==0) return NULL;
1568     int fspan=end-start+1;
1569     const char* gsubseq=faseq->subseq(start, fspan);
1570     if (gsubseq==NULL) {
1571     GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end);
1572     }
1573     char* unspliced=NULL;
1574    
1575     int seqstart=exons.First()->start;
1576     int seqend=exons.Last()->end;
1577    
1578     int unsplicedlen = 0;
1579    
1580     unsplicedlen += seqend - seqstart + 1;
1581    
1582     GMALLOC(unspliced, unsplicedlen+1); //allocate more here
1583     //uint seqstart, seqend;
1584    
1585     int s = 0; //resulting nucleotide counter
1586     if (strand=='-')
1587     {
1588     if (seglst!=NULL)
1589     seglst->Add(new GSeg(s+1,s+1+seqend-seqstart));
1590     for (int i=seqend;i>=seqstart;i--)
1591     {
1592     unspliced[s] = ntComplement(gsubseq[i-start]);
1593     s++;
1594     }//for each nt
1595     } // - strand
1596     else
1597     { // + strand
1598     if (seglst!=NULL)
1599     seglst->Add(new GSeg(s+1,s+1+seqend-seqstart));
1600     for (int i=seqstart;i<=seqend;i++)
1601     {
1602     unspliced[s]=gsubseq[i-start];
1603     s++;
1604     }//for each nt
1605     } // + strand
1606     //assert(s <= unsplicedlen);
1607     unspliced[s]=0;
1608     if (rlen!=NULL) *rlen=s;
1609     return unspliced;
1610     }
1611    
1612 gpertea 2 char* GffObj::getSpliced(GFaSeqGet* faseq, bool CDSonly, int* rlen, uint* cds_start, uint* cds_end,
1613     GList<GSeg>* seglst) {
1614     if (CDSonly && CDstart==0) return NULL;
1615     if (faseq==NULL) { GMessage("Warning: getSpliced(NULL,.. ) called!\n");
1616     return NULL;
1617     }
1618     //restore normal coordinates:
1619     unxcoord();
1620     if (exons.Count()==0) return NULL;
1621     int fspan=end-start+1;
1622     const char* gsubseq=faseq->subseq(start, fspan);
1623     if (gsubseq==NULL) {
1624     GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end);
1625     }
1626 gpertea 16 if (fspan<(int)(end-start+1)) { //special case: stop coordinate was extended past the gseq length, must adjust
1627     int endadj=end-start+1-fspan;
1628     uint prevend=end;
1629     end-=endadj;
1630     if (CDend>end) CDend=end;
1631     if (exons.Last()->end>end) {
1632     exons.Last()->end=end; //this could get us into trouble if exon start is also > end
1633     if (exons.Last()->start>exons.Last()->end) {
1634     GError("GffObj::getSpliced() error: improper genomic coordinate %d on %s for %s\n",
1635     prevend,getGSeqName(), getID());
1636     }
1637     covlen-=endadj;
1638     }
1639     }
1640 gpertea 2 char* spliced=NULL;
1641     GMALLOC(spliced, covlen+1); //allocate more here
1642     uint seqstart, seqend;
1643     int cdsadj=0;
1644     if (CDphase=='1' || CDphase=='2') {
1645     cdsadj=CDphase-'0';
1646     }
1647     if (CDSonly) {
1648     seqstart=CDstart;
1649     seqend=CDend;
1650     if (strand=='-') seqend-=cdsadj;
1651     else seqstart+=cdsadj;
1652     }
1653     else {
1654     seqstart=exons.First()->start;
1655     seqend=exons.Last()->end;
1656     }
1657     int s=0; //resulting nucleotide counter
1658     if (strand=='-') {
1659     for (int x=exons.Count()-1;x>=0;x--) {
1660     uint sgstart=exons[x]->start;
1661     uint sgend=exons[x]->end;
1662     if (seqend<sgstart || seqstart>sgend) continue;
1663     if (seqstart>=sgstart && seqstart<=sgend)
1664     sgstart=seqstart; //seqstart within this segment
1665     if (seqend>=sgstart && seqend<=sgend)
1666     sgend=seqend; //seqend within this segment
1667     if (seglst!=NULL)
1668     seglst->Add(new GSeg(s+1,s+1+sgend-sgstart));
1669     for (uint i=sgend;i>=sgstart;i--) {
1670     spliced[s] = ntComplement(gsubseq[i-start]);
1671     s++;
1672     }//for each nt
1673    
1674     if (!CDSonly && cds_start!=NULL && CDstart>0) {
1675     if (CDstart>=sgstart && CDstart<=sgend) {
1676     //CDstart in this segment
1677     //and we are getting the whole transcript
1678     *cds_end=s-(CDstart-sgstart);
1679     }
1680     if (CDend>=sgstart && CDend<=sgend) {
1681     //CDstart in this segment
1682     //and we are getting the whole transcript
1683     *cds_start=s-(CDend-cdsadj-sgstart);
1684     }
1685     }//update local CDS coordinates
1686     } //for each exon
1687     } // - strand
1688     else { // + strand
1689     for (int x=0;x<exons.Count();x++) {
1690     uint sgstart=exons[x]->start;
1691     uint sgend=exons[x]->end;
1692     if (seqend<sgstart || seqstart>sgend) continue;
1693     if (seqstart>=sgstart && seqstart<=sgend)
1694     sgstart=seqstart; //seqstart within this segment
1695     if (seqend>=sgstart && seqend<=sgend)
1696     sgend=seqend; //seqend within this segment
1697     if (seglst!=NULL)
1698     seglst->Add(new GSeg(s+1,s+1+sgend-sgstart));
1699     for (uint i=sgstart;i<=sgend;i++) {
1700     spliced[s]=gsubseq[i-start];
1701     s++;
1702     }//for each nt
1703     if (!CDSonly && cds_start!=NULL && CDstart>0) {
1704     if (CDstart>=sgstart && CDstart<=sgend) {
1705     //CDstart in this segment
1706     //and we are getting the whole transcript
1707     *cds_start=s-(sgend-CDstart-cdsadj);
1708     }
1709     if (CDend>=sgstart && CDend<=sgend) {
1710     //CDstart in this segment
1711     //and we are getting the whole transcript
1712     *cds_end=s-(sgend-CDend);
1713     }
1714     }//update local CDS coordinates
1715     } //for each exon
1716     } // + strand
1717     spliced[s]=0;
1718     if (rlen!=NULL) *rlen=s;
1719     return spliced;
1720     }
1721    
1722     char* GffObj::getSplicedTr(GFaSeqGet* faseq, bool CDSonly, int* rlen) {
1723     if (CDSonly && CDstart==0) return NULL;
1724     //restore normal coordinates:
1725     unxcoord();
1726     if (exons.Count()==0) return NULL;
1727     int fspan=end-start+1;
1728     const char* gsubseq=faseq->subseq(start, fspan);
1729     if (gsubseq==NULL) {
1730     GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end);
1731     }
1732    
1733     char* translation=NULL;
1734     GMALLOC(translation, (int)(covlen/3)+1);
1735     uint seqstart, seqend;
1736     int cdsadj=0;
1737     if (CDphase=='1' || CDphase=='2') {
1738     cdsadj=CDphase-'0';
1739     }
1740     if (CDSonly) {
1741     seqstart=CDstart;
1742     seqend=CDend;
1743     if (strand=='-') seqend-=cdsadj;
1744     else seqstart+=cdsadj;
1745     }
1746     else {
1747     seqstart=exons.First()->start;
1748     seqend=exons.Last()->end;
1749     }
1750     Codon codon;
1751     int nt=0; //codon nucleotide counter (0..2)
1752     int aa=0; //aminoacid count
1753     if (strand=='-') {
1754     for (int x=exons.Count()-1;x>=0;x--) {
1755     uint sgstart=exons[x]->start;
1756     uint sgend=exons[x]->end;
1757     if (seqend<sgstart || seqstart>sgend) continue;
1758     if (seqstart>=sgstart && seqstart<=sgend)
1759     sgstart=seqstart; //seqstart within this segment
1760     if (seqend>=sgstart && seqend<=sgend) {
1761     sgend=seqend; //seqend within this segment
1762     }
1763     for (uint i=sgend;i>=sgstart;i--) {
1764     codon.nuc[nt]=ntComplement(gsubseq[i-start]);
1765     nt++;
1766     if (nt==3) {
1767     nt=0;
1768     translation[aa]=codon.translate();
1769     aa++;
1770     }
1771     }//for each nt
1772     } //for each exon
1773     } // - strand
1774     else { // + strand
1775     for (int x=0;x<exons.Count();x++) {
1776     uint sgstart=exons[x]->start;
1777     uint sgend=exons[x]->end;
1778     if (seqend<sgstart || seqstart>sgend) continue;
1779     if (seqstart>=sgstart && seqstart<=sgend)
1780     sgstart=seqstart; //seqstart within this segment
1781     if (seqend>=sgstart && seqend<=sgend)
1782     sgend=seqend; //seqend within this segment
1783     for (uint i=sgstart;i<=sgend;i++) {
1784     codon.nuc[nt]=gsubseq[i-start];
1785     nt++;
1786     if (nt==3) {
1787     nt=0;
1788     translation[aa]=codon.translate();
1789     aa++;
1790     }
1791     }//for each nt
1792     } //for each exon
1793     } // + strand
1794     translation[aa]=0;
1795     if (rlen!=NULL) *rlen=aa;
1796     return translation;
1797     }
1798    
1799     void GffObj::printSummary(FILE* fout) {
1800     if (fout==NULL) fout=stdout;
1801     fprintf(fout, "%s\t%c\t%d\t%d\t%4.2f\t%4.1f\n", gffID,
1802     strand, start, end, gscore, (float)qcov/10.0);
1803     }
1804    
1805 gpertea 16 void GffObj::printGxfLine(FILE* fout, const char* tlabel, const char* gseqname, bool iscds,
1806 gpertea 2 uint segstart, uint segend, int exidx, char phase, bool gff3) {
1807     static char scorestr[14];
1808     strcpy(scorestr,".");
1809     GffAttrs* xattrs=NULL;
1810     if (exidx>=0) {
1811     if (exons[exidx]->score) sprintf(scorestr,"%.2f", exons[exidx]->score);
1812     xattrs=exons[exidx]->attrs;
1813     }
1814     if (phase==0 || !iscds) phase='.';
1815     const char* ftype=iscds ? "CDS" : getSubfName();
1816     if (gff3) {
1817     fprintf(fout,
1818     "%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\tParent=%s",
1819     gseqname, tlabel, ftype, segstart, segend, scorestr, strand,
1820     phase, gffID);
1821     if (xattrs!=NULL) {
1822     for (int i=0;i<xattrs->Count();i++)
1823     fprintf(fout, ";%s=%s",names->attrs.getName(xattrs->Get(i)->attr_id),
1824     xattrs->Get(i)->attr_val);
1825     }
1826     fprintf(fout, "\n");
1827     } //GFF
1828 gpertea 16 else {//for GTF -- we print only transcripts
1829     //if (isValidTranscript())
1830     fprintf(fout, "%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\ttranscript_id \"%s\";",
1831     gseqname, tlabel, ftype, segstart, segend, scorestr, strand, phase, gffID);
1832     //char* geneid=(geneID!=NULL)? geneID : gffID;
1833     if (geneID)
1834     fprintf(fout," gene_id \"%s\";",geneID);
1835     if (gene_name!=NULL) {
1836     //fprintf(fout, " gene_name ");
1837     //if (gene_name[0]=='"') fprintf (fout, "%s;",gene_name);
1838     // else fprintf(fout, "\"%s\";",gene_name);
1839     fprintf(fout," gene_name \"%s\";",gene_name);
1840     }
1841 gpertea 2 if (xattrs!=NULL) {
1842 gpertea 16 for (int i=0;i<xattrs->Count();i++) {
1843     if (xattrs->Get(i)->attr_val==NULL) continue;
1844     const char* attrname=names->attrs.getName(xattrs->Get(i)->attr_id);
1845     fprintf(fout, " %s ",attrname);
1846     if (xattrs->Get(i)->attr_val[0]=='"')
1847     fprintf(fout, "%s;",xattrs->Get(i)->attr_val);
1848     else fprintf(fout, "\"%s\";",xattrs->Get(i)->attr_val);
1849     }
1850 gpertea 2 }
1851 gpertea 16 //for GTF, also append the GffObj attributes to each exon line
1852     if ((xattrs=this->attrs)!=NULL) {
1853     for (int i=0;i<xattrs->Count();i++) {
1854     if (xattrs->Get(i)->attr_val==NULL) continue;
1855     const char* attrname=names->attrs.getName(xattrs->Get(i)->attr_id);
1856     fprintf(fout, " %s ",attrname);
1857     if (xattrs->Get(i)->attr_val[0]=='"')
1858     fprintf(fout, "%s;",xattrs->Get(i)->attr_val);
1859     else fprintf(fout, "\"%s\";",xattrs->Get(i)->attr_val);
1860     }
1861     }
1862 gpertea 2 fprintf(fout, "\n");
1863     }//GTF
1864     }
1865    
1866 gpertea 16 void GffObj::printGxf(FILE* fout, GffPrintMode gffp,
1867     const char* tlabel, const char* gfparent) {
1868 gpertea 2 static char tmpstr[255];
1869     if (tlabel==NULL) {
1870     tlabel=track_id>=0 ? names->tracks.Get(track_id)->name :
1871     (char*)"gffobj" ;
1872     }
1873     unxcoord();
1874 gpertea 16 //if (exons.Count()==0) return;
1875     const char* gseqname=names->gseqs.Get(gseq_id)->name;
1876 gpertea 2 bool gff3 = (gffp>=pgffAny);
1877     bool showCDS = (gffp==pgtfAny || gffp==pgtfCDS || gffp==pgffCDS || gffp==pgffAny || gffp==pgffBoth);
1878     bool showExon = (gffp<=pgtfExon || gffp==pgffAny || gffp==pgffExon || gffp==pgffBoth);
1879     if (gff3) {
1880     //print GFF3 mRNA line:
1881     if (gscore>0.0) sprintf(tmpstr,"%.2f", gscore);
1882     else strcpy(tmpstr,".");
1883     uint pstart, pend;
1884     if (gffp==pgffCDS) {
1885     pstart=CDstart;
1886     pend=CDend;
1887     }
1888     else { pstart=start;pend=end; }
1889 gpertea 16 //const char* ftype=isTranscript() ? "mRNA" : getFeatureName();
1890     const char* ftype=getFeatureName();
1891 gpertea 2 fprintf(fout,
1892     "%s\t%s\t%s\t%d\t%d\t%s\t%c\t.\tID=%s",
1893     gseqname, tlabel, ftype, pstart, pend, tmpstr, strand, gffID);
1894 gpertea 16 if (CDstart>0 && !showCDS && !isCDS) fprintf(fout,";CDS=%d-%d",CDstart,CDend);
1895     if (gfparent!=NULL) {
1896     //parent override
1897     fprintf(fout, ";Parent=%s",gfparent);
1898     }
1899     else {
1900     if (parent!=NULL && !parent->isDiscarded())
1901     fprintf(fout, ";Parent=%s",parent->getID());
1902     }
1903     if (geneID!=NULL)
1904     fprintf(fout, ";geneID=%s",geneID);
1905     if (gene_name!=NULL)
1906     fprintf(fout, ";gene_name=%s",gene_name);
1907 gpertea 2 if (attrs!=NULL) {
1908     for (int i=0;i<attrs->Count();i++) {
1909 gpertea 16 const char* attrname=names->attrs.getName(attrs->Get(i)->attr_id);
1910     fprintf(fout,";%s=%s", attrname,
1911 gpertea 2 attrs->Get(i)->attr_val);
1912     }
1913     }
1914     fprintf(fout,"\n");
1915     }// gff3 mRNA line
1916     if (showExon) {
1917     //print exons
1918 gpertea 16 if (isCDS && exons.Count()>0 &&
1919     ((strand=='-' && exons.Last()->phase<'0') || (strand=='+' && exons.Last()->phase<'0')))
1920     updateExonPhase();
1921    
1922     for (int i=0;i<exons.Count();i++) {
1923     printGxfLine(fout, tlabel, gseqname, isCDS, exons[i]->start, exons[i]->end, i, exons[i]->phase, gff3);
1924     }
1925     }//printing exons
1926 gpertea 2 if (showCDS && !isCDS && CDstart>0) {
1927     GArray<GffCDSeg> cds(true,true);
1928     getCDSegs(cds);
1929     for (int i=0;i<cds.Count();i++) {
1930     printGxfLine(fout, tlabel, gseqname, true, cds[i].start, cds[i].end, -1, cds[i].phase, gff3);
1931     }
1932     } //showCDS
1933     }
1934    
1935 gpertea 16 void GffObj::updateExonPhase() {
1936     if (!isCDS) return;
1937     int cdsacc=0;
1938     if (CDphase=='1' || CDphase=='2') {
1939     cdsacc+= 3-(CDphase-'0');
1940     }
1941     if (strand=='-') { //reverse strand
1942     for (int i=exons.Count()-1;i>=0;i--) {
1943     exons[i]->phase='0'+ (3-cdsacc%3)%3;
1944     cdsacc+=exons[i]->end-exons[i]->start+1;
1945     }
1946     }
1947     else { //forward strand
1948     for (int i=0;i<exons.Count();i++) {
1949     exons[i]->phase='0'+ (3-cdsacc%3)%3;
1950     cdsacc+=exons[i]->end-exons[i]->start+1;
1951     }
1952     }
1953     }
1954 gpertea 2
1955 gpertea 16
1956 gpertea 2 void GffObj::getCDSegs(GArray<GffCDSeg>& cds) {
1957     GffCDSeg cdseg;
1958     int cdsacc=0;
1959     if (CDphase=='1' || CDphase=='2') {
1960     cdsacc+= 3-(CDphase-'0');
1961     }
1962     if (strand=='-') {
1963     for (int x=exons.Count()-1;x>=0;x--) {
1964     uint sgstart=exons[x]->start;
1965     uint sgend=exons[x]->end;
1966     if (CDend<sgstart || CDstart>sgend) continue;
1967     if (CDstart>=sgstart && CDstart<=sgend)
1968     sgstart=CDstart; //cdstart within this segment
1969     if (CDend>=sgstart && CDend<=sgend)
1970     sgend=CDend; //cdend within this segment
1971     cdseg.start=sgstart;
1972     cdseg.end=sgend;
1973     cdseg.exonidx=x;
1974     //cdseg.phase='0'+(cdsacc>0 ? (3-cdsacc%3)%3 : 0);
1975     cdseg.phase='0'+ (3-cdsacc%3)%3;
1976     cdsacc+=sgend-sgstart+1;
1977     cds.Add(cdseg);
1978     } //for each exon
1979     } // - strand
1980     else { // + strand
1981     for (int x=0;x<exons.Count();x++) {
1982     uint sgstart=exons[x]->start;
1983     uint sgend=exons[x]->end;
1984     if (CDend<sgstart || CDstart>sgend) continue;
1985     if (CDstart>=sgstart && CDstart<=sgend)
1986     sgstart=CDstart; //seqstart within this segment
1987     if (CDend>=sgstart && CDend<=sgend)
1988     sgend=CDend; //seqend within this segment
1989     cdseg.start=sgstart;
1990     cdseg.end=sgend;
1991     cdseg.exonidx=x;
1992     //cdseg.phase='0'+(cdsacc>0 ? (3-cdsacc%3)%3 : 0);
1993     cdseg.phase='0' + (3-cdsacc%3)%3 ;
1994     cdsacc+=sgend-sgstart+1;
1995     cds.Add(cdseg);
1996     } //for each exon
1997     } // + strand
1998     }