ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/gff.cpp
Revision: 286
Committed: Mon Oct 15 17:31:47 2012 UTC (6 years, 11 months ago) by gpertea
File size: 70631 byte(s)
Log Message:
slight mods for rejoin use

Line File contents
1 #include "gff.h"
2
3 //GffNames* GffReader::names=NULL;
4 GffNames* GffObj::names=NULL;
5 //global set of feature names, attribute names etc.
6 // -- common for all GffObjs in current application!
7
8 const uint GFF_MAX_LOCUS = 7000000; //longest known gene in human is ~2.2M, UCSC claims a gene for mouse of ~ 3.1 M
9 const uint GFF_MAX_EXON = 30000; //longest known exon in human is ~11K
10 const uint GFF_MAX_INTRON= 6000000; //Ensembl shows a >5MB human intron
11 bool gff_show_warnings = false; //global setting, set by GffReader->showWarnings()
12 const int gff_fid_mRNA=0;
13 const int gff_fid_transcript=1;
14 const int gff_fid_exon=2;
15 const int gff_fid_CDS=3; //never really used in GffObj ftype_id or subftype_id
16 const uint gfo_flag_HAS_ERRORS = 0x00000001;
17 const uint gfo_flag_CHILDREN_PROMOTED= 0x00000002;
18 const uint gfo_flag_IS_GENE = 0x00000004;
19 const uint gfo_flag_IS_TRANSCRIPT = 0x00000008;
20 const uint gfo_flag_HAS_GFF_ID = 0x00000010; //found GFF3 feature line with its own ID
21 const uint gfo_flag_BY_EXON = 0x00000020; //created by subfeature (exon) directly
22 const uint gfo_flag_DISCARDED = 0x00000100;
23 const uint gfo_flag_LST_KEEP = 0x00000200;
24 const uint gfo_flag_LEVEL_MSK = 0x00FF0000;
25 const byte gfo_flagShift_LEVEL = 16;
26
27 void gffnames_ref(GffNames* &n) {
28 if (n==NULL) n=new GffNames();
29 n->numrefs++;
30 }
31
32 void gffnames_unref(GffNames* &n) {
33 if (n==NULL) GError("Error: attempt to remove reference to null GffNames object!\n");
34 n->numrefs--;
35 if (n->numrefs==0) { delete n; n=NULL; }
36 }
37
38 int gfo_cmpByLoc(const pointer p1, const pointer p2) {
39
40 GffObj& g1=*((GffObj*)p1);
41 GffObj& g2=*((GffObj*)p2);
42 if (g1.gseq_id==g2.gseq_id) {
43 if (g1.start!=g2.start)
44 return (int)(g1.start-g2.start);
45 else if (g1.getLevel()!=g2.getLevel())
46 return (int)(g1.getLevel()-g2.getLevel());
47 else
48 if (g1.end!=g2.end)
49 return (int)(g1.end-g2.end);
50 else return strcmp(g1.getID(), g2.getID());
51 }
52 else return (int)(g1.gseq_id-g2.gseq_id);
53 }
54
55 char* GffLine::extractAttr(const char* attr, bool caseStrict, bool enforce_GTF2) {
56 //parse a key attribute and remove it from the info string
57 //(only works for attributes that have values following them after ' ' or '=')
58 static const char GTF2_ERR[]="Error parsing attribute %s ('\"' required) at GTF line:\n%s\n";
59 int attrlen=strlen(attr);
60 char cend=attr[attrlen-1];
61 //char* pos = (caseStrict) ? strstr(info, attr) : strifind(info, attr);
62 //must make sure attr is not found in quoted text
63 char* pos=info;
64 char prevch=0;
65 bool in_str=false;
66 bool notfound=true;
67 int (*strcmpfn)(const char*, const char*, int) = caseStrict ? Gstrcmp : Gstricmp;
68 while (notfound && *pos) {
69 char ch=*pos;
70 if (ch=='"') {
71 in_str=!in_str;
72 pos++;
73 prevch=ch;
74 continue;
75 }
76 if (!in_str && (prevch==0 || prevch==' ' || prevch == ';')
77 && strcmpfn(attr, pos, attrlen)==0) {
78 //attr match found
79 //check for word boundary on right
80 char* epos=pos+attrlen;
81 if (cend=='=' || cend==' ' || *epos==0 || *epos==' ') {
82 notfound=false;
83 break;
84 }
85 //not a perfect match, move on
86 pos=epos;
87 prevch=*(pos-1);
88 continue;
89 }
90 //not a match or in_str
91 prevch=ch;
92 pos++;
93 }
94 if (notfound) return NULL;
95 char* vp=pos+attrlen;
96 while (*vp==' ') vp++;
97 if (*vp==';' || *vp==0)
98 GError("Error parsing value of GFF attribute \"%s\", line:\n%s\n", attr, dupline);
99 bool dq_enclosed=false; //value string enclosed by double quotes
100 if (*vp=='"') {
101 dq_enclosed=true;
102 vp++;
103 }
104 if (enforce_GTF2 && !dq_enclosed)
105 GError(GTF2_ERR,attr, dupline);
106 char* vend=vp;
107 if (dq_enclosed) {
108 while (*vend!='"' && *vend!=';' && *vend!=0) vend++;
109 }
110 else {
111 while (*vend!=';' && *vend!=0) vend++;
112 }
113 if (enforce_GTF2 && *vend!='"')
114 GError(GTF2_ERR, attr, dupline);
115 char *r=Gstrdup(vp, vend-1);
116 //-- now remove this attribute from the info string
117 while (*vend!=0 && (*vend=='"' || *vend==';' || *vend==' ')) vend++;
118 if (*vend==0) vend--;
119 for (char *src=vend, *dest=pos;;src++,dest++) {
120 *dest=*src;
121 if (*src==0) break;
122 }
123 return r;
124 }
125
126 static char fnamelc[128];
127
128 GffLine::GffLine(GffReader* reader, const char* l) {
129 llen=strlen(l);
130 GMALLOC(line,llen+1);
131 memcpy(line, l, llen+1);
132 GMALLOC(dupline, llen+1);
133 memcpy(dupline, l, llen+1);
134 skip=true;
135 gseqname=NULL;
136 track=NULL;
137 ftype=NULL;
138 info=NULL;
139 _parents=NULL;
140 _parents_len=0;
141 num_parents=0;
142 parents=NULL;
143 is_gff3=false;
144 is_cds=false;
145 is_transcript=false;
146 is_exon=false;
147 is_gene=false;
148 exontype=0;
149 gene_id=NULL;
150 gene_name=NULL;
151 qstart=0;
152 qend=0;
153 qlen=0;
154 ID=NULL;
155 char* t[9];
156 int i=0;
157 int tidx=1;
158 t[0]=line;
159
160 while (line[i]!=0) {
161 if (line[i]=='\t') {
162 line[i]=0;
163 t[tidx]=line+i+1;
164 tidx++;
165 if (tidx>8) break;
166 }
167 i++;
168 }
169
170 if (tidx<8) { // ignore non-GFF lines
171 // GMessage("Warning: error parsing GFF/GTF line:\n%s\n", l);
172 return;
173 }
174 gseqname=t[0];
175 track=t[1];
176 ftype=t[2];
177 info=t[8];
178 char* p=t[3];
179 if (!parseUInt(p,fstart)) {
180 //FIXME: chromosome_band entries in Flybase
181 GMessage("Warning: invalid start coordinate at line:\n%s\n",l);
182 return;
183 }
184 p=t[4];
185 if (!parseUInt(p,fend)) {
186 GMessage("Warning: invalid end coordinate at line:\n%s\n",l);
187 return;
188 }
189 if (fend<fstart) Gswap(fend,fstart); //make sure fstart>=fend, always
190 p=t[5];
191 if (p[0]=='.' && p[1]==0) {
192 score=0;
193 }
194 else {
195 if (!parseDouble(p,score))
196 GError("Error parsing feature score from GFF line:\n%s\n",l);
197 }
198 strand=*t[6];
199 if (strand!='+' && strand!='-' && strand!='.')
200 GError("Error parsing strand (%c) from GFF line:\n%s\n",strand,l);
201 phase=*t[7]; // must be '.', '0', '1' or '2'
202 ID=NULL;
203 // exon/CDS/mrna filter
204 strncpy(fnamelc, ftype, 127);
205 fnamelc[127]=0;
206 strlower(fnamelc); //convert to lower case
207 bool is_t_data=false;
208 if (strstr(fnamelc, "utr")!=NULL) {
209 exontype=exgffUTR;
210 is_exon=true;
211 is_t_data=true;
212 }
213 else if (endsWith(fnamelc, "exon")) {
214 exontype=exgffExon;
215 is_exon=true;
216 is_t_data=true;
217 }
218 else if (strstr(fnamelc, "stop") &&
219 (strstr(fnamelc, "codon") || strstr(fnamelc, "cds"))){
220 exontype=exgffStop;
221 is_cds=true; //though some place it outside the last CDS segment
222 is_t_data=true;
223 }
224 else if (strstr(fnamelc, "start") &&
225 ((strstr(fnamelc, "codon")!=NULL) || strstr(fnamelc, "cds")!=NULL)){
226 exontype=exgffStart;
227 is_cds=true;
228 is_t_data=true;
229 }
230 else if (strcmp(fnamelc, "cds")==0) {
231 exontype=exgffCDS;
232 is_cds=true;
233 is_t_data=true;
234 }
235 else if (endsWith(fnamelc, "gene") || startsWith(fnamelc, "gene")) {
236 is_gene=true;
237 is_t_data=true; //because its name will be attached to parented transcripts
238 }
239 else if (endsWith(fnamelc,"rna") || endsWith(fnamelc,"transcript")) {
240 is_transcript=true;
241 is_t_data=true;
242 }
243
244 if (reader->transcriptsOnly && !is_t_data) {
245 char* id=extractAttr("ID=");
246 if (id==NULL) id=extractAttr("transcript_id");
247 //GMessage("Discarding non-transcript line:\n%s\n",l);
248 if (id!=NULL) {
249 reader->discarded_ids.Add(id, new int(1));
250 GFREE(id);
251 }
252 return; //skip this line, unwanted feature name
253 }
254 ID=extractAttr("ID=",true);
255 char* Parent=extractAttr("Parent=",true);
256 is_gff3=(ID!=NULL || Parent!=NULL);
257 if (is_gff3) {
258 //parse as GFF3
259 if (ID!=NULL) {
260 //has ID attr so it's likely to be a parent feature
261 //look for explicit gene name
262 gene_name=extractAttr("gene_name=");
263 if (gene_name==NULL) {
264 gene_name=extractAttr("geneName=");
265 if (gene_name==NULL) {
266 gene_name=extractAttr("gene_sym=");
267 if (gene_name==NULL) {
268 gene_name=extractAttr("gene=");
269 }
270 }
271 }
272 gene_id=extractAttr("geneID=");
273 if (gene_id==NULL) {
274 gene_id=extractAttr("gene_id=");
275 }
276 if (is_gene) {
277 //special case: keep the Name and ID attributes of the gene feature
278 if (gene_name==NULL)
279 gene_name=extractAttr("Name=");
280 if (gene_id==NULL) //the ID is also gene_id in this case
281 gene_id=Gstrdup(ID);
282 //skip=false;
283 //return;
284 GFREE(Parent); //TMI, we really don't care about gene Parents?
285 } //gene feature
286 }// has GFF3 ID
287 if (Parent!=NULL) {
288 //keep Parent attr
289 //parse multiple parents
290 num_parents=1;
291 p=Parent;
292 int last_delim_pos=-1;
293 while (*p!=';' && *p!=0) {
294 if (*p==',' && *(p+1)!=0 && *(p+1)!=';') {
295 num_parents++;
296 last_delim_pos=(p-Parent);
297 }
298 p++;
299 }
300 _parents_len=p-Parent+1;
301 _parents=Parent;
302 GMALLOC(parents, num_parents*sizeof(char*));
303 parents[0]=_parents;
304 int i=1;
305 if (last_delim_pos>0) {
306 for (p=_parents+1;p<=_parents+last_delim_pos;p++) {
307 if (*p==',') {
308 char* ep=p-1;
309 while (*ep==' ' && ep>_parents) ep--;
310 *(ep+1)=0; //end the string there
311 parents[i]=p+1;
312 i++;
313 }
314 }
315 }
316 } //has Parent field
317 } //GFF3
318 else { // GTF-like expected
319 Parent=extractAttr("transcript_id",true);
320 if (Parent!=NULL) { //GTF2 format detected
321 if (is_transcript) {
322 // atypical GTF with a parent transcript line declared
323 ID=Parent;
324 Parent=NULL;
325 }
326 gene_id=extractAttr("gene_id"); // for GTF this is the only attribute accepted as geneID
327 if (gene_id==NULL)
328 gene_id=extractAttr("geneid");
329 gene_name=extractAttr("gene_name");
330 if (gene_name==NULL) {
331
332 gene_name=extractAttr("gene_sym");
333 if (gene_name==NULL) {
334 gene_name=extractAttr("gene");
335 if (gene_name==NULL)
336 gene_name=extractAttr("genesymbol");
337 }
338 }
339 //prepare for parseAttr by adding '=' character instead of spaces for all attributes
340 //after the attribute name
341 p=info;
342 bool noed=true; //not edited after the last delim
343 bool nsp=false; //non-space found after last delim
344 while (*p!=0) {
345 if (*p==' ') {
346 if (nsp && noed) {
347 *p='=';
348 noed=false;
349 p++;
350 continue;
351 }
352 }
353 else nsp=true; //non-space
354 if (*p==';') { noed=true; nsp=false; }
355 p++;
356 }
357 } //GTF2 detected (no parent line)
358 else {// Parent is NULL, check for jigsaw format or other pre-GTF2 format
359 //char* fexon=strstr(fnamelc, "exon");
360 //if (fexon!=NULL) {
361 if (exontype==exgffExon) {
362 if (startsWith(track,"jigsaw")) {
363 is_cds=true;
364 strcpy(track,"jigsaw");
365 p=strchr(info,';');
366 if (p==NULL) { Parent=Gstrdup(info); info=NULL; }
367 else { Parent=Gstrdup(info,p-1);
368 info=p+1;
369 }
370 }
371 } //exon feature?
372 if (Parent==NULL && exontype>=exgffCDS &&
373 (i=strcspn(info,"; \t\n\r"))<=(int)(strlen(info)+1)) {
374 //one word ID ? really desperate attempt to parse it here
375 Parent=Gstrdup(info,info+i-1);
376 info=NULL; //discard anything else on the line
377 }
378 }
379 if (Parent!=NULL) { //GTF transcript_id for exon/CDS feature
380 _parents=Parent;
381 GMALLOC(parents,sizeof(char*));
382 num_parents=1;
383 parents[0]=_parents;
384 }
385 } //GTF-like
386
387 //parse other potentially useful features
388 if (is_gff3) {
389 if ((p=strstr(info,"Target="))!=NULL) { //has Target attr
390 p+=7;
391 while (*p!=';' && *p!=0 && *p!=' ') p++;
392 if (*p!=' ') {
393 GError("Error parsing target coordinates from GFF line:\n%s\n",l);
394 }
395 if (!parseUInt(p,qstart))
396 GError("Error parsing target start coordinate from GFF line:\n%s\n",l);
397 if (*p!=' ') {
398 GError("Error parsing next target coordinate from GFF line:\n%s\n",l);
399 }
400 p++;
401 if (!parseUInt(p,qend))
402 GError("Error parsing target end coordinate from GFF line:\n%s\n",l);
403 }
404 if ((p=strifind(info,"Qreg="))!=NULL) { //has Qreg attr
405 p+=5;
406 if (!parseUInt(p,qstart))
407 GError("Error parsing target start coordinate from GFF line:\n%s\n",l);
408 if (*p!='-') {
409 GError("Error parsing next target coordinate from GFF line:\n%s\n",l);
410 }
411 p++;
412 if (!parseUInt(p,qend))
413 GError("Error parsing target end coordinate from GFF line:\n%s\n",l);
414 if (*p=='|' || *p==':') {
415 p++;
416 if (!parseUInt(p,qlen))
417 GError("Error parsing target length from GFF Qreg|: \n%s\n",l);
418 }
419 }//has Qreg attr
420 if (qlen==0 && (p=strifind(info,"Qlen="))!=NULL) {
421 p+=5;
422 if (!parseUInt(p,qlen))
423 GError("Error parsing target length from GFF Qlen:\n%s\n",l);
424 }
425 }//parsing some useful attributes in GFF3 records
426 if (ID==NULL && parents==NULL) {
427 if (reader->gff_warns)
428 GMessage("Warning: could not parse ID or Parent from GFF line:\n%s\n",dupline);
429 return; //skip
430 }
431 skip=false;
432 }
433
434
435 void GffObj::addCDS(uint cd_start, uint cd_end, char phase) {
436 if (cd_start>=this->start) {
437 this->CDstart=cd_start;
438 if (strand=='+') this->CDphase=phase;
439 }
440 else this->CDstart=this->start;
441 if (cd_end<=this->end) {
442 this->CDend=cd_end;
443 if (strand=='-') this->CDphase=phase;
444 }
445 else this->CDend=this->end;
446 isTranscript(true);
447 exon_ftype_id=gff_fid_exon;
448 if (monoFeature()) {
449 if (exons.Count()==0) addExon(this->start, this->end,0,'.',0,0,false,exgffExon);
450 else exons[0]->exontype=exgffExon;
451 }
452 }
453
454 int GffObj::addExon(GffReader* reader, GffLine* gl, bool keepAttr, bool noExonAttr) {
455 //this will make sure we have the right subftype_id!
456 //int subf_id=-1;
457 if (!isTranscript() && gl->is_cds) {
458 isTranscript(true);
459 exon_ftype_id=gff_fid_exon;
460 if (exons.Count()==1) exons[0]->exontype=exgffExon;
461 }
462 if (isTranscript()) {
463 if (exon_ftype_id<0) {//exon_ftype_id=gff_fid_exon;
464 if (gl->exontype>0) exon_ftype_id=gff_fid_exon;
465 else exon_ftype_id=names->feats.addName(gl->ftype);
466 }
467 //any recognized mRNA segment gets the generic "exon" type (also applies to CDS)
468 if (gl->exontype==0 && !gl->is_transcript) {
469 //extraneous mRNA feature, discard
470 if (reader->gff_warns)
471 GMessage("Warning: discarding unrecognized transcript subfeature %s of %s\n",
472 gl->ftype, gffID);
473 return -1;
474 }
475 }
476 else { //non-mRNA parent feature, check this subf type
477 int subf_id=names->feats.addName(gl->ftype);
478 if (exon_ftype_id<0 || exons.Count()==0) //never assigned a subfeature type before (e.g. first exon being added)
479 exon_ftype_id=subf_id;
480 else {
481 if (exon_ftype_id!=subf_id) {
482 //
483 if (exon_ftype_id==ftype_id && exons.Count()==1 && exons[0]->start==start && exons[0]->end==end) {
484 //the existing exon was just a dummy one created by default, discard it
485 exons.Clear();
486 covlen=0;
487 exon_ftype_id=subf_id; //allow the new subfeature to completely takeover
488 }
489 else { //multiple subfeatures, prefer those with
490 if (reader->gff_warns)
491 GMessage("GFF Warning: multiple subfeatures (%s and %s) found for %s, discarding ",
492 names->feats.getName(subf_id), names->feats.getName(exon_ftype_id),gffID);
493 if (gl->exontype!=0) { //new feature is an exon, discard previously parsed subfeatures
494 if (reader->gff_warns) GMessage("%s.\n", names->feats.getName(exon_ftype_id));
495 exon_ftype_id=subf_id;
496 exons.Clear();
497 covlen=0;
498 }
499 else { //discard new feature
500 if (reader->gff_warns) GMessage("%s.\n", names->feats.getName(subf_id));
501 return -1; //skip this 2nd subfeature type for this parent!
502 }
503 }
504 } //incoming subfeature is of different type
505 } //new subfeature type
506 } //non-mRNA parent
507 int eidx=addExon(gl->fstart, gl->fend, gl->score, gl->phase,
508 gl->qstart,gl->qend, gl->is_cds, gl->exontype);
509 if (eidx<0) return eidx; //this should never happen
510 if (keepAttr) {
511 if (noExonAttr) {
512 if (attrs==NULL) //place the parsed attributes directly at transcript level
513 parseAttrs(attrs, gl->info);
514 }
515 else { //need all exon-level attributes
516 parseAttrs(exons[eidx]->attrs, gl->info, true);
517 }
518 }
519 return eidx;
520 }
521
522
523 int GffObj::addExon(uint segstart, uint segend, double sc, char fr, int qs, int qe, bool iscds, char exontype) {
524 if (exons.Count()==0) {
525 if (iscds) isCDS=true; //for now, assume CDS only if first "exon" given is a CDS
526 if (exon_ftype_id<0) {
527 exon_ftype_id = isTranscript() ? gff_fid_exon : ftype_id;
528 }
529 }
530 //special treatment of start/stop codon features, they might be broken/split between exons
531 //and in that case some providers will still give the wrong end coordinate as start+2 (e.g. UCSC)
532 //so we should not trust the end coordinate for such features
533 if (exontype==exgffStart || exontype==exgffStop) {
534 if (strand=='-') segstart=segend;
535 else segend=segstart;
536 if (exontype==exgffStart) {
537 if (CDstart==0 || segstart<CDstart) CDstart=segstart;
538 }
539 else {
540 if (segstart>CDend) CDend=segstart;
541 }
542 }
543 else if (iscds) { //update CDS anchors:
544 if (CDstart==0 || segstart<CDstart) {
545 CDstart=segstart;
546 if (exontype==exgffCDS && strand=='+') CDphase=fr;
547 }
548 if (segend>CDend) {
549 if (exontype==exgffCDS && strand=='-') CDphase=fr;
550 CDend=segend;
551 }
552 }
553 else { // not a CDS/start/stop
554 isCDS=false;
555 }
556 if (qs || qe) {
557 if (qs>qe) Gswap(qs,qe);
558 if (qs==0) qs=1;
559 }
560 int ovlen=0;
561 if (exontype>0) { //check for overlaps between exon-type segments
562 int oi=exonOverlapIdx(segstart, segend, &ovlen);
563 if (oi>=0) { //overlap existing segment
564 if (ovlen==0) {
565 //adjacent segments will be merged
566 //e.g. CDS to (UTR|exon)
567 if ((exons[oi]->exontype>=exgffUTR && exontype==exgffCDS) ||
568 (exons[oi]->exontype==exgffCDS && exontype>=exgffUTR)) {
569 expandExon(oi, segstart, segend, exgffCDSUTR, sc, fr, qs, qe);
570 return oi;
571 }
572 //CDS adjacent to stop_codon: UCSC does (did?) this
573 if ((exons[oi]->exontype==exgffStop && exontype==exgffCDS) ||
574 (exons[oi]->exontype==exgffCDS && exontype==exgffStop)) {
575 expandExon(oi, segstart, segend, exgffCDS, sc, fr, qs, qe);
576 return oi;
577 }
578 }
579 //only allow this for CDS within exon, stop_codon within (CDS|UTR|exon),
580 // start_codon within (CDS|exon)
581 if (exons[oi]->exontype>exontype &&
582 exons[oi]->start<=segstart && exons[oi]->end>=segend &&
583 !(exons[oi]->exontype==exgffUTR && exontype==exgffCDS)) {
584 //larger segment given first, now the smaller included one is redundant
585 return oi; //only used to store attributes from current GffLine
586 }
587 if (exontype>exons[oi]->exontype &&
588 segstart<=exons[oi]->start && segend>=exons[oi]->end &&
589 !(exontype==exgffUTR && exons[oi]->exontype==exgffCDS)) {
590 //smaller segment given first, so we have to enlarge it
591 expandExon(oi, segstart, segend, exontype, sc, fr, qs, qe);
592 //this should also check for overlapping next exon (oi+1) ?
593 return oi;
594 }
595 //there is also the special case of "ribosomal slippage exception" (programmed frameshift)
596 //where two CDS segments may actually overlap for 1 or 2 bases, but there should be only one encompassing exon
597 //if (ovlen>2 || exons[oi]->exontype!=exgffCDS || exontype!=exgffCDS) {
598 // had to relax this because of some weird UCSC annotations with exons partially overlapping the CDS segments
599 /*
600 if (ovlen>2 && exons[oi]->exontype!=exgffUTR && exontype!=exgffUTR) {
601 if (gff_show_warnings)
602 GMessage("GFF Warning: discarding overlapping feature segment (%d-%d) (vs %d-%d (%s)) for GFF ID %s on %s\n",
603 segstart, segend, exons[oi]->start, exons[oi]->end, getSubfName(), gffID, getGSeqName());
604 hasErrors(true);
605 return -1; //segment NOT added
606 }
607 */
608
609 if ((ovlen>2 || ovlen==0) || exons[oi]->exontype!=exgffCDS || exontype!=exgffCDS) {
610 if (gff_show_warnings)
611 GMessage("GFF Warning: merging overlapping/adjacent feature segment (%d-%d) into (%d-%d) (%s) for GFF ID %s on %s\n",
612 segstart, segend, exons[oi]->start, exons[oi]->end, getSubfName(), gffID, getGSeqName());
613 expandExon(oi, segstart, segend, exontype, sc, fr, qs, qe);
614 return oi;
615 }
616 // else add the segment if the overlap is small and between two CDS segments
617 //TODO: we might want to add an attribute here with the slippage coordinate and size?
618 covlen-=ovlen;
619 }//overlap or adjacent to existing segment
620 } //check for overlap
621 // --- no overlap, or accepted micro-overlap (ribosomal slippage)
622 // create & add the new segment
623 /*
624 if (start>0 && exontype==exgffCDS && exons.Count()==0) {
625 //adding a CDS directly as the first subfeature of a declared parent
626 segstart=start;
627 segend=end;
628 }
629 */
630 GffExon* enew=new GffExon(segstart, segend, sc, fr, qs, qe, exontype);
631 int eidx=exons.Add(enew);
632 if (eidx<0) {
633 //this would actually be acceptable if the object is a "Gene" and "exons" are in fact isoforms
634 if (gff_show_warnings)
635 GMessage("GFF Warning: failed adding segment %d-%d for %s (discarded)!\n",
636 segstart, segend, gffID);
637 delete enew;
638 hasErrors(true);
639 return -1;
640 }
641 covlen+=(int)(exons[eidx]->end-exons[eidx]->start)+1;
642 //adjust parent feature coordinates to contain this exon
643 if (start==0 || start>exons.First()->start) {
644 start=exons.First()->start;
645 }
646 if (end<exons.Last()->end) end=exons.Last()->end;
647
648 return eidx;
649 }
650
651 void GffObj::expandExon(int oi, uint segstart, uint segend, char exontype, double sc, char fr, int qs, int qe) {
652 //oi is the index of the *first* overlapping segment found that must be enlarged
653 covlen-=exons[oi]->len();
654 if (segstart<exons[oi]->start)
655 exons[oi]->start=segstart;
656 if (qs && qs<exons[oi]->qstart) exons[oi]->qstart=qs;
657 if (segend>exons[oi]->end)
658 exons[oi]->end=segend;
659 if (qe && qe>exons[oi]->qend) exons[oi]->qend=qe;
660 //warning: score cannot be properly adjusted! e.g. if it's a p-value it's just going to get worse
661 if (sc!=0) exons[oi]->score=sc;
662 covlen+=exons[oi]->len();
663 //if (exons[oi]->exontype< exontype) -- always true
664 exons[oi]->exontype = exontype;
665 if (exontype==exgffCDS) exons[oi]->phase=fr;
666 //we must check if any more exons are also overlapping this
667 int ni=oi+1; //next exon index after oi
668 while (ni<exons.Count() && segend>=exons[ni]->start) { // next segment overlaps new enlarged segment
669 //only allow this if next segment is fully included, and a subordinate
670 if (exons[ni]->exontype<exontype && exons[ni]->end<=segend) {
671 /* I guess we have to relax this due to stupid UCSC hg18 files having a start_codon sticking out
672 chr1 hg18_knownGene start_codon 69806911 69806913 0.000000 + .
673 chr1 hg18_knownGene CDS 69806911 69806912 0.000000 + 0
674 chr1 hg18_knownGene exon 69805456 69806912 0.000000 + .
675 */
676 if (exons[ni]->qstart<exons[oi]->qstart) exons[oi]->qstart=exons[ni]->qstart;
677 if (exons[ni]->qend>exons[oi]->qend) exons[oi]->qend=exons[ni]->qend;
678 exons.Delete(ni);
679 }
680 else {
681 if (gff_show_warnings) GMessage("GFF Warning: overlapping existing exon(%d-%d) while expanding to %d-%d for GFF ID %s\n",
682 exons[ni]->start, exons[ni]->end, segstart, segend, gffID);
683 //hasErrors(true);
684 break;
685 }
686 }
687 // -- make sure any other related boundaries are updated:
688 start=exons.First()->start;
689 end=exons.Last()->end;
690 if (uptr!=NULL) { //collect stats about the underlying genomic sequence
691 GSeqStat* gsd=(GSeqStat*)uptr;
692 if (start<gsd->mincoord) gsd->mincoord=start;
693 if (end>gsd->maxcoord) gsd->maxcoord=end;
694 if (this->len()>gsd->maxfeat_len) {
695 gsd->maxfeat_len=this->len();
696 gsd->maxfeat=this;
697 }
698 }
699 }
700
701 void GffObj::removeExon(int idx) {
702 /*
703 if (idx==0 && segs[0].start==gstart)
704 gstart=segs[1].start;
705 if (idx==segcount && segs[segcount].end==gend)
706 gend=segs[segcount-1].end;
707 */
708 if (idx<0 || idx>=exons.Count()) return;
709 int segstart=exons[idx]->start;
710 int segend=exons[idx]->end;
711 exons.Delete(idx);
712 covlen -= (int)(segend-segstart)+1;
713 start=exons.First()->start;
714 end=exons.Last()->end;
715 if (isCDS) { CDstart=start; CDend=end; }
716 }
717
718 void GffObj::removeExon(GffExon* p) {
719 for (int idx=0;idx<exons.Count();idx++) {
720 if (exons[idx]==p) {
721 int segstart=exons[idx]->start;
722 int segend=exons[idx]->end;
723 exons.Delete(idx);
724 covlen -= (int)(segend-segstart)+1;
725
726 if (exons.Count() > 0) {
727 start=exons.First()->start;
728 end=exons.Last()->end;
729 if (isCDS) { CDstart=start; CDend=end; }
730 }
731 return;
732 }
733 }
734 }
735
736
737
738 GffObj::GffObj(GffReader *gfrd, GffLine* gffline, bool keepAttr, bool noExonAttr):
739 GSeg(0,0), exons(true,true,false), children(1,false) {
740 xstart=0;
741 xend=0;
742 xstatus=0;
743 partial=false;
744 isCDS=false;
745 uptr=NULL;
746 ulink=NULL;
747 parent=NULL;
748 udata=0;
749 flags=0;
750 CDstart=0;
751 CDend=0;
752 CDphase=0;
753 geneID=NULL;
754 gene_name=NULL;
755 attrs=NULL;
756 gffID=NULL;
757 track_id=-1;
758 gseq_id=-1;
759 ftype_id=-1;
760 exon_ftype_id=-1;
761 strand='.';
762 if (gfrd==NULL)
763 GError("Cannot use this GffObj constructor with a NULL GffReader!\n");
764 gffnames_ref(names);
765 if (gfrd->names==NULL) gfrd->names=names;
766 //qlen=0;qstart=0;qend=0;
767 gscore=0;
768 uscore=0;
769 covlen=0;
770 qcov=0;
771 start=gffline->fstart;
772 end=gffline->fend;
773 gseq_id=names->gseqs.addName(gffline->gseqname);
774 track_id=names->tracks.addName(gffline->track);
775 strand=gffline->strand;
776 qlen=gffline->qlen;
777 qstart=gffline->qstart;
778 qend=gffline->qend;
779 //setup flags from gffline
780 isCDS=gffline->is_cds; //for now
781 isGene(gffline->is_gene);
782 isTranscript(gffline->is_transcript || gffline->exontype!=0);
783 //fromGff3(gffline->is_gff3);
784
785 if (gffline->parents!=NULL && !gffline->is_transcript) {
786 //GTF style -- create a GffObj directly by subfeature
787 //(also possible orphan GFF3 exon line, or an exon given before its parent (chado))
788 if (gffline->exontype!=0) { //recognized exon-like feature
789 ftype_id=gff_fid_transcript; //so this is some sort of transcript
790 exon_ftype_id=gff_fid_exon; //subfeatures MUST be exons
791 }
792 else {//unrecognized subfeatures
793 //make this GffObj of the same feature type
794 ftype_id=names->feats.addName(gffline->ftype);
795 }
796 if (gffline->ID==NULL) { //typical GTF2 without "transcript" line
797 gffID=Gstrdup(gffline->parents[0]);
798 this->createdByExon(true);
799 //this is likely the first exon/segment of the feature
800 addExon(gfrd, gffline, keepAttr, noExonAttr);
801 }
802 else { //a parented feature with an ID: orphan or premature GFF3 subfeature line
803 if (gffline->is_gff3 && gffline->exontype!=0) {
804 //premature exon given before its parent transcript
805 //create the transcript entry here
806 gffID=Gstrdup(gffline->parents[0]);
807 this->createdByExon(true);
808 //this is the first exon/segment of the transcript
809 addExon(gfrd, gffline, keepAttr, noExonAttr);
810 }
811 else { //unrecognized non-exon feature ? use the ID instead
812 this->hasGffID(true);
813 gffID=Gstrdup(gffline->ID);
814 if (keepAttr) this->parseAttrs(attrs, gffline->info);
815 }
816 }
817 } //non-transcript parented subfeature given directly
818 else {
819 //non-parented feature OR a recognizable transcript
820 //create a parent feature in its own right
821 gscore=gffline->score;
822 if (gffline->ID==NULL || gffline->ID[0]==0)
823 GError("Error: no ID found for GFF record start\n");
824 this->hasGffID(true);
825 gffID=Gstrdup(gffline->ID); //there must be an ID here
826 //if (gffline->is_transcript) ftype_id=gff_fid_mRNA;
827 //else
828 ftype_id=names->feats.addName(gffline->ftype);
829 if (gffline->is_transcript)
830 exon_ftype_id=gff_fid_exon;
831 if (keepAttr) this->parseAttrs(attrs, gffline->info);
832 }//no parent
833
834 if (gffline->gene_name!=NULL) {
835 gene_name=Gstrdup(gffline->gene_name);
836 }
837 if (gffline->gene_id) {
838 geneID=Gstrdup(gffline->gene_id);
839 }
840 else if (gffline->is_transcript && gffline->parents) {
841 geneID=Gstrdup(gffline->parents[0]);
842 }
843
844 //GSeqStat* gsd=gfrd->gseqstats.AddIfNew(new GSeqStat(gseq_id,names->gseqs.lastNameUsed()),true);
845 GSeqStat* gsd=gfrd->gseqstats.AddIfNew(new GSeqStat(gseq_id,gffline->gseqname), true);
846 uptr=gsd;
847 /*
848 if (start<gsd->mincoord) gsd->mincoord=start;
849 if (end>gsd->maxcoord) gsd->maxcoord=end;
850 if (this->len()>gsd->maxfeat_len) {
851 gsd->maxfeat_len=this->len();
852 gsd->maxfeat=this;
853 }
854 */
855 }
856
857 GffLine* GffReader::nextGffLine() {
858 if (gffline!=NULL) return gffline; //caller should free gffline after processing
859 while (gffline==NULL) {
860 int llen=0;
861 buflen=GFF_LINELEN-1;
862 char* l=fgetline(linebuf, buflen, fh, &fpos, &llen);
863 if (l==NULL) {
864 return NULL; //end of file
865 }
866 int ns=0; //first nonspace position
867 while (l[ns]!=0 && isspace(l[ns])) ns++;
868 if (l[ns]=='#' || llen<10) continue;
869 gffline=new GffLine(this, l);
870 if (gffline->skip) {
871 delete gffline;
872 gffline=NULL;
873 continue;
874 }
875 if (gffline->ID==NULL && gffline->parents==NULL) { //it must have an ID
876 //this might not be needed, already checked in the GffLine constructor
877 if (gff_warns)
878 GMessage("Warning: malformed GFF line, no parent or record Id (kipping\n");
879 delete gffline;
880 gffline=NULL;
881 //continue;
882 }
883 }
884 return gffline;
885 }
886
887
888 char* GffReader::gfoBuildId(const char* id, const char* ctg) {
889 //caller must free the returned pointer
890 char* buf=NULL;
891 int idlen=strlen(id);
892 GMALLOC(buf, idlen+strlen(ctg)+2);
893 strcpy(buf, id);
894 buf[idlen]='~';
895 strcpy(buf+idlen+1, ctg);
896 return buf;
897 }
898 /*
899 void GffReader::gfoRemove(const char* id, const char* ctg) {
900 char* buf=gfoBuildId(id,ctg);
901 phash.Remove(buf);
902 GFREE(buf);
903 }
904 */
905 GfoHolder* GffReader::gfoAdd(GffObj* gfo, int idx) {
906 //Warning: if gflst gets altered, idx becomes obsolete
907 GVec<GfoHolder>* glst=phash.Find(gfo->gffID);
908 if (glst==NULL)
909 glst=new GVec<GfoHolder>(1);
910 GfoHolder gh(gfo,idx);
911 int i=glst->Add(gh);
912 phash.Add(gfo->gffID, glst);
913 return &(glst->Get(i));
914 }
915
916 GfoHolder* GffReader::gfoAdd(GVec<GfoHolder>& glst, GffObj* gfo, int idx) {
917 GfoHolder gh(gfo,idx);
918 int i=glst.Add(gh);
919 return &(glst[i]);
920 }
921
922 GfoHolder* GffReader::gfoFind(const char* id, const char* ctg,
923 GVec<GfoHolder>** glst, char strand, uint start, uint end) {
924 GVec<GfoHolder>* gl=phash.Find(id);
925 GfoHolder* gh=NULL;
926 if (gl) {
927 for (int i=0;i<gl->Count();i++) {
928 GfoHolder& gfo = gl->Get(i);
929 if (ctg!=NULL && strcmp(ctg, gfo.gffobj->getGSeqName())!=0)
930 continue;
931 if (strand && gfo.gffobj->strand!='.' && strand != gfo.gffobj->strand)
932 continue;
933 if (start>0) {
934 if (abs((int)start-(int)gfo.gffobj->start)> (int)GFF_MAX_LOCUS)
935 continue;
936 if (end>0 && (gfo.gffobj->start>end || gfo.gffobj->end<start))
937 continue;
938 }
939 //must be the same transcript, according to given comparison criteria
940 gh=&gfo;
941 break;
942 }
943 }
944 if (glst) *glst=gl;
945 return gh;
946 }
947
948 GfoHolder* GffReader::replaceGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, int replaceidx) {
949 GffObj* newgfo=new GffObj(this, gffline, keepAttr, noExonAttr);
950 GfoHolder* r=NULL;
951 if (replaceidx>=0) {
952 gflst.Put(replaceidx,newgfo);
953 r=gfoAdd(newgfo, replaceidx);
954 }
955 else {
956 int gfoidx=gflst.Add(newgfo);
957 r=gfoAdd(newgfo, gfoidx);
958 }
959 /*
960 if (gff_warns) {
961 int* pcount=tids.Find(newgfo->gffID);
962 if (pcount!=NULL) {
963 if (gff_warns) GMessage("Warning: duplicate GFF ID: %s\n", newgfo->gffID);
964 (*pcount)++;
965 }
966 else {
967 tids.Add(newgfo->gffID,new int(1));
968 }
969 }
970 */
971 return r;
972 }
973
974 GfoHolder* GffReader::updateParent(GfoHolder* newgfh, GffObj* parent) {
975 //assert(parent);
976 //assert(newgfo);
977 parent->children.Add(newgfh->gffobj);
978 if (newgfh->gffobj->parent==NULL) newgfh->gffobj->parent=parent;
979 newgfh->gffobj->setLevel(parent->getLevel()+1);
980 if (parent->isGene()) {
981 if (parent->gene_name!=NULL && newgfh->gffobj->gene_name==NULL)
982 newgfh->gffobj->gene_name=Gstrdup(parent->gene_name);
983 if (parent->geneID!=NULL && newgfh->gffobj->geneID==NULL)
984 newgfh->gffobj->geneID=Gstrdup(parent->geneID);
985 }
986
987 return newgfh;
988 }
989
990 GfoHolder* GffReader::newGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr,
991 GffObj* parent, GffExon* pexon, GVec<GfoHolder>* glst) {
992 GffObj* newgfo=new GffObj(this, gffline, keepAttr, noExonAttr);
993 GfoHolder* r=NULL;
994 int gfoidx=gflst.Add(newgfo);
995 r=(glst) ? gfoAdd(*glst, newgfo, gfoidx) : gfoAdd(newgfo, gfoidx);
996 if (parent!=NULL) {
997 updateParent(r, parent);
998 if (pexon!=NULL) parent->removeExon(pexon);
999 }
1000 /*
1001 if (gff_warns) {
1002 int* pcount=tids.Find(newgfo->gffID);
1003 if (pcount!=NULL) {
1004 if (gff_warns) GMessage("Warning: duplicate GFF ID: %s\n", newgfo->gffID);
1005 (*pcount)++;
1006 }
1007 else {
1008 tids.Add(newgfo->gffID,new int(1));
1009 }
1010 }
1011 */
1012 return r;
1013 }
1014
1015 GfoHolder* GffReader::updateGffRec(GfoHolder* prevgfo, GffLine* gffline,
1016 bool keepAttr) {
1017 if (prevgfo==NULL) return NULL;
1018 //prevgfo->gffobj->createdByExon(false);
1019 prevgfo->gffobj->ftype_id=prevgfo->gffobj->names->feats.addName(gffline->ftype);
1020 prevgfo->gffobj->start=gffline->fstart;
1021 prevgfo->gffobj->end=gffline->fend;
1022 prevgfo->gffobj->isGene(gffline->is_gene);
1023 prevgfo->gffobj->isTranscript(gffline->is_transcript || gffline->exontype!=0);
1024 prevgfo->gffobj->hasGffID(gffline->ID!=NULL);
1025 if (keepAttr) {
1026 if (prevgfo->gffobj->attrs!=NULL) prevgfo->gffobj->attrs->Clear();
1027 prevgfo->gffobj->parseAttrs(prevgfo->gffobj->attrs, gffline->info);
1028 }
1029 return prevgfo;
1030 }
1031
1032
1033 bool GffReader::addExonFeature(GfoHolder* prevgfo, GffLine* gffline, GHash<CNonExon>& pex, bool noExonAttr) {
1034 bool r=true;
1035 if (gffline->strand!=prevgfo->gffobj->strand) {
1036 //TODO: add support for trans-splicing and even inter-chromosomal fusions
1037 if (prevgfo->gffobj->strand=='.') {
1038 prevgfo->gffobj->strand=gffline->strand;
1039 }
1040 else {
1041 GMessage("GFF Error at %s (%c): exon %d-%d (%c) found on different strand; discarded.\n",
1042 prevgfo->gffobj->gffID, prevgfo->gffobj->strand,
1043 gffline->fstart, gffline->fend, gffline->strand, prevgfo->gffobj->getGSeqName());
1044 //r=false;
1045 return true; //FIXME: split trans-spliced mRNAs by strand
1046 }
1047 }
1048 int gdist=(gffline->fstart>prevgfo->gffobj->end) ? gffline->fstart-prevgfo->gffobj->end :
1049 ((gffline->fend<prevgfo->gffobj->start)? prevgfo->gffobj->start-gffline->fend :
1050 0 );
1051 if (gdist>(int)GFF_MAX_LOCUS) { //too far apart, most likely this is a duplicate ID
1052 GMessage("Error: duplicate GFF ID '%s' (or exons too far apart)!\n",prevgfo->gffobj->gffID);
1053 //validation_errors = true;
1054 r=false;
1055 if (!gff_warns) exit(1);
1056 }
1057 int eidx=prevgfo->gffobj->addExon(this, gffline, !noExonAttr, noExonAttr);
1058 if (eidx>=0 && gffline->ID!=NULL && gffline->exontype==0)
1059 subfPoolAdd(pex, prevgfo);
1060 return r;
1061 }
1062
1063 CNonExon* GffReader::subfPoolCheck(GffLine* gffline, GHash<CNonExon>& pex, char*& subp_name) {
1064 CNonExon* subp=NULL;
1065 subp_name=NULL;
1066 for (int i=0;i<gffline->num_parents;i++) {
1067 if (transcriptsOnly && discarded_ids.Find(gffline->parents[i])!=NULL)
1068 continue;
1069 subp_name=gfoBuildId(gffline->parents[i], gffline->gseqname); //e.g. mRNA name
1070 subp=pex.Find(subp_name);
1071 if (subp!=NULL)
1072 return subp;
1073 GFREE(subp_name);
1074 }
1075 return NULL;
1076 }
1077
1078 void GffReader::subfPoolAdd(GHash<CNonExon>& pex, GfoHolder* newgfo) {
1079 //this might become a parent feature later
1080 if (newgfo->gffobj->exons.Count()>0) {
1081 char* xbuf=gfoBuildId(gffline->ID, gffline->gseqname);
1082 pex.Add(xbuf, new CNonExon(newgfo->idx, newgfo->gffobj,
1083 newgfo->gffobj->exons[0], gffline));
1084 GFREE(xbuf);
1085 }
1086 }
1087
1088 GfoHolder* GffReader::promoteFeature(CNonExon* subp, char*& subp_name, GHash<CNonExon>& pex,
1089 bool keepAttr, bool noExonAttr) {
1090 GffObj* prevp=subp->parent; //grandparent of gffline (e.g. gene)
1091 if (prevp!=gflst[subp->idx])
1092 GError("Error promoting subfeature %s, gflst index mismatch?!\n", subp->gffline->ID);
1093 subp->gffline->discardParent();
1094 GfoHolder* gfoh=newGffRec(subp->gffline, keepAttr, noExonAttr, prevp, subp->exon);
1095 pex.Remove(subp_name); //no longer a potential parent, moved it to phash already
1096 prevp->promotedChildren(true);
1097 return gfoh; //returns the holder of newly promoted feature
1098 }
1099
1100 //have to parse the whole file because exons can be scattered all over
1101 //trans-splicing and fusions are only accepted in proper GFF3 format, with a single parent feature ID entry
1102 void GffReader::readAll(bool keepAttr, bool mergeCloseExons, bool noExonAttr) {
1103 bool validation_errors = false;
1104 //loc_debug=false;
1105 GHash<CNonExon> pex; //keep track of any "exon"-like features that have an ID
1106 //and thus could become promoted to parent features
1107 while (nextGffLine()!=NULL) {
1108 GfoHolder* prevseen=NULL;
1109 GVec<GfoHolder>* prevgflst=NULL;
1110 if (gffline->ID && gffline->exontype==0) {
1111 //>> for a parent-like IDed feature (mRNA, gene, etc.)
1112 //look for same ID on the same chromosome/strand/locus
1113 prevseen=gfoFind(gffline->ID, gffline->gseqname, &prevgflst, gffline->strand, gffline->fstart);
1114 if (prevseen!=NULL) {
1115 //same ID/chromosome combo encountered before
1116 if (prevseen->gffobj->createdByExon() &&
1117 prevseen->gffobj->start>=gffline->fstart &&
1118 prevseen->gffobj->end<=gffline->fend) {
1119 //an exon of this ID was given before
1120 //this line has the main attributes for this ID
1121 updateGffRec(prevseen, gffline, keepAttr);
1122 }
1123 else {
1124 //- duplicate ID -- this must be a discontiguous feature
1125 // e.g. a trans-spliced transcript
1126 if (prevseen->gffobj->overlap(gffline->fstart, gffline->fend)) {
1127 //overlapping with same ID not allowed
1128 GMessage("Error: duplicate GFF ID '%s' encountered!\n",gffline->ID);
1129 //validation_errors = true;
1130 if (gff_warns) {
1131 delete gffline;
1132 gffline=NULL;
1133 continue;
1134 }
1135 else exit(1);
1136 }
1137 //create a new entry with the same ID
1138 prevseen=newGffRec(gffline, keepAttr, noExonAttr,
1139 prevseen->gffobj->parent, NULL, prevgflst);
1140 } //duplicate ID on the same chromosome
1141 } //prevseeen != NULL
1142 } //parent-like ID feature
1143 if (gffline->parents==NULL) {//start GFF3-like record with no parent (mRNA, gene)
1144 if (!prevseen) newGffRec(gffline, keepAttr, noExonAttr, NULL, NULL, prevgflst);
1145 }
1146 else { //--- it's a child feature (exon/CDS but could still be a mRNA with gene(s) as parent)
1147 //updates all the declared parents with this child
1148 bool found_parent=false;
1149 GfoHolder* newgfo=prevseen;
1150 GVec<GfoHolder>* newgflst=NULL;
1151 for (int i=0;i<gffline->num_parents;i++) {
1152 if (transcriptsOnly && discarded_ids.Find(gffline->parents[i])!=NULL)
1153 continue; //skipping discarded parent feature
1154 GfoHolder* parentgfo=NULL;
1155 if (gffline->is_transcript || gffline->exontype==0) {//possibly a transcript
1156 parentgfo=gfoFind(gffline->parents[i], gffline->gseqname,
1157 &newgflst, gffline->strand, gffline->fstart, gffline->fend);
1158 }
1159 else {
1160 //for exon-like entities we only need a parent to be in locus distance,
1161 //on the same strand
1162 parentgfo=gfoFind(gffline->parents[i], gffline->gseqname,
1163 &newgflst, gffline->strand, gffline->fstart);
1164 }
1165 if (parentgfo!=NULL) { //parent GffObj parsed earlier
1166 found_parent=true;
1167 if (parentgfo->gffobj->isGene() && gffline->is_transcript
1168 && gffline->exontype==0) {
1169 //not an exon, but a transcript parented by a gene
1170 if (newgfo) {
1171 updateParent(newgfo, parentgfo->gffobj);
1172 }
1173 else {
1174 newgfo=newGffRec(gffline, keepAttr, noExonAttr, parentgfo->gffobj);
1175 }
1176 }
1177 else { //potential exon subfeature
1178 if (!addExonFeature(parentgfo, gffline, pex, noExonAttr))
1179 validation_errors=true;
1180 }
1181 } //overlapping parent feature found
1182 } //for each parsed parent Id
1183 if (!found_parent) { //new GTF-like record starting here with a subfeature directly
1184 //or it could be some chado GFF3 barf with exons coming BEFORE their parent :(
1185 //check if this feature isn't parented by a previously stored "exon" subfeature
1186 char* subp_name=NULL;
1187 CNonExon* subp=subfPoolCheck(gffline, pex, subp_name);
1188 if (subp!=NULL) { //found a subfeature that is the parent of this gffline
1189 //promote that subfeature to a full GffObj
1190 GfoHolder* gfoh=promoteFeature(subp, subp_name, pex, keepAttr, noExonAttr);
1191 //add current gffline as an exon of the newly promoted subfeature
1192 if (!addExonFeature(gfoh, gffline, pex, noExonAttr))
1193 validation_errors=true;
1194 }
1195 else { //no parent seen before,
1196 //loc_debug=true;
1197 GfoHolder* ngfo=prevseen;
1198 if (ngfo==NULL) {
1199 //if it's an exon type, create directly the parent with this exon
1200 //but if it's recognized as a transcript, the object itself is created
1201 ngfo=newGffRec(gffline, keepAttr, noExonAttr, NULL, NULL, newgflst);
1202 }
1203 if (!ngfo->gffobj->isTranscript() &&
1204 gffline->ID!=NULL && gffline->exontype==0)
1205 subfPoolAdd(pex, ngfo);
1206 //even those with errors will be added here!
1207 }
1208 GFREE(subp_name);
1209 } //no previous parent found
1210 } //parented feature
1211 //--
1212 delete gffline;
1213 gffline=NULL;
1214 }//while gff lines
1215 gflst.finalize(this, mergeCloseExons, keepAttr, noExonAttr); //force sorting by locus if so constructed
1216 gseqStats.setCount(gseqstats.Last()->gseqid+1);
1217 for (int gi=0;gi<gseqstats.Count();gi++) {
1218 gseqStats.Put(gseqstats[gi]->gseqid, gseqstats[gi]); //copy the pointer only
1219 }
1220 // all gff records are now loaded in GList gflst
1221 // so we can free the hash
1222 phash.Clear();
1223 //tids.Clear();
1224 if (validation_errors) {
1225 exit(1);
1226 }
1227 }
1228
1229 void GfList::finalize(GffReader* gfr, bool mergeCloseExons,
1230 bool keepAttrs, bool noExonAttr) { //if set, enforce sort by locus
1231 if (mustSort) { //force (re-)sorting
1232 this->setSorted(false);
1233 this->setSorted((GCompareProc*)gfo_cmpByLoc);
1234 }
1235 GList<GffObj> discarded(false,true,false);
1236 for (int i=0;i<Count();i++) {
1237 //finish the parsing for each GffObj
1238 fList[i]->finalize(gfr, mergeCloseExons, keepAttrs, noExonAttr);
1239 if (fList[i]->isDiscarded()) {
1240 discarded.Add(fList[i]);
1241 this->Forget(i);
1242 }
1243 }
1244 if (discarded.Count()>0) this->Pack();
1245 }
1246
1247 GffObj* GffObj::finalize(GffReader* gfr, bool mergeCloseExons, bool keepAttrs, bool noExonAttr) {
1248 //merge
1249 //always merge adjacent or overlapping segments
1250 //but if mergeCloseExons then merge even when distance is up to 5 bases
1251 udata=0;
1252 uptr=NULL;
1253 if (gfr->transcriptsOnly && !(isTranscript() || (isGene() && children.Count()==0))) {
1254 isDiscarded(true);
1255 }
1256 if (ftype_id==gff_fid_transcript && CDstart>0) {
1257 ftype_id=gff_fid_mRNA;
1258 //exon_ftype_id=gff_fid_exon;
1259 }
1260 if (exons.Count()>0 && (isTranscript() || exon_ftype_id==gff_fid_exon)) {
1261 if (mergeCloseExons) {
1262 int mindist=mergeCloseExons ? 5:1;
1263 for (int i=0;i<exons.Count()-1;i++) {
1264 int ni=i+1;
1265 uint mend=exons[i]->end;
1266 while (ni<exons.Count()) {
1267 int dist=(int)(exons[ni]->start-mend);
1268 if (dist>mindist) break; //no merging with next segment
1269 if (gfr!=NULL && gfr->gff_warns && dist!=0 && (exons[ni]->exontype!=exgffUTR && exons[i]->exontype!=exgffUTR)) {
1270 GMessage("GFF warning: merging adjacent/overlapping segments of %s on %s (%d-%d, %d-%d)\n",
1271 gffID, getGSeqName(), exons[i]->start, exons[i]->end,exons[ni]->start, exons[ni]->end);
1272 }
1273 mend=exons[ni]->end;
1274 covlen-=exons[i]->len();
1275 exons[i]->end=mend;
1276 covlen+=exons[i]->len();
1277 covlen-=exons[ni]->len();
1278 if (exons[ni]->attrs!=NULL && (exons[i]->attrs==NULL ||
1279 exons[i]->attrs->Count()<exons[ni]->attrs->Count())) {
1280 //use the other exon attributes, if more
1281 delete(exons[i]->attrs);
1282 exons[i]->attrs=exons[ni]->attrs;
1283 exons[ni]->attrs=NULL;
1284 }
1285 exons.Delete(ni);
1286 } //check for merge with next exon
1287 } //for each exon
1288 } //merge close exons
1289 //shrink transcript to the exons' span
1290 this->start=exons.First()->start;
1291 this->end=exons.Last()->end;
1292 //also update the stats for the reference sequence
1293 if (uptr!=NULL) { //collect stats about the underlying genomic sequence
1294 GSeqStat* gsd=(GSeqStat*)uptr;
1295 if (start<gsd->mincoord) gsd->mincoord=start;
1296 if (end>gsd->maxcoord) gsd->maxcoord=end;
1297 if (this->len()>gsd->maxfeat_len) {
1298 gsd->maxfeat_len=this->len();
1299 gsd->maxfeat=this;
1300 }
1301 }
1302 this->uptr=NULL;
1303 this->udata=0;
1304 }
1305 //attribute reduction for GTF records
1306 if (keepAttrs && !noExonAttr && !hasGffID()
1307 && exons.Count()>0 && exons[0]->attrs!=NULL) {
1308 bool attrs_discarded=false;
1309 for (int a=0;a<exons[0]->attrs->Count();a++) {
1310 int attr_name_id=exons[0]->attrs->Get(a)->attr_id;
1311 char* attr_name=names->attrs.getName(attr_name_id);
1312 char* attr_val =exons[0]->attrs->Get(a)->attr_val;
1313 bool sameExonAttr=true;
1314 for (int i=1;i<exons.Count();i++) {
1315 char* ov=exons[i]->getAttr(attr_name_id);
1316 if (ov==NULL || (strcmp(ov,attr_val)!=0)) {
1317 sameExonAttr=false;
1318 break;
1319 }
1320 }
1321 if (sameExonAttr) {
1322 //delete this attribute from exons level
1323 attrs_discarded=true;
1324 this->addAttr(attr_name, attr_val);
1325 for (int i=1;i<exons.Count();i++) {
1326 removeExonAttr(*(exons[i]), attr_name_id);
1327 }
1328 exons[0]->attrs->freeItem(a);
1329 }
1330 }
1331 if (attrs_discarded) exons[0]->attrs->Pack();
1332 }
1333 return this;
1334 }
1335
1336 void GffObj::parseAttrs(GffAttrs*& atrlist, char* info, bool isExon) {
1337 if (names==NULL)
1338 GError(ERR_NULL_GFNAMES, "parseAttrs()");
1339 if (atrlist==NULL)
1340 atrlist=new GffAttrs();
1341 char* endinfo=info+strlen(info);
1342 char* start=info;
1343 char* pch=start;
1344 while (start<endinfo) {
1345 //skip spaces
1346 while (*start==' ' && start<endinfo) start++;
1347 pch=strchr(start, ';');
1348 if (pch==NULL) pch=endinfo;
1349 else {
1350 *pch='\0';
1351 pch++;
1352 }
1353 char* ech=strchr(start,'=');
1354 if (ech!=NULL) { // attr=value format found
1355 *ech='\0';
1356 //if (noExonAttr && (strcmp(start, "exon_number")==0 || strcmp(start, "exon")==0)) { start=pch; continue; }
1357 if (strcmp(start, "exon_number")==0 || strcmp(start, "exon")==0 ||
1358 strcmp(start, "exon_id")==0)
1359 { start=pch; continue; }
1360 ech++;
1361 while (*ech==' ' && ech<endinfo) ech++;//skip extra spaces after the '='
1362 //atrlist->Add(new GffAttr(names->attrs.addName(start),ech));
1363 //make sure we don't add the same attribute more than once
1364 if (isExon && (strcmp(start, "protein_id")==0)) {
1365 //Ensembl special case
1366 this->addAttr(start, ech);
1367 start=pch;
1368 continue;
1369 }
1370 atrlist->add_or_update(names, start, ech);
1371 }
1372 /*
1373 else { //not an attr=value format
1374 atrlist->Add(new GffAttr(names->attrs.addName(start),"1"));
1375 }
1376 */
1377 start=pch;
1378 }
1379 if (atrlist->Count()==0) { delete atrlist; atrlist=NULL; }
1380 }
1381
1382 void GffObj::addAttr(const char* attrname, const char* attrvalue) {
1383 if (this->attrs==NULL)
1384 this->attrs=new GffAttrs();
1385 //this->attrs->Add(new GffAttr(names->attrs.addName(attrname),attrvalue));
1386 this->attrs->add_or_update(names, attrname, attrvalue);
1387 }
1388
1389
1390 void GffObj::setFeatureName(const char* feature) {
1391 //change the feature name/type for a transcript
1392 int fid=names->feats.addName(feature);
1393 if (monoFeature() && exons.Count()>0)
1394 this->exon_ftype_id=fid;
1395 this->ftype_id=fid;
1396 }
1397
1398 void GffObj::setRefName(const char* newname) {
1399 //change the feature name/type for a transcript
1400 int rid=names->gseqs.addName(newname);
1401 this->gseq_id=rid;
1402 }
1403
1404
1405
1406 int GffObj::removeAttr(const char* attrname, const char* attrval) {
1407 if (this->attrs==NULL || attrname==NULL || attrname[0]==0) return 0;
1408 int aid=this->names->attrs.getId(attrname);
1409 if (aid<0) return 0;
1410 int delcount=0; //could be more than one ?
1411 for (int i=0;i<this->attrs->Count();i++) {
1412 if (aid==this->attrs->Get(i)->attr_id) {
1413 if (attrval==NULL ||
1414 strcmp(attrval, this->attrs->Get(i)->attr_val)==0) {
1415 delcount++;
1416 this->attrs->freeItem(i);
1417 }
1418 }
1419 }
1420 if (delcount>0) this->attrs->Pack();
1421 return delcount;
1422 }
1423
1424 int GffObj::removeAttr(int aid, const char* attrval) {
1425 if (this->attrs==NULL || aid<0) return 0;
1426 int delcount=0; //could be more than one ?
1427 for (int i=0;i<this->attrs->Count();i++) {
1428 if (aid==this->attrs->Get(i)->attr_id) {
1429 if (attrval==NULL ||
1430 strcmp(attrval, this->attrs->Get(i)->attr_val)==0) {
1431 delcount++;
1432 this->attrs->freeItem(i);
1433 }
1434 }
1435 }
1436 if (delcount>0) this->attrs->Pack();
1437 return delcount;
1438 }
1439
1440
1441 int GffObj::removeExonAttr(GffExon& exon, const char* attrname, const char* attrval) {
1442 if (exon.attrs==NULL || attrname==NULL || attrname[0]==0) return 0;
1443 int aid=this->names->attrs.getId(attrname);
1444 if (aid<0) return 0;
1445 int delcount=0; //could be more than one
1446 for (int i=0;i<exon.attrs->Count();i++) {
1447 if (aid==exon.attrs->Get(i)->attr_id) {
1448 if (attrval==NULL ||
1449 strcmp(attrval, exon.attrs->Get(i)->attr_val)==0) {
1450 delcount++;
1451 exon.attrs->freeItem(i);
1452 }
1453 }
1454 }
1455 if (delcount>0) exon.attrs->Pack();
1456 return delcount;
1457 }
1458
1459 int GffObj::removeExonAttr(GffExon& exon, int aid, const char* attrval) {
1460 if (exon.attrs==NULL || aid<0) return 0;
1461 int delcount=0; //could be more than one
1462 for (int i=0;i<exon.attrs->Count();i++) {
1463 if (aid==exon.attrs->Get(i)->attr_id) {
1464 if (attrval==NULL ||
1465 strcmp(attrval, exon.attrs->Get(i)->attr_val)==0) {
1466 delcount++;
1467 exon.attrs->freeItem(i);
1468 }
1469 }
1470 }
1471 if (delcount>0) exon.attrs->Pack();
1472 return delcount;
1473 }
1474
1475
1476 void GffObj::getCDS_ends(uint& cds_start, uint& cds_end) {
1477 cds_start=0;
1478 cds_end=0;
1479 if (CDstart==0 || CDend==0) return; //no CDS info
1480 int cdsadj=0;
1481 if (CDphase=='1' || CDphase=='2') {
1482 cdsadj=CDphase-'0';
1483 }
1484 cds_start=CDstart;
1485 cds_end=CDend;
1486 if (strand=='-') cds_end-=cdsadj;
1487 else cds_start+=cdsadj;
1488 }
1489
1490 void GffObj::mRNA_CDS_coords(uint& cds_mstart, uint& cds_mend) {
1491 //sets cds_start and cds_end to the CDS start,end coordinates on the spliced mRNA transcript
1492 cds_mstart=0;
1493 cds_mend=0;
1494 if (CDstart==0 || CDend==0) return; //no CDS info
1495 //restore normal coordinates, just in case
1496 unxcoord();
1497 int cdsadj=0;
1498 if (CDphase=='1' || CDphase=='2') {
1499 cdsadj=CDphase-'0';
1500 }
1501 /*
1502 uint seqstart=CDstart;
1503 uint seqend=CDend;
1504 */
1505 uint seqstart=exons.First()->start;
1506 uint seqend=exons.Last()->end;
1507 int s=0; //resulting nucleotide counter
1508 if (strand=='-') {
1509 for (int x=exons.Count()-1;x>=0;x--) {
1510 uint sgstart=exons[x]->start;
1511 uint sgend=exons[x]->end;
1512 if (seqend<sgstart || seqstart>sgend) continue;
1513 if (seqstart>=sgstart && seqstart<=sgend)
1514 sgstart=seqstart; //seqstart within this segment
1515 if (seqend>=sgstart && seqend<=sgend)
1516 sgend=seqend; //seqend within this segment
1517 s+=(int)(sgend-sgstart)+1;
1518 if (CDstart>=sgstart && CDstart<=sgend) {
1519 //CDstart in this segment
1520 //and we are getting the whole transcript
1521 cds_mend=s-(int)(CDstart-sgstart);
1522 }
1523 if (CDend>=sgstart && CDend<=sgend) {
1524 //CDstart in this segment
1525 //and we are getting the whole transcript
1526 cds_mstart=s-(int)(CDend-cdsadj-sgstart);
1527 }
1528 } //for each exon
1529 } // - strand
1530 else { // + strand
1531 for (int x=0;x<exons.Count();x++) {
1532 uint sgstart=exons[x]->start;
1533 uint sgend=exons[x]->end;
1534 if (seqend<sgstart || seqstart>sgend) continue;
1535 if (seqstart>=sgstart && seqstart<=sgend)
1536 sgstart=seqstart; //seqstart within this segment
1537 if (seqend>=sgstart && seqend<=sgend)
1538 sgend=seqend; //seqend within this segment
1539 s+=(int)(sgend-sgstart)+1;
1540 /* for (uint i=sgstart;i<=sgend;i++) {
1541 spliced[s]=gsubseq[i-gstart];
1542 s++;
1543 }//for each nt
1544 */
1545 if (CDstart>=sgstart && CDstart<=sgend) {
1546 //CDstart in this segment
1547 cds_mstart=s-(int)(sgend-CDstart-cdsadj);
1548 }
1549 if (CDend>=sgstart && CDend<=sgend) {
1550 //CDend in this segment
1551 cds_mend=s-(int)(sgend-CDend);
1552 }
1553 } //for each exon
1554 } // + strand
1555 //spliced[s]=0;
1556 //if (rlen!=NULL) *rlen=s;
1557 //return spliced;
1558 }
1559
1560 char* GffObj::getUnspliced(GFaSeqGet* faseq, int* rlen, GList<GSeg>* seglst)
1561 {
1562 if (faseq==NULL) { GMessage("Warning: getUnspliced(NULL,.. ) called!\n");
1563 return NULL;
1564 }
1565 //restore normal coordinates:
1566 unxcoord();
1567 if (exons.Count()==0) return NULL;
1568 int fspan=end-start+1;
1569 const char* gsubseq=faseq->subseq(start, fspan);
1570 if (gsubseq==NULL) {
1571 GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end);
1572 }
1573 char* unspliced=NULL;
1574
1575 int seqstart=exons.First()->start;
1576 int seqend=exons.Last()->end;
1577
1578 int unsplicedlen = 0;
1579
1580 unsplicedlen += seqend - seqstart + 1;
1581
1582 GMALLOC(unspliced, unsplicedlen+1); //allocate more here
1583 //uint seqstart, seqend;
1584
1585 int s = 0; //resulting nucleotide counter
1586 if (strand=='-')
1587 {
1588 if (seglst!=NULL)
1589 seglst->Add(new GSeg(s+1,s+1+seqend-seqstart));
1590 for (int i=seqend;i>=seqstart;i--)
1591 {
1592 unspliced[s] = ntComplement(gsubseq[i-start]);
1593 s++;
1594 }//for each nt
1595 } // - strand
1596 else
1597 { // + strand
1598 if (seglst!=NULL)
1599 seglst->Add(new GSeg(s+1,s+1+seqend-seqstart));
1600 for (int i=seqstart;i<=seqend;i++)
1601 {
1602 unspliced[s]=gsubseq[i-start];
1603 s++;
1604 }//for each nt
1605 } // + strand
1606 //assert(s <= unsplicedlen);
1607 unspliced[s]=0;
1608 if (rlen!=NULL) *rlen=s;
1609 return unspliced;
1610 }
1611
1612 char* GffObj::getSpliced(GFaSeqGet* faseq, bool CDSonly, int* rlen, uint* cds_start, uint* cds_end,
1613 GList<GSeg>* seglst) {
1614 if (CDSonly && CDstart==0) return NULL;
1615 if (faseq==NULL) { GMessage("Warning: getSpliced(NULL,.. ) called!\n");
1616 return NULL;
1617 }
1618 //restore normal coordinates:
1619 unxcoord();
1620 if (exons.Count()==0) return NULL;
1621 int fspan=end-start+1;
1622 const char* gsubseq=faseq->subseq(start, fspan);
1623 if (gsubseq==NULL) {
1624 GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end);
1625 }
1626 if (fspan<(int)(end-start+1)) { //special case: stop coordinate was extended past the gseq length, must adjust
1627 int endadj=end-start+1-fspan;
1628 uint prevend=end;
1629 end-=endadj;
1630 if (CDend>end) CDend=end;
1631 if (exons.Last()->end>end) {
1632 exons.Last()->end=end; //this could get us into trouble if exon start is also > end
1633 if (exons.Last()->start>exons.Last()->end) {
1634 GError("GffObj::getSpliced() error: improper genomic coordinate %d on %s for %s\n",
1635 prevend,getGSeqName(), getID());
1636 }
1637 covlen-=endadj;
1638 }
1639 }
1640 char* spliced=NULL;
1641 GMALLOC(spliced, covlen+1); //allocate more here
1642 uint seqstart, seqend;
1643 int cdsadj=0;
1644 if (CDphase=='1' || CDphase=='2') {
1645 cdsadj=CDphase-'0';
1646 }
1647 if (CDSonly) {
1648 seqstart=CDstart;
1649 seqend=CDend;
1650 if (strand=='-') seqend-=cdsadj;
1651 else seqstart+=cdsadj;
1652 }
1653 else {
1654 seqstart=exons.First()->start;
1655 seqend=exons.Last()->end;
1656 }
1657 int s=0; //resulting nucleotide counter
1658 if (strand=='-') {
1659 for (int x=exons.Count()-1;x>=0;x--) {
1660 uint sgstart=exons[x]->start;
1661 uint sgend=exons[x]->end;
1662 if (seqend<sgstart || seqstart>sgend) continue;
1663 if (seqstart>=sgstart && seqstart<=sgend)
1664 sgstart=seqstart; //seqstart within this segment
1665 if (seqend>=sgstart && seqend<=sgend)
1666 sgend=seqend; //seqend within this segment
1667 if (seglst!=NULL)
1668 seglst->Add(new GSeg(s+1,s+1+sgend-sgstart));
1669 for (uint i=sgend;i>=sgstart;i--) {
1670 spliced[s] = ntComplement(gsubseq[i-start]);
1671 s++;
1672 }//for each nt
1673
1674 if (!CDSonly && cds_start!=NULL && CDstart>0) {
1675 if (CDstart>=sgstart && CDstart<=sgend) {
1676 //CDstart in this segment
1677 //and we are getting the whole transcript
1678 *cds_end=s-(CDstart-sgstart);
1679 }
1680 if (CDend>=sgstart && CDend<=sgend) {
1681 //CDstart in this segment
1682 //and we are getting the whole transcript
1683 *cds_start=s-(CDend-cdsadj-sgstart);
1684 }
1685 }//update local CDS coordinates
1686 } //for each exon
1687 } // - strand
1688 else { // + strand
1689 for (int x=0;x<exons.Count();x++) {
1690 uint sgstart=exons[x]->start;
1691 uint sgend=exons[x]->end;
1692 if (seqend<sgstart || seqstart>sgend) continue;
1693 if (seqstart>=sgstart && seqstart<=sgend)
1694 sgstart=seqstart; //seqstart within this segment
1695 if (seqend>=sgstart && seqend<=sgend)
1696 sgend=seqend; //seqend within this segment
1697 if (seglst!=NULL)
1698 seglst->Add(new GSeg(s+1,s+1+sgend-sgstart));
1699 for (uint i=sgstart;i<=sgend;i++) {
1700 spliced[s]=gsubseq[i-start];
1701 s++;
1702 }//for each nt
1703 if (!CDSonly && cds_start!=NULL && CDstart>0) {
1704 if (CDstart>=sgstart && CDstart<=sgend) {
1705 //CDstart in this segment
1706 //and we are getting the whole transcript
1707 *cds_start=s-(sgend-CDstart-cdsadj);
1708 }
1709 if (CDend>=sgstart && CDend<=sgend) {
1710 //CDstart in this segment
1711 //and we are getting the whole transcript
1712 *cds_end=s-(sgend-CDend);
1713 }
1714 }//update local CDS coordinates
1715 } //for each exon
1716 } // + strand
1717 spliced[s]=0;
1718 if (rlen!=NULL) *rlen=s;
1719 return spliced;
1720 }
1721
1722 char* GffObj::getSplicedTr(GFaSeqGet* faseq, bool CDSonly, int* rlen) {
1723 if (CDSonly && CDstart==0) return NULL;
1724 //restore normal coordinates:
1725 unxcoord();
1726 if (exons.Count()==0) return NULL;
1727 int fspan=end-start+1;
1728 const char* gsubseq=faseq->subseq(start, fspan);
1729 if (gsubseq==NULL) {
1730 GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end);
1731 }
1732
1733 char* translation=NULL;
1734 GMALLOC(translation, (int)(covlen/3)+1);
1735 uint seqstart, seqend;
1736 int cdsadj=0;
1737 if (CDphase=='1' || CDphase=='2') {
1738 cdsadj=CDphase-'0';
1739 }
1740 if (CDSonly) {
1741 seqstart=CDstart;
1742 seqend=CDend;
1743 if (strand=='-') seqend-=cdsadj;
1744 else seqstart+=cdsadj;
1745 }
1746 else {
1747 seqstart=exons.First()->start;
1748 seqend=exons.Last()->end;
1749 }
1750 Codon codon;
1751 int nt=0; //codon nucleotide counter (0..2)
1752 int aa=0; //aminoacid count
1753 if (strand=='-') {
1754 for (int x=exons.Count()-1;x>=0;x--) {
1755 uint sgstart=exons[x]->start;
1756 uint sgend=exons[x]->end;
1757 if (seqend<sgstart || seqstart>sgend) continue;
1758 if (seqstart>=sgstart && seqstart<=sgend)
1759 sgstart=seqstart; //seqstart within this segment
1760 if (seqend>=sgstart && seqend<=sgend) {
1761 sgend=seqend; //seqend within this segment
1762 }
1763 for (uint i=sgend;i>=sgstart;i--) {
1764 codon.nuc[nt]=ntComplement(gsubseq[i-start]);
1765 nt++;
1766 if (nt==3) {
1767 nt=0;
1768 translation[aa]=codon.translate();
1769 aa++;
1770 }
1771 }//for each nt
1772 } //for each exon
1773 } // - strand
1774 else { // + strand
1775 for (int x=0;x<exons.Count();x++) {
1776 uint sgstart=exons[x]->start;
1777 uint sgend=exons[x]->end;
1778 if (seqend<sgstart || seqstart>sgend) continue;
1779 if (seqstart>=sgstart && seqstart<=sgend)
1780 sgstart=seqstart; //seqstart within this segment
1781 if (seqend>=sgstart && seqend<=sgend)
1782 sgend=seqend; //seqend within this segment
1783 for (uint i=sgstart;i<=sgend;i++) {
1784 codon.nuc[nt]=gsubseq[i-start];
1785 nt++;
1786 if (nt==3) {
1787 nt=0;
1788 translation[aa]=codon.translate();
1789 aa++;
1790 }
1791 }//for each nt
1792 } //for each exon
1793 } // + strand
1794 translation[aa]=0;
1795 if (rlen!=NULL) *rlen=aa;
1796 return translation;
1797 }
1798
1799 void GffObj::printSummary(FILE* fout) {
1800 if (fout==NULL) fout=stdout;
1801 fprintf(fout, "%s\t%c\t%d\t%d\t%4.2f\t%4.1f\n", gffID,
1802 strand, start, end, gscore, (float)qcov/10.0);
1803 }
1804
1805 void GffObj::printGxfLine(FILE* fout, const char* tlabel, const char* gseqname, bool iscds,
1806 uint segstart, uint segend, int exidx, char phase, bool gff3) {
1807 static char scorestr[14];
1808 strcpy(scorestr,".");
1809 GffAttrs* xattrs=NULL;
1810 if (exidx>=0) {
1811 if (exons[exidx]->score) sprintf(scorestr,"%.2f", exons[exidx]->score);
1812 xattrs=exons[exidx]->attrs;
1813 }
1814 if (phase==0 || !iscds) phase='.';
1815 const char* ftype=iscds ? "CDS" : getSubfName();
1816 if (gff3) {
1817 fprintf(fout,
1818 "%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\tParent=%s",
1819 gseqname, tlabel, ftype, segstart, segend, scorestr, strand,
1820 phase, gffID);
1821 if (xattrs!=NULL) {
1822 for (int i=0;i<xattrs->Count();i++)
1823 fprintf(fout, ";%s=%s",names->attrs.getName(xattrs->Get(i)->attr_id),
1824 xattrs->Get(i)->attr_val);
1825 }
1826 fprintf(fout, "\n");
1827 } //GFF
1828 else {//for GTF -- we print only transcripts
1829 //if (isValidTranscript())
1830 fprintf(fout, "%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\ttranscript_id \"%s\";",
1831 gseqname, tlabel, ftype, segstart, segend, scorestr, strand, phase, gffID);
1832 //char* geneid=(geneID!=NULL)? geneID : gffID;
1833 if (geneID)
1834 fprintf(fout," gene_id \"%s\";",geneID);
1835 if (gene_name!=NULL) {
1836 //fprintf(fout, " gene_name ");
1837 //if (gene_name[0]=='"') fprintf (fout, "%s;",gene_name);
1838 // else fprintf(fout, "\"%s\";",gene_name);
1839 fprintf(fout," gene_name \"%s\";",gene_name);
1840 }
1841 if (xattrs!=NULL) {
1842 for (int i=0;i<xattrs->Count();i++) {
1843 if (xattrs->Get(i)->attr_val==NULL) continue;
1844 const char* attrname=names->attrs.getName(xattrs->Get(i)->attr_id);
1845 fprintf(fout, " %s ",attrname);
1846 if (xattrs->Get(i)->attr_val[0]=='"')
1847 fprintf(fout, "%s;",xattrs->Get(i)->attr_val);
1848 else fprintf(fout, "\"%s\";",xattrs->Get(i)->attr_val);
1849 }
1850 }
1851 //for GTF, also append the GffObj attributes to each exon line
1852 if ((xattrs=this->attrs)!=NULL) {
1853 for (int i=0;i<xattrs->Count();i++) {
1854 if (xattrs->Get(i)->attr_val==NULL) continue;
1855 const char* attrname=names->attrs.getName(xattrs->Get(i)->attr_id);
1856 fprintf(fout, " %s ",attrname);
1857 if (xattrs->Get(i)->attr_val[0]=='"')
1858 fprintf(fout, "%s;",xattrs->Get(i)->attr_val);
1859 else fprintf(fout, "\"%s\";",xattrs->Get(i)->attr_val);
1860 }
1861 }
1862 fprintf(fout, "\n");
1863 }//GTF
1864 }
1865
1866 void GffObj::printGxf(FILE* fout, GffPrintMode gffp,
1867 const char* tlabel, const char* gfparent) {
1868 static char tmpstr[255];
1869 if (tlabel==NULL) {
1870 tlabel=track_id>=0 ? names->tracks.Get(track_id)->name :
1871 (char*)"gffobj" ;
1872 }
1873 unxcoord();
1874 //if (exons.Count()==0) return;
1875 const char* gseqname=names->gseqs.Get(gseq_id)->name;
1876 bool gff3 = (gffp>=pgffAny);
1877 bool showCDS = (gffp==pgtfAny || gffp==pgtfCDS || gffp==pgffCDS || gffp==pgffAny || gffp==pgffBoth);
1878 bool showExon = (gffp<=pgtfExon || gffp==pgffAny || gffp==pgffExon || gffp==pgffBoth);
1879 if (gff3) {
1880 //print GFF3 mRNA line:
1881 if (gscore>0.0) sprintf(tmpstr,"%.2f", gscore);
1882 else strcpy(tmpstr,".");
1883 uint pstart, pend;
1884 if (gffp==pgffCDS) {
1885 pstart=CDstart;
1886 pend=CDend;
1887 }
1888 else { pstart=start;pend=end; }
1889 //const char* ftype=isTranscript() ? "mRNA" : getFeatureName();
1890 const char* ftype=getFeatureName();
1891 fprintf(fout,
1892 "%s\t%s\t%s\t%d\t%d\t%s\t%c\t.\tID=%s",
1893 gseqname, tlabel, ftype, pstart, pend, tmpstr, strand, gffID);
1894 if (CDstart>0 && !showCDS && !isCDS) fprintf(fout,";CDS=%d-%d",CDstart,CDend);
1895 if (gfparent!=NULL) {
1896 //parent override
1897 fprintf(fout, ";Parent=%s",gfparent);
1898 }
1899 else {
1900 if (parent!=NULL && !parent->isDiscarded())
1901 fprintf(fout, ";Parent=%s",parent->getID());
1902 }
1903 if (geneID!=NULL)
1904 fprintf(fout, ";geneID=%s",geneID);
1905 if (gene_name!=NULL)
1906 fprintf(fout, ";gene_name=%s",gene_name);
1907 if (attrs!=NULL) {
1908 for (int i=0;i<attrs->Count();i++) {
1909 const char* attrname=names->attrs.getName(attrs->Get(i)->attr_id);
1910 fprintf(fout,";%s=%s", attrname,
1911 attrs->Get(i)->attr_val);
1912 }
1913 }
1914 fprintf(fout,"\n");
1915 }// gff3 mRNA line
1916 if (showExon) {
1917 //print exons
1918 if (isCDS && exons.Count()>0 &&
1919 ((strand=='-' && exons.Last()->phase<'0') || (strand=='+' && exons.Last()->phase<'0')))
1920 updateExonPhase();
1921
1922 for (int i=0;i<exons.Count();i++) {
1923 printGxfLine(fout, tlabel, gseqname, isCDS, exons[i]->start, exons[i]->end, i, exons[i]->phase, gff3);
1924 }
1925 }//printing exons
1926 if (showCDS && !isCDS && CDstart>0) {
1927 GArray<GffCDSeg> cds(true,true);
1928 getCDSegs(cds);
1929 for (int i=0;i<cds.Count();i++) {
1930 printGxfLine(fout, tlabel, gseqname, true, cds[i].start, cds[i].end, -1, cds[i].phase, gff3);
1931 }
1932 } //showCDS
1933 }
1934
1935 void GffObj::updateExonPhase() {
1936 if (!isCDS) return;
1937 int cdsacc=0;
1938 if (CDphase=='1' || CDphase=='2') {
1939 cdsacc+= 3-(CDphase-'0');
1940 }
1941 if (strand=='-') { //reverse strand
1942 for (int i=exons.Count()-1;i>=0;i--) {
1943 exons[i]->phase='0'+ (3-cdsacc%3)%3;
1944 cdsacc+=exons[i]->end-exons[i]->start+1;
1945 }
1946 }
1947 else { //forward strand
1948 for (int i=0;i<exons.Count();i++) {
1949 exons[i]->phase='0'+ (3-cdsacc%3)%3;
1950 cdsacc+=exons[i]->end-exons[i]->start+1;
1951 }
1952 }
1953 }
1954
1955
1956 void GffObj::getCDSegs(GArray<GffCDSeg>& cds) {
1957 GffCDSeg cdseg;
1958 int cdsacc=0;
1959 if (CDphase=='1' || CDphase=='2') {
1960 cdsacc+= 3-(CDphase-'0');
1961 }
1962 if (strand=='-') {
1963 for (int x=exons.Count()-1;x>=0;x--) {
1964 uint sgstart=exons[x]->start;
1965 uint sgend=exons[x]->end;
1966 if (CDend<sgstart || CDstart>sgend) continue;
1967 if (CDstart>=sgstart && CDstart<=sgend)
1968 sgstart=CDstart; //cdstart within this segment
1969 if (CDend>=sgstart && CDend<=sgend)
1970 sgend=CDend; //cdend within this segment
1971 cdseg.start=sgstart;
1972 cdseg.end=sgend;
1973 cdseg.exonidx=x;
1974 //cdseg.phase='0'+(cdsacc>0 ? (3-cdsacc%3)%3 : 0);
1975 cdseg.phase='0'+ (3-cdsacc%3)%3;
1976 cdsacc+=sgend-sgstart+1;
1977 cds.Add(cdseg);
1978 } //for each exon
1979 } // - strand
1980 else { // + strand
1981 for (int x=0;x<exons.Count();x++) {
1982 uint sgstart=exons[x]->start;
1983 uint sgend=exons[x]->end;
1984 if (CDend<sgstart || CDstart>sgend) continue;
1985 if (CDstart>=sgstart && CDstart<=sgend)
1986 sgstart=CDstart; //seqstart within this segment
1987 if (CDend>=sgstart && CDend<=sgend)
1988 sgend=CDend; //seqend within this segment
1989 cdseg.start=sgstart;
1990 cdseg.end=sgend;
1991 cdseg.exonidx=x;
1992 //cdseg.phase='0'+(cdsacc>0 ? (3-cdsacc%3)%3 : 0);
1993 cdseg.phase='0' + (3-cdsacc%3)%3 ;
1994 cdsacc+=sgend-sgstart+1;
1995 cds.Add(cdseg);
1996 } //for each exon
1997 } // + strand
1998 }