ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/gff.cpp
(Generate patch)
# Line 5 | Line 5
5   //global set of feature names, attribute names etc.
6   // -- common for all GffObjs in current application!
7  
8 < const uint GFF_MAX_LOCUS = 4000000; //longest known gene in human is ~2.2M, UCSC claims a gene for mouse of ~ 3.1 M
9 < const uint GFF_MAX_EXON  =   20000; //longest known exon in human is ~11K
10 < const uint GFF_MAX_INTRON= 1600000;
11 <
8 > const uint GFF_MAX_LOCUS = 7000000; //longest known gene in human is ~2.2M, UCSC claims a gene for mouse of ~ 3.1 M
9 > const uint GFF_MAX_EXON  =   30000; //longest known exon in human is ~11K
10 > const uint GFF_MAX_INTRON= 6000000; //Ensembl shows a >5MB human intron
11 > bool gff_show_warnings = false; //global setting, set by GffReader->showWarnings()
12   const int gff_fid_mRNA=0;
13 < const int gff_fid_exon=1;
14 < const int gff_fid_CDS=2;
15 < bool gff_warns=true;
13 > const int gff_fid_transcript=1;
14 > const int gff_fid_exon=2;
15 > const int gff_fid_CDS=3; //never really used in GffObj ftype_id or subftype_id
16 > const uint gfo_flag_HAS_ERRORS       = 0x00000001;
17 > const uint gfo_flag_CHILDREN_PROMOTED= 0x00000002;
18 > const uint gfo_flag_IS_GENE          = 0x00000004;
19 > const uint gfo_flag_IS_TRANSCRIPT    = 0x00000008;
20 > const uint gfo_flag_HAS_GFF_ID       = 0x00000010; //found GFF3 feature line with its own ID
21 > const uint gfo_flag_BY_EXON          = 0x00000020; //created by subfeature (exon) directly
22 > const uint gfo_flag_DISCARDED        = 0x00000100;
23 > const uint gfo_flag_LST_KEEP         = 0x00000200;
24 > const uint gfo_flag_LEVEL_MSK        = 0x00FF0000;
25 > const byte gfo_flagShift_LEVEL           = 16;
26  
27   void gffnames_ref(GffNames* &n) {
28    if (n==NULL) n=new GffNames();
# Line 29 | Line 39
39  
40   GffObj& g1=*((GffObj*)p1);
41   GffObj& g2=*((GffObj*)p2);
42 < if (g1.gseq_id==g2.gseq_id) return (int)(g1.start-g2.start);
43 <                else return (int)(g1.gseq_id-g2.gseq_id);
42 > if (g1.gseq_id==g2.gseq_id) {
43 >             if (g1.start!=g2.start)
44 >                    return (int)(g1.start-g2.start);
45 >               else if (g1.getLevel()!=g2.getLevel())
46 >                        return (int)(g1.getLevel()-g2.getLevel());
47 >                    else
48 >                        if (g1.end!=g2.end)
49 >                              return (int)(g1.end-g2.end);
50 >                        else return strcmp(g1.getID(), g2.getID());
51 >             }
52 >             else return (int)(g1.gseq_id-g2.gseq_id);
53 > }
54 >
55 > char* GffLine::extractAttr(const char* attr, bool caseStrict, bool enforce_GTF2) {
56 > //parse a key attribute and remove it from the info string
57 > //(only works for attributes that have values following them after ' ' or '=')
58 > static const char GTF2_ERR[]="Error parsing attribute %s ('\"' required) at GTF line:\n%s\n";
59 > int attrlen=strlen(attr);
60 > char cend=attr[attrlen-1];
61 > //char* pos = (caseStrict) ? strstr(info, attr) : strifind(info, attr);
62 > //must make sure attr is not found in quoted text
63 > char* pos=info;
64 > char prevch=0;
65 > bool in_str=false;
66 > bool notfound=true;
67 > int (*strcmpfn)(const char*, const char*, int) = caseStrict ? Gstrcmp : Gstricmp;
68 > while (notfound && *pos) {
69 >   char ch=*pos;
70 >   if (ch=='"') {
71 >     in_str=!in_str;
72 >     pos++;
73 >     prevch=ch;
74 >     continue;
75 >     }
76 >   if (!in_str && (prevch==0 || prevch==' ' || prevch == ';')
77 >          && strcmpfn(attr, pos, attrlen)==0) {
78 >      //attr match found
79 >      //check for word boundary on right
80 >      char* epos=pos+attrlen;
81 >      if (cend=='=' || cend==' ' || *epos==0 || *epos==' ') {
82 >        notfound=false;
83 >        break;
84 >        }
85 >      //not a perfect match, move on
86 >      pos=epos;
87 >      prevch=*(pos-1);
88 >      continue;
89 >      }
90 >   //not a match or in_str
91 >   prevch=ch;
92 >   pos++;
93 >   }
94 > if (notfound) return NULL;
95 > char* vp=pos+attrlen;
96 > while (*vp==' ') vp++;
97 > if (*vp==';' || *vp==0)
98 >      GError("Error parsing value of GFF attribute \"%s\", line:\n%s\n", attr, dupline);
99 > bool dq_enclosed=false; //value string enclosed by double quotes
100 > if (*vp=='"') {
101 >     dq_enclosed=true;
102 >     vp++;
103 >     }
104 > if (enforce_GTF2 && !dq_enclosed)
105 >      GError(GTF2_ERR,attr, dupline);
106 > char* vend=vp;
107 > if (dq_enclosed) {
108 >    while (*vend!='"' && *vend!=';' && *vend!=0) vend++;
109 >    }
110 > else {
111 >    while (*vend!=';' && *vend!=0) vend++;
112 >    }
113 > if (enforce_GTF2 && *vend!='"')
114 >     GError(GTF2_ERR, attr, dupline);
115 > char *r=Gstrdup(vp, vend-1);
116 > //-- now remove this attribute from the info string
117 > while (*vend!=0 && (*vend=='"' || *vend==';' || *vend==' ')) vend++;
118 > if (*vend==0) vend--;
119 > for (char *src=vend, *dest=pos;;src++,dest++) {
120 >   *dest=*src;
121 >   if (*src==0) break;
122 >   }
123 > return r;
124   }
125  
126 < static char fnamelc[32];
126 > static char fnamelc[128];
127  
128   GffLine::GffLine(GffReader* reader, const char* l) {
129 < line=Gstrdup(l);
129 > llen=strlen(l);
130 > GMALLOC(line,llen+1);
131 > memcpy(line, l, llen+1);
132 > GMALLOC(dupline, llen+1);
133 > memcpy(dupline, l, llen+1);
134   skip=true;
135   gseqname=NULL;
136   track=NULL;
137   ftype=NULL;
138   info=NULL;
139 < Parent=NULL;
139 > _parents=NULL;
140 > _parents_len=0;
141 > num_parents=0;
142 > parents=NULL;
143 > is_gff3=false;
144   is_cds=false;
145 < is_mrna=false;
145 > is_transcript=false;
146   is_exon=false;
147 + is_gene=false;
148   exontype=0;
149 < gname=NULL;
149 > gene_id=NULL;
150 > gene_name=NULL;
151   qstart=0;
152   qend=0;
153   qlen=0;
54 exontype=0;
154   ID=NULL;
155   char* t[9];
156   int i=0;
# Line 77 | Line 176
176   ftype=t[2];
177   info=t[8];
178   char* p=t[3];
179 < if (!parseUInt(p,fstart))
180 <   GError("Error parsing start coordinate from GFF line:\n%s\n",l);
179 > if (!parseUInt(p,fstart)) {
180 >   //FIXME: chromosome_band entries in Flybase
181 >   GMessage("Warning: invalid start coordinate at line:\n%s\n",l);
182 >   return;
183 >   }
184   p=t[4];
185 < if (!parseUInt(p,fend))
186 <   GError("Error parsing end coordinate from GFF line:\n%s\n",l);
187 < if (fend<fstart) swap(fend,fstart);
185 > if (!parseUInt(p,fend)) {
186 >   GMessage("Warning: invalid end coordinate at line:\n%s\n",l);
187 >   return;
188 >   }
189 > if (fend<fstart) Gswap(fend,fstart); //make sure fstart>=fend, always
190   p=t[5];
191   if (p[0]=='.' && p[1]==0) {
192    score=0;
# Line 94 | Line 198
198   strand=*t[6];
199   if (strand!='+' && strand!='-' && strand!='.')
200       GError("Error parsing strand (%c) from GFF line:\n%s\n",strand,l);
201 < phase=*t[7];
201 > phase=*t[7]; // must be '.', '0', '1' or '2'
202   ID=NULL;
99 Parent=NULL;
203   // exon/CDS/mrna filter
204 < strncpy(fnamelc, ftype, 31);
205 < fnamelc[31]=0;
204 > strncpy(fnamelc, ftype, 127);
205 > fnamelc[127]=0;
206   strlower(fnamelc); //convert to lower case
207 + bool is_t_data=false;
208   if (strstr(fnamelc, "utr")!=NULL) {
209     exontype=exgffUTR;
210     is_exon=true;
211 +   is_t_data=true;
212     }
213 <  else if (strstr(fnamelc, "exon")!=NULL) {
213 >  else if (endsWith(fnamelc, "exon")) {
214     exontype=exgffExon;
215     is_exon=true;
216 +   is_t_data=true;
217     }
218 <  else if (startsWith(fnamelc, "stop") && strstr(fnamelc, "codon")!=NULL) {
218 >  else if (strstr(fnamelc, "stop") &&
219 >      (strstr(fnamelc, "codon") || strstr(fnamelc, "cds"))){
220     exontype=exgffStop;
221 +   is_cds=true; //though some place it outside the last CDS segment
222 +   is_t_data=true;
223 +   }
224 +  else if (strstr(fnamelc, "start") &&
225 +      ((strstr(fnamelc, "codon")!=NULL) || strstr(fnamelc, "cds")!=NULL)){
226 +   exontype=exgffStart;
227     is_cds=true;
228 +   is_t_data=true;
229     }
230   else if (strcmp(fnamelc, "cds")==0) {
231     exontype=exgffCDS;
232     is_cds=true;
233 +   is_t_data=true;
234     }
235 < else { //is_mrna is set only if the *current* line is a mRNA or transcript
236 <   is_mrna=(strcmp(fnamelc,"mrna")==0 ||
237 <          strcmp(fnamelc,"transcript")==0);
123 <   }
124 <
125 < if (reader->mrnaOnly) {
126 <   if (!is_mrna && !is_cds && !is_exon)
127 <                     return; //skip this line, unwanted feature name
128 <   }
129 < p=strstr(info,"ID=");
130 < if (p!=NULL) { //has ID attr
131 <   ID=p+3;
132 <   p=ID;
133 <   while (*p!=';' && *p!=0) p++;
134 <   ID=Gstrdup(ID, p-1);
135 <   //look for a name attr too:
136 <   p=strstr(info,"Name=");
137 <   if (p!=NULL) {
138 <     gname=p+5;
139 <     p=gname;
140 <     while (*p!=';' && *p!=0) p++;
141 <     gname=Gstrdup(gname, p-1);
142 <     }
235 > else if (endsWith(fnamelc, "gene") || startsWith(fnamelc, "gene")) {
236 >   is_gene=true;
237 >   is_t_data=true; //because its name will be attached to parented transcripts
238     }
239 < p=NULL;
240 < if (!is_mrna)
241 <     p=strifind(info,"Parent="); //don't care about the parent for mRNA features..
147 < if (p!=NULL) { //has Parent attr
148 <   Parent=p+7;
149 <   p=Parent;
150 <   while (*p!=';' && *p!=0) p++;
151 <   Parent=Gstrdup(Parent, p-1);
239 > else if (endsWith(fnamelc,"rna") || endsWith(fnamelc,"transcript")) {
240 >   is_transcript=true;
241 >   is_t_data=true;
242     }
243 <  else if (ID==NULL) { //no "Parent=" and no "ID=", attempt GTF parsing instead
244 <   p=strstr(info,"transcript_id");
245 <   if (p!=NULL) { //GTF detected
246 <     p+=13;
247 <     //requires quotes
248 <     while (*p!='"' && *p!=0) p++;
249 <     if (*p==0) GError("Error parsing transcript_id (double quotes expected) at GTF line:\n%s\n",l);
250 <     p++;
251 <     Parent=p;
252 <     while (*p!='"' && *p!=0) p++;
163 <     if (*p==0) GError("Error parsing transcript_id (ending double quotes expected) at GTF line:\n%s\n",l);
164 <     if (is_mrna) { // RGASP GTF exception: a parent "transcript" feature preceding exon/CDS subfeatures
165 <        ID=Gstrdup(Parent, p-1);
166 <        Parent=NULL;
167 <        }
168 <       else {
169 <        Parent=Gstrdup(Parent, p-1);
243 >
244 > if (reader->transcriptsOnly && !is_t_data) {
245 >        char* id=extractAttr("ID=");
246 >        if (id==NULL) id=extractAttr("transcript_id");
247 >        //GMessage("Discarding non-transcript line:\n%s\n",l);
248 >        if (id!=NULL) {
249 >          reader->discarded_ids.Add(id, new int(1));
250 >          GFREE(id);
251 >          }
252 >        return; //skip this line, unwanted feature name
253          }
254 <     //check for gene_name or gene_id
255 <     //p=strstr(info, "gene_name");// this is preferred over gene_id
256 <     //if (p==NULL)
257 <     p=strstr(info,"gene_id");
258 <     if (p!=NULL) {
259 <       p+=7;//skip 'gene_id'
260 <       while (*p!='"' && *p!=0) p++;
261 <       if (*p==0) GError("Error parsing gene_id (double quotes expected) at GTF line:\n%s\n",l);
262 <       p++;
263 <       gname=p;
264 <       while (*p!='"' && *p!=0) p++;
265 <       if (*p==0) GError("Error parsing gene_id (ending double quotes expected) at GTF line:\n%s\n",l);
266 <       gname=Gstrdup(gname, p-1);
267 <       }
254 > ID=extractAttr("ID=",true);
255 > char* Parent=extractAttr("Parent=",true);
256 > is_gff3=(ID!=NULL || Parent!=NULL);
257 > if (is_gff3) {
258 >   //parse as GFF3
259 >    if (ID!=NULL) {
260 >       //has ID attr so it's likely to be a parent feature
261 >       //look for explicit gene name
262 >       gene_name=extractAttr("gene_name=");
263 >       if (gene_name==NULL) {
264 >           gene_name=extractAttr("geneName=");
265 >           if (gene_name==NULL) {
266 >               gene_name=extractAttr("gene_sym=");
267 >               if (gene_name==NULL) {
268 >                   gene_name=extractAttr("gene=");
269 >                   }
270 >               }
271 >           }
272 >       gene_id=extractAttr("geneID=");
273 >       if (gene_id==NULL) {
274 >          gene_id=extractAttr("gene_id=");
275 >          }
276 >       if (is_gene) {
277 >         //special case: keep the Name and ID attributes of the gene feature
278 >         if (gene_name==NULL)
279 >              gene_name=extractAttr("Name=");
280 >         if (gene_id==NULL) //the ID is also gene_id in this case
281 >              gene_id=Gstrdup(ID);
282 >         //skip=false;
283 >         //return;
284 >         GFREE(Parent); //TMI, we really don't care about gene Parents?
285 >         } //gene feature
286 >       }// has GFF3 ID
287 >   if (Parent!=NULL) {
288 >        //keep Parent attr
289 >         //parse multiple parents
290 >         num_parents=1;
291 >         p=Parent;
292 >         int last_delim_pos=-1;
293 >         while (*p!=';' && *p!=0) {
294 >             if (*p==',' && *(p+1)!=0 && *(p+1)!=';') {
295 >                 num_parents++;
296 >                 last_delim_pos=(p-Parent);
297 >                 }
298 >             p++;
299 >             }
300 >         _parents_len=p-Parent+1;
301 >         _parents=Parent;
302 >         GMALLOC(parents, num_parents*sizeof(char*));
303 >         parents[0]=_parents;
304 >         int i=1;
305 >         if (last_delim_pos>0) {
306 >           for (p=_parents+1;p<=_parents+last_delim_pos;p++) {
307 >              if (*p==',') {
308 >                 char* ep=p-1;
309 >                 while (*ep==' ' && ep>_parents) ep--;
310 >                 *(ep+1)=0; //end the string there
311 >                 parents[i]=p+1;
312 >                 i++;
313 >                 }
314 >              }
315 >           }
316 >         } //has Parent field
317 >   } //GFF3
318 >  else { // GTF-like expected
319 >   Parent=extractAttr("transcript_id",true);
320 >   if (Parent!=NULL) { //GTF2 format detected
321 >     if (is_transcript) {
322 >         // atypical GTF with a parent transcript line declared
323 >         ID=Parent;
324 >         Parent=NULL;
325 >         }
326 >     gene_id=extractAttr("gene_id"); // for GTF this is the only attribute accepted as geneID
327 >     if (gene_id==NULL)
328 >       gene_id=extractAttr("geneid");
329 >     gene_name=extractAttr("gene_name");
330 >     if (gene_name==NULL) {
331 >
332 >           gene_name=extractAttr("gene_sym");
333 >           if (gene_name==NULL) {
334 >               gene_name=extractAttr("gene");
335 >               if (gene_name==NULL)
336 >                  gene_name=extractAttr("genesymbol");
337 >               }
338 >           }
339       //prepare for parseAttr by adding '=' character instead of spaces for all attributes
340       //after the attribute name
341       p=info;
342       bool noed=true; //not edited after the last delim
343       bool nsp=false; //non-space found after last delim
344       while (*p!=0) {
345 <      if (*p==' ') {
346 <         if (nsp && noed) {
347 <           *p='=';
348 <            noed=false;
349 <            p++;
350 <            continue;
351 <            }
352 <         }
353 <      else nsp=true;
354 <      if (*p==';') { noed=true; nsp=false; }
355 <      p++;
356 <      }
357 <     } //gtf detected
358 <    else {//check for jigsaw or cufflinks format
359 <     char* fexon=strstr(fnamelc, "exon");
360 <     if (fexon!=NULL) {
345 >       if (*p==' ') {
346 >          if (nsp && noed) {
347 >             *p='=';
348 >             noed=false;
349 >             p++;
350 >             continue;
351 >             }
352 >           }
353 >         else nsp=true; //non-space
354 >       if (*p==';') { noed=true; nsp=false; }
355 >       p++;
356 >       }
357 >     } //GTF2 detected (no parent line)
358 >    else {// Parent is NULL, check for jigsaw format or other pre-GTF2 format
359 >     //char* fexon=strstr(fnamelc, "exon");
360 >     //if (fexon!=NULL) {
361 >     if (exontype==exgffExon) {
362         if (startsWith(track,"jigsaw")) {
363 <        is_cds=true;
364 <        strcpy(track,"jigsaw");
365 <        p=strchr(info,';');
366 <        if (p==NULL) Parent=Gstrdup(info);
367 <           else { Parent=Gstrdup(info,p-1); info=p+1;  }
368 <        }
369 <       else if ((i=strcspn(info,"; \t\n\r"))<=(int)(strlen(info)+1)) {//one word ID
363 >          is_cds=true;
364 >          strcpy(track,"jigsaw");
365 >          p=strchr(info,';');
366 >          if (p==NULL) { Parent=Gstrdup(info); info=NULL; }
367 >           else { Parent=Gstrdup(info,p-1);
368 >                  info=p+1;
369 >                }
370 >          }
371 >        } //exon feature?
372 >        if (Parent==NULL && exontype>=exgffCDS &&
373 >               (i=strcspn(info,"; \t\n\r"))<=(int)(strlen(info)+1)) {
374 >          //one word ID ? really desperate attempt to parse it here
375            Parent=Gstrdup(info,info+i-1);
376 <        }
376 >          info=NULL; //discard anything else on the line
377 >          }
378 >     }
379 >   if (Parent!=NULL) { //GTF transcript_id for exon/CDS feature
380 >      _parents=Parent;
381 >      GMALLOC(parents,sizeof(char*));
382 >      num_parents=1;
383 >      parents[0]=_parents;
384 >      }
385 >   } //GTF-like
386  
387 + //parse other potentially useful features
388 + if (is_gff3) {
389 +   if ((p=strstr(info,"Target="))!=NULL) { //has Target attr
390 +      p+=7;
391 +      while (*p!=';' && *p!=0 && *p!=' ') p++;
392 +      if (*p!=' ') {
393 +         GError("Error parsing target coordinates from GFF line:\n%s\n",l);
394 +         }
395 +      if (!parseUInt(p,qstart))
396 +         GError("Error parsing target start coordinate from GFF line:\n%s\n",l);
397 +      if (*p!=' ') {
398 +         GError("Error parsing next target coordinate from GFF line:\n%s\n",l);
399 +         }
400 +      p++;
401 +      if (!parseUInt(p,qend))
402 +         GError("Error parsing target end coordinate from GFF line:\n%s\n",l);
403        }
404 <      else GError("Error parsing Parent/ID at input line:\n%s\n",l);
405 <     }
406 <   }
407 < p=strstr(info,"Target=");
408 < if (p!=NULL) { //has Target attr
409 <   p+=7;
410 <   while (*p!=';' && *p!=0 && *p!=' ') p++;
226 <   if (*p!=' ') {
227 <      GError("Error parsing target coordinates from GFF line:\n%s\n",l);
228 <      }
229 <   if (!parseUInt(p,qstart))
230 <     GError("Error parsing target start coordinate from GFF line:\n%s\n",l);
231 <   if (*p!=' ') {
232 <      GError("Error parsing next target coordinate from GFF line:\n%s\n",l);
233 <      }
234 <   p++;
235 <   if (!parseUInt(p,qend))
236 <     GError("Error parsing target end coordinate from GFF line:\n%s\n",l);
237 <   }
238 < else {
239 <   p=strifind(info,"Qreg=");
240 <   if (p!=NULL) { //has Qreg attr
241 <     p+=5;
242 <     if (!parseUInt(p,qstart))
243 <       GError("Error parsing target start coordinate from GFF line:\n%s\n",l);
244 <     if (*p!='-') {
245 <        GError("Error parsing next target coordinate from GFF line:\n%s\n",l);
246 <        }
247 <     p++;
248 <     if (!parseUInt(p,qend))
249 <       GError("Error parsing target end coordinate from GFF line:\n%s\n",l);
250 <     if (*p=='|') {
404 >   if ((p=strifind(info,"Qreg="))!=NULL) { //has Qreg attr
405 >       p+=5;
406 >       if (!parseUInt(p,qstart))
407 >         GError("Error parsing target start coordinate from GFF line:\n%s\n",l);
408 >       if (*p!='-') {
409 >          GError("Error parsing next target coordinate from GFF line:\n%s\n",l);
410 >          }
411         p++;
412 <       if (!parseUInt(p,qlen))
413 <         GError("Error parsing target length from GFF Qreg|: \n%s\n",l);
414 <       }
415 <     }//has Qreg attr
416 <   }
417 < if (qlen==0 && (p=strifind(info,"Qlen="))!=NULL) {
418 <   p+=5;
419 <   if (!parseUInt(p,qlen))
420 <       GError("Error parsing target length from GFF Qlen:\n%s\n",l);
421 <   }
412 >       if (!parseUInt(p,qend))
413 >         GError("Error parsing target end coordinate from GFF line:\n%s\n",l);
414 >       if (*p=='|' || *p==':') {
415 >         p++;
416 >         if (!parseUInt(p,qlen))
417 >           GError("Error parsing target length from GFF Qreg|: \n%s\n",l);
418 >         }
419 >       }//has Qreg attr
420 >   if (qlen==0 && (p=strifind(info,"Qlen="))!=NULL) {
421 >     p+=5;
422 >     if (!parseUInt(p,qlen))
423 >         GError("Error parsing target length from GFF Qlen:\n%s\n",l);
424 >     }
425 >   }//parsing some useful attributes in GFF3 records
426 > if (ID==NULL && parents==NULL) {
427 >      if (reader->gff_warns)
428 >          GMessage("Warning: could not parse ID or Parent from GFF line:\n%s\n",dupline);
429 >      return; //skip
430 >      }
431   skip=false;
432   }
433  
434 < int GffObj::addExon(GffLine* gl, bool keepAttr, bool noExonAttr) {
434 >
435 > void GffObj::addCDS(uint cd_start, uint cd_end, char phase) {
436 >  if (cd_start>=this->start) {
437 >        this->CDstart=cd_start;
438 >        if (strand=='+') this->CDphase=phase;
439 >        }
440 >      else this->CDstart=this->start;
441 >  if (cd_end<=this->end) {
442 >      this->CDend=cd_end;
443 >      if (strand=='-') this->CDphase=phase;
444 >      }
445 >     else this->CDend=this->end;
446 >  isTranscript(true);
447 >  exon_ftype_id=gff_fid_exon;
448 >  if (monoFeature()) {
449 >     if (exons.Count()==0) addExon(this->start, this->end,0,'.',0,0,false,exgffExon);
450 >            else exons[0]->exontype=exgffExon;
451 >     }
452 > }
453 >
454 > int GffObj::addExon(GffReader* reader, GffLine* gl, bool keepAttr, bool noExonAttr) {
455    //this will make sure we have the right subftype_id!
456 <  int subf_id=-1;
457 <  if (ftype_id==gff_fid_mRNA) {
458 <     if (subftype_id<0) subftype_id=gff_fid_exon;
456 >  //int subf_id=-1;
457 >  if (!isTranscript() && gl->is_cds) {
458 >          isTranscript(true);
459 >          exon_ftype_id=gff_fid_exon;
460 >          if (exons.Count()==1) exons[0]->exontype=exgffExon;
461 >          }
462 >  if (isTranscript()) {
463 >     if (exon_ftype_id<0) {//exon_ftype_id=gff_fid_exon;
464 >          if (gl->exontype>0) exon_ftype_id=gff_fid_exon;
465 >                         else exon_ftype_id=names->feats.addName(gl->ftype);
466 >          }
467       //any recognized mRNA segment gets the generic "exon" type (also applies to CDS)
468 <     if (!gl->is_cds && !gl->is_exon)
469 <         //extraneous mRNA feature, will discard for now
468 >     if (gl->exontype==0 && !gl->is_transcript) {
469 >          //extraneous mRNA feature, discard
470 >          if (reader->gff_warns)
471 >            GMessage("Warning: discarding unrecognized transcript subfeature %s of %s\n",
472 >                gl->ftype, gffID);
473            return -1;
474 +          }
475       }
476 <  else { //other kind of parent feature, check this subf type
477 <    subf_id=names->feats.addName(gl->ftype);
478 <    if (subftype_id<0)
479 <       subftype_id=subf_id;
480 <    else {
481 <       if (subftype_id!=subf_id)
482 <         GMessage("GFF Warning: multiple subfeatures (%s and %s) found for %s, only %s is kept\n",
483 <             names->feats.getName(subftype_id), names->feats.getName(subf_id),
484 <             gffID,names->feats.getName(subftype_id));
485 <         return -1; //skip this 2nd subfeature type for this parent!
486 <       }
487 <    }
488 <  if (gl->exontype==exgffUTR || gl->exontype==exgffStop)
489 <      udata=1;//merge 0-distance segments
476 >  else { //non-mRNA parent feature, check this subf type
477 >    int subf_id=names->feats.addName(gl->ftype);
478 >    if (exon_ftype_id<0 || exons.Count()==0) //never assigned a subfeature type before (e.g. first exon being added)
479 >       exon_ftype_id=subf_id;
480 >     else {
481 >       if (exon_ftype_id!=subf_id) {
482 >         //
483 >         if (exon_ftype_id==ftype_id && exons.Count()==1 && exons[0]->start==start && exons[0]->end==end) {
484 >            //the existing exon was just a dummy one created by default, discard it
485 >            exons.Clear();
486 >            covlen=0;
487 >            exon_ftype_id=subf_id; //allow the new subfeature to completely takeover
488 >            }
489 >         else { //multiple subfeatures, prefer those with
490 >             if (reader->gff_warns)
491 >               GMessage("GFF Warning: multiple subfeatures (%s and %s) found for %s, discarding ",
492 >                  names->feats.getName(subf_id), names->feats.getName(exon_ftype_id),gffID);
493 >            if (gl->exontype!=0) { //new feature is an exon, discard previously parsed subfeatures
494 >               if (reader->gff_warns) GMessage("%s.\n", names->feats.getName(exon_ftype_id));
495 >               exon_ftype_id=subf_id;
496 >               exons.Clear();
497 >               covlen=0;
498 >               }
499 >              else { //discard new feature
500 >               if (reader->gff_warns) GMessage("%s.\n", names->feats.getName(subf_id));
501 >               return -1; //skip this 2nd subfeature type for this parent!
502 >               }
503 >            }
504 >         } //incoming subfeature is of different type
505 >       } //new subfeature type
506 >    } //non-mRNA parent
507    int eidx=addExon(gl->fstart, gl->fend, gl->score, gl->phase,
508           gl->qstart,gl->qend, gl->is_cds, gl->exontype);
509 <  if (eidx>=0 && keepAttr) {
510 <      parseAttrs(exons[eidx]->attrs, gl->info, noExonAttr);
509 >  if (eidx<0) return eidx; //this should never happen
510 >  if (keepAttr) {
511 >     if (noExonAttr) {
512 >         if (attrs==NULL) //place the parsed attributes directly at transcript level
513 >           parseAttrs(attrs, gl->info);
514 >         }
515 >       else { //need all exon-level attributes
516 >         parseAttrs(exons[eidx]->attrs, gl->info, true);
517 >         }
518        }
519    return eidx;
520   }
521  
522  
298 void GffObj::expandExon(int oi, uint segstart, uint segend, char exontype, double sc, char fr, int qs, int qe) {
299  //oi is the index of the *first* overlapping segment found that must be enlarged
300  covlen-=exons[oi]->len();
301  if (segstart<exons[oi]->start)
302    exons[oi]->start=segstart;
303  if (qs<exons[oi]->qstart) exons[oi]->qstart=qs;
304  if (segend>exons[oi]->end)
305    exons[oi]->end=segend;
306  if (qe>exons[oi]->qend) exons[oi]->qend=qe;
307  //warning: score cannot be properly adjusted! e.g. if it's a p-value it's just going to get worse
308  if (sc!=0) exons[oi]->score=sc;
309  covlen+=exons[oi]->len();
310  //if (exons[oi]->exontype< exontype) -- always true
311  exons[oi]->exontype = exontype;
312  if (exontype==exgffCDS) exons[oi]->phase=fr;
313  //we must check if any more exons are also overlapping this
314  int ni=oi+1; //next exon index after oi
315  while (ni<exons.Count() && segend>=exons[ni]->start) { // next segment overlaps new enlarged segment
316     //only allow this if next segment is fully included, and a subordinate
317     if (exons[ni]->exontype<exontype && exons[ni]->end<=segend) {
318         if (exons[ni]->qstart<exons[oi]->qstart) exons[oi]->qstart=exons[ni]->qstart;
319         if (exons[ni]->qend>exons[oi]->qend) exons[oi]->qend=exons[ni]->qend;
320         exons.Delete(ni);
321         }
322      else {
323         GMessage("GFF Warning: overlapping feature segment (%d-%d) for GFF ID %s\n", segstart, segend, gffID);
324         hasErrors=true;
325         break;
326         }
327     }
328  // -- make sure any other related boundaries are updated:
329  start=exons.First()->start;
330  end=exons.Last()->end;
331  if (uptr!=NULL) { //collect stats about the underlying genomic sequence
332    GSeqStat* gsd=(GSeqStat*)uptr;
333    if (start<gsd->mincoord) gsd->mincoord=start;
334    if (end>gsd->maxcoord) gsd->maxcoord=end;
335    }
336 }
337
523   int GffObj::addExon(uint segstart, uint segend, double sc, char fr, int qs, int qe, bool iscds, char exontype) {
524    if (exons.Count()==0) {
525        if (iscds) isCDS=true; //for now, assume CDS only if first "exon" given is a CDS
526 <      if (subftype_id<0) {
527 <         subftype_id = (ftype_id==gff_fid_mRNA) ? gff_fid_exon : ftype_id;
526 >      if (exon_ftype_id<0) {
527 >         exon_ftype_id = isTranscript() ? gff_fid_exon : ftype_id;
528           }
529        }
530 <  if (iscds) { //update CDS anchors:
530 >  //special treatment of start/stop codon features, they might be broken/split between exons
531 >  //and in that case some providers will still give the wrong end coordinate as start+2 (e.g. UCSC)
532 >  //so we should not trust the end coordinate for such features
533 >  if (exontype==exgffStart || exontype==exgffStop) {
534 >     if (strand=='-') segstart=segend;
535 >                else  segend=segstart;
536 >     if (exontype==exgffStart) {
537 >           if (CDstart==0 || segstart<CDstart) CDstart=segstart;
538 >           }
539 >         else {
540 >           if (segstart>CDend) CDend=segstart;
541 >           }
542 >     }
543 >    else if (iscds) { //update CDS anchors:
544       if (CDstart==0 || segstart<CDstart)  {
545             CDstart=segstart;
546             if (exontype==exgffCDS && strand=='+') CDphase=fr;
# Line 352 | Line 550
550             CDend=segend;
551             }
552       }
553 <   else { // !iscds
553 >   else { // not a CDS/start/stop
554       isCDS=false;
555       }
556    if (qs || qe) {
557 <    if (qs>qe) swap(qs,qe);
557 >    if (qs>qe) Gswap(qs,qe);
558      if (qs==0) qs=1;
559 <    }
559 >        }
560    int ovlen=0;
561 <  int oi=exonOverlapIdx(segstart, segend, &ovlen);
562 <  if (oi>=0) { //overlap existing segment
563 <     //only allow this for CDS within exon, stop_codon within exon, stop_codon within UTR,
564 <     //                        or stop_codon within CDS
565 <    if (exons[oi]->exontype>exontype &&
566 <         exons[oi]->start<=segstart && exons[oi]->end>=segend &&
567 <         !(exons[oi]->exontype==exgffUTR && exontype==exgffCDS)) {
568 <          //larger segment given first, now the smaller included one
569 <          return oi; //only used to store attributes from current GffLine
570 <          }
571 <    if (exontype>exons[oi]->exontype &&
572 <         segstart<=exons[oi]->start && segend>=exons[oi]->end &&
573 <         !(exontype==exgffUTR && exons[oi]->exontype==exgffCDS)) {
574 <           //smaller segment given first, so we have to enlarge it
575 <          expandExon(oi, segstart, segend, exontype, sc, fr, qs, qe);
576 <            //this must also check for overlapping next exon (oi+1)
577 <          return oi;
578 <          }
579 <    //there is also the special case of "ribosomal slippage exception" (programmed frameshift)
580 <    //where two CDS segments may actually overlap for 1 or 2 bases, but there should be only one encompassing exon
581 <    if (ovlen>2 || exons[oi]->exontype!=exgffCDS || exontype!=exgffCDS) {
582 <       GMessage("GFF Warning: discarded overlapping feature segment (%d-%d) for GFF ID %s\n",
583 <           segstart, segend, gffID);
584 <       hasErrors=true;
585 <       return false; //segment NOT added
586 <       }
587 <     /* --
588 <     else {
589 <       // else add the segment if the overlap is small and between two CDS segments
590 <       //we might want to add an attribute here with the slippage coordinate and size?
591 <       }
592 <     */
593 <    }//overlap of existing segment
594 <
561 >  if (exontype>0) { //check for overlaps between exon-type segments
562 >      int oi=exonOverlapIdx(segstart, segend, &ovlen);
563 >      if (oi>=0) { //overlap existing segment
564 >         if (ovlen==0) {
565 >                          //adjacent segments will be merged
566 >                          //e.g. CDS to (UTR|exon)
567 >                          if ((exons[oi]->exontype>=exgffUTR && exontype==exgffCDS) ||
568 >                                  (exons[oi]->exontype==exgffCDS && exontype>=exgffUTR)) {
569 >                                        expandExon(oi, segstart, segend, exgffCDSUTR, sc, fr, qs, qe);
570 >                                        return oi;
571 >                                        }
572 >                          //CDS adjacent to stop_codon: UCSC does (did?) this
573 >                          if ((exons[oi]->exontype==exgffStop && exontype==exgffCDS) ||
574 >                                  (exons[oi]->exontype==exgffCDS && exontype==exgffStop)) {
575 >                                        expandExon(oi, segstart, segend, exgffCDS, sc, fr, qs, qe);
576 >                                        return oi;
577 >                                        }
578 >             }
579 >                 //only allow this for CDS within exon, stop_codon within (CDS|UTR|exon),
580 >         //                   start_codon within (CDS|exon)
581 >        if (exons[oi]->exontype>exontype &&
582 >             exons[oi]->start<=segstart && exons[oi]->end>=segend &&
583 >             !(exons[oi]->exontype==exgffUTR && exontype==exgffCDS)) {
584 >              //larger segment given first, now the smaller included one is redundant
585 >              return oi; //only used to store attributes from current GffLine
586 >              }
587 >        if (exontype>exons[oi]->exontype &&
588 >             segstart<=exons[oi]->start && segend>=exons[oi]->end &&
589 >             !(exontype==exgffUTR && exons[oi]->exontype==exgffCDS)) {
590 >               //smaller segment given first, so we have to enlarge it
591 >                          expandExon(oi, segstart, segend, exontype, sc, fr, qs, qe);
592 >                                //this should also check for overlapping next exon (oi+1) ?
593 >              return oi;
594 >              }
595 >        //there is also the special case of "ribosomal slippage exception" (programmed frameshift)
596 >        //where two CDS segments may actually overlap for 1 or 2 bases, but there should be only one encompassing exon
597 >                //if (ovlen>2 || exons[oi]->exontype!=exgffCDS || exontype!=exgffCDS) {
598 >                // had to relax this because of some weird UCSC annotations with exons partially overlapping the CDS segments
599 >                /*
600 >                if (ovlen>2 && exons[oi]->exontype!=exgffUTR && exontype!=exgffUTR) {
601 >                   if (gff_show_warnings)
602 >                           GMessage("GFF Warning: discarding overlapping feature segment (%d-%d) (vs %d-%d (%s)) for GFF ID %s on %s\n",
603 >                           segstart, segend, exons[oi]->start, exons[oi]->end, getSubfName(), gffID, getGSeqName());
604 >                   hasErrors(true);
605 >                   return -1; //segment NOT added
606 >                   }
607 >                */
608 >
609 >                 if ((ovlen>2 || ovlen==0) || exons[oi]->exontype!=exgffCDS || exontype!=exgffCDS) {
610 >                  if (gff_show_warnings)
611 >                         GMessage("GFF Warning: merging overlapping/adjacent feature segment (%d-%d) into (%d-%d) (%s) for GFF ID %s on %s\n",
612 >                                 segstart, segend, exons[oi]->start, exons[oi]->end, getSubfName(), gffID, getGSeqName());
613 >                        expandExon(oi, segstart, segend, exontype, sc, fr, qs, qe);
614 >                        return oi;
615 >                 }
616 >                // else add the segment if the overlap is small and between two CDS segments
617 >                //TODO: we might want to add an attribute here with the slippage coordinate and size?
618 >        covlen-=ovlen;
619 >                }//overlap or adjacent to existing segment
620 >           } //check for overlap
621     // --- no overlap, or accepted micro-overlap (ribosomal slippage)
622     // create & add the new segment
623 +   /*
624 +   if (start>0 && exontype==exgffCDS && exons.Count()==0) {
625 +      //adding a CDS directly as the first subfeature of a declared parent
626 +      segstart=start;
627 +      segend=end;
628 +      }
629 +   */
630     GffExon* enew=new GffExon(segstart, segend, sc, fr, qs, qe, exontype);
631     int eidx=exons.Add(enew);
632     if (eidx<0) {
633 <     GMessage("GFF Warning: duplicate segment %d-%d not added for GFF ID %s!\n",
633 >    //this would actually be acceptable if the object is a "Gene" and "exons" are in fact isoforms
634 >     if (gff_show_warnings)
635 >       GMessage("GFF Warning: failed adding segment %d-%d for %s (discarded)!\n",
636              segstart, segend, gffID);
637 +     delete enew;
638 +     hasErrors(true);
639       return -1;            
640       }
641     covlen+=(int)(exons[eidx]->end-exons[eidx]->start)+1;
642 <   start=exons.First()->start;
643 <   end=exons.Last()->end;
642 >   //adjust parent feature coordinates to contain this exon
643 >   if (start==0 || start>exons.First()->start) {
644 >     start=exons.First()->start;
645 >     }
646 >   if (end<exons.Last()->end) end=exons.Last()->end;
647 >    
648     if (uptr!=NULL) { //collect stats about the underlying genomic sequence
649         GSeqStat* gsd=(GSeqStat*)uptr;
650         if (start<gsd->mincoord) gsd->mincoord=start;
651         if (end>gsd->maxcoord) gsd->maxcoord=end;
652 +       if (this->len()>gsd->maxfeat_len) {
653 +          gsd->maxfeat_len=this->len();
654 +          gsd->maxfeat=this;
655 +          }
656         }
657     return eidx;
658   }
659  
660 + void GffObj::expandExon(int oi, uint segstart, uint segend, char exontype, double sc, char fr, int qs, int qe) {
661 +  //oi is the index of the *first* overlapping segment found that must be enlarged
662 +  covlen-=exons[oi]->len();
663 +  if (segstart<exons[oi]->start)
664 +    exons[oi]->start=segstart;
665 +  if (qs && qs<exons[oi]->qstart) exons[oi]->qstart=qs;
666 +  if (segend>exons[oi]->end)
667 +    exons[oi]->end=segend;
668 +  if (qe && qe>exons[oi]->qend) exons[oi]->qend=qe;
669 +  //warning: score cannot be properly adjusted! e.g. if it's a p-value it's just going to get worse
670 +  if (sc!=0) exons[oi]->score=sc;
671 +  covlen+=exons[oi]->len();
672 +  //if (exons[oi]->exontype< exontype) -- always true
673 +  exons[oi]->exontype = exontype;
674 +  if (exontype==exgffCDS) exons[oi]->phase=fr;
675 +  //we must check if any more exons are also overlapping this
676 +  int ni=oi+1; //next exon index after oi
677 +  while (ni<exons.Count() && segend>=exons[ni]->start) { // next segment overlaps new enlarged segment
678 +     //only allow this if next segment is fully included, and a subordinate
679 +     if (exons[ni]->exontype<exontype && exons[ni]->end<=segend) {
680 + /* I guess we have to relax this due to stupid UCSC hg18 files having a start_codon sticking out
681 + chr1    hg18_knownGene  start_codon     69806911        69806913        0.000000        +       .
682 + chr1    hg18_knownGene  CDS     69806911        69806912        0.000000        +       0
683 + chr1    hg18_knownGene  exon    69805456        69806912        0.000000        +       .    
684 + */
685 +         if (exons[ni]->qstart<exons[oi]->qstart) exons[oi]->qstart=exons[ni]->qstart;
686 +         if (exons[ni]->qend>exons[oi]->qend) exons[oi]->qend=exons[ni]->qend;
687 +         exons.Delete(ni);
688 +         }
689 +      else {
690 +         if (gff_show_warnings) GMessage("GFF Warning: overlapping existing exon(%d-%d) while expanding to %d-%d for GFF ID %s\n",
691 +                exons[ni]->start, exons[ni]->end, segstart, segend, gffID);
692 +         //hasErrors(true);
693 +         break;
694 +         }
695 +     }
696 +  // -- make sure any other related boundaries are updated:
697 +  start=exons.First()->start;
698 +  end=exons.Last()->end;
699 +  if (uptr!=NULL) { //collect stats about the underlying genomic sequence
700 +    GSeqStat* gsd=(GSeqStat*)uptr;
701 +    if (start<gsd->mincoord) gsd->mincoord=start;
702 +    if (end>gsd->maxcoord) gsd->maxcoord=end;
703 +    if (this->len()>gsd->maxfeat_len) {
704 +        gsd->maxfeat_len=this->len();
705 +        gsd->maxfeat=this;
706 +        }
707 +    }
708 + }
709 +
710   void GffObj::removeExon(int idx) {
711    /*
712     if (idx==0 && segs[0].start==gstart)
# Line 431 | Line 724
724    if (isCDS) { CDstart=start; CDend=end; }
725   }
726  
727 + void GffObj::removeExon(GffExon* p) {
728 +  for (int idx=0;idx<exons.Count();idx++) {
729 +     if (exons[idx]==p) {
730 +        int segstart=exons[idx]->start;
731 +        int segend=exons[idx]->end;
732 +        exons.Delete(idx);
733 +        covlen -= (int)(segend-segstart)+1;
734 +        start=exons.First()->start;
735 +        end=exons.Last()->end;
736 +        if (isCDS) { CDstart=start; CDend=end; }
737 +        return;
738 +        }
739 +     }
740 + }
741 +
742 +
743 +
744   GffObj::GffObj(GffReader *gfrd, GffLine* gffline, bool keepAttr, bool noExonAttr):
745 <     GSeg(0,0), exons(true,true,true) {
746 < xstart=0;
747 < xend=0;
748 < xstatus=0;
749 < partial=false;
750 < isCDS=false;
751 < uptr=NULL;
752 < ulink=NULL;
753 < udata=0;
754 < CDstart=0;
755 < CDend=0;
756 < gname=NULL;
757 < attrs=NULL;
758 < gffID=NULL;
759 < track_id=-1;
760 < gseq_id=-1;
761 < ftype_id=-1;
762 < subftype_id=-1;
763 < hasErrors=false;
764 < if (gfrd==NULL)
745 >     GSeg(0,0), exons(true,true,false), children(1,false) {
746 >  xstart=0;
747 >  xend=0;
748 >  xstatus=0;
749 >  partial=false;
750 >  isCDS=false;
751 >  uptr=NULL;
752 >  ulink=NULL;
753 >  parent=NULL;
754 >  udata=0;
755 >  flags=0;
756 >  CDstart=0;
757 >  CDend=0;
758 >  CDphase=0;
759 >  geneID=NULL;
760 >  gene_name=NULL;
761 >  attrs=NULL;
762 >  gffID=NULL;
763 >  track_id=-1;
764 >  gseq_id=-1;
765 >  ftype_id=-1;
766 >  exon_ftype_id=-1;
767 >  strand='.';
768 >  if (gfrd==NULL)
769      GError("Cannot use this GffObj constructor with a NULL GffReader!\n");
770 < gffnames_ref(names);
771 < if (gfrd->names==NULL) gfrd->names=names;
772 < qlen=0;qstart=0;qend=0;
773 < gscore=0;
774 < uscore=0;
775 < covlen=0;
776 < qcov=0;
777 < if (gffline->Parent!=NULL) {
778 <    //GTF style -- subfeature given directly
779 <    if (gffline->is_cds || gffline->is_exon)
780 <         ftype_id=gff_fid_mRNA;
781 <      else {
782 <        //group of other subfeatures of type ftype:
783 <        ftype_id=names->feats.addName(gffline->ftype);
770 >  gffnames_ref(names);
771 >  if (gfrd->names==NULL) gfrd->names=names;
772 >  //qlen=0;qstart=0;qend=0;
773 >  gscore=0;
774 >  uscore=0;
775 >  covlen=0;
776 >  qcov=0;
777 >  start=gffline->fstart;
778 >  end=gffline->fend;
779 >  gseq_id=names->gseqs.addName(gffline->gseqname);
780 >  track_id=names->tracks.addName(gffline->track);
781 >  strand=gffline->strand;
782 >  qlen=gffline->qlen;
783 >  qstart=gffline->qstart;
784 >  qend=gffline->qend;
785 >  //setup flags from gffline
786 >  isCDS=gffline->is_cds; //for now
787 >  isGene(gffline->is_gene);
788 >  isTranscript(gffline->is_transcript || gffline->exontype!=0);
789 >  //fromGff3(gffline->is_gff3);
790 >
791 >  if (gffline->parents!=NULL) {
792 >    //GTF style -- create a GffObj directly by subfeature
793 >    //(also possible orphan GFF3 exon line, or an exon given before its parent (chado))
794 >    if (gffline->exontype!=0) { //recognized exon-like feature
795 >       ftype_id=gff_fid_transcript; //so this is some sort of transcript
796 >       exon_ftype_id=gff_fid_exon; //subfeatures MUST be exons
797 >       }
798 >     else {//unrecognized subfeatures
799 >       //make this GffObj of the same feature type
800 >       ftype_id=names->feats.addName(gffline->ftype);
801 >       }
802 >    if (gffline->ID==NULL) { //typical GTF2 without "transcript" line
803 >        gffID=Gstrdup(gffline->parents[0]);
804 >        this->createdByExon(true);
805 >        //this is likely the first exon/segment of the feature
806 >        addExon(gfrd, gffline, keepAttr, noExonAttr);
807          }
808 <    gffID=gffline->Parent;
809 <    gffline->Parent=NULL; //just take over
810 <    if (gffline->gname!=NULL) {
811 <        gname=gffline->gname;
812 <        gffline->gname=NULL;
808 >      else { //a parented feature with an ID -- probably an orphan or premature GFF3 subfeature line
809 >        if (gffline->is_gff3 && gffline->exontype!=0) {
810 >             //premature exon given before its parent transcript
811 >             //create the transcript entry here
812 >             gffID=Gstrdup(gffline->parents[0]);
813 >             this->createdByExon(true);
814 >             //this is the first exon/segment of the transcript
815 >             addExon(gfrd, gffline, keepAttr, noExonAttr);
816 >             }
817 >            else { //unrecognized non-exon feature ? use the ID instead
818 >             gffID=Gstrdup(gffline->ID);
819 >             if (keepAttr) this->parseAttrs(attrs, gffline->info);
820 >             }
821          }
822 <    gseq_id=names->gseqs.addName(gffline->gseqname);
823 <    track_id=names->tracks.addName(gffline->track);
824 <    strand=gffline->strand;
480 <    qlen=gffline->qlen;
481 <    start=gffline->fstart;
482 <    end=gffline->fend;
483 <    isCDS=gffline->is_cds; //for now
484 <    addExon(gffline, keepAttr, noExonAttr);
485 <    if (keepAttr && noExonAttr) {
486 <      //simply move the attrs from this first exon
487 <      //to the transcript
488 <      if (exons.First()->attrs!=NULL) {
489 <         attrs=exons.First()->attrs;
490 <         exons.First()->attrs=NULL;
491 <         }
492 <      }
493 <    } //GTF line
494 < else { //GffReader made sure this is a parent line (no parent)
495 <    //even for a mRNA with a Parent= line
822 >    } //subfeature given directly
823 >  else { //gffline->parents==NULL
824 >    //create a parent feature in its own right
825      gscore=gffline->score;
826      if (gffline->ID==NULL || gffline->ID[0]==0)
827 <       GError("Error: no ID found for GFF record start\n");
828 <    gffID=gffline->ID; //there must be an ID here
829 <    if (gffline->is_mrna) ftype_id=gff_fid_mRNA;
830 <        else ftype_id=names->feats.addName(gffline->ftype);
831 <    gffline->ID=NULL; //steal it
832 <    if (gffline->gname!=NULL) {
833 <        gname=gffline->gname;
834 <        gffline->gname=NULL;
827 >      GError("Error: no ID found for GFF record start\n");
828 >    this->hasGffID(true);
829 >    gffID=Gstrdup(gffline->ID); //there must be an ID here
830 >    //if (gffline->is_transcript) ftype_id=gff_fid_mRNA;
831 >      //else
832 >    ftype_id=names->feats.addName(gffline->ftype);
833 >    if (gffline->is_transcript)
834 >      exon_ftype_id=gff_fid_exon;
835 >
836 >    if (keepAttr) this->parseAttrs(attrs, gffline->info);
837 >    }//no parent
838 >
839 >  if (gffline->gene_name!=NULL) {
840 >     gene_name=Gstrdup(gffline->gene_name);
841 >     }
842 >  if (gffline->gene_id!=NULL) {
843 >     geneID=Gstrdup(gffline->gene_id);
844 >     }
845 >
846 >  GSeqStat* gsd=gfrd->gseqstats.AddIfNew(new GSeqStat(gseq_id,names->gseqs.lastNameUsed()),true);
847 >  uptr=gsd;
848 >  if (start<gsd->mincoord) gsd->mincoord=start;
849 >  if (end>gsd->maxcoord) gsd->maxcoord=end;
850 >    if (this->len()>gsd->maxfeat_len) {
851 >        gsd->maxfeat_len=this->len();
852 >        gsd->maxfeat=this;
853          }
507    start=gffline->fstart;
508    end=gffline->fend;
509    gseq_id=names->gseqs.addName(gffline->gseqname);
510    track_id=names->tracks.addName(gffline->track);
511    qlen=gffline->qlen;
512    qstart=gffline->qstart;
513    qend=gffline->qend;
514    strand=gffline->strand;
515    if (keepAttr) this->parseAttrs(attrs, gffline->info, noExonAttr);
516    }
517 GSeqStat* gsd=gfrd->gseqstats.AddIfNew(new GSeqStat(gseq_id,names->gseqs.lastNameUsed()),true);
518 uptr=gsd;
519 gsd->gflst.Add(this);
520 if (start<gsd->mincoord) gsd->mincoord=start;
521 if (end>gsd->maxcoord) gsd->maxcoord=end;
522 gfrd->gfoAdd(gffID, gffline->gseqname, this);
854   }
855  
525
856   GffLine* GffReader::nextGffLine() {
857   if (gffline!=NULL) return gffline; //caller should free gffline after processing
858   while (gffline==NULL) {
529    //const char* l=linebuf->getLine();
859      int llen=0;
860      buflen=GFF_LINELEN-1;
861      char* l=fgetline(linebuf, buflen, fh, &fpos, &llen);
# Line 540 | Line 869
869      if (gffline->skip) {
870         delete gffline;
871         gffline=NULL;
872 +       continue;
873         }
874 +    if (gffline->ID==NULL && gffline->parents==NULL)  { //it must have an ID
875 +        //this might not be needed, already checked in the GffLine constructor
876 +        if (gff_warns)
877 +            GMessage("Warning: malformed GFF line, no parent or record Id (kipping\n");
878 +        delete gffline;
879 +        gffline=NULL;
880 +        //continue;
881 +        }
882      }
883   return gffline;
884   }
885  
886  
549 GffObj* GffReader::parse(bool keepAttr, bool noExonAttr) { //<- do not use this!
550   //parses one parent feature at a time, assuming absolute grouping of subfeatures
551   //!ASSUMES that subfeatures (exons) are properly grouped together
552   //any interleaving exons from other transcripts will terminate the parsing of the current feature!
553  GffObj* gfo=NULL;
554  while (nextGffLine()!=NULL) {
555    if (gfo==NULL) {//record starts fresh here
556       gfo=new GffObj(this, gffline, keepAttr, noExonAttr);
557       delete gffline; gffline=NULL;
558       continue;
559       }
560 // -- gfo is not NULL from here --
561    if (gffline->Parent==NULL) {// new parent feature starting here
562       //new record start, return what we have so far,
563       //gffline was NOT deleted, so it will be used for the next parse() call
564       return ((gfo==NULL) ? NULL : gfo->finalize());
565       }
566    //-- has a Parent so it's a subfeature segment (exon/CDS/other subfeature)
567      // is it a subfeature of the current gf?
568    if (strcmp(gffline->Parent, gfo->gffID)==0) {
569          //yes, add it
570          gfo->addExon(gffline, !noExonAttr, noExonAttr);
571          delete gffline; gffline=NULL;
572          continue;
573          }
574      // is it a subfeature of a previously loaded gfo?
575    GffObj* prevgfo=gfoFind(gffline->Parent, gffline->gseqname);
576    if (prevgfo==NULL)
577           return ((gfo==NULL) ? NULL : gfo->finalize()); // new subfeature, gffline will be used for the next parse()
578    //this is for an earlier parent
579    prevgfo->addExon(gffline, !noExonAttr, noExonAttr);
580    delete gffline;
581    gffline=NULL;
582    } //while reading gfflines
583  return ((gfo==NULL) ? NULL : gfo->finalize());
584 }
887   char* GffReader::gfoBuildId(const char* id, const char* ctg) {
888   //caller must free the returned pointer
889   char* buf=NULL;
# Line 592 | Line 894
894   strcpy(buf+idlen+1, ctg);
895   return buf;
896   }
897 <
897 > /*
898   void GffReader::gfoRemove(const char* id, const char* ctg) {
899   char* buf=gfoBuildId(id,ctg);
900   phash.Remove(buf);
901   GFREE(buf);
902   }
903 + */
904 + //Warning: if gflst gets altered, idx becomes obsolete
905 + GfoHolder* GffReader::gfoAdd(GffObj* gfo, int idx) {
906 + //TODO: must make sure the gfo ID isn't there already.
907 +
908 + GVec<GfoHolder>* glst=phash.Find(gfo->gffID);
909 + if (glst==NULL)
910 +         glst=new GVec<GfoHolder>(1);
911 + GfoHolder gh(gfo,idx);
912 + int i=glst->Add(gh);
913 + phash.Add(gfo->gffID, glst);
914 + return &(glst->Get(i));
915 + }
916 +
917 + GfoHolder* GffReader::gfoAdd(GVec<GfoHolder>& glst, GffObj* gfo, int idx) {
918 + GfoHolder gh(gfo,idx);
919 + int i=glst.Add(gh);
920 + return &(glst[i]);
921 + }
922 +
923 + GfoHolder* GffReader::gfoFind(const char* id, const char* ctg,
924 +                    GVec<GfoHolder>** glst, char strand, uint start, uint end) {
925 + GVec<GfoHolder>* gl=phash.Find(id);
926 + GfoHolder* gh=NULL;
927 + if (gl) {
928 +   for (int i=0;i<gl->Count();i++) {
929 +      GfoHolder& gfo = gl->Get(i);
930 +      if (ctg!=NULL && strcmp(ctg, gfo.gffobj->getGSeqName())!=0)
931 +           continue;
932 +      if (strand && strand != gfo.gffobj->strand)
933 +           continue;
934 +      if (start>0) {
935 +           if (abs((int)start-(int)gfo.gffobj->start)>GFF_MAX_LOCUS)
936 +               continue;
937 +           if (end>0 && (gfo.gffobj->start>end || gfo.gffobj->end<start))
938 +                   continue;
939 +           }
940 +      //must be the same transcript, according to given comparison criteria
941 +      gh=&gfo;
942 +      break;
943 +      }
944 +   }
945 + if (glst) *glst=gl;
946 + return gh;
947 + }
948  
949 < void GffReader::gfoAdd(const char* id, const char* ctg, GffObj* gfo) {
950 < char* buf=gfoBuildId(id,ctg);
951 < phash.Add(buf, gfo);
952 < GFREE(buf);
949 > GfoHolder* GffReader::replaceGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, int replaceidx) {
950 >  GffObj* newgfo=new GffObj(this, gffline, keepAttr, noExonAttr);
951 >  GfoHolder* r=NULL;
952 >  if (replaceidx>=0) {
953 >     gflst.Put(replaceidx,newgfo);
954 >     r=gfoAdd(newgfo, replaceidx);
955 >     }
956 >   else {
957 >     int gfoidx=gflst.Add(newgfo);
958 >     r=gfoAdd(newgfo, gfoidx);
959 >     }
960 >  /*
961 >  if (gff_warns) {
962 >    int* pcount=tids.Find(newgfo->gffID);
963 >    if (pcount!=NULL) {
964 >      if (gff_warns) GMessage("Warning: duplicate GFF ID: %s\n", newgfo->gffID);
965 >       (*pcount)++;
966 >       }
967 >     else {
968 >       tids.Add(newgfo->gffID,new int(1));
969 >       }
970 >    }
971 >   */
972 >  return r;
973   }
974 < GffObj* GffReader::gfoFind(const char* id, const char* ctg) {
975 < char* buf=gfoBuildId(id,ctg);
976 < GffObj* r=phash.Find(buf);
977 < GFREE(buf);
978 < return r;
974 >
975 > GfoHolder* GffReader::updateParent(GfoHolder* newgfh, GffObj* parent) {
976 >  //assert(parent);
977 >  //assert(newgfo);
978 >  parent->children.Add(newgfh->gffobj);
979 >  if (newgfh->gffobj->parent==NULL) newgfh->gffobj->parent=parent;
980 >  newgfh->gffobj->setLevel(parent->getLevel()+1);
981 >  if (parent->isGene()) {
982 >      if (parent->gene_name!=NULL && newgfh->gffobj->gene_name==NULL)
983 >        newgfh->gffobj->gene_name=Gstrdup(parent->gene_name);
984 >      if (parent->geneID!=NULL && newgfh->gffobj->geneID==NULL)
985 >        newgfh->gffobj->geneID=Gstrdup(parent->geneID);
986 >      }
987 >
988 >  return newgfh;
989   }
990  
991 + GfoHolder* GffReader::newGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr,
992 +                          GffObj* parent, GffExon* pexon, GVec<GfoHolder>* glst) {
993 +  GffObj* newgfo=new GffObj(this, gffline, keepAttr, noExonAttr);
994 +  GfoHolder* r=NULL;
995 +  int gfoidx=gflst.Add(newgfo);
996 +  r=(glst) ? gfoAdd(*glst, newgfo, gfoidx) : gfoAdd(newgfo, gfoidx);
997 +  if (parent!=NULL) {
998 +    updateParent(r, parent);
999 +    if (pexon!=NULL) parent->removeExon(pexon);
1000 +    }
1001 +  /*
1002 +  if (gff_warns) {
1003 +    int* pcount=tids.Find(newgfo->gffID);
1004 +    if (pcount!=NULL) {
1005 +       if (gff_warns) GMessage("Warning: duplicate GFF ID: %s\n", newgfo->gffID);
1006 +       (*pcount)++;
1007 +       }
1008 +     else {
1009 +       tids.Add(newgfo->gffID,new int(1));
1010 +       }
1011 +    }
1012 +  */
1013 +  return r;
1014 + }
1015  
1016 < void GffReader::parseAll(GffRecFunc* gproc, bool keepAttr, bool noExonAttr, void* userptr1, void* userptr2) {
1017 <  //WARNING: this is all messed up if the GFF lines are NOT grouped by parent feature
1018 <  //!!! do not use this !!!
1019 <  //iterates through all mappings in the input file
1020 <  //calling gproc with each parsed mapping
1021 <    GffObj* gfo;
1022 <    while ((gfo=this->parse(keepAttr,noExonAttr))!=NULL) { //a valid gff record was parsed
1023 <        if (gfo->empty()) { //shouldn't happen!
1024 <             delete gfo;
1025 <             gfo=NULL;
1026 <             continue;
1027 <             }
1028 <     if (gproc(gfo, userptr1, userptr2)) {
1029 <             //true returned from GfProcFunc means no longer needed
1030 <              gfoRemove(gfo->gffID, gfo->getGSeqName());
630 <              delete gfo;
631 <              }
632 <         else {
633 <              gflst.Add(gfo);
634 <              }
635 <     gfo=NULL;
636 <     } //while records are parsed
637 <    phash.Clear();
1016 > GfoHolder* GffReader::updateGffRec(GfoHolder* prevgfo, GffLine* gffline,
1017 >                                         bool keepAttr) {
1018 > if (prevgfo==NULL) return NULL;
1019 > //prevgfo->gffobj->createdByExon(false);
1020 > prevgfo->gffobj->ftype_id=prevgfo->gffobj->names->feats.addName(gffline->ftype);
1021 > prevgfo->gffobj->start=gffline->fstart;
1022 > prevgfo->gffobj->end=gffline->fend;
1023 > prevgfo->gffobj->isGene(gffline->is_gene);
1024 > prevgfo->gffobj->isTranscript(gffline->is_transcript || gffline->exontype!=0);
1025 > prevgfo->gffobj->hasGffID(gffline->ID!=NULL);
1026 > if (keepAttr) {
1027 >   if (prevgfo->gffobj->attrs!=NULL) prevgfo->gffobj->attrs->Clear();
1028 >   prevgfo->gffobj->parseAttrs(prevgfo->gffobj->attrs, gffline->info);
1029 >   }
1030 > return prevgfo;
1031   }
1032  
1033  
1034 + bool GffReader::addExonFeature(GfoHolder* prevgfo, GffLine* gffline, GHash<CNonExon>& pex, bool noExonAttr) {
1035 +  bool r=true;
1036 +  if (gffline->strand!=prevgfo->gffobj->strand) {
1037 +     //TODO: add support for trans-splicing and even inter-chromosomal fusions
1038 +        if (prevgfo->gffobj->strand=='.') {
1039 +            prevgfo->gffobj->strand=gffline->strand;
1040 +        }
1041 +     else {
1042 +       GMessage("GFF Error at %s (%c): exon %d-%d (%c) found on different strand; discarded.\n",
1043 +       prevgfo->gffobj->gffID, prevgfo->gffobj->strand,
1044 +       gffline->fstart, gffline->fend, gffline->strand, prevgfo->gffobj->getGSeqName());
1045 +       //r=false;
1046 +       return true; //FIXME: split trans-spliced mRNAs by strand
1047 +       }
1048 +   }
1049 +  int gdist=(gffline->fstart>prevgfo->gffobj->end) ? gffline->fstart-prevgfo->gffobj->end :
1050 +                      ((gffline->fend<prevgfo->gffobj->start)? prevgfo->gffobj->start-gffline->fend :
1051 +                         0 );
1052 +  if (gdist>(int)GFF_MAX_LOCUS) { //too far apart, most likely this is a duplicate ID
1053 +    GMessage("Error: duplicate GFF ID '%s' (or exons too far apart)!\n",prevgfo->gffobj->gffID);
1054 +    //validation_errors = true;
1055 +    r=false;
1056 +    if (!gff_warns) exit(1);
1057 +    }
1058 +  int eidx=prevgfo->gffobj->addExon(this, gffline, !noExonAttr, noExonAttr);
1059 +  if (eidx>=0 && gffline->ID!=NULL && gffline->exontype==0)
1060 +      subfPoolAdd(pex, prevgfo);
1061 +  return r;
1062 + }
1063 +
1064 + CNonExon* GffReader::subfPoolCheck(GffLine* gffline, GHash<CNonExon>& pex, char*& subp_name) {
1065 +  CNonExon* subp=NULL;
1066 +  subp_name=NULL;
1067 +  for (int i=0;i<gffline->num_parents;i++) {
1068 +    if (transcriptsOnly && discarded_ids.Find(gffline->parents[i])!=NULL)
1069 +        continue;
1070 +    subp_name=gfoBuildId(gffline->parents[i], gffline->gseqname); //e.g. mRNA name
1071 +    subp=pex.Find(subp_name);
1072 +    if (subp!=NULL)
1073 +       return subp;
1074 +    GFREE(subp_name);
1075 +    }
1076 +  return NULL;
1077 + }
1078 +
1079 + void GffReader::subfPoolAdd(GHash<CNonExon>& pex, GfoHolder* newgfo) {
1080 + //this might become a parent feature later
1081 + if (newgfo->gffobj->exons.Count()>0) {
1082 +   char* xbuf=gfoBuildId(gffline->ID, gffline->gseqname);
1083 +   pex.Add(xbuf, new CNonExon(newgfo->idx, newgfo->gffobj,
1084 +       newgfo->gffobj->exons[0], gffline));
1085 +   GFREE(xbuf);
1086 +   }
1087 + }
1088 +
1089 + GfoHolder* GffReader::promoteFeature(CNonExon* subp, char*& subp_name, GHash<CNonExon>& pex,
1090 +    bool keepAttr, bool noExonAttr) {
1091 +  GffObj* prevp=subp->parent; //grandparent of gffline (e.g. gene)
1092 +  if (prevp!=gflst[subp->idx])
1093 +    GError("Error promoting subfeature %s, gflst index mismatch?!\n", subp->gffline->ID);
1094 +  subp->gffline->discardParent();
1095 +  GfoHolder* gfoh=newGffRec(subp->gffline, keepAttr, noExonAttr, prevp, subp->exon);
1096 +  pex.Remove(subp_name); //no longer a potential parent, moved it to phash already
1097 +  prevp->promotedChildren(true);
1098 +  return gfoh; //returns the holder of newly promoted feature
1099 + }
1100 +
1101 + //have to parse the whole file because exons can be scattered all over
1102 + //trans-splicing and fusions are only accepted in proper GFF3 format, with a single parent feature ID entry
1103   void GffReader::readAll(bool keepAttr, bool mergeCloseExons, bool noExonAttr) {
1104 +  bool validation_errors = false;
1105 +  //loc_debug=false;
1106 +  GHash<CNonExon> pex; //keep track of any "exon"-like features that have an ID
1107 +                     //and thus could become promoted to parent features
1108    while (nextGffLine()!=NULL) {
1109 <    if (gffline->Parent==NULL) {//no parent, new GFF3-like record starting
1110 <       //check for uniqueness of gffline->ID in phash !
1111 <       GffObj* f=gfoFind(gffline->ID, gffline->gseqname);
1112 <       if (f!=NULL) {
1113 <            GError("Error: duplicate GFF ID '%s' encountered!\n",gffline->ID);
1114 <            }
1115 <       gflst.Add(new GffObj(this, gffline, keepAttr, noExonAttr));
1109 >     GfoHolder* prevseen=NULL;
1110 >     GVec<GfoHolder>* prevgflst=NULL;
1111 >     if (gffline->ID && gffline->exontype==0) {
1112 >         //>>>>> for a parent-like IDed feature (mRNA, gene, etc.)
1113 >                 //look for same ID on the same chromosome/strand/locus
1114 >                 prevseen=gfoFind(gffline->ID, gffline->gseqname, &prevgflst, gffline->strand, gffline->fstart);
1115 >                 if (prevseen!=NULL) {
1116 >                                //same ID/chromosome combo encountered before
1117 >                                if (prevseen->gffobj->createdByExon() &&
1118 >                                          prevseen->gffobj->start>=gffline->fstart &&
1119 >                                          prevseen->gffobj->end<=gffline->fend) {
1120 >                                        //an exon of this ID was given before
1121 >                                        //this line has the main attributes for this ID
1122 >                                        updateGffRec(prevseen, gffline, keepAttr);
1123 >                                        }
1124 >                                 else {
1125 >                                        //- duplicate ID -- this must be a discontiguous feature
1126 >                                   //   e.g. a trans-spliced transcript
1127 >                                   if (prevseen->gffobj->overlap(gffline->fstart, gffline->fend)) {
1128 >                                          //overlapping with same ID not allowed
1129 >                                         GMessage("Error: duplicate GFF ID '%s' encountered!\n",gffline->ID);
1130 >                                         //validation_errors = true;
1131 >                                         if (gff_warns) {
1132 >                                                   delete gffline;
1133 >                                                   gffline=NULL;
1134 >                                                   continue;
1135 >                                                   }
1136 >                                         else exit(1);
1137 >                                     }
1138 >                                    //create a new entry with the same ID
1139 >                    prevseen=newGffRec(gffline, keepAttr, noExonAttr,
1140 >                            prevseen->gffobj->parent, NULL, prevgflst);
1141 >                                        } //duplicate ID on the same chromosome
1142 >                                } //prevseeen != NULL
1143 >        } //parent-like ID feature
1144 >    if (gffline->parents==NULL) {//start GFF3-like record with no parent (mRNA, gene)
1145 >       if (!prevseen) newGffRec(gffline, keepAttr, noExonAttr, NULL, NULL, prevgflst);
1146         }
1147 <    else { //--- it's a subfeature (exon/CDS/other):
1148 <       GffObj* prevgfo=gfoFind(gffline->Parent, gffline->gseqname);
1149 <       if (prevgfo!=NULL) { //exon of a previously seen GffObj
1150 <                 if (gffline->strand!=prevgfo->strand) {
1151 <                    GError("Error: duplicate GFF ID '%s' (exons found on different strands of %s)\n",
1152 <                       prevgfo->gffID, prevgfo->getGSeqName());
1153 <                    }
1154 <                 int gdist=(gffline->fstart>prevgfo->end) ? gffline->fstart-prevgfo->end :
1155 <                                     ((gffline->fend<prevgfo->start)? prevgfo->start-gffline->fend :
1156 <                                        0 );
1157 <                 if (gdist>(int)GFF_MAX_LOCUS) { //too far apart, most likely this is a duplicate ID
1158 <                   GError("Error: duplicate GFF ID '%s' (or exons too far apart)!\n",prevgfo->gffID);
1159 <                   }
1160 <                 prevgfo->addExon(gffline, !noExonAttr, noExonAttr);
1161 <                 }
1162 <            else {//new GTF-like record starting here with a subfeature
1163 <                 gflst.Add(new GffObj(this, gffline, keepAttr, noExonAttr));
1164 <                 //even those with errors will be added here!
1165 <                 }
1166 <       } //subfeature
1167 <      //--
1168 <    delete gffline;
1169 <    gffline=NULL;
1170 <    }//while
1171 <  for (int i=0;i<gflst.Count();i++) {
1172 <    gflst[i]->finalize(mergeCloseExons); //finalize the parsing - also merge close exons/features if so requested
1173 <    }
1147 >    else { //--- it's a child feature (exon/CDS but could still be a mRNA with gene(s) as parent)
1148 >       bool found_parent=false;
1149 >       GfoHolder* newgfo=prevseen;
1150 >       GVec<GfoHolder>* newgflst=NULL;
1151 >       for (int i=0;i<gffline->num_parents;i++) {
1152 >            if (transcriptsOnly && discarded_ids.Find(gffline->parents[i])!=NULL)
1153 >                continue; //skipping discarded parent feature
1154 >            GfoHolder* parentgfo=gfoFind(gffline->parents[i], gffline->gseqname,
1155 >                                          &newgflst, gffline->strand, gffline->fstart, gffline->fend);
1156 >            if (parentgfo!=NULL) { //parent GffObj parsed earlier
1157 >                   found_parent=true;
1158 >                   if (parentgfo->gffobj->isGene() && gffline->is_transcript
1159 >                                   && gffline->exontype==0) {
1160 >                       //not an exon, but a transcript parented by a gene
1161 >                       if (newgfo) {
1162 >                           updateParent(newgfo, parentgfo->gffobj);
1163 >                           }
1164 >                         else {
1165 >                           newgfo=newGffRec(gffline, keepAttr, noExonAttr, parentgfo->gffobj);
1166 >                           }
1167 >                       }
1168 >                   else { //potential exon subfeature
1169 >                       if (!addExonFeature(parentgfo, gffline, pex, noExonAttr))
1170 >                         validation_errors=true;
1171 >                       }
1172 >                   } //overlapping parent feature found
1173 >            } //for each parsed parent Id
1174 >       if (!found_parent) { //new GTF-like record starting here with a subfeature directly
1175 >             //or it could be some chado GFF3 barf with exons declared BEFORE their parent :(
1176 >            //check if this feature isn't parented by a previously stored "exon" subfeature
1177 >            char* subp_name=NULL;
1178 >            CNonExon* subp=subfPoolCheck(gffline, pex, subp_name);
1179 >            if (subp!=NULL) { //found a subfeature that is the parent of this gffline
1180 >               //promote that subfeature to a full GffObj
1181 >               GfoHolder* gfoh=promoteFeature(subp, subp_name, pex, keepAttr, noExonAttr);
1182 >               //add current gffline as an exon of the newly promoted subfeature
1183 >               if (!addExonFeature(gfoh, gffline, pex, noExonAttr))
1184 >                      validation_errors=true;
1185 >               }
1186 >              else { //no parent seen before, create one directly with this exon
1187 >               //loc_debug=true;
1188 >               GfoHolder* ngfo=prevseen;
1189 >               if (ngfo==NULL)
1190 >                   ngfo=newGffRec(gffline, keepAttr, noExonAttr, NULL, NULL, newgflst);
1191 >               if (gffline->ID!=NULL && gffline->exontype==0)
1192 >                     subfPoolAdd(pex, ngfo);
1193 >               //even those with errors will be added here!
1194 >               }
1195 >            GFREE(subp_name);
1196 >            } //no previous parent found
1197 >       } //parented feature
1198 >        //--
1199 >      delete gffline;
1200 >      gffline=NULL;
1201 >      }//while gff lines
1202 >  gflst.finalize(this, mergeCloseExons, keepAttr, noExonAttr); //force sorting by locus if so constructed
1203   // all gff records are now loaded in GList gflst
1204   // so we can free the hash
1205    phash.Clear();
1206 +  //tids.Clear();
1207 +  if (validation_errors) {
1208 +    exit(1);
1209 +    }
1210   }
1211  
1212 < //this may be called prematurely if exon records are not grouped by parent
1213 < GffObj* GffObj::finalize(bool mergeCloseExons) {
1212 > GffObj* GffObj::finalize(GffReader* gfr, bool mergeCloseExons, bool keepAttrs, bool noExonAttr) {
1213 > //merge
1214 > //always merge adjacent or overlapping segments
1215 > //but if mergeCloseExons then merge even when distance is up to 5 bases
1216   udata=0;
1217   uptr=NULL;
1218 < //TODO: merge adjacent or close segments - for mRNAs
1219 < //must merge adjacent features, and optionally small gaps
1220 < if (ftype_id==gff_fid_mRNA) {
1218 > if (gfr->transcriptsOnly && !(isTranscript() || (isGene() && children.Count()==0))) {
1219 >       isDiscarded(true);
1220 >       }
1221 > if (ftype_id==gff_fid_transcript && CDstart>0) {
1222 >    ftype_id=gff_fid_mRNA;
1223 >    //exon_ftype_id=gff_fid_exon;
1224 >    }
1225 > //if (ftype_id==gff_fid_mRNA || exon_ftype_id==gff_fid_exon || mergeCloseExons) {
1226 > if (isTranscript() || exon_ftype_id==gff_fid_exon || mergeCloseExons) {
1227     int mindist=mergeCloseExons ? 5:1;
1228     for (int i=0;i<exons.Count()-1;i++) {
1229       int ni=i+1;
1230       uint mend=exons[i]->end;
1231       while (ni<exons.Count()) {
1232         int dist=(int)(exons[ni]->start-mend);
1233 <       if (dist<0 || dist>mindist) break; //no merging with next segment
1233 >       if (dist>mindist) break; //no merging with next segment
1234 >       if (gfr!=NULL && gfr->gff_warns && dist!=0 && (exons[ni]->exontype!=exgffUTR && exons[i]->exontype!=exgffUTR)) {
1235 >          GMessage("GFF warning: merging adjacent/overlapping segments of %s on %s (%d-%d, %d-%d)\n",
1236 >               gffID, getGSeqName(), exons[i]->start, exons[i]->end,exons[ni]->start, exons[ni]->end);
1237 >          }
1238         mend=exons[ni]->end;
1239 +       covlen-=exons[i]->len();
1240         exons[i]->end=mend;
1241 +       covlen+=exons[i]->len();
1242 +       covlen-=exons[ni]->len();
1243         if (exons[ni]->attrs!=NULL && (exons[i]->attrs==NULL ||
1244              exons[i]->attrs->Count()<exons[ni]->attrs->Count())) {
1245                //use the other exon attributes, if more
# Line 707 | Line 1251
1251         } //check for merge with next exon
1252       } //for each exon
1253     }
1254 + //attribute reduction for GTF records
1255 + if (keepAttrs && !noExonAttr && !hasGffID()
1256 +          && exons.Count()>0 && exons[0]->attrs!=NULL) {
1257 +   bool attrs_discarded=false;
1258 +   for (int a=0;a<exons[0]->attrs->Count();a++) {
1259 +      int attr_name_id=exons[0]->attrs->Get(a)->attr_id;
1260 +      char* attr_name=names->attrs.getName(attr_name_id);
1261 +      char* attr_val =exons[0]->attrs->Get(a)->attr_val;
1262 +      bool sameExonAttr=true;
1263 +      for (int i=1;i<exons.Count();i++) {
1264 +         char* ov=exons[i]->getAttr(attr_name_id);
1265 +         if (ov==NULL || (strcmp(ov,attr_val)!=0)) {
1266 +             sameExonAttr=false;
1267 +             break;
1268 +             }
1269 +         }
1270 +      if (sameExonAttr) {
1271 +             //delete this attribute from exons level
1272 +             attrs_discarded=true;
1273 +             this->addAttr(attr_name, attr_val);
1274 +             for (int i=1;i<exons.Count();i++) {
1275 +                 removeExonAttr(*(exons[i]), attr_name_id);
1276 +                 }
1277 +             exons[0]->attrs->freeItem(a);
1278 +             }
1279 +      }
1280 +   if (attrs_discarded) exons[0]->attrs->Pack();
1281 +   }
1282   return this;
1283   }
1284  
1285 < void GffObj::parseAttrs(GffAttrs*& atrlist, char* info, bool noExonAttr) {
1285 > void GffObj::parseAttrs(GffAttrs*& atrlist, char* info, bool isExon) {
1286    if (names==NULL)
1287       GError(ERR_NULL_GFNAMES, "parseAttrs()");
1288    if (atrlist==NULL)
# Line 730 | Line 1302
1302      char* ech=strchr(start,'=');
1303      if (ech!=NULL) { // attr=value format found
1304         *ech='\0';
1305 <       if (strcmp(start, "ID")==0 || strcmp(start,"Parent")==0 || strcmp(start,"Name")==0 ||
1306 <            strcmp(start,"transcript_id")==0 || strcmp(start,"gene_id")==0)
1307 <          { start=pch; continue; } //skip this already recognized and stored attribute
1308 <       if (noExonAttr && (strcmp(start, "exon_number")==0 || strcmp(start, "exon")==0)) { start=pch; continue; }
1305 >       //if (noExonAttr && (strcmp(start, "exon_number")==0 || strcmp(start, "exon")==0)) { start=pch; continue; }
1306 >       if (strcmp(start, "exon_number")==0 || strcmp(start, "exon")==0 ||
1307 >              strcmp(start, "exon_id")==0)
1308 >           { start=pch; continue; }
1309         ech++;
1310         while (*ech==' ' && ech<endinfo) ech++;//skip extra spaces after the '='
1311 <       atrlist->Add(new GffAttr(names->attrs.addName(start),ech));
1311 >       //atrlist->Add(new GffAttr(names->attrs.addName(start),ech));
1312 >       //make sure we don't add the same attribute more than once
1313 >       if (isExon && (strcmp(start, "protein_id")==0)) {
1314 >             //Ensembl special case
1315 >             this->addAttr(start, ech);
1316 >             start=pch;
1317 >             continue;
1318 >             }
1319 >       atrlist->add_or_update(names, start, ech);
1320         }
1321        /*
1322        else { //not an attr=value format
# Line 748 | Line 1328
1328    if (atrlist->Count()==0) { delete atrlist; atrlist=NULL; }
1329   }
1330  
1331 < void GffObj::addAttr(const char* attrname, char* attrvalue) {
1331 > void GffObj::addAttr(const char* attrname, const char* attrvalue) {
1332    if (this->attrs==NULL)
1333        this->attrs=new GffAttrs();
1334 <  this->attrs->Add(new GffAttr(names->attrs.addName(attrname),attrvalue));
1334 >  //this->attrs->Add(new GffAttr(names->attrs.addName(attrname),attrvalue));
1335 >  this->attrs->add_or_update(names, attrname, attrvalue);
1336 > }
1337 >
1338 >
1339 > void GffObj::setFeatureName(const char* feature) {
1340 > //change the feature name/type for a transcript
1341 > int fid=names->feats.addName(feature);
1342 > if (monoFeature() && exons.Count()>0)
1343 >    this->exon_ftype_id=fid;
1344 > this->ftype_id=fid;
1345 > }
1346 >
1347 > void GffObj::setRefName(const char* newname) {
1348 > //change the feature name/type for a transcript
1349 > int rid=names->gseqs.addName(newname);
1350 > this->gseq_id=rid;
1351 > }
1352 >
1353 >
1354 >
1355 > int GffObj::removeAttr(const char* attrname, const char* attrval) {
1356 >  if (this->attrs==NULL || attrname==NULL || attrname[0]==0) return 0;
1357 >  int aid=this->names->attrs.getId(attrname);
1358 >  if (aid<0) return 0;
1359 >  int delcount=0;  //could be more than one ?
1360 >  for (int i=0;i<this->attrs->Count();i++) {
1361 >     if (aid==this->attrs->Get(i)->attr_id) {
1362 >       if (attrval==NULL ||
1363 >          strcmp(attrval, this->attrs->Get(i)->attr_val)==0) {
1364 >             delcount++;
1365 >             this->attrs->freeItem(i);
1366 >             }
1367 >       }
1368 >     }
1369 >  if (delcount>0) this->attrs->Pack();
1370 >  return delcount;
1371 > }
1372 >
1373 > int GffObj::removeAttr(int aid, const char* attrval) {
1374 >  if (this->attrs==NULL || aid<0) return 0;
1375 >  int delcount=0;  //could be more than one ?
1376 >  for (int i=0;i<this->attrs->Count();i++) {
1377 >     if (aid==this->attrs->Get(i)->attr_id) {
1378 >       if (attrval==NULL ||
1379 >          strcmp(attrval, this->attrs->Get(i)->attr_val)==0) {
1380 >             delcount++;
1381 >             this->attrs->freeItem(i);
1382 >             }
1383 >       }
1384 >     }
1385 >  if (delcount>0) this->attrs->Pack();
1386 >  return delcount;
1387 > }
1388 >
1389 >
1390 > int GffObj::removeExonAttr(GffExon& exon, const char* attrname, const char* attrval) {
1391 >  if (exon.attrs==NULL || attrname==NULL || attrname[0]==0) return 0;
1392 >  int aid=this->names->attrs.getId(attrname);
1393 >  if (aid<0) return 0;
1394 >  int delcount=0;  //could be more than one
1395 >  for (int i=0;i<exon.attrs->Count();i++) {
1396 >     if (aid==exon.attrs->Get(i)->attr_id) {
1397 >       if (attrval==NULL ||
1398 >          strcmp(attrval, exon.attrs->Get(i)->attr_val)==0) {
1399 >             delcount++;
1400 >             exon.attrs->freeItem(i);
1401 >             }
1402 >       }
1403 >     }
1404 >  if (delcount>0) exon.attrs->Pack();
1405 >  return delcount;
1406   }
1407  
1408 + int GffObj::removeExonAttr(GffExon& exon, int aid, const char* attrval) {
1409 +  if (exon.attrs==NULL || aid<0) return 0;
1410 +  int delcount=0;  //could be more than one
1411 +  for (int i=0;i<exon.attrs->Count();i++) {
1412 +     if (aid==exon.attrs->Get(i)->attr_id) {
1413 +       if (attrval==NULL ||
1414 +          strcmp(attrval, exon.attrs->Get(i)->attr_val)==0) {
1415 +             delcount++;
1416 +             exon.attrs->freeItem(i);
1417 +             }
1418 +       }
1419 +     }
1420 +  if (delcount>0) exon.attrs->Pack();
1421 +  return delcount;
1422 + }
1423 +
1424 +
1425   void GffObj::getCDS_ends(uint& cds_start, uint& cds_end) {
1426    cds_start=0;
1427    cds_end=0;
# Line 800 | Line 1468
1468               //CDstart in this segment
1469               //and we are getting the whole transcript
1470               cds_mend=s-(int)(CDstart-sgstart);
803             //GMessage("Setting cds_mend to %d\n",cds_mend);
1471               }
1472         if (CDend>=sgstart && CDend<=sgend) {
1473               //CDstart in this segment
1474               //and we are getting the whole transcript
1475               cds_mstart=s-(int)(CDend-cdsadj-sgstart);
809             //GMessage("Setting cds_mstart to %d\n",cds_mstart);
1476               }
1477        } //for each exon
1478      } // - strand
# Line 840 | Line 1506
1506    //return spliced;
1507   }
1508  
1509 + char* GffObj::getUnspliced(GFaSeqGet* faseq, int* rlen, GList<GSeg>* seglst)
1510 + {
1511 +    if (faseq==NULL) { GMessage("Warning: getUnspliced(NULL,.. ) called!\n");
1512 +        return NULL;
1513 +    }
1514 +    //restore normal coordinates:
1515 +    unxcoord();
1516 +    if (exons.Count()==0) return NULL;
1517 +    int fspan=end-start+1;
1518 +    const char* gsubseq=faseq->subseq(start, fspan);
1519 +    if (gsubseq==NULL) {
1520 +        GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end);
1521 +    }
1522 +    char* unspliced=NULL;
1523 +
1524 +    int seqstart=exons.First()->start;
1525 +    int seqend=exons.Last()->end;
1526 +    
1527 +    int unsplicedlen = 0;
1528 +
1529 +    unsplicedlen += seqend - seqstart + 1;
1530 +
1531 +    GMALLOC(unspliced, unsplicedlen+1); //allocate more here
1532 +    //uint seqstart, seqend;
1533 +
1534 +    int s = 0; //resulting nucleotide counter
1535 +    if (strand=='-')
1536 +    {
1537 +        if (seglst!=NULL)
1538 +            seglst->Add(new GSeg(s+1,s+1+seqend-seqstart));
1539 +        for (int i=seqend;i>=seqstart;i--)
1540 +        {
1541 +            unspliced[s] = ntComplement(gsubseq[i-start]);
1542 +            s++;
1543 +        }//for each nt
1544 +    } // - strand
1545 +    else
1546 +    { // + strand
1547 +        if (seglst!=NULL)
1548 +            seglst->Add(new GSeg(s+1,s+1+seqend-seqstart));
1549 +        for (int i=seqstart;i<=seqend;i++)
1550 +        {
1551 +            unspliced[s]=gsubseq[i-start];
1552 +            s++;
1553 +        }//for each nt
1554 +    } // + strand
1555 +    //assert(s <= unsplicedlen);
1556 +    unspliced[s]=0;
1557 +    if (rlen!=NULL) *rlen=s;
1558 +    return unspliced;
1559 + }
1560 +
1561   char* GffObj::getSpliced(GFaSeqGet* faseq, bool CDSonly, int* rlen, uint* cds_start, uint* cds_end,
1562            GList<GSeg>* seglst) {
1563    if (CDSonly && CDstart==0) return NULL;
# Line 854 | Line 1572
1572    if (gsubseq==NULL) {
1573          GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end);
1574          }
1575 +  if (fspan<(int)(end-start+1)) { //special case: stop coordinate was extended past the gseq length, must adjust
1576 +     int endadj=end-start+1-fspan;
1577 +     uint prevend=end;
1578 +     end-=endadj;
1579 +     if (CDend>end) CDend=end;
1580 +     if (exons.Last()->end>end) {
1581 +         exons.Last()->end=end; //this could get us into trouble if exon start is also > end
1582 +         if (exons.Last()->start>exons.Last()->end) {
1583 +            GError("GffObj::getSpliced() error: improper genomic coordinate %d on %s for %s\n",
1584 +                  prevend,getGSeqName(), getID());
1585 +            }
1586 +         covlen-=endadj;
1587 +         }
1588 +     }
1589    char* spliced=NULL;
1590    GMALLOC(spliced, covlen+1); //allocate more here
1591    uint seqstart, seqend;
# Line 1019 | Line 1751
1751            strand, start, end, gscore, (float)qcov/10.0);
1752   }
1753  
1754 < void GffObj::printGxfLine(FILE* fout, char* tlabel, char* gseqname, bool iscds,
1754 > void GffObj::printGxfLine(FILE* fout, const char* tlabel, const char* gseqname, bool iscds,
1755                               uint segstart, uint segend, int exidx, char phase, bool gff3) {
1756    static char scorestr[14];
1757    strcpy(scorestr,".");
# Line 1028 | Line 1760
1760       if (exons[exidx]->score) sprintf(scorestr,"%.2f", exons[exidx]->score);
1761       xattrs=exons[exidx]->attrs;
1762    }
1031  char* geneid=(gname!=NULL)? gname : gffID;
1763    if (phase==0 || !iscds) phase='.';
1764    const char* ftype=iscds ? "CDS" : getSubfName();
1765    if (gff3) {
# Line 1043 | Line 1774
1774           }
1775      fprintf(fout, "\n");
1776      } //GFF
1777 <  else {//for GTF -- we can only print mRNAs here
1778 <    fprintf(fout, "%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\t",
1779 <        gseqname, tlabel, ftype, segstart, segend, scorestr, strand, phase);
1780 <    if (ismRNA())
1781 <       fprintf(fout,"gene_id \"%s\"; transcript_id \"%s\";", geneid, gffID);
1777 >  else {//for GTF -- we print only transcripts
1778 >    //if (isValidTranscript())
1779 >    fprintf(fout, "%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\ttranscript_id \"%s\";",
1780 >           gseqname, tlabel, ftype, segstart, segend, scorestr, strand, phase, gffID);
1781 >    //char* geneid=(geneID!=NULL)? geneID : gffID;
1782 >    if (geneID)
1783 >      fprintf(fout," gene_id \"%s\";",geneID);
1784 >    if (gene_name!=NULL) {
1785 >       //fprintf(fout, " gene_name ");
1786 >       //if (gene_name[0]=='"') fprintf (fout, "%s;",gene_name);
1787 >       //              else fprintf(fout, "\"%s\";",gene_name);
1788 >       fprintf(fout," gene_name \"%s\";",gene_name);
1789 >       }
1790      if (xattrs!=NULL) {
1791 <       for (int i=0;i<xattrs->Count();i++) {
1792 <         if (xattrs->Get(i)->attr_val==NULL) continue;
1793 <         fprintf(fout, " %s ",names->attrs.getName(xattrs->Get(i)->attr_id));
1794 <          if (xattrs->Get(i)->attr_val[0]=='"')
1795 <                  fprintf(fout, "%s;",xattrs->Get(i)->attr_val);
1796 <             else fprintf(fout, "\"%s\";",xattrs->Get(i)->attr_val);
1791 >          for (int i=0;i<xattrs->Count();i++) {
1792 >            if (xattrs->Get(i)->attr_val==NULL) continue;
1793 >            const char* attrname=names->attrs.getName(xattrs->Get(i)->attr_id);
1794 >            fprintf(fout, " %s ",attrname);
1795 >            if (xattrs->Get(i)->attr_val[0]=='"')
1796 >                     fprintf(fout, "%s;",xattrs->Get(i)->attr_val);
1797 >                else fprintf(fout, "\"%s\";",xattrs->Get(i)->attr_val);
1798 >             }
1799            }
1800 <       }
1800 >    //for GTF, also append the GffObj attributes to each exon line
1801 >    if ((xattrs=this->attrs)!=NULL) {
1802 >          for (int i=0;i<xattrs->Count();i++) {
1803 >            if (xattrs->Get(i)->attr_val==NULL) continue;
1804 >            const char* attrname=names->attrs.getName(xattrs->Get(i)->attr_id);
1805 >            fprintf(fout, " %s ",attrname);
1806 >            if (xattrs->Get(i)->attr_val[0]=='"')
1807 >                     fprintf(fout, "%s;",xattrs->Get(i)->attr_val);
1808 >                else fprintf(fout, "\"%s\";",xattrs->Get(i)->attr_val);
1809 >             }
1810 >           }
1811      fprintf(fout, "\n");
1812      }//GTF
1813   }
1814  
1815 < void GffObj::printGxf(FILE* fout, GffPrintMode gffp, char* tlabel) {
1815 > void GffObj::printGxf(FILE* fout, GffPrintMode gffp,
1816 >                   const char* tlabel, const char* gfparent) {
1817   static char tmpstr[255];
1818   if (tlabel==NULL) {
1819      tlabel=track_id>=0 ? names->tracks.Get(track_id)->name :
1820           (char*)"gffobj" ;
1821      }
1070
1822   unxcoord();
1823 < if (exons.Count()==0) return;
1824 < char* gseqname=names->gseqs.Get(gseq_id)->name;
1823 > //if (exons.Count()==0) return;
1824 > const char* gseqname=names->gseqs.Get(gseq_id)->name;
1825   bool gff3 = (gffp>=pgffAny);
1826   bool showCDS = (gffp==pgtfAny || gffp==pgtfCDS || gffp==pgffCDS || gffp==pgffAny || gffp==pgffBoth);
1827   bool showExon = (gffp<=pgtfExon || gffp==pgffAny || gffp==pgffExon || gffp==pgffBoth);
# Line 1084 | Line 1835
1835        pend=CDend;
1836        }
1837     else { pstart=start;pend=end; }
1838 <   const char* ftype=ismRNA() ? "mRNA" : getFeatureName();
1838 >   //const char* ftype=isTranscript() ? "mRNA" : getFeatureName();
1839 >   const char* ftype=getFeatureName();
1840     fprintf(fout,
1841       "%s\t%s\t%s\t%d\t%d\t%s\t%c\t.\tID=%s",
1842       gseqname, tlabel, ftype, pstart, pend, tmpstr, strand, gffID);
1843 <   if (gname!=NULL)
1844 <       fprintf(fout, ";Name=%s",gname);
1845 <   if (CDstart>0 && !showCDS && !isCDS) fprintf(fout,";CDS=%d:%d",CDstart,CDend);
1843 >   if (CDstart>0 && !showCDS && !isCDS) fprintf(fout,";CDS=%d-%d",CDstart,CDend);
1844 >   if (gfparent!=NULL) {
1845 >      //parent override
1846 >      fprintf(fout, ";Parent=%s",gfparent);
1847 >      }
1848 >     else {
1849 >       if (parent!=NULL && !parent->isDiscarded())
1850 >           fprintf(fout, ";Parent=%s",parent->getID());
1851 >       }
1852 >   if (geneID!=NULL)
1853 >      fprintf(fout, ";geneID=%s",geneID);
1854 >   if (gene_name!=NULL)
1855 >      fprintf(fout, ";gene_name=%s",gene_name);
1856     if (attrs!=NULL) {
1857        for (int i=0;i<attrs->Count();i++) {
1858 <        fprintf(fout,";%s=%s", names->attrs.getName(attrs->Get(i)->attr_id),
1858 >        const char* attrname=names->attrs.getName(attrs->Get(i)->attr_id);
1859 >        fprintf(fout,";%s=%s", attrname,
1860                 attrs->Get(i)->attr_val);
1861          }
1862        }
# Line 1101 | Line 1864
1864     }// gff3 mRNA line
1865   if (showExon) {
1866     //print exons
1867 <   for (int i=0;i<exons.Count();i++) {
1868 <     printGxfLine(fout, tlabel, gseqname, isCDS, exons[i]->start, exons[i]->end, i, exons[i]->phase, gff3);
1869 <     }
1870 < }//printing exons
1867 >    if (isCDS && exons.Count()>0 &&
1868 >        ((strand=='-' && exons.Last()->phase<'0') || (strand=='+' && exons.Last()->phase<'0')))
1869 >         updateExonPhase();
1870 >
1871 >    for (int i=0;i<exons.Count();i++) {
1872 >      printGxfLine(fout, tlabel, gseqname, isCDS, exons[i]->start, exons[i]->end, i, exons[i]->phase, gff3);
1873 >      }
1874 >    }//printing exons
1875   if (showCDS && !isCDS && CDstart>0) {
1876      GArray<GffCDSeg> cds(true,true);
1877      getCDSegs(cds);
# Line 1114 | Line 1881
1881    } //showCDS
1882   }
1883  
1884 + void GffObj::updateExonPhase() {
1885 +  if (!isCDS) return;
1886 +  int cdsacc=0;
1887 +  if (CDphase=='1' || CDphase=='2') {
1888 +      cdsacc+= 3-(CDphase-'0');
1889 +      }
1890 +  if (strand=='-') { //reverse strand
1891 +     for (int i=exons.Count()-1;i>=0;i--) {
1892 +         exons[i]->phase='0'+ (3-cdsacc%3)%3;
1893 +         cdsacc+=exons[i]->end-exons[i]->start+1;
1894 +         }
1895 +     }
1896 +    else { //forward strand
1897 +     for (int i=0;i<exons.Count();i++) {
1898 +         exons[i]->phase='0'+ (3-cdsacc%3)%3;
1899 +         cdsacc+=exons[i]->end-exons[i]->start+1;
1900 +         }
1901 +     }
1902 + }
1903 +
1904  
1905   void GffObj::getCDSegs(GArray<GffCDSeg>& cds) {
1906    GffCDSeg cdseg;
# Line 1158 | Line 1945
1945         } //for each exon
1946     } // + strand
1947   }
1161 /*
1162 #ifdef DEBUG
1163 void GffObj::dbgPrint(const char* msg) {
1164 if (msg!=NULL) fprintf(stdout, ">> %s\n",msg);
1165 char* tlabel=track_id>=0 ? names->tracks.Get(track_id)->name :
1166       (char*)"gmapobj" ;
1167 char scorestr[14];
1168 char strand=revstrand?'-':'+';
1169 unxcoord();
1170 char* gseqname=names->gseqs.Get(gseq_id)->name;
1171 char* fname=f_id>=0 ? names->feats.Get(f_id)->name : (char*)"nofeatname";
1172
1173 fprintf(stdout, "%s\t%s\t%s\t%d\t%d\t.\t%c\t.\tID=%s;Name=%s\n",
1174       gseqname, tlabel, fname, start, end, strand, gffID, gffID);
1175
1176 for (int fi=0;fi<features->Count();fi++) {
1177   GFeature* feature=features->Get(fi);
1178   fname=names->feats.Get(feature->name_id)->name;
1179   GffExon* segs=feature->segs;
1180   int segcount=feature->segcount;
1181   if (segcount==0 || segs==NULL) continue;
1182   for (int i=0;i<segcount;i++) {
1183      if (segs[i].start==0) continue;
1184      if (segs[i].score) sprintf(scorestr,"%.2f", segs[i].score/100.00);
1185                  else strcpy(scorestr,".");
1186      fprintf(stdout,
1187         "%s\t%s\t%s\t%d\t%d\t%s\t%c\t.\tParent=%s\n",
1188         gseqname, tlabel, fname, segs[i].start, segs[i].end, scorestr, strand, gffID);
1189      }
1190   }
1191 fflush(stdout);
1192 }
1193 #endif
1194 */
1195

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines