ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/gff.cpp
(Generate patch)
# Line 5 | Line 5
5   //global set of feature names, attribute names etc.
6   // -- common for all GffObjs in current application!
7  
8 < const uint GFF_MAX_LOCUS = 4000000; //longest known gene in human is ~2.2M, UCSC claims a gene for mouse of ~ 3.1 M
9 < const uint GFF_MAX_EXON  =   20000; //longest known exon in human is ~11K
10 < const uint GFF_MAX_INTRON= 1600000;
11 <
8 > const uint GFF_MAX_LOCUS = 7000000; //longest known gene in human is ~2.2M, UCSC claims a gene for mouse of ~ 3.1 M
9 > const uint GFF_MAX_EXON  =   30000; //longest known exon in human is ~11K
10 > const uint GFF_MAX_INTRON= 6000000;
11 > bool gff_show_warnings = false; //global setting, set by GffReader->showWarnings()
12   const int gff_fid_mRNA=0;
13 < const int gff_fid_exon=1;
14 < const int gff_fid_CDS=2;
15 < bool gff_warns=true;
13 > const int gff_fid_transcript=1;
14 > const int gff_fid_exon=2;
15 > const int gff_fid_CDS=3; //never really used in GffObj ftype_id or subftype_id
16 > const uint gfo_flag_HAS_ERRORS       = 0x00000001;
17 > const uint gfo_flag_CHILDREN_PROMOTED= 0x00000002;
18 > const uint gfo_flag_IS_GENE          = 0x00000004;
19 > const uint gfo_flag_IS_TRANSCRIPT    = 0x00000008;
20 > const uint gfo_flag_FROM_GFF3        = 0x00000010;
21 > const uint gfo_flag_BY_EXON          = 0x00000020; //created by subfeature (exon) directly
22 > const uint gfo_flag_DISCARDED        = 0x00000100;
23 > const uint gfo_flag_LST_KEEP         = 0x00000200;
24 > const uint gfo_flag_LEVEL_MSK        = 0x00FF0000;
25 > const byte gfo_flagShift_LEVEL           = 16;
26  
27   void gffnames_ref(GffNames* &n) {
28    if (n==NULL) n=new GffNames();
# Line 29 | Line 39
39  
40   GffObj& g1=*((GffObj*)p1);
41   GffObj& g2=*((GffObj*)p2);
42 < if (g1.gseq_id==g2.gseq_id) return (int)(g1.start-g2.start);
43 <                else return (int)(g1.gseq_id-g2.gseq_id);
42 > if (g1.gseq_id==g2.gseq_id) {
43 >             if (g1.start!=g2.start)
44 >                    return (int)(g1.start-g2.start);
45 >               else if (g1.getLevel()!=g2.getLevel())
46 >                        return (int)(g1.getLevel()-g2.getLevel());
47 >                    else
48 >                        if (g1.end!=g2.end)
49 >                              return (int)(g1.end-g2.end);
50 >                        else return strcmp(g1.getID(), g2.getID());
51 >             }
52 >             else return (int)(g1.gseq_id-g2.gseq_id);
53   }
54  
55 < static char fnamelc[32];
55 > char* GffLine::extractAttr(const char* pre, bool caseStrict, bool enforce_GTF2) {
56 > //parse a key attribute and remove it from the info string
57 > //(only works for attributes that have values following them after ' ' or '=')
58 > static const char GTF2_ERR[]="Error parsing attribute %s ('\"' required) at GTF line:\n%s\n";
59 > int lpre=strlen(pre);
60 > char cend=pre[lpre-1];
61 > char* pos = (caseStrict) ? strstr(info, pre) : strifind(info, pre);
62 > if (pos==NULL) return NULL;
63 > char* findstart=info;
64 > //require word boundary on the left:
65 > while (pos!=NULL && pos!=info && *(pos-1)!=';' && *(pos-1)!=' ') {
66 >    findstart=pos+lpre;
67 >    pos = (caseStrict) ? strstr(findstart, pre) : strifind(findstart, pre);
68 >    }
69 > if (pos==NULL) return NULL;
70 > if (cend!=' ' && cend!='=') {
71 >    //require word boundary on the right:
72 >    while (pos!=NULL && *(pos+lpre)!=' ' && *(pos+lpre)!='=') {
73 >       findstart=pos+lpre;
74 >       pos = (caseStrict) ? strstr(findstart, pre) : strifind(findstart, pre);
75 >       }
76 >    }
77 > if (pos==NULL) return NULL;
78 > char* vp=pos+lpre;
79 > while (*vp==' ') vp++;
80 > if (*vp==';' || *vp==0)
81 >      GError("Error parsing value of GFF attribute \"%s\", line:\n%s\n", pre, dupline);
82 > bool dq_enclosed=false; //value string enclosed by double quotes
83 > if (*vp=='"') {
84 >     dq_enclosed=true;
85 >     vp++;
86 >     }
87 > if (enforce_GTF2 && !dq_enclosed)
88 >      GError(GTF2_ERR,pre, dupline);
89 > char* vend=vp;
90 > if (dq_enclosed) {
91 >    while (*vend!='"' && *vend!=';' && *vend!=0) vend++;
92 >    }
93 > else {
94 >    while (*vend!=';' && *vend!=0) vend++;
95 >    }
96 > if (enforce_GTF2 && *vend!='"')
97 >     GError(GTF2_ERR, pre, dupline);
98 > char *r=Gstrdup(vp, vend-1);
99 > //-- now remove this attribute from the info string
100 > while (*vend!=0 && (*vend=='"' || *vend==';' || *vend==' ')) vend++;
101 > if (*vend==0) vend--;
102 > for (char *src=vend, *dest=pos;;src++,dest++) {
103 >   *dest=*src;
104 >   if (*src==0) break;
105 >   }
106 > return r;
107 > }
108 >
109 > static char fnamelc[128];
110  
111   GffLine::GffLine(GffReader* reader, const char* l) {
112 < line=Gstrdup(l);
112 > llen=strlen(l);
113 > GMALLOC(line,llen+1);
114 > memcpy(line, l, llen+1);
115 > GMALLOC(dupline, llen+1);
116 > memcpy(dupline, l, llen+1);
117   skip=true;
118   gseqname=NULL;
119   track=NULL;
120   ftype=NULL;
121   info=NULL;
122 < Parent=NULL;
122 > _parents=NULL;
123 > _parents_len=0;
124 > num_parents=0;
125 > parents=NULL;
126 > is_gff3=false;
127   is_cds=false;
128 < is_mrna=false;
128 > is_transcript=false;
129   is_exon=false;
130 + is_gene=false;
131   exontype=0;
132 < gname=NULL;
132 > gene_id=NULL;
133 > gene_name=NULL;
134   qstart=0;
135   qend=0;
136   qlen=0;
54 exontype=0;
137   ID=NULL;
138   char* t[9];
139   int i=0;
# Line 82 | Line 164
164   p=t[4];
165   if (!parseUInt(p,fend))
166     GError("Error parsing end coordinate from GFF line:\n%s\n",l);
167 < if (fend<fstart) swap(fend,fstart);
167 > if (fend<fstart) swap(fend,fstart); //make sure fstart>=fend, always
168   p=t[5];
169   if (p[0]=='.' && p[1]==0) {
170    score=0;
# Line 94 | Line 176
176   strand=*t[6];
177   if (strand!='+' && strand!='-' && strand!='.')
178       GError("Error parsing strand (%c) from GFF line:\n%s\n",strand,l);
179 < phase=*t[7];
179 > phase=*t[7]; // must be '.', '0', '1' or '2'
180   ID=NULL;
99 Parent=NULL;
181   // exon/CDS/mrna filter
182 < strncpy(fnamelc, ftype, 31);
183 < fnamelc[31]=0;
182 > strncpy(fnamelc, ftype, 127);
183 > fnamelc[127]=0;
184   strlower(fnamelc); //convert to lower case
185 + bool is_t_data=false;
186   if (strstr(fnamelc, "utr")!=NULL) {
187     exontype=exgffUTR;
188     is_exon=true;
189 +   is_t_data=true;
190     }
191    else if (strstr(fnamelc, "exon")!=NULL) {
192     exontype=exgffExon;
193     is_exon=true;
194 +   is_t_data=true;
195     }
196 <  else if (startsWith(fnamelc, "stop") && strstr(fnamelc, "codon")!=NULL) {
196 >  else if (strstr(fnamelc, "stop") &&
197 >      (strstr(fnamelc, "codon") || strstr(fnamelc, "cds"))){
198     exontype=exgffStop;
199 +   is_cds=true; //though some place it outside the last CDS segment
200 +   is_t_data=true;
201 +   }
202 +  else if (strstr(fnamelc, "start") &&
203 +      ((strstr(fnamelc, "codon")!=NULL) || strstr(fnamelc, "cds")!=NULL)){
204 +   exontype=exgffStart;
205     is_cds=true;
206 +   is_t_data=true;
207     }
208   else if (strcmp(fnamelc, "cds")==0) {
209     exontype=exgffCDS;
210     is_cds=true;
211 +   is_t_data=true;
212     }
213 < else { //is_mrna is set only if the *current* line is a mRNA or transcript
214 <   is_mrna=(strcmp(fnamelc,"mrna")==0 ||
215 <          strcmp(fnamelc,"transcript")==0);
123 <   }
124 <
125 < if (reader->mrnaOnly) {
126 <   if (!is_mrna && !is_cds && !is_exon)
127 <                     return; //skip this line, unwanted feature name
128 <   }
129 < p=strstr(info,"ID=");
130 < if (p!=NULL) { //has ID attr
131 <   ID=p+3;
132 <   p=ID;
133 <   while (*p!=';' && *p!=0) p++;
134 <   ID=Gstrdup(ID, p-1);
135 <   //look for a name attr too:
136 <   p=strstr(info,"Name=");
137 <   if (p!=NULL) {
138 <     gname=p+5;
139 <     p=gname;
140 <     while (*p!=';' && *p!=0) p++;
141 <     gname=Gstrdup(gname, p-1);
142 <     }
213 > else if (endsWith(fnamelc, "gene") || startsWith(fnamelc, "gene")) {
214 >   is_gene=true;
215 >   is_t_data=true; //because its name will be attached to parented transcripts
216     }
217 < p=NULL;
218 < if (!is_mrna)
219 <     p=strifind(info,"Parent="); //don't care about the parent for mRNA features..
147 < if (p!=NULL) { //has Parent attr
148 <   Parent=p+7;
149 <   p=Parent;
150 <   while (*p!=';' && *p!=0) p++;
151 <   Parent=Gstrdup(Parent, p-1);
217 > else if (endsWith(fnamelc,"rna") || endsWith(fnamelc,"transcript")) {
218 >   is_transcript=true;
219 >   is_t_data=true;
220     }
221 <  else if (ID==NULL) { //no "Parent=" and no "ID=", attempt GTF parsing instead
222 <   p=strstr(info,"transcript_id");
223 <   if (p!=NULL) { //GTF detected
224 <     p+=13;
225 <     //requires quotes
226 <     while (*p!='"' && *p!=0) p++;
227 <     if (*p==0) GError("Error parsing transcript_id (double quotes expected) at GTF line:\n%s\n",l);
228 <     p++;
229 <     Parent=p;
230 <     while (*p!='"' && *p!=0) p++;
163 <     if (*p==0) GError("Error parsing transcript_id (ending double quotes expected) at GTF line:\n%s\n",l);
164 <     if (is_mrna) { // RGASP GTF exception: a parent "transcript" feature preceding exon/CDS subfeatures
165 <        ID=Gstrdup(Parent, p-1);
166 <        Parent=NULL;
167 <        }
168 <       else {
169 <        Parent=Gstrdup(Parent, p-1);
221 >
222 > if (reader->transcriptsOnly && !is_t_data) {
223 >        char* id=extractAttr("ID=");
224 >        if (id==NULL) id=extractAttr("transcript_id");
225 >        //GMessage("Discarding non-transcript line:\n%s\n",l);
226 >        if (id!=NULL) {
227 >          reader->discarded_ids.Add(id, new int(1));
228 >          GFREE(id);
229 >          }
230 >        return; //skip this line, unwanted feature name
231          }
232 <     //check for gene_name or gene_id
233 <     //p=strstr(info, "gene_name");// this is preferred over gene_id
234 <     //if (p==NULL)
235 <     p=strstr(info,"gene_id");
236 <     if (p!=NULL) {
237 <       p+=7;//skip 'gene_id'
238 <       while (*p!='"' && *p!=0) p++;
239 <       if (*p==0) GError("Error parsing gene_id (double quotes expected) at GTF line:\n%s\n",l);
240 <       p++;
241 <       gname=p;
242 <       while (*p!='"' && *p!=0) p++;
243 <       if (*p==0) GError("Error parsing gene_id (ending double quotes expected) at GTF line:\n%s\n",l);
244 <       gname=Gstrdup(gname, p-1);
245 <       }
232 > ID=extractAttr("ID=");
233 > char* Parent=extractAttr("Parent=");
234 > is_gff3=(ID!=NULL || Parent!=NULL);
235 > if (is_gff3) {
236 >   //parse as GFF3
237 >    if (ID!=NULL) {
238 >       //has ID attr so it's likely to be a parent feature
239 >       //look for explicit gene name
240 >       gene_name=extractAttr("gene_name=",false);
241 >       if (gene_name==NULL) {
242 >           gene_name=extractAttr("geneName=",false);
243 >           if (gene_name==NULL) {
244 >               gene_name=extractAttr("gene_sym=",false);
245 >               if (gene_name==NULL) {
246 >                   gene_name=extractAttr("gene=",false);
247 >                   }
248 >               }
249 >           }
250 >       gene_id=extractAttr("geneID=",false);
251 >       if (gene_id==NULL) {
252 >          gene_id=extractAttr("gene_id=",false);
253 >          }
254 >       if (is_gene) {
255 >         //special case: keep the Name and ID attributes of the gene feature
256 >         if (gene_name==NULL)
257 >              gene_name=extractAttr("Name=");
258 >         if (gene_id==NULL) //the ID is also gene_id in this case
259 >              gene_id=Gstrdup(ID);
260 >         //skip=false;
261 >         //return;
262 >         GFREE(Parent); //TMI, we really don't care about gene Parents?
263 >         } //gene feature
264 >       }// has GFF3 ID
265 >   if (Parent!=NULL) {
266 >        //keep Parent attr
267 >         //parse multiple parents
268 >         num_parents=1;
269 >         p=Parent;
270 >         int last_delim_pos=-1;
271 >         while (*p!=';' && *p!=0) {
272 >             if (*p==',' && *(p+1)!=0 && *(p+1)!=';') {
273 >                 num_parents++;
274 >                 last_delim_pos=(p-Parent);
275 >                 }
276 >             p++;
277 >             }
278 >         _parents_len=p-Parent+1;
279 >         _parents=Parent;
280 >         GMALLOC(parents, num_parents*sizeof(char*));
281 >         parents[0]=_parents;
282 >         int i=1;
283 >         if (last_delim_pos>0) {
284 >           for (p=_parents+1;p<=_parents+last_delim_pos;p++) {
285 >              if (*p==',') {
286 >                 char* ep=p-1;
287 >                 while (*ep==' ' && ep>_parents) ep--;
288 >                 *(ep+1)=0; //end the string there
289 >                 parents[i]=p+1;
290 >                 i++;
291 >                 }
292 >              }
293 >           }
294 >         } //has Parent field
295 >   } //GFF3
296 >  else { // GTF-like expected
297 >   Parent=extractAttr("transcript_id");
298 >   if (Parent!=NULL) { //GTF2 format detected
299 >     if (is_transcript) {
300 >         // atypical GTF with a parent transcript line declared
301 >         ID=Parent;
302 >         Parent=NULL;
303 >         }
304 >     gene_id=extractAttr("gene_id"); // for GTF this is the only attribute accepted as geneID
305 >     gene_name=extractAttr("gene_name");
306 >     if (gene_name==NULL) {
307 >           gene_name=extractAttr("gene_sym");
308 >           if (gene_name==NULL)
309 >               gene_name=extractAttr("gene");
310 >           }
311       //prepare for parseAttr by adding '=' character instead of spaces for all attributes
312       //after the attribute name
313       p=info;
314       bool noed=true; //not edited after the last delim
315       bool nsp=false; //non-space found after last delim
316       while (*p!=0) {
317 <      if (*p==' ') {
318 <         if (nsp && noed) {
319 <           *p='=';
320 <            noed=false;
321 <            p++;
322 <            continue;
323 <            }
324 <         }
325 <      else nsp=true;
326 <      if (*p==';') { noed=true; nsp=false; }
327 <      p++;
328 <      }
329 <     } //gtf detected
330 <    else {//check for jigsaw or cufflinks format
331 <     char* fexon=strstr(fnamelc, "exon");
332 <     if (fexon!=NULL) {
317 >       if (*p==' ') {
318 >          if (nsp && noed) {
319 >             *p='=';
320 >             noed=false;
321 >             p++;
322 >             continue;
323 >             }
324 >           }
325 >         else nsp=true; //non-space
326 >       if (*p==';') { noed=true; nsp=false; }
327 >       p++;
328 >       }
329 >     } //GTF2 detected (no parent line)
330 >    else {// Parent is NULL, check for jigsaw format or other pre-GTF2 format
331 >     //char* fexon=strstr(fnamelc, "exon");
332 >     //if (fexon!=NULL) {
333 >     if (exontype==exgffExon) {
334         if (startsWith(track,"jigsaw")) {
335 <        is_cds=true;
336 <        strcpy(track,"jigsaw");
337 <        p=strchr(info,';');
338 <        if (p==NULL) Parent=Gstrdup(info);
339 <           else { Parent=Gstrdup(info,p-1); info=p+1;  }
340 <        }
341 <       else if ((i=strcspn(info,"; \t\n\r"))<=(int)(strlen(info)+1)) {//one word ID
335 >          is_cds=true;
336 >          strcpy(track,"jigsaw");
337 >          p=strchr(info,';');
338 >          if (p==NULL) { Parent=Gstrdup(info); info=NULL; }
339 >           else { Parent=Gstrdup(info,p-1);
340 >                  info=p+1;
341 >                }
342 >          }
343 >        } //exon feature?
344 >        if (Parent==NULL && exontype>=exgffCDS &&
345 >               (i=strcspn(info,"; \t\n\r"))<=(int)(strlen(info)+1)) {
346 >          //one word ID ? really desperate attempt to parse it here
347            Parent=Gstrdup(info,info+i-1);
348 <        }
349 <
218 <      }
219 <      else GError("Error parsing Parent/ID at input line:\n%s\n",l);
348 >          info=NULL; //discard anything else on the line
349 >          }
350       }
351 <   }
352 < p=strstr(info,"Target=");
353 < if (p!=NULL) { //has Target attr
354 <   p+=7;
355 <   while (*p!=';' && *p!=0 && *p!=' ') p++;
226 <   if (*p!=' ') {
227 <      GError("Error parsing target coordinates from GFF line:\n%s\n",l);
351 >   if (Parent!=NULL) { //GTF transcript_id for exon/CDS feature
352 >      _parents=Parent;
353 >      GMALLOC(parents,sizeof(char*));
354 >      num_parents=1;
355 >      parents[0]=_parents;
356        }
357 <   if (!parseUInt(p,qstart))
358 <     GError("Error parsing target start coordinate from GFF line:\n%s\n",l);
359 <   if (*p!=' ') {
360 <      GError("Error parsing next target coordinate from GFF line:\n%s\n",l);
357 >   } //GTF-like
358 >
359 > //parse other potentially useful features
360 > if (is_gff3) {
361 >   if ((p=strstr(info,"Target="))!=NULL) { //has Target attr
362 >      p+=7;
363 >      while (*p!=';' && *p!=0 && *p!=' ') p++;
364 >      if (*p!=' ') {
365 >         GError("Error parsing target coordinates from GFF line:\n%s\n",l);
366 >         }
367 >      if (!parseUInt(p,qstart))
368 >         GError("Error parsing target start coordinate from GFF line:\n%s\n",l);
369 >      if (*p!=' ') {
370 >         GError("Error parsing next target coordinate from GFF line:\n%s\n",l);
371 >         }
372 >      p++;
373 >      if (!parseUInt(p,qend))
374 >         GError("Error parsing target end coordinate from GFF line:\n%s\n",l);
375        }
376 <   p++;
377 <   if (!parseUInt(p,qend))
378 <     GError("Error parsing target end coordinate from GFF line:\n%s\n",l);
379 <   }
380 < else {
381 <   p=strifind(info,"Qreg=");
382 <   if (p!=NULL) { //has Qreg attr
241 <     p+=5;
242 <     if (!parseUInt(p,qstart))
243 <       GError("Error parsing target start coordinate from GFF line:\n%s\n",l);
244 <     if (*p!='-') {
245 <        GError("Error parsing next target coordinate from GFF line:\n%s\n",l);
246 <        }
247 <     p++;
248 <     if (!parseUInt(p,qend))
249 <       GError("Error parsing target end coordinate from GFF line:\n%s\n",l);
250 <     if (*p=='|') {
376 >   if ((p=strifind(info,"Qreg="))!=NULL) { //has Qreg attr
377 >       p+=5;
378 >       if (!parseUInt(p,qstart))
379 >         GError("Error parsing target start coordinate from GFF line:\n%s\n",l);
380 >       if (*p!='-') {
381 >          GError("Error parsing next target coordinate from GFF line:\n%s\n",l);
382 >          }
383         p++;
384 <       if (!parseUInt(p,qlen))
385 <         GError("Error parsing target length from GFF Qreg|: \n%s\n",l);
386 <       }
387 <     }//has Qreg attr
388 <   }
389 < if (qlen==0 && (p=strifind(info,"Qlen="))!=NULL) {
390 <   p+=5;
391 <   if (!parseUInt(p,qlen))
392 <       GError("Error parsing target length from GFF Qlen:\n%s\n",l);
393 <   }
384 >       if (!parseUInt(p,qend))
385 >         GError("Error parsing target end coordinate from GFF line:\n%s\n",l);
386 >       if (*p=='|' || *p==':') {
387 >         p++;
388 >         if (!parseUInt(p,qlen))
389 >           GError("Error parsing target length from GFF Qreg|: \n%s\n",l);
390 >         }
391 >       }//has Qreg attr
392 >   if (qlen==0 && (p=strifind(info,"Qlen="))!=NULL) {
393 >     p+=5;
394 >     if (!parseUInt(p,qlen))
395 >         GError("Error parsing target length from GFF Qlen:\n%s\n",l);
396 >     }
397 >   }//parsing some useful attributes in GFF3 records
398 > if (ID==NULL && parents==NULL) {
399 >      if (reader->gff_warns)
400 >          GMessage("Warning: could not parse ID or Parent from GFF line:\n%s\n",dupline);
401 >      return; //skip
402 >      }
403   skip=false;
404   }
405  
406 < int GffObj::addExon(GffLine* gl, bool keepAttr, bool noExonAttr) {
406 > int GffObj::addExon(GffReader* reader, GffLine* gl, bool keepAttr, bool noExonAttr) {
407    //this will make sure we have the right subftype_id!
408    int subf_id=-1;
409 <  if (ftype_id==gff_fid_mRNA) {
410 <     if (subftype_id<0) subftype_id=gff_fid_exon;
409 >  //if (ftype_id==gff_fid_mRNA) { //for mRNAs only parse known subfeatures!
410 >  if (isTranscript()) {
411 >     if (exon_ftype_id<0) {//exon_ftype_id=gff_fid_exon;
412 >          if (gl->exontype>0) exon_ftype_id=gff_fid_exon;
413 >                         else exon_ftype_id=names->feats.addName(gl->ftype);
414 >          }
415       //any recognized mRNA segment gets the generic "exon" type (also applies to CDS)
416 <     if (!gl->is_cds && !gl->is_exon)
417 <         //extraneous mRNA feature, will discard for now
416 >     if (gl->exontype==0 && !gl->is_transcript) {
417 >          //extraneous mRNA feature, discard
418 >          if (reader->gff_warns)
419 >            GMessage("Warning: discarding unrecognized transcript subfeature %s of %s\n",
420 >                gl->ftype, gffID);
421            return -1;
422 +          }
423       }
424 <  else { //other kind of parent feature, check this subf type
424 >  else { //non-mRNA parent feature, check this subf type
425      subf_id=names->feats.addName(gl->ftype);
426 <    if (subftype_id<0)
427 <       subftype_id=subf_id;
428 <    else {
429 <       if (subftype_id!=subf_id)
430 <         GMessage("GFF Warning: multiple subfeatures (%s and %s) found for %s, only %s is kept\n",
431 <             names->feats.getName(subftype_id), names->feats.getName(subf_id),
432 <             gffID,names->feats.getName(subftype_id));
433 <         return -1; //skip this 2nd subfeature type for this parent!
434 <       }
435 <    }
436 <  if (gl->exontype==exgffUTR || gl->exontype==exgffStop)
437 <      udata=1;//merge 0-distance segments
426 >    if (exon_ftype_id<0 || exons.Count()==0) //never assigned a subfeature type before (e.g. first exon being added)
427 >       exon_ftype_id=subf_id;
428 >     else {
429 >       if (exon_ftype_id!=subf_id) {
430 >         //if (subftype_id==ftype_id && exons.Count()==1 && exons[0]->start==start && exons[0]->end==end) {
431 >         if (exon_ftype_id==ftype_id && exons.Count()==1 && exons[0]->start==start && exons[0]->end==end) {
432 >            //the existing exon was just a dummy one created by default, discard it
433 >            exons.Clear();
434 >            covlen=0;
435 >            exon_ftype_id=subf_id; //allow the new subfeature to completely takeover
436 >            }
437 >         else { //multiple subfeatures, prefer those with
438 >             if (reader->gff_warns)
439 >               GMessage("GFF Warning: multiple subfeatures (%s and %s) found for %s, discarding ",
440 >                  names->feats.getName(subf_id), names->feats.getName(exon_ftype_id),gffID);
441 >            if (gl->exontype!=0) { //new feature is an exon, discard previously parsed subfeatures
442 >               if (reader->gff_warns) GMessage("%s.\n", names->feats.getName(exon_ftype_id));
443 >               exon_ftype_id=subf_id;
444 >               exons.Clear();
445 >               covlen=0;
446 >               }
447 >              else { //discard new feature
448 >               if (reader->gff_warns) GMessage("%s.\n", names->feats.getName(subf_id));
449 >               return -1; //skip this 2nd subfeature type for this parent!
450 >               }
451 >            }
452 >         } //incoming subfeature is of different type
453 >       } //new subfeature type
454 >    } //non-mRNA parent
455    int eidx=addExon(gl->fstart, gl->fend, gl->score, gl->phase,
456           gl->qstart,gl->qend, gl->is_cds, gl->exontype);
457 <  if (eidx>=0 && keepAttr) {
458 <      parseAttrs(exons[eidx]->attrs, gl->info, noExonAttr);
457 >  if (eidx<0) return eidx; //this should never happen
458 >  if (keepAttr) {
459 >     if (noExonAttr) {
460 >         if (attrs==NULL) //place the parsed attributes directly at transcript level
461 >           parseAttrs(attrs, gl->info);
462 >         }
463 >       else { //need all exon-level attributes
464 >         parseAttrs(exons[eidx]->attrs, gl->info, true);
465 >         }
466        }
467    return eidx;
468   }
469  
470  
298 void GffObj::expandExon(int oi, uint segstart, uint segend, char exontype, double sc, char fr, int qs, int qe) {
299  //oi is the index of the *first* overlapping segment found that must be enlarged
300  covlen-=exons[oi]->len();
301  if (segstart<exons[oi]->start)
302    exons[oi]->start=segstart;
303  if (qs<exons[oi]->qstart) exons[oi]->qstart=qs;
304  if (segend>exons[oi]->end)
305    exons[oi]->end=segend;
306  if (qe>exons[oi]->qend) exons[oi]->qend=qe;
307  //warning: score cannot be properly adjusted! e.g. if it's a p-value it's just going to get worse
308  if (sc!=0) exons[oi]->score=sc;
309  covlen+=exons[oi]->len();
310  //if (exons[oi]->exontype< exontype) -- always true
311  exons[oi]->exontype = exontype;
312  if (exontype==exgffCDS) exons[oi]->phase=fr;
313  //we must check if any more exons are also overlapping this
314  int ni=oi+1; //next exon index after oi
315  while (ni<exons.Count() && segend>=exons[ni]->start) { // next segment overlaps new enlarged segment
316     //only allow this if next segment is fully included, and a subordinate
317     if (exons[ni]->exontype<exontype && exons[ni]->end<=segend) {
318         if (exons[ni]->qstart<exons[oi]->qstart) exons[oi]->qstart=exons[ni]->qstart;
319         if (exons[ni]->qend>exons[oi]->qend) exons[oi]->qend=exons[ni]->qend;
320         exons.Delete(ni);
321         }
322      else {
323         GMessage("GFF Warning: overlapping feature segment (%d-%d) for GFF ID %s\n", segstart, segend, gffID);
324         hasErrors=true;
325         break;
326         }
327     }
328  // -- make sure any other related boundaries are updated:
329  start=exons.First()->start;
330  end=exons.Last()->end;
331  if (uptr!=NULL) { //collect stats about the underlying genomic sequence
332    GSeqStat* gsd=(GSeqStat*)uptr;
333    if (start<gsd->mincoord) gsd->mincoord=start;
334    if (end>gsd->maxcoord) gsd->maxcoord=end;
335    }
336 }
337
471   int GffObj::addExon(uint segstart, uint segend, double sc, char fr, int qs, int qe, bool iscds, char exontype) {
472    if (exons.Count()==0) {
473        if (iscds) isCDS=true; //for now, assume CDS only if first "exon" given is a CDS
474 <      if (subftype_id<0) {
475 <         subftype_id = (ftype_id==gff_fid_mRNA) ? gff_fid_exon : ftype_id;
474 >      if (exon_ftype_id<0) {
475 >         exon_ftype_id = isTranscript() ? gff_fid_exon : ftype_id;
476           }
477        }
478 <  if (iscds) { //update CDS anchors:
478 >  //special treatment of start/stop codon features, they might be broken/split between exons
479 >  //and in that case some providers will still give the wrong end coordinate as start+2 (e.g. UCSC)
480 >  //so we should not trust the end coordinate for such features
481 >  if (exontype==exgffStart || exontype==exgffStop) {
482 >     if (strand=='-') segstart=segend;
483 >                else  segend=segstart;
484 >     if (exontype==exgffStart) {
485 >           if (CDstart==0 || segstart<CDstart) CDstart=segstart;
486 >           }
487 >         else {
488 >           if (segstart>CDend) CDend=segstart;
489 >           }
490 >     }
491 >    else if (iscds) { //update CDS anchors:
492       if (CDstart==0 || segstart<CDstart)  {
493             CDstart=segstart;
494             if (exontype==exgffCDS && strand=='+') CDphase=fr;
# Line 352 | Line 498
498             CDend=segend;
499             }
500       }
501 <   else { // !iscds
501 >   else { // not a CDS/start/stop
502       isCDS=false;
503       }
504    if (qs || qe) {
505      if (qs>qe) swap(qs,qe);
506      if (qs==0) qs=1;
507 <    }
507 >        }
508    int ovlen=0;
509 <  int oi=exonOverlapIdx(segstart, segend, &ovlen);
510 <  if (oi>=0) { //overlap existing segment
511 <     //only allow this for CDS within exon, stop_codon within exon, stop_codon within UTR,
512 <     //                        or stop_codon within CDS
513 <    if (exons[oi]->exontype>exontype &&
514 <         exons[oi]->start<=segstart && exons[oi]->end>=segend &&
515 <         !(exons[oi]->exontype==exgffUTR && exontype==exgffCDS)) {
516 <          //larger segment given first, now the smaller included one
517 <          return oi; //only used to store attributes from current GffLine
518 <          }
519 <    if (exontype>exons[oi]->exontype &&
520 <         segstart<=exons[oi]->start && segend>=exons[oi]->end &&
521 <         !(exontype==exgffUTR && exons[oi]->exontype==exgffCDS)) {
522 <           //smaller segment given first, so we have to enlarge it
523 <          expandExon(oi, segstart, segend, exontype, sc, fr, qs, qe);
524 <            //this must also check for overlapping next exon (oi+1)
525 <          return oi;
526 <          }
527 <    //there is also the special case of "ribosomal slippage exception" (programmed frameshift)
528 <    //where two CDS segments may actually overlap for 1 or 2 bases, but there should be only one encompassing exon
529 <    if (ovlen>2 || exons[oi]->exontype!=exgffCDS || exontype!=exgffCDS) {
530 <       GMessage("GFF Warning: discarded overlapping feature segment (%d-%d) for GFF ID %s\n",
531 <           segstart, segend, gffID);
532 <       hasErrors=true;
533 <       return false; //segment NOT added
534 <       }
535 <     /* --
536 <     else {
537 <       // else add the segment if the overlap is small and between two CDS segments
538 <       //we might want to add an attribute here with the slippage coordinate and size?
539 <       }
540 <     */
541 <    }//overlap of existing segment
542 <
509 >  if (exontype>0) { //check for overlaps between exon-type segments
510 >      int oi=exonOverlapIdx(segstart, segend, &ovlen);
511 >      if (oi>=0) { //overlap existing segment
512 >         if (ovlen==0) {
513 >                          //adjacent segments will be merged
514 >                          //e.g. CDS to (UTR|exon)
515 >                          if ((exons[oi]->exontype>=exgffUTR && exontype==exgffCDS) ||
516 >                                  (exons[oi]->exontype==exgffCDS && exontype>=exgffUTR)) {
517 >                                        expandExon(oi, segstart, segend, exgffCDSUTR, sc, fr, qs, qe);
518 >                                        return oi;
519 >                                        }
520 >                          //CDS adjacent to stop_codon: UCSC does (did?) this
521 >                          if ((exons[oi]->exontype==exgffStop && exontype==exgffCDS) ||
522 >                                  (exons[oi]->exontype==exgffCDS && exontype==exgffStop)) {
523 >                                        expandExon(oi, segstart, segend, exgffCDS, sc, fr, qs, qe);
524 >                                        return oi;
525 >                                        }
526 >             }
527 >                 //only allow this for CDS within exon, stop_codon within (CDS|UTR|exon),
528 >         //                   start_codon within (CDS|exon)
529 >        if (exons[oi]->exontype>exontype &&
530 >             exons[oi]->start<=segstart && exons[oi]->end>=segend &&
531 >             !(exons[oi]->exontype==exgffUTR && exontype==exgffCDS)) {
532 >              //larger segment given first, now the smaller included one is redundant
533 >              return oi; //only used to store attributes from current GffLine
534 >              }
535 >        if (exontype>exons[oi]->exontype &&
536 >             segstart<=exons[oi]->start && segend>=exons[oi]->end &&
537 >             !(exontype==exgffUTR && exons[oi]->exontype==exgffCDS)) {
538 >               //smaller segment given first, so we have to enlarge it
539 >                          expandExon(oi, segstart, segend, exontype, sc, fr, qs, qe);
540 >                                //this should also check for overlapping next exon (oi+1) ?
541 >              return oi;
542 >              }
543 >        //there is also the special case of "ribosomal slippage exception" (programmed frameshift)
544 >        //where two CDS segments may actually overlap for 1 or 2 bases, but there should be only one encompassing exon
545 >                //if (ovlen>2 || exons[oi]->exontype!=exgffCDS || exontype!=exgffCDS) {
546 >                // had to relax this because of some weird UCSC annotations with exons partially overlapping the CDS segments
547 >                /*
548 >                if (ovlen>2 && exons[oi]->exontype!=exgffUTR && exontype!=exgffUTR) {
549 >                   if (gff_show_warnings)
550 >                           GMessage("GFF Warning: discarding overlapping feature segment (%d-%d) (vs %d-%d (%s)) for GFF ID %s on %s\n",
551 >                           segstart, segend, exons[oi]->start, exons[oi]->end, getSubfName(), gffID, getGSeqName());
552 >                   hasErrors(true);
553 >                   return -1; //segment NOT added
554 >                   }
555 >                */
556 >
557 >                 if ((ovlen>2 || ovlen==0) || exons[oi]->exontype!=exgffCDS || exontype!=exgffCDS) {
558 >                  if (gff_show_warnings)
559 >                         GMessage("GFF Warning: merging overlapping/adjacent feature segment (%d-%d) into (%d-%d) (%s) for GFF ID %s on %s\n",
560 >                                 segstart, segend, exons[oi]->start, exons[oi]->end, getSubfName(), gffID, getGSeqName());
561 >                        expandExon(oi, segstart, segend, exontype, sc, fr, qs, qe);
562 >                        return oi;
563 >                 }
564 >                // else add the segment if the overlap is small and between two CDS segments
565 >                //TODO: we might want to add an attribute here with the slippage coordinate and size?
566 >        covlen-=ovlen;
567 >                }//overlap or adjacent to existing segment
568 >           } //check for overlap
569     // --- no overlap, or accepted micro-overlap (ribosomal slippage)
570     // create & add the new segment
571     GffExon* enew=new GffExon(segstart, segend, sc, fr, qs, qe, exontype);
572     int eidx=exons.Add(enew);
573     if (eidx<0) {
574 <     GMessage("GFF Warning: duplicate segment %d-%d not added for GFF ID %s!\n",
574 >    //this would actually be acceptable if the object is a "Gene" and "exons" are in fact isoforms
575 >     if (gff_show_warnings)
576 >       GMessage("GFF Warning: failed adding segment %d-%d for %s (discarded)!\n",
577              segstart, segend, gffID);
578 +     delete enew;
579 +     hasErrors(true);
580       return -1;            
581       }
582     covlen+=(int)(exons[eidx]->end-exons[eidx]->start)+1;
# Line 410 | Line 586
586         GSeqStat* gsd=(GSeqStat*)uptr;
587         if (start<gsd->mincoord) gsd->mincoord=start;
588         if (end>gsd->maxcoord) gsd->maxcoord=end;
589 +       if (this->len()>gsd->maxfeat_len) {
590 +          gsd->maxfeat_len=this->len();
591 +          gsd->maxfeat=this;
592 +          }
593         }
594     return eidx;
595   }
596  
597 + void GffObj::expandExon(int oi, uint segstart, uint segend, char exontype, double sc, char fr, int qs, int qe) {
598 +  //oi is the index of the *first* overlapping segment found that must be enlarged
599 +  covlen-=exons[oi]->len();
600 +  if (segstart<exons[oi]->start)
601 +    exons[oi]->start=segstart;
602 +  if (qs && qs<exons[oi]->qstart) exons[oi]->qstart=qs;
603 +  if (segend>exons[oi]->end)
604 +    exons[oi]->end=segend;
605 +  if (qe && qe>exons[oi]->qend) exons[oi]->qend=qe;
606 +  //warning: score cannot be properly adjusted! e.g. if it's a p-value it's just going to get worse
607 +  if (sc!=0) exons[oi]->score=sc;
608 +  covlen+=exons[oi]->len();
609 +  //if (exons[oi]->exontype< exontype) -- always true
610 +  exons[oi]->exontype = exontype;
611 +  if (exontype==exgffCDS) exons[oi]->phase=fr;
612 +  //we must check if any more exons are also overlapping this
613 +  int ni=oi+1; //next exon index after oi
614 +  while (ni<exons.Count() && segend>=exons[ni]->start) { // next segment overlaps new enlarged segment
615 +     //only allow this if next segment is fully included, and a subordinate
616 +     if (exons[ni]->exontype<exontype && exons[ni]->end<=segend) {
617 + /* I guess we have to relax this due to stupid UCSC hg18 files having a start_codon sticking out
618 + chr1    hg18_knownGene  start_codon     69806911        69806913        0.000000        +       .
619 + chr1    hg18_knownGene  CDS     69806911        69806912        0.000000        +       0
620 + chr1    hg18_knownGene  exon    69805456        69806912        0.000000        +       .    
621 + */
622 +         if (exons[ni]->qstart<exons[oi]->qstart) exons[oi]->qstart=exons[ni]->qstart;
623 +         if (exons[ni]->qend>exons[oi]->qend) exons[oi]->qend=exons[ni]->qend;
624 +         exons.Delete(ni);
625 +         }
626 +      else {
627 +         if (gff_show_warnings) GMessage("GFF Warning: overlapping existing exon(%d-%d) while expanding to %d-%d for GFF ID %s\n",
628 +                exons[ni]->start, exons[ni]->end, segstart, segend, gffID);
629 +         //hasErrors(true);
630 +         break;
631 +         }
632 +     }
633 +  // -- make sure any other related boundaries are updated:
634 +  start=exons.First()->start;
635 +  end=exons.Last()->end;
636 +  if (uptr!=NULL) { //collect stats about the underlying genomic sequence
637 +    GSeqStat* gsd=(GSeqStat*)uptr;
638 +    if (start<gsd->mincoord) gsd->mincoord=start;
639 +    if (end>gsd->maxcoord) gsd->maxcoord=end;
640 +    if (this->len()>gsd->maxfeat_len) {
641 +        gsd->maxfeat_len=this->len();
642 +        gsd->maxfeat=this;
643 +        }
644 +    }
645 + }
646 +
647   void GffObj::removeExon(int idx) {
648    /*
649     if (idx==0 && segs[0].start==gstart)
# Line 431 | Line 661
661    if (isCDS) { CDstart=start; CDend=end; }
662   }
663  
664 + void GffObj::removeExon(GffExon* p) {
665 +  for (int idx=0;idx<exons.Count();idx++) {
666 +     if (exons[idx]==p) {
667 +        int segstart=exons[idx]->start;
668 +        int segend=exons[idx]->end;
669 +        exons.Delete(idx);
670 +        covlen -= (int)(segend-segstart)+1;
671 +        start=exons.First()->start;
672 +        end=exons.Last()->end;
673 +        if (isCDS) { CDstart=start; CDend=end; }
674 +        return;
675 +        }
676 +     }
677 + }
678 +
679 +
680 +
681   GffObj::GffObj(GffReader *gfrd, GffLine* gffline, bool keepAttr, bool noExonAttr):
682 <     GSeg(0,0), exons(true,true,true) {
683 < xstart=0;
684 < xend=0;
685 < xstatus=0;
686 < partial=false;
687 < isCDS=false;
688 < uptr=NULL;
689 < ulink=NULL;
690 < udata=0;
691 < CDstart=0;
692 < CDend=0;
693 < gname=NULL;
694 < attrs=NULL;
695 < gffID=NULL;
696 < track_id=-1;
697 < gseq_id=-1;
698 < ftype_id=-1;
699 < subftype_id=-1;
700 < hasErrors=false;
701 < if (gfrd==NULL)
682 >     GSeg(0,0), exons(true,true,false), children(1,false) {
683 >  xstart=0;
684 >  xend=0;
685 >  xstatus=0;
686 >  partial=false;
687 >  isCDS=false;
688 >  uptr=NULL;
689 >  ulink=NULL;
690 >  parent=NULL;
691 >  udata=0;
692 >  flags=0;
693 >  CDstart=0;
694 >  CDend=0;
695 >  CDphase=0;
696 >  geneID=NULL;
697 >  gene_name=NULL;
698 >  attrs=NULL;
699 >  gffID=NULL;
700 >  track_id=-1;
701 >  gseq_id=-1;
702 >  ftype_id=-1;
703 >  exon_ftype_id=-1;
704 >  strand='.';
705 >  if (gfrd==NULL)
706      GError("Cannot use this GffObj constructor with a NULL GffReader!\n");
707 < gffnames_ref(names);
708 < if (gfrd->names==NULL) gfrd->names=names;
709 < qlen=0;qstart=0;qend=0;
710 < gscore=0;
711 < uscore=0;
712 < covlen=0;
713 < qcov=0;
714 < if (gffline->Parent!=NULL) {
715 <    //GTF style -- subfeature given directly
716 <    if (gffline->is_cds || gffline->is_exon)
717 <         ftype_id=gff_fid_mRNA;
718 <      else {
719 <        //group of other subfeatures of type ftype:
720 <        ftype_id=names->feats.addName(gffline->ftype);
707 >  gffnames_ref(names);
708 >  if (gfrd->names==NULL) gfrd->names=names;
709 >  //qlen=0;qstart=0;qend=0;
710 >  gscore=0;
711 >  uscore=0;
712 >  covlen=0;
713 >  qcov=0;
714 >  start=gffline->fstart;
715 >  end=gffline->fend;
716 >  gseq_id=names->gseqs.addName(gffline->gseqname);
717 >  track_id=names->tracks.addName(gffline->track);
718 >  strand=gffline->strand;
719 >  qlen=gffline->qlen;
720 >  qstart=gffline->qstart;
721 >  qend=gffline->qend;
722 >  //setup flags from gffline
723 >  isCDS=gffline->is_cds; //for now
724 >  isGene(gffline->is_gene);
725 >  isTranscript(gffline->is_transcript || gffline->exontype!=0);
726 >  fromGff3(gffline->is_gff3);
727 >
728 >  if (gffline->parents!=NULL) {
729 >    //GTF style -- create a GffObj directly by subfeature
730 >    //(also possible orphan GFF3 exon line, or an exon given before its parent (chado))
731 >    if (gffline->exontype!=0) { //recognized exon-like feature
732 >       ftype_id=gff_fid_transcript; //so this is some sort of transcript
733 >       exon_ftype_id=gff_fid_exon; //subfeatures MUST be exons
734 >       }
735 >     else {//unrecognized subfeatures
736 >       //make this GffObj of the same feature type
737 >       ftype_id=names->feats.addName(gffline->ftype);
738 >       }
739 >    if (gffline->ID==NULL) { //typical GTF
740 >        gffID=Gstrdup(gffline->parents[0]);
741 >        this->createdByExon(true);
742 >        //this is likely the first exon/segment of the feature
743 >        addExon(gfrd, gffline, keepAttr, noExonAttr);
744          }
745 <    gffID=gffline->Parent;
746 <    gffline->Parent=NULL; //just take over
747 <    if (gffline->gname!=NULL) {
748 <        gname=gffline->gname;
749 <        gffline->gname=NULL;
745 >      else { //a parented feature with an ID -- probably an orphan GFF3 line
746 >        if (gffline->is_gff3 && gffline->exontype!=0) {
747 >             //premature exon given before its parent transcript
748 >             //create the transcript entry here
749 >             gffID=Gstrdup(gffline->parents[0]);
750 >             this->createdByExon(true);
751 >             //this is the first exon/segment of the transcript
752 >             addExon(gfrd, gffline, keepAttr, noExonAttr);
753 >             }
754 >            else { //unrecognized non-exon feature ? use the ID instead
755 >             gffID=Gstrdup(gffline->ID);
756 >             if (keepAttr) this->parseAttrs(attrs, gffline->info);
757 >             }
758          }
759 <    gseq_id=names->gseqs.addName(gffline->gseqname);
760 <    track_id=names->tracks.addName(gffline->track);
761 <    strand=gffline->strand;
480 <    qlen=gffline->qlen;
481 <    start=gffline->fstart;
482 <    end=gffline->fend;
483 <    isCDS=gffline->is_cds; //for now
484 <    addExon(gffline, keepAttr, noExonAttr);
485 <    if (keepAttr && noExonAttr) {
486 <      //simply move the attrs from this first exon
487 <      //to the transcript
488 <      if (exons.First()->attrs!=NULL) {
489 <         attrs=exons.First()->attrs;
490 <         exons.First()->attrs=NULL;
491 <         }
492 <      }
493 <    } //GTF line
494 < else { //GffReader made sure this is a parent line (no parent)
495 <    //even for a mRNA with a Parent= line
759 >    } //subfeature given directly
760 >  else { //gffline->parents==NULL
761 >    //create a parent feature in its own right
762      gscore=gffline->score;
763      if (gffline->ID==NULL || gffline->ID[0]==0)
764 <       GError("Error: no ID found for GFF record start\n");
765 <    gffID=gffline->ID; //there must be an ID here
766 <    if (gffline->is_mrna) ftype_id=gff_fid_mRNA;
767 <        else ftype_id=names->feats.addName(gffline->ftype);
768 <    gffline->ID=NULL; //steal it
769 <    if (gffline->gname!=NULL) {
770 <        gname=gffline->gname;
771 <        gffline->gname=NULL;
764 >      GError("Error: no ID found for GFF record start\n");
765 >    gffID=Gstrdup(gffline->ID); //there must be an ID here
766 >    //if (gffline->is_transcript) ftype_id=gff_fid_mRNA;
767 >      //else
768 >    ftype_id=names->feats.addName(gffline->ftype);
769 >    if (gffline->is_transcript)
770 >      exon_ftype_id=gff_fid_exon;
771 >
772 >    if (keepAttr) this->parseAttrs(attrs, gffline->info);
773 >    }//no parent
774 >
775 >  if (gffline->gene_name!=NULL) {
776 >     gene_name=Gstrdup(gffline->gene_name);
777 >     }
778 >  if (gffline->gene_id!=NULL) {
779 >     geneID=Gstrdup(gffline->gene_id);
780 >     }
781 >
782 >  GSeqStat* gsd=gfrd->gseqstats.AddIfNew(new GSeqStat(gseq_id,names->gseqs.lastNameUsed()),true);
783 >  uptr=gsd;
784 >  if (start<gsd->mincoord) gsd->mincoord=start;
785 >  if (end>gsd->maxcoord) gsd->maxcoord=end;
786 >    if (this->len()>gsd->maxfeat_len) {
787 >        gsd->maxfeat_len=this->len();
788 >        gsd->maxfeat=this;
789          }
507    start=gffline->fstart;
508    end=gffline->fend;
509    gseq_id=names->gseqs.addName(gffline->gseqname);
510    track_id=names->tracks.addName(gffline->track);
511    qlen=gffline->qlen;
512    qstart=gffline->qstart;
513    qend=gffline->qend;
514    strand=gffline->strand;
515    if (keepAttr) this->parseAttrs(attrs, gffline->info, noExonAttr);
516    }
517 GSeqStat* gsd=gfrd->gseqstats.AddIfNew(new GSeqStat(gseq_id,names->gseqs.lastNameUsed()),true);
518 uptr=gsd;
519 gsd->gflst.Add(this);
520 if (start<gsd->mincoord) gsd->mincoord=start;
521 if (end>gsd->maxcoord) gsd->maxcoord=end;
522 gfrd->gfoAdd(gffID, gffline->gseqname, this);
790   }
791  
525
792   GffLine* GffReader::nextGffLine() {
793   if (gffline!=NULL) return gffline; //caller should free gffline after processing
794   while (gffline==NULL) {
529    //const char* l=linebuf->getLine();
795      int llen=0;
796      buflen=GFF_LINELEN-1;
797      char* l=fgetline(linebuf, buflen, fh, &fpos, &llen);
# Line 540 | Line 805
805      if (gffline->skip) {
806         delete gffline;
807         gffline=NULL;
808 +       continue;
809         }
810 +    if (gffline->ID==NULL && gffline->parents==NULL)  { //it must have an ID
811 +        //this might not be needed, already checked in the GffLine constructor
812 +        if (gff_warns)
813 +            GMessage("Warning: malformed GFF line, no parent or record Id (kipping\n");
814 +        delete gffline;
815 +        gffline=NULL;
816 +        //continue;
817 +        }
818      }
819   return gffline;
820   }
821  
548
549 GffObj* GffReader::parse(bool keepAttr, bool noExonAttr) { //<- do not use this!
550   //parses one parent feature at a time, assuming absolute grouping of subfeatures
551   //!ASSUMES that subfeatures (exons) are properly grouped together
552   //any interleaving exons from other transcripts will terminate the parsing of the current feature!
553  GffObj* gfo=NULL;
554  while (nextGffLine()!=NULL) {
555    if (gfo==NULL) {//record starts fresh here
556       gfo=new GffObj(this, gffline, keepAttr, noExonAttr);
557       delete gffline; gffline=NULL;
558       continue;
559       }
560 // -- gfo is not NULL from here --
561    if (gffline->Parent==NULL) {// new parent feature starting here
562       //new record start, return what we have so far,
563       //gffline was NOT deleted, so it will be used for the next parse() call
564       return ((gfo==NULL) ? NULL : gfo->finalize());
565       }
566    //-- has a Parent so it's a subfeature segment (exon/CDS/other subfeature)
567      // is it a subfeature of the current gf?
568    if (strcmp(gffline->Parent, gfo->gffID)==0) {
569          //yes, add it
570          gfo->addExon(gffline, !noExonAttr, noExonAttr);
571          delete gffline; gffline=NULL;
572          continue;
573          }
574      // is it a subfeature of a previously loaded gfo?
575    GffObj* prevgfo=gfoFind(gffline->Parent, gffline->gseqname);
576    if (prevgfo==NULL)
577           return ((gfo==NULL) ? NULL : gfo->finalize()); // new subfeature, gffline will be used for the next parse()
578    //this is for an earlier parent
579    prevgfo->addExon(gffline, !noExonAttr, noExonAttr);
580    delete gffline;
581    gffline=NULL;
582    } //while reading gfflines
583  return ((gfo==NULL) ? NULL : gfo->finalize());
584 }
822   char* GffReader::gfoBuildId(const char* id, const char* ctg) {
823   //caller must free the returned pointer
824   char* buf=NULL;
# Line 599 | Line 836
836   GFREE(buf);
837   }
838  
839 < void GffReader::gfoAdd(const char* id, const char* ctg, GffObj* gfo) {
839 > //Warning: if gflst gets altered, idx becomes obsolete
840 > GfoHolder* GffReader::gfoAdd(const char* id, const char* ctg, GffObj* gfo, int idx) {
841   char* buf=gfoBuildId(id,ctg);
842 < phash.Add(buf, gfo);
842 > GfoHolder* r=new GfoHolder(gfo,idx);
843 > phash.Add(buf, r);
844   GFREE(buf);
845 + return r;
846   }
847 < GffObj* GffReader::gfoFind(const char* id, const char* ctg) {
847 >
848 > GfoHolder* GffReader::gfoFind(const char* id, const char* ctg) {
849   char* buf=gfoBuildId(id,ctg);
850 < GffObj* r=phash.Find(buf);
850 > GfoHolder* r=phash.Find(buf);
851   GFREE(buf);
852   return r;
853   }
854  
855 + GfoHolder* GffReader::replaceGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, int replaceidx) {
856 +  GffObj* newgfo=new GffObj(this, gffline, keepAttr, noExonAttr);
857 +  GfoHolder* r=NULL;
858 +  if (replaceidx>=0) {
859 +     gflst.Put(replaceidx,newgfo);
860 +     r=gfoAdd(newgfo->gffID, gffline->gseqname, newgfo, replaceidx);
861 +     }
862 +   else {
863 +     int gfoidx=gflst.Add(newgfo);
864 +     r=gfoAdd(newgfo->gffID, gffline->gseqname, newgfo, gfoidx);
865 +     }
866 +  if (gff_warns) {
867 +    int* pcount=tids.Find(newgfo->gffID);
868 +    if (pcount!=NULL) {
869 +       if (gff_warns) GMessage("Warning: duplicate GFF ID: %s\n", newgfo->gffID);
870 +       (*pcount)++;
871 +       }
872 +     else {
873 +       tids.Add(newgfo->gffID,new int(1));
874 +       }
875 +    }
876 +  return r;
877 + }
878  
879 < void GffReader::parseAll(GffRecFunc* gproc, bool keepAttr, bool noExonAttr, void* userptr1, void* userptr2) {
880 <  //WARNING: this is all messed up if the GFF lines are NOT grouped by parent feature
881 <  //!!! do not use this !!!
882 <  //iterates through all mappings in the input file
883 <  //calling gproc with each parsed mapping
884 <    GffObj* gfo;
885 <    while ((gfo=this->parse(keepAttr,noExonAttr))!=NULL) { //a valid gff record was parsed
886 <        if (gfo->empty()) { //shouldn't happen!
887 <             delete gfo;
888 <             gfo=NULL;
889 <             continue;
890 <             }
891 <     if (gproc(gfo, userptr1, userptr2)) {
892 <             //true returned from GfProcFunc means no longer needed
893 <              gfoRemove(gfo->gffID, gfo->getGSeqName());
894 <              delete gfo;
895 <              }
896 <         else {
897 <              gflst.Add(gfo);
898 <              }
899 <     gfo=NULL;
900 <     } //while records are parsed
901 <    phash.Clear();
879 > GfoHolder* GffReader::updateParent(GfoHolder* newgfh, GffObj* parent) {
880 >  //assert(parent);
881 >  //assert(newgfo);
882 >  parent->children.Add(newgfh->gffobj);
883 >  if (newgfh->gffobj->parent==NULL) newgfh->gffobj->parent=parent;
884 >  newgfh->gffobj->setLevel(parent->getLevel()+1);
885 >  if (parent->isGene()) {
886 >      if (parent->gene_name!=NULL && newgfh->gffobj->gene_name==NULL)
887 >        newgfh->gffobj->gene_name=Gstrdup(parent->gene_name);
888 >      if (parent->geneID!=NULL && newgfh->gffobj->geneID==NULL)
889 >        newgfh->gffobj->geneID=Gstrdup(parent->geneID);
890 >      }
891 >
892 >  return newgfh;
893 > }
894 >
895 > GfoHolder* GffReader::newGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr,
896 >                          GffObj* parent, GffExon* pexon) {
897 >  GffObj* newgfo=new GffObj(this, gffline, keepAttr, noExonAttr);
898 >  GfoHolder* r=NULL;
899 >  int gfoidx=gflst.Add(newgfo);
900 >  r=gfoAdd(newgfo->gffID, gffline->gseqname, newgfo, gfoidx);
901 >  if (parent!=NULL) {
902 >    updateParent(r, parent);
903 >    if (pexon!=NULL) parent->removeExon(pexon);
904 >    }
905 >  if (gff_warns) {
906 >    int* pcount=tids.Find(newgfo->gffID);
907 >    if (pcount!=NULL) {
908 >       if (gff_warns) GMessage("Warning: duplicate GFF ID: %s\n", newgfo->gffID);
909 >       (*pcount)++;
910 >       }
911 >     else {
912 >       tids.Add(newgfo->gffID,new int(1));
913 >       }
914 >    }
915 >  return r;
916 > }
917 >
918 > GfoHolder* GffReader::updateGffRec(GfoHolder* prevgfo, GffLine* gffline,
919 >                                         bool keepAttr) {
920 > if (prevgfo==NULL) return NULL;
921 > prevgfo->gffobj->createdByExon(false);
922 > prevgfo->gffobj->ftype_id=prevgfo->gffobj->names->feats.addName(gffline->ftype);
923 > prevgfo->gffobj->start=gffline->fstart;
924 > prevgfo->gffobj->end=gffline->fend;
925 > prevgfo->gffobj->isGene(gffline->is_gene);
926 > prevgfo->gffobj->isTranscript(gffline->is_transcript || gffline->exontype!=0);
927 > prevgfo->gffobj->fromGff3(gffline->is_gff3);
928 > if (keepAttr) {
929 >   if (prevgfo->gffobj->attrs!=NULL) prevgfo->gffobj->attrs->Clear();
930 >   prevgfo->gffobj->parseAttrs(prevgfo->gffobj->attrs, gffline->info);
931 >   }
932 > return prevgfo;
933 > }
934 >
935 >
936 > bool GffReader::addExonFeature(GfoHolder* prevgfo, GffLine* gffline, GHash<CNonExon>& pex, bool noExonAttr) {
937 >  bool r=true;
938 >  if (gffline->strand!=prevgfo->gffobj->strand) {
939 >     GMessage("GFF Error: duplicate GFF ID '%s' (exons found on different strands of %s)\n",
940 >        prevgfo->gffobj->gffID, prevgfo->gffobj->getGSeqName());
941 >      r=false;
942 >     }
943 >  int gdist=(gffline->fstart>prevgfo->gffobj->end) ? gffline->fstart-prevgfo->gffobj->end :
944 >                      ((gffline->fend<prevgfo->gffobj->start)? prevgfo->gffobj->start-gffline->fend :
945 >                         0 );
946 >  if (gdist>(int)GFF_MAX_LOCUS) { //too far apart, most likely this is a duplicate ID
947 >    GMessage("Error: duplicate GFF ID '%s' (or exons too far apart)!\n",prevgfo->gffobj->gffID);
948 >    //validation_errors = true;
949 >    r=false;
950 >    if (!gff_warns) exit(1);
951 >    }
952 >  int eidx=prevgfo->gffobj->addExon(this, gffline, !noExonAttr, noExonAttr);
953 >  if (eidx>=0 && gffline->ID!=NULL && gffline->exontype==0)
954 >      subfPoolAdd(pex, prevgfo);
955 >  return r;
956 > }
957 >
958 > CNonExon* GffReader::subfPoolCheck(GffLine* gffline, GHash<CNonExon>& pex, char*& subp_name) {
959 >  CNonExon* subp=NULL;
960 >  subp_name=NULL;
961 >  for (int i=0;i<gffline->num_parents;i++) {
962 >    if (transcriptsOnly && discarded_ids.Find(gffline->parents[i])!=NULL)
963 >        continue;
964 >    subp_name=gfoBuildId(gffline->parents[i], gffline->gseqname); //e.g. mRNA name
965 >    subp=pex.Find(subp_name);
966 >    if (subp!=NULL)
967 >       return subp;
968 >    GFREE(subp_name);
969 >    }
970 >  return NULL;
971 > }
972 >
973 > void GffReader::subfPoolAdd(GHash<CNonExon>& pex, GfoHolder* newgfo) {
974 > //this might become a parent feature later
975 > if (newgfo->gffobj->exons.Count()>0) {
976 >   char* xbuf=gfoBuildId(gffline->ID, gffline->gseqname);
977 >   pex.Add(xbuf, new CNonExon(newgfo->idx, newgfo->gffobj,
978 >       newgfo->gffobj->exons[0], gffline));
979 >   GFREE(xbuf);
980 >   }
981   }
982  
983 + GfoHolder* GffReader::promoteFeature(CNonExon* subp, char*& subp_name, GHash<CNonExon>& pex,
984 +    bool keepAttr, bool noExonAttr) {
985 +  GffObj* prevp=subp->parent; //grandparent of gffline (e.g. gene)
986 +  if (prevp!=gflst[subp->idx])
987 +    GError("Error promoting subfeature %s, gflst index mismatch?!\n", subp->gffline->ID);
988 +  subp->gffline->discardParent();
989 +  GfoHolder* gfoh=newGffRec(subp->gffline, keepAttr, noExonAttr, prevp, subp->exon);
990 +  pex.Remove(subp_name); //no longer a potential parent, moved it to phash already
991 +  prevp->promotedChildren(true);
992 +  return gfoh; //returns the holder of newly promoted feature
993 + }
994  
995 + //have to parse the whole file because exons can be scattered all over
996   void GffReader::readAll(bool keepAttr, bool mergeCloseExons, bool noExonAttr) {
997 +  bool validation_errors = false;
998 +  //loc_debug=false;
999 +  GHash<CNonExon> pex; //keep track of any "exon"-like features that have an ID
1000 +                     //and thus could become promoted to parent features
1001    while (nextGffLine()!=NULL) {
1002 <    if (gffline->Parent==NULL) {//no parent, new GFF3-like record starting
1003 <       //check for uniqueness of gffline->ID in phash !
1004 <       GffObj* f=gfoFind(gffline->ID, gffline->gseqname);
1005 <       if (f!=NULL) {
1006 <            GError("Error: duplicate GFF ID '%s' encountered!\n",gffline->ID);
1002 >       //seen this gff ID before?
1003 >     GfoHolder* prevseen=NULL;
1004 >     if (gffline->ID) //GFF3
1005 >         prevseen=gfoFind(gffline->ID, gffline->gseqname);
1006 >     if (prevseen!=NULL) {
1007 >            if (prevseen->gffobj->createdByExon()) {
1008 >                updateGffRec(prevseen, gffline, keepAttr);
1009 >                }
1010 >             else {
1011 >                GMessage("Error: duplicate GFF ID '%s' encountered!\n",gffline->ID);
1012 >                validation_errors = true;
1013 >                if (gff_warns) {
1014 >                       delete gffline; gffline=NULL; continue;
1015 >                       }
1016 >                   else exit(1);
1017 >                }
1018              }
1019 <       gflst.Add(new GffObj(this, gffline, keepAttr, noExonAttr));
1019 >    if (gffline->parents==NULL) {//start GFF3-like record with no parent (mRNA, gene)
1020 >       if (!prevseen) newGffRec(gffline, keepAttr, noExonAttr);
1021         }
1022 <    else { //--- it's a subfeature (exon/CDS/other):
1023 <       GffObj* prevgfo=gfoFind(gffline->Parent, gffline->gseqname);
1024 <       if (prevgfo!=NULL) { //exon of a previously seen GffObj
1025 <                 if (gffline->strand!=prevgfo->strand) {
1026 <                    GError("Error: duplicate GFF ID '%s' (exons found on different strands of %s)\n",
1027 <                       prevgfo->gffID, prevgfo->getGSeqName());
1028 <                    }
1029 <                 int gdist=(gffline->fstart>prevgfo->end) ? gffline->fstart-prevgfo->end :
1030 <                                     ((gffline->fend<prevgfo->start)? prevgfo->start-gffline->fend :
1031 <                                        0 );
1032 <                 if (gdist>(int)GFF_MAX_LOCUS) { //too far apart, most likely this is a duplicate ID
1033 <                   GError("Error: duplicate GFF ID '%s' (or exons too far apart)!\n",prevgfo->gffID);
1022 >    else { //--- it's a parented feature (could still be a mRNA)
1023 >       bool found_parent=false;
1024 >       GfoHolder* newgfo=prevseen;
1025 >       for (int i=0;i<gffline->num_parents;i++) {
1026 >            if (transcriptsOnly && discarded_ids.Find(gffline->parents[i])!=NULL)
1027 >                continue; //skipping discarded parent feature
1028 >            GfoHolder* parentgfo=gfoFind(gffline->parents[i], gffline->gseqname);
1029 >            if (parentgfo!=NULL) { //parent GffObj parsed earlier
1030 >                   found_parent=true;
1031 >                   if (parentgfo->gffobj->isGene() && gffline->is_transcript
1032 >                                   && gffline->exontype==0) {
1033 >                       //not an exon, but a transcript parented by a gene
1034 >                       if (newgfo) {
1035 >                           updateParent(newgfo, parentgfo->gffobj);
1036 >                           }
1037 >                         else {
1038 >                           newgfo=newGffRec(gffline, keepAttr, noExonAttr, parentgfo->gffobj);
1039 >                           }
1040 >                       }
1041 >                   else { //potential exon subfeature
1042 >                       if (!addExonFeature(parentgfo, gffline, pex, noExonAttr))
1043 >                         validation_errors=true;
1044 >                       }
1045                     }
1046 <                 prevgfo->addExon(gffline, !noExonAttr, noExonAttr);
1047 <                 }
1048 <            else {//new GTF-like record starting here with a subfeature
1049 <                 gflst.Add(new GffObj(this, gffline, keepAttr, noExonAttr));
1050 <                 //even those with errors will be added here!
1051 <                 }
1052 <       } //subfeature
1053 <      //--
1054 <    delete gffline;
1055 <    gffline=NULL;
1056 <    }//while
1057 <  for (int i=0;i<gflst.Count();i++) {
1058 <    gflst[i]->finalize(mergeCloseExons); //finalize the parsing - also merge close exons/features if so requested
1059 <    }
1046 >            } //for each parsed parent Id
1047 >       if (!found_parent) { //new GTF-like record starting here with a subfeature directly
1048 >             //or it could be some chado GFF3 barf with exons declared BEFORE their parent :(
1049 >            //check if this feature isn't parented by a previously stored "exon" subfeature
1050 >            char* subp_name=NULL;
1051 >            CNonExon* subp=subfPoolCheck(gffline, pex, subp_name);
1052 >            if (subp!=NULL) { //found a subfeature that is the parent of this gffline
1053 >               //promote that subfeature to a full GffObj
1054 >               GfoHolder* gfoh=promoteFeature(subp, subp_name, pex, keepAttr, noExonAttr);
1055 >               //add current gffline as an exon of the newly promoted subfeature
1056 >               if (!addExonFeature(gfoh, gffline, pex, noExonAttr))
1057 >                      validation_errors=true;
1058 >               }
1059 >              else { //no parent seen before, create one directly with this exon
1060 >               //loc_debug=true;
1061 >               GfoHolder* newgfo=prevseen ? prevseen : newGffRec(gffline, keepAttr, noExonAttr);
1062 >               if (gffline->ID!=NULL && gffline->exontype==0)
1063 >                     subfPoolAdd(pex, newgfo);
1064 >               //even those with errors will be added here!
1065 >               }
1066 >            GFREE(subp_name);
1067 >            } //no previous parent found
1068 >       } //parented feature
1069 >        //--
1070 >      delete gffline;
1071 >      gffline=NULL;
1072 >      }//while gff lines
1073 >  gflst.finalize(this, mergeCloseExons, keepAttr, noExonAttr); //force sorting by locus if so constructed
1074   // all gff records are now loaded in GList gflst
1075   // so we can free the hash
1076    phash.Clear();
1077 +  tids.Clear();
1078 +  if (validation_errors) {
1079 +    exit(1);
1080 +    }
1081   }
1082  
1083 < //this may be called prematurely if exon records are not grouped by parent
1084 < GffObj* GffObj::finalize(bool mergeCloseExons) {
1083 > GffObj* GffObj::finalize(GffReader* gfr, bool mergeCloseExons, bool keepAttrs, bool noExonAttr) {
1084 > //merge
1085 > //always merge adjacent or overlapping segments
1086 > //but if mergeCloseExons then merge even when distance is up to 5 bases
1087   udata=0;
1088   uptr=NULL;
1089 < //TODO: merge adjacent or close segments - for mRNAs
1090 < //must merge adjacent features, and optionally small gaps
1091 < if (ftype_id==gff_fid_mRNA) {
1089 > if (gfr->transcriptsOnly && !(isTranscript() || (isGene() && children.Count()==0))) {
1090 >       isDiscarded(true);
1091 >       }
1092 > if (ftype_id==gff_fid_transcript && CDstart>0) {
1093 >    ftype_id=gff_fid_mRNA;
1094 >    //exon_ftype_id=gff_fid_exon;
1095 >    }
1096 > //if (ftype_id==gff_fid_mRNA || exon_ftype_id==gff_fid_exon || mergeCloseExons) {
1097 > if (isTranscript() || exon_ftype_id==gff_fid_exon || mergeCloseExons) {
1098     int mindist=mergeCloseExons ? 5:1;
1099     for (int i=0;i<exons.Count()-1;i++) {
1100       int ni=i+1;
1101       uint mend=exons[i]->end;
1102       while (ni<exons.Count()) {
1103         int dist=(int)(exons[ni]->start-mend);
1104 <       if (dist<0 || dist>mindist) break; //no merging with next segment
1104 >       if (dist>mindist) break; //no merging with next segment
1105 >       if (gfr!=NULL && gfr->gff_warns && dist!=0 && (exons[ni]->exontype!=exgffUTR && exons[i]->exontype!=exgffUTR)) {
1106 >          GMessage("GFF warning: merging adjacent/overlapping segments of %s on %s (%d-%d, %d-%d)\n",
1107 >               gffID, getGSeqName(), exons[i]->start, exons[i]->end,exons[ni]->start, exons[ni]->end);
1108 >          }
1109         mend=exons[ni]->end;
1110 +       covlen-=exons[i]->len();
1111         exons[i]->end=mend;
1112 +       covlen+=exons[i]->len();
1113 +       covlen-=exons[ni]->len();
1114         if (exons[ni]->attrs!=NULL && (exons[i]->attrs==NULL ||
1115              exons[i]->attrs->Count()<exons[ni]->attrs->Count())) {
1116                //use the other exon attributes, if more
# Line 707 | Line 1122
1122         } //check for merge with next exon
1123       } //for each exon
1124     }
1125 + //attribute reduction for GTF records
1126 + if (keepAttrs && !noExonAttr && !fromGff3()
1127 +          && exons.Count()>0 && exons[0]->attrs!=NULL) {
1128 +   bool attrs_discarded=false;
1129 +   for (int a=0;a<exons[0]->attrs->Count();a++) {
1130 +      int attr_name_id=exons[0]->attrs->Get(a)->attr_id;
1131 +      char* attr_name=names->attrs.getName(attr_name_id);
1132 +      char* attr_val =exons[0]->attrs->Get(a)->attr_val;
1133 +      bool sameExonAttr=true;
1134 +      for (int i=1;i<exons.Count();i++) {
1135 +         char* ov=exons[i]->getAttr(attr_name_id);
1136 +         if (ov==NULL || (strcmp(ov,attr_val)!=0)) {
1137 +             sameExonAttr=false;
1138 +             break;
1139 +             }
1140 +         }
1141 +      if (sameExonAttr) {
1142 +             //delete this attribute from exons level
1143 +             attrs_discarded=true;
1144 +             this->addAttr(attr_name, attr_val);
1145 +             for (int i=1;i<exons.Count();i++) {
1146 +                 removeExonAttr(*(exons[i]), attr_name_id);
1147 +                 }
1148 +             exons[0]->attrs->freeItem(a);
1149 +             }
1150 +      }
1151 +   if (attrs_discarded) exons[0]->attrs->Pack();
1152 +   }
1153   return this;
1154   }
1155  
1156 < void GffObj::parseAttrs(GffAttrs*& atrlist, char* info, bool noExonAttr) {
1156 > void GffObj::parseAttrs(GffAttrs*& atrlist, char* info, bool isExon) {
1157    if (names==NULL)
1158       GError(ERR_NULL_GFNAMES, "parseAttrs()");
1159    if (atrlist==NULL)
# Line 730 | Line 1173
1173      char* ech=strchr(start,'=');
1174      if (ech!=NULL) { // attr=value format found
1175         *ech='\0';
1176 <       if (strcmp(start, "ID")==0 || strcmp(start,"Parent")==0 || strcmp(start,"Name")==0 ||
1177 <            strcmp(start,"transcript_id")==0 || strcmp(start,"gene_id")==0)
1178 <          { start=pch; continue; } //skip this already recognized and stored attribute
1179 <       if (noExonAttr && (strcmp(start, "exon_number")==0 || strcmp(start, "exon")==0)) { start=pch; continue; }
1176 >       //if (noExonAttr && (strcmp(start, "exon_number")==0 || strcmp(start, "exon")==0)) { start=pch; continue; }
1177 >       if (strcmp(start, "exon_number")==0 || strcmp(start, "exon")==0 ||
1178 >              strcmp(start, "exon_id")==0)
1179 >           { start=pch; continue; }
1180         ech++;
1181         while (*ech==' ' && ech<endinfo) ech++;//skip extra spaces after the '='
1182 <       atrlist->Add(new GffAttr(names->attrs.addName(start),ech));
1182 >       //atrlist->Add(new GffAttr(names->attrs.addName(start),ech));
1183 >       //make sure we don't add the same attribute more than once
1184 >       if (isExon && (strcmp(start, "protein_id")==0)) {
1185 >             //Ensembl special case
1186 >             this->addAttr(start, ech);
1187 >             start=pch;
1188 >             continue;
1189 >             }
1190 >       atrlist->add_or_update(names, start, ech);
1191         }
1192        /*
1193        else { //not an attr=value format
# Line 748 | Line 1199
1199    if (atrlist->Count()==0) { delete atrlist; atrlist=NULL; }
1200   }
1201  
1202 < void GffObj::addAttr(const char* attrname, char* attrvalue) {
1202 > void GffObj::addAttr(const char* attrname, const char* attrvalue) {
1203    if (this->attrs==NULL)
1204        this->attrs=new GffAttrs();
1205 <  this->attrs->Add(new GffAttr(names->attrs.addName(attrname),attrvalue));
1205 >  //this->attrs->Add(new GffAttr(names->attrs.addName(attrname),attrvalue));
1206 >  this->attrs->add_or_update(names, attrname, attrvalue);
1207 > }
1208 >
1209 >
1210 > void GffObj::setFeatureName(const char* feature) {
1211 > //change the feature name/type for a transcript
1212 > int fid=names->feats.addName(feature);
1213 > if (monoFeature() && exons.Count()>0)
1214 >    this->exon_ftype_id=fid;
1215 > this->ftype_id=fid;
1216 > }
1217 >
1218 > void GffObj::setRefName(const char* newname) {
1219 > //change the feature name/type for a transcript
1220 > int rid=names->gseqs.addName(newname);
1221 > this->gseq_id=rid;
1222 > }
1223 >
1224 >
1225 >
1226 > int GffObj::removeAttr(const char* attrname, const char* attrval) {
1227 >  if (this->attrs==NULL || attrname==NULL || attrname[0]==0) return 0;
1228 >  int aid=this->names->attrs.getId(attrname);
1229 >  if (aid<0) return 0;
1230 >  int delcount=0;  //could be more than one ?
1231 >  for (int i=0;i<this->attrs->Count();i++) {
1232 >     if (aid==this->attrs->Get(i)->attr_id) {
1233 >       if (attrval==NULL ||
1234 >          strcmp(attrval, this->attrs->Get(i)->attr_val)==0) {
1235 >             delcount++;
1236 >             this->attrs->freeItem(i);
1237 >             }
1238 >       }
1239 >     }
1240 >  if (delcount>0) this->attrs->Pack();
1241 >  return delcount;
1242   }
1243  
1244 + int GffObj::removeAttr(int aid, const char* attrval) {
1245 +  if (this->attrs==NULL || aid<0) return 0;
1246 +  int delcount=0;  //could be more than one ?
1247 +  for (int i=0;i<this->attrs->Count();i++) {
1248 +     if (aid==this->attrs->Get(i)->attr_id) {
1249 +       if (attrval==NULL ||
1250 +          strcmp(attrval, this->attrs->Get(i)->attr_val)==0) {
1251 +             delcount++;
1252 +             this->attrs->freeItem(i);
1253 +             }
1254 +       }
1255 +     }
1256 +  if (delcount>0) this->attrs->Pack();
1257 +  return delcount;
1258 + }
1259 +
1260 +
1261 + int GffObj::removeExonAttr(GffExon& exon, const char* attrname, const char* attrval) {
1262 +  if (exon.attrs==NULL || attrname==NULL || attrname[0]==0) return 0;
1263 +  int aid=this->names->attrs.getId(attrname);
1264 +  if (aid<0) return 0;
1265 +  int delcount=0;  //could be more than one
1266 +  for (int i=0;i<exon.attrs->Count();i++) {
1267 +     if (aid==exon.attrs->Get(i)->attr_id) {
1268 +       if (attrval==NULL ||
1269 +          strcmp(attrval, exon.attrs->Get(i)->attr_val)==0) {
1270 +             delcount++;
1271 +             exon.attrs->freeItem(i);
1272 +             }
1273 +       }
1274 +     }
1275 +  if (delcount>0) exon.attrs->Pack();
1276 +  return delcount;
1277 + }
1278 +
1279 + int GffObj::removeExonAttr(GffExon& exon, int aid, const char* attrval) {
1280 +  if (exon.attrs==NULL || aid<0) return 0;
1281 +  int delcount=0;  //could be more than one
1282 +  for (int i=0;i<exon.attrs->Count();i++) {
1283 +     if (aid==exon.attrs->Get(i)->attr_id) {
1284 +       if (attrval==NULL ||
1285 +          strcmp(attrval, exon.attrs->Get(i)->attr_val)==0) {
1286 +             delcount++;
1287 +             exon.attrs->freeItem(i);
1288 +             }
1289 +       }
1290 +     }
1291 +  if (delcount>0) exon.attrs->Pack();
1292 +  return delcount;
1293 + }
1294 +
1295 +
1296   void GffObj::getCDS_ends(uint& cds_start, uint& cds_end) {
1297    cds_start=0;
1298    cds_end=0;
# Line 800 | Line 1339
1339               //CDstart in this segment
1340               //and we are getting the whole transcript
1341               cds_mend=s-(int)(CDstart-sgstart);
803             //GMessage("Setting cds_mend to %d\n",cds_mend);
1342               }
1343         if (CDend>=sgstart && CDend<=sgend) {
1344               //CDstart in this segment
1345               //and we are getting the whole transcript
1346               cds_mstart=s-(int)(CDend-cdsadj-sgstart);
809             //GMessage("Setting cds_mstart to %d\n",cds_mstart);
1347               }
1348        } //for each exon
1349      } // - strand
# Line 840 | Line 1377
1377    //return spliced;
1378   }
1379  
1380 + char* GffObj::getUnspliced(GFaSeqGet* faseq, int* rlen, GList<GSeg>* seglst)
1381 + {
1382 +    if (faseq==NULL) { GMessage("Warning: getUnspliced(NULL,.. ) called!\n");
1383 +        return NULL;
1384 +    }
1385 +    //restore normal coordinates:
1386 +    unxcoord();
1387 +    if (exons.Count()==0) return NULL;
1388 +    int fspan=end-start+1;
1389 +    const char* gsubseq=faseq->subseq(start, fspan);
1390 +    if (gsubseq==NULL) {
1391 +        GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end);
1392 +    }
1393 +    char* unspliced=NULL;
1394 +
1395 +    int seqstart=exons.First()->start;
1396 +    int seqend=exons.Last()->end;
1397 +    
1398 +    int unsplicedlen = 0;
1399 +
1400 +    unsplicedlen += seqend - seqstart + 1;
1401 +
1402 +    GMALLOC(unspliced, unsplicedlen+1); //allocate more here
1403 +    //uint seqstart, seqend;
1404 +
1405 +    int s = 0; //resulting nucleotide counter
1406 +    if (strand=='-')
1407 +    {
1408 +        if (seglst!=NULL)
1409 +            seglst->Add(new GSeg(s+1,s+1+seqend-seqstart));
1410 +        for (int i=seqend;i>=seqstart;i--)
1411 +        {
1412 +            unspliced[s] = ntComplement(gsubseq[i-start]);
1413 +            s++;
1414 +        }//for each nt
1415 +    } // - strand
1416 +    else
1417 +    { // + strand
1418 +        if (seglst!=NULL)
1419 +            seglst->Add(new GSeg(s+1,s+1+seqend-seqstart));
1420 +        for (int i=seqstart;i<=seqend;i++)
1421 +        {
1422 +            unspliced[s]=gsubseq[i-start];
1423 +            s++;
1424 +        }//for each nt
1425 +    } // + strand
1426 +    //assert(s <= unsplicedlen);
1427 +    unspliced[s]=0;
1428 +    if (rlen!=NULL) *rlen=s;
1429 +    return unspliced;
1430 + }
1431 +
1432   char* GffObj::getSpliced(GFaSeqGet* faseq, bool CDSonly, int* rlen, uint* cds_start, uint* cds_end,
1433            GList<GSeg>* seglst) {
1434    if (CDSonly && CDstart==0) return NULL;
# Line 854 | Line 1443
1443    if (gsubseq==NULL) {
1444          GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end);
1445          }
1446 +  if (fspan<(int)(end-start+1)) { //special case: stop coordinate was extended past the gseq length, must adjust
1447 +     int endadj=end-start+1-fspan;
1448 +     uint prevend=end;
1449 +     end-=endadj;
1450 +     if (CDend>end) CDend=end;
1451 +     if (exons.Last()->end>end) {
1452 +         exons.Last()->end=end; //this could get us into trouble if exon start is also > end
1453 +         if (exons.Last()->start>exons.Last()->end) {
1454 +            GError("GffObj::getSpliced() error: improper genomic coordinate %d on %s for %s\n",
1455 +                  prevend,getGSeqName(), getID());
1456 +            }
1457 +         covlen-=endadj;
1458 +         }
1459 +     }
1460    char* spliced=NULL;
1461    GMALLOC(spliced, covlen+1); //allocate more here
1462    uint seqstart, seqend;
# Line 1019 | Line 1622
1622            strand, start, end, gscore, (float)qcov/10.0);
1623   }
1624  
1625 < void GffObj::printGxfLine(FILE* fout, char* tlabel, char* gseqname, bool iscds,
1625 > void GffObj::printGxfLine(FILE* fout, const char* tlabel, const char* gseqname, bool iscds,
1626                               uint segstart, uint segend, int exidx, char phase, bool gff3) {
1627    static char scorestr[14];
1628    strcpy(scorestr,".");
# Line 1028 | Line 1631
1631       if (exons[exidx]->score) sprintf(scorestr,"%.2f", exons[exidx]->score);
1632       xattrs=exons[exidx]->attrs;
1633    }
1031  char* geneid=(gname!=NULL)? gname : gffID;
1634    if (phase==0 || !iscds) phase='.';
1635    const char* ftype=iscds ? "CDS" : getSubfName();
1636    if (gff3) {
# Line 1043 | Line 1645
1645           }
1646      fprintf(fout, "\n");
1647      } //GFF
1648 <  else {//for GTF -- we can only print mRNAs here
1649 <    fprintf(fout, "%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\t",
1650 <        gseqname, tlabel, ftype, segstart, segend, scorestr, strand, phase);
1651 <    if (ismRNA())
1652 <       fprintf(fout,"gene_id \"%s\"; transcript_id \"%s\";", geneid, gffID);
1648 >  else {//for GTF -- we print only transcripts
1649 >    //if (isValidTranscript())
1650 >    fprintf(fout, "%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\ttranscript_id \"%s\";",
1651 >           gseqname, tlabel, ftype, segstart, segend, scorestr, strand, phase, gffID);
1652 >    //char* geneid=(geneID!=NULL)? geneID : gffID;
1653 >    if (geneID)
1654 >      fprintf(fout," gene_id \"%s\";",geneID);
1655 >    if (gene_name!=NULL) {
1656 >       //fprintf(fout, " gene_name ");
1657 >       //if (gene_name[0]=='"') fprintf (fout, "%s;",gene_name);
1658 >       //              else fprintf(fout, "\"%s\";",gene_name);
1659 >       fprintf(fout," gene_name \"%s\";",gene_name);
1660 >       }
1661      if (xattrs!=NULL) {
1662 <       for (int i=0;i<xattrs->Count();i++) {
1663 <         if (xattrs->Get(i)->attr_val==NULL) continue;
1664 <         fprintf(fout, " %s ",names->attrs.getName(xattrs->Get(i)->attr_id));
1665 <          if (xattrs->Get(i)->attr_val[0]=='"')
1666 <                  fprintf(fout, "%s;",xattrs->Get(i)->attr_val);
1667 <             else fprintf(fout, "\"%s\";",xattrs->Get(i)->attr_val);
1662 >          for (int i=0;i<xattrs->Count();i++) {
1663 >            if (xattrs->Get(i)->attr_val==NULL) continue;
1664 >            const char* attrname=names->attrs.getName(xattrs->Get(i)->attr_id);
1665 >            fprintf(fout, " %s ",attrname);
1666 >            if (xattrs->Get(i)->attr_val[0]=='"')
1667 >                     fprintf(fout, "%s;",xattrs->Get(i)->attr_val);
1668 >                else fprintf(fout, "\"%s\";",xattrs->Get(i)->attr_val);
1669 >             }
1670            }
1671 <       }
1671 >    //for GTF, also append the GffObj attributes to each exon line
1672 >    if ((xattrs=this->attrs)!=NULL) {
1673 >          for (int i=0;i<xattrs->Count();i++) {
1674 >            if (xattrs->Get(i)->attr_val==NULL) continue;
1675 >            const char* attrname=names->attrs.getName(xattrs->Get(i)->attr_id);
1676 >            fprintf(fout, " %s ",attrname);
1677 >            if (xattrs->Get(i)->attr_val[0]=='"')
1678 >                     fprintf(fout, "%s;",xattrs->Get(i)->attr_val);
1679 >                else fprintf(fout, "\"%s\";",xattrs->Get(i)->attr_val);
1680 >             }
1681 >           }
1682      fprintf(fout, "\n");
1683      }//GTF
1684   }
1685  
1686 < void GffObj::printGxf(FILE* fout, GffPrintMode gffp, char* tlabel) {
1686 > void GffObj::printGxf(FILE* fout, GffPrintMode gffp,
1687 >                   const char* tlabel, const char* gfparent) {
1688   static char tmpstr[255];
1689   if (tlabel==NULL) {
1690      tlabel=track_id>=0 ? names->tracks.Get(track_id)->name :
1691           (char*)"gffobj" ;
1692      }
1070
1693   unxcoord();
1694 < if (exons.Count()==0) return;
1695 < char* gseqname=names->gseqs.Get(gseq_id)->name;
1694 > //if (exons.Count()==0) return;
1695 > const char* gseqname=names->gseqs.Get(gseq_id)->name;
1696   bool gff3 = (gffp>=pgffAny);
1697   bool showCDS = (gffp==pgtfAny || gffp==pgtfCDS || gffp==pgffCDS || gffp==pgffAny || gffp==pgffBoth);
1698   bool showExon = (gffp<=pgtfExon || gffp==pgffAny || gffp==pgffExon || gffp==pgffBoth);
# Line 1084 | Line 1706
1706        pend=CDend;
1707        }
1708     else { pstart=start;pend=end; }
1709 <   const char* ftype=ismRNA() ? "mRNA" : getFeatureName();
1709 >   //const char* ftype=isTranscript() ? "mRNA" : getFeatureName();
1710 >   const char* ftype=getFeatureName();
1711     fprintf(fout,
1712       "%s\t%s\t%s\t%d\t%d\t%s\t%c\t.\tID=%s",
1713       gseqname, tlabel, ftype, pstart, pend, tmpstr, strand, gffID);
1714 <   if (gname!=NULL)
1715 <       fprintf(fout, ";Name=%s",gname);
1716 <   if (CDstart>0 && !showCDS && !isCDS) fprintf(fout,";CDS=%d:%d",CDstart,CDend);
1714 >   if (CDstart>0 && !showCDS && !isCDS) fprintf(fout,";CDS=%d-%d",CDstart,CDend);
1715 >   if (gfparent!=NULL) {
1716 >      //parent override
1717 >      fprintf(fout, ";Parent=%s",gfparent);
1718 >      }
1719 >     else {
1720 >       if (parent!=NULL && !parent->isDiscarded())
1721 >           fprintf(fout, ";Parent=%s",parent->getID());
1722 >       }
1723 >   if (geneID!=NULL)
1724 >      fprintf(fout, ";geneID=%s",geneID);
1725 >   if (gene_name!=NULL)
1726 >      fprintf(fout, ";gene_name=%s",gene_name);
1727     if (attrs!=NULL) {
1728        for (int i=0;i<attrs->Count();i++) {
1729 <        fprintf(fout,";%s=%s", names->attrs.getName(attrs->Get(i)->attr_id),
1729 >        const char* attrname=names->attrs.getName(attrs->Get(i)->attr_id);
1730 >        fprintf(fout,";%s=%s", attrname,
1731                 attrs->Get(i)->attr_val);
1732          }
1733        }
# Line 1101 | Line 1735
1735     }// gff3 mRNA line
1736   if (showExon) {
1737     //print exons
1738 <   for (int i=0;i<exons.Count();i++) {
1739 <     printGxfLine(fout, tlabel, gseqname, isCDS, exons[i]->start, exons[i]->end, i, exons[i]->phase, gff3);
1740 <     }
1741 < }//printing exons
1738 >    if (isCDS && exons.Count()>0 &&
1739 >        ((strand=='-' && exons.Last()->phase<'0') || (strand=='+' && exons.Last()->phase<'0')))
1740 >         updateExonPhase();
1741 >
1742 >    for (int i=0;i<exons.Count();i++) {
1743 >      printGxfLine(fout, tlabel, gseqname, isCDS, exons[i]->start, exons[i]->end, i, exons[i]->phase, gff3);
1744 >      }
1745 >    }//printing exons
1746   if (showCDS && !isCDS && CDstart>0) {
1747      GArray<GffCDSeg> cds(true,true);
1748      getCDSegs(cds);
# Line 1114 | Line 1752
1752    } //showCDS
1753   }
1754  
1755 + void GffObj::updateExonPhase() {
1756 +  if (!isCDS) return;
1757 +  int cdsacc=0;
1758 +  if (CDphase=='1' || CDphase=='2') {
1759 +      cdsacc+= 3-(CDphase-'0');
1760 +      }
1761 +  if (strand=='-') { //reverse strand
1762 +     for (int i=exons.Count()-1;i>=0;i--) {
1763 +         exons[i]->phase='0'+ (3-cdsacc%3)%3;
1764 +         cdsacc+=exons[i]->end-exons[i]->start+1;
1765 +         }
1766 +     }
1767 +    else { //forward strand
1768 +     for (int i=0;i<exons.Count();i++) {
1769 +         exons[i]->phase='0'+ (3-cdsacc%3)%3;
1770 +         cdsacc+=exons[i]->end-exons[i]->start+1;
1771 +         }
1772 +     }
1773 + }
1774 +
1775  
1776   void GffObj::getCDSegs(GArray<GffCDSeg>& cds) {
1777    GffCDSeg cdseg;
# Line 1158 | Line 1816
1816         } //for each exon
1817     } // + strand
1818   }
1161 /*
1162 #ifdef DEBUG
1163 void GffObj::dbgPrint(const char* msg) {
1164 if (msg!=NULL) fprintf(stdout, ">> %s\n",msg);
1165 char* tlabel=track_id>=0 ? names->tracks.Get(track_id)->name :
1166       (char*)"gmapobj" ;
1167 char scorestr[14];
1168 char strand=revstrand?'-':'+';
1169 unxcoord();
1170 char* gseqname=names->gseqs.Get(gseq_id)->name;
1171 char* fname=f_id>=0 ? names->feats.Get(f_id)->name : (char*)"nofeatname";
1172
1173 fprintf(stdout, "%s\t%s\t%s\t%d\t%d\t.\t%c\t.\tID=%s;Name=%s\n",
1174       gseqname, tlabel, fname, start, end, strand, gffID, gffID);
1175
1176 for (int fi=0;fi<features->Count();fi++) {
1177   GFeature* feature=features->Get(fi);
1178   fname=names->feats.Get(feature->name_id)->name;
1179   GffExon* segs=feature->segs;
1180   int segcount=feature->segcount;
1181   if (segcount==0 || segs==NULL) continue;
1182   for (int i=0;i<segcount;i++) {
1183      if (segs[i].start==0) continue;
1184      if (segs[i].score) sprintf(scorestr,"%.2f", segs[i].score/100.00);
1185                  else strcpy(scorestr,".");
1186      fprintf(stdout,
1187         "%s\t%s\t%s\t%d\t%d\t%s\t%c\t.\tParent=%s\n",
1188         gseqname, tlabel, fname, segs[i].start, segs[i].end, scorestr, strand, gffID);
1189      }
1190   }
1191 fflush(stdout);
1192 }
1193 #endif
1194 */
1195

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines