ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/gff.cpp
(Generate patch)
# Line 5 | Line 5
5   //global set of feature names, attribute names etc.
6   // -- common for all GffObjs in current application!
7  
8 < const uint GFF_MAX_LOCUS = 4000000; //longest known gene in human is ~2.2M, UCSC claims a gene for mouse of ~ 3.1 M
9 < const uint GFF_MAX_EXON  =   20000; //longest known exon in human is ~11K
10 < const uint GFF_MAX_INTRON= 1600000;
11 <
8 > const uint GFF_MAX_LOCUS = 7000000; //longest known gene in human is ~2.2M, UCSC claims a gene for mouse of ~ 3.1 M
9 > const uint GFF_MAX_EXON  =   30000; //longest known exon in human is ~11K
10 > const uint GFF_MAX_INTRON= 6000000;
11 > bool gff_show_warnings = false; //global setting, set by GffReader->showWarnings()
12   const int gff_fid_mRNA=0;
13 < const int gff_fid_exon=1;
14 < const int gff_fid_CDS=2;
15 < bool gff_warns=true;
13 > const int gff_fid_transcript=1;
14 > const int gff_fid_exon=2;
15 > const int gff_fid_CDS=3; //never really used in GffObj ftype_id or subftype_id
16 > const uint gfo_flag_HAS_ERRORS       = 0x00000001;
17 > const uint gfo_flag_CHILDREN_PROMOTED= 0x00000002;
18 > const uint gfo_flag_IS_GENE          = 0x00000004;
19 > const uint gfo_flag_IS_TRANSCRIPT    = 0x00000008;
20 > const uint gfo_flag_FROM_GFF3        = 0x00000010;
21 > const uint gfo_flag_BY_EXON          = 0x00000020; //created by subfeature (exon) directly
22 > const uint gfo_flag_DISCARDED        = 0x00000100;
23 > const uint gfo_flag_LST_KEEP         = 0x00000200;
24 > const uint gfo_flag_LEVEL_MSK        = 0x00FF0000;
25 > const byte gfo_flagShift_LEVEL           = 16;
26  
27   void gffnames_ref(GffNames* &n) {
28    if (n==NULL) n=new GffNames();
# Line 29 | Line 39
39  
40   GffObj& g1=*((GffObj*)p1);
41   GffObj& g2=*((GffObj*)p2);
42 < if (g1.gseq_id==g2.gseq_id) return (int)(g1.start-g2.start);
43 <                else return (int)(g1.gseq_id-g2.gseq_id);
42 > if (g1.gseq_id==g2.gseq_id) {
43 >             if (g1.start!=g2.start)
44 >                    return (int)(g1.start-g2.start);
45 >               else if (g1.getLevel()!=g2.getLevel())
46 >                        return (int)(g1.getLevel()-g2.getLevel());
47 >                    else
48 >                        if (g1.end!=g2.end)
49 >                              return (int)(g1.end-g2.end);
50 >                        else return strcmp(g1.getID(), g2.getID());
51 >             }
52 >             else return (int)(g1.gseq_id-g2.gseq_id);
53   }
54  
55 < static char fnamelc[32];
55 > char* GffLine::extractAttr(const char* pre, bool caseStrict, bool enforce_GTF2) {
56 > //parse a key attribute and remove it from the info string
57 > //(only works for attributes that have values following them after ' ' or '=')
58 > static const char GTF2_ERR[]="Error parsing attribute %s ('\"' required) at GTF line:\n%s\n";
59 > int lpre=strlen(pre);
60 > char cend=pre[lpre-1];
61 > char* pos = (caseStrict) ? strstr(info, pre) : strifind(info, pre);
62 > if (pos==NULL) return NULL;
63 > char* findstart=info;
64 > //require word boundary on the left:
65 > while (pos!=NULL && pos!=info && *(pos-1)!=';' && *(pos-1)!=' ') {
66 >    findstart=pos+lpre;
67 >    pos = (caseStrict) ? strstr(findstart, pre) : strifind(findstart, pre);
68 >    }
69 > if (pos==NULL) return NULL;
70 > if (cend!=' ' && cend!='=') {
71 >    //require word boundary on the right:
72 >    while (pos!=NULL && *(pos+lpre)!=' ' && *(pos+lpre)!='=') {
73 >       findstart=pos+lpre;
74 >       pos = (caseStrict) ? strstr(findstart, pre) : strifind(findstart, pre);
75 >       }
76 >    }
77 > if (pos==NULL) return NULL;
78 > char* vp=pos+lpre;
79 > while (*vp==' ') vp++;
80 > if (*vp==';' || *vp==0)
81 >      GError("Error parsing value of GFF attribute \"%s\", line:\n%s\n", pre, dupline);
82 > bool dq_enclosed=false; //value string enclosed by double quotes
83 > if (*vp=='"') {
84 >     dq_enclosed=true;
85 >     vp++;
86 >     }
87 > if (enforce_GTF2 && !dq_enclosed)
88 >      GError(GTF2_ERR,pre, dupline);
89 > char* vend=vp;
90 > if (dq_enclosed) {
91 >    while (*vend!='"' && *vend!=';' && *vend!=0) vend++;
92 >    }
93 > else {
94 >    while (*vend!=';' && *vend!=0) vend++;
95 >    }
96 > if (enforce_GTF2 && *vend!='"')
97 >     GError(GTF2_ERR, pre, dupline);
98 > char *r=Gstrdup(vp, vend-1);
99 > //-- now remove this attribute from the info string
100 > while (*vend!=0 && (*vend=='"' || *vend==';' || *vend==' ')) vend++;
101 > if (*vend==0) vend--;
102 > for (char *src=vend, *dest=pos;;src++,dest++) {
103 >   *dest=*src;
104 >   if (*src==0) break;
105 >   }
106 > return r;
107 > }
108 >
109 > static char fnamelc[128];
110  
111   GffLine::GffLine(GffReader* reader, const char* l) {
112 < line=Gstrdup(l);
112 > llen=strlen(l);
113 > GMALLOC(line,llen+1);
114 > memcpy(line, l, llen+1);
115 > GMALLOC(dupline, llen+1);
116 > memcpy(dupline, l, llen+1);
117   skip=true;
118   gseqname=NULL;
119   track=NULL;
120   ftype=NULL;
121   info=NULL;
122 < Parent=NULL;
122 > _parents=NULL;
123 > _parents_len=0;
124 > num_parents=0;
125 > parents=NULL;
126 > is_gff3=false;
127   is_cds=false;
128 < is_mrna=false;
128 > is_transcript=false;
129   is_exon=false;
130 + is_gene=false;
131   exontype=0;
132 < gname=NULL;
132 > gene_id=NULL;
133 > gene_name=NULL;
134   qstart=0;
135   qend=0;
136   qlen=0;
54 exontype=0;
137   ID=NULL;
138   char* t[9];
139   int i=0;
# Line 82 | Line 164
164   p=t[4];
165   if (!parseUInt(p,fend))
166     GError("Error parsing end coordinate from GFF line:\n%s\n",l);
167 < if (fend<fstart) swap(fend,fstart);
167 > if (fend<fstart) swap(fend,fstart); //make sure fstart>=fend, always
168   p=t[5];
169   if (p[0]=='.' && p[1]==0) {
170    score=0;
# Line 94 | Line 176
176   strand=*t[6];
177   if (strand!='+' && strand!='-' && strand!='.')
178       GError("Error parsing strand (%c) from GFF line:\n%s\n",strand,l);
179 < phase=*t[7];
179 > phase=*t[7]; // must be '.', '0', '1' or '2'
180   ID=NULL;
99 Parent=NULL;
181   // exon/CDS/mrna filter
182 < strncpy(fnamelc, ftype, 31);
183 < fnamelc[31]=0;
182 > strncpy(fnamelc, ftype, 127);
183 > fnamelc[127]=0;
184   strlower(fnamelc); //convert to lower case
185 + bool is_t_data=false;
186   if (strstr(fnamelc, "utr")!=NULL) {
187     exontype=exgffUTR;
188     is_exon=true;
189 +   is_t_data=true;
190     }
191    else if (strstr(fnamelc, "exon")!=NULL) {
192     exontype=exgffExon;
193     is_exon=true;
194 +   is_t_data=true;
195     }
196 <  else if (startsWith(fnamelc, "stop") && strstr(fnamelc, "codon")!=NULL) {
196 >  else if (strstr(fnamelc, "stop") &&
197 >      (strstr(fnamelc, "codon") || strstr(fnamelc, "cds"))){
198     exontype=exgffStop;
199 +   is_cds=true; //though some place it outside the last CDS segment
200 +   is_t_data=true;
201 +   }
202 +  else if (strstr(fnamelc, "start") &&
203 +      ((strstr(fnamelc, "codon")!=NULL) || strstr(fnamelc, "cds")!=NULL)){
204 +   exontype=exgffStart;
205     is_cds=true;
206 +   is_t_data=true;
207     }
208   else if (strcmp(fnamelc, "cds")==0) {
209     exontype=exgffCDS;
210     is_cds=true;
211 +   is_t_data=true;
212     }
213 < else { //is_mrna is set only if the *current* line is a mRNA or transcript
214 <   is_mrna=(strcmp(fnamelc,"mrna")==0 ||
215 <          strcmp(fnamelc,"transcript")==0);
123 <   }
124 <
125 < if (reader->mrnaOnly) {
126 <   if (!is_mrna && !is_cds && !is_exon)
127 <                     return; //skip this line, unwanted feature name
128 <   }
129 < p=strstr(info,"ID=");
130 < if (p!=NULL) { //has ID attr
131 <   ID=p+3;
132 <   p=ID;
133 <   while (*p!=';' && *p!=0) p++;
134 <   ID=Gstrdup(ID, p-1);
135 <   //look for a name attr too:
136 <   p=strstr(info,"Name=");
137 <   if (p!=NULL) {
138 <     gname=p+5;
139 <     p=gname;
140 <     while (*p!=';' && *p!=0) p++;
141 <     gname=Gstrdup(gname, p-1);
142 <     }
213 > else if (endsWith(fnamelc, "gene") || startsWith(fnamelc, "gene")) {
214 >   is_gene=true;
215 >   is_t_data=true; //because its name will be attached to parented transcripts
216     }
217 < p=NULL;
218 < if (!is_mrna)
219 <     p=strifind(info,"Parent="); //don't care about the parent for mRNA features..
147 < if (p!=NULL) { //has Parent attr
148 <   Parent=p+7;
149 <   p=Parent;
150 <   while (*p!=';' && *p!=0) p++;
151 <   Parent=Gstrdup(Parent, p-1);
217 > else if (endsWith(fnamelc,"rna") || endsWith(fnamelc,"transcript")) {
218 >   is_transcript=true;
219 >   is_t_data=true;
220     }
221 <  else if (ID==NULL) { //no "Parent=" and no "ID=", attempt GTF parsing instead
222 <   p=strstr(info,"transcript_id");
223 <   if (p!=NULL) { //GTF detected
224 <     p+=13;
225 <     //requires quotes
226 <     while (*p!='"' && *p!=0) p++;
227 <     if (*p==0) GError("Error parsing transcript_id (double quotes expected) at GTF line:\n%s\n",l);
228 <     p++;
229 <     Parent=p;
230 <     while (*p!='"' && *p!=0) p++;
163 <     if (*p==0) GError("Error parsing transcript_id (ending double quotes expected) at GTF line:\n%s\n",l);
164 <     if (is_mrna) { // RGASP GTF exception: a parent "transcript" feature preceding exon/CDS subfeatures
165 <        ID=Gstrdup(Parent, p-1);
166 <        Parent=NULL;
167 <        }
168 <       else {
169 <        Parent=Gstrdup(Parent, p-1);
221 >
222 > if (reader->transcriptsOnly && !is_t_data) {
223 >        char* id=extractAttr("ID=");
224 >        if (id==NULL) id=extractAttr("transcript_id");
225 >        //GMessage("Discarding non-transcript line:\n%s\n",l);
226 >        if (id!=NULL) {
227 >          reader->discarded_ids.Add(id, new int(1));
228 >          GFREE(id);
229 >          }
230 >        return; //skip this line, unwanted feature name
231          }
232 <     //check for gene_name or gene_id
233 <     //p=strstr(info, "gene_name");// this is preferred over gene_id
234 <     //if (p==NULL)
235 <     p=strstr(info,"gene_id");
236 <     if (p!=NULL) {
237 <       p+=7;//skip 'gene_id'
238 <       while (*p!='"' && *p!=0) p++;
239 <       if (*p==0) GError("Error parsing gene_id (double quotes expected) at GTF line:\n%s\n",l);
240 <       p++;
241 <       gname=p;
242 <       while (*p!='"' && *p!=0) p++;
243 <       if (*p==0) GError("Error parsing gene_id (ending double quotes expected) at GTF line:\n%s\n",l);
244 <       gname=Gstrdup(gname, p-1);
245 <       }
232 > ID=extractAttr("ID=");
233 > char* Parent=extractAttr("Parent=");
234 > is_gff3=(ID!=NULL || Parent!=NULL);
235 > if (is_gff3) {
236 >   //parse as GFF3
237 >    if (ID!=NULL) {
238 >       //has ID attr so it's likely to be a parent feature
239 >       //look for explicit gene name
240 >       gene_name=extractAttr("gene_name=",false);
241 >       if (gene_name==NULL) {
242 >           gene_name=extractAttr("geneName=",false);
243 >           if (gene_name==NULL) {
244 >               gene_name=extractAttr("gene_sym=",false);
245 >               if (gene_name==NULL) {
246 >                   gene_name=extractAttr("gene=",false);
247 >                   }
248 >               }
249 >           }
250 >       gene_id=extractAttr("geneID=",false);
251 >       if (gene_id==NULL) {
252 >          gene_id=extractAttr("gene_id=",false);
253 >          }
254 >       if (is_gene) {
255 >         //special case: keep the Name and ID attributes of the gene feature
256 >         if (gene_name==NULL)
257 >              gene_name=extractAttr("Name=");
258 >         if (gene_id==NULL) //the ID is also gene_id in this case
259 >              gene_id=Gstrdup(ID);
260 >         //skip=false;
261 >         //return;
262 >         GFREE(Parent); //TMI, we really don't care about gene Parents?
263 >         } //gene feature
264 >       }// has GFF3 ID
265 >   if (Parent!=NULL) {
266 >        //keep Parent attr
267 >         //parse multiple parents
268 >         num_parents=1;
269 >         p=Parent;
270 >         int last_delim_pos=-1;
271 >         while (*p!=';' && *p!=0) {
272 >             if (*p==',' && *(p+1)!=0 && *(p+1)!=';') {
273 >                 num_parents++;
274 >                 last_delim_pos=(p-Parent);
275 >                 }
276 >             p++;
277 >             }
278 >         _parents_len=p-Parent+1;
279 >         _parents=Parent;
280 >         GMALLOC(parents, num_parents*sizeof(char*));
281 >         parents[0]=_parents;
282 >         int i=1;
283 >         if (last_delim_pos>0) {
284 >           for (p=_parents+1;p<=_parents+last_delim_pos;p++) {
285 >              if (*p==',') {
286 >                 char* ep=p-1;
287 >                 while (*ep==' ' && ep>_parents) ep--;
288 >                 *(ep+1)=0; //end the string there
289 >                 parents[i]=p+1;
290 >                 i++;
291 >                 }
292 >              }
293 >           }
294 >         } //has Parent field
295 >   } //GFF3
296 >  else { // GTF-like expected
297 >   Parent=extractAttr("transcript_id");
298 >   if (Parent!=NULL) { //GTF2 format detected
299 >     if (is_transcript) {
300 >         // atypical GTF with a parent transcript line declared
301 >         ID=Parent;
302 >         Parent=NULL;
303 >         }
304 >     gene_id=extractAttr("gene_id"); // for GTF this is the only attribute accepted as geneID
305 >     gene_name=extractAttr("gene_name");
306 >     if (gene_name==NULL) {
307 >           gene_name=extractAttr("gene_sym");
308 >           if (gene_name==NULL)
309 >               gene_name=extractAttr("gene");
310 >           }
311       //prepare for parseAttr by adding '=' character instead of spaces for all attributes
312       //after the attribute name
313       p=info;
314       bool noed=true; //not edited after the last delim
315       bool nsp=false; //non-space found after last delim
316       while (*p!=0) {
317 <      if (*p==' ') {
318 <         if (nsp && noed) {
319 <           *p='=';
320 <            noed=false;
321 <            p++;
322 <            continue;
323 <            }
324 <         }
325 <      else nsp=true;
326 <      if (*p==';') { noed=true; nsp=false; }
327 <      p++;
328 <      }
329 <     } //gtf detected
330 <    else {//check for jigsaw or cufflinks format
331 <     char* fexon=strstr(fnamelc, "exon");
332 <     if (fexon!=NULL) {
317 >       if (*p==' ') {
318 >          if (nsp && noed) {
319 >             *p='=';
320 >             noed=false;
321 >             p++;
322 >             continue;
323 >             }
324 >           }
325 >         else nsp=true; //non-space
326 >       if (*p==';') { noed=true; nsp=false; }
327 >       p++;
328 >       }
329 >     } //GTF2 detected (no parent line)
330 >    else {// Parent is NULL, check for jigsaw format or other pre-GTF2 format
331 >     //char* fexon=strstr(fnamelc, "exon");
332 >     //if (fexon!=NULL) {
333 >     if (exontype==exgffExon) {
334         if (startsWith(track,"jigsaw")) {
335 <        is_cds=true;
336 <        strcpy(track,"jigsaw");
337 <        p=strchr(info,';');
338 <        if (p==NULL) Parent=Gstrdup(info);
339 <           else { Parent=Gstrdup(info,p-1); info=p+1;  }
340 <        }
341 <       else if ((i=strcspn(info,"; \t\n\r"))<=(int)(strlen(info)+1)) {//one word ID
335 >          is_cds=true;
336 >          strcpy(track,"jigsaw");
337 >          p=strchr(info,';');
338 >          if (p==NULL) { Parent=Gstrdup(info); info=NULL; }
339 >           else { Parent=Gstrdup(info,p-1);
340 >                  info=p+1;
341 >                }
342 >          }
343 >        } //exon feature?
344 >        if (Parent==NULL && exontype>=exgffCDS &&
345 >               (i=strcspn(info,"; \t\n\r"))<=(int)(strlen(info)+1)) {
346 >          //one word ID ? really desperate attempt to parse it here
347            Parent=Gstrdup(info,info+i-1);
348 <        }
349 <
218 <      }
219 <      else GError("Error parsing Parent/ID at input line:\n%s\n",l);
348 >          info=NULL; //discard anything else on the line
349 >          }
350       }
351 <   }
352 < p=strstr(info,"Target=");
353 < if (p!=NULL) { //has Target attr
354 <   p+=7;
355 <   while (*p!=';' && *p!=0 && *p!=' ') p++;
226 <   if (*p!=' ') {
227 <      GError("Error parsing target coordinates from GFF line:\n%s\n",l);
351 >   if (Parent!=NULL) { //GTF transcript_id for exon/CDS feature
352 >      _parents=Parent;
353 >      GMALLOC(parents,sizeof(char*));
354 >      num_parents=1;
355 >      parents[0]=_parents;
356        }
357 <   if (!parseUInt(p,qstart))
358 <     GError("Error parsing target start coordinate from GFF line:\n%s\n",l);
359 <   if (*p!=' ') {
360 <      GError("Error parsing next target coordinate from GFF line:\n%s\n",l);
357 >   } //GTF-like
358 >
359 > //parse other potentially useful features
360 > if (is_gff3) {
361 >   if ((p=strstr(info,"Target="))!=NULL) { //has Target attr
362 >      p+=7;
363 >      while (*p!=';' && *p!=0 && *p!=' ') p++;
364 >      if (*p!=' ') {
365 >         GError("Error parsing target coordinates from GFF line:\n%s\n",l);
366 >         }
367 >      if (!parseUInt(p,qstart))
368 >         GError("Error parsing target start coordinate from GFF line:\n%s\n",l);
369 >      if (*p!=' ') {
370 >         GError("Error parsing next target coordinate from GFF line:\n%s\n",l);
371 >         }
372 >      p++;
373 >      if (!parseUInt(p,qend))
374 >         GError("Error parsing target end coordinate from GFF line:\n%s\n",l);
375        }
376 <   p++;
377 <   if (!parseUInt(p,qend))
378 <     GError("Error parsing target end coordinate from GFF line:\n%s\n",l);
379 <   }
380 < else {
381 <   p=strifind(info,"Qreg=");
382 <   if (p!=NULL) { //has Qreg attr
241 <     p+=5;
242 <     if (!parseUInt(p,qstart))
243 <       GError("Error parsing target start coordinate from GFF line:\n%s\n",l);
244 <     if (*p!='-') {
245 <        GError("Error parsing next target coordinate from GFF line:\n%s\n",l);
246 <        }
247 <     p++;
248 <     if (!parseUInt(p,qend))
249 <       GError("Error parsing target end coordinate from GFF line:\n%s\n",l);
250 <     if (*p=='|') {
376 >   if ((p=strifind(info,"Qreg="))!=NULL) { //has Qreg attr
377 >       p+=5;
378 >       if (!parseUInt(p,qstart))
379 >         GError("Error parsing target start coordinate from GFF line:\n%s\n",l);
380 >       if (*p!='-') {
381 >          GError("Error parsing next target coordinate from GFF line:\n%s\n",l);
382 >          }
383         p++;
384 <       if (!parseUInt(p,qlen))
385 <         GError("Error parsing target length from GFF Qreg|: \n%s\n",l);
386 <       }
387 <     }//has Qreg attr
388 <   }
389 < if (qlen==0 && (p=strifind(info,"Qlen="))!=NULL) {
390 <   p+=5;
391 <   if (!parseUInt(p,qlen))
392 <       GError("Error parsing target length from GFF Qlen:\n%s\n",l);
393 <   }
384 >       if (!parseUInt(p,qend))
385 >         GError("Error parsing target end coordinate from GFF line:\n%s\n",l);
386 >       if (*p=='|' || *p==':') {
387 >         p++;
388 >         if (!parseUInt(p,qlen))
389 >           GError("Error parsing target length from GFF Qreg|: \n%s\n",l);
390 >         }
391 >       }//has Qreg attr
392 >   if (qlen==0 && (p=strifind(info,"Qlen="))!=NULL) {
393 >     p+=5;
394 >     if (!parseUInt(p,qlen))
395 >         GError("Error parsing target length from GFF Qlen:\n%s\n",l);
396 >     }
397 >   }//parsing some useful attributes in GFF3 records
398 > if (ID==NULL && parents==NULL) {
399 >      if (reader->gff_warns)
400 >          GMessage("Warning: could not parse ID or Parent from GFF line:\n%s\n",dupline);
401 >      return; //skip
402 >      }
403   skip=false;
404   }
405  
406 < int GffObj::addExon(GffLine* gl, bool keepAttr, bool noExonAttr) {
406 > int GffObj::addExon(GffReader* reader, GffLine* gl, bool keepAttr, bool noExonAttr) {
407    //this will make sure we have the right subftype_id!
408    int subf_id=-1;
409 <  if (ftype_id==gff_fid_mRNA) {
410 <     if (subftype_id<0) subftype_id=gff_fid_exon;
409 >  if (!isTranscript() && gl->is_cds && monoFeature()) {
410 >          isTranscript(true);
411 >          exon_ftype_id=gff_fid_exon;
412 >          if (exons.Count()==1) exons[0]->exontype=exgffExon;
413 >          }
414 >  if (isTranscript()) {
415 >     if (exon_ftype_id<0) {//exon_ftype_id=gff_fid_exon;
416 >          if (gl->exontype>0) exon_ftype_id=gff_fid_exon;
417 >                         else exon_ftype_id=names->feats.addName(gl->ftype);
418 >          }
419       //any recognized mRNA segment gets the generic "exon" type (also applies to CDS)
420 <     if (!gl->is_cds && !gl->is_exon)
421 <         //extraneous mRNA feature, will discard for now
420 >     if (gl->exontype==0 && !gl->is_transcript) {
421 >          //extraneous mRNA feature, discard
422 >          if (reader->gff_warns)
423 >            GMessage("Warning: discarding unrecognized transcript subfeature %s of %s\n",
424 >                gl->ftype, gffID);
425            return -1;
426 +          }
427       }
428 <  else { //other kind of parent feature, check this subf type
428 >  else { //non-mRNA parent feature, check this subf type
429      subf_id=names->feats.addName(gl->ftype);
430 <    if (subftype_id<0)
431 <       subftype_id=subf_id;
432 <    else {
433 <       if (subftype_id!=subf_id)
434 <         GMessage("GFF Warning: multiple subfeatures (%s and %s) found for %s, only %s is kept\n",
435 <             names->feats.getName(subftype_id), names->feats.getName(subf_id),
436 <             gffID,names->feats.getName(subftype_id));
437 <         return -1; //skip this 2nd subfeature type for this parent!
438 <       }
439 <    }
440 <  if (gl->exontype==exgffUTR || gl->exontype==exgffStop)
441 <      udata=1;//merge 0-distance segments
430 >    if (exon_ftype_id<0 || exons.Count()==0) //never assigned a subfeature type before (e.g. first exon being added)
431 >       exon_ftype_id=subf_id;
432 >     else {
433 >       if (exon_ftype_id!=subf_id) {
434 >         //
435 >         if (exon_ftype_id==ftype_id && exons.Count()==1 && exons[0]->start==start && exons[0]->end==end) {
436 >            //the existing exon was just a dummy one created by default, discard it
437 >            exons.Clear();
438 >            covlen=0;
439 >            exon_ftype_id=subf_id; //allow the new subfeature to completely takeover
440 >            }
441 >         else { //multiple subfeatures, prefer those with
442 >             if (reader->gff_warns)
443 >               GMessage("GFF Warning: multiple subfeatures (%s and %s) found for %s, discarding ",
444 >                  names->feats.getName(subf_id), names->feats.getName(exon_ftype_id),gffID);
445 >            if (gl->exontype!=0) { //new feature is an exon, discard previously parsed subfeatures
446 >               if (reader->gff_warns) GMessage("%s.\n", names->feats.getName(exon_ftype_id));
447 >               exon_ftype_id=subf_id;
448 >               exons.Clear();
449 >               covlen=0;
450 >               }
451 >              else { //discard new feature
452 >               if (reader->gff_warns) GMessage("%s.\n", names->feats.getName(subf_id));
453 >               return -1; //skip this 2nd subfeature type for this parent!
454 >               }
455 >            }
456 >         } //incoming subfeature is of different type
457 >       } //new subfeature type
458 >    } //non-mRNA parent
459    int eidx=addExon(gl->fstart, gl->fend, gl->score, gl->phase,
460           gl->qstart,gl->qend, gl->is_cds, gl->exontype);
461 <  if (eidx>=0 && keepAttr) {
462 <      parseAttrs(exons[eidx]->attrs, gl->info, noExonAttr);
461 >  if (eidx<0) return eidx; //this should never happen
462 >  if (keepAttr) {
463 >     if (noExonAttr) {
464 >         if (attrs==NULL) //place the parsed attributes directly at transcript level
465 >           parseAttrs(attrs, gl->info);
466 >         }
467 >       else { //need all exon-level attributes
468 >         parseAttrs(exons[eidx]->attrs, gl->info, true);
469 >         }
470        }
471    return eidx;
472   }
473  
474  
298 void GffObj::expandExon(int oi, uint segstart, uint segend, char exontype, double sc, char fr, int qs, int qe) {
299  //oi is the index of the *first* overlapping segment found that must be enlarged
300  covlen-=exons[oi]->len();
301  if (segstart<exons[oi]->start)
302    exons[oi]->start=segstart;
303  if (qs<exons[oi]->qstart) exons[oi]->qstart=qs;
304  if (segend>exons[oi]->end)
305    exons[oi]->end=segend;
306  if (qe>exons[oi]->qend) exons[oi]->qend=qe;
307  //warning: score cannot be properly adjusted! e.g. if it's a p-value it's just going to get worse
308  if (sc!=0) exons[oi]->score=sc;
309  covlen+=exons[oi]->len();
310  //if (exons[oi]->exontype< exontype) -- always true
311  exons[oi]->exontype = exontype;
312  if (exontype==exgffCDS) exons[oi]->phase=fr;
313  //we must check if any more exons are also overlapping this
314  int ni=oi+1; //next exon index after oi
315  while (ni<exons.Count() && segend>=exons[ni]->start) { // next segment overlaps new enlarged segment
316     //only allow this if next segment is fully included, and a subordinate
317     if (exons[ni]->exontype<exontype && exons[ni]->end<=segend) {
318         if (exons[ni]->qstart<exons[oi]->qstart) exons[oi]->qstart=exons[ni]->qstart;
319         if (exons[ni]->qend>exons[oi]->qend) exons[oi]->qend=exons[ni]->qend;
320         exons.Delete(ni);
321         }
322      else {
323         GMessage("GFF Warning: overlapping feature segment (%d-%d) for GFF ID %s\n", segstart, segend, gffID);
324         hasErrors=true;
325         break;
326         }
327     }
328  // -- make sure any other related boundaries are updated:
329  start=exons.First()->start;
330  end=exons.Last()->end;
331  if (uptr!=NULL) { //collect stats about the underlying genomic sequence
332    GSeqStat* gsd=(GSeqStat*)uptr;
333    if (start<gsd->mincoord) gsd->mincoord=start;
334    if (end>gsd->maxcoord) gsd->maxcoord=end;
335    }
336 }
337
475   int GffObj::addExon(uint segstart, uint segend, double sc, char fr, int qs, int qe, bool iscds, char exontype) {
476    if (exons.Count()==0) {
477        if (iscds) isCDS=true; //for now, assume CDS only if first "exon" given is a CDS
478 <      if (subftype_id<0) {
479 <         subftype_id = (ftype_id==gff_fid_mRNA) ? gff_fid_exon : ftype_id;
478 >      if (exon_ftype_id<0) {
479 >         exon_ftype_id = isTranscript() ? gff_fid_exon : ftype_id;
480           }
481        }
482 <  if (iscds) { //update CDS anchors:
482 >  //special treatment of start/stop codon features, they might be broken/split between exons
483 >  //and in that case some providers will still give the wrong end coordinate as start+2 (e.g. UCSC)
484 >  //so we should not trust the end coordinate for such features
485 >  if (exontype==exgffStart || exontype==exgffStop) {
486 >     if (strand=='-') segstart=segend;
487 >                else  segend=segstart;
488 >     if (exontype==exgffStart) {
489 >           if (CDstart==0 || segstart<CDstart) CDstart=segstart;
490 >           }
491 >         else {
492 >           if (segstart>CDend) CDend=segstart;
493 >           }
494 >     }
495 >    else if (iscds) { //update CDS anchors:
496       if (CDstart==0 || segstart<CDstart)  {
497             CDstart=segstart;
498             if (exontype==exgffCDS && strand=='+') CDphase=fr;
# Line 352 | Line 502
502             CDend=segend;
503             }
504       }
505 <   else { // !iscds
505 >   else { // not a CDS/start/stop
506       isCDS=false;
507       }
508    if (qs || qe) {
509      if (qs>qe) swap(qs,qe);
510      if (qs==0) qs=1;
511 <    }
511 >        }
512    int ovlen=0;
513 <  int oi=exonOverlapIdx(segstart, segend, &ovlen);
514 <  if (oi>=0) { //overlap existing segment
515 <     //only allow this for CDS within exon, stop_codon within exon, stop_codon within UTR,
516 <     //                        or stop_codon within CDS
517 <    if (exons[oi]->exontype>exontype &&
518 <         exons[oi]->start<=segstart && exons[oi]->end>=segend &&
519 <         !(exons[oi]->exontype==exgffUTR && exontype==exgffCDS)) {
520 <          //larger segment given first, now the smaller included one
521 <          return oi; //only used to store attributes from current GffLine
522 <          }
523 <    if (exontype>exons[oi]->exontype &&
524 <         segstart<=exons[oi]->start && segend>=exons[oi]->end &&
525 <         !(exontype==exgffUTR && exons[oi]->exontype==exgffCDS)) {
526 <           //smaller segment given first, so we have to enlarge it
527 <          expandExon(oi, segstart, segend, exontype, sc, fr, qs, qe);
528 <            //this must also check for overlapping next exon (oi+1)
529 <          return oi;
530 <          }
531 <    //there is also the special case of "ribosomal slippage exception" (programmed frameshift)
532 <    //where two CDS segments may actually overlap for 1 or 2 bases, but there should be only one encompassing exon
533 <    if (ovlen>2 || exons[oi]->exontype!=exgffCDS || exontype!=exgffCDS) {
534 <       GMessage("GFF Warning: discarded overlapping feature segment (%d-%d) for GFF ID %s\n",
535 <           segstart, segend, gffID);
536 <       hasErrors=true;
537 <       return false; //segment NOT added
538 <       }
539 <     /* --
540 <     else {
541 <       // else add the segment if the overlap is small and between two CDS segments
542 <       //we might want to add an attribute here with the slippage coordinate and size?
543 <       }
544 <     */
545 <    }//overlap of existing segment
546 <
513 >  if (exontype>0) { //check for overlaps between exon-type segments
514 >      int oi=exonOverlapIdx(segstart, segend, &ovlen);
515 >      if (oi>=0) { //overlap existing segment
516 >         if (ovlen==0) {
517 >                          //adjacent segments will be merged
518 >                          //e.g. CDS to (UTR|exon)
519 >                          if ((exons[oi]->exontype>=exgffUTR && exontype==exgffCDS) ||
520 >                                  (exons[oi]->exontype==exgffCDS && exontype>=exgffUTR)) {
521 >                                        expandExon(oi, segstart, segend, exgffCDSUTR, sc, fr, qs, qe);
522 >                                        return oi;
523 >                                        }
524 >                          //CDS adjacent to stop_codon: UCSC does (did?) this
525 >                          if ((exons[oi]->exontype==exgffStop && exontype==exgffCDS) ||
526 >                                  (exons[oi]->exontype==exgffCDS && exontype==exgffStop)) {
527 >                                        expandExon(oi, segstart, segend, exgffCDS, sc, fr, qs, qe);
528 >                                        return oi;
529 >                                        }
530 >             }
531 >                 //only allow this for CDS within exon, stop_codon within (CDS|UTR|exon),
532 >         //                   start_codon within (CDS|exon)
533 >        if (exons[oi]->exontype>exontype &&
534 >             exons[oi]->start<=segstart && exons[oi]->end>=segend &&
535 >             !(exons[oi]->exontype==exgffUTR && exontype==exgffCDS)) {
536 >              //larger segment given first, now the smaller included one is redundant
537 >              return oi; //only used to store attributes from current GffLine
538 >              }
539 >        if (exontype>exons[oi]->exontype &&
540 >             segstart<=exons[oi]->start && segend>=exons[oi]->end &&
541 >             !(exontype==exgffUTR && exons[oi]->exontype==exgffCDS)) {
542 >               //smaller segment given first, so we have to enlarge it
543 >                          expandExon(oi, segstart, segend, exontype, sc, fr, qs, qe);
544 >                                //this should also check for overlapping next exon (oi+1) ?
545 >              return oi;
546 >              }
547 >        //there is also the special case of "ribosomal slippage exception" (programmed frameshift)
548 >        //where two CDS segments may actually overlap for 1 or 2 bases, but there should be only one encompassing exon
549 >                //if (ovlen>2 || exons[oi]->exontype!=exgffCDS || exontype!=exgffCDS) {
550 >                // had to relax this because of some weird UCSC annotations with exons partially overlapping the CDS segments
551 >                /*
552 >                if (ovlen>2 && exons[oi]->exontype!=exgffUTR && exontype!=exgffUTR) {
553 >                   if (gff_show_warnings)
554 >                           GMessage("GFF Warning: discarding overlapping feature segment (%d-%d) (vs %d-%d (%s)) for GFF ID %s on %s\n",
555 >                           segstart, segend, exons[oi]->start, exons[oi]->end, getSubfName(), gffID, getGSeqName());
556 >                   hasErrors(true);
557 >                   return -1; //segment NOT added
558 >                   }
559 >                */
560 >
561 >                 if ((ovlen>2 || ovlen==0) || exons[oi]->exontype!=exgffCDS || exontype!=exgffCDS) {
562 >                  if (gff_show_warnings)
563 >                         GMessage("GFF Warning: merging overlapping/adjacent feature segment (%d-%d) into (%d-%d) (%s) for GFF ID %s on %s\n",
564 >                                 segstart, segend, exons[oi]->start, exons[oi]->end, getSubfName(), gffID, getGSeqName());
565 >                        expandExon(oi, segstart, segend, exontype, sc, fr, qs, qe);
566 >                        return oi;
567 >                 }
568 >                // else add the segment if the overlap is small and between two CDS segments
569 >                //TODO: we might want to add an attribute here with the slippage coordinate and size?
570 >        covlen-=ovlen;
571 >                }//overlap or adjacent to existing segment
572 >           } //check for overlap
573     // --- no overlap, or accepted micro-overlap (ribosomal slippage)
574     // create & add the new segment
575     GffExon* enew=new GffExon(segstart, segend, sc, fr, qs, qe, exontype);
576     int eidx=exons.Add(enew);
577     if (eidx<0) {
578 <     GMessage("GFF Warning: duplicate segment %d-%d not added for GFF ID %s!\n",
578 >    //this would actually be acceptable if the object is a "Gene" and "exons" are in fact isoforms
579 >     if (gff_show_warnings)
580 >       GMessage("GFF Warning: failed adding segment %d-%d for %s (discarded)!\n",
581              segstart, segend, gffID);
582 +     delete enew;
583 +     hasErrors(true);
584       return -1;            
585       }
586     covlen+=(int)(exons[eidx]->end-exons[eidx]->start)+1;
# Line 410 | Line 590
590         GSeqStat* gsd=(GSeqStat*)uptr;
591         if (start<gsd->mincoord) gsd->mincoord=start;
592         if (end>gsd->maxcoord) gsd->maxcoord=end;
593 +       if (this->len()>gsd->maxfeat_len) {
594 +          gsd->maxfeat_len=this->len();
595 +          gsd->maxfeat=this;
596 +          }
597         }
598     return eidx;
599   }
600  
601 + void GffObj::expandExon(int oi, uint segstart, uint segend, char exontype, double sc, char fr, int qs, int qe) {
602 +  //oi is the index of the *first* overlapping segment found that must be enlarged
603 +  covlen-=exons[oi]->len();
604 +  if (segstart<exons[oi]->start)
605 +    exons[oi]->start=segstart;
606 +  if (qs && qs<exons[oi]->qstart) exons[oi]->qstart=qs;
607 +  if (segend>exons[oi]->end)
608 +    exons[oi]->end=segend;
609 +  if (qe && qe>exons[oi]->qend) exons[oi]->qend=qe;
610 +  //warning: score cannot be properly adjusted! e.g. if it's a p-value it's just going to get worse
611 +  if (sc!=0) exons[oi]->score=sc;
612 +  covlen+=exons[oi]->len();
613 +  //if (exons[oi]->exontype< exontype) -- always true
614 +  exons[oi]->exontype = exontype;
615 +  if (exontype==exgffCDS) exons[oi]->phase=fr;
616 +  //we must check if any more exons are also overlapping this
617 +  int ni=oi+1; //next exon index after oi
618 +  while (ni<exons.Count() && segend>=exons[ni]->start) { // next segment overlaps new enlarged segment
619 +     //only allow this if next segment is fully included, and a subordinate
620 +     if (exons[ni]->exontype<exontype && exons[ni]->end<=segend) {
621 + /* I guess we have to relax this due to stupid UCSC hg18 files having a start_codon sticking out
622 + chr1    hg18_knownGene  start_codon     69806911        69806913        0.000000        +       .
623 + chr1    hg18_knownGene  CDS     69806911        69806912        0.000000        +       0
624 + chr1    hg18_knownGene  exon    69805456        69806912        0.000000        +       .    
625 + */
626 +         if (exons[ni]->qstart<exons[oi]->qstart) exons[oi]->qstart=exons[ni]->qstart;
627 +         if (exons[ni]->qend>exons[oi]->qend) exons[oi]->qend=exons[ni]->qend;
628 +         exons.Delete(ni);
629 +         }
630 +      else {
631 +         if (gff_show_warnings) GMessage("GFF Warning: overlapping existing exon(%d-%d) while expanding to %d-%d for GFF ID %s\n",
632 +                exons[ni]->start, exons[ni]->end, segstart, segend, gffID);
633 +         //hasErrors(true);
634 +         break;
635 +         }
636 +     }
637 +  // -- make sure any other related boundaries are updated:
638 +  start=exons.First()->start;
639 +  end=exons.Last()->end;
640 +  if (uptr!=NULL) { //collect stats about the underlying genomic sequence
641 +    GSeqStat* gsd=(GSeqStat*)uptr;
642 +    if (start<gsd->mincoord) gsd->mincoord=start;
643 +    if (end>gsd->maxcoord) gsd->maxcoord=end;
644 +    if (this->len()>gsd->maxfeat_len) {
645 +        gsd->maxfeat_len=this->len();
646 +        gsd->maxfeat=this;
647 +        }
648 +    }
649 + }
650 +
651   void GffObj::removeExon(int idx) {
652    /*
653     if (idx==0 && segs[0].start==gstart)
# Line 431 | Line 665
665    if (isCDS) { CDstart=start; CDend=end; }
666   }
667  
668 + void GffObj::removeExon(GffExon* p) {
669 +  for (int idx=0;idx<exons.Count();idx++) {
670 +     if (exons[idx]==p) {
671 +        int segstart=exons[idx]->start;
672 +        int segend=exons[idx]->end;
673 +        exons.Delete(idx);
674 +        covlen -= (int)(segend-segstart)+1;
675 +        start=exons.First()->start;
676 +        end=exons.Last()->end;
677 +        if (isCDS) { CDstart=start; CDend=end; }
678 +        return;
679 +        }
680 +     }
681 + }
682 +
683 +
684 +
685   GffObj::GffObj(GffReader *gfrd, GffLine* gffline, bool keepAttr, bool noExonAttr):
686 <     GSeg(0,0), exons(true,true,true) {
687 < xstart=0;
688 < xend=0;
689 < xstatus=0;
690 < partial=false;
691 < isCDS=false;
692 < uptr=NULL;
693 < ulink=NULL;
694 < udata=0;
695 < CDstart=0;
696 < CDend=0;
697 < gname=NULL;
698 < attrs=NULL;
699 < gffID=NULL;
700 < track_id=-1;
701 < gseq_id=-1;
702 < ftype_id=-1;
703 < subftype_id=-1;
704 < hasErrors=false;
705 < if (gfrd==NULL)
686 >     GSeg(0,0), exons(true,true,false), children(1,false) {
687 >  xstart=0;
688 >  xend=0;
689 >  xstatus=0;
690 >  partial=false;
691 >  isCDS=false;
692 >  uptr=NULL;
693 >  ulink=NULL;
694 >  parent=NULL;
695 >  udata=0;
696 >  flags=0;
697 >  CDstart=0;
698 >  CDend=0;
699 >  CDphase=0;
700 >  geneID=NULL;
701 >  gene_name=NULL;
702 >  attrs=NULL;
703 >  gffID=NULL;
704 >  track_id=-1;
705 >  gseq_id=-1;
706 >  ftype_id=-1;
707 >  exon_ftype_id=-1;
708 >  strand='.';
709 >  if (gfrd==NULL)
710      GError("Cannot use this GffObj constructor with a NULL GffReader!\n");
711 < gffnames_ref(names);
712 < if (gfrd->names==NULL) gfrd->names=names;
713 < qlen=0;qstart=0;qend=0;
714 < gscore=0;
715 < uscore=0;
716 < covlen=0;
717 < qcov=0;
718 < if (gffline->Parent!=NULL) {
719 <    //GTF style -- subfeature given directly
720 <    if (gffline->is_cds || gffline->is_exon)
721 <         ftype_id=gff_fid_mRNA;
722 <      else {
723 <        //group of other subfeatures of type ftype:
724 <        ftype_id=names->feats.addName(gffline->ftype);
711 >  gffnames_ref(names);
712 >  if (gfrd->names==NULL) gfrd->names=names;
713 >  //qlen=0;qstart=0;qend=0;
714 >  gscore=0;
715 >  uscore=0;
716 >  covlen=0;
717 >  qcov=0;
718 >  start=gffline->fstart;
719 >  end=gffline->fend;
720 >  gseq_id=names->gseqs.addName(gffline->gseqname);
721 >  track_id=names->tracks.addName(gffline->track);
722 >  strand=gffline->strand;
723 >  qlen=gffline->qlen;
724 >  qstart=gffline->qstart;
725 >  qend=gffline->qend;
726 >  //setup flags from gffline
727 >  isCDS=gffline->is_cds; //for now
728 >  isGene(gffline->is_gene);
729 >  isTranscript(gffline->is_transcript || gffline->exontype!=0);
730 >  fromGff3(gffline->is_gff3);
731 >
732 >  if (gffline->parents!=NULL) {
733 >    //GTF style -- create a GffObj directly by subfeature
734 >    //(also possible orphan GFF3 exon line, or an exon given before its parent (chado))
735 >    if (gffline->exontype!=0) { //recognized exon-like feature
736 >       ftype_id=gff_fid_transcript; //so this is some sort of transcript
737 >       exon_ftype_id=gff_fid_exon; //subfeatures MUST be exons
738 >       }
739 >     else {//unrecognized subfeatures
740 >       //make this GffObj of the same feature type
741 >       ftype_id=names->feats.addName(gffline->ftype);
742 >       }
743 >    if (gffline->ID==NULL) { //typical GTF
744 >        gffID=Gstrdup(gffline->parents[0]);
745 >        this->createdByExon(true);
746 >        //this is likely the first exon/segment of the feature
747 >        addExon(gfrd, gffline, keepAttr, noExonAttr);
748          }
749 <    gffID=gffline->Parent;
750 <    gffline->Parent=NULL; //just take over
751 <    if (gffline->gname!=NULL) {
752 <        gname=gffline->gname;
753 <        gffline->gname=NULL;
749 >      else { //a parented feature with an ID -- probably an orphan GFF3 line
750 >        if (gffline->is_gff3 && gffline->exontype!=0) {
751 >             //premature exon given before its parent transcript
752 >             //create the transcript entry here
753 >             gffID=Gstrdup(gffline->parents[0]);
754 >             this->createdByExon(true);
755 >             //this is the first exon/segment of the transcript
756 >             addExon(gfrd, gffline, keepAttr, noExonAttr);
757 >             }
758 >            else { //unrecognized non-exon feature ? use the ID instead
759 >             gffID=Gstrdup(gffline->ID);
760 >             if (keepAttr) this->parseAttrs(attrs, gffline->info);
761 >             }
762          }
763 <    gseq_id=names->gseqs.addName(gffline->gseqname);
764 <    track_id=names->tracks.addName(gffline->track);
765 <    strand=gffline->strand;
480 <    qlen=gffline->qlen;
481 <    start=gffline->fstart;
482 <    end=gffline->fend;
483 <    isCDS=gffline->is_cds; //for now
484 <    addExon(gffline, keepAttr, noExonAttr);
485 <    if (keepAttr && noExonAttr) {
486 <      //simply move the attrs from this first exon
487 <      //to the transcript
488 <      if (exons.First()->attrs!=NULL) {
489 <         attrs=exons.First()->attrs;
490 <         exons.First()->attrs=NULL;
491 <         }
492 <      }
493 <    } //GTF line
494 < else { //GffReader made sure this is a parent line (no parent)
495 <    //even for a mRNA with a Parent= line
763 >    } //subfeature given directly
764 >  else { //gffline->parents==NULL
765 >    //create a parent feature in its own right
766      gscore=gffline->score;
767      if (gffline->ID==NULL || gffline->ID[0]==0)
768 <       GError("Error: no ID found for GFF record start\n");
769 <    gffID=gffline->ID; //there must be an ID here
770 <    if (gffline->is_mrna) ftype_id=gff_fid_mRNA;
771 <        else ftype_id=names->feats.addName(gffline->ftype);
772 <    gffline->ID=NULL; //steal it
773 <    if (gffline->gname!=NULL) {
774 <        gname=gffline->gname;
775 <        gffline->gname=NULL;
768 >      GError("Error: no ID found for GFF record start\n");
769 >    gffID=Gstrdup(gffline->ID); //there must be an ID here
770 >    //if (gffline->is_transcript) ftype_id=gff_fid_mRNA;
771 >      //else
772 >    ftype_id=names->feats.addName(gffline->ftype);
773 >    if (gffline->is_transcript)
774 >      exon_ftype_id=gff_fid_exon;
775 >
776 >    if (keepAttr) this->parseAttrs(attrs, gffline->info);
777 >    }//no parent
778 >
779 >  if (gffline->gene_name!=NULL) {
780 >     gene_name=Gstrdup(gffline->gene_name);
781 >     }
782 >  if (gffline->gene_id!=NULL) {
783 >     geneID=Gstrdup(gffline->gene_id);
784 >     }
785 >
786 >  GSeqStat* gsd=gfrd->gseqstats.AddIfNew(new GSeqStat(gseq_id,names->gseqs.lastNameUsed()),true);
787 >  uptr=gsd;
788 >  if (start<gsd->mincoord) gsd->mincoord=start;
789 >  if (end>gsd->maxcoord) gsd->maxcoord=end;
790 >    if (this->len()>gsd->maxfeat_len) {
791 >        gsd->maxfeat_len=this->len();
792 >        gsd->maxfeat=this;
793          }
507    start=gffline->fstart;
508    end=gffline->fend;
509    gseq_id=names->gseqs.addName(gffline->gseqname);
510    track_id=names->tracks.addName(gffline->track);
511    qlen=gffline->qlen;
512    qstart=gffline->qstart;
513    qend=gffline->qend;
514    strand=gffline->strand;
515    if (keepAttr) this->parseAttrs(attrs, gffline->info, noExonAttr);
516    }
517 GSeqStat* gsd=gfrd->gseqstats.AddIfNew(new GSeqStat(gseq_id,names->gseqs.lastNameUsed()),true);
518 uptr=gsd;
519 gsd->gflst.Add(this);
520 if (start<gsd->mincoord) gsd->mincoord=start;
521 if (end>gsd->maxcoord) gsd->maxcoord=end;
522 gfrd->gfoAdd(gffID, gffline->gseqname, this);
794   }
795  
525
796   GffLine* GffReader::nextGffLine() {
797   if (gffline!=NULL) return gffline; //caller should free gffline after processing
798   while (gffline==NULL) {
529    //const char* l=linebuf->getLine();
799      int llen=0;
800      buflen=GFF_LINELEN-1;
801      char* l=fgetline(linebuf, buflen, fh, &fpos, &llen);
# Line 540 | Line 809
809      if (gffline->skip) {
810         delete gffline;
811         gffline=NULL;
812 +       continue;
813         }
814 +    if (gffline->ID==NULL && gffline->parents==NULL)  { //it must have an ID
815 +        //this might not be needed, already checked in the GffLine constructor
816 +        if (gff_warns)
817 +            GMessage("Warning: malformed GFF line, no parent or record Id (kipping\n");
818 +        delete gffline;
819 +        gffline=NULL;
820 +        //continue;
821 +        }
822      }
823   return gffline;
824   }
825  
548
549 GffObj* GffReader::parse(bool keepAttr, bool noExonAttr) { //<- do not use this!
550   //parses one parent feature at a time, assuming absolute grouping of subfeatures
551   //!ASSUMES that subfeatures (exons) are properly grouped together
552   //any interleaving exons from other transcripts will terminate the parsing of the current feature!
553  GffObj* gfo=NULL;
554  while (nextGffLine()!=NULL) {
555    if (gfo==NULL) {//record starts fresh here
556       gfo=new GffObj(this, gffline, keepAttr, noExonAttr);
557       delete gffline; gffline=NULL;
558       continue;
559       }
560 // -- gfo is not NULL from here --
561    if (gffline->Parent==NULL) {// new parent feature starting here
562       //new record start, return what we have so far,
563       //gffline was NOT deleted, so it will be used for the next parse() call
564       return ((gfo==NULL) ? NULL : gfo->finalize());
565       }
566    //-- has a Parent so it's a subfeature segment (exon/CDS/other subfeature)
567      // is it a subfeature of the current gf?
568    if (strcmp(gffline->Parent, gfo->gffID)==0) {
569          //yes, add it
570          gfo->addExon(gffline, !noExonAttr, noExonAttr);
571          delete gffline; gffline=NULL;
572          continue;
573          }
574      // is it a subfeature of a previously loaded gfo?
575    GffObj* prevgfo=gfoFind(gffline->Parent, gffline->gseqname);
576    if (prevgfo==NULL)
577           return ((gfo==NULL) ? NULL : gfo->finalize()); // new subfeature, gffline will be used for the next parse()
578    //this is for an earlier parent
579    prevgfo->addExon(gffline, !noExonAttr, noExonAttr);
580    delete gffline;
581    gffline=NULL;
582    } //while reading gfflines
583  return ((gfo==NULL) ? NULL : gfo->finalize());
584 }
826   char* GffReader::gfoBuildId(const char* id, const char* ctg) {
827   //caller must free the returned pointer
828   char* buf=NULL;
# Line 599 | Line 840
840   GFREE(buf);
841   }
842  
843 < void GffReader::gfoAdd(const char* id, const char* ctg, GffObj* gfo) {
843 > //Warning: if gflst gets altered, idx becomes obsolete
844 > GfoHolder* GffReader::gfoAdd(const char* id, const char* ctg, GffObj* gfo, int idx) {
845   char* buf=gfoBuildId(id,ctg);
846 < phash.Add(buf, gfo);
846 > GfoHolder* r=new GfoHolder(gfo,idx);
847 > phash.Add(buf, r);
848   GFREE(buf);
849 + return r;
850   }
851 < GffObj* GffReader::gfoFind(const char* id, const char* ctg) {
851 >
852 > GfoHolder* GffReader::gfoFind(const char* id, const char* ctg) {
853   char* buf=gfoBuildId(id,ctg);
854 < GffObj* r=phash.Find(buf);
854 > GfoHolder* r=phash.Find(buf);
855   GFREE(buf);
856   return r;
857   }
858  
859 + GfoHolder* GffReader::replaceGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, int replaceidx) {
860 +  GffObj* newgfo=new GffObj(this, gffline, keepAttr, noExonAttr);
861 +  GfoHolder* r=NULL;
862 +  if (replaceidx>=0) {
863 +     gflst.Put(replaceidx,newgfo);
864 +     r=gfoAdd(newgfo->gffID, gffline->gseqname, newgfo, replaceidx);
865 +     }
866 +   else {
867 +     int gfoidx=gflst.Add(newgfo);
868 +     r=gfoAdd(newgfo->gffID, gffline->gseqname, newgfo, gfoidx);
869 +     }
870 +  if (gff_warns) {
871 +    int* pcount=tids.Find(newgfo->gffID);
872 +    if (pcount!=NULL) {
873 +       if (gff_warns) GMessage("Warning: duplicate GFF ID: %s\n", newgfo->gffID);
874 +       (*pcount)++;
875 +       }
876 +     else {
877 +       tids.Add(newgfo->gffID,new int(1));
878 +       }
879 +    }
880 +  return r;
881 + }
882  
883 < void GffReader::parseAll(GffRecFunc* gproc, bool keepAttr, bool noExonAttr, void* userptr1, void* userptr2) {
884 <  //WARNING: this is all messed up if the GFF lines are NOT grouped by parent feature
885 <  //!!! do not use this !!!
886 <  //iterates through all mappings in the input file
887 <  //calling gproc with each parsed mapping
888 <    GffObj* gfo;
889 <    while ((gfo=this->parse(keepAttr,noExonAttr))!=NULL) { //a valid gff record was parsed
890 <        if (gfo->empty()) { //shouldn't happen!
891 <             delete gfo;
892 <             gfo=NULL;
893 <             continue;
894 <             }
895 <     if (gproc(gfo, userptr1, userptr2)) {
896 <             //true returned from GfProcFunc means no longer needed
897 <              gfoRemove(gfo->gffID, gfo->getGSeqName());
898 <              delete gfo;
899 <              }
900 <         else {
901 <              gflst.Add(gfo);
902 <              }
903 <     gfo=NULL;
904 <     } //while records are parsed
905 <    phash.Clear();
883 > GfoHolder* GffReader::updateParent(GfoHolder* newgfh, GffObj* parent) {
884 >  //assert(parent);
885 >  //assert(newgfo);
886 >  parent->children.Add(newgfh->gffobj);
887 >  if (newgfh->gffobj->parent==NULL) newgfh->gffobj->parent=parent;
888 >  newgfh->gffobj->setLevel(parent->getLevel()+1);
889 >  if (parent->isGene()) {
890 >      if (parent->gene_name!=NULL && newgfh->gffobj->gene_name==NULL)
891 >        newgfh->gffobj->gene_name=Gstrdup(parent->gene_name);
892 >      if (parent->geneID!=NULL && newgfh->gffobj->geneID==NULL)
893 >        newgfh->gffobj->geneID=Gstrdup(parent->geneID);
894 >      }
895 >
896 >  return newgfh;
897 > }
898 >
899 > GfoHolder* GffReader::newGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr,
900 >                          GffObj* parent, GffExon* pexon) {
901 >  GffObj* newgfo=new GffObj(this, gffline, keepAttr, noExonAttr);
902 >  GfoHolder* r=NULL;
903 >  int gfoidx=gflst.Add(newgfo);
904 >  r=gfoAdd(newgfo->gffID, gffline->gseqname, newgfo, gfoidx);
905 >  if (parent!=NULL) {
906 >    updateParent(r, parent);
907 >    if (pexon!=NULL) parent->removeExon(pexon);
908 >    }
909 >  if (gff_warns) {
910 >    int* pcount=tids.Find(newgfo->gffID);
911 >    if (pcount!=NULL) {
912 >       if (gff_warns) GMessage("Warning: duplicate GFF ID: %s\n", newgfo->gffID);
913 >       (*pcount)++;
914 >       }
915 >     else {
916 >       tids.Add(newgfo->gffID,new int(1));
917 >       }
918 >    }
919 >  return r;
920 > }
921 >
922 > GfoHolder* GffReader::updateGffRec(GfoHolder* prevgfo, GffLine* gffline,
923 >                                         bool keepAttr) {
924 > if (prevgfo==NULL) return NULL;
925 > prevgfo->gffobj->createdByExon(false);
926 > prevgfo->gffobj->ftype_id=prevgfo->gffobj->names->feats.addName(gffline->ftype);
927 > prevgfo->gffobj->start=gffline->fstart;
928 > prevgfo->gffobj->end=gffline->fend;
929 > prevgfo->gffobj->isGene(gffline->is_gene);
930 > prevgfo->gffobj->isTranscript(gffline->is_transcript || gffline->exontype!=0);
931 > prevgfo->gffobj->fromGff3(gffline->is_gff3);
932 > if (keepAttr) {
933 >   if (prevgfo->gffobj->attrs!=NULL) prevgfo->gffobj->attrs->Clear();
934 >   prevgfo->gffobj->parseAttrs(prevgfo->gffobj->attrs, gffline->info);
935 >   }
936 > return prevgfo;
937 > }
938 >
939 >
940 > bool GffReader::addExonFeature(GfoHolder* prevgfo, GffLine* gffline, GHash<CNonExon>& pex, bool noExonAttr) {
941 >  bool r=true;
942 >  if (gffline->strand!=prevgfo->gffobj->strand) {
943 >     GMessage("GFF Error: duplicate GFF ID '%s' (exons found on different strands of %s)\n",
944 >        prevgfo->gffobj->gffID, prevgfo->gffobj->getGSeqName());
945 >      r=false;
946 >     }
947 >  int gdist=(gffline->fstart>prevgfo->gffobj->end) ? gffline->fstart-prevgfo->gffobj->end :
948 >                      ((gffline->fend<prevgfo->gffobj->start)? prevgfo->gffobj->start-gffline->fend :
949 >                         0 );
950 >  if (gdist>(int)GFF_MAX_LOCUS) { //too far apart, most likely this is a duplicate ID
951 >    GMessage("Error: duplicate GFF ID '%s' (or exons too far apart)!\n",prevgfo->gffobj->gffID);
952 >    //validation_errors = true;
953 >    r=false;
954 >    if (!gff_warns) exit(1);
955 >    }
956 >  int eidx=prevgfo->gffobj->addExon(this, gffline, !noExonAttr, noExonAttr);
957 >  if (eidx>=0 && gffline->ID!=NULL && gffline->exontype==0)
958 >      subfPoolAdd(pex, prevgfo);
959 >  return r;
960 > }
961 >
962 > CNonExon* GffReader::subfPoolCheck(GffLine* gffline, GHash<CNonExon>& pex, char*& subp_name) {
963 >  CNonExon* subp=NULL;
964 >  subp_name=NULL;
965 >  for (int i=0;i<gffline->num_parents;i++) {
966 >    if (transcriptsOnly && discarded_ids.Find(gffline->parents[i])!=NULL)
967 >        continue;
968 >    subp_name=gfoBuildId(gffline->parents[i], gffline->gseqname); //e.g. mRNA name
969 >    subp=pex.Find(subp_name);
970 >    if (subp!=NULL)
971 >       return subp;
972 >    GFREE(subp_name);
973 >    }
974 >  return NULL;
975 > }
976 >
977 > void GffReader::subfPoolAdd(GHash<CNonExon>& pex, GfoHolder* newgfo) {
978 > //this might become a parent feature later
979 > if (newgfo->gffobj->exons.Count()>0) {
980 >   char* xbuf=gfoBuildId(gffline->ID, gffline->gseqname);
981 >   pex.Add(xbuf, new CNonExon(newgfo->idx, newgfo->gffobj,
982 >       newgfo->gffobj->exons[0], gffline));
983 >   GFREE(xbuf);
984 >   }
985   }
986  
987 + GfoHolder* GffReader::promoteFeature(CNonExon* subp, char*& subp_name, GHash<CNonExon>& pex,
988 +    bool keepAttr, bool noExonAttr) {
989 +  GffObj* prevp=subp->parent; //grandparent of gffline (e.g. gene)
990 +  if (prevp!=gflst[subp->idx])
991 +    GError("Error promoting subfeature %s, gflst index mismatch?!\n", subp->gffline->ID);
992 +  subp->gffline->discardParent();
993 +  GfoHolder* gfoh=newGffRec(subp->gffline, keepAttr, noExonAttr, prevp, subp->exon);
994 +  pex.Remove(subp_name); //no longer a potential parent, moved it to phash already
995 +  prevp->promotedChildren(true);
996 +  return gfoh; //returns the holder of newly promoted feature
997 + }
998  
999 + //have to parse the whole file because exons can be scattered all over
1000   void GffReader::readAll(bool keepAttr, bool mergeCloseExons, bool noExonAttr) {
1001 +  bool validation_errors = false;
1002 +  //loc_debug=false;
1003 +  GHash<CNonExon> pex; //keep track of any "exon"-like features that have an ID
1004 +                     //and thus could become promoted to parent features
1005    while (nextGffLine()!=NULL) {
1006 <    if (gffline->Parent==NULL) {//no parent, new GFF3-like record starting
1007 <       //check for uniqueness of gffline->ID in phash !
1008 <       GffObj* f=gfoFind(gffline->ID, gffline->gseqname);
1009 <       if (f!=NULL) {
1010 <            GError("Error: duplicate GFF ID '%s' encountered!\n",gffline->ID);
1006 >       //seen this gff ID before?
1007 >     GfoHolder* prevseen=NULL;
1008 >     if (gffline->ID) //GFF3
1009 >         prevseen=gfoFind(gffline->ID, gffline->gseqname);
1010 >     if (prevseen!=NULL) {
1011 >            if (prevseen->gffobj->createdByExon()) {
1012 >                updateGffRec(prevseen, gffline, keepAttr);
1013 >                }
1014 >             else {
1015 >                GMessage("Error: duplicate GFF ID '%s' encountered!\n",gffline->ID);
1016 >                validation_errors = true;
1017 >                if (gff_warns) {
1018 >                       delete gffline; gffline=NULL; continue;
1019 >                       }
1020 >                   else exit(1);
1021 >                }
1022              }
1023 <       gflst.Add(new GffObj(this, gffline, keepAttr, noExonAttr));
1023 >    if (gffline->parents==NULL) {//start GFF3-like record with no parent (mRNA, gene)
1024 >       if (!prevseen) newGffRec(gffline, keepAttr, noExonAttr);
1025         }
1026 <    else { //--- it's a subfeature (exon/CDS/other):
1027 <       GffObj* prevgfo=gfoFind(gffline->Parent, gffline->gseqname);
1028 <       if (prevgfo!=NULL) { //exon of a previously seen GffObj
1029 <                 if (gffline->strand!=prevgfo->strand) {
1030 <                    GError("Error: duplicate GFF ID '%s' (exons found on different strands of %s)\n",
1031 <                       prevgfo->gffID, prevgfo->getGSeqName());
1032 <                    }
1033 <                 int gdist=(gffline->fstart>prevgfo->end) ? gffline->fstart-prevgfo->end :
1034 <                                     ((gffline->fend<prevgfo->start)? prevgfo->start-gffline->fend :
1035 <                                        0 );
1036 <                 if (gdist>(int)GFF_MAX_LOCUS) { //too far apart, most likely this is a duplicate ID
1037 <                   GError("Error: duplicate GFF ID '%s' (or exons too far apart)!\n",prevgfo->gffID);
1026 >    else { //--- it's a parented feature (could still be a mRNA)
1027 >       bool found_parent=false;
1028 >       GfoHolder* newgfo=prevseen;
1029 >       for (int i=0;i<gffline->num_parents;i++) {
1030 >            if (transcriptsOnly && discarded_ids.Find(gffline->parents[i])!=NULL)
1031 >                continue; //skipping discarded parent feature
1032 >            GfoHolder* parentgfo=gfoFind(gffline->parents[i], gffline->gseqname);
1033 >            if (parentgfo!=NULL) { //parent GffObj parsed earlier
1034 >                   found_parent=true;
1035 >                   if (parentgfo->gffobj->isGene() && gffline->is_transcript
1036 >                                   && gffline->exontype==0) {
1037 >                       //not an exon, but a transcript parented by a gene
1038 >                       if (newgfo) {
1039 >                           updateParent(newgfo, parentgfo->gffobj);
1040 >                           }
1041 >                         else {
1042 >                           newgfo=newGffRec(gffline, keepAttr, noExonAttr, parentgfo->gffobj);
1043 >                           }
1044 >                       }
1045 >                   else { //potential exon subfeature
1046 >                       if (!addExonFeature(parentgfo, gffline, pex, noExonAttr))
1047 >                         validation_errors=true;
1048 >                       }
1049                     }
1050 <                 prevgfo->addExon(gffline, !noExonAttr, noExonAttr);
1051 <                 }
1052 <            else {//new GTF-like record starting here with a subfeature
1053 <                 gflst.Add(new GffObj(this, gffline, keepAttr, noExonAttr));
1054 <                 //even those with errors will be added here!
1055 <                 }
1056 <       } //subfeature
1057 <      //--
1058 <    delete gffline;
1059 <    gffline=NULL;
1060 <    }//while
1061 <  for (int i=0;i<gflst.Count();i++) {
1062 <    gflst[i]->finalize(mergeCloseExons); //finalize the parsing - also merge close exons/features if so requested
1063 <    }
1050 >            } //for each parsed parent Id
1051 >       if (!found_parent) { //new GTF-like record starting here with a subfeature directly
1052 >             //or it could be some chado GFF3 barf with exons declared BEFORE their parent :(
1053 >            //check if this feature isn't parented by a previously stored "exon" subfeature
1054 >            char* subp_name=NULL;
1055 >            CNonExon* subp=subfPoolCheck(gffline, pex, subp_name);
1056 >            if (subp!=NULL) { //found a subfeature that is the parent of this gffline
1057 >               //promote that subfeature to a full GffObj
1058 >               GfoHolder* gfoh=promoteFeature(subp, subp_name, pex, keepAttr, noExonAttr);
1059 >               //add current gffline as an exon of the newly promoted subfeature
1060 >               if (!addExonFeature(gfoh, gffline, pex, noExonAttr))
1061 >                      validation_errors=true;
1062 >               }
1063 >              else { //no parent seen before, create one directly with this exon
1064 >               //loc_debug=true;
1065 >               GfoHolder* newgfo=prevseen ? prevseen : newGffRec(gffline, keepAttr, noExonAttr);
1066 >               if (gffline->ID!=NULL && gffline->exontype==0)
1067 >                     subfPoolAdd(pex, newgfo);
1068 >               //even those with errors will be added here!
1069 >               }
1070 >            GFREE(subp_name);
1071 >            } //no previous parent found
1072 >       } //parented feature
1073 >        //--
1074 >      delete gffline;
1075 >      gffline=NULL;
1076 >      }//while gff lines
1077 >  gflst.finalize(this, mergeCloseExons, keepAttr, noExonAttr); //force sorting by locus if so constructed
1078   // all gff records are now loaded in GList gflst
1079   // so we can free the hash
1080    phash.Clear();
1081 +  tids.Clear();
1082 +  if (validation_errors) {
1083 +    exit(1);
1084 +    }
1085   }
1086  
1087 < //this may be called prematurely if exon records are not grouped by parent
1088 < GffObj* GffObj::finalize(bool mergeCloseExons) {
1087 > GffObj* GffObj::finalize(GffReader* gfr, bool mergeCloseExons, bool keepAttrs, bool noExonAttr) {
1088 > //merge
1089 > //always merge adjacent or overlapping segments
1090 > //but if mergeCloseExons then merge even when distance is up to 5 bases
1091   udata=0;
1092   uptr=NULL;
1093 < //TODO: merge adjacent or close segments - for mRNAs
1094 < //must merge adjacent features, and optionally small gaps
1095 < if (ftype_id==gff_fid_mRNA) {
1093 > if (gfr->transcriptsOnly && !(isTranscript() || (isGene() && children.Count()==0))) {
1094 >       isDiscarded(true);
1095 >       }
1096 > if (ftype_id==gff_fid_transcript && CDstart>0) {
1097 >    ftype_id=gff_fid_mRNA;
1098 >    //exon_ftype_id=gff_fid_exon;
1099 >    }
1100 > //if (ftype_id==gff_fid_mRNA || exon_ftype_id==gff_fid_exon || mergeCloseExons) {
1101 > if (isTranscript() || exon_ftype_id==gff_fid_exon || mergeCloseExons) {
1102     int mindist=mergeCloseExons ? 5:1;
1103     for (int i=0;i<exons.Count()-1;i++) {
1104       int ni=i+1;
1105       uint mend=exons[i]->end;
1106       while (ni<exons.Count()) {
1107         int dist=(int)(exons[ni]->start-mend);
1108 <       if (dist<0 || dist>mindist) break; //no merging with next segment
1108 >       if (dist>mindist) break; //no merging with next segment
1109 >       if (gfr!=NULL && gfr->gff_warns && dist!=0 && (exons[ni]->exontype!=exgffUTR && exons[i]->exontype!=exgffUTR)) {
1110 >          GMessage("GFF warning: merging adjacent/overlapping segments of %s on %s (%d-%d, %d-%d)\n",
1111 >               gffID, getGSeqName(), exons[i]->start, exons[i]->end,exons[ni]->start, exons[ni]->end);
1112 >          }
1113         mend=exons[ni]->end;
1114 +       covlen-=exons[i]->len();
1115         exons[i]->end=mend;
1116 +       covlen+=exons[i]->len();
1117 +       covlen-=exons[ni]->len();
1118         if (exons[ni]->attrs!=NULL && (exons[i]->attrs==NULL ||
1119              exons[i]->attrs->Count()<exons[ni]->attrs->Count())) {
1120                //use the other exon attributes, if more
# Line 707 | Line 1126
1126         } //check for merge with next exon
1127       } //for each exon
1128     }
1129 + //attribute reduction for GTF records
1130 + if (keepAttrs && !noExonAttr && !fromGff3()
1131 +          && exons.Count()>0 && exons[0]->attrs!=NULL) {
1132 +   bool attrs_discarded=false;
1133 +   for (int a=0;a<exons[0]->attrs->Count();a++) {
1134 +      int attr_name_id=exons[0]->attrs->Get(a)->attr_id;
1135 +      char* attr_name=names->attrs.getName(attr_name_id);
1136 +      char* attr_val =exons[0]->attrs->Get(a)->attr_val;
1137 +      bool sameExonAttr=true;
1138 +      for (int i=1;i<exons.Count();i++) {
1139 +         char* ov=exons[i]->getAttr(attr_name_id);
1140 +         if (ov==NULL || (strcmp(ov,attr_val)!=0)) {
1141 +             sameExonAttr=false;
1142 +             break;
1143 +             }
1144 +         }
1145 +      if (sameExonAttr) {
1146 +             //delete this attribute from exons level
1147 +             attrs_discarded=true;
1148 +             this->addAttr(attr_name, attr_val);
1149 +             for (int i=1;i<exons.Count();i++) {
1150 +                 removeExonAttr(*(exons[i]), attr_name_id);
1151 +                 }
1152 +             exons[0]->attrs->freeItem(a);
1153 +             }
1154 +      }
1155 +   if (attrs_discarded) exons[0]->attrs->Pack();
1156 +   }
1157   return this;
1158   }
1159  
1160 < void GffObj::parseAttrs(GffAttrs*& atrlist, char* info, bool noExonAttr) {
1160 > void GffObj::parseAttrs(GffAttrs*& atrlist, char* info, bool isExon) {
1161    if (names==NULL)
1162       GError(ERR_NULL_GFNAMES, "parseAttrs()");
1163    if (atrlist==NULL)
# Line 730 | Line 1177
1177      char* ech=strchr(start,'=');
1178      if (ech!=NULL) { // attr=value format found
1179         *ech='\0';
1180 <       if (strcmp(start, "ID")==0 || strcmp(start,"Parent")==0 || strcmp(start,"Name")==0 ||
1181 <            strcmp(start,"transcript_id")==0 || strcmp(start,"gene_id")==0)
1182 <          { start=pch; continue; } //skip this already recognized and stored attribute
1183 <       if (noExonAttr && (strcmp(start, "exon_number")==0 || strcmp(start, "exon")==0)) { start=pch; continue; }
1180 >       //if (noExonAttr && (strcmp(start, "exon_number")==0 || strcmp(start, "exon")==0)) { start=pch; continue; }
1181 >       if (strcmp(start, "exon_number")==0 || strcmp(start, "exon")==0 ||
1182 >              strcmp(start, "exon_id")==0)
1183 >           { start=pch; continue; }
1184         ech++;
1185         while (*ech==' ' && ech<endinfo) ech++;//skip extra spaces after the '='
1186 <       atrlist->Add(new GffAttr(names->attrs.addName(start),ech));
1186 >       //atrlist->Add(new GffAttr(names->attrs.addName(start),ech));
1187 >       //make sure we don't add the same attribute more than once
1188 >       if (isExon && (strcmp(start, "protein_id")==0)) {
1189 >             //Ensembl special case
1190 >             this->addAttr(start, ech);
1191 >             start=pch;
1192 >             continue;
1193 >             }
1194 >       atrlist->add_or_update(names, start, ech);
1195         }
1196        /*
1197        else { //not an attr=value format
# Line 748 | Line 1203
1203    if (atrlist->Count()==0) { delete atrlist; atrlist=NULL; }
1204   }
1205  
1206 < void GffObj::addAttr(const char* attrname, char* attrvalue) {
1206 > void GffObj::addAttr(const char* attrname, const char* attrvalue) {
1207    if (this->attrs==NULL)
1208        this->attrs=new GffAttrs();
1209 <  this->attrs->Add(new GffAttr(names->attrs.addName(attrname),attrvalue));
1209 >  //this->attrs->Add(new GffAttr(names->attrs.addName(attrname),attrvalue));
1210 >  this->attrs->add_or_update(names, attrname, attrvalue);
1211 > }
1212 >
1213 >
1214 > void GffObj::setFeatureName(const char* feature) {
1215 > //change the feature name/type for a transcript
1216 > int fid=names->feats.addName(feature);
1217 > if (monoFeature() && exons.Count()>0)
1218 >    this->exon_ftype_id=fid;
1219 > this->ftype_id=fid;
1220 > }
1221 >
1222 > void GffObj::setRefName(const char* newname) {
1223 > //change the feature name/type for a transcript
1224 > int rid=names->gseqs.addName(newname);
1225 > this->gseq_id=rid;
1226 > }
1227 >
1228 >
1229 >
1230 > int GffObj::removeAttr(const char* attrname, const char* attrval) {
1231 >  if (this->attrs==NULL || attrname==NULL || attrname[0]==0) return 0;
1232 >  int aid=this->names->attrs.getId(attrname);
1233 >  if (aid<0) return 0;
1234 >  int delcount=0;  //could be more than one ?
1235 >  for (int i=0;i<this->attrs->Count();i++) {
1236 >     if (aid==this->attrs->Get(i)->attr_id) {
1237 >       if (attrval==NULL ||
1238 >          strcmp(attrval, this->attrs->Get(i)->attr_val)==0) {
1239 >             delcount++;
1240 >             this->attrs->freeItem(i);
1241 >             }
1242 >       }
1243 >     }
1244 >  if (delcount>0) this->attrs->Pack();
1245 >  return delcount;
1246   }
1247  
1248 + int GffObj::removeAttr(int aid, const char* attrval) {
1249 +  if (this->attrs==NULL || aid<0) return 0;
1250 +  int delcount=0;  //could be more than one ?
1251 +  for (int i=0;i<this->attrs->Count();i++) {
1252 +     if (aid==this->attrs->Get(i)->attr_id) {
1253 +       if (attrval==NULL ||
1254 +          strcmp(attrval, this->attrs->Get(i)->attr_val)==0) {
1255 +             delcount++;
1256 +             this->attrs->freeItem(i);
1257 +             }
1258 +       }
1259 +     }
1260 +  if (delcount>0) this->attrs->Pack();
1261 +  return delcount;
1262 + }
1263 +
1264 +
1265 + int GffObj::removeExonAttr(GffExon& exon, const char* attrname, const char* attrval) {
1266 +  if (exon.attrs==NULL || attrname==NULL || attrname[0]==0) return 0;
1267 +  int aid=this->names->attrs.getId(attrname);
1268 +  if (aid<0) return 0;
1269 +  int delcount=0;  //could be more than one
1270 +  for (int i=0;i<exon.attrs->Count();i++) {
1271 +     if (aid==exon.attrs->Get(i)->attr_id) {
1272 +       if (attrval==NULL ||
1273 +          strcmp(attrval, exon.attrs->Get(i)->attr_val)==0) {
1274 +             delcount++;
1275 +             exon.attrs->freeItem(i);
1276 +             }
1277 +       }
1278 +     }
1279 +  if (delcount>0) exon.attrs->Pack();
1280 +  return delcount;
1281 + }
1282 +
1283 + int GffObj::removeExonAttr(GffExon& exon, int aid, const char* attrval) {
1284 +  if (exon.attrs==NULL || aid<0) return 0;
1285 +  int delcount=0;  //could be more than one
1286 +  for (int i=0;i<exon.attrs->Count();i++) {
1287 +     if (aid==exon.attrs->Get(i)->attr_id) {
1288 +       if (attrval==NULL ||
1289 +          strcmp(attrval, exon.attrs->Get(i)->attr_val)==0) {
1290 +             delcount++;
1291 +             exon.attrs->freeItem(i);
1292 +             }
1293 +       }
1294 +     }
1295 +  if (delcount>0) exon.attrs->Pack();
1296 +  return delcount;
1297 + }
1298 +
1299 +
1300   void GffObj::getCDS_ends(uint& cds_start, uint& cds_end) {
1301    cds_start=0;
1302    cds_end=0;
# Line 800 | Line 1343
1343               //CDstart in this segment
1344               //and we are getting the whole transcript
1345               cds_mend=s-(int)(CDstart-sgstart);
803             //GMessage("Setting cds_mend to %d\n",cds_mend);
1346               }
1347         if (CDend>=sgstart && CDend<=sgend) {
1348               //CDstart in this segment
1349               //and we are getting the whole transcript
1350               cds_mstart=s-(int)(CDend-cdsadj-sgstart);
809             //GMessage("Setting cds_mstart to %d\n",cds_mstart);
1351               }
1352        } //for each exon
1353      } // - strand
# Line 840 | Line 1381
1381    //return spliced;
1382   }
1383  
1384 + char* GffObj::getUnspliced(GFaSeqGet* faseq, int* rlen, GList<GSeg>* seglst)
1385 + {
1386 +    if (faseq==NULL) { GMessage("Warning: getUnspliced(NULL,.. ) called!\n");
1387 +        return NULL;
1388 +    }
1389 +    //restore normal coordinates:
1390 +    unxcoord();
1391 +    if (exons.Count()==0) return NULL;
1392 +    int fspan=end-start+1;
1393 +    const char* gsubseq=faseq->subseq(start, fspan);
1394 +    if (gsubseq==NULL) {
1395 +        GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end);
1396 +    }
1397 +    char* unspliced=NULL;
1398 +
1399 +    int seqstart=exons.First()->start;
1400 +    int seqend=exons.Last()->end;
1401 +    
1402 +    int unsplicedlen = 0;
1403 +
1404 +    unsplicedlen += seqend - seqstart + 1;
1405 +
1406 +    GMALLOC(unspliced, unsplicedlen+1); //allocate more here
1407 +    //uint seqstart, seqend;
1408 +
1409 +    int s = 0; //resulting nucleotide counter
1410 +    if (strand=='-')
1411 +    {
1412 +        if (seglst!=NULL)
1413 +            seglst->Add(new GSeg(s+1,s+1+seqend-seqstart));
1414 +        for (int i=seqend;i>=seqstart;i--)
1415 +        {
1416 +            unspliced[s] = ntComplement(gsubseq[i-start]);
1417 +            s++;
1418 +        }//for each nt
1419 +    } // - strand
1420 +    else
1421 +    { // + strand
1422 +        if (seglst!=NULL)
1423 +            seglst->Add(new GSeg(s+1,s+1+seqend-seqstart));
1424 +        for (int i=seqstart;i<=seqend;i++)
1425 +        {
1426 +            unspliced[s]=gsubseq[i-start];
1427 +            s++;
1428 +        }//for each nt
1429 +    } // + strand
1430 +    //assert(s <= unsplicedlen);
1431 +    unspliced[s]=0;
1432 +    if (rlen!=NULL) *rlen=s;
1433 +    return unspliced;
1434 + }
1435 +
1436   char* GffObj::getSpliced(GFaSeqGet* faseq, bool CDSonly, int* rlen, uint* cds_start, uint* cds_end,
1437            GList<GSeg>* seglst) {
1438    if (CDSonly && CDstart==0) return NULL;
# Line 854 | Line 1447
1447    if (gsubseq==NULL) {
1448          GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end);
1449          }
1450 +  if (fspan<(int)(end-start+1)) { //special case: stop coordinate was extended past the gseq length, must adjust
1451 +     int endadj=end-start+1-fspan;
1452 +     uint prevend=end;
1453 +     end-=endadj;
1454 +     if (CDend>end) CDend=end;
1455 +     if (exons.Last()->end>end) {
1456 +         exons.Last()->end=end; //this could get us into trouble if exon start is also > end
1457 +         if (exons.Last()->start>exons.Last()->end) {
1458 +            GError("GffObj::getSpliced() error: improper genomic coordinate %d on %s for %s\n",
1459 +                  prevend,getGSeqName(), getID());
1460 +            }
1461 +         covlen-=endadj;
1462 +         }
1463 +     }
1464    char* spliced=NULL;
1465    GMALLOC(spliced, covlen+1); //allocate more here
1466    uint seqstart, seqend;
# Line 1019 | Line 1626
1626            strand, start, end, gscore, (float)qcov/10.0);
1627   }
1628  
1629 < void GffObj::printGxfLine(FILE* fout, char* tlabel, char* gseqname, bool iscds,
1629 > void GffObj::printGxfLine(FILE* fout, const char* tlabel, const char* gseqname, bool iscds,
1630                               uint segstart, uint segend, int exidx, char phase, bool gff3) {
1631    static char scorestr[14];
1632    strcpy(scorestr,".");
# Line 1028 | Line 1635
1635       if (exons[exidx]->score) sprintf(scorestr,"%.2f", exons[exidx]->score);
1636       xattrs=exons[exidx]->attrs;
1637    }
1031  char* geneid=(gname!=NULL)? gname : gffID;
1638    if (phase==0 || !iscds) phase='.';
1639    const char* ftype=iscds ? "CDS" : getSubfName();
1640    if (gff3) {
# Line 1043 | Line 1649
1649           }
1650      fprintf(fout, "\n");
1651      } //GFF
1652 <  else {//for GTF -- we can only print mRNAs here
1653 <    fprintf(fout, "%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\t",
1654 <        gseqname, tlabel, ftype, segstart, segend, scorestr, strand, phase);
1655 <    if (ismRNA())
1656 <       fprintf(fout,"gene_id \"%s\"; transcript_id \"%s\";", geneid, gffID);
1652 >  else {//for GTF -- we print only transcripts
1653 >    //if (isValidTranscript())
1654 >    fprintf(fout, "%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\ttranscript_id \"%s\";",
1655 >           gseqname, tlabel, ftype, segstart, segend, scorestr, strand, phase, gffID);
1656 >    //char* geneid=(geneID!=NULL)? geneID : gffID;
1657 >    if (geneID)
1658 >      fprintf(fout," gene_id \"%s\";",geneID);
1659 >    if (gene_name!=NULL) {
1660 >       //fprintf(fout, " gene_name ");
1661 >       //if (gene_name[0]=='"') fprintf (fout, "%s;",gene_name);
1662 >       //              else fprintf(fout, "\"%s\";",gene_name);
1663 >       fprintf(fout," gene_name \"%s\";",gene_name);
1664 >       }
1665      if (xattrs!=NULL) {
1666 <       for (int i=0;i<xattrs->Count();i++) {
1667 <         if (xattrs->Get(i)->attr_val==NULL) continue;
1668 <         fprintf(fout, " %s ",names->attrs.getName(xattrs->Get(i)->attr_id));
1669 <          if (xattrs->Get(i)->attr_val[0]=='"')
1670 <                  fprintf(fout, "%s;",xattrs->Get(i)->attr_val);
1671 <             else fprintf(fout, "\"%s\";",xattrs->Get(i)->attr_val);
1666 >          for (int i=0;i<xattrs->Count();i++) {
1667 >            if (xattrs->Get(i)->attr_val==NULL) continue;
1668 >            const char* attrname=names->attrs.getName(xattrs->Get(i)->attr_id);
1669 >            fprintf(fout, " %s ",attrname);
1670 >            if (xattrs->Get(i)->attr_val[0]=='"')
1671 >                     fprintf(fout, "%s;",xattrs->Get(i)->attr_val);
1672 >                else fprintf(fout, "\"%s\";",xattrs->Get(i)->attr_val);
1673 >             }
1674            }
1675 <       }
1675 >    //for GTF, also append the GffObj attributes to each exon line
1676 >    if ((xattrs=this->attrs)!=NULL) {
1677 >          for (int i=0;i<xattrs->Count();i++) {
1678 >            if (xattrs->Get(i)->attr_val==NULL) continue;
1679 >            const char* attrname=names->attrs.getName(xattrs->Get(i)->attr_id);
1680 >            fprintf(fout, " %s ",attrname);
1681 >            if (xattrs->Get(i)->attr_val[0]=='"')
1682 >                     fprintf(fout, "%s;",xattrs->Get(i)->attr_val);
1683 >                else fprintf(fout, "\"%s\";",xattrs->Get(i)->attr_val);
1684 >             }
1685 >           }
1686      fprintf(fout, "\n");
1687      }//GTF
1688   }
1689  
1690 < void GffObj::printGxf(FILE* fout, GffPrintMode gffp, char* tlabel) {
1690 > void GffObj::printGxf(FILE* fout, GffPrintMode gffp,
1691 >                   const char* tlabel, const char* gfparent) {
1692   static char tmpstr[255];
1693   if (tlabel==NULL) {
1694      tlabel=track_id>=0 ? names->tracks.Get(track_id)->name :
1695           (char*)"gffobj" ;
1696      }
1070
1697   unxcoord();
1698 < if (exons.Count()==0) return;
1699 < char* gseqname=names->gseqs.Get(gseq_id)->name;
1698 > //if (exons.Count()==0) return;
1699 > const char* gseqname=names->gseqs.Get(gseq_id)->name;
1700   bool gff3 = (gffp>=pgffAny);
1701   bool showCDS = (gffp==pgtfAny || gffp==pgtfCDS || gffp==pgffCDS || gffp==pgffAny || gffp==pgffBoth);
1702   bool showExon = (gffp<=pgtfExon || gffp==pgffAny || gffp==pgffExon || gffp==pgffBoth);
# Line 1084 | Line 1710
1710        pend=CDend;
1711        }
1712     else { pstart=start;pend=end; }
1713 <   const char* ftype=ismRNA() ? "mRNA" : getFeatureName();
1713 >   //const char* ftype=isTranscript() ? "mRNA" : getFeatureName();
1714 >   const char* ftype=getFeatureName();
1715     fprintf(fout,
1716       "%s\t%s\t%s\t%d\t%d\t%s\t%c\t.\tID=%s",
1717       gseqname, tlabel, ftype, pstart, pend, tmpstr, strand, gffID);
1718 <   if (gname!=NULL)
1719 <       fprintf(fout, ";Name=%s",gname);
1720 <   if (CDstart>0 && !showCDS && !isCDS) fprintf(fout,";CDS=%d:%d",CDstart,CDend);
1718 >   if (CDstart>0 && !showCDS && !isCDS) fprintf(fout,";CDS=%d-%d",CDstart,CDend);
1719 >   if (gfparent!=NULL) {
1720 >      //parent override
1721 >      fprintf(fout, ";Parent=%s",gfparent);
1722 >      }
1723 >     else {
1724 >       if (parent!=NULL && !parent->isDiscarded())
1725 >           fprintf(fout, ";Parent=%s",parent->getID());
1726 >       }
1727 >   if (geneID!=NULL)
1728 >      fprintf(fout, ";geneID=%s",geneID);
1729 >   if (gene_name!=NULL)
1730 >      fprintf(fout, ";gene_name=%s",gene_name);
1731     if (attrs!=NULL) {
1732        for (int i=0;i<attrs->Count();i++) {
1733 <        fprintf(fout,";%s=%s", names->attrs.getName(attrs->Get(i)->attr_id),
1733 >        const char* attrname=names->attrs.getName(attrs->Get(i)->attr_id);
1734 >        fprintf(fout,";%s=%s", attrname,
1735                 attrs->Get(i)->attr_val);
1736          }
1737        }
# Line 1101 | Line 1739
1739     }// gff3 mRNA line
1740   if (showExon) {
1741     //print exons
1742 <   for (int i=0;i<exons.Count();i++) {
1743 <     printGxfLine(fout, tlabel, gseqname, isCDS, exons[i]->start, exons[i]->end, i, exons[i]->phase, gff3);
1744 <     }
1745 < }//printing exons
1742 >    if (isCDS && exons.Count()>0 &&
1743 >        ((strand=='-' && exons.Last()->phase<'0') || (strand=='+' && exons.Last()->phase<'0')))
1744 >         updateExonPhase();
1745 >
1746 >    for (int i=0;i<exons.Count();i++) {
1747 >      printGxfLine(fout, tlabel, gseqname, isCDS, exons[i]->start, exons[i]->end, i, exons[i]->phase, gff3);
1748 >      }
1749 >    }//printing exons
1750   if (showCDS && !isCDS && CDstart>0) {
1751      GArray<GffCDSeg> cds(true,true);
1752      getCDSegs(cds);
# Line 1114 | Line 1756
1756    } //showCDS
1757   }
1758  
1759 + void GffObj::updateExonPhase() {
1760 +  if (!isCDS) return;
1761 +  int cdsacc=0;
1762 +  if (CDphase=='1' || CDphase=='2') {
1763 +      cdsacc+= 3-(CDphase-'0');
1764 +      }
1765 +  if (strand=='-') { //reverse strand
1766 +     for (int i=exons.Count()-1;i>=0;i--) {
1767 +         exons[i]->phase='0'+ (3-cdsacc%3)%3;
1768 +         cdsacc+=exons[i]->end-exons[i]->start+1;
1769 +         }
1770 +     }
1771 +    else { //forward strand
1772 +     for (int i=0;i<exons.Count();i++) {
1773 +         exons[i]->phase='0'+ (3-cdsacc%3)%3;
1774 +         cdsacc+=exons[i]->end-exons[i]->start+1;
1775 +         }
1776 +     }
1777 + }
1778 +
1779  
1780   void GffObj::getCDSegs(GArray<GffCDSeg>& cds) {
1781    GffCDSeg cdseg;
# Line 1158 | Line 1820
1820         } //for each exon
1821     } // + strand
1822   }
1161 /*
1162 #ifdef DEBUG
1163 void GffObj::dbgPrint(const char* msg) {
1164 if (msg!=NULL) fprintf(stdout, ">> %s\n",msg);
1165 char* tlabel=track_id>=0 ? names->tracks.Get(track_id)->name :
1166       (char*)"gmapobj" ;
1167 char scorestr[14];
1168 char strand=revstrand?'-':'+';
1169 unxcoord();
1170 char* gseqname=names->gseqs.Get(gseq_id)->name;
1171 char* fname=f_id>=0 ? names->feats.Get(f_id)->name : (char*)"nofeatname";
1172
1173 fprintf(stdout, "%s\t%s\t%s\t%d\t%d\t.\t%c\t.\tID=%s;Name=%s\n",
1174       gseqname, tlabel, fname, start, end, strand, gffID, gffID);
1175
1176 for (int fi=0;fi<features->Count();fi++) {
1177   GFeature* feature=features->Get(fi);
1178   fname=names->feats.Get(feature->name_id)->name;
1179   GffExon* segs=feature->segs;
1180   int segcount=feature->segcount;
1181   if (segcount==0 || segs==NULL) continue;
1182   for (int i=0;i<segcount;i++) {
1183      if (segs[i].start==0) continue;
1184      if (segs[i].score) sprintf(scorestr,"%.2f", segs[i].score/100.00);
1185                  else strcpy(scorestr,".");
1186      fprintf(stdout,
1187         "%s\t%s\t%s\t%d\t%d\t%s\t%c\t.\tParent=%s\n",
1188         gseqname, tlabel, fname, segs[i].start, segs[i].end, scorestr, strand, gffID);
1189      }
1190   }
1191 fflush(stdout);
1192 }
1193 #endif
1194 */
1195

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines