ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/gff.cpp
(Generate patch)
# Line 7 | Line 7
7  
8   const uint GFF_MAX_LOCUS = 7000000; //longest known gene in human is ~2.2M, UCSC claims a gene for mouse of ~ 3.1 M
9   const uint GFF_MAX_EXON  =   30000; //longest known exon in human is ~11K
10 < const uint GFF_MAX_INTRON= 6000000;
10 > const uint GFF_MAX_INTRON= 6000000; //Ensembl shows a >5MB human intron
11   bool gff_show_warnings = false; //global setting, set by GffReader->showWarnings()
12   const int gff_fid_mRNA=0;
13   const int gff_fid_transcript=1;
# Line 17 | Line 17
17   const uint gfo_flag_CHILDREN_PROMOTED= 0x00000002;
18   const uint gfo_flag_IS_GENE          = 0x00000004;
19   const uint gfo_flag_IS_TRANSCRIPT    = 0x00000008;
20 < const uint gfo_flag_FROM_GFF3        = 0x00000010;
20 > const uint gfo_flag_HAS_GFF_ID       = 0x00000010; //found GFF3 feature line with its own ID
21   const uint gfo_flag_BY_EXON          = 0x00000020; //created by subfeature (exon) directly
22   const uint gfo_flag_DISCARDED        = 0x00000100;
23   const uint gfo_flag_LST_KEEP         = 0x00000200;
# Line 52 | Line 52
52               else return (int)(g1.gseq_id-g2.gseq_id);
53   }
54  
55 < char* GffLine::extractAttr(const char* pre, bool caseStrict, bool enforce_GTF2) {
55 > char* GffLine::extractAttr(const char* attr, bool caseStrict, bool enforce_GTF2) {
56   //parse a key attribute and remove it from the info string
57   //(only works for attributes that have values following them after ' ' or '=')
58   static const char GTF2_ERR[]="Error parsing attribute %s ('\"' required) at GTF line:\n%s\n";
59 < int lpre=strlen(pre);
60 < char cend=pre[lpre-1];
61 < char* pos = (caseStrict) ? strstr(info, pre) : strifind(info, pre);
62 < if (pos==NULL) return NULL;
63 < char* findstart=info;
64 < //require word boundary on the left:
65 < while (pos!=NULL && pos!=info && *(pos-1)!=';' && *(pos-1)!=' ') {
66 <    findstart=pos+lpre;
67 <    pos = (caseStrict) ? strstr(findstart, pre) : strifind(findstart, pre);
68 <    }
69 < if (pos==NULL) return NULL;
70 < if (cend!=' ' && cend!='=') {
71 <    //require word boundary on the right:
72 <    while (pos!=NULL && *(pos+lpre)!=' ' && *(pos+lpre)!='=') {
73 <       findstart=pos+lpre;
74 <       pos = (caseStrict) ? strstr(findstart, pre) : strifind(findstart, pre);
75 <       }
76 <    }
77 < if (pos==NULL) return NULL;
78 < char* vp=pos+lpre;
59 > int attrlen=strlen(attr);
60 > char cend=attr[attrlen-1];
61 > //char* pos = (caseStrict) ? strstr(info, attr) : strifind(info, attr);
62 > //must make sure attr is not found in quoted text
63 > char* pos=info;
64 > char prevch=0;
65 > bool in_str=false;
66 > bool notfound=true;
67 > int (*strcmpfn)(const char*, const char*, int) = caseStrict ? Gstrcmp : Gstricmp;
68 > while (notfound && *pos) {
69 >   char ch=*pos;
70 >   if (ch=='"') {
71 >     in_str=!in_str;
72 >     pos++;
73 >     prevch=ch;
74 >     continue;
75 >     }
76 >   if (!in_str && (prevch==0 || prevch==' ' || prevch == ';')
77 >          && strcmpfn(attr, pos, attrlen)==0) {
78 >      //attr match found
79 >      //check for word boundary on right
80 >      char* epos=pos+attrlen;
81 >      if (cend=='=' || cend==' ' || *epos==0 || *epos==' ') {
82 >        notfound=false;
83 >        break;
84 >        }
85 >      //not a perfect match, move on
86 >      pos=epos;
87 >      prevch=*(pos-1);
88 >      continue;
89 >      }
90 >   //not a match or in_str
91 >   prevch=ch;
92 >   pos++;
93 >   }
94 > if (notfound) return NULL;
95 > char* vp=pos+attrlen;
96   while (*vp==' ') vp++;
97   if (*vp==';' || *vp==0)
98 <      GError("Error parsing value of GFF attribute \"%s\", line:\n%s\n", pre, dupline);
98 >      GError("Error parsing value of GFF attribute \"%s\", line:\n%s\n", attr, dupline);
99   bool dq_enclosed=false; //value string enclosed by double quotes
100   if (*vp=='"') {
101       dq_enclosed=true;
102       vp++;
103       }
104   if (enforce_GTF2 && !dq_enclosed)
105 <      GError(GTF2_ERR,pre, dupline);
105 >      GError(GTF2_ERR,attr, dupline);
106   char* vend=vp;
107   if (dq_enclosed) {
108      while (*vend!='"' && *vend!=';' && *vend!=0) vend++;
# Line 94 | Line 111
111      while (*vend!=';' && *vend!=0) vend++;
112      }
113   if (enforce_GTF2 && *vend!='"')
114 <     GError(GTF2_ERR, pre, dupline);
114 >     GError(GTF2_ERR, attr, dupline);
115   char *r=Gstrdup(vp, vend-1);
116   //-- now remove this attribute from the info string
117   while (*vend!=0 && (*vend=='"' || *vend==';' || *vend==' ')) vend++;
# Line 159 | Line 176
176   ftype=t[2];
177   info=t[8];
178   char* p=t[3];
179 < if (!parseUInt(p,fstart))
180 <   GError("Error parsing start coordinate from GFF line:\n%s\n",l);
179 > if (!parseUInt(p,fstart)) {
180 >   //FIXME: chromosome_band entries in Flybase
181 >   GMessage("Warning: invalid start coordinate at line:\n%s\n",l);
182 >   return;
183 >   }
184   p=t[4];
185 < if (!parseUInt(p,fend))
186 <   GError("Error parsing end coordinate from GFF line:\n%s\n",l);
187 < if (fend<fstart) swap(fend,fstart); //make sure fstart>=fend, always
185 > if (!parseUInt(p,fend)) {
186 >   GMessage("Warning: invalid end coordinate at line:\n%s\n",l);
187 >   return;
188 >   }
189 > if (fend<fstart) Gswap(fend,fstart); //make sure fstart>=fend, always
190   p=t[5];
191   if (p[0]=='.' && p[1]==0) {
192    score=0;
# Line 188 | Line 210
210     is_exon=true;
211     is_t_data=true;
212     }
213 <  else if (strstr(fnamelc, "exon")!=NULL) {
213 >  else if (endsWith(fnamelc, "exon")) {
214     exontype=exgffExon;
215     is_exon=true;
216     is_t_data=true;
# Line 229 | Line 251
251            }
252          return; //skip this line, unwanted feature name
253          }
254 < ID=extractAttr("ID=");
255 < char* Parent=extractAttr("Parent=");
254 > ID=extractAttr("ID=",true);
255 > char* Parent=extractAttr("Parent=",true);
256   is_gff3=(ID!=NULL || Parent!=NULL);
257   if (is_gff3) {
258     //parse as GFF3
259      if (ID!=NULL) {
260         //has ID attr so it's likely to be a parent feature
261         //look for explicit gene name
262 <       gene_name=extractAttr("gene_name=",false);
262 >       gene_name=extractAttr("gene_name=");
263         if (gene_name==NULL) {
264 <           gene_name=extractAttr("geneName=",false);
264 >           gene_name=extractAttr("geneName=");
265             if (gene_name==NULL) {
266 <               gene_name=extractAttr("gene_sym=",false);
266 >               gene_name=extractAttr("gene_sym=");
267                 if (gene_name==NULL) {
268 <                   gene_name=extractAttr("gene=",false);
268 >                   gene_name=extractAttr("gene=");
269                     }
270                 }
271             }
272 <       gene_id=extractAttr("geneID=",false);
272 >       gene_id=extractAttr("geneID=");
273         if (gene_id==NULL) {
274 <          gene_id=extractAttr("gene_id=",false);
274 >          gene_id=extractAttr("gene_id=");
275            }
276         if (is_gene) {
277           //special case: keep the Name and ID attributes of the gene feature
# Line 294 | Line 316
316           } //has Parent field
317     } //GFF3
318    else { // GTF-like expected
319 <   Parent=extractAttr("transcript_id");
319 >   Parent=extractAttr("transcript_id",true);
320     if (Parent!=NULL) { //GTF2 format detected
321       if (is_transcript) {
322           // atypical GTF with a parent transcript line declared
# Line 302 | Line 324
324           Parent=NULL;
325           }
326       gene_id=extractAttr("gene_id"); // for GTF this is the only attribute accepted as geneID
327 +     if (gene_id==NULL)
328 +       gene_id=extractAttr("geneid");
329       gene_name=extractAttr("gene_name");
330       if (gene_name==NULL) {
331 +
332             gene_name=extractAttr("gene_sym");
333 <           if (gene_name==NULL)
333 >           if (gene_name==NULL) {
334                 gene_name=extractAttr("gene");
335 +               if (gene_name==NULL)
336 +                  gene_name=extractAttr("genesymbol");
337 +               }
338             }
339       //prepare for parseAttr by adding '=' character instead of spaces for all attributes
340       //after the attribute name
# Line 403 | Line 431
431   skip=false;
432   }
433  
434 +
435 + void GffObj::addCDS(uint cd_start, uint cd_end, char phase) {
436 +  if (cd_start>=this->start) {
437 +        this->CDstart=cd_start;
438 +        if (strand=='+') this->CDphase=phase;
439 +        }
440 +      else this->CDstart=this->start;
441 +  if (cd_end<=this->end) {
442 +      this->CDend=cd_end;
443 +      if (strand=='-') this->CDphase=phase;
444 +      }
445 +     else this->CDend=this->end;
446 +  isTranscript(true);
447 +  exon_ftype_id=gff_fid_exon;
448 +  if (monoFeature()) {
449 +     if (exons.Count()==0) addExon(this->start, this->end,0,'.',0,0,false,exgffExon);
450 +            else exons[0]->exontype=exgffExon;
451 +     }
452 + }
453 +
454   int GffObj::addExon(GffReader* reader, GffLine* gl, bool keepAttr, bool noExonAttr) {
455    //this will make sure we have the right subftype_id!
456 <  int subf_id=-1;
457 <  if (!isTranscript() && gl->is_cds && monoFeature()) {
456 >  //int subf_id=-1;
457 >  if (!isTranscript() && gl->is_cds) {
458            isTranscript(true);
459            exon_ftype_id=gff_fid_exon;
460            if (exons.Count()==1) exons[0]->exontype=exgffExon;
# Line 426 | Line 474
474            }
475       }
476    else { //non-mRNA parent feature, check this subf type
477 <    subf_id=names->feats.addName(gl->ftype);
477 >    int subf_id=names->feats.addName(gl->ftype);
478      if (exon_ftype_id<0 || exons.Count()==0) //never assigned a subfeature type before (e.g. first exon being added)
479         exon_ftype_id=subf_id;
480       else {
# Line 506 | Line 554
554       isCDS=false;
555       }
556    if (qs || qe) {
557 <    if (qs>qe) swap(qs,qe);
557 >    if (qs>qe) Gswap(qs,qe);
558      if (qs==0) qs=1;
559          }
560    int ovlen=0;
# Line 572 | Line 620
620             } //check for overlap
621     // --- no overlap, or accepted micro-overlap (ribosomal slippage)
622     // create & add the new segment
623 +   /*
624 +   if (start>0 && exontype==exgffCDS && exons.Count()==0) {
625 +      //adding a CDS directly as the first subfeature of a declared parent
626 +      segstart=start;
627 +      segend=end;
628 +      }
629 +   */
630     GffExon* enew=new GffExon(segstart, segend, sc, fr, qs, qe, exontype);
631     int eidx=exons.Add(enew);
632     if (eidx<0) {
# Line 584 | Line 639
639       return -1;            
640       }
641     covlen+=(int)(exons[eidx]->end-exons[eidx]->start)+1;
642 <   start=exons.First()->start;
643 <   end=exons.Last()->end;
642 >   //adjust parent feature coordinates to contain this exon
643 >   if (start==0 || start>exons.First()->start) {
644 >     start=exons.First()->start;
645 >     }
646 >   if (end<exons.Last()->end) end=exons.Last()->end;
647 >    
648     if (uptr!=NULL) { //collect stats about the underlying genomic sequence
649         GSeqStat* gsd=(GSeqStat*)uptr;
650         if (start<gsd->mincoord) gsd->mincoord=start;
# Line 727 | Line 786
786    isCDS=gffline->is_cds; //for now
787    isGene(gffline->is_gene);
788    isTranscript(gffline->is_transcript || gffline->exontype!=0);
789 <  fromGff3(gffline->is_gff3);
789 >  //fromGff3(gffline->is_gff3);
790  
791 <  if (gffline->parents!=NULL) {
791 >  if (gffline->parents!=NULL && !gffline->is_transcript) {
792      //GTF style -- create a GffObj directly by subfeature
793      //(also possible orphan GFF3 exon line, or an exon given before its parent (chado))
794      if (gffline->exontype!=0) { //recognized exon-like feature
# Line 740 | Line 799
799         //make this GffObj of the same feature type
800         ftype_id=names->feats.addName(gffline->ftype);
801         }
802 <    if (gffline->ID==NULL) { //typical GTF
802 >    if (gffline->ID==NULL) { //typical GTF2 without "transcript" line
803          gffID=Gstrdup(gffline->parents[0]);
804          this->createdByExon(true);
805          //this is likely the first exon/segment of the feature
806          addExon(gfrd, gffline, keepAttr, noExonAttr);
807          }
808 <      else { //a parented feature with an ID -- probably an orphan GFF3 line
808 >      else { //a parented feature with an ID: orphan or premature GFF3 subfeature line
809          if (gffline->is_gff3 && gffline->exontype!=0) {
810               //premature exon given before its parent transcript
811               //create the transcript entry here
# Line 755 | Line 814
814               //this is the first exon/segment of the transcript
815               addExon(gfrd, gffline, keepAttr, noExonAttr);
816               }
817 <            else { //unrecognized non-exon feature ? use the ID instead
817 >        else { //unrecognized non-exon feature ? use the ID instead
818 >             this->hasGffID(true);
819               gffID=Gstrdup(gffline->ID);
820               if (keepAttr) this->parseAttrs(attrs, gffline->info);
821               }
822          }
823 <    } //subfeature given directly
824 <  else { //gffline->parents==NULL
823 >    } //non-transcript parented subfeature given directly
824 >  else {
825 >        //non-parented feature OR a recognizable transcript
826      //create a parent feature in its own right
827      gscore=gffline->score;
828      if (gffline->ID==NULL || gffline->ID[0]==0)
829        GError("Error: no ID found for GFF record start\n");
830 +    this->hasGffID(true);
831      gffID=Gstrdup(gffline->ID); //there must be an ID here
832      //if (gffline->is_transcript) ftype_id=gff_fid_mRNA;
833        //else
834      ftype_id=names->feats.addName(gffline->ftype);
835      if (gffline->is_transcript)
836        exon_ftype_id=gff_fid_exon;
775
837      if (keepAttr) this->parseAttrs(attrs, gffline->info);
838      }//no parent
839  
840    if (gffline->gene_name!=NULL) {
841       gene_name=Gstrdup(gffline->gene_name);
842       }
843 <  if (gffline->gene_id!=NULL) {
843 >  if (gffline->gene_id) {
844       geneID=Gstrdup(gffline->gene_id);
845       }
846 +  else if (gffline->is_transcript && gffline->parents) {
847 +         geneID=Gstrdup(gffline->parents[0]);
848 +     }
849  
850    GSeqStat* gsd=gfrd->gseqstats.AddIfNew(new GSeqStat(gseq_id,names->gseqs.lastNameUsed()),true);
851    uptr=gsd;
# Line 823 | Line 887
887   return gffline;
888   }
889  
890 +
891   char* GffReader::gfoBuildId(const char* id, const char* ctg) {
892   //caller must free the returned pointer
893   char* buf=NULL;
# Line 833 | Line 898
898   strcpy(buf+idlen+1, ctg);
899   return buf;
900   }
901 <
901 > /*
902   void GffReader::gfoRemove(const char* id, const char* ctg) {
903   char* buf=gfoBuildId(id,ctg);
904   phash.Remove(buf);
905   GFREE(buf);
906   }
907 <
908 < //Warning: if gflst gets altered, idx becomes obsolete
909 < GfoHolder* GffReader::gfoAdd(const char* id, const char* ctg, GffObj* gfo, int idx) {
910 < char* buf=gfoBuildId(id,ctg);
911 < GfoHolder* r=new GfoHolder(gfo,idx);
912 < phash.Add(buf, r);
913 < GFREE(buf);
914 < return r;
915 < }
916 <
917 < GfoHolder* GffReader::gfoFind(const char* id, const char* ctg) {
918 < char* buf=gfoBuildId(id,ctg);
919 < GfoHolder* r=phash.Find(buf);
920 < GFREE(buf);
921 < return r;
907 > */
908 > GfoHolder* GffReader::gfoAdd(GffObj* gfo, int idx) {
909 > //Warning: if gflst gets altered, idx becomes obsolete
910 > GVec<GfoHolder>* glst=phash.Find(gfo->gffID);
911 > if (glst==NULL)
912 >         glst=new GVec<GfoHolder>(1);
913 > GfoHolder gh(gfo,idx);
914 > int i=glst->Add(gh);
915 > phash.Add(gfo->gffID, glst);
916 > return &(glst->Get(i));
917 > }
918 >
919 > GfoHolder* GffReader::gfoAdd(GVec<GfoHolder>& glst, GffObj* gfo, int idx) {
920 > GfoHolder gh(gfo,idx);
921 > int i=glst.Add(gh);
922 > return &(glst[i]);
923 > }
924 >
925 > GfoHolder* GffReader::gfoFind(const char* id, const char* ctg,
926 >                    GVec<GfoHolder>** glst, char strand, uint start, uint end) {
927 > GVec<GfoHolder>* gl=phash.Find(id);
928 > GfoHolder* gh=NULL;
929 > if (gl) {
930 >   for (int i=0;i<gl->Count();i++) {
931 >      GfoHolder& gfo = gl->Get(i);
932 >      if (ctg!=NULL && strcmp(ctg, gfo.gffobj->getGSeqName())!=0)
933 >           continue;
934 >      if (strand && gfo.gffobj->strand!='.' && strand != gfo.gffobj->strand)
935 >           continue;
936 >      if (start>0) {
937 >           if (abs((int)start-(int)gfo.gffobj->start)>GFF_MAX_LOCUS)
938 >               continue;
939 >           if (end>0 && (gfo.gffobj->start>end || gfo.gffobj->end<start))
940 >                   continue;
941 >           }
942 >      //must be the same transcript, according to given comparison criteria
943 >      gh=&gfo;
944 >      break;
945 >      }
946 >   }
947 > if (glst) *glst=gl;
948 > return gh;
949   }
950  
951   GfoHolder* GffReader::replaceGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, int replaceidx) {
# Line 861 | Line 953
953    GfoHolder* r=NULL;
954    if (replaceidx>=0) {
955       gflst.Put(replaceidx,newgfo);
956 <     r=gfoAdd(newgfo->gffID, gffline->gseqname, newgfo, replaceidx);
956 >     r=gfoAdd(newgfo, replaceidx);
957       }
958     else {
959       int gfoidx=gflst.Add(newgfo);
960 <     r=gfoAdd(newgfo->gffID, gffline->gseqname, newgfo, gfoidx);
960 >     r=gfoAdd(newgfo, gfoidx);
961       }
962 +  /*
963    if (gff_warns) {
964      int* pcount=tids.Find(newgfo->gffID);
965      if (pcount!=NULL) {
966 <       if (gff_warns) GMessage("Warning: duplicate GFF ID: %s\n", newgfo->gffID);
966 >      if (gff_warns) GMessage("Warning: duplicate GFF ID: %s\n", newgfo->gffID);
967         (*pcount)++;
968         }
969       else {
970         tids.Add(newgfo->gffID,new int(1));
971         }
972      }
973 +   */
974    return r;
975   }
976  
# Line 897 | Line 991
991   }
992  
993   GfoHolder* GffReader::newGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr,
994 <                          GffObj* parent, GffExon* pexon) {
994 >                          GffObj* parent, GffExon* pexon, GVec<GfoHolder>* glst) {
995    GffObj* newgfo=new GffObj(this, gffline, keepAttr, noExonAttr);
996    GfoHolder* r=NULL;
997    int gfoidx=gflst.Add(newgfo);
998 <  r=gfoAdd(newgfo->gffID, gffline->gseqname, newgfo, gfoidx);
998 >  r=(glst) ? gfoAdd(*glst, newgfo, gfoidx) : gfoAdd(newgfo, gfoidx);
999    if (parent!=NULL) {
1000      updateParent(r, parent);
1001      if (pexon!=NULL) parent->removeExon(pexon);
1002      }
1003 +  /*
1004    if (gff_warns) {
1005      int* pcount=tids.Find(newgfo->gffID);
1006      if (pcount!=NULL) {
# Line 916 | Line 1011
1011         tids.Add(newgfo->gffID,new int(1));
1012         }
1013      }
1014 +  */
1015    return r;
1016   }
1017  
1018   GfoHolder* GffReader::updateGffRec(GfoHolder* prevgfo, GffLine* gffline,
1019                                           bool keepAttr) {
1020   if (prevgfo==NULL) return NULL;
1021 < prevgfo->gffobj->createdByExon(false);
1021 > //prevgfo->gffobj->createdByExon(false);
1022   prevgfo->gffobj->ftype_id=prevgfo->gffobj->names->feats.addName(gffline->ftype);
1023   prevgfo->gffobj->start=gffline->fstart;
1024   prevgfo->gffobj->end=gffline->fend;
1025   prevgfo->gffobj->isGene(gffline->is_gene);
1026   prevgfo->gffobj->isTranscript(gffline->is_transcript || gffline->exontype!=0);
1027 < prevgfo->gffobj->fromGff3(gffline->is_gff3);
1027 > prevgfo->gffobj->hasGffID(gffline->ID!=NULL);
1028   if (keepAttr) {
1029     if (prevgfo->gffobj->attrs!=NULL) prevgfo->gffobj->attrs->Clear();
1030     prevgfo->gffobj->parseAttrs(prevgfo->gffobj->attrs, gffline->info);
# Line 940 | Line 1036
1036   bool GffReader::addExonFeature(GfoHolder* prevgfo, GffLine* gffline, GHash<CNonExon>& pex, bool noExonAttr) {
1037    bool r=true;
1038    if (gffline->strand!=prevgfo->gffobj->strand) {
1039 <     GMessage("GFF Error: duplicate GFF ID '%s' (exons found on different strands of %s)\n",
1040 <        prevgfo->gffobj->gffID, prevgfo->gffobj->getGSeqName());
1041 <      r=false;
1042 <     }
1039 >     //TODO: add support for trans-splicing and even inter-chromosomal fusions
1040 >        if (prevgfo->gffobj->strand=='.') {
1041 >            prevgfo->gffobj->strand=gffline->strand;
1042 >        }
1043 >     else {
1044 >       GMessage("GFF Error at %s (%c): exon %d-%d (%c) found on different strand; discarded.\n",
1045 >       prevgfo->gffobj->gffID, prevgfo->gffobj->strand,
1046 >       gffline->fstart, gffline->fend, gffline->strand, prevgfo->gffobj->getGSeqName());
1047 >       //r=false;
1048 >       return true; //FIXME: split trans-spliced mRNAs by strand
1049 >       }
1050 >   }
1051    int gdist=(gffline->fstart>prevgfo->gffobj->end) ? gffline->fstart-prevgfo->gffobj->end :
1052                        ((gffline->fend<prevgfo->gffobj->start)? prevgfo->gffobj->start-gffline->fend :
1053                           0 );
# Line 997 | Line 1101
1101   }
1102  
1103   //have to parse the whole file because exons can be scattered all over
1104 + //trans-splicing and fusions are only accepted in proper GFF3 format, with a single parent feature ID entry
1105   void GffReader::readAll(bool keepAttr, bool mergeCloseExons, bool noExonAttr) {
1106    bool validation_errors = false;
1107    //loc_debug=false;
1108    GHash<CNonExon> pex; //keep track of any "exon"-like features that have an ID
1109                       //and thus could become promoted to parent features
1110    while (nextGffLine()!=NULL) {
1006       //seen this gff ID before?
1111       GfoHolder* prevseen=NULL;
1112 <     if (gffline->ID) //GFF3
1113 <         prevseen=gfoFind(gffline->ID, gffline->gseqname);
1114 <     if (prevseen!=NULL) {
1115 <            if (prevseen->gffobj->createdByExon()) {
1116 <                updateGffRec(prevseen, gffline, keepAttr);
1117 <                }
1118 <             else {
1119 <                GMessage("Error: duplicate GFF ID '%s' encountered!\n",gffline->ID);
1120 <                validation_errors = true;
1121 <                if (gff_warns) {
1122 <                       delete gffline; gffline=NULL; continue;
1123 <                       }
1124 <                   else exit(1);
1125 <                }
1126 <            }
1112 >     GVec<GfoHolder>* prevgflst=NULL;
1113 >     if (gffline->ID && gffline->exontype==0) {
1114 >         //>> for a parent-like IDed feature (mRNA, gene, etc.)
1115 >                 //look for same ID on the same chromosome/strand/locus
1116 >                 prevseen=gfoFind(gffline->ID, gffline->gseqname, &prevgflst, gffline->strand, gffline->fstart);
1117 >                 if (prevseen!=NULL) {
1118 >                                //same ID/chromosome combo encountered before
1119 >                                if (prevseen->gffobj->createdByExon() &&
1120 >                                          prevseen->gffobj->start>=gffline->fstart &&
1121 >                                          prevseen->gffobj->end<=gffline->fend) {
1122 >                                        //an exon of this ID was given before
1123 >                                        //this line has the main attributes for this ID
1124 >                                        updateGffRec(prevseen, gffline, keepAttr);
1125 >                                        }
1126 >                                 else {
1127 >                                        //- duplicate ID -- this must be a discontiguous feature
1128 >                                   //   e.g. a trans-spliced transcript
1129 >                                   if (prevseen->gffobj->overlap(gffline->fstart, gffline->fend)) {
1130 >                                          //overlapping with same ID not allowed
1131 >                                         GMessage("Error: duplicate GFF ID '%s' encountered!\n",gffline->ID);
1132 >                                         //validation_errors = true;
1133 >                                         if (gff_warns) {
1134 >                                                   delete gffline;
1135 >                                                   gffline=NULL;
1136 >                                                   continue;
1137 >                                                   }
1138 >                                         else exit(1);
1139 >                                     }
1140 >                                    //create a new entry with the same ID
1141 >                    prevseen=newGffRec(gffline, keepAttr, noExonAttr,
1142 >                            prevseen->gffobj->parent, NULL, prevgflst);
1143 >                                        } //duplicate ID on the same chromosome
1144 >                                } //prevseeen != NULL
1145 >        } //parent-like ID feature
1146      if (gffline->parents==NULL) {//start GFF3-like record with no parent (mRNA, gene)
1147 <       if (!prevseen) newGffRec(gffline, keepAttr, noExonAttr);
1147 >       if (!prevseen) newGffRec(gffline, keepAttr, noExonAttr, NULL, NULL, prevgflst);
1148         }
1149 <    else { //--- it's a parented feature (could still be a mRNA)
1149 >    else { //--- it's a child feature (exon/CDS but could still be a mRNA with gene(s) as parent)
1150 >       //updates all the declared parents with this child
1151         bool found_parent=false;
1152         GfoHolder* newgfo=prevseen;
1153 +       GVec<GfoHolder>* newgflst=NULL;
1154         for (int i=0;i<gffline->num_parents;i++) {
1155              if (transcriptsOnly && discarded_ids.Find(gffline->parents[i])!=NULL)
1156                  continue; //skipping discarded parent feature
1157 <            GfoHolder* parentgfo=gfoFind(gffline->parents[i], gffline->gseqname);
1157 >            GfoHolder* parentgfo=NULL;
1158 >            if (gffline->is_transcript || gffline->exontype==0) //possibly a transcript
1159 >              parentgfo=gfoFind(gffline->parents[i], gffline->gseqname,
1160 >                                          &newgflst, gffline->strand, gffline->fstart, gffline->fend);
1161 >            else
1162 >              parentgfo=gfoFind(gffline->parents[i], gffline->gseqname,
1163 >                                          &newgflst, gffline->strand, gffline->fstart);
1164              if (parentgfo!=NULL) { //parent GffObj parsed earlier
1165 <                   found_parent=true;
1165 >
1166 >                   //found_parent=true;
1167                     if (parentgfo->gffobj->isGene() && gffline->is_transcript
1168                                     && gffline->exontype==0) {
1169                         //not an exon, but a transcript parented by a gene
# Line 1046 | Line 1178
1178                         if (!addExonFeature(parentgfo, gffline, pex, noExonAttr))
1179                           validation_errors=true;
1180                         }
1181 <                   }
1181 >                   } //overlapping parent feature found
1182              } //for each parsed parent Id
1183         if (!found_parent) { //new GTF-like record starting here with a subfeature directly
1184 <             //or it could be some chado GFF3 barf with exons declared BEFORE their parent :(
1184 >             //or it could be some chado GFF3 barf with exons coming BEFORE their parent :(
1185              //check if this feature isn't parented by a previously stored "exon" subfeature
1186              char* subp_name=NULL;
1187              CNonExon* subp=subfPoolCheck(gffline, pex, subp_name);
# Line 1060 | Line 1192
1192                 if (!addExonFeature(gfoh, gffline, pex, noExonAttr))
1193                        validation_errors=true;
1194                 }
1195 <              else { //no parent seen before, create one directly with this exon
1195 >              else { //no parent seen before,
1196                 //loc_debug=true;
1197 <               GfoHolder* newgfo=prevseen ? prevseen : newGffRec(gffline, keepAttr, noExonAttr);
1198 <               if (gffline->ID!=NULL && gffline->exontype==0)
1199 <                     subfPoolAdd(pex, newgfo);
1197 >               GfoHolder* ngfo=prevseen;
1198 >               if (ngfo==NULL) {
1199 >                   //if it's an exon type, create directly the parent with this exon
1200 >                   //but if it's recognized as a transcript, the object itself is created
1201 >                   ngfo=newGffRec(gffline, keepAttr, noExonAttr, NULL, NULL, newgflst);
1202 >                   }
1203 >               if (!ngfo->gffobj->isTranscript() &&
1204 >                     gffline->ID!=NULL && gffline->exontype==0)
1205 >                     subfPoolAdd(pex, ngfo);
1206                 //even those with errors will be added here!
1207                 }
1208              GFREE(subp_name);
# Line 1078 | Line 1216
1216   // all gff records are now loaded in GList gflst
1217   // so we can free the hash
1218    phash.Clear();
1219 <  tids.Clear();
1219 >  //tids.Clear();
1220    if (validation_errors) {
1221      exit(1);
1222      }
# Line 1127 | Line 1265
1265       } //for each exon
1266     }
1267   //attribute reduction for GTF records
1268 < if (keepAttrs && !noExonAttr && !fromGff3()
1268 > if (keepAttrs && !noExonAttr && !hasGffID()
1269            && exons.Count()>0 && exons[0]->attrs!=NULL) {
1270     bool attrs_discarded=false;
1271     for (int a=0;a<exons[0]->attrs->Count();a++) {

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines