ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/gff.cpp
(Generate patch)
# Line 17 | Line 17
17   const uint gfo_flag_CHILDREN_PROMOTED= 0x00000002;
18   const uint gfo_flag_IS_GENE          = 0x00000004;
19   const uint gfo_flag_IS_TRANSCRIPT    = 0x00000008;
20 < const uint gfo_flag_FROM_GFF3        = 0x00000010;
20 > const uint gfo_flag_HAS_GFF_ID       = 0x00000010; //found GFF3 feature line with its own ID
21   const uint gfo_flag_BY_EXON          = 0x00000020; //created by subfeature (exon) directly
22   const uint gfo_flag_DISCARDED        = 0x00000100;
23   const uint gfo_flag_LST_KEEP         = 0x00000200;
# Line 786 | Line 786
786    isCDS=gffline->is_cds; //for now
787    isGene(gffline->is_gene);
788    isTranscript(gffline->is_transcript || gffline->exontype!=0);
789 <  fromGff3(gffline->is_gff3);
789 >  //fromGff3(gffline->is_gff3);
790  
791    if (gffline->parents!=NULL) {
792      //GTF style -- create a GffObj directly by subfeature
# Line 799 | Line 799
799         //make this GffObj of the same feature type
800         ftype_id=names->feats.addName(gffline->ftype);
801         }
802 <    if (gffline->ID==NULL) { //typical GTF
802 >    if (gffline->ID==NULL) { //typical GTF2 without "transcript" line
803          gffID=Gstrdup(gffline->parents[0]);
804          this->createdByExon(true);
805          //this is likely the first exon/segment of the feature
806          addExon(gfrd, gffline, keepAttr, noExonAttr);
807          }
808 <      else { //a parented feature with an ID -- probably an orphan GFF3 line
808 >      else { //a parented feature with an ID -- probably an orphan or premature GFF3 subfeature line
809          if (gffline->is_gff3 && gffline->exontype!=0) {
810               //premature exon given before its parent transcript
811               //create the transcript entry here
# Line 825 | Line 825
825      gscore=gffline->score;
826      if (gffline->ID==NULL || gffline->ID[0]==0)
827        GError("Error: no ID found for GFF record start\n");
828 +    this->hasGffID(true);
829      gffID=Gstrdup(gffline->ID); //there must be an ID here
830      //if (gffline->is_transcript) ftype_id=gff_fid_mRNA;
831        //else
# Line 902 | Line 903
903   */
904   //Warning: if gflst gets altered, idx becomes obsolete
905   GfoHolder* GffReader::gfoAdd(GffObj* gfo, int idx) {
906 < GVec<GfoHolder>* glst=new GVec<GfoHolder>(1);
906 > //TODO: must make sure the gfo ID isn't there already.
907 >
908 > GVec<GfoHolder>* glst=phash.Find(gfo->gffID);
909 > if (glst==NULL)
910 >         glst=new GVec<GfoHolder>(1);
911   GfoHolder gh(gfo,idx);
912   int i=glst->Add(gh);
913   phash.Add(gfo->gffID, glst);
# Line 915 | Line 920
920   return &(glst[i]);
921   }
922  
923 < GfoHolder* GffReader::gfoFind(const char* id, const char* ctg, char strand, uint start, GVec<GfoHolder>** glst) {
923 > GfoHolder* GffReader::gfoFind(const char* id, const char* ctg,
924 >                    GVec<GfoHolder>** glst, char strand, uint start, uint end) {
925   GVec<GfoHolder>* gl=phash.Find(id);
926   GfoHolder* gh=NULL;
927   if (gl) {
928     for (int i=0;i<gl->Count();i++) {
929        GfoHolder& gfo = gl->Get(i);
930 <      if (ctg!=NULL && strcmp(ctg,gfo.gffobj->getGSeqName())!=0)
930 >      if (ctg!=NULL && strcmp(ctg, gfo.gffobj->getGSeqName())!=0)
931             continue;
932        if (strand && strand != gfo.gffobj->strand)
933             continue;
934 <      if (start>0 && abs((int)start-(int)gfo.gffobj->start)>GFF_MAX_LOCUS)
935 <           continue;
934 >      if (start>0) {
935 >           if (abs((int)start-(int)gfo.gffobj->start)>GFF_MAX_LOCUS)
936 >               continue;
937 >           if (end>0 && (gfo.gffobj->start>end || gfo.gffobj->end<start))
938 >                   continue;
939 >           }
940        //must be the same transcript, according to given comparison criteria
941        gh=&gfo;
942        break;
# Line 947 | Line 957
957       int gfoidx=gflst.Add(newgfo);
958       r=gfoAdd(newgfo, gfoidx);
959       }
960 +  /*
961    if (gff_warns) {
962      int* pcount=tids.Find(newgfo->gffID);
963      if (pcount!=NULL) {
# Line 957 | Line 968
968         tids.Add(newgfo->gffID,new int(1));
969         }
970      }
971 +   */
972    return r;
973   }
974  
# Line 981 | Line 993
993    GffObj* newgfo=new GffObj(this, gffline, keepAttr, noExonAttr);
994    GfoHolder* r=NULL;
995    int gfoidx=gflst.Add(newgfo);
996 <  r=(glst) ? gfoAdd(*glst, newgfo, gfoidx) :gfoAdd(newgfo, gfoidx);
996 >  r=(glst) ? gfoAdd(*glst, newgfo, gfoidx) : gfoAdd(newgfo, gfoidx);
997    if (parent!=NULL) {
998      updateParent(r, parent);
999      if (pexon!=NULL) parent->removeExon(pexon);
1000      }
1001 +  /*
1002    if (gff_warns) {
1003      int* pcount=tids.Find(newgfo->gffID);
1004      if (pcount!=NULL) {
# Line 996 | Line 1009
1009         tids.Add(newgfo->gffID,new int(1));
1010         }
1011      }
1012 +  */
1013    return r;
1014   }
1015  
1016   GfoHolder* GffReader::updateGffRec(GfoHolder* prevgfo, GffLine* gffline,
1017                                           bool keepAttr) {
1018   if (prevgfo==NULL) return NULL;
1019 < prevgfo->gffobj->createdByExon(false);
1019 > //prevgfo->gffobj->createdByExon(false);
1020   prevgfo->gffobj->ftype_id=prevgfo->gffobj->names->feats.addName(gffline->ftype);
1021   prevgfo->gffobj->start=gffline->fstart;
1022   prevgfo->gffobj->end=gffline->fend;
1023   prevgfo->gffobj->isGene(gffline->is_gene);
1024   prevgfo->gffobj->isTranscript(gffline->is_transcript || gffline->exontype!=0);
1025 < prevgfo->gffobj->fromGff3(gffline->is_gff3);
1025 > prevgfo->gffobj->hasGffID(gffline->ID!=NULL);
1026   if (keepAttr) {
1027     if (prevgfo->gffobj->attrs!=NULL) prevgfo->gffobj->attrs->Clear();
1028     prevgfo->gffobj->parseAttrs(prevgfo->gffobj->attrs, gffline->info);
# Line 1020 | Line 1034
1034   bool GffReader::addExonFeature(GfoHolder* prevgfo, GffLine* gffline, GHash<CNonExon>& pex, bool noExonAttr) {
1035    bool r=true;
1036    if (gffline->strand!=prevgfo->gffobj->strand) {
1037 <  //TODO: add support for trans-splicing and even inter-chromosomal fusions
1038 <     if (prevgfo->gffobj->strand=='.') {
1037 >     //TODO: add support for trans-splicing and even inter-chromosomal fusions
1038 >        if (prevgfo->gffobj->strand=='.') {
1039              prevgfo->gffobj->strand=gffline->strand;
1040          }
1041       else {
# Line 1085 | Line 1099
1099   }
1100  
1101   //have to parse the whole file because exons can be scattered all over
1102 + //trans-splicing and fusions are only accepted in proper GFF3 format, with a single parent feature ID entry
1103   void GffReader::readAll(bool keepAttr, bool mergeCloseExons, bool noExonAttr) {
1104    bool validation_errors = false;
1105    //loc_debug=false;
1106    GHash<CNonExon> pex; //keep track of any "exon"-like features that have an ID
1107                       //and thus could become promoted to parent features
1108    while (nextGffLine()!=NULL) {
1094       //seen this gff ID before?
1109       GfoHolder* prevseen=NULL;
1110 <     if (gffline->ID && gffline->exontype==0) //GFF3 parent-like feature (mRNA, gene, etc.)
1111 <         prevseen=gfoFind(gffline->ID, gffline->gseqname);
1112 <     if (prevseen!=NULL) {
1113 <            if (prevseen->gffobj->createdByExon()) {
1114 <                //just in case the exon was found before (shouldn't happen)
1115 <                updateGffRec(prevseen, gffline, keepAttr);
1116 <                }
1117 <             else {
1118 <                GMessage("Error: duplicate GFF ID '%s' encountered!\n",gffline->ID);
1119 <                validation_errors = true;
1120 <                if (gff_warns) {
1121 <                       delete gffline; gffline=NULL; continue;
1122 <                       }
1123 <                   else exit(1);
1124 <                }
1125 <            }
1110 >     GVec<GfoHolder>* prevgflst=NULL;
1111 >     if (gffline->ID && gffline->exontype==0) {
1112 >         //>>>>> for a parent-like IDed feature (mRNA, gene, etc.)
1113 >                 //look for same ID on the same chromosome/strand/locus
1114 >                 prevseen=gfoFind(gffline->ID, gffline->gseqname, &prevgflst, gffline->strand, gffline->fstart);
1115 >                 if (prevseen!=NULL) {
1116 >                                //same ID/chromosome combo encountered before
1117 >                                if (prevseen->gffobj->createdByExon() &&
1118 >                                          prevseen->gffobj->start>=gffline->fstart &&
1119 >                                          prevseen->gffobj->end<=gffline->fend) {
1120 >                                        //an exon of this ID was given before
1121 >                                        //this line has the main attributes for this ID
1122 >                                        updateGffRec(prevseen, gffline, keepAttr);
1123 >                                        }
1124 >                                 else {
1125 >                                        //- duplicate ID -- this must be a discontiguous feature
1126 >                                   //   e.g. a trans-spliced transcript
1127 >                                   if (prevseen->gffobj->overlap(gffline->fstart, gffline->fend)) {
1128 >                                          //overlapping with same ID not allowed
1129 >                                         GMessage("Error: duplicate GFF ID '%s' encountered!\n",gffline->ID);
1130 >                                         //validation_errors = true;
1131 >                                         if (gff_warns) {
1132 >                                                   delete gffline;
1133 >                                                   gffline=NULL;
1134 >                                                   continue;
1135 >                                                   }
1136 >                                         else exit(1);
1137 >                                     }
1138 >                                    //create a new entry with the same ID
1139 >                    prevseen=newGffRec(gffline, keepAttr, noExonAttr,
1140 >                            prevseen->gffobj->parent, NULL, prevgflst);
1141 >                                        } //duplicate ID on the same chromosome
1142 >                                } //prevseeen != NULL
1143 >        } //parent-like ID feature
1144      if (gffline->parents==NULL) {//start GFF3-like record with no parent (mRNA, gene)
1145 <       if (!prevseen) newGffRec(gffline, keepAttr, noExonAttr);
1145 >       if (!prevseen) newGffRec(gffline, keepAttr, noExonAttr, NULL, NULL, prevgflst);
1146         }
1147 <    else { //--- it's a parented feature (exon/CDS, but might still be a mRNA)
1147 >    else { //--- it's a child feature (exon/CDS but could still be a mRNA with gene(s) as parent)
1148         bool found_parent=false;
1149         GfoHolder* newgfo=prevseen;
1150         GVec<GfoHolder>* newgflst=NULL;
1151         for (int i=0;i<gffline->num_parents;i++) {
1152              if (transcriptsOnly && discarded_ids.Find(gffline->parents[i])!=NULL)
1153                  continue; //skipping discarded parent feature
1154 <            GfoHolder* parentgfo=gfoFind(gffline->parents[i], gffline->gseqname, gffline->strand, gffline->fstart, &newgflst);
1154 >            GfoHolder* parentgfo=gfoFind(gffline->parents[i], gffline->gseqname,
1155 >                                          &newgflst, gffline->strand, gffline->fstart, gffline->fend);
1156              if (parentgfo!=NULL) { //parent GffObj parsed earlier
1157                     found_parent=true;
1158                     if (parentgfo->gffobj->isGene() && gffline->is_transcript
# Line 1136 | Line 1169
1169                         if (!addExonFeature(parentgfo, gffline, pex, noExonAttr))
1170                           validation_errors=true;
1171                         }
1172 <                   }
1172 >                   } //overlapping parent feature found
1173              } //for each parsed parent Id
1174         if (!found_parent) { //new GTF-like record starting here with a subfeature directly
1175               //or it could be some chado GFF3 barf with exons declared BEFORE their parent :(
# Line 1152 | Line 1185
1185                 }
1186                else { //no parent seen before, create one directly with this exon
1187                 //loc_debug=true;
1188 <               GfoHolder* newgfo=prevseen ? prevseen : newGffRec(gffline, keepAttr, noExonAttr, NULL, NULL, newgflst);
1188 >               GfoHolder* ngfo=prevseen;
1189 >               if (ngfo==NULL)
1190 >                   ngfo=newGffRec(gffline, keepAttr, noExonAttr, NULL, NULL, newgflst);
1191                 if (gffline->ID!=NULL && gffline->exontype==0)
1192 <                     subfPoolAdd(pex, newgfo);
1192 >                     subfPoolAdd(pex, ngfo);
1193                 //even those with errors will be added here!
1194                 }
1195              GFREE(subp_name);
# Line 1168 | Line 1203
1203   // all gff records are now loaded in GList gflst
1204   // so we can free the hash
1205    phash.Clear();
1206 <  tids.Clear();
1206 >  //tids.Clear();
1207    if (validation_errors) {
1208      exit(1);
1209      }
# Line 1217 | Line 1252
1252       } //for each exon
1253     }
1254   //attribute reduction for GTF records
1255 < if (keepAttrs && !noExonAttr && !fromGff3()
1255 > if (keepAttrs && !noExonAttr && !hasGffID()
1256            && exons.Count()>0 && exons[0]->attrs!=NULL) {
1257     bool attrs_discarded=false;
1258     for (int a=0;a<exons[0]->attrs->Count();a++) {

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines