17 |
|
const uint gfo_flag_CHILDREN_PROMOTED= 0x00000002; |
18 |
|
const uint gfo_flag_IS_GENE = 0x00000004; |
19 |
|
const uint gfo_flag_IS_TRANSCRIPT = 0x00000008; |
20 |
< |
const uint gfo_flag_FROM_GFF3 = 0x00000010; |
20 |
> |
const uint gfo_flag_HAS_GFF_ID = 0x00000010; //found GFF3 feature line with its own ID |
21 |
|
const uint gfo_flag_BY_EXON = 0x00000020; //created by subfeature (exon) directly |
22 |
|
const uint gfo_flag_DISCARDED = 0x00000100; |
23 |
|
const uint gfo_flag_LST_KEEP = 0x00000200; |
52 |
|
else return (int)(g1.gseq_id-g2.gseq_id); |
53 |
|
} |
54 |
|
|
55 |
< |
char* GffLine::extractAttr(const char* pre, bool caseStrict, bool enforce_GTF2) { |
55 |
> |
char* GffLine::extractAttr(const char* attr, bool caseStrict, bool enforce_GTF2) { |
56 |
|
//parse a key attribute and remove it from the info string |
57 |
|
//(only works for attributes that have values following them after ' ' or '=') |
58 |
|
static const char GTF2_ERR[]="Error parsing attribute %s ('\"' required) at GTF line:\n%s\n"; |
59 |
< |
int lpre=strlen(pre); |
60 |
< |
char cend=pre[lpre-1]; |
61 |
< |
char* pos = (caseStrict) ? strstr(info, pre) : strifind(info, pre); |
62 |
< |
if (pos==NULL) return NULL; |
63 |
< |
char* findstart=info; |
64 |
< |
//require word boundary on the left: |
65 |
< |
while (pos!=NULL && pos!=info && *(pos-1)!=';' && *(pos-1)!=' ') { |
66 |
< |
findstart=pos+lpre; |
67 |
< |
pos = (caseStrict) ? strstr(findstart, pre) : strifind(findstart, pre); |
68 |
< |
} |
69 |
< |
if (pos==NULL) return NULL; |
70 |
< |
if (cend!=' ' && cend!='=') { |
71 |
< |
//require word boundary on the right: |
72 |
< |
while (pos!=NULL && *(pos+lpre)!=' ' && *(pos+lpre)!='=') { |
73 |
< |
findstart=pos+lpre; |
74 |
< |
pos = (caseStrict) ? strstr(findstart, pre) : strifind(findstart, pre); |
75 |
< |
} |
76 |
< |
} |
77 |
< |
if (pos==NULL) return NULL; |
78 |
< |
char* vp=pos+lpre; |
59 |
> |
int attrlen=strlen(attr); |
60 |
> |
char cend=attr[attrlen-1]; |
61 |
> |
//char* pos = (caseStrict) ? strstr(info, attr) : strifind(info, attr); |
62 |
> |
//must make sure attr is not found in quoted text |
63 |
> |
char* pos=info; |
64 |
> |
char prevch=0; |
65 |
> |
bool in_str=false; |
66 |
> |
bool notfound=true; |
67 |
> |
int (*strcmpfn)(const char*, const char*, int) = caseStrict ? Gstrcmp : Gstricmp; |
68 |
> |
while (notfound && *pos) { |
69 |
> |
char ch=*pos; |
70 |
> |
if (ch=='"') { |
71 |
> |
in_str=!in_str; |
72 |
> |
pos++; |
73 |
> |
prevch=ch; |
74 |
> |
continue; |
75 |
> |
} |
76 |
> |
if (!in_str && (prevch==0 || prevch==' ' || prevch == ';') |
77 |
> |
&& strcmpfn(attr, pos, attrlen)==0) { |
78 |
> |
//attr match found |
79 |
> |
//check for word boundary on right |
80 |
> |
char* epos=pos+attrlen; |
81 |
> |
if (cend=='=' || cend==' ' || *epos==0 || *epos==' ') { |
82 |
> |
notfound=false; |
83 |
> |
break; |
84 |
> |
} |
85 |
> |
//not a perfect match, move on |
86 |
> |
pos=epos; |
87 |
> |
prevch=*(pos-1); |
88 |
> |
continue; |
89 |
> |
} |
90 |
> |
//not a match or in_str |
91 |
> |
prevch=ch; |
92 |
> |
pos++; |
93 |
> |
} |
94 |
> |
if (notfound) return NULL; |
95 |
> |
char* vp=pos+attrlen; |
96 |
|
while (*vp==' ') vp++; |
97 |
|
if (*vp==';' || *vp==0) |
98 |
< |
GError("Error parsing value of GFF attribute \"%s\", line:\n%s\n", pre, dupline); |
98 |
> |
GError("Error parsing value of GFF attribute \"%s\", line:\n%s\n", attr, dupline); |
99 |
|
bool dq_enclosed=false; //value string enclosed by double quotes |
100 |
|
if (*vp=='"') { |
101 |
|
dq_enclosed=true; |
102 |
|
vp++; |
103 |
|
} |
104 |
|
if (enforce_GTF2 && !dq_enclosed) |
105 |
< |
GError(GTF2_ERR,pre, dupline); |
105 |
> |
GError(GTF2_ERR,attr, dupline); |
106 |
|
char* vend=vp; |
107 |
|
if (dq_enclosed) { |
108 |
|
while (*vend!='"' && *vend!=';' && *vend!=0) vend++; |
111 |
|
while (*vend!=';' && *vend!=0) vend++; |
112 |
|
} |
113 |
|
if (enforce_GTF2 && *vend!='"') |
114 |
< |
GError(GTF2_ERR, pre, dupline); |
114 |
> |
GError(GTF2_ERR, attr, dupline); |
115 |
|
char *r=Gstrdup(vp, vend-1); |
116 |
|
//-- now remove this attribute from the info string |
117 |
|
while (*vend!=0 && (*vend=='"' || *vend==';' || *vend==' ')) vend++; |
186 |
|
GMessage("Warning: invalid end coordinate at line:\n%s\n",l); |
187 |
|
return; |
188 |
|
} |
189 |
< |
if (fend<fstart) swap(fend,fstart); //make sure fstart>=fend, always |
189 |
> |
if (fend<fstart) Gswap(fend,fstart); //make sure fstart>=fend, always |
190 |
|
p=t[5]; |
191 |
|
if (p[0]=='.' && p[1]==0) { |
192 |
|
score=0; |
251 |
|
} |
252 |
|
return; //skip this line, unwanted feature name |
253 |
|
} |
254 |
< |
ID=extractAttr("ID="); |
255 |
< |
char* Parent=extractAttr("Parent="); |
254 |
> |
ID=extractAttr("ID=",true); |
255 |
> |
char* Parent=extractAttr("Parent=",true); |
256 |
|
is_gff3=(ID!=NULL || Parent!=NULL); |
257 |
|
if (is_gff3) { |
258 |
|
//parse as GFF3 |
259 |
|
if (ID!=NULL) { |
260 |
|
//has ID attr so it's likely to be a parent feature |
261 |
|
//look for explicit gene name |
262 |
< |
gene_name=extractAttr("gene_name=",false); |
262 |
> |
gene_name=extractAttr("gene_name="); |
263 |
|
if (gene_name==NULL) { |
264 |
< |
gene_name=extractAttr("geneName=",false); |
264 |
> |
gene_name=extractAttr("geneName="); |
265 |
|
if (gene_name==NULL) { |
266 |
< |
gene_name=extractAttr("gene_sym=",false); |
266 |
> |
gene_name=extractAttr("gene_sym="); |
267 |
|
if (gene_name==NULL) { |
268 |
< |
gene_name=extractAttr("gene=",false); |
268 |
> |
gene_name=extractAttr("gene="); |
269 |
|
} |
270 |
|
} |
271 |
|
} |
272 |
< |
gene_id=extractAttr("geneID=",false); |
272 |
> |
gene_id=extractAttr("geneID="); |
273 |
|
if (gene_id==NULL) { |
274 |
< |
gene_id=extractAttr("gene_id=",false); |
274 |
> |
gene_id=extractAttr("gene_id="); |
275 |
|
} |
276 |
|
if (is_gene) { |
277 |
|
//special case: keep the Name and ID attributes of the gene feature |
316 |
|
} //has Parent field |
317 |
|
} //GFF3 |
318 |
|
else { // GTF-like expected |
319 |
< |
Parent=extractAttr("transcript_id"); |
319 |
> |
Parent=extractAttr("transcript_id",true); |
320 |
|
if (Parent!=NULL) { //GTF2 format detected |
321 |
|
if (is_transcript) { |
322 |
|
// atypical GTF with a parent transcript line declared |
324 |
|
Parent=NULL; |
325 |
|
} |
326 |
|
gene_id=extractAttr("gene_id"); // for GTF this is the only attribute accepted as geneID |
327 |
+ |
if (gene_id==NULL) |
328 |
+ |
gene_id=extractAttr("geneid"); |
329 |
|
gene_name=extractAttr("gene_name"); |
330 |
|
if (gene_name==NULL) { |
331 |
+ |
|
332 |
|
gene_name=extractAttr("gene_sym"); |
333 |
< |
if (gene_name==NULL) |
333 |
> |
if (gene_name==NULL) { |
334 |
|
gene_name=extractAttr("gene"); |
335 |
+ |
if (gene_name==NULL) |
336 |
+ |
gene_name=extractAttr("genesymbol"); |
337 |
+ |
} |
338 |
|
} |
339 |
|
//prepare for parseAttr by adding '=' character instead of spaces for all attributes |
340 |
|
//after the attribute name |
554 |
|
isCDS=false; |
555 |
|
} |
556 |
|
if (qs || qe) { |
557 |
< |
if (qs>qe) swap(qs,qe); |
557 |
> |
if (qs>qe) Gswap(qs,qe); |
558 |
|
if (qs==0) qs=1; |
559 |
|
} |
560 |
|
int ovlen=0; |
786 |
|
isCDS=gffline->is_cds; //for now |
787 |
|
isGene(gffline->is_gene); |
788 |
|
isTranscript(gffline->is_transcript || gffline->exontype!=0); |
789 |
< |
fromGff3(gffline->is_gff3); |
789 |
> |
//fromGff3(gffline->is_gff3); |
790 |
|
|
791 |
|
if (gffline->parents!=NULL) { |
792 |
|
//GTF style -- create a GffObj directly by subfeature |
799 |
|
//make this GffObj of the same feature type |
800 |
|
ftype_id=names->feats.addName(gffline->ftype); |
801 |
|
} |
802 |
< |
if (gffline->ID==NULL) { //typical GTF |
802 |
> |
if (gffline->ID==NULL) { //typical GTF2 without "transcript" line |
803 |
|
gffID=Gstrdup(gffline->parents[0]); |
804 |
|
this->createdByExon(true); |
805 |
|
//this is likely the first exon/segment of the feature |
806 |
|
addExon(gfrd, gffline, keepAttr, noExonAttr); |
807 |
|
} |
808 |
< |
else { //a parented feature with an ID -- probably an orphan GFF3 line |
808 |
> |
else { //a parented feature with an ID -- probably an orphan or premature GFF3 subfeature line |
809 |
|
if (gffline->is_gff3 && gffline->exontype!=0) { |
810 |
|
//premature exon given before its parent transcript |
811 |
|
//create the transcript entry here |
825 |
|
gscore=gffline->score; |
826 |
|
if (gffline->ID==NULL || gffline->ID[0]==0) |
827 |
|
GError("Error: no ID found for GFF record start\n"); |
828 |
+ |
this->hasGffID(true); |
829 |
|
gffID=Gstrdup(gffline->ID); //there must be an ID here |
830 |
|
//if (gffline->is_transcript) ftype_id=gff_fid_mRNA; |
831 |
|
//else |
883 |
|
return gffline; |
884 |
|
} |
885 |
|
|
886 |
+ |
|
887 |
|
char* GffReader::gfoBuildId(const char* id, const char* ctg) { |
888 |
|
//caller must free the returned pointer |
889 |
|
char* buf=NULL; |
894 |
|
strcpy(buf+idlen+1, ctg); |
895 |
|
return buf; |
896 |
|
} |
897 |
< |
|
897 |
> |
/* |
898 |
|
void GffReader::gfoRemove(const char* id, const char* ctg) { |
899 |
|
char* buf=gfoBuildId(id,ctg); |
900 |
|
phash.Remove(buf); |
901 |
|
GFREE(buf); |
902 |
|
} |
903 |
< |
|
903 |
> |
*/ |
904 |
|
//Warning: if gflst gets altered, idx becomes obsolete |
905 |
< |
GfoHolder* GffReader::gfoAdd(const char* id, const char* ctg, GffObj* gfo, int idx) { |
906 |
< |
char* buf=gfoBuildId(id,ctg); |
882 |
< |
GfoHolder* r=new GfoHolder(gfo,idx); |
883 |
< |
phash.Add(buf, r); |
884 |
< |
GFREE(buf); |
885 |
< |
return r; |
886 |
< |
} |
905 |
> |
GfoHolder* GffReader::gfoAdd(GffObj* gfo, int idx) { |
906 |
> |
//TODO: must make sure the gfo ID isn't there already. |
907 |
|
|
908 |
< |
GfoHolder* GffReader::gfoFind(const char* id, const char* ctg) { |
909 |
< |
char* buf=gfoBuildId(id,ctg); |
910 |
< |
GfoHolder* r=phash.Find(buf); |
911 |
< |
GFREE(buf); |
912 |
< |
return r; |
908 |
> |
GVec<GfoHolder>* glst=phash.Find(gfo->gffID); |
909 |
> |
if (glst==NULL) |
910 |
> |
glst=new GVec<GfoHolder>(1); |
911 |
> |
GfoHolder gh(gfo,idx); |
912 |
> |
int i=glst->Add(gh); |
913 |
> |
phash.Add(gfo->gffID, glst); |
914 |
> |
return &(glst->Get(i)); |
915 |
> |
} |
916 |
> |
|
917 |
> |
GfoHolder* GffReader::gfoAdd(GVec<GfoHolder>& glst, GffObj* gfo, int idx) { |
918 |
> |
GfoHolder gh(gfo,idx); |
919 |
> |
int i=glst.Add(gh); |
920 |
> |
return &(glst[i]); |
921 |
> |
} |
922 |
> |
|
923 |
> |
GfoHolder* GffReader::gfoFind(const char* id, const char* ctg, |
924 |
> |
GVec<GfoHolder>** glst, char strand, uint start, uint end) { |
925 |
> |
GVec<GfoHolder>* gl=phash.Find(id); |
926 |
> |
GfoHolder* gh=NULL; |
927 |
> |
if (gl) { |
928 |
> |
for (int i=0;i<gl->Count();i++) { |
929 |
> |
GfoHolder& gfo = gl->Get(i); |
930 |
> |
if (ctg!=NULL && strcmp(ctg, gfo.gffobj->getGSeqName())!=0) |
931 |
> |
continue; |
932 |
> |
if (strand && strand != gfo.gffobj->strand) |
933 |
> |
continue; |
934 |
> |
if (start>0) { |
935 |
> |
if (abs((int)start-(int)gfo.gffobj->start)>GFF_MAX_LOCUS) |
936 |
> |
continue; |
937 |
> |
if (end>0 && (gfo.gffobj->start>end || gfo.gffobj->end<start)) |
938 |
> |
continue; |
939 |
> |
} |
940 |
> |
//must be the same transcript, according to given comparison criteria |
941 |
> |
gh=&gfo; |
942 |
> |
break; |
943 |
> |
} |
944 |
> |
} |
945 |
> |
if (glst) *glst=gl; |
946 |
> |
return gh; |
947 |
|
} |
948 |
|
|
949 |
|
GfoHolder* GffReader::replaceGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, int replaceidx) { |
951 |
|
GfoHolder* r=NULL; |
952 |
|
if (replaceidx>=0) { |
953 |
|
gflst.Put(replaceidx,newgfo); |
954 |
< |
r=gfoAdd(newgfo->gffID, gffline->gseqname, newgfo, replaceidx); |
954 |
> |
r=gfoAdd(newgfo, replaceidx); |
955 |
|
} |
956 |
|
else { |
957 |
|
int gfoidx=gflst.Add(newgfo); |
958 |
< |
r=gfoAdd(newgfo->gffID, gffline->gseqname, newgfo, gfoidx); |
958 |
> |
r=gfoAdd(newgfo, gfoidx); |
959 |
|
} |
960 |
+ |
/* |
961 |
|
if (gff_warns) { |
962 |
|
int* pcount=tids.Find(newgfo->gffID); |
963 |
|
if (pcount!=NULL) { |
964 |
< |
if (gff_warns) GMessage("Warning: duplicate GFF ID: %s\n", newgfo->gffID); |
964 |
> |
if (gff_warns) GMessage("Warning: duplicate GFF ID: %s\n", newgfo->gffID); |
965 |
|
(*pcount)++; |
966 |
|
} |
967 |
|
else { |
968 |
|
tids.Add(newgfo->gffID,new int(1)); |
969 |
|
} |
970 |
|
} |
971 |
+ |
*/ |
972 |
|
return r; |
973 |
|
} |
974 |
|
|
989 |
|
} |
990 |
|
|
991 |
|
GfoHolder* GffReader::newGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, |
992 |
< |
GffObj* parent, GffExon* pexon) { |
992 |
> |
GffObj* parent, GffExon* pexon, GVec<GfoHolder>* glst) { |
993 |
|
GffObj* newgfo=new GffObj(this, gffline, keepAttr, noExonAttr); |
994 |
|
GfoHolder* r=NULL; |
995 |
|
int gfoidx=gflst.Add(newgfo); |
996 |
< |
r=gfoAdd(newgfo->gffID, gffline->gseqname, newgfo, gfoidx); |
996 |
> |
r=(glst) ? gfoAdd(*glst, newgfo, gfoidx) : gfoAdd(newgfo, gfoidx); |
997 |
|
if (parent!=NULL) { |
998 |
|
updateParent(r, parent); |
999 |
|
if (pexon!=NULL) parent->removeExon(pexon); |
1000 |
|
} |
1001 |
+ |
/* |
1002 |
|
if (gff_warns) { |
1003 |
|
int* pcount=tids.Find(newgfo->gffID); |
1004 |
|
if (pcount!=NULL) { |
1009 |
|
tids.Add(newgfo->gffID,new int(1)); |
1010 |
|
} |
1011 |
|
} |
1012 |
+ |
*/ |
1013 |
|
return r; |
1014 |
|
} |
1015 |
|
|
1016 |
|
GfoHolder* GffReader::updateGffRec(GfoHolder* prevgfo, GffLine* gffline, |
1017 |
|
bool keepAttr) { |
1018 |
|
if (prevgfo==NULL) return NULL; |
1019 |
< |
prevgfo->gffobj->createdByExon(false); |
1019 |
> |
//prevgfo->gffobj->createdByExon(false); |
1020 |
|
prevgfo->gffobj->ftype_id=prevgfo->gffobj->names->feats.addName(gffline->ftype); |
1021 |
|
prevgfo->gffobj->start=gffline->fstart; |
1022 |
|
prevgfo->gffobj->end=gffline->fend; |
1023 |
|
prevgfo->gffobj->isGene(gffline->is_gene); |
1024 |
|
prevgfo->gffobj->isTranscript(gffline->is_transcript || gffline->exontype!=0); |
1025 |
< |
prevgfo->gffobj->fromGff3(gffline->is_gff3); |
1025 |
> |
prevgfo->gffobj->hasGffID(gffline->ID!=NULL); |
1026 |
|
if (keepAttr) { |
1027 |
|
if (prevgfo->gffobj->attrs!=NULL) prevgfo->gffobj->attrs->Clear(); |
1028 |
|
prevgfo->gffobj->parseAttrs(prevgfo->gffobj->attrs, gffline->info); |
1034 |
|
bool GffReader::addExonFeature(GfoHolder* prevgfo, GffLine* gffline, GHash<CNonExon>& pex, bool noExonAttr) { |
1035 |
|
bool r=true; |
1036 |
|
if (gffline->strand!=prevgfo->gffobj->strand) { |
1037 |
< |
//TODO: add support for trans-splicing and even inter-chromosomal fusions |
1038 |
< |
if (prevgfo->gffobj->strand=='.') { |
1037 |
> |
//TODO: add support for trans-splicing and even inter-chromosomal fusions |
1038 |
> |
if (prevgfo->gffobj->strand=='.') { |
1039 |
|
prevgfo->gffobj->strand=gffline->strand; |
1040 |
|
} |
1041 |
|
else { |
1099 |
|
} |
1100 |
|
|
1101 |
|
//have to parse the whole file because exons can be scattered all over |
1102 |
+ |
//trans-splicing and fusions are only accepted in proper GFF3 format, with a single parent feature ID entry |
1103 |
|
void GffReader::readAll(bool keepAttr, bool mergeCloseExons, bool noExonAttr) { |
1104 |
|
bool validation_errors = false; |
1105 |
|
//loc_debug=false; |
1106 |
|
GHash<CNonExon> pex; //keep track of any "exon"-like features that have an ID |
1107 |
|
//and thus could become promoted to parent features |
1108 |
|
while (nextGffLine()!=NULL) { |
1050 |
– |
//seen this gff ID before? |
1109 |
|
GfoHolder* prevseen=NULL; |
1110 |
< |
if (gffline->ID) //GFF3 |
1111 |
< |
prevseen=gfoFind(gffline->ID, gffline->gseqname); |
1112 |
< |
if (prevseen!=NULL) { |
1113 |
< |
if (prevseen->gffobj->createdByExon()) { |
1114 |
< |
updateGffRec(prevseen, gffline, keepAttr); |
1115 |
< |
} |
1116 |
< |
else { |
1117 |
< |
GMessage("Error: duplicate GFF ID '%s' encountered!\n",gffline->ID); |
1118 |
< |
validation_errors = true; |
1119 |
< |
if (gff_warns) { |
1120 |
< |
delete gffline; gffline=NULL; continue; |
1121 |
< |
} |
1122 |
< |
else exit(1); |
1123 |
< |
} |
1124 |
< |
} |
1110 |
> |
GVec<GfoHolder>* prevgflst=NULL; |
1111 |
> |
if (gffline->ID && gffline->exontype==0) { |
1112 |
> |
//>>>>> for a parent-like IDed feature (mRNA, gene, etc.) |
1113 |
> |
//look for same ID on the same chromosome/strand/locus |
1114 |
> |
prevseen=gfoFind(gffline->ID, gffline->gseqname, &prevgflst, gffline->strand, gffline->fstart); |
1115 |
> |
if (prevseen!=NULL) { |
1116 |
> |
//same ID/chromosome combo encountered before |
1117 |
> |
if (prevseen->gffobj->createdByExon() && |
1118 |
> |
prevseen->gffobj->start>=gffline->fstart && |
1119 |
> |
prevseen->gffobj->end<=gffline->fend) { |
1120 |
> |
//an exon of this ID was given before |
1121 |
> |
//this line has the main attributes for this ID |
1122 |
> |
updateGffRec(prevseen, gffline, keepAttr); |
1123 |
> |
} |
1124 |
> |
else { |
1125 |
> |
//- duplicate ID -- this must be a discontiguous feature |
1126 |
> |
// e.g. a trans-spliced transcript |
1127 |
> |
if (prevseen->gffobj->overlap(gffline->fstart, gffline->fend)) { |
1128 |
> |
//overlapping with same ID not allowed |
1129 |
> |
GMessage("Error: duplicate GFF ID '%s' encountered!\n",gffline->ID); |
1130 |
> |
//validation_errors = true; |
1131 |
> |
if (gff_warns) { |
1132 |
> |
delete gffline; |
1133 |
> |
gffline=NULL; |
1134 |
> |
continue; |
1135 |
> |
} |
1136 |
> |
else exit(1); |
1137 |
> |
} |
1138 |
> |
//create a new entry with the same ID |
1139 |
> |
prevseen=newGffRec(gffline, keepAttr, noExonAttr, |
1140 |
> |
prevseen->gffobj->parent, NULL, prevgflst); |
1141 |
> |
} //duplicate ID on the same chromosome |
1142 |
> |
} //prevseeen != NULL |
1143 |
> |
} //parent-like ID feature |
1144 |
|
if (gffline->parents==NULL) {//start GFF3-like record with no parent (mRNA, gene) |
1145 |
< |
if (!prevseen) newGffRec(gffline, keepAttr, noExonAttr); |
1145 |
> |
if (!prevseen) newGffRec(gffline, keepAttr, noExonAttr, NULL, NULL, prevgflst); |
1146 |
|
} |
1147 |
< |
else { //--- it's a parented feature (could still be a mRNA) |
1147 |
> |
else { //--- it's a child feature (exon/CDS but could still be a mRNA with gene(s) as parent) |
1148 |
|
bool found_parent=false; |
1149 |
|
GfoHolder* newgfo=prevseen; |
1150 |
+ |
GVec<GfoHolder>* newgflst=NULL; |
1151 |
|
for (int i=0;i<gffline->num_parents;i++) { |
1152 |
|
if (transcriptsOnly && discarded_ids.Find(gffline->parents[i])!=NULL) |
1153 |
|
continue; //skipping discarded parent feature |
1154 |
< |
GfoHolder* parentgfo=gfoFind(gffline->parents[i], gffline->gseqname); |
1154 |
> |
GfoHolder* parentgfo=gfoFind(gffline->parents[i], gffline->gseqname, |
1155 |
> |
&newgflst, gffline->strand, gffline->fstart, gffline->fend); |
1156 |
|
if (parentgfo!=NULL) { //parent GffObj parsed earlier |
1157 |
|
found_parent=true; |
1158 |
|
if (parentgfo->gffobj->isGene() && gffline->is_transcript |
1169 |
|
if (!addExonFeature(parentgfo, gffline, pex, noExonAttr)) |
1170 |
|
validation_errors=true; |
1171 |
|
} |
1172 |
< |
} |
1172 |
> |
} //overlapping parent feature found |
1173 |
|
} //for each parsed parent Id |
1174 |
|
if (!found_parent) { //new GTF-like record starting here with a subfeature directly |
1175 |
|
//or it could be some chado GFF3 barf with exons declared BEFORE their parent :( |
1185 |
|
} |
1186 |
|
else { //no parent seen before, create one directly with this exon |
1187 |
|
//loc_debug=true; |
1188 |
< |
GfoHolder* newgfo=prevseen ? prevseen : newGffRec(gffline, keepAttr, noExonAttr); |
1188 |
> |
GfoHolder* ngfo=prevseen; |
1189 |
> |
if (ngfo==NULL) |
1190 |
> |
ngfo=newGffRec(gffline, keepAttr, noExonAttr, NULL, NULL, newgflst); |
1191 |
|
if (gffline->ID!=NULL && gffline->exontype==0) |
1192 |
< |
subfPoolAdd(pex, newgfo); |
1192 |
> |
subfPoolAdd(pex, ngfo); |
1193 |
|
//even those with errors will be added here! |
1194 |
|
} |
1195 |
|
GFREE(subp_name); |
1203 |
|
// all gff records are now loaded in GList gflst |
1204 |
|
// so we can free the hash |
1205 |
|
phash.Clear(); |
1206 |
< |
tids.Clear(); |
1206 |
> |
//tids.Clear(); |
1207 |
|
if (validation_errors) { |
1208 |
|
exit(1); |
1209 |
|
} |
1252 |
|
} //for each exon |
1253 |
|
} |
1254 |
|
//attribute reduction for GTF records |
1255 |
< |
if (keepAttrs && !noExonAttr && !fromGff3() |
1255 |
> |
if (keepAttrs && !noExonAttr && !hasGffID() |
1256 |
|
&& exons.Count()>0 && exons[0]->attrs!=NULL) { |
1257 |
|
bool attrs_discarded=false; |
1258 |
|
for (int a=0;a<exons[0]->attrs->Count();a++) { |