17 |
|
const uint gfo_flag_CHILDREN_PROMOTED= 0x00000002; |
18 |
|
const uint gfo_flag_IS_GENE = 0x00000004; |
19 |
|
const uint gfo_flag_IS_TRANSCRIPT = 0x00000008; |
20 |
< |
const uint gfo_flag_FROM_GFF3 = 0x00000010; |
20 |
> |
const uint gfo_flag_HAS_GFF_ID = 0x00000010; //found GFF3 feature line with its own ID |
21 |
|
const uint gfo_flag_BY_EXON = 0x00000020; //created by subfeature (exon) directly |
22 |
|
const uint gfo_flag_DISCARDED = 0x00000100; |
23 |
|
const uint gfo_flag_LST_KEEP = 0x00000200; |
786 |
|
isCDS=gffline->is_cds; //for now |
787 |
|
isGene(gffline->is_gene); |
788 |
|
isTranscript(gffline->is_transcript || gffline->exontype!=0); |
789 |
< |
fromGff3(gffline->is_gff3); |
789 |
> |
//fromGff3(gffline->is_gff3); |
790 |
|
|
791 |
|
if (gffline->parents!=NULL) { |
792 |
|
//GTF style -- create a GffObj directly by subfeature |
799 |
|
//make this GffObj of the same feature type |
800 |
|
ftype_id=names->feats.addName(gffline->ftype); |
801 |
|
} |
802 |
< |
if (gffline->ID==NULL) { //typical GTF |
802 |
> |
if (gffline->ID==NULL) { //typical GTF2 without "transcript" line |
803 |
|
gffID=Gstrdup(gffline->parents[0]); |
804 |
|
this->createdByExon(true); |
805 |
|
//this is likely the first exon/segment of the feature |
806 |
|
addExon(gfrd, gffline, keepAttr, noExonAttr); |
807 |
|
} |
808 |
< |
else { //a parented feature with an ID -- probably an orphan GFF3 line |
808 |
> |
else { //a parented feature with an ID -- probably an orphan or premature GFF3 subfeature line |
809 |
|
if (gffline->is_gff3 && gffline->exontype!=0) { |
810 |
|
//premature exon given before its parent transcript |
811 |
|
//create the transcript entry here |
825 |
|
gscore=gffline->score; |
826 |
|
if (gffline->ID==NULL || gffline->ID[0]==0) |
827 |
|
GError("Error: no ID found for GFF record start\n"); |
828 |
+ |
this->hasGffID(true); |
829 |
|
gffID=Gstrdup(gffline->ID); //there must be an ID here |
830 |
|
//if (gffline->is_transcript) ftype_id=gff_fid_mRNA; |
831 |
|
//else |
948 |
|
int gfoidx=gflst.Add(newgfo); |
949 |
|
r=gfoAdd(newgfo, gfoidx); |
950 |
|
} |
951 |
+ |
/* |
952 |
|
if (gff_warns) { |
953 |
|
int* pcount=tids.Find(newgfo->gffID); |
954 |
|
if (pcount!=NULL) { |
959 |
|
tids.Add(newgfo->gffID,new int(1)); |
960 |
|
} |
961 |
|
} |
962 |
+ |
*/ |
963 |
|
return r; |
964 |
|
} |
965 |
|
|
989 |
|
updateParent(r, parent); |
990 |
|
if (pexon!=NULL) parent->removeExon(pexon); |
991 |
|
} |
992 |
+ |
/* |
993 |
|
if (gff_warns) { |
994 |
|
int* pcount=tids.Find(newgfo->gffID); |
995 |
|
if (pcount!=NULL) { |
1000 |
|
tids.Add(newgfo->gffID,new int(1)); |
1001 |
|
} |
1002 |
|
} |
1003 |
+ |
*/ |
1004 |
|
return r; |
1005 |
|
} |
1006 |
|
|
1007 |
|
GfoHolder* GffReader::updateGffRec(GfoHolder* prevgfo, GffLine* gffline, |
1008 |
|
bool keepAttr) { |
1009 |
|
if (prevgfo==NULL) return NULL; |
1010 |
< |
prevgfo->gffobj->createdByExon(false); |
1010 |
> |
//prevgfo->gffobj->createdByExon(false); |
1011 |
|
prevgfo->gffobj->ftype_id=prevgfo->gffobj->names->feats.addName(gffline->ftype); |
1012 |
|
prevgfo->gffobj->start=gffline->fstart; |
1013 |
|
prevgfo->gffobj->end=gffline->fend; |
1014 |
|
prevgfo->gffobj->isGene(gffline->is_gene); |
1015 |
|
prevgfo->gffobj->isTranscript(gffline->is_transcript || gffline->exontype!=0); |
1016 |
< |
prevgfo->gffobj->fromGff3(gffline->is_gff3); |
1016 |
> |
prevgfo->gffobj->hasGffID(gffline->ID!=NULL); |
1017 |
|
if (keepAttr) { |
1018 |
|
if (prevgfo->gffobj->attrs!=NULL) prevgfo->gffobj->attrs->Clear(); |
1019 |
|
prevgfo->gffobj->parseAttrs(prevgfo->gffobj->attrs, gffline->info); |
1090 |
|
} |
1091 |
|
|
1092 |
|
//have to parse the whole file because exons can be scattered all over |
1093 |
+ |
//trans-splicing and fusions are only accepted in proper GFF3 format, with a single parent feature ID entry |
1094 |
|
void GffReader::readAll(bool keepAttr, bool mergeCloseExons, bool noExonAttr) { |
1095 |
|
bool validation_errors = false; |
1096 |
|
//loc_debug=false; |
1100 |
|
//seen this gff ID before? |
1101 |
|
GfoHolder* prevseen=NULL; |
1102 |
|
if (gffline->ID && gffline->exontype==0) //GFF3 parent-like feature (mRNA, gene, etc.) |
1103 |
+ |
//look for same ID on the same chromosome |
1104 |
|
prevseen=gfoFind(gffline->ID, gffline->gseqname); |
1105 |
|
if (prevseen!=NULL) { |
1106 |
< |
if (prevseen->gffobj->createdByExon()) { |
1107 |
< |
//just in case the exon was found before (shouldn't happen) |
1106 |
> |
//found same ID/chromosome combo |
1107 |
> |
if (prevseen->gffobj->createdByExon() && |
1108 |
> |
prevseen->gffobj->start>=gffline->fstart && prevseen->gffobj->end<=gffline->fend) { |
1109 |
> |
//an exon of this ID was given before |
1110 |
> |
//this line has the main attributes for this ID |
1111 |
|
updateGffRec(prevseen, gffline, keepAttr); |
1112 |
|
} |
1113 |
|
else { |
1178 |
|
// all gff records are now loaded in GList gflst |
1179 |
|
// so we can free the hash |
1180 |
|
phash.Clear(); |
1181 |
< |
tids.Clear(); |
1181 |
> |
//tids.Clear(); |
1182 |
|
if (validation_errors) { |
1183 |
|
exit(1); |
1184 |
|
} |
1227 |
|
} //for each exon |
1228 |
|
} |
1229 |
|
//attribute reduction for GTF records |
1230 |
< |
if (keepAttrs && !noExonAttr && !fromGff3() |
1230 |
> |
if (keepAttrs && !noExonAttr && !hasGffID() |
1231 |
|
&& exons.Count()>0 && exons[0]->attrs!=NULL) { |
1232 |
|
bool attrs_discarded=false; |
1233 |
|
for (int a=0;a<exons[0]->attrs->Count();a++) { |