ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/gff.cpp
(Generate patch)
# Line 52 | Line 52
52               else return (int)(g1.gseq_id-g2.gseq_id);
53   }
54  
55 < char* GffLine::extractAttr(const char* pre, bool caseStrict, bool enforce_GTF2) {
55 > char* GffLine::extractAttr(const char* attr, bool caseStrict, bool enforce_GTF2) {
56   //parse a key attribute and remove it from the info string
57   //(only works for attributes that have values following them after ' ' or '=')
58   static const char GTF2_ERR[]="Error parsing attribute %s ('\"' required) at GTF line:\n%s\n";
59 < int lpre=strlen(pre);
60 < char cend=pre[lpre-1];
61 < char* pos = (caseStrict) ? strstr(info, pre) : strifind(info, pre);
62 < if (pos==NULL) return NULL;
63 < char* findstart=info;
64 < //require word boundary on the left:
65 < while (pos!=NULL && pos!=info && *(pos-1)!=';' && *(pos-1)!=' ') {
66 <    findstart=pos+lpre;
67 <    pos = (caseStrict) ? strstr(findstart, pre) : strifind(findstart, pre);
68 <    }
69 < if (pos==NULL) return NULL;
70 < if (cend!=' ' && cend!='=') {
71 <    //require word boundary on the right:
72 <    while (pos!=NULL && *(pos+lpre)!=' ' && *(pos+lpre)!='=') {
73 <       findstart=pos+lpre;
74 <       pos = (caseStrict) ? strstr(findstart, pre) : strifind(findstart, pre);
75 <       }
76 <    }
77 < if (pos==NULL) return NULL;
78 < char* vp=pos+lpre;
59 > int attrlen=strlen(attr);
60 > char cend=attr[attrlen-1];
61 > //char* pos = (caseStrict) ? strstr(info, attr) : strifind(info, attr);
62 > //must make sure attr is not found in quoted text
63 > char* pos=info;
64 > char prevch=0;
65 > bool in_str=false;
66 > bool notfound=true;
67 > int (*strcmpfn)(const char*, const char*, int) = caseStrict ? Gstrcmp : Gstricmp;
68 > while (notfound && *pos) {
69 >   char ch=*pos;
70 >   if (ch=='"') {
71 >     in_str=!in_str;
72 >     pos++;
73 >     prevch=ch;
74 >     continue;
75 >     }
76 >   if (!in_str && (prevch==0 || prevch==' ' || prevch == ';')
77 >          && strcmpfn(attr, pos, attrlen)==0) {
78 >      //attr match found
79 >      //check for word boundary on right
80 >      char* epos=pos+attrlen;
81 >      if (cend=='=' || cend==' ' || *epos==0 || *epos==' ') {
82 >        notfound=false;
83 >        break;
84 >        }
85 >      //not a perfect match, move on
86 >      pos=epos;
87 >      prevch=*(pos-1);
88 >      continue;
89 >      }
90 >   //not a match or in_str
91 >   prevch=ch;
92 >   pos++;
93 >   }
94 > if (notfound) return NULL;
95 > char* vp=pos+attrlen;
96   while (*vp==' ') vp++;
97   if (*vp==';' || *vp==0)
98 <      GError("Error parsing value of GFF attribute \"%s\", line:\n%s\n", pre, dupline);
98 >      GError("Error parsing value of GFF attribute \"%s\", line:\n%s\n", attr, dupline);
99   bool dq_enclosed=false; //value string enclosed by double quotes
100   if (*vp=='"') {
101       dq_enclosed=true;
102       vp++;
103       }
104   if (enforce_GTF2 && !dq_enclosed)
105 <      GError(GTF2_ERR,pre, dupline);
105 >      GError(GTF2_ERR,attr, dupline);
106   char* vend=vp;
107   if (dq_enclosed) {
108      while (*vend!='"' && *vend!=';' && *vend!=0) vend++;
# Line 94 | Line 111
111      while (*vend!=';' && *vend!=0) vend++;
112      }
113   if (enforce_GTF2 && *vend!='"')
114 <     GError(GTF2_ERR, pre, dupline);
114 >     GError(GTF2_ERR, attr, dupline);
115   char *r=Gstrdup(vp, vend-1);
116   //-- now remove this attribute from the info string
117   while (*vend!=0 && (*vend=='"' || *vend==';' || *vend==' ')) vend++;
# Line 234 | Line 251
251            }
252          return; //skip this line, unwanted feature name
253          }
254 < ID=extractAttr("ID=");
255 < char* Parent=extractAttr("Parent=");
254 > ID=extractAttr("ID=",true);
255 > char* Parent=extractAttr("Parent=",true);
256   is_gff3=(ID!=NULL || Parent!=NULL);
257   if (is_gff3) {
258     //parse as GFF3
259      if (ID!=NULL) {
260         //has ID attr so it's likely to be a parent feature
261         //look for explicit gene name
262 <       gene_name=extractAttr("gene_name=",false);
262 >       gene_name=extractAttr("gene_name=");
263         if (gene_name==NULL) {
264 <           gene_name=extractAttr("geneName=",false);
264 >           gene_name=extractAttr("geneName=");
265             if (gene_name==NULL) {
266 <               gene_name=extractAttr("gene_sym=",false);
266 >               gene_name=extractAttr("gene_sym=");
267                 if (gene_name==NULL) {
268 <                   gene_name=extractAttr("gene=",false);
268 >                   gene_name=extractAttr("gene=");
269                     }
270                 }
271             }
272 <       gene_id=extractAttr("geneID=",false);
272 >       gene_id=extractAttr("geneID=");
273         if (gene_id==NULL) {
274 <          gene_id=extractAttr("gene_id=",false);
274 >          gene_id=extractAttr("gene_id=");
275            }
276         if (is_gene) {
277           //special case: keep the Name and ID attributes of the gene feature
# Line 299 | Line 316
316           } //has Parent field
317     } //GFF3
318    else { // GTF-like expected
319 <   Parent=extractAttr("transcript_id");
319 >   Parent=extractAttr("transcript_id",true);
320     if (Parent!=NULL) { //GTF2 format detected
321       if (is_transcript) {
322           // atypical GTF with a parent transcript line declared
# Line 307 | Line 324
324           Parent=NULL;
325           }
326       gene_id=extractAttr("gene_id"); // for GTF this is the only attribute accepted as geneID
327 +     if (gene_id==NULL)
328 +       gene_id=extractAttr("geneid");
329       gene_name=extractAttr("gene_name");
330       if (gene_name==NULL) {
331 +
332             gene_name=extractAttr("gene_sym");
333 <           if (gene_name==NULL)
333 >           if (gene_name==NULL) {
334                 gene_name=extractAttr("gene");
335 +               if (gene_name==NULL)
336 +                  gene_name=extractAttr("genesymbol");
337 +               }
338             }
339       //prepare for parseAttr by adding '=' character instead of spaces for all attributes
340       //after the attribute name

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines