ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/AceParser.cpp
Revision: 2
Committed: Mon Mar 22 22:03:27 2010 UTC (9 years, 4 months ago) by gpertea
File size: 11632 byte(s)
Log Message:
added my gclib source files

Line File contents
1 #include "AceParser.h"
2 #include <ctype.h>
3
4 bool AceParser::open() { //also checks if it looks like a valid ACE file
5 if (f==stdin) return true;
6 if ((f=fopen(fname,"rb"))==NULL) return false;
7 char first3[4];
8 first3[3]='\0';
9 if (fread(first3, 1,3, f)==3) {
10 f_pos=3;
11 return (strcmp(first3, "AS ")==0 || strcmp(first3, "CO ")==0);
12 }
13 return false;
14 }
15
16
17 LytSeqInfo* AceParser::addSeq(char* s, LytCtgData* ctg) {
18 LytSeqInfo* seq;
19 if (!startsWith(s, "AF ", 3))
20 return NULL;
21 s+=3;
22 char* p=strchrs(s," \t");
23 if (p==NULL) return NULL;
24 p++;
25 char c;
26 int offs;
27 if (sscanf(p,"%c %d", &c, &offs)!=2) return NULL;
28 p--;
29 *p='\0';
30 if ((seq=seqinfo.Find(s))!=NULL) {
31 GMessage("Sequence '%s' already found for contig '%s (%d nt)'\n"
32 " so it cannot be added for contig '%s (%d)'\n",
33 s, seq->contig->name, seq->contig->len,
34 ctg->name, ctg->len);
35 return NULL;
36 }
37 seq = new LytSeqInfo(s, ctg, offs, (c=='C') ? 1 : 0);
38 seqinfo.shkAdd(seq->name, seq);
39 ctg->seqs.Add(seq);
40 return seq;
41 }
42
43
44 bool AceParser::loadContig(int ctgidx, fnLytSeq* seqfn, bool re_pos) {
45
46 bool forgetCtg = false;
47 if (ctgidx>=contigs.Count())
48 GError("LayoutParser: invalid contig index '%d'\n", ctgidx);
49 LytCtgData* ctgdata=contigs[ctgidx];
50 if (re_pos && currentContig!=NULL) { //free previously loaded contig data
51 currentContig->seqs.Clear(); // unless it was a parse() call
52 seqinfo.Clear();
53 }
54 currentContig=ctgdata;
55 int ctg_numSeqs=ctgdata->numseqs;
56
57 if (re_pos) {
58 seek(ctgdata->fpos); //position right where the contig definition starts
59 char *r = linebuf->getLine(f,f_pos);
60 if (r==NULL) return false;
61 }
62
63 if (seqfn!=NULL) { //process the contig sequence!
64 char* ctgseq=readSeq();
65 forgetCtg=(*seqfn)(numContigs, ctgdata, NULL, ctgseq);
66 GFREE(ctgseq); //obviously the caller should have made a copy
67 }
68 //now look for all the component sequences
69 if (fskipTo("AF ")<0) {
70 GMessage("AceParser: error finding sequence offsets (AF)"
71 " for contig '%s' (%d)\n", ctgdata->name, ctgdata->len);
72 return false;
73 }
74 int numseqs=0;
75 while (startsWith(linebuf->chars(), "AF ",3)) {
76 if (addSeq(linebuf->chars(), ctgdata)==NULL) {
77 GMessage("AceParser: error parsing AF entry:\n%s\n",linebuf->chars());
78 return false;
79 }
80 numseqs++;
81 //read next line:
82 linebuf->getLine(f,f_pos);
83 }
84 if (numseqs!=ctg_numSeqs) {
85 GMessage("Invalid number of AF entries found (%d) for contig '%s' "
86 "(length %d, numseqs %d)\n", numseqs,
87 ctgdata->name, ctgdata->len, ctg_numSeqs);
88 return false;
89 }
90 //now read each sequence entry
91 off_t seqpos=fskipTo("RD ");
92 numseqs=0; //count again, now the RD entries
93 if (seqpos<0) {
94 GMessage("AceParser: error locating first RD entry for contig '%s'\n",
95 ctgdata->name);
96 return false;
97 }
98 //int numseqs=0;
99 //reading the actual component sequence details
100 while (startsWith(linebuf->chars(), "RD ",3)) {
101 char* s=linebuf->chars()+3;
102 char* p=strchrs(s, " \t");
103 LytSeqInfo* seq;
104 if (p==NULL) {
105 GMessage("AceParser: Error parsing RD header line:\n%s\n", linebuf->chars());
106 return false;
107 }
108 *p='\0';
109 if ((seq=seqinfo.Find(s))==NULL) {
110 GMessage("AceParser: unknown RD encountered: '%s'\n", s);
111 return false;
112 }
113 p++; //now p is in linebuf after the RD name
114 seq->fpos=seqpos;
115 int len;
116 if (sscanf(p, "%d", &len)!=1) {
117 GMessage("AceParser: cannot parse RD length for '%s'\n", s);
118 return false;
119 }
120 seq->setLength(len);
121 //read the sequence data here if a callback fn was given:
122 char* sseq=NULL;
123 if (seqfn!=NULL)
124 sseq=readSeq(seq); //read full sequence here
125 if (fskipTo("QA ")<0) {
126 GMessage("AceParser: Error finding QA entry for read %s! (fpos=%llu)\n", seq->name, (unsigned long long)f_pos);
127 return false;
128 }
129 //parse QA entry:
130 int tmpa, tmpb;
131 if (sscanf(linebuf->chars()+3, "%d %d %d %d", &tmpa, &tmpb, &seq->left,&seq->right)!=4 ||
132 seq->left<=0 || seq->right<=0) {
133 GMessage("AceParser: Error parsing QA entry.\n");
134 return false;
135 }
136 /*
137 if (fskipTo("DS")<0) {
138 GMessage("AceParser: Error closing RD entry ('DS' not found).\n");
139 return false;
140 }
141 */
142 seqpos=getFilePos()+1;
143 bool forgetSeq=false;
144 if (seqfn!=NULL) {
145 forgetSeq=(*seqfn)(numContigs, ctgdata, seq, sseq);
146 GFREE(sseq);
147 }
148 if (forgetSeq) { //parsing the whole stream -- aceconv)
149 ctg_numSeqs--;
150 seqinfo.Remove(seq->name);
151 ctgdata->seqs.RemovePtr(seq);
152 }
153 numseqs++;
154 if (numseqs<ctgdata->numseqs)
155 seqpos=fskipTo("RD ", "CO "); //more sequences left to read
156 }
157 if (numseqs!=ctgdata->numseqs) {
158 GMessage("Error: Invalid number of RD entries found (%d) for contig '%s' "
159 "(length %d, numseqs %d)\n", numseqs,
160 ctgdata->name, ctgdata->len, ctg_numSeqs);
161 return false;
162 }
163 if (forgetCtg) {
164 ctgIDs.Remove(ctgdata->name);
165 ctgdata->seqs.Clear();
166 seqinfo.Clear();
167 contigs.RemovePtr(ctgdata);
168 }
169 return true;
170 }
171
172 //read only the contig headers from the file
173 //also checks for duplicate seqnames (just in case)
174 bool AceParser::parseContigs() {
175 if (f!=stdin) seek(0);
176 off_t ctgpos;
177 numContigs=0;
178 while ((ctgpos=fskipTo("CO "))>=0) {
179 numContigs++;
180 LytCtgData* ctgdata=new LytCtgData(ctgpos);
181 char* p=ctgdata->readName(linebuf->chars()+3, ctgIDs);
182 if (p==NULL) {
183 GMessage("AceParser: error parsing contig name!\n");
184 return false;
185 }
186 int ctglen, ctg_numSeqs, numseqs;
187 //p must be after contig name within linebuf!
188 if (sscanf(p, "%d %d", &ctglen, &numseqs)!=2) {
189 GMessage("Error parsing contig len and seq count at:\n%s\n",
190 linebuf->chars());
191 }
192 ctg_numSeqs=numseqs;
193 ctgdata->len=ctglen;
194 ctgdata->numseqs=numseqs;
195 ctgdata->offs = 1;
196 contigs.Add(ctgdata);
197 //ctgpos=fskipTo("CO ");
198 }
199 if (ctgpos==-2) return false; //parsing failed: line too long ?!?
200 contigs.setSorted(true);
201 return true;
202 }
203
204
205 bool AceParser::parse(fnLytSeq* seqfn) {
206 //read all seqs and their positions from the file
207 //also checks for duplicate seqnames (just in case)
208 if (f!=stdin) seek(0);
209 ctgIDs.Clear();
210 //GHash<int> ctgIDs; //contig IDs, to make them unique!
211 //
212 off_t ctgpos;
213 numContigs=0;
214 while ((ctgpos=fskipTo("CO "))>=0) {
215 numContigs++;
216 LytCtgData* ctgdata=new LytCtgData(ctgpos);
217 char* p=ctgdata->readName(linebuf->chars()+3, ctgIDs);
218 if (p==NULL) {
219 GMessage("AceParser: error parsing contig name!\n");
220 return false;
221 }
222 int ctglen, numseqs;
223 //p must be after contig name within linebuf!
224 if (sscanf(p, "%d %d", &ctglen, &numseqs)!=2) {
225 GMessage("Error parsing contig len and seq count at:\n%s\n",
226 linebuf->chars());
227 }
228 ctgdata->len=ctglen;
229 ctgdata->numseqs = numseqs;
230 ctgdata->offs = 1;
231 int ctgidx=contigs.Add(ctgdata);
232 loadContig(ctgidx, seqfn, false);
233 } //while contigs
234 if (ctgpos==-2) return false; //parsing failed: line too long ?!?
235 contigs.setSorted(true);
236 return true;
237 }
238
239
240 void add_intron(char*& buf,int accrd, int& rgpos,
241 const char* numstr, LytSeqInfo* seqinfo, char lspl, char rspl) {
242 int n=atoi(numstr);
243 if (n<=0) return;
244 if (seqinfo!=NULL) {
245 char sL = 0;
246 char sR = 0;
247 if (lspl>0) sL=(lspl=='['?'S':'s');
248 if (rspl>0) sR=(rspl==']'?'S':'s');
249 seqinfo->addInterSeg(seqinfo->offs+rgpos-1,seqinfo->offs+rgpos+n,
250 0,0,sL,sR,accrd);
251 }
252 rgpos+=n;
253 }
254
255 char* AceParser::readSeq(LytSeqInfo* seqinfo) {
256 //assumes the next line is where a sequence starts!
257 //stops at the next empty line encountered
258 char* buf;
259 static char rlenbuf[12]={0,0,0,0,0,0,0,0,0,0,0,0}; //buffer for parsing the gap length
260 int rlenbufacc=0; //how many digits accumulated in rlenbuf so far
261 int buflen=512;
262 GMALLOC(buf, buflen); //this MUST be freed by the caller
263 buf[0]='\0';
264 int accrd=0; //accumulated read length so far -- excludes interseg gaps!
265 int rgpos=0; //accumulated offset including interseg gaps!
266 char *r = linebuf->getLine(f,f_pos);
267 int linelen=linebuf->length();
268 char r_splice=0, l_splice=0;
269 while (linelen>0) {
270 if (r==NULL) {
271 GMessage("AceParser: error reading sequence data\n");
272 return NULL;
273 }
274 //-- pass the line content for accumulation
275 int i=0;
276 while (r[i]) {
277 while (r[i] && isdigit(r[i])) {
278 rlenbuf[rlenbufacc]=r[i];
279 rlenbufacc++;
280 i++;
281 }
282 if (r[i]==0) break; //end of line reached already
283 //now r[i] is surely a non-digit char
284 if (rlenbufacc>0) { //have we just had a number before?
285 rlenbuf[rlenbufacc]=0;
286 if (r[i]=='=' || r[i]=='-') {
287 i++;
288 if (r[i]==0) break;
289 } else { //check for splice site markers for this introns
290 if (r[i]=='('||r[i]=='[') {
291 l_splice=r[i];
292 i++;
293 if (r[i]==0) break;
294 }
295 if (r[i]==')'||r[i]==']') {
296 r_splice=r[i];
297 i++;
298 if (r[i]==0) break;
299 }
300 }//splice check
301 add_intron(buf, accrd, rgpos, rlenbuf, seqinfo, l_splice, r_splice);
302 rlenbufacc=0;
303 r_splice=0;l_splice=0;
304 //i++;//skip the gap character
305 }
306 //check for digits here and break the linebuf as needed
307 int bi=i; //start of non-digit run
308 while (r[i] && !isdigit(r[i])) i++;
309 int nl=(i-bi); //length of non-digit run
310 if (nl>0) {
311 int si=accrd;
312 accrd+=nl;
313 rgpos+=nl;
314 if (accrd>=buflen-1) {
315 buflen=accrd+512;
316 GREALLOC(buf,buflen);
317 }
318 //append these non-digit chars
319 for(int b=0;b<nl;b++) {
320 buf[si+b]=r[bi+b];
321 }
322 }//non-digit run
323 } //while line chars
324
325 /*
326 //-- append the line to buf
327 accrd+=linelen;
328 if (accrd>=buflen) {
329 buflen+=1024;
330 GREALLOC(buf,buflen);
331 }
332 strcat(buf, r);
333 */
334
335 r=linebuf->getLine(f,f_pos);
336 linelen=linebuf->length();
337 }//while linelen>0
338
339 //add the 0-ending
340 buf[accrd]=0;
341 return buf;
342 }
343
344 char* AceParser::getSeq(LytSeqInfo* seq) {
345 if (f==stdin || seek(seq->fpos)!=0) {
346 GMessage("AceParser: error seeking seq '%s'\n", seq->name);
347 return NULL;
348 }
349 //skip the contig header:
350 char* r=linebuf->getLine(f,f_pos);
351 if (r==NULL || !startsWith(linebuf->chars(), "RD ", 3)) {
352 GMessage("AceParser: error seeking seq '%s'\n"
353 " (no RD entry found at location %d)\n", seq->name, seq->fpos);
354 return NULL;
355 }
356 return readSeq(seq);
357 }
358
359 char* AceParser::getContigSeq(LytCtgData* ctg) {
360 if (f==stdin || seek(ctg->fpos)!=0) {
361 GMessage("AceParser: error seeking contig '%s'\n", ctg->name);
362 return NULL;
363 }
364 //skip the contig header:
365 char* r=linebuf->getLine(f,f_pos);
366 if (r==NULL || !startsWith(linebuf->chars(), "CO ", 3)) {
367 GMessage("AceParser: error seeking contig '%s'\n"
368 " (no CO entry found at location %d)\n", ctg->name, ctg->fpos);
369 return NULL;
370 }
371 return readSeq();
372 }
373
374