ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/LayoutParser.cpp
Revision: 2
Committed: Mon Mar 22 22:03:27 2010 UTC (9 years, 1 month ago) by gpertea
File size: 11710 byte(s)
Log Message:
added my gclib source files

Line File contents
1 #include "LayoutParser.h"
2
3 bool LayoutParser::startsWith(const char* s, const char* start, int tlen) {
4 bool found=true;
5 for (int i=0;i<tlen;i++)
6 if (s[i]!=start[i])
7 return false;
8 return found;
9 }
10
11
12
13 bool LayoutParser::open() { //also checks if it looks like a valid LYT file
14 if (f==stdin) return true;
15 if ((f=fopen(fname,"rb"))==NULL) return false;
16 char first1[2];
17 first1[1]='\0';
18 if (fread(first1, 1,1, f)==1) {
19 f_pos=1;
20 return (strcmp(first1, "#")==0 || strcmp(first1, ">")==0);
21 }
22 return false;
23 }
24
25
26 void LayoutParser::close() {
27 if (f!=NULL) {
28 fclose(f);
29 f=NULL;
30 }
31 }
32
33 off_t LayoutParser::fskipTo(const char* linestart, const char* butnot) {
34 /* reads the file from the current position
35 until the next occurence of a line
36 starting with linestart
37 returns the line in buf and the file offset
38 of the beginning of the line;
39 the file offset is -1 if linestart was not found
40 or -2 if an unwanted line-start came out of order
41 */
42 off_t lastpos=getFilePos();
43 int tlen=strlen(linestart);
44 int nlen=(butnot==NULL) ? 0 : strlen(butnot);
45 while (linebuf->getLine(f, f_pos)!=NULL) {
46 if (nlen>0 && startsWith(linebuf->line(), butnot, nlen)) {
47 GMessage("fSkipTo: unwanted line '%s' encountered when searching for '%s'\n",
48 linebuf->line(), linestart);
49 return -2;
50 }
51 if (startsWith(linebuf->chars(), linestart, tlen)) return lastpos;
52 lastpos=getFilePos();
53 }
54 return -1;
55 }
56
57 LytSeqInfo* LayoutParser::addSeq(char* s, LytCtgData* ctg) {
58 LytSeqInfo* seq;
59 //s must be the line with sequence data
60 char* p=strchrs(s," \t");
61 if (p==NULL) return NULL;
62 p++;
63 char c;
64 int slen, soffs, clpL, clpR;
65 clpL=0;clpR=0;
66 if (sscanf(p,"%c %d %d %d %d", &c, &slen, &soffs, &clpL, &clpR)<3) return NULL;
67 p--;
68 *p='\0';
69 if ((seq=seqinfo.Find(s))!=NULL) {
70 GMessage("Sequence '%s' already found for contig '%s (%d nt)'\n"
71 " so it cannot be added for contig '%s (%d nt)'\n",
72 s, seq->contig->name, seq->contig->len,
73 ctg->name, ctg->len);
74 return NULL;
75 }
76 seq = new LytSeqInfo(s, ctg, soffs, (c=='-') ? 1 : 0, slen, clpL, clpR);
77 seqinfo.shkAdd(seq->name, seq);
78 ctg->seqs.Add(seq);
79 //parse optional extensions, if any
80 p+=strlen(s); //position p after the seqname
81 char* m=NULL;
82 int segEnd, segRclip,nextsegStart, nextsegLclip, prevSegStart;
83 char segSplice, nextsegSplice;
84 while ((m=strchr(p,':'))!=NULL) {
85 switch (*(m-1)) {
86 case 'G': //segmenting info
87 prevSegStart=soffs+clpL-1;
88 p=m+1; //p to the beginning of G: data
89 //accumulate the total length in lenSegs
90 while (*p>='1' && *p<='9') {
91 segEnd=0;
92 segRclip=0;
93 nextsegStart=0;
94 nextsegLclip=0;
95 segSplice=0;
96 nextsegSplice=0;
97 if (!parseInt(p,segEnd))
98 GError("Error [segment] at LayoutParser for %s at: %s\n",
99 s, m-1);
100 if (*p=='c') {
101 p++;
102 if (!parseInt(p,segRclip))
103 GError("Error [segment] at LayoutParser for %s at: %s\n",
104 s, m-1);
105 }
106 if (*p=='S' || *p=='s') {
107 segSplice=*p; p++;
108 }
109 if (*p!='-')
110 GError("Error [segment] at LayoutParser for %s at: %s\n",
111 s, m-1);
112 else p++;
113 if (!parseInt(p,nextsegStart))
114 GError("Error [segment] at LayoutParser for %s at: %s\n",
115 s, m-1);
116 if (*p=='c') {
117 p++;
118 if (!parseInt(p,nextsegLclip))
119 GError("Error [segment] at LayoutParser for %s at: %s\n",
120 s, m-1);
121 }
122 if (*p=='S' || *p=='s') {
123 nextsegSplice=*p; p++;
124 }
125 seq->addInterSeg(segEnd,nextsegStart,segRclip,nextsegLclip,
126 segSplice, nextsegSplice);
127 prevSegStart=nextsegStart;
128 //
129 if (*p==',') p++;
130 else break;
131 } //while inter-segment parsing
132 break; // 'G:' case
133 case 'L': //clone mates list
134 p=m+1; //p to the beginning of L: data
135 break;
136 case 'D': //difference sequence
137 p=m+1; //p to the beginning of D: data
138 break;
139 case 'S': //actual sequence
140 p=m+1; //p to the beginning of S: data
141 break;
142 default:
143 p=m+1;//next attribute
144 }
145 }
146
147 return seq;
148 }
149
150 char* LytCtgData::readName(char* s, GHash<int>& names) {
151 char* p=strchrs(s, " \t");
152 if (p!=NULL) {
153 char* tmp;
154 char* tmp2;
155 GMALLOC(tmp, (p-s+30)*sizeof(char));
156 strncpy(tmp, s,p-s);
157 tmp[p-s]='\0';
158 GMALLOC(tmp2, (p-s+30)*sizeof(char));
159 strcpy(tmp2, tmp);
160 //make it unique (by simple versioning)
161 int v=0;
162 while (names.hasKey(tmp2)) {
163 v++;
164 sprintf(tmp2, "%s.%d", tmp, v);
165 }
166 name=Gstrdup(tmp2);
167 GFREE(tmp);
168 GFREE(tmp2);
169 names.shkAdd(name, new int(1));
170 p++;
171 }
172 else {
173 GMessage("LytCtgData::readName: Cannot find the token delimiter in:\n%s\n", s);
174 }
175 return p;
176 }
177
178 char* LytSeqInfo::expandGaps(char* s) {
179 if (!hasIntrons()) return s;
180 char* r=NULL;
181 GMALLOC(r, xlen+1);
182 int srcOfs=0;
183 char* rpos=s; //source reader position
184 char* wpos=r; //target writer position
185 for (int i=0;i<numisegs;i++) {
186 int wlen=intersegs[i].nextSegSeq-srcOfs;
187 strncpy(wpos,rpos,wlen);
188 wpos+=wlen;
189 rpos+=wlen;
190 srcOfs=intersegs[i].nextSegSeq;
191 //now expand the gap
192 wlen=intersegs[i].length();
193 memset((void*)wpos, '-',wlen);
194 wpos+=wlen;
195 }
196 //add last exon:
197 strcpy(wpos, rpos);
198 return r;
199 }
200
201
202 bool LayoutParser::parse(fnLytSeq* seqfn) {
203 //read all seqs and their positions from the file
204 //also checks for duplicate seqnames (just in case)
205 if (f!=stdin) seek(0);
206 ctgIDs.Clear();
207 //GHash<int> ctgIDs; //contig IDs, to make them unique!
208 //
209 int ctgpos;
210 numContigs=0;
211 while ((ctgpos=fskipTo(">"))>=0) { //locate the contig line
212 numContigs++;
213 LytCtgData* ctgdata=new LytCtgData(ctgpos);
214 char* p=ctgdata->readName(linebuf->chars()+1, ctgIDs);
215 if (p==NULL) {
216 GMessage("LayoutParser: error parsing contig name:\n%s\n", linebuf->chars());
217 return false;
218 }
219 int ctg_lpos, ctg_rpos, numseqs;
220 //p must be after contig name within linebuf!
221 ctg_lpos=0;ctg_rpos=0;
222 if (sscanf(p, "%d %d %d", &numseqs, &ctg_lpos, &ctg_rpos)<1) {
223 GMessage("Error parsing contig len and seq count at:\n%s\n",
224 p);
225 return false;
226 }
227 //ctg_numSeqs=numseqs;
228 ctgdata->numseqs=numseqs;
229 ctgdata->rpos=ctg_rpos;
230 ctgdata->lpos=ctg_lpos;
231 ctgdata->len=ctg_rpos-ctg_lpos+1;
232 ctgdata->offs=ctg_lpos;
233 int ctgidx=contigs.Add(ctgdata);
234 //now look and load all the component sequences
235 loadContig(ctgidx, seqfn, false);
236 } //while lines
237 contigs.setSorted(true);
238 return true;
239 }
240
241 //==============================================
242 /*
243 Load contig data; can be called by parse - and then no fseek is needed and
244 the file position if right after parsing the contig summary data
245 */
246 bool LayoutParser::loadContig(int ctgidx, fnLytSeq* seqfn, bool re_pos) {
247 bool forgetCtg=false;
248 char* r=NULL;
249 if (ctgidx>=contigs.Count())
250 GError("LayoutParser: invalid contig index '%d'\n", ctgidx);
251
252 LytCtgData* ctgdata=contigs[ctgidx];
253 if (re_pos && currentContig!=NULL) { //free previous contig data
254 //unless it was a parse() call
255 currentContig->seqs.Clear();
256 seqinfo.Clear();
257 }
258 currentContig=ctgdata;
259 if (re_pos) {
260 seek(ctgdata->fpos); //position right where the contig definition starts
261 r=linebuf->getLine(f,f_pos);//skip the first line
262 if (r==NULL) return false;
263 }
264 if (seqfn!=NULL)
265 forgetCtg=(*seqfn)(numContigs, ctgdata, NULL, NULL);
266 int ctg_numSeqs=ctgdata->numseqs;
267 int numseqs=0;
268 while ((r=linebuf->getLine(f,f_pos))!=NULL) {
269 if (linebuf->length()<4) continue;
270 if (linebuf->chars()[0]=='>') {
271 linebuf->pushBack();
272 break; //reached next contig
273 }
274 //sequence data parsing
275
276 bool forgetSeq=false;
277 LytSeqInfo* seq=NULL;
278 if ((seq=addSeq(linebuf->chars(), ctgdata))==NULL) {
279 GMessage("LayoutParser: error parsing sequence entry:\n%s\n",linebuf->chars());
280 return false;
281 }
282 /*
283 // Weird -- why would I MODIFY the given clipping of a sequence?
284 //--
285 bool ctg_clipping = (ctgdata->rpos>ctgdata->lpos);
286 if (ctg_clipping) {
287 if (ctgdata->lpos > seq->offs && ctgdata->lpos < seq->offs+seq->length())
288 seq->left = ctgdata->lpos - seq->offs+1;
289 if (ctgdata->rpos < seq->offs+seq->length() && ctgdata->rpos>seq->offs )
290 seq->right = ctgdata->rpos-seq->offs+1;
291 } */
292 if (seqfn!=NULL)
293 forgetSeq=(*seqfn)(numContigs, ctgdata, seq, NULL);
294 if (forgetSeq) {
295 ctg_numSeqs--;
296 seqinfo.Remove(seq->name);
297 ctgdata->seqs.RemovePtr(seq);
298 }
299 else {
300 numseqs++;
301 }
302 } //while sequences
303 if (forgetCtg) {
304 ctgIDs.Remove(ctgdata->name);
305 contigs.RemovePtr(ctgdata);
306 }
307 if (numseqs!=ctg_numSeqs) {
308 GMessage("Mismatching number of sequences found (%d) for contig '%s' "
309 "(length %d, numseqs %d)\n", numseqs,
310 ctgdata->name, ctgdata->len, ctg_numSeqs);
311 return false;
312 }
313 return true;
314 }
315
316
317 bool LayoutParser::parseContigs() { //load all the file offsets for contigs
318 if (f!=stdin) seek(0);
319 ctgIDs.Clear();
320 //GHash<int> ctgIDs; //contig IDs, to make them unique!
321 //
322 int ctgpos; //locate the first contig line
323 numContigs=0;
324 while ((ctgpos=fskipTo(">"))>=0) {
325 numContigs++;
326 LytCtgData* ctgdata=new LytCtgData(ctgpos);
327 char* p=ctgdata->readName(linebuf->chars()+1, ctgIDs);
328 if (p==NULL) {
329 GMessage("LayoutParser: error parsing contig name:\n%s\n", linebuf->chars());
330 return false;
331 }
332 int ctg_lpos, ctg_rpos, numseqs;
333 //p must be after contig name within linebuf!
334 ctg_lpos=0;ctg_rpos=0;
335 if (sscanf(p, "%d %d %d", &numseqs, &ctg_lpos, &ctg_rpos)<1) {
336 GMessage("Error parsing contig len and seq count at:\n%s\n",
337 p);
338 return false;
339 }
340 ctgdata->len=ctg_rpos-ctg_lpos+1;
341 ctgdata->numseqs=numseqs;
342 ctgdata->lpos=ctg_lpos;
343 ctgdata->rpos=ctg_rpos;
344 ctgdata->offs=ctg_lpos;
345 contigs.Add(ctgdata);
346 //ctgpos=fskipTo(">");
347 } //while lines
348 if (ctgpos==-2) return false;
349 contigs.setSorted(true);
350 return true;
351 }
352
353 //-- compare functions for contigs
354 int ctgByLen(void* p1, void* p2) {
355 int c1=((LytCtgData*)p1)->len;
356 int c2=((LytCtgData*)p2)->len;
357 return (c1>c2)?-1:((c1<c2)?1:0);
358 }
359
360 int ctgByNumSeqs(void* p1, void* p2) {
361 int c1=((LytCtgData*)p1)->numseqs;
362 int c2=((LytCtgData*)p2)->numseqs;
363 return (c1>c2)?-1:((c1<c2)?1:0);
364 }
365
366 void LayoutParser::contigsByName() {
367 contigs.setSorted(true);
368 }
369
370 void LayoutParser::contigsByLen() {
371 contigs.setSorted(&ctgByLen);
372 }
373
374 void LayoutParser::contigsByNumSeqs() {
375 contigs.setSorted(&ctgByNumSeqs);
376 }