ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/LayoutParser.cpp
(Generate patch)
# Line 1 | Line 1
1 < #include "LayoutParser.h"
2 <
3 < bool LayoutParser::startsWith(const char* s, const char* start, int tlen) {
4 < bool found=true;
5 < for (int i=0;i<tlen;i++)
6 <  if (s[i]!=start[i])
7 <     return false;
8 < return found;
9 < }
10 <
11 <
12 <
13 < bool LayoutParser::open() { //also checks if it looks like a valid LYT file
14 <   if (f==stdin) return true;
15 <   if ((f=fopen(fname,"rb"))==NULL) return false;
16 <     char first1[2];
17 <     first1[1]='\0';
18 <     if (fread(first1, 1,1, f)==1) {
19 <       f_pos=1;
20 <       return (strcmp(first1, "#")==0 || strcmp(first1, ">")==0);
21 <       }
22 <   return false;
23 <   }
24 <
25 <
26 < void LayoutParser::close() {
27 <   if (f!=NULL) {
28 <        fclose(f);
29 <        f=NULL;
30 <        }
31 <   }
32 <
33 < off_t LayoutParser::fskipTo(const char* linestart, const char* butnot) {
34 <  /* reads the file from the current position
35 <  until the next occurence of a line
36 <  starting with linestart
37 <  returns the line in buf and the file offset
38 <  of the beginning of the line;
39 <  the file offset is -1 if linestart was not found
40 <  or -2 if an unwanted line-start came out of order
41 <  */
42 <    off_t lastpos=getFilePos();
43 <    int tlen=strlen(linestart);
44 <    int nlen=(butnot==NULL) ? 0 : strlen(butnot);
45 <    while (linebuf->getLine(f, f_pos)!=NULL) {
46 <     if (nlen>0 && startsWith(linebuf->line(), butnot, nlen)) {
47 <        GMessage("fSkipTo: unwanted line '%s' encountered when searching for '%s'\n",
48 <          linebuf->line(), linestart);
49 <        return -2;
50 <        }
51 <     if (startsWith(linebuf->chars(), linestart, tlen)) return lastpos;
52 <     lastpos=getFilePos();
53 <     }
54 < return -1;
55 < }
56 <
57 < LytSeqInfo* LayoutParser::addSeq(char* s, LytCtgData* ctg) {
58 < LytSeqInfo* seq;
59 < //s must be the line with sequence data
60 < char* p=strchrs(s," \t");
61 < if (p==NULL) return NULL;
62 < p++;
63 < char c;
64 < int slen, soffs, clpL, clpR;
65 < clpL=0;clpR=0;
66 < if (sscanf(p,"%c %d %d %d %d", &c, &slen, &soffs, &clpL, &clpR)<3) return NULL;
67 < p--;
68 < *p='\0';
69 < if ((seq=seqinfo.Find(s))!=NULL) {
70 <   GMessage("Sequence '%s' already found for contig '%s (%d nt)'\n"
71 <      " so it cannot be added for contig '%s (%d nt)'\n",
72 <     s, seq->contig->name, seq->contig->len,
73 <     ctg->name, ctg->len);
74 <   return NULL;
75 <   }
76 < seq = new LytSeqInfo(s, ctg, soffs, (c=='-') ? 1 : 0, slen, clpL, clpR);
77 < seqinfo.shkAdd(seq->name, seq);
78 < ctg->seqs.Add(seq);
79 < //parse optional extensions, if any
80 < p+=strlen(s); //position p after the seqname
81 < char* m=NULL;
82 < int segEnd, segRclip,nextsegStart, nextsegLclip, prevSegStart;
83 < char segSplice, nextsegSplice;
84 < while ((m=strchr(p,':'))!=NULL) {
85 <  switch (*(m-1)) {
86 <    case 'G': //segmenting info
87 <       prevSegStart=soffs+clpL-1;
88 <       p=m+1;  //p to the beginning of G: data
89 <       //accumulate the total length in lenSegs
90 <       while (*p>='1' && *p<='9') {
91 <         segEnd=0;
92 <         segRclip=0;
93 <         nextsegStart=0;
94 <         nextsegLclip=0;
95 <         segSplice=0;
96 <         nextsegSplice=0;
97 <         if (!parseInt(p,segEnd))
98 <            GError("Error [segment] at LayoutParser for %s at: %s\n",
99 <                 s, m-1);
100 <         if (*p=='c') {
101 <            p++;
102 <            if (!parseInt(p,segRclip))
103 <              GError("Error [segment] at LayoutParser for %s at: %s\n",
104 <                      s, m-1);
105 <            }
106 <         if (*p=='S' || *p=='s') {
107 <            segSplice=*p; p++;
108 <            }
109 <         if (*p!='-')
110 <                GError("Error [segment] at LayoutParser for %s at: %s\n",
111 <                      s, m-1);
112 <            else p++;
113 <         if (!parseInt(p,nextsegStart))
114 <            GError("Error [segment] at LayoutParser for %s at: %s\n",
115 <                 s, m-1);
116 <         if (*p=='c') {
117 <            p++;
118 <            if (!parseInt(p,nextsegLclip))
119 <              GError("Error [segment] at LayoutParser for %s at: %s\n",
120 <                      s, m-1);
121 <            }
122 <         if (*p=='S' || *p=='s') {
123 <            nextsegSplice=*p; p++;
124 <            }
125 <         seq->addInterSeg(segEnd,nextsegStart,segRclip,nextsegLclip,
126 <                                         segSplice, nextsegSplice);
127 <         prevSegStart=nextsegStart;
128 <         //
129 <         if (*p==',') p++;
130 <             else break;
131 <         } //while inter-segment parsing
132 <       break; // 'G:' case
133 <    case 'L': //clone mates list
134 <       p=m+1; //p to the beginning of L: data
135 <       break;
136 <    case 'D': //difference sequence
137 <       p=m+1; //p to the beginning of D: data
138 <       break;
139 <    case 'S': //actual sequence
140 <       p=m+1; //p to the beginning of S: data
141 <       break;
142 <    default:
143 <       p=m+1;//next attribute
144 <    }
145 <  }
146 <
147 < return seq;
148 < }
149 <
150 < char* LytCtgData::readName(char* s, GHash<int>& names) {
151 <   char* p=strchrs(s, " \t");
152 <   if (p!=NULL) {
153 <       char* tmp;
154 <       char* tmp2;
155 <       GMALLOC(tmp, (p-s+30)*sizeof(char));
156 <       strncpy(tmp, s,p-s);
157 <       tmp[p-s]='\0';
158 <       GMALLOC(tmp2, (p-s+30)*sizeof(char));
159 <       strcpy(tmp2, tmp);
160 <       //make it unique (by simple versioning)
161 <       int v=0;
162 <       while (names.hasKey(tmp2)) {
163 <         v++;
164 <         sprintf(tmp2, "%s.%d", tmp, v);
165 <         }
166 <       name=Gstrdup(tmp2);
167 <       GFREE(tmp);
168 <       GFREE(tmp2);
169 <       names.shkAdd(name, new int(1));
170 <       p++;
171 <       }
172 <      else {
173 <       GMessage("LytCtgData::readName: Cannot find the token delimiter in:\n%s\n", s);
174 <       }
175 <     return p;
176 <     }
177 <
178 < char* LytSeqInfo::expandGaps(char* s) {
179 < if (!hasIntrons()) return s;
180 < char* r=NULL;
181 < GMALLOC(r, xlen+1);
182 < int srcOfs=0;
183 < char* rpos=s; //source reader position
184 < char* wpos=r; //target writer position
185 < for (int i=0;i<numisegs;i++) {
186 <    int wlen=intersegs[i].nextSegSeq-srcOfs;
187 <    strncpy(wpos,rpos,wlen);
188 <    wpos+=wlen;
189 <    rpos+=wlen;
190 <    srcOfs=intersegs[i].nextSegSeq;
191 <    //now expand the gap
192 <    wlen=intersegs[i].length();
193 <    memset((void*)wpos, '-',wlen);
194 <    wpos+=wlen;
195 <   }
196 < //add last exon:
197 < strcpy(wpos, rpos);
198 < return r;
199 < }
200 <
201 <
202 < bool LayoutParser::parse(fnLytSeq* seqfn) {
203 <  //read all seqs and their positions from the file
204 <  //also checks for duplicate seqnames (just in case)
205 <  if (f!=stdin) seek(0);
206 <  ctgIDs.Clear();
207 <  //GHash<int> ctgIDs; //contig IDs, to make them unique!
208 <  //
209 <  int ctgpos;
210 <  numContigs=0;
211 <  while ((ctgpos=fskipTo(">"))>=0) { //locate the contig line
212 <    numContigs++;
213 <    LytCtgData* ctgdata=new LytCtgData(ctgpos);
214 <    char* p=ctgdata->readName(linebuf->chars()+1, ctgIDs);
215 <    if (p==NULL) {
216 <       GMessage("LayoutParser: error parsing contig name:\n%s\n", linebuf->chars());
217 <       return false;
218 <       }
219 <    int ctg_lpos, ctg_rpos, numseqs;
220 <    //p must be after contig name within linebuf!
221 <    ctg_lpos=0;ctg_rpos=0;
222 <    if (sscanf(p, "%d %d %d", &numseqs, &ctg_lpos, &ctg_rpos)<1) {
223 <      GMessage("Error parsing contig len and seq count at:\n%s\n",
224 <         p);
225 <      return false;
226 <      }
227 <    //ctg_numSeqs=numseqs;
228 <    ctgdata->numseqs=numseqs;
229 <    ctgdata->rpos=ctg_rpos;
230 <    ctgdata->lpos=ctg_lpos;
231 <    ctgdata->len=ctg_rpos-ctg_lpos+1;
232 <    ctgdata->offs=ctg_lpos;
233 <    int ctgidx=contigs.Add(ctgdata);
234 <    //now look and load all the component sequences
235 <    loadContig(ctgidx, seqfn, false);
236 <    } //while lines
237 <  contigs.setSorted(true);
238 <  return true;
239 <  }
240 <
241 < //==============================================
242 < /*
243 < Load contig data; can be called by parse - and then no fseek is needed and
244 < the file position if right after parsing the contig summary data
245 < */
246 < bool LayoutParser::loadContig(int ctgidx, fnLytSeq* seqfn, bool re_pos) {
247 <    bool forgetCtg=false;
248 <    char* r=NULL;
249 <    if (ctgidx>=contigs.Count())
250 <      GError("LayoutParser: invalid contig index '%d'\n", ctgidx);
251 <
252 <    LytCtgData* ctgdata=contigs[ctgidx];
253 <    if (re_pos && currentContig!=NULL) { //free previous contig data
254 <                                          //unless it was a parse() call
255 <      currentContig->seqs.Clear();
256 <      seqinfo.Clear();
257 <      }
258 <    currentContig=ctgdata;
259 <    if (re_pos) {
260 <       seek(ctgdata->fpos); //position right where the contig definition starts
261 <       r=linebuf->getLine(f,f_pos);//skip the first line
262 <       if (r==NULL) return false;
263 <       }
264 <    if (seqfn!=NULL)
265 <       forgetCtg=(*seqfn)(numContigs, ctgdata, NULL, NULL);
266 <    int ctg_numSeqs=ctgdata->numseqs;
267 <    int numseqs=0;
268 <    while ((r=linebuf->getLine(f,f_pos))!=NULL) {
269 <       if (linebuf->length()<4) continue;
270 <       if (linebuf->chars()[0]=='>') {
271 <            linebuf->pushBack();
272 <            break; //reached next contig
273 <            }
274 <       //sequence data parsing
275 <
276 <       bool forgetSeq=false;
277 <       LytSeqInfo* seq=NULL;
278 <       if ((seq=addSeq(linebuf->chars(), ctgdata))==NULL) {
279 <         GMessage("LayoutParser: error parsing sequence entry:\n%s\n",linebuf->chars());
280 <         return false;
281 <         }
282 <        /*
283 <        // Weird -- why would I MODIFY the given clipping of a sequence?
284 <        //--
285 <        bool ctg_clipping = (ctgdata->rpos>ctgdata->lpos);
286 <        if (ctg_clipping) {
287 <          if (ctgdata->lpos > seq->offs && ctgdata->lpos < seq->offs+seq->length())
288 <             seq->left = ctgdata->lpos - seq->offs+1;
289 <            if (ctgdata->rpos < seq->offs+seq->length() && ctgdata->rpos>seq->offs )
290 <             seq->right = ctgdata->rpos-seq->offs+1;
291 <          } */
292 <        if (seqfn!=NULL)
293 <          forgetSeq=(*seqfn)(numContigs, ctgdata, seq, NULL);
294 <        if (forgetSeq) {
295 <            ctg_numSeqs--;
296 <            seqinfo.Remove(seq->name);
297 <            ctgdata->seqs.RemovePtr(seq);
298 <            }
299 <          else {
300 <            numseqs++;
301 <            }
302 <       } //while sequences
303 <     if (forgetCtg) {
304 <      ctgIDs.Remove(ctgdata->name);
305 <      contigs.RemovePtr(ctgdata);
306 <      }
307 <    if (numseqs!=ctg_numSeqs) {
308 <       GMessage("Mismatching number of sequences found (%d) for contig '%s' "
309 <         "(length %d, numseqs %d)\n", numseqs,
310 <                ctgdata->name, ctgdata->len, ctg_numSeqs);
311 <       return false;
312 <       }
313 < return true;
314 < }
315 <
316 <
317 < bool LayoutParser::parseContigs() { //load all the file offsets for contigs
318 <  if (f!=stdin) seek(0);
319 <  ctgIDs.Clear();
320 <  //GHash<int> ctgIDs; //contig IDs, to make them unique!
321 <  //
322 <  int ctgpos; //locate the first contig line
323 <  numContigs=0;
324 <  while ((ctgpos=fskipTo(">"))>=0) {
325 <    numContigs++;
326 <    LytCtgData* ctgdata=new LytCtgData(ctgpos);
327 <    char* p=ctgdata->readName(linebuf->chars()+1, ctgIDs);
328 <    if (p==NULL) {
329 <       GMessage("LayoutParser: error parsing contig name:\n%s\n", linebuf->chars());
330 <       return false;
331 <       }
332 <    int ctg_lpos, ctg_rpos, numseqs;
333 <    //p must be after contig name within linebuf!
334 <    ctg_lpos=0;ctg_rpos=0;
335 <    if (sscanf(p, "%d %d %d", &numseqs, &ctg_lpos, &ctg_rpos)<1) {
336 <      GMessage("Error parsing contig len and seq count at:\n%s\n",
337 <         p);
338 <      return false;
339 <      }
340 <    ctgdata->len=ctg_rpos-ctg_lpos+1;
341 <    ctgdata->numseqs=numseqs;
342 <    ctgdata->lpos=ctg_lpos;
343 <    ctgdata->rpos=ctg_rpos;
344 <    ctgdata->offs=ctg_lpos;
345 <    contigs.Add(ctgdata);
346 <    //ctgpos=fskipTo(">");
347 <    } //while lines
348 <  if (ctgpos==-2) return false;
349 <  contigs.setSorted(true);
350 <  return true;
351 < }
352 <
353 < //-- compare functions for contigs
354 < int ctgByLen(void* p1, void* p2) {
355 < int c1=((LytCtgData*)p1)->len;
356 < int c2=((LytCtgData*)p2)->len;
357 < return (c1>c2)?-1:((c1<c2)?1:0);
358 < }
359 <
360 < int ctgByNumSeqs(void* p1, void* p2) {
361 < int c1=((LytCtgData*)p1)->numseqs;
362 < int c2=((LytCtgData*)p2)->numseqs;
363 < return (c1>c2)?-1:((c1<c2)?1:0);
364 < }
365 <
366 < void LayoutParser::contigsByName() {
367 < contigs.setSorted(true);
368 < }
369 <
370 < void LayoutParser::contigsByLen() {
371 < contigs.setSorted(&ctgByLen);
372 < }
373 <
374 < void LayoutParser::contigsByNumSeqs() {
375 < contigs.setSorted(&ctgByNumSeqs);
376 < }
1 > #include "LayoutParser.h"
2 >
3 > bool LayoutParser::startsWith(const char* s, const char* start, int tlen) {
4 > bool found=true;
5 > for (int i=0;i<tlen;i++)
6 >  if (s[i]!=start[i])
7 >     return false;
8 > return found;
9 > }
10 >
11 >
12 >
13 > bool LayoutParser::open() { //also checks if it looks like a valid LYT file
14 >   if (f==stdin) return true;
15 >   if ((f=fopen(fname,"rb"))==NULL) return false;
16 >     char first1[2];
17 >     first1[1]='\0';
18 >     if (fread(first1, 1,1, f)==1) {
19 >       f_pos=1;
20 >       return (strcmp(first1, "#")==0 || strcmp(first1, ">")==0);
21 >       }
22 >   return false;
23 >   }
24 >
25 >
26 > void LayoutParser::close() {
27 >   if (f!=NULL) {
28 >        fclose(f);
29 >        f=NULL;
30 >        }
31 >   }
32 >
33 > off_t LayoutParser::fskipTo(const char* linestart, const char* butnot) {
34 >  /* reads the file from the current position
35 >  until the next occurence of a line
36 >  starting with linestart
37 >  returns the line in buf and the file offset
38 >  of the beginning of the line;
39 >  the file offset is -1 if linestart was not found
40 >  or -2 if an unwanted line-start came out of order
41 >  */
42 >    off_t lastpos=getFilePos();
43 >    int tlen=strlen(linestart);
44 >    int nlen=(butnot==NULL) ? 0 : strlen(butnot);
45 >    while (linebuf->getLine(f, f_pos)!=NULL) {
46 >     if (nlen>0 && startsWith(linebuf->line(), butnot, nlen)) {
47 >        GMessage("fSkipTo: unwanted line '%s' encountered when searching for '%s'\n",
48 >          linebuf->line(), linestart);
49 >        return -2;
50 >        }
51 >     if (startsWith(linebuf->chars(), linestart, tlen)) return lastpos;
52 >     lastpos=getFilePos();
53 >     }
54 > return -1;
55 > }
56 >
57 > LytSeqInfo* LayoutParser::addSeq(char* s, LytCtgData* ctg) {
58 > LytSeqInfo* seq;
59 > //s must be the line with sequence data
60 > char* p=strchrs(s," \t");
61 > if (p==NULL) return NULL;
62 > p++;
63 > char c;
64 > int slen, soffs, clpL, clpR;
65 > clpL=0;clpR=0;
66 > if (sscanf(p,"%c %d %d %d %d", &c, &slen, &soffs, &clpL, &clpR)<3) return NULL;
67 > p--;
68 > *p='\0';
69 > if ((seq=seqinfo.Find(s))!=NULL) {
70 >   GMessage("Sequence '%s' already found for contig '%s (%d nt)'\n"
71 >      " so it cannot be added for contig '%s (%d nt)'\n",
72 >     s, seq->contig->name, seq->contig->len,
73 >     ctg->name, ctg->len);
74 >   return NULL;
75 >   }
76 > seq = new LytSeqInfo(s, ctg, soffs, (c=='-') ? 1 : 0, slen, clpL, clpR);
77 > seqinfo.shkAdd(seq->name, seq);
78 > ctg->seqs.Add(seq);
79 > //parse optional extensions, if any
80 > p+=strlen(s); //position p after the seqname
81 > char* m=NULL;
82 > int segEnd, segRclip,nextsegStart, nextsegLclip, prevSegStart;
83 > char segSplice, nextsegSplice;
84 > while ((m=strchr(p,':'))!=NULL) {
85 >  switch (*(m-1)) {
86 >    case 'G': //segmenting info
87 >       prevSegStart=soffs+clpL-1;
88 >       p=m+1;  //p to the beginning of G: data
89 >       //accumulate the total length in lenSegs
90 >       while (*p>='1' && *p<='9') {
91 >         segEnd=0;
92 >         segRclip=0;
93 >         nextsegStart=0;
94 >         nextsegLclip=0;
95 >         segSplice=0;
96 >         nextsegSplice=0;
97 >         if (!parseInt(p,segEnd))
98 >            GError("Error [segment] at LayoutParser for %s at: %s\n",
99 >                 s, m-1);
100 >         if (*p=='c') {
101 >            p++;
102 >            if (!parseInt(p,segRclip))
103 >              GError("Error [segment] at LayoutParser for %s at: %s\n",
104 >                      s, m-1);
105 >            }
106 >         if (*p=='S' || *p=='s') {
107 >            segSplice=*p; p++;
108 >            }
109 >         if (*p!='-')
110 >                GError("Error [segment] at LayoutParser for %s at: %s\n",
111 >                      s, m-1);
112 >            else p++;
113 >         if (!parseInt(p,nextsegStart))
114 >            GError("Error [segment] at LayoutParser for %s at: %s\n",
115 >                 s, m-1);
116 >         if (*p=='c') {
117 >            p++;
118 >            if (!parseInt(p,nextsegLclip))
119 >              GError("Error [segment] at LayoutParser for %s at: %s\n",
120 >                      s, m-1);
121 >            }
122 >         if (*p=='S' || *p=='s') {
123 >            nextsegSplice=*p; p++;
124 >            }
125 >         seq->addInterSeg(segEnd,nextsegStart,segRclip,nextsegLclip,
126 >                                         segSplice, nextsegSplice);
127 >         prevSegStart=nextsegStart;
128 >         //
129 >         if (*p==',') p++;
130 >             else break;
131 >         } //while inter-segment parsing
132 >       break; // 'G:' case
133 >    case 'L': //clone mates list
134 >       p=m+1; //p to the beginning of L: data
135 >       break;
136 >    case 'D': //difference sequence
137 >       p=m+1; //p to the beginning of D: data
138 >       break;
139 >    case 'S': //actual sequence
140 >       p=m+1; //p to the beginning of S: data
141 >       break;
142 >    default:
143 >       p=m+1;//next attribute
144 >    }
145 >  }
146 >
147 > return seq;
148 > }
149 >
150 > char* LytCtgData::readName(char* s, GHash<int>& names) {
151 >   char* p=strchrs(s, " \t");
152 >   if (p!=NULL) {
153 >       char* tmp;
154 >       char* tmp2;
155 >       GMALLOC(tmp, (p-s+30)*sizeof(char));
156 >       strncpy(tmp, s,p-s);
157 >       tmp[p-s]='\0';
158 >       GMALLOC(tmp2, (p-s+30)*sizeof(char));
159 >       strcpy(tmp2, tmp);
160 >       //make it unique (by simple versioning)
161 >       int v=0;
162 >       while (names.hasKey(tmp2)) {
163 >         v++;
164 >         sprintf(tmp2, "%s.%d", tmp, v);
165 >         }
166 >       name=Gstrdup(tmp2);
167 >       GFREE(tmp);
168 >       GFREE(tmp2);
169 >       names.shkAdd(name, new int(1));
170 >       p++;
171 >       }
172 >      else {
173 >       GMessage("LytCtgData::readName: Cannot find the token delimiter in:\n%s\n", s);
174 >       }
175 >     return p;
176 >     }
177 >
178 > char* LytSeqInfo::expandGaps(char* s) {
179 > if (!hasIntrons()) return s;
180 > char* r=NULL;
181 > GMALLOC(r, xlen+1);
182 > int srcOfs=0;
183 > char* rpos=s; //source reader position
184 > char* wpos=r; //target writer position
185 > for (int i=0;i<numisegs;i++) {
186 >    int wlen=intersegs[i].nextSegSeq-srcOfs;
187 >    strncpy(wpos,rpos,wlen);
188 >    wpos+=wlen;
189 >    rpos+=wlen;
190 >    srcOfs=intersegs[i].nextSegSeq;
191 >    //now expand the gap
192 >    wlen=intersegs[i].length();
193 >    memset((void*)wpos, '-',wlen);
194 >    wpos+=wlen;
195 >   }
196 > //add last exon:
197 > strcpy(wpos, rpos);
198 > return r;
199 > }
200 >
201 >
202 > bool LayoutParser::parse(fnLytSeq* seqfn) {
203 >  //read all seqs and their positions from the file
204 >  //also checks for duplicate seqnames (just in case)
205 >  if (f!=stdin) seek(0);
206 >  ctgIDs.Clear();
207 >  //GHash<int> ctgIDs; //contig IDs, to make them unique!
208 >  //
209 >  int ctgpos;
210 >  numContigs=0;
211 >  while ((ctgpos=fskipTo(">"))>=0) { //locate the contig line
212 >    numContigs++;
213 >    LytCtgData* ctgdata=new LytCtgData(ctgpos);
214 >    char* p=ctgdata->readName(linebuf->chars()+1, ctgIDs);
215 >    if (p==NULL) {
216 >       GMessage("LayoutParser: error parsing contig name:\n%s\n", linebuf->chars());
217 >       return false;
218 >       }
219 >    int ctg_lpos, ctg_rpos, numseqs;
220 >    //p must be after contig name within linebuf!
221 >    ctg_lpos=0;ctg_rpos=0;
222 >    if (sscanf(p, "%d %d %d", &numseqs, &ctg_lpos, &ctg_rpos)<1) {
223 >      GMessage("Error parsing contig len and seq count at:\n%s\n",
224 >         p);
225 >      return false;
226 >      }
227 >    //ctg_numSeqs=numseqs;
228 >    ctgdata->numseqs=numseqs;
229 >    ctgdata->rpos=ctg_rpos;
230 >    ctgdata->lpos=ctg_lpos;
231 >    ctgdata->len=ctg_rpos-ctg_lpos+1;
232 >    ctgdata->offs=ctg_lpos;
233 >    int ctgidx=contigs.Add(ctgdata);
234 >    //now look and load all the component sequences
235 >    loadContig(ctgidx, seqfn, false);
236 >    } //while lines
237 >  contigs.setSorted(true);
238 >  return true;
239 >  }
240 >
241 > //==============================================
242 > /*
243 > Load contig data; can be called by parse - and then no fseek is needed and
244 > the file position if right after parsing the contig summary data
245 > */
246 > bool LayoutParser::loadContig(int ctgidx, fnLytSeq* seqfn, bool re_pos) {
247 >    bool forgetCtg=false;
248 >    char* r=NULL;
249 >    if (ctgidx>=contigs.Count())
250 >      GError("LayoutParser: invalid contig index '%d'\n", ctgidx);
251 >
252 >    LytCtgData* ctgdata=contigs[ctgidx];
253 >    if (re_pos && currentContig!=NULL) { //free previous contig data
254 >                                          //unless it was a parse() call
255 >      currentContig->seqs.Clear();
256 >      seqinfo.Clear();
257 >      }
258 >    currentContig=ctgdata;
259 >    if (re_pos) {
260 >       seek(ctgdata->fpos); //position right where the contig definition starts
261 >       r=linebuf->getLine(f,f_pos);//skip the first line
262 >       if (r==NULL) return false;
263 >       }
264 >    if (seqfn!=NULL)
265 >       forgetCtg=(*seqfn)(numContigs, ctgdata, NULL, NULL);
266 >    int ctg_numSeqs=ctgdata->numseqs;
267 >    int numseqs=0;
268 >    while ((r=linebuf->getLine(f,f_pos))!=NULL) {
269 >       if (linebuf->length()<4) continue;
270 >       if (linebuf->chars()[0]=='>') {
271 >            linebuf->pushBack();
272 >            break; //reached next contig
273 >            }
274 >       //sequence data parsing
275 >
276 >       bool forgetSeq=false;
277 >       LytSeqInfo* seq=NULL;
278 >       if ((seq=addSeq(linebuf->chars(), ctgdata))==NULL) {
279 >         GMessage("LayoutParser: error parsing sequence entry:\n%s\n",linebuf->chars());
280 >         return false;
281 >         }
282 >        /*
283 >        // Weird -- why would I MODIFY the given clipping of a sequence?
284 >        //--
285 >        bool ctg_clipping = (ctgdata->rpos>ctgdata->lpos);
286 >        if (ctg_clipping) {
287 >          if (ctgdata->lpos > seq->offs && ctgdata->lpos < seq->offs+seq->length())
288 >             seq->left = ctgdata->lpos - seq->offs+1;
289 >            if (ctgdata->rpos < seq->offs+seq->length() && ctgdata->rpos>seq->offs )
290 >             seq->right = ctgdata->rpos-seq->offs+1;
291 >          } */
292 >        if (seqfn!=NULL)
293 >          forgetSeq=(*seqfn)(numContigs, ctgdata, seq, NULL);
294 >        if (forgetSeq) {
295 >            ctg_numSeqs--;
296 >            seqinfo.Remove(seq->name);
297 >            ctgdata->seqs.RemovePtr(seq);
298 >            }
299 >          else {
300 >            numseqs++;
301 >            }
302 >       } //while sequences
303 >     if (forgetCtg) {
304 >      ctgIDs.Remove(ctgdata->name);
305 >      contigs.RemovePtr(ctgdata);
306 >      }
307 >    if (numseqs!=ctg_numSeqs) {
308 >       GMessage("Mismatching number of sequences found (%d) for contig '%s' "
309 >         "(length %d, numseqs %d)\n", numseqs,
310 >                ctgdata->name, ctgdata->len, ctg_numSeqs);
311 >       return false;
312 >       }
313 > return true;
314 > }
315 >
316 >
317 > bool LayoutParser::parseContigs() { //load all the file offsets for contigs
318 >  if (f!=stdin) seek(0);
319 >  ctgIDs.Clear();
320 >  //GHash<int> ctgIDs; //contig IDs, to make them unique!
321 >  //
322 >  int ctgpos; //locate the first contig line
323 >  numContigs=0;
324 >  while ((ctgpos=fskipTo(">"))>=0) {
325 >    numContigs++;
326 >    LytCtgData* ctgdata=new LytCtgData(ctgpos);
327 >    char* p=ctgdata->readName(linebuf->chars()+1, ctgIDs);
328 >    if (p==NULL) {
329 >       GMessage("LayoutParser: error parsing contig name:\n%s\n", linebuf->chars());
330 >       return false;
331 >       }
332 >    int ctg_lpos, ctg_rpos, numseqs;
333 >    //p must be after contig name within linebuf!
334 >    ctg_lpos=0;ctg_rpos=0;
335 >    if (sscanf(p, "%d %d %d", &numseqs, &ctg_lpos, &ctg_rpos)<1) {
336 >      GMessage("Error parsing contig len and seq count at:\n%s\n",
337 >         p);
338 >      return false;
339 >      }
340 >    ctgdata->len=ctg_rpos-ctg_lpos+1;
341 >    ctgdata->numseqs=numseqs;
342 >    ctgdata->lpos=ctg_lpos;
343 >    ctgdata->rpos=ctg_rpos;
344 >    ctgdata->offs=ctg_lpos;
345 >    contigs.Add(ctgdata);
346 >    //ctgpos=fskipTo(">");
347 >    } //while lines
348 >  if (ctgpos==-2) return false;
349 >  contigs.setSorted(true);
350 >  return true;
351 > }
352 >
353 > //-- compare functions for contigs
354 > int ctgByLen(void* p1, void* p2) {
355 > int c1=((LytCtgData*)p1)->len;
356 > int c2=((LytCtgData*)p2)->len;
357 > return (c1>c2)?-1:((c1<c2)?1:0);
358 > }
359 >
360 > int ctgByNumSeqs(void* p1, void* p2) {
361 > int c1=((LytCtgData*)p1)->numseqs;
362 > int c2=((LytCtgData*)p2)->numseqs;
363 > return (c1>c2)?-1:((c1<c2)?1:0);
364 > }
365 >
366 > void LayoutParser::contigsByName() {
367 > contigs.setSorted(true);
368 > }
369 >
370 > void LayoutParser::contigsByLen() {
371 > contigs.setSorted(&ctgByLen);
372 > }
373 >
374 > void LayoutParser::contigsByNumSeqs() {
375 > contigs.setSorted(&ctgByNumSeqs);
376 > }

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines