ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/LayoutParser.cpp
Revision: 16
Committed: Mon Jul 18 20:56:02 2011 UTC (8 years ago) by gpertea
File size: 11334 byte(s)
Log Message:
sync with local source

Line User Rev File contents
1 gpertea 16 #include "LayoutParser.h"
2    
3     bool LayoutParser::startsWith(const char* s, const char* start, int tlen) {
4     bool found=true;
5     for (int i=0;i<tlen;i++)
6     if (s[i]!=start[i])
7     return false;
8     return found;
9     }
10    
11    
12    
13     bool LayoutParser::open() { //also checks if it looks like a valid LYT file
14     if (f==stdin) return true;
15     if ((f=fopen(fname,"rb"))==NULL) return false;
16     char first1[2];
17     first1[1]='\0';
18     if (fread(first1, 1,1, f)==1) {
19     f_pos=1;
20     return (strcmp(first1, "#")==0 || strcmp(first1, ">")==0);
21     }
22     return false;
23     }
24    
25    
26     void LayoutParser::close() {
27     if (f!=NULL) {
28     fclose(f);
29     f=NULL;
30     }
31     }
32    
33     off_t LayoutParser::fskipTo(const char* linestart, const char* butnot) {
34     /* reads the file from the current position
35     until the next occurence of a line
36     starting with linestart
37     returns the line in buf and the file offset
38     of the beginning of the line;
39     the file offset is -1 if linestart was not found
40     or -2 if an unwanted line-start came out of order
41     */
42     off_t lastpos=getFilePos();
43     int tlen=strlen(linestart);
44     int nlen=(butnot==NULL) ? 0 : strlen(butnot);
45     while (linebuf->getLine(f, f_pos)!=NULL) {
46     if (nlen>0 && startsWith(linebuf->line(), butnot, nlen)) {
47     GMessage("fSkipTo: unwanted line '%s' encountered when searching for '%s'\n",
48     linebuf->line(), linestart);
49     return -2;
50     }
51     if (startsWith(linebuf->chars(), linestart, tlen)) return lastpos;
52     lastpos=getFilePos();
53     }
54     return -1;
55     }
56    
57     LytSeqInfo* LayoutParser::addSeq(char* s, LytCtgData* ctg) {
58     LytSeqInfo* seq;
59     //s must be the line with sequence data
60     char* p=strchrs(s," \t");
61     if (p==NULL) return NULL;
62     p++;
63     char c;
64     int slen, soffs, clpL, clpR;
65     clpL=0;clpR=0;
66     if (sscanf(p,"%c %d %d %d %d", &c, &slen, &soffs, &clpL, &clpR)<3) return NULL;
67     p--;
68     *p='\0';
69     if ((seq=seqinfo.Find(s))!=NULL) {
70     GMessage("Sequence '%s' already found for contig '%s (%d nt)'\n"
71     " so it cannot be added for contig '%s (%d nt)'\n",
72     s, seq->contig->name, seq->contig->len,
73     ctg->name, ctg->len);
74     return NULL;
75     }
76     seq = new LytSeqInfo(s, ctg, soffs, (c=='-') ? 1 : 0, slen, clpL, clpR);
77     seqinfo.shkAdd(seq->name, seq);
78     ctg->seqs.Add(seq);
79     //parse optional extensions, if any
80     p+=strlen(s); //position p after the seqname
81     char* m=NULL;
82     int segEnd, segRclip,nextsegStart, nextsegLclip, prevSegStart;
83     char segSplice, nextsegSplice;
84     while ((m=strchr(p,':'))!=NULL) {
85     switch (*(m-1)) {
86     case 'G': //segmenting info
87     prevSegStart=soffs+clpL-1;
88     p=m+1; //p to the beginning of G: data
89     //accumulate the total length in lenSegs
90     while (*p>='1' && *p<='9') {
91     segEnd=0;
92     segRclip=0;
93     nextsegStart=0;
94     nextsegLclip=0;
95     segSplice=0;
96     nextsegSplice=0;
97     if (!parseInt(p,segEnd))
98     GError("Error [segment] at LayoutParser for %s at: %s\n",
99     s, m-1);
100     if (*p=='c') {
101     p++;
102     if (!parseInt(p,segRclip))
103     GError("Error [segment] at LayoutParser for %s at: %s\n",
104     s, m-1);
105     }
106     if (*p=='S' || *p=='s') {
107     segSplice=*p; p++;
108     }
109     if (*p!='-')
110     GError("Error [segment] at LayoutParser for %s at: %s\n",
111     s, m-1);
112     else p++;
113     if (!parseInt(p,nextsegStart))
114     GError("Error [segment] at LayoutParser for %s at: %s\n",
115     s, m-1);
116     if (*p=='c') {
117     p++;
118     if (!parseInt(p,nextsegLclip))
119     GError("Error [segment] at LayoutParser for %s at: %s\n",
120     s, m-1);
121     }
122     if (*p=='S' || *p=='s') {
123     nextsegSplice=*p; p++;
124     }
125     seq->addInterSeg(segEnd,nextsegStart,segRclip,nextsegLclip,
126     segSplice, nextsegSplice);
127     prevSegStart=nextsegStart;
128     //
129     if (*p==',') p++;
130     else break;
131     } //while inter-segment parsing
132     break; // 'G:' case
133     case 'L': //clone mates list
134     p=m+1; //p to the beginning of L: data
135     break;
136     case 'D': //difference sequence
137     p=m+1; //p to the beginning of D: data
138     break;
139     case 'S': //actual sequence
140     p=m+1; //p to the beginning of S: data
141     break;
142     default:
143     p=m+1;//next attribute
144     }
145     }
146    
147     return seq;
148     }
149    
150     char* LytCtgData::readName(char* s, GHash<int>& names) {
151     char* p=strchrs(s, " \t");
152     if (p!=NULL) {
153     char* tmp;
154     char* tmp2;
155     GMALLOC(tmp, (p-s+30)*sizeof(char));
156     strncpy(tmp, s,p-s);
157     tmp[p-s]='\0';
158     GMALLOC(tmp2, (p-s+30)*sizeof(char));
159     strcpy(tmp2, tmp);
160     //make it unique (by simple versioning)
161     int v=0;
162     while (names.hasKey(tmp2)) {
163     v++;
164     sprintf(tmp2, "%s.%d", tmp, v);
165     }
166     name=Gstrdup(tmp2);
167     GFREE(tmp);
168     GFREE(tmp2);
169     names.shkAdd(name, new int(1));
170     p++;
171     }
172     else {
173     GMessage("LytCtgData::readName: Cannot find the token delimiter in:\n%s\n", s);
174     }
175     return p;
176     }
177    
178     char* LytSeqInfo::expandGaps(char* s) {
179     if (!hasIntrons()) return s;
180     char* r=NULL;
181     GMALLOC(r, xlen+1);
182     int srcOfs=0;
183     char* rpos=s; //source reader position
184     char* wpos=r; //target writer position
185     for (int i=0;i<numisegs;i++) {
186     int wlen=intersegs[i].nextSegSeq-srcOfs;
187     strncpy(wpos,rpos,wlen);
188     wpos+=wlen;
189     rpos+=wlen;
190     srcOfs=intersegs[i].nextSegSeq;
191     //now expand the gap
192     wlen=intersegs[i].length();
193     memset((void*)wpos, '-',wlen);
194     wpos+=wlen;
195     }
196     //add last exon:
197     strcpy(wpos, rpos);
198     return r;
199     }
200    
201    
202     bool LayoutParser::parse(fnLytSeq* seqfn) {
203     //read all seqs and their positions from the file
204     //also checks for duplicate seqnames (just in case)
205     if (f!=stdin) seek(0);
206     ctgIDs.Clear();
207     //GHash<int> ctgIDs; //contig IDs, to make them unique!
208     //
209     int ctgpos;
210     numContigs=0;
211     while ((ctgpos=fskipTo(">"))>=0) { //locate the contig line
212     numContigs++;
213     LytCtgData* ctgdata=new LytCtgData(ctgpos);
214     char* p=ctgdata->readName(linebuf->chars()+1, ctgIDs);
215     if (p==NULL) {
216     GMessage("LayoutParser: error parsing contig name:\n%s\n", linebuf->chars());
217     return false;
218     }
219     int ctg_lpos, ctg_rpos, numseqs;
220     //p must be after contig name within linebuf!
221     ctg_lpos=0;ctg_rpos=0;
222     if (sscanf(p, "%d %d %d", &numseqs, &ctg_lpos, &ctg_rpos)<1) {
223     GMessage("Error parsing contig len and seq count at:\n%s\n",
224     p);
225     return false;
226     }
227     //ctg_numSeqs=numseqs;
228     ctgdata->numseqs=numseqs;
229     ctgdata->rpos=ctg_rpos;
230     ctgdata->lpos=ctg_lpos;
231     ctgdata->len=ctg_rpos-ctg_lpos+1;
232     ctgdata->offs=ctg_lpos;
233     int ctgidx=contigs.Add(ctgdata);
234     //now look and load all the component sequences
235     loadContig(ctgidx, seqfn, false);
236     } //while lines
237     contigs.setSorted(true);
238     return true;
239     }
240    
241     //==============================================
242     /*
243     Load contig data; can be called by parse - and then no fseek is needed and
244     the file position if right after parsing the contig summary data
245     */
246     bool LayoutParser::loadContig(int ctgidx, fnLytSeq* seqfn, bool re_pos) {
247     bool forgetCtg=false;
248     char* r=NULL;
249     if (ctgidx>=contigs.Count())
250     GError("LayoutParser: invalid contig index '%d'\n", ctgidx);
251    
252     LytCtgData* ctgdata=contigs[ctgidx];
253     if (re_pos && currentContig!=NULL) { //free previous contig data
254     //unless it was a parse() call
255     currentContig->seqs.Clear();
256     seqinfo.Clear();
257     }
258     currentContig=ctgdata;
259     if (re_pos) {
260     seek(ctgdata->fpos); //position right where the contig definition starts
261     r=linebuf->getLine(f,f_pos);//skip the first line
262     if (r==NULL) return false;
263     }
264     if (seqfn!=NULL)
265     forgetCtg=(*seqfn)(numContigs, ctgdata, NULL, NULL);
266     int ctg_numSeqs=ctgdata->numseqs;
267     int numseqs=0;
268     while ((r=linebuf->getLine(f,f_pos))!=NULL) {
269     if (linebuf->length()<4) continue;
270     if (linebuf->chars()[0]=='>') {
271     linebuf->pushBack();
272     break; //reached next contig
273     }
274     //sequence data parsing
275    
276     bool forgetSeq=false;
277     LytSeqInfo* seq=NULL;
278     if ((seq=addSeq(linebuf->chars(), ctgdata))==NULL) {
279     GMessage("LayoutParser: error parsing sequence entry:\n%s\n",linebuf->chars());
280     return false;
281     }
282     /*
283     // Weird -- why would I MODIFY the given clipping of a sequence?
284     //--
285     bool ctg_clipping = (ctgdata->rpos>ctgdata->lpos);
286     if (ctg_clipping) {
287     if (ctgdata->lpos > seq->offs && ctgdata->lpos < seq->offs+seq->length())
288     seq->left = ctgdata->lpos - seq->offs+1;
289     if (ctgdata->rpos < seq->offs+seq->length() && ctgdata->rpos>seq->offs )
290     seq->right = ctgdata->rpos-seq->offs+1;
291     } */
292     if (seqfn!=NULL)
293     forgetSeq=(*seqfn)(numContigs, ctgdata, seq, NULL);
294     if (forgetSeq) {
295     ctg_numSeqs--;
296     seqinfo.Remove(seq->name);
297     ctgdata->seqs.RemovePtr(seq);
298     }
299     else {
300     numseqs++;
301     }
302     } //while sequences
303     if (forgetCtg) {
304     ctgIDs.Remove(ctgdata->name);
305     contigs.RemovePtr(ctgdata);
306     }
307     if (numseqs!=ctg_numSeqs) {
308     GMessage("Mismatching number of sequences found (%d) for contig '%s' "
309     "(length %d, numseqs %d)\n", numseqs,
310     ctgdata->name, ctgdata->len, ctg_numSeqs);
311     return false;
312     }
313     return true;
314     }
315    
316    
317     bool LayoutParser::parseContigs() { //load all the file offsets for contigs
318     if (f!=stdin) seek(0);
319     ctgIDs.Clear();
320     //GHash<int> ctgIDs; //contig IDs, to make them unique!
321     //
322     int ctgpos; //locate the first contig line
323     numContigs=0;
324     while ((ctgpos=fskipTo(">"))>=0) {
325     numContigs++;
326     LytCtgData* ctgdata=new LytCtgData(ctgpos);
327     char* p=ctgdata->readName(linebuf->chars()+1, ctgIDs);
328     if (p==NULL) {
329     GMessage("LayoutParser: error parsing contig name:\n%s\n", linebuf->chars());
330     return false;
331     }
332     int ctg_lpos, ctg_rpos, numseqs;
333     //p must be after contig name within linebuf!
334     ctg_lpos=0;ctg_rpos=0;
335     if (sscanf(p, "%d %d %d", &numseqs, &ctg_lpos, &ctg_rpos)<1) {
336     GMessage("Error parsing contig len and seq count at:\n%s\n",
337     p);
338     return false;
339     }
340     ctgdata->len=ctg_rpos-ctg_lpos+1;
341     ctgdata->numseqs=numseqs;
342     ctgdata->lpos=ctg_lpos;
343     ctgdata->rpos=ctg_rpos;
344     ctgdata->offs=ctg_lpos;
345     contigs.Add(ctgdata);
346     //ctgpos=fskipTo(">");
347     } //while lines
348     if (ctgpos==-2) return false;
349     contigs.setSorted(true);
350     return true;
351     }
352    
353     //-- compare functions for contigs
354     int ctgByLen(void* p1, void* p2) {
355     int c1=((LytCtgData*)p1)->len;
356     int c2=((LytCtgData*)p2)->len;
357     return (c1>c2)?-1:((c1<c2)?1:0);
358     }
359    
360     int ctgByNumSeqs(void* p1, void* p2) {
361     int c1=((LytCtgData*)p1)->numseqs;
362     int c2=((LytCtgData*)p2)->numseqs;
363     return (c1>c2)?-1:((c1<c2)?1:0);
364     }
365    
366     void LayoutParser::contigsByName() {
367     contigs.setSorted(true);
368     }
369    
370     void LayoutParser::contigsByLen() {
371     contigs.setSorted(&ctgByLen);
372     }
373    
374     void LayoutParser::contigsByNumSeqs() {
375     contigs.setSorted(&ctgByNumSeqs);
376     }