ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GFaSeqGet.cpp
Revision: 144
Committed: Thu Dec 22 19:15:46 2011 UTC (7 years, 6 months ago) by gpertea
File size: 10196 byte(s)
Log Message:
added GBitVec.h; removed operator> requirement for sorted vectors; swap() is now template Gswap()

Line User Rev File contents
1 gpertea 2 #include "GFaSeqGet.h"
2     #include "gdna.h"
3     #include <ctype.h>
4    
5 gpertea 16 void GSubSeq::setup(uint sstart, int slen, int sovl, int qfrom, int qto, uint maxseqlen) {
6 gpertea 2 if (sovl==0) {
7     GFREE(sq);
8     sqstart=sstart;
9 gpertea 16 uint max_len=(maxseqlen>0) ? maxseqlen : MAX_FASUBSEQ;
10     sqlen = (slen==0 ? max_len : slen);
11 gpertea 2 GMALLOC(sq, sqlen);
12     return;
13     }
14     //overlap -- copy the overlapping region
15     char* newsq=NULL;
16     GMALLOC(newsq, slen);
17     memcpy((void*)&newsq[qto], (void*)&sq[qfrom], sovl);
18     GFREE(sq);
19     sq=newsq;
20     sqstart=sstart;
21     sqlen=slen;
22     }
23    
24     void GFaSeqGet::finit(const char* fn, off_t fofs, bool validate) {
25     fh=fopen(fn,"rb");
26     if (fh==NULL) {
27     GError("Error (GFaSeqGet) opening file '%s'\n",fn);
28     }
29     fname=Gstrdup(fn);
30     initialParse(fofs, validate);
31     lastsub=new GSubSeq();
32     }
33    
34 gpertea 16 GFaSeqGet::GFaSeqGet(const char* faname, uint seqlen, off_t fseqofs, int l_len, int l_blen) {
35     //for GFastaIndex use mostly -- the important difference is that
36     //the file offset is to the sequence, not to the defline
37     fh=fopen(faname,"rb");
38     if (fh==NULL) {
39     GError("Error (GFaSeqGet) opening file '%s'\n",faname);
40     }
41     fname=Gstrdup(faname);
42     line_len=l_len;
43     line_blen=l_blen;
44     seq_len=seqlen;
45     if (line_blen<line_len)
46     GError("Error (GFaSeqGet): invalid line length info (len=%d, blen=%d)\n",
47     line_len, line_blen);
48     fseqstart=fseqofs;
49     lastsub=new GSubSeq();
50     }
51    
52 gpertea 2 GFaSeqGet::GFaSeqGet(FILE* f, off_t fofs, bool validate) {
53     if (f==NULL) GError("Error (GFaSeqGet) : null file handle!\n");
54 gpertea 16 seq_len=0;
55 gpertea 2 fh=f;
56     initialParse(fofs, validate);
57     lastsub=new GSubSeq();
58     }
59    
60     void GFaSeqGet::initialParse(off_t fofs, bool checkall) {
61     static const char gfa_ERRPARSE[]="Error (GFaSeqGet): invalid FASTA file format.\n";
62 gpertea 16 if (fofs!=0) { fseeko(fh,fofs,SEEK_SET); } //e.g. for offsets provided by cdbyank
63 gpertea 2 //read the first two lines to determine fasta parameters
64     fseqstart=fofs;
65     int c=getc(fh);
66     fseqstart++;
67     if (c!='>') GError("Error (GFaSeqGet): not a fasta header?\n");
68     while ((c=getc(fh))!=EOF) {
69     fseqstart++;
70     if (c=='\n' || c=='\r') { break; } //end of defline
71     }
72    
73     if (c==EOF) GError(gfa_ERRPARSE);
74 gpertea 16 line_len=0;
75     int lendlen=0;
76 gpertea 2 while ((c=getc(fh))!=EOF) {
77     if (c=='\n' || c=='\r') { //end of line encountered
78 gpertea 16 if (line_len>0) { //end of the first "sequence" line
79 gpertea 2 lendlen++;
80     break;
81     }
82 gpertea 16 else {// another EoL char at the end of defline
83 gpertea 2 fseqstart++;
84     continue;
85     }
86     }// end-of-line characters
87 gpertea 16 line_len++;
88 gpertea 2 }
89     //we are at the end of first sequence line
90     while ((c=getc(fh))!=EOF) {
91     if (c=='\n' || c=='\r') lendlen++;
92     else {
93     ungetc(c,fh);
94     break;
95     }
96     }
97 gpertea 16 line_blen=line_len+lendlen;
98 gpertea 2 if (c==EOF) return;
99     // -- you don't need to check it all if you're sure it's safe
100     if (checkall) { //validate the rest of the FASTA record
101     int llen=0; //last line length
102     int elen=0; //length of last line ending
103     bool waseol=true;
104     while ((c=getc(fh))!=EOF) {
105     if (c=='>' && waseol) { ungetc(c,fh); break; }
106     if (c=='\n' || c=='\r') {
107     // eol char
108     elen++;
109     if (waseol) continue; //2nd eol char
110     waseol=true;
111     elen=1;
112     continue;
113     }
114     if (c<=32) GError(gfa_ERRPARSE); //invalid character encountered
115     //--- on a seq char here:
116     if (waseol) {//beginning of a seq line
117 gpertea 16 if (elen && (llen!=line_len || elen!=lendlen))
118 gpertea 2 //GError(gfa_ERRPARSE);
119     GError("Error: invalid FASTA format for GFaSeqGet; make sure that\n\
120     the sequence lines have the same length (except for the last line)");
121     waseol=false;
122     llen=0;
123     elen=0;
124     }
125     llen++;
126     } //while reading chars
127     }// FASTA checking was requested
128 gpertea 16 fseeko(fh,fseqstart,SEEK_SET);
129 gpertea 2 }
130    
131     const char* GFaSeqGet::subseq(uint cstart, int& clen) {
132     //cstart is 1-based genomic coordinate within current fasta sequence
133 gpertea 16 int maxlen=(seq_len>0)?seq_len : MAX_FASUBSEQ;
134     //GMessage("--> call: subseq(%u, %d)\n", cstart, clen);
135     if (clen>maxlen) {
136 gpertea 2 GMessage("Error (GFaSeqGet): subsequence cannot be larger than %d\n", maxlen);
137     return NULL;
138     }
139 gpertea 16 if (seq_len>0 && clen+cstart-1>seq_len) {
140     GMessage("Error (GFaSeqGet): end coordinate (%d) cannot be larger than sequence length %d\n", clen+cstart-1, seq_len);
141     }
142 gpertea 2 if (lastsub->sq==NULL || lastsub->sqlen==0) {
143 gpertea 16 lastsub->setup(cstart, clen, 0,0,0,seq_len);
144 gpertea 2 loadsubseq(cstart, clen);
145     lastsub->sqlen=clen;
146     return (const char*)lastsub->sq;
147     }
148     //allow extension up to MAX_FASUBSEQ
149     uint bstart=lastsub->sqstart;
150     uint bend=lastsub->sqstart+lastsub->sqlen-1;
151     uint cend=cstart+clen-1;
152     int qlen=0; //only the extra len to be allocated/appended/prepended
153 gpertea 16 uint qstart=cstart; //start coordinate of the new seq block of length qlen to be read from file
154 gpertea 2 int newlen=0; //the new total length of the buffered sequence lastsub->sq
155     int kovl=0;
156 gpertea 16 int czfrom=0;//0-based offsets for copying a previously read sequence chunk
157     int czto=0;
158 gpertea 2 uint newstart=cstart;
159 gpertea 16 if (cstart>=bstart && cend<=bend) { //new reg contained within existing buffer
160 gpertea 2 return (const char*) &(lastsub->sq[cstart-bstart]) ;
161     }
162     //extend downward
163     uint newend=GMAX(cend, bend);
164 gpertea 16 if (cstart<bstart) { //requested start < old buffer start
165 gpertea 2 newstart=cstart;
166     newlen=(newend-newstart+1);
167     if (newlen>MAX_FASUBSEQ) {
168     newlen=MAX_FASUBSEQ;
169 gpertea 16 newend=cstart+newlen-1; //keep newstart, set newend
170 gpertea 2 }
171 gpertea 16 qlen=bstart-cstart;
172 gpertea 2 if (newend>bstart) { //overlap
173 gpertea 16 if (newend>bend) {// new region is larger & around the old one - so we have two regions to update
174 gpertea 2 kovl=bend-bstart+1;
175 gpertea 16 czfrom=0;
176     czto=bstart-cstart;
177     lastsub->setup(newstart, newlen, kovl, czfrom, czto, seq_len); //this should realloc and copy the kovl subseq
178 gpertea 2 qlen=bstart-cstart;
179     loadsubseq(newstart, qlen);
180     qlen=newend-bend;
181 gpertea 16 int toread=qlen;
182 gpertea 2 loadsubseq(bend+1, qlen);
183 gpertea 16 clen-=(toread-qlen);
184 gpertea 2 lastsub->sqlen=clen;
185     return (const char*)lastsub->sq;
186     }
187 gpertea 16 //newend<=bend
188 gpertea 2 kovl=newend-bstart+1;
189     }
190 gpertea 16 else { //no overlap with previous buffer
191     if (newend>bend) kovl=bend-bstart+1;
192     else kovl=newend-bstart+1;
193 gpertea 2 }
194 gpertea 16 qlen=bstart-cstart;
195     czfrom=0;
196     czto=qlen;
197 gpertea 2 } //cstart<bstart
198 gpertea 16 else { //cstart>=bstart, possibly extend upwards
199 gpertea 2 newstart=bstart;
200     newlen=(newend-newstart+1);
201     if (newlen>MAX_FASUBSEQ) {
202 gpertea 16 newstart=bstart+(newlen-MAX_FASUBSEQ);//keep newend, assign newstart
203 gpertea 2 newlen=MAX_FASUBSEQ;
204 gpertea 16 if (newstart<=bend) { //overlap with old buffer
205     kovl=bend-newstart+1;
206     czfrom=newstart-bstart;
207     czto=0;
208     }
209     else { //not overlapping old buffer
210     kovl=0;
211     }
212     } //newstart reassigned
213     else { //we can extend the buffer to include the old one
214     qlen=newend-bend; //how much to read from file
215     qstart=bend+1;
216     kovl=bend-bstart+1;
217     czfrom=0;
218     czto=0;
219     }
220 gpertea 2 }
221 gpertea 16 lastsub->setup(newstart, newlen, kovl, czfrom, czto, seq_len); //this should realloc but copy any overlapping region
222 gpertea 2 lastsub->sqlen-=qlen; //appending may result in a premature eof
223 gpertea 16 int toread=qlen;
224 gpertea 2 loadsubseq(qstart, qlen); //read the missing chunk, if any
225 gpertea 16 clen-=(toread-qlen);
226 gpertea 2 lastsub->sqlen+=qlen;
227     return (const char*)(lastsub->sq+(cstart-newstart));
228     }
229    
230     char* GFaSeqGet::copyRange(uint cstart, uint cend, bool revCmpl, bool upCase) {
231 gpertea 144 if (cstart>cend) { Gswap(cstart, cend); }
232 gpertea 2 int clen=cend-cstart+1;
233     const char* gs=subseq(cstart, clen);
234     if (gs==NULL) return NULL;
235     char* r=NULL;
236     GMALLOC(r,clen+1);
237     r[clen]=0;
238     memcpy((void*)r,(void*)gs, clen);
239     if (revCmpl) reverseComplement(r,clen);
240     if (upCase) {
241     for (int i=0;i<clen;i++)
242     r[i]=toupper(r[i]);
243     }
244     return r;
245     }
246    
247     const char* GFaSeqGet::loadsubseq(uint cstart, int& clen) {
248     //assumes enough lastsub->sq space allocated previously
249     //only loads the requested clen chars from file, at offset &lastsub->sq[cstart-lastsub->sqstart]
250     int sofs=cstart-lastsub->sqstart;
251 gpertea 16 int lendlen=line_blen-line_len;
252 gpertea 2 char* seqp=lastsub->sq+sofs;
253     //find the proper file offset and read the appropriate lines
254     uint seqofs=cstart-1;
255 gpertea 16 uint startlno = seqofs/line_len;
256     int lineofs = seqofs % line_len;
257     off_t fstart=fseqstart + (startlno*line_blen);
258 gpertea 2 fstart+=lineofs;
259 gpertea 16
260     fseeko(fh, fstart, SEEK_SET);
261     int toread=clen;
262     int maxlen=(seq_len>0)? seq_len-cstart+1 : MAX_FASUBSEQ ;
263     if (toread==0) toread=maxlen; //read max allowed, or to the end of file
264 gpertea 2 int actualrlen=0;
265     int sublen=0;
266     if (lineofs>0) { //read the partial first line
267 gpertea 16 int reqrlen=line_len-lineofs;
268 gpertea 2 if (reqrlen>toread) reqrlen=toread; //in case we need to read just a few chars
269     actualrlen=fread((void*)seqp, 1, reqrlen, fh);
270     if (actualrlen<reqrlen) { //eof reached prematurely
271     while (seqp[actualrlen-1]=='\n' || seqp[actualrlen-1]=='\r') actualrlen--;
272 gpertea 16 //check for new sequences in between
273 gpertea 2 clen=actualrlen;
274     sublen+=actualrlen;
275     return (const char*)seqp;
276     }
277     toread-=reqrlen;
278     sublen+=reqrlen;
279 gpertea 16 fseeko(fh, lendlen, SEEK_CUR);
280 gpertea 2 }
281     //read the rest of the lines
282 gpertea 16 while (toread>=line_len) {
283     char* rseqp=&(seqp[sublen]);
284     actualrlen=fread((void*)rseqp, 1, line_len, fh);
285     /*
286     char dbuf[256];dbuf[255]=0;
287     strncpy(dbuf,rseqp, actualrlen);
288     dbuf[actualrlen]=0;
289     GMessage("<<<read line: %s\n",dbuf);
290     */
291     if (actualrlen<line_len) {
292     while (rseqp[actualrlen-1]=='\n' || rseqp[actualrlen-1]=='\r') actualrlen--;
293 gpertea 2 sublen+=actualrlen;
294     clen=sublen;
295     return (const char*)seqp;
296     }
297     toread-=actualrlen;
298     sublen+=actualrlen;
299 gpertea 16 fseeko(fh, lendlen, SEEK_CUR);
300 gpertea 2 }
301     // read the last partial line, if any
302     if (toread>0) {
303 gpertea 16 char* rseqp=&(seqp[sublen]);
304     actualrlen=fread((void*)rseqp, 1, toread, fh);
305     if (actualrlen<toread) {
306     while (rseqp[actualrlen-1]=='\n' || rseqp[actualrlen-1]=='\r')
307 gpertea 2 actualrlen--;
308     }
309     sublen+=actualrlen;
310     }
311     //lastsub->sqlen+=sublen;
312     clen=sublen;
313 gpertea 16
314 gpertea 2 return (const char*)seqp;
315     }