ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GFaSeqGet.cpp
Revision: 2
Committed: Mon Mar 22 22:03:27 2010 UTC (9 years, 7 months ago) by gpertea
File size: 8697 byte(s)
Log Message:
added my gclib source files

Line File contents
1 #include "GFaSeqGet.h"
2 #include "gdna.h"
3 #include <ctype.h>
4
5
6 void GSubSeq::setup(uint sstart, int slen, int sovl, int qfrom, int qto) {
7 if (sovl==0) {
8 GFREE(sq);
9 sqstart=sstart;
10 sqlen = (slen==0 ? MAX_FASUBSEQ : slen);
11 GMALLOC(sq, sqlen);
12 return;
13 }
14 //overlap -- copy the overlapping region
15 char* newsq=NULL;
16 GMALLOC(newsq, slen);
17 memcpy((void*)&newsq[qto], (void*)&sq[qfrom], sovl);
18 GFREE(sq);
19 sq=newsq;
20 sqstart=sstart;
21 sqlen=slen;
22 }
23
24
25 void GFaSeqGet::finit(const char* fn, off_t fofs, bool validate) {
26 fh=fopen(fn,"rb");
27 if (fh==NULL) {
28 GError("Error (GFaSeqGet) opening file '%s'\n",fn);
29 }
30 fname=Gstrdup(fn);
31 initialParse(fofs, validate);
32 lastsub=new GSubSeq();
33 }
34
35 GFaSeqGet::GFaSeqGet(FILE* f, off_t fofs, bool validate) {
36 if (f==NULL) GError("Error (GFaSeqGet) : null file handle!\n");
37 fh=f;
38 initialParse(fofs, validate);
39 lastsub=new GSubSeq();
40 }
41
42
43 void GFaSeqGet::initialParse(off_t fofs, bool checkall) {
44 static const char gfa_ERRPARSE[]="Error (GFaSeqGet): invalid FASTA file format.\n";
45 if (fofs!=0) { fseek(fh,fofs,SEEK_SET); } //e.g. for offsets provided by cdbyank
46 //read the first two lines to determine fasta parameters
47 fseqstart=fofs;
48 int c=getc(fh);
49 fseqstart++;
50 if (c!='>') GError("Error (GFaSeqGet): not a fasta header?\n");
51 while ((c=getc(fh))!=EOF) {
52 fseqstart++;
53 if (c=='\n' || c=='\r') { break; } //end of defline
54 }
55
56 if (c==EOF) GError(gfa_ERRPARSE);
57 linelen=0;
58 lendlen=0;
59 lendch=0;
60 while ((c=getc(fh))!=EOF) {
61 if (c=='\n' || c=='\r') { //end of line encountered
62 if (linelen>0) { //end of the first "sequence" line
63 lendch=c;
64 lendlen++;
65 break;
66 }
67 else {// another eol char at the end of defline
68 fseqstart++;
69 continue;
70 }
71 }// end-of-line characters
72 linelen++;
73 }
74 //we are at the end of first sequence line
75 while ((c=getc(fh))!=EOF) {
76 if (c=='\n' || c=='\r') lendlen++;
77 else {
78 ungetc(c,fh);
79 break;
80 }
81 }
82 if (c==EOF) return;
83 // -- you don't need to check it all if you're sure it's safe
84 if (checkall) { //validate the rest of the FASTA record
85 int llen=0; //last line length
86 int elen=0; //length of last line ending
87 bool waseol=true;
88 while ((c=getc(fh))!=EOF) {
89 if (c=='>' && waseol) { ungetc(c,fh); break; }
90 if (c=='\n' || c=='\r') {
91 // eol char
92 elen++;
93 if (waseol) continue; //2nd eol char
94 waseol=true;
95 elen=1;
96 continue;
97 }
98 if (c<=32) GError(gfa_ERRPARSE); //invalid character encountered
99 //--- on a seq char here:
100 if (waseol) {//beginning of a seq line
101 if (elen && (llen!=linelen || elen!=lendlen))
102 //GError(gfa_ERRPARSE);
103 GError("Error: invalid FASTA format for GFaSeqGet; make sure that\n\
104 the sequence lines have the same length (except for the last line)");
105 waseol=false;
106 llen=0;
107 elen=0;
108 }
109 llen++;
110 } //while reading chars
111 }// FASTA checking was requested
112 fseek(fh,fseqstart,SEEK_SET);
113 }
114
115 const char* GFaSeqGet::subseq(uint cstart, int& clen) {
116 //cstart is 1-based genomic coordinate within current fasta sequence
117 if (clen>MAX_FASUBSEQ) {
118 unsigned int maxlen=MAX_FASUBSEQ;
119 GMessage("Error (GFaSeqGet): subsequence cannot be larger than %d\n", maxlen);
120 return NULL;
121 }
122 if (lastsub->sq==NULL || lastsub->sqlen==0) {
123 lastsub->setup(cstart, clen);
124 loadsubseq(cstart, clen);
125 lastsub->sqlen=clen;
126 return (const char*)lastsub->sq;
127 }
128 //allow extension up to MAX_FASUBSEQ
129 uint bstart=lastsub->sqstart;
130 uint bend=lastsub->sqstart+lastsub->sqlen-1;
131 uint cend=cstart+clen-1;
132 int qlen=0; //only the extra len to be allocated/appended/prepended
133 uint qstart=1; //start coordinate of the new seq block of length qlen to be read from file
134 int newlen=0; //the new total length of the buffered sequence lastsub->sq
135 int kovl=0;
136 int kfrom=0;//0-based offsets for copying
137 int kto=0; // a previously read sequence chunk
138 uint newstart=cstart;
139 if (cstart>=bstart && cend<=bend) { //new req included in exising buffer
140 return (const char*) &(lastsub->sq[cstart-bstart]) ;
141 }
142 //extend downward
143 uint newend=GMAX(cend, bend);
144
145 if (cstart<bstart) {
146 newstart=cstart;
147 newlen=(newend-newstart+1);
148 if (newlen>MAX_FASUBSEQ) {
149 newlen=MAX_FASUBSEQ;
150 newend=cstart+newlen-1;
151 }
152 qstart=cstart;
153 if (newend>bstart) { //overlap
154 if (newend>bend) {// new req is larger, two regions to update
155 kovl=bend-bstart+1;
156 lastsub->setup(newstart, newlen, kovl, 0, bstart-cstart); //this should realloc and copy the kovl subseq
157 qlen=bstart-cstart;
158 loadsubseq(newstart, qlen);
159 clen=qlen+kovl;
160 qlen=newend-bend;
161 loadsubseq(bend+1, qlen);
162 clen+=qlen;
163 lastsub->sqlen=clen;
164 return (const char*)lastsub->sq;
165 }
166 qlen=bstart-cstart;
167 kovl=newend-bstart+1;
168 //kfrom=0;
169 kto=bstart-cstart;
170 }
171 else { // non overlapping new segment
172 qlen=newlen;
173 }
174 } //cstart<bstart
175 else { //possibly extend upwards
176 //bstart<=cstart && cend>bend
177 newstart=bstart;
178 newlen=(newend-newstart+1);
179 if (newlen>MAX_FASUBSEQ) {
180 newstart=bstart+(newlen-MAX_FASUBSEQ);//keep newend
181 newlen=MAX_FASUBSEQ;
182 }
183 if (newstart<bend) { //overlap
184 qlen=newend-bend; //to read from file
185 qstart=bend+1;
186 kovl=bend-newstart+1;
187 kfrom=newstart-bstart;
188 //kto=0;
189 }
190 else { //non-overlapping new segment
191 qlen=newlen;
192 qstart=newstart; //read the whole thing anew
193 }
194 }
195
196 lastsub->setup(newstart, newlen, kovl, kfrom, kto); //this should realloc but copy any overlapping region
197 lastsub->sqlen-=qlen; //appending may result in a premature eof
198 loadsubseq(qstart, qlen); //read the missing chunk, if any
199 lastsub->sqlen+=qlen;
200 return (const char*)(lastsub->sq+(cstart-newstart));
201 }
202
203 char* GFaSeqGet::copyRange(uint cstart, uint cend, bool revCmpl, bool upCase) {
204 if (cstart>cend) { swap(cstart, cend); }
205 int clen=cend-cstart+1;
206 const char* gs=subseq(cstart, clen);
207 if (gs==NULL) return NULL;
208 char* r=NULL;
209 GMALLOC(r,clen+1);
210 r[clen]=0;
211 memcpy((void*)r,(void*)gs, clen);
212 if (revCmpl) reverseComplement(r,clen);
213 if (upCase) {
214 for (int i=0;i<clen;i++)
215 r[i]=toupper(r[i]);
216 }
217 return r;
218 }
219
220 const char* GFaSeqGet::loadsubseq(uint cstart, int& clen) {
221 //assumes enough lastsub->sq space allocated previously
222 //only loads the requested clen chars from file, at offset &lastsub->sq[cstart-lastsub->sqstart]
223 int sofs=cstart-lastsub->sqstart;
224 bool appending=(sofs>0);
225 char* seqp=lastsub->sq+sofs;
226 //find the proper file offset and read the appropriate lines
227 uint seqofs=cstart-1;
228 uint startlno = seqofs/linelen;
229 int lineofs = seqofs % linelen;
230 off_t fstart=fseqstart+startlno * (linelen+lendlen);
231 fstart+=lineofs;
232 fseek(fh, fstart, SEEK_SET);
233 int toread=(int)clen;
234 if (toread==0) toread=MAX_FASUBSEQ; //read max allowed, or to the end of file
235 int actualrlen=0;
236 int sublen=0;
237 if (lineofs>0) { //read the partial first line
238 int reqrlen=linelen-lineofs;
239 if (reqrlen>toread) reqrlen=toread; //in case we need to read just a few chars
240 actualrlen=fread((void*)seqp, 1, reqrlen, fh);
241 //if (appending && actualrlen<reqrlen) { //eof reached prematurely
242 if (actualrlen<reqrlen) { //eof reached prematurely
243 while (seqp[actualrlen-1]=='\n' || seqp[actualrlen-1]=='\r') actualrlen--;
244 clen=actualrlen;
245 sublen+=actualrlen;
246 return (const char*)seqp;
247 }
248 toread-=reqrlen;
249 sublen+=reqrlen;
250 fseek(fh, lendlen, SEEK_CUR);
251 }
252 //read the rest of the lines
253 while (toread>=linelen) {
254 actualrlen=fread((void*)&(seqp[sublen]), 1, linelen, fh);
255 //if (appending && actualrlen<linelen) {
256 if (actualrlen<linelen) {
257 while (seqp[actualrlen-1]=='\n' || seqp[actualrlen-1]=='\r') actualrlen--;
258 sublen+=actualrlen;
259 clen=sublen;
260 return (const char*)seqp;
261 }
262 toread-=actualrlen;
263 sublen+=actualrlen;
264 fseek(fh, lendlen, SEEK_CUR);
265 }
266 // read the last partial line, if any
267 if (toread>0) {
268 actualrlen=fread((void*)&(seqp[sublen]), 1, toread, fh);
269 if (appending || actualrlen<toread) {
270 while (seqp[actualrlen-1]=='\n' || seqp[actualrlen-1]=='\r')
271 actualrlen--;
272 }
273 toread-=actualrlen;
274 sublen+=actualrlen;
275 }
276 //lastsub->sqlen+=sublen;
277 clen=sublen;
278 //GMessage(" sublen = %d\n", sublen);
279 return (const char*)seqp;
280 }