ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GFaSeqGet.cpp
Revision: 2
Committed: Mon Mar 22 22:03:27 2010 UTC (9 years, 3 months ago) by gpertea
File size: 8697 byte(s)
Log Message:
added my gclib source files

Line User Rev File contents
1 gpertea 2 #include "GFaSeqGet.h"
2     #include "gdna.h"
3     #include <ctype.h>
4    
5    
6     void GSubSeq::setup(uint sstart, int slen, int sovl, int qfrom, int qto) {
7     if (sovl==0) {
8     GFREE(sq);
9     sqstart=sstart;
10     sqlen = (slen==0 ? MAX_FASUBSEQ : slen);
11     GMALLOC(sq, sqlen);
12     return;
13     }
14     //overlap -- copy the overlapping region
15     char* newsq=NULL;
16     GMALLOC(newsq, slen);
17     memcpy((void*)&newsq[qto], (void*)&sq[qfrom], sovl);
18     GFREE(sq);
19     sq=newsq;
20     sqstart=sstart;
21     sqlen=slen;
22     }
23    
24    
25     void GFaSeqGet::finit(const char* fn, off_t fofs, bool validate) {
26     fh=fopen(fn,"rb");
27     if (fh==NULL) {
28     GError("Error (GFaSeqGet) opening file '%s'\n",fn);
29     }
30     fname=Gstrdup(fn);
31     initialParse(fofs, validate);
32     lastsub=new GSubSeq();
33     }
34    
35     GFaSeqGet::GFaSeqGet(FILE* f, off_t fofs, bool validate) {
36     if (f==NULL) GError("Error (GFaSeqGet) : null file handle!\n");
37     fh=f;
38     initialParse(fofs, validate);
39     lastsub=new GSubSeq();
40     }
41    
42    
43     void GFaSeqGet::initialParse(off_t fofs, bool checkall) {
44     static const char gfa_ERRPARSE[]="Error (GFaSeqGet): invalid FASTA file format.\n";
45     if (fofs!=0) { fseek(fh,fofs,SEEK_SET); } //e.g. for offsets provided by cdbyank
46     //read the first two lines to determine fasta parameters
47     fseqstart=fofs;
48     int c=getc(fh);
49     fseqstart++;
50     if (c!='>') GError("Error (GFaSeqGet): not a fasta header?\n");
51     while ((c=getc(fh))!=EOF) {
52     fseqstart++;
53     if (c=='\n' || c=='\r') { break; } //end of defline
54     }
55    
56     if (c==EOF) GError(gfa_ERRPARSE);
57     linelen=0;
58     lendlen=0;
59     lendch=0;
60     while ((c=getc(fh))!=EOF) {
61     if (c=='\n' || c=='\r') { //end of line encountered
62     if (linelen>0) { //end of the first "sequence" line
63     lendch=c;
64     lendlen++;
65     break;
66     }
67     else {// another eol char at the end of defline
68     fseqstart++;
69     continue;
70     }
71     }// end-of-line characters
72     linelen++;
73     }
74     //we are at the end of first sequence line
75     while ((c=getc(fh))!=EOF) {
76     if (c=='\n' || c=='\r') lendlen++;
77     else {
78     ungetc(c,fh);
79     break;
80     }
81     }
82     if (c==EOF) return;
83     // -- you don't need to check it all if you're sure it's safe
84     if (checkall) { //validate the rest of the FASTA record
85     int llen=0; //last line length
86     int elen=0; //length of last line ending
87     bool waseol=true;
88     while ((c=getc(fh))!=EOF) {
89     if (c=='>' && waseol) { ungetc(c,fh); break; }
90     if (c=='\n' || c=='\r') {
91     // eol char
92     elen++;
93     if (waseol) continue; //2nd eol char
94     waseol=true;
95     elen=1;
96     continue;
97     }
98     if (c<=32) GError(gfa_ERRPARSE); //invalid character encountered
99     //--- on a seq char here:
100     if (waseol) {//beginning of a seq line
101     if (elen && (llen!=linelen || elen!=lendlen))
102     //GError(gfa_ERRPARSE);
103     GError("Error: invalid FASTA format for GFaSeqGet; make sure that\n\
104     the sequence lines have the same length (except for the last line)");
105     waseol=false;
106     llen=0;
107     elen=0;
108     }
109     llen++;
110     } //while reading chars
111     }// FASTA checking was requested
112     fseek(fh,fseqstart,SEEK_SET);
113     }
114    
115     const char* GFaSeqGet::subseq(uint cstart, int& clen) {
116     //cstart is 1-based genomic coordinate within current fasta sequence
117     if (clen>MAX_FASUBSEQ) {
118     unsigned int maxlen=MAX_FASUBSEQ;
119     GMessage("Error (GFaSeqGet): subsequence cannot be larger than %d\n", maxlen);
120     return NULL;
121     }
122     if (lastsub->sq==NULL || lastsub->sqlen==0) {
123     lastsub->setup(cstart, clen);
124     loadsubseq(cstart, clen);
125     lastsub->sqlen=clen;
126     return (const char*)lastsub->sq;
127     }
128     //allow extension up to MAX_FASUBSEQ
129     uint bstart=lastsub->sqstart;
130     uint bend=lastsub->sqstart+lastsub->sqlen-1;
131     uint cend=cstart+clen-1;
132     int qlen=0; //only the extra len to be allocated/appended/prepended
133     uint qstart=1; //start coordinate of the new seq block of length qlen to be read from file
134     int newlen=0; //the new total length of the buffered sequence lastsub->sq
135     int kovl=0;
136     int kfrom=0;//0-based offsets for copying
137     int kto=0; // a previously read sequence chunk
138     uint newstart=cstart;
139     if (cstart>=bstart && cend<=bend) { //new req included in exising buffer
140     return (const char*) &(lastsub->sq[cstart-bstart]) ;
141     }
142     //extend downward
143     uint newend=GMAX(cend, bend);
144    
145     if (cstart<bstart) {
146     newstart=cstart;
147     newlen=(newend-newstart+1);
148     if (newlen>MAX_FASUBSEQ) {
149     newlen=MAX_FASUBSEQ;
150     newend=cstart+newlen-1;
151     }
152     qstart=cstart;
153     if (newend>bstart) { //overlap
154     if (newend>bend) {// new req is larger, two regions to update
155     kovl=bend-bstart+1;
156     lastsub->setup(newstart, newlen, kovl, 0, bstart-cstart); //this should realloc and copy the kovl subseq
157     qlen=bstart-cstart;
158     loadsubseq(newstart, qlen);
159     clen=qlen+kovl;
160     qlen=newend-bend;
161     loadsubseq(bend+1, qlen);
162     clen+=qlen;
163     lastsub->sqlen=clen;
164     return (const char*)lastsub->sq;
165     }
166     qlen=bstart-cstart;
167     kovl=newend-bstart+1;
168     //kfrom=0;
169     kto=bstart-cstart;
170     }
171     else { // non overlapping new segment
172     qlen=newlen;
173     }
174     } //cstart<bstart
175     else { //possibly extend upwards
176     //bstart<=cstart && cend>bend
177     newstart=bstart;
178     newlen=(newend-newstart+1);
179     if (newlen>MAX_FASUBSEQ) {
180     newstart=bstart+(newlen-MAX_FASUBSEQ);//keep newend
181     newlen=MAX_FASUBSEQ;
182     }
183     if (newstart<bend) { //overlap
184     qlen=newend-bend; //to read from file
185     qstart=bend+1;
186     kovl=bend-newstart+1;
187     kfrom=newstart-bstart;
188     //kto=0;
189     }
190     else { //non-overlapping new segment
191     qlen=newlen;
192     qstart=newstart; //read the whole thing anew
193     }
194     }
195    
196     lastsub->setup(newstart, newlen, kovl, kfrom, kto); //this should realloc but copy any overlapping region
197     lastsub->sqlen-=qlen; //appending may result in a premature eof
198     loadsubseq(qstart, qlen); //read the missing chunk, if any
199     lastsub->sqlen+=qlen;
200     return (const char*)(lastsub->sq+(cstart-newstart));
201     }
202    
203     char* GFaSeqGet::copyRange(uint cstart, uint cend, bool revCmpl, bool upCase) {
204     if (cstart>cend) { swap(cstart, cend); }
205     int clen=cend-cstart+1;
206     const char* gs=subseq(cstart, clen);
207     if (gs==NULL) return NULL;
208     char* r=NULL;
209     GMALLOC(r,clen+1);
210     r[clen]=0;
211     memcpy((void*)r,(void*)gs, clen);
212     if (revCmpl) reverseComplement(r,clen);
213     if (upCase) {
214     for (int i=0;i<clen;i++)
215     r[i]=toupper(r[i]);
216     }
217     return r;
218     }
219    
220     const char* GFaSeqGet::loadsubseq(uint cstart, int& clen) {
221     //assumes enough lastsub->sq space allocated previously
222     //only loads the requested clen chars from file, at offset &lastsub->sq[cstart-lastsub->sqstart]
223     int sofs=cstart-lastsub->sqstart;
224     bool appending=(sofs>0);
225     char* seqp=lastsub->sq+sofs;
226     //find the proper file offset and read the appropriate lines
227     uint seqofs=cstart-1;
228     uint startlno = seqofs/linelen;
229     int lineofs = seqofs % linelen;
230     off_t fstart=fseqstart+startlno * (linelen+lendlen);
231     fstart+=lineofs;
232     fseek(fh, fstart, SEEK_SET);
233     int toread=(int)clen;
234     if (toread==0) toread=MAX_FASUBSEQ; //read max allowed, or to the end of file
235     int actualrlen=0;
236     int sublen=0;
237     if (lineofs>0) { //read the partial first line
238     int reqrlen=linelen-lineofs;
239     if (reqrlen>toread) reqrlen=toread; //in case we need to read just a few chars
240     actualrlen=fread((void*)seqp, 1, reqrlen, fh);
241     //if (appending && actualrlen<reqrlen) { //eof reached prematurely
242     if (actualrlen<reqrlen) { //eof reached prematurely
243     while (seqp[actualrlen-1]=='\n' || seqp[actualrlen-1]=='\r') actualrlen--;
244     clen=actualrlen;
245     sublen+=actualrlen;
246     return (const char*)seqp;
247     }
248     toread-=reqrlen;
249     sublen+=reqrlen;
250     fseek(fh, lendlen, SEEK_CUR);
251     }
252     //read the rest of the lines
253     while (toread>=linelen) {
254     actualrlen=fread((void*)&(seqp[sublen]), 1, linelen, fh);
255     //if (appending && actualrlen<linelen) {
256     if (actualrlen<linelen) {
257     while (seqp[actualrlen-1]=='\n' || seqp[actualrlen-1]=='\r') actualrlen--;
258     sublen+=actualrlen;
259     clen=sublen;
260     return (const char*)seqp;
261     }
262     toread-=actualrlen;
263     sublen+=actualrlen;
264     fseek(fh, lendlen, SEEK_CUR);
265     }
266     // read the last partial line, if any
267     if (toread>0) {
268     actualrlen=fread((void*)&(seqp[sublen]), 1, toread, fh);
269     if (appending || actualrlen<toread) {
270     while (seqp[actualrlen-1]=='\n' || seqp[actualrlen-1]=='\r')
271     actualrlen--;
272     }
273     toread-=actualrlen;
274     sublen+=actualrlen;
275     }
276     //lastsub->sqlen+=sublen;
277     clen=sublen;
278     //GMessage(" sublen = %d\n", sublen);
279     return (const char*)seqp;
280     }