ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GFaSeqGet.cpp
(Generate patch)
# Line 2 | Line 2
2   #include "gdna.h"
3   #include <ctype.h>
4  
5 <
6 < void GSubSeq::setup(uint sstart, int slen, int sovl, int qfrom, int qto) {
5 > void GSubSeq::setup(uint sstart, int slen, int sovl, int qfrom, int qto, uint maxseqlen) {
6       if (sovl==0) {
7         GFREE(sq);
8         sqstart=sstart;
9 <       sqlen = (slen==0 ? MAX_FASUBSEQ : slen);
9 >       uint max_len=(maxseqlen>0) ? maxseqlen : MAX_FASUBSEQ;
10 >       sqlen = (slen==0 ? max_len : slen);
11         GMALLOC(sq, sqlen);
12         return;
13         }
# Line 21 | Line 21
21    sqlen=slen;
22   }
23  
24
24   void GFaSeqGet::finit(const char* fn, off_t fofs, bool validate) {
25   fh=fopen(fn,"rb");
26   if (fh==NULL) {
# Line 32 | Line 31
31   lastsub=new GSubSeq();
32   }
33  
34 + GFaSeqGet::GFaSeqGet(const char* faname, uint seqlen, off_t fseqofs, int l_len, int l_blen) {
35 + //for GFastaIndex use mostly -- the important difference is that
36 + //the file offset is to the sequence, not to the defline
37 +  fh=fopen(faname,"rb");
38 +  if (fh==NULL) {
39 +    GError("Error (GFaSeqGet) opening file '%s'\n",faname);
40 +    }
41 +  fname=Gstrdup(faname);
42 +  line_len=l_len;
43 +  line_blen=l_blen;
44 +  seq_len=seqlen;
45 +  if (line_blen<line_len)
46 +       GError("Error (GFaSeqGet): invalid line length info (len=%d, blen=%d)\n",
47 +              line_len, line_blen);
48 +  fseqstart=fseqofs;
49 +  lastsub=new GSubSeq();
50 + }
51 +
52   GFaSeqGet::GFaSeqGet(FILE* f, off_t fofs, bool validate) {
53   if (f==NULL) GError("Error (GFaSeqGet) : null file handle!\n");
54 + seq_len=0;
55   fh=f;
56   initialParse(fofs, validate);
57   lastsub=new GSubSeq();
58   }
59  
42
60   void GFaSeqGet::initialParse(off_t fofs, bool checkall) {
61   static const char gfa_ERRPARSE[]="Error (GFaSeqGet): invalid FASTA file format.\n";
62 < if (fofs!=0) { fseek(fh,fofs,SEEK_SET); } //e.g. for offsets provided by cdbyank
62 > if (fofs!=0) { fseeko(fh,fofs,SEEK_SET); } //e.g. for offsets provided by cdbyank
63   //read the first two lines to determine fasta parameters
64   fseqstart=fofs;
65   int c=getc(fh);
# Line 54 | Line 71
71     }
72  
73   if (c==EOF) GError(gfa_ERRPARSE);
74 < linelen=0;
75 < lendlen=0;
59 < lendch=0;
74 > line_len=0;
75 > int lendlen=0;
76   while ((c=getc(fh))!=EOF) {
77    if (c=='\n' || c=='\r') { //end of line encountered
78 <     if (linelen>0) { //end of the first "sequence" line
63 <        lendch=c;
78 >     if (line_len>0) { //end of the first "sequence" line
79          lendlen++;
80          break;
81          }
82 <      else {// another eol char at the end of defline
82 >      else {// another EoL char at the end of defline
83          fseqstart++;
84          continue;
85          }
86       }// end-of-line characters
87 <  linelen++;
87 >  line_len++;
88    }
89   //we are at the end of first sequence line
90   while ((c=getc(fh))!=EOF) {
# Line 79 | Line 94
94         break;
95         }
96     }
97 + line_blen=line_len+lendlen;
98   if (c==EOF) return;
99   // -- you don't need to check it all if you're sure it's safe
100   if (checkall) { //validate the rest of the FASTA record
# Line 98 | Line 114
114       if (c<=32) GError(gfa_ERRPARSE); //invalid character encountered
115       //--- on a seq char here:
116       if (waseol) {//beginning of a seq line
117 <       if (elen && (llen!=linelen || elen!=lendlen))
117 >       if (elen && (llen!=line_len || elen!=lendlen))
118             //GError(gfa_ERRPARSE);
119           GError("Error: invalid FASTA format for GFaSeqGet; make sure that\n\
120    the sequence lines have the same length (except for the last line)");
# Line 109 | Line 125
125       llen++;
126       } //while reading chars
127     }// FASTA checking was requested
128 < fseek(fh,fseqstart,SEEK_SET);
128 > fseeko(fh,fseqstart,SEEK_SET);
129   }
130  
131   const char* GFaSeqGet::subseq(uint cstart, int& clen) {
132    //cstart is 1-based genomic coordinate within current fasta sequence
133 <  if (clen>MAX_FASUBSEQ) {
134 <    unsigned int maxlen=MAX_FASUBSEQ;
133 >   int maxlen=(seq_len>0)?seq_len : MAX_FASUBSEQ;
134 >   //GMessage("--> call: subseq(%u, %d)\n", cstart, clen);
135 >  if (clen>maxlen) {
136      GMessage("Error (GFaSeqGet): subsequence cannot be larger than %d\n", maxlen);
137      return NULL;
138      }
139 +  if (seq_len>0 && clen+cstart-1>seq_len) {
140 +     GMessage("Error (GFaSeqGet): end coordinate (%d) cannot be larger than sequence length %d\n", clen+cstart-1, seq_len);
141 +     }
142    if (lastsub->sq==NULL || lastsub->sqlen==0) {
143 <    lastsub->setup(cstart, clen);
143 >    lastsub->setup(cstart, clen, 0,0,0,seq_len);
144      loadsubseq(cstart, clen);
145      lastsub->sqlen=clen;
146      return (const char*)lastsub->sq;
# Line 130 | Line 150
150    uint bend=lastsub->sqstart+lastsub->sqlen-1;
151    uint cend=cstart+clen-1;
152    int qlen=0; //only the extra len to be allocated/appended/prepended
153 <  uint qstart=1; //start coordinate of the new seq block of length qlen to be read from file
153 >  uint qstart=cstart; //start coordinate of the new seq block of length qlen to be read from file
154    int newlen=0; //the new total length of the buffered sequence lastsub->sq
155    int kovl=0;
156 <  int kfrom=0;//0-based offsets for copying
157 <  int kto=0;  //  a previously read sequence chunk
156 >  int czfrom=0;//0-based offsets for copying a previously read sequence chunk
157 >  int czto=0;
158    uint newstart=cstart;
159 <  if (cstart>=bstart && cend<=bend) { //new req included in exising buffer
159 >  if (cstart>=bstart && cend<=bend) { //new reg contained within existing buffer
160       return (const char*) &(lastsub->sq[cstart-bstart]) ;
161      }
162    //extend downward
163    uint newend=GMAX(cend, bend);
164 <
145 <  if (cstart<bstart) {
164 >  if (cstart<bstart) { //requested start < old buffer start
165      newstart=cstart;
166      newlen=(newend-newstart+1);
167      if (newlen>MAX_FASUBSEQ) {
168         newlen=MAX_FASUBSEQ;
169 <       newend=cstart+newlen-1;
169 >       newend=cstart+newlen-1; //keep newstart, set newend
170         }
171 <    qstart=cstart;
171 >    qlen=bstart-cstart;
172      if (newend>bstart) { //overlap
173 <       if (newend>bend) {// new req is larger, two regions to update
173 >       if (newend>bend) {// new region is larger & around the old one - so we have two regions to update
174           kovl=bend-bstart+1;
175 <         lastsub->setup(newstart, newlen, kovl, 0, bstart-cstart); //this should realloc and copy the kovl subseq
175 >         czfrom=0;
176 >         czto=bstart-cstart;
177 >         lastsub->setup(newstart, newlen, kovl, czfrom, czto, seq_len); //this should realloc and copy the kovl subseq
178           qlen=bstart-cstart;
179           loadsubseq(newstart, qlen);
159         clen=qlen+kovl;
180           qlen=newend-bend;
181 +         int toread=qlen;
182           loadsubseq(bend+1, qlen);
183 <         clen+=qlen;
183 >         clen-=(toread-qlen);
184           lastsub->sqlen=clen;
185           return (const char*)lastsub->sq;
186           }
187 <       qlen=bstart-cstart;
187 >        //newend<=bend
188         kovl=newend-bstart+1;
168       //kfrom=0;
169       kto=bstart-cstart;
189         }
190 <      else { // non overlapping new segment
191 <       qlen=newlen;
190 >     else { //no overlap with previous buffer
191 >       if (newend>bend) kovl=bend-bstart+1;
192 >                   else kovl=newend-bstart+1;
193         }
194 +     qlen=bstart-cstart;
195 +     czfrom=0;
196 +     czto=qlen;
197      } //cstart<bstart
198 <   else { //possibly extend upwards
176 <    //bstart<=cstart && cend>bend
198 >   else { //cstart>=bstart, possibly extend upwards
199      newstart=bstart;
200      newlen=(newend-newstart+1);
201      if (newlen>MAX_FASUBSEQ) {
202 <       newstart=bstart+(newlen-MAX_FASUBSEQ);//keep newend
202 >       newstart=bstart+(newlen-MAX_FASUBSEQ);//keep newend, assign newstart
203         newlen=MAX_FASUBSEQ;
204 <       }
205 <    if (newstart<bend) { //overlap
206 <       qlen=newend-bend; //to read from file
207 <       qstart=bend+1;
208 <       kovl=bend-newstart+1;
209 <       kfrom=newstart-bstart;
210 <       //kto=0;
211 <       }
212 <      else { //non-overlapping new segment
213 <       qlen=newlen;
214 <       qstart=newstart; //read the whole thing anew
215 <       }
204 >       if (newstart<=bend) { //overlap with old buffer
205 >          kovl=bend-newstart+1;
206 >          czfrom=newstart-bstart;
207 >          czto=0;
208 >          }
209 >       else { //not overlapping old buffer
210 >         kovl=0;
211 >         }
212 >       } //newstart reassigned
213 >    else { //we can extend the buffer to include the old one
214 >      qlen=newend-bend; //how much to read from file
215 >      qstart=bend+1;
216 >      kovl=bend-bstart+1;
217 >      czfrom=0;
218 >      czto=0;
219 >      }
220      }
221 <
196 <  lastsub->setup(newstart, newlen, kovl, kfrom, kto); //this should realloc but copy any overlapping region
221 >  lastsub->setup(newstart, newlen, kovl, czfrom, czto, seq_len); //this should realloc but copy any overlapping region
222    lastsub->sqlen-=qlen; //appending may result in a premature eof
223 +  int toread=qlen;
224    loadsubseq(qstart, qlen); //read the missing chunk, if any
225 +  clen-=(toread-qlen);
226    lastsub->sqlen+=qlen;
227    return (const char*)(lastsub->sq+(cstart-newstart));
228   }
229  
230   char* GFaSeqGet::copyRange(uint cstart, uint cend, bool revCmpl, bool upCase) {
231 <  if (cstart>cend) { swap(cstart, cend); }
231 >  if (cstart>cend) { Gswap(cstart, cend); }
232    int clen=cend-cstart+1;
233    const char* gs=subseq(cstart, clen);
234    if (gs==NULL) return NULL;
# Line 221 | Line 248
248    //assumes enough lastsub->sq space allocated previously
249    //only loads the requested clen chars from file, at offset &lastsub->sq[cstart-lastsub->sqstart]
250    int sofs=cstart-lastsub->sqstart;
251 <  bool appending=(sofs>0);
251 >  int lendlen=line_blen-line_len;
252    char* seqp=lastsub->sq+sofs;
253    //find the proper file offset and read the appropriate lines
254    uint seqofs=cstart-1;
255 <  uint startlno = seqofs/linelen;
256 <  int lineofs = seqofs % linelen;
257 <  off_t fstart=fseqstart+startlno * (linelen+lendlen);
255 >  uint startlno = seqofs/line_len;
256 >  int lineofs = seqofs % line_len;
257 >  off_t fstart=fseqstart + (startlno*line_blen);
258    fstart+=lineofs;
259 <  fseek(fh, fstart, SEEK_SET);
260 <  int toread=(int)clen;
261 <  if (toread==0) toread=MAX_FASUBSEQ; //read max allowed, or to the end of file
259 >  
260 >  fseeko(fh, fstart, SEEK_SET);
261 >  int toread=clen;
262 >  int maxlen=(seq_len>0)? seq_len-cstart+1 : MAX_FASUBSEQ ;
263 >  if (toread==0) toread=maxlen; //read max allowed, or to the end of file
264    int actualrlen=0;
265    int sublen=0;
266    if (lineofs>0) { //read the partial first line
267 <    int reqrlen=linelen-lineofs;
267 >    int reqrlen=line_len-lineofs;
268      if (reqrlen>toread) reqrlen=toread; //in case we need to read just a few chars
269      actualrlen=fread((void*)seqp, 1, reqrlen, fh);
241    //if (appending && actualrlen<reqrlen) { //eof reached prematurely
270      if (actualrlen<reqrlen) { //eof reached prematurely
271        while (seqp[actualrlen-1]=='\n' || seqp[actualrlen-1]=='\r') actualrlen--;
272 +      //check for new sequences in between
273        clen=actualrlen;
274        sublen+=actualrlen;
275        return (const char*)seqp;
276        }
277      toread-=reqrlen;
278      sublen+=reqrlen;
279 <    fseek(fh, lendlen, SEEK_CUR);
279 >    fseeko(fh, lendlen, SEEK_CUR);
280      }
281    //read the rest of the lines
282 <  while (toread>=linelen) {
283 <    actualrlen=fread((void*)&(seqp[sublen]), 1, linelen, fh);
284 <    //if (appending && actualrlen<linelen) {
285 <    if (actualrlen<linelen) {
286 <      while (seqp[actualrlen-1]=='\n' || seqp[actualrlen-1]=='\r') actualrlen--;
282 >  while (toread>=line_len) {
283 >    char* rseqp=&(seqp[sublen]);
284 >    actualrlen=fread((void*)rseqp, 1, line_len, fh);
285 >    /*
286 >    char dbuf[256];dbuf[255]=0;
287 >    strncpy(dbuf,rseqp, actualrlen);
288 >    dbuf[actualrlen]=0;
289 >    GMessage("<<<read line: %s\n",dbuf);
290 >    */
291 >    if (actualrlen<line_len) {
292 >      while (rseqp[actualrlen-1]=='\n' || rseqp[actualrlen-1]=='\r') actualrlen--;
293        sublen+=actualrlen;
294        clen=sublen;
295        return (const char*)seqp;
296        }
297      toread-=actualrlen;
298      sublen+=actualrlen;
299 <    fseek(fh, lendlen, SEEK_CUR);
299 >    fseeko(fh, lendlen, SEEK_CUR);
300      }
301    // read the last partial line, if any
302    if (toread>0) {
303 <    actualrlen=fread((void*)&(seqp[sublen]), 1, toread, fh);
304 <    if (appending || actualrlen<toread) {
305 <      while (seqp[actualrlen-1]=='\n' || seqp[actualrlen-1]=='\r')
303 >    char* rseqp=&(seqp[sublen]);
304 >    actualrlen=fread((void*)rseqp, 1, toread, fh);
305 >    if (actualrlen<toread) {
306 >      while (rseqp[actualrlen-1]=='\n' || rseqp[actualrlen-1]=='\r')
307            actualrlen--;
308        }
273    toread-=actualrlen;
309      sublen+=actualrlen;
310      }
311    //lastsub->sqlen+=sublen;
312    clen=sublen;
313 <  //GMessage(" sublen = %d\n", sublen);
313 >  
314    return (const char*)seqp;
315    }
316  

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines