ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GFaSeqGet.h
Revision: 16
Committed: Mon Jul 18 20:56:02 2011 UTC (8 years, 3 months ago) by gpertea
File size: 3485 byte(s)
Log Message:
sync with local source

Line User Rev File contents
1 gpertea 2 #ifndef GFASEQGET_H
2     #define GFASEQGET_H
3    
4     #include "GList.hh"
5    
6     #define MAX_FASUBSEQ 0x20000000
7     //max 512MB sequence data held in memory at a time
8    
9     class GSubSeq {
10     public:
11     uint sqstart; //1-based coord of subseq start on sequence
12     uint sqlen; //length of subseq loaded
13     char* sq; //actual subsequence data will be stored here
14     // (with end-of-line characters removed)
15    
16     /*char* xseq; //the exposed pointer to the last requested subsequence start
17     off_t xstart; //the coordinate start for the last requested subseq
18     off_t xlen; //the last requested subseq len*/
19     GSubSeq() {
20     sqstart=0;
21     sqlen=0;
22     sq=NULL;
23     /* xseq=NULL;
24     xstart=0;
25     xlen=0;*/
26     }
27     ~GSubSeq() {
28     GFREE(sq);
29     }
30     // genomic, 1-based coordinates:
31 gpertea 16 void setup(uint sstart, int slen, int sovl=0, int qfrom=0, int qto=0, uint maxseqlen=0);
32 gpertea 2 //check for overlap with previous window and realloc/extend appropriately
33     //returns offset from seq that corresponds to sstart
34     // the window will keep extending until MAX_FASUBSEQ is reached
35     };
36    
37     class GFaSeqGet {
38     char* fname;
39     FILE* fh;
40     //raw offset in the file where the sequence actually starts:
41     off_t fseqstart;
42 gpertea 16 uint seq_len; //total sequence length, if known (when created from GFastaIndex)
43     int line_len; //length of each line of text
44     int line_blen; //binary length of each line
45     // = line_len + number of EOL character(s)
46 gpertea 2 GSubSeq* lastsub;
47     void initialParse(off_t fofs=0, bool checkall=true);
48     const char* loadsubseq(uint cstart, int& clen);
49     void finit(const char* fn, off_t fofs, bool validate);
50     public:
51     GFaSeqGet() {
52 gpertea 16 fh=NULL;
53 gpertea 2 fseqstart=0;
54 gpertea 16 seq_len=0;
55     line_len=0;
56     line_blen=0;
57 gpertea 2 fname=NULL;
58     lastsub=NULL;
59     }
60 gpertea 16 GFaSeqGet(const char* fn, off_t fofs, bool validate=false) {
61     seq_len=0;
62 gpertea 2 finit(fn,fofs,validate);
63     }
64     GFaSeqGet(const char* fn, bool validate=false) {
65 gpertea 16 seq_len=0;
66 gpertea 2 finit(fn,0,validate);
67     }
68 gpertea 16
69     GFaSeqGet(const char* faname, uint seqlen, off_t fseqofs, int l_len, int l_blen);
70     //constructor from GFastaIndex record
71    
72 gpertea 2 GFaSeqGet(FILE* f, off_t fofs=0, bool validate=false);
73 gpertea 16
74 gpertea 2 ~GFaSeqGet() {
75     if (fname!=NULL) {
76     GFREE(fname);
77     fclose(fh);
78     }
79     delete lastsub;
80     }
81     const char* subseq(uint cstart, int& clen);
82 gpertea 16 const char* getRange(uint cstart=1, uint cend=0) {
83     if (cend==0) cend=(seq_len>0)?seq_len : MAX_FASUBSEQ;
84 gpertea 2 if (cstart>cend) { swap(cstart, cend); }
85     int clen=cend-cstart+1;
86     //int rdlen=clen;
87     return subseq(cstart, clen);
88     }
89 gpertea 16
90 gpertea 2 char* copyRange(uint cstart, uint cend, bool revCmpl=false, bool upCase=false);
91 gpertea 16 //caller is responsible for deallocating the return string
92 gpertea 2
93 gpertea 16 void loadall(uint32 max_len=0) {
94     //TODO: must read the whole sequence differently here - line by line
95     //so when EOF or another '>' line is found, the reading stops!
96     int clen=(seq_len>0) ? seq_len : ((max_len>0) ? max_len : MAX_FASUBSEQ);
97 gpertea 2 subseq(1, clen);
98     }
99     void load(uint cstart, uint cend) {
100     //cache as much as possible
101 gpertea 16 if (seq_len>0 && cend>seq_len) cend=seq_len; //correct a bad request
102 gpertea 2 int clen=cend-cstart+1;
103     subseq(cstart, clen);
104     }
105     int getsublen() { return lastsub!=NULL ? lastsub->sqlen : 0 ; }
106     off_t getseqofs() { return fseqstart; }
107 gpertea 16 int getLineLen() { return line_len; }
108     int getLineBLen() { return line_blen; }
109 gpertea 2 //reads a subsequence starting at genomic coordinate cstart (1-based)
110     };
111    
112    
113     #endif