ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GFaSeqGet.h
Revision: 153
Committed: Fri Jan 20 22:35:32 2012 UTC (7 years, 8 months ago) by gpertea
File size: 3485 byte(s)
Log Message:
still working on trans-splicing/fusions in gff

Line User Rev File contents
1 gpertea 2 #ifndef GFASEQGET_H
2     #define GFASEQGET_H
3     #include "GList.hh"
4    
5     #define MAX_FASUBSEQ 0x20000000
6     //max 512MB sequence data held in memory at a time
7    
8     class GSubSeq {
9     public:
10     uint sqstart; //1-based coord of subseq start on sequence
11     uint sqlen; //length of subseq loaded
12     char* sq; //actual subsequence data will be stored here
13     // (with end-of-line characters removed)
14    
15     /*char* xseq; //the exposed pointer to the last requested subsequence start
16     off_t xstart; //the coordinate start for the last requested subseq
17     off_t xlen; //the last requested subseq len*/
18     GSubSeq() {
19     sqstart=0;
20     sqlen=0;
21     sq=NULL;
22     /* xseq=NULL;
23     xstart=0;
24     xlen=0;*/
25     }
26     ~GSubSeq() {
27     GFREE(sq);
28     }
29     // genomic, 1-based coordinates:
30 gpertea 16 void setup(uint sstart, int slen, int sovl=0, int qfrom=0, int qto=0, uint maxseqlen=0);
31 gpertea 2 //check for overlap with previous window and realloc/extend appropriately
32     //returns offset from seq that corresponds to sstart
33     // the window will keep extending until MAX_FASUBSEQ is reached
34     };
35    
36     class GFaSeqGet {
37     char* fname;
38     FILE* fh;
39     //raw offset in the file where the sequence actually starts:
40     off_t fseqstart;
41 gpertea 16 uint seq_len; //total sequence length, if known (when created from GFastaIndex)
42     int line_len; //length of each line of text
43     int line_blen; //binary length of each line
44     // = line_len + number of EOL character(s)
45 gpertea 2 GSubSeq* lastsub;
46     void initialParse(off_t fofs=0, bool checkall=true);
47     const char* loadsubseq(uint cstart, int& clen);
48     void finit(const char* fn, off_t fofs, bool validate);
49     public:
50     GFaSeqGet() {
51 gpertea 16 fh=NULL;
52 gpertea 2 fseqstart=0;
53 gpertea 16 seq_len=0;
54     line_len=0;
55     line_blen=0;
56 gpertea 2 fname=NULL;
57     lastsub=NULL;
58     }
59 gpertea 16 GFaSeqGet(const char* fn, off_t fofs, bool validate=false) {
60     seq_len=0;
61 gpertea 2 finit(fn,fofs,validate);
62     }
63     GFaSeqGet(const char* fn, bool validate=false) {
64 gpertea 16 seq_len=0;
65 gpertea 2 finit(fn,0,validate);
66     }
67 gpertea 16
68     GFaSeqGet(const char* faname, uint seqlen, off_t fseqofs, int l_len, int l_blen);
69     //constructor from GFastaIndex record
70    
71 gpertea 2 GFaSeqGet(FILE* f, off_t fofs=0, bool validate=false);
72 gpertea 16
73 gpertea 2 ~GFaSeqGet() {
74     if (fname!=NULL) {
75     GFREE(fname);
76     fclose(fh);
77     }
78     delete lastsub;
79     }
80     const char* subseq(uint cstart, int& clen);
81 gpertea 16 const char* getRange(uint cstart=1, uint cend=0) {
82     if (cend==0) cend=(seq_len>0)?seq_len : MAX_FASUBSEQ;
83 gpertea 144 if (cstart>cend) { Gswap(cstart, cend); }
84 gpertea 2 int clen=cend-cstart+1;
85     //int rdlen=clen;
86     return subseq(cstart, clen);
87     }
88 gpertea 16
89 gpertea 2 char* copyRange(uint cstart, uint cend, bool revCmpl=false, bool upCase=false);
90 gpertea 16 //caller is responsible for deallocating the return string
91 gpertea 2
92 gpertea 16 void loadall(uint32 max_len=0) {
93     //TODO: must read the whole sequence differently here - line by line
94     //so when EOF or another '>' line is found, the reading stops!
95     int clen=(seq_len>0) ? seq_len : ((max_len>0) ? max_len : MAX_FASUBSEQ);
96 gpertea 2 subseq(1, clen);
97     }
98     void load(uint cstart, uint cend) {
99     //cache as much as possible
100 gpertea 16 if (seq_len>0 && cend>seq_len) cend=seq_len; //correct a bad request
101 gpertea 2 int clen=cend-cstart+1;
102     subseq(cstart, clen);
103     }
104     int getsublen() { return lastsub!=NULL ? lastsub->sqlen : 0 ; }
105     off_t getseqofs() { return fseqstart; }
106 gpertea 16 int getLineLen() { return line_len; }
107     int getLineBLen() { return line_blen; }
108 gpertea 2 //reads a subsequence starting at genomic coordinate cstart (1-based)
109     };
110    
111    
112     #endif