ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GFaSeqGet.h
Revision: 2
Committed: Mon Mar 22 22:03:27 2010 UTC (9 years, 4 months ago) by gpertea
File size: 3084 byte(s)
Log Message:
added my gclib source files

Line User Rev File contents
1 gpertea 2 #ifndef GFASEQGET_H
2     #define GFASEQGET_H
3    
4     #include "GBase.h"
5     #include "GList.hh"
6    
7    
8     #define MAX_FASUBSEQ 0x20000000
9     //max 512MB sequence data held in memory at a time
10    
11     class GSubSeq {
12     public:
13     uint sqstart; //1-based coord of subseq start on sequence
14     uint sqlen; //length of subseq loaded
15     char* sq; //actual subsequence data will be stored here
16     // (with end-of-line characters removed)
17    
18     /*char* xseq; //the exposed pointer to the last requested subsequence start
19     off_t xstart; //the coordinate start for the last requested subseq
20     off_t xlen; //the last requested subseq len*/
21     GSubSeq() {
22     sqstart=0;
23     sqlen=0;
24     sq=NULL;
25     /* xseq=NULL;
26     xstart=0;
27     xlen=0;*/
28     }
29     ~GSubSeq() {
30     GFREE(sq);
31     }
32     // genomic, 1-based coordinates:
33     void setup(uint sstart, int slen, int sovl=0, int qfrom=0, int qto=0);
34     //check for overlap with previous window and realloc/extend appropriately
35     //returns offset from seq that corresponds to sstart
36     // the window will keep extending until MAX_FASUBSEQ is reached
37     };
38    
39     class GFaSeqGet {
40     char* fname;
41     FILE* fh;
42     //raw offset in the file where the sequence actually starts:
43     off_t fseqstart;
44     int linelen; //length of each sequence line (assumed fixed)
45     char lendlen; //length of end-of-line characters between lines
46     //(assumed fixed)
47     char lendch; //end-of-line signal character (can only be '\n' or '\r')
48     GSubSeq* lastsub;
49     void initialParse(off_t fofs=0, bool checkall=true);
50     const char* loadsubseq(uint cstart, int& clen);
51     void finit(const char* fn, off_t fofs, bool validate);
52     public:
53     GFaSeqGet() {
54     fseqstart=0;
55     linelen=0;
56     lendch='\0';
57     fname=NULL;
58     lastsub=NULL;
59     }
60     GFaSeqGet(const char* fn, off_t fofs, bool validate=false) {
61     finit(fn,fofs,validate);
62     }
63     GFaSeqGet(const char* fn, bool validate=false) {
64     finit(fn,0,validate);
65     }
66     /*
67     GFaSeqGet(bool readAll, const char* fn, off_t fofs=0);
68     GFaSeqGet(bool readAll, FILE* f, off_t fofs=0);
69     */
70     GFaSeqGet(FILE* f, off_t fofs=0, bool validate=false);
71     ~GFaSeqGet() {
72     if (fname!=NULL) {
73     GFREE(fname);
74     fclose(fh);
75     }
76     delete lastsub;
77     }
78     const char* subseq(uint cstart, int& clen);
79     const char* getRange(uint cstart, uint cend) {
80     if (cstart>cend) { swap(cstart, cend); }
81     int clen=cend-cstart+1;
82     //int rdlen=clen;
83     return subseq(cstart, clen);
84     }
85     //caller is responsible for deallocating copyRange() return string
86     char* copyRange(uint cstart, uint cend, bool revCmpl=false, bool upCase=false);
87    
88     void loadall() {
89     int clen=MAX_FASUBSEQ;
90     subseq(1, clen);
91     }
92     void load(uint cstart, uint cend) {
93     //cache as much as possible
94     int clen=cend-cstart+1;
95     subseq(cstart, clen);
96     }
97     int getsublen() { return lastsub!=NULL ? lastsub->sqlen : 0 ; }
98     off_t getseqofs() { return fseqstart; }
99     int getlinelen() { return linelen; }
100     int getlendlen() { return lendlen; }
101     //reads a subsequence starting at genomic coordinate cstart (1-based)
102     };
103    
104    
105     #endif