1 |
#ifndef LayoutParser_H |
2 |
#define LayoutParser_H |
3 |
|
4 |
#include "GBase.h" |
5 |
#include "GList.hh" |
6 |
#include "GHash.hh" |
7 |
#include <stdio.h> |
8 |
//hash data associated with a contig/sequence name |
9 |
//a contig name key is always stored as its name plus .<length> |
10 |
|
11 |
class LytCtgData; |
12 |
|
13 |
struct LytSeqInterSeg { |
14 |
int segEnd, segRClip; |
15 |
int nextStart, nextLClip; |
16 |
char segRSplice, nextLSplice; |
17 |
int nextSegSeq; |
18 |
LytSeqInterSeg(int end, int nextstart, int rclip=0, int nextlclip=0, |
19 |
char segrsplice=0, char nextlsplice=0, int seqpos=0) { |
20 |
segEnd=end; segRClip=rclip; |
21 |
nextStart=nextstart; nextLClip=nextlclip; |
22 |
segRSplice=segrsplice; |
23 |
nextLSplice=nextlsplice; |
24 |
nextSegSeq=seqpos; |
25 |
} |
26 |
int length() { |
27 |
return (nextStart-segEnd-1); |
28 |
} |
29 |
}; |
30 |
|
31 |
class LytSeqInfo { //info for a sequence within the file |
32 |
int xlen; //total sequence length (with all the added * within contig) |
33 |
int interseglen; |
34 |
public: |
35 |
char *name; |
36 |
LytCtgData* contig; //contig data containing this sequence, as above |
37 |
bool segmented; |
38 |
int numisegs; // number of intersegs[] |
39 |
LytSeqInterSeg* intersegs; |
40 |
off_t fpos; //file position for the sequence data |
41 |
unsigned char reversed; |
42 |
int offs; //offset in contig (of the very left end) |
43 |
int left,right; //clear range (relative to sequence itself, max 1..xlen) |
44 |
LytSeqInfo(char* seqid, LytCtgData* ctg, int pos=0, unsigned char minus=0, |
45 |
int slen=0, int clpL=0, int clpR=0) { |
46 |
contig=ctg; |
47 |
offs=pos; |
48 |
reversed=minus; |
49 |
fpos=0; |
50 |
interseglen=0; |
51 |
xlen=slen; |
52 |
left=clpL+1; //1 if no clpL given |
53 |
right=xlen-clpR; //0 if no len given |
54 |
segmented=false; |
55 |
numisegs=0; |
56 |
name=Gstrdup(seqid); |
57 |
intersegs=NULL; |
58 |
} |
59 |
~LytSeqInfo() { |
60 |
GFREE(name); |
61 |
GFREE(intersegs); |
62 |
} |
63 |
bool hasIntrons() { return (numisegs>0); } |
64 |
void addInterSeg(int end, int nextstart, int rclip=0, int nextlclip=0, |
65 |
char splice=0, char nextsplice=0, int seqofs=0) { |
66 |
GREALLOC(intersegs,(numisegs+1)*sizeof(LytSeqInterSeg)); |
67 |
interseglen+=nextstart-end-1; |
68 |
intersegs[numisegs].segEnd=end; intersegs[numisegs].segRClip=rclip; |
69 |
intersegs[numisegs].nextStart=nextstart; intersegs[numisegs].nextLClip=nextlclip; |
70 |
intersegs[numisegs].segRSplice=splice; |
71 |
intersegs[numisegs].nextLSplice=nextsplice; |
72 |
intersegs[numisegs].nextSegSeq=seqofs; |
73 |
numisegs++; |
74 |
} |
75 |
|
76 |
void setLength(int len) { |
77 |
//should only be called BEFORE setting the real clipping coordinates |
78 |
//(left,right) |
79 |
xlen=len; |
80 |
left=1; |
81 |
right=xlen; |
82 |
} |
83 |
int length() { return xlen; } //xtended span, including introns |
84 |
int seglen() { return xlen-interseglen; } //segments only, no introns |
85 |
bool operator==(const LytSeqInfo& s) { |
86 |
return (offs+left-1==s.offs+s.left-1); |
87 |
} |
88 |
bool operator>(const LytSeqInfo& s) { |
89 |
return (offs+left-1>s.offs+s.left-1); |
90 |
} |
91 |
bool operator<(const LytSeqInfo& s) { |
92 |
return (offs+left-1<s.offs+s.left-1); |
93 |
} |
94 |
char* expandGaps(char* s); |
95 |
}; |
96 |
|
97 |
|
98 |
class LytCtgData { |
99 |
public: |
100 |
char* name; //contig name, as stored in file |
101 |
unsigned int len; //contig length (lsequence, from ACE file) |
102 |
int lpos, rpos; |
103 |
int numseqs; |
104 |
int offs; //some other type of user data that might be of use |
105 |
off_t fpos; //position in file for this contig's entry |
106 |
GList<LytSeqInfo> seqs; |
107 |
LytCtgData(off_t pos=0):seqs(false,false,false) { |
108 |
name=NULL; |
109 |
offs=0; |
110 |
len=0; |
111 |
numseqs=0; |
112 |
fpos=pos; |
113 |
} |
114 |
~LytCtgData() { |
115 |
GFREE(name); |
116 |
seqs.Clear(); |
117 |
} |
118 |
|
119 |
char* readName(char* s, GHash<int>& names); |
120 |
|
121 |
bool operator==(const LytCtgData& s) { |
122 |
return (strcmp(name,s.name)==0); |
123 |
} |
124 |
bool operator>(const LytCtgData& s) { |
125 |
return (strcmp(name,s.name)>0); |
126 |
} |
127 |
bool operator<(const LytCtgData& s) { |
128 |
return (strcmp(name,s.name)<0); |
129 |
} |
130 |
}; |
131 |
//callback -- called after a read or contig sequence is loaded |
132 |
typedef bool fnLytSeq(int ctgno, LytCtgData* d, LytSeqInfo* s, char* seq); |
133 |
|
134 |
class LayoutParser { |
135 |
protected: |
136 |
FILE* f; //file stream |
137 |
off_t f_pos; |
138 |
char* fname; |
139 |
LytCtgData* currentContig; // currently loaded contig -- for browsing/loading |
140 |
int numContigs; //total number of contigs found in this file |
141 |
//int numSeqs; //total number of (distinct) sequences found in this file |
142 |
GHash<LytSeqInfo> seqinfo; //sequence locations in the file |
143 |
GHash<int> ctgIDs; //contig IDs, to make them unique! |
144 |
|
145 |
GList<LytCtgData> contigs; //list of contig names with their size, |
146 |
//number of sequences and filepos |
147 |
protected: |
148 |
GLineReader* linebuf; //the line buffer |
149 |
off_t fskipTo(const char* linestart, const char* butnot=NULL); |
150 |
bool startsWith(const char* s, const char* start, int tlen); |
151 |
virtual LytSeqInfo* addSeq(char* s, LytCtgData* ctg); |
152 |
int seek(off_t offset) { |
153 |
int r=fseeko(f, offset, SEEK_SET); |
154 |
if (r==0) f_pos=offset; |
155 |
return r; |
156 |
} |
157 |
|
158 |
public: |
159 |
LayoutParser(const char* filename):contigs(false,true) { |
160 |
f=NULL; |
161 |
f_pos=0; |
162 |
numContigs=0; |
163 |
currentContig=NULL; |
164 |
if (filename==NULL) { |
165 |
f=stdin; |
166 |
fname=Gstrdup("stdin"); |
167 |
} |
168 |
else |
169 |
fname=Gstrdup(filename); |
170 |
linebuf=new GLineReader(); |
171 |
} |
172 |
virtual ~LayoutParser() { |
173 |
ctgIDs.Clear(); |
174 |
GFREE(fname); |
175 |
delete linebuf; |
176 |
close(); |
177 |
numContigs=0; |
178 |
seqinfo.Clear(); |
179 |
contigs.Clear(); |
180 |
} |
181 |
virtual bool open(); |
182 |
void close(); |
183 |
virtual bool parse(fnLytSeq* seqfn=NULL); //load all the file offsets |
184 |
virtual bool parseContigs(); //load all the file offsets for contigs |
185 |
virtual bool loadContig(int ctgidx, fnLytSeq* seqfn=NULL, bool re_pos=true); //for loading by browsing |
186 |
//if parsefn is not NULL, it is executed, passing the sequence data(first time, with the contig sequence) |
187 |
//if parserfn returns true, the data is freed after it is processed |
188 |
virtual char getFileType() { return 'L'; } |
189 |
//sequence loading - only by request |
190 |
LytCtgData* getContig(int idx) { return contigs[idx]; } |
191 |
virtual char* getSeq(LytSeqInfo* sqinfo) { return NULL; } |
192 |
virtual char* getContigSeq(LytCtgData* ctgdata) { return NULL; } |
193 |
int getNumContigs() { return numContigs; } |
194 |
//int getNumSeqs() { return numSeqs; } |
195 |
off_t getFilePos() { return f_pos; } |
196 |
//sorting the list of contigs: |
197 |
void contigsByName(); |
198 |
void contigsByLen(); |
199 |
void contigsByNumSeqs(); |
200 |
}; |
201 |
|
202 |
#endif |