ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/LayoutParser.h
Revision: 16
Committed: Mon Jul 18 20:56:02 2011 UTC (8 years, 4 months ago) by gpertea
File size: 6233 byte(s)
Log Message:
sync with local source

Line File contents
1 #ifndef LayoutParser_H
2 #define LayoutParser_H
3
4 #include "GBase.h"
5 #include "GList.hh"
6 #include "GHash.hh"
7 #include <stdio.h>
8 //hash data associated with a contig/sequence name
9 //a contig name key is always stored as its name plus .<length>
10
11 class LytCtgData;
12
13 struct LytSeqInterSeg {
14 int segEnd, segRClip;
15 int nextStart, nextLClip;
16 char segRSplice, nextLSplice;
17 int nextSegSeq;
18 LytSeqInterSeg(int end, int nextstart, int rclip=0, int nextlclip=0,
19 char segrsplice=0, char nextlsplice=0, int seqpos=0) {
20 segEnd=end; segRClip=rclip;
21 nextStart=nextstart; nextLClip=nextlclip;
22 segRSplice=segrsplice;
23 nextLSplice=nextlsplice;
24 nextSegSeq=seqpos;
25 }
26 int length() {
27 return (nextStart-segEnd-1);
28 }
29 };
30
31 class LytSeqInfo { //info for a sequence within the file
32 int xlen; //total sequence length (with all the added * within contig)
33 int interseglen;
34 public:
35 char *name;
36 LytCtgData* contig; //contig data containing this sequence, as above
37 bool segmented;
38 int numisegs; // number of intersegs[]
39 LytSeqInterSeg* intersegs;
40 off_t fpos; //file position for the sequence data
41 unsigned char reversed;
42 int offs; //offset in contig (of the very left end)
43 int left,right; //clear range (relative to sequence itself, max 1..xlen)
44 LytSeqInfo(char* seqid, LytCtgData* ctg, int pos=0, unsigned char minus=0,
45 int slen=0, int clpL=0, int clpR=0) {
46 contig=ctg;
47 offs=pos;
48 reversed=minus;
49 fpos=0;
50 interseglen=0;
51 xlen=slen;
52 left=clpL+1; //1 if no clpL given
53 right=xlen-clpR; //0 if no len given
54 segmented=false;
55 numisegs=0;
56 name=Gstrdup(seqid);
57 intersegs=NULL;
58 }
59 ~LytSeqInfo() {
60 GFREE(name);
61 GFREE(intersegs);
62 }
63 bool hasIntrons() { return (numisegs>0); }
64 void addInterSeg(int end, int nextstart, int rclip=0, int nextlclip=0,
65 char splice=0, char nextsplice=0, int seqofs=0) {
66 GREALLOC(intersegs,(numisegs+1)*sizeof(LytSeqInterSeg));
67 interseglen+=nextstart-end-1;
68 intersegs[numisegs].segEnd=end; intersegs[numisegs].segRClip=rclip;
69 intersegs[numisegs].nextStart=nextstart; intersegs[numisegs].nextLClip=nextlclip;
70 intersegs[numisegs].segRSplice=splice;
71 intersegs[numisegs].nextLSplice=nextsplice;
72 intersegs[numisegs].nextSegSeq=seqofs;
73 numisegs++;
74 }
75
76 void setLength(int len) {
77 //should only be called BEFORE setting the real clipping coordinates
78 //(left,right)
79 xlen=len;
80 left=1;
81 right=xlen;
82 }
83 int length() { return xlen; } //xtended span, including introns
84 int seglen() { return xlen-interseglen; } //segments only, no introns
85 bool operator==(const LytSeqInfo& s) {
86 return (offs+left-1==s.offs+s.left-1);
87 }
88 bool operator>(const LytSeqInfo& s) {
89 return (offs+left-1>s.offs+s.left-1);
90 }
91 bool operator<(const LytSeqInfo& s) {
92 return (offs+left-1<s.offs+s.left-1);
93 }
94 char* expandGaps(char* s);
95 };
96
97
98 class LytCtgData {
99 public:
100 char* name; //contig name, as stored in file
101 unsigned int len; //contig length (lsequence, from ACE file)
102 int lpos, rpos;
103 int numseqs;
104 int offs; //some other type of user data that might be of use
105 off_t fpos; //position in file for this contig's entry
106 GList<LytSeqInfo> seqs;
107 LytCtgData(off_t pos=0):seqs(false,false,false) {
108 name=NULL;
109 offs=0;
110 len=0;
111 numseqs=0;
112 fpos=pos;
113 }
114 ~LytCtgData() {
115 GFREE(name);
116 seqs.Clear();
117 }
118
119 char* readName(char* s, GHash<int>& names);
120
121 bool operator==(const LytCtgData& s) {
122 return (strcmp(name,s.name)==0);
123 }
124 bool operator>(const LytCtgData& s) {
125 return (strcmp(name,s.name)>0);
126 }
127 bool operator<(const LytCtgData& s) {
128 return (strcmp(name,s.name)<0);
129 }
130 };
131 //callback -- called after a read or contig sequence is loaded
132 typedef bool fnLytSeq(int ctgno, LytCtgData* d, LytSeqInfo* s, char* seq);
133
134 class LayoutParser {
135 protected:
136 FILE* f; //file stream
137 off_t f_pos;
138 char* fname;
139 LytCtgData* currentContig; // currently loaded contig -- for browsing/loading
140 int numContigs; //total number of contigs found in this file
141 //int numSeqs; //total number of (distinct) sequences found in this file
142 GHash<LytSeqInfo> seqinfo; //sequence locations in the file
143 GHash<int> ctgIDs; //contig IDs, to make them unique!
144
145 GList<LytCtgData> contigs; //list of contig names with their size,
146 //number of sequences and filepos
147 protected:
148 GLineReader* linebuf; //the line buffer
149 off_t fskipTo(const char* linestart, const char* butnot=NULL);
150 bool startsWith(const char* s, const char* start, int tlen);
151 virtual LytSeqInfo* addSeq(char* s, LytCtgData* ctg);
152 int seek(off_t offset) {
153 int r=fseeko(f, offset, SEEK_SET);
154 if (r==0) f_pos=offset;
155 return r;
156 }
157
158 public:
159 LayoutParser(const char* filename):contigs(false,true) {
160 f=NULL;
161 f_pos=0;
162 numContigs=0;
163 currentContig=NULL;
164 if (filename==NULL) {
165 f=stdin;
166 fname=Gstrdup("stdin");
167 }
168 else
169 fname=Gstrdup(filename);
170 linebuf=new GLineReader();
171 }
172 virtual ~LayoutParser() {
173 ctgIDs.Clear();
174 GFREE(fname);
175 delete linebuf;
176 close();
177 numContigs=0;
178 seqinfo.Clear();
179 contigs.Clear();
180 }
181 virtual bool open();
182 void close();
183 virtual bool parse(fnLytSeq* seqfn=NULL); //load all the file offsets
184 virtual bool parseContigs(); //load all the file offsets for contigs
185 virtual bool loadContig(int ctgidx, fnLytSeq* seqfn=NULL, bool re_pos=true); //for loading by browsing
186 //if parsefn is not NULL, it is executed, passing the sequence data(first time, with the contig sequence)
187 //if parserfn returns true, the data is freed after it is processed
188 virtual char getFileType() { return 'L'; }
189 //sequence loading - only by request
190 LytCtgData* getContig(int idx) { return contigs[idx]; }
191 virtual char* getSeq(LytSeqInfo* sqinfo) { return NULL; }
192 virtual char* getContigSeq(LytCtgData* ctgdata) { return NULL; }
193 int getNumContigs() { return numContigs; }
194 //int getNumSeqs() { return numSeqs; }
195 off_t getFilePos() { return f_pos; }
196 //sorting the list of contigs:
197 void contigsByName();
198 void contigsByLen();
199 void contigsByNumSeqs();
200 };
201
202 #endif