1 |
#ifndef GFF_UTILS_H |
2 |
#define GFF_UTILS_H |
3 |
#include "gff.h" |
4 |
#include "GStr.h" |
5 |
#include "GFastaIndex.h" |
6 |
#include "GFaSeqGet.h" |
7 |
|
8 |
typedef bool GFValidateFunc(GffObj* gf, GList<GffObj>* gfadd); |
9 |
|
10 |
class GeneInfo { //for Ensembl GTF conversion |
11 |
public: |
12 |
int flag; |
13 |
GffObj* gf; |
14 |
GList<GStr> gene_names; |
15 |
GList<GStr> transcripts; //list of transcript IDs |
16 |
GeneInfo():gene_names(true, true, true), transcripts(true,true,true) { |
17 |
gf=NULL; |
18 |
flag=0; |
19 |
} |
20 |
GeneInfo(GffObj* gfrec, bool ensembl_convert=false):gene_names(true, true, true), |
21 |
transcripts(true,true,true) { |
22 |
flag=0; |
23 |
if (gfrec->getGeneName()) |
24 |
gene_names.Add(new GStr(gfrec->getGeneName())); |
25 |
transcripts.Add(new GStr(gfrec->getID())); |
26 |
create_gf(gfrec, ensembl_convert); |
27 |
} |
28 |
|
29 |
void create_gf(GffObj* gfrec, bool ensembl_convert) { |
30 |
gf=new GffObj(gfrec->getGeneID()); |
31 |
gf->gseq_id=gfrec->gseq_id; |
32 |
gf->track_id=gfrec->track_id; |
33 |
gf->start=gfrec->start; |
34 |
gf->end=gfrec->end; |
35 |
gf->strand=gfrec->strand; |
36 |
gf->setFeatureName("gene"); |
37 |
gf->isGene(true); |
38 |
gf->isUsed(true); |
39 |
gf->uptr=this; |
40 |
gfrec->incLevel(); |
41 |
gfrec->parent=gf; |
42 |
gf->children.Add(gfrec); |
43 |
if (ensembl_convert) { |
44 |
gf->addAttr("type", gf->getTrackName()); |
45 |
} |
46 |
//gf->children.Add(gfrec); |
47 |
} |
48 |
//~GeneInfo() { |
49 |
// } |
50 |
void update(GffObj* gfrec) { |
51 |
if (transcripts.AddedIfNew(new GStr(gfrec->getID()))<0) |
52 |
return; |
53 |
gene_names.AddedIfNew(new GStr(gfrec->getGeneName())); |
54 |
if (gf==NULL) { |
55 |
GError("GeneInfo::update() called on uninitialized gf!\n"); |
56 |
//create_gf(gfrec); |
57 |
//return; |
58 |
} |
59 |
gfrec->parent=gf; |
60 |
gf->children.Add(gfrec); |
61 |
gfrec->incLevel(); |
62 |
if (gf->start>gfrec->start) |
63 |
gf->start=gfrec->start; |
64 |
if (gf->end<gfrec->end) |
65 |
gf->end=gfrec->end; |
66 |
} |
67 |
void finalize() { |
68 |
//prepare attributes for printing |
69 |
//must be called right before printing |
70 |
if (gf==NULL || transcripts.Count()==0) return; |
71 |
if (gene_names.Count()>0) { |
72 |
gf->addAttr("Name", gene_names[0]->chars()); |
73 |
/* |
74 |
GStr s(gene_names[0]->chars()); |
75 |
for (int i=1;i<gene_names.Count();i++) { |
76 |
s.append(","); |
77 |
s.append(gene_names[i]->chars()); |
78 |
} |
79 |
gf->addAttr("genes", s.chars()); |
80 |
*/ |
81 |
} //has gene names |
82 |
GStr t(transcripts[0]->chars()); |
83 |
for (int i=1;i<transcripts.Count();i++) { |
84 |
t.append(","); |
85 |
t.append(transcripts[i]->chars()); |
86 |
} |
87 |
gf->addAttr("transcripts", t.chars()); |
88 |
} |
89 |
}; |
90 |
|
91 |
//genomic fasta sequence handling |
92 |
class GFastaDb { |
93 |
public: |
94 |
char* fastaPath; |
95 |
GFastaIndex* faIdx; //could be a cdb .cidx file |
96 |
int last_fetchid; |
97 |
GFaSeqGet* faseq; |
98 |
//GCdbYank* gcdb; |
99 |
char* getFastaFile(int gseq_id) { |
100 |
if (fastaPath==NULL) return NULL; |
101 |
GStr s(fastaPath); |
102 |
s.trimR('/'); |
103 |
s.appendfmt("/%s",GffObj::names->gseqs.getName(gseq_id)); |
104 |
GStr sbase(s); |
105 |
if (!fileExists(s.chars())) s.append(".fa"); |
106 |
if (!fileExists(s.chars())) s.append("sta"); |
107 |
if (fileExists(s.chars())) return Gstrdup(s.chars()); |
108 |
else { |
109 |
GMessage("Warning: cannot find genomic sequence file %s{.fa,.fasta}\n",sbase.chars()); |
110 |
return NULL; |
111 |
} |
112 |
} |
113 |
|
114 |
GFastaDb(const char* fpath=NULL) { |
115 |
//gcdb=NULL; |
116 |
fastaPath=NULL; |
117 |
faseq=NULL; |
118 |
faIdx=NULL; |
119 |
init(fpath); |
120 |
} |
121 |
|
122 |
void init(const char* fpath) { |
123 |
if (fpath==NULL || fpath[0]==0) return; |
124 |
last_fetchid=-1; |
125 |
if (!fileExists(fpath)) |
126 |
GError("Error: file/directory %s does not exist!\n",fpath); |
127 |
fastaPath=Gstrdup(fpath); |
128 |
GStr gseqpath(fpath); |
129 |
if (fileExists(fastaPath)>1) { //exists and it's not a directory |
130 |
GStr fainame(fastaPath); |
131 |
if (fainame.rindex(".fai")==fainame.length()-4) { |
132 |
//.fai index file given directly |
133 |
fastaPath[fainame.length()-4]=0; |
134 |
if (!fileExists(fastaPath)) |
135 |
GError("Error: cannot find fasta file for index %s !\n", fastaPath); |
136 |
} |
137 |
else fainame.append(".fai"); |
138 |
//GMessage("creating GFastaIndex with fastaPath=%s, fainame=%s\n", fastaPath, fainame.chars()); |
139 |
faIdx=new GFastaIndex(fastaPath,fainame.chars()); |
140 |
GStr fainamecwd(fainame); |
141 |
int ip=-1; |
142 |
if ((ip=fainamecwd.rindex(CHPATHSEP))>=0) |
143 |
fainamecwd.cut(0,ip+1); |
144 |
if (!faIdx->hasIndex()) { //could not load index |
145 |
//try current directory |
146 |
if (fainame!=fainamecwd) { |
147 |
if (fileExists(fainamecwd.chars())>1) { |
148 |
faIdx->loadIndex(fainamecwd.chars()); |
149 |
} |
150 |
} |
151 |
} //tried to load index |
152 |
if (!faIdx->hasIndex()) { |
153 |
GMessage("No fasta index found for %s. Rebuilding, please wait..\n",fastaPath); |
154 |
faIdx->buildIndex(); |
155 |
if (faIdx->getCount()==0) GError("Error: no fasta records found!\n"); |
156 |
GMessage("Fasta index rebuilt.\n"); |
157 |
FILE* fcreate=fopen(fainame.chars(), "w"); |
158 |
if (fcreate==NULL) { |
159 |
GMessage("Warning: cannot create fasta index %s! (permissions?)\n", fainame.chars()); |
160 |
if (fainame!=fainamecwd) fcreate=fopen(fainamecwd.chars(), "w"); |
161 |
if (fcreate==NULL) |
162 |
GError("Error: cannot create fasta index %s!\n", fainamecwd.chars()); |
163 |
} |
164 |
if (faIdx->storeIndex(fcreate)<faIdx->getCount()) |
165 |
GMessage("Warning: error writing the index file!\n"); |
166 |
} //index created and attempted to store it |
167 |
} //multi-fasta |
168 |
} |
169 |
GFaSeqGet* fetch(int gseq_id, bool checkFasta=false) { |
170 |
if (fastaPath==NULL) return NULL; |
171 |
if (gseq_id==last_fetchid && faseq!=NULL) return faseq; |
172 |
delete faseq; |
173 |
faseq=NULL; |
174 |
last_fetchid=-1; |
175 |
char* gseqname=GffObj::names->gseqs.getName(gseq_id); |
176 |
if (faIdx!=NULL) { //fastaPath was the multi-fasta file name |
177 |
GFastaRec* farec=faIdx->getRecord(gseqname); |
178 |
if (farec!=NULL) { |
179 |
faseq=new GFaSeqGet(fastaPath,farec->seqlen, farec->fpos, |
180 |
farec->line_len, farec->line_blen); |
181 |
faseq->loadall(); //just cache the whole sequence, it's faster |
182 |
last_fetchid=gseq_id; |
183 |
} |
184 |
else { |
185 |
GMessage("Warning: couldn't find fasta record for '%s'!\n",gseqname); |
186 |
return NULL; |
187 |
} |
188 |
} |
189 |
else { |
190 |
char* sfile=getFastaFile(gseq_id); |
191 |
if (sfile!=NULL) { |
192 |
faseq=new GFaSeqGet(sfile,checkFasta); |
193 |
faseq->loadall(); |
194 |
last_fetchid=gseq_id; |
195 |
GFREE(sfile); |
196 |
} |
197 |
} //one fasta file per contig |
198 |
return faseq; |
199 |
} |
200 |
|
201 |
~GFastaDb() { |
202 |
GFREE(fastaPath); |
203 |
//delete gcdb; |
204 |
delete faIdx; |
205 |
delete faseq; |
206 |
} |
207 |
}; |
208 |
|
209 |
class GffLocus; |
210 |
|
211 |
class GTData { //transcript associated data |
212 |
public: |
213 |
GffObj* rna; |
214 |
GffLocus* locus; |
215 |
GffObj* replaced_by; |
216 |
GeneInfo* geneinfo; |
217 |
int flag; |
218 |
GTData(GffObj* t=NULL) { |
219 |
rna=t; |
220 |
flag=0; |
221 |
locus=NULL; |
222 |
replaced_by=NULL; |
223 |
geneinfo=NULL; |
224 |
if (rna!=NULL) { |
225 |
geneinfo=(GeneInfo*)rna->uptr; //take over geneinfo, if there |
226 |
rna->uptr=this; |
227 |
} |
228 |
} |
229 |
bool operator>(GTData& b) { return (rna > b.rna); } |
230 |
bool operator<(GTData& b) { return (rna < b.rna); } |
231 |
bool operator==(GTData& b) { return (rna==b.rna); } |
232 |
}; |
233 |
|
234 |
class CGeneSym { |
235 |
public: |
236 |
GStr name; |
237 |
int freq; |
238 |
CGeneSym(const char* n=NULL, int f=0):name(n) { |
239 |
freq=f; |
240 |
} |
241 |
bool operator>(CGeneSym& b) { |
242 |
return (freq==b.freq) ? ( (name.length()==b.name.length()) ? (name>b.name) : |
243 |
(name.length()>b.name.length()) ) : ( freq<b.freq ); |
244 |
} |
245 |
bool operator<(CGeneSym& b) { |
246 |
return (freq==b.freq)? ( (name.length()==b.name.length()) ? (name<b.name) : |
247 |
(name.length()<b.name.length()) ) : ( freq>b.freq ); |
248 |
} |
249 |
bool operator==(CGeneSym& b) { return name==b.name; } |
250 |
}; |
251 |
|
252 |
const char* getGeneDescr(const char* gsym); |
253 |
|
254 |
void printLocus(GffLocus* loc, const char* pre=NULL); |
255 |
|
256 |
class GffLocus:public GSeg { |
257 |
public: |
258 |
int gseq_id; //id of underlying genomic sequence |
259 |
int locus_num; |
260 |
bool is_mrna; |
261 |
bool startChanged; //prompt reordering of container list |
262 |
char strand; |
263 |
GffObj* t_maxcov; //transcript with maximum coverage (for main "ref" transcript) |
264 |
GList<GffObj> rnas; //list of transcripts (isoforms) for this locus |
265 |
GArray<GSeg> mexons; //list of merged exons in this region |
266 |
GList<CGeneSym> gene_names; |
267 |
GList<CGeneSym> gene_ids; |
268 |
int v; //user flag/data |
269 |
/* |
270 |
bool operator==(GffLocus& d){ |
271 |
return (gseq_id==d.gseq_id && strand==d.strand && start==d.start && end==d.end); |
272 |
} |
273 |
bool operator>(GffLocus& d){ |
274 |
if (gseq_id!=d.gseq_id) return (gseq_id>d.gseq_id); |
275 |
if (start==d.start) { |
276 |
if (end==d.end) return (strand>d.strand); |
277 |
else return (end>d.end); |
278 |
} else return (start>d.start); |
279 |
} |
280 |
bool operator<(GffLocus& d){ |
281 |
if (gseq_id!=d.gseq_id) return (gseq_id<d.gseq_id); |
282 |
if (start==d.start) { |
283 |
if (end==d.end) return strand<d.strand; |
284 |
else return end<d.end; |
285 |
} else return (start<d.start); |
286 |
} |
287 |
*/ |
288 |
const char* getGeneName() { |
289 |
if (gene_names.Count()==0) return NULL; |
290 |
return gene_names.First()->name.chars(); |
291 |
} |
292 |
const char* get_tmax_id() { |
293 |
return t_maxcov->getID(); |
294 |
} |
295 |
const char* get_descr() { |
296 |
if (gene_names.Count()>0) { |
297 |
for (int i=0;i<gene_names.Count();i++) { |
298 |
const char* gn=getGeneDescr(gene_names.First()->name.chars()); |
299 |
if (gn!=NULL) return gn; |
300 |
} |
301 |
} |
302 |
char* s=t_maxcov->getAttr("product"); |
303 |
if (s!=NULL) return s; |
304 |
s=t_maxcov->getAttr("descr"); |
305 |
if (s!=NULL) return s; |
306 |
s=t_maxcov->getAttr("description"); |
307 |
if (s!=NULL) return s; |
308 |
s=t_maxcov->getAttr("info"); |
309 |
if (s!=NULL) return s; |
310 |
return NULL; |
311 |
} |
312 |
|
313 |
GffLocus(GffObj* t=NULL):rnas(true,false,false),mexons(true,true), |
314 |
gene_names(true,true,false), gene_ids(true,true,false) { |
315 |
//this will NOT free rnas! |
316 |
t_maxcov=NULL; |
317 |
gseq_id=-1; |
318 |
v=0; |
319 |
locus_num=0; |
320 |
start=0; |
321 |
end=0; |
322 |
strand=0; |
323 |
is_mrna=false; |
324 |
startChanged=false; |
325 |
if (t!=NULL) { |
326 |
start=t->exons.First()->start; |
327 |
end=t->exons.Last()->end;; |
328 |
gseq_id=t->gseq_id; |
329 |
GSeg seg; |
330 |
for (int i=0;i<t->exons.Count();i++) { |
331 |
seg.start=t->exons[i]->start; |
332 |
seg.end=t->exons[i]->end; |
333 |
mexons.Add(seg); |
334 |
} |
335 |
rnas.Add(t); |
336 |
((GTData*)(t->uptr))->locus=this; |
337 |
t_maxcov=t; |
338 |
strand=t->strand; |
339 |
if (t->ftype_id==gff_fid_mRNA) { |
340 |
is_mrna=true; |
341 |
} |
342 |
} |
343 |
} |
344 |
|
345 |
void addMerge(GffLocus& locus, GffObj* lnkrna) { |
346 |
//add all the elements of the other locus (merging) |
347 |
//-- merge mexons |
348 |
GArray<int> ovlexons(true,true); //list of locus.mexons indexes overlapping existing mexons |
349 |
int i=0; //index of first mexons with a merge |
350 |
int j=0; //index current mrna exon |
351 |
while (i<mexons.Count() && j<locus.mexons.Count()) { |
352 |
uint istart=mexons[i].start; |
353 |
uint iend=mexons[i].end; |
354 |
uint jstart=locus.mexons[j].start; |
355 |
uint jend=locus.mexons[j].end; |
356 |
if (iend<jstart) { i++; continue; } |
357 |
if (jend<istart) { j++; continue; } |
358 |
ovlexons.Add(j); |
359 |
//extend mexons[i] as needed |
360 |
if (jstart<istart) mexons[i].start=jstart; |
361 |
if (jend>iend) { //mexons[i] end extend |
362 |
mexons[i].end=jend; |
363 |
//now this could overlap the next mexon(s), so we have to merge them all |
364 |
while (i<mexons.Count()-1 && mexons[i].end>mexons[i+1].start) { |
365 |
uint nextend=mexons[i+1].end; |
366 |
mexons.Delete(i+1); |
367 |
if (nextend>mexons[i].end) { |
368 |
mexons[i].end=nextend; |
369 |
break; //no need to check next mexons |
370 |
} |
371 |
} //while next mexons merge |
372 |
} // mexons[i] end extend |
373 |
j++; //check the next locus.mexon |
374 |
} |
375 |
//-- add the rest of the non-overlapping mexons: |
376 |
GSeg seg; |
377 |
for (int i=0;i<locus.mexons.Count();i++) { |
378 |
seg.start=locus.mexons[i].start; |
379 |
seg.end=locus.mexons[i].end; |
380 |
if (!ovlexons.Exists(i)) mexons.Add(seg); |
381 |
} |
382 |
// -- add locus.rnas |
383 |
for (int i=0;i<locus.rnas.Count();i++) { |
384 |
((GTData*)(locus.rnas[i]->uptr))->locus=this; |
385 |
if (locus.rnas[i]!=lnkrna) rnas.Add(locus.rnas[i]); |
386 |
} |
387 |
// -- adjust start/end as needed |
388 |
if (start>locus.start) { |
389 |
startChanged=true; |
390 |
start=locus.start; |
391 |
} |
392 |
if (end<locus.end) end=locus.end; |
393 |
if (locus.is_mrna) is_mrna=true; |
394 |
if (t_maxcov->covlen<locus.t_maxcov->covlen) |
395 |
t_maxcov=locus.t_maxcov; |
396 |
} |
397 |
|
398 |
bool exonOverlap(GffLocus& loc) { |
399 |
//check if any mexons overlap! |
400 |
if (strand!=loc.strand || loc.start>end || start>loc.end) return false; |
401 |
int i=0; |
402 |
int j=0; |
403 |
while (i<mexons.Count() && j<loc.mexons.Count()) { |
404 |
uint istart=mexons[i].start; |
405 |
uint iend=mexons[i].end; |
406 |
uint jstart=loc.mexons[j].start; |
407 |
uint jend=loc.mexons[j].end; |
408 |
if (iend<jstart) { i++; continue; } |
409 |
if (jend<istart) { j++; continue; } |
410 |
//exon overlap found if we're here: |
411 |
return true; |
412 |
} |
413 |
return false; |
414 |
} |
415 |
|
416 |
bool add_RNA(GffObj* t) { |
417 |
//if (rnas.Count()==0) return true; //? should never be called on an empty locus |
418 |
if (t->gseq_id!=gseq_id || t->strand!=strand || t->start>end || start>t->end) |
419 |
return false; //rna must be on the same genomic seq |
420 |
//check for exon overlap with existing mexons |
421 |
//also update mexons accordingly if t is to be added |
422 |
bool hasovl=false; |
423 |
int i=0; //index of first mexons with a merge |
424 |
int j=0; //index current t exon |
425 |
GArray<int> ovlexons(true,true); //list of mrna exon indexes overlapping mexons |
426 |
while (i<mexons.Count() && j<t->exons.Count()) { |
427 |
uint istart=mexons[i].start; |
428 |
uint iend=mexons[i].end; |
429 |
uint jstart=t->exons[j]->start; |
430 |
uint jend=t->exons[j]->end; |
431 |
if (iend<jstart) { i++; continue; } |
432 |
if (jend<istart) { j++; continue; } |
433 |
//exon overlap found if we're here: |
434 |
ovlexons.Add(j); |
435 |
hasovl=true; |
436 |
//extend mexons[i] as needed |
437 |
if (jstart<istart) mexons[i].start=jstart; |
438 |
if (jend>iend) { //mexon stretch up |
439 |
mexons[i].end=jend; |
440 |
//now this could overlap the next mexon(s), so we have to merge them all |
441 |
while (i<mexons.Count()-1 && mexons[i].end>mexons[i+1].start) { |
442 |
uint nextend=mexons[i+1].end; |
443 |
mexons.Delete(i+1); |
444 |
if (nextend>mexons[i].end) { |
445 |
mexons[i].end=nextend; |
446 |
break; //no need to check next mexons |
447 |
} |
448 |
} //while next mexons merge |
449 |
} //possible mexons merge |
450 |
|
451 |
j++; //check the next t exon |
452 |
}//all vs all exon check loop |
453 |
if (hasovl) { |
454 |
GSeg seg; |
455 |
//add the rest of the non-overlapping exons |
456 |
for (int i=0;i<t->exons.Count();i++) { |
457 |
seg.start=t->exons[i]->start; |
458 |
seg.end=t->exons[i]->end; |
459 |
if (!ovlexons.Exists(i)) mexons.Add(seg); |
460 |
} |
461 |
rnas_add(t); |
462 |
// add to rnas |
463 |
((GTData*)t->uptr)->locus=this; |
464 |
gseq_id=t->gseq_id; |
465 |
} |
466 |
return hasovl; |
467 |
} |
468 |
|
469 |
//simpler,basic adding of a mrna |
470 |
void rnas_add(GffObj* t) { |
471 |
rnas.Add(t); |
472 |
// adjust start/end |
473 |
//if (start==0 || start>t->start) start=t->start; |
474 |
if (start==0) start=t->start; |
475 |
else if (start>t->start) { |
476 |
startChanged=true; |
477 |
start=t->start; |
478 |
} |
479 |
if (end<t->end) end=t->end; |
480 |
if (t_maxcov->covlen<t->covlen) t_maxcov=t; |
481 |
if (strand==0) strand=t->strand; |
482 |
if (t->ftype_id==gff_fid_mRNA) is_mrna=true; |
483 |
} |
484 |
}; |
485 |
|
486 |
class GenomicSeqData { |
487 |
int gseq_id; |
488 |
public: |
489 |
const char* gseq_name; |
490 |
GList<GffObj> gfs; //all non-transcript features -> usually gene features |
491 |
GList<GffObj> rnas; //all transcripts on this genomic sequence |
492 |
GList<GffLocus> loci; //all loci clusters |
493 |
GList<GTData> tdata; //transcript data (uptr holder for all rnas loaded here) |
494 |
//GenomicSeqData(int gid=-1):rnas(true,true,false),loci(true,true,true), |
495 |
GenomicSeqData(int gid=-1):gfs(true, true, false),rnas((GCompareProc*)gfo_cmpByLoc),loci(true,true,false), |
496 |
tdata(false,true,false) { |
497 |
gseq_id=gid; |
498 |
if (gseq_id>=0) |
499 |
gseq_name=GffObj::names->gseqs.getName(gseq_id); |
500 |
|
501 |
} |
502 |
bool operator==(GenomicSeqData& d){ |
503 |
return gseq_id==d.gseq_id; |
504 |
} |
505 |
bool operator>(GenomicSeqData& d){ |
506 |
return (gseq_id>d.gseq_id); |
507 |
} |
508 |
bool operator<(GenomicSeqData& d){ |
509 |
return (gseq_id<d.gseq_id); |
510 |
} |
511 |
}; |
512 |
|
513 |
int gseqCmpName(const pointer p1, const pointer p2); |
514 |
|
515 |
class GSpliceSite { |
516 |
public: |
517 |
char nt[3]; |
518 |
GSpliceSite(const char* c, bool revc=false) { |
519 |
nt[2]=0; |
520 |
if (c==NULL) { |
521 |
nt[0]=0; |
522 |
nt[1]=0; |
523 |
return; |
524 |
} |
525 |
if (revc) { |
526 |
nt[0]=toupper(ntComplement(c[1])); |
527 |
nt[1]=toupper(ntComplement(c[0])); |
528 |
} |
529 |
else { |
530 |
nt[0]=toupper(c[0]); |
531 |
nt[1]=toupper(c[1]); |
532 |
} |
533 |
} |
534 |
|
535 |
GSpliceSite(const char* intron, int intronlen, bool getAcceptor, bool revc=false) { |
536 |
nt[2]=0; |
537 |
if (intron==NULL || intronlen==0) |
538 |
GError("Error: invalid intron or intron len for GSpliceSite()!\n"); |
539 |
const char* c=intron; |
540 |
if (revc) { |
541 |
if (!getAcceptor) c+=intronlen-2; |
542 |
nt[0]=toupper(ntComplement(c[1])); |
543 |
nt[1]=toupper(ntComplement(c[0])); |
544 |
} |
545 |
else { //on forward strand |
546 |
if (getAcceptor) c+=intronlen-2; |
547 |
nt[0]=toupper(c[0]); |
548 |
nt[1]=toupper(c[1]); |
549 |
}//forward strand |
550 |
} |
551 |
|
552 |
GSpliceSite(const char n1, const char n2) { |
553 |
nt[2]=0; |
554 |
nt[0]=toupper(n1); |
555 |
nt[1]=toupper(n2); |
556 |
} |
557 |
bool canonicalDonor() { |
558 |
return (nt[0]=='G' && (nt[1]=='C' || nt[1]=='T')); |
559 |
} |
560 |
bool operator==(GSpliceSite& c) { |
561 |
return (c.nt[0]==nt[0] && c.nt[1]==nt[1]); |
562 |
} |
563 |
bool operator==(GSpliceSite* c) { |
564 |
return (c->nt[0]==nt[0] && c->nt[1]==nt[1]); |
565 |
} |
566 |
bool operator==(const char* c) { |
567 |
//return (nt[0]==toupper(c[0]) && nt[1]==toupper(c[1])); |
568 |
//assumes given const nucleotides are uppercase already! |
569 |
return (nt[0]==c[0] && nt[1]==c[1]); |
570 |
} |
571 |
bool operator!=(const char* c) { |
572 |
//assumes given const nucleotides are uppercase already! |
573 |
return (nt[0]!=c[0] || nt[1]!=c[1]); |
574 |
} |
575 |
}; |
576 |
|
577 |
struct GffLoader { |
578 |
GStr fname; |
579 |
FILE* f; |
580 |
bool transcriptsOnly; |
581 |
bool fullAttributes; |
582 |
bool noExonAttrs; |
583 |
bool mergeCloseExons; |
584 |
bool showWarnings; |
585 |
void load(GList<GenomicSeqData>&seqdata, GFValidateFunc* gf_validate=NULL, |
586 |
bool doCluster=true, bool doCollapseRedundant=true, bool matchAllIntrons=true, bool fuzzSpan=false); |
587 |
GffLoader(const char* filename):fname(filename) { |
588 |
f=NULL; |
589 |
transcriptsOnly=true; |
590 |
fullAttributes=false; |
591 |
noExonAttrs=false; |
592 |
mergeCloseExons=false; |
593 |
showWarnings=false; |
594 |
if (fname=="-") { |
595 |
f=stdin; |
596 |
fname="stdin"; |
597 |
} |
598 |
else { |
599 |
if ((f=fopen(fname.chars(), "r"))==NULL) { |
600 |
GError("Error: cannot open file %s!\n",fname.chars()); |
601 |
} |
602 |
} |
603 |
} |
604 |
~GffLoader() { |
605 |
if (f!=NULL && f!=stdin) fclose(f); |
606 |
} |
607 |
}; |
608 |
|
609 |
void printFasta(FILE* f, GStr& defline, char* seq, int seqlen=-1); |
610 |
|
611 |
//"position" a given coordinate x within a list of transcripts sorted by their start (lowest) |
612 |
//coordinate, using quick-search; the returned int is the list index of the closest *higher* |
613 |
//GffObj - i.e. starting right *ABOVE* the given coordinate |
614 |
//Convention: returns -1 if there is no such GffObj (i.e. last GffObj starts below x) |
615 |
int qsearch_rnas(uint x, GList<GffObj>& rnas); |
616 |
int qsearch_gloci(uint x, GList<GffLocus>& loci); |
617 |
|
618 |
GffObj* redundantTranscripts(GffObj& ti, GffObj& tj, bool matchAllIntrons=true, bool fuzzSpan=false); |
619 |
|
620 |
void placeGf(GffObj* t, GenomicSeqData* gdata, bool doCluster=true, bool collapseRedundant=true, |
621 |
bool matchAllIntrons=true, bool fuzzSpan=false); |
622 |
//void loadGFF(FILE* f, GList<GenomicSeqData>& seqdata, const char* fname); |
623 |
|
624 |
void collectLocusData(GList<GenomicSeqData>& ref_data); |
625 |
|
626 |
#endif |