ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gffread/gff_utils.h
Revision: 59
Committed: Fri Sep 9 18:00:23 2011 UTC (9 years, 1 month ago) by gpertea
File size: 21230 byte(s)
Log Message:
finally fixed the placeGf clustering bug

Line File contents
1 #ifndef GFF_UTILS_H
2 #define GFF_UTILS_H
3 #include "gff.h"
4 #include "GStr.h"
5 #include "GFastaIndex.h"
6 #include "GFaSeqGet.h"
7
8 typedef bool GFValidateFunc(GffObj* gf, GList<GffObj>* gfadd);
9
10 class GeneInfo { //for Ensembl GTF conversion
11 public:
12 int flag;
13 GffObj* gf;
14 GList<GStr> gene_names;
15 GList<GStr> transcripts; //list of transcript IDs
16 GeneInfo():gene_names(true, true, true), transcripts(true,true,true) {
17 gf=NULL;
18 flag=0;
19 }
20 GeneInfo(GffObj* gfrec, bool ensembl_convert=false):gene_names(true, true, true),
21 transcripts(true,true,true) {
22 flag=0;
23 if (gfrec->getGeneName())
24 gene_names.Add(new GStr(gfrec->getGeneName()));
25 transcripts.Add(new GStr(gfrec->getID()));
26 create_gf(gfrec, ensembl_convert);
27 }
28
29 void create_gf(GffObj* gfrec, bool ensembl_convert) {
30 gf=new GffObj(gfrec->getGeneID());
31 gf->gseq_id=gfrec->gseq_id;
32 gf->track_id=gfrec->track_id;
33 gf->start=gfrec->start;
34 gf->end=gfrec->end;
35 gf->strand=gfrec->strand;
36 gf->setFeatureName("gene");
37 gf->isGene(true);
38 gf->isUsed(true);
39 gf->uptr=this;
40 gfrec->incLevel();
41 gfrec->parent=gf;
42 gf->children.Add(gfrec);
43 if (ensembl_convert) {
44 gf->addAttr("type", gf->getTrackName());
45 }
46 //gf->children.Add(gfrec);
47 }
48 //~GeneInfo() {
49 // }
50 void update(GffObj* gfrec) {
51 if (transcripts.AddedIfNew(new GStr(gfrec->getID()))<0)
52 return;
53 gene_names.AddedIfNew(new GStr(gfrec->getGeneName()));
54 if (gf==NULL) {
55 GError("GeneInfo::update() called on uninitialized gf!\n");
56 //create_gf(gfrec);
57 //return;
58 }
59 gfrec->parent=gf;
60 gf->children.Add(gfrec);
61 gfrec->incLevel();
62 if (gf->start>gfrec->start)
63 gf->start=gfrec->start;
64 if (gf->end<gfrec->end)
65 gf->end=gfrec->end;
66 }
67 void finalize() {
68 //prepare attributes for printing
69 //must be called right before printing
70 if (gf==NULL || transcripts.Count()==0) return;
71 if (gene_names.Count()>0) {
72 gf->addAttr("Name", gene_names[0]->chars());
73 /*
74 GStr s(gene_names[0]->chars());
75 for (int i=1;i<gene_names.Count();i++) {
76 s.append(",");
77 s.append(gene_names[i]->chars());
78 }
79 gf->addAttr("genes", s.chars());
80 */
81 } //has gene names
82 GStr t(transcripts[0]->chars());
83 for (int i=1;i<transcripts.Count();i++) {
84 t.append(",");
85 t.append(transcripts[i]->chars());
86 }
87 gf->addAttr("transcripts", t.chars());
88 }
89 };
90
91 //genomic fasta sequence handling
92 class GFastaDb {
93 public:
94 char* fastaPath;
95 GFastaIndex* faIdx; //could be a cdb .cidx file
96 int last_fetchid;
97 GFaSeqGet* faseq;
98 //GCdbYank* gcdb;
99 char* getFastaFile(int gseq_id) {
100 if (fastaPath==NULL) return NULL;
101 GStr s(fastaPath);
102 s.trimR('/');
103 s.appendfmt("/%s",GffObj::names->gseqs.getName(gseq_id));
104 GStr sbase(s);
105 if (!fileExists(s.chars())) s.append(".fa");
106 if (!fileExists(s.chars())) s.append("sta");
107 if (fileExists(s.chars())) return Gstrdup(s.chars());
108 else {
109 GMessage("Warning: cannot find genomic sequence file %s{.fa,.fasta}\n",sbase.chars());
110 return NULL;
111 }
112 }
113
114 GFastaDb(const char* fpath=NULL) {
115 //gcdb=NULL;
116 fastaPath=NULL;
117 faseq=NULL;
118 faIdx=NULL;
119 init(fpath);
120 }
121
122 void init(const char* fpath) {
123 if (fpath==NULL || fpath[0]==0) return;
124 last_fetchid=-1;
125 if (!fileExists(fpath))
126 GError("Error: file/directory %s does not exist!\n",fpath);
127 fastaPath=Gstrdup(fpath);
128 GStr gseqpath(fpath);
129 if (fileExists(fastaPath)>1) { //exists and it's not a directory
130 GStr fainame(fastaPath);
131 if (fainame.rindex(".fai")==fainame.length()-4) {
132 //.fai index file given directly
133 fastaPath[fainame.length()-4]=0;
134 if (!fileExists(fastaPath))
135 GError("Error: cannot find fasta file for index %s !\n", fastaPath);
136 }
137 else fainame.append(".fai");
138 //GMessage("creating GFastaIndex with fastaPath=%s, fainame=%s\n", fastaPath, fainame.chars());
139 faIdx=new GFastaIndex(fastaPath,fainame.chars());
140 GStr fainamecwd(fainame);
141 int ip=-1;
142 if ((ip=fainamecwd.rindex(CHPATHSEP))>=0)
143 fainamecwd.cut(0,ip+1);
144 if (!faIdx->hasIndex()) { //could not load index
145 //try current directory
146 if (fainame!=fainamecwd) {
147 if (fileExists(fainamecwd.chars())>1) {
148 faIdx->loadIndex(fainamecwd.chars());
149 }
150 }
151 } //tried to load index
152 if (!faIdx->hasIndex()) {
153 GMessage("No fasta index found for %s. Rebuilding, please wait..\n",fastaPath);
154 faIdx->buildIndex();
155 if (faIdx->getCount()==0) GError("Error: no fasta records found!\n");
156 GMessage("Fasta index rebuilt.\n");
157 FILE* fcreate=fopen(fainame.chars(), "w");
158 if (fcreate==NULL) {
159 GMessage("Warning: cannot create fasta index %s! (permissions?)\n", fainame.chars());
160 if (fainame!=fainamecwd) fcreate=fopen(fainamecwd.chars(), "w");
161 if (fcreate==NULL)
162 GError("Error: cannot create fasta index %s!\n", fainamecwd.chars());
163 }
164 if (faIdx->storeIndex(fcreate)<faIdx->getCount())
165 GMessage("Warning: error writing the index file!\n");
166 } //index created and attempted to store it
167 } //multi-fasta
168 }
169 GFaSeqGet* fetch(int gseq_id, bool checkFasta=false) {
170 if (fastaPath==NULL) return NULL;
171 if (gseq_id==last_fetchid && faseq!=NULL) return faseq;
172 delete faseq;
173 faseq=NULL;
174 last_fetchid=-1;
175 char* gseqname=GffObj::names->gseqs.getName(gseq_id);
176 if (faIdx!=NULL) { //fastaPath was the multi-fasta file name
177 GFastaRec* farec=faIdx->getRecord(gseqname);
178 if (farec!=NULL) {
179 faseq=new GFaSeqGet(fastaPath,farec->seqlen, farec->fpos,
180 farec->line_len, farec->line_blen);
181 faseq->loadall(); //just cache the whole sequence, it's faster
182 last_fetchid=gseq_id;
183 }
184 else {
185 GMessage("Warning: couldn't find fasta record for '%s'!\n",gseqname);
186 return NULL;
187 }
188 }
189 else {
190 char* sfile=getFastaFile(gseq_id);
191 if (sfile!=NULL) {
192 faseq=new GFaSeqGet(sfile,checkFasta);
193 faseq->loadall();
194 last_fetchid=gseq_id;
195 GFREE(sfile);
196 }
197 } //one fasta file per contig
198 return faseq;
199 }
200
201 ~GFastaDb() {
202 GFREE(fastaPath);
203 //delete gcdb;
204 delete faIdx;
205 delete faseq;
206 }
207 };
208
209 class GffLocus;
210
211 class GTData { //transcript associated data
212 public:
213 GffObj* rna;
214 GffLocus* locus;
215 GffObj* replaced_by;
216 GeneInfo* geneinfo;
217 int flag;
218 GTData(GffObj* t=NULL) {
219 rna=t;
220 flag=0;
221 locus=NULL;
222 replaced_by=NULL;
223 geneinfo=NULL;
224 if (rna!=NULL) {
225 geneinfo=(GeneInfo*)rna->uptr; //take over geneinfo, if there
226 rna->uptr=this;
227 }
228 }
229 bool operator>(GTData& b) { return (rna > b.rna); }
230 bool operator<(GTData& b) { return (rna < b.rna); }
231 bool operator==(GTData& b) { return (rna==b.rna); }
232 };
233
234 class CGeneSym {
235 public:
236 GStr name;
237 int freq;
238 CGeneSym(const char* n=NULL, int f=0):name(n) {
239 freq=f;
240 }
241 bool operator>(CGeneSym& b) {
242 return (freq==b.freq) ? ( (name.length()==b.name.length()) ? (name>b.name) :
243 (name.length()>b.name.length()) ) : ( freq<b.freq );
244 }
245 bool operator<(CGeneSym& b) {
246 return (freq==b.freq)? ( (name.length()==b.name.length()) ? (name<b.name) :
247 (name.length()<b.name.length()) ) : ( freq>b.freq );
248 }
249 bool operator==(CGeneSym& b) { return name==b.name; }
250 };
251
252 const char* getGeneDescr(const char* gsym);
253
254 void printLocus(GffLocus* loc, const char* pre=NULL);
255
256 class GffLocus:public GSeg {
257 public:
258 int gseq_id; //id of underlying genomic sequence
259 int locus_num;
260 bool is_mrna;
261 bool startChanged; //prompt reordering of container list
262 char strand;
263 GffObj* t_maxcov; //transcript with maximum coverage (for main "ref" transcript)
264 GList<GffObj> rnas; //list of transcripts (isoforms) for this locus
265 GArray<GSeg> mexons; //list of merged exons in this region
266 GList<CGeneSym> gene_names;
267 GList<CGeneSym> gene_ids;
268 int v; //user flag/data
269 /*
270 bool operator==(GffLocus& d){
271 return (gseq_id==d.gseq_id && strand==d.strand && start==d.start && end==d.end);
272 }
273 bool operator>(GffLocus& d){
274 if (gseq_id!=d.gseq_id) return (gseq_id>d.gseq_id);
275 if (start==d.start) {
276 if (end==d.end) return (strand>d.strand);
277 else return (end>d.end);
278 } else return (start>d.start);
279 }
280 bool operator<(GffLocus& d){
281 if (gseq_id!=d.gseq_id) return (gseq_id<d.gseq_id);
282 if (start==d.start) {
283 if (end==d.end) return strand<d.strand;
284 else return end<d.end;
285 } else return (start<d.start);
286 }
287 */
288 const char* getGeneName() {
289 if (gene_names.Count()==0) return NULL;
290 return gene_names.First()->name.chars();
291 }
292 const char* get_tmax_id() {
293 return t_maxcov->getID();
294 }
295 const char* get_descr() {
296 if (gene_names.Count()>0) {
297 for (int i=0;i<gene_names.Count();i++) {
298 const char* gn=getGeneDescr(gene_names.First()->name.chars());
299 if (gn!=NULL) return gn;
300 }
301 }
302 char* s=t_maxcov->getAttr("product");
303 if (s!=NULL) return s;
304 s=t_maxcov->getAttr("descr");
305 if (s!=NULL) return s;
306 s=t_maxcov->getAttr("description");
307 if (s!=NULL) return s;
308 s=t_maxcov->getAttr("info");
309 if (s!=NULL) return s;
310 return NULL;
311 }
312
313 GffLocus(GffObj* t=NULL):rnas(true,false,false),mexons(true,true),
314 gene_names(true,true,false), gene_ids(true,true,false) {
315 //this will NOT free rnas!
316 t_maxcov=NULL;
317 gseq_id=-1;
318 v=0;
319 locus_num=0;
320 start=0;
321 end=0;
322 strand=0;
323 is_mrna=false;
324 startChanged=false;
325 if (t!=NULL) {
326 start=t->exons.First()->start;
327 end=t->exons.Last()->end;;
328 gseq_id=t->gseq_id;
329 GSeg seg;
330 for (int i=0;i<t->exons.Count();i++) {
331 seg.start=t->exons[i]->start;
332 seg.end=t->exons[i]->end;
333 mexons.Add(seg);
334 }
335 rnas.Add(t);
336 ((GTData*)(t->uptr))->locus=this;
337 t_maxcov=t;
338 strand=t->strand;
339 if (t->ftype_id==gff_fid_mRNA) {
340 is_mrna=true;
341 }
342 }
343 }
344
345 void addMerge(GffLocus& locus, GffObj* lnkrna) {
346 //add all the elements of the other locus (merging)
347 //-- merge mexons
348 GArray<int> ovlexons(true,true); //list of locus.mexons indexes overlapping existing mexons
349 int i=0; //index of first mexons with a merge
350 int j=0; //index current mrna exon
351 while (i<mexons.Count() && j<locus.mexons.Count()) {
352 uint istart=mexons[i].start;
353 uint iend=mexons[i].end;
354 uint jstart=locus.mexons[j].start;
355 uint jend=locus.mexons[j].end;
356 if (iend<jstart) { i++; continue; }
357 if (jend<istart) { j++; continue; }
358 ovlexons.Add(j);
359 //extend mexons[i] as needed
360 if (jstart<istart) mexons[i].start=jstart;
361 if (jend>iend) { //mexons[i] end extend
362 mexons[i].end=jend;
363 //now this could overlap the next mexon(s), so we have to merge them all
364 while (i<mexons.Count()-1 && mexons[i].end>mexons[i+1].start) {
365 uint nextend=mexons[i+1].end;
366 mexons.Delete(i+1);
367 if (nextend>mexons[i].end) {
368 mexons[i].end=nextend;
369 break; //no need to check next mexons
370 }
371 } //while next mexons merge
372 } // mexons[i] end extend
373 j++; //check the next locus.mexon
374 }
375 //-- add the rest of the non-overlapping mexons:
376 GSeg seg;
377 for (int i=0;i<locus.mexons.Count();i++) {
378 seg.start=locus.mexons[i].start;
379 seg.end=locus.mexons[i].end;
380 if (!ovlexons.Exists(i)) mexons.Add(seg);
381 }
382 // -- add locus.rnas
383 for (int i=0;i<locus.rnas.Count();i++) {
384 ((GTData*)(locus.rnas[i]->uptr))->locus=this;
385 if (locus.rnas[i]!=lnkrna) rnas.Add(locus.rnas[i]);
386 }
387 // -- adjust start/end as needed
388 if (start>locus.start) {
389 startChanged=true;
390 start=locus.start;
391 }
392 if (end<locus.end) end=locus.end;
393 if (locus.is_mrna) is_mrna=true;
394 if (t_maxcov->covlen<locus.t_maxcov->covlen)
395 t_maxcov=locus.t_maxcov;
396 }
397
398 bool exonOverlap(GffLocus& loc) {
399 //check if any mexons overlap!
400 if (strand!=loc.strand || loc.start>end || start>loc.end) return false;
401 int i=0;
402 int j=0;
403 while (i<mexons.Count() && j<loc.mexons.Count()) {
404 uint istart=mexons[i].start;
405 uint iend=mexons[i].end;
406 uint jstart=loc.mexons[j].start;
407 uint jend=loc.mexons[j].end;
408 if (iend<jstart) { i++; continue; }
409 if (jend<istart) { j++; continue; }
410 //exon overlap found if we're here:
411 return true;
412 }
413 return false;
414 }
415
416 bool add_RNA(GffObj* t) {
417 //if (rnas.Count()==0) return true; //? should never be called on an empty locus
418 if (t->gseq_id!=gseq_id || t->strand!=strand || t->start>end || start>t->end)
419 return false; //rna must be on the same genomic seq
420 //check for exon overlap with existing mexons
421 //also update mexons accordingly if t is to be added
422 bool hasovl=false;
423 int i=0; //index of first mexons with a merge
424 int j=0; //index current t exon
425 GArray<int> ovlexons(true,true); //list of mrna exon indexes overlapping mexons
426 while (i<mexons.Count() && j<t->exons.Count()) {
427 uint istart=mexons[i].start;
428 uint iend=mexons[i].end;
429 uint jstart=t->exons[j]->start;
430 uint jend=t->exons[j]->end;
431 if (iend<jstart) { i++; continue; }
432 if (jend<istart) { j++; continue; }
433 //exon overlap found if we're here:
434 ovlexons.Add(j);
435 hasovl=true;
436 //extend mexons[i] as needed
437 if (jstart<istart) mexons[i].start=jstart;
438 if (jend>iend) { //mexon stretch up
439 mexons[i].end=jend;
440 //now this could overlap the next mexon(s), so we have to merge them all
441 while (i<mexons.Count()-1 && mexons[i].end>mexons[i+1].start) {
442 uint nextend=mexons[i+1].end;
443 mexons.Delete(i+1);
444 if (nextend>mexons[i].end) {
445 mexons[i].end=nextend;
446 break; //no need to check next mexons
447 }
448 } //while next mexons merge
449 } //possible mexons merge
450
451 j++; //check the next t exon
452 }//all vs all exon check loop
453 if (hasovl) {
454 GSeg seg;
455 //add the rest of the non-overlapping exons
456 for (int i=0;i<t->exons.Count();i++) {
457 seg.start=t->exons[i]->start;
458 seg.end=t->exons[i]->end;
459 if (!ovlexons.Exists(i)) mexons.Add(seg);
460 }
461 rnas_add(t);
462 // add to rnas
463 ((GTData*)t->uptr)->locus=this;
464 gseq_id=t->gseq_id;
465 }
466 return hasovl;
467 }
468
469 //simpler,basic adding of a mrna
470 void rnas_add(GffObj* t) {
471 rnas.Add(t);
472 // adjust start/end
473 //if (start==0 || start>t->start) start=t->start;
474 if (start==0) start=t->start;
475 else if (start>t->start) {
476 startChanged=true;
477 start=t->start;
478 }
479 if (end<t->end) end=t->end;
480 if (t_maxcov->covlen<t->covlen) t_maxcov=t;
481 if (strand==0) strand=t->strand;
482 if (t->ftype_id==gff_fid_mRNA) is_mrna=true;
483 }
484 };
485
486 class GenomicSeqData {
487 int gseq_id;
488 public:
489 const char* gseq_name;
490 GList<GffObj> gfs; //all non-transcript features -> usually gene features
491 GList<GffObj> rnas; //all transcripts on this genomic sequence
492 GList<GffLocus> loci; //all loci clusters
493 GList<GTData> tdata; //transcript data (uptr holder for all rnas loaded here)
494 //GenomicSeqData(int gid=-1):rnas(true,true,false),loci(true,true,true),
495 GenomicSeqData(int gid=-1):gfs(true, true, false),rnas((GCompareProc*)gfo_cmpByLoc),loci(true,true,false),
496 tdata(false,true,false) {
497 gseq_id=gid;
498 if (gseq_id>=0)
499 gseq_name=GffObj::names->gseqs.getName(gseq_id);
500
501 }
502 bool operator==(GenomicSeqData& d){
503 return gseq_id==d.gseq_id;
504 }
505 bool operator>(GenomicSeqData& d){
506 return (gseq_id>d.gseq_id);
507 }
508 bool operator<(GenomicSeqData& d){
509 return (gseq_id<d.gseq_id);
510 }
511 };
512
513 int gseqCmpName(const pointer p1, const pointer p2);
514
515 class GSpliceSite {
516 public:
517 char nt[3];
518 GSpliceSite(const char* c, bool revc=false) {
519 nt[2]=0;
520 if (c==NULL) {
521 nt[0]=0;
522 nt[1]=0;
523 return;
524 }
525 if (revc) {
526 nt[0]=toupper(ntComplement(c[1]));
527 nt[1]=toupper(ntComplement(c[0]));
528 }
529 else {
530 nt[0]=toupper(c[0]);
531 nt[1]=toupper(c[1]);
532 }
533 }
534
535 GSpliceSite(const char* intron, int intronlen, bool getAcceptor, bool revc=false) {
536 nt[2]=0;
537 if (intron==NULL || intronlen==0)
538 GError("Error: invalid intron or intron len for GSpliceSite()!\n");
539 const char* c=intron;
540 if (revc) {
541 if (!getAcceptor) c+=intronlen-2;
542 nt[0]=toupper(ntComplement(c[1]));
543 nt[1]=toupper(ntComplement(c[0]));
544 }
545 else { //on forward strand
546 if (getAcceptor) c+=intronlen-2;
547 nt[0]=toupper(c[0]);
548 nt[1]=toupper(c[1]);
549 }//forward strand
550 }
551
552 GSpliceSite(const char n1, const char n2) {
553 nt[2]=0;
554 nt[0]=toupper(n1);
555 nt[1]=toupper(n2);
556 }
557 bool canonicalDonor() {
558 return (nt[0]=='G' && (nt[1]=='C' || nt[1]=='T'));
559 }
560 bool operator==(GSpliceSite& c) {
561 return (c.nt[0]==nt[0] && c.nt[1]==nt[1]);
562 }
563 bool operator==(GSpliceSite* c) {
564 return (c->nt[0]==nt[0] && c->nt[1]==nt[1]);
565 }
566 bool operator==(const char* c) {
567 //return (nt[0]==toupper(c[0]) && nt[1]==toupper(c[1]));
568 //assumes given const nucleotides are uppercase already!
569 return (nt[0]==c[0] && nt[1]==c[1]);
570 }
571 bool operator!=(const char* c) {
572 //assumes given const nucleotides are uppercase already!
573 return (nt[0]!=c[0] || nt[1]!=c[1]);
574 }
575 };
576
577 struct GffLoader {
578 GStr fname;
579 FILE* f;
580 bool transcriptsOnly;
581 bool fullAttributes;
582 bool noExonAttrs;
583 bool mergeCloseExons;
584 bool showWarnings;
585 void load(GList<GenomicSeqData>&seqdata, GFValidateFunc* gf_validate=NULL,
586 bool doCluster=true, bool doCollapseRedundant=true, bool matchAllIntrons=true, bool fuzzSpan=false);
587 GffLoader(const char* filename):fname(filename) {
588 f=NULL;
589 transcriptsOnly=true;
590 fullAttributes=false;
591 noExonAttrs=false;
592 mergeCloseExons=false;
593 showWarnings=false;
594 if (fname=="-") {
595 f=stdin;
596 fname="stdin";
597 }
598 else {
599 if ((f=fopen(fname.chars(), "r"))==NULL) {
600 GError("Error: cannot open file %s!\n",fname.chars());
601 }
602 }
603 }
604 ~GffLoader() {
605 if (f!=NULL && f!=stdin) fclose(f);
606 }
607 };
608
609 void printFasta(FILE* f, GStr& defline, char* seq, int seqlen=-1);
610
611 //"position" a given coordinate x within a list of transcripts sorted by their start (lowest)
612 //coordinate, using quick-search; the returned int is the list index of the closest *higher*
613 //GffObj - i.e. starting right *ABOVE* the given coordinate
614 //Convention: returns -1 if there is no such GffObj (i.e. last GffObj starts below x)
615 int qsearch_rnas(uint x, GList<GffObj>& rnas);
616 int qsearch_gloci(uint x, GList<GffLocus>& loci);
617
618 GffObj* redundantTranscripts(GffObj& ti, GffObj& tj, bool matchAllIntrons=true, bool fuzzSpan=false);
619
620 void placeGf(GffObj* t, GenomicSeqData* gdata, bool doCluster=true, bool collapseRedundant=true,
621 bool matchAllIntrons=true, bool fuzzSpan=false);
622 //void loadGFF(FILE* f, GList<GenomicSeqData>& seqdata, const char* fname);
623
624 void collectLocusData(GList<GenomicSeqData>& ref_data);
625
626 #endif