ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/map_report/gff_utils.h
Revision: 162
Committed: Tue Feb 7 19:27:55 2012 UTC (7 years, 7 months ago) by gpertea
File size: 20646 byte(s)
Log Message:
initial map_report commit

Line File contents
1 #ifndef GFF_UTILS_H
2 #define GFF_UTILS_H
3 #include "gff.h"
4 #include "GStr.h"
5 #include "GFastaIndex.h"
6 #include "GFaSeqGet.h"
7
8 typedef bool GFValidateFunc(GffObj* gf, GList<GffObj>* gfadd);
9
10 class GeneInfo { //for Ensembl GTF conversion
11 public:
12 int flag;
13 GffObj* gf;
14 GList<GStr> gene_names;
15 GList<GStr> transcripts; //list of transcript IDs
16 GeneInfo():gene_names(true, true, true), transcripts(true,true,true) {
17 gf=NULL;
18 flag=0;
19 }
20 GeneInfo(GffObj* gfrec, bool ensembl_convert=false):gene_names(true, true, true),
21 transcripts(true,true,true) {
22 flag=0;
23 if (gfrec->getGeneName())
24 gene_names.Add(new GStr(gfrec->getGeneName()));
25 transcripts.Add(new GStr(gfrec->getID()));
26 create_gf(gfrec, ensembl_convert);
27 }
28
29 void create_gf(GffObj* gfrec, bool ensembl_convert) {
30 gf=new GffObj(gfrec->getGeneID());
31 gf->gseq_id=gfrec->gseq_id;
32 gf->track_id=gfrec->track_id;
33 gf->start=gfrec->start;
34 gf->end=gfrec->end;
35 gf->strand=gfrec->strand;
36 gf->setFeatureName("gene");
37 gf->isGene(true);
38 gf->isUsed(true);
39 gf->uptr=this;
40 gfrec->incLevel();
41 gfrec->parent=gf;
42 gf->children.Add(gfrec);
43 if (ensembl_convert) {
44 //gf->addAttr("type", gf->getTrackName());
45 const char* biotype=gfrec->getAttr("type");
46 if (biotype) gf->addAttr("type", biotype);
47 }
48 //gf->children.Add(gfrec);
49 }
50 //~GeneInfo() {
51 // }
52 void update(GffObj* gfrec) {
53 if (transcripts.AddedIfNew(new GStr(gfrec->getID()))<0)
54 return;
55 gene_names.AddedIfNew(new GStr(gfrec->getGeneName()));
56 if (gf==NULL) {
57 GError("GeneInfo::update() called on uninitialized gf!\n");
58 //create_gf(gfrec);
59 //return;
60 }
61 gfrec->parent=gf;
62 gf->children.Add(gfrec);
63 gfrec->incLevel();
64 if (gf->start>gfrec->start)
65 gf->start=gfrec->start;
66 if (gf->end<gfrec->end)
67 gf->end=gfrec->end;
68 }
69 void finalize() {
70 //prepare attributes for printing
71 //must be called right before printing
72 if (gf==NULL || transcripts.Count()==0) return;
73 if (gene_names.Count()>0) {
74 gf->addAttr("Name", gene_names[0]->chars());
75 /*
76 GStr s(gene_names[0]->chars());
77 for (int i=1;i<gene_names.Count();i++) {
78 s.append(",");
79 s.append(gene_names[i]->chars());
80 }
81 gf->addAttr("genes", s.chars());
82 */
83 } //has gene names
84 GStr t(transcripts[0]->chars());
85 for (int i=1;i<transcripts.Count();i++) {
86 t.append(",");
87 t.append(transcripts[i]->chars());
88 }
89 gf->addAttr("transcripts", t.chars());
90 }
91 };
92
93 //genomic fasta sequence handling
94 class GFastaDb {
95 public:
96 char* fastaPath;
97 GFastaIndex* faIdx; //could be a cdb .cidx file
98 int last_fetchid;
99 GFaSeqGet* faseq;
100 //GCdbYank* gcdb;
101 char* getFastaFile(int gseq_id) {
102 if (fastaPath==NULL) return NULL;
103 GStr s(fastaPath);
104 s.trimR('/');
105 s.appendfmt("/%s",GffObj::names->gseqs.getName(gseq_id));
106 GStr sbase(s);
107 if (!fileExists(s.chars())) s.append(".fa");
108 if (!fileExists(s.chars())) s.append("sta");
109 if (fileExists(s.chars())) return Gstrdup(s.chars());
110 else {
111 GMessage("Warning: cannot find genomic sequence file %s{.fa,.fasta}\n",sbase.chars());
112 return NULL;
113 }
114 }
115
116 GFastaDb(const char* fpath=NULL) {
117 //gcdb=NULL;
118 fastaPath=NULL;
119 faseq=NULL;
120 faIdx=NULL;
121 init(fpath);
122 }
123
124 void init(const char* fpath) {
125 if (fpath==NULL || fpath[0]==0) return;
126 last_fetchid=-1;
127 if (!fileExists(fpath))
128 GError("Error: file/directory %s does not exist!\n",fpath);
129 fastaPath=Gstrdup(fpath);
130 GStr gseqpath(fpath);
131 if (fileExists(fastaPath)>1) { //exists and it's not a directory
132 GStr fainame(fastaPath);
133 if (fainame.rindex(".fai")==fainame.length()-4) {
134 //.fai index file given directly
135 fastaPath[fainame.length()-4]=0;
136 if (!fileExists(fastaPath))
137 GError("Error: cannot find fasta file for index %s !\n", fastaPath);
138 }
139 else fainame.append(".fai");
140 //GMessage("creating GFastaIndex with fastaPath=%s, fainame=%s\n", fastaPath, fainame.chars());
141 faIdx=new GFastaIndex(fastaPath,fainame.chars());
142 GStr fainamecwd(fainame);
143 int ip=-1;
144 if ((ip=fainamecwd.rindex(CHPATHSEP))>=0)
145 fainamecwd.cut(0,ip+1);
146 if (!faIdx->hasIndex()) { //could not load index
147 //try current directory
148 if (fainame!=fainamecwd) {
149 if (fileExists(fainamecwd.chars())>1) {
150 faIdx->loadIndex(fainamecwd.chars());
151 }
152 }
153 } //tried to load index
154 if (!faIdx->hasIndex()) {
155 GMessage("No fasta index found for %s. Rebuilding, please wait..\n",fastaPath);
156 faIdx->buildIndex();
157 if (faIdx->getCount()==0) GError("Error: no fasta records found!\n");
158 GMessage("Fasta index rebuilt.\n");
159 FILE* fcreate=fopen(fainame.chars(), "w");
160 if (fcreate==NULL) {
161 GMessage("Warning: cannot create fasta index %s! (permissions?)\n", fainame.chars());
162 if (fainame!=fainamecwd) fcreate=fopen(fainamecwd.chars(), "w");
163 if (fcreate==NULL)
164 GError("Error: cannot create fasta index %s!\n", fainamecwd.chars());
165 }
166 if (faIdx->storeIndex(fcreate)<faIdx->getCount())
167 GMessage("Warning: error writing the index file!\n");
168 } //index created and attempted to store it
169 } //multi-fasta
170 }
171 GFaSeqGet* fetch(int gseq_id, bool checkFasta=false) {
172 if (fastaPath==NULL) return NULL;
173 if (gseq_id==last_fetchid && faseq!=NULL) return faseq;
174 delete faseq;
175 faseq=NULL;
176 last_fetchid=-1;
177 char* gseqname=GffObj::names->gseqs.getName(gseq_id);
178 if (faIdx!=NULL) { //fastaPath was the multi-fasta file name
179 GFastaRec* farec=faIdx->getRecord(gseqname);
180 if (farec!=NULL) {
181 faseq=new GFaSeqGet(fastaPath,farec->seqlen, farec->fpos,
182 farec->line_len, farec->line_blen);
183 faseq->loadall(); //just cache the whole sequence, it's faster
184 last_fetchid=gseq_id;
185 }
186 else {
187 GMessage("Warning: couldn't find fasta record for '%s'!\n",gseqname);
188 return NULL;
189 }
190 }
191 else {
192 char* sfile=getFastaFile(gseq_id);
193 if (sfile!=NULL) {
194 faseq=new GFaSeqGet(sfile,checkFasta);
195 faseq->loadall();
196 last_fetchid=gseq_id;
197 GFREE(sfile);
198 }
199 } //one fasta file per contig
200 return faseq;
201 }
202
203 ~GFastaDb() {
204 GFREE(fastaPath);
205 //delete gcdb;
206 delete faIdx;
207 delete faseq;
208 }
209 };
210
211 class GffLocus;
212
213 class GTData { //transcript associated data
214 public:
215 GffObj* rna;
216 GffLocus* locus;
217 GffObj* replaced_by;
218 GeneInfo* geneinfo;
219 int flag;
220 GTData(GffObj* t=NULL) {
221 rna=t;
222 flag=0;
223 locus=NULL;
224 replaced_by=NULL;
225 geneinfo=NULL;
226 if (rna!=NULL) {
227 geneinfo=(GeneInfo*)rna->uptr; //take over geneinfo, if there
228 rna->uptr=this;
229 }
230 }
231 bool operator<(GTData& b) { return (rna < b.rna); }
232 bool operator==(GTData& b) { return (rna==b.rna); }
233 };
234
235 class CGeneSym {
236 public:
237 GStr name;
238 int freq;
239 CGeneSym(const char* n=NULL, int f=0):name(n) {
240 freq=f;
241 }
242 bool operator<(CGeneSym& b) {
243 return (freq==b.freq)? ( (name.length()==b.name.length()) ? (name<b.name) :
244 (name.length()<b.name.length()) ) : ( freq>b.freq );
245 }
246 bool operator==(CGeneSym& b) { return name==b.name; }
247 };
248
249 const char* getGeneDescr(const char* gsym);
250
251 void printLocus(GffLocus* loc, const char* pre=NULL);
252
253 class GffLocus:public GSeg {
254 public:
255 int gseq_id; //id of underlying genomic sequence
256 int locus_num;
257 bool is_mrna;
258 char strand;
259 GffObj* t_maxcov; //transcript with maximum coverage (for main "ref" transcript)
260 GList<GffObj> rnas; //list of transcripts (isoforms) for this locus
261 GArray<GSeg> mexons; //list of merged exons in this region
262 GList<CGeneSym> gene_names;
263 GList<CGeneSym> gene_ids;
264 int v; //user flag/data
265 /*
266 bool operator==(GffLocus& d){
267 return (gseq_id==d.gseq_id && strand==d.strand && start==d.start && end==d.end);
268 }
269 bool operator<(GffLocus& d){
270 if (gseq_id!=d.gseq_id) return (gseq_id<d.gseq_id);
271 if (start==d.start) {
272 if (end==d.end) return strand<d.strand;
273 else return end<d.end;
274 } else return (start<d.start);
275 }
276 */
277 const char* getGeneName() {
278 if (gene_names.Count()==0) return NULL;
279 return gene_names.First()->name.chars();
280 }
281 const char* get_tmax_id() {
282 return t_maxcov->getID();
283 }
284 const char* get_descr() {
285 if (gene_names.Count()>0) {
286 for (int i=0;i<gene_names.Count();i++) {
287 const char* gn=getGeneDescr(gene_names.First()->name.chars());
288 if (gn!=NULL) return gn;
289 }
290 }
291 char* s=t_maxcov->getAttr("product");
292 if (s!=NULL) return s;
293 s=t_maxcov->getAttr("descr");
294 if (s!=NULL) return s;
295 s=t_maxcov->getAttr("description");
296 if (s!=NULL) return s;
297 s=t_maxcov->getAttr("info");
298 if (s!=NULL) return s;
299 return NULL;
300 }
301
302 GffLocus(GffObj* t=NULL):rnas(true,false,false),mexons(true,true),
303 gene_names(true,true,false), gene_ids(true,true,false) {
304 //this will NOT free rnas!
305 t_maxcov=NULL;
306 gseq_id=-1;
307 v=0;
308 locus_num=0;
309 start=0;
310 end=0;
311 strand=0;
312 is_mrna=false;
313 if (t!=NULL) {
314 start=t->exons.First()->start;
315 end=t->exons.Last()->end;;
316 gseq_id=t->gseq_id;
317 GSeg seg;
318 for (int i=0;i<t->exons.Count();i++) {
319 seg.start=t->exons[i]->start;
320 seg.end=t->exons[i]->end;
321 mexons.Add(seg);
322 }
323 rnas.Add(t);
324 ((GTData*)(t->uptr))->locus=this;
325 t_maxcov=t;
326 strand=t->strand;
327 if (t->ftype_id==gff_fid_mRNA) {
328 is_mrna=true;
329 }
330 }
331 }
332
333 void addMerge(GffLocus& locus, GffObj* lnkrna) {
334 //add all the elements of the other locus (merging)
335 //-- merge mexons
336 GArray<int> ovlexons(true,true); //list of locus.mexons indexes overlapping existing mexons
337 int i=0; //index of first mexons with a merge
338 int j=0; //index current mrna exon
339 while (i<mexons.Count() && j<locus.mexons.Count()) {
340 uint istart=mexons[i].start;
341 uint iend=mexons[i].end;
342 uint jstart=locus.mexons[j].start;
343 uint jend=locus.mexons[j].end;
344 if (iend<jstart) { i++; continue; }
345 if (jend<istart) { j++; continue; }
346 ovlexons.Add(j);
347 //extend mexons[i] as needed
348 if (jstart<istart) mexons[i].start=jstart;
349 if (jend>iend) { //mexons[i] end extend
350 mexons[i].end=jend;
351 //now this could overlap the next mexon(s), so we have to merge them all
352 while (i<mexons.Count()-1 && mexons[i].end>mexons[i+1].start) {
353 uint nextend=mexons[i+1].end;
354 mexons.Delete(i+1);
355 if (nextend>mexons[i].end) {
356 mexons[i].end=nextend;
357 break; //no need to check next mexons
358 }
359 } //while next mexons merge
360 } // mexons[i] end extend
361 j++; //check the next locus.mexon
362 }
363 //-- add the rest of the non-overlapping mexons:
364 GSeg seg;
365 for (int i=0;i<locus.mexons.Count();i++) {
366 seg.start=locus.mexons[i].start;
367 seg.end=locus.mexons[i].end;
368 if (!ovlexons.Exists(i)) mexons.Add(seg);
369 }
370 // -- add locus.rnas
371 for (int i=0;i<locus.rnas.Count();i++) {
372 ((GTData*)(locus.rnas[i]->uptr))->locus=this;
373 if (locus.rnas[i]!=lnkrna) rnas.Add(locus.rnas[i]);
374 }
375 // -- adjust start/end as needed
376 if (start>locus.start) start=locus.start;
377 if (end<locus.end) end=locus.end;
378 if (locus.is_mrna) is_mrna=true;
379 if (t_maxcov->covlen<locus.t_maxcov->covlen)
380 t_maxcov=locus.t_maxcov;
381 }
382
383 bool exonOverlap(GffLocus& loc) {
384 //check if any mexons overlap!
385 if (strand!=loc.strand || loc.start>end || start>loc.end) return false;
386 int i=0;
387 int j=0;
388 while (i<mexons.Count() && j<loc.mexons.Count()) {
389 uint istart=mexons[i].start;
390 uint iend=mexons[i].end;
391 uint jstart=loc.mexons[j].start;
392 uint jend=loc.mexons[j].end;
393 if (iend<jstart) { i++; continue; }
394 if (jend<istart) { j++; continue; }
395 //exon overlap found if we're here:
396 return true;
397 }
398 return false;
399 }
400
401 bool add_RNA(GffObj* t) {
402 //if (rnas.Count()==0) return true; //? should never be called on an empty locus
403 if (t->gseq_id!=gseq_id || t->strand!=strand || t->start>end || start>t->end)
404 return false; //rna must be on the same genomic seq
405 //check for exon overlap with existing mexons
406 //also update mexons accordingly if t is to be added
407 bool hasovl=false;
408 int i=0; //index of first mexons with a merge
409 int j=0; //index current t exon
410 GArray<int> ovlexons(true,true); //list of mrna exon indexes overlapping mexons
411 while (i<mexons.Count() && j<t->exons.Count()) {
412 uint istart=mexons[i].start;
413 uint iend=mexons[i].end;
414 uint jstart=t->exons[j]->start;
415 uint jend=t->exons[j]->end;
416 if (iend<jstart) { i++; continue; }
417 if (jend<istart) { j++; continue; }
418 //exon overlap found if we're here:
419 ovlexons.Add(j);
420 hasovl=true;
421 //extend mexons[i] as needed
422 if (jstart<istart) mexons[i].start=jstart;
423 if (jend>iend) { //mexon stretch up
424 mexons[i].end=jend;
425 //now this could overlap the next mexon(s), so we have to merge them all
426 while (i<mexons.Count()-1 && mexons[i].end>mexons[i+1].start) {
427 uint nextend=mexons[i+1].end;
428 mexons.Delete(i+1);
429 if (nextend>mexons[i].end) {
430 mexons[i].end=nextend;
431 break; //no need to check next mexons
432 }
433 } //while next mexons merge
434 } //possible mexons merge
435
436 j++; //check the next t exon
437 }//all vs all exon check loop
438 if (hasovl) {
439 GSeg seg;
440 //add the rest of the non-overlapping exons
441 for (int i=0;i<t->exons.Count();i++) {
442 seg.start=t->exons[i]->start;
443 seg.end=t->exons[i]->end;
444 if (!ovlexons.Exists(i)) mexons.Add(seg);
445 }
446 rnas_add(t);
447 // add to rnas
448 ((GTData*)t->uptr)->locus=this;
449 gseq_id=t->gseq_id;
450 }
451 return hasovl;
452 }
453
454 //simpler,basic adding of a mrna
455 void rnas_add(GffObj* t) {
456 rnas.Add(t);
457 // adjust start/end
458 //if (start==0 || start>t->start) start=t->start;
459 if (start==0) start=t->start;
460 else if (start>t->start) {
461 start=t->start;
462 }
463 if (end<t->end) end=t->end;
464 if (t_maxcov->covlen<t->covlen) t_maxcov=t;
465 if (strand==0) strand=t->strand;
466 if (t->ftype_id==gff_fid_mRNA) is_mrna=true;
467 }
468 };
469
470 class GenomicSeqData {
471 int gseq_id;
472 public:
473 const char* gseq_name;
474 GList<GffObj> gfs; //all non-transcript features -> usually gene features
475 GList<GffObj> rnas; //all transcripts on this genomic sequence
476 GList<GffLocus> loci; //all loci clusters
477 GList<GTData> tdata; //transcript data (uptr holder for all rnas loaded here)
478 //GenomicSeqData(int gid=-1):rnas(true,true,false),loci(true,true,true),
479 GenomicSeqData(int gid=-1):gfs(true, true, false),rnas((GCompareProc*)gfo_cmpByLoc),loci(true,true,false),
480 tdata(false,true,false) {
481 gseq_id=gid;
482 if (gseq_id>=0)
483 gseq_name=GffObj::names->gseqs.getName(gseq_id);
484
485 }
486 bool operator==(GenomicSeqData& d){
487 return gseq_id==d.gseq_id;
488 }
489 bool operator<(GenomicSeqData& d){
490 return (gseq_id<d.gseq_id);
491 }
492 };
493
494 int gseqCmpName(const pointer p1, const pointer p2);
495
496 class GSpliceSite {
497 public:
498 char nt[3];
499 GSpliceSite(const char* c, bool revc=false) {
500 nt[2]=0;
501 if (c==NULL) {
502 nt[0]=0;
503 nt[1]=0;
504 return;
505 }
506 if (revc) {
507 nt[0]=toupper(ntComplement(c[1]));
508 nt[1]=toupper(ntComplement(c[0]));
509 }
510 else {
511 nt[0]=toupper(c[0]);
512 nt[1]=toupper(c[1]);
513 }
514 }
515
516 GSpliceSite(const char* intron, int intronlen, bool getAcceptor, bool revc=false) {
517 nt[2]=0;
518 if (intron==NULL || intronlen==0)
519 GError("Error: invalid intron or intron len for GSpliceSite()!\n");
520 const char* c=intron;
521 if (revc) {
522 if (!getAcceptor) c+=intronlen-2;
523 nt[0]=toupper(ntComplement(c[1]));
524 nt[1]=toupper(ntComplement(c[0]));
525 }
526 else { //on forward strand
527 if (getAcceptor) c+=intronlen-2;
528 nt[0]=toupper(c[0]);
529 nt[1]=toupper(c[1]);
530 }//forward strand
531 }
532
533 GSpliceSite(const char n1, const char n2) {
534 nt[2]=0;
535 nt[0]=toupper(n1);
536 nt[1]=toupper(n2);
537 }
538 bool canonicalDonor() {
539 return (nt[0]=='G' && (nt[1]=='C' || nt[1]=='T'));
540 }
541 bool operator==(GSpliceSite& c) {
542 return (c.nt[0]==nt[0] && c.nt[1]==nt[1]);
543 }
544 bool operator==(GSpliceSite* c) {
545 return (c->nt[0]==nt[0] && c->nt[1]==nt[1]);
546 }
547 bool operator==(const char* c) {
548 //return (nt[0]==toupper(c[0]) && nt[1]==toupper(c[1]));
549 //assumes given const nucleotides are uppercase already!
550 return (nt[0]==c[0] && nt[1]==c[1]);
551 }
552 bool operator!=(const char* c) {
553 //assumes given const nucleotides are uppercase already!
554 return (nt[0]!=c[0] || nt[1]!=c[1]);
555 }
556 };
557
558 struct GffLoader {
559 GStr fname;
560 FILE* f;
561 bool transcriptsOnly;
562 bool fullAttributes;
563 bool noExonAttrs;
564 bool mergeCloseExons;
565 bool showWarnings;
566 void load(GList<GenomicSeqData>&seqdata, GFValidateFunc* gf_validate=NULL,
567 bool doCluster=true, bool doCollapseRedundant=true,
568 bool matchAllIntrons=true, bool fuzzSpan=false, bool forceExons=false);
569 GffLoader(const char* filename):fname(filename) {
570 f=NULL;
571 transcriptsOnly=true;
572 fullAttributes=false;
573 noExonAttrs=false;
574 mergeCloseExons=false;
575 showWarnings=false;
576 if (fname=="-" || fname=="stdin") {
577 f=stdin;
578 fname="stdin";
579 }
580 else {
581 if ((f=fopen(fname.chars(), "r"))==NULL) {
582 GError("Error: cannot open gff file %s!\n",fname.chars());
583 }
584 }
585 }
586 ~GffLoader() {
587 if (f!=NULL && f!=stdin) fclose(f);
588 }
589 };
590
591 void printFasta(FILE* f, GStr& defline, char* seq, int seqlen=-1);
592
593 //"position" a given coordinate x within a list of transcripts sorted by their start (lowest)
594 //coordinate, using quick-search; the returned int is the list index of the closest *higher*
595 //GffObj - i.e. starting right *ABOVE* the given coordinate
596 //Convention: returns -1 if there is no such GffObj (i.e. last GffObj starts below x)
597 int qsearch_rnas(uint x, GList<GffObj>& rnas);
598 int qsearch_gloci(uint x, GList<GffLocus>& loci);
599
600 GffObj* redundantTranscripts(GffObj& ti, GffObj& tj, bool matchAllIntrons=true, bool fuzzSpan=false);
601
602 void placeGf(GffObj* t, GenomicSeqData* gdata, bool doCluster=true, bool collapseRedundant=true,
603 bool matchAllIntrons=true, bool fuzzSpan=false);
604 //void loadGFF(FILE* f, GList<GenomicSeqData>& seqdata, const char* fname);
605
606 void collectLocusData(GList<GenomicSeqData>& ref_data);
607
608 #endif