ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gffread/gff_utils.h
Revision: 59
Committed: Fri Sep 9 18:00:23 2011 UTC (9 years, 2 months ago) by gpertea
File size: 21230 byte(s)
Log Message:
finally fixed the placeGf clustering bug

Line User Rev File contents
1 gpertea 26 #ifndef GFF_UTILS_H
2     #define GFF_UTILS_H
3     #include "gff.h"
4     #include "GStr.h"
5     #include "GFastaIndex.h"
6     #include "GFaSeqGet.h"
7    
8     typedef bool GFValidateFunc(GffObj* gf, GList<GffObj>* gfadd);
9    
10     class GeneInfo { //for Ensembl GTF conversion
11     public:
12     int flag;
13     GffObj* gf;
14     GList<GStr> gene_names;
15     GList<GStr> transcripts; //list of transcript IDs
16     GeneInfo():gene_names(true, true, true), transcripts(true,true,true) {
17     gf=NULL;
18     flag=0;
19     }
20     GeneInfo(GffObj* gfrec, bool ensembl_convert=false):gene_names(true, true, true),
21     transcripts(true,true,true) {
22     flag=0;
23     if (gfrec->getGeneName())
24     gene_names.Add(new GStr(gfrec->getGeneName()));
25     transcripts.Add(new GStr(gfrec->getID()));
26     create_gf(gfrec, ensembl_convert);
27     }
28    
29     void create_gf(GffObj* gfrec, bool ensembl_convert) {
30     gf=new GffObj(gfrec->getGeneID());
31     gf->gseq_id=gfrec->gseq_id;
32     gf->track_id=gfrec->track_id;
33     gf->start=gfrec->start;
34     gf->end=gfrec->end;
35     gf->strand=gfrec->strand;
36     gf->setFeatureName("gene");
37     gf->isGene(true);
38     gf->isUsed(true);
39     gf->uptr=this;
40     gfrec->incLevel();
41     gfrec->parent=gf;
42     gf->children.Add(gfrec);
43     if (ensembl_convert) {
44     gf->addAttr("type", gf->getTrackName());
45     }
46     //gf->children.Add(gfrec);
47     }
48     //~GeneInfo() {
49     // }
50     void update(GffObj* gfrec) {
51     if (transcripts.AddedIfNew(new GStr(gfrec->getID()))<0)
52     return;
53     gene_names.AddedIfNew(new GStr(gfrec->getGeneName()));
54     if (gf==NULL) {
55     GError("GeneInfo::update() called on uninitialized gf!\n");
56     //create_gf(gfrec);
57     //return;
58     }
59     gfrec->parent=gf;
60     gf->children.Add(gfrec);
61     gfrec->incLevel();
62     if (gf->start>gfrec->start)
63     gf->start=gfrec->start;
64     if (gf->end<gfrec->end)
65     gf->end=gfrec->end;
66     }
67     void finalize() {
68     //prepare attributes for printing
69     //must be called right before printing
70     if (gf==NULL || transcripts.Count()==0) return;
71     if (gene_names.Count()>0) {
72     gf->addAttr("Name", gene_names[0]->chars());
73     /*
74     GStr s(gene_names[0]->chars());
75     for (int i=1;i<gene_names.Count();i++) {
76     s.append(",");
77     s.append(gene_names[i]->chars());
78     }
79     gf->addAttr("genes", s.chars());
80     */
81     } //has gene names
82     GStr t(transcripts[0]->chars());
83     for (int i=1;i<transcripts.Count();i++) {
84     t.append(",");
85     t.append(transcripts[i]->chars());
86     }
87     gf->addAttr("transcripts", t.chars());
88     }
89     };
90    
91     //genomic fasta sequence handling
92     class GFastaDb {
93     public:
94     char* fastaPath;
95     GFastaIndex* faIdx; //could be a cdb .cidx file
96     int last_fetchid;
97     GFaSeqGet* faseq;
98     //GCdbYank* gcdb;
99     char* getFastaFile(int gseq_id) {
100     if (fastaPath==NULL) return NULL;
101     GStr s(fastaPath);
102     s.trimR('/');
103     s.appendfmt("/%s",GffObj::names->gseqs.getName(gseq_id));
104     GStr sbase(s);
105     if (!fileExists(s.chars())) s.append(".fa");
106     if (!fileExists(s.chars())) s.append("sta");
107     if (fileExists(s.chars())) return Gstrdup(s.chars());
108     else {
109     GMessage("Warning: cannot find genomic sequence file %s{.fa,.fasta}\n",sbase.chars());
110     return NULL;
111     }
112     }
113    
114     GFastaDb(const char* fpath=NULL) {
115     //gcdb=NULL;
116     fastaPath=NULL;
117     faseq=NULL;
118     faIdx=NULL;
119     init(fpath);
120     }
121    
122     void init(const char* fpath) {
123     if (fpath==NULL || fpath[0]==0) return;
124     last_fetchid=-1;
125     if (!fileExists(fpath))
126     GError("Error: file/directory %s does not exist!\n",fpath);
127     fastaPath=Gstrdup(fpath);
128     GStr gseqpath(fpath);
129     if (fileExists(fastaPath)>1) { //exists and it's not a directory
130     GStr fainame(fastaPath);
131     if (fainame.rindex(".fai")==fainame.length()-4) {
132     //.fai index file given directly
133     fastaPath[fainame.length()-4]=0;
134     if (!fileExists(fastaPath))
135     GError("Error: cannot find fasta file for index %s !\n", fastaPath);
136     }
137     else fainame.append(".fai");
138     //GMessage("creating GFastaIndex with fastaPath=%s, fainame=%s\n", fastaPath, fainame.chars());
139     faIdx=new GFastaIndex(fastaPath,fainame.chars());
140     GStr fainamecwd(fainame);
141     int ip=-1;
142     if ((ip=fainamecwd.rindex(CHPATHSEP))>=0)
143     fainamecwd.cut(0,ip+1);
144     if (!faIdx->hasIndex()) { //could not load index
145     //try current directory
146     if (fainame!=fainamecwd) {
147     if (fileExists(fainamecwd.chars())>1) {
148     faIdx->loadIndex(fainamecwd.chars());
149     }
150     }
151     } //tried to load index
152     if (!faIdx->hasIndex()) {
153     GMessage("No fasta index found for %s. Rebuilding, please wait..\n",fastaPath);
154     faIdx->buildIndex();
155     if (faIdx->getCount()==0) GError("Error: no fasta records found!\n");
156     GMessage("Fasta index rebuilt.\n");
157     FILE* fcreate=fopen(fainame.chars(), "w");
158     if (fcreate==NULL) {
159     GMessage("Warning: cannot create fasta index %s! (permissions?)\n", fainame.chars());
160     if (fainame!=fainamecwd) fcreate=fopen(fainamecwd.chars(), "w");
161     if (fcreate==NULL)
162     GError("Error: cannot create fasta index %s!\n", fainamecwd.chars());
163     }
164     if (faIdx->storeIndex(fcreate)<faIdx->getCount())
165     GMessage("Warning: error writing the index file!\n");
166     } //index created and attempted to store it
167     } //multi-fasta
168     }
169     GFaSeqGet* fetch(int gseq_id, bool checkFasta=false) {
170     if (fastaPath==NULL) return NULL;
171     if (gseq_id==last_fetchid && faseq!=NULL) return faseq;
172     delete faseq;
173     faseq=NULL;
174     last_fetchid=-1;
175     char* gseqname=GffObj::names->gseqs.getName(gseq_id);
176     if (faIdx!=NULL) { //fastaPath was the multi-fasta file name
177     GFastaRec* farec=faIdx->getRecord(gseqname);
178     if (farec!=NULL) {
179     faseq=new GFaSeqGet(fastaPath,farec->seqlen, farec->fpos,
180     farec->line_len, farec->line_blen);
181     faseq->loadall(); //just cache the whole sequence, it's faster
182     last_fetchid=gseq_id;
183     }
184     else {
185     GMessage("Warning: couldn't find fasta record for '%s'!\n",gseqname);
186     return NULL;
187     }
188     }
189     else {
190     char* sfile=getFastaFile(gseq_id);
191     if (sfile!=NULL) {
192     faseq=new GFaSeqGet(sfile,checkFasta);
193     faseq->loadall();
194     last_fetchid=gseq_id;
195     GFREE(sfile);
196     }
197     } //one fasta file per contig
198     return faseq;
199     }
200    
201     ~GFastaDb() {
202     GFREE(fastaPath);
203     //delete gcdb;
204     delete faIdx;
205     delete faseq;
206     }
207     };
208    
209     class GffLocus;
210    
211     class GTData { //transcript associated data
212     public:
213     GffObj* rna;
214     GffLocus* locus;
215     GffObj* replaced_by;
216     GeneInfo* geneinfo;
217     int flag;
218     GTData(GffObj* t=NULL) {
219     rna=t;
220     flag=0;
221     locus=NULL;
222     replaced_by=NULL;
223     geneinfo=NULL;
224     if (rna!=NULL) {
225     geneinfo=(GeneInfo*)rna->uptr; //take over geneinfo, if there
226     rna->uptr=this;
227     }
228     }
229     bool operator>(GTData& b) { return (rna > b.rna); }
230     bool operator<(GTData& b) { return (rna < b.rna); }
231     bool operator==(GTData& b) { return (rna==b.rna); }
232     };
233    
234     class CGeneSym {
235     public:
236     GStr name;
237     int freq;
238     CGeneSym(const char* n=NULL, int f=0):name(n) {
239     freq=f;
240     }
241     bool operator>(CGeneSym& b) {
242     return (freq==b.freq) ? ( (name.length()==b.name.length()) ? (name>b.name) :
243     (name.length()>b.name.length()) ) : ( freq<b.freq );
244     }
245     bool operator<(CGeneSym& b) {
246     return (freq==b.freq)? ( (name.length()==b.name.length()) ? (name<b.name) :
247     (name.length()<b.name.length()) ) : ( freq>b.freq );
248     }
249     bool operator==(CGeneSym& b) { return name==b.name; }
250     };
251    
252     const char* getGeneDescr(const char* gsym);
253    
254 gpertea 59 void printLocus(GffLocus* loc, const char* pre=NULL);
255    
256 gpertea 26 class GffLocus:public GSeg {
257     public:
258     int gseq_id; //id of underlying genomic sequence
259     int locus_num;
260     bool is_mrna;
261 gpertea 59 bool startChanged; //prompt reordering of container list
262 gpertea 26 char strand;
263     GffObj* t_maxcov; //transcript with maximum coverage (for main "ref" transcript)
264     GList<GffObj> rnas; //list of transcripts (isoforms) for this locus
265     GArray<GSeg> mexons; //list of merged exons in this region
266     GList<CGeneSym> gene_names;
267     GList<CGeneSym> gene_ids;
268     int v; //user flag/data
269 gpertea 59 /*
270 gpertea 26 bool operator==(GffLocus& d){
271     return (gseq_id==d.gseq_id && strand==d.strand && start==d.start && end==d.end);
272     }
273     bool operator>(GffLocus& d){
274     if (gseq_id!=d.gseq_id) return (gseq_id>d.gseq_id);
275     if (start==d.start) {
276     if (end==d.end) return (strand>d.strand);
277     else return (end>d.end);
278     } else return (start>d.start);
279     }
280     bool operator<(GffLocus& d){
281     if (gseq_id!=d.gseq_id) return (gseq_id<d.gseq_id);
282     if (start==d.start) {
283     if (end==d.end) return strand<d.strand;
284     else return end<d.end;
285     } else return (start<d.start);
286     }
287 gpertea 59 */
288 gpertea 26 const char* getGeneName() {
289     if (gene_names.Count()==0) return NULL;
290     return gene_names.First()->name.chars();
291     }
292     const char* get_tmax_id() {
293     return t_maxcov->getID();
294     }
295     const char* get_descr() {
296     if (gene_names.Count()>0) {
297     for (int i=0;i<gene_names.Count();i++) {
298     const char* gn=getGeneDescr(gene_names.First()->name.chars());
299     if (gn!=NULL) return gn;
300     }
301     }
302     char* s=t_maxcov->getAttr("product");
303     if (s!=NULL) return s;
304     s=t_maxcov->getAttr("descr");
305     if (s!=NULL) return s;
306     s=t_maxcov->getAttr("description");
307     if (s!=NULL) return s;
308     s=t_maxcov->getAttr("info");
309     if (s!=NULL) return s;
310     return NULL;
311     }
312    
313     GffLocus(GffObj* t=NULL):rnas(true,false,false),mexons(true,true),
314     gene_names(true,true,false), gene_ids(true,true,false) {
315     //this will NOT free rnas!
316     t_maxcov=NULL;
317     gseq_id=-1;
318     v=0;
319     locus_num=0;
320     start=0;
321     end=0;
322     strand=0;
323     is_mrna=false;
324 gpertea 59 startChanged=false;
325 gpertea 26 if (t!=NULL) {
326     start=t->exons.First()->start;
327     end=t->exons.Last()->end;;
328     gseq_id=t->gseq_id;
329     GSeg seg;
330     for (int i=0;i<t->exons.Count();i++) {
331     seg.start=t->exons[i]->start;
332     seg.end=t->exons[i]->end;
333     mexons.Add(seg);
334     }
335     rnas.Add(t);
336     ((GTData*)(t->uptr))->locus=this;
337     t_maxcov=t;
338     strand=t->strand;
339     if (t->ftype_id==gff_fid_mRNA) {
340     is_mrna=true;
341     }
342     }
343     }
344    
345     void addMerge(GffLocus& locus, GffObj* lnkrna) {
346     //add all the elements of the other locus (merging)
347     //-- merge mexons
348     GArray<int> ovlexons(true,true); //list of locus.mexons indexes overlapping existing mexons
349     int i=0; //index of first mexons with a merge
350     int j=0; //index current mrna exon
351     while (i<mexons.Count() && j<locus.mexons.Count()) {
352     uint istart=mexons[i].start;
353     uint iend=mexons[i].end;
354     uint jstart=locus.mexons[j].start;
355     uint jend=locus.mexons[j].end;
356     if (iend<jstart) { i++; continue; }
357     if (jend<istart) { j++; continue; }
358     ovlexons.Add(j);
359     //extend mexons[i] as needed
360     if (jstart<istart) mexons[i].start=jstart;
361     if (jend>iend) { //mexons[i] end extend
362     mexons[i].end=jend;
363     //now this could overlap the next mexon(s), so we have to merge them all
364     while (i<mexons.Count()-1 && mexons[i].end>mexons[i+1].start) {
365     uint nextend=mexons[i+1].end;
366     mexons.Delete(i+1);
367     if (nextend>mexons[i].end) {
368     mexons[i].end=nextend;
369     break; //no need to check next mexons
370     }
371     } //while next mexons merge
372     } // mexons[i] end extend
373     j++; //check the next locus.mexon
374     }
375     //-- add the rest of the non-overlapping mexons:
376     GSeg seg;
377     for (int i=0;i<locus.mexons.Count();i++) {
378     seg.start=locus.mexons[i].start;
379     seg.end=locus.mexons[i].end;
380     if (!ovlexons.Exists(i)) mexons.Add(seg);
381     }
382     // -- add locus.rnas
383     for (int i=0;i<locus.rnas.Count();i++) {
384     ((GTData*)(locus.rnas[i]->uptr))->locus=this;
385     if (locus.rnas[i]!=lnkrna) rnas.Add(locus.rnas[i]);
386     }
387     // -- adjust start/end as needed
388 gpertea 59 if (start>locus.start) {
389     startChanged=true;
390     start=locus.start;
391     }
392 gpertea 26 if (end<locus.end) end=locus.end;
393     if (locus.is_mrna) is_mrna=true;
394     if (t_maxcov->covlen<locus.t_maxcov->covlen)
395     t_maxcov=locus.t_maxcov;
396     }
397    
398     bool exonOverlap(GffLocus& loc) {
399     //check if any mexons overlap!
400     if (strand!=loc.strand || loc.start>end || start>loc.end) return false;
401     int i=0;
402     int j=0;
403     while (i<mexons.Count() && j<loc.mexons.Count()) {
404     uint istart=mexons[i].start;
405     uint iend=mexons[i].end;
406     uint jstart=loc.mexons[j].start;
407     uint jend=loc.mexons[j].end;
408     if (iend<jstart) { i++; continue; }
409     if (jend<istart) { j++; continue; }
410     //exon overlap found if we're here:
411     return true;
412     }
413     return false;
414     }
415    
416     bool add_RNA(GffObj* t) {
417     //if (rnas.Count()==0) return true; //? should never be called on an empty locus
418     if (t->gseq_id!=gseq_id || t->strand!=strand || t->start>end || start>t->end)
419     return false; //rna must be on the same genomic seq
420     //check for exon overlap with existing mexons
421     //also update mexons accordingly if t is to be added
422     bool hasovl=false;
423     int i=0; //index of first mexons with a merge
424     int j=0; //index current t exon
425     GArray<int> ovlexons(true,true); //list of mrna exon indexes overlapping mexons
426     while (i<mexons.Count() && j<t->exons.Count()) {
427     uint istart=mexons[i].start;
428     uint iend=mexons[i].end;
429     uint jstart=t->exons[j]->start;
430     uint jend=t->exons[j]->end;
431     if (iend<jstart) { i++; continue; }
432     if (jend<istart) { j++; continue; }
433     //exon overlap found if we're here:
434     ovlexons.Add(j);
435     hasovl=true;
436     //extend mexons[i] as needed
437     if (jstart<istart) mexons[i].start=jstart;
438     if (jend>iend) { //mexon stretch up
439     mexons[i].end=jend;
440     //now this could overlap the next mexon(s), so we have to merge them all
441     while (i<mexons.Count()-1 && mexons[i].end>mexons[i+1].start) {
442     uint nextend=mexons[i+1].end;
443     mexons.Delete(i+1);
444     if (nextend>mexons[i].end) {
445     mexons[i].end=nextend;
446     break; //no need to check next mexons
447     }
448     } //while next mexons merge
449     } //possible mexons merge
450    
451     j++; //check the next t exon
452     }//all vs all exon check loop
453     if (hasovl) {
454     GSeg seg;
455     //add the rest of the non-overlapping exons
456     for (int i=0;i<t->exons.Count();i++) {
457     seg.start=t->exons[i]->start;
458     seg.end=t->exons[i]->end;
459     if (!ovlexons.Exists(i)) mexons.Add(seg);
460     }
461     rnas_add(t);
462     // add to rnas
463     ((GTData*)t->uptr)->locus=this;
464     gseq_id=t->gseq_id;
465     }
466     return hasovl;
467     }
468    
469     //simpler,basic adding of a mrna
470     void rnas_add(GffObj* t) {
471     rnas.Add(t);
472     // adjust start/end
473 gpertea 59 //if (start==0 || start>t->start) start=t->start;
474     if (start==0) start=t->start;
475     else if (start>t->start) {
476     startChanged=true;
477     start=t->start;
478     }
479 gpertea 26 if (end<t->end) end=t->end;
480     if (t_maxcov->covlen<t->covlen) t_maxcov=t;
481     if (strand==0) strand=t->strand;
482     if (t->ftype_id==gff_fid_mRNA) is_mrna=true;
483     }
484     };
485    
486     class GenomicSeqData {
487     int gseq_id;
488     public:
489     const char* gseq_name;
490     GList<GffObj> gfs; //all non-transcript features -> usually gene features
491     GList<GffObj> rnas; //all transcripts on this genomic sequence
492     GList<GffLocus> loci; //all loci clusters
493     GList<GTData> tdata; //transcript data (uptr holder for all rnas loaded here)
494     //GenomicSeqData(int gid=-1):rnas(true,true,false),loci(true,true,true),
495     GenomicSeqData(int gid=-1):gfs(true, true, false),rnas((GCompareProc*)gfo_cmpByLoc),loci(true,true,false),
496     tdata(false,true,false) {
497     gseq_id=gid;
498     if (gseq_id>=0)
499     gseq_name=GffObj::names->gseqs.getName(gseq_id);
500    
501     }
502     bool operator==(GenomicSeqData& d){
503     return gseq_id==d.gseq_id;
504     }
505     bool operator>(GenomicSeqData& d){
506     return (gseq_id>d.gseq_id);
507     }
508     bool operator<(GenomicSeqData& d){
509     return (gseq_id<d.gseq_id);
510     }
511     };
512    
513     int gseqCmpName(const pointer p1, const pointer p2);
514    
515     class GSpliceSite {
516     public:
517     char nt[3];
518     GSpliceSite(const char* c, bool revc=false) {
519     nt[2]=0;
520     if (c==NULL) {
521     nt[0]=0;
522     nt[1]=0;
523     return;
524     }
525     if (revc) {
526     nt[0]=toupper(ntComplement(c[1]));
527     nt[1]=toupper(ntComplement(c[0]));
528     }
529     else {
530     nt[0]=toupper(c[0]);
531     nt[1]=toupper(c[1]);
532     }
533     }
534    
535     GSpliceSite(const char* intron, int intronlen, bool getAcceptor, bool revc=false) {
536     nt[2]=0;
537     if (intron==NULL || intronlen==0)
538     GError("Error: invalid intron or intron len for GSpliceSite()!\n");
539     const char* c=intron;
540     if (revc) {
541     if (!getAcceptor) c+=intronlen-2;
542     nt[0]=toupper(ntComplement(c[1]));
543     nt[1]=toupper(ntComplement(c[0]));
544     }
545     else { //on forward strand
546     if (getAcceptor) c+=intronlen-2;
547     nt[0]=toupper(c[0]);
548     nt[1]=toupper(c[1]);
549     }//forward strand
550     }
551    
552     GSpliceSite(const char n1, const char n2) {
553     nt[2]=0;
554     nt[0]=toupper(n1);
555     nt[1]=toupper(n2);
556     }
557     bool canonicalDonor() {
558     return (nt[0]=='G' && (nt[1]=='C' || nt[1]=='T'));
559     }
560     bool operator==(GSpliceSite& c) {
561     return (c.nt[0]==nt[0] && c.nt[1]==nt[1]);
562     }
563     bool operator==(GSpliceSite* c) {
564     return (c->nt[0]==nt[0] && c->nt[1]==nt[1]);
565     }
566     bool operator==(const char* c) {
567     //return (nt[0]==toupper(c[0]) && nt[1]==toupper(c[1]));
568     //assumes given const nucleotides are uppercase already!
569     return (nt[0]==c[0] && nt[1]==c[1]);
570     }
571     bool operator!=(const char* c) {
572     //assumes given const nucleotides are uppercase already!
573     return (nt[0]!=c[0] || nt[1]!=c[1]);
574     }
575     };
576    
577     struct GffLoader {
578     GStr fname;
579     FILE* f;
580     bool transcriptsOnly;
581     bool fullAttributes;
582     bool noExonAttrs;
583     bool mergeCloseExons;
584     bool showWarnings;
585     void load(GList<GenomicSeqData>&seqdata, GFValidateFunc* gf_validate=NULL,
586     bool doCluster=true, bool doCollapseRedundant=true, bool matchAllIntrons=true, bool fuzzSpan=false);
587     GffLoader(const char* filename):fname(filename) {
588     f=NULL;
589     transcriptsOnly=true;
590     fullAttributes=false;
591     noExonAttrs=false;
592     mergeCloseExons=false;
593     showWarnings=false;
594     if (fname=="-") {
595     f=stdin;
596     fname="stdin";
597     }
598     else {
599     if ((f=fopen(fname.chars(), "r"))==NULL) {
600     GError("Error: cannot open file %s!\n",fname.chars());
601     }
602     }
603     }
604     ~GffLoader() {
605     if (f!=NULL && f!=stdin) fclose(f);
606     }
607     };
608    
609     void printFasta(FILE* f, GStr& defline, char* seq, int seqlen=-1);
610    
611     //"position" a given coordinate x within a list of transcripts sorted by their start (lowest)
612     //coordinate, using quick-search; the returned int is the list index of the closest *higher*
613     //GffObj - i.e. starting right *ABOVE* the given coordinate
614     //Convention: returns -1 if there is no such GffObj (i.e. last GffObj starts below x)
615     int qsearch_rnas(uint x, GList<GffObj>& rnas);
616     int qsearch_gloci(uint x, GList<GffLocus>& loci);
617    
618     GffObj* redundantTranscripts(GffObj& ti, GffObj& tj, bool matchAllIntrons=true, bool fuzzSpan=false);
619    
620     void placeGf(GffObj* t, GenomicSeqData* gdata, bool doCluster=true, bool collapseRedundant=true,
621     bool matchAllIntrons=true, bool fuzzSpan=false);
622     //void loadGFF(FILE* f, GList<GenomicSeqData>& seqdata, const char* fname);
623    
624     void collectLocusData(GList<GenomicSeqData>& ref_data);
625    
626     #endif