ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gffread/gff_utils.cpp
Revision: 26
Committed: Wed Jul 27 19:39:27 2011 UTC (8 years, 1 month ago) by gpertea
File size: 18929 byte(s)
Log Message:
added missing gff_utils source

Line User Rev File contents
1 gpertea 26 #include "gff_utils.h"
2    
3     extern bool verbose;
4     extern bool debugMode;
5    
6     void printFasta(FILE* f, GStr& defline, char* seq, int seqlen) {
7     if (seq==NULL) return;
8     int len=(seqlen>0)?seqlen:strlen(seq);
9     if (len<=0) return;
10     if (!defline.is_empty())
11     fprintf(f, ">%s\n",defline.chars());
12     int ilen=0;
13     for (int i=0; i < len; i++, ilen++) {
14     if (ilen == 70) {
15     fputc('\n', f);
16     ilen = 0;
17     }
18     putc(seq[i], f);
19     } //for
20     fputc('\n', f);
21     }
22    
23     int qsearch_gloci(uint x, GList<GffLocus>& loci) {
24     //binary search
25     //do the simplest tests first:
26     if (loci[0]->start>x) return 0;
27     if (loci.Last()->start<x) return -1;
28     uint istart=0;
29     int i=0;
30     int idx=-1;
31     int maxh=loci.Count()-1;
32     int l=0;
33     int h = maxh;
34     while (l <= h) {
35     i = (l+h)>>1;
36     istart=loci[i]->start;
37     if (istart < x) l = i + 1;
38     else {
39     if (istart == x) { //found matching coordinate here
40     idx=i;
41     while (idx<=maxh && loci[idx]->start==x) {
42     idx++;
43     }
44     return (idx>maxh) ? -1 : idx;
45     }
46     h = i - 1;
47     }
48     } //while
49     idx = l;
50     while (idx<=maxh && loci[idx]->start<=x) {
51     idx++;
52     }
53     return (idx>maxh) ? -1 : idx;
54     }
55    
56     int qsearch_rnas(uint x, GList<GffObj>& rnas) {
57     //binary search
58     //do the simplest tests first:
59     if (rnas[0]->start>x) return 0;
60     if (rnas.Last()->start<x) return -1;
61     uint istart=0;
62     int i=0;
63     int idx=-1;
64     int maxh=rnas.Count()-1;
65     int l=0;
66     int h = maxh;
67     while (l <= h) {
68     i = (l+h)>>1;
69     istart=rnas[i]->start;
70     if (istart < x) l = i + 1;
71     else {
72     if (istart == x) { //found matching coordinate here
73     idx=i;
74     while (idx<=maxh && rnas[idx]->start==x) {
75     idx++;
76     }
77     return (idx>maxh) ? -1 : idx;
78     }
79     h = i - 1;
80     }
81     } //while
82     idx = l;
83     while (idx<=maxh && rnas[idx]->start<=x) {
84     idx++;
85     }
86     return (idx>maxh) ? -1 : idx;
87     }
88    
89     int cmpRedundant(GffObj& a, GffObj& b) {
90     if (a.exons.Count()==b.exons.Count()) {
91     if (a.covlen==b.covlen) {
92     return strcmp(a.getID(), b.getID());
93     }
94     else return (a.covlen>b.covlen)? 1 : -1;
95     }
96     else return (a.exons.Count()>b.exons.Count())? 1: -1;
97     }
98    
99    
100     bool tMatch(GffObj& a, GffObj& b) {
101     //strict intron chain match, or single-exon perfect match
102     int imax=a.exons.Count()-1;
103     int jmax=b.exons.Count()-1;
104     int ovlen=0;
105     if (imax!=jmax) return false; //different number of introns
106    
107     if (imax==0) { //single-exon mRNAs
108     //if (equnspl) {
109     //fuzz match for single-exon transfrags:
110     // it's a match if they overlap at least 80% of max len
111     ovlen=a.exons[0]->overlapLen(b.exons[0]);
112     int maxlen=GMAX(a.covlen,b.covlen);
113     return (ovlen>=maxlen*0.8);
114     /*}
115     else {
116     //only exact match
117     ovlen=a.covlen;
118     return (a.exons[0]->start==b.exons[0]->start &&
119     a.exons[0]->end==b.exons[0]->end);
120    
121     }*/
122     }
123     //check intron overlaps
124     ovlen=a.exons[0]->end-(GMAX(a.start,b.start))+1;
125     ovlen+=(GMIN(a.end,b.end))-a.exons.Last()->start;
126     for (int i=1;i<=imax;i++) {
127     if (i<imax) ovlen+=a.exons[i]->len();
128     if ((a.exons[i-1]->end!=b.exons[i-1]->end) ||
129     (a.exons[i]->start!=b.exons[i]->start)) {
130     return false; //intron mismatch
131     }
132     }
133     return true;
134     }
135    
136    
137     bool unsplContained(GffObj& ti, GffObj& tj, bool fuzzSpan) {
138     //returns true only if ti (which MUST be single-exon) is "almost" contained in any of tj's exons
139     //but it does not cross any intron-exon boundary of tj
140     int imax=ti.exons.Count()-1;
141     int jmax=tj.exons.Count()-1;
142     if (imax>0) GError("Error: bad unsplContained() call, 1st param must be single-exon transcript!\n");
143     int minovl = (int)(0.8 * ti.len()); //minimum overlap for fuzzSpan
144     if (fuzzSpan) {
145     for (int j=0;j<=jmax;j++) {
146     //must NOT overlap the introns
147     if ((j>0 && ti.start<tj.exons[j]->start)
148     || (j<jmax && ti.end>tj.exons[j]->end))
149     return false;
150     if (ti.exons[0]->overlapLen(tj.exons[j])>=minovl)
151     return true;
152     }
153     } else {
154     for (int j=0;j<=jmax;j++) {
155     //must NOT overlap the introns
156     if ((j>0 && ti.start<tj.exons[j]->start)
157     || (j<jmax && ti.end>tj.exons[j]->end))
158     return false;
159     //strict containment
160     if (ti.end<=tj.exons[j]->end && ti.start>=tj.exons[j]->start)
161     return true;
162     }
163     }
164     return false;
165     }
166    
167     GffObj* redundantTranscripts(GffObj& ti, GffObj& tj, bool matchAllIntrons, bool fuzzSpan) {
168     // matchAllIntrons==true: transcripts are considered "redundant" only if
169     // they have the exact same number of introns and same splice sites (or none)
170     // (single-exon transcripts can be also fully contained to be considered matching)
171     // matchAllIntrons==false: an intron chain could be a subset of a "container" chain,
172     // as long as no intron-exon boundaries are violated; also, a single-exon
173     // transcript will be collapsed if it's contained in one of the exons of the other
174     // fuzzSpan==false: the genomic span of one transcript must be contained in or equal with the genomic
175     // span of the other
176     //
177     // fuzzSpan==true: then genomic spans of transcripts are no longer required to be fully contained
178     // (i.e. they may extend each-other in opposite directions)
179    
180     //if redundancy is found, the "bigger" transcript is returned (otherwise NULL is returned)
181     if (ti.start>=tj.end || tj.start>=ti.end || tj.strand!=ti.strand) return NULL; //no span overlap at all
182     int imax=ti.exons.Count()-1;
183     int jmax=tj.exons.Count()-1;
184     GffObj* bigger=NULL;
185     GffObj* smaller=NULL;
186     if (matchAllIntrons) {
187     if (imax!=jmax) return false;
188     if (ti.covlen>tj.covlen) {
189     bigger=&ti;
190     if (!fuzzSpan && (ti.start>tj.start || ti.end<tj.end)) return NULL;
191     }
192     else { //ti.covlen<=tj.covlen
193     bigger=&tj;
194     if (!fuzzSpan && (tj.start>ti.start || tj.end<ti.end)) return NULL;
195     }
196     //check that all introns really match
197     for (int i=0;i<imax;i++) {
198     if (ti.exons[i]->end!=tj.exons[i]->end ||
199     ti.exons[i+1]->start!=tj.exons[i+1]->start) return NULL;
200     }
201     return bigger;
202     }
203     //--- matchAllIntrons==false: intron-chain containment is also considered redundancy
204     int maxlen=0;
205     int minlen=0;
206     if (ti.covlen>tj.covlen) {
207     if (tj.exons.Count()>ti.exons.Count()) {
208     //exon count override
209     bigger=&tj;
210     smaller=&ti;
211     }
212     else {
213     bigger=&ti;
214     smaller=&tj;
215     }
216     maxlen=ti.covlen;
217     minlen=tj.covlen;
218     }
219     else { //tj has more bases
220     if (ti.exons.Count()>tj.exons.Count()) {
221     //exon count override
222     bigger=&ti;
223     smaller=&tj;
224     }
225     else {
226     bigger=&tj;
227     smaller=&ti;
228     }
229     maxlen=tj.covlen;
230     minlen=ti.covlen;
231     }
232     if (imax==0 && jmax==0) {
233     //single-exon transcripts: if fuzzSpan, at least 80% of the shortest one must be overlapped by the other
234     if (fuzzSpan) {
235     return (ti.exons[0]->overlapLen(tj.exons[0])>=minlen*0.8) ? bigger : NULL;
236     }
237     else {
238     return (smaller->start>=bigger->start && smaller->end<=bigger->end) ? bigger : NULL;
239     }
240     }
241     //containment is also considered redundancy
242     if (smaller->exons.Count()==1) {
243     //check if this single exon is contained in any of tj exons
244     //without violating any intron-exon boundaries
245     return (unsplContained(*smaller, *bigger, fuzzSpan) ? bigger : NULL);
246     }
247    
248     //--from here on: both are multi-exon transcripts, imax>0 && jmax>0
249     if (ti.exons[imax]->start<tj.exons[0]->end ||
250     tj.exons[jmax]->start<ti.exons[0]->end )
251     return NULL; //intron chains do not overlap at all
252    
253    
254     //checking full intron chain containment
255     uint eistart=0, eiend=0, ejstart=0, ejend=0; //exon boundaries
256     int i=1; //exon idx to the right of the current intron of ti
257     int j=1; //exon idx to the right of the current intron of tj
258     //find the first intron overlap:
259     while (i<=imax && j<=jmax) {
260     eistart=ti.exons[i-1]->end;
261     eiend=ti.exons[i]->start;
262     ejstart=tj.exons[j-1]->end;
263     ejend=tj.exons[j]->start;
264     if (ejend<eistart) { j++; continue; }
265     if (eiend<ejstart) { i++; continue; }
266     //we found an intron overlap
267     break;
268     }
269     if (!fuzzSpan && (bigger->start>smaller->start || bigger->end < smaller->end)) return NULL;
270     if ((i>1 && j>1) || i>imax || j>jmax) {
271     return NULL; //either no intron overlaps found at all
272     //or it's not the first intron for at least one of the transcripts
273     }
274     if (eistart!=ejstart || eiend!=ejend) return NULL; //not an exact intron match
275     if (j>i) {
276     //i==1, ti's start must not conflict with the previous intron of tj
277     if (ti.start<tj.exons[j-1]->start) return NULL;
278     //so i's first intron starts AFTER j's first intron
279     // then j must contain i, so i's last intron must end with or before j's last intron
280     if (ti.exons[imax]->start>tj.exons[jmax]->start) return NULL;
281     //comment out the line above if you just want "intron compatibility" (i.e. extension of intron chains )
282     }
283     else if (i>j) {
284     //j==1, tj's start must not conflict with the previous intron of ti
285     if (tj.start<ti.exons[i-1]->start) return NULL;
286     //so j's intron chain starts AFTER i's
287     // then i must contain j, so j's last intron must end with or before j's last intron
288     if (tj.exons[jmax]->start>ti.exons[imax]->start) return NULL;
289     //comment out the line above for just "intronCompatible()" check (allowing extension of intron chain)
290     }
291     //now check if the rest of the introns overlap, in the same sequence
292     i++;
293     j++;
294     while (i<=imax && j<=jmax) {
295     if (ti.exons[i-1]->end!=tj.exons[j-1]->end ||
296     ti.exons[i]->start!=tj.exons[j]->start) return NULL;
297     i++;
298     j++;
299     }
300     i--;
301     j--;
302     if (i==imax && j<jmax) {
303     // tj has more introns to the right, check if ti's end doesn't conflict with the current tj exon boundary
304     if (ti.end>tj.exons[j]->end) return NULL;
305     }
306     else if (j==jmax && i<imax) {
307     if (tj.end>ti.exons[i]->end) return NULL;
308     }
309     return bigger;
310     }
311    
312    
313     int gseqCmpName(const pointer p1, const pointer p2) {
314     return strcmp(((GenomicSeqData*)p1)->gseq_name, ((GenomicSeqData*)p2)->gseq_name);
315     }
316    
317    
318     void printLocus(GffLocus* loc, const char* pre=NULL) {
319     if (pre!=NULL) fprintf(stderr, "%s", pre);
320     GMessage(" [%d-%d] : ", loc->start, loc->end);
321     GMessage("%s",loc->rnas[0]->getID());
322     for (int i=1;i<loc->rnas.Count();i++) {
323     GMessage(",%s",loc->rnas[i]->getID());
324     }
325     GMessage("\n");
326     }
327    
328     void placeGf(GffObj* t, GenomicSeqData* gdata, bool doCluster, bool collapseRedundant, bool matchAllIntrons, bool fuzzSpan) {
329     //GMessage(">>Placing transcript %s\n", t->getID());
330     GTData* tdata=new GTData(t);
331     gdata->tdata.Add(tdata);
332     int tidx=-1;
333     if (t->exons.Count()>0)
334     tidx=gdata->rnas.Add(t); //added it in sorted order
335     else {
336     gdata->gfs.Add(t);
337     return; //nothing to do with these non-transcript objects
338     }
339     if (!doCluster) return;
340     if (gdata->loci.Count()==0) {
341     gdata->loci.Add(new GffLocus(t));
342     //GMessage(" <<make it first locus %d-%d \n",t->start, t->end);
343     return;
344     }
345     //DEBUG: show available loci:
346     // GMessage(" [%d loci already:\n", gdata->loci.Count());
347     //for (int l=0;l<gdata->loci.Count();l++) {
348     // printLocus(gdata->loci[l]);
349     // }
350     int nidx=qsearch_gloci(t->end, gdata->loci); //get index of nearest locus starting just ABOVE t->end
351     //GMessage("\tlooking up end coord %d in gdata->loci.. (qsearch got nidx=%d)\n", t->end, nidx);
352     if (nidx==0) {
353     //cannot have any overlapping loci
354     //GMessage(" <<no ovls possible, create locus %d-%d \n",t->start, t->end);
355     gdata->loci.Add(new GffLocus(t));
356     return;
357     }
358     if (nidx==-1) nidx=gdata->loci.Count();//all loci start below t->end
359     int lfound=0; //count of parent loci
360     GArray<int> mrgloci(false);
361     GList<GffLocus> tloci(true); //candidate parent loci to adopt this
362     //GMessage("\tchecking all loci from %d to 0\n",nidx-1);
363     for (int l=nidx-1;l>=0;l--) {
364     GffLocus& loc=*(gdata->loci[l]);
365     if (loc.strand!='.' && t->strand!='.'&& loc.strand!=t->strand) continue;
366     if (t->start>loc.end) {
367     if (t->start-loc.start>GFF_MAX_LOCUS) break; //give up already
368     continue;
369     }
370     if (loc.start>t->end) continue;
371     //this should never be the case if nidx was found correctly
372     //GMessage(" !range overlap found with locus ");
373     //printLocus(&loc);
374     if (loc.add_RNA(t)) {
375     //will add this transcript to loc
376     lfound++;
377     mrgloci.Add(l);
378     if (collapseRedundant) {
379     //compare to every single transcript in this locus
380     for (int ti=0;ti<loc.rnas.Count();ti++) {
381     if (loc.rnas[ti]==t) continue;
382     GTData* odata=(GTData*)(loc.rnas[ti]->uptr);
383     //GMessage(" ..redundant check vs overlapping transcript %s\n",loc.rnas[ti]->getID());
384     GffObj* container=NULL;
385     if (odata->replaced_by==NULL &&
386     (container=redundantTranscripts(*t, *(loc.rnas[ti]), matchAllIntrons, fuzzSpan))!=NULL) {
387     if (container==t) {
388     odata->replaced_by=t;
389     }
390     else {
391     tdata->replaced_by=loc.rnas[ti];
392     }
393     }
394     }//for each transcript in the exon-overlapping locus
395     } //if doCollapseRedundant
396     } //overlapping locus
397     }
398     if (lfound==0) {
399     //overlapping loci not found, create a locus with only this mRNA
400     //GMessage(" overlapping locus not found, create locus %d-%d \n",t->start, t->end);
401     int addidx=gdata->loci.Add(new GffLocus(t));
402     if (addidx<0) {
403     GMessage(" WARNING: new GffLocus(%s:%d-%d) not added!\n",t->getID(), t->start, t->end);
404     }
405     }
406     else if (lfound>1) {
407     //more than one loci found parenting this mRNA, merge loci
408     //if (lfound>2) GMessage(" merging %d loci \n",lfound);
409     lfound--;
410     for (int l=0;l<lfound;l++) {
411     int mlidx=mrgloci[l]; //largest indices first, so it's safe to remove
412     gdata->loci[mrgloci[lfound]]->addMerge(*(gdata->loci[mlidx]), t);
413     gdata->loci.Delete(mlidx);
414     }
415     }
416     }
417    
418     void collectLocusData(GList<GenomicSeqData>& ref_data) {
419     int locus_num=0;
420     for (int g=0;g<ref_data.Count();g++) {
421     GenomicSeqData* gdata=ref_data[g];
422     for (int l=0;l<gdata->loci.Count();l++) {
423     GffLocus& loc=*(gdata->loci[l]);
424     GHash<int> gnames(true); //gene names in this locus
425     GHash<int> geneids(true); //Entrez GeneID: numbers
426     for (int i=0;i<loc.rnas.Count();i++) {
427     GffObj& t=*(loc.rnas[i]);
428     GStr gname(t.getGeneName());
429     if (!gname.is_empty()) {
430     gname.upper();
431     int* prevg=gnames.Find(gname.chars());
432     if (prevg!=NULL) (*prevg)++;
433     else gnames.Add(gname, new int(1));
434     }
435     //parse GeneID xrefs, if any:
436     GStr xrefs(t.getAttr("xrefs"));
437     if (!xrefs.is_empty()) {
438     xrefs.startTokenize(",");
439     GStr token;
440     while (xrefs.nextToken(token)) {
441     token.upper();
442     if (token.startsWith("GENEID:")) {
443     token.cut(0,token.index(':')+1);
444     int* prevg=geneids.Find(token.chars());
445     if (prevg!=NULL) (*prevg)++;
446     else geneids.Add(token, new int(1));
447     }
448     } //for each xref
449     } //xrefs parsing
450     }//for each transcript
451     locus_num++;
452     loc.locus_num=locus_num;
453     if (gnames.Count()>0) { //collect all gene names associated to this locus
454     gnames.startIterate();
455     int* gfreq=NULL;
456     char* key=NULL;
457     while ((gfreq=gnames.NextData(key))!=NULL) {
458     loc.gene_names.AddIfNew(new CGeneSym(key,*gfreq));
459     }
460     } //added collected gene_names
461     if (loc.gene_ids.Count()>0) { //collect all GeneIDs names associated to this locus
462     geneids.startIterate();
463     int* gfreq=NULL;
464     char* key=NULL;
465     while ((gfreq=geneids.NextData(key))!=NULL) {
466     loc.gene_ids.AddIfNew(new CGeneSym(key,*gfreq));
467     }
468     }
469     } //for each locus
470     }//for each genomic sequence
471     }
472    
473    
474     void GffLoader::load(GList<GenomicSeqData>& seqdata, GFValidateFunc* gf_validate,
475     bool doCluster, bool doCollapseRedundant, bool matchAllIntrons, bool fuzzSpan) {
476     GffReader* gffr=new GffReader(f, this->transcriptsOnly, false); //not only mRNA features, not sorted
477     gffr->showWarnings(this->showWarnings);
478     // keepAttrs mergeCloseExons noExonAttr
479     gffr->readAll(this->fullAttributes, this->mergeCloseExons, this->noExonAttrs);
480     //int redundant=0; //redundant annotation discarded
481     if (verbose) GMessage(" .. loaded %d genomic features from %s\n", gffr->gflst.Count(), fname.chars());
482     //int rna_deleted=0;
483     //add to GenomicSeqData, adding to existing loci and identifying intron-chain duplicates
484     for (int k=0;k<gffr->gflst.Count();k++) {
485     GffObj* m=gffr->gflst[k];
486     if (strcmp(m->getFeatureName(), "locus")==0 &&
487     m->getAttr("transcripts")!=NULL) {
488     continue;
489     }
490    
491     char* rloc=m->getAttr("locus");
492     if (rloc!=NULL && startsWith(rloc, "RLOC_")) {
493     m->removeAttr("locus", rloc);
494     }
495     if (m->exons.Count()==0 && m->children.Count()==0) {
496     //a non-mRNA feature with no subfeatures
497     //add a dummy exon just to have the generic exon checking work
498     m->addExon(m->start,m->end);
499     }
500     GList<GffObj> gfadd(false,false);
501     if (gf_validate!=NULL && !(*gf_validate)(m, &gfadd)) {
502     continue;
503     }
504     m->isUsed(true); //so the gffreader won't destroy it
505     int i=-1;
506     GenomicSeqData f(m->gseq_id);
507     GenomicSeqData* gdata=NULL;
508    
509     if (seqdata.Found(&f,i)) gdata=seqdata[i];
510     else { //entry not created yet for this genomic seq
511     gdata=new GenomicSeqData(m->gseq_id);
512     seqdata.Add(gdata);
513     }
514     for (int k=0;k<gfadd.Count();k++) {
515     placeGf(gfadd[k], gdata, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan);
516     }
517     placeGf(m, gdata, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan);
518     } //for each read gffObj
519     //if (verbose) GMessage(" .. %d records from %s clustered into loci.\n", gffr->gflst.Count(), fname.chars());
520     if (f!=stdin) { fclose(f); f=NULL; }
521     delete gffr;
522     }