ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gffread/gff_utils.cpp
Revision: 56
Committed: Thu Sep 8 05:47:09 2011 UTC (8 years ago) by gpertea
File size: 19545 byte(s)
Log Message:
RPS20P29, ENSG00000241828 not collapsing

Line User Rev File contents
1 gpertea 26 #include "gff_utils.h"
2    
3     extern bool verbose;
4     extern bool debugMode;
5    
6     void printFasta(FILE* f, GStr& defline, char* seq, int seqlen) {
7     if (seq==NULL) return;
8     int len=(seqlen>0)?seqlen:strlen(seq);
9     if (len<=0) return;
10     if (!defline.is_empty())
11     fprintf(f, ">%s\n",defline.chars());
12     int ilen=0;
13     for (int i=0; i < len; i++, ilen++) {
14     if (ilen == 70) {
15     fputc('\n', f);
16     ilen = 0;
17     }
18     putc(seq[i], f);
19     } //for
20     fputc('\n', f);
21     }
22    
23     int qsearch_gloci(uint x, GList<GffLocus>& loci) {
24     //binary search
25     //do the simplest tests first:
26     if (loci[0]->start>x) return 0;
27     if (loci.Last()->start<x) return -1;
28     uint istart=0;
29     int i=0;
30     int idx=-1;
31     int maxh=loci.Count()-1;
32     int l=0;
33     int h = maxh;
34     while (l <= h) {
35     i = (l+h)>>1;
36     istart=loci[i]->start;
37     if (istart < x) l = i + 1;
38     else {
39     if (istart == x) { //found matching coordinate here
40     idx=i;
41     while (idx<=maxh && loci[idx]->start==x) {
42     idx++;
43     }
44     return (idx>maxh) ? -1 : idx;
45     }
46     h = i - 1;
47     }
48     } //while
49     idx = l;
50     while (idx<=maxh && loci[idx]->start<=x) {
51     idx++;
52     }
53     return (idx>maxh) ? -1 : idx;
54     }
55    
56     int qsearch_rnas(uint x, GList<GffObj>& rnas) {
57     //binary search
58     //do the simplest tests first:
59     if (rnas[0]->start>x) return 0;
60     if (rnas.Last()->start<x) return -1;
61     uint istart=0;
62     int i=0;
63     int idx=-1;
64     int maxh=rnas.Count()-1;
65     int l=0;
66     int h = maxh;
67     while (l <= h) {
68     i = (l+h)>>1;
69     istart=rnas[i]->start;
70     if (istart < x) l = i + 1;
71     else {
72     if (istart == x) { //found matching coordinate here
73     idx=i;
74     while (idx<=maxh && rnas[idx]->start==x) {
75     idx++;
76     }
77     return (idx>maxh) ? -1 : idx;
78     }
79     h = i - 1;
80     }
81     } //while
82     idx = l;
83     while (idx<=maxh && rnas[idx]->start<=x) {
84     idx++;
85     }
86     return (idx>maxh) ? -1 : idx;
87     }
88    
89     int cmpRedundant(GffObj& a, GffObj& b) {
90     if (a.exons.Count()==b.exons.Count()) {
91     if (a.covlen==b.covlen) {
92     return strcmp(a.getID(), b.getID());
93     }
94     else return (a.covlen>b.covlen)? 1 : -1;
95     }
96     else return (a.exons.Count()>b.exons.Count())? 1: -1;
97     }
98    
99    
100     bool tMatch(GffObj& a, GffObj& b) {
101     //strict intron chain match, or single-exon perfect match
102     int imax=a.exons.Count()-1;
103     int jmax=b.exons.Count()-1;
104     int ovlen=0;
105     if (imax!=jmax) return false; //different number of introns
106    
107     if (imax==0) { //single-exon mRNAs
108     //if (equnspl) {
109     //fuzz match for single-exon transfrags:
110     // it's a match if they overlap at least 80% of max len
111     ovlen=a.exons[0]->overlapLen(b.exons[0]);
112     int maxlen=GMAX(a.covlen,b.covlen);
113     return (ovlen>=maxlen*0.8);
114     /*}
115     else {
116     //only exact match
117     ovlen=a.covlen;
118     return (a.exons[0]->start==b.exons[0]->start &&
119     a.exons[0]->end==b.exons[0]->end);
120    
121     }*/
122     }
123     //check intron overlaps
124     ovlen=a.exons[0]->end-(GMAX(a.start,b.start))+1;
125     ovlen+=(GMIN(a.end,b.end))-a.exons.Last()->start;
126     for (int i=1;i<=imax;i++) {
127     if (i<imax) ovlen+=a.exons[i]->len();
128     if ((a.exons[i-1]->end!=b.exons[i-1]->end) ||
129     (a.exons[i]->start!=b.exons[i]->start)) {
130     return false; //intron mismatch
131     }
132     }
133     return true;
134     }
135    
136    
137     bool unsplContained(GffObj& ti, GffObj& tj, bool fuzzSpan) {
138     //returns true only if ti (which MUST be single-exon) is "almost" contained in any of tj's exons
139     //but it does not cross any intron-exon boundary of tj
140     int imax=ti.exons.Count()-1;
141     int jmax=tj.exons.Count()-1;
142     if (imax>0) GError("Error: bad unsplContained() call, 1st param must be single-exon transcript!\n");
143     int minovl = (int)(0.8 * ti.len()); //minimum overlap for fuzzSpan
144     if (fuzzSpan) {
145     for (int j=0;j<=jmax;j++) {
146     //must NOT overlap the introns
147     if ((j>0 && ti.start<tj.exons[j]->start)
148     || (j<jmax && ti.end>tj.exons[j]->end))
149     return false;
150     if (ti.exons[0]->overlapLen(tj.exons[j])>=minovl)
151     return true;
152     }
153     } else {
154     for (int j=0;j<=jmax;j++) {
155     //must NOT overlap the introns
156     if ((j>0 && ti.start<tj.exons[j]->start)
157     || (j<jmax && ti.end>tj.exons[j]->end))
158     return false;
159     //strict containment
160     if (ti.end<=tj.exons[j]->end && ti.start>=tj.exons[j]->start)
161     return true;
162     }
163     }
164     return false;
165     }
166    
167     GffObj* redundantTranscripts(GffObj& ti, GffObj& tj, bool matchAllIntrons, bool fuzzSpan) {
168     // matchAllIntrons==true: transcripts are considered "redundant" only if
169     // they have the exact same number of introns and same splice sites (or none)
170     // (single-exon transcripts can be also fully contained to be considered matching)
171     // matchAllIntrons==false: an intron chain could be a subset of a "container" chain,
172     // as long as no intron-exon boundaries are violated; also, a single-exon
173     // transcript will be collapsed if it's contained in one of the exons of the other
174     // fuzzSpan==false: the genomic span of one transcript must be contained in or equal with the genomic
175     // span of the other
176     //
177     // fuzzSpan==true: then genomic spans of transcripts are no longer required to be fully contained
178     // (i.e. they may extend each-other in opposite directions)
179    
180     //if redundancy is found, the "bigger" transcript is returned (otherwise NULL is returned)
181     if (ti.start>=tj.end || tj.start>=ti.end || tj.strand!=ti.strand) return NULL; //no span overlap at all
182     int imax=ti.exons.Count()-1;
183     int jmax=tj.exons.Count()-1;
184     GffObj* bigger=NULL;
185     GffObj* smaller=NULL;
186     if (matchAllIntrons) {
187     if (imax!=jmax) return false;
188     if (ti.covlen>tj.covlen) {
189     bigger=&ti;
190     if (!fuzzSpan && (ti.start>tj.start || ti.end<tj.end)) return NULL;
191     }
192     else { //ti.covlen<=tj.covlen
193     bigger=&tj;
194     if (!fuzzSpan && (tj.start>ti.start || tj.end<ti.end)) return NULL;
195     }
196     //check that all introns really match
197     for (int i=0;i<imax;i++) {
198     if (ti.exons[i]->end!=tj.exons[i]->end ||
199     ti.exons[i+1]->start!=tj.exons[i+1]->start) return NULL;
200     }
201     return bigger;
202     }
203     //--- matchAllIntrons==false: intron-chain containment is also considered redundancy
204     int maxlen=0;
205     int minlen=0;
206     if (ti.covlen>tj.covlen) {
207     if (tj.exons.Count()>ti.exons.Count()) {
208     //exon count override
209     bigger=&tj;
210     smaller=&ti;
211     }
212     else {
213     bigger=&ti;
214     smaller=&tj;
215     }
216     maxlen=ti.covlen;
217     minlen=tj.covlen;
218     }
219     else { //tj has more bases
220     if (ti.exons.Count()>tj.exons.Count()) {
221     //exon count override
222     bigger=&ti;
223     smaller=&tj;
224     }
225     else {
226     bigger=&tj;
227     smaller=&ti;
228     }
229     maxlen=tj.covlen;
230     minlen=ti.covlen;
231     }
232     if (imax==0 && jmax==0) {
233     //single-exon transcripts: if fuzzSpan, at least 80% of the shortest one must be overlapped by the other
234     if (fuzzSpan) {
235     return (ti.exons[0]->overlapLen(tj.exons[0])>=minlen*0.8) ? bigger : NULL;
236     }
237     else {
238     return (smaller->start>=bigger->start && smaller->end<=bigger->end) ? bigger : NULL;
239     }
240     }
241     //containment is also considered redundancy
242     if (smaller->exons.Count()==1) {
243     //check if this single exon is contained in any of tj exons
244     //without violating any intron-exon boundaries
245     return (unsplContained(*smaller, *bigger, fuzzSpan) ? bigger : NULL);
246     }
247    
248     //--from here on: both are multi-exon transcripts, imax>0 && jmax>0
249     if (ti.exons[imax]->start<tj.exons[0]->end ||
250     tj.exons[jmax]->start<ti.exons[0]->end )
251     return NULL; //intron chains do not overlap at all
252    
253    
254     //checking full intron chain containment
255     uint eistart=0, eiend=0, ejstart=0, ejend=0; //exon boundaries
256     int i=1; //exon idx to the right of the current intron of ti
257     int j=1; //exon idx to the right of the current intron of tj
258     //find the first intron overlap:
259     while (i<=imax && j<=jmax) {
260     eistart=ti.exons[i-1]->end;
261     eiend=ti.exons[i]->start;
262     ejstart=tj.exons[j-1]->end;
263     ejend=tj.exons[j]->start;
264     if (ejend<eistart) { j++; continue; }
265     if (eiend<ejstart) { i++; continue; }
266     //we found an intron overlap
267     break;
268     }
269     if (!fuzzSpan && (bigger->start>smaller->start || bigger->end < smaller->end)) return NULL;
270     if ((i>1 && j>1) || i>imax || j>jmax) {
271     return NULL; //either no intron overlaps found at all
272     //or it's not the first intron for at least one of the transcripts
273     }
274     if (eistart!=ejstart || eiend!=ejend) return NULL; //not an exact intron match
275     if (j>i) {
276     //i==1, ti's start must not conflict with the previous intron of tj
277     if (ti.start<tj.exons[j-1]->start) return NULL;
278     //so i's first intron starts AFTER j's first intron
279     // then j must contain i, so i's last intron must end with or before j's last intron
280     if (ti.exons[imax]->start>tj.exons[jmax]->start) return NULL;
281     //comment out the line above if you just want "intron compatibility" (i.e. extension of intron chains )
282     }
283     else if (i>j) {
284     //j==1, tj's start must not conflict with the previous intron of ti
285     if (tj.start<ti.exons[i-1]->start) return NULL;
286     //so j's intron chain starts AFTER i's
287     // then i must contain j, so j's last intron must end with or before j's last intron
288     if (tj.exons[jmax]->start>ti.exons[imax]->start) return NULL;
289     //comment out the line above for just "intronCompatible()" check (allowing extension of intron chain)
290     }
291     //now check if the rest of the introns overlap, in the same sequence
292     i++;
293     j++;
294     while (i<=imax && j<=jmax) {
295     if (ti.exons[i-1]->end!=tj.exons[j-1]->end ||
296     ti.exons[i]->start!=tj.exons[j]->start) return NULL;
297     i++;
298     j++;
299     }
300     i--;
301     j--;
302     if (i==imax && j<jmax) {
303     // tj has more introns to the right, check if ti's end doesn't conflict with the current tj exon boundary
304     if (ti.end>tj.exons[j]->end) return NULL;
305     }
306     else if (j==jmax && i<imax) {
307     if (tj.end>ti.exons[i]->end) return NULL;
308     }
309     return bigger;
310     }
311    
312    
313     int gseqCmpName(const pointer p1, const pointer p2) {
314     return strcmp(((GenomicSeqData*)p1)->gseq_name, ((GenomicSeqData*)p2)->gseq_name);
315     }
316    
317    
318     void printLocus(GffLocus* loc, const char* pre=NULL) {
319     if (pre!=NULL) fprintf(stderr, "%s", pre);
320     GMessage(" [%d-%d] : ", loc->start, loc->end);
321     GMessage("%s",loc->rnas[0]->getID());
322     for (int i=1;i<loc->rnas.Count();i++) {
323     GMessage(",%s",loc->rnas[i]->getID());
324     }
325     GMessage("\n");
326     }
327    
328 gpertea 48 void preserveContainedCDS(GffObj* t, GffObj* tfrom) {
329     //transfer CDS info to the container t if it's a larger protein
330     if (tfrom->CDstart==0) return;
331     if (t->CDstart) {
332     if (tfrom->CDstart<t->CDstart && tfrom->CDstart>=t->start)
333     t->CDstart=tfrom->CDstart;
334     if (tfrom->CDend>t->CDend && tfrom->CDend<=t->end)
335     t->CDend=tfrom->CDend;
336     }
337     else { //no CDS info on container, just copy it from the contained
338 gpertea 56 t->addCDS(tfrom->CDstart, tfrom->CDend, tfrom->CDphase);
339 gpertea 48 }
340 gpertea 47 }
341    
342 gpertea 26 void placeGf(GffObj* t, GenomicSeqData* gdata, bool doCluster, bool collapseRedundant, bool matchAllIntrons, bool fuzzSpan) {
343     //GMessage(">>Placing transcript %s\n", t->getID());
344     GTData* tdata=new GTData(t);
345     gdata->tdata.Add(tdata);
346     int tidx=-1;
347     if (t->exons.Count()>0)
348     tidx=gdata->rnas.Add(t); //added it in sorted order
349     else {
350     gdata->gfs.Add(t);
351     return; //nothing to do with these non-transcript objects
352     }
353     if (!doCluster) return;
354     if (gdata->loci.Count()==0) {
355     gdata->loci.Add(new GffLocus(t));
356     //GMessage(" <<make it first locus %d-%d \n",t->start, t->end);
357     return;
358     }
359     //DEBUG: show available loci:
360     // GMessage(" [%d loci already:\n", gdata->loci.Count());
361     //for (int l=0;l<gdata->loci.Count();l++) {
362     // printLocus(gdata->loci[l]);
363     // }
364     int nidx=qsearch_gloci(t->end, gdata->loci); //get index of nearest locus starting just ABOVE t->end
365     //GMessage("\tlooking up end coord %d in gdata->loci.. (qsearch got nidx=%d)\n", t->end, nidx);
366     if (nidx==0) {
367     //cannot have any overlapping loci
368     //GMessage(" <<no ovls possible, create locus %d-%d \n",t->start, t->end);
369     gdata->loci.Add(new GffLocus(t));
370     return;
371     }
372     if (nidx==-1) nidx=gdata->loci.Count();//all loci start below t->end
373     int lfound=0; //count of parent loci
374     GArray<int> mrgloci(false);
375     GList<GffLocus> tloci(true); //candidate parent loci to adopt this
376     //GMessage("\tchecking all loci from %d to 0\n",nidx-1);
377     for (int l=nidx-1;l>=0;l--) {
378     GffLocus& loc=*(gdata->loci[l]);
379     if (loc.strand!='.' && t->strand!='.'&& loc.strand!=t->strand) continue;
380     if (t->start>loc.end) {
381     if (t->start-loc.start>GFF_MAX_LOCUS) break; //give up already
382     continue;
383     }
384     if (loc.start>t->end) continue;
385     //this should never be the case if nidx was found correctly
386     //GMessage(" !range overlap found with locus ");
387     //printLocus(&loc);
388     if (loc.add_RNA(t)) {
389     //will add this transcript to loc
390     lfound++;
391     mrgloci.Add(l);
392     if (collapseRedundant) {
393     //compare to every single transcript in this locus
394     for (int ti=0;ti<loc.rnas.Count();ti++) {
395     if (loc.rnas[ti]==t) continue;
396     GTData* odata=(GTData*)(loc.rnas[ti]->uptr);
397     //GMessage(" ..redundant check vs overlapping transcript %s\n",loc.rnas[ti]->getID());
398     GffObj* container=NULL;
399     if (odata->replaced_by==NULL &&
400     (container=redundantTranscripts(*t, *(loc.rnas[ti]), matchAllIntrons, fuzzSpan))!=NULL) {
401     if (container==t) {
402     odata->replaced_by=t;
403 gpertea 48 preserveContainedCDS(t, loc.rnas[ti]);
404 gpertea 26 }
405     else {
406     tdata->replaced_by=loc.rnas[ti];
407 gpertea 48 preserveContainedCDS(loc.rnas[ti], t);
408 gpertea 26 }
409     }
410     }//for each transcript in the exon-overlapping locus
411     } //if doCollapseRedundant
412     } //overlapping locus
413     }
414     if (lfound==0) {
415     //overlapping loci not found, create a locus with only this mRNA
416     //GMessage(" overlapping locus not found, create locus %d-%d \n",t->start, t->end);
417     int addidx=gdata->loci.Add(new GffLocus(t));
418     if (addidx<0) {
419     GMessage(" WARNING: new GffLocus(%s:%d-%d) not added!\n",t->getID(), t->start, t->end);
420     }
421     }
422     else if (lfound>1) {
423     //more than one loci found parenting this mRNA, merge loci
424     //if (lfound>2) GMessage(" merging %d loci \n",lfound);
425     lfound--;
426     for (int l=0;l<lfound;l++) {
427     int mlidx=mrgloci[l]; //largest indices first, so it's safe to remove
428     gdata->loci[mrgloci[lfound]]->addMerge(*(gdata->loci[mlidx]), t);
429     gdata->loci.Delete(mlidx);
430     }
431     }
432     }
433    
434     void collectLocusData(GList<GenomicSeqData>& ref_data) {
435     int locus_num=0;
436     for (int g=0;g<ref_data.Count();g++) {
437     GenomicSeqData* gdata=ref_data[g];
438     for (int l=0;l<gdata->loci.Count();l++) {
439     GffLocus& loc=*(gdata->loci[l]);
440     GHash<int> gnames(true); //gene names in this locus
441     GHash<int> geneids(true); //Entrez GeneID: numbers
442     for (int i=0;i<loc.rnas.Count();i++) {
443     GffObj& t=*(loc.rnas[i]);
444     GStr gname(t.getGeneName());
445     if (!gname.is_empty()) {
446     gname.upper();
447     int* prevg=gnames.Find(gname.chars());
448     if (prevg!=NULL) (*prevg)++;
449     else gnames.Add(gname, new int(1));
450     }
451     //parse GeneID xrefs, if any:
452     GStr xrefs(t.getAttr("xrefs"));
453     if (!xrefs.is_empty()) {
454     xrefs.startTokenize(",");
455     GStr token;
456     while (xrefs.nextToken(token)) {
457     token.upper();
458     if (token.startsWith("GENEID:")) {
459     token.cut(0,token.index(':')+1);
460     int* prevg=geneids.Find(token.chars());
461     if (prevg!=NULL) (*prevg)++;
462     else geneids.Add(token, new int(1));
463     }
464     } //for each xref
465     } //xrefs parsing
466     }//for each transcript
467     locus_num++;
468     loc.locus_num=locus_num;
469     if (gnames.Count()>0) { //collect all gene names associated to this locus
470     gnames.startIterate();
471     int* gfreq=NULL;
472     char* key=NULL;
473     while ((gfreq=gnames.NextData(key))!=NULL) {
474     loc.gene_names.AddIfNew(new CGeneSym(key,*gfreq));
475     }
476     } //added collected gene_names
477     if (loc.gene_ids.Count()>0) { //collect all GeneIDs names associated to this locus
478     geneids.startIterate();
479     int* gfreq=NULL;
480     char* key=NULL;
481     while ((gfreq=geneids.NextData(key))!=NULL) {
482     loc.gene_ids.AddIfNew(new CGeneSym(key,*gfreq));
483     }
484     }
485     } //for each locus
486     }//for each genomic sequence
487     }
488    
489    
490     void GffLoader::load(GList<GenomicSeqData>& seqdata, GFValidateFunc* gf_validate,
491     bool doCluster, bool doCollapseRedundant, bool matchAllIntrons, bool fuzzSpan) {
492     GffReader* gffr=new GffReader(f, this->transcriptsOnly, false); //not only mRNA features, not sorted
493     gffr->showWarnings(this->showWarnings);
494     // keepAttrs mergeCloseExons noExonAttr
495     gffr->readAll(this->fullAttributes, this->mergeCloseExons, this->noExonAttrs);
496     //int redundant=0; //redundant annotation discarded
497     if (verbose) GMessage(" .. loaded %d genomic features from %s\n", gffr->gflst.Count(), fname.chars());
498     //int rna_deleted=0;
499     //add to GenomicSeqData, adding to existing loci and identifying intron-chain duplicates
500     for (int k=0;k<gffr->gflst.Count();k++) {
501     GffObj* m=gffr->gflst[k];
502     if (strcmp(m->getFeatureName(), "locus")==0 &&
503     m->getAttr("transcripts")!=NULL) {
504     continue;
505     }
506    
507     char* rloc=m->getAttr("locus");
508     if (rloc!=NULL && startsWith(rloc, "RLOC_")) {
509     m->removeAttr("locus", rloc);
510     }
511     if (m->exons.Count()==0 && m->children.Count()==0) {
512     //a non-mRNA feature with no subfeatures
513     //add a dummy exon just to have the generic exon checking work
514     m->addExon(m->start,m->end);
515     }
516     GList<GffObj> gfadd(false,false);
517     if (gf_validate!=NULL && !(*gf_validate)(m, &gfadd)) {
518     continue;
519     }
520     m->isUsed(true); //so the gffreader won't destroy it
521     int i=-1;
522     GenomicSeqData f(m->gseq_id);
523     GenomicSeqData* gdata=NULL;
524    
525     if (seqdata.Found(&f,i)) gdata=seqdata[i];
526     else { //entry not created yet for this genomic seq
527     gdata=new GenomicSeqData(m->gseq_id);
528     seqdata.Add(gdata);
529     }
530     for (int k=0;k<gfadd.Count();k++) {
531     placeGf(gfadd[k], gdata, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan);
532     }
533     placeGf(m, gdata, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan);
534     } //for each read gffObj
535     //if (verbose) GMessage(" .. %d records from %s clustered into loci.\n", gffr->gflst.Count(), fname.chars());
536     if (f!=stdin) { fclose(f); f=NULL; }
537     delete gffr;
538     }