ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gffread/gff_utils.cpp
Revision: 48
Committed: Wed Sep 7 16:07:32 2011 UTC (8 years ago) by gpertea
File size: 19699 byte(s)
Log Message:
longest CDS is now kept on -MK containment

Line User Rev File contents
1 gpertea 26 #include "gff_utils.h"
2    
3     extern bool verbose;
4     extern bool debugMode;
5    
6     void printFasta(FILE* f, GStr& defline, char* seq, int seqlen) {
7     if (seq==NULL) return;
8     int len=(seqlen>0)?seqlen:strlen(seq);
9     if (len<=0) return;
10     if (!defline.is_empty())
11     fprintf(f, ">%s\n",defline.chars());
12     int ilen=0;
13     for (int i=0; i < len; i++, ilen++) {
14     if (ilen == 70) {
15     fputc('\n', f);
16     ilen = 0;
17     }
18     putc(seq[i], f);
19     } //for
20     fputc('\n', f);
21     }
22    
23     int qsearch_gloci(uint x, GList<GffLocus>& loci) {
24     //binary search
25     //do the simplest tests first:
26     if (loci[0]->start>x) return 0;
27     if (loci.Last()->start<x) return -1;
28     uint istart=0;
29     int i=0;
30     int idx=-1;
31     int maxh=loci.Count()-1;
32     int l=0;
33     int h = maxh;
34     while (l <= h) {
35     i = (l+h)>>1;
36     istart=loci[i]->start;
37     if (istart < x) l = i + 1;
38     else {
39     if (istart == x) { //found matching coordinate here
40     idx=i;
41     while (idx<=maxh && loci[idx]->start==x) {
42     idx++;
43     }
44     return (idx>maxh) ? -1 : idx;
45     }
46     h = i - 1;
47     }
48     } //while
49     idx = l;
50     while (idx<=maxh && loci[idx]->start<=x) {
51     idx++;
52     }
53     return (idx>maxh) ? -1 : idx;
54     }
55    
56     int qsearch_rnas(uint x, GList<GffObj>& rnas) {
57     //binary search
58     //do the simplest tests first:
59     if (rnas[0]->start>x) return 0;
60     if (rnas.Last()->start<x) return -1;
61     uint istart=0;
62     int i=0;
63     int idx=-1;
64     int maxh=rnas.Count()-1;
65     int l=0;
66     int h = maxh;
67     while (l <= h) {
68     i = (l+h)>>1;
69     istart=rnas[i]->start;
70     if (istart < x) l = i + 1;
71     else {
72     if (istart == x) { //found matching coordinate here
73     idx=i;
74     while (idx<=maxh && rnas[idx]->start==x) {
75     idx++;
76     }
77     return (idx>maxh) ? -1 : idx;
78     }
79     h = i - 1;
80     }
81     } //while
82     idx = l;
83     while (idx<=maxh && rnas[idx]->start<=x) {
84     idx++;
85     }
86     return (idx>maxh) ? -1 : idx;
87     }
88    
89     int cmpRedundant(GffObj& a, GffObj& b) {
90     if (a.exons.Count()==b.exons.Count()) {
91     if (a.covlen==b.covlen) {
92     return strcmp(a.getID(), b.getID());
93     }
94     else return (a.covlen>b.covlen)? 1 : -1;
95     }
96     else return (a.exons.Count()>b.exons.Count())? 1: -1;
97     }
98    
99    
100     bool tMatch(GffObj& a, GffObj& b) {
101     //strict intron chain match, or single-exon perfect match
102     int imax=a.exons.Count()-1;
103     int jmax=b.exons.Count()-1;
104     int ovlen=0;
105     if (imax!=jmax) return false; //different number of introns
106    
107     if (imax==0) { //single-exon mRNAs
108     //if (equnspl) {
109     //fuzz match for single-exon transfrags:
110     // it's a match if they overlap at least 80% of max len
111     ovlen=a.exons[0]->overlapLen(b.exons[0]);
112     int maxlen=GMAX(a.covlen,b.covlen);
113     return (ovlen>=maxlen*0.8);
114     /*}
115     else {
116     //only exact match
117     ovlen=a.covlen;
118     return (a.exons[0]->start==b.exons[0]->start &&
119     a.exons[0]->end==b.exons[0]->end);
120    
121     }*/
122     }
123     //check intron overlaps
124     ovlen=a.exons[0]->end-(GMAX(a.start,b.start))+1;
125     ovlen+=(GMIN(a.end,b.end))-a.exons.Last()->start;
126     for (int i=1;i<=imax;i++) {
127     if (i<imax) ovlen+=a.exons[i]->len();
128     if ((a.exons[i-1]->end!=b.exons[i-1]->end) ||
129     (a.exons[i]->start!=b.exons[i]->start)) {
130     return false; //intron mismatch
131     }
132     }
133     return true;
134     }
135    
136    
137     bool unsplContained(GffObj& ti, GffObj& tj, bool fuzzSpan) {
138     //returns true only if ti (which MUST be single-exon) is "almost" contained in any of tj's exons
139     //but it does not cross any intron-exon boundary of tj
140     int imax=ti.exons.Count()-1;
141     int jmax=tj.exons.Count()-1;
142     if (imax>0) GError("Error: bad unsplContained() call, 1st param must be single-exon transcript!\n");
143     int minovl = (int)(0.8 * ti.len()); //minimum overlap for fuzzSpan
144     if (fuzzSpan) {
145     for (int j=0;j<=jmax;j++) {
146     //must NOT overlap the introns
147     if ((j>0 && ti.start<tj.exons[j]->start)
148     || (j<jmax && ti.end>tj.exons[j]->end))
149     return false;
150     if (ti.exons[0]->overlapLen(tj.exons[j])>=minovl)
151     return true;
152     }
153     } else {
154     for (int j=0;j<=jmax;j++) {
155     //must NOT overlap the introns
156     if ((j>0 && ti.start<tj.exons[j]->start)
157     || (j<jmax && ti.end>tj.exons[j]->end))
158     return false;
159     //strict containment
160     if (ti.end<=tj.exons[j]->end && ti.start>=tj.exons[j]->start)
161     return true;
162     }
163     }
164     return false;
165     }
166    
167     GffObj* redundantTranscripts(GffObj& ti, GffObj& tj, bool matchAllIntrons, bool fuzzSpan) {
168     // matchAllIntrons==true: transcripts are considered "redundant" only if
169     // they have the exact same number of introns and same splice sites (or none)
170     // (single-exon transcripts can be also fully contained to be considered matching)
171     // matchAllIntrons==false: an intron chain could be a subset of a "container" chain,
172     // as long as no intron-exon boundaries are violated; also, a single-exon
173     // transcript will be collapsed if it's contained in one of the exons of the other
174     // fuzzSpan==false: the genomic span of one transcript must be contained in or equal with the genomic
175     // span of the other
176     //
177     // fuzzSpan==true: then genomic spans of transcripts are no longer required to be fully contained
178     // (i.e. they may extend each-other in opposite directions)
179    
180     //if redundancy is found, the "bigger" transcript is returned (otherwise NULL is returned)
181     if (ti.start>=tj.end || tj.start>=ti.end || tj.strand!=ti.strand) return NULL; //no span overlap at all
182     int imax=ti.exons.Count()-1;
183     int jmax=tj.exons.Count()-1;
184     GffObj* bigger=NULL;
185     GffObj* smaller=NULL;
186     if (matchAllIntrons) {
187     if (imax!=jmax) return false;
188     if (ti.covlen>tj.covlen) {
189     bigger=&ti;
190     if (!fuzzSpan && (ti.start>tj.start || ti.end<tj.end)) return NULL;
191     }
192     else { //ti.covlen<=tj.covlen
193     bigger=&tj;
194     if (!fuzzSpan && (tj.start>ti.start || tj.end<ti.end)) return NULL;
195     }
196     //check that all introns really match
197     for (int i=0;i<imax;i++) {
198     if (ti.exons[i]->end!=tj.exons[i]->end ||
199     ti.exons[i+1]->start!=tj.exons[i+1]->start) return NULL;
200     }
201     return bigger;
202     }
203     //--- matchAllIntrons==false: intron-chain containment is also considered redundancy
204     int maxlen=0;
205     int minlen=0;
206     if (ti.covlen>tj.covlen) {
207     if (tj.exons.Count()>ti.exons.Count()) {
208     //exon count override
209     bigger=&tj;
210     smaller=&ti;
211     }
212     else {
213     bigger=&ti;
214     smaller=&tj;
215     }
216     maxlen=ti.covlen;
217     minlen=tj.covlen;
218     }
219     else { //tj has more bases
220     if (ti.exons.Count()>tj.exons.Count()) {
221     //exon count override
222     bigger=&ti;
223     smaller=&tj;
224     }
225     else {
226     bigger=&tj;
227     smaller=&ti;
228     }
229     maxlen=tj.covlen;
230     minlen=ti.covlen;
231     }
232     if (imax==0 && jmax==0) {
233     //single-exon transcripts: if fuzzSpan, at least 80% of the shortest one must be overlapped by the other
234     if (fuzzSpan) {
235     return (ti.exons[0]->overlapLen(tj.exons[0])>=minlen*0.8) ? bigger : NULL;
236     }
237     else {
238     return (smaller->start>=bigger->start && smaller->end<=bigger->end) ? bigger : NULL;
239     }
240     }
241     //containment is also considered redundancy
242     if (smaller->exons.Count()==1) {
243     //check if this single exon is contained in any of tj exons
244     //without violating any intron-exon boundaries
245     return (unsplContained(*smaller, *bigger, fuzzSpan) ? bigger : NULL);
246     }
247    
248     //--from here on: both are multi-exon transcripts, imax>0 && jmax>0
249     if (ti.exons[imax]->start<tj.exons[0]->end ||
250     tj.exons[jmax]->start<ti.exons[0]->end )
251     return NULL; //intron chains do not overlap at all
252    
253    
254     //checking full intron chain containment
255     uint eistart=0, eiend=0, ejstart=0, ejend=0; //exon boundaries
256     int i=1; //exon idx to the right of the current intron of ti
257     int j=1; //exon idx to the right of the current intron of tj
258     //find the first intron overlap:
259     while (i<=imax && j<=jmax) {
260     eistart=ti.exons[i-1]->end;
261     eiend=ti.exons[i]->start;
262     ejstart=tj.exons[j-1]->end;
263     ejend=tj.exons[j]->start;
264     if (ejend<eistart) { j++; continue; }
265     if (eiend<ejstart) { i++; continue; }
266     //we found an intron overlap
267     break;
268     }
269     if (!fuzzSpan && (bigger->start>smaller->start || bigger->end < smaller->end)) return NULL;
270     if ((i>1 && j>1) || i>imax || j>jmax) {
271     return NULL; //either no intron overlaps found at all
272     //or it's not the first intron for at least one of the transcripts
273     }
274     if (eistart!=ejstart || eiend!=ejend) return NULL; //not an exact intron match
275     if (j>i) {
276     //i==1, ti's start must not conflict with the previous intron of tj
277     if (ti.start<tj.exons[j-1]->start) return NULL;
278     //so i's first intron starts AFTER j's first intron
279     // then j must contain i, so i's last intron must end with or before j's last intron
280     if (ti.exons[imax]->start>tj.exons[jmax]->start) return NULL;
281     //comment out the line above if you just want "intron compatibility" (i.e. extension of intron chains )
282     }
283     else if (i>j) {
284     //j==1, tj's start must not conflict with the previous intron of ti
285     if (tj.start<ti.exons[i-1]->start) return NULL;
286     //so j's intron chain starts AFTER i's
287     // then i must contain j, so j's last intron must end with or before j's last intron
288     if (tj.exons[jmax]->start>ti.exons[imax]->start) return NULL;
289     //comment out the line above for just "intronCompatible()" check (allowing extension of intron chain)
290     }
291     //now check if the rest of the introns overlap, in the same sequence
292     i++;
293     j++;
294     while (i<=imax && j<=jmax) {
295     if (ti.exons[i-1]->end!=tj.exons[j-1]->end ||
296     ti.exons[i]->start!=tj.exons[j]->start) return NULL;
297     i++;
298     j++;
299     }
300     i--;
301     j--;
302     if (i==imax && j<jmax) {
303     // tj has more introns to the right, check if ti's end doesn't conflict with the current tj exon boundary
304     if (ti.end>tj.exons[j]->end) return NULL;
305     }
306     else if (j==jmax && i<imax) {
307     if (tj.end>ti.exons[i]->end) return NULL;
308     }
309     return bigger;
310     }
311    
312    
313     int gseqCmpName(const pointer p1, const pointer p2) {
314     return strcmp(((GenomicSeqData*)p1)->gseq_name, ((GenomicSeqData*)p2)->gseq_name);
315     }
316    
317    
318     void printLocus(GffLocus* loc, const char* pre=NULL) {
319     if (pre!=NULL) fprintf(stderr, "%s", pre);
320     GMessage(" [%d-%d] : ", loc->start, loc->end);
321     GMessage("%s",loc->rnas[0]->getID());
322     for (int i=1;i<loc->rnas.Count();i++) {
323     GMessage(",%s",loc->rnas[i]->getID());
324     }
325     GMessage("\n");
326     }
327    
328 gpertea 48 void preserveContainedCDS(GffObj* t, GffObj* tfrom) {
329     //transfer CDS info to the container t if it's a larger protein
330     if (tfrom->CDstart==0) return;
331     if (t->CDstart) {
332     if (tfrom->CDstart<t->CDstart && tfrom->CDstart>=t->start)
333     t->CDstart=tfrom->CDstart;
334     if (tfrom->CDend>t->CDend && tfrom->CDend<=t->end)
335     t->CDend=tfrom->CDend;
336     }
337     else { //no CDS info on container, just copy it from the contained
338     if (tfrom->CDstart>=t->start) t->CDstart=tfrom->CDstart;
339 gpertea 47 else t->CDstart=t->start;
340 gpertea 48 if (tfrom->CDend<=t->end) t->CDend=tfrom->CDend;
341 gpertea 47 else t->CDend=t->end;
342 gpertea 48 }
343    
344 gpertea 47
345     }
346    
347 gpertea 26 void placeGf(GffObj* t, GenomicSeqData* gdata, bool doCluster, bool collapseRedundant, bool matchAllIntrons, bool fuzzSpan) {
348     //GMessage(">>Placing transcript %s\n", t->getID());
349     GTData* tdata=new GTData(t);
350     gdata->tdata.Add(tdata);
351     int tidx=-1;
352     if (t->exons.Count()>0)
353     tidx=gdata->rnas.Add(t); //added it in sorted order
354     else {
355     gdata->gfs.Add(t);
356     return; //nothing to do with these non-transcript objects
357     }
358     if (!doCluster) return;
359     if (gdata->loci.Count()==0) {
360     gdata->loci.Add(new GffLocus(t));
361     //GMessage(" <<make it first locus %d-%d \n",t->start, t->end);
362     return;
363     }
364     //DEBUG: show available loci:
365     // GMessage(" [%d loci already:\n", gdata->loci.Count());
366     //for (int l=0;l<gdata->loci.Count();l++) {
367     // printLocus(gdata->loci[l]);
368     // }
369     int nidx=qsearch_gloci(t->end, gdata->loci); //get index of nearest locus starting just ABOVE t->end
370     //GMessage("\tlooking up end coord %d in gdata->loci.. (qsearch got nidx=%d)\n", t->end, nidx);
371     if (nidx==0) {
372     //cannot have any overlapping loci
373     //GMessage(" <<no ovls possible, create locus %d-%d \n",t->start, t->end);
374     gdata->loci.Add(new GffLocus(t));
375     return;
376     }
377     if (nidx==-1) nidx=gdata->loci.Count();//all loci start below t->end
378     int lfound=0; //count of parent loci
379     GArray<int> mrgloci(false);
380     GList<GffLocus> tloci(true); //candidate parent loci to adopt this
381     //GMessage("\tchecking all loci from %d to 0\n",nidx-1);
382     for (int l=nidx-1;l>=0;l--) {
383     GffLocus& loc=*(gdata->loci[l]);
384     if (loc.strand!='.' && t->strand!='.'&& loc.strand!=t->strand) continue;
385     if (t->start>loc.end) {
386     if (t->start-loc.start>GFF_MAX_LOCUS) break; //give up already
387     continue;
388     }
389     if (loc.start>t->end) continue;
390     //this should never be the case if nidx was found correctly
391     //GMessage(" !range overlap found with locus ");
392     //printLocus(&loc);
393     if (loc.add_RNA(t)) {
394     //will add this transcript to loc
395     lfound++;
396     mrgloci.Add(l);
397     if (collapseRedundant) {
398     //compare to every single transcript in this locus
399     for (int ti=0;ti<loc.rnas.Count();ti++) {
400     if (loc.rnas[ti]==t) continue;
401     GTData* odata=(GTData*)(loc.rnas[ti]->uptr);
402     //GMessage(" ..redundant check vs overlapping transcript %s\n",loc.rnas[ti]->getID());
403     GffObj* container=NULL;
404     if (odata->replaced_by==NULL &&
405     (container=redundantTranscripts(*t, *(loc.rnas[ti]), matchAllIntrons, fuzzSpan))!=NULL) {
406     if (container==t) {
407     odata->replaced_by=t;
408 gpertea 48 preserveContainedCDS(t, loc.rnas[ti]);
409 gpertea 26 }
410     else {
411     tdata->replaced_by=loc.rnas[ti];
412 gpertea 48 preserveContainedCDS(loc.rnas[ti], t);
413 gpertea 26 }
414     }
415     }//for each transcript in the exon-overlapping locus
416     } //if doCollapseRedundant
417     } //overlapping locus
418     }
419     if (lfound==0) {
420     //overlapping loci not found, create a locus with only this mRNA
421     //GMessage(" overlapping locus not found, create locus %d-%d \n",t->start, t->end);
422     int addidx=gdata->loci.Add(new GffLocus(t));
423     if (addidx<0) {
424     GMessage(" WARNING: new GffLocus(%s:%d-%d) not added!\n",t->getID(), t->start, t->end);
425     }
426     }
427     else if (lfound>1) {
428     //more than one loci found parenting this mRNA, merge loci
429     //if (lfound>2) GMessage(" merging %d loci \n",lfound);
430     lfound--;
431     for (int l=0;l<lfound;l++) {
432     int mlidx=mrgloci[l]; //largest indices first, so it's safe to remove
433     gdata->loci[mrgloci[lfound]]->addMerge(*(gdata->loci[mlidx]), t);
434     gdata->loci.Delete(mlidx);
435     }
436     }
437     }
438    
439     void collectLocusData(GList<GenomicSeqData>& ref_data) {
440     int locus_num=0;
441     for (int g=0;g<ref_data.Count();g++) {
442     GenomicSeqData* gdata=ref_data[g];
443     for (int l=0;l<gdata->loci.Count();l++) {
444     GffLocus& loc=*(gdata->loci[l]);
445     GHash<int> gnames(true); //gene names in this locus
446     GHash<int> geneids(true); //Entrez GeneID: numbers
447     for (int i=0;i<loc.rnas.Count();i++) {
448     GffObj& t=*(loc.rnas[i]);
449     GStr gname(t.getGeneName());
450     if (!gname.is_empty()) {
451     gname.upper();
452     int* prevg=gnames.Find(gname.chars());
453     if (prevg!=NULL) (*prevg)++;
454     else gnames.Add(gname, new int(1));
455     }
456     //parse GeneID xrefs, if any:
457     GStr xrefs(t.getAttr("xrefs"));
458     if (!xrefs.is_empty()) {
459     xrefs.startTokenize(",");
460     GStr token;
461     while (xrefs.nextToken(token)) {
462     token.upper();
463     if (token.startsWith("GENEID:")) {
464     token.cut(0,token.index(':')+1);
465     int* prevg=geneids.Find(token.chars());
466     if (prevg!=NULL) (*prevg)++;
467     else geneids.Add(token, new int(1));
468     }
469     } //for each xref
470     } //xrefs parsing
471     }//for each transcript
472     locus_num++;
473     loc.locus_num=locus_num;
474     if (gnames.Count()>0) { //collect all gene names associated to this locus
475     gnames.startIterate();
476     int* gfreq=NULL;
477     char* key=NULL;
478     while ((gfreq=gnames.NextData(key))!=NULL) {
479     loc.gene_names.AddIfNew(new CGeneSym(key,*gfreq));
480     }
481     } //added collected gene_names
482     if (loc.gene_ids.Count()>0) { //collect all GeneIDs names associated to this locus
483     geneids.startIterate();
484     int* gfreq=NULL;
485     char* key=NULL;
486     while ((gfreq=geneids.NextData(key))!=NULL) {
487     loc.gene_ids.AddIfNew(new CGeneSym(key,*gfreq));
488     }
489     }
490     } //for each locus
491     }//for each genomic sequence
492     }
493    
494    
495     void GffLoader::load(GList<GenomicSeqData>& seqdata, GFValidateFunc* gf_validate,
496     bool doCluster, bool doCollapseRedundant, bool matchAllIntrons, bool fuzzSpan) {
497     GffReader* gffr=new GffReader(f, this->transcriptsOnly, false); //not only mRNA features, not sorted
498     gffr->showWarnings(this->showWarnings);
499     // keepAttrs mergeCloseExons noExonAttr
500     gffr->readAll(this->fullAttributes, this->mergeCloseExons, this->noExonAttrs);
501     //int redundant=0; //redundant annotation discarded
502     if (verbose) GMessage(" .. loaded %d genomic features from %s\n", gffr->gflst.Count(), fname.chars());
503     //int rna_deleted=0;
504     //add to GenomicSeqData, adding to existing loci and identifying intron-chain duplicates
505     for (int k=0;k<gffr->gflst.Count();k++) {
506     GffObj* m=gffr->gflst[k];
507     if (strcmp(m->getFeatureName(), "locus")==0 &&
508     m->getAttr("transcripts")!=NULL) {
509     continue;
510     }
511    
512     char* rloc=m->getAttr("locus");
513     if (rloc!=NULL && startsWith(rloc, "RLOC_")) {
514     m->removeAttr("locus", rloc);
515     }
516     if (m->exons.Count()==0 && m->children.Count()==0) {
517     //a non-mRNA feature with no subfeatures
518     //add a dummy exon just to have the generic exon checking work
519     m->addExon(m->start,m->end);
520     }
521     GList<GffObj> gfadd(false,false);
522     if (gf_validate!=NULL && !(*gf_validate)(m, &gfadd)) {
523     continue;
524     }
525     m->isUsed(true); //so the gffreader won't destroy it
526     int i=-1;
527     GenomicSeqData f(m->gseq_id);
528     GenomicSeqData* gdata=NULL;
529    
530     if (seqdata.Found(&f,i)) gdata=seqdata[i];
531     else { //entry not created yet for this genomic seq
532     gdata=new GenomicSeqData(m->gseq_id);
533     seqdata.Add(gdata);
534     }
535     for (int k=0;k<gfadd.Count();k++) {
536     placeGf(gfadd[k], gdata, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan);
537     }
538     placeGf(m, gdata, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan);
539     } //for each read gffObj
540     //if (verbose) GMessage(" .. %d records from %s clustered into loci.\n", gffr->gflst.Count(), fname.chars());
541     if (f!=stdin) { fclose(f); f=NULL; }
542     delete gffr;
543     }