ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gffread/gff_utils.cpp
Revision: 56
Committed: Thu Sep 8 05:47:09 2011 UTC (8 years, 2 months ago) by gpertea
File size: 19545 byte(s)
Log Message:
RPS20P29, ENSG00000241828 not collapsing

Line File contents
1 #include "gff_utils.h"
2
3 extern bool verbose;
4 extern bool debugMode;
5
6 void printFasta(FILE* f, GStr& defline, char* seq, int seqlen) {
7 if (seq==NULL) return;
8 int len=(seqlen>0)?seqlen:strlen(seq);
9 if (len<=0) return;
10 if (!defline.is_empty())
11 fprintf(f, ">%s\n",defline.chars());
12 int ilen=0;
13 for (int i=0; i < len; i++, ilen++) {
14 if (ilen == 70) {
15 fputc('\n', f);
16 ilen = 0;
17 }
18 putc(seq[i], f);
19 } //for
20 fputc('\n', f);
21 }
22
23 int qsearch_gloci(uint x, GList<GffLocus>& loci) {
24 //binary search
25 //do the simplest tests first:
26 if (loci[0]->start>x) return 0;
27 if (loci.Last()->start<x) return -1;
28 uint istart=0;
29 int i=0;
30 int idx=-1;
31 int maxh=loci.Count()-1;
32 int l=0;
33 int h = maxh;
34 while (l <= h) {
35 i = (l+h)>>1;
36 istart=loci[i]->start;
37 if (istart < x) l = i + 1;
38 else {
39 if (istart == x) { //found matching coordinate here
40 idx=i;
41 while (idx<=maxh && loci[idx]->start==x) {
42 idx++;
43 }
44 return (idx>maxh) ? -1 : idx;
45 }
46 h = i - 1;
47 }
48 } //while
49 idx = l;
50 while (idx<=maxh && loci[idx]->start<=x) {
51 idx++;
52 }
53 return (idx>maxh) ? -1 : idx;
54 }
55
56 int qsearch_rnas(uint x, GList<GffObj>& rnas) {
57 //binary search
58 //do the simplest tests first:
59 if (rnas[0]->start>x) return 0;
60 if (rnas.Last()->start<x) return -1;
61 uint istart=0;
62 int i=0;
63 int idx=-1;
64 int maxh=rnas.Count()-1;
65 int l=0;
66 int h = maxh;
67 while (l <= h) {
68 i = (l+h)>>1;
69 istart=rnas[i]->start;
70 if (istart < x) l = i + 1;
71 else {
72 if (istart == x) { //found matching coordinate here
73 idx=i;
74 while (idx<=maxh && rnas[idx]->start==x) {
75 idx++;
76 }
77 return (idx>maxh) ? -1 : idx;
78 }
79 h = i - 1;
80 }
81 } //while
82 idx = l;
83 while (idx<=maxh && rnas[idx]->start<=x) {
84 idx++;
85 }
86 return (idx>maxh) ? -1 : idx;
87 }
88
89 int cmpRedundant(GffObj& a, GffObj& b) {
90 if (a.exons.Count()==b.exons.Count()) {
91 if (a.covlen==b.covlen) {
92 return strcmp(a.getID(), b.getID());
93 }
94 else return (a.covlen>b.covlen)? 1 : -1;
95 }
96 else return (a.exons.Count()>b.exons.Count())? 1: -1;
97 }
98
99
100 bool tMatch(GffObj& a, GffObj& b) {
101 //strict intron chain match, or single-exon perfect match
102 int imax=a.exons.Count()-1;
103 int jmax=b.exons.Count()-1;
104 int ovlen=0;
105 if (imax!=jmax) return false; //different number of introns
106
107 if (imax==0) { //single-exon mRNAs
108 //if (equnspl) {
109 //fuzz match for single-exon transfrags:
110 // it's a match if they overlap at least 80% of max len
111 ovlen=a.exons[0]->overlapLen(b.exons[0]);
112 int maxlen=GMAX(a.covlen,b.covlen);
113 return (ovlen>=maxlen*0.8);
114 /*}
115 else {
116 //only exact match
117 ovlen=a.covlen;
118 return (a.exons[0]->start==b.exons[0]->start &&
119 a.exons[0]->end==b.exons[0]->end);
120
121 }*/
122 }
123 //check intron overlaps
124 ovlen=a.exons[0]->end-(GMAX(a.start,b.start))+1;
125 ovlen+=(GMIN(a.end,b.end))-a.exons.Last()->start;
126 for (int i=1;i<=imax;i++) {
127 if (i<imax) ovlen+=a.exons[i]->len();
128 if ((a.exons[i-1]->end!=b.exons[i-1]->end) ||
129 (a.exons[i]->start!=b.exons[i]->start)) {
130 return false; //intron mismatch
131 }
132 }
133 return true;
134 }
135
136
137 bool unsplContained(GffObj& ti, GffObj& tj, bool fuzzSpan) {
138 //returns true only if ti (which MUST be single-exon) is "almost" contained in any of tj's exons
139 //but it does not cross any intron-exon boundary of tj
140 int imax=ti.exons.Count()-1;
141 int jmax=tj.exons.Count()-1;
142 if (imax>0) GError("Error: bad unsplContained() call, 1st param must be single-exon transcript!\n");
143 int minovl = (int)(0.8 * ti.len()); //minimum overlap for fuzzSpan
144 if (fuzzSpan) {
145 for (int j=0;j<=jmax;j++) {
146 //must NOT overlap the introns
147 if ((j>0 && ti.start<tj.exons[j]->start)
148 || (j<jmax && ti.end>tj.exons[j]->end))
149 return false;
150 if (ti.exons[0]->overlapLen(tj.exons[j])>=minovl)
151 return true;
152 }
153 } else {
154 for (int j=0;j<=jmax;j++) {
155 //must NOT overlap the introns
156 if ((j>0 && ti.start<tj.exons[j]->start)
157 || (j<jmax && ti.end>tj.exons[j]->end))
158 return false;
159 //strict containment
160 if (ti.end<=tj.exons[j]->end && ti.start>=tj.exons[j]->start)
161 return true;
162 }
163 }
164 return false;
165 }
166
167 GffObj* redundantTranscripts(GffObj& ti, GffObj& tj, bool matchAllIntrons, bool fuzzSpan) {
168 // matchAllIntrons==true: transcripts are considered "redundant" only if
169 // they have the exact same number of introns and same splice sites (or none)
170 // (single-exon transcripts can be also fully contained to be considered matching)
171 // matchAllIntrons==false: an intron chain could be a subset of a "container" chain,
172 // as long as no intron-exon boundaries are violated; also, a single-exon
173 // transcript will be collapsed if it's contained in one of the exons of the other
174 // fuzzSpan==false: the genomic span of one transcript must be contained in or equal with the genomic
175 // span of the other
176 //
177 // fuzzSpan==true: then genomic spans of transcripts are no longer required to be fully contained
178 // (i.e. they may extend each-other in opposite directions)
179
180 //if redundancy is found, the "bigger" transcript is returned (otherwise NULL is returned)
181 if (ti.start>=tj.end || tj.start>=ti.end || tj.strand!=ti.strand) return NULL; //no span overlap at all
182 int imax=ti.exons.Count()-1;
183 int jmax=tj.exons.Count()-1;
184 GffObj* bigger=NULL;
185 GffObj* smaller=NULL;
186 if (matchAllIntrons) {
187 if (imax!=jmax) return false;
188 if (ti.covlen>tj.covlen) {
189 bigger=&ti;
190 if (!fuzzSpan && (ti.start>tj.start || ti.end<tj.end)) return NULL;
191 }
192 else { //ti.covlen<=tj.covlen
193 bigger=&tj;
194 if (!fuzzSpan && (tj.start>ti.start || tj.end<ti.end)) return NULL;
195 }
196 //check that all introns really match
197 for (int i=0;i<imax;i++) {
198 if (ti.exons[i]->end!=tj.exons[i]->end ||
199 ti.exons[i+1]->start!=tj.exons[i+1]->start) return NULL;
200 }
201 return bigger;
202 }
203 //--- matchAllIntrons==false: intron-chain containment is also considered redundancy
204 int maxlen=0;
205 int minlen=0;
206 if (ti.covlen>tj.covlen) {
207 if (tj.exons.Count()>ti.exons.Count()) {
208 //exon count override
209 bigger=&tj;
210 smaller=&ti;
211 }
212 else {
213 bigger=&ti;
214 smaller=&tj;
215 }
216 maxlen=ti.covlen;
217 minlen=tj.covlen;
218 }
219 else { //tj has more bases
220 if (ti.exons.Count()>tj.exons.Count()) {
221 //exon count override
222 bigger=&ti;
223 smaller=&tj;
224 }
225 else {
226 bigger=&tj;
227 smaller=&ti;
228 }
229 maxlen=tj.covlen;
230 minlen=ti.covlen;
231 }
232 if (imax==0 && jmax==0) {
233 //single-exon transcripts: if fuzzSpan, at least 80% of the shortest one must be overlapped by the other
234 if (fuzzSpan) {
235 return (ti.exons[0]->overlapLen(tj.exons[0])>=minlen*0.8) ? bigger : NULL;
236 }
237 else {
238 return (smaller->start>=bigger->start && smaller->end<=bigger->end) ? bigger : NULL;
239 }
240 }
241 //containment is also considered redundancy
242 if (smaller->exons.Count()==1) {
243 //check if this single exon is contained in any of tj exons
244 //without violating any intron-exon boundaries
245 return (unsplContained(*smaller, *bigger, fuzzSpan) ? bigger : NULL);
246 }
247
248 //--from here on: both are multi-exon transcripts, imax>0 && jmax>0
249 if (ti.exons[imax]->start<tj.exons[0]->end ||
250 tj.exons[jmax]->start<ti.exons[0]->end )
251 return NULL; //intron chains do not overlap at all
252
253
254 //checking full intron chain containment
255 uint eistart=0, eiend=0, ejstart=0, ejend=0; //exon boundaries
256 int i=1; //exon idx to the right of the current intron of ti
257 int j=1; //exon idx to the right of the current intron of tj
258 //find the first intron overlap:
259 while (i<=imax && j<=jmax) {
260 eistart=ti.exons[i-1]->end;
261 eiend=ti.exons[i]->start;
262 ejstart=tj.exons[j-1]->end;
263 ejend=tj.exons[j]->start;
264 if (ejend<eistart) { j++; continue; }
265 if (eiend<ejstart) { i++; continue; }
266 //we found an intron overlap
267 break;
268 }
269 if (!fuzzSpan && (bigger->start>smaller->start || bigger->end < smaller->end)) return NULL;
270 if ((i>1 && j>1) || i>imax || j>jmax) {
271 return NULL; //either no intron overlaps found at all
272 //or it's not the first intron for at least one of the transcripts
273 }
274 if (eistart!=ejstart || eiend!=ejend) return NULL; //not an exact intron match
275 if (j>i) {
276 //i==1, ti's start must not conflict with the previous intron of tj
277 if (ti.start<tj.exons[j-1]->start) return NULL;
278 //so i's first intron starts AFTER j's first intron
279 // then j must contain i, so i's last intron must end with or before j's last intron
280 if (ti.exons[imax]->start>tj.exons[jmax]->start) return NULL;
281 //comment out the line above if you just want "intron compatibility" (i.e. extension of intron chains )
282 }
283 else if (i>j) {
284 //j==1, tj's start must not conflict with the previous intron of ti
285 if (tj.start<ti.exons[i-1]->start) return NULL;
286 //so j's intron chain starts AFTER i's
287 // then i must contain j, so j's last intron must end with or before j's last intron
288 if (tj.exons[jmax]->start>ti.exons[imax]->start) return NULL;
289 //comment out the line above for just "intronCompatible()" check (allowing extension of intron chain)
290 }
291 //now check if the rest of the introns overlap, in the same sequence
292 i++;
293 j++;
294 while (i<=imax && j<=jmax) {
295 if (ti.exons[i-1]->end!=tj.exons[j-1]->end ||
296 ti.exons[i]->start!=tj.exons[j]->start) return NULL;
297 i++;
298 j++;
299 }
300 i--;
301 j--;
302 if (i==imax && j<jmax) {
303 // tj has more introns to the right, check if ti's end doesn't conflict with the current tj exon boundary
304 if (ti.end>tj.exons[j]->end) return NULL;
305 }
306 else if (j==jmax && i<imax) {
307 if (tj.end>ti.exons[i]->end) return NULL;
308 }
309 return bigger;
310 }
311
312
313 int gseqCmpName(const pointer p1, const pointer p2) {
314 return strcmp(((GenomicSeqData*)p1)->gseq_name, ((GenomicSeqData*)p2)->gseq_name);
315 }
316
317
318 void printLocus(GffLocus* loc, const char* pre=NULL) {
319 if (pre!=NULL) fprintf(stderr, "%s", pre);
320 GMessage(" [%d-%d] : ", loc->start, loc->end);
321 GMessage("%s",loc->rnas[0]->getID());
322 for (int i=1;i<loc->rnas.Count();i++) {
323 GMessage(",%s",loc->rnas[i]->getID());
324 }
325 GMessage("\n");
326 }
327
328 void preserveContainedCDS(GffObj* t, GffObj* tfrom) {
329 //transfer CDS info to the container t if it's a larger protein
330 if (tfrom->CDstart==0) return;
331 if (t->CDstart) {
332 if (tfrom->CDstart<t->CDstart && tfrom->CDstart>=t->start)
333 t->CDstart=tfrom->CDstart;
334 if (tfrom->CDend>t->CDend && tfrom->CDend<=t->end)
335 t->CDend=tfrom->CDend;
336 }
337 else { //no CDS info on container, just copy it from the contained
338 t->addCDS(tfrom->CDstart, tfrom->CDend, tfrom->CDphase);
339 }
340 }
341
342 void placeGf(GffObj* t, GenomicSeqData* gdata, bool doCluster, bool collapseRedundant, bool matchAllIntrons, bool fuzzSpan) {
343 //GMessage(">>Placing transcript %s\n", t->getID());
344 GTData* tdata=new GTData(t);
345 gdata->tdata.Add(tdata);
346 int tidx=-1;
347 if (t->exons.Count()>0)
348 tidx=gdata->rnas.Add(t); //added it in sorted order
349 else {
350 gdata->gfs.Add(t);
351 return; //nothing to do with these non-transcript objects
352 }
353 if (!doCluster) return;
354 if (gdata->loci.Count()==0) {
355 gdata->loci.Add(new GffLocus(t));
356 //GMessage(" <<make it first locus %d-%d \n",t->start, t->end);
357 return;
358 }
359 //DEBUG: show available loci:
360 // GMessage(" [%d loci already:\n", gdata->loci.Count());
361 //for (int l=0;l<gdata->loci.Count();l++) {
362 // printLocus(gdata->loci[l]);
363 // }
364 int nidx=qsearch_gloci(t->end, gdata->loci); //get index of nearest locus starting just ABOVE t->end
365 //GMessage("\tlooking up end coord %d in gdata->loci.. (qsearch got nidx=%d)\n", t->end, nidx);
366 if (nidx==0) {
367 //cannot have any overlapping loci
368 //GMessage(" <<no ovls possible, create locus %d-%d \n",t->start, t->end);
369 gdata->loci.Add(new GffLocus(t));
370 return;
371 }
372 if (nidx==-1) nidx=gdata->loci.Count();//all loci start below t->end
373 int lfound=0; //count of parent loci
374 GArray<int> mrgloci(false);
375 GList<GffLocus> tloci(true); //candidate parent loci to adopt this
376 //GMessage("\tchecking all loci from %d to 0\n",nidx-1);
377 for (int l=nidx-1;l>=0;l--) {
378 GffLocus& loc=*(gdata->loci[l]);
379 if (loc.strand!='.' && t->strand!='.'&& loc.strand!=t->strand) continue;
380 if (t->start>loc.end) {
381 if (t->start-loc.start>GFF_MAX_LOCUS) break; //give up already
382 continue;
383 }
384 if (loc.start>t->end) continue;
385 //this should never be the case if nidx was found correctly
386 //GMessage(" !range overlap found with locus ");
387 //printLocus(&loc);
388 if (loc.add_RNA(t)) {
389 //will add this transcript to loc
390 lfound++;
391 mrgloci.Add(l);
392 if (collapseRedundant) {
393 //compare to every single transcript in this locus
394 for (int ti=0;ti<loc.rnas.Count();ti++) {
395 if (loc.rnas[ti]==t) continue;
396 GTData* odata=(GTData*)(loc.rnas[ti]->uptr);
397 //GMessage(" ..redundant check vs overlapping transcript %s\n",loc.rnas[ti]->getID());
398 GffObj* container=NULL;
399 if (odata->replaced_by==NULL &&
400 (container=redundantTranscripts(*t, *(loc.rnas[ti]), matchAllIntrons, fuzzSpan))!=NULL) {
401 if (container==t) {
402 odata->replaced_by=t;
403 preserveContainedCDS(t, loc.rnas[ti]);
404 }
405 else {
406 tdata->replaced_by=loc.rnas[ti];
407 preserveContainedCDS(loc.rnas[ti], t);
408 }
409 }
410 }//for each transcript in the exon-overlapping locus
411 } //if doCollapseRedundant
412 } //overlapping locus
413 }
414 if (lfound==0) {
415 //overlapping loci not found, create a locus with only this mRNA
416 //GMessage(" overlapping locus not found, create locus %d-%d \n",t->start, t->end);
417 int addidx=gdata->loci.Add(new GffLocus(t));
418 if (addidx<0) {
419 GMessage(" WARNING: new GffLocus(%s:%d-%d) not added!\n",t->getID(), t->start, t->end);
420 }
421 }
422 else if (lfound>1) {
423 //more than one loci found parenting this mRNA, merge loci
424 //if (lfound>2) GMessage(" merging %d loci \n",lfound);
425 lfound--;
426 for (int l=0;l<lfound;l++) {
427 int mlidx=mrgloci[l]; //largest indices first, so it's safe to remove
428 gdata->loci[mrgloci[lfound]]->addMerge(*(gdata->loci[mlidx]), t);
429 gdata->loci.Delete(mlidx);
430 }
431 }
432 }
433
434 void collectLocusData(GList<GenomicSeqData>& ref_data) {
435 int locus_num=0;
436 for (int g=0;g<ref_data.Count();g++) {
437 GenomicSeqData* gdata=ref_data[g];
438 for (int l=0;l<gdata->loci.Count();l++) {
439 GffLocus& loc=*(gdata->loci[l]);
440 GHash<int> gnames(true); //gene names in this locus
441 GHash<int> geneids(true); //Entrez GeneID: numbers
442 for (int i=0;i<loc.rnas.Count();i++) {
443 GffObj& t=*(loc.rnas[i]);
444 GStr gname(t.getGeneName());
445 if (!gname.is_empty()) {
446 gname.upper();
447 int* prevg=gnames.Find(gname.chars());
448 if (prevg!=NULL) (*prevg)++;
449 else gnames.Add(gname, new int(1));
450 }
451 //parse GeneID xrefs, if any:
452 GStr xrefs(t.getAttr("xrefs"));
453 if (!xrefs.is_empty()) {
454 xrefs.startTokenize(",");
455 GStr token;
456 while (xrefs.nextToken(token)) {
457 token.upper();
458 if (token.startsWith("GENEID:")) {
459 token.cut(0,token.index(':')+1);
460 int* prevg=geneids.Find(token.chars());
461 if (prevg!=NULL) (*prevg)++;
462 else geneids.Add(token, new int(1));
463 }
464 } //for each xref
465 } //xrefs parsing
466 }//for each transcript
467 locus_num++;
468 loc.locus_num=locus_num;
469 if (gnames.Count()>0) { //collect all gene names associated to this locus
470 gnames.startIterate();
471 int* gfreq=NULL;
472 char* key=NULL;
473 while ((gfreq=gnames.NextData(key))!=NULL) {
474 loc.gene_names.AddIfNew(new CGeneSym(key,*gfreq));
475 }
476 } //added collected gene_names
477 if (loc.gene_ids.Count()>0) { //collect all GeneIDs names associated to this locus
478 geneids.startIterate();
479 int* gfreq=NULL;
480 char* key=NULL;
481 while ((gfreq=geneids.NextData(key))!=NULL) {
482 loc.gene_ids.AddIfNew(new CGeneSym(key,*gfreq));
483 }
484 }
485 } //for each locus
486 }//for each genomic sequence
487 }
488
489
490 void GffLoader::load(GList<GenomicSeqData>& seqdata, GFValidateFunc* gf_validate,
491 bool doCluster, bool doCollapseRedundant, bool matchAllIntrons, bool fuzzSpan) {
492 GffReader* gffr=new GffReader(f, this->transcriptsOnly, false); //not only mRNA features, not sorted
493 gffr->showWarnings(this->showWarnings);
494 // keepAttrs mergeCloseExons noExonAttr
495 gffr->readAll(this->fullAttributes, this->mergeCloseExons, this->noExonAttrs);
496 //int redundant=0; //redundant annotation discarded
497 if (verbose) GMessage(" .. loaded %d genomic features from %s\n", gffr->gflst.Count(), fname.chars());
498 //int rna_deleted=0;
499 //add to GenomicSeqData, adding to existing loci and identifying intron-chain duplicates
500 for (int k=0;k<gffr->gflst.Count();k++) {
501 GffObj* m=gffr->gflst[k];
502 if (strcmp(m->getFeatureName(), "locus")==0 &&
503 m->getAttr("transcripts")!=NULL) {
504 continue;
505 }
506
507 char* rloc=m->getAttr("locus");
508 if (rloc!=NULL && startsWith(rloc, "RLOC_")) {
509 m->removeAttr("locus", rloc);
510 }
511 if (m->exons.Count()==0 && m->children.Count()==0) {
512 //a non-mRNA feature with no subfeatures
513 //add a dummy exon just to have the generic exon checking work
514 m->addExon(m->start,m->end);
515 }
516 GList<GffObj> gfadd(false,false);
517 if (gf_validate!=NULL && !(*gf_validate)(m, &gfadd)) {
518 continue;
519 }
520 m->isUsed(true); //so the gffreader won't destroy it
521 int i=-1;
522 GenomicSeqData f(m->gseq_id);
523 GenomicSeqData* gdata=NULL;
524
525 if (seqdata.Found(&f,i)) gdata=seqdata[i];
526 else { //entry not created yet for this genomic seq
527 gdata=new GenomicSeqData(m->gseq_id);
528 seqdata.Add(gdata);
529 }
530 for (int k=0;k<gfadd.Count();k++) {
531 placeGf(gfadd[k], gdata, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan);
532 }
533 placeGf(m, gdata, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan);
534 } //for each read gffObj
535 //if (verbose) GMessage(" .. %d records from %s clustered into loci.\n", gffr->gflst.Count(), fname.chars());
536 if (f!=stdin) { fclose(f); f=NULL; }
537 delete gffr;
538 }