ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/map_report/gff_utils.cpp
Revision: 162
Committed: Tue Feb 7 19:27:55 2012 UTC (7 years, 7 months ago) by gpertea
File size: 20605 byte(s)
Log Message:
initial map_report commit

Line File contents
1 #include "gff_utils.h"
2
3 extern bool verbose;
4 extern bool debugMode;
5
6 //bool debugState=false;
7
8 void printFasta(FILE* f, GStr& defline, char* seq, int seqlen) {
9 if (seq==NULL) return;
10 int len=(seqlen>0)?seqlen:strlen(seq);
11 if (len<=0) return;
12 if (!defline.is_empty())
13 fprintf(f, ">%s\n",defline.chars());
14 int ilen=0;
15 for (int i=0; i < len; i++, ilen++) {
16 if (ilen == 70) {
17 fputc('\n', f);
18 ilen = 0;
19 }
20 putc(seq[i], f);
21 } //for
22 fputc('\n', f);
23 }
24
25 int qsearch_gloci(uint x, GList<GffLocus>& loci) {
26 //binary search
27 //do the simplest tests first:
28 if (loci[0]->start>x) return 0;
29 if (loci.Last()->start<x) return -1;
30 uint istart=0;
31 int i=0;
32 int idx=-1;
33 int maxh=loci.Count()-1;
34 int l=0;
35 int h = maxh;
36 while (l <= h) {
37 i = (l+h)>>1;
38 istart=loci[i]->start;
39 if (istart < x) l = i + 1;
40 else {
41 if (istart == x) { //found matching coordinate here
42 idx=i;
43 while (idx<=maxh && loci[idx]->start==x) {
44 idx++;
45 }
46 return (idx>maxh) ? -1 : idx;
47 }
48 h = i - 1;
49 }
50 } //while
51 idx = l;
52 while (idx<=maxh && loci[idx]->start<=x) {
53 idx++;
54 }
55 return (idx>maxh) ? -1 : idx;
56 }
57
58 int qsearch_rnas(uint x, GList<GffObj>& rnas) {
59 //binary search
60 //do the simplest tests first:
61 if (rnas[0]->start>x) return 0;
62 if (rnas.Last()->start<x) return -1;
63 uint istart=0;
64 int i=0;
65 int idx=-1;
66 int maxh=rnas.Count()-1;
67 int l=0;
68 int h = maxh;
69 while (l <= h) {
70 i = (l+h)>>1;
71 istart=rnas[i]->start;
72 if (istart < x) l = i + 1;
73 else {
74 if (istart == x) { //found matching coordinate here
75 idx=i;
76 while (idx<=maxh && rnas[idx]->start==x) {
77 idx++;
78 }
79 return (idx>maxh) ? -1 : idx;
80 }
81 h = i - 1;
82 }
83 } //while
84 idx = l;
85 while (idx<=maxh && rnas[idx]->start<=x) {
86 idx++;
87 }
88 return (idx>maxh) ? -1 : idx;
89 }
90
91 int cmpRedundant(GffObj& a, GffObj& b) {
92 if (a.exons.Count()==b.exons.Count()) {
93 if (a.covlen==b.covlen) {
94 return strcmp(a.getID(), b.getID());
95 }
96 else return (a.covlen>b.covlen)? 1 : -1;
97 }
98 else return (a.exons.Count()>b.exons.Count())? 1: -1;
99 }
100
101
102 bool tMatch(GffObj& a, GffObj& b) {
103 //strict intron chain match, or single-exon perfect match
104 int imax=a.exons.Count()-1;
105 int jmax=b.exons.Count()-1;
106 int ovlen=0;
107 if (imax!=jmax) return false; //different number of introns
108
109 if (imax==0) { //single-exon mRNAs
110 //if (equnspl) {
111 //fuzz match for single-exon transfrags:
112 // it's a match if they overlap at least 80% of max len
113 ovlen=a.exons[0]->overlapLen(b.exons[0]);
114 int maxlen=GMAX(a.covlen,b.covlen);
115 return (ovlen>=maxlen*0.8);
116 /*}
117 else {
118 //only exact match
119 ovlen=a.covlen;
120 return (a.exons[0]->start==b.exons[0]->start &&
121 a.exons[0]->end==b.exons[0]->end);
122
123 }*/
124 }
125 //check intron overlaps
126 ovlen=a.exons[0]->end-(GMAX(a.start,b.start))+1;
127 ovlen+=(GMIN(a.end,b.end))-a.exons.Last()->start;
128 for (int i=1;i<=imax;i++) {
129 if (i<imax) ovlen+=a.exons[i]->len();
130 if ((a.exons[i-1]->end!=b.exons[i-1]->end) ||
131 (a.exons[i]->start!=b.exons[i]->start)) {
132 return false; //intron mismatch
133 }
134 }
135 return true;
136 }
137
138
139 bool unsplContained(GffObj& ti, GffObj& tj, bool fuzzSpan) {
140 //returns true only if ti (which MUST be single-exon) is "almost" contained in any of tj's exons
141 //but it does not cross any intron-exon boundary of tj
142 int imax=ti.exons.Count()-1;
143 int jmax=tj.exons.Count()-1;
144 if (imax>0) GError("Error: bad unsplContained() call, 1st param must be single-exon transcript!\n");
145 int minovl = (int)(0.8 * ti.len()); //minimum overlap for fuzzSpan
146 if (fuzzSpan) {
147 for (int j=0;j<=jmax;j++) {
148 //must NOT overlap the introns
149 if ((j>0 && ti.start<tj.exons[j]->start)
150 || (j<jmax && ti.end>tj.exons[j]->end))
151 return false;
152 if (ti.exons[0]->overlapLen(tj.exons[j])>=minovl)
153 return true;
154 }
155 } else {
156 for (int j=0;j<=jmax;j++) {
157 //must NOT overlap the introns
158 if ((j>0 && ti.start<tj.exons[j]->start)
159 || (j<jmax && ti.end>tj.exons[j]->end))
160 return false;
161 //strict containment
162 if (ti.end<=tj.exons[j]->end && ti.start>=tj.exons[j]->start)
163 return true;
164 }
165 }
166 return false;
167 }
168
169 GffObj* redundantTranscripts(GffObj& ti, GffObj& tj, bool matchAllIntrons, bool fuzzSpan) {
170 // matchAllIntrons==true: transcripts are considered "redundant" only if
171 // they have the exact same number of introns and same splice sites (or none)
172 // (single-exon transcripts can be also fully contained to be considered matching)
173 // matchAllIntrons==false: an intron chain could be a subset of a "container" chain,
174 // as long as no intron-exon boundaries are violated; also, a single-exon
175 // transcript will be collapsed if it's contained in one of the exons of the other
176 // fuzzSpan==false: the genomic span of one transcript must be contained in or equal with the genomic
177 // span of the other
178 //
179 // fuzzSpan==true: then genomic spans of transcripts are no longer required to be fully contained
180 // (i.e. they may extend each-other in opposite directions)
181
182 //if redundancy is found, the "bigger" transcript is returned (otherwise NULL is returned)
183 if (ti.start>=tj.end || tj.start>=ti.end || tj.strand!=ti.strand) return NULL; //no span overlap at all
184 int imax=ti.exons.Count()-1;
185 int jmax=tj.exons.Count()-1;
186 GffObj* bigger=NULL;
187 GffObj* smaller=NULL;
188 if (matchAllIntrons) {
189 if (imax!=jmax) return false;
190 if (ti.covlen>tj.covlen) {
191 bigger=&ti;
192 if (!fuzzSpan && (ti.start>tj.start || ti.end<tj.end)) return NULL;
193 }
194 else { //ti.covlen<=tj.covlen
195 bigger=&tj;
196 if (!fuzzSpan && (tj.start>ti.start || tj.end<ti.end)) return NULL;
197 }
198 //check that all introns really match
199 for (int i=0;i<imax;i++) {
200 if (ti.exons[i]->end!=tj.exons[i]->end ||
201 ti.exons[i+1]->start!=tj.exons[i+1]->start) return NULL;
202 }
203 return bigger;
204 }
205 //--- matchAllIntrons==false: intron-chain containment is also considered redundancy
206 //int maxlen=0;
207 int minlen=0;
208 if (ti.covlen>tj.covlen) {
209 if (tj.exons.Count()>ti.exons.Count()) {
210 //exon count override
211 bigger=&tj;
212 smaller=&ti;
213 }
214 else {
215 bigger=&ti;
216 smaller=&tj;
217 }
218 //maxlen=ti.covlen;
219 minlen=tj.covlen;
220 }
221 else { //tj has more bases
222 if (ti.exons.Count()>tj.exons.Count()) {
223 //exon count override
224 bigger=&ti;
225 smaller=&tj;
226 }
227 else {
228 bigger=&tj;
229 smaller=&ti;
230 }
231 //maxlen=tj.covlen;
232 minlen=ti.covlen;
233 }
234 if (imax==0 && jmax==0) {
235 //single-exon transcripts: if fuzzSpan, at least 80% of the shortest one must be overlapped by the other
236 if (fuzzSpan) {
237 return (ti.exons[0]->overlapLen(tj.exons[0])>=minlen*0.8) ? bigger : NULL;
238 }
239 else {
240 return (smaller->start>=bigger->start && smaller->end<=bigger->end) ? bigger : NULL;
241 }
242 }
243 //containment is also considered redundancy
244 if (smaller->exons.Count()==1) {
245 //check if this single exon is contained in any of tj exons
246 //without violating any intron-exon boundaries
247 return (unsplContained(*smaller, *bigger, fuzzSpan) ? bigger : NULL);
248 }
249
250 //--from here on: both are multi-exon transcripts, imax>0 && jmax>0
251 if (ti.exons[imax]->start<tj.exons[0]->end ||
252 tj.exons[jmax]->start<ti.exons[0]->end )
253 return NULL; //intron chains do not overlap at all
254
255
256 //checking full intron chain containment
257 uint eistart=0, eiend=0, ejstart=0, ejend=0; //exon boundaries
258 int i=1; //exon idx to the right of the current intron of ti
259 int j=1; //exon idx to the right of the current intron of tj
260 //find the first intron overlap:
261 while (i<=imax && j<=jmax) {
262 eistart=ti.exons[i-1]->end;
263 eiend=ti.exons[i]->start;
264 ejstart=tj.exons[j-1]->end;
265 ejend=tj.exons[j]->start;
266 if (ejend<eistart) { j++; continue; }
267 if (eiend<ejstart) { i++; continue; }
268 //we found an intron overlap
269 break;
270 }
271 if (!fuzzSpan && (bigger->start>smaller->start || bigger->end < smaller->end)) return NULL;
272 if ((i>1 && j>1) || i>imax || j>jmax) {
273 return NULL; //either no intron overlaps found at all
274 //or it's not the first intron for at least one of the transcripts
275 }
276 if (eistart!=ejstart || eiend!=ejend) return NULL; //not an exact intron match
277 if (j>i) {
278 //i==1, ti's start must not conflict with the previous intron of tj
279 if (ti.start<tj.exons[j-1]->start) return NULL;
280 //so i's first intron starts AFTER j's first intron
281 // then j must contain i, so i's last intron must end with or before j's last intron
282 if (ti.exons[imax]->start>tj.exons[jmax]->start) return NULL;
283 //comment out the line above if you just want "intron compatibility" (i.e. extension of intron chains )
284 }
285 else if (i>j) {
286 //j==1, tj's start must not conflict with the previous intron of ti
287 if (tj.start<ti.exons[i-1]->start) return NULL;
288 //so j's intron chain starts AFTER i's
289 // then i must contain j, so j's last intron must end with or before j's last intron
290 if (tj.exons[jmax]->start>ti.exons[imax]->start) return NULL;
291 //comment out the line above for just "intronCompatible()" check (allowing extension of intron chain)
292 }
293 //now check if the rest of the introns overlap, in the same sequence
294 i++;
295 j++;
296 while (i<=imax && j<=jmax) {
297 if (ti.exons[i-1]->end!=tj.exons[j-1]->end ||
298 ti.exons[i]->start!=tj.exons[j]->start) return NULL;
299 i++;
300 j++;
301 }
302 i--;
303 j--;
304 if (i==imax && j<jmax) {
305 // tj has more introns to the right, check if ti's end doesn't conflict with the current tj exon boundary
306 if (ti.end>tj.exons[j]->end) return NULL;
307 }
308 else if (j==jmax && i<imax) {
309 if (tj.end>ti.exons[i]->end) return NULL;
310 }
311 return bigger;
312 }
313
314
315 int gseqCmpName(const pointer p1, const pointer p2) {
316 return strcmp(((GenomicSeqData*)p1)->gseq_name, ((GenomicSeqData*)p2)->gseq_name);
317 }
318
319
320 void printLocus(GffLocus* loc, const char* pre) {
321 if (pre!=NULL) fprintf(stderr, "%s", pre);
322 GMessage(" [%d-%d] : ", loc->start, loc->end);
323 GMessage("%s",loc->rnas[0]->getID());
324 for (int i=1;i<loc->rnas.Count();i++) {
325 GMessage(",%s",loc->rnas[i]->getID());
326 }
327 GMessage("\n");
328 }
329
330 void preserveContainedCDS(GffObj* t, GffObj* tfrom) {
331 //transfer CDS info to the container t if it's a larger protein
332 if (tfrom->CDstart==0) return;
333 if (t->CDstart) {
334 if (tfrom->CDstart<t->CDstart && tfrom->CDstart>=t->start)
335 t->CDstart=tfrom->CDstart;
336 if (tfrom->CDend>t->CDend && tfrom->CDend<=t->end)
337 t->CDend=tfrom->CDend;
338 }
339 else { //no CDS info on container, just copy it from the contained
340 t->addCDS(tfrom->CDstart, tfrom->CDend, tfrom->CDphase);
341 }
342 }
343
344 void placeGf(GffObj* t, GenomicSeqData* gdata, bool doCluster, bool collapseRedundant,
345 bool matchAllIntrons, bool fuzzSpan) {
346 GTData* tdata=new GTData(t);
347 gdata->tdata.Add(tdata);
348 int tidx=-1;
349 /*
350 if (debug) {
351 GMessage(">>Placing transcript %s\n", t->getID());
352 debugState=true;
353 }
354 else debugState=false;
355 */
356 if (t->exons.Count()>0)
357 tidx=gdata->rnas.Add(t); //added it in sorted order
358 else {
359 gdata->gfs.Add(t);
360 return; //nothing to do with these non-transcript objects
361 }
362 if (!doCluster) return;
363 if (gdata->loci.Count()==0) {
364 gdata->loci.Add(new GffLocus(t));
365 //GMessage(" <<make it first locus %d-%d \n",t->start, t->end);
366 return;
367 }
368 /*
369 //DEBUG: show available loci:
370 if (debug) {
371 GMessage(" [%d loci already:\n", gdata->loci.Count());
372 for (int l=0;l<gdata->loci.Count();l++) {
373 printLocus(gdata->loci[l]);
374 }
375 }
376 */
377 int nidx=qsearch_gloci(t->end, gdata->loci); //get index of nearest locus starting just ABOVE t->end
378 //GMessage("\tlooking up end coord %d in gdata->loci.. (qsearch got nidx=%d)\n", t->end, nidx);
379 if (nidx==0) {
380 //cannot have any overlapping loci
381 //if (debug) GMessage(" <<no ovls possible, create locus %d-%d \n",t->start, t->end);
382 gdata->loci.Add(new GffLocus(t));
383 return;
384 }
385 if (nidx==-1) nidx=gdata->loci.Count();//all loci start below t->end
386 int lfound=0; //count of parent loci
387 GArray<int> mrgloci(false);
388 GList<GffLocus> tloci(true); //candidate parent loci to adopt this
389 //if (debug) GMessage("\tchecking all loci from %d to 0\n",nidx-1);
390 for (int l=nidx-1;l>=0;l--) {
391 GffLocus& loc=*(gdata->loci[l]);
392 if (loc.strand!='.' && t->strand!='.'&& loc.strand!=t->strand) continue;
393 if (t->start>loc.end) {
394 if (t->start-loc.start>GFF_MAX_LOCUS) break; //give up already
395 continue;
396 }
397 if (loc.start>t->end) {
398 //this should never be the case if nidx was found correctly
399 GMessage("Warning: qsearch_gloci found loc.start>t.end!(t=%s)\n", t->getID());
400 continue;
401 }
402 /*
403 if (debug) {
404 GMessage(" !range overlap found with locus ");
405 printLocus(&loc);
406 }
407 */
408 if (loc.add_RNA(t)) {
409 //will add this transcript to loc
410 lfound++;
411 mrgloci.Add(l);
412 if (collapseRedundant) {
413 //compare to every single transcript in this locus
414 for (int ti=0;ti<loc.rnas.Count();ti++) {
415 if (loc.rnas[ti]==t) continue;
416 GTData* odata=(GTData*)(loc.rnas[ti]->uptr);
417 //GMessage(" ..redundant check vs overlapping transcript %s\n",loc.rnas[ti]->getID());
418 GffObj* container=NULL;
419 if (odata->replaced_by==NULL &&
420 (container=redundantTranscripts(*t, *(loc.rnas[ti]), matchAllIntrons, fuzzSpan))!=NULL) {
421 if (container==t) {
422 odata->replaced_by=t;
423 preserveContainedCDS(t, loc.rnas[ti]);
424 }
425 else {
426 tdata->replaced_by=loc.rnas[ti];
427 preserveContainedCDS(loc.rnas[ti], t);
428 }
429 }
430 }//for each transcript in the exon-overlapping locus
431 } //if doCollapseRedundant
432 } //overlapping locus
433 } //for each existing locus
434 if (lfound==0) {
435 //overlapping loci not found, create a locus with only this mRNA
436 /* if (debug) {
437 GMessage(" overlapping locus not found, create locus %d-%d \n",t->start, t->end);
438 }
439 */
440 int addidx=gdata->loci.Add(new GffLocus(t));
441 if (addidx<0) {
442 //should never be the case!
443 GMessage(" WARNING: new GffLocus(%s:%d-%d) not added!\n",t->getID(), t->start, t->end);
444 }
445 }
446 else { //found at least one overlapping locus
447 lfound--;
448 int locidx=mrgloci[lfound];
449 GffLocus& loc=*(gdata->loci[locidx]);
450 //last locus index found is also the smallest index
451 if (lfound>0) {
452 //more than one loci found parenting this mRNA, merge loci
453 /* if (debug)
454 GMessage(" merging %d loci \n",lfound);
455 */
456 for (int l=0;l<lfound;l++) {
457 int mlidx=mrgloci[l];
458 loc.addMerge(*(gdata->loci[mlidx]), t);
459 gdata->loci.Delete(mlidx); //highest indices first, so it's safe to remove
460 }
461 }
462 int i=locidx;
463 while (i>0 && loc<*(gdata->loci[i-1])) {
464 //bubble down until it's in the proper order
465 i--;
466 gdata->loci.Swap(i,i+1);
467 }
468 }//found at least one overlapping locus
469 }
470
471 void collectLocusData(GList<GenomicSeqData>& ref_data) {
472 int locus_num=0;
473 for (int g=0;g<ref_data.Count();g++) {
474 GenomicSeqData* gdata=ref_data[g];
475 for (int l=0;l<gdata->loci.Count();l++) {
476 GffLocus& loc=*(gdata->loci[l]);
477 GHash<int> gnames(true); //gene names in this locus
478 GHash<int> geneids(true); //Entrez GeneID: numbers
479 for (int i=0;i<loc.rnas.Count();i++) {
480 GffObj& t=*(loc.rnas[i]);
481 GStr gname(t.getGeneName());
482 if (!gname.is_empty()) {
483 gname.upper();
484 int* prevg=gnames.Find(gname.chars());
485 if (prevg!=NULL) (*prevg)++;
486 else gnames.Add(gname, new int(1));
487 }
488 //parse GeneID xrefs, if any:
489 GStr xrefs(t.getAttr("xrefs"));
490 if (!xrefs.is_empty()) {
491 xrefs.startTokenize(",");
492 GStr token;
493 while (xrefs.nextToken(token)) {
494 token.upper();
495 if (token.startsWith("GENEID:")) {
496 token.cut(0,token.index(':')+1);
497 int* prevg=geneids.Find(token.chars());
498 if (prevg!=NULL) (*prevg)++;
499 else geneids.Add(token, new int(1));
500 }
501 } //for each xref
502 } //xrefs parsing
503 }//for each transcript
504 locus_num++;
505 loc.locus_num=locus_num;
506 if (gnames.Count()>0) { //collect all gene names associated to this locus
507 gnames.startIterate();
508 int* gfreq=NULL;
509 char* key=NULL;
510 while ((gfreq=gnames.NextData(key))!=NULL) {
511 loc.gene_names.AddIfNew(new CGeneSym(key,*gfreq));
512 }
513 } //added collected gene_names
514 if (loc.gene_ids.Count()>0) { //collect all GeneIDs names associated to this locus
515 geneids.startIterate();
516 int* gfreq=NULL;
517 char* key=NULL;
518 while ((gfreq=geneids.NextData(key))!=NULL) {
519 loc.gene_ids.AddIfNew(new CGeneSym(key,*gfreq));
520 }
521 }
522 } //for each locus
523 }//for each genomic sequence
524 }
525
526
527 void GffLoader::load(GList<GenomicSeqData>& seqdata, GFValidateFunc* gf_validate,
528 bool doCluster, bool doCollapseRedundant,
529 bool matchAllIntrons, bool fuzzSpan, bool forceExons) {
530 GffReader* gffr=new GffReader(f, this->transcriptsOnly, false); //not only mRNA features, not sorted
531 gffr->showWarnings(this->showWarnings);
532 // keepAttrs mergeCloseExons noExonAttr
533 gffr->readAll(this->fullAttributes, this->mergeCloseExons, this->noExonAttrs);
534 //int redundant=0; //redundant annotation discarded
535 if (verbose) GMessage(" .. loaded %d genomic features from %s\n", gffr->gflst.Count(), fname.chars());
536 //int rna_deleted=0;
537 //add to GenomicSeqData, adding to existing loci and identifying intron-chain duplicates
538 for (int k=0;k<gffr->gflst.Count();k++) {
539 GffObj* m=gffr->gflst[k];
540 if (strcmp(m->getFeatureName(), "locus")==0 &&
541 m->getAttr("transcripts")!=NULL) {
542 continue; //discard locus meta-features
543 }
544
545 char* rloc=m->getAttr("locus");
546 if (rloc!=NULL && startsWith(rloc, "RLOC_")) {
547 m->removeAttr("locus", rloc);
548 }
549 if (m->exons.Count()==0 && m->children.Count()==0) {
550 //a non-mRNA feature with no subfeatures
551 //add a dummy exon just to have the generic exon checking work
552 m->addExon(m->start,m->end);
553 }
554 if (forceExons && m->children.Count()==0) {
555 m->exon_ftype_id=gff_fid_exon;
556 }
557 GList<GffObj> gfadd(false,false);
558 if (gf_validate!=NULL && !(*gf_validate)(m, &gfadd)) {
559 continue;
560 }
561 m->isUsed(true); //so the gffreader won't destroy it
562 int i=-1;
563 GenomicSeqData f(m->gseq_id);
564 GenomicSeqData* gdata=NULL;
565
566 if (seqdata.Found(&f,i)) gdata=seqdata[i];
567 else { //entry not created yet for this genomic seq
568 gdata=new GenomicSeqData(m->gseq_id);
569 seqdata.Add(gdata);
570 }
571 for (int k=0;k<gfadd.Count();k++) {
572 placeGf(gfadd[k], gdata, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan);
573 }
574 placeGf(m, gdata, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan);
575 } //for each read gffObj
576 //if (verbose) GMessage(" .. %d records from %s clustered into loci.\n", gffr->gflst.Count(), fname.chars());
577 if (f!=stdin) { fclose(f); f=NULL; }
578 delete gffr;
579 }