ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/cuffcompare/gtf_tracking.h
Revision: 118
Committed: Sun Nov 27 21:14:57 2011 UTC (7 years, 10 months ago) by gpertea
File size: 45593 byte(s)
Log Message:
fix locus Sn calculation

Line File contents
1 #ifndef GTF_TRACKING_H
2 #define GTF_TRACKING_H
3 /*
4 * gtf_tracking.h
5 * cufflinks
6 *
7 * Created by Cole Trapnell on 9/5/09.
8 * Copyright 2009 Geo Pertea. All rights reserved.
9 *
10 */
11
12 #include "gff.h"
13 #include "GFaSeqGet.h"
14 #include "GFastaIndex.h"
15 #include "GStr.h"
16
17
18 #define MAX_QFILES 500
19
20 extern bool gtf_tracking_verbose;
21
22 extern bool gtf_tracking_largeScale;
23 //many input files, no accuracy stats are generated, no *.tmap
24 // and exon attributes are discarded
25
26 int cmpByPtr(const pointer p1, const pointer p2);
27
28 bool t_contains(GffObj& a, GffObj& b);
29 //returns true only IF b has fewer exons than a AND a "contains" b
30
31 char* getGSeqName(int gseq_id);
32
33 //genomic fasta sequence handling
34 class GFastaHandler {
35 public:
36 char* fastaPath;
37 GFastaIndex* faIdx;
38 char* getFastaFile(int gseq_id) {
39 if (fastaPath==NULL) return NULL;
40 GStr s(fastaPath);
41 s.trimR('/');
42 s.appendfmt("/%s",getGSeqName(gseq_id));
43 GStr sbase(s);
44 if (!fileExists(s.chars())) s.append(".fa");
45 if (!fileExists(s.chars())) s.append("sta");
46 if (fileExists(s.chars())) return Gstrdup(s.chars());
47 else {
48 GMessage("Warning: cannot find genomic sequence file %s{.fa,.fasta}\n",sbase.chars());
49 return NULL;
50 }
51 }
52
53 GFastaHandler(const char* fpath=NULL) {
54 fastaPath=NULL;
55 faIdx=NULL;
56 if (fpath!=NULL && fpath[0]!=0) init(fpath);
57 }
58
59 void init(const char* fpath) {
60 if (fpath==NULL || fpath[0]==0) return;
61 if (!fileExists(fpath))
62 GError("Error: file/directory %s does not exist!\n",fpath);
63 fastaPath=Gstrdup(fpath);
64 if (fastaPath!=NULL) {
65 if (fileExists(fastaPath)>1) { //exists and it's not a directory
66 GStr fainame(fastaPath);
67 //the .fai name might have been given directly
68 if (fainame.rindex(".fai")==fainame.length()-4) {
69 //.fai index file given directly
70 fastaPath[fainame.length()-4]=0;
71 if (!fileExists(fastaPath))
72 GError("Error: cannot find fasta file for index %s !\n", fastaPath);
73 }
74 else fainame.append(".fai");
75 //fainame.append(".fai");
76 faIdx=new GFastaIndex(fastaPath,fainame.chars());
77 GStr fainamecwd(fainame);
78 int ip=-1;
79 if ((ip=fainamecwd.rindex('/'))>=0)
80 fainamecwd.cut(0,ip+1);
81 if (!faIdx->hasIndex()) { //could not load index
82 //try current directory
83 if (fainame!=fainamecwd) {
84 if (fileExists(fainamecwd.chars())>1) {
85 faIdx->loadIndex(fainamecwd.chars());
86 }
87 }
88 } //tried to load index
89 if (!faIdx->hasIndex()) {
90 GMessage("No fasta index found for %s. Rebuilding, please wait..\n",fastaPath);
91 faIdx->buildIndex();
92 if (faIdx->getCount()==0) GError("Error: no fasta records found!\n");
93 GMessage("Fasta index rebuilt.\n");
94 FILE* fcreate=fopen(fainame.chars(), "w");
95 if (fcreate==NULL) {
96 GMessage("Warning: cannot create fasta index %s! (permissions?)\n", fainame.chars());
97 if (fainame!=fainamecwd) fcreate=fopen(fainamecwd.chars(), "w");
98 if (fcreate==NULL)
99 GError("Error: cannot create fasta index %s!\n", fainamecwd.chars());
100 }
101 if (faIdx->storeIndex(fcreate)<faIdx->getCount())
102 GMessage("Warning: error writing the index file!\n");
103 } //index created and attempted to store it
104 } //multi-fasta
105 } //genomic sequence given
106 }
107 GFaSeqGet* fetch(int gseq_id, bool checkFasta=false) {
108 if (fastaPath==NULL) return NULL;
109 //genomic sequence given
110 GFaSeqGet* faseq=NULL;
111 if (faIdx!=NULL) { //fastaPath was the multi-fasta file name
112 char* gseqname=getGSeqName(gseq_id);
113 GFastaRec* farec=faIdx->getRecord(gseqname);
114 if (farec!=NULL) {
115 faseq=new GFaSeqGet(fastaPath,farec->seqlen, farec->fpos,
116 farec->line_len, farec->line_blen);
117 faseq->loadall(); //just cache the whole sequence, it's faster
118 }
119 else {
120 GMessage("Warning: couldn't find fasta record for '%s'!\n",gseqname);
121 return NULL;
122 }
123 }
124 else //if (fileExists(fastaPath)==1)
125 {
126 char* sfile=getFastaFile(gseq_id);
127 if (sfile!=NULL) {
128 //if (gtf_tracking_verbose)
129 // GMessage("Processing sequence from fasta file '%s'\n",sfile);
130 faseq=new GFaSeqGet(sfile,checkFasta);
131 faseq->loadall();
132 GFREE(sfile);
133 }
134 } //one fasta file per contig
135 return faseq;
136 }
137
138 ~GFastaHandler() {
139 GFREE(fastaPath);
140 delete faIdx;
141 }
142 };
143
144
145
146 bool betterRef(GffObj* a, GffObj* b); //for better CovLink reference ranking
147
148 class GLocus;
149
150 class COvLink {
151 public:
152 static int coderank(char c) {
153 switch (c) {
154 case '=': return 0; //ichain match
155 case 'c': return 2; //containment (ichain fragment)
156 case 'j': return 4; // overlap with at least a junction match
157 case 'e': return 6; // single exon transfrag overlapping an intron of reference (possible pre-mRNA)
158 case 'o': return 8; // generic exon overlap
159 case 's': return 16; //"shadow" - an intron overlaps with a ref intron on the opposite strand
160 case 'x': return 18; // exon overlap on opposite strand (usually wrong strand mapping)
161 case 'i': return 20; // intra-intron
162 case 'p': return 90; //polymerase run
163 case 'r': return 92; //repeats
164 case 'u': return 94; //intergenic
165 case 0 : return 100;
166 default: return 96;
167 }
168 }
169 char code;
170 int rank;
171 GffObj* mrna;
172 int ovlen;
173 COvLink(char c=0,GffObj* m=NULL, int ovl=0) {
174 code=c;
175 mrna=m;
176 ovlen=ovl;
177 rank=coderank(c);
178 }
179 bool operator<(COvLink& b) {
180 if (rank==b.rank)
181 return (ovlen==b.ovlen)? betterRef(mrna, b.mrna) : (ovlen>b.ovlen);
182 else return rank<b.rank;
183 }
184 bool operator>(COvLink& b) {
185 if (rank==b.rank)
186 return (ovlen==b.ovlen)? betterRef(b.mrna, mrna) : (ovlen<b.ovlen);
187 else return rank>b.rank;
188 }
189 bool operator==(COvLink& b) {
190 return (rank==b.rank && mrna==b.mrna);
191 }
192 };
193
194 class GISeg: public GSeg {
195 public:
196 GffObj* t; //pointer to the largest transcript with a segment this exact exon coordinates
197 GISeg(uint s=0,uint e=0, GffObj* ot=NULL):GSeg(s,e) { t=ot; }
198 };
199
200 class GIArray:public GArray<GISeg> {
201 public:
202 GIArray(bool uniq=true):GArray<GISeg>(true,uniq) { }
203 int IAdd(GISeg* item) {
204 if (item==NULL) return -1;
205 int result=-1;
206 if (Found(*item, result)) {
207 if (fUnique) {
208 //cannot add a duplicate, return index of existing item
209 if (item->t!=NULL && fArray[result].t!=NULL &&
210 item->t->covlen>fArray[result].t->covlen)
211 fArray[result].t=item->t;
212 return result;
213 }
214 }
215 //Found sets result to the position where the item should be
216 idxInsert(result, *item);
217 return result;
218 }
219
220 };
221
222 class CEqList: public GList<GffObj> {
223 public:
224 GffObj* head;
225 CEqList():GList<GffObj>((GCompareProc*)cmpByPtr, (GFreeProc*)NULL, true) {
226 head=NULL;
227 }
228 };
229
230 class CTData { //transcript associated data
231 public:
232 GffObj* mrna; //owner transcript
233 GLocus* locus;
234 GList<COvLink> ovls; //overlaps with other transcripts (ref vs query)
235 //-- just for ichain match tracking:
236 GffObj* eqref; //ref transcript having an ichain match
237 int qset; //qry set index (qfidx), -1 means reference dataset
238 //GffObj* eqnext; //next GffObj in the linked list of matching transfrags
239 CEqList* eqlist; //keep track of matching transfrags
240 //int eqdata; // flags for EQ list (is it a list head?)
241 // Cufflinks specific data:
242 double FPKM;
243 double conf_hi;
244 double conf_lo;
245 double cov;
246 char classcode; //the best/final classcode
247 CTData(GffObj* m=NULL, GLocus* l=NULL):ovls(true,true,true) {
248 mrna=m;
249 if (mrna!=NULL) mrna->uptr=this;
250 locus=l;
251 classcode=0;
252 eqref=NULL;
253 //eqnext=NULL;
254 eqlist=NULL;
255 //eqdata=0;
256 qset=-2;
257 FPKM=0;
258 conf_lo=0;
259 conf_hi=0;
260 cov=0;
261 }
262
263 ~CTData() {
264 ovls.Clear();
265 //if ((eqdata & EQHEAD_TAG)!=0) delete eqlist;
266 if (isEqHead()) delete eqlist;
267 }
268
269 //inline bool eqHead() { return ((eqdata & EQHEAD_TAG)!=0); }
270 bool isEqHead() {
271 if (eqlist==NULL) return false;
272 return (eqlist->head==this->mrna);
273 }
274
275 void joinEqList(GffObj* m) { //add list from m
276 //list head is set to the transfrag with the lower qset#
277 CTData* md=(CTData*)(m->uptr);
278 //ASSERT(md);
279 if (eqlist==NULL) {
280 if (md->eqlist!=NULL) {
281 eqlist=md->eqlist;
282 eqlist->Add(this->mrna);
283 CTData* md_head_d=(CTData*)(md->eqlist->head->uptr);
284 if (this->qset < md_head_d->qset)
285 eqlist->head=this->mrna;
286 }
287 else { //m was not in an EQ list
288 //eqlist=new GList<GffObj>((GCompareProc*)cmpByPtr, (GFreeProc*)NULL, true);
289 eqlist=new CEqList();
290 eqlist->Add(this->mrna);
291 eqlist->Add(m);
292 md->eqlist=eqlist;
293 if (qset<md->qset) eqlist->head=this->mrna;
294 else eqlist->head=m;
295 }
296 }//no eqlist before
297 else { //merge two eqlists
298 if (eqlist==md->eqlist) //already in the same eqlist, nothing to do
299 return;
300 if (md->eqlist!=NULL) { //copy elements of m's eqlist
301 //copy the smaller list into the larger one
302 CEqList* srclst, *destlst;
303 if (md->eqlist->Count()<eqlist->Count()) {
304 srclst=md->eqlist;
305 destlst=eqlist;
306 }
307 else {
308 srclst=eqlist;
309 destlst=md->eqlist;
310 }
311 for (int i=0;i<srclst->Count();i++) {
312 destlst->Add(srclst->Get(i));
313 CTData* od=(CTData*)((*srclst)[i]->uptr);
314 od->eqlist=destlst;
315 //od->eqdata=od->qset+1;
316 }
317 this->eqlist=destlst;
318 CTData* s_head_d=(CTData*)(srclst->head->uptr);
319 CTData* d_head_d=(CTData*)(destlst->head->uptr);
320 if (s_head_d->qset < d_head_d->qset )
321 this->eqlist->head=srclst->head;
322 delete srclst;
323 }
324 else { //md->eqlist==NULL
325 eqlist->Add(m);
326 md->eqlist=eqlist;
327 CTData* head_d=(CTData*)(eqlist->head->uptr);
328 if (md->qset<head_d->qset)
329 eqlist->head=m;
330 }
331 }
332 }
333
334 void addOvl(char code,GffObj* target=NULL, int ovlen=0) {
335 ovls.AddIfNew(new COvLink(code, target, ovlen));
336 }
337 char getBestCode() {
338 return (ovls.Count()>0) ? ovls[0]->code : 0 ;
339 }
340 bool operator>(CTData& b) { return (mrna > b.mrna); }
341 bool operator<(CTData& b) { return (mrna < b.mrna); }
342 bool operator==(CTData& b) { return (mrna==b.mrna); }
343 };
344
345 class GSuperLocus;
346 class GTrackLocus;
347 class GXLocus;
348
349 //Data structure holding a query locus data (overlapping mRNAs on the same strand)
350 // and also the accuracy data of all mRNAs of a query locus
351 // (against all reference loci overlapping the same region)
352 class GLocus:public GSeg {
353 public:
354 int gseq_id; //id of underlying genomic sequence
355 int qfidx; // for locus tracking
356 GTrackLocus* t_ptr; //for locus tracking cluster
357 GffObj* mrna_maxcov; //transcript with maximum coverage (for main "ref" transcript)
358 GffObj* mrna_maxscore; //transcript with maximum gscore (for major isoform)
359 GList<GffObj> mrnas; //list of transcripts (isoforms) for this locus
360 GArray<GSeg> uexons; //list of unique exons (covered segments) in this region
361 GArray<GSeg> mexons; //list of merged exons in this region
362 GIArray introns;
363 GList<GLocus> cmpovl; //temp list of overlapping qry/ref loci to compare to (while forming superloci)
364
365 //only for reference loci --> keep track of all superloci found for each qry dataset
366 // which contain this reference locus
367 GList<GSuperLocus>* superlst;
368 GXLocus* xlocus; //superlocus formed by exon overlaps across all qry datasets
369 // -- if genomic sequence was given:
370 int spl_major; // number of GT-AG splice site consensi
371 int spl_rare; // number of GC-AG, AT-AC and other rare splice site consensi
372 int spl_wrong; //number of "wrong" (unrecognized) splice site consensi
373 int ichains; //number of multi-exon mrnas
374 int ichainTP;
375 int ichainATP;
376 int mrnaTP;
377 int mrnaATP;
378 int v; //user flag/data
379 GLocus(GffObj* mrna=NULL, int qidx=-1):mrnas(true,false,false),uexons(true,true),mexons(true,true),
380 introns(), cmpovl(true,false,true) {
381 //this will NOT free mrnas!
382 ichains=0;
383 gseq_id=-1;
384 qfidx=qidx;
385 t_ptr=NULL;
386 creset();
387 xlocus=NULL;
388 mrna_maxcov=NULL;
389 mrna_maxscore=NULL;
390 superlst=new GList<GSuperLocus>(true,false,false);
391 if (mrna!=NULL) {
392 start=mrna->exons.First()->start;
393 end=mrna->exons.Last()->end;;
394 gseq_id=mrna->gseq_id;
395 GISeg seg;
396 for (int i=0;i<mrna->exons.Count();i++) {
397 seg.start=mrna->exons[i]->start;
398 seg.end=mrna->exons[i]->end;
399 uexons.Add(seg);
400 mexons.Add(seg);
401 if (i>0) {
402 seg.start=mrna->exons[i-1]->end+1;
403 seg.end=mrna->exons[i]->start-1;
404 seg.t=mrna;
405 introns.Add(seg);
406 }
407 }
408 mrnas.Add(mrna);
409 if (mrna->exons.Count()>1) ichains++;
410 ((CTData*)(mrna->uptr))->locus=this;
411 mrna_maxscore=mrna;
412 mrna_maxcov=mrna;
413 }
414 }
415 ~GLocus() {
416 delete superlst;
417 }
418 void creset() {
419 spl_major=0;spl_rare=0;spl_wrong=0;
420 v=0; //visited/other data
421 ichainTP=0;
422 ichainATP=0;
423 mrnaTP=0;
424 mrnaATP=0;
425 cmpovl.Clear();
426 }
427
428 void addMerge(GLocus& locus, GffObj* lnkmrna) {
429 //add all the elements of the other locus (merging)
430 //-- merge mexons
431 GArray<int> ovlexons(true,true); //list of locus.mexons indexes overlapping existing mexons
432 int i=0; //index of first mexons with a merge
433 int j=0; //index current mrna exon
434 while (i<mexons.Count() && j<locus.mexons.Count()) {
435 uint istart=mexons[i].start;
436 uint iend=mexons[i].end;
437 uint jstart=locus.mexons[j].start;
438 uint jend=locus.mexons[j].end;
439 if (iend<jstart) { i++; continue; }
440 if (jend<istart) { j++; continue; }
441 //if (mexons[i].overlap(jstart, jend)) {
442 //exon overlap was found :
443 ovlexons.Add(j);
444 //extend mexons[i] as needed
445 if (jstart<istart) mexons[i].start=jstart;
446 if (jend>iend) { //mexons[i] end extend
447 mexons[i].end=jend;
448 //now this could overlap the next mexon(s), so we have to merge them all
449 while (i<mexons.Count()-1 && mexons[i].end>mexons[i+1].start) {
450 uint nextend=mexons[i+1].end;
451 mexons.Delete(i+1);
452 if (nextend>mexons[i].end) {
453 mexons[i].end=nextend;
454 break; //no need to check next mexons
455 }
456 } //while next mexons merge
457 } // mexons[i] end extend
458 // } //exon overlap
459 j++; //check the next locus.mexon
460 }
461 //-- add the rest of the non-overlapping mexons:
462 GSeg seg;
463 for (int i=0;i<locus.mexons.Count();i++) {
464 seg.start=locus.mexons[i].start;
465 seg.end=locus.mexons[i].end;
466 if (!ovlexons.Exists(i)) mexons.Add(seg);
467 }
468 // -- merge uexons
469 //add to uexons:
470 for (int i=0;i<locus.uexons.Count();i++) {
471 uexons.Add(locus.uexons[i]);
472 }
473 for (int i=0;i<locus.introns.Count();i++) {
474 introns.IAdd(&(locus.introns[i]));
475 }
476
477 // -- add locus.mrnas
478 for (int i=0;i<locus.mrnas.Count();i++) {
479 ((CTData*)(locus.mrnas[i]->uptr))->locus=this;
480 if (locus.mrnas[i]!=lnkmrna) {
481 mrnas.Add(locus.mrnas[i]);
482 if (locus.mrnas[i]->exons.Count()>1) ichains++;
483 }
484 }
485 // -- adjust start/end as needed
486 if (start>locus.start) start=locus.start;
487 if (end<locus.end) end=locus.end;
488 if (mrna_maxcov->covlen<locus.mrna_maxcov->covlen)
489 mrna_maxcov=locus.mrna_maxcov;
490 if (mrna_maxscore->gscore<locus.mrna_maxscore->gscore)
491 mrna_maxscore=locus.mrna_maxscore;
492 }
493
494
495 bool exonOverlap(GLocus& loc) {
496 //check if any mexons overlap!
497 int i=0;
498 int j=0;
499 while (i<mexons.Count() && j<loc.mexons.Count()) {
500 uint istart=mexons[i].start;
501 uint iend=mexons[i].end;
502 uint jstart=loc.mexons[j].start;
503 uint jend=loc.mexons[j].end;
504 if (iend<jstart) { i++; continue; }
505 if (jend<istart) { j++; continue; }
506 //exon overlap found if we're here:
507 return true;
508 }
509 return false;
510 }
511
512 bool add_mRNA(GffObj* mrna) {
513 if (mrnas.Count()>0 && mrna->gseq_id!=gseq_id) return false; //mrna must be on the same genomic seq
514 //check for exon overlap with existing mexons
515 //also update uexons and mexons accordingly, if mrna is added
516 uint mrna_start=mrna->exons.First()->start;
517 uint mrna_end=mrna->exons.Last()->end;
518 if (mrna_start>end || start>mrna_end) return false;
519 bool hasovl=false;
520 int i=0; //index of first mexons with a merge
521 int j=0; //index current mrna exon
522 GArray<int> ovlexons(true,true); //list of mrna exon indexes overlapping mexons
523 while (i<mexons.Count() && j<mrna->exons.Count()) {
524 uint istart=mexons[i].start;
525 uint iend=mexons[i].end;
526 uint jstart=mrna->exons[j]->start;
527 uint jend=mrna->exons[j]->end;
528 if (iend<jstart) { i++; continue; }
529 if (jend<istart) { j++; continue; }
530 //exon overlap found if we're here:
531 ovlexons.Add(j);
532 hasovl=true;
533 //extend mexons[i] as needed
534 if (jstart<istart) mexons[i].start=jstart;
535 if (jend>iend) { //mexon stretch up
536 mexons[i].end=jend;
537 //now this could overlap the next mexon(s), so we have to merge them all
538 while (i<mexons.Count()-1 && mexons[i].end>mexons[i+1].start) {
539 uint nextend=mexons[i+1].end;
540 mexons.Delete(i+1);
541 if (nextend>mexons[i].end) {
542 mexons[i].end=nextend;
543 break; //no need to check next mexons
544 }
545 } //while next mexons merge
546 } //possible mexons merge
547
548 j++; //check the next mrna exon
549 }//all vs all exon check loop
550 if (hasovl) {
551 GSeg seg;
552 //add the rest of the non-overlapping exons,
553 // and also to uexons etc.
554 for (int i=0;i<mrna->exons.Count();i++) {
555 seg.start=mrna->exons[i]->start;
556 seg.end=mrna->exons[i]->end;
557 if (!ovlexons.Exists(i)) mexons.Add(seg);
558 uexons.Add(seg);
559 GISeg iseg;
560 if (i>0) {
561 iseg.start=mrna->exons[i-1]->end+1;
562 iseg.end=mrna->exons[i]->start-1;
563 iseg.t=mrna;
564 introns.Add(iseg);
565 }
566 }
567
568 mrnas_add(mrna);
569 // add to mrnas
570 ((CTData*)mrna->uptr)->locus=this;
571 gseq_id=mrna->gseq_id;
572 if (mrna->exons.Count()>1) ichains++;
573 }
574 return hasovl;
575 }
576
577 //simpler,basic adding of a mrna
578 void mrnas_add(GffObj* mrna) {
579 mrnas.Add(mrna);
580 // adjust start/end
581 if (start>mrna->start) start=mrna->start;
582 if (end<mrna->end) end=mrna->end;
583 if (mrna_maxcov->covlen<mrna->covlen) mrna_maxcov=mrna;
584 if (mrna_maxscore->gscore<mrna->gscore) mrna_maxscore=mrna;
585 }
586 };
587
588 class GSuperLocus;
589 class GTrackLocus;
590
591 class GSuperLocus : public GSeg {
592 public:
593 int qfidx; //index of query dataset/file for which this superlocus was created
594 GList<GLocus> qloci;
595 GList<GLocus> rloci;
596 GList<GffObj> qmrnas; //list of transcripts (isoforms) for this locus
597 GArray<GSeg> qmexons; //list of merged exons in this region
598 GArray<GSeg> quexons; //list of unique exons (covered segments) in this region
599 GIArray qintrons; //list of unique exons (covered segments) in this region
600 //same lists for reference:
601 GList<GffObj> rmrnas; //list of transcripts (isoforms) for this locus
602 GArray<GSeg> rmexons; //list of merged exons in this region
603 GArray<GSeg> ruexons; //list of unique exons (covered segments) in this region
604 GArray<GISeg> rintrons; //list of unique exons (covered segments) in this region
605 // store problematic introns for printing:
606 GIArray i_missed; //missed reference introns (not overlapped by any qry intron)
607 GIArray i_notp; //wrong ref introns (one or both ends not matching any qry intron)
608 //
609 GIArray i_qwrong; //totally wrong qry introns (not overlapped by any ref intron)
610 GIArray i_qnotp; //imperfect qry introns (may overlap but has no "perfect" match)
611
612
613 int qbases_all;
614 int rbases_all; //in fact, it's all ref bases overlapping any query loci
615 int in_rmrnas; //count of ALL ref mrnas and loci given for this region
616 int in_rloci; //not just those overlapping qry data
617 // this will keep track of total qry loci, mrnas and exons in an area
618 int total_superloci;
619 int total_qloci;
620 int total_qloci_alt; //total qloci with multiple transcripts
621
622 int total_qmrnas;
623 int total_qichains; //multi exon mrnas
624 int total_qexons; //unique exons
625 int total_qmexons;
626 int total_qintrons; //unique introns
627 // these ref totals are in fact only limited to data from
628 // loci overlapping any of qry loci
629 int total_rmexons;
630 int total_richains; //multi exon mrnas
631 int total_rloci;
632 int total_rmrnas;
633 int total_rexons;
634 int total_rintrons; //unique introns
635
636 //--- accuracy data after compared to ref loci:
637 int locusQTP;
638 int locusTP;
639 int locusAQTP;
640 int locusATP; // 1 if ichainATP + mrnaATP > 0
641 int locusFP;
642 int locusAFP;
643 int locusAFN;
644 int locusFN;
645 //---transcript level accuracy -- all exon coordinates should match (most stringent)
646 int mrnaTP; // number of qry mRNAs with perfect match with ref transcripts
647 int mrnaFP; // number of qry mRNAs with no perfect match with a ref transcript
648 int mrnaFN; // number of ref mRNAs in this region having no perfect match with a qry transcript
649 int mrnaATP;
650 int mrnaAFN;
651 int mrnaAFP;
652 //---intron level accuracy (comparing the ordered set of splice sites):
653 int ichainTP; // number of qry intron chains covering a reference intron chain
654 // (covering meaning that the ordered set of reference splice sites
655 // is the same with a ordered subset of the query splice sites)
656 int ichainFP; // number of qry intron chains not covering a reference intron chain
657 int ichainFN; // number of ref intron chains in this region not being covered by a reference intron chain
658 // same as above, but approximate -- allowing a 10bp distance error for splice sites
659 int ichainATP;
660 int ichainAFP;
661 int ichainAFN;
662 //---projected features ---
663 //---exon level accuracy:
664 int exonTP; //number of perfectly overlapping exons (true positives)
665 int exonFP; //number of exons of query with no perfect match with a reference exon
666 int exonFN; //number of exons of reference with no perfect match with a query exon
667 // same as the above but with acceptable approximation (10bp error window):
668 int exonATP;
669 int exonAFP;
670 int exonAFN;
671
672 int intronTP; //number of perfectly overlapping introns (true positives)
673 int intronFP; //number of introns of query with no perfect match with a reference intron
674 int intronFN; //number of introns of reference with no perfect match with a query intron
675 // same as the above but with acceptable approximation (10bp error window):
676 int intronATP;
677 int intronAFP;
678 int intronAFN;
679
680 //-- EGASP added these too:
681 int m_exons; //number of exons totally missed (not overlapped *at all* by any query exon)
682 int w_exons; //numer of totally wrong exons (query exons not overlapping *at all* any reference exon)
683 int m_introns; //number of introns totally missed (not overlapped *at all* by any query intron)
684 int w_introns; //numer of totally wrong introns (query introns not overlapping *at all* any reference intron)
685 int m_loci; //missed loci
686 int w_loci; //novel/wrong loci
687 //---base level accuracy
688 int baseTP; //number of overlapping bases
689 int baseFP; //number of qry bases not overlapping reference
690 int baseFN; //number of ref bases not overlapping qry
691 // sorted,free,unique sorted,unique
692 GSuperLocus(uint lstart=0,uint lend=0):qloci(true,false,false),rloci(true,false,false),
693 qmrnas(true,false,false), qmexons(true,false), quexons(true,false), qintrons(false),
694 rmrnas(true,false,false), rmexons(true,false), ruexons(true,false), rintrons(false),
695 i_missed(false),i_notp(false), i_qwrong(false), i_qnotp(false){
696 qfidx=-1;
697 start=lstart;
698 end=lend;
699 qbases_all=0;
700 rbases_all=0;
701 baseTP=0;baseFP=0;baseFN=0;
702 locusTP=0;locusQTP=0; locusAQTP=0; locusATP=0;
703 locusFP=0;locusAFP=0;locusAFN=0;
704 locusFN=0;
705 in_rmrnas=0;
706 in_rloci=0;
707 w_loci=0;
708 m_loci=0;
709 total_superloci=0;
710 mrnaTP=0;mrnaFP=0;mrnaFN=0;
711 mrnaATP=0;mrnaAFP=0;mrnaAFN=0;
712 ichainTP=0;ichainFP=0;ichainFN=0;
713 ichainATP=0;ichainAFP=0;ichainAFN=0;
714 exonTP=0;exonFP=0;exonFN=0;
715 exonATP=0;exonAFP=0;exonAFN=0;
716 intronTP=0;intronFP=0;intronFN=0;
717 intronATP=0;intronAFP=0;intronAFN=0;
718 total_rmexons=0;
719 total_qmexons=0;
720 total_qexons=0;total_qloci=0;total_qmrnas=0;
721 total_qloci_alt=0;
722 total_qintrons=0;total_qichains=0;
723 total_rexons=0;total_rloci=0;total_rmrnas=0;
724 total_rintrons=0;total_richains=0;
725 w_exons=0;
726 m_exons=0;
727 w_introns=0;
728 m_introns=0;
729 }
730 void addQlocus(GLocus& loc) {
731 if (start==0 || start>loc.start) start=loc.start;
732 if (end<loc.end) end=loc.end;
733 qloci.Add(&loc);
734 total_qloci++;
735 if (loc.ichains>0 && loc.mrnas.Count()>1)
736 total_qloci_alt++;
737 qmrnas.Add(loc.mrnas);
738 total_qmrnas+=loc.mrnas.Count();
739 total_qichains+=loc.ichains;
740 qmexons.Add(loc.mexons);
741 total_qmexons+=loc.mexons.Count();
742 quexons.Add(loc.uexons);
743 total_qexons+=loc.uexons.Count();
744 qintrons.Add(loc.introns);
745 total_qintrons+=loc.introns.Count();
746 }
747 void addRlocus(GLocus& loc) {
748 if (start==0 || start>loc.start) start=loc.start;
749 if (end<loc.end) end=loc.end;
750 rloci.Add(&loc);
751 total_rloci++;
752 rmrnas.Add(loc.mrnas);
753 total_rmrnas+=loc.mrnas.Count();
754 total_richains+=loc.ichains;
755 rmexons.Add(loc.mexons);
756 total_rmexons+=loc.mexons.Count();
757 ruexons.Add(loc.uexons);
758 total_rexons+=loc.uexons.Count();
759 rintrons.Add(loc.introns);
760 total_rintrons+=loc.introns.Count();
761 }
762
763 void calcF() {
764 // base level
765 baseFP=qbases_all-baseTP;
766 baseFN=rbases_all-baseTP;
767 //exon level:
768 exonAFP=total_qexons-exonATP;
769 exonFP=total_qexons-exonTP;
770 exonAFN=total_rexons-exonATP;
771 exonFN=total_rexons-exonTP;
772 //intron stats
773 intronAFP=total_qintrons-intronATP;
774 intronFP=total_qintrons-intronTP;
775 intronAFN=total_rintrons-intronATP;
776 intronFN=total_rintrons-intronTP;
777
778 // ichain and transcript levels:
779 ichainAFP=total_qichains-ichainATP;
780 ichainFP=total_qichains-ichainTP;
781 ichainAFN=total_richains-ichainATP;
782 ichainFN=total_richains-ichainTP;
783 mrnaFP=total_qmrnas-mrnaTP;
784 mrnaFN=total_rmrnas-mrnaTP;
785 mrnaAFP=total_qmrnas-mrnaATP;
786 mrnaAFN=total_rmrnas-mrnaATP;
787 // locus/gene level:
788 locusAFP=total_qloci-locusAQTP;
789 locusFP=total_qloci-locusQTP;
790 locusAFN=total_rloci-locusATP;
791 locusFN=total_rloci-locusTP;
792 }
793
794 void addStats(GSuperLocus& s) {
795 in_rmrnas+=s.in_rmrnas;
796 in_rloci+=s.in_rloci;
797 baseTP+=s.baseTP;
798 exonTP+=s.exonTP;
799 exonATP+=s.exonATP;
800 intronTP+=s.intronTP;
801 intronATP+=s.intronATP;
802 ichainTP+=s.ichainTP;
803 ichainATP+=s.ichainATP;
804 mrnaTP+=s.mrnaTP;
805 mrnaATP+=s.mrnaATP;
806 locusTP+=s.locusTP;
807 locusQTP+=s.locusQTP;
808 locusATP+=s.locusATP;
809 locusAQTP+=s.locusAQTP;
810 m_exons+=s.m_exons;
811 w_exons+=s.w_exons;
812 m_introns+=s.m_introns;
813 w_introns+=s.w_introns;
814 if (s.total_superloci==0 && s.qloci.Count()>0) s.total_superloci=1;
815 total_superloci+=s.total_superloci;
816 qbases_all+=s.qbases_all;
817 rbases_all+=s.rbases_all;
818 m_loci+=s.m_loci;
819 w_loci+=s.w_loci;
820 total_qexons+=s.total_qexons;
821 total_qintrons+=s.total_qintrons;
822 total_qmexons+=s.total_qmexons;
823 total_rexons+=s.total_rexons;
824 total_rintrons+=s.total_rintrons;
825 total_rmexons+=s.total_rmexons;
826 total_qmrnas+=s.total_qmrnas;
827 total_qichains+=s.total_qichains;
828 total_rmrnas+=s.total_rmrnas;
829 total_richains+=s.total_richains;
830 total_qloci+=s.total_qloci;
831 total_qloci_alt+=s.total_qloci_alt;
832 total_rloci+=s.total_rloci;
833 }
834 };
835
836 class GSeqData {
837 int gseq_id;
838 public:
839 const char* gseq_name;
840 GList<GffObj> refs_f; //forward strand mRNAs
841 GList<GffObj> refs_r; //reverse strand mRNAs
842 GList<GffObj> mrnas_f; //forward strand mRNAs
843 GList<GffObj> mrnas_r; //reverse strand mRNAs
844 GList<GLocus> loci_f; //forward strand loci
845 GList<GLocus> loci_r; //reverse strand loci
846 //--> the fields below are not used by reference data --
847 GList<GSuperLocus> gstats_f; //stats for forward strand superloci
848 GList<GSuperLocus> gstats_r; //stats for reverse strand superloci
849 GList<GLocus> nloci_f; //"novel" loci on forward strand
850 GList<GLocus> nloci_r; //"novel" loci on reverse strand
851 GList<GffObj> umrnas; //unknown orientation mrnas
852 GList<GLocus> nloci_u; //"novel" loci with no orientation found
853
854 GList<CTData> tdata; //transcript data (uptr holder for all mrnas here)
855
856 int get_gseqid() { return gseq_id; }
857
858 //--<
859 GSeqData(int gid=-1):mrnas_f(true,true,false),mrnas_r(true,true,false),
860 loci_f(true,true,true),loci_r(true,true,true),
861 gstats_f(true,true,false),gstats_r(true,true,false),
862 nloci_f(true,false,true), nloci_r(true,false,true),
863 umrnas(true,true,false), nloci_u(true,true,true), tdata(false,true,false) {
864 gseq_id=gid;
865 if (gseq_id>=0)
866 gseq_name=GffObj::names->gseqs.getName(gseq_id);
867 }
868 bool operator==(GSeqData& d){
869 return (gseq_id==d.gseq_id);
870 }
871 bool operator>(GSeqData& d){
872 return (gseq_id>d.gseq_id);
873 }
874 bool operator<(GSeqData& d){
875 return (gseq_id<d.gseq_id);
876 }
877 };
878
879
880 // a group of qry loci and a transcript cluster for a single qry dataset
881 class GQCluster : public GList<GffObj> {
882 public:
883 GffObj* mrna_maxcov; //transcript with maximum coverage (for largest transcript)
884 GffObj* mrna_maxscore; //transcript with maximum gscore ( = major isoform for Cufflinks)
885 uint start;
886 uint end;
887 GList<GLocus> qloci;
888 //GCluster cl; //just a more compact way of keeping all transcripts in these loci
889 GQCluster(GList<GLocus>* loci=NULL):GList<GffObj>(true,false,false),
890 qloci(true,false,false) {
891 mrna_maxcov=NULL;
892 mrna_maxscore=NULL;
893 start=0;
894 end=0;
895 if (loci!=NULL) {
896 qloci.Add(*loci);
897 for (int i=0;i<loci->Count();i++) {
898 addLocus(loci->Get(i),false);
899 }
900 }
901 }
902 void addLocus(GLocus* loc, bool toLoci=true) {
903 //check so we don't add locus duplicates
904 if (toLoci) {
905 for (int i=0;i<qloci.Count();i++) {
906 if (loc==qloci[i]) return;
907 }
908 qloci.Add(loc);
909 }
910 for (int m=0;m<loc->mrnas.Count();m++) {
911 GffObj* mrna=loc->mrnas[m];
912 Add(mrna);
913 if (start==0 || start>mrna->start) start=mrna->start;
914 if (end<mrna->end) end=mrna->end;
915 if (mrna_maxcov==NULL || mrna_maxcov->covlen<mrna->covlen) mrna_maxcov=mrna;
916 if (mrna_maxscore==NULL || mrna_maxscore->gscore<mrna->gscore) mrna_maxscore=mrna;
917 }
918 }
919 };
920
921 //track a set of clustered qloci across multiple qry datasets
922 // the qloci in qcls[] overlap but not necessarily at exon level
923 // (so there can be multiple genes here in fact)
924 class GTrackLocus:public GSeg {
925 public:
926 char strand;
927 bool hasQloci;
928 //GLocus* rloc; //corresponding reference locus, if available
929 GList<GLocus> rloci; //ref loci found overlapping this region
930 GQCluster* qcls[MAX_QFILES]; //all qloci for this superlocus, grouped by dataset
931 GTrackLocus(GLocus* qloc=NULL, int q=-1):GSeg(0,0),rloci(true,false,true) {
932 strand='.';
933 for (int i=0;i<MAX_QFILES;i++) qcls[i]=NULL;
934 if (qloc!=NULL) addQLocus(qloc,q);
935 }
936
937 void addRLocus(GLocus* rl) {
938 if (rl==NULL) return;
939 if (rl->qfidx>=0)
940 GError("Error: GTrackLocus::addRLocus called with a query locus (set# %d)\n",
941 rl->qfidx+1);
942 if (strand=='.') strand=rl->mrna_maxcov->strand;
943 if (start==0 || start>rl->start) start=rl->start;
944 if (end==0 || end<rl->end) end=rl->end;
945 rl->t_ptr=this;
946 rloci.Add(rl);
947 }
948
949 void addQLocus(GLocus* loc, int q=-1) { //adding qry locus
950 if (loc==NULL) return;
951 if (strand=='.' && loc->mrna_maxcov->strand!='.')
952 strand=loc->mrna_maxcov->strand;
953 if (loc->qfidx<0 && q<0)
954 GError("Error at GTrackLocus::addQLocus(): locus.qfidx not set and index not given!\n");
955 if (q>=0) loc->qfidx=q;
956 else q=loc->qfidx;
957 if (start==0 || start>loc->start) start=loc->start;
958 if (end==0 || end<loc->end) end=loc->end;
959 if (qcls[q]==NULL) qcls[q]=new GQCluster();
960 hasQloci=true;
961 loc->t_ptr = this;
962 qcls[q]->addLocus(loc);
963 }
964
965 bool add_Locus(GLocus* loc) {
966 if (start==0 || overlap(*loc)) { //simple range overlap, not exon overlap
967 if (loc->qfidx<0) addRLocus(loc);
968 else addQLocus(loc);
969 return true;
970 }
971 return false;
972 }
973
974
975 void addQCl(int q, GQCluster* qcl, GLocus* lnkloc) {
976 for (int i=0;i<qcl->qloci.Count();i++) {
977 GLocus* loc=qcl->qloci[i];
978 if (loc==lnkloc || loc->t_ptr==this) continue;
979 hasQloci=true;
980 loc->t_ptr=this;
981 qcls[q]->addLocus(loc);
982 }
983 }
984
985 void addMerge(GTrackLocus* loctrack, int qcount, GLocus* lnkloc) {
986 if (loctrack==NULL) return;
987 //merge qloci
988 for (int q=0; q < qcount; q++) {
989 if (qcls[q]==NULL) {
990 if (loctrack->qcls[q]!=NULL) {
991 qcls[q]=loctrack->qcls[q];
992 loctrack->qcls[q]=NULL; //just move pointer here
993 //set all t_ptr pointers for moved loci
994 for (int ql = 0; ql < qcls[q]->qloci.Count(); ql++) {
995 qcls[q]->qloci[ql]->t_ptr=this;
996 }
997 hasQloci=true;
998 }
999 }
1000 else //existing qloci at q
1001 if (loctrack->qcls[q]!=NULL) { //merge elements
1002 addQCl(q, loctrack->qcls[q], lnkloc);
1003 }
1004 }//for each qset
1005 //merge rloci, if any
1006 if (loctrack->rloci.Count()>0) {
1007 for (int l=0;l<loctrack->rloci.Count();l++) {
1008 if (loctrack->rloci[l]!=lnkloc && loctrack->rloci[l]->t_ptr!=this) {
1009 rloci.Add(loctrack->rloci[l]);
1010 loctrack->rloci[l]->t_ptr=this;
1011 }
1012 }
1013 }
1014 if (loctrack->start<start) start=loctrack->start;
1015 if (loctrack->end>end) end=loctrack->end;
1016 if (strand=='.' && loctrack->strand!='.') strand=loctrack->strand;
1017 }
1018
1019 /*
1020 void add_QLoci(GList<GLocus>* loci, int q, GLocus& r) {
1021 // only add loci overlapping given refloc
1022 //rloc=&r;
1023 if (loci==NULL) return;
1024 for (int i=0;i<loci->Count();i++) {
1025 GLocus* loc=loci->Get(i);
1026 // if (!loc->exonOverlap(r)) continue; //do we really needed exon overlap?
1027 if (!loc->overlap(r)) continue;
1028 if (start==0 || start>loc->start) start=loc->start;
1029 if (end==0 || end<loc->end) end=loc->end;
1030 loc->t_ptr=this;
1031 loc->qfidx=q;
1032 if (qcls[q]==NULL) qcls[q]=new GQCluster();
1033 qcls[q]->addLocus(loc);
1034 }
1035 strand=r.mrnas[0]->strand;
1036 }
1037 */
1038 ~GTrackLocus() {
1039 for (int q=0;q<MAX_QFILES;q++)
1040 if (qcls[q]!=NULL) { delete qcls[q]; qcls[q]=NULL; }
1041 }
1042
1043 GQCluster* operator[](int q) {
1044 if (q<0 || q>=MAX_QFILES)
1045 GError("Error: qfidx index out of bounds (%d) for GTrackLocus!\n",q);
1046 return qcls[q];
1047 }
1048 };
1049
1050 class GXConsensus:public GSeg {
1051 public:
1052 static int count;
1053 int id; //XConsensus ID
1054 int tss_id; //group id for those xconsensi with shared first exon
1055 int p_id; //group id for those xconsensi with "similar" protein
1056 GffObj* tcons; //longest transcript to represent the combined "consensus" structure
1057 GffObj* ref; //overlapping reference transcript
1058 char refcode; // the code for ref relationship (like in the tracking file)
1059 char* aa;
1060 int aalen;
1061 GXConsensus* contained; //if contained into another GXConsensus
1062 //list of ichain-matching query (cufflinks) transcripts that contributed to this consensus
1063 GList<GffObj> qchain;
1064 GXConsensus(GffObj* c, CEqList* qlst, GffObj* r=NULL, char rcode=0)
1065 :qchain(false,false,false) {
1066 ref=r;
1067 refcode=rcode;
1068 tcons=c;
1069 if (qlst!=NULL) qchain.Add(*((GList<GffObj>*)qlst));
1070 else qchain.Add(c);
1071 count++;
1072 tss_id=0;
1073 p_id=0;
1074 aalen=0;
1075 id=count;
1076 aa=NULL;
1077 start=tcons->start;
1078 end=tcons->end;
1079 contained=NULL;
1080 }
1081 ~GXConsensus() {
1082 if (aa!=NULL) GFREE(aa);
1083 }
1084 };
1085
1086 class GXLocus:public GSeg {
1087 public:
1088 int id;
1089 int num_mtcons; //number of multi-exon "consensus" transcripts in this locus
1090 char strand;
1091 GList<GLocus> rloci; //list of ref loci overlapping any of the mexons
1092 GList<GLocus> qloci; //loci from all qry datasets that have overlapping exons with this region
1093 GArray<GSeg> mexons; //list of merged exonic regions for this locus
1094 GList<GXConsensus> tcons;
1095 GXLocus(GLocus* lfirst=NULL):GSeg(0,0),
1096 rloci((GCompareProc*)cmpByPtr, (GFreeProc*)NULL, true),
1097 qloci((GCompareProc*)cmpByPtr, (GFreeProc*)NULL, true),
1098 mexons(true,true), tcons(true,true,false) {
1099 strand='.';
1100 num_mtcons=0;
1101 if (lfirst!=NULL) {
1102 add_Locus(lfirst);
1103 }
1104 id=0;
1105 }
1106
1107 bool add_Locus(GLocus* loc) {
1108 if (mexons.Count()>0 && (end<loc->start || start > loc->end))
1109 return false; //no chance for overlapping exons
1110 if (mexons.Count()==0) {
1111 mexons.Add(loc->mexons);
1112 start=loc->start;
1113 end=loc->end;
1114 if (loc->qfidx<0) rloci.Add(loc);
1115 else qloci.Add(loc);
1116 strand=loc->mrna_maxcov->strand;
1117 loc->xlocus=this;
1118 return true;
1119 }
1120 int f=0;
1121 if (loc->qfidx<0) {
1122 if (rloci.Found(loc,f)) return false;
1123 }
1124 else if (qloci.Found(loc,f)) return false;
1125
1126 // -- merge mexons
1127 GArray<int> ovlexons(true,true); //list of locus.mexons indexes overlapping existing mexons
1128 int i=0; //index of first mexons with a merge
1129 int j=0; //index current mrna exon
1130 while (i<mexons.Count() && j<loc->mexons.Count()) {
1131 uint istart=mexons[i].start;
1132 uint iend=mexons[i].end;
1133 uint jstart=loc->mexons[j].start;
1134 uint jend=loc->mexons[j].end;
1135 if (iend<jstart) { i++; continue; }
1136 if (jend<istart) { j++; continue; }
1137 //if (mexons[i].overlap(jstart, jend)) {
1138 //exon overlap was found :
1139 ovlexons.Add(j);
1140 //extend mexons[i] as needed
1141 if (jstart<istart) mexons[i].start=jstart;
1142 if (jend>iend) { //mexons[i] end extend
1143 mexons[i].end=jend;
1144 //now this could overlap the next mexon(s), so we have to merge them all
1145 while (i<mexons.Count()-1 && mexons[i].end>mexons[i+1].start) {
1146 uint nextend=mexons[i+1].end;
1147 mexons.Delete(i+1);
1148 if (nextend>mexons[i].end) {
1149 mexons[i].end=nextend;
1150 break; //no need to check next mexons
1151 }
1152 } //while next mexons merge
1153 } // mexons[i] end extend
1154 // } //exon overlap
1155 j++; //check the next locus.mexon
1156 }//while mexons
1157 if (ovlexons.Count()==0) return false;
1158 if (strand=='.' && loc->mrna_maxcov->strand!='.')
1159 strand=loc->mrna_maxcov->strand;
1160 //have exon overlap:
1161 //-- add the rest of the non-overlapping mexons:
1162 GSeg seg;
1163 for (int i=0;i<loc->mexons.Count();i++) {
1164 seg.start=loc->mexons[i].start;
1165 seg.end=loc->mexons[i].end;
1166 if (!ovlexons.Exists(i)) mexons.Add(seg);
1167 }
1168 // -- adjust start/end as needed
1169 if (start>loc->start) start=loc->start;
1170 if (end<loc->end) end=loc->end;
1171 loc->xlocus=this;
1172 if (loc->qfidx<0) rloci.Add(loc);
1173 else qloci.Add(loc);
1174 return true;
1175 }
1176
1177 void addMerge(GXLocus& oxloc) {
1178 GArray<int> ovlexons(true,true); //list of oxloc.mexons indexes overlapping existing mexons
1179 int i=0; //index of first mexons with a merge
1180 int j=0; //index current mrna exon
1181 while (i<mexons.Count() && j<oxloc.mexons.Count()) {
1182 uint istart=mexons[i].start;
1183 uint iend=mexons[i].end;
1184 uint jstart=oxloc.mexons[j].start;
1185 uint jend=oxloc.mexons[j].end;
1186 if (iend<jstart) { i++; continue; }
1187 if (jend<istart) { j++; continue; }
1188 //if (mexons[i].overlap(jstart, jend)) {
1189 //exon overlap was found :
1190 ovlexons.Add(j);
1191 //extend mexons[i] as needed
1192 if (jstart<istart) mexons[i].start=jstart;
1193 if (jend>iend) { //mexons[i] end extend
1194 mexons[i].end=jend;
1195 //now this could overlap the next mexon(s), so we have to merge them all
1196 while (i<mexons.Count()-1 && mexons[i].end>mexons[i+1].start) {
1197 uint nextend=mexons[i+1].end;
1198 mexons.Delete(i+1);
1199 if (nextend>mexons[i].end) {
1200 mexons[i].end=nextend;
1201 break; //no need to check next mexons
1202 }
1203 } //while next mexons merge
1204 } // mexons[i] end extend
1205 // } //exon overlap
1206 j++; //check the next oxloc.mexon
1207 }
1208 if (ovlexons.Count()==0) {
1209 GError("Error: attempt to merge GXLoci with non-overlapping exons!\n");
1210 }
1211 //-- add the rest of the non-overlapping mexons:
1212 GSeg seg;
1213 for (int i=0;i<oxloc.mexons.Count();i++) {
1214 seg.start=oxloc.mexons[i].start;
1215 seg.end=oxloc.mexons[i].end;
1216 if (!ovlexons.Exists(i)) mexons.Add(seg);
1217 }
1218 if (start>oxloc.start) start=oxloc.start;
1219 if (end<oxloc.end) end=oxloc.end;
1220 if (strand=='.') strand=oxloc.strand;
1221 //-- steal all qloci and rloci
1222 for (int i=0;i<oxloc.qloci.Count();i++) {
1223 if (oxloc.qloci[i]->xlocus==this) continue;
1224 qloci.Add(oxloc.qloci[i]);
1225 oxloc.qloci[i]->xlocus=this;
1226 }
1227 for (int i=0;i<oxloc.rloci.Count();i++) {
1228 if (oxloc.rloci[i]->xlocus==this) continue;
1229 rloci.Add(oxloc.rloci[i]);
1230 oxloc.rloci[i]->xlocus=this;
1231 }
1232 } //::addMerge()
1233
1234
1235 void checkContainment() {
1236 //checking containment
1237 for (int j=0;j<tcons.Count()-1;j++) {
1238 GXConsensus* t=tcons[j];
1239 for (int i=j+1;i<tcons.Count();i++) {
1240 if (tcons[i]->contained!=NULL && t->tcons->exons.Count()>1) continue; //will check the container later anyway
1241 int c_status=checkXConsContain(t->tcons, tcons[i]->tcons);
1242 if (c_status==0) continue; //no containment relationship between t and tcons[i]
1243 if (c_status>0) { //t is a container for tcons[i]
1244 tcons[i]->contained=t;
1245 }
1246 else { //contained into exising XCons
1247 t->contained=tcons[i];
1248 break;
1249 }
1250 }
1251 }
1252 }
1253
1254 int checkXConsContain(GffObj* a, GffObj* b) {
1255 // returns 1 if a is the container of b
1256 // -1 if a is contained in b
1257 // 0 if no
1258 if (a->end<b->start || b->end<a->start) return 0;
1259 if (a->exons.Count()==b->exons.Count()) {
1260 if (a->exons.Count()>1) return 0; //same number of exons - no containment possible
1261 //because equivalence was already tested
1262 else { //single exon containment testing
1263 //this is fuzzy and messy (end result may vary depending on the testing order)
1264 int ovlen=a->exons[0]->overlapLen(b->exons[0]);
1265 int minlen=GMIN(a->covlen, b->covlen);
1266 if (ovlen>=minlen*0.8) { //if at least 80% of the shorter one is covered, it is contained
1267 return ((a->covlen>b->covlen) ? 1 : -1);
1268 }
1269 else return 0;
1270 //if (a->start<=b->start+10 && a->end+10>=b->end) return 1;
1271 // else { if (b->start<=a->start+10 && b->end+10>=a->end) return -1;
1272 // else return 0;
1273 //}
1274 }
1275 }
1276 //different number of exons:
1277 if (a->exons.Count()>b->exons.Count()) return t_contains(*a, *b) ? 1:0;
1278 else return t_contains(*b, *a) ? -1 : 0;
1279 }
1280
1281 void addXCons(GXConsensus* t) {
1282 tcons.Add(t);
1283 }
1284
1285 }; //GXLocus
1286
1287
1288
1289 int parse_mRNAs(GfList& mrnas,
1290 GList<GSeqData>& glstdata,
1291 bool is_ref_set=true,
1292 bool check_for_dups=false,
1293 int qfidx=-1, bool only_multiexon=false);
1294
1295 //reading a mRNAs from a gff file and grouping them into loci
1296 void read_mRNAs(FILE* f, GList<GSeqData>& seqdata, GList<GSeqData>* ref_data=NULL,
1297 bool check_for_dups=false, int qfidx=-1, const char* fname=NULL,
1298 bool only_multiexon=false);
1299
1300 void read_transcripts(FILE* f, GList<GSeqData>& seqdata, bool keepAttrs=true);
1301 void sort_GSeqs_byName(GList<GSeqData>& seqdata);
1302
1303
1304 bool tMatch(GffObj& a, GffObj& b, int& ovlen, bool fuzzunspl=false, bool contain_only=false);
1305
1306 //use qsearch to "position" a given coordinate x within a list of transcripts sorted
1307 //by their start (lowest) coordinate; the returned int is the list index of the
1308 //closest GffObj starting just *ABOVE* coordinate x
1309 //Convention: returns -1 if there is no such GffObj (i.e. last GffObj start <= x)
1310 int qsearch_mrnas(uint x, GList<GffObj>& mrnas);
1311 int qsearch_loci(uint x, GList<GLocus>& segs); // same as above, but for GSeg lists
1312
1313 GSeqData* getRefData(int gid, GList<GSeqData>& ref_data); //returns reference GSeqData for a specific genomic sequence
1314
1315 #endif

Properties

Name Value
svn:executable *