ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/cuffcompare/gtf_tracking.h
Revision: 118
Committed: Sun Nov 27 21:14:57 2011 UTC (8 years, 4 months ago) by gpertea
File size: 45593 byte(s)
Log Message:
fix locus Sn calculation

Line User Rev File contents
1 gpertea 20 #ifndef GTF_TRACKING_H
2     #define GTF_TRACKING_H
3     /*
4     * gtf_tracking.h
5     * cufflinks
6     *
7     * Created by Cole Trapnell on 9/5/09.
8     * Copyright 2009 Geo Pertea. All rights reserved.
9     *
10     */
11    
12     #include "gff.h"
13     #include "GFaSeqGet.h"
14     #include "GFastaIndex.h"
15     #include "GStr.h"
16    
17    
18     #define MAX_QFILES 500
19    
20     extern bool gtf_tracking_verbose;
21    
22     extern bool gtf_tracking_largeScale;
23     //many input files, no accuracy stats are generated, no *.tmap
24     // and exon attributes are discarded
25    
26     int cmpByPtr(const pointer p1, const pointer p2);
27    
28     bool t_contains(GffObj& a, GffObj& b);
29     //returns true only IF b has fewer exons than a AND a "contains" b
30    
31     char* getGSeqName(int gseq_id);
32    
33     //genomic fasta sequence handling
34     class GFastaHandler {
35     public:
36     char* fastaPath;
37     GFastaIndex* faIdx;
38     char* getFastaFile(int gseq_id) {
39     if (fastaPath==NULL) return NULL;
40     GStr s(fastaPath);
41     s.trimR('/');
42     s.appendfmt("/%s",getGSeqName(gseq_id));
43     GStr sbase(s);
44     if (!fileExists(s.chars())) s.append(".fa");
45     if (!fileExists(s.chars())) s.append("sta");
46     if (fileExists(s.chars())) return Gstrdup(s.chars());
47     else {
48     GMessage("Warning: cannot find genomic sequence file %s{.fa,.fasta}\n",sbase.chars());
49     return NULL;
50     }
51     }
52    
53     GFastaHandler(const char* fpath=NULL) {
54     fastaPath=NULL;
55     faIdx=NULL;
56     if (fpath!=NULL && fpath[0]!=0) init(fpath);
57     }
58    
59     void init(const char* fpath) {
60     if (fpath==NULL || fpath[0]==0) return;
61     if (!fileExists(fpath))
62     GError("Error: file/directory %s does not exist!\n",fpath);
63     fastaPath=Gstrdup(fpath);
64     if (fastaPath!=NULL) {
65     if (fileExists(fastaPath)>1) { //exists and it's not a directory
66     GStr fainame(fastaPath);
67     //the .fai name might have been given directly
68     if (fainame.rindex(".fai")==fainame.length()-4) {
69     //.fai index file given directly
70     fastaPath[fainame.length()-4]=0;
71     if (!fileExists(fastaPath))
72     GError("Error: cannot find fasta file for index %s !\n", fastaPath);
73     }
74     else fainame.append(".fai");
75     //fainame.append(".fai");
76     faIdx=new GFastaIndex(fastaPath,fainame.chars());
77     GStr fainamecwd(fainame);
78     int ip=-1;
79     if ((ip=fainamecwd.rindex('/'))>=0)
80     fainamecwd.cut(0,ip+1);
81     if (!faIdx->hasIndex()) { //could not load index
82     //try current directory
83     if (fainame!=fainamecwd) {
84     if (fileExists(fainamecwd.chars())>1) {
85     faIdx->loadIndex(fainamecwd.chars());
86     }
87     }
88     } //tried to load index
89     if (!faIdx->hasIndex()) {
90     GMessage("No fasta index found for %s. Rebuilding, please wait..\n",fastaPath);
91     faIdx->buildIndex();
92     if (faIdx->getCount()==0) GError("Error: no fasta records found!\n");
93     GMessage("Fasta index rebuilt.\n");
94     FILE* fcreate=fopen(fainame.chars(), "w");
95     if (fcreate==NULL) {
96     GMessage("Warning: cannot create fasta index %s! (permissions?)\n", fainame.chars());
97     if (fainame!=fainamecwd) fcreate=fopen(fainamecwd.chars(), "w");
98     if (fcreate==NULL)
99     GError("Error: cannot create fasta index %s!\n", fainamecwd.chars());
100     }
101     if (faIdx->storeIndex(fcreate)<faIdx->getCount())
102     GMessage("Warning: error writing the index file!\n");
103     } //index created and attempted to store it
104     } //multi-fasta
105     } //genomic sequence given
106     }
107     GFaSeqGet* fetch(int gseq_id, bool checkFasta=false) {
108     if (fastaPath==NULL) return NULL;
109     //genomic sequence given
110     GFaSeqGet* faseq=NULL;
111     if (faIdx!=NULL) { //fastaPath was the multi-fasta file name
112     char* gseqname=getGSeqName(gseq_id);
113     GFastaRec* farec=faIdx->getRecord(gseqname);
114     if (farec!=NULL) {
115     faseq=new GFaSeqGet(fastaPath,farec->seqlen, farec->fpos,
116     farec->line_len, farec->line_blen);
117     faseq->loadall(); //just cache the whole sequence, it's faster
118     }
119     else {
120     GMessage("Warning: couldn't find fasta record for '%s'!\n",gseqname);
121     return NULL;
122     }
123     }
124     else //if (fileExists(fastaPath)==1)
125     {
126     char* sfile=getFastaFile(gseq_id);
127     if (sfile!=NULL) {
128     //if (gtf_tracking_verbose)
129     // GMessage("Processing sequence from fasta file '%s'\n",sfile);
130     faseq=new GFaSeqGet(sfile,checkFasta);
131     faseq->loadall();
132     GFREE(sfile);
133     }
134     } //one fasta file per contig
135     return faseq;
136     }
137    
138     ~GFastaHandler() {
139     GFREE(fastaPath);
140     delete faIdx;
141     }
142     };
143    
144    
145    
146     bool betterRef(GffObj* a, GffObj* b); //for better CovLink reference ranking
147    
148     class GLocus;
149    
150     class COvLink {
151     public:
152     static int coderank(char c) {
153     switch (c) {
154     case '=': return 0; //ichain match
155     case 'c': return 2; //containment (ichain fragment)
156     case 'j': return 4; // overlap with at least a junction match
157     case 'e': return 6; // single exon transfrag overlapping an intron of reference (possible pre-mRNA)
158     case 'o': return 8; // generic exon overlap
159     case 's': return 16; //"shadow" - an intron overlaps with a ref intron on the opposite strand
160     case 'x': return 18; // exon overlap on opposite strand (usually wrong strand mapping)
161     case 'i': return 20; // intra-intron
162     case 'p': return 90; //polymerase run
163     case 'r': return 92; //repeats
164     case 'u': return 94; //intergenic
165     case 0 : return 100;
166     default: return 96;
167     }
168     }
169     char code;
170     int rank;
171     GffObj* mrna;
172     int ovlen;
173     COvLink(char c=0,GffObj* m=NULL, int ovl=0) {
174     code=c;
175     mrna=m;
176     ovlen=ovl;
177     rank=coderank(c);
178     }
179     bool operator<(COvLink& b) {
180     if (rank==b.rank)
181     return (ovlen==b.ovlen)? betterRef(mrna, b.mrna) : (ovlen>b.ovlen);
182     else return rank<b.rank;
183     }
184     bool operator>(COvLink& b) {
185     if (rank==b.rank)
186     return (ovlen==b.ovlen)? betterRef(b.mrna, mrna) : (ovlen<b.ovlen);
187     else return rank>b.rank;
188     }
189     bool operator==(COvLink& b) {
190     return (rank==b.rank && mrna==b.mrna);
191     }
192     };
193    
194     class GISeg: public GSeg {
195     public:
196     GffObj* t; //pointer to the largest transcript with a segment this exact exon coordinates
197     GISeg(uint s=0,uint e=0, GffObj* ot=NULL):GSeg(s,e) { t=ot; }
198     };
199    
200     class GIArray:public GArray<GISeg> {
201     public:
202     GIArray(bool uniq=true):GArray<GISeg>(true,uniq) { }
203     int IAdd(GISeg* item) {
204     if (item==NULL) return -1;
205     int result=-1;
206     if (Found(*item, result)) {
207     if (fUnique) {
208     //cannot add a duplicate, return index of existing item
209     if (item->t!=NULL && fArray[result].t!=NULL &&
210     item->t->covlen>fArray[result].t->covlen)
211     fArray[result].t=item->t;
212     return result;
213     }
214     }
215     //Found sets result to the position where the item should be
216     idxInsert(result, *item);
217     return result;
218     }
219    
220     };
221    
222     class CEqList: public GList<GffObj> {
223     public:
224     GffObj* head;
225     CEqList():GList<GffObj>((GCompareProc*)cmpByPtr, (GFreeProc*)NULL, true) {
226     head=NULL;
227     }
228     };
229    
230     class CTData { //transcript associated data
231     public:
232     GffObj* mrna; //owner transcript
233     GLocus* locus;
234     GList<COvLink> ovls; //overlaps with other transcripts (ref vs query)
235     //-- just for ichain match tracking:
236     GffObj* eqref; //ref transcript having an ichain match
237     int qset; //qry set index (qfidx), -1 means reference dataset
238     //GffObj* eqnext; //next GffObj in the linked list of matching transfrags
239     CEqList* eqlist; //keep track of matching transfrags
240     //int eqdata; // flags for EQ list (is it a list head?)
241     // Cufflinks specific data:
242     double FPKM;
243     double conf_hi;
244     double conf_lo;
245     double cov;
246     char classcode; //the best/final classcode
247     CTData(GffObj* m=NULL, GLocus* l=NULL):ovls(true,true,true) {
248     mrna=m;
249     if (mrna!=NULL) mrna->uptr=this;
250     locus=l;
251     classcode=0;
252     eqref=NULL;
253     //eqnext=NULL;
254     eqlist=NULL;
255     //eqdata=0;
256     qset=-2;
257     FPKM=0;
258     conf_lo=0;
259     conf_hi=0;
260     cov=0;
261     }
262    
263     ~CTData() {
264     ovls.Clear();
265     //if ((eqdata & EQHEAD_TAG)!=0) delete eqlist;
266     if (isEqHead()) delete eqlist;
267     }
268    
269     //inline bool eqHead() { return ((eqdata & EQHEAD_TAG)!=0); }
270     bool isEqHead() {
271     if (eqlist==NULL) return false;
272     return (eqlist->head==this->mrna);
273     }
274    
275     void joinEqList(GffObj* m) { //add list from m
276     //list head is set to the transfrag with the lower qset#
277     CTData* md=(CTData*)(m->uptr);
278     //ASSERT(md);
279     if (eqlist==NULL) {
280     if (md->eqlist!=NULL) {
281     eqlist=md->eqlist;
282     eqlist->Add(this->mrna);
283     CTData* md_head_d=(CTData*)(md->eqlist->head->uptr);
284     if (this->qset < md_head_d->qset)
285     eqlist->head=this->mrna;
286     }
287     else { //m was not in an EQ list
288     //eqlist=new GList<GffObj>((GCompareProc*)cmpByPtr, (GFreeProc*)NULL, true);
289     eqlist=new CEqList();
290     eqlist->Add(this->mrna);
291     eqlist->Add(m);
292     md->eqlist=eqlist;
293     if (qset<md->qset) eqlist->head=this->mrna;
294     else eqlist->head=m;
295     }
296     }//no eqlist before
297     else { //merge two eqlists
298     if (eqlist==md->eqlist) //already in the same eqlist, nothing to do
299     return;
300     if (md->eqlist!=NULL) { //copy elements of m's eqlist
301     //copy the smaller list into the larger one
302     CEqList* srclst, *destlst;
303     if (md->eqlist->Count()<eqlist->Count()) {
304     srclst=md->eqlist;
305     destlst=eqlist;
306     }
307     else {
308     srclst=eqlist;
309     destlst=md->eqlist;
310     }
311     for (int i=0;i<srclst->Count();i++) {
312     destlst->Add(srclst->Get(i));
313     CTData* od=(CTData*)((*srclst)[i]->uptr);
314     od->eqlist=destlst;
315     //od->eqdata=od->qset+1;
316     }
317     this->eqlist=destlst;
318     CTData* s_head_d=(CTData*)(srclst->head->uptr);
319     CTData* d_head_d=(CTData*)(destlst->head->uptr);
320     if (s_head_d->qset < d_head_d->qset )
321     this->eqlist->head=srclst->head;
322     delete srclst;
323     }
324     else { //md->eqlist==NULL
325     eqlist->Add(m);
326     md->eqlist=eqlist;
327     CTData* head_d=(CTData*)(eqlist->head->uptr);
328     if (md->qset<head_d->qset)
329     eqlist->head=m;
330     }
331     }
332     }
333    
334     void addOvl(char code,GffObj* target=NULL, int ovlen=0) {
335     ovls.AddIfNew(new COvLink(code, target, ovlen));
336     }
337     char getBestCode() {
338     return (ovls.Count()>0) ? ovls[0]->code : 0 ;
339     }
340     bool operator>(CTData& b) { return (mrna > b.mrna); }
341     bool operator<(CTData& b) { return (mrna < b.mrna); }
342     bool operator==(CTData& b) { return (mrna==b.mrna); }
343     };
344    
345     class GSuperLocus;
346     class GTrackLocus;
347     class GXLocus;
348    
349     //Data structure holding a query locus data (overlapping mRNAs on the same strand)
350     // and also the accuracy data of all mRNAs of a query locus
351     // (against all reference loci overlapping the same region)
352     class GLocus:public GSeg {
353     public:
354     int gseq_id; //id of underlying genomic sequence
355     int qfidx; // for locus tracking
356     GTrackLocus* t_ptr; //for locus tracking cluster
357     GffObj* mrna_maxcov; //transcript with maximum coverage (for main "ref" transcript)
358     GffObj* mrna_maxscore; //transcript with maximum gscore (for major isoform)
359     GList<GffObj> mrnas; //list of transcripts (isoforms) for this locus
360     GArray<GSeg> uexons; //list of unique exons (covered segments) in this region
361     GArray<GSeg> mexons; //list of merged exons in this region
362     GIArray introns;
363     GList<GLocus> cmpovl; //temp list of overlapping qry/ref loci to compare to (while forming superloci)
364    
365     //only for reference loci --> keep track of all superloci found for each qry dataset
366     // which contain this reference locus
367     GList<GSuperLocus>* superlst;
368     GXLocus* xlocus; //superlocus formed by exon overlaps across all qry datasets
369     // -- if genomic sequence was given:
370     int spl_major; // number of GT-AG splice site consensi
371     int spl_rare; // number of GC-AG, AT-AC and other rare splice site consensi
372     int spl_wrong; //number of "wrong" (unrecognized) splice site consensi
373     int ichains; //number of multi-exon mrnas
374     int ichainTP;
375     int ichainATP;
376     int mrnaTP;
377     int mrnaATP;
378     int v; //user flag/data
379     GLocus(GffObj* mrna=NULL, int qidx=-1):mrnas(true,false,false),uexons(true,true),mexons(true,true),
380     introns(), cmpovl(true,false,true) {
381     //this will NOT free mrnas!
382     ichains=0;
383     gseq_id=-1;
384     qfidx=qidx;
385     t_ptr=NULL;
386     creset();
387     xlocus=NULL;
388     mrna_maxcov=NULL;
389     mrna_maxscore=NULL;
390     superlst=new GList<GSuperLocus>(true,false,false);
391     if (mrna!=NULL) {
392     start=mrna->exons.First()->start;
393     end=mrna->exons.Last()->end;;
394     gseq_id=mrna->gseq_id;
395     GISeg seg;
396     for (int i=0;i<mrna->exons.Count();i++) {
397     seg.start=mrna->exons[i]->start;
398     seg.end=mrna->exons[i]->end;
399     uexons.Add(seg);
400     mexons.Add(seg);
401     if (i>0) {
402     seg.start=mrna->exons[i-1]->end+1;
403     seg.end=mrna->exons[i]->start-1;
404     seg.t=mrna;
405     introns.Add(seg);
406     }
407     }
408     mrnas.Add(mrna);
409     if (mrna->exons.Count()>1) ichains++;
410     ((CTData*)(mrna->uptr))->locus=this;
411     mrna_maxscore=mrna;
412     mrna_maxcov=mrna;
413     }
414     }
415     ~GLocus() {
416     delete superlst;
417     }
418     void creset() {
419     spl_major=0;spl_rare=0;spl_wrong=0;
420     v=0; //visited/other data
421     ichainTP=0;
422     ichainATP=0;
423     mrnaTP=0;
424     mrnaATP=0;
425     cmpovl.Clear();
426     }
427    
428     void addMerge(GLocus& locus, GffObj* lnkmrna) {
429     //add all the elements of the other locus (merging)
430     //-- merge mexons
431     GArray<int> ovlexons(true,true); //list of locus.mexons indexes overlapping existing mexons
432     int i=0; //index of first mexons with a merge
433     int j=0; //index current mrna exon
434     while (i<mexons.Count() && j<locus.mexons.Count()) {
435     uint istart=mexons[i].start;
436     uint iend=mexons[i].end;
437     uint jstart=locus.mexons[j].start;
438     uint jend=locus.mexons[j].end;
439     if (iend<jstart) { i++; continue; }
440     if (jend<istart) { j++; continue; }
441     //if (mexons[i].overlap(jstart, jend)) {
442     //exon overlap was found :
443     ovlexons.Add(j);
444     //extend mexons[i] as needed
445     if (jstart<istart) mexons[i].start=jstart;
446     if (jend>iend) { //mexons[i] end extend
447     mexons[i].end=jend;
448     //now this could overlap the next mexon(s), so we have to merge them all
449     while (i<mexons.Count()-1 && mexons[i].end>mexons[i+1].start) {
450     uint nextend=mexons[i+1].end;
451     mexons.Delete(i+1);
452     if (nextend>mexons[i].end) {
453     mexons[i].end=nextend;
454     break; //no need to check next mexons
455     }
456     } //while next mexons merge
457     } // mexons[i] end extend
458     // } //exon overlap
459     j++; //check the next locus.mexon
460     }
461     //-- add the rest of the non-overlapping mexons:
462     GSeg seg;
463     for (int i=0;i<locus.mexons.Count();i++) {
464     seg.start=locus.mexons[i].start;
465     seg.end=locus.mexons[i].end;
466     if (!ovlexons.Exists(i)) mexons.Add(seg);
467     }
468     // -- merge uexons
469     //add to uexons:
470     for (int i=0;i<locus.uexons.Count();i++) {
471     uexons.Add(locus.uexons[i]);
472     }
473     for (int i=0;i<locus.introns.Count();i++) {
474     introns.IAdd(&(locus.introns[i]));
475     }
476    
477     // -- add locus.mrnas
478     for (int i=0;i<locus.mrnas.Count();i++) {
479     ((CTData*)(locus.mrnas[i]->uptr))->locus=this;
480     if (locus.mrnas[i]!=lnkmrna) {
481     mrnas.Add(locus.mrnas[i]);
482     if (locus.mrnas[i]->exons.Count()>1) ichains++;
483     }
484     }
485     // -- adjust start/end as needed
486     if (start>locus.start) start=locus.start;
487     if (end<locus.end) end=locus.end;
488     if (mrna_maxcov->covlen<locus.mrna_maxcov->covlen)
489     mrna_maxcov=locus.mrna_maxcov;
490     if (mrna_maxscore->gscore<locus.mrna_maxscore->gscore)
491     mrna_maxscore=locus.mrna_maxscore;
492     }
493    
494    
495     bool exonOverlap(GLocus& loc) {
496     //check if any mexons overlap!
497     int i=0;
498     int j=0;
499     while (i<mexons.Count() && j<loc.mexons.Count()) {
500     uint istart=mexons[i].start;
501     uint iend=mexons[i].end;
502     uint jstart=loc.mexons[j].start;
503     uint jend=loc.mexons[j].end;
504     if (iend<jstart) { i++; continue; }
505     if (jend<istart) { j++; continue; }
506     //exon overlap found if we're here:
507     return true;
508     }
509     return false;
510     }
511    
512     bool add_mRNA(GffObj* mrna) {
513     if (mrnas.Count()>0 && mrna->gseq_id!=gseq_id) return false; //mrna must be on the same genomic seq
514     //check for exon overlap with existing mexons
515     //also update uexons and mexons accordingly, if mrna is added
516     uint mrna_start=mrna->exons.First()->start;
517     uint mrna_end=mrna->exons.Last()->end;
518     if (mrna_start>end || start>mrna_end) return false;
519     bool hasovl=false;
520     int i=0; //index of first mexons with a merge
521     int j=0; //index current mrna exon
522     GArray<int> ovlexons(true,true); //list of mrna exon indexes overlapping mexons
523     while (i<mexons.Count() && j<mrna->exons.Count()) {
524     uint istart=mexons[i].start;
525     uint iend=mexons[i].end;
526     uint jstart=mrna->exons[j]->start;
527     uint jend=mrna->exons[j]->end;
528     if (iend<jstart) { i++; continue; }
529     if (jend<istart) { j++; continue; }
530     //exon overlap found if we're here:
531     ovlexons.Add(j);
532     hasovl=true;
533     //extend mexons[i] as needed
534     if (jstart<istart) mexons[i].start=jstart;
535     if (jend>iend) { //mexon stretch up
536     mexons[i].end=jend;
537     //now this could overlap the next mexon(s), so we have to merge them all
538     while (i<mexons.Count()-1 && mexons[i].end>mexons[i+1].start) {
539     uint nextend=mexons[i+1].end;
540     mexons.Delete(i+1);
541     if (nextend>mexons[i].end) {
542     mexons[i].end=nextend;
543     break; //no need to check next mexons
544     }
545     } //while next mexons merge
546     } //possible mexons merge
547    
548     j++; //check the next mrna exon
549     }//all vs all exon check loop
550     if (hasovl) {
551     GSeg seg;
552     //add the rest of the non-overlapping exons,
553     // and also to uexons etc.
554     for (int i=0;i<mrna->exons.Count();i++) {
555     seg.start=mrna->exons[i]->start;
556     seg.end=mrna->exons[i]->end;
557     if (!ovlexons.Exists(i)) mexons.Add(seg);
558     uexons.Add(seg);
559     GISeg iseg;
560     if (i>0) {
561     iseg.start=mrna->exons[i-1]->end+1;
562     iseg.end=mrna->exons[i]->start-1;
563     iseg.t=mrna;
564     introns.Add(iseg);
565     }
566     }
567    
568     mrnas_add(mrna);
569     // add to mrnas
570     ((CTData*)mrna->uptr)->locus=this;
571     gseq_id=mrna->gseq_id;
572     if (mrna->exons.Count()>1) ichains++;
573     }
574     return hasovl;
575     }
576    
577     //simpler,basic adding of a mrna
578     void mrnas_add(GffObj* mrna) {
579     mrnas.Add(mrna);
580     // adjust start/end
581     if (start>mrna->start) start=mrna->start;
582     if (end<mrna->end) end=mrna->end;
583     if (mrna_maxcov->covlen<mrna->covlen) mrna_maxcov=mrna;
584     if (mrna_maxscore->gscore<mrna->gscore) mrna_maxscore=mrna;
585     }
586     };
587    
588     class GSuperLocus;
589     class GTrackLocus;
590    
591     class GSuperLocus : public GSeg {
592     public:
593     int qfidx; //index of query dataset/file for which this superlocus was created
594     GList<GLocus> qloci;
595     GList<GLocus> rloci;
596     GList<GffObj> qmrnas; //list of transcripts (isoforms) for this locus
597     GArray<GSeg> qmexons; //list of merged exons in this region
598     GArray<GSeg> quexons; //list of unique exons (covered segments) in this region
599     GIArray qintrons; //list of unique exons (covered segments) in this region
600     //same lists for reference:
601     GList<GffObj> rmrnas; //list of transcripts (isoforms) for this locus
602     GArray<GSeg> rmexons; //list of merged exons in this region
603     GArray<GSeg> ruexons; //list of unique exons (covered segments) in this region
604     GArray<GISeg> rintrons; //list of unique exons (covered segments) in this region
605     // store problematic introns for printing:
606     GIArray i_missed; //missed reference introns (not overlapped by any qry intron)
607     GIArray i_notp; //wrong ref introns (one or both ends not matching any qry intron)
608     //
609     GIArray i_qwrong; //totally wrong qry introns (not overlapped by any ref intron)
610     GIArray i_qnotp; //imperfect qry introns (may overlap but has no "perfect" match)
611    
612    
613     int qbases_all;
614     int rbases_all; //in fact, it's all ref bases overlapping any query loci
615     int in_rmrnas; //count of ALL ref mrnas and loci given for this region
616     int in_rloci; //not just those overlapping qry data
617     // this will keep track of total qry loci, mrnas and exons in an area
618     int total_superloci;
619     int total_qloci;
620     int total_qloci_alt; //total qloci with multiple transcripts
621    
622     int total_qmrnas;
623     int total_qichains; //multi exon mrnas
624     int total_qexons; //unique exons
625     int total_qmexons;
626     int total_qintrons; //unique introns
627     // these ref totals are in fact only limited to data from
628     // loci overlapping any of qry loci
629     int total_rmexons;
630     int total_richains; //multi exon mrnas
631     int total_rloci;
632     int total_rmrnas;
633     int total_rexons;
634     int total_rintrons; //unique introns
635    
636     //--- accuracy data after compared to ref loci:
637 gpertea 118 int locusQTP;
638     int locusTP;
639     int locusAQTP;
640 gpertea 20 int locusATP; // 1 if ichainATP + mrnaATP > 0
641     int locusFP;
642     int locusAFP;
643     int locusAFN;
644     int locusFN;
645     //---transcript level accuracy -- all exon coordinates should match (most stringent)
646     int mrnaTP; // number of qry mRNAs with perfect match with ref transcripts
647     int mrnaFP; // number of qry mRNAs with no perfect match with a ref transcript
648     int mrnaFN; // number of ref mRNAs in this region having no perfect match with a qry transcript
649     int mrnaATP;
650     int mrnaAFN;
651     int mrnaAFP;
652     //---intron level accuracy (comparing the ordered set of splice sites):
653     int ichainTP; // number of qry intron chains covering a reference intron chain
654     // (covering meaning that the ordered set of reference splice sites
655     // is the same with a ordered subset of the query splice sites)
656     int ichainFP; // number of qry intron chains not covering a reference intron chain
657     int ichainFN; // number of ref intron chains in this region not being covered by a reference intron chain
658     // same as above, but approximate -- allowing a 10bp distance error for splice sites
659     int ichainATP;
660     int ichainAFP;
661     int ichainAFN;
662     //---projected features ---
663     //---exon level accuracy:
664     int exonTP; //number of perfectly overlapping exons (true positives)
665     int exonFP; //number of exons of query with no perfect match with a reference exon
666     int exonFN; //number of exons of reference with no perfect match with a query exon
667     // same as the above but with acceptable approximation (10bp error window):
668     int exonATP;
669     int exonAFP;
670     int exonAFN;
671    
672     int intronTP; //number of perfectly overlapping introns (true positives)
673     int intronFP; //number of introns of query with no perfect match with a reference intron
674     int intronFN; //number of introns of reference with no perfect match with a query intron
675     // same as the above but with acceptable approximation (10bp error window):
676     int intronATP;
677     int intronAFP;
678     int intronAFN;
679    
680     //-- EGASP added these too:
681     int m_exons; //number of exons totally missed (not overlapped *at all* by any query exon)
682     int w_exons; //numer of totally wrong exons (query exons not overlapping *at all* any reference exon)
683     int m_introns; //number of introns totally missed (not overlapped *at all* by any query intron)
684     int w_introns; //numer of totally wrong introns (query introns not overlapping *at all* any reference intron)
685     int m_loci; //missed loci
686     int w_loci; //novel/wrong loci
687     //---base level accuracy
688     int baseTP; //number of overlapping bases
689     int baseFP; //number of qry bases not overlapping reference
690     int baseFN; //number of ref bases not overlapping qry
691     // sorted,free,unique sorted,unique
692     GSuperLocus(uint lstart=0,uint lend=0):qloci(true,false,false),rloci(true,false,false),
693     qmrnas(true,false,false), qmexons(true,false), quexons(true,false), qintrons(false),
694     rmrnas(true,false,false), rmexons(true,false), ruexons(true,false), rintrons(false),
695     i_missed(false),i_notp(false), i_qwrong(false), i_qnotp(false){
696     qfidx=-1;
697     start=lstart;
698     end=lend;
699     qbases_all=0;
700     rbases_all=0;
701     baseTP=0;baseFP=0;baseFN=0;
702 gpertea 118 locusTP=0;locusQTP=0; locusAQTP=0; locusATP=0;
703 gpertea 20 locusFP=0;locusAFP=0;locusAFN=0;
704     locusFN=0;
705     in_rmrnas=0;
706     in_rloci=0;
707     w_loci=0;
708     m_loci=0;
709     total_superloci=0;
710     mrnaTP=0;mrnaFP=0;mrnaFN=0;
711     mrnaATP=0;mrnaAFP=0;mrnaAFN=0;
712     ichainTP=0;ichainFP=0;ichainFN=0;
713     ichainATP=0;ichainAFP=0;ichainAFN=0;
714     exonTP=0;exonFP=0;exonFN=0;
715     exonATP=0;exonAFP=0;exonAFN=0;
716     intronTP=0;intronFP=0;intronFN=0;
717     intronATP=0;intronAFP=0;intronAFN=0;
718     total_rmexons=0;
719     total_qmexons=0;
720     total_qexons=0;total_qloci=0;total_qmrnas=0;
721     total_qloci_alt=0;
722     total_qintrons=0;total_qichains=0;
723     total_rexons=0;total_rloci=0;total_rmrnas=0;
724     total_rintrons=0;total_richains=0;
725     w_exons=0;
726     m_exons=0;
727     w_introns=0;
728     m_introns=0;
729     }
730     void addQlocus(GLocus& loc) {
731     if (start==0 || start>loc.start) start=loc.start;
732     if (end<loc.end) end=loc.end;
733     qloci.Add(&loc);
734     total_qloci++;
735     if (loc.ichains>0 && loc.mrnas.Count()>1)
736     total_qloci_alt++;
737     qmrnas.Add(loc.mrnas);
738     total_qmrnas+=loc.mrnas.Count();
739     total_qichains+=loc.ichains;
740     qmexons.Add(loc.mexons);
741     total_qmexons+=loc.mexons.Count();
742     quexons.Add(loc.uexons);
743     total_qexons+=loc.uexons.Count();
744     qintrons.Add(loc.introns);
745     total_qintrons+=loc.introns.Count();
746     }
747     void addRlocus(GLocus& loc) {
748     if (start==0 || start>loc.start) start=loc.start;
749     if (end<loc.end) end=loc.end;
750     rloci.Add(&loc);
751     total_rloci++;
752     rmrnas.Add(loc.mrnas);
753     total_rmrnas+=loc.mrnas.Count();
754     total_richains+=loc.ichains;
755     rmexons.Add(loc.mexons);
756     total_rmexons+=loc.mexons.Count();
757     ruexons.Add(loc.uexons);
758     total_rexons+=loc.uexons.Count();
759     rintrons.Add(loc.introns);
760     total_rintrons+=loc.introns.Count();
761     }
762    
763     void calcF() {
764     // base level
765     baseFP=qbases_all-baseTP;
766     baseFN=rbases_all-baseTP;
767     //exon level:
768     exonAFP=total_qexons-exonATP;
769     exonFP=total_qexons-exonTP;
770     exonAFN=total_rexons-exonATP;
771     exonFN=total_rexons-exonTP;
772     //intron stats
773     intronAFP=total_qintrons-intronATP;
774     intronFP=total_qintrons-intronTP;
775     intronAFN=total_rintrons-intronATP;
776     intronFN=total_rintrons-intronTP;
777    
778     // ichain and transcript levels:
779     ichainAFP=total_qichains-ichainATP;
780     ichainFP=total_qichains-ichainTP;
781     ichainAFN=total_richains-ichainATP;
782     ichainFN=total_richains-ichainTP;
783     mrnaFP=total_qmrnas-mrnaTP;
784     mrnaFN=total_rmrnas-mrnaTP;
785     mrnaAFP=total_qmrnas-mrnaATP;
786     mrnaAFN=total_rmrnas-mrnaATP;
787     // locus/gene level:
788 gpertea 118 locusAFP=total_qloci-locusAQTP;
789     locusFP=total_qloci-locusQTP;
790 gpertea 20 locusAFN=total_rloci-locusATP;
791     locusFN=total_rloci-locusTP;
792     }
793    
794     void addStats(GSuperLocus& s) {
795     in_rmrnas+=s.in_rmrnas;
796     in_rloci+=s.in_rloci;
797     baseTP+=s.baseTP;
798     exonTP+=s.exonTP;
799     exonATP+=s.exonATP;
800     intronTP+=s.intronTP;
801     intronATP+=s.intronATP;
802     ichainTP+=s.ichainTP;
803     ichainATP+=s.ichainATP;
804     mrnaTP+=s.mrnaTP;
805     mrnaATP+=s.mrnaATP;
806     locusTP+=s.locusTP;
807 gpertea 118 locusQTP+=s.locusQTP;
808 gpertea 20 locusATP+=s.locusATP;
809 gpertea 118 locusAQTP+=s.locusAQTP;
810 gpertea 20 m_exons+=s.m_exons;
811     w_exons+=s.w_exons;
812     m_introns+=s.m_introns;
813     w_introns+=s.w_introns;
814     if (s.total_superloci==0 && s.qloci.Count()>0) s.total_superloci=1;
815     total_superloci+=s.total_superloci;
816     qbases_all+=s.qbases_all;
817     rbases_all+=s.rbases_all;
818     m_loci+=s.m_loci;
819     w_loci+=s.w_loci;
820     total_qexons+=s.total_qexons;
821     total_qintrons+=s.total_qintrons;
822     total_qmexons+=s.total_qmexons;
823     total_rexons+=s.total_rexons;
824     total_rintrons+=s.total_rintrons;
825     total_rmexons+=s.total_rmexons;
826     total_qmrnas+=s.total_qmrnas;
827     total_qichains+=s.total_qichains;
828     total_rmrnas+=s.total_rmrnas;
829     total_richains+=s.total_richains;
830     total_qloci+=s.total_qloci;
831     total_qloci_alt+=s.total_qloci_alt;
832     total_rloci+=s.total_rloci;
833     }
834     };
835    
836     class GSeqData {
837     int gseq_id;
838     public:
839     const char* gseq_name;
840     GList<GffObj> refs_f; //forward strand mRNAs
841     GList<GffObj> refs_r; //reverse strand mRNAs
842     GList<GffObj> mrnas_f; //forward strand mRNAs
843     GList<GffObj> mrnas_r; //reverse strand mRNAs
844     GList<GLocus> loci_f; //forward strand loci
845     GList<GLocus> loci_r; //reverse strand loci
846     //--> the fields below are not used by reference data --
847     GList<GSuperLocus> gstats_f; //stats for forward strand superloci
848     GList<GSuperLocus> gstats_r; //stats for reverse strand superloci
849     GList<GLocus> nloci_f; //"novel" loci on forward strand
850     GList<GLocus> nloci_r; //"novel" loci on reverse strand
851     GList<GffObj> umrnas; //unknown orientation mrnas
852     GList<GLocus> nloci_u; //"novel" loci with no orientation found
853    
854     GList<CTData> tdata; //transcript data (uptr holder for all mrnas here)
855    
856     int get_gseqid() { return gseq_id; }
857    
858     //--<
859     GSeqData(int gid=-1):mrnas_f(true,true,false),mrnas_r(true,true,false),
860     loci_f(true,true,true),loci_r(true,true,true),
861     gstats_f(true,true,false),gstats_r(true,true,false),
862     nloci_f(true,false,true), nloci_r(true,false,true),
863     umrnas(true,true,false), nloci_u(true,true,true), tdata(false,true,false) {
864     gseq_id=gid;
865     if (gseq_id>=0)
866     gseq_name=GffObj::names->gseqs.getName(gseq_id);
867     }
868     bool operator==(GSeqData& d){
869     return (gseq_id==d.gseq_id);
870     }
871     bool operator>(GSeqData& d){
872     return (gseq_id>d.gseq_id);
873     }
874     bool operator<(GSeqData& d){
875     return (gseq_id<d.gseq_id);
876     }
877     };
878    
879    
880     // a group of qry loci and a transcript cluster for a single qry dataset
881     class GQCluster : public GList<GffObj> {
882     public:
883     GffObj* mrna_maxcov; //transcript with maximum coverage (for largest transcript)
884     GffObj* mrna_maxscore; //transcript with maximum gscore ( = major isoform for Cufflinks)
885     uint start;
886     uint end;
887     GList<GLocus> qloci;
888     //GCluster cl; //just a more compact way of keeping all transcripts in these loci
889     GQCluster(GList<GLocus>* loci=NULL):GList<GffObj>(true,false,false),
890     qloci(true,false,false) {
891     mrna_maxcov=NULL;
892     mrna_maxscore=NULL;
893     start=0;
894     end=0;
895     if (loci!=NULL) {
896     qloci.Add(*loci);
897     for (int i=0;i<loci->Count();i++) {
898     addLocus(loci->Get(i),false);
899     }
900     }
901     }
902     void addLocus(GLocus* loc, bool toLoci=true) {
903     //check so we don't add locus duplicates
904     if (toLoci) {
905     for (int i=0;i<qloci.Count();i++) {
906     if (loc==qloci[i]) return;
907     }
908     qloci.Add(loc);
909     }
910     for (int m=0;m<loc->mrnas.Count();m++) {
911     GffObj* mrna=loc->mrnas[m];
912     Add(mrna);
913     if (start==0 || start>mrna->start) start=mrna->start;
914     if (end<mrna->end) end=mrna->end;
915     if (mrna_maxcov==NULL || mrna_maxcov->covlen<mrna->covlen) mrna_maxcov=mrna;
916     if (mrna_maxscore==NULL || mrna_maxscore->gscore<mrna->gscore) mrna_maxscore=mrna;
917     }
918     }
919     };
920    
921     //track a set of clustered qloci across multiple qry datasets
922     // the qloci in qcls[] overlap but not necessarily at exon level
923     // (so there can be multiple genes here in fact)
924     class GTrackLocus:public GSeg {
925     public:
926     char strand;
927     bool hasQloci;
928     //GLocus* rloc; //corresponding reference locus, if available
929     GList<GLocus> rloci; //ref loci found overlapping this region
930     GQCluster* qcls[MAX_QFILES]; //all qloci for this superlocus, grouped by dataset
931     GTrackLocus(GLocus* qloc=NULL, int q=-1):GSeg(0,0),rloci(true,false,true) {
932     strand='.';
933     for (int i=0;i<MAX_QFILES;i++) qcls[i]=NULL;
934     if (qloc!=NULL) addQLocus(qloc,q);
935     }
936    
937     void addRLocus(GLocus* rl) {
938     if (rl==NULL) return;
939     if (rl->qfidx>=0)
940     GError("Error: GTrackLocus::addRLocus called with a query locus (set# %d)\n",
941     rl->qfidx+1);
942     if (strand=='.') strand=rl->mrna_maxcov->strand;
943     if (start==0 || start>rl->start) start=rl->start;
944     if (end==0 || end<rl->end) end=rl->end;
945     rl->t_ptr=this;
946     rloci.Add(rl);
947     }
948    
949     void addQLocus(GLocus* loc, int q=-1) { //adding qry locus
950     if (loc==NULL) return;
951     if (strand=='.' && loc->mrna_maxcov->strand!='.')
952     strand=loc->mrna_maxcov->strand;
953     if (loc->qfidx<0 && q<0)
954     GError("Error at GTrackLocus::addQLocus(): locus.qfidx not set and index not given!\n");
955     if (q>=0) loc->qfidx=q;
956     else q=loc->qfidx;
957     if (start==0 || start>loc->start) start=loc->start;
958     if (end==0 || end<loc->end) end=loc->end;
959     if (qcls[q]==NULL) qcls[q]=new GQCluster();
960     hasQloci=true;
961     loc->t_ptr = this;
962     qcls[q]->addLocus(loc);
963     }
964    
965     bool add_Locus(GLocus* loc) {
966     if (start==0 || overlap(*loc)) { //simple range overlap, not exon overlap
967     if (loc->qfidx<0) addRLocus(loc);
968     else addQLocus(loc);
969     return true;
970     }
971     return false;
972     }
973    
974    
975     void addQCl(int q, GQCluster* qcl, GLocus* lnkloc) {
976     for (int i=0;i<qcl->qloci.Count();i++) {
977     GLocus* loc=qcl->qloci[i];
978     if (loc==lnkloc || loc->t_ptr==this) continue;
979     hasQloci=true;
980     loc->t_ptr=this;
981     qcls[q]->addLocus(loc);
982     }
983     }
984    
985     void addMerge(GTrackLocus* loctrack, int qcount, GLocus* lnkloc) {
986     if (loctrack==NULL) return;
987     //merge qloci
988     for (int q=0; q < qcount; q++) {
989     if (qcls[q]==NULL) {
990     if (loctrack->qcls[q]!=NULL) {
991     qcls[q]=loctrack->qcls[q];
992     loctrack->qcls[q]=NULL; //just move pointer here
993     //set all t_ptr pointers for moved loci
994     for (int ql = 0; ql < qcls[q]->qloci.Count(); ql++) {
995     qcls[q]->qloci[ql]->t_ptr=this;
996     }
997     hasQloci=true;
998     }
999     }
1000     else //existing qloci at q
1001     if (loctrack->qcls[q]!=NULL) { //merge elements
1002     addQCl(q, loctrack->qcls[q], lnkloc);
1003     }
1004     }//for each qset
1005     //merge rloci, if any
1006     if (loctrack->rloci.Count()>0) {
1007     for (int l=0;l<loctrack->rloci.Count();l++) {
1008     if (loctrack->rloci[l]!=lnkloc && loctrack->rloci[l]->t_ptr!=this) {
1009     rloci.Add(loctrack->rloci[l]);
1010     loctrack->rloci[l]->t_ptr=this;
1011     }
1012     }
1013     }
1014     if (loctrack->start<start) start=loctrack->start;
1015     if (loctrack->end>end) end=loctrack->end;
1016     if (strand=='.' && loctrack->strand!='.') strand=loctrack->strand;
1017     }
1018    
1019     /*
1020     void add_QLoci(GList<GLocus>* loci, int q, GLocus& r) {
1021     // only add loci overlapping given refloc
1022     //rloc=&r;
1023     if (loci==NULL) return;
1024     for (int i=0;i<loci->Count();i++) {
1025     GLocus* loc=loci->Get(i);
1026     // if (!loc->exonOverlap(r)) continue; //do we really needed exon overlap?
1027     if (!loc->overlap(r)) continue;
1028     if (start==0 || start>loc->start) start=loc->start;
1029     if (end==0 || end<loc->end) end=loc->end;
1030     loc->t_ptr=this;
1031     loc->qfidx=q;
1032     if (qcls[q]==NULL) qcls[q]=new GQCluster();
1033     qcls[q]->addLocus(loc);
1034     }
1035     strand=r.mrnas[0]->strand;
1036     }
1037     */
1038     ~GTrackLocus() {
1039     for (int q=0;q<MAX_QFILES;q++)
1040     if (qcls[q]!=NULL) { delete qcls[q]; qcls[q]=NULL; }
1041     }
1042    
1043     GQCluster* operator[](int q) {
1044     if (q<0 || q>=MAX_QFILES)
1045     GError("Error: qfidx index out of bounds (%d) for GTrackLocus!\n",q);
1046     return qcls[q];
1047     }
1048     };
1049    
1050     class GXConsensus:public GSeg {
1051     public:
1052     static int count;
1053     int id; //XConsensus ID
1054     int tss_id; //group id for those xconsensi with shared first exon
1055     int p_id; //group id for those xconsensi with "similar" protein
1056     GffObj* tcons; //longest transcript to represent the combined "consensus" structure
1057     GffObj* ref; //overlapping reference transcript
1058     char refcode; // the code for ref relationship (like in the tracking file)
1059     char* aa;
1060     int aalen;
1061     GXConsensus* contained; //if contained into another GXConsensus
1062     //list of ichain-matching query (cufflinks) transcripts that contributed to this consensus
1063     GList<GffObj> qchain;
1064     GXConsensus(GffObj* c, CEqList* qlst, GffObj* r=NULL, char rcode=0)
1065     :qchain(false,false,false) {
1066     ref=r;
1067     refcode=rcode;
1068     tcons=c;
1069     if (qlst!=NULL) qchain.Add(*((GList<GffObj>*)qlst));
1070     else qchain.Add(c);
1071     count++;
1072     tss_id=0;
1073     p_id=0;
1074     aalen=0;
1075     id=count;
1076     aa=NULL;
1077     start=tcons->start;
1078     end=tcons->end;
1079     contained=NULL;
1080     }
1081     ~GXConsensus() {
1082     if (aa!=NULL) GFREE(aa);
1083     }
1084     };
1085    
1086     class GXLocus:public GSeg {
1087     public:
1088     int id;
1089     int num_mtcons; //number of multi-exon "consensus" transcripts in this locus
1090     char strand;
1091     GList<GLocus> rloci; //list of ref loci overlapping any of the mexons
1092     GList<GLocus> qloci; //loci from all qry datasets that have overlapping exons with this region
1093     GArray<GSeg> mexons; //list of merged exonic regions for this locus
1094     GList<GXConsensus> tcons;
1095     GXLocus(GLocus* lfirst=NULL):GSeg(0,0),
1096     rloci((GCompareProc*)cmpByPtr, (GFreeProc*)NULL, true),
1097     qloci((GCompareProc*)cmpByPtr, (GFreeProc*)NULL, true),
1098     mexons(true,true), tcons(true,true,false) {
1099     strand='.';
1100     num_mtcons=0;
1101     if (lfirst!=NULL) {
1102     add_Locus(lfirst);
1103     }
1104     id=0;
1105     }
1106    
1107     bool add_Locus(GLocus* loc) {
1108     if (mexons.Count()>0 && (end<loc->start || start > loc->end))
1109     return false; //no chance for overlapping exons
1110     if (mexons.Count()==0) {
1111     mexons.Add(loc->mexons);
1112     start=loc->start;
1113     end=loc->end;
1114     if (loc->qfidx<0) rloci.Add(loc);
1115     else qloci.Add(loc);
1116     strand=loc->mrna_maxcov->strand;
1117     loc->xlocus=this;
1118     return true;
1119     }
1120     int f=0;
1121     if (loc->qfidx<0) {
1122     if (rloci.Found(loc,f)) return false;
1123     }
1124     else if (qloci.Found(loc,f)) return false;
1125    
1126     // -- merge mexons
1127     GArray<int> ovlexons(true,true); //list of locus.mexons indexes overlapping existing mexons
1128     int i=0; //index of first mexons with a merge
1129     int j=0; //index current mrna exon
1130     while (i<mexons.Count() && j<loc->mexons.Count()) {
1131     uint istart=mexons[i].start;
1132     uint iend=mexons[i].end;
1133     uint jstart=loc->mexons[j].start;
1134     uint jend=loc->mexons[j].end;
1135     if (iend<jstart) { i++; continue; }
1136     if (jend<istart) { j++; continue; }
1137     //if (mexons[i].overlap(jstart, jend)) {
1138     //exon overlap was found :
1139     ovlexons.Add(j);
1140     //extend mexons[i] as needed
1141     if (jstart<istart) mexons[i].start=jstart;
1142     if (jend>iend) { //mexons[i] end extend
1143     mexons[i].end=jend;
1144     //now this could overlap the next mexon(s), so we have to merge them all
1145     while (i<mexons.Count()-1 && mexons[i].end>mexons[i+1].start) {
1146     uint nextend=mexons[i+1].end;
1147     mexons.Delete(i+1);
1148     if (nextend>mexons[i].end) {
1149     mexons[i].end=nextend;
1150     break; //no need to check next mexons
1151     }
1152     } //while next mexons merge
1153     } // mexons[i] end extend
1154     // } //exon overlap
1155     j++; //check the next locus.mexon
1156     }//while mexons
1157     if (ovlexons.Count()==0) return false;
1158     if (strand=='.' && loc->mrna_maxcov->strand!='.')
1159     strand=loc->mrna_maxcov->strand;
1160     //have exon overlap:
1161     //-- add the rest of the non-overlapping mexons:
1162     GSeg seg;
1163     for (int i=0;i<loc->mexons.Count();i++) {
1164     seg.start=loc->mexons[i].start;
1165     seg.end=loc->mexons[i].end;
1166     if (!ovlexons.Exists(i)) mexons.Add(seg);
1167     }
1168     // -- adjust start/end as needed
1169     if (start>loc->start) start=loc->start;
1170     if (end<loc->end) end=loc->end;
1171     loc->xlocus=this;
1172     if (loc->qfidx<0) rloci.Add(loc);
1173     else qloci.Add(loc);
1174     return true;
1175     }
1176    
1177     void addMerge(GXLocus& oxloc) {
1178     GArray<int> ovlexons(true,true); //list of oxloc.mexons indexes overlapping existing mexons
1179     int i=0; //index of first mexons with a merge
1180     int j=0; //index current mrna exon
1181     while (i<mexons.Count() && j<oxloc.mexons.Count()) {
1182     uint istart=mexons[i].start;
1183     uint iend=mexons[i].end;
1184     uint jstart=oxloc.mexons[j].start;
1185     uint jend=oxloc.mexons[j].end;
1186     if (iend<jstart) { i++; continue; }
1187     if (jend<istart) { j++; continue; }
1188     //if (mexons[i].overlap(jstart, jend)) {
1189     //exon overlap was found :
1190     ovlexons.Add(j);
1191     //extend mexons[i] as needed
1192     if (jstart<istart) mexons[i].start=jstart;
1193     if (jend>iend) { //mexons[i] end extend
1194     mexons[i].end=jend;
1195     //now this could overlap the next mexon(s), so we have to merge them all
1196     while (i<mexons.Count()-1 && mexons[i].end>mexons[i+1].start) {
1197     uint nextend=mexons[i+1].end;
1198     mexons.Delete(i+1);
1199     if (nextend>mexons[i].end) {
1200     mexons[i].end=nextend;
1201     break; //no need to check next mexons
1202     }
1203     } //while next mexons merge
1204     } // mexons[i] end extend
1205     // } //exon overlap
1206     j++; //check the next oxloc.mexon
1207     }
1208     if (ovlexons.Count()==0) {
1209     GError("Error: attempt to merge GXLoci with non-overlapping exons!\n");
1210     }
1211     //-- add the rest of the non-overlapping mexons:
1212     GSeg seg;
1213     for (int i=0;i<oxloc.mexons.Count();i++) {
1214     seg.start=oxloc.mexons[i].start;
1215     seg.end=oxloc.mexons[i].end;
1216     if (!ovlexons.Exists(i)) mexons.Add(seg);
1217     }
1218     if (start>oxloc.start) start=oxloc.start;
1219     if (end<oxloc.end) end=oxloc.end;
1220     if (strand=='.') strand=oxloc.strand;
1221     //-- steal all qloci and rloci
1222     for (int i=0;i<oxloc.qloci.Count();i++) {
1223     if (oxloc.qloci[i]->xlocus==this) continue;
1224     qloci.Add(oxloc.qloci[i]);
1225     oxloc.qloci[i]->xlocus=this;
1226     }
1227     for (int i=0;i<oxloc.rloci.Count();i++) {
1228     if (oxloc.rloci[i]->xlocus==this) continue;
1229     rloci.Add(oxloc.rloci[i]);
1230     oxloc.rloci[i]->xlocus=this;
1231     }
1232     } //::addMerge()
1233    
1234    
1235     void checkContainment() {
1236     //checking containment
1237     for (int j=0;j<tcons.Count()-1;j++) {
1238     GXConsensus* t=tcons[j];
1239     for (int i=j+1;i<tcons.Count();i++) {
1240     if (tcons[i]->contained!=NULL && t->tcons->exons.Count()>1) continue; //will check the container later anyway
1241     int c_status=checkXConsContain(t->tcons, tcons[i]->tcons);
1242     if (c_status==0) continue; //no containment relationship between t and tcons[i]
1243     if (c_status>0) { //t is a container for tcons[i]
1244     tcons[i]->contained=t;
1245     }
1246     else { //contained into exising XCons
1247     t->contained=tcons[i];
1248     break;
1249     }
1250     }
1251     }
1252     }
1253    
1254     int checkXConsContain(GffObj* a, GffObj* b) {
1255     // returns 1 if a is the container of b
1256     // -1 if a is contained in b
1257     // 0 if no
1258     if (a->end<b->start || b->end<a->start) return 0;
1259     if (a->exons.Count()==b->exons.Count()) {
1260     if (a->exons.Count()>1) return 0; //same number of exons - no containment possible
1261     //because equivalence was already tested
1262     else { //single exon containment testing
1263     //this is fuzzy and messy (end result may vary depending on the testing order)
1264     int ovlen=a->exons[0]->overlapLen(b->exons[0]);
1265     int minlen=GMIN(a->covlen, b->covlen);
1266     if (ovlen>=minlen*0.8) { //if at least 80% of the shorter one is covered, it is contained
1267     return ((a->covlen>b->covlen) ? 1 : -1);
1268     }
1269     else return 0;
1270     //if (a->start<=b->start+10 && a->end+10>=b->end) return 1;
1271     // else { if (b->start<=a->start+10 && b->end+10>=a->end) return -1;
1272     // else return 0;
1273     //}
1274     }
1275     }
1276     //different number of exons:
1277     if (a->exons.Count()>b->exons.Count()) return t_contains(*a, *b) ? 1:0;
1278     else return t_contains(*b, *a) ? -1 : 0;
1279     }
1280    
1281     void addXCons(GXConsensus* t) {
1282     tcons.Add(t);
1283     }
1284    
1285     }; //GXLocus
1286    
1287    
1288    
1289     int parse_mRNAs(GfList& mrnas,
1290     GList<GSeqData>& glstdata,
1291     bool is_ref_set=true,
1292     bool check_for_dups=false,
1293     int qfidx=-1, bool only_multiexon=false);
1294    
1295     //reading a mRNAs from a gff file and grouping them into loci
1296     void read_mRNAs(FILE* f, GList<GSeqData>& seqdata, GList<GSeqData>* ref_data=NULL,
1297     bool check_for_dups=false, int qfidx=-1, const char* fname=NULL,
1298     bool only_multiexon=false);
1299    
1300     void read_transcripts(FILE* f, GList<GSeqData>& seqdata, bool keepAttrs=true);
1301     void sort_GSeqs_byName(GList<GSeqData>& seqdata);
1302    
1303    
1304 gpertea 54 bool tMatch(GffObj& a, GffObj& b, int& ovlen, bool fuzzunspl=false, bool contain_only=false);
1305 gpertea 20
1306     //use qsearch to "position" a given coordinate x within a list of transcripts sorted
1307     //by their start (lowest) coordinate; the returned int is the list index of the
1308     //closest GffObj starting just *ABOVE* coordinate x
1309     //Convention: returns -1 if there is no such GffObj (i.e. last GffObj start <= x)
1310     int qsearch_mrnas(uint x, GList<GffObj>& mrnas);
1311     int qsearch_loci(uint x, GList<GLocus>& segs); // same as above, but for GSeg lists
1312    
1313     GSeqData* getRefData(int gid, GList<GSeqData>& ref_data); //returns reference GSeqData for a specific genomic sequence
1314    
1315     #endif

Properties

Name Value
svn:executable *