ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/cuffcompare/gtf_tracking.cpp
Revision: 54
Committed: Thu Sep 8 05:46:16 2011 UTC (9 years, 1 month ago) by gpertea
File size: 24079 byte(s)
Log Message:
debug RPS20P29 ENSG00000241828 not collapsing

Line User Rev File contents
1 gpertea 20 /*
2     * gtf_tracking.cpp
3     * cufflinks
4     *
5     * Created by Cole Trapnell on 9/5/09.
6     * Copyright 2009 Geo Pertea. All rights reserved.
7     *
8     */
9    
10     #include "gtf_tracking.h"
11    
12     bool gtf_tracking_verbose = false;
13     bool gtf_tracking_largeScale=false; //many input Cufflinks files processed at once by cuffcompare, discard exon attributes
14    
15     int GXConsensus::count=0;
16    
17     char* getGSeqName(int gseq_id) {
18     return GffObj::names->gseqs.getName(gseq_id);
19     }
20    
21     int cmpByPtr(const pointer p1, const pointer p2) {
22     return (p1>p2) ? 1: ((p1==p2)? 0 : -1);
23     }
24    
25     bool betterRef(GffObj* a, GffObj* b) {
26     if (a==NULL || b==NULL) return (a!=NULL);
27     if (a->exons.Count()!=b->exons.Count()) return (a->exons.Count()>b->exons.Count());
28     if (a->hasCDS() && !b->hasCDS())
29     return true;
30     else {
31     if (b->hasCDS() && !a->hasCDS()) return false;
32     return (a->covlen>b->covlen);
33     }
34     }
35    
36     GffObj* is_RefDup(GffObj* m, GList<GffObj>& mrnas, int& dupidx) {
37     //mrnas MUST be sorted by start coordinate
38     int ovlen=0;
39     dupidx=-1;
40     if (mrnas.Count()==0) return NULL;
41     int nidx=qsearch_mrnas(m->end, mrnas);
42     if (nidx==0) return NULL;
43     if (nidx==-1) nidx=mrnas.Count();//all can overlap
44     for (int i=nidx-1;i>=0;i--) {
45     GffObj& omrna=*mrnas[i];
46     if (m->start>omrna.end) {
47     if (m->start-omrna.start>GFF_MAX_EXON) break; //give up already
48     continue;
49     }
50     if (omrna.start>m->end) continue; //this should never be the case if nidx was found correctly
51     //locus overlap here:
52 gpertea 54 if (tMatch(*m, omrna, ovlen, false, true)) {
53 gpertea 20 dupidx=i;
54     return mrnas[i];
55     }
56     }
57     return NULL;
58     }
59    
60    
61     bool intronRedundant(GffObj& ti, GffObj& tj) {
62     //two transcripts are "intron redundant" iff one transcript's intron chain
63     // is a sub-chain of the other's
64     int imax=ti.exons.Count()-1;
65     int jmax=tj.exons.Count()-1;
66     if (imax==0 || jmax==0) return false; //don't deal with single-exon transcripts here
67     if (ti.exons[imax]->start<tj.exons[0]->end ||
68     tj.exons[jmax]->start<ti.exons[0]->end )
69     return false; //intron chains do not overlap at all
70    
71     uint eistart=0, eiend=0, ejstart=0, ejend=0; //exon boundaries
72     int i=1; //exon idx to the right of the current intron of ti
73     int j=1; //exon idx to the right of the current intron of tj
74     //find the first intron overlap:
75     while (i<=imax && j<=jmax) {
76     eistart=ti.exons[i-1]->end;
77     eiend=ti.exons[i]->start;
78     ejstart=tj.exons[j-1]->end;
79     ejend=tj.exons[j]->start;
80     if (ejend<eistart) { j++; continue; }
81     if (eiend<ejstart) { i++; continue; }
82     //we found an intron overlap
83     break;
84     }
85     if ((i>1 && j>1) || i>imax || j>jmax) {
86     return false; //either no intron overlaps found at all
87     //or it's not the first intron for at least one of the transcripts
88     }
89     if (eistart!=ejstart || eiend!=ejend) return false; //not an exact intron match
90 gpertea 49 //we have the first matching intron on the left
91 gpertea 20 if (j>i) {
92     //i==1, ti's start must not conflict with the previous intron of tj
93     if (ti.start<tj.exons[j-1]->start) return false;
94     //so i's first intron starts AFTER j's first intron
95     // then j must contain i, so i's last intron must end with or before j's last intron
96     if (ti.exons[imax]->start>tj.exons[jmax]->start) return false;
97     //comment out the line above if you just want "intron compatibility" (i.e. extension of intron chains )
98     }
99     else if (i>j) {
100     //j==1, tj's start must not conflict with the previous intron of ti
101     if (tj.start<ti.exons[i-1]->start) return false;
102     //so j's intron chain starts AFTER i's
103     // then i must contain j, so j's last intron must end with or before j's last intron
104     if (tj.exons[jmax]->start>ti.exons[imax]->start) return false;
105     //comment out the line above for just "intronCompatible()" check
106     }
107     //now check if the rest of the introns overlap, in the same sequence
108     i++;
109     j++;
110     while (i<=imax && j<=jmax) {
111     if (ti.exons[i-1]->end!=tj.exons[j-1]->end ||
112     ti.exons[i]->start!=tj.exons[j]->start) return false;
113     i++;
114     j++;
115     }
116     i--;
117     j--;
118     if (i==imax && j<jmax) {
119     // tj has more introns to the right, check if ti's end doesn't conflict with the current tj exon boundary
120     if (ti.end>tj.exons[j]->end) return false;
121     }
122     else if (j==jmax && i<imax) {
123     if (tj.end>ti.exons[i]->end) return false;
124     }
125     return true;
126     }
127    
128     bool t_contains(GffObj& a, GffObj& b) {
129     //returns true if b's intron chain (or single exon) is included in a
130     if (b.exons.Count()>=a.exons.Count()) return false;
131     if (b.exons.Count()==1) {
132     //check if b is contained in any of a's exons:
133     for (int i=0;i<a.exons.Count();i++) {
134     if (b.start>=a.exons[i]->start && b.end<=a.exons[i]->end) return true;
135     }
136     return false;
137     }
138     if (intronRedundant(a,b)) {
139     //intronRedudant allows b's initial/terminal exons to extend beyond a's boundaries
140     //but we don't allow this kind of behavior here
141     return (b.start>=a.start && b.end<=a.end);
142     }
143     else return false;
144     }
145    
146     int is_Redundant(GffObj*m, GList<GffObj>* mrnas) {
147     //first locate the list index of the mrna starting just ABOVE
148     //the end of this mrna
149     if (mrnas->Count()==0) return -1;
150     int nidx=qsearch_mrnas(m->end, *mrnas);
151     if (nidx==0) return -1;
152     if (nidx==-1) nidx=mrnas->Count();//all can overlap
153     for (int i=nidx-1;i>=0;i--) {
154     GffObj& omrna=*mrnas->Get(i);
155     if (m->start>omrna.end) {
156     if (m->start-omrna.start>GFF_MAX_EXON) break; //give up already
157     continue;
158     }
159     if (omrna.start>m->end) continue; //this should never be the case if nidx was found correctly
160    
161     if (intronRedundant(*m, omrna)) return i;
162     }
163     return -1;
164     }
165    
166     bool t_dominates(GffObj* a, GffObj* b) {
167     // for redundant / intron compatible transfrags:
168     // returns true if a "dominates" b, i.e. a has more exons or is longer
169     if (a->exons.Count()==b->exons.Count())
170     return (a->covlen>b->covlen);
171     else return (a->exons.Count()>b->exons.Count());
172     }
173    
174     bool betterDupRef(GffObj* a, GffObj* b) {
175 gpertea 54 if (a->exons.Count()!=b->exons.Count())
176     return (a->exons.Count()>b->exons.Count());
177     if (a->hasCDS()!=b->hasCDS())
178     return (a->hasCDS()>b->hasCDS());
179     //for annotation purposes, it's more important to keep the
180     //longer transcript, instead of the one that was loaded first
181     if (a->covlen != b->covlen)
182     return (a->covlen > b->covlen);
183     else return (a->track_id < b->track_id);
184 gpertea 20 }
185    
186     int parse_mRNAs(GfList& mrnas,
187     GList<GSeqData>& glstdata,
188     bool is_ref_set,
189     bool check_for_dups,
190     int qfidx, bool only_multiexon) {
191     int refdiscarded=0; //ref duplicates discarded
192     int tredundant=0; //cufflinks redundant transcripts discarded
193     for (int k=0;k<mrnas.Count();k++) {
194     GffObj* m=mrnas[k];
195     int i=-1;
196     GSeqData f(m->gseq_id);
197     GSeqData* gdata=NULL;
198     uint tlen=m->len();
199     if (m->hasErrors() || (tlen+500>GFF_MAX_LOCUS)) { //should probably report these in a file too..
200     if (gtf_tracking_verbose)
201     GMessage("Warning: transcript %s discarded (structural errors found, length=%d).\n", m->getID(), tlen);
202     continue;
203     }
204     if (only_multiexon && m->exons.Count()<2) {
205     continue;
206     }
207     //GStr feature(m->getFeatureName());
208     //feature.lower();
209     //bool gene_or_locus=(feature.endsWith("gene") ||feature.index("loc")>=0);
210     //if (m->exons.Count()==0 && gene_or_locus) {
211     if (m->isDiscarded()) {
212     //discard generic "gene" or "locus" features with no other detailed subfeatures
213     //if (gtf_tracking_verbose)
214     // GMessage("Warning: discarding GFF generic gene/locus container %s\n",m->getID());
215     continue;
216     }
217     if (m->exons.Count()==0) {
218     //if (gtf_tracking_verbose)
219     // GMessage("Warning: %s %s found without exon segments (adding default exon).\n",m->getFeatureName(), m->getID());
220     m->addExon(m->start,m->end);
221     }
222     if (glstdata.Found(&f,i)) gdata=glstdata[i];
223     else {
224     gdata=new GSeqData(m->gseq_id);
225     glstdata.Add(gdata);
226     }
227    
228     double fpkm=0;
229     double cov=0;
230     double conf_hi=0;
231     double conf_lo=0;
232    
233     GList<GffObj>* target_mrnas=NULL;
234     if (is_ref_set) { //-- ref transcripts
235     if (m->strand=='.') {
236     //unknown strand - discard from reference set (!)
237     continue;
238     }
239     target_mrnas=(m->strand=='+') ? &(gdata->mrnas_f) : &(gdata->mrnas_r);
240     if (check_for_dups) {
241     //check all gdata->mrnas_r (ref_data) for duplicate ref transcripts
242     int rpidx=-1;
243     GffObj* rp= is_RefDup(m, *target_mrnas, rpidx);
244     if (rp!=NULL) { //duplicate found
245 gpertea 54 //discard one of them
246 gpertea 20 //but let's keep the gene_name if present
247 gpertea 54 //DEBUG:
248     GMessage("Ref duplicates: %s = %s\n", rp->getID(), m->getID());
249 gpertea 20 refdiscarded++;
250     if (betterDupRef(rp, m)) {
251     if (rp->getGeneName()==NULL && m->getGeneName()!=NULL) {
252     rp->setGeneName(m->getGeneName());
253     }
254     continue;
255     }
256     else {
257     if (m->getGeneName()==NULL && rp->getGeneName()!=NULL) {
258     m->setGeneName(rp->getGeneName());
259     }
260     ((CTData*)(rp->uptr))->mrna=NULL;
261     rp->isUsed(false);
262     target_mrnas->Forget(rpidx);
263     target_mrnas->Delete(rpidx);
264     }
265     }
266     } //check for duplicate ref transcripts
267     } //ref transcripts
268     else { //-- transfrags
269     if (m->strand=='+') { target_mrnas = &(gdata->mrnas_f); }
270     else if (m->strand=='-') { target_mrnas=&(gdata->mrnas_r); }
271     else { m->strand='.'; target_mrnas=&(gdata->umrnas); }
272     if (check_for_dups) { //check for redundancy
273     // check if there is a redundancy between this and another already loaded Cufflinks transcript
274     int cidx = is_Redundant(m, target_mrnas);
275     if (cidx>=0) {
276     //always discard the redundant transcript with the fewer exons OR shorter
277     if (t_dominates(target_mrnas->Get(cidx),m)) {
278     //new transcript is shorter, discard it
279     continue;
280     }
281     else {
282     //discard the older transfrag
283     ((CTData*)(target_mrnas->Get(cidx)->uptr))->mrna=NULL;
284     target_mrnas->Get(cidx)->isUsed(false);
285     target_mrnas->Forget(cidx);
286     target_mrnas->Delete(cidx);
287     //the uptr (CTData) pointer will still be kept in gdata->ctdata and deallocated eventually
288     }
289     tredundant++;
290     }
291     }// redundant transfrag check
292     if (m->gscore==0.0)
293     m->gscore=m->exons[0]->score; //Cufflinks exon score = isoform abundance
294     //const char* expr = (gtf_tracking_largeScale) ? m->getAttr("FPKM") : m->exons[0]->getAttr(m->names,"FPKM");
295     const char* expr = m->getAttr("FPKM");
296     if (expr!=NULL) {
297     if (expr[0]=='"') expr++;
298     fpkm=strtod(expr, NULL);
299     } else { //backward compatibility: read RPKM if FPKM not found
300     //expr=(gtf_tracking_largeScale) ? m->getAttr("RPKM") : m->exons[0]->getAttr(m->names,"RPKM");
301     expr=m->getAttr("RPKM");
302     if (expr!=NULL) {
303     if (expr[0]=='"') expr++;
304     fpkm=strtod(expr, NULL);
305     }
306     }
307     //const char* scov=(gtf_tracking_largeScale) ? m->getAttr("cov") : m->exons[0]->getAttr(m->names,"cov");
308     const char* scov=m->getAttr("cov");
309     if (scov!=NULL) {
310     if (scov[0]=='"') scov++;
311     cov=strtod(scov, NULL);
312     }
313     //const char* sconf_hi=(gtf_tracking_largeScale) ? m->getAttr("conf_hi") : m->exons[0]->getAttr(m->names,"conf_hi");
314     const char* sconf_hi=m->getAttr("conf_hi");
315     if (sconf_hi!=NULL){
316     if (sconf_hi[0]=='"') sconf_hi++;
317     conf_hi=strtod(sconf_hi, NULL);
318     }
319     //const char* sconf_lo=(gtf_tracking_largeScale) ? m->getAttr("conf_lo") : m->exons[0]->getAttr(m->names,"conf_lo");
320     const char* sconf_lo=m->getAttr("conf_lo");
321     if (sconf_lo!=NULL) {
322     if (sconf_lo[0]=='"') sconf_lo++;
323     conf_lo=strtod(sconf_lo, NULL);
324     }
325     } //Cufflinks transfrags
326     target_mrnas->Add(m);
327     m->isUsed(true);
328     CTData* mdata=new CTData(m);
329     mdata->qset=qfidx;
330     gdata->tdata.Add(mdata);
331     if (!is_ref_set) {
332     // Cufflinks - attributes parsing
333     mdata->FPKM=fpkm;
334     mdata->cov=cov;
335     mdata->conf_hi=conf_hi;
336     mdata->conf_lo=conf_lo;
337     }
338     }//for each mrna read
339     //if (mrna_deleted>0)
340     // mrnas.Pack();
341    
342     return (is_ref_set ? refdiscarded : tredundant);
343     }
344    
345 gpertea 54 bool tMatch(GffObj& a, GffObj& b, int& ovlen, bool fuzzunspl, bool contain_only) {
346 gpertea 20 //strict intron chain match, or single-exon perfect match
347     int imax=a.exons.Count()-1;
348     int jmax=b.exons.Count()-1;
349     ovlen=0;
350     if (imax!=jmax) return false; //different number of introns
351     if (imax==0) { //single-exon mRNAs
352 gpertea 54 if (contain_only) {
353     return ((a.start>=b.start && a.end<=b.end) ||
354     (b.start>=a.start && b.end<=a.end));
355     }
356     if (fuzzunspl) {
357 gpertea 20 //fuzz match for single-exon transfrags:
358 gpertea 54 // it's a match if they overlap at least 80% of shortest one
359 gpertea 20 ovlen=a.exons[0]->overlapLen(b.exons[0]);
360     int maxlen=GMAX(a.covlen,b.covlen);
361     return (ovlen>=maxlen*0.8);
362     }
363     else {
364 gpertea 54 //only exact match, or strictly contained
365 gpertea 20 ovlen=a.covlen;
366     return (a.exons[0]->start==b.exons[0]->start &&
367     a.exons[0]->end==b.exons[0]->end);
368     }
369     }
370     if ( a.exons[imax]->start<b.exons[0]->end ||
371     b.exons[jmax]->start<a.exons[0]->end )
372     return false; //intron chains do not overlap at all
373     //check intron overlaps
374     ovlen=a.exons[0]->end-(GMAX(a.start,b.start))+1;
375     ovlen+=(GMIN(a.end,b.end))-a.exons.Last()->start;
376     for (int i=1;i<=imax;i++) {
377     if (i<imax) ovlen+=a.exons[i]->len();
378     if ((a.exons[i-1]->end!=b.exons[i-1]->end) ||
379     (a.exons[i]->start!=b.exons[i]->start)) {
380 gpertea 54 return false; //intron mismatch
381 gpertea 20 }
382     }
383 gpertea 54 if (contain_only)
384     return ((a.start>=b.start && a.end<=b.end) ||
385     (b.start>=a.start && b.end<=a.end));
386     else return true;
387 gpertea 20 }
388    
389 gpertea 54
390 gpertea 20 void cluster_mRNAs(GList<GffObj> & mrnas, GList<GLocus> & loci, int qfidx) {
391     //mrnas sorted by start coordinate
392     //and so are the loci
393     //int rdisc=0;
394     for (int t=0;t<mrnas.Count();t++) {
395     GArray<int> mrgloci(false);
396     GffObj* mrna=mrnas[t];
397     int lfound=0; //count of parent loci
398     /*for (int l=0;l<loci.Count();l++) {
399     if (loci[l]->end<mrna->exons.First()->start) continue;
400     if (loci[l]->start>mrna->exons.Last()->end) break; */
401     for (int l=loci.Count()-1;l>=0;l--) {
402     if (loci[l]->end<mrna->exons.First()->start) {
403     if (mrna->exons.First()->start-loci[l]->start > GFF_MAX_LOCUS) break;
404     continue;
405     }
406     if (loci[l]->start>mrna->exons.Last()->end) continue;
407     //here we have mrna overlapping loci[l]
408     if (loci[l]->add_mRNA(mrna)) {
409     //a parent locus was found
410     lfound++;
411     mrgloci.Add(l); //locus indices added here, in decreasing order
412     }
413     }//loci loop
414     //if (lfound<0) continue; //mrna was a ref duplicate, skip it
415     if (lfound==0) {
416     //create a locus with only this mRNA
417     loci.Add(new GLocus(mrna, qfidx));
418     }
419     else if (lfound>1) {
420     //more than one locus found parenting this mRNA, merge loci
421     lfound--;
422     for (int l=0;l<lfound;l++) {
423     int mlidx=mrgloci[l]; //largest indices first, so it's safe to remove
424     loci[mrgloci[lfound]]->addMerge(*loci[mlidx], mrna);
425     loci.Delete(mlidx);
426     }
427     }
428     }//mrnas loop
429     //if (rdisc>0) mrnas.Pack();
430     //return rdisc;
431     }
432    
433     int fix_umrnas(GSeqData& seqdata, GSeqData* rdata, FILE* fdis=NULL) {
434     //attempt to find the strand for seqdata.umrnas
435     //based on a) overlaps with oriented reference mRNAs if present
436     // b) overlaps with oriented mRNAs from the same input set
437     if (rdata!=NULL) { //we have reference mrnas
438     for (int i=0;i<rdata->mrnas_f.Count();i++) {
439     for (int j=0;j<seqdata.umrnas.Count();j++) {
440     if (rdata->mrnas_f[i]->gseq_id!=seqdata.umrnas[j]->gseq_id) continue;
441     if (seqdata.umrnas[j]->strand!='.') continue;
442     uint ustart=seqdata.umrnas[j]->exons.First()->start;
443     uint uend=seqdata.umrnas[j]->exons.Last()->end;
444     uint rstart=rdata->mrnas_f[i]->exons.First()->start;
445     uint rend=rdata->mrnas_f[i]->exons.Last()->end;
446     if (ustart>rend) break;
447     if (rstart>uend) continue;
448     if (rdata->mrnas_f[i]->exonOverlap(ustart,uend)) {
449     seqdata.umrnas[j]->strand='+';
450     }
451     else { //within intron
452     //if (seqdata.umrnas[j]->ulink==NULL ||
453     // seqdata.umrnas[j]->ulink->covlen<rdata->mrnas_f[i]->covlen) {
454     CTData* mdata=(CTData*)seqdata.umrnas[j]->uptr;
455     mdata->addOvl('i',rdata->mrnas_f[i]);
456     }
457     }
458     }
459     for (int i=0;i<rdata->mrnas_r.Count();i++) {
460     for (int j=0;j<seqdata.umrnas.Count();j++) {
461     if (seqdata.umrnas[j]->strand) continue;
462     uint ustart=seqdata.umrnas[j]->exons.First()->start;
463     uint uend=seqdata.umrnas[j]->exons.Last()->end;
464     uint rstart=rdata->mrnas_r[i]->exons.First()->start;
465     uint rend=rdata->mrnas_r[i]->exons.Last()->end;
466     if (ustart>rend) break;
467     if (rstart>uend) continue;
468     if (rdata->mrnas_r[i]->exonOverlap(ustart,uend)) {
469     seqdata.umrnas[j]->strand='-';
470     }
471     else { //within intron
472     CTData* mdata=(CTData*)seqdata.umrnas[j]->uptr;
473     mdata->addOvl('i',rdata->mrnas_r[i]);
474     }
475    
476     }
477     }
478     }//we have reference transcripts
479     //---- now compare to other transcripts
480     for (int i=0;i<seqdata.mrnas_f.Count();i++) {
481     for (int j=0;j<seqdata.umrnas.Count();j++) {
482     if (seqdata.umrnas[j]->strand) continue;
483     uint ustart=seqdata.umrnas[j]->exons.First()->start;
484     uint uend=seqdata.umrnas[j]->exons.Last()->end;
485     uint rstart=seqdata.mrnas_f[i]->exons.First()->start;
486     uint rend=seqdata.mrnas_f[i]->exons.Last()->end;
487     if (ustart>rend) break;
488     if (rstart>uend) continue;
489     if (seqdata.mrnas_f[i]->exonOverlap(ustart,uend)) {
490     seqdata.umrnas[j]->strand='+';
491     }
492     }
493     }
494     for (int i=0;i<seqdata.mrnas_r.Count();i++) {
495     for (int j=0;j<seqdata.umrnas.Count();j++) {
496     if (seqdata.umrnas[j]->strand) continue;
497     uint ustart=seqdata.umrnas[j]->exons.First()->start;
498     uint uend=seqdata.umrnas[j]->exons.Last()->end;
499     uint rstart=seqdata.mrnas_r[i]->exons.First()->start;
500     uint rend=seqdata.mrnas_r[i]->exons.Last()->end;
501     if (ustart>rend) break;
502     if (rstart>uend) continue;
503     //overlap
504     if (seqdata.mrnas_r[i]->exonOverlap(ustart,uend)) {
505     seqdata.umrnas[j]->strand='-';
506     }
507     }
508     }
509     int fcount=0;
510     for (int i=0;i<seqdata.umrnas.Count();i++) {
511     if (seqdata.umrnas[i]->strand=='+') {
512     seqdata.mrnas_f.Add(seqdata.umrnas[i]);
513     seqdata.umrnas.Forget(i);
514     }
515     else if (seqdata.umrnas[i]->strand=='-') {
516     seqdata.mrnas_r.Add(seqdata.umrnas[i]);
517     seqdata.umrnas.Forget(i);
518     }
519     else { //discard mRNAs not settled
520     seqdata.umrnas[i]->strand='.';
521     if (fdis!=NULL) {
522     seqdata.umrnas[i]->printGtf(fdis);
523     }
524     fcount++;
525     }
526     }
527     seqdata.umrnas.Pack();
528     return fcount;
529     }
530    
531     //retrieve ref_data for a specific genomic sequence
532     GSeqData* getRefData(int gid, GList<GSeqData>& ref_data) {
533     int ri=-1;
534     GSeqData f(gid);
535     GSeqData* r=NULL;
536     if (ref_data.Found(&f,ri))
537     r=ref_data[ri];
538     return r;
539     }
540    
541     void read_transcripts(FILE* f, GList<GSeqData>& seqdata, bool keepAttrs) {
542     rewind(f);
543     GffReader gffr(f, true); //loading only recognizable transcript features
544     gffr.showWarnings(gtf_tracking_verbose);
545    
546     // keepAttrs mergeCloseExons noExonAttrs
547     gffr.readAll(keepAttrs, true, true);
548    
549     // is_ref? check_for_dups,
550     parse_mRNAs(gffr.gflst, seqdata, false, false);
551     }
552    
553     int cmpGSeqByName(const pointer p1, const pointer p2) {
554     return strcmp(((GSeqData*)p1)->gseq_name, ((GSeqData*)p2)->gseq_name);
555     }
556    
557     void sort_GSeqs_byName(GList<GSeqData>& seqdata) {
558     seqdata.setSorted(&cmpGSeqByName);
559     }
560    
561     void read_mRNAs(FILE* f, GList<GSeqData>& seqdata, GList<GSeqData>* ref_data,
562     bool check_for_dups, int qfidx, const char* fname, bool only_multiexon) {
563     //>>>>> read all transcripts/features from a GTF/GFF3 file
564     //int imrna_counter=0;
565     int loci_counter=0;
566     if (ref_data==NULL) ref_data=&seqdata;
567     bool isRefData=(&seqdata==ref_data);
568     //(f, transcripts_only)
569     GffReader* gffr=new GffReader(f, true); //load only transcript annotations
570     gffr->showWarnings(gtf_tracking_verbose);
571     // keepAttrs mergeCloseExons noExonAttrs
572     gffr->readAll(!isRefData, true, isRefData || gtf_tracking_largeScale);
573     //so it will read exon attributes only for low number of Cufflinks files
574    
575     int d=parse_mRNAs(gffr->gflst, seqdata, isRefData, check_for_dups, qfidx,only_multiexon);
576     if (gtf_tracking_verbose && d>0) {
577     if (isRefData) GMessage(" %d duplicate reference transcripts discarded.\n",d);
578     else GMessage(" %d redundant cufflinks transfrags discarded.\n",d);
579     }
580     //imrna_counter=gffr->mrnas.Count();
581     delete gffr; //free the extra memory and unused GffObjs
582    
583     //for each genomic sequence, cluster transcripts
584     int discarded=0;
585     GStr bname(fname);
586     GStr s;
587     if (!bname.is_empty()) {
588     int di=bname.rindex('.');
589     if (di>0) bname.cut(di);
590     int p=bname.rindex('/');
591     if (p<0) p=bname.rindex('\\');
592     if (p>=0) bname.remove(0,p);
593     }
594     FILE* fdis=NULL;
595     FILE* frloci=NULL;
596    
597     for (int g=0;g<seqdata.Count();g++) {
598     //find the corresponding refseqdata with the same gseq_id
599     int gseq_id=seqdata[g]->get_gseqid();
600     if (!isRefData) { //cufflinks data, find corresponding ref data
601     GSeqData* rdata=getRefData(gseq_id, *ref_data);
602     if (rdata!=NULL && seqdata[g]->umrnas.Count()>0) {
603     discarded+=fix_umrnas(*seqdata[g], rdata, fdis);
604     }
605     }
606     //>>>>> group mRNAs into locus-clusters (based on exon overlap)
607     cluster_mRNAs(seqdata[g]->mrnas_f, seqdata[g]->loci_f, qfidx);
608     cluster_mRNAs(seqdata[g]->mrnas_r, seqdata[g]->loci_r, qfidx);
609     if (!isRefData) {
610     cluster_mRNAs(seqdata[g]->umrnas, seqdata[g]->nloci_u, qfidx);
611     }
612     loci_counter+=seqdata[g]->loci_f.Count();
613     loci_counter+=seqdata[g]->loci_r.Count();
614     // if (refData) {
615     // if (frloci==NULL) {
616     // s=bname;
617     // s.append(".loci.lst");
618     // frloci=fopen(s.chars(), "w");
619     // }
620     // writeLoci(frloci, seqdata[g]->loci_f);
621     // writeLoci(frloci, seqdata[g]->loci_r);
622     // }//write ref loci
623     }//for each genomic sequence
624     if (fdis!=NULL) fclose(fdis);
625     if (frloci!=NULL) fclose(frloci);
626     if (discarded>0) {
627     if (gtf_tracking_verbose) GMessage("Found %d transcripts with undetermined strand.\n", discarded);
628     }
629     else { if (fdis!=NULL) remove(s.chars()); }
630     }
631    
632     int qsearch_mrnas(uint x, GList<GffObj>& mrnas) {
633     //binary search
634     //do the simplest tests first:
635     if (mrnas[0]->start>x) return 0;
636     if (mrnas.Last()->start<x) return -1;
637     uint istart=0;
638     int i=0;
639     int idx=-1;
640     int maxh=mrnas.Count()-1;
641     int l=0;
642     int h = maxh;
643     while (l <= h) {
644     i = (l+h)>>1;
645     istart=mrnas[i]->start;
646     if (istart < x) l = i + 1;
647     else {
648     if (istart == x) { //found matching coordinate here
649     idx=i;
650     while (idx<=maxh && mrnas[idx]->start==x) {
651     idx++;
652     }
653     return (idx>maxh) ? -1 : idx;
654     }
655     h = i - 1;
656     }
657     } //while
658     idx = l;
659     while (idx<=maxh && mrnas[idx]->start<=x) {
660     idx++;
661     }
662     return (idx>maxh) ? -1 : idx;
663     }
664    
665     int qsearch_loci(uint x, GList<GLocus>& loci) {
666     // same as above, but for GSeg lists
667     //binary search
668     //do the simplest tests first:
669     if (loci[0]->start>x) return 0;
670     if (loci.Last()->start<x) return -1;
671     uint istart=0;
672     int i=0;
673     int idx=-1;
674     int maxh=loci.Count()-1;
675     int l=0;
676     int h = maxh;
677     while (l <= h) {
678     i = (l + h) >> 1;
679     istart=loci[i]->start;
680     if (istart < x) l=i+1;
681     else {
682     if (istart == x) { //found matching coordinate here
683     idx=i;
684     while (idx<=maxh && loci[idx]->start==x) {
685     idx++;
686     }
687     return (idx>maxh) ? -1 : idx;
688     }
689     h=i-1;
690     }
691     } //while
692     idx = l;
693     while (idx<=maxh && loci[idx]->start<=x) {
694     idx++;
695     }
696     return (idx>maxh) ? -1 : idx;
697     }

Properties

Name Value
svn:executable *