1 |
/* |
2 |
* gtf_tracking.cpp |
3 |
* cufflinks |
4 |
* |
5 |
* Created by Cole Trapnell on 9/5/09. |
6 |
* Copyright 2009 Geo Pertea. All rights reserved. |
7 |
* |
8 |
*/ |
9 |
|
10 |
#include "gtf_tracking.h" |
11 |
|
12 |
bool gtf_tracking_verbose = false; |
13 |
bool gtf_tracking_largeScale=false; //many input Cufflinks files processed at once by cuffcompare, discard exon attributes |
14 |
|
15 |
int GXConsensus::count=0; |
16 |
|
17 |
char* getGSeqName(int gseq_id) { |
18 |
return GffObj::names->gseqs.getName(gseq_id); |
19 |
} |
20 |
|
21 |
int cmpByPtr(const pointer p1, const pointer p2) { |
22 |
return (p1>p2) ? 1: ((p1==p2)? 0 : -1); |
23 |
} |
24 |
|
25 |
bool betterRef(GffObj* a, GffObj* b) { |
26 |
if (a==NULL || b==NULL) return (a!=NULL); |
27 |
if (a->exons.Count()!=b->exons.Count()) return (a->exons.Count()>b->exons.Count()); |
28 |
if (a->hasCDS() && !b->hasCDS()) |
29 |
return true; |
30 |
else { |
31 |
if (b->hasCDS() && !a->hasCDS()) return false; |
32 |
return (a->covlen>b->covlen); |
33 |
} |
34 |
} |
35 |
|
36 |
GffObj* is_RefDup(GffObj* m, GList<GffObj>& mrnas, int& dupidx) { |
37 |
//mrnas MUST be sorted by start coordinate |
38 |
int ovlen=0; |
39 |
dupidx=-1; |
40 |
if (mrnas.Count()==0) return NULL; |
41 |
int nidx=qsearch_mrnas(m->end, mrnas); |
42 |
if (nidx==0) return NULL; |
43 |
if (nidx==-1) nidx=mrnas.Count();//all can overlap |
44 |
for (int i=nidx-1;i>=0;i--) { |
45 |
GffObj& omrna=*mrnas[i]; |
46 |
if (m->start>omrna.end) { |
47 |
if (m->start-omrna.start>GFF_MAX_EXON) break; //give up already |
48 |
continue; |
49 |
} |
50 |
if (omrna.start>m->end) continue; //this should never be the case if nidx was found correctly |
51 |
//locus overlap here: |
52 |
if (tMatch(*m, omrna, ovlen, false, true)) { |
53 |
dupidx=i; |
54 |
return mrnas[i]; |
55 |
} |
56 |
} |
57 |
return NULL; |
58 |
} |
59 |
|
60 |
|
61 |
bool intronRedundant(GffObj& ti, GffObj& tj) { |
62 |
//two transcripts are "intron redundant" iff one transcript's intron chain |
63 |
// is a sub-chain of the other's |
64 |
int imax=ti.exons.Count()-1; |
65 |
int jmax=tj.exons.Count()-1; |
66 |
if (imax==0 || jmax==0) return false; //don't deal with single-exon transcripts here |
67 |
if (ti.exons[imax]->start<tj.exons[0]->end || |
68 |
tj.exons[jmax]->start<ti.exons[0]->end ) |
69 |
return false; //intron chains do not overlap at all |
70 |
|
71 |
uint eistart=0, eiend=0, ejstart=0, ejend=0; //exon boundaries |
72 |
int i=1; //exon idx to the right of the current intron of ti |
73 |
int j=1; //exon idx to the right of the current intron of tj |
74 |
//find the first intron overlap: |
75 |
while (i<=imax && j<=jmax) { |
76 |
eistart=ti.exons[i-1]->end; |
77 |
eiend=ti.exons[i]->start; |
78 |
ejstart=tj.exons[j-1]->end; |
79 |
ejend=tj.exons[j]->start; |
80 |
if (ejend<eistart) { j++; continue; } |
81 |
if (eiend<ejstart) { i++; continue; } |
82 |
//we found an intron overlap |
83 |
break; |
84 |
} |
85 |
if ((i>1 && j>1) || i>imax || j>jmax) { |
86 |
return false; //either no intron overlaps found at all |
87 |
//or it's not the first intron for at least one of the transcripts |
88 |
} |
89 |
if (eistart!=ejstart || eiend!=ejend) return false; //not an exact intron match |
90 |
//we have the first matching intron on the left |
91 |
if (j>i) { |
92 |
//i==1, ti's start must not conflict with the previous intron of tj |
93 |
if (ti.start<tj.exons[j-1]->start) return false; |
94 |
//so i's first intron starts AFTER j's first intron |
95 |
// then j must contain i, so i's last intron must end with or before j's last intron |
96 |
if (ti.exons[imax]->start>tj.exons[jmax]->start) return false; |
97 |
//comment out the line above if you just want "intron compatibility" (i.e. extension of intron chains ) |
98 |
} |
99 |
else if (i>j) { |
100 |
//j==1, tj's start must not conflict with the previous intron of ti |
101 |
if (tj.start<ti.exons[i-1]->start) return false; |
102 |
//so j's intron chain starts AFTER i's |
103 |
// then i must contain j, so j's last intron must end with or before j's last intron |
104 |
if (tj.exons[jmax]->start>ti.exons[imax]->start) return false; |
105 |
//comment out the line above for just "intronCompatible()" check |
106 |
} |
107 |
//now check if the rest of the introns overlap, in the same sequence |
108 |
i++; |
109 |
j++; |
110 |
while (i<=imax && j<=jmax) { |
111 |
if (ti.exons[i-1]->end!=tj.exons[j-1]->end || |
112 |
ti.exons[i]->start!=tj.exons[j]->start) return false; |
113 |
i++; |
114 |
j++; |
115 |
} |
116 |
i--; |
117 |
j--; |
118 |
if (i==imax && j<jmax) { |
119 |
// tj has more introns to the right, check if ti's end doesn't conflict with the current tj exon boundary |
120 |
if (ti.end>tj.exons[j]->end) return false; |
121 |
} |
122 |
else if (j==jmax && i<imax) { |
123 |
if (tj.end>ti.exons[i]->end) return false; |
124 |
} |
125 |
return true; |
126 |
} |
127 |
|
128 |
bool t_contains(GffObj& a, GffObj& b) { |
129 |
//returns true if b's intron chain (or single exon) is included in a |
130 |
if (b.exons.Count()>=a.exons.Count()) return false; |
131 |
if (b.exons.Count()==1) { |
132 |
//check if b is contained in any of a's exons: |
133 |
for (int i=0;i<a.exons.Count();i++) { |
134 |
if (b.start>=a.exons[i]->start && b.end<=a.exons[i]->end) return true; |
135 |
} |
136 |
return false; |
137 |
} |
138 |
if (intronRedundant(a,b)) { |
139 |
//intronRedudant allows b's initial/terminal exons to extend beyond a's boundaries |
140 |
//but we don't allow this kind of behavior here |
141 |
return (b.start>=a.start && b.end<=a.end); |
142 |
} |
143 |
else return false; |
144 |
} |
145 |
|
146 |
int is_Redundant(GffObj*m, GList<GffObj>* mrnas) { |
147 |
//first locate the list index of the mrna starting just ABOVE |
148 |
//the end of this mrna |
149 |
if (mrnas->Count()==0) return -1; |
150 |
int nidx=qsearch_mrnas(m->end, *mrnas); |
151 |
if (nidx==0) return -1; |
152 |
if (nidx==-1) nidx=mrnas->Count();//all can overlap |
153 |
for (int i=nidx-1;i>=0;i--) { |
154 |
GffObj& omrna=*mrnas->Get(i); |
155 |
if (m->start>omrna.end) { |
156 |
if (m->start-omrna.start>GFF_MAX_EXON) break; //give up already |
157 |
continue; |
158 |
} |
159 |
if (omrna.start>m->end) continue; //this should never be the case if nidx was found correctly |
160 |
|
161 |
if (intronRedundant(*m, omrna)) return i; |
162 |
} |
163 |
return -1; |
164 |
} |
165 |
|
166 |
bool t_dominates(GffObj* a, GffObj* b) { |
167 |
// for redundant / intron compatible transfrags: |
168 |
// returns true if a "dominates" b, i.e. a has more exons or is longer |
169 |
if (a->exons.Count()==b->exons.Count()) |
170 |
return (a->covlen>b->covlen); |
171 |
else return (a->exons.Count()>b->exons.Count()); |
172 |
} |
173 |
|
174 |
bool betterDupRef(GffObj* a, GffObj* b) { |
175 |
if (a->exons.Count()!=b->exons.Count()) |
176 |
return (a->exons.Count()>b->exons.Count()); |
177 |
if (a->hasCDS()!=b->hasCDS()) |
178 |
return (a->hasCDS()>b->hasCDS()); |
179 |
//for annotation purposes, it's more important to keep the |
180 |
//longer transcript, instead of the one that was loaded first |
181 |
if (a->covlen != b->covlen) |
182 |
return (a->covlen > b->covlen); |
183 |
else return (a->track_id < b->track_id); |
184 |
} |
185 |
|
186 |
int parse_mRNAs(GfList& mrnas, |
187 |
GList<GSeqData>& glstdata, |
188 |
bool is_ref_set, |
189 |
bool check_for_dups, |
190 |
int qfidx, bool only_multiexon) { |
191 |
int refdiscarded=0; //ref duplicates discarded |
192 |
int tredundant=0; //cufflinks redundant transcripts discarded |
193 |
for (int k=0;k<mrnas.Count();k++) { |
194 |
GffObj* m=mrnas[k]; |
195 |
int i=-1; |
196 |
GSeqData f(m->gseq_id); |
197 |
GSeqData* gdata=NULL; |
198 |
uint tlen=m->len(); |
199 |
if (m->hasErrors() || (tlen+500>GFF_MAX_LOCUS)) { //should probably report these in a file too.. |
200 |
if (gtf_tracking_verbose) |
201 |
GMessage("Warning: transcript %s discarded (structural errors found, length=%d).\n", m->getID(), tlen); |
202 |
continue; |
203 |
} |
204 |
if (only_multiexon && m->exons.Count()<2) { |
205 |
continue; |
206 |
} |
207 |
//GStr feature(m->getFeatureName()); |
208 |
//feature.lower(); |
209 |
//bool gene_or_locus=(feature.endsWith("gene") ||feature.index("loc")>=0); |
210 |
//if (m->exons.Count()==0 && gene_or_locus) { |
211 |
if (m->isDiscarded()) { |
212 |
//discard generic "gene" or "locus" features with no other detailed subfeatures |
213 |
//if (gtf_tracking_verbose) |
214 |
// GMessage("Warning: discarding GFF generic gene/locus container %s\n",m->getID()); |
215 |
continue; |
216 |
} |
217 |
if (m->exons.Count()==0) { |
218 |
//if (gtf_tracking_verbose) |
219 |
// GMessage("Warning: %s %s found without exon segments (adding default exon).\n",m->getFeatureName(), m->getID()); |
220 |
m->addExon(m->start,m->end); |
221 |
} |
222 |
if (glstdata.Found(&f,i)) gdata=glstdata[i]; |
223 |
else { |
224 |
gdata=new GSeqData(m->gseq_id); |
225 |
glstdata.Add(gdata); |
226 |
} |
227 |
|
228 |
double fpkm=0; |
229 |
double cov=0; |
230 |
double conf_hi=0; |
231 |
double conf_lo=0; |
232 |
|
233 |
GList<GffObj>* target_mrnas=NULL; |
234 |
if (is_ref_set) { //-- ref transcripts |
235 |
if (m->strand=='.') { |
236 |
//unknown strand - discard from reference set (!) |
237 |
continue; |
238 |
} |
239 |
target_mrnas=(m->strand=='+') ? &(gdata->mrnas_f) : &(gdata->mrnas_r); |
240 |
if (check_for_dups) { |
241 |
//check all gdata->mrnas_r (ref_data) for duplicate ref transcripts |
242 |
int rpidx=-1; |
243 |
GffObj* rp= is_RefDup(m, *target_mrnas, rpidx); |
244 |
if (rp!=NULL) { //duplicate found |
245 |
//discard one of them |
246 |
//but let's keep the gene_name if present |
247 |
//DEBUG: |
248 |
//GMessage("Ref duplicates: %s = %s\n", rp->getID(), m->getID()); |
249 |
refdiscarded++; |
250 |
if (betterDupRef(rp, m)) { |
251 |
if (rp->getGeneName()==NULL && m->getGeneName()!=NULL) { |
252 |
rp->setGeneName(m->getGeneName()); |
253 |
} |
254 |
continue; |
255 |
} |
256 |
else { |
257 |
if (m->getGeneName()==NULL && rp->getGeneName()!=NULL) { |
258 |
m->setGeneName(rp->getGeneName()); |
259 |
} |
260 |
((CTData*)(rp->uptr))->mrna=NULL; |
261 |
rp->isUsed(false); |
262 |
target_mrnas->Forget(rpidx); |
263 |
target_mrnas->Delete(rpidx); |
264 |
} |
265 |
} |
266 |
} //check for duplicate ref transcripts |
267 |
} //ref transcripts |
268 |
else { //-- transfrags |
269 |
if (m->strand=='+') { target_mrnas = &(gdata->mrnas_f); } |
270 |
else if (m->strand=='-') { target_mrnas=&(gdata->mrnas_r); } |
271 |
else { m->strand='.'; target_mrnas=&(gdata->umrnas); } |
272 |
if (check_for_dups) { //check for redundancy |
273 |
// check if there is a redundancy between this and another already loaded Cufflinks transcript |
274 |
int cidx = is_Redundant(m, target_mrnas); |
275 |
if (cidx>=0) { |
276 |
//always discard the redundant transcript with the fewer exons OR shorter |
277 |
if (t_dominates(target_mrnas->Get(cidx),m)) { |
278 |
//new transcript is shorter, discard it |
279 |
continue; |
280 |
} |
281 |
else { |
282 |
//discard the older transfrag |
283 |
((CTData*)(target_mrnas->Get(cidx)->uptr))->mrna=NULL; |
284 |
target_mrnas->Get(cidx)->isUsed(false); |
285 |
target_mrnas->Forget(cidx); |
286 |
target_mrnas->Delete(cidx); |
287 |
//the uptr (CTData) pointer will still be kept in gdata->ctdata and deallocated eventually |
288 |
} |
289 |
tredundant++; |
290 |
} |
291 |
}// redundant transfrag check |
292 |
if (m->gscore==0.0) |
293 |
m->gscore=m->exons[0]->score; //Cufflinks exon score = isoform abundance |
294 |
//const char* expr = (gtf_tracking_largeScale) ? m->getAttr("FPKM") : m->exons[0]->getAttr(m->names,"FPKM"); |
295 |
const char* expr = m->getAttr("FPKM"); |
296 |
if (expr!=NULL) { |
297 |
if (expr[0]=='"') expr++; |
298 |
fpkm=strtod(expr, NULL); |
299 |
} else { //backward compatibility: read RPKM if FPKM not found |
300 |
//expr=(gtf_tracking_largeScale) ? m->getAttr("RPKM") : m->exons[0]->getAttr(m->names,"RPKM"); |
301 |
expr=m->getAttr("RPKM"); |
302 |
if (expr!=NULL) { |
303 |
if (expr[0]=='"') expr++; |
304 |
fpkm=strtod(expr, NULL); |
305 |
} |
306 |
} |
307 |
//const char* scov=(gtf_tracking_largeScale) ? m->getAttr("cov") : m->exons[0]->getAttr(m->names,"cov"); |
308 |
const char* scov=m->getAttr("cov"); |
309 |
if (scov!=NULL) { |
310 |
if (scov[0]=='"') scov++; |
311 |
cov=strtod(scov, NULL); |
312 |
} |
313 |
//const char* sconf_hi=(gtf_tracking_largeScale) ? m->getAttr("conf_hi") : m->exons[0]->getAttr(m->names,"conf_hi"); |
314 |
const char* sconf_hi=m->getAttr("conf_hi"); |
315 |
if (sconf_hi!=NULL){ |
316 |
if (sconf_hi[0]=='"') sconf_hi++; |
317 |
conf_hi=strtod(sconf_hi, NULL); |
318 |
} |
319 |
//const char* sconf_lo=(gtf_tracking_largeScale) ? m->getAttr("conf_lo") : m->exons[0]->getAttr(m->names,"conf_lo"); |
320 |
const char* sconf_lo=m->getAttr("conf_lo"); |
321 |
if (sconf_lo!=NULL) { |
322 |
if (sconf_lo[0]=='"') sconf_lo++; |
323 |
conf_lo=strtod(sconf_lo, NULL); |
324 |
} |
325 |
} //Cufflinks transfrags |
326 |
target_mrnas->Add(m); |
327 |
m->isUsed(true); |
328 |
CTData* mdata=new CTData(m); |
329 |
mdata->qset=qfidx; |
330 |
gdata->tdata.Add(mdata); |
331 |
if (!is_ref_set) { |
332 |
// Cufflinks - attributes parsing |
333 |
mdata->FPKM=fpkm; |
334 |
mdata->cov=cov; |
335 |
mdata->conf_hi=conf_hi; |
336 |
mdata->conf_lo=conf_lo; |
337 |
} |
338 |
}//for each mrna read |
339 |
//if (mrna_deleted>0) |
340 |
// mrnas.Pack(); |
341 |
|
342 |
return (is_ref_set ? refdiscarded : tredundant); |
343 |
} |
344 |
|
345 |
bool tMatch(GffObj& a, GffObj& b, int& ovlen, bool fuzzunspl, bool contain_only) { |
346 |
//strict intron chain match, or single-exon perfect match |
347 |
int imax=a.exons.Count()-1; |
348 |
int jmax=b.exons.Count()-1; |
349 |
ovlen=0; |
350 |
if (imax!=jmax) return false; //different number of introns |
351 |
if (imax==0) { //single-exon mRNAs |
352 |
if (contain_only) { |
353 |
return ((a.start>=b.start && a.end<=b.end) || |
354 |
(b.start>=a.start && b.end<=a.end)); |
355 |
} |
356 |
if (fuzzunspl) { |
357 |
//fuzz match for single-exon transfrags: |
358 |
// it's a match if they overlap at least 80% of shortest one |
359 |
ovlen=a.exons[0]->overlapLen(b.exons[0]); |
360 |
int maxlen=GMAX(a.covlen,b.covlen); |
361 |
return (ovlen>=maxlen*0.8); |
362 |
} |
363 |
else { |
364 |
//only exact match, or strictly contained |
365 |
ovlen=a.covlen; |
366 |
return (a.exons[0]->start==b.exons[0]->start && |
367 |
a.exons[0]->end==b.exons[0]->end); |
368 |
} |
369 |
} |
370 |
if ( a.exons[imax]->start<b.exons[0]->end || |
371 |
b.exons[jmax]->start<a.exons[0]->end ) |
372 |
return false; //intron chains do not overlap at all |
373 |
//check intron overlaps |
374 |
ovlen=a.exons[0]->end-(GMAX(a.start,b.start))+1; |
375 |
ovlen+=(GMIN(a.end,b.end))-a.exons.Last()->start; |
376 |
for (int i=1;i<=imax;i++) { |
377 |
if (i<imax) ovlen+=a.exons[i]->len(); |
378 |
if ((a.exons[i-1]->end!=b.exons[i-1]->end) || |
379 |
(a.exons[i]->start!=b.exons[i]->start)) { |
380 |
return false; //intron mismatch |
381 |
} |
382 |
} |
383 |
if (contain_only) |
384 |
return ((a.start>=b.start && a.end<=b.end) || |
385 |
(b.start>=a.start && b.end<=a.end)); |
386 |
else return true; |
387 |
} |
388 |
|
389 |
|
390 |
void cluster_mRNAs(GList<GffObj> & mrnas, GList<GLocus> & loci, int qfidx) { |
391 |
//mrnas sorted by start coordinate |
392 |
//and so are the loci |
393 |
//int rdisc=0; |
394 |
for (int t=0;t<mrnas.Count();t++) { |
395 |
GArray<int> mrgloci(false); |
396 |
GffObj* mrna=mrnas[t]; |
397 |
int lfound=0; //count of parent loci |
398 |
/*for (int l=0;l<loci.Count();l++) { |
399 |
if (loci[l]->end<mrna->exons.First()->start) continue; |
400 |
if (loci[l]->start>mrna->exons.Last()->end) break; */ |
401 |
for (int l=loci.Count()-1;l>=0;l--) { |
402 |
if (loci[l]->end<mrna->exons.First()->start) { |
403 |
if (mrna->exons.First()->start-loci[l]->start > GFF_MAX_LOCUS) break; |
404 |
continue; |
405 |
} |
406 |
if (loci[l]->start>mrna->exons.Last()->end) continue; |
407 |
//here we have mrna overlapping loci[l] |
408 |
if (loci[l]->add_mRNA(mrna)) { |
409 |
//a parent locus was found |
410 |
lfound++; |
411 |
mrgloci.Add(l); //locus indices added here, in decreasing order |
412 |
} |
413 |
}//loci loop |
414 |
//if (lfound<0) continue; //mrna was a ref duplicate, skip it |
415 |
if (lfound==0) { |
416 |
//create a locus with only this mRNA |
417 |
loci.Add(new GLocus(mrna, qfidx)); |
418 |
} |
419 |
else if (lfound>1) { |
420 |
//more than one locus found parenting this mRNA, merge loci |
421 |
lfound--; |
422 |
for (int l=0;l<lfound;l++) { |
423 |
int mlidx=mrgloci[l]; //largest indices first, so it's safe to remove |
424 |
loci[mrgloci[lfound]]->addMerge(*loci[mlidx], mrna); |
425 |
loci.Delete(mlidx); |
426 |
} |
427 |
} |
428 |
}//mrnas loop |
429 |
//if (rdisc>0) mrnas.Pack(); |
430 |
//return rdisc; |
431 |
} |
432 |
|
433 |
int fix_umrnas(GSeqData& seqdata, GSeqData* rdata, FILE* fdis=NULL) { |
434 |
//attempt to find the strand for seqdata.umrnas |
435 |
//based on a) overlaps with oriented reference mRNAs if present |
436 |
// b) overlaps with oriented mRNAs from the same input set |
437 |
if (rdata!=NULL) { //we have reference mrnas |
438 |
for (int i=0;i<rdata->mrnas_f.Count();i++) { |
439 |
for (int j=0;j<seqdata.umrnas.Count();j++) { |
440 |
if (rdata->mrnas_f[i]->gseq_id!=seqdata.umrnas[j]->gseq_id) continue; |
441 |
if (seqdata.umrnas[j]->strand!='.') continue; |
442 |
uint ustart=seqdata.umrnas[j]->exons.First()->start; |
443 |
uint uend=seqdata.umrnas[j]->exons.Last()->end; |
444 |
uint rstart=rdata->mrnas_f[i]->exons.First()->start; |
445 |
uint rend=rdata->mrnas_f[i]->exons.Last()->end; |
446 |
if (ustart>rend) break; |
447 |
if (rstart>uend) continue; |
448 |
if (rdata->mrnas_f[i]->exonOverlap(ustart,uend)) { |
449 |
seqdata.umrnas[j]->strand='+'; |
450 |
} |
451 |
else { //within intron |
452 |
//if (seqdata.umrnas[j]->ulink==NULL || |
453 |
// seqdata.umrnas[j]->ulink->covlen<rdata->mrnas_f[i]->covlen) { |
454 |
CTData* mdata=(CTData*)seqdata.umrnas[j]->uptr; |
455 |
mdata->addOvl('i',rdata->mrnas_f[i]); |
456 |
} |
457 |
} |
458 |
} |
459 |
for (int i=0;i<rdata->mrnas_r.Count();i++) { |
460 |
for (int j=0;j<seqdata.umrnas.Count();j++) { |
461 |
if (seqdata.umrnas[j]->strand) continue; |
462 |
uint ustart=seqdata.umrnas[j]->exons.First()->start; |
463 |
uint uend=seqdata.umrnas[j]->exons.Last()->end; |
464 |
uint rstart=rdata->mrnas_r[i]->exons.First()->start; |
465 |
uint rend=rdata->mrnas_r[i]->exons.Last()->end; |
466 |
if (ustart>rend) break; |
467 |
if (rstart>uend) continue; |
468 |
if (rdata->mrnas_r[i]->exonOverlap(ustart,uend)) { |
469 |
seqdata.umrnas[j]->strand='-'; |
470 |
} |
471 |
else { //within intron |
472 |
CTData* mdata=(CTData*)seqdata.umrnas[j]->uptr; |
473 |
mdata->addOvl('i',rdata->mrnas_r[i]); |
474 |
} |
475 |
|
476 |
} |
477 |
} |
478 |
}//we have reference transcripts |
479 |
//---- now compare to other transcripts |
480 |
for (int i=0;i<seqdata.mrnas_f.Count();i++) { |
481 |
for (int j=0;j<seqdata.umrnas.Count();j++) { |
482 |
if (seqdata.umrnas[j]->strand) continue; |
483 |
uint ustart=seqdata.umrnas[j]->exons.First()->start; |
484 |
uint uend=seqdata.umrnas[j]->exons.Last()->end; |
485 |
uint rstart=seqdata.mrnas_f[i]->exons.First()->start; |
486 |
uint rend=seqdata.mrnas_f[i]->exons.Last()->end; |
487 |
if (ustart>rend) break; |
488 |
if (rstart>uend) continue; |
489 |
if (seqdata.mrnas_f[i]->exonOverlap(ustart,uend)) { |
490 |
seqdata.umrnas[j]->strand='+'; |
491 |
} |
492 |
} |
493 |
} |
494 |
for (int i=0;i<seqdata.mrnas_r.Count();i++) { |
495 |
for (int j=0;j<seqdata.umrnas.Count();j++) { |
496 |
if (seqdata.umrnas[j]->strand) continue; |
497 |
uint ustart=seqdata.umrnas[j]->exons.First()->start; |
498 |
uint uend=seqdata.umrnas[j]->exons.Last()->end; |
499 |
uint rstart=seqdata.mrnas_r[i]->exons.First()->start; |
500 |
uint rend=seqdata.mrnas_r[i]->exons.Last()->end; |
501 |
if (ustart>rend) break; |
502 |
if (rstart>uend) continue; |
503 |
//overlap |
504 |
if (seqdata.mrnas_r[i]->exonOverlap(ustart,uend)) { |
505 |
seqdata.umrnas[j]->strand='-'; |
506 |
} |
507 |
} |
508 |
} |
509 |
int fcount=0; |
510 |
for (int i=0;i<seqdata.umrnas.Count();i++) { |
511 |
if (seqdata.umrnas[i]->strand=='+') { |
512 |
seqdata.mrnas_f.Add(seqdata.umrnas[i]); |
513 |
seqdata.umrnas.Forget(i); |
514 |
} |
515 |
else if (seqdata.umrnas[i]->strand=='-') { |
516 |
seqdata.mrnas_r.Add(seqdata.umrnas[i]); |
517 |
seqdata.umrnas.Forget(i); |
518 |
} |
519 |
else { //discard mRNAs not settled |
520 |
seqdata.umrnas[i]->strand='.'; |
521 |
if (fdis!=NULL) { |
522 |
seqdata.umrnas[i]->printGtf(fdis); |
523 |
} |
524 |
fcount++; |
525 |
} |
526 |
} |
527 |
seqdata.umrnas.Pack(); |
528 |
return fcount; |
529 |
} |
530 |
|
531 |
//retrieve ref_data for a specific genomic sequence |
532 |
GSeqData* getRefData(int gid, GList<GSeqData>& ref_data) { |
533 |
int ri=-1; |
534 |
GSeqData f(gid); |
535 |
GSeqData* r=NULL; |
536 |
if (ref_data.Found(&f,ri)) |
537 |
r=ref_data[ri]; |
538 |
return r; |
539 |
} |
540 |
|
541 |
void read_transcripts(FILE* f, GList<GSeqData>& seqdata, bool keepAttrs) { |
542 |
rewind(f); |
543 |
GffReader gffr(f, true); //loading only recognizable transcript features |
544 |
gffr.showWarnings(gtf_tracking_verbose); |
545 |
|
546 |
// keepAttrs mergeCloseExons noExonAttrs |
547 |
gffr.readAll(keepAttrs, true, true); |
548 |
|
549 |
// is_ref? check_for_dups, |
550 |
parse_mRNAs(gffr.gflst, seqdata, false, false); |
551 |
} |
552 |
|
553 |
int cmpGSeqByName(const pointer p1, const pointer p2) { |
554 |
return strcmp(((GSeqData*)p1)->gseq_name, ((GSeqData*)p2)->gseq_name); |
555 |
} |
556 |
|
557 |
void sort_GSeqs_byName(GList<GSeqData>& seqdata) { |
558 |
seqdata.setSorted(&cmpGSeqByName); |
559 |
} |
560 |
|
561 |
void read_mRNAs(FILE* f, GList<GSeqData>& seqdata, GList<GSeqData>* ref_data, |
562 |
bool check_for_dups, int qfidx, const char* fname, bool only_multiexon) { |
563 |
//>>>>> read all transcripts/features from a GTF/GFF3 file |
564 |
//int imrna_counter=0; |
565 |
int loci_counter=0; |
566 |
if (ref_data==NULL) ref_data=&seqdata; |
567 |
bool isRefData=(&seqdata==ref_data); |
568 |
//(f, transcripts_only) |
569 |
GffReader* gffr=new GffReader(f, true); //load only transcript annotations |
570 |
gffr->showWarnings(gtf_tracking_verbose); |
571 |
// keepAttrs mergeCloseExons noExonAttrs |
572 |
gffr->readAll(!isRefData, true, isRefData || gtf_tracking_largeScale); |
573 |
//so it will read exon attributes only for low number of Cufflinks files |
574 |
|
575 |
int d=parse_mRNAs(gffr->gflst, seqdata, isRefData, check_for_dups, qfidx,only_multiexon); |
576 |
if (gtf_tracking_verbose && d>0) { |
577 |
if (isRefData) GMessage(" %d duplicate reference transcripts discarded.\n",d); |
578 |
else GMessage(" %d redundant cufflinks transfrags discarded.\n",d); |
579 |
} |
580 |
//imrna_counter=gffr->mrnas.Count(); |
581 |
delete gffr; //free the extra memory and unused GffObjs |
582 |
|
583 |
//for each genomic sequence, cluster transcripts |
584 |
int discarded=0; |
585 |
GStr bname(fname); |
586 |
GStr s; |
587 |
if (!bname.is_empty()) { |
588 |
int di=bname.rindex('.'); |
589 |
if (di>0) bname.cut(di); |
590 |
int p=bname.rindex('/'); |
591 |
if (p<0) p=bname.rindex('\\'); |
592 |
if (p>=0) bname.remove(0,p); |
593 |
} |
594 |
FILE* fdis=NULL; |
595 |
FILE* frloci=NULL; |
596 |
|
597 |
for (int g=0;g<seqdata.Count();g++) { |
598 |
//find the corresponding refseqdata with the same gseq_id |
599 |
int gseq_id=seqdata[g]->get_gseqid(); |
600 |
if (!isRefData) { //cufflinks data, find corresponding ref data |
601 |
GSeqData* rdata=getRefData(gseq_id, *ref_data); |
602 |
if (rdata!=NULL && seqdata[g]->umrnas.Count()>0) { |
603 |
discarded+=fix_umrnas(*seqdata[g], rdata, fdis); |
604 |
} |
605 |
} |
606 |
//>>>>> group mRNAs into locus-clusters (based on exon overlap) |
607 |
cluster_mRNAs(seqdata[g]->mrnas_f, seqdata[g]->loci_f, qfidx); |
608 |
cluster_mRNAs(seqdata[g]->mrnas_r, seqdata[g]->loci_r, qfidx); |
609 |
if (!isRefData) { |
610 |
cluster_mRNAs(seqdata[g]->umrnas, seqdata[g]->nloci_u, qfidx); |
611 |
} |
612 |
loci_counter+=seqdata[g]->loci_f.Count(); |
613 |
loci_counter+=seqdata[g]->loci_r.Count(); |
614 |
// if (refData) { |
615 |
// if (frloci==NULL) { |
616 |
// s=bname; |
617 |
// s.append(".loci.lst"); |
618 |
// frloci=fopen(s.chars(), "w"); |
619 |
// } |
620 |
// writeLoci(frloci, seqdata[g]->loci_f); |
621 |
// writeLoci(frloci, seqdata[g]->loci_r); |
622 |
// }//write ref loci |
623 |
}//for each genomic sequence |
624 |
if (fdis!=NULL) fclose(fdis); |
625 |
if (frloci!=NULL) fclose(frloci); |
626 |
if (discarded>0) { |
627 |
if (gtf_tracking_verbose) GMessage("Found %d transcripts with undetermined strand.\n", discarded); |
628 |
} |
629 |
else { if (fdis!=NULL) remove(s.chars()); } |
630 |
} |
631 |
|
632 |
int qsearch_mrnas(uint x, GList<GffObj>& mrnas) { |
633 |
//binary search |
634 |
//do the simplest tests first: |
635 |
if (mrnas[0]->start>x) return 0; |
636 |
if (mrnas.Last()->start<x) return -1; |
637 |
uint istart=0; |
638 |
int i=0; |
639 |
int idx=-1; |
640 |
int maxh=mrnas.Count()-1; |
641 |
int l=0; |
642 |
int h = maxh; |
643 |
while (l <= h) { |
644 |
i = (l+h)>>1; |
645 |
istart=mrnas[i]->start; |
646 |
if (istart < x) l = i + 1; |
647 |
else { |
648 |
if (istart == x) { //found matching coordinate here |
649 |
idx=i; |
650 |
while (idx<=maxh && mrnas[idx]->start==x) { |
651 |
idx++; |
652 |
} |
653 |
return (idx>maxh) ? -1 : idx; |
654 |
} |
655 |
h = i - 1; |
656 |
} |
657 |
} //while |
658 |
idx = l; |
659 |
while (idx<=maxh && mrnas[idx]->start<=x) { |
660 |
idx++; |
661 |
} |
662 |
return (idx>maxh) ? -1 : idx; |
663 |
} |
664 |
|
665 |
int qsearch_loci(uint x, GList<GLocus>& loci) { |
666 |
// same as above, but for GSeg lists |
667 |
//binary search |
668 |
//do the simplest tests first: |
669 |
if (loci[0]->start>x) return 0; |
670 |
if (loci.Last()->start<x) return -1; |
671 |
uint istart=0; |
672 |
int i=0; |
673 |
int idx=-1; |
674 |
int maxh=loci.Count()-1; |
675 |
int l=0; |
676 |
int h = maxh; |
677 |
while (l <= h) { |
678 |
i = (l + h) >> 1; |
679 |
istart=loci[i]->start; |
680 |
if (istart < x) l=i+1; |
681 |
else { |
682 |
if (istart == x) { //found matching coordinate here |
683 |
idx=i; |
684 |
while (idx<=maxh && loci[idx]->start==x) { |
685 |
idx++; |
686 |
} |
687 |
return (idx>maxh) ? -1 : idx; |
688 |
} |
689 |
h=i-1; |
690 |
} |
691 |
} //while |
692 |
idx = l; |
693 |
while (idx<=maxh && loci[idx]->start<=x) { |
694 |
idx++; |
695 |
} |
696 |
return (idx>maxh) ? -1 : idx; |
697 |
} |