ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/cuffcompare/gtf_tracking.cpp
Revision: 54
Committed: Thu Sep 8 05:46:16 2011 UTC (8 years, 1 month ago) by gpertea
File size: 24079 byte(s)
Log Message:
debug RPS20P29 ENSG00000241828 not collapsing

Line File contents
1 /*
2 * gtf_tracking.cpp
3 * cufflinks
4 *
5 * Created by Cole Trapnell on 9/5/09.
6 * Copyright 2009 Geo Pertea. All rights reserved.
7 *
8 */
9
10 #include "gtf_tracking.h"
11
12 bool gtf_tracking_verbose = false;
13 bool gtf_tracking_largeScale=false; //many input Cufflinks files processed at once by cuffcompare, discard exon attributes
14
15 int GXConsensus::count=0;
16
17 char* getGSeqName(int gseq_id) {
18 return GffObj::names->gseqs.getName(gseq_id);
19 }
20
21 int cmpByPtr(const pointer p1, const pointer p2) {
22 return (p1>p2) ? 1: ((p1==p2)? 0 : -1);
23 }
24
25 bool betterRef(GffObj* a, GffObj* b) {
26 if (a==NULL || b==NULL) return (a!=NULL);
27 if (a->exons.Count()!=b->exons.Count()) return (a->exons.Count()>b->exons.Count());
28 if (a->hasCDS() && !b->hasCDS())
29 return true;
30 else {
31 if (b->hasCDS() && !a->hasCDS()) return false;
32 return (a->covlen>b->covlen);
33 }
34 }
35
36 GffObj* is_RefDup(GffObj* m, GList<GffObj>& mrnas, int& dupidx) {
37 //mrnas MUST be sorted by start coordinate
38 int ovlen=0;
39 dupidx=-1;
40 if (mrnas.Count()==0) return NULL;
41 int nidx=qsearch_mrnas(m->end, mrnas);
42 if (nidx==0) return NULL;
43 if (nidx==-1) nidx=mrnas.Count();//all can overlap
44 for (int i=nidx-1;i>=0;i--) {
45 GffObj& omrna=*mrnas[i];
46 if (m->start>omrna.end) {
47 if (m->start-omrna.start>GFF_MAX_EXON) break; //give up already
48 continue;
49 }
50 if (omrna.start>m->end) continue; //this should never be the case if nidx was found correctly
51 //locus overlap here:
52 if (tMatch(*m, omrna, ovlen, false, true)) {
53 dupidx=i;
54 return mrnas[i];
55 }
56 }
57 return NULL;
58 }
59
60
61 bool intronRedundant(GffObj& ti, GffObj& tj) {
62 //two transcripts are "intron redundant" iff one transcript's intron chain
63 // is a sub-chain of the other's
64 int imax=ti.exons.Count()-1;
65 int jmax=tj.exons.Count()-1;
66 if (imax==0 || jmax==0) return false; //don't deal with single-exon transcripts here
67 if (ti.exons[imax]->start<tj.exons[0]->end ||
68 tj.exons[jmax]->start<ti.exons[0]->end )
69 return false; //intron chains do not overlap at all
70
71 uint eistart=0, eiend=0, ejstart=0, ejend=0; //exon boundaries
72 int i=1; //exon idx to the right of the current intron of ti
73 int j=1; //exon idx to the right of the current intron of tj
74 //find the first intron overlap:
75 while (i<=imax && j<=jmax) {
76 eistart=ti.exons[i-1]->end;
77 eiend=ti.exons[i]->start;
78 ejstart=tj.exons[j-1]->end;
79 ejend=tj.exons[j]->start;
80 if (ejend<eistart) { j++; continue; }
81 if (eiend<ejstart) { i++; continue; }
82 //we found an intron overlap
83 break;
84 }
85 if ((i>1 && j>1) || i>imax || j>jmax) {
86 return false; //either no intron overlaps found at all
87 //or it's not the first intron for at least one of the transcripts
88 }
89 if (eistart!=ejstart || eiend!=ejend) return false; //not an exact intron match
90 //we have the first matching intron on the left
91 if (j>i) {
92 //i==1, ti's start must not conflict with the previous intron of tj
93 if (ti.start<tj.exons[j-1]->start) return false;
94 //so i's first intron starts AFTER j's first intron
95 // then j must contain i, so i's last intron must end with or before j's last intron
96 if (ti.exons[imax]->start>tj.exons[jmax]->start) return false;
97 //comment out the line above if you just want "intron compatibility" (i.e. extension of intron chains )
98 }
99 else if (i>j) {
100 //j==1, tj's start must not conflict with the previous intron of ti
101 if (tj.start<ti.exons[i-1]->start) return false;
102 //so j's intron chain starts AFTER i's
103 // then i must contain j, so j's last intron must end with or before j's last intron
104 if (tj.exons[jmax]->start>ti.exons[imax]->start) return false;
105 //comment out the line above for just "intronCompatible()" check
106 }
107 //now check if the rest of the introns overlap, in the same sequence
108 i++;
109 j++;
110 while (i<=imax && j<=jmax) {
111 if (ti.exons[i-1]->end!=tj.exons[j-1]->end ||
112 ti.exons[i]->start!=tj.exons[j]->start) return false;
113 i++;
114 j++;
115 }
116 i--;
117 j--;
118 if (i==imax && j<jmax) {
119 // tj has more introns to the right, check if ti's end doesn't conflict with the current tj exon boundary
120 if (ti.end>tj.exons[j]->end) return false;
121 }
122 else if (j==jmax && i<imax) {
123 if (tj.end>ti.exons[i]->end) return false;
124 }
125 return true;
126 }
127
128 bool t_contains(GffObj& a, GffObj& b) {
129 //returns true if b's intron chain (or single exon) is included in a
130 if (b.exons.Count()>=a.exons.Count()) return false;
131 if (b.exons.Count()==1) {
132 //check if b is contained in any of a's exons:
133 for (int i=0;i<a.exons.Count();i++) {
134 if (b.start>=a.exons[i]->start && b.end<=a.exons[i]->end) return true;
135 }
136 return false;
137 }
138 if (intronRedundant(a,b)) {
139 //intronRedudant allows b's initial/terminal exons to extend beyond a's boundaries
140 //but we don't allow this kind of behavior here
141 return (b.start>=a.start && b.end<=a.end);
142 }
143 else return false;
144 }
145
146 int is_Redundant(GffObj*m, GList<GffObj>* mrnas) {
147 //first locate the list index of the mrna starting just ABOVE
148 //the end of this mrna
149 if (mrnas->Count()==0) return -1;
150 int nidx=qsearch_mrnas(m->end, *mrnas);
151 if (nidx==0) return -1;
152 if (nidx==-1) nidx=mrnas->Count();//all can overlap
153 for (int i=nidx-1;i>=0;i--) {
154 GffObj& omrna=*mrnas->Get(i);
155 if (m->start>omrna.end) {
156 if (m->start-omrna.start>GFF_MAX_EXON) break; //give up already
157 continue;
158 }
159 if (omrna.start>m->end) continue; //this should never be the case if nidx was found correctly
160
161 if (intronRedundant(*m, omrna)) return i;
162 }
163 return -1;
164 }
165
166 bool t_dominates(GffObj* a, GffObj* b) {
167 // for redundant / intron compatible transfrags:
168 // returns true if a "dominates" b, i.e. a has more exons or is longer
169 if (a->exons.Count()==b->exons.Count())
170 return (a->covlen>b->covlen);
171 else return (a->exons.Count()>b->exons.Count());
172 }
173
174 bool betterDupRef(GffObj* a, GffObj* b) {
175 if (a->exons.Count()!=b->exons.Count())
176 return (a->exons.Count()>b->exons.Count());
177 if (a->hasCDS()!=b->hasCDS())
178 return (a->hasCDS()>b->hasCDS());
179 //for annotation purposes, it's more important to keep the
180 //longer transcript, instead of the one that was loaded first
181 if (a->covlen != b->covlen)
182 return (a->covlen > b->covlen);
183 else return (a->track_id < b->track_id);
184 }
185
186 int parse_mRNAs(GfList& mrnas,
187 GList<GSeqData>& glstdata,
188 bool is_ref_set,
189 bool check_for_dups,
190 int qfidx, bool only_multiexon) {
191 int refdiscarded=0; //ref duplicates discarded
192 int tredundant=0; //cufflinks redundant transcripts discarded
193 for (int k=0;k<mrnas.Count();k++) {
194 GffObj* m=mrnas[k];
195 int i=-1;
196 GSeqData f(m->gseq_id);
197 GSeqData* gdata=NULL;
198 uint tlen=m->len();
199 if (m->hasErrors() || (tlen+500>GFF_MAX_LOCUS)) { //should probably report these in a file too..
200 if (gtf_tracking_verbose)
201 GMessage("Warning: transcript %s discarded (structural errors found, length=%d).\n", m->getID(), tlen);
202 continue;
203 }
204 if (only_multiexon && m->exons.Count()<2) {
205 continue;
206 }
207 //GStr feature(m->getFeatureName());
208 //feature.lower();
209 //bool gene_or_locus=(feature.endsWith("gene") ||feature.index("loc")>=0);
210 //if (m->exons.Count()==0 && gene_or_locus) {
211 if (m->isDiscarded()) {
212 //discard generic "gene" or "locus" features with no other detailed subfeatures
213 //if (gtf_tracking_verbose)
214 // GMessage("Warning: discarding GFF generic gene/locus container %s\n",m->getID());
215 continue;
216 }
217 if (m->exons.Count()==0) {
218 //if (gtf_tracking_verbose)
219 // GMessage("Warning: %s %s found without exon segments (adding default exon).\n",m->getFeatureName(), m->getID());
220 m->addExon(m->start,m->end);
221 }
222 if (glstdata.Found(&f,i)) gdata=glstdata[i];
223 else {
224 gdata=new GSeqData(m->gseq_id);
225 glstdata.Add(gdata);
226 }
227
228 double fpkm=0;
229 double cov=0;
230 double conf_hi=0;
231 double conf_lo=0;
232
233 GList<GffObj>* target_mrnas=NULL;
234 if (is_ref_set) { //-- ref transcripts
235 if (m->strand=='.') {
236 //unknown strand - discard from reference set (!)
237 continue;
238 }
239 target_mrnas=(m->strand=='+') ? &(gdata->mrnas_f) : &(gdata->mrnas_r);
240 if (check_for_dups) {
241 //check all gdata->mrnas_r (ref_data) for duplicate ref transcripts
242 int rpidx=-1;
243 GffObj* rp= is_RefDup(m, *target_mrnas, rpidx);
244 if (rp!=NULL) { //duplicate found
245 //discard one of them
246 //but let's keep the gene_name if present
247 //DEBUG:
248 GMessage("Ref duplicates: %s = %s\n", rp->getID(), m->getID());
249 refdiscarded++;
250 if (betterDupRef(rp, m)) {
251 if (rp->getGeneName()==NULL && m->getGeneName()!=NULL) {
252 rp->setGeneName(m->getGeneName());
253 }
254 continue;
255 }
256 else {
257 if (m->getGeneName()==NULL && rp->getGeneName()!=NULL) {
258 m->setGeneName(rp->getGeneName());
259 }
260 ((CTData*)(rp->uptr))->mrna=NULL;
261 rp->isUsed(false);
262 target_mrnas->Forget(rpidx);
263 target_mrnas->Delete(rpidx);
264 }
265 }
266 } //check for duplicate ref transcripts
267 } //ref transcripts
268 else { //-- transfrags
269 if (m->strand=='+') { target_mrnas = &(gdata->mrnas_f); }
270 else if (m->strand=='-') { target_mrnas=&(gdata->mrnas_r); }
271 else { m->strand='.'; target_mrnas=&(gdata->umrnas); }
272 if (check_for_dups) { //check for redundancy
273 // check if there is a redundancy between this and another already loaded Cufflinks transcript
274 int cidx = is_Redundant(m, target_mrnas);
275 if (cidx>=0) {
276 //always discard the redundant transcript with the fewer exons OR shorter
277 if (t_dominates(target_mrnas->Get(cidx),m)) {
278 //new transcript is shorter, discard it
279 continue;
280 }
281 else {
282 //discard the older transfrag
283 ((CTData*)(target_mrnas->Get(cidx)->uptr))->mrna=NULL;
284 target_mrnas->Get(cidx)->isUsed(false);
285 target_mrnas->Forget(cidx);
286 target_mrnas->Delete(cidx);
287 //the uptr (CTData) pointer will still be kept in gdata->ctdata and deallocated eventually
288 }
289 tredundant++;
290 }
291 }// redundant transfrag check
292 if (m->gscore==0.0)
293 m->gscore=m->exons[0]->score; //Cufflinks exon score = isoform abundance
294 //const char* expr = (gtf_tracking_largeScale) ? m->getAttr("FPKM") : m->exons[0]->getAttr(m->names,"FPKM");
295 const char* expr = m->getAttr("FPKM");
296 if (expr!=NULL) {
297 if (expr[0]=='"') expr++;
298 fpkm=strtod(expr, NULL);
299 } else { //backward compatibility: read RPKM if FPKM not found
300 //expr=(gtf_tracking_largeScale) ? m->getAttr("RPKM") : m->exons[0]->getAttr(m->names,"RPKM");
301 expr=m->getAttr("RPKM");
302 if (expr!=NULL) {
303 if (expr[0]=='"') expr++;
304 fpkm=strtod(expr, NULL);
305 }
306 }
307 //const char* scov=(gtf_tracking_largeScale) ? m->getAttr("cov") : m->exons[0]->getAttr(m->names,"cov");
308 const char* scov=m->getAttr("cov");
309 if (scov!=NULL) {
310 if (scov[0]=='"') scov++;
311 cov=strtod(scov, NULL);
312 }
313 //const char* sconf_hi=(gtf_tracking_largeScale) ? m->getAttr("conf_hi") : m->exons[0]->getAttr(m->names,"conf_hi");
314 const char* sconf_hi=m->getAttr("conf_hi");
315 if (sconf_hi!=NULL){
316 if (sconf_hi[0]=='"') sconf_hi++;
317 conf_hi=strtod(sconf_hi, NULL);
318 }
319 //const char* sconf_lo=(gtf_tracking_largeScale) ? m->getAttr("conf_lo") : m->exons[0]->getAttr(m->names,"conf_lo");
320 const char* sconf_lo=m->getAttr("conf_lo");
321 if (sconf_lo!=NULL) {
322 if (sconf_lo[0]=='"') sconf_lo++;
323 conf_lo=strtod(sconf_lo, NULL);
324 }
325 } //Cufflinks transfrags
326 target_mrnas->Add(m);
327 m->isUsed(true);
328 CTData* mdata=new CTData(m);
329 mdata->qset=qfidx;
330 gdata->tdata.Add(mdata);
331 if (!is_ref_set) {
332 // Cufflinks - attributes parsing
333 mdata->FPKM=fpkm;
334 mdata->cov=cov;
335 mdata->conf_hi=conf_hi;
336 mdata->conf_lo=conf_lo;
337 }
338 }//for each mrna read
339 //if (mrna_deleted>0)
340 // mrnas.Pack();
341
342 return (is_ref_set ? refdiscarded : tredundant);
343 }
344
345 bool tMatch(GffObj& a, GffObj& b, int& ovlen, bool fuzzunspl, bool contain_only) {
346 //strict intron chain match, or single-exon perfect match
347 int imax=a.exons.Count()-1;
348 int jmax=b.exons.Count()-1;
349 ovlen=0;
350 if (imax!=jmax) return false; //different number of introns
351 if (imax==0) { //single-exon mRNAs
352 if (contain_only) {
353 return ((a.start>=b.start && a.end<=b.end) ||
354 (b.start>=a.start && b.end<=a.end));
355 }
356 if (fuzzunspl) {
357 //fuzz match for single-exon transfrags:
358 // it's a match if they overlap at least 80% of shortest one
359 ovlen=a.exons[0]->overlapLen(b.exons[0]);
360 int maxlen=GMAX(a.covlen,b.covlen);
361 return (ovlen>=maxlen*0.8);
362 }
363 else {
364 //only exact match, or strictly contained
365 ovlen=a.covlen;
366 return (a.exons[0]->start==b.exons[0]->start &&
367 a.exons[0]->end==b.exons[0]->end);
368 }
369 }
370 if ( a.exons[imax]->start<b.exons[0]->end ||
371 b.exons[jmax]->start<a.exons[0]->end )
372 return false; //intron chains do not overlap at all
373 //check intron overlaps
374 ovlen=a.exons[0]->end-(GMAX(a.start,b.start))+1;
375 ovlen+=(GMIN(a.end,b.end))-a.exons.Last()->start;
376 for (int i=1;i<=imax;i++) {
377 if (i<imax) ovlen+=a.exons[i]->len();
378 if ((a.exons[i-1]->end!=b.exons[i-1]->end) ||
379 (a.exons[i]->start!=b.exons[i]->start)) {
380 return false; //intron mismatch
381 }
382 }
383 if (contain_only)
384 return ((a.start>=b.start && a.end<=b.end) ||
385 (b.start>=a.start && b.end<=a.end));
386 else return true;
387 }
388
389
390 void cluster_mRNAs(GList<GffObj> & mrnas, GList<GLocus> & loci, int qfidx) {
391 //mrnas sorted by start coordinate
392 //and so are the loci
393 //int rdisc=0;
394 for (int t=0;t<mrnas.Count();t++) {
395 GArray<int> mrgloci(false);
396 GffObj* mrna=mrnas[t];
397 int lfound=0; //count of parent loci
398 /*for (int l=0;l<loci.Count();l++) {
399 if (loci[l]->end<mrna->exons.First()->start) continue;
400 if (loci[l]->start>mrna->exons.Last()->end) break; */
401 for (int l=loci.Count()-1;l>=0;l--) {
402 if (loci[l]->end<mrna->exons.First()->start) {
403 if (mrna->exons.First()->start-loci[l]->start > GFF_MAX_LOCUS) break;
404 continue;
405 }
406 if (loci[l]->start>mrna->exons.Last()->end) continue;
407 //here we have mrna overlapping loci[l]
408 if (loci[l]->add_mRNA(mrna)) {
409 //a parent locus was found
410 lfound++;
411 mrgloci.Add(l); //locus indices added here, in decreasing order
412 }
413 }//loci loop
414 //if (lfound<0) continue; //mrna was a ref duplicate, skip it
415 if (lfound==0) {
416 //create a locus with only this mRNA
417 loci.Add(new GLocus(mrna, qfidx));
418 }
419 else if (lfound>1) {
420 //more than one locus found parenting this mRNA, merge loci
421 lfound--;
422 for (int l=0;l<lfound;l++) {
423 int mlidx=mrgloci[l]; //largest indices first, so it's safe to remove
424 loci[mrgloci[lfound]]->addMerge(*loci[mlidx], mrna);
425 loci.Delete(mlidx);
426 }
427 }
428 }//mrnas loop
429 //if (rdisc>0) mrnas.Pack();
430 //return rdisc;
431 }
432
433 int fix_umrnas(GSeqData& seqdata, GSeqData* rdata, FILE* fdis=NULL) {
434 //attempt to find the strand for seqdata.umrnas
435 //based on a) overlaps with oriented reference mRNAs if present
436 // b) overlaps with oriented mRNAs from the same input set
437 if (rdata!=NULL) { //we have reference mrnas
438 for (int i=0;i<rdata->mrnas_f.Count();i++) {
439 for (int j=0;j<seqdata.umrnas.Count();j++) {
440 if (rdata->mrnas_f[i]->gseq_id!=seqdata.umrnas[j]->gseq_id) continue;
441 if (seqdata.umrnas[j]->strand!='.') continue;
442 uint ustart=seqdata.umrnas[j]->exons.First()->start;
443 uint uend=seqdata.umrnas[j]->exons.Last()->end;
444 uint rstart=rdata->mrnas_f[i]->exons.First()->start;
445 uint rend=rdata->mrnas_f[i]->exons.Last()->end;
446 if (ustart>rend) break;
447 if (rstart>uend) continue;
448 if (rdata->mrnas_f[i]->exonOverlap(ustart,uend)) {
449 seqdata.umrnas[j]->strand='+';
450 }
451 else { //within intron
452 //if (seqdata.umrnas[j]->ulink==NULL ||
453 // seqdata.umrnas[j]->ulink->covlen<rdata->mrnas_f[i]->covlen) {
454 CTData* mdata=(CTData*)seqdata.umrnas[j]->uptr;
455 mdata->addOvl('i',rdata->mrnas_f[i]);
456 }
457 }
458 }
459 for (int i=0;i<rdata->mrnas_r.Count();i++) {
460 for (int j=0;j<seqdata.umrnas.Count();j++) {
461 if (seqdata.umrnas[j]->strand) continue;
462 uint ustart=seqdata.umrnas[j]->exons.First()->start;
463 uint uend=seqdata.umrnas[j]->exons.Last()->end;
464 uint rstart=rdata->mrnas_r[i]->exons.First()->start;
465 uint rend=rdata->mrnas_r[i]->exons.Last()->end;
466 if (ustart>rend) break;
467 if (rstart>uend) continue;
468 if (rdata->mrnas_r[i]->exonOverlap(ustart,uend)) {
469 seqdata.umrnas[j]->strand='-';
470 }
471 else { //within intron
472 CTData* mdata=(CTData*)seqdata.umrnas[j]->uptr;
473 mdata->addOvl('i',rdata->mrnas_r[i]);
474 }
475
476 }
477 }
478 }//we have reference transcripts
479 //---- now compare to other transcripts
480 for (int i=0;i<seqdata.mrnas_f.Count();i++) {
481 for (int j=0;j<seqdata.umrnas.Count();j++) {
482 if (seqdata.umrnas[j]->strand) continue;
483 uint ustart=seqdata.umrnas[j]->exons.First()->start;
484 uint uend=seqdata.umrnas[j]->exons.Last()->end;
485 uint rstart=seqdata.mrnas_f[i]->exons.First()->start;
486 uint rend=seqdata.mrnas_f[i]->exons.Last()->end;
487 if (ustart>rend) break;
488 if (rstart>uend) continue;
489 if (seqdata.mrnas_f[i]->exonOverlap(ustart,uend)) {
490 seqdata.umrnas[j]->strand='+';
491 }
492 }
493 }
494 for (int i=0;i<seqdata.mrnas_r.Count();i++) {
495 for (int j=0;j<seqdata.umrnas.Count();j++) {
496 if (seqdata.umrnas[j]->strand) continue;
497 uint ustart=seqdata.umrnas[j]->exons.First()->start;
498 uint uend=seqdata.umrnas[j]->exons.Last()->end;
499 uint rstart=seqdata.mrnas_r[i]->exons.First()->start;
500 uint rend=seqdata.mrnas_r[i]->exons.Last()->end;
501 if (ustart>rend) break;
502 if (rstart>uend) continue;
503 //overlap
504 if (seqdata.mrnas_r[i]->exonOverlap(ustart,uend)) {
505 seqdata.umrnas[j]->strand='-';
506 }
507 }
508 }
509 int fcount=0;
510 for (int i=0;i<seqdata.umrnas.Count();i++) {
511 if (seqdata.umrnas[i]->strand=='+') {
512 seqdata.mrnas_f.Add(seqdata.umrnas[i]);
513 seqdata.umrnas.Forget(i);
514 }
515 else if (seqdata.umrnas[i]->strand=='-') {
516 seqdata.mrnas_r.Add(seqdata.umrnas[i]);
517 seqdata.umrnas.Forget(i);
518 }
519 else { //discard mRNAs not settled
520 seqdata.umrnas[i]->strand='.';
521 if (fdis!=NULL) {
522 seqdata.umrnas[i]->printGtf(fdis);
523 }
524 fcount++;
525 }
526 }
527 seqdata.umrnas.Pack();
528 return fcount;
529 }
530
531 //retrieve ref_data for a specific genomic sequence
532 GSeqData* getRefData(int gid, GList<GSeqData>& ref_data) {
533 int ri=-1;
534 GSeqData f(gid);
535 GSeqData* r=NULL;
536 if (ref_data.Found(&f,ri))
537 r=ref_data[ri];
538 return r;
539 }
540
541 void read_transcripts(FILE* f, GList<GSeqData>& seqdata, bool keepAttrs) {
542 rewind(f);
543 GffReader gffr(f, true); //loading only recognizable transcript features
544 gffr.showWarnings(gtf_tracking_verbose);
545
546 // keepAttrs mergeCloseExons noExonAttrs
547 gffr.readAll(keepAttrs, true, true);
548
549 // is_ref? check_for_dups,
550 parse_mRNAs(gffr.gflst, seqdata, false, false);
551 }
552
553 int cmpGSeqByName(const pointer p1, const pointer p2) {
554 return strcmp(((GSeqData*)p1)->gseq_name, ((GSeqData*)p2)->gseq_name);
555 }
556
557 void sort_GSeqs_byName(GList<GSeqData>& seqdata) {
558 seqdata.setSorted(&cmpGSeqByName);
559 }
560
561 void read_mRNAs(FILE* f, GList<GSeqData>& seqdata, GList<GSeqData>* ref_data,
562 bool check_for_dups, int qfidx, const char* fname, bool only_multiexon) {
563 //>>>>> read all transcripts/features from a GTF/GFF3 file
564 //int imrna_counter=0;
565 int loci_counter=0;
566 if (ref_data==NULL) ref_data=&seqdata;
567 bool isRefData=(&seqdata==ref_data);
568 //(f, transcripts_only)
569 GffReader* gffr=new GffReader(f, true); //load only transcript annotations
570 gffr->showWarnings(gtf_tracking_verbose);
571 // keepAttrs mergeCloseExons noExonAttrs
572 gffr->readAll(!isRefData, true, isRefData || gtf_tracking_largeScale);
573 //so it will read exon attributes only for low number of Cufflinks files
574
575 int d=parse_mRNAs(gffr->gflst, seqdata, isRefData, check_for_dups, qfidx,only_multiexon);
576 if (gtf_tracking_verbose && d>0) {
577 if (isRefData) GMessage(" %d duplicate reference transcripts discarded.\n",d);
578 else GMessage(" %d redundant cufflinks transfrags discarded.\n",d);
579 }
580 //imrna_counter=gffr->mrnas.Count();
581 delete gffr; //free the extra memory and unused GffObjs
582
583 //for each genomic sequence, cluster transcripts
584 int discarded=0;
585 GStr bname(fname);
586 GStr s;
587 if (!bname.is_empty()) {
588 int di=bname.rindex('.');
589 if (di>0) bname.cut(di);
590 int p=bname.rindex('/');
591 if (p<0) p=bname.rindex('\\');
592 if (p>=0) bname.remove(0,p);
593 }
594 FILE* fdis=NULL;
595 FILE* frloci=NULL;
596
597 for (int g=0;g<seqdata.Count();g++) {
598 //find the corresponding refseqdata with the same gseq_id
599 int gseq_id=seqdata[g]->get_gseqid();
600 if (!isRefData) { //cufflinks data, find corresponding ref data
601 GSeqData* rdata=getRefData(gseq_id, *ref_data);
602 if (rdata!=NULL && seqdata[g]->umrnas.Count()>0) {
603 discarded+=fix_umrnas(*seqdata[g], rdata, fdis);
604 }
605 }
606 //>>>>> group mRNAs into locus-clusters (based on exon overlap)
607 cluster_mRNAs(seqdata[g]->mrnas_f, seqdata[g]->loci_f, qfidx);
608 cluster_mRNAs(seqdata[g]->mrnas_r, seqdata[g]->loci_r, qfidx);
609 if (!isRefData) {
610 cluster_mRNAs(seqdata[g]->umrnas, seqdata[g]->nloci_u, qfidx);
611 }
612 loci_counter+=seqdata[g]->loci_f.Count();
613 loci_counter+=seqdata[g]->loci_r.Count();
614 // if (refData) {
615 // if (frloci==NULL) {
616 // s=bname;
617 // s.append(".loci.lst");
618 // frloci=fopen(s.chars(), "w");
619 // }
620 // writeLoci(frloci, seqdata[g]->loci_f);
621 // writeLoci(frloci, seqdata[g]->loci_r);
622 // }//write ref loci
623 }//for each genomic sequence
624 if (fdis!=NULL) fclose(fdis);
625 if (frloci!=NULL) fclose(frloci);
626 if (discarded>0) {
627 if (gtf_tracking_verbose) GMessage("Found %d transcripts with undetermined strand.\n", discarded);
628 }
629 else { if (fdis!=NULL) remove(s.chars()); }
630 }
631
632 int qsearch_mrnas(uint x, GList<GffObj>& mrnas) {
633 //binary search
634 //do the simplest tests first:
635 if (mrnas[0]->start>x) return 0;
636 if (mrnas.Last()->start<x) return -1;
637 uint istart=0;
638 int i=0;
639 int idx=-1;
640 int maxh=mrnas.Count()-1;
641 int l=0;
642 int h = maxh;
643 while (l <= h) {
644 i = (l+h)>>1;
645 istart=mrnas[i]->start;
646 if (istart < x) l = i + 1;
647 else {
648 if (istart == x) { //found matching coordinate here
649 idx=i;
650 while (idx<=maxh && mrnas[idx]->start==x) {
651 idx++;
652 }
653 return (idx>maxh) ? -1 : idx;
654 }
655 h = i - 1;
656 }
657 } //while
658 idx = l;
659 while (idx<=maxh && mrnas[idx]->start<=x) {
660 idx++;
661 }
662 return (idx>maxh) ? -1 : idx;
663 }
664
665 int qsearch_loci(uint x, GList<GLocus>& loci) {
666 // same as above, but for GSeg lists
667 //binary search
668 //do the simplest tests first:
669 if (loci[0]->start>x) return 0;
670 if (loci.Last()->start<x) return -1;
671 uint istart=0;
672 int i=0;
673 int idx=-1;
674 int maxh=loci.Count()-1;
675 int l=0;
676 int h = maxh;
677 while (l <= h) {
678 i = (l + h) >> 1;
679 istart=loci[i]->start;
680 if (istart < x) l=i+1;
681 else {
682 if (istart == x) { //found matching coordinate here
683 idx=i;
684 while (idx<=maxh && loci[idx]->start==x) {
685 idx++;
686 }
687 return (idx>maxh) ? -1 : idx;
688 }
689 h=i-1;
690 }
691 } //while
692 idx = l;
693 while (idx<=maxh && loci[idx]->start<=x) {
694 idx++;
695 }
696 return (idx>maxh) ? -1 : idx;
697 }

Properties

Name Value
svn:executable *