ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/cuffcompare/gtf_tracking.cpp
(Generate patch)
# Line 49 | Line 49
49             }
50        if (omrna.start>m->end) continue; //this should never be the case if nidx was found correctly
51        //locus overlap here:
52 <      if (tMatch(*m, omrna, ovlen)) {
52 >      if (tMatch(*m, omrna, ovlen, false, true)) {
53               dupidx=i;
54               return mrnas[i];
55               }
# Line 172 | Line 172
172   }
173  
174   bool betterDupRef(GffObj* a, GffObj* b) {
175 < if (a->exons.Count()!=b->exons.Count())
176 <  return (a->exons.Count()>b->exons.Count());
177 < if (a->hasCDS()!=b->hasCDS())
178 <   return (a->hasCDS()>b->hasCDS());
179 < if (a->track_id != b->track_id)
180 <   return (a->track_id < b->track_id);
181 < return (a->covlen > b->covlen);
175 >  if (a->exons.Count()!=b->exons.Count())
176 >    return (a->exons.Count()>b->exons.Count());
177 >  if (a->hasCDS()!=b->hasCDS())
178 >     return (a->hasCDS()>b->hasCDS());
179 >   //for annotation purposes, it's more important to keep the
180 >   //longer transcript, instead of the one that was loaded first
181 >  if (a->covlen != b->covlen)
182 >         return (a->covlen > b->covlen);
183 >    else return (a->track_id < b->track_id);
184   }
185  
186   int parse_mRNAs(GfList& mrnas,
# Line 240 | Line 242
242                       int rpidx=-1;
243                       GffObj* rp= is_RefDup(m, *target_mrnas, rpidx);
244                       if (rp!=NULL) { //duplicate found
245 <                       //discard the one that was seen "later" (higher track_id)
245 >                       //discard one of them
246                         //but let's keep the gene_name if present
247 +                     //DEBUG:
248 +                     GMessage("Ref duplicates: %s = %s\n", rp->getID(), m->getID());
249                       refdiscarded++;
250                       if (betterDupRef(rp, m)) {
251                             if (rp->getGeneName()==NULL && m->getGeneName()!=NULL) {
# Line 338 | Line 342
342   return (is_ref_set ? refdiscarded : tredundant);
343   }
344  
345 < bool tMatch(GffObj& a, GffObj& b, int& ovlen, bool equnspl) {
345 > bool tMatch(GffObj& a, GffObj& b, int& ovlen, bool fuzzunspl, bool contain_only) {
346          //strict intron chain match, or single-exon perfect match
347          int imax=a.exons.Count()-1;
348          int jmax=b.exons.Count()-1;
349          ovlen=0;
350          if (imax!=jmax) return false; //different number of introns
351          if (imax==0) { //single-exon mRNAs
352 <                if (equnspl) {
352 >                if (contain_only) {
353 >                   return ((a.start>=b.start && a.end<=b.end) ||
354 >                           (b.start>=a.start && b.end<=a.end));
355 >                }
356 >                if (fuzzunspl) {
357                          //fuzz match for single-exon transfrags:
358 <                        // it's a match if they overlap at least 80% of max len
358 >                        // it's a match if they overlap at least 80% of shortest one
359                          ovlen=a.exons[0]->overlapLen(b.exons[0]);
360                          int maxlen=GMAX(a.covlen,b.covlen);
361                          return (ovlen>=maxlen*0.8);
362                  }
363            else {
364 <                        //only exact match
364 >                        //only exact match, or strictly contained
365                          ovlen=a.covlen;
366                          return (a.exons[0]->start==b.exons[0]->start &&
367                                          a.exons[0]->end==b.exons[0]->end);
# Line 369 | Line 377
377                  if (i<imax) ovlen+=a.exons[i]->len();
378                  if ((a.exons[i-1]->end!=b.exons[i-1]->end) ||
379                          (a.exons[i]->start!=b.exons[i]->start)) {
380 <            return false; //intron mismatch
380 >                        return false; //intron mismatch
381                  }
382          }
383 <        return true;
383 >        if (contain_only)
384 >                     return ((a.start>=b.start && a.end<=b.end) ||
385 >                           (b.start>=a.start && b.end<=a.end));
386 >                else return true;
387   }
388  
389 +
390   void cluster_mRNAs(GList<GffObj> & mrnas, GList<GLocus> & loci, int qfidx) {
391          //mrnas sorted by start coordinate
392          //and so are the loci

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines