ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/gff.cpp
(Generate patch)
# Line 7 | Line 7
7  
8   const uint GFF_MAX_LOCUS = 7000000; //longest known gene in human is ~2.2M, UCSC claims a gene for mouse of ~ 3.1 M
9   const uint GFF_MAX_EXON  =   30000; //longest known exon in human is ~11K
10 < const uint GFF_MAX_INTRON= 6000000;
10 > const uint GFF_MAX_INTRON= 6000000; //Ensembl shows a >5MB human intron
11   bool gff_show_warnings = false; //global setting, set by GffReader->showWarnings()
12   const int gff_fid_mRNA=0;
13   const int gff_fid_transcript=1;
# Line 159 | Line 159
159   ftype=t[2];
160   info=t[8];
161   char* p=t[3];
162 < if (!parseUInt(p,fstart))
163 <   GError("Error parsing start coordinate from GFF line:\n%s\n",l);
162 > if (!parseUInt(p,fstart)) {
163 >   //FIXME: chromosome_band entries in Flybase
164 >   GMessage("Warning: invalid start coordinate at line:\n%s\n",l);
165 >   return;
166 >   }
167   p=t[4];
168 < if (!parseUInt(p,fend))
169 <   GError("Error parsing end coordinate from GFF line:\n%s\n",l);
168 > if (!parseUInt(p,fend)) {
169 >   GMessage("Warning: invalid end coordinate at line:\n%s\n",l);
170 >   return;
171 >   }
172   if (fend<fstart) swap(fend,fstart); //make sure fstart>=fend, always
173   p=t[5];
174   if (p[0]=='.' && p[1]==0) {
# Line 188 | Line 193
193     is_exon=true;
194     is_t_data=true;
195     }
196 <  else if (strstr(fnamelc, "exon")!=NULL) {
196 >  else if (endsWith(fnamelc, "exon")) {
197     exontype=exgffExon;
198     is_exon=true;
199     is_t_data=true;
# Line 403 | Line 408
408   skip=false;
409   }
410  
411 +
412 + void GffObj::addCDS(uint cd_start, uint cd_end, char phase) {
413 +  if (cd_start>=this->start) {
414 +        this->CDstart=cd_start;
415 +        if (strand=='+') this->CDphase=phase;
416 +        }
417 +      else this->CDstart=this->start;
418 +  if (cd_end<=this->end) {
419 +      this->CDend=cd_end;
420 +      if (strand=='-') this->CDphase=phase;
421 +      }
422 +     else this->CDend=this->end;
423 +  isTranscript(true);
424 +  exon_ftype_id=gff_fid_exon;
425 +  if (monoFeature()) {
426 +     if (exons.Count()==0) addExon(this->start, this->end,0,'.',0,0,false,exgffExon);
427 +            else exons[0]->exontype=exgffExon;
428 +     }
429 + }
430 +
431   int GffObj::addExon(GffReader* reader, GffLine* gl, bool keepAttr, bool noExonAttr) {
432    //this will make sure we have the right subftype_id!
433 <  int subf_id=-1;
434 <  //if (ftype_id==gff_fid_mRNA) { //for mRNAs only parse known subfeatures!
433 >  //int subf_id=-1;
434 >  if (!isTranscript() && gl->is_cds) {
435 >          isTranscript(true);
436 >          exon_ftype_id=gff_fid_exon;
437 >          if (exons.Count()==1) exons[0]->exontype=exgffExon;
438 >          }
439    if (isTranscript()) {
440       if (exon_ftype_id<0) {//exon_ftype_id=gff_fid_exon;
441            if (gl->exontype>0) exon_ftype_id=gff_fid_exon;
# Line 422 | Line 451
451            }
452       }
453    else { //non-mRNA parent feature, check this subf type
454 <    subf_id=names->feats.addName(gl->ftype);
454 >    int subf_id=names->feats.addName(gl->ftype);
455      if (exon_ftype_id<0 || exons.Count()==0) //never assigned a subfeature type before (e.g. first exon being added)
456         exon_ftype_id=subf_id;
457       else {
458         if (exon_ftype_id!=subf_id) {
459 <         //if (subftype_id==ftype_id && exons.Count()==1 && exons[0]->start==start && exons[0]->end==end) {
459 >         //
460           if (exon_ftype_id==ftype_id && exons.Count()==1 && exons[0]->start==start && exons[0]->end==end) {
461              //the existing exon was just a dummy one created by default, discard it
462              exons.Clear();
# Line 504 | Line 533
533    if (qs || qe) {
534      if (qs>qe) swap(qs,qe);
535      if (qs==0) qs=1;
536 <    }
536 >        }
537 >  int ovlen=0;
538    if (exontype>0) { //check for overlaps between exon-type segments
509      int ovlen=0;
539        int oi=exonOverlapIdx(segstart, segend, &ovlen);
540        if (oi>=0) { //overlap existing segment
541           if (ovlen==0) {
542 <              //adjacent segments will be merged
543 <              if ((exons[oi]->exontype==exgffUTR && exontype==exgffCDS) ||
544 <                  (exons[oi]->exontype==exgffCDS && exontype==exgffUTR)) {
545 <                    expandExon(oi, segstart, segend, exgffCDSUTR, sc, fr, qs, qe);
546 <                    return oi;
547 <                    }
542 >                          //adjacent segments will be merged
543 >                          //e.g. CDS to (UTR|exon)
544 >                          if ((exons[oi]->exontype>=exgffUTR && exontype==exgffCDS) ||
545 >                                  (exons[oi]->exontype==exgffCDS && exontype>=exgffUTR)) {
546 >                                        expandExon(oi, segstart, segend, exgffCDSUTR, sc, fr, qs, qe);
547 >                                        return oi;
548 >                                        }
549 >                          //CDS adjacent to stop_codon: UCSC does (did?) this
550 >                          if ((exons[oi]->exontype==exgffStop && exontype==exgffCDS) ||
551 >                                  (exons[oi]->exontype==exgffCDS && exontype==exgffStop)) {
552 >                                        expandExon(oi, segstart, segend, exgffCDS, sc, fr, qs, qe);
553 >                                        return oi;
554 >                                        }
555               }
556 <         //only allow this for CDS within exon, stop_codon within exon, stop_codon within UTR,
557 <         //                   start_codon within CDS or stop_codon within CDS
556 >                 //only allow this for CDS within exon, stop_codon within (CDS|UTR|exon),
557 >         //                   start_codon within (CDS|exon)
558          if (exons[oi]->exontype>exontype &&
559               exons[oi]->start<=segstart && exons[oi]->end>=segend &&
560               !(exons[oi]->exontype==exgffUTR && exontype==exgffCDS)) {
# Line 529 | Line 565
565               segstart<=exons[oi]->start && segend>=exons[oi]->end &&
566               !(exontype==exgffUTR && exons[oi]->exontype==exgffCDS)) {
567                 //smaller segment given first, so we have to enlarge it
568 <              expandExon(oi, segstart, segend, exontype, sc, fr, qs, qe);
569 <                //this should also check for overlapping next exon (oi+1) ?
568 >                          expandExon(oi, segstart, segend, exontype, sc, fr, qs, qe);
569 >                                //this should also check for overlapping next exon (oi+1) ?
570                return oi;
571                }
572          //there is also the special case of "ribosomal slippage exception" (programmed frameshift)
573          //where two CDS segments may actually overlap for 1 or 2 bases, but there should be only one encompassing exon
574 <        //if (ovlen>2 || exons[oi]->exontype!=exgffCDS || exontype!=exgffCDS) {
575 <        // --> had to relax this because of some weird UCSC annotations with exons partially overlapping the CDS segments
576 <        if (ovlen>2 && exons[oi]->exontype!=exgffUTR && exontype!=exgffUTR) {
577 <           //important structural warning, will always print:
578 <           if (gff_show_warnings)
579 <               GMessage("GFF Warning: discarding overlapping feature segment (%d-%d) (vs %d-%d (%s)) for GFF ID %s on %s\n",
580 <               segstart, segend, exons[oi]->start, exons[oi]->end, getSubfName(), gffID, getGSeqName());
581 <           hasErrors(true);
582 <           return -1; //segment NOT added
583 <           }
584 <          // else add the segment if the overlap is small and between two CDS segments
585 <          //TODO: we might want to add an attribute here with the slippage coordinate and size?
586 <        }//overlap of existing segment
587 <       } //check for overlap
574 >                //if (ovlen>2 || exons[oi]->exontype!=exgffCDS || exontype!=exgffCDS) {
575 >                // had to relax this because of some weird UCSC annotations with exons partially overlapping the CDS segments
576 >                /*
577 >                if (ovlen>2 && exons[oi]->exontype!=exgffUTR && exontype!=exgffUTR) {
578 >                   if (gff_show_warnings)
579 >                           GMessage("GFF Warning: discarding overlapping feature segment (%d-%d) (vs %d-%d (%s)) for GFF ID %s on %s\n",
580 >                           segstart, segend, exons[oi]->start, exons[oi]->end, getSubfName(), gffID, getGSeqName());
581 >                   hasErrors(true);
582 >                   return -1; //segment NOT added
583 >                   }
584 >                */
585 >
586 >                 if ((ovlen>2 || ovlen==0) || exons[oi]->exontype!=exgffCDS || exontype!=exgffCDS) {
587 >                  if (gff_show_warnings)
588 >                         GMessage("GFF Warning: merging overlapping/adjacent feature segment (%d-%d) into (%d-%d) (%s) for GFF ID %s on %s\n",
589 >                                 segstart, segend, exons[oi]->start, exons[oi]->end, getSubfName(), gffID, getGSeqName());
590 >                        expandExon(oi, segstart, segend, exontype, sc, fr, qs, qe);
591 >                        return oi;
592 >                 }
593 >                // else add the segment if the overlap is small and between two CDS segments
594 >                //TODO: we might want to add an attribute here with the slippage coordinate and size?
595 >        covlen-=ovlen;
596 >                }//overlap or adjacent to existing segment
597 >           } //check for overlap
598     // --- no overlap, or accepted micro-overlap (ribosomal slippage)
599     // create & add the new segment
600 +   /*
601 +   if (start>0 && exontype==exgffCDS && exons.Count()==0) {
602 +      //adding a CDS directly as the first subfeature of a declared parent
603 +      segstart=start;
604 +      segend=end;
605 +      }
606 +   */
607     GffExon* enew=new GffExon(segstart, segend, sc, fr, qs, qe, exontype);
608     int eidx=exons.Add(enew);
609     if (eidx<0) {
# Line 563 | Line 616
616       return -1;            
617       }
618     covlen+=(int)(exons[eidx]->end-exons[eidx]->start)+1;
619 <   start=exons.First()->start;
620 <   end=exons.Last()->end;
619 >   //adjust parent feature coordinates to contain this exon
620 >   if (start==0 || start>exons.First()->start) {
621 >     start=exons.First()->start;
622 >     }
623 >   if (end<exons.Last()->end) end=exons.Last()->end;
624 >    
625     if (uptr!=NULL) { //collect stats about the underlying genomic sequence
626         GSeqStat* gsd=(GSeqStat*)uptr;
627         if (start<gsd->mincoord) gsd->mincoord=start;
# Line 919 | Line 976
976   bool GffReader::addExonFeature(GfoHolder* prevgfo, GffLine* gffline, GHash<CNonExon>& pex, bool noExonAttr) {
977    bool r=true;
978    if (gffline->strand!=prevgfo->gffobj->strand) {
979 <     GMessage("GFF Error: duplicate GFF ID '%s' (exons found on different strands of %s)\n",
980 <        prevgfo->gffobj->gffID, prevgfo->gffobj->getGSeqName());
981 <      r=false;
982 <     }
979 >  //TODO: add support for trans-splicing and even inter-chromosomal fusions
980 >     if (prevgfo->gffobj->strand=='.') {
981 >            prevgfo->gffobj->strand=gffline->strand;
982 >        }
983 >     else {
984 >       GMessage("GFF Error at %s (%c): exon %d-%d (%c) found on different strand; discarded.\n",
985 >       prevgfo->gffobj->gffID, prevgfo->gffobj->strand,
986 >       gffline->fstart, gffline->fend, gffline->strand, prevgfo->gffobj->getGSeqName());
987 >       //r=false;
988 >       return true; //FIXME: split trans-spliced mRNAs by strand
989 >       }
990 >   }
991    int gdist=(gffline->fstart>prevgfo->gffobj->end) ? gffline->fstart-prevgfo->gffobj->end :
992                        ((gffline->fend<prevgfo->gffobj->start)? prevgfo->gffobj->start-gffline->fend :
993                           0 );

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines