ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/gff.cpp
(Generate patch)
# Line 7 | Line 7
7  
8   const uint GFF_MAX_LOCUS = 7000000; //longest known gene in human is ~2.2M, UCSC claims a gene for mouse of ~ 3.1 M
9   const uint GFF_MAX_EXON  =   30000; //longest known exon in human is ~11K
10 < const uint GFF_MAX_INTRON= 6000000;
10 > const uint GFF_MAX_INTRON= 6000000; //Ensembl shows a >5MB human intron
11   bool gff_show_warnings = false; //global setting, set by GffReader->showWarnings()
12   const int gff_fid_mRNA=0;
13   const int gff_fid_transcript=1;
# Line 159 | Line 159
159   ftype=t[2];
160   info=t[8];
161   char* p=t[3];
162 < if (!parseUInt(p,fstart))
163 <   GError("Error parsing start coordinate from GFF line:\n%s\n",l);
162 > if (!parseUInt(p,fstart)) {
163 >   //FIXME: chromosome_band entries in Flybase
164 >   GMessage("Warning: invalid start coordinate at line:\n%s\n",l);
165 >   return;
166 >   }
167   p=t[4];
168 < if (!parseUInt(p,fend))
169 <   GError("Error parsing end coordinate from GFF line:\n%s\n",l);
170 < if (fend<fstart) swap(fend,fstart); //make sure fstart>=fend, always
168 > if (!parseUInt(p,fend)) {
169 >   GMessage("Warning: invalid end coordinate at line:\n%s\n",l);
170 >   return;
171 >   }
172 > if (fend<fstart) Gswap(fend,fstart); //make sure fstart>=fend, always
173   p=t[5];
174   if (p[0]=='.' && p[1]==0) {
175    score=0;
# Line 188 | Line 193
193     is_exon=true;
194     is_t_data=true;
195     }
196 <  else if (strstr(fnamelc, "exon")!=NULL) {
196 >  else if (endsWith(fnamelc, "exon")) {
197     exontype=exgffExon;
198     is_exon=true;
199     is_t_data=true;
# Line 403 | Line 408
408   skip=false;
409   }
410  
411 +
412 + void GffObj::addCDS(uint cd_start, uint cd_end, char phase) {
413 +  if (cd_start>=this->start) {
414 +        this->CDstart=cd_start;
415 +        if (strand=='+') this->CDphase=phase;
416 +        }
417 +      else this->CDstart=this->start;
418 +  if (cd_end<=this->end) {
419 +      this->CDend=cd_end;
420 +      if (strand=='-') this->CDphase=phase;
421 +      }
422 +     else this->CDend=this->end;
423 +  isTranscript(true);
424 +  exon_ftype_id=gff_fid_exon;
425 +  if (monoFeature()) {
426 +     if (exons.Count()==0) addExon(this->start, this->end,0,'.',0,0,false,exgffExon);
427 +            else exons[0]->exontype=exgffExon;
428 +     }
429 + }
430 +
431   int GffObj::addExon(GffReader* reader, GffLine* gl, bool keepAttr, bool noExonAttr) {
432    //this will make sure we have the right subftype_id!
433 <  int subf_id=-1;
434 <  if (!isTranscript() && gl->is_cds && monoFeature()) {
433 >  //int subf_id=-1;
434 >  if (!isTranscript() && gl->is_cds) {
435            isTranscript(true);
436            exon_ftype_id=gff_fid_exon;
437            if (exons.Count()==1) exons[0]->exontype=exgffExon;
# Line 426 | Line 451
451            }
452       }
453    else { //non-mRNA parent feature, check this subf type
454 <    subf_id=names->feats.addName(gl->ftype);
454 >    int subf_id=names->feats.addName(gl->ftype);
455      if (exon_ftype_id<0 || exons.Count()==0) //never assigned a subfeature type before (e.g. first exon being added)
456         exon_ftype_id=subf_id;
457       else {
# Line 506 | Line 531
531       isCDS=false;
532       }
533    if (qs || qe) {
534 <    if (qs>qe) swap(qs,qe);
534 >    if (qs>qe) Gswap(qs,qe);
535      if (qs==0) qs=1;
536          }
537    int ovlen=0;
# Line 572 | Line 597
597             } //check for overlap
598     // --- no overlap, or accepted micro-overlap (ribosomal slippage)
599     // create & add the new segment
600 +   /*
601 +   if (start>0 && exontype==exgffCDS && exons.Count()==0) {
602 +      //adding a CDS directly as the first subfeature of a declared parent
603 +      segstart=start;
604 +      segend=end;
605 +      }
606 +   */
607     GffExon* enew=new GffExon(segstart, segend, sc, fr, qs, qe, exontype);
608     int eidx=exons.Add(enew);
609     if (eidx<0) {
# Line 584 | Line 616
616       return -1;            
617       }
618     covlen+=(int)(exons[eidx]->end-exons[eidx]->start)+1;
619 <   start=exons.First()->start;
620 <   end=exons.Last()->end;
619 >   //adjust parent feature coordinates to contain this exon
620 >   if (start==0 || start>exons.First()->start) {
621 >     start=exons.First()->start;
622 >     }
623 >   if (end<exons.Last()->end) end=exons.Last()->end;
624 >    
625     if (uptr!=NULL) { //collect stats about the underlying genomic sequence
626         GSeqStat* gsd=(GSeqStat*)uptr;
627         if (start<gsd->mincoord) gsd->mincoord=start;
# Line 940 | Line 976
976   bool GffReader::addExonFeature(GfoHolder* prevgfo, GffLine* gffline, GHash<CNonExon>& pex, bool noExonAttr) {
977    bool r=true;
978    if (gffline->strand!=prevgfo->gffobj->strand) {
979 <     GMessage("GFF Error: duplicate GFF ID '%s' (exons found on different strands of %s)\n",
980 <        prevgfo->gffobj->gffID, prevgfo->gffobj->getGSeqName());
981 <      r=false;
982 <     }
979 >  //TODO: add support for trans-splicing and even inter-chromosomal fusions
980 >     if (prevgfo->gffobj->strand=='.') {
981 >            prevgfo->gffobj->strand=gffline->strand;
982 >        }
983 >     else {
984 >       GMessage("GFF Error at %s (%c): exon %d-%d (%c) found on different strand; discarded.\n",
985 >       prevgfo->gffobj->gffID, prevgfo->gffobj->strand,
986 >       gffline->fstart, gffline->fend, gffline->strand, prevgfo->gffobj->getGSeqName());
987 >       //r=false;
988 >       return true; //FIXME: split trans-spliced mRNAs by strand
989 >       }
990 >   }
991    int gdist=(gffline->fstart>prevgfo->gffobj->end) ? gffline->fstart-prevgfo->gffobj->end :
992                        ((gffline->fend<prevgfo->gffobj->start)? prevgfo->gffobj->start-gffline->fend :
993                           0 );

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines