ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/gb2gff.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 1 month ago) by gpertea
File size: 5689 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5
6 my $usage = q{ Usage:
7 gb2gff.pl [-A] [-f <fasta_output>] [-o <out_gff_file>] genbank_file(s)
8
9 Options:
10 -A convert ALL features (default is: only convert genes/mRNAs
11 -f also write a fasta version of the base sequence(s) in
12 the file <fasta_output>
13 };
14 umask 0002;
15 getopts('Af:o:') || die($usage."\n");
16 my $allfeats=$Getopt::Std::opt_A;
17 my $outfile=$Getopt::Std::opt_o;
18 my $fastaout=$Getopt::Std::opt_f;
19 my %gnames;
20 if ($fastaout) {
21 open(FASTAOUT, '>'.$fastaout)|| die ("Error creating file $fastaout\n");
22 }
23 if ($outfile) {
24 open(GFFOUT, '>'.$outfile) || die ("Error creating file $outfile\n");
25 select(GFFOUT);
26 }
27 my ($bseq, $fseq, $bdef, $bGI, $bACC, $blen, $freading, $sreading);
28 while (<>) {
29 chomp;
30 NEXTLINE:
31 if (m/^LOCUS/) {
32 ($bseq, $blen)=(m/^LOCUS\s+(\S+)\s+(\d+)/);
33 die("Error parsing LOCUS tag info: $_\n") unless $blen>1;
34 }
35 elsif (m/^DEFINITION\s+(.+)/) {
36 $bdef=$1;
37 }
38 elsif (m/^ACCESSION\s+(\S+)/) {
39 $bACC=$1;
40 }
41 elsif (m/^VERSION/) {
42 ($bGI)=(m/\bGI:(\d+)/);
43 die("Error parsing GI# from VERSION tag ($_)") unless $bGI>1;
44 }
45 elsif (m/^FEATURES\s+/) {
46 $freading=1;
47 next;
48 }
49 elsif (m/^BASE COUNT/) {
50 $freading=0;
51 next;
52 }
53 elsif (m/^ORIGIN\s+/) {
54 $freading=0;
55 $sreading=1;
56 next;
57 }
58 elsif (m/^\/\/$/) {
59 $freading=0;
60 $sreading=0;
61 if ($fastaout) {
62 print FASTAOUT &fastafmt(\$fseq, $bseq.' gi|'.$bGI.' '.$bdef);
63 }
64 $fseq='';
65 $bdef='';
66 $bseq='';
67 }
68
69 #-- remaining lines parsing:
70
71 if ($freading) {
72
73 my ($featname, $frange)=(m/^\s+([\w\-]+)\s+(.+)/);
74 die("Error parsing feature at: $_\n") unless $featname && $frange;
75 if ($frange=~m/\(/ && $frange!~m/\)$/) { #wrapped composite intervals line
76 do {
77 $_=<>;
78 last unless $_;
79 chomp;
80 $frange.=$_;
81 } until (m/\)$/);
82 } #wrapped composite intervals;
83 my %attrs;
84 my $endattr;
85 #print STDERR "..reading attributes for $featname ($frange)\n";
86 while (<>) { #read all attributes for this feature
87 unless (m/^\s+\/(\S+)=(.+)/) {
88 $endattr=1;
89 last;
90 }
91 chomp;
92 my ($fattr, $fattr_dta)=&parseAttr($1,$2); #this will read the next line(s) if necessary
93 #$fattr_dta =~ tr/"//d; #"
94 $attrs{$fattr}=$fattr_dta if ($fattr ne 'translation');
95 } #attribute parsing loop
96 if (!$allfeats && $featname ne 'mRNA' && $featname ne 'CDS' && $featname ne 'gene') {
97 goto NEXTLINE if $endattr;
98 next;
99 }
100 &writeFeature($featname, $frange, \%attrs);
101 goto NEXTLINE if $endattr;
102 next;
103 }
104
105 if ($fastaout && $sreading && m/^\s+\d+\s+(.+)/) {
106 my $s=$1;
107 $s=~tr/ \t\n//d;
108 $fseq.=$s;
109 }
110
111 }#while input lines
112
113 close(FASTAOUT) if $fastaout;
114
115 sub parseAttr {
116 my ($a, $adta)=@_;
117 #print STDERR "[[ parseAttr($a, $adta) ]]\n";
118
119 $adta=~s/^\s+//;$adta=~s/\s+$//; #trim
120 #die("Invalid attribute string for $fn\:$a ($adta)\n") unless $adta=~m/^"/;
121 if ($adta=~m/^"/) {
122 while ($adta!~m/\"$/) {
123 $_=<>;
124 chomp;
125 tr/ \t/ /s; #remove extra spaces
126 $adta.=$_;
127 }
128 }
129 $adta=~s/^\"//;$adta=~s/\"$//; #remove quotes
130 return ($a, $adta);
131 }
132
133 sub writeFeature {
134 my ($fname, $frange, $attrs)=@_;
135 #the rest of @_ are attributes
136 my $comment='';
137 my $gene=$$attrs{'gene'};
138 my $id=$gene;
139 my $product=$$attrs{'product'};
140 if ($fname eq 'mRNA') {
141 $id=$$attrs{'transcript_id'} || $gene;
142 }
143 elsif ($fname eq 'CDS') {
144 $id=$$attrs{'protein_id'} || $gene;
145 }
146 my $strand=($frange=~s/complement\(//) ? '-' : '+';
147 $frange=~s/join\(//;
148 $frange=~s/\)+$//;
149 my @ranges=split(/\,/,$frange);
150 my @ex;
151 foreach my $r (@ranges) {
152 my ($start, $end)=($r=~m/(\d+)\.\.(\d+)/);
153 die("Error parsing ranges for $fname ($frange)\n") unless $start>0 && $end>$start;
154 push(@ex, [$start, $end]);
155 }
156 @ex= sort { $main::a->[0] <=> $main::b->[0] } @ex;
157 my ($min, $max)=($ex[0]->[0], $ex[-1]->[1]);
158 # if ($fname eq 'mRNA' || $fname eq 'CDS') { #isoform testing
159 # my $p= $$attrs{'product'}; #identify isoforms by product.. :(
160 # my $isoform;
161 # if ($p=~m/isoform (\S+)/i) {
162 # $isoform=$1;
163 # $gene.='-'.$isoform;
164 # }
165 # }
166 if ($fname eq 'gene' || $fname eq 'mRNA' || $fname eq 'CDS') {
167 $comment="ID=$id;Name=$gene";
168 my $numexons = @ex;
169 $comment.=";ExonCount=$numexons;" if $numexons>0;
170 $comment.=";Product=\"$product\"" if $product;
171 my $gffeat=$fname;
172 $gffeat='mRNA' if $fname eq 'CDS';
173
174 print join("\t",$bseq, 'gb2gff', $gffeat, $min, $max, '.',$strand,'.',$comment)."\n";
175 return if $fname eq 'gene';
176 $fname='exon' if $fname eq 'mRNA';
177 }
178 if ($fname eq 'exon' || $fname eq 'CDS') {
179 my $n=1;
180 #print "## CDS $gene info: ".join('; ',@_)."\n" if ($fname eq 'CDS');
181 my $frame=0; ## actually, this should be: codon_start - 1
182 foreach my $e (@ex) {
183 my ($start, $stop)=@$e;
184 print join("\t",$bseq, 'gb2gff', $fname, $start, $stop, '.',$strand, $frame,
185 "Parent=$id")."\n";
186 $frame = ($fname eq 'CDS') ? ($frame + ($stop - $start + 1)) % 3 : '.';
187 $n++;
188 }
189 }
190 else { # other non-mRNA features
191 $comment=join(';',@_);
192 foreach my $e (@ex) {
193 print join("\t",$bseq, 'gb2gff', $fname, $$e[0], $$e[1], '.',$strand,'.',
194 "Parent=$id")."\n";
195 }
196 }
197 }
198
199 sub fastafmt { # (seqref, defline, linelen)
200 my $seqr=shift(@_);
201 my $defline=shift(@_);
202 my $linelen=shift(@_) || 60;;
203 my $rec;
204 $rec=">$defline\n" if $defline;
205 my $seqlen=length($$seqr);
206 my $pos=0;
207 while ($pos<$seqlen) {
208 $rec.= substr($$seqr,$pos,$linelen)."\n";
209 $pos+=$linelen;
210 }
211 return $rec;
212 }

Properties

Name Value
svn:executable *