ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/gbk2gff.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 1 month ago) by gpertea
File size: 6847 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5
6 my $usage = q{ Usage:
7 gb2gff.pl [-A] [-f <fasta_output>] [-o <out_gff_file>] genbank_file(s)
8
9 Options:
10 -A convert ALL features (default is: only convert genes/mRNAs
11 -f also write a fasta version of the base sequence(s) in
12 the file <fasta_output>
13 };
14 umask 0002;
15 getopts('Af:o:') || die($usage."\n");
16 my $allfeats=$Getopt::Std::opt_A;
17 my $outfile=$Getopt::Std::opt_o;
18 my $fastaout=$Getopt::Std::opt_f;
19 my %gnames;
20 my $fakeGI=121000;
21 my $vegasrc=0;
22 if ($fastaout) {
23 open(FASTAOUT, '>'.$fastaout)|| die ("Error creating file $fastaout\n");
24 }
25 if ($outfile) {
26 open(GFFOUT, '>'.$outfile) || die ("Error creating file $outfile\n");
27 select(GFFOUT);
28 }
29 my ($bseq, $fseq, $bdef, $bGI, $bACC, $blen, $freading, $sreading);
30 my ($gname, $gdescr, $mrna);
31 while (<>) {
32 chomp;
33 NEXTLINE:
34 if (m/^LOCUS/) {
35 ($bseq, $blen)=(m/^LOCUS\s+(\S+)\s+(\d+)/);
36 die("Error parsing LOCUS tag info: $_\n") unless $blen>1;
37 }
38 elsif (m/^DEFINITION\s+(.+)/) {
39 $bdef=$1;
40 }
41 elsif (m/^ACCESSION\s+(\S+)/) {
42 $bACC=$1;
43 }
44 elsif (m/^VERSION/) {
45 if (m/chromosome:VEGA/) {
46 $bGI = $fakeGI++;
47 $vegasrc=1;
48 }
49 else {
50 ($bGI)=(m/\bGI:(\d+)/);
51 die("Error parsing GI# from VERSION tag ($_)") unless $bGI>1;
52 }
53 }
54 elsif (m/^FEATURES\s+/) {
55 $freading=1;
56 ($gname, $gdescr, $mrna)=('','','');
57 next;
58 }
59 elsif (m/^BASE COUNT/) {
60 $freading=0;
61 next;
62 }
63 elsif (m/^ORIGIN\s+/) {
64 $freading=0;
65 $sreading=1;
66 next;
67 }
68 elsif (m/^\/\/$/) {
69 $freading=0;
70 $sreading=0;
71 if ($fastaout) {
72 print FASTAOUT &fastafmt(\$fseq, $bseq.' gi|'.$bGI.' '.$bdef);
73 }
74 $fseq='';
75 $bdef='';
76 $bseq='';
77 }
78
79 #-- remaining lines parsing:
80 if ($freading) {
81
82 my ($featname, $frange)=(m/^\s+([\w\-]+)\s+(.+)/);
83 die("Error parsing feature at: $_\n") unless $featname && $frange;
84 ($gname, $gdescr)='' if $featname eq 'gene';
85
86 if ($frange=~m/\(/ && $frange!~m/\)$/) { #wrapped composite intervals line
87 do {
88 $_=<>;
89 last unless $_;
90 chomp;
91 $frange.=$_;
92 } until (m/\)$/);
93 } #wrapped composite intervals;
94 my %attrs;
95 my $endattr;
96 #print STDERR "..reading attributes for $featname ($frange)\n";
97 while (<>) { #read all attributes for this feature
98 unless (m/^\s+\/([^=]+)=(.+)/) {
99 $endattr=1;
100 last;
101 }
102 chomp;
103 my ($fattr, $fattr_dta)=&parseAttr($1,$2); #this will read the next line(s) if necessary
104 #$fattr_dta =~ tr/"//d; #"
105 $attrs{$fattr}=$fattr_dta if ($fattr ne 'translation');
106 } #attribute parsing loop
107 if ($vegasrc) {
108 #print STDERR "############# FEATNAME = $featname \n";
109 if ($featname eq 'gene') {
110 $gname=$attrs{'locus_tag'};
111 $gdescr=$attrs{'note'};
112 $mrna='';
113 }
114 if ($featname eq 'mRNA') {
115 my $note=$attrs{'note'};
116 ($mrna)=($note=~m/transcript_id=([\-\.\w]+)/);
117 #print STDERR "##############>>> MRNA === > $mrna ($note)\n";
118 }
119 }
120 if (!$allfeats && $featname ne 'mRNA' && $featname ne 'CDS' && $featname ne 'gene') {
121 goto NEXTLINE if $endattr;
122 next;
123 }
124 &writeFeature($featname, $frange, \%attrs, $gname, $gdescr, $mrna);
125 goto NEXTLINE if $endattr;
126 next;
127 }
128
129 if ($fastaout && $sreading && m/^\s+\d+\s+(.+)/) {
130 my $s=$1;
131 $s=~tr/ \t\n//d;
132 $fseq.=$s;
133 }
134
135 }#while input lines
136
137 close(FASTAOUT) if $fastaout;
138
139 sub parseAttr {
140 my ($a, $adta)=@_;
141 #print STDERR "[[ parseAttr($a, $adta) ]]\n";
142
143 $adta=~s/^\s+//;$adta=~s/\s+$//; #trim
144 #die("Invalid attribute string for $fn\:$a ($adta)\n") unless $adta=~m/^"/;
145 if ($adta=~m/^"/) {
146 while ($adta!~m/\"$/) {
147 $_=<>;
148 chomp;
149 tr/ \t/ /s; #remove extra spaces
150 $adta.=$_;
151 }
152 }
153 $adta=~s/^\"//;$adta=~s/\"$//; #remove quotes
154 return ($a, $adta);
155 }
156
157 sub writeFeature {
158 my ($fname, $frange, $attrs, $gname, $gdescr, $mrna)=@_;
159 #the rest of @_ are attributes
160 my $comment='';
161 my $gene=$$attrs{'gene'};
162 my $id=$gene;
163 my $product=$$attrs{'product'};
164 if ($fname eq 'mRNA') {
165 $id=$$attrs{'transcript_id'} || $gene;
166 }
167 elsif ($fname eq 'CDS') {
168 $id=$$attrs{'protein_id'} || $gene;
169 }
170 my $strand=($frange=~s/complement\(//) ? '-' : '+';
171 $frange=~s/join\(//;
172 $frange=~s/\)+$//;
173 my @ranges=split(/\,/,$frange);
174 my @ex;
175 foreach my $r (@ranges) {
176 my ($start, $end)=($r=~m/(\d+)\.\.(\d+)/);
177 die("Error parsing ranges for $fname ($frange)\n") unless $start>0 && $end>$start;
178 push(@ex, [$start, $end, 0]); #start, end, frame;
179 }
180 @ex= sort { $main::a->[0] <=> $main::b->[0] } @ex;
181 my ($min, $max)=($ex[0]->[0], $ex[-1]->[1]);
182 # if ($fname eq 'mRNA' || $fname eq 'CDS') { #isoform testing
183 # my $p= $$attrs{'product'}; #identify isoforms by product.. :(
184 # my $isoform;
185 # if ($p=~m/isoform (\S+)/i) {
186 # $isoform=$1;
187 # $gene.='-'.$isoform;
188 # }
189 # }
190 if ($fname eq 'gene' || $fname eq 'mRNA' || $fname eq 'CDS') {
191 my $gene_name=$gname || $gene;
192 $id=$mrna if $mrna;
193 $comment="ID=$id;Name=$gene_name";
194 $comment.=";geneId=$gname" if $gname;
195 $comment.=";descr=\"$gdescr\"" if $gdescr;
196 my $numexons = @ex;
197 $comment.=";exonCount=$numexons;" if $numexons>0;
198 $comment.=";product=\"$product\"" if $product;
199 my $gffeat=$fname;
200 $gffeat='mRNA' if $fname eq 'CDS';
201
202 print join("\t",$bseq, 'gb2gff', $gffeat, $min, $max, '.',$strand,'.',$comment)."\n"
203 unless $fname eq 'CDS';
204 return if $fname eq 'gene';
205 $fname='exon' if $fname eq 'mRNA';
206 }
207 if ($fname eq 'exon' || $fname eq 'CDS') {
208 my $n=1;
209 #print "## CDS $gene info: ".join('; ',@_)."\n" if ($fname eq 'CDS');
210 my $frame=0; ## actually, this should be: codon_start - 1
211 if ($fname eq 'CDS') { # compute frame
212 if ($strand eq '-') { @ex=reverse(@ex); }
213 foreach my $e (@ex) {
214 $$e[2]=$frame;
215 $frame = ($frame + ($$e[1] - $$e[0] + 1)) % 3;
216 }
217 if ($strand eq '-') { @ex=reverse(@ex); }
218 }
219 foreach my $e (@ex) {
220 my ($start, $stop, $fr)=@$e;
221 print join("\t",$bseq, 'gb2gff', $fname, $start, $stop, '.', $strand, $fr,
222 "Parent=$id")."\n";
223 #$frame = ($fname eq 'CDS') ? ($frame + ($stop - $start + 1)) % 3 : '.';
224 $n++;
225 }
226 }
227 else { # other non-mRNA features
228 $comment=join(';',@_);
229 foreach my $e (@ex) {
230 print join("\t",$bseq, 'gb2gff', $fname, $$e[0], $$e[1], '.',$strand,'.',
231 "Parent=$id")."\n";
232 }
233 }
234 }
235
236 sub fastafmt { # (seqref, defline, linelen)
237 my $seqr=shift(@_);
238 my $defline=shift(@_);
239 my $linelen=shift(@_) || 60;;
240 my $rec;
241 $rec=">$defline\n" if $defline;
242 my $seqlen=length($$seqr);
243 my $pos=0;
244 while ($pos<$seqlen) {
245 $rec.= substr($$seqr,$pos,$linelen)."\n";
246 $pos+=$linelen;
247 }
248 return $rec;
249 }

Properties

Name Value
svn:executable *