ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/genpept2fa.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 1 month ago) by gpertea
File size: 7609 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5 my $usage = q{
6 Script for parsing GenBank protein records GenPept File Format.
7 Usage:
8 genpept2fa.pl [-P][-R] [-f <dbxref.lst>] genbank_format_stream
9
10 Use -P option in order to exclude the PREDICTED mRNAs and gene models.
11 Use -R option to only write the REVIEWED and VALIDATED entries
12 Use -f option to provide a list of identifiers and output
13 only those records matching any of the provided dbxref IDs
14 };
15
16 umask 0002;
17 getopts('PRf:') || die($usage."\n");
18 my %refseqtype=(
19 'INFERRED'=> 'inf', # usually not yet supported by experimental evidence
20 'MODEL' => 'mod', #usually predicted by GNOMON
21 'PREDICTED' => 'p', # only CDS is predicted in most cases, the mRNA is experimentally confirmed
22 # (it becomes 'px' then)
23 'PROVISIONAL' => 'pv', #usually based on experimental evidence, but not "reviewed" by NCBI
24 'VALIDATED' => 'v', # preliminary reviewed, but did not go through "final review"
25 'REVIEWED' => 'r'
26 );
27
28 #my $outfile=$Getopt::Std::opt_g;
29 #my $fastaout=$Getopt::Std::opt_F;
30 my %typeflt=('mod'=>1,'p'=>1,'inf'=>1,'px'=>1,'pv'=>1, 'v'=>1, 'r'=>1, 'gb'=>1);
31
32 my $nopred=$Getopt::Std::opt_P;
33 my $onlyrev=$Getopt::Std::opt_R;
34
35 if ($nopred || $onlyrev) {
36 @typeflt{'mod', 'inf', 'p'}=(0,0,0);
37 }
38
39 if ($onlyrev) {
40 @typeflt{'pv','px'}=(0,0);
41 }
42
43 my %gnames;
44 #if ($fastaout) {
45 # open(FASTAOUT, '>'.$fastaout)|| die ("Error creating file $fastaout\n");
46 # }
47 #if ($outfile) {
48 # open(GFFOUT, '>'.$outfile) || die ("Error creating file $outfile\n");
49 # select(GFFOUT);
50 # }
51 my ($bseq, $fseq, $descr, $bGI, $bACC, $blen, $org, $comment, $cds, $cdsprod,
52 $genesym, $geneID, $freading, $seqreading, $has_dbxref);
53
54 my $fxref=$Getopt::Std::opt_f;
55 my %xr;
56 if ($fxref) {
57 open(XREF, $fxref) || die ("Error opening file $fxref\n");
58 while (<XREF>) {
59 chomp;
60 next unless length($_)>0;
61 my ($v)=(m/(\S+)/);
62 $xr{$v}=1;
63 }
64 close(XREF);
65 }
66
67 while (<>) {
68 chomp;
69 NEXTLINE:
70 if (m/^LOCUS/) {
71 ($bseq, $blen)=(m/^LOCUS\s+(\S+)\s+(\d+)/);
72 die("Error parsing LOCUS tag info: $_\n") unless $blen>1;
73 }
74 elsif (m/^DEFINITION\s+(.+)/) {
75 $descr=$1;
76 while (<>) {
77 chomp;
78 if (m/^[A-Z]+\s+/) {
79 $descr=~tr/\t \n\r/ /s;
80 goto NEXTLINE;
81 }
82 $descr.=' '.$_;
83 }
84 $descr=~tr/\t \n\r/ /s;
85 next;
86 }
87 elsif (m/^ACCESSION\s+(\S+)/) {
88 $has_dbxref=$fxref ? 0 : 1;
89 $bACC=$1;
90 next;
91 }
92 elsif (m/^VERSION/) {
93 ($bACC, $bGI)=(m/([\w\.]+)\s+GI:(\d+)/);
94 die("Error parsing GI# from VERSION tag ($_)") unless $bGI>1;
95 next;
96 }
97 elsif (m/^SOURCE\s+(.+)/) {
98 $org=$1;
99 next;
100 }
101 elsif (m/^COMMENT\s+(.+)/) {
102 $comment=$1;
103 while (<>) {
104 chomp;
105 if (m/^\s*[A-Z]+\s+/) {
106 goto NEXTLINE;
107 }
108 $comment.=' '.$_;
109 }
110 next;
111 }
112 elsif (m/^FEATURES\s+/) {
113 $freading=1;
114 next;
115 }
116 # elsif (m/^BASE COUNT/) {
117 # $freading=0;
118 # next;
119 # }
120 elsif (m/^CONTIG\s+/) {
121 $freading=0;
122 $seqreading=0;
123 }
124 elsif (m/^ORIGIN\s+/) {
125 $freading=0;
126 $seqreading=1;
127 next;
128 }
129 elsif (m/^\/\/$/) {
130 $freading=0;
131 $seqreading=0;
132 my $defline=$bACC;
133 my ($ctype)=($comment=~m/^([A-Z]+)\s+/);
134 my $rtype;
135 if ($ctype && ($rtype=$refseqtype{$ctype})) {
136 $rtype.='x' if $comment=~m/is supported by experimental/;
137 #$defline=$rtype.'|'.$bACC;
138 }
139 else {
140 $rtype='gb';
141 }
142 $defline=$rtype.'|'.$bACC;
143 # non-refseq entries will have rtype set to
144 # so they will NOT be shown
145 my $noskip=(length($fseq)>9 && $typeflt{$rtype}==1 && $has_dbxref);
146 if ($noskip) {
147 $descr=~s/\,\s+mRNA\.$//;
148 $descr=~s/ \[[\w\,\. ]+\]\.$//;
149 $descr=~s/\.$//;
150 $descr=~s/\s*\(\Q$genesym\E\)// if $genesym;
151 $descr=~s/^$org\s+//;
152 if ($cdsprod) {
153 $cdsprod=~tr/\t \n\r/ /s;
154 if (!$descr || $descr=~m/hypothetical/i) {
155 $descr=$cdsprod;
156 }
157 else {
158 #print STDERR "{$descr}\n{$cdsprod}\n";
159 $descr.=' product:'.$cdsprod
160 unless (index(lc($descr), lc($cdsprod))>=0 || $cdsprod=~m/hypothetical protein/i);
161 }
162 }
163 $defline.=' gid:'.$genesym if $genesym;
164 $defline.=' '.$geneID if $geneID;
165 $defline.=' '.$cds if $cds;
166 $defline.=' '.$descr;
167 $defline.=' {'.$org.'}';
168 $defline =~ tr/ \t/ /s;
169 print &fastafmt(\$fseq, $defline);
170 }
171 $fseq='';
172 $descr='';
173 $bseq='';
174 $genesym='';
175 $geneID='';
176 $has_dbxref=$fxref ? 0 : 1;
177 $comment='';
178 $cds='';
179 $cdsprod='';
180 $comment='';
181 next;
182 }
183
184 #-- remaining lines parsing:
185
186 if ($freading) {
187 my ($featname, $frange)=(m/^\s+([\w\'\-]+)\s+(.+)/);
188 die("Error parsing $bACC feature at: $_\n") unless $featname && $frange;
189 if ($frange=~m/\(/ && $frange!~m/\)$/) { #wrapped composite intervals line
190 do {
191 $_=<>;
192 last unless $_;
193 chomp;
194 $frange.=$_;
195 } until (m/\)$/);
196 } #unwrapped the composite intervals (multi-exons)
197 #if ($featname eq 'CDS') {
198 # $cds.='|' if $cds;
199 # $frange=~s/\.\./-/g;
200 # $cds.='CDS:'.$frange;
201 # }
202
203 my @attrs;
204 my $endattr;
205 #print STDERR "..reading attributes for $featname ($frange)\n";
206 while (<>) { #read all attributes for this feature
207 unless (m/^\s+\/\w+/) {
208 $endattr=1;
209 last;
210 }
211 chomp;
212 if (m/^\s+\/([\w\-]+)=(.+)/) { # /attr=value
213 my ($a, $av)=($1, $2);
214 my ($fattr, $fattr_dta)=&parseAttr($featname, $a,$av); #this will read the next line(s) if necessary
215 if ($fattr eq 'gene' && !$genesym) {
216 $genesym=$fattr_dta;
217 #$gname =~ tr/"//d; #"
218 next;
219 }
220 if ($featname eq 'Protein' && $fattr eq 'product') {
221 $cdsprod=$fattr_dta;
222 # if (!$descr || $descr=~m/hypothetical/i) {
223 # $descr=$fattr_dta;
224 # }
225 # else { $cdsprod=' product:'.$fattr_dta; }
226 next;
227 }
228 if ($has_dbxref==0 && $fattr eq 'db_xref') {
229 if ($has_dbxref=exists($xr{$fattr_dta})) {
230 $geneID=$fattr_dta;
231 }
232 next;
233 }
234 if ($fattr eq 'organism') {
235 $org=$fattr_dta;
236 #$org=~tr/"//d; #"
237 }
238 } # /attr=attrvalue parsing
239 # store all attributes and their values -- for the current feature only
240 #elsif ($fattr ne 'translation') {
241 # push(@attrs, $fattr.'='.$fattr_dta);
242 # }
243 } #attribute parsing loop
244 #&writeFeature($featname, $frange, $gname, @attrs);
245 goto NEXTLINE if $endattr;
246 next;
247 }# FEATURES section reading
248
249 if ($seqreading && m/^\s+\d+\s+(.+)/) {
250 my $s=$1;
251 $s=~tr/ \t\n//d;
252 $fseq.=uc($s);
253 }
254
255 }#while input lines
256
257 #close(FASTAOUT) if $fastaout;
258
259
260 sub parseAttr {
261 my ($fn, $a, $adta)=@_;
262 #print STDERR "[[ parseAttr($fn, $a, $adta) ]]\n";
263 $adta=~s/^\s+//;$adta=~s/\s+$//; #trim
264 #die("Invalid attribute string for $fn\:$a ($adta)\n") unless $adta=~m/^"/;
265 if ($adta=~m/^"/) {
266 while ($adta!~m/\"$/) {
267 $_=<>;
268 chomp;
269 tr/ \t/ /s; #remove extra spaces
270 $adta.=$_;
271 }
272 }
273 $adta=~s/^\"//;$adta=~s/\"$//; #remove quotes
274 $adta=~tr/\t \n\r/ /s;
275 return ($a, $adta);
276 }
277
278 sub fastafmt { # (seqref, defline, linelen)
279 my $seqr=shift(@_);
280 my $defline=shift(@_);
281 my $linelen=shift(@_) || 60;;
282 my $rec;
283 $rec=">$defline\n" if $defline;
284 my $seqlen=length($$seqr);
285 my $pos=0;
286 while ($pos<$seqlen) {
287 $rec.= substr($$seqr,$pos,$linelen)."\n";
288 $pos+=$linelen;
289 }
290 return $rec;
291 }

Properties

Name Value
svn:executable *