ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/gbrna2fa.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 1 month ago) by gpertea
File size: 7054 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5 my $usage = q{
6 Script for parsing RefSeq mRNAs in GenBank File Format.
7 Usage:
8 gbrna2fa.pl [-P][-R] genbank_format_stream
9
10 Use -P option in order to exclude the PREDICTED mRNAs and gene models.
11 Use -R option to only write the REVIEWED and VALIDATED entries
12 };
13
14 umask 0002;
15 getopts('PR') || die($usage."\n");
16 my %refseqtype=(
17 'INFERRED'=> 'inf', # usually not yet supported by experimental evidence
18 'MODEL' => 'mod', #usually predicted by GNOMON
19 'PREDICTED' => 'p', # only CDS is predicted in most cases, the mRNA is experimentally confirmed
20 # (it becomes 'px' then)
21 'PROVISIONAL' => 'pv', #usually based on experimental evidence, but not "reviewed" by NCBI
22 'VALIDATED' => 'v', # preliminary reviewed, but did not go through "final review"
23 'REVIEWED' => 'r'
24 );
25
26 #my $outfile=$Getopt::Std::opt_g;
27 #my $fastaout=$Getopt::Std::opt_F;
28 my %typeflt=('mod'=>1,'p'=>1,'inf'=>1,'px'=>1,'pv'=>1, 'v'=>1, 'r'=>1, 'gb'=>1);
29
30 my $nopred=$Getopt::Std::opt_P;
31 my $onlyrev=$Getopt::Std::opt_R;
32
33 if ($nopred || $onlyrev) {
34 @typeflt{'mod', 'inf', 'p'}=(0,0,0);
35 }
36
37 if ($onlyrev) {
38 @typeflt{'pv','px'}=(0,0);
39 }
40
41 my %gnames;
42 #if ($fastaout) {
43 # open(FASTAOUT, '>'.$fastaout)|| die ("Error creating file $fastaout\n");
44 # }
45 #if ($outfile) {
46 # open(GFFOUT, '>'.$outfile) || die ("Error creating file $outfile\n");
47 # select(GFFOUT);
48 # }
49 my ($bseq, $fseq, $descr, $bGI, $bACC, $blen, $org, $comment, $cds, $cdsprod,
50 $geneid, $freading, $seqreading);
51 my $skipLocus=0;
52 while (<>) {
53 chomp;
54 NEXTLINE:
55 if (m/^LOCUS/) {
56 ($bseq, $blen)=(m/^LOCUS\s+(\S+)\s+(\d+)/);
57 die("Error parsing LOCUS tag info: $_\n") unless $blen;
58 if ($blen<20 || $blen>300000 || m/\s+DNA\s+linear\s+/) {
59 while(<>) {
60 chomp;
61 goto NEXTLINE if m/^LOCUS/;
62 }
63 }
64 }
65 elsif (m/^DEFINITION\s+(.+)/) {
66 $descr=$1;
67 while (<>) {
68 chomp;
69 if (m/^[A-Z]+\s+/) {
70 $descr=~tr/\t \n\r/ /s;
71 goto NEXTLINE;
72 }
73 $descr.=' '.$_;
74 }
75 $descr=~tr/\t \n\r/ /s;
76 next;
77 }
78 elsif (m/^ACCESSION\s+(\S+)/) {
79 $bACC=$1;
80 next;
81 }
82 elsif (m/^VERSION/) {
83 ($bACC, $bGI)=(m/([\w\.]+)\s+GI:(\d+)/);
84 die("Error parsing GI# from VERSION tag ($_)") unless $bGI>1;
85 next;
86 }
87 elsif (m/^SOURCE\s+(.+)/) {
88 $org=$1;
89 next;
90 }
91 elsif (m/^COMMENT\s+(.+)/) {
92 $comment=$1;
93 while (<>) {
94 chomp;
95 if (m/^\s*[A-Z]+\s+/) {
96 goto NEXTLINE;
97 }
98 $comment.=' '.$_;
99 }
100 next;
101 }
102 elsif (m/^FEATURES\s+/) {
103 $freading=1;
104 next;
105 }
106 # elsif (m/^BASE COUNT/) {
107 # $freading=0;
108 # next;
109 # }
110 elsif (m/^CONTIG\s+/) {
111 $freading=0;
112 $seqreading=0;
113 }
114 elsif (m/^ORIGIN\s+/) {
115 $freading=0;
116 $seqreading=1;
117 next;
118 }
119 elsif (m/^\/\/$/) {
120 $freading=0;
121 $seqreading=0;
122 my $defline=$bACC;
123 my ($ctype)=($comment=~m/^([A-Z]+)\s+/);
124 my $rtype;
125 if ($ctype && ($rtype=$refseqtype{$ctype})) {
126 $rtype.='x' if $comment=~m/is supported by experimental/;
127 #$defline=$rtype.'|'.$bACC;
128 }
129 else {
130 $rtype='gb';
131 }
132 $defline=$rtype.'|'.$bACC;
133 # non refseq entries will not have rtype set
134 if (length($fseq)>20 && $typeflt{$rtype}==1) {
135 $descr=~s/\,\s+mRNA\.$//;
136 $descr=~s/\.$//;
137 $descr=~s/\s*\(\Q$geneid\E\)// if $geneid;
138 $descr=~s/^$org\s+//;
139 if ($cdsprod) {
140 $cdsprod=~tr/\t \n\r/ /s;
141 if (!$descr || $descr=~m/hypothetical/i) {
142 $descr=$cdsprod;
143 }
144 else {
145 #print STDERR "{$descr}\n{$cdsprod}\n";
146 $descr.=' product:'.$cdsprod
147 unless (index(lc($descr), lc($cdsprod))>=0 || $cdsprod=~m/hypothetical protein/i);
148 }
149 }
150 $defline.=' gid:'.$geneid if $geneid;
151 $defline.=' '.$cds if $cds;
152 $defline.=' '.$descr;
153 $defline.=' {'.$org.'}';
154 $defline =~ tr/ \t/ /s;
155 print &fastafmt(\$fseq, $defline);
156 }
157 $fseq='';
158 $descr='';
159 $bseq='';
160 $geneid='';
161 $comment='';
162 $cds='';
163 $cdsprod='';
164 $comment='';
165 next;
166 }
167
168 #-- remaining lines parsing:
169
170 if ($freading) {
171 my ($featname, $frange)=(m/^\s+([\w\'\-]+)\s+(.+)/);
172 die("Error parsing $bACC feature at: $_\n") unless $featname && $frange;
173 if ($frange=~m/\(/ && $frange!~m/\)$/) { #wrapped composite intervals line
174 do {
175 $_=<>;
176 last unless $_;
177 chomp;
178 $frange.=$_;
179 } until (m/\)$/);
180 } #unwrapped the composite intervals (multi-exons)
181 if ($featname eq 'CDS') {
182 $cds.='|' if $cds;
183 $frange=~s/\.\./-/g;
184 $frange=~tr/ //d;
185 # for join() ranges or partial < >
186 $frange=~ s/^[^\d]*(\d+).+?(\d+)\)?$/$1-$2/;
187 $cds.='CDS:'.$frange;
188 }
189
190 my @attrs;
191 my $endattr;
192 #print STDERR "..reading attributes for $featname ($frange)\n";
193 while (<>) { #read all attributes for this feature
194 unless (m/^\s+\/\w+/) {
195 $endattr=1;
196 last;
197 }
198 chomp;
199 if (m/^\s+\/([\w\-]+)=(.+)/) {
200 my ($a, $av)=($1, $2);
201 my ($fattr, $fattr_dta)=&parseAttr($featname, $a,$av); #this will read the next line(s) if necessary
202 if ($fattr eq 'gene' && !$geneid) {
203 $geneid=$fattr_dta;
204 #$gname =~ tr/"//d; #"
205 next;
206 }
207 if ($featname eq 'CDS' && $fattr eq 'product') {
208 $cdsprod=$fattr_dta;
209 # if (!$descr || $descr=~m/hypothetical/i) {
210 # $descr=$fattr_dta;
211 # }
212 # else { $cdsprod=' product:'.$fattr_dta; }
213 next;
214 }
215
216 if ($fattr eq 'organism') {
217 $org=$fattr_dta;
218 #$org=~tr/"//d; #"
219 }
220 } # /attr=attrvalue parsing
221 # store all attributes and their values -- for the current feature only
222 #elsif ($fattr ne 'translation') {
223 # push(@attrs, $fattr.'='.$fattr_dta);
224 # }
225 } #attribute parsing loop
226 #&writeFeature($featname, $frange, $gname, @attrs);
227 goto NEXTLINE if $endattr;
228 next;
229 }# FEATURES section reading
230
231 if ($seqreading && m/^\s+\d+\s+(.+)/) {
232 my $s=$1;
233 $s=~tr/ \t\n//d;
234 $fseq.=uc($s);
235 }
236
237 }#while input lines
238
239 #close(FASTAOUT) if $fastaout;
240
241 sub parseAttr {
242 my ($fn, $a, $adta)=@_;
243 #print STDERR "[[ parseAttr($fn, $a, $adta) ]]\n";
244 $adta=~s/^\s+//;$adta=~s/\s+$//; #trim
245 #die("Invalid attribute string for $fn\:$a ($adta)\n") unless $adta=~m/^"/;
246 if ($adta=~m/^"/) {
247 while ($adta!~m/\"$/) {
248 $_=<>;
249 chomp;
250 tr/ \t/ /s; #remove extra spaces
251 $adta.=$_;
252 }
253 }
254 $adta=~s/^\"//;$adta=~s/\"$//; #remove quotes
255 $adta=~tr/\t \n\r/ /s;
256 return ($a, $adta);
257 }
258
259 sub fastafmt { # (seqref, defline, linelen)
260 my $seqr=shift(@_);
261 my $defline=shift(@_);
262 my $linelen=shift(@_) || 60;;
263 my $rec;
264 $rec=">$defline\n" if $defline;
265 my $seqlen=length($$seqr);
266 my $pos=0;
267 while ($pos<$seqlen) {
268 $rec.= substr($$seqr,$pos,$linelen)."\n";
269 $pos+=$linelen;
270 }
271 return $rec;
272 }

Properties

Name Value
svn:executable *