ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/gbrna2fa.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 2 months ago) by gpertea
File size: 7054 byte(s)
Log Message:
Line User Rev File contents
1 gpertea 23 #!/usr/bin/perl
2     use strict;
3     use Getopt::Std;
4     use FindBin;use lib $FindBin::Bin;
5     my $usage = q{
6     Script for parsing RefSeq mRNAs in GenBank File Format.
7     Usage:
8     gbrna2fa.pl [-P][-R] genbank_format_stream
9    
10     Use -P option in order to exclude the PREDICTED mRNAs and gene models.
11     Use -R option to only write the REVIEWED and VALIDATED entries
12     };
13    
14     umask 0002;
15     getopts('PR') || die($usage."\n");
16     my %refseqtype=(
17     'INFERRED'=> 'inf', # usually not yet supported by experimental evidence
18     'MODEL' => 'mod', #usually predicted by GNOMON
19     'PREDICTED' => 'p', # only CDS is predicted in most cases, the mRNA is experimentally confirmed
20     # (it becomes 'px' then)
21     'PROVISIONAL' => 'pv', #usually based on experimental evidence, but not "reviewed" by NCBI
22     'VALIDATED' => 'v', # preliminary reviewed, but did not go through "final review"
23     'REVIEWED' => 'r'
24     );
25    
26     #my $outfile=$Getopt::Std::opt_g;
27     #my $fastaout=$Getopt::Std::opt_F;
28     my %typeflt=('mod'=>1,'p'=>1,'inf'=>1,'px'=>1,'pv'=>1, 'v'=>1, 'r'=>1, 'gb'=>1);
29    
30     my $nopred=$Getopt::Std::opt_P;
31     my $onlyrev=$Getopt::Std::opt_R;
32    
33     if ($nopred || $onlyrev) {
34     @typeflt{'mod', 'inf', 'p'}=(0,0,0);
35     }
36    
37     if ($onlyrev) {
38     @typeflt{'pv','px'}=(0,0);
39     }
40    
41     my %gnames;
42     #if ($fastaout) {
43     # open(FASTAOUT, '>'.$fastaout)|| die ("Error creating file $fastaout\n");
44     # }
45     #if ($outfile) {
46     # open(GFFOUT, '>'.$outfile) || die ("Error creating file $outfile\n");
47     # select(GFFOUT);
48     # }
49     my ($bseq, $fseq, $descr, $bGI, $bACC, $blen, $org, $comment, $cds, $cdsprod,
50     $geneid, $freading, $seqreading);
51     my $skipLocus=0;
52     while (<>) {
53     chomp;
54     NEXTLINE:
55     if (m/^LOCUS/) {
56     ($bseq, $blen)=(m/^LOCUS\s+(\S+)\s+(\d+)/);
57     die("Error parsing LOCUS tag info: $_\n") unless $blen;
58     if ($blen<20 || $blen>300000 || m/\s+DNA\s+linear\s+/) {
59     while(<>) {
60     chomp;
61     goto NEXTLINE if m/^LOCUS/;
62     }
63     }
64     }
65     elsif (m/^DEFINITION\s+(.+)/) {
66     $descr=$1;
67     while (<>) {
68     chomp;
69     if (m/^[A-Z]+\s+/) {
70     $descr=~tr/\t \n\r/ /s;
71     goto NEXTLINE;
72     }
73     $descr.=' '.$_;
74     }
75     $descr=~tr/\t \n\r/ /s;
76     next;
77     }
78     elsif (m/^ACCESSION\s+(\S+)/) {
79     $bACC=$1;
80     next;
81     }
82     elsif (m/^VERSION/) {
83     ($bACC, $bGI)=(m/([\w\.]+)\s+GI:(\d+)/);
84     die("Error parsing GI# from VERSION tag ($_)") unless $bGI>1;
85     next;
86     }
87     elsif (m/^SOURCE\s+(.+)/) {
88     $org=$1;
89     next;
90     }
91     elsif (m/^COMMENT\s+(.+)/) {
92     $comment=$1;
93     while (<>) {
94     chomp;
95     if (m/^\s*[A-Z]+\s+/) {
96     goto NEXTLINE;
97     }
98     $comment.=' '.$_;
99     }
100     next;
101     }
102     elsif (m/^FEATURES\s+/) {
103     $freading=1;
104     next;
105     }
106     # elsif (m/^BASE COUNT/) {
107     # $freading=0;
108     # next;
109     # }
110     elsif (m/^CONTIG\s+/) {
111     $freading=0;
112     $seqreading=0;
113     }
114     elsif (m/^ORIGIN\s+/) {
115     $freading=0;
116     $seqreading=1;
117     next;
118     }
119     elsif (m/^\/\/$/) {
120     $freading=0;
121     $seqreading=0;
122     my $defline=$bACC;
123     my ($ctype)=($comment=~m/^([A-Z]+)\s+/);
124     my $rtype;
125     if ($ctype && ($rtype=$refseqtype{$ctype})) {
126     $rtype.='x' if $comment=~m/is supported by experimental/;
127     #$defline=$rtype.'|'.$bACC;
128     }
129     else {
130     $rtype='gb';
131     }
132     $defline=$rtype.'|'.$bACC;
133     # non refseq entries will not have rtype set
134     if (length($fseq)>20 && $typeflt{$rtype}==1) {
135     $descr=~s/\,\s+mRNA\.$//;
136     $descr=~s/\.$//;
137     $descr=~s/\s*\(\Q$geneid\E\)// if $geneid;
138     $descr=~s/^$org\s+//;
139     if ($cdsprod) {
140     $cdsprod=~tr/\t \n\r/ /s;
141     if (!$descr || $descr=~m/hypothetical/i) {
142     $descr=$cdsprod;
143     }
144     else {
145     #print STDERR "{$descr}\n{$cdsprod}\n";
146     $descr.=' product:'.$cdsprod
147     unless (index(lc($descr), lc($cdsprod))>=0 || $cdsprod=~m/hypothetical protein/i);
148     }
149     }
150     $defline.=' gid:'.$geneid if $geneid;
151     $defline.=' '.$cds if $cds;
152     $defline.=' '.$descr;
153     $defline.=' {'.$org.'}';
154     $defline =~ tr/ \t/ /s;
155     print &fastafmt(\$fseq, $defline);
156     }
157     $fseq='';
158     $descr='';
159     $bseq='';
160     $geneid='';
161     $comment='';
162     $cds='';
163     $cdsprod='';
164     $comment='';
165     next;
166     }
167    
168     #-- remaining lines parsing:
169    
170     if ($freading) {
171     my ($featname, $frange)=(m/^\s+([\w\'\-]+)\s+(.+)/);
172     die("Error parsing $bACC feature at: $_\n") unless $featname && $frange;
173     if ($frange=~m/\(/ && $frange!~m/\)$/) { #wrapped composite intervals line
174     do {
175     $_=<>;
176     last unless $_;
177     chomp;
178     $frange.=$_;
179     } until (m/\)$/);
180     } #unwrapped the composite intervals (multi-exons)
181     if ($featname eq 'CDS') {
182     $cds.='|' if $cds;
183     $frange=~s/\.\./-/g;
184     $frange=~tr/ //d;
185     # for join() ranges or partial < >
186     $frange=~ s/^[^\d]*(\d+).+?(\d+)\)?$/$1-$2/;
187     $cds.='CDS:'.$frange;
188     }
189    
190     my @attrs;
191     my $endattr;
192     #print STDERR "..reading attributes for $featname ($frange)\n";
193     while (<>) { #read all attributes for this feature
194     unless (m/^\s+\/\w+/) {
195     $endattr=1;
196     last;
197     }
198     chomp;
199     if (m/^\s+\/([\w\-]+)=(.+)/) {
200     my ($a, $av)=($1, $2);
201     my ($fattr, $fattr_dta)=&parseAttr($featname, $a,$av); #this will read the next line(s) if necessary
202     if ($fattr eq 'gene' && !$geneid) {
203     $geneid=$fattr_dta;
204     #$gname =~ tr/"//d; #"
205     next;
206     }
207     if ($featname eq 'CDS' && $fattr eq 'product') {
208     $cdsprod=$fattr_dta;
209     # if (!$descr || $descr=~m/hypothetical/i) {
210     # $descr=$fattr_dta;
211     # }
212     # else { $cdsprod=' product:'.$fattr_dta; }
213     next;
214     }
215    
216     if ($fattr eq 'organism') {
217     $org=$fattr_dta;
218     #$org=~tr/"//d; #"
219     }
220     } # /attr=attrvalue parsing
221     # store all attributes and their values -- for the current feature only
222     #elsif ($fattr ne 'translation') {
223     # push(@attrs, $fattr.'='.$fattr_dta);
224     # }
225     } #attribute parsing loop
226     #&writeFeature($featname, $frange, $gname, @attrs);
227     goto NEXTLINE if $endattr;
228     next;
229     }# FEATURES section reading
230    
231     if ($seqreading && m/^\s+\d+\s+(.+)/) {
232     my $s=$1;
233     $s=~tr/ \t\n//d;
234     $fseq.=uc($s);
235     }
236    
237     }#while input lines
238    
239     #close(FASTAOUT) if $fastaout;
240    
241     sub parseAttr {
242     my ($fn, $a, $adta)=@_;
243     #print STDERR "[[ parseAttr($fn, $a, $adta) ]]\n";
244     $adta=~s/^\s+//;$adta=~s/\s+$//; #trim
245     #die("Invalid attribute string for $fn\:$a ($adta)\n") unless $adta=~m/^"/;
246     if ($adta=~m/^"/) {
247     while ($adta!~m/\"$/) {
248     $_=<>;
249     chomp;
250     tr/ \t/ /s; #remove extra spaces
251     $adta.=$_;
252     }
253     }
254     $adta=~s/^\"//;$adta=~s/\"$//; #remove quotes
255     $adta=~tr/\t \n\r/ /s;
256     return ($a, $adta);
257     }
258    
259     sub fastafmt { # (seqref, defline, linelen)
260     my $seqr=shift(@_);
261     my $defline=shift(@_);
262     my $linelen=shift(@_) || 60;;
263     my $rec;
264     $rec=">$defline\n" if $defline;
265     my $seqlen=length($$seqr);
266     my $pos=0;
267     while ($pos<$seqlen) {
268     $rec.= substr($$seqr,$pos,$linelen)."\n";
269     $pos+=$linelen;
270     }
271     return $rec;
272     }

Properties

Name Value
svn:executable *