ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/uniprot2bcp.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 2 months ago) by gpertea
File size: 7236 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 #use LWP::Simple;
5 use FindBin;use lib $FindBin::Bin;
6 use dbSession;
7 my $usage = qq/Usage:
8 uniprot2bcp.pl [-t <taxons.tab>] [-F] uniref100_clusters.pfa [<uniprot_data_file_or_stream>]
9
10 Parses the UNIPROT data into bcp files for loading into geanno db.
11
12 Also creates a uniprot fasta file (uniprot.fa) with all the
13 uniprot entries found in the input data\/stream
14
15 If no database connection is possible, use -t option to provide a file
16 with taxon information; <taxons.tab> can be built with
17 pullsql -b common\@CBCBMYSQL -o taxons.tab
18 .. with the query:
19 select tax_id, name, c_name from taxon
20
21 If -F option is given, ONLY the fasta file will be created.
22 /;
23 umask 0002;
24 getopts('t:F') || die($usage."\n");
25 my $fastaOnly=$Getopt::Std::opt_F;
26 my $ftaxon=$Getopt::Std::opt_t;
27 my @tax;
28 if ($ftaxon) {
29 open(TAX, $ftaxon)||die("Error opening $ftaxon\n");
30 while (<TAX>) {
31 chomp;
32 my @t=split(/\t/);
33 $tax[int($t[0])]=[$t[1],$t[2]];
34 }
35 close(TAX);
36 }
37
38 #create the bcp files for loading
39
40 my $funiref=shift(@ARGV) || die("$usage\n");
41
42 my $ds;
43 unless ($ftaxon) {
44 $ds=dbSession->new('common@CBCBMYSQL');
45 }
46
47 my %uniref; # uniprot_id => uniref_cluster_id
48 open(UNIREF, $funiref) || die("Error opening $funiref file!\n");
49 while (<UNIREF>) {
50 chomp;
51 s/^>//;
52 my @ids=split;
53 unless (@ids>1) {
54 print STDERR "WARNING: cluster: '$_' has no members!\n";
55 next;
56 }
57 my $clname=shift(@ids);
58 foreach my $upid (@ids) {
59 $uniref{$upid}=$clname;
60 }
61 }
62 close(UNIREF);
63 print STDERR "UniRef clusters loaded, now parsing the main uniprot KB data..\n";
64
65 open(UPFA, '>uniprot.fa') || die ("Error creating file uniprot.fa!\n");
66 unless ($fastaOnly) {
67 open(UNIPROT, '>uniprot.bcp') || die ("Error creating file uniprot.bcp!\n");
68 open(UNAMES, '>uniprot_names.bcp') || die ("Error creating file uniprot_names.bcp!\n");
69 open(UXREF, '>uniprot_xref.bcp') || die ("Error creating file uniprot_xref.bcp!\n");
70 }
71
72 #open(FGO, $file) || die ("Error opening $file\n");
73 my $sth;
74 unless ($ftaxon) {
75 $sth=$ds->prep('select name, c_name from taxon where tax_id=?');
76 }
77 my $c_id; #current uniprot ID
78 my $c_taxon;
79 my $c_rev=0;
80 my $c_len;
81 my $c_priacc;
82 my $c_descr;
83 my $c_gene;
84 my $c_uniref;
85 my $c_htaxon;
86 my $seq;
87 my $inSeq=0;
88 my $xcode=0;
89 my ($c_orgname, $c_comname);
90 while (<>) {
91 if (m{^//}) {
92 writeUP() if $c_id;
93 next;
94 }
95 if ($inSeq) {
96 #within SQ record
97 chomp;
98 tr/\t //d;
99 $seq.=$_;
100 next;
101 }
102 if (m/^SQ\s+SEQUENCE\s+(\d+)/) {
103 my $seqlen=$1;
104 die("Error: invalid length for $c_id ($c_len vs $seqlen)!\n") if ($seqlen != $c_len);
105 $inSeq=1;
106 next;
107 }
108 if (m/^ID\s+(\S+)\s+(\S+)\s+(\d+)/) {
109 my ($id, $rev, $len)=($1,$2,$3);
110 writeUP() if $c_id;
111 ($c_id, $c_len)=($id, $len);
112 $c_rev=($rev=~/^Reviewed/)?'1':'0';
113 next;
114 }# ID
115 if (m/^AC\s+(\S.+)/) {
116 my $a=$1;
117 chomp;
118 my @accs=split(/;\s*/, $a);
119 $c_priacc=$accs[0] unless $c_priacc;
120 unless ($fastaOnly) {
121 foreach my $acc (@accs) {
122 next unless $acc;
123 print UXREF join("\t",$c_id, 'UP_ACC', '', $acc, '')."\n";
124 }
125 }
126 next;
127 }# AC
128 if (m/^DE\s+(\S.+)/) {
129 my $d=$1;
130 $c_descr.=' '.$d;
131 next;
132 }
133 if (m/^PE\s+(\d+)\:/) {
134 $xcode=$1;
135 next;
136 }
137 if (m/^GN\s+(\S.+)/) {
138 my $d=$1;
139 my ($gname)=($d=~m/\bName\s*=\s*([\w\-\.]+)/);
140 if ($gname) {
141 $c_gene=$gname unless $c_gene;
142 unless ($fastaOnly) {
143 print UXREF join("\t", $c_id, 'UP_GENE', '', $gname, '')."\n";
144 my ($gsyns)=($d=~m/\bSynonyms\s*=\s*(\S[\w\-\,\.\s]+)\;/);
145 if ($gsyns) {
146 my @gsyn=split(/\,\s*/,$gsyns);
147 foreach my $gs (@gsyn) {
148 print UNAMES join("\t",$gname, 'GSYN', $gs)."\n";
149 }
150 } #synonyms found
151 }
152 }#a gname found
153 next;
154 }#GN
155 if (s/^OS\s+//) {
156 ($c_orgname)=(m/^([\w\. \-]+)/);
157 $c_orgname=~s/\s+$//;
158 $c_orgname=~s/\.$//;
159 ($c_comname)=(m/\(([\w\. \-]+)/);
160 next;
161 }
162 if (m/^OX\s+NCBI_TaxID\s*=\s*(\d+)/) {
163 my $t=$1;
164 $c_taxon=$t unless $c_taxon;
165 print UXREF join("\t", $c_id, 'UP_Taxon', $t, '', '')."\n"
166 unless $fastaOnly; #taxon
167 next;
168 }#OX (Taxon)
169
170 if (m/^OH\s+NCBI_TaxID\s*=\s*(\d+)/) {
171 my $t=$1;
172 $c_htaxon=$t unless $c_htaxon;
173 print UXREF join("\t", $c_id, 'UP_Host', $t, '', '')."\n"
174 unless $fastaOnly; #host taxon
175 next;
176 } #OH (Host taxon)
177
178 if (m/^DR\s+(\S.+)/) { # db xref
179 my $d=$1;
180 $d=~s/\-?\.\s*$//;
181 $d=~s/\;\s*$//;
182 my @xd=split(/\;\s*/,$d);
183 @xd=grep { !/^\-/ } @xd;
184 my $xdb=shift(@xd);
185 if ($xdb eq 'GO') {
186 my $goid=shift(@xd);
187 my ($gonum)=($goid=~/GO:(\d+)/);
188 $gonum=int($gonum);
189 die("Error parsing GO ID from DR line: $_\n") unless $gonum>0;
190 my $gobranch=shift(@xd);
191 my $go_ann_type=shift(@xd);
192 print UXREF join("\t", $c_id, $xdb, $gonum, $goid, $go_ann_type.' '.$gobranch)."\n"
193 unless $fastaOnly;
194 }
195 else {
196 my $acc=shift(@xd);
197 my $data=join(' ', @xd);
198 print UXREF join("\t", $c_id, $xdb, '', $acc, $data)."\n" unless $fastaOnly;
199 }
200 next;
201 } #DR
202 } #while
203
204 writeUP() if $c_id;
205 unless ($fastaOnly) {
206 close(UNIPROT);
207 close(UNAMES);
208 close(UXREF);
209 }
210 close(UPFA);
211 unless ($ftaxon) {
212 $ds->logout();
213 }
214 #-- now run bcpin
215
216 #my $bcpcmd='bcpin -TI -b geanno@NEOSYBASE';
217 #system($bcpcmd. ' uniprot.bcp uniprot_names.bcp uniprot_xref')
218 # && die("Error at bcpin!\n");
219
220 sub writeUP {
221 # c_descr now can contain strange RecName: Full=...; and AltName: Full=...;
222 # keep only the RecName Full, remove anything else
223 if ($c_descr=~m/Name:\s+Full=\s*([^\;]+)/) { # RecName or SubName
224 $c_descr=$1;
225 }
226 # elsif ($c_descr=~m/SubName:\s+Full=\s*([^\;]+)/) {
227 # $c_descr=$1;
228 # }
229
230 prepFields($c_descr);
231 # -- c_descr can probably be insanely long.. truncate it here to the max field length
232 $c_uniref=$uniref{$c_id} || $uniref{$c_priacc};
233 unless ($fastaOnly) {
234 print UNIPROT join("\t",
235 $c_id, $c_taxon, $c_rev, $c_len, $c_priacc, $c_descr, $c_gene, $xcode, $c_uniref, $c_htaxon)."\n";
236 }
237 my $upre=($c_rev==1)?'UPr|':'UP|';
238 #$upre.='e' if $xcode<3;
239 my $defline='';
240 $defline.='|gid|'.$c_gene if $c_gene;
241 $defline.=' '.$c_uniref if $c_uniref;
242 $defline.=' '.$c_descr;
243 my ($oname, $coname);
244 if ($ftaxon) {
245 my $td=$tax[$c_taxon];
246 if ($td) {
247 ($oname, $coname)=@$td;
248 }
249 #else {
250 # print STDERR "Warning: no data for taxon $c_taxon".
251 # " was loaded ($c_id|acc|$c_priacc $defline)!\n";
252 # }
253 }
254 else {
255 $ds->exec($sth, $c_taxon);
256 while (my $rr=$ds->fetch($sth)) { ($oname, $coname)=@$rr; };
257 }
258 unless ($oname) {
259 print STDERR "Warning: no data for taxon $c_taxon".
260 " ($c_id|acc|$c_priacc $defline) - using '$c_orgname' instead.\n";
261 $oname=$c_orgname;
262 }
263 $defline.=' {'.$oname.'}';
264 $defline=~tr/;"\t/.' /; #"
265 print UPFA '>'.$upre.$c_id.'|acc|'.$c_priacc.$defline."\n";
266 print UPFA fastafmt(\$seq);
267 $inSeq=0;
268 ($c_id, $c_taxon, $c_rev, $c_len, $c_priacc, $c_descr, $c_gene, $xcode, $c_uniref, $c_htaxon)=
269 (undef, undef, 0, undef, undef, undef, undef, 0, undef, undef );
270 ($c_orgname, $c_comname)=(undef,undef);
271 $seq='';
272 }
273
274 sub fastafmt {
275 my $s=shift;
276 my $slen=length($$s);
277 my @lines=unpack("A60" x (int(($slen-1)/60)+1),$$s);
278 return join("\n",@lines)."\n";
279 }
280
281 sub prepFields {
282 foreach (@_) {
283 s/^\s+//;
284 s/\s+$//;
285 s/\.$//;
286 tr/\t \n/ /s;
287 }
288 }

Properties

Name Value
svn:executable *