1 |
#!/bin/sh |
2 |
# Script to update a local uniprot copy with sequence files and generate blast dbs for them (formatdb) |
3 |
|
4 |
if [ -z "$1" ] |
5 |
then |
6 |
echo "Usage: $0 <base local dir>" |
7 |
exit 1 |
8 |
fi |
9 |
|
10 |
|
11 |
LOCALDIR=$1 |
12 |
CURRENT="$LOCALDIR/current" |
13 |
DOWNLOAD="$LOCALDIR/download" |
14 |
|
15 |
#SITE="ftp://ftp.uniprot.org/pub" # US main ftp |
16 |
SITE="ftp://ftp.ebi.ac.uk/pub" # UK mirror |
17 |
# the swiss mirror doesn't seem to update properly, not using it anymore |
18 |
#SITE="ftp://ftp.expasy.org" # swiss mirror |
19 |
|
20 |
|
21 |
COMPLETEKBDIR="databases/uniprot/current_release/knowledgebase/complete" |
22 |
|
23 |
SIFTSPDB2UNIPROTFTP="ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/text/pdb_chain_uniprot.lst" |
24 |
|
25 |
|
26 |
SPROT="uniprot_sprot.fasta" |
27 |
SPROTGZ="${SPROT}.gz" |
28 |
TREMBL="uniprot_trembl.fasta" |
29 |
TREMBLGZ="${TREMBL}.gz" |
30 |
ALL="uniprot_all.fasta" |
31 |
RELDATEFILE="reldate.txt" |
32 |
SIFTSPDB2UNIPROT="pdb_chain_uniprot.lst" |
33 |
|
34 |
sproturl="$SITE/$COMPLETEKBDIR/$SPROTGZ" |
35 |
tremblurl="$SITE/$COMPLETEKBDIR/$TREMBLGZ" |
36 |
reldateurl="$SITE/$COMPLETEKBDIR/$RELDATEFILE" |
37 |
|
38 |
# remove existing download directory if there was one |
39 |
rm -rf $DOWNLOAD |
40 |
# create the download dir |
41 |
mkdir $DOWNLOAD |
42 |
|
43 |
# getting the release date file if newer available |
44 |
release="" |
45 |
curl -z $CURRENT/$RELDATEFILE $reldateurl > $DOWNLOAD/$RELDATEFILE |
46 |
if [ -s "$DOWNLOAD/$RELDATEFILE" ] |
47 |
then |
48 |
release=`head -1 $DOWNLOAD/$RELDATEFILE | sed "s/UniProt Knowledgebase Release \(...._..\).*/\1/"` |
49 |
echo "New uniprot release $release available. Downloading files." |
50 |
else |
51 |
echo "No new uniprot release available. Exiting" |
52 |
rm -rf $DOWNLOAD |
53 |
exit 0 |
54 |
fi |
55 |
|
56 |
|
57 |
# download if newer available |
58 |
curl -z $CURRENT/$TREMBL $tremblurl > $DOWNLOAD/${TREMBL}.gz |
59 |
if [ -s "$DOWNLOAD/${TREMBL}.gz" ] |
60 |
then |
61 |
echo "New trembl version downloaded" |
62 |
else |
63 |
echo "Remote trembl file not newer than local one. Something wrong. Exiting." |
64 |
exit 1 |
65 |
fi |
66 |
|
67 |
curl -z $CURRENT/$SPROT $sproturl > $DOWNLOAD/${SPROT}.gz |
68 |
if [ -s "$DOWNLOAD/${SPROT}.gz" ] |
69 |
then |
70 |
echo "New sprot version downloaded" |
71 |
else |
72 |
echo "Remote sprot file not newer than local one. Something wrong. Exiting." |
73 |
exit 1 |
74 |
fi |
75 |
|
76 |
|
77 |
# getting the SIFTS PDB to UNIPROT mapping file |
78 |
curl $SIFTSPDB2UNIPROTFTP > $DOWNLOAD/$SIFTSPDB2UNIPROT |
79 |
|
80 |
|
81 |
# uncompressing |
82 |
gzip -df $DOWNLOAD/${SPROT}.gz |
83 |
gzip -df $DOWNLOAD/${TREMBL}.gz |
84 |
# creating the "all" file |
85 |
cat $DOWNLOAD/$TREMBL $DOWNLOAD/$SPROT > $DOWNLOAD/$ALL |
86 |
|
87 |
# run formatdb |
88 |
# formatdb appends the path used to run it to the .pal index file, |
89 |
# thus if the path used is an absolute path it's effectively hard coding |
90 |
# them making the directory not movable. That's why we have to cd to the |
91 |
# DOWNLOAD dir first, so that there's no hard-coded paths in the .pal file |
92 |
|
93 |
|
94 |
echo "Running formatdb..." |
95 |
|
96 |
#formatdb log file |
97 |
logfile="$DOWNLOAD/formatdb.log" |
98 |
|
99 |
cd $DOWNLOAD |
100 |
formatdb -p T -o T -l $logfile -i $SPROT |
101 |
formatdb -p T -o T -l $logfile -i $TREMBL |
102 |
formatdb -p T -o T -l $logfile -i $ALL |
103 |
|
104 |
#renaming DOWNLOAD dir to uniprot version and updating current symlink |
105 |
echo "Creating new symlink..." |
106 |
mv $DOWNLOAD $LOCALDIR/uniprot_$release |
107 |
rm -f $CURRENT |
108 |
cd $LOCALDIR |
109 |
ln -s uniprot_$release current |
110 |
|
111 |
echo "Done" |