ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/owl/trunk/scripts/updateUniprot.sh
Revision: 1485
Committed: Tue Nov 15 10:24:40 2011 UTC (10 years ago) by jmduarteg
File size: 3525 byte(s)
Log Message:
Adding downloading of uniref100
Line User Rev File contents
1 jmduarteg 1053 #!/bin/sh
2     # Script to update a local uniprot copy with sequence files and generate blast dbs for them (formatdb)
3    
4     if [ -z "$1" ]
5     then
6     echo "Usage: $0 <base local dir>"
7     exit 1
8     fi
9    
10    
11     LOCALDIR=$1
12     CURRENT="$LOCALDIR/current"
13     DOWNLOAD="$LOCALDIR/download"
14    
15 jmduarteg 1289 FORMATDB=/usr/bin/formatdb
16    
17 jmduarteg 1053 #SITE="ftp://ftp.uniprot.org/pub" # US main ftp
18 jmduarteg 1076 SITE="ftp://ftp.ebi.ac.uk/pub" # UK mirror
19     # the swiss mirror doesn't seem to update properly, not using it anymore
20     #SITE="ftp://ftp.expasy.org" # swiss mirror
21 jmduarteg 1053
22    
23     COMPLETEKBDIR="databases/uniprot/current_release/knowledgebase/complete"
24 jmduarteg 1485 UNIREFDIR="databases/uniprot/uniref/uniref100"
25 jmduarteg 1053
26     SIFTSPDB2UNIPROTFTP="ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/text/pdb_chain_uniprot.lst"
27    
28    
29     SPROT="uniprot_sprot.fasta"
30     SPROTGZ="${SPROT}.gz"
31     TREMBL="uniprot_trembl.fasta"
32     TREMBLGZ="${TREMBL}.gz"
33 jmduarteg 1485 UNIREF100="uniref100.fasta"
34     UNIREF100GZ="uniref100.fasta.gz"
35 jmduarteg 1053 ALL="uniprot_all.fasta"
36     RELDATEFILE="reldate.txt"
37     SIFTSPDB2UNIPROT="pdb_chain_uniprot.lst"
38    
39     sproturl="$SITE/$COMPLETEKBDIR/$SPROTGZ"
40     tremblurl="$SITE/$COMPLETEKBDIR/$TREMBLGZ"
41 jmduarteg 1485 uref100url="$SITE/$UNIREFDIR/$UNIREF100GZ"
42 jmduarteg 1053 reldateurl="$SITE/$COMPLETEKBDIR/$RELDATEFILE"
43    
44     # remove existing download directory if there was one
45     rm -rf $DOWNLOAD
46     # create the download dir
47     mkdir $DOWNLOAD
48    
49     # getting the release date file if newer available
50     release=""
51     curl -z $CURRENT/$RELDATEFILE $reldateurl > $DOWNLOAD/$RELDATEFILE
52     if [ -s "$DOWNLOAD/$RELDATEFILE" ]
53     then
54     release=`head -1 $DOWNLOAD/$RELDATEFILE | sed "s/UniProt Knowledgebase Release \(...._..\).*/\1/"`
55     echo "New uniprot release $release available. Downloading files."
56     else
57     echo "No new uniprot release available. Exiting"
58     rm -rf $DOWNLOAD
59     exit 0
60     fi
61    
62    
63     # download if newer available
64     curl -z $CURRENT/$TREMBL $tremblurl > $DOWNLOAD/${TREMBL}.gz
65     if [ -s "$DOWNLOAD/${TREMBL}.gz" ]
66     then
67     echo "New trembl version downloaded"
68     else
69     echo "Remote trembl file not newer than local one. Something wrong. Exiting."
70     exit 1
71     fi
72    
73     curl -z $CURRENT/$SPROT $sproturl > $DOWNLOAD/${SPROT}.gz
74     if [ -s "$DOWNLOAD/${SPROT}.gz" ]
75     then
76     echo "New sprot version downloaded"
77     else
78     echo "Remote sprot file not newer than local one. Something wrong. Exiting."
79     exit 1
80     fi
81    
82 jmduarteg 1485 curl -z $CURRENT/$UNIREF100 $uref100url > $DOWNLOAD/${UNIREF100}.gz
83     if [ -s "$DOWNLOAD/${UNIREF100}.gz" ]
84     then
85     echo "New Uniref100 version downloaded"
86     else
87     echo "Remote Uniref100 file not newer than local one. Something wrong. Exiting"
88     exit 1
89     fi
90 jmduarteg 1053
91     # getting the SIFTS PDB to UNIPROT mapping file
92     curl $SIFTSPDB2UNIPROTFTP > $DOWNLOAD/$SIFTSPDB2UNIPROT
93    
94    
95     # uncompressing
96     gzip -df $DOWNLOAD/${SPROT}.gz
97     gzip -df $DOWNLOAD/${TREMBL}.gz
98 jmduarteg 1485 gzip -df $DOWNLOAD/${UNIREF100}.gz
99 jmduarteg 1053 # creating the "all" file
100     cat $DOWNLOAD/$TREMBL $DOWNLOAD/$SPROT > $DOWNLOAD/$ALL
101    
102     # run formatdb
103     # formatdb appends the path used to run it to the .pal index file,
104     # thus if the path used is an absolute path it's effectively hard coding
105     # them making the directory not movable. That's why we have to cd to the
106     # DOWNLOAD dir first, so that there's no hard-coded paths in the .pal file
107    
108    
109     echo "Running formatdb..."
110    
111     #formatdb log file
112     logfile="$DOWNLOAD/formatdb.log"
113    
114     cd $DOWNLOAD
115 jmduarteg 1485 #$FORMATDB -p T -o T -l $logfile -i $SPROT
116     #$FORMATDB -p T -o T -l $logfile -i $TREMBL
117 jmduarteg 1289 $FORMATDB -p T -o T -l $logfile -i $ALL
118 jmduarteg 1485 $FORMATDB -p T -o T -l $logfile -i $UNIREF100
119 jmduarteg 1053
120     #renaming DOWNLOAD dir to uniprot version and updating current symlink
121     echo "Creating new symlink..."
122     mv $DOWNLOAD $LOCALDIR/uniprot_$release
123     rm -f $CURRENT
124     cd $LOCALDIR
125     ln -s uniprot_$release current
126    
127     echo "Done"

Properties

Name Value
svn:executable *