ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/owl/trunk/scripts/updateUniprot.sh
Revision: 1053
Committed: Wed Apr 28 10:58:18 2010 UTC (11 years, 7 months ago) by jmduarteg
File size: 2929 byte(s)
Log Message:
New script to maintain a local copy of blastable uniprot sequences db.
Line User Rev File contents
1 jmduarteg 1053 #!/bin/sh
2     # Script to update a local uniprot copy with sequence files and generate blast dbs for them (formatdb)
3    
4     if [ -z "$1" ]
5     then
6     echo "Usage: $0 <base local dir>"
7     exit 1
8     fi
9    
10    
11     LOCALDIR=$1
12     CURRENT="$LOCALDIR/current"
13     DOWNLOAD="$LOCALDIR/download"
14    
15     #SITE="ftp://ftp.uniprot.org/pub" # US main ftp
16     #SITE="ftp://ftp.ebi.ac.uk/pub" # UK mirror
17     SITE="ftp://ftp.expasy.org" # swiss mirror
18    
19    
20     COMPLETEKBDIR="databases/uniprot/current_release/knowledgebase/complete"
21    
22     SIFTSPDB2UNIPROTFTP="ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/text/pdb_chain_uniprot.lst"
23    
24    
25     SPROT="uniprot_sprot.fasta"
26     SPROTGZ="${SPROT}.gz"
27     TREMBL="uniprot_trembl.fasta"
28     TREMBLGZ="${TREMBL}.gz"
29     ALL="uniprot_all.fasta"
30     RELDATEFILE="reldate.txt"
31     SIFTSPDB2UNIPROT="pdb_chain_uniprot.lst"
32    
33     sproturl="$SITE/$COMPLETEKBDIR/$SPROTGZ"
34     tremblurl="$SITE/$COMPLETEKBDIR/$TREMBLGZ"
35     reldateurl="$SITE/$COMPLETEKBDIR/$RELDATEFILE"
36    
37     # remove existing download directory if there was one
38     rm -rf $DOWNLOAD
39     # create the download dir
40     mkdir $DOWNLOAD
41    
42     # getting the release date file if newer available
43     release=""
44     curl -z $CURRENT/$RELDATEFILE $reldateurl > $DOWNLOAD/$RELDATEFILE
45     if [ -s "$DOWNLOAD/$RELDATEFILE" ]
46     then
47     release=`head -1 $DOWNLOAD/$RELDATEFILE | sed "s/UniProt Knowledgebase Release \(...._..\).*/\1/"`
48     echo "New uniprot release $release available. Downloading files."
49     else
50     echo "No new uniprot release available. Exiting"
51     rm -rf $DOWNLOAD
52     exit 0
53     fi
54    
55    
56     # download if newer available
57     curl -z $CURRENT/$TREMBL $tremblurl > $DOWNLOAD/${TREMBL}.gz
58     if [ -s "$DOWNLOAD/${TREMBL}.gz" ]
59     then
60     echo "New trembl version downloaded"
61     else
62     echo "Remote trembl file not newer than local one. Something wrong. Exiting."
63     exit 1
64     fi
65    
66     curl -z $CURRENT/$SPROT $sproturl > $DOWNLOAD/${SPROT}.gz
67     if [ -s "$DOWNLOAD/${SPROT}.gz" ]
68     then
69     echo "New sprot version downloaded"
70     else
71     echo "Remote sprot file not newer than local one. Something wrong. Exiting."
72     exit 1
73     fi
74    
75    
76     # getting the SIFTS PDB to UNIPROT mapping file
77     curl $SIFTSPDB2UNIPROTFTP > $DOWNLOAD/$SIFTSPDB2UNIPROT
78    
79    
80     # uncompressing
81     gzip -df $DOWNLOAD/${SPROT}.gz
82     gzip -df $DOWNLOAD/${TREMBL}.gz
83     # creating the "all" file
84     cat $DOWNLOAD/$TREMBL $DOWNLOAD/$SPROT > $DOWNLOAD/$ALL
85    
86     # run formatdb
87     # formatdb appends the path used to run it to the .pal index file,
88     # thus if the path used is an absolute path it's effectively hard coding
89     # them making the directory not movable. That's why we have to cd to the
90     # DOWNLOAD dir first, so that there's no hard-coded paths in the .pal file
91    
92    
93     echo "Running formatdb..."
94    
95     #formatdb log file
96     logfile="$DOWNLOAD/formatdb.log"
97    
98     cd $DOWNLOAD
99     formatdb -p T -o T -l $logfile -i $SPROT
100     formatdb -p T -o T -l $logfile -i $TREMBL
101     formatdb -p T -o T -l $logfile -i $ALL
102    
103     #renaming DOWNLOAD dir to uniprot version and updating current symlink
104     echo "Creating new symlink..."
105     mv $DOWNLOAD $LOCALDIR/uniprot_$release
106     rm -f $CURRENT
107     cd $LOCALDIR
108     ln -s uniprot_$release current
109    
110     echo "Done"

Properties

Name Value
svn:executable *