ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/pdbwiki/trunk/scripts/create_pdbentries.py
Revision: 66
Committed: Fri Sep 2 10:23:01 2011 UTC (8 years, 10 months ago) by jmduarteg
File size: 14691 byte(s)
Log Message:
Fixed (this time hopefully for good) the beta character problem of entry 1ubp by hard-coding a fix in the python page generation script
Line File contents
1 #!/usr/bin/env python
2 """
3 create_pdbentries.py -l <listfile> -o <outfile> [-u]
4
5 Creates a file with pdb entry pages which can be uploaded using pagefromfile.py from pywikpedia.
6 The data is taken from local installations of pdbase, uniprot and unzipped mmCIF files.
7
8 Parameters:
9 -l A file that contains a list of pdb codes
10 -o The output file
11 -u If used the entry will be added to the Updated category
12
13 """
14
15 import sys
16 import os.path
17 import re
18 import getopt
19 import time
20 import gzip
21
22 #import SOAPpy
23
24 #import wikipedia
25
26 import MySQLdb
27
28 AAS=["ALA", "ARG", "ASN", "ASP",\
29 "CYS", "GLN", "GLU", "GLY",\
30 "HIS", "ILE", "LEU", "LYS",\
31 "MET", "PHE", "PRO", "SER",\
32 "THR", "TRP", "TYR","VAL"]
33
34 THREELETTER2ONELETTER = \
35 {'CYS': 'C', \
36 'ASP': 'D', \
37 'SER': 'S', \
38 'GLN': 'Q', \
39 'LYS': 'K', \
40 'ILE': 'I', \
41 'PRO': 'P', \
42 'THR': 'T', \
43 'PHE': 'F', \
44 'ALA': 'A', \
45 'GLY': 'G', \
46 'HIS': 'H', \
47 'GLU': 'E', \
48 'LEU': 'L', \
49 'ARG': 'R', \
50 'TRP': 'W', \
51 'VAL': 'V', \
52 'ASN': 'N', \
53 'TYR': 'Y', \
54 'MET': 'M'}
55
56 PDBASEDB="pdbase"
57
58
59 BLASTURL="http://www.ncbi.nlm.nih.gov/blast/Blast.cgi?CMD=Put&PROGRAM=%s&DATABASE=nr&FILTER=L&QUERY="
60
61 UNIPROTURL="http://www.uniprot.org/uniprot"
62 PMURL="http://www.ncbi.nlm.nih.gov/sites/entrez?WebEnv=&db=pubmed&orig_db=pubmed&term="
63
64 CIFFILESDIR="/groupsroot/pdbmirror/data/structures/all/mmCIF"
65
66
67 def getPdbaseData(conn,pdbCode):
68 cursor= conn.cursor ()
69 # entry key and title
70 entrykey=0
71 title=""
72 sql="""
73 SELECT entry_key,title FROM STRUCT WHERE entry_id='%s'
74 """%pdbCode.upper()
75 cursor.execute (sql)
76 resultset = cursor.fetchall ()
77 for result in resultset:
78 entrykey=int(result[0])
79 title=str(result[1])
80 # release date
81 releaseDate=""
82 depositionDate=""
83 lastRevisionDate=""
84 lastRevision=0
85 sql="""
86 SELECT min(date_original),min(date2), max(date2), max(num) FROM DATABASE_PDB_REV WHERE entry_key=%s
87 """%entrykey
88 cursor.execute (sql)
89 resultset = cursor.fetchall ()
90 for result in resultset:
91 depositionDate=str(result[0])
92 releaseDate=str(result[1])
93 lastRevisionDate=str(result[2])
94 lastRevision=result[3]
95 # experimental method
96 expmethod=""
97 sql="""
98 SELECT substring_index(method,",",1) FROM EXPTL WHERE entry_key=%s
99 """%entrykey
100 cursor.execute (sql)
101 resultset = cursor.fetchall ()
102 for result in resultset:
103 expmethod=str(result[0])
104 # resolution
105 resol=0.0
106 sql="""
107 SELECT ls_d_res_high FROM REFINE WHERE entry_key=%s
108 """%entrykey
109 cursor.execute (sql)
110 resultset = cursor.fetchall ()
111 for result in resultset:
112 resol=float(result[0])
113 # primary citation
114 primcit=""
115 sql="""
116 SELECT title FROM CITATION WHERE entry_key=%s AND id='primary'
117 """%entrykey
118 cursor.execute (sql)
119 resultset = cursor.fetchall ()
120 for result in resultset:
121 primcit=str(result[0])
122 # authors
123 primauthors=""
124 sql="""
125 SELECT name FROM CITATION_AUTHOR WHERE entry_key=%s AND citation_id='primary'
126 ORDER BY ordinal
127 """%entrykey
128 cursor.execute (sql)
129 resultset = cursor.fetchall ()
130 for result in resultset:
131 primauthors+=str(result[0])+" "
132 # classification
133 classif=""
134 sql="""
135 SELECT pdbx_keywords FROM STRUCT_KEYWORDS WHERE entry_key=%s
136 """%entrykey
137 cursor.execute (sql)
138 resultset = cursor.fetchall ()
139 for result in resultset:
140 classif=str(result[0])
141 # chains and cifchains
142 cifchains2chains={}
143 sql="""
144 SELECT DISTINCT pdb_strand_id,asym_id FROM PDBX_POLY_SEQ_SCHEME WHERE entry_key=%s
145 """%entrykey
146 cursor.execute (sql)
147 resultset = cursor.fetchall ()
148 for result in resultset:
149 cifchains2chains[str(result[1])]=str(result[0])
150 # sequences
151 sequences = {}
152 for chainCode in cifchains2chains:
153 pdbChainCode=cifchains2chains[chainCode]
154 sequences[pdbChainCode]=getSeqFromPdbase(conn, entrykey, pdbChainCode, chainCode)
155 # getting all identity keys
156 entitykeys={}
157 for chainCode in cifchains2chains:
158 entitykeys[chainCode]=getEntityKey(conn,entrykey,chainCode)
159 # and unique chains from the entitykeys (using pdbChainCode as chain identifiers)
160 uniquechains={} # entity keys to list of pdbChainCodes
161 for chainCode in cifchains2chains:
162 pdbChainCode=cifchains2chains[chainCode]
163 if (not uniquechains.has_key(entitykeys[chainCode])):
164 uniquechains[entitykeys[chainCode]]=[]
165 uniquechains[entitykeys[chainCode]].append(pdbChainCode)
166 # source organism
167 source=""
168 sql="""
169 SELECT pdbx_gene_src_scientific_name FROM ENTITY_SRC_GEN WHERE entry_key=%s
170 """%entrykey
171 cursor.execute (sql)
172 resultset = cursor.fetchall ()
173 for result in resultset:
174 source=str(result[0]).upper()
175 # uniprot/swissprot
176 unps = {}
177 for chainCode in cifchains2chains:
178 pdbChainCode=cifchains2chains[chainCode]
179 unps[pdbChainCode]=getUniprotFromPdbase(conn,entrykey,chainCode,entitykeys)
180 # ec numbers
181 ecs = {}
182 for chainCode in cifchains2chains:
183 pdbChainCode=cifchains2chains[chainCode]
184 ecs[pdbChainCode]=getECsFromPdbase(conn,entrykey,chainCode,entitykeys)
185
186 # escaping wiki special characters and stripping new lines
187 title=cleanUpString(title)
188 primcit=cleanUpString(primcit)
189 primauthors=cleanUpString(primauthors)
190
191 return (title,depositionDate,releaseDate,lastRevisionDate,lastRevision,expmethod,resol,primcit,primauthors,classif,sequences,source,unps,ecs,uniquechains)
192
193 def getSeqFromPdbase(conn, entrykey, pdbChainCode, chainCode):
194 cursor = conn.cursor()
195 sequence = ""
196 # we use seq_id+0 (implicitly converts to int) in ORDER BY because seq_id is varchar!!
197 sql="""
198 SELECT mon_id FROM PDBX_POLY_SEQ_SCHEME WHERE entry_key=%s AND asym_id='%s' AND pdb_strand_id='%s' ORDER BY seq_id+0
199 """%(entrykey,chainCode,pdbChainCode)
200 cursor.execute (sql)
201 resultset = cursor.fetchall ()
202 for result in resultset:
203 res_type=str(result[0])
204 aalist=AAS # to get the list of 20 standard aas
205 if (aalist.count(res_type)>0): # we check that the residue type is in the list of 20 standard aas
206 sequence+=THREELETTER2ONELETTER[res_type]
207 elif (res_type=="DA" or res_type=="A"):
208 sequence+="A"
209 elif (res_type=="DC" or res_type=="C"):
210 sequence+="C"
211 elif (res_type=="DT" or res_type=="T"):
212 sequence+="T"
213 elif (res_type=="DG" or res_type=="G"):
214 sequence+="G"
215 elif (res_type=="DU" or res_type=="U"):
216 sequence+="U"
217 else:
218 sequence+="X"
219 return sequence
220
221
222 def getEntityKey(conn,entrykey,chainCode):
223 cursor = conn.cursor()
224 entitykey=0
225 sql="""
226 SELECT entity_key FROM STRUCT_ASYM WHERE entry_key=%s AND id='%s'
227 """%(entrykey,chainCode)
228 cursor.execute (sql)
229 resultset = cursor.fetchall ()
230 for result in resultset:
231 entitykey=int(result[0])
232 return entitykey
233
234 def getUniprotFromPdbase(conn,entrykey,chainCode,entitykeys):
235 entitykey=entitykeys[chainCode]
236 cursor = conn.cursor()
237 unp=""
238 sql="""
239 SELECT pdbx_db_accession FROM STRUCT_REF WHERE entry_key=%s AND db_name='UNP' AND entity_key=%s AND pdbx_db_accession IS NOT NULL
240 """%(entrykey,entitykey)
241 cursor.execute (sql)
242 resultset = cursor.fetchall ()
243 for result in resultset:
244 unp=str(result[0])
245 return unp
246
247 def getECsFromPdbase(conn,entrykey,chainCode,entitykeys):
248 entitykey=entitykeys[chainCode]
249 cursor = conn.cursor()
250 ecs = []
251 sql="""
252 SELECT pdbx_ec FROM ENTITY WHERE entity_key=%s AND pdbx_ec IS NOT NULL AND pdbx_ec!='?'
253 """%(entitykey)
254 cursor.execute (sql)
255 resultset = cursor.fetchall ()
256 for result in resultset:
257 tokens = re.split(",|/|;",result[0]) # possible separators , / ;
258 for token in tokens:
259 token=token.strip()
260 # in pdbx_ec most times ECs are in the format x.x.x.x, but there are a few cases with leading "EC" or "E.C."
261 # There are also a few cases of x.x.x or x.x, that's why we use {1,3} 1 to 3 repeats
262 m = re.search("(?:E\.?C\.?\s*)?((\d+(?:\.(?:\d|-)+){1,3}))",token)
263 if (m):
264 ec=m.group(1)
265 # finally we take care of cases x.x.-.- or x.x.x.-, we simply strip out everythin from the first ".-"
266 if (ec.find(".-")!=-1):
267 ec=ec[0:ec.find(".-")]
268 ecs.append(ec)
269 else:
270 sys.stderr.write("Unmatched EC number for entity_key %s: %s\n"%(entitykey,token))
271 return ecs
272
273 def getSequenceStr(pdbCode,sequences,ecs,unps,uniquechains):
274 sequenceStr=""
275 for entitykey in uniquechains.keys():
276 chains = uniquechains[entitykey]
277 chains.sort()
278 firstpdbChainCode=min(chains)
279 seq = sequences[firstpdbChainCode]
280 blastType="blastn" # we set default to DNA/RNA
281 if (re.search("[^ACTGUX]",seq)):
282 blastType="blastp" #looks like a protein
283 baseblasturl="[%s%s"%(BLASTURL%blastType,pdbCode)
284 sep=", "
285 ecStr=""
286 if (ecs.has_key(firstpdbChainCode) and ecs[firstpdbChainCode]!=[]):
287 ecStr="EC %s"%sep.join(ecs[firstpdbChainCode])
288 unpStr=""
289 if (unps.has_key(firstpdbChainCode) and unps[firstpdbChainCode]!=""):
290 unpStr="[%s/%s Uniprot]"%(UNIPROTURL,unps[firstpdbChainCode])
291 fullblasturl=baseblasturl+firstpdbChainCode+" Blast]"
292
293 length=len(seq)
294 chainsInThisEntity=""
295 for pdbChainCode in chains:
296 chainsInThisEntity+=pdbChainCode+" "
297 sequenceStr+="Chain(s) %s(%s residues): %s %s %s <pre>\n"%(chainsInThisEntity,length,fullblasturl,unpStr,ecStr)
298 for i in range(0,len(seq),80):
299 sequenceStr+="%s\n"%(seq[i:i+80])
300 sequenceStr+="</pre>\n"
301 return sequenceStr
302
303 def getPmidFromCifFile(pdbCode):
304 pmid=0
305 ciffile=os.path.join(CIFFILESDIR,"%s.cif.gz"%pdbCode)
306 fcif = gzip.open(ciffile,"rb")
307 lines = fcif.readlines()
308 for line in lines:
309 m=re.search("^_citation.pdbx_database_id_PubMed\s+(\d+)",line)
310 if (m):
311 pmid=int(m.group(1))
312 fcif.close()
313 return pmid
314
315 def cleanUpString(string):
316 """
317 Escapes [[ and {{ from given string using <nowiki> tags and strips new lines from given string
318 """
319 # taking care of "[[" or "{{" that have to be escaped with <nowiki> </nowiki>
320 if (string.find("[[")>0 or string.find("{{")>0):
321 string="<nowiki>%s</nowiki>"%string
322 # strip new lines
323 return re.sub("\n","",string)
324
325
326 # ---------------- main ---------------------
327 if __name__ == '__main__':
328 try:
329 opts, args = getopt.getopt(sys.argv[1:], "l:o:u", ["listfile=","outfile=","update="])
330 except getopt.GetoptError:
331 sys.stderr.write("Arguments not correctly specified\n")
332 # print help information and exit:
333 sys.stderr.write(__doc__)
334 sys.exit(2)
335 listfile=""
336 outfile=""
337 update = False
338 for o, a in opts:
339 if o in ("-l","--listfile"):
340 listfile = a
341 if o in ("-o","--outfile"):
342 outfile = a
343 if o in ("-u","--update"):
344 update = True
345
346 if (listfile=="" or outfile==""):
347 print "Must provide a list file with -l and an outfile with -o"
348 sys.exit(1)
349
350
351
352 ## connecting to our database to get pdb data
353 conn= MySQLdb.connect (db = PDBASEDB, read_default_file="~/.my.cnf")
354 ## connect to pdb web services
355 #server = SOAPpy.SOAPProxy("http://www.pdb.org/pdb/services/pdbws")
356
357 #site = wikipedia.getSite()
358
359 out = open(outfile,"w")
360
361 lfile=open(listfile,"r")
362 lines=lfile.readlines()
363 count=0
364 for line in lines:
365 count=count+1
366 pdbCode=line.strip()
367
368 #print pdbCode
369
370 # getting data
371 (title,depositionDate,releaseDate,lastRevisionDate,lastRevision,expmethod,resol,primcit,primauthors,classif,sequences,source,unps,ecs,uniquechains) = getPdbaseData(conn,pdbCode)
372
373 pmid = getPmidFromCifFile(pdbCode)
374 pmurl=""
375 if (pmid!=0):
376 pmurl = "[%s%s PubMed]"%(PMURL,pmid)
377
378 # creating template string
379
380 # hard-coded fix to entry 1ubp whose citation title contains a beta character (code 0xdf i.e. 223) that breaks pywikipedia's upload script
381 # 1ubp is the one and only such case found in pdbase
382 primcit=re.sub(chr(223),"beta",primcit)
383
384 # 1st we create the sequence string
385 sequenceStr = getSequenceStr(pdbCode,sequences,ecs,unps,uniquechains)
386
387 # and we get a unique list of ec numbers for this pdbCode (for the category)
388 ecSet = {}
389 for pdbChainCode in ecs:
390 for ec in ecs[pdbChainCode]:
391 ecSet[ec]=""
392
393 pdbidx = pdbCode[1:3].upper()
394
395 template = """
396 {{PDB entry
397 | PDBID=%s
398 | PDBIDX=%s
399 | TITLE=%s
400 | DEPOSITION_DATE=%s
401 | RELEASE_DATE=%s
402 | LAST_REVISION_DATE=%s
403 | LAST_REVISION=%s
404 | AUTHORS=%s
405 | CITATION=%s
406 | PMID=%s
407 | PMURL=%s
408 | EXP_METHOD=%s
409 | RESOLUTION=%s
410 | CLASSIFICATION=%s
411 | SEQUENCE=%s
412 | SOURCE=%s
413 }}
414 """%(pdbCode,pdbidx,title,depositionDate,releaseDate,lastRevisionDate,lastRevision,primauthors,primcit,pmid,pmurl,expmethod,resol,classif,sequenceStr,source)
415
416
417 # start tag
418 out.write("<page>\n")
419
420 # title (now using -notitle argument in pagefromfile.py)
421 out.write("'''%s'''"%pdbCode)
422
423 # main template
424 out.write("%s\n"%template)
425
426 # categories (only EC because it's an array and arrays can't be passed in the template,
427 # all other categories are done through the template)
428 for ec in ecSet:
429 out.write("[[Category:EC %s|%s]]\n"%(ec,pdbidx))
430
431
432 if (update):
433 time.localtime(time.time())
434 timestamp = time.strftime('%Y-%m-%d', time.localtime(time.time()))
435 out.write("{{PDB entry updated| TIMESTAMP=%s|PDBIDX=%s }}\n"%(timestamp,pdbidx))
436
437 # end tag
438 out.write("</page>\n")
439
440 ## wiki uploading
441 #page = wikipedia.Page(site, pdbCode)
442 # to get the page would be:
443 #text = page.get()
444 #print(text)
445
446 #page.put(template)
447 print("Done generating entry pages: %s pages generated"%count)
448 out.close()

Properties

Name Value
svn:executable