ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/pdbwiki/trunk/scripts/create_pdbentries.py
Revision: 66
Committed: Fri Sep 2 10:23:01 2011 UTC (8 years, 10 months ago) by jmduarteg
File size: 14691 byte(s)
Log Message:
Fixed (this time hopefully for good) the beta character problem of entry 1ubp by hard-coding a fix in the python page generation script
Line User Rev File contents
1 jmduarteg 1 #!/usr/bin/env python
2     """
3     create_pdbentries.py -l <listfile> -o <outfile> [-u]
4    
5     Creates a file with pdb entry pages which can be uploaded using pagefromfile.py from pywikpedia.
6     The data is taken from local installations of pdbase, uniprot and unzipped mmCIF files.
7    
8     Parameters:
9     -l A file that contains a list of pdb codes
10     -o The output file
11     -u If used the entry will be added to the Updated category
12    
13     """
14    
15     import sys
16     import os.path
17     import re
18     import getopt
19     import time
20 jmduarteg 63 import gzip
21 jmduarteg 1
22 jmduarteg 41 #import SOAPpy
23 jmduarteg 1
24     #import wikipedia
25    
26     import MySQLdb
27    
28 jmduarteg 35 AAS=["ALA", "ARG", "ASN", "ASP",\
29     "CYS", "GLN", "GLU", "GLY",\
30     "HIS", "ILE", "LEU", "LYS",\
31     "MET", "PHE", "PRO", "SER",\
32     "THR", "TRP", "TYR","VAL"]
33 jmduarteg 1
34 jmduarteg 35 THREELETTER2ONELETTER = \
35     {'CYS': 'C', \
36     'ASP': 'D', \
37     'SER': 'S', \
38     'GLN': 'Q', \
39     'LYS': 'K', \
40     'ILE': 'I', \
41     'PRO': 'P', \
42     'THR': 'T', \
43     'PHE': 'F', \
44     'ALA': 'A', \
45     'GLY': 'G', \
46     'HIS': 'H', \
47     'GLU': 'E', \
48     'LEU': 'L', \
49     'ARG': 'R', \
50     'TRP': 'W', \
51     'VAL': 'V', \
52     'ASN': 'N', \
53     'TYR': 'Y', \
54     'MET': 'M'}
55    
56 jmduarteg 1 PDBASEDB="pdbase"
57    
58    
59     BLASTURL="http://www.ncbi.nlm.nih.gov/blast/Blast.cgi?CMD=Put&PROGRAM=%s&DATABASE=nr&FILTER=L&QUERY="
60    
61     UNIPROTURL="http://www.uniprot.org/uniprot"
62     PMURL="http://www.ncbi.nlm.nih.gov/sites/entrez?WebEnv=&db=pubmed&orig_db=pubmed&term="
63    
64 jmduarteg 64 CIFFILESDIR="/groupsroot/pdbmirror/data/structures/all/mmCIF"
65 jmduarteg 1
66    
67     def getPdbaseData(conn,pdbCode):
68     cursor= conn.cursor ()
69     # entry key and title
70     entrykey=0
71     title=""
72     sql="""
73 jmduarteg 39 SELECT entry_key,title FROM STRUCT WHERE entry_id='%s'
74 jmduarteg 1 """%pdbCode.upper()
75     cursor.execute (sql)
76     resultset = cursor.fetchall ()
77     for result in resultset:
78     entrykey=int(result[0])
79     title=str(result[1])
80     # release date
81     releaseDate=""
82     depositionDate=""
83     lastRevisionDate=""
84     lastRevision=0
85     sql="""
86 jmduarteg 39 SELECT min(date_original),min(date2), max(date2), max(num) FROM DATABASE_PDB_REV WHERE entry_key=%s
87 jmduarteg 1 """%entrykey
88     cursor.execute (sql)
89     resultset = cursor.fetchall ()
90     for result in resultset:
91     depositionDate=str(result[0])
92     releaseDate=str(result[1])
93     lastRevisionDate=str(result[2])
94     lastRevision=result[3]
95     # experimental method
96     expmethod=""
97     sql="""
98 jmduarteg 39 SELECT substring_index(method,",",1) FROM EXPTL WHERE entry_key=%s
99 jmduarteg 1 """%entrykey
100     cursor.execute (sql)
101     resultset = cursor.fetchall ()
102     for result in resultset:
103     expmethod=str(result[0])
104     # resolution
105     resol=0.0
106     sql="""
107 jmduarteg 39 SELECT ls_d_res_high FROM REFINE WHERE entry_key=%s
108 jmduarteg 1 """%entrykey
109     cursor.execute (sql)
110     resultset = cursor.fetchall ()
111     for result in resultset:
112     resol=float(result[0])
113     # primary citation
114     primcit=""
115     sql="""
116 jmduarteg 39 SELECT title FROM CITATION WHERE entry_key=%s AND id='primary'
117 jmduarteg 1 """%entrykey
118     cursor.execute (sql)
119     resultset = cursor.fetchall ()
120     for result in resultset:
121     primcit=str(result[0])
122     # authors
123     primauthors=""
124     sql="""
125 jmduarteg 39 SELECT name FROM CITATION_AUTHOR WHERE entry_key=%s AND citation_id='primary'
126 jmduarteg 1 ORDER BY ordinal
127     """%entrykey
128     cursor.execute (sql)
129     resultset = cursor.fetchall ()
130     for result in resultset:
131     primauthors+=str(result[0])+" "
132     # classification
133     classif=""
134     sql="""
135 jmduarteg 39 SELECT pdbx_keywords FROM STRUCT_KEYWORDS WHERE entry_key=%s
136 jmduarteg 1 """%entrykey
137     cursor.execute (sql)
138     resultset = cursor.fetchall ()
139     for result in resultset:
140     classif=str(result[0])
141     # chains and cifchains
142     cifchains2chains={}
143     sql="""
144 jmduarteg 39 SELECT DISTINCT pdb_strand_id,asym_id FROM PDBX_POLY_SEQ_SCHEME WHERE entry_key=%s
145 jmduarteg 1 """%entrykey
146     cursor.execute (sql)
147     resultset = cursor.fetchall ()
148     for result in resultset:
149     cifchains2chains[str(result[1])]=str(result[0])
150     # sequences
151     sequences = {}
152     for chainCode in cifchains2chains:
153     pdbChainCode=cifchains2chains[chainCode]
154     sequences[pdbChainCode]=getSeqFromPdbase(conn, entrykey, pdbChainCode, chainCode)
155     # getting all identity keys
156     entitykeys={}
157     for chainCode in cifchains2chains:
158     entitykeys[chainCode]=getEntityKey(conn,entrykey,chainCode)
159     # and unique chains from the entitykeys (using pdbChainCode as chain identifiers)
160     uniquechains={} # entity keys to list of pdbChainCodes
161     for chainCode in cifchains2chains:
162     pdbChainCode=cifchains2chains[chainCode]
163     if (not uniquechains.has_key(entitykeys[chainCode])):
164     uniquechains[entitykeys[chainCode]]=[]
165     uniquechains[entitykeys[chainCode]].append(pdbChainCode)
166     # source organism
167     source=""
168     sql="""
169 jmduarteg 39 SELECT pdbx_gene_src_scientific_name FROM ENTITY_SRC_GEN WHERE entry_key=%s
170 jmduarteg 1 """%entrykey
171     cursor.execute (sql)
172     resultset = cursor.fetchall ()
173     for result in resultset:
174     source=str(result[0]).upper()
175     # uniprot/swissprot
176     unps = {}
177     for chainCode in cifchains2chains:
178     pdbChainCode=cifchains2chains[chainCode]
179     unps[pdbChainCode]=getUniprotFromPdbase(conn,entrykey,chainCode,entitykeys)
180     # ec numbers
181     ecs = {}
182     for chainCode in cifchains2chains:
183     pdbChainCode=cifchains2chains[chainCode]
184     ecs[pdbChainCode]=getECsFromPdbase(conn,entrykey,chainCode,entitykeys)
185    
186     # escaping wiki special characters and stripping new lines
187     title=cleanUpString(title)
188     primcit=cleanUpString(primcit)
189     primauthors=cleanUpString(primauthors)
190    
191     return (title,depositionDate,releaseDate,lastRevisionDate,lastRevision,expmethod,resol,primcit,primauthors,classif,sequences,source,unps,ecs,uniquechains)
192    
193     def getSeqFromPdbase(conn, entrykey, pdbChainCode, chainCode):
194     cursor = conn.cursor()
195     sequence = ""
196     # we use seq_id+0 (implicitly converts to int) in ORDER BY because seq_id is varchar!!
197     sql="""
198 jmduarteg 39 SELECT mon_id FROM PDBX_POLY_SEQ_SCHEME WHERE entry_key=%s AND asym_id='%s' AND pdb_strand_id='%s' ORDER BY seq_id+0
199 jmduarteg 1 """%(entrykey,chainCode,pdbChainCode)
200     cursor.execute (sql)
201     resultset = cursor.fetchall ()
202     for result in resultset:
203     res_type=str(result[0])
204     aalist=AAS # to get the list of 20 standard aas
205     if (aalist.count(res_type)>0): # we check that the residue type is in the list of 20 standard aas
206     sequence+=THREELETTER2ONELETTER[res_type]
207     elif (res_type=="DA" or res_type=="A"):
208     sequence+="A"
209     elif (res_type=="DC" or res_type=="C"):
210     sequence+="C"
211     elif (res_type=="DT" or res_type=="T"):
212     sequence+="T"
213     elif (res_type=="DG" or res_type=="G"):
214     sequence+="G"
215     elif (res_type=="DU" or res_type=="U"):
216     sequence+="U"
217     else:
218     sequence+="X"
219     return sequence
220    
221    
222     def getEntityKey(conn,entrykey,chainCode):
223     cursor = conn.cursor()
224     entitykey=0
225     sql="""
226 jmduarteg 39 SELECT entity_key FROM STRUCT_ASYM WHERE entry_key=%s AND id='%s'
227 jmduarteg 1 """%(entrykey,chainCode)
228     cursor.execute (sql)
229     resultset = cursor.fetchall ()
230     for result in resultset:
231     entitykey=int(result[0])
232     return entitykey
233    
234     def getUniprotFromPdbase(conn,entrykey,chainCode,entitykeys):
235     entitykey=entitykeys[chainCode]
236     cursor = conn.cursor()
237     unp=""
238     sql="""
239 jmduarteg 39 SELECT pdbx_db_accession FROM STRUCT_REF WHERE entry_key=%s AND db_name='UNP' AND entity_key=%s AND pdbx_db_accession IS NOT NULL
240 jmduarteg 1 """%(entrykey,entitykey)
241     cursor.execute (sql)
242     resultset = cursor.fetchall ()
243     for result in resultset:
244     unp=str(result[0])
245     return unp
246    
247     def getECsFromPdbase(conn,entrykey,chainCode,entitykeys):
248     entitykey=entitykeys[chainCode]
249     cursor = conn.cursor()
250     ecs = []
251     sql="""
252 jmduarteg 39 SELECT pdbx_ec FROM ENTITY WHERE entity_key=%s AND pdbx_ec IS NOT NULL AND pdbx_ec!='?'
253 jmduarteg 1 """%(entitykey)
254     cursor.execute (sql)
255     resultset = cursor.fetchall ()
256     for result in resultset:
257     tokens = re.split(",|/|;",result[0]) # possible separators , / ;
258     for token in tokens:
259     token=token.strip()
260     # in pdbx_ec most times ECs are in the format x.x.x.x, but there are a few cases with leading "EC" or "E.C."
261     # There are also a few cases of x.x.x or x.x, that's why we use {1,3} 1 to 3 repeats
262     m = re.search("(?:E\.?C\.?\s*)?((\d+(?:\.(?:\d|-)+){1,3}))",token)
263     if (m):
264     ec=m.group(1)
265     # finally we take care of cases x.x.-.- or x.x.x.-, we simply strip out everythin from the first ".-"
266     if (ec.find(".-")!=-1):
267     ec=ec[0:ec.find(".-")]
268     ecs.append(ec)
269     else:
270     sys.stderr.write("Unmatched EC number for entity_key %s: %s\n"%(entitykey,token))
271     return ecs
272    
273     def getSequenceStr(pdbCode,sequences,ecs,unps,uniquechains):
274     sequenceStr=""
275     for entitykey in uniquechains.keys():
276     chains = uniquechains[entitykey]
277     chains.sort()
278     firstpdbChainCode=min(chains)
279     seq = sequences[firstpdbChainCode]
280     blastType="blastn" # we set default to DNA/RNA
281     if (re.search("[^ACTGUX]",seq)):
282     blastType="blastp" #looks like a protein
283     baseblasturl="[%s%s"%(BLASTURL%blastType,pdbCode)
284     sep=", "
285     ecStr=""
286     if (ecs.has_key(firstpdbChainCode) and ecs[firstpdbChainCode]!=[]):
287     ecStr="EC %s"%sep.join(ecs[firstpdbChainCode])
288     unpStr=""
289     if (unps.has_key(firstpdbChainCode) and unps[firstpdbChainCode]!=""):
290     unpStr="[%s/%s Uniprot]"%(UNIPROTURL,unps[firstpdbChainCode])
291     fullblasturl=baseblasturl+firstpdbChainCode+" Blast]"
292    
293     length=len(seq)
294     chainsInThisEntity=""
295     for pdbChainCode in chains:
296     chainsInThisEntity+=pdbChainCode+" "
297     sequenceStr+="Chain(s) %s(%s residues): %s %s %s <pre>\n"%(chainsInThisEntity,length,fullblasturl,unpStr,ecStr)
298     for i in range(0,len(seq),80):
299     sequenceStr+="%s\n"%(seq[i:i+80])
300     sequenceStr+="</pre>\n"
301     return sequenceStr
302    
303     def getPmidFromCifFile(pdbCode):
304     pmid=0
305 jmduarteg 63 ciffile=os.path.join(CIFFILESDIR,"%s.cif.gz"%pdbCode)
306     fcif = gzip.open(ciffile,"rb")
307 jmduarteg 1 lines = fcif.readlines()
308     for line in lines:
309     m=re.search("^_citation.pdbx_database_id_PubMed\s+(\d+)",line)
310     if (m):
311     pmid=int(m.group(1))
312 jmduarteg 63 fcif.close()
313 jmduarteg 1 return pmid
314    
315     def cleanUpString(string):
316     """
317     Escapes [[ and {{ from given string using <nowiki> tags and strips new lines from given string
318     """
319     # taking care of "[[" or "{{" that have to be escaped with <nowiki> </nowiki>
320     if (string.find("[[")>0 or string.find("{{")>0):
321     string="<nowiki>%s</nowiki>"%string
322     # strip new lines
323     return re.sub("\n","",string)
324    
325    
326     # ---------------- main ---------------------
327     if __name__ == '__main__':
328     try:
329     opts, args = getopt.getopt(sys.argv[1:], "l:o:u", ["listfile=","outfile=","update="])
330     except getopt.GetoptError:
331     sys.stderr.write("Arguments not correctly specified\n")
332     # print help information and exit:
333     sys.stderr.write(__doc__)
334     sys.exit(2)
335     listfile=""
336     outfile=""
337     update = False
338     for o, a in opts:
339     if o in ("-l","--listfile"):
340     listfile = a
341     if o in ("-o","--outfile"):
342     outfile = a
343     if o in ("-u","--update"):
344     update = True
345    
346     if (listfile=="" or outfile==""):
347     print "Must provide a list file with -l and an outfile with -o"
348     sys.exit(1)
349    
350    
351    
352     ## connecting to our database to get pdb data
353     conn= MySQLdb.connect (db = PDBASEDB, read_default_file="~/.my.cnf")
354     ## connect to pdb web services
355     #server = SOAPpy.SOAPProxy("http://www.pdb.org/pdb/services/pdbws")
356    
357     #site = wikipedia.getSite()
358    
359     out = open(outfile,"w")
360    
361     lfile=open(listfile,"r")
362     lines=lfile.readlines()
363     count=0
364     for line in lines:
365     count=count+1
366     pdbCode=line.strip()
367    
368     #print pdbCode
369    
370     # getting data
371     (title,depositionDate,releaseDate,lastRevisionDate,lastRevision,expmethod,resol,primcit,primauthors,classif,sequences,source,unps,ecs,uniquechains) = getPdbaseData(conn,pdbCode)
372    
373     pmid = getPmidFromCifFile(pdbCode)
374     pmurl=""
375     if (pmid!=0):
376     pmurl = "[%s%s PubMed]"%(PMURL,pmid)
377    
378     # creating template string
379    
380 jmduarteg 66 # hard-coded fix to entry 1ubp whose citation title contains a beta character (code 0xdf i.e. 223) that breaks pywikipedia's upload script
381     # 1ubp is the one and only such case found in pdbase
382     primcit=re.sub(chr(223),"beta",primcit)
383    
384 jmduarteg 1 # 1st we create the sequence string
385     sequenceStr = getSequenceStr(pdbCode,sequences,ecs,unps,uniquechains)
386    
387     # and we get a unique list of ec numbers for this pdbCode (for the category)
388     ecSet = {}
389     for pdbChainCode in ecs:
390     for ec in ecs[pdbChainCode]:
391     ecSet[ec]=""
392    
393     pdbidx = pdbCode[1:3].upper()
394    
395     template = """
396     {{PDB entry
397     | PDBID=%s
398     | PDBIDX=%s
399     | TITLE=%s
400     | DEPOSITION_DATE=%s
401     | RELEASE_DATE=%s
402     | LAST_REVISION_DATE=%s
403     | LAST_REVISION=%s
404     | AUTHORS=%s
405     | CITATION=%s
406     | PMID=%s
407     | PMURL=%s
408     | EXP_METHOD=%s
409     | RESOLUTION=%s
410     | CLASSIFICATION=%s
411     | SEQUENCE=%s
412     | SOURCE=%s
413     }}
414     """%(pdbCode,pdbidx,title,depositionDate,releaseDate,lastRevisionDate,lastRevision,primauthors,primcit,pmid,pmurl,expmethod,resol,classif,sequenceStr,source)
415    
416    
417     # start tag
418     out.write("<page>\n")
419    
420     # title (now using -notitle argument in pagefromfile.py)
421     out.write("'''%s'''"%pdbCode)
422    
423     # main template
424     out.write("%s\n"%template)
425    
426     # categories (only EC because it's an array and arrays can't be passed in the template,
427     # all other categories are done through the template)
428     for ec in ecSet:
429     out.write("[[Category:EC %s|%s]]\n"%(ec,pdbidx))
430    
431    
432     if (update):
433     time.localtime(time.time())
434     timestamp = time.strftime('%Y-%m-%d', time.localtime(time.time()))
435     out.write("{{PDB entry updated| TIMESTAMP=%s|PDBIDX=%s }}\n"%(timestamp,pdbidx))
436    
437     # end tag
438     out.write("</page>\n")
439    
440     ## wiki uploading
441     #page = wikipedia.Page(site, pdbCode)
442     # to get the page would be:
443     #text = page.get()
444     #print(text)
445    
446     #page.put(template)
447     print("Done generating entry pages: %s pages generated"%count)
448     out.close()

Properties

Name Value
svn:executable