ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/pymsxml/pymsxml.py
Revision: 1.7
Committed: Wed Jul 25 14:14:16 2007 UTC (15 years, 2 months ago) by nje01
Branch: MAIN
Changes since 1.6: +103 -81 lines
Log Message:
*** empty log message ***

Line User Rev File contents
1 nje01 1.1 #!/usr/bin/env python
2    
3     import sys,os,os.path
4     import re
5     import sha
6     import math
7     import tempfile
8     import csv
9     import gzip
10 nje01 1.3 import zlib
11 nje01 1.1 from array import array
12     from base64 import b64encode,b64decode
13     from urllib import quote,unquote
14     from xml.sax import *
15    
16     class NameVersion:
17     def __init__(self):
18     self.name_ = "PyMsXML"
19     self.majorVer = 0
20 nje01 1.3 self.minorVer = 5
21 nje01 1.7 self.revVer = 3
22 nje01 1.1 def version(self):
23     return "%d.%d.%d"%(self.majorVer,self.minorVer,self.revVer)
24     def name(self):
25     return self.name_
26    
27     def fileSHA(filename):
28     s = sha.new()
29     h = open(filename,'rb')
30     while True:
31     buffer = h.read(1024)
32     if buffer == '':
33     break
34     s.update(buffer)
35     h.close()
36     return s.hexdigest().lower()
37    
38     class ToMzXML:
39 nje01 1.3 def __init__(self,reader,filename,cmpfmt=None,filt=None,peaksCompress=False,version="3.0"):
40 nje01 1.1 self.filename = filename
41     self.compressfmt = cmpfmt
42     self.reader = reader
43     self.initSHA()
44     self.writeByteCounter = 0
45     self.actualScanCount = 0
46     self.scanIndices = {}
47     self.filter = filt
48 nje01 1.3 if not version in ('2.1','2.2','3.0'):
49     raise "Bad mzXML version \"%s\""%(version,)
50     self.version = version
51     if peaksCompress and float(self.version) < 3.0:
52     raise "Cannot compress peaks until mzXML version 3.0"
53     self.peakcompress = peaksCompress
54 nje01 1.1
55     def __del__(self):
56     if hasattr(self,'tmpFileName') and self.tmpFileName and self.tmpFileName != '':
57     os.unlink(self.tmpFileName)
58    
59     def getFilename(self):
60     return self.filename
61    
62     def initSHA(self):
63     self.sha1 = sha.new()
64    
65     def updateSHA(self,s):
66     self.sha1.update(s)
67    
68     def getSHA(self):
69     return self.sha1.hexdigest().lower()
70    
71     def writeString(self,fh,data):
72     self.writeByteCounter += len(data)
73     self.sha1.update(data)
74     fh.write(data)
75    
76     def write(self,debug=False):
77    
78     # Make a temporary file for the scan data to count the number of spectra.
79     (tmpFileFD,self.tmpFileName) = tempfile.mkstemp(dir='.',prefix='.pymsxml')
80     tmpFile = os.fdopen(tmpFileFD,'wb')
81    
82     # Has a number of side effects, it fills in scanIndices, and sets self.actualScanCount.
83     self.write_scans(tmpFile,debug)
84    
85     tmpFile.close()
86    
87     # Reset!
88    
89     self.initSHA()
90     self.writeByteCounter = 0
91    
92     if self.compressfmt == None and not self.filename.endswith('.gz'):
93     xmlFile = open(self.filename,'wb')
94     elif self.compressfmt == 'gz' or self.filename.endswith('.gz'):
95     if self.filename.endswith('.gz'):
96     xmlFile = gzip.open(self.filename,'wb')
97     else:
98     xmlFile = gzip.open(self.filename+'.gz','wb')
99     else:
100     print >>sys.stderr, "Bad compressfmt specification"
101     sys.exit(1)
102    
103     self.writeString (xmlFile,'<?xml version="1.0" encoding="ISO-8859-1"?>\n')
104    
105     outStr = "<mzXML "
106 nje01 1.3 outStr += "xmlns=\"http://sashimi.sourceforge.net/schema_revision/mzXML_%s\" "%(self.version,)
107 nje01 1.1 outStr += "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
108 nje01 1.3 outStr += "xsi:schemaLocation=\"http://sashimi.sourceforge.net/schema_revision/mzXML_%s http://sashimi.sourceforge.net/schema_revision/mzXML_%s/mzXML_idx_%s.xsd\""%(self.version,self.version,self.version)
109 nje01 1.1 outStr += ">\n"
110    
111     self.writeString (xmlFile,outStr)
112    
113     outStr = "<msRun"
114     outStr += " scanCount=\"%d\"" % (self.actualScanCount,)
115 nje01 1.3 md = self.reader.getMsRunMetaData()
116     for (k,v) in md.items():
117 nje01 1.1 k,f = k.split(':')
118 nje01 1.3 if self.version not in ('2.2',) or (k != 'startTime'and k != 'endTime'):
119     outStr += (" %%s=\"%s\""%(f,))%(k,v)
120 nje01 1.1 outStr += ">\n"
121    
122     self.writeString (xmlFile,outStr)
123    
124     filehashes = {}
125    
126     for f in self.reader.getFilenames():
127     if len(f) == 2:
128     outStr = "<parentFile "
129     outStr += "fileName=\"%s\" " % (quote(f[0]),)
130     outStr += "fileType=\"RAWData\" "
131     outStr += "fileSha1=\"%s\"" % (f[1],)
132     outStr += "/>\n"
133     else:
134     outStr = "<parentFile "
135     outStr += "fileName=\"%s\" " % (f[0],)
136     outStr += "fileType=\"%s\" " % (f[1],)
137     outStr += "fileSha1=\"%s\"" % (f[2],)
138     outStr += "/>\n"
139     self.writeString (xmlFile,outStr)
140 nje01 1.3
141     if float(self.version) >= 3.0:
142     outStr = "<msInstrument msInstrumentID=\"%s\">\n" % (self.reader.msInstrumentID(),)
143     else:
144     outStr = "<msInstrument>\n"
145    
146 nje01 1.1 outStr += "<msManufacturer category=\"msManufacturer\" value=\"%s\"/>\n" % (self.reader.msManufacturer(),)
147     outStr += "<msModel category=\"msModel\" value=\"%s\"/>\n" % (self.reader.msModel(),)
148     if self.reader.msIonisation():
149     outStr += "<msIonisation category=\"msIonisation\" value=\"%s\"/>\n" % (self.reader.msIonisation(),)
150     if self.reader.msMassAnalyzer():
151     outStr += "<msMassAnalyzer category=\"msMassAnalyzer\" value=\"%s\"/>\n" % (self.reader.msMassAnalyzer(),)
152     if self.reader.msDetector():
153     outStr += "<msDetector category=\"msDetector\" value=\"%s\"/>\n" % (self.reader.msDetector(),)
154    
155     self.writeString (xmlFile,outStr)
156    
157     outStr = "<software "
158     outStr += "type=\"acquisition\" "
159     outStr += "name=\"%s\" "%(self.reader.acquisitionSoftware(),)
160     outStr += "version=\"%s\""%(self.reader.acquisitionSoftwareVersion(),)
161     outStr += "/>\n"
162    
163     self.writeString (xmlFile,outStr)
164    
165     outStr = "</msInstrument>\n"
166    
167     self.writeString (xmlFile,outStr)
168    
169     outStr = "<dataProcessing>\n"
170    
171     self.writeString (xmlFile,outStr)
172    
173     nv = NameVersion()
174    
175     outStr = "<software "
176     outStr += "type=\"conversion\" "
177     outStr += "name=\"%s\" "%(nv.name(),)
178     outStr += "version=\"%s\"" % (nv.version(),)
179     outStr += "/>\n"
180    
181     self.writeString (xmlFile,outStr)
182    
183     if (self.reader.peakDetect(1)):
184     outStr = "<processingOperation "
185     outStr += "name=\"peak_detection_level_1\" "
186     outStr += "value=\"true\""
187     outStr += "/>\n"
188     outStr += "<processingOperation "
189     outStr += "name=\"peak_detection_level_1_threshold\" "
190     outStr += "value=\"1%\""
191     outStr += "/>\n"
192     outStr += "<processingOperation "
193     outStr += "name=\"peak_detection_level_1_software\" "
194 nje01 1.3 outStr += "value=\"%s\""%(self.reader.acquisitionSoftware(),)
195 nje01 1.1 outStr += "/>\n"
196     self.writeString (xmlFile,outStr)
197     if (self.reader.peakDetect(2)):
198     outStr = "<processingOperation "
199     outStr += "name=\"peak_detection_level_2\" "
200     outStr += "value=\"true\""
201     outStr += "/>\n"
202     outStr += "<processingOperation "
203     outStr += "name=\"peak_detection_level_2_threshold\" "
204     outStr += "value=\"1%\""
205     outStr += "/>\n"
206     outStr += "<processingOperation "
207     outStr += "name=\"peak_detection_level_2_software\" "
208 nje01 1.3 outStr += "value=\"%s\""%(self.reader.acquisitionSoftware(),)
209 nje01 1.1 outStr += "/>\n"
210     self.writeString (xmlFile,outStr)
211    
212     outStr = "<processingOperation "
213     outStr += "name=\"min_peaks_per_spectra\" "
214     outStr += "value=\"1\""
215     outStr += "/>\n"
216    
217     self.writeString (xmlFile,outStr)
218    
219     outStr = "</dataProcessing>\n"
220    
221     self.writeString (xmlFile,outStr)
222    
223     if self.reader.maldi():
224     outStr = '<spotting>\n'
225     for d in self.reader.plateData():
226     outStr += '<plate plateID="%s" spotXCount="%s" spotYCount="%s">\n' % \
227     (d['plateID'],d['spotXCount'],d['spotYCount'])
228     outStr += '<plateManufacturer category="plateManufacturer" value="%s"/>\n' % \
229     (d['plateManufacturer'],)
230     outStr += '<plateModel category="plateModel" value="%s"/>\n' % \
231     (d['plateModel'],)
232     for s in self.reader.spotData(d['plateID']):
233     outStr += '<spot spotID="%s" spotXPosition="%s" spotYPosition="%s">\n' % \
234     (s['spotID'],s['spotXPosition'],s['spotYPosition'])
235     outStr += '<maldiMatrix category="maldiMatrix" value="%s"/>\n' % \
236     (s['maldiMatrix'],)
237     outStr += '</spot>\n'
238     outStr += '</plate>\n'
239     outStr += '</spotting>\n'
240     self.writeString(xmlFile,outStr)
241    
242     if self.reader.lc():
243     pass
244    
245     scanOffset = self.writeByteCounter
246    
247     tmpFile = open(self.tmpFileName,'rb')
248     while True:
249     tmpStr = tmpFile.read(1024)
250     if tmpStr == '':
251     break
252     self.writeString(xmlFile,tmpStr)
253     tmpFile.close()
254     os.unlink(self.tmpFileName)
255     self.tmpFileName = ''
256    
257     outStr = "</msRun>\n"
258     self.writeString (xmlFile,outStr)
259    
260     indexOffset = self.writeByteCounter
261    
262     outStr = "<index "
263     outStr += "name=\"scan\" "
264     outStr += ">\n"
265    
266     self.writeString (xmlFile,outStr)
267    
268     for i in xrange(1,self.actualScanCount+1):
269     outStr = "<offset id=\"%d\">" % (i,)
270     outStr += "%d</offset>\n" % (self.scanIndices[i] + scanOffset,)
271     self.writeString(xmlFile,outStr)
272    
273     outStr = "</index>\n"
274     self.writeString (xmlFile,outStr)
275    
276     outStr = "<indexOffset>%d</indexOffset>\n" % (indexOffset,)
277    
278     self.writeString (xmlFile,outStr)
279    
280     self.writeString (xmlFile,"<sha1>")
281    
282     outStr = self.getSHA()
283    
284     self.writeString (xmlFile,outStr)
285    
286     self.writeString (xmlFile,"</sha1>\n")
287    
288     outStr = "</mzXML>\n"
289     self.writeString (xmlFile,outStr)
290    
291     xmlFile.close()
292    
293     def write_scans(self,xmlFile,debug=False):
294    
295     msLevel = 0
296     scanNumber = 0
297     ancestors = []
298     self.writeByteCounter = 0
299    
300     for (s,d) in self.reader.spectra():
301    
302     if self.filter != None and not self.filter.test(d):
303     continue
304    
305     if debug and scanNumber >= 10:
306     break
307    
308     if not d.has_key('msLevel'):
309     print >>sys.stderr, "Required scan attributes missing."
310     sys.exit(1)
311    
312     prevLevel = msLevel
313     msLevel = d['msLevel']
314    
315     if prevLevel < msLevel and prevLevel > 0:
316     # We went "in" a level, push scan number of parent
317     ancestors.append((scanNumber,prevSpec))
318     elif prevLevel > msLevel and msLevel > 0:
319     if len(ancestors) == 0:
320     pass #print >>sys.stderr, "No ancestor for scan %s at level %s"%(scanNumber,msLevel)
321     else:
322     ancestors.pop()
323    
324     outStr = ''
325     if prevLevel > msLevel:
326     for m in xrange(0,prevLevel-msLevel+1):
327     outStr += "</scan>\n"
328     else:
329     if prevLevel > 0 and prevLevel == msLevel:
330     outStr += "</scan>\n"
331    
332     self.writeString(xmlFile,outStr)
333    
334     scanNumber = scanNumber + 1
335    
336     self.scanIndices[scanNumber] = self.writeByteCounter
337    
338     totIonCurrent = 0
339     maxmz = None
340     minmz = None
341     basePeakMz = None
342     basePeakIntensity = None
343     peaksCount = 0
344     for i in xrange(0,len(s),2):
345     x = s[i]; y = s[i+1]
346     if minmz is None or minmz > x:
347     minmz = x
348     if maxmz is None or maxmz < x:
349     maxmz = x
350     totIonCurrent += y
351     if basePeakIntensity is None or basePeakIntensity < y:
352     basePeakIntensity = y
353     basePeakMz = x
354     peaksCount += 1
355    
356     outStr = "<scan num=\"%d\" msLevel=\"%d\"" % (scanNumber,msLevel)
357     outStr += " peaksCount=\"%d\"" % (peaksCount,)
358     outStr += " lowMz=\"%f\"" % (minmz,)
359     outStr += " highMz=\"%f\"" % (maxmz,)
360     outStr += " totIonCurrent=\"%f\"" % (totIonCurrent,)
361     outStr += " basePeakMz=\"%f\"" % (basePeakMz,)
362     outStr += " basePeakIntensity=\"%f\"" % (basePeakIntensity,)
363    
364     for (k,v) in d.items():
365     if k.startswith('scan.'):
366     k,f = k.split(':')
367     k = k[5:]
368     outStr += (" %%s=\"%s\""%(f,))%(k,v)
369     outStr += ">\n"
370     self.writeString(xmlFile,outStr)
371    
372 nje01 1.3 if self.version in ('2.2',):
373     any = False
374     for (k,v) in d.items():
375     if k.startswith('scanOrigin.'):
376     if not any:
377     any = True
378     outStr = "<scanOrigin"
379     k,f = k.split(':')
380     k = k[11:]
381     outStr += (" %%s=\"%s\""%(f,))%(k,v)
382     if any:
383     outStr += "/>\n"
384     self.writeString(xmlFile,outStr)
385 nje01 1.2
386 nje01 1.1 if msLevel > 1:
387    
388     if not d.has_key('precursorMz') :
389     print >>sys.stderr, "Required precursorMz attribute missing."
390     sys.exit(1)
391    
392     # We scan the parent spectrum in the region of the
393     # precursorMz to establish the intensity. Optionally,
394     # we tune the precursorMz itself, on the basis of this
395    
396     pMz = d['precursorMz']
397     tol = self.reader.precursorPrecision(pMz)
398     pMzLB = pMz-tol
399     pMzUB = pMz+tol
400     pMzIntensity = 0
401    
402     if len(ancestors) > 0:
403     for i in xrange(0,len(ancestors[-1][1]),2):
404     x = ancestors[-1][1][i]; y = ancestors[-1][1][i+1];
405     if x < pMzLB:
406     continue
407     if x > pMzUB:
408     break
409     if pMzIntensity < y:
410     pMzIntensity = y
411     pMzTune = x
412    
413     if self.reader.precursorTune():
414     pMz = pMzTune
415    
416     outStr = "<precursorMz"
417     for (k,v) in d.items():
418     if k.startswith('precursorMz.'):
419     k,f = k.split(':')
420     k = k[12:]
421     outStr += (" %%s=\"%s\""%(f,))%(k,v)
422     if len(ancestors)>0:
423     outStr += " precursorScanNum=\"%d\""%(ancestors[-1][0],)
424     outStr += " precursorIntensity=\"%f\">"%(pMzIntensity,)
425     else:
426     outStr += ">"
427     outStr += "%f</precursorMz>\n" % (pMz,)
428     self.writeString(xmlFile,outStr)
429    
430 nje01 1.3 if self.reader.maldi() and self.version in ('2.2',):
431 nje01 1.1 outStr = '<maldi'
432     for (k,v) in d.items():
433     if k.startswith('maldi.'):
434     k,f = k.split(':')
435     k = k[6:]
436     outStr += (" %%s=\"%s\""%(f,))%(k,v)
437     outStr += " plateID=\"%s\""%(d['plateID'],)
438     outStr += " spotID=\"%s\""%(d['spotID'],)
439     outStr += "/>\n"
440     self.writeString(xmlFile,outStr)
441    
442 nje01 1.3 if sys.byteorder != 'big':
443     s.byteswap()
444    
445     if debug:
446     s = s[:20]
447    
448     specstr = s.tostring()
449     if self.peakcompress:
450     specstr = zlib.compress(specstr,9)
451     outStr = "<peaks precision=\"32\" byteOrder=\"network\" contentType=\"m/z-int\" compressionType=\"zlib\" compressedLen=\"%d\">"%(len(specstr),)
452     else:
453     if float(self.version) >= 3.0:
454     outStr = "<peaks precision=\"32\" byteOrder=\"network\" contentType=\"m/z-int\" compressionType=\"none\" compressedLen=\"%d\">"%(len(specstr),)
455     else:
456     outStr = "<peaks precision=\"32\" byteOrder=\"network\" pairOrder=\"m/z-int\">"
457    
458 nje01 1.1 self.writeString(xmlFile,outStr)
459    
460     # Spec says the byte order shall be
461     # network, which is the same as big endian.
462    
463     if sys.byteorder != 'big':
464     s.byteswap()
465    
466 nje01 1.3 outStr = b64encode(specstr)
467     self.writeString(xmlFile,outStr)
468 nje01 1.1
469     outStr = "</peaks>\n"
470     self.writeString(xmlFile,outStr)
471    
472 nje01 1.3 if self.reader.maldi() and self.version not in ('2.2',):
473 nje01 1.1 outStr = ''
474     for (k,v) in d.items():
475     if k.startswith('maldi.'):
476     k,f = k.split(':')
477     k = k[6:]
478     outStr += '<nameValue'
479 nje01 1.3 outStr += ' name="maldi.%s"'%(k,)
480     outStr += (' value="%s"'%(f,))%(v,)
481     outStr += '/>\n'
482     outStr += '<nameValue'
483     outStr += ' name="maldi.plateID"'
484     outStr += ' value="%s"'%(d['plateID'],)
485     outStr += '/>\n'
486     outStr += '<nameValue'
487     outStr += ' name="maldi.spotID"'
488     outStr += ' value="%s"'%(d['spotID'],)
489     outStr += '/>\n'
490     self.writeString(xmlFile,outStr)
491    
492     if self.version not in ('2.2',):
493     outStr = ''
494     for (k,v) in d.items():
495     if k.startswith('scanOrigin.'):
496     k,f = k.split(':')
497     k = k[11:]
498     outStr += '<nameValue'
499     outStr += ' name="scanOrigin.%s"'%(k,)
500     outStr += (' value="%s"'%(f,))%(v,)
501 nje01 1.1 outStr += '/>\n'
502 nje01 1.3 self.writeString(xmlFile,outStr)
503    
504     outStr = ''
505     for (k,v) in d.items():
506     if k.startswith('nameValue.'):
507     k,f = k.split(':')
508     k = k[10:]
509     outStr += '<nameValue'
510     outStr += ' name="%s"'%(k,)
511     outStr += (' value="%s"'%(f,))%(v,)
512     outStr += '/>\n'
513     if len(outStr) > 0:
514 nje01 1.1 self.writeString(xmlFile,outStr)
515    
516     xmlFile.flush()
517    
518     prevSpec = s
519    
520     outStr = ""
521 nje01 1.4 for m in xrange(0,len(ancestors)+1):
522 nje01 1.1 outStr += "</scan>\n"
523    
524     self.writeString(xmlFile,outStr)
525     self.actualScanCount = scanNumber
526    
527     class ToMzData:
528     def __init__(self,reader,filename,cmpfmt=None,filt=None):
529     self.filename = filename
530     self.compressfmt = cmpfmt
531     self.reader = reader
532     self.actualScanCount = 0
533     self.initSHA()
534     self.filter = filt
535    
536     def writeString(self,fh,data):
537     # self.writeByteCounter += len(data)
538     self.sha1.update(data)
539     fh.write(data)
540    
541     def initSHA(self):
542     self.sha1 = sha.new()
543    
544     def updateSHA(self,s):
545     self.sha1.update(s)
546    
547     def getSHA(self):
548     return self.sha1.hexdigest().lower()
549    
550     def write(self,debug=False):
551    
552     # Make a temporary file for the scan data to count the number of spectra.
553     (tmpFileFD,self.tmpFileName) = tempfile.mkstemp(dir='.',prefix='.pymsxml')
554     tmpFile = os.fdopen(tmpFileFD,'wb')
555    
556     # Has a number of side effects, it fills in scanIndices, and sets self.actualScanCount.
557     self.write_scans(tmpFile,debug)
558    
559     tmpFile.close()
560    
561     scanhash = self.getSHA()
562    
563     self.initSHA()
564    
565     if self.compressfmt == None and not self.filename.endswith('.gz'):
566     xmlFile = open(self.filename,'wb')
567     elif self.compressfmt == 'gz' or self.filename.endswith('.gz'):
568     if self.filename.endswith('.gz'):
569     xmlFile = gzip.open(self.filename,'wb')
570     else:
571     xmlFile = gzip.open(self.filename+'.gz','wb')
572     else:
573     print >>sys.stderr, "Bad compressfmt specification"
574     sys.exit(1)
575    
576     self.writeString (xmlFile,'<?xml version="1.0" encoding="ISO-8859-1"?>\n')
577    
578     outStr = "<mzData "
579     outStr += "version=\"1.05\" "
580     outStr += "accessionNumber=\"pymsxml:%s\" "%(scanhash,)
581     outStr += "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
582     # outStr += "xmlns=\"http://psidev.sourceforge.net/ms/xml/mzdata\" "
583     outStr += "xsi:noNamespaceSchemaLocation=\"http://psidev.sourceforge.net/ms/xml/mzdata/mzdata.xsd\""
584     outStr += ">\n"
585    
586     self.writeString (xmlFile,outStr)
587    
588     outStr = "<description>\n"
589     outStr += "<admin>\n"
590     outStr += "<sampleName></sampleName>\n"
591     outStr += "<contact>\n"
592     outStr += "<name></name>\n"
593     outStr += "<institution></institution>\n"
594     outStr += "</contact>\n"
595     outStr += "</admin>\n"
596     outStr += "<instrument>\n"
597     outStr += "<instrumentName></instrumentName>\n"
598     outStr += "<source>\n"
599     outStr += "</source>\n"
600     outStr += "<analyzerList count=\"1\">\n"
601     outStr += "<analyzer>\n"
602     outStr += "</analyzer>\n"
603     outStr += "</analyzerList>\n"
604     outStr += "<detector>\n"
605     outStr += "</detector>\n"
606     outStr += "</instrument>\n"
607     outStr += "<dataProcessing>\n"
608     outStr += "<software>\n"
609     outStr += "<name>%s</name>\n"%(self.reader.acquisitionSoftware(),)
610     outStr += "<version>%s</version>\n"%(self.reader.acquisitionSoftwareVersion(),)
611     outStr += "</software>\n"
612     outStr += "<software>\n"
613     nv = NameVersion()
614     outStr += "<name>%s</name>\n"%(nv.name(),)
615     outStr += "<version>%s</version>\n"%(nv.version(),)
616     outStr += "</software>\n"
617     outStr += "</dataProcessing>\n"
618     outStr += "</description>\n"
619    
620     self.writeString (xmlFile,outStr)
621    
622     outStr = "<spectrumList"
623     outStr += " count=\"%d\">\n"%(self.actualScanCount,)
624    
625     self.writeString (xmlFile,outStr)
626    
627     tmpFile = open(self.tmpFileName,'rb')
628     while True:
629     tmpStr = tmpFile.read(1024)
630     if tmpStr == '':
631     break
632     self.writeString(xmlFile,tmpStr)
633     tmpFile.close()
634     os.unlink(self.tmpFileName)
635     self.tmpFileName = ''
636    
637     outStr = "</spectrumList>\n</mzData>\n"
638    
639     self.writeString (xmlFile,outStr)
640    
641     def write_scans(self,xmlFile,debug=False):
642    
643     msLevel = 0
644     scanNumber = 0
645     ancestors = []
646    
647     self.initSHA()
648     # self.writeByteCounter = 0
649    
650     for (s,d) in self.reader.spectra():
651    
652     if self.filter != None and not self.filter.test(d):
653     continue
654    
655     if debug and scanNumber >= 10:
656     break
657    
658     if not d.has_key('msLevel'):
659     print >>sys.stderr, "Required scan attributes missing."
660     sys.exit(1)
661    
662     prevLevel = msLevel
663     msLevel = d['msLevel']
664    
665     if prevLevel < msLevel and prevLevel > 0:
666     # We went "in" a level, push scan number of parent
667     ancestors.append((scanNumber,prevSpec))
668     elif prevLevel > msLevel and msLevel > 0:
669     ancestors.pop()
670    
671     scanNumber = scanNumber + 1
672    
673     totIonCurrent = 0
674     maxmz = None
675     minmz = None
676     basePeakMz = None
677     basePeakIntensity = None
678     peaksCount = 0
679     for i in xrange(0,len(s),2):
680     x = s[i]; y = s[i+1]
681     if minmz is None or minmz > x:
682     minmz = x
683     if maxmz is None or maxmz < x:
684     maxmz = x
685     totIonCurrent += y
686     if basePeakIntensity is None or basePeakIntensity < y:
687     basePeakIntensity = y
688     basePeakMz = x
689     peaksCount += 1
690    
691     outStr = "<spectrum id=\"%d\">\n" % (scanNumber,)
692     outStr += "<spectrumDesc>\n"
693     outStr += "<spectrumSettings>\n"
694    
695     self.writeString (xmlFile,outStr)
696    
697     # outStr += "<acqSpecification>\n"
698     # outStr += "</acqSpecification>\n"
699    
700     outStr = "<spectrumInstrument"
701     outStr += " msLevel=\"%d\""%(msLevel,)
702     if d.has_key('scan.startMz') and \
703     d.has_key('scan.endMz'):
704     outStr += " mzRangeStart=\"%f\""%(d['scan.startMz'],)
705     outStr += " mzRangeStop=\"%f\""%(d['scan.endMz'],)
706     outStr += ">\n"
707     for (k,v) in d.items():
708     if k.startswith('scan.'):
709     k,f = k.split(':')
710     k = k[5:]
711    
712     if k == "polarity":
713     if v == '+':
714     outStr += "<cvParam cvLabel=\"\" accession=\"\" name=\"Polarity\" value=\"%s\"/>\n"%("Positive",)
715     elif v == '-':
716     outStr += "<cvParam cvLabel=\"\" accession=\"\" name=\"Polarity\" value=\"%s\"/>\n"%("Negative",)
717     elif k == 'retentionTime':
718     outStr += "<cvParam cvLabel=\"\" accession=\"\" name=\"TimeInSeconds\" value=\"%f\"/>\n"%(v,)
719     else:
720     outStr += ("<userParam name=\"%%s\" value=\"%s\"/>\n"%(f,))%(k,v)
721     outStr += "<userParam name=\"lowMz\" value=\"%f\"/>\n"%(minmz,)
722     outStr += "<userParam name=\"highMz\" value=\"%f\"/>\n"%(maxmz,)
723     outStr += "<userParam name=\"totIonCurrent\" value=\"%f\"/>\n"%(totIonCurrent,)
724     outStr += "<userParam name=\"basePeakMz\" value=\"%f\"/>\n"%(basePeakMz,)
725     outStr += "<userParam name=\"basePeakIntensity\" value=\"%f\"/>\n"%(basePeakIntensity,)
726     outStr += "</spectrumInstrument>\n"
727     outStr += "</spectrumSettings>\n"
728    
729     self.writeString (xmlFile,outStr)
730    
731     if msLevel > 1:
732    
733     if not d.has_key('precursorMz') :
734     print >>sys.stderr, "Required precursorMz attribute missing."
735     sys.exit(1)
736    
737     # We scan the parent spectrum in the region of the
738     # precursorMz to establish the intensity. Optionally,
739     # we tune the precursorMz itself, on the basis of this
740    
741     pMz = d['precursorMz']
742     tol = self.reader.precursorPrecision(pMz)
743     pMzLB = pMz-tol
744     pMzUB = pMz+tol
745     pMzIntensity = 0
746    
747     for i in xrange(0,len(ancestors[-1][1]),2):
748     x = ancestors[-1][1][i]; y = ancestors[-1][1][i+1];
749     if x < pMzLB:
750     continue
751     if x > pMzUB:
752     break
753     if pMzIntensity < y:
754     pMzIntensity = y
755     pMzTune = x
756    
757     if self.reader.precursorTune():
758     pMz = pMzTune
759    
760     outStr = "<precursorList count=\"1\">\n"
761    
762     outStr += "<precursor"
763     outStr += " msLevel=\"%d\""%(msLevel,)
764     outStr += " spectrumRef=\"%d\""%(ancestors[-1][0],)
765     outStr += ">\n"
766    
767     self.writeString(xmlFile,outStr)
768    
769     outStr = "<ionSelection>\n"
770     outStr += "<cvParam cvLabel=\"\" accession=\"\" name=\"MassToChargeRatio\" value=\"%f\"/>\n"%(pMz,)
771     outStr += "<cvParam cvLabel=\"\" accession=\"\" name=\"Intensity\" value=\"%f\"/>\n"%(pMzIntensity,)
772     for (k,v) in d.items():
773     if k.startswith('precursorMz.'):
774     k,f = k.split(':')
775     k = k[12:]
776     outStr += ("<userParam name=\"%%s\" value=\"%s\"/>\n"%(f,))%(k,v)
777     outStr += "</ionSelection>\n"
778    
779     self.writeString(xmlFile,outStr)
780    
781     outStr = "<activation>\n"
782     outStr += "</activation>\n"
783     outStr += "</precursor>\n"
784     outStr += "</precursorList>\n"
785    
786     self.writeString(xmlFile,outStr)
787    
788     if self.reader.maldi():
789     outStr = '<maldi'
790     for (k,v) in d.items():
791     if k.startswith('maldi.'):
792     k,f = k.split(':')
793     k = k[6:]
794     outStr += (" %%s=\"%s\""%(f,))%(k,v)
795     outStr += " plateID=\"%s\""%(d['plateID'],)
796     outStr += " spotID=\"%s\""%(d['spotID'],)
797     outStr += "/>\n"
798     self.writeString(xmlFile,outStr)
799    
800     outStr = "</spectrumDesc>\n"
801     self.writeString(xmlFile,outStr)
802    
803     outStr = "<mzArrayBinary>\n"
804     outStr += "<data"
805     outStr += " precision=\"32\""
806     outStr += " endian=\"%s\""%(sys.byteorder,)
807     outStr += " length=\"%d\""%(peaksCount,)
808     outStr += ">"
809     self.writeString(xmlFile,outStr)
810    
811     outStr = b64encode(s[0:peaksCount*2:2].tostring())
812     if not debug:
813     self.writeString(xmlFile,outStr)
814     else:
815     self.writeString(xmlFile,outStr[0:100])
816    
817     outStr = "</data>\n</mzArrayBinary>\n"
818     self.writeString(xmlFile,outStr)
819    
820     outStr = "<intenArrayBinary>\n"
821     outStr += "<data"
822     outStr += " precision=\"32\""
823     outStr += " endian=\"%s\""%(sys.byteorder,)
824     outStr += " length=\"%d\""%(peaksCount,)
825     outStr += ">"
826     self.writeString(xmlFile,outStr)
827    
828     outStr = b64encode(s[1:peaksCount*2:2].tostring())
829     if not debug:
830     self.writeString(xmlFile,outStr)
831     else:
832     self.writeString(xmlFile,outStr[0:100])
833    
834     outStr = "</data>\n</intenArrayBinary>\n"
835     self.writeString(xmlFile,outStr)
836    
837     if self.reader.maldi():
838     outStr = ''
839     for (k,v) in d.items():
840     if k.startswith('maldi.'):
841     k,f = k.split(':')
842     k = k[6:]
843     outStr += '<nameValue'
844     outStr += ' name="%s"'%(k,)
845     outStr += (' value="%f"'%(f,))%(v,)
846     outStr += '/>\n'
847     ## outStr += '<nameValue'
848     ## outStr += ' name="plateID"'
849     ## outStr += ' value="%s"'%(d['plateID'],)
850     ## outStr += '/>\n'
851     ## outStr += '<nameValue'
852     ## outStr += ' name="spotID"'
853     ## outStr += ' value="%s"'%(d['spotID'],)
854     ## outStr += '/>\n'
855     self.writeString(xmlFile,outStr)
856    
857     outStr = "</spectrum>\n"
858     self.writeString(xmlFile,outStr)
859     xmlFile.flush()
860    
861     prevSpec = s
862    
863     self.actualScanCount = scanNumber
864    
865     class QStarWiff:
866     def __init__(self,filename,peaks=None):
867     self.filename = filename
868     self.filehash = fileSHA(filename)
869     self.theTIC = None
870     self.theWF = None
871     self.theFMANSpecData = None
872     self.startTime = None
873     self.stopTime = None
874 nje01 1.2 self.applyPeakDetect = []
875 nje01 1.1 if peaks:
876     self.applyPeakDetect = map(int,peaks.split(','))
877    
878     if not os.path.exists(self.filename):
879     print >>sys.stderr, "Filename %s does not exist."%(self.filename,)
880     sys.exit(1)
881    
882     self.filename = os.path.abspath(self.filename)
883    
884     def __del__(self):
885     # Probably unnecessary
886     self.close()
887    
888     def peakDetect(self,msLevel):
889     return ((msLevel in self.applyPeakDetect) and (msLevel != 1))
890    
891     def open(self):
892    
893     if self.theTIC is None:
894    
895     from win32com.client import Dispatch
896    
897     self.theFMANSpecData = Dispatch('Analyst.FMANSpecData')
898     self.theTIC = Dispatch('Analyst.FMANChromData')
899     self.theSPL = Dispatch('Analyst.SpectralPeakList')
900    
901     self.theFMANSpecData.WiffFileName = self.filename
902     self.theTIC.WiffFileName = self.filename
903     self.theWF = self.theFMANSpecData.GetWiffFileObject()
904    
905     self.theTIC.SetToTIC(1,0,0)
906    
907     self.startTime = self.theTIC.GetDataPointXValue(1)
908     self.stopTime = self.theTIC.GetDataPointXValue(self.theTIC.GetNumberOfDataPoints())
909    
910     def close(self):
911    
912     if not self.theTIC is None:
913    
914     if not self.theWF is None:
915     self.theWF.CloseWiffFile()
916     self.theTIC = None
917     self.theFMANSpecData = None
918     self.theWF = None
919    
920     def spectra(self):
921     self.open()
922     spectrumCount=0;
923    
924     numberOfSamples = self.theWF.GetActualNumberOfSamples()
925     for i in xrange(1,numberOfSamples+1):
926    
927     # print >>sys.stderr, "Sample %d"%i
928     # print >>sys.stderr, "Sample Name: %s"%(self.theWF.GetSampleName(i),)
929    
930     sampleName = self.theWF.GetSampleName(i)
931    
932     numberOfPeriods = self.theWF.GetActualNumberOfPeriods(i)
933     for j in xrange(0,numberOfPeriods):
934    
935     numberOfExperiments = self.theWF.GetNumberOfExperiments(i,j)
936     self.theTIC.SetToTIC(i, j, 0)
937    
938     numberOfTICDataPoints = self.theTIC.GetNumberOfDataPoints()
939     for l in xrange(1,numberOfTICDataPoints+1):
940    
941     for k in xrange(0,numberOfExperiments):
942    
943     self.theExperiment = self.theWF.GetExperimentObject(i, j, k)
944     self.theTIC.SetToTIC(i, j, k)
945    
946     # Convert to seconds?
947     xValue = self.theTIC.GetDataPointXValue(l) * 60
948    
949     spectrumCount += 1;
950    
951     # If any Ion current at time "l"
952     if self.theTIC.GetDataPointYValue(l) > 0:
953    
954     self.theFMANSpecData.SetSpectrum(i, j, k, xValue, xValue)
955    
956     if self.theExperiment.ScanType == 9:
957     msLevel = 2
958     m = re.search(r'[(]([0-9.]+)[)]',self.theFMANSpecData.DataTitle)
959     fixedMass = float(m.group(1))
960     else:
961     msLevel = 1
962    
963     startMass = self.theFMANSpecData.GetStartMass()
964     stopMass = self.theFMANSpecData.GetStopMass()
965     dataArr = array('f');
966     peakDetect = 'False'
967     if self.peakDetect(msLevel):
968     peakDetect = 'True'
969     minY,maxY = self.theFMANSpecData.GetYValueRange()
970 nje01 1.3 # print minY,maxY
971 nje01 1.1 self.theFMANSpecData.Threshold(maxY*0.01)
972     self.theSPL.FindPeaksInDataObject(self.theFMANSpecData,50.0)
973     numPeaks = self.theSPL.GetNumberOfPeaks()
974 nje01 1.3 # print numPeaks
975 nje01 1.1 for m in xrange(1,numPeaks+1):
976     (x,w,y,yp) = self.theSPL.GetPeak(m)
977     if y <= 0:
978     continue
979     dataArr.append(x)
980     dataArr.append(y)
981     else:
982     numPoints = self.theFMANSpecData.GetNumberOfDataPoints()
983     for m in xrange(1,numPoints+1):
984     if self.theFMANSpecData.GetDataPointYValue(m) <= 0:
985     continue
986     x = self.theFMANSpecData.GetDataPointXValue(m)
987     dataArr.append(x)
988     y = self.theFMANSpecData.GetDataPointYValue(m)
989     dataArr.append(y)
990     polarity = self.theExperiment.Polarity
991    
992     if len(dataArr) == 0:
993     continue
994    
995     if polarity == 0:
996     polarity_val = "+"
997     else:
998     polarity_val = "-"
999    
1000     rt = "PT%fS"%(xValue,)
1001    
1002     # Scan level attributes
1003     # Required by mzXML (no format spec. needed)
1004 nje01 1.3 d = {'msLevel':msLevel}
1005 nje01 1.1 # Optional ('scan.' and prefixformat spec. needed)
1006     d.update({
1007     'scan.retentionTime:PT%fS':xValue,
1008     'scan.polarity:%s':polarity_val,
1009     'scan.startMz:%f':startMass,
1010     'scan.endMz:%f':stopMass,
1011     })
1012 nje01 1.2 d.update({'scanOrigin.parentFileID:%s': self.filehash,
1013 nje01 1.1 'scanOrigin.num:%d': spectrumCount})
1014 nje01 1.3 d.update({'nameValue.sample:%d':i,
1015     'nameValue.period:%d':j,
1016     'nameValue.experiment:%d':k,
1017     'nameValue.sampleName:%s':sampleName})
1018    
1019 nje01 1.1 if msLevel == 1:
1020     yield (dataArr,d)
1021     else:
1022     # tandem MS scans add the following
1023     # Required by mzXML
1024     d.update({'precursorMz':fixedMass})
1025     # Optional
1026     # d.update({'precursorMz.precursorMzIntensity:f':0.0})
1027     yield (dataArr,d)
1028    
1029     def getMsRunMetaData(self):
1030     self.open()
1031     d = {'startTime:PT%fS':self.startTime,
1032     'endTime:PT%fS':self.stopTime,
1033     }
1034     return d
1035    
1036     def getFilenames(self):
1037     return [ (self.filename,self.filehash) ]
1038    
1039 nje01 1.3 def msInstrumentID(self):
1040     return "1"
1041    
1042 nje01 1.1 def msManufacturer(self):
1043     return "ABI / SCIEX"
1044    
1045     def msModel(self):
1046     return "QSTAR"
1047    
1048     def msIonisation(self):
1049     return "ESI"
1050    
1051     def msMassAnalyzer(self):
1052     return "Quadrupole"
1053    
1054     def msDetector(self):
1055 nje01 1.7 return None
1056 nje01 1.1
1057     def acquisitionSoftware(self):
1058     return "Analyst"
1059    
1060     def acquisitionSoftwareVersion(self):
1061     return "1.0"
1062    
1063     def precursorPrecision(self,mz):
1064     return .1
1065    
1066     def precursorTune(self):
1067     return True
1068    
1069     def lc(self):
1070     return True
1071    
1072     def maldi(self):
1073     return not self.lc()
1074    
1075     class mzXML_md_sax(handler.ContentHandler):
1076    
1077     def __init__(self):
1078     self.md = {}
1079     self.context = ''
1080    
1081     def startElement(self, name, attrs):
1082     self.context += ':%s'%(name,)
1083     # print >>sys.stderr, ">>",self.context
1084     # sys.stderr.flush()
1085     if self.context.endswith(':msRun'):
1086 nje01 1.4 if attrs.has_key('startTime'):
1087     self.md['startTime'] = float(attrs['startTime'][2:-1])
1088     if attrs.has_key('endTime'):
1089     self.md['endTime'] = float(attrs['endTime'][2:-1])
1090 nje01 1.1 elif self.context.endswith(':parentFile'):
1091 nje01 1.4 if not self.md.has_key('parentFile'):
1092     self.md['parentFile'] = []
1093     self.md['parentFile'].append((unquote(attrs['fileName']),attrs['fileType'],attrs['fileSha1']))
1094 nje01 1.1 elif self.context.endswith(':instrument'):
1095     self.md['msManufacturer'] = attrs['manufacturer']
1096     self.md['msModel'] = attrs['model']
1097     self.md['msIonisation'] = attrs['ionisation']
1098     self.md['msMassAnalyzer'] = attrs['msType']
1099     elif self.context.endswith(':msInstrument:msManufacturer'):
1100     self.md['msManufacturer'] = attrs['value']
1101     elif self.context.endswith(':msInstrument:msModel'):
1102     self.md['msModel'] = attrs['value']
1103     elif self.context.endswith(':msInstrument:msIonisation'):
1104     self.md['msIonisation'] = attrs['value']
1105     elif self.context.endswith(':msInstrument:msMassAnalyzer'):
1106     self.md['msMassAnalyzer'] = attrs['value']
1107     elif self.context.endswith(':msInstrument:msDetector'):
1108     self.md['msDetector'] = attrs['value']
1109     elif self.context.endswith(':msInstrument:software'):
1110     if attrs['type'] == 'acquisition':
1111     self.md['acquisitionSoftware'] = attrs['name']
1112     self.md['acquisitionSoftwareVersion'] = attrs['version']
1113     elif self.context.endswith(':instrument:software'):
1114     if attrs['type'] == 'acquisition':
1115     self.md['acquisitionSoftware'] = attrs['name']
1116     self.md['acquisitionSoftwareVersion'] = attrs['version']
1117     elif self.context.endswith(':scan'):
1118     raise SAXException("Early termination")
1119    
1120     def endElement(self, name):
1121     # print >>sys.stderr, "<<",self.context
1122     # sys.stderr.flush()
1123     self.context = self.context[0:-(len(name)+1)]
1124    
1125    
1126     class mzXML_spec_sax(handler.ContentHandler):
1127    
1128     def __init__(self):
1129     self.context = ''
1130     self.content = ''
1131     self.spectra = []
1132     self.done = False
1133     self.scancount = 0
1134     self.scanmax = 1000
1135     self.scanstart = 0
1136    
1137     def startElement(self, name, attrs):
1138     self.context += ':%s'%(name,)
1139     # print >>sys.stderr, ">>",self.context
1140     # sys.stderr.flush()
1141     self.content = ''
1142     if name == 'scan':
1143     self.scanmd = {}
1144     self.scanlevel = int(attrs['msLevel'])
1145     self.scannum = int(attrs['num'])
1146     if attrs.has_key('retentionTime'):
1147     self.rt = float(attrs['retentionTime'][2:-1])
1148 nje01 1.4 if attrs.has_key('endMz'):
1149     self.endMz = float(attrs['endMz'])
1150     if attrs.has_key('startMz'):
1151     self.startMz = float(attrs['startMz'])
1152 nje01 1.1 self.polarity = attrs.get('polarity',None)
1153     for (k,v) in attrs.items():
1154     self.scanmd[k] = v
1155     self.precursorMz = None
1156     self.scancount += 1
1157     elif name == 'msRun':
1158     self.scancount = 0
1159     self.spectra = []
1160     self.context = ':msRun'
1161     elif name == 'scanOrigin':
1162     self.scanmd['scanOrigin'] = attrs
1163 nje01 1.5 elif self.context.endswith(':scan:nameValue'):
1164     self.scanmd['nameValue.%s:%%s'%attrs['name']] = attrs['value']
1165 nje01 1.1
1166     def characters(self, content):
1167     if self.scancount >= self.scanstart:
1168     if self.context.endswith(':peaks') or \
1169     self.context.endswith(':precursorMz'):
1170     self.content += content
1171    
1172     def endElement(self, name):
1173     if self.scancount >= self.scanstart:
1174     if self.context.endswith('scan:peaks'):
1175    
1176 nje01 1.5 self.spec = array('f')
1177 nje01 1.1 # print >>sys.stderr, len(self.content), len(b64decode(self.content)), self.content
1178 nje01 1.5 self.spec.fromstring(b64decode(self.content))
1179 nje01 1.1 if sys.byteorder != 'big':
1180 nje01 1.5 self.spec.byteswap()
1181     elif self.context.endswith(':scan'):
1182 nje01 1.1 d = {'msLevel':self.scanlevel}
1183     # Optional ('scan.' and prefixformat spec. needed)
1184 nje01 1.4 if hasattr(self,'rt'):
1185     d.update({'scan.retentionTime:PT%fS':self.rt})
1186     if hasattr(self,'startMz'):
1187     d.update({'scan.startMz:%f':self.startMz})
1188     if hasattr(self,'endMz'):
1189     d.update({'scan.endMz:%f':self.endMz})
1190 nje01 1.1 if self.polarity != None:
1191     d.update({'scan.polarity:%s':self.polarity})
1192     if self.scanlevel == 2:
1193     d.update({'precursorMz':self.precursorMz})
1194     if self.scanmd.has_key('scanOrigin'):
1195 nje01 1.2 d.update({'scanOrigin.parentFileID:%s': self.scanmd['scanOrigin']['parentFileID'],
1196 nje01 1.5 'scanOrigin.num:%d': int(self.scanmd['scanOrigin']['num'])})
1197 nje01 1.1 for (k,v) in self.scanmd.items():
1198     if not d.has_key(k):
1199     d[k] = v
1200 nje01 1.5 self.spectra.append((self.spec,d))
1201 nje01 1.1 if len(self.spectra) >= self.scanmax:
1202     self.scanstart = self.scancount+1
1203     raise SAXException("Early termination")
1204     elif self.context.endswith('scan:precursorMz'):
1205     self.precursorMz = float(self.content)
1206     if self.context.endswith('msRun'):
1207     self.done = True
1208     # print >>sys.stderr, "<<",self.context
1209     # sys.stderr.flush()
1210     self.context = self.context[0:-(len(name)+1)]
1211    
1212     class FromMzXML:
1213     def __init__(self,filename):
1214     self.filename = filename
1215     self.parser = None
1216    
1217     if not os.path.exists(self.filename):
1218     print >>sys.stderr, "Filename %s does not exist."%(self.filename,)
1219     sys.exit(1)
1220    
1221     if self.filename.endswith('.gz'):
1222     self.compressfmt = 'gz'
1223     else:
1224     self.compressfmt = None
1225    
1226     def __del__(self):
1227     self.close()
1228    
1229     def open(self):
1230    
1231     if self.parser == None:
1232     # print >>sys.stderr, "Parsing out meta-data..."
1233     # sys.stderr.flush()
1234     self.parser = make_parser()
1235     self.handler = mzXML_md_sax()
1236     self.parser.setContentHandler(self.handler)
1237     try:
1238     if self.compressfmt == None:
1239     self.parser.parse(self.filename)
1240     elif self.compressfmt == 'gz':
1241     self.parser.parse(gzip.open(self.filename,'r'))
1242     else:
1243     print >>sys.stderr, "Bad compressfmt specification: %s"%(self.compressfmt,)
1244     sys.exit(1)
1245     except SAXException:
1246     pass
1247    
1248     self.md = self.handler.md
1249    
1250     # print >>sys.stderr, self.md
1251     # sys.stderr.flush()
1252    
1253     # print >>sys.stderr, "Set up spectrum parser..."
1254     # sys.stderr.flush()
1255     self.parser = make_parser()
1256     self.handler = mzXML_spec_sax()
1257     self.parser.setContentHandler(self.handler)
1258    
1259     def close(self):
1260     pass
1261    
1262     def spectra(self):
1263     self.open()
1264     while True:
1265     # print >>sys.stderr, "Spectrum parser to read spectra from",self.handler.scanstart
1266     # sys.stderr.flush()
1267     try:
1268     if self.compressfmt == None:
1269     self.parser.parse(self.filename)
1270     elif self.compressfmt == 'gz':
1271     self.parser.parse(gzip.open(self.filename,'r'))
1272     else:
1273     print >>sys.stderr, "Bad compressfmt specification: %s"%(self.compressfmt,)
1274     sys.exit(1)
1275     except SAXException:
1276     pass
1277     for (s,d) in self.handler.spectra:
1278     # print >>sys.stderr, "Yeild spectrum number:",d['num']
1279     yield (s,d)
1280     if self.handler.done:
1281     break
1282    
1283     def getMsRunMetaData(self):
1284     self.open()
1285 nje01 1.4 d = {}
1286     if self.md.has_key('startTime'):
1287     d.update({'startTime:PT%fS':self.md['startTime']})
1288     if self.md.has_key('endTime'):
1289     d.update({'endTime:PT%fS':self.md['endTime']})
1290 nje01 1.1 return d
1291    
1292     def getFilenames(self):
1293 nje01 1.4 return self.md['parentFile']
1294    
1295     def msInstrumentID(self):
1296     return "1"
1297    
1298     def peakDetect(self,level):
1299     return False
1300 nje01 1.1
1301     def msManufacturer(self):
1302     return self.md.get('msManufacturer','')
1303    
1304     def msModel(self):
1305     return self.md.get('msModel','')
1306    
1307     def msIonisation(self):
1308     return self.md.get('msIonisation','')
1309    
1310     def msMassAnalyzer(self):
1311     return self.md.get('msMassAnalyzer','')
1312    
1313     def msDetector(self):
1314     return self.md.get('msDetector','')
1315    
1316     def acquisitionSoftware(self):
1317     return self.md.get('acquisitionSoftware','')
1318    
1319     def acquisitionSoftwareVersion(self):
1320     return self.md.get('acquisitionSoftwareVersion','')
1321    
1322     def precursorPrecision(self,mz):
1323     return 0
1324    
1325     def precursorTune(self):
1326     return False
1327    
1328     def lc(self):
1329     return True
1330    
1331     def maldi(self):
1332     return not self.lc()
1333    
1334     class AB4700T2D:
1335     def __init__(self,filename,peaks=None):
1336     self.filename = filename
1337     self.datafiles = None
1338     self.deapp = None
1339     self.metadata = {}
1340     self.platespots = {}
1341     self.applyPeakDetect = []
1342     if peaks:
1343     self.applyPeakDetect = map(int,peaks.split(','))
1344    
1345     if not os.path.exists(self.filename):
1346     print >>sys.stderr, "Filename %s does not exist."%(self.filename,)
1347     sys.exit(1)
1348    
1349     self.filename = os.path.abspath(self.filename)
1350    
1351     self.dir = os.path.split(self.filename)[0]
1352    
1353     def __del__(self):
1354     # Probably unnecessary
1355     self.close()
1356    
1357 nje01 1.7 def read_metadata(self):
1358    
1359     h = open(self.filename,'rb')
1360     r = csv.reader(h,'excel-tab')
1361 nje01 1.1
1362 nje01 1.7 self.metadata['PLATEDEF'] = []
1363     self.metadata['PLATE'] = []
1364     self.metadata['SPOT'] = []
1365     self.metadata['SCAN'] = []
1366 nje01 1.1
1367 nje01 1.7 for l in r:
1368     if len(l) == 0:
1369     continue
1370     kw = l.pop(0).upper()
1371     if kw in ('PLATEDEF','PLATE','SPOT','SCAN'):
1372     self.metadata[kw].append({})
1373     while len(l)>=2:
1374     k = l.pop(0);v = l.pop(0);
1375     if (kw == 'PLATE' and \
1376     (k in ('plateID','spotXCount','spotYCount','plateManufacturer','plateModel'))) or \
1377     (kw == 'PLATEDEF' and \
1378     (k in ('plateID','plateManufacturer','plateModel','spotNaming','maldiMatrix'))) or \
1379     (kw == 'SPOT' and \
1380     (k in ('plateID','spotID','spotXPosition','spotYPosition','maldiMatrix'))) or \
1381     (kw == 'SCAN' and \
1382     (k in ('plateID','spotID','filename','index'))):
1383     self.metadata[kw][-1][k] = v
1384     elif k == '':
1385     break
1386 nje01 1.1 else:
1387 nje01 1.7 print >>sys.stderr,"Bad key %s for %s row"%(k,kw)
1388 nje01 1.1 sys.exit(1)
1389 nje01 1.7 h.close()
1390    
1391     for pd in self.metadata['PLATEDEF']:
1392     # insert all the gory details needed for a particular plate model etc...
1393     if pd['plateManufacturer'] == 'ABI / SCIEX' and pd['plateModel'] == '01-192+06-BB':
1394     if not pd.has_key('spotNaming') or not pd.has_key('maldiMatrix') or \
1395     not pd['spotNaming'] in ('alpha','parallel','antiparallel'):
1396     print >>sys.stderr,"Bad plate definition, missing maldiMatrix or spotNaming, or bad spotNaming value."
1397     sys.exit(1)
1398     self.metadata['PLATE'].append({
1399     'plateID':pd['plateID'],
1400     'plateManufacturer':pd['plateManufacturer'],
1401     'plateModel':pd['plateModel'],
1402     'spotXCount':24,
1403     'spotYCount':8,
1404     })
1405     if pd['spotNaming'] == 'alpha':
1406     for y in xrange(0,8):
1407     for x in xrange(0,24):
1408     self.metadata['SPOT'].append({
1409     'plateID':pd['plateID'],
1410     'spotID':"%c%d"%(y+ord('A'),x+1),
1411     'spotXPosition':x,
1412     'spotYPosition':2*y+(x%2),
1413     'maldiMatrix':pd['maldiMatrix']
1414     })
1415     else:
1416     print >>sys.stderr,"Valid spotNaming value, not yet implemented."
1417     sys.exit(1)
1418 nje01 1.1
1419 nje01 1.7 for pl in self.metadata['PLATE']:
1420     self.platespots[pl['plateID']] = []
1421 nje01 1.1
1422 nje01 1.7 for sp in self.metadata['SPOT']:
1423     self.platespots[sp['plateID']].append(sp)
1424 nje01 1.1
1425 nje01 1.7 self.datafiles = [ (os.path.join(self.dir,md['filename']),int(md['index'])) for md in self.metadata['SCAN'] ]
1426     self.distinct_datafiles = dict([(os.path.join(self.dir,md['filename']),
1427     fileSHA(os.path.join(self.dir,md['filename']))) for md in self.metadata['SCAN']])
1428    
1429     self.maptoscan = {}
1430     index = 0
1431     for md in self.metadata['SCAN']:
1432     self.maptoscan[(os.path.join(self.dir,md['filename']),int(md['index']))] = index
1433     index += 1
1434    
1435     def make_metadata(self):
1436     self.deapp.Documents.Open(self.filename)
1437     doc = self.deapp.Documents.Item(0)
1438     sv = doc.SpecView
1439     nspectra = sv.TotalSpectrum;
1440     self.deapp.Documents.Close()
1441     self.remsettingsfile(self.filename)
1442    
1443     self.datafiles = [ (self.filename,i) for i in range(1,nspectra+1) ]
1444     self.distinct_datafiles = dict([ (self.filename, fileSHA(self.filename)) ])
1445     self.maptoscan = {}
1446     for (f,i) in self.datafiles:
1447     self.maptoscan[(f,i)] = i
1448    
1449     def open(self):
1450    
1451     if self.deapp is None:
1452 nje01 1.1
1453     from win32com.client import Dispatch, gencache
1454     self.deapp = Dispatch('DataExplorer.Application',
1455     resultCLSID='{3FED40F1-D409-11D1-8B56-0060971CB54B}')
1456     self.delib = gencache.EnsureModule('{06972F50-13F6-11D3-A5CB-0060971CB54B}',0,4,2)
1457 nje01 1.6 try:
1458     self.deapp.AutomatedProcessing = 1
1459     except AttributeError:
1460     pass
1461     try:
1462     self.deapp.Visible = 0
1463     except AttributeError:
1464     pass
1465    
1466 nje01 1.7 try:
1467     self.read_metadata()
1468     except:
1469     self.make_metadata()
1470 nje01 1.1
1471     def close(self):
1472    
1473     if not self.deapp is None:
1474     self.deapp.Quit()
1475     self.deapp = None
1476    
1477     def peakDetect(self,msLevel):
1478     return msLevel in self.applyPeakDetect
1479    
1480     # remove .set and .cts files to prevent DataExplorer crashes - DT 11-1-2006
1481     def remsettingsfile(self,fn):
1482 nje01 1.2 # print "rmsettingsfile: %s"%fn
1483     if ((len(fn) > 3) and (fn[-4:].lower() == '.t2d')):
1484     path2del = fn[:-4] + '.cts'
1485     if os.path.exists(path2del):
1486     os.unlink(path2del)
1487     path2del = fn[:-4] + '.set'
1488     if os.path.exists(path2del):
1489     os.unlink(path2del)
1490 nje01 1.1
1491     def spectra(self):
1492     self.open()
1493    
1494     prevfile = ''
1495     for (f,i) in self.datafiles:
1496    
1497     if f != prevfile:
1498     self.deapp.Documents.Close()
1499 nje01 1.2 self.remsettingsfile(prevfile)
1500 nje01 1.1
1501     if not os.path.exists(f):
1502     print >>sys.stderr, "Filename %s does not exist."%(f,)
1503     sys.exit(1)
1504    
1505     self.remsettingsfile(f)
1506 nje01 1.2 self.deapp.Documents.Open(f)
1507     # print "open %s"%f
1508 nje01 1.1
1509    
1510     doc = self.deapp.Documents.Item(0)
1511     sv = doc.SpecView
1512    
1513 nje01 1.6 if i > sv.TotalSpectrum:
1514     prevfile = f
1515     continue
1516    
1517 nje01 1.1 sv.SetSpectrumAt(i-1)
1518    
1519     (tf,fixedMass) = doc.InstrumentSettings.GetSetting(self.delib.constants.dePreCursorIon,i-1,None)
1520    
1521     if fixedMass > 0:
1522     msLevel = 2
1523     else:
1524     msLevel = 1
1525    
1526     dataArr = array('f');
1527     peakDetect="False"
1528     if self.peakDetect(msLevel):
1529     peakDetect="True"
1530     sv.CorrectBaseline()
1531     doc.SpecSetup.NoiseReductionStdev = 2.0
1532     sv.FilterNoise(self.delib.constants.deNoiseRemoval)
1533     sv.PeakSettings.PeakThresholdPercent = 1
1534     st,sp = sv.GetPeakData(self.delib.constants.deSpecPeakAll,
1535     self.delib.constants.deSpecPeakSortMass, 0, 0)
1536     for j in xrange(len(sp)):
1537     x = sp[j][self.delib.constants.deSpecPeakCentroidMass]
1538     y = sp[j][self.delib.constants.deSpecPeakArea]
1539     if y <= 0:
1540     continue
1541     dataArr.append(x)
1542     dataArr.append(y)
1543     else:
1544     n,s = sv.GetRawData(0,0)
1545     for j in xrange(n):
1546     dataArr.append(s[j][0])
1547     dataArr.append(s[j][1])
1548    
1549     if len(dataArr) == 0:
1550     continue
1551    
1552     polarity_val = "+"
1553    
1554     # Scan level attributes
1555     # Required by mzXML (no format spec. needed)
1556 nje01 1.3 d = {'msLevel':msLevel}
1557 nje01 1.1 # Optional ('scan.' and prefixformat spec. needed)
1558     d.update({
1559     'scan.polarity:%s':polarity_val,
1560     })
1561    
1562     scanindex = self.maptoscan[(f,i)]
1563 nje01 1.7 if len(self.metadata['SCAN']) > 0:
1564     d.update({
1565     'plateID':self.metadata['SCAN'][scanindex]['plateID'],
1566     'spotID':self.metadata['SCAN'][scanindex]['spotID'],
1567     })
1568 nje01 1.2 d.update({'scanOrigin.parentFileID:%s': self.distinct_datafiles[f],
1569 nje01 1.1 'scanOrigin.num:%d': i})
1570     if msLevel == 1:
1571     yield (dataArr,d)
1572     else:
1573     # tandem MS scans add the following
1574     # Required by mzXML
1575     d.update({'precursorMz':fixedMass})
1576     # Optional
1577     d.update({'precursorMz.precursorCharge:%d':1})
1578     yield (dataArr,d)
1579    
1580     prevfile = f
1581    
1582     self.deapp.Documents.Close()
1583     self.remsettingsfile(prevfile)
1584 nje01 1.2 self.remsettingsfile(f)
1585 nje01 1.1
1586     def getMsRunMetaData(self):
1587     self.open()
1588     return {}
1589    
1590     def getFilenames(self):
1591     return [ t for t in self.distinct_datafiles.items() ]
1592    
1593 nje01 1.3 def msInstrumentID(self):
1594     return "1"
1595    
1596 nje01 1.1 def msManufacturer(self):
1597     return "ABI / SCIEX"
1598    
1599     def msModel(self):
1600     return "4700 Proteomic Analyzer"
1601    
1602     def msIonisation(self):
1603     return "MALDI"
1604    
1605     def msMassAnalyzer(self):
1606     return "TOF"
1607    
1608     def msDetector(self):
1609 nje01 1.7 return None
1610 nje01 1.1
1611     def acquisitionSoftware(self):
1612     return "DataExplorer"
1613    
1614     def acquisitionSoftwareVersion(self):
1615     return "1.0"
1616    
1617     def precursorPrecision(self,mz):
1618     return .1
1619    
1620     def precursorTune(self):
1621     return False
1622    
1623     def lc(self):
1624 nje01 1.7 return len(self.metadata['PLATE']) == 0
1625 nje01 1.1
1626     def maldi(self):
1627     return not self.lc()
1628    
1629     def plateData(self):
1630     self.open()
1631     for d in self.metadata['PLATE']:
1632     yield d
1633    
1634     def spotData(self,plate):
1635     self.open()
1636     for s in self.platespots[plate]:
1637     yield s
1638    
1639     class scanfilter:
1640     def __init__(self,str):
1641     self.rtre = re.compile('^PT([0-9]+(\.[0-9]+)?)S$')
1642     self.intre = re.compile('^[1-9][0-9]*$')
1643     self.floatre = re.compile('^[0-9]+(\.[0-9]+)?$')
1644     self.cl = []
1645     clause = str.split(',')
1646     for cl in clause:
1647     scl = cl.split('.')
1648     m = self.rtre.match(scl[2].strip())
1649     if m != None:
1650     v = float(m.group(1))
1651     elif self.floatre.match(scl[2].strip()):
1652     v = float(scl[2])
1653     elif self.intre.match(scl[2].strip()):
1654     v = int(scl[2])
1655     else:
1656     v = scl[2]
1657     self.cl.append((scl[0],scl[1],v))
1658    
1659     def test(self,d):
1660     a = {}
1661     for (k,v) in d.iteritems():
1662     k0 = k.split(':')[0].split('.')[-1]
1663     a[k0] = v
1664     for cl in self.cl:
1665     if type(a[cl[0]]) == type(u'') or \
1666     type(a[cl[0]]) == type('') :
1667     m = self.rtre.match(a[cl[0]].strip())
1668     if m != None:
1669     v1 = float(m.group(1))
1670     elif self.floatre.match(a[cl[0]].strip()):
1671     v1 = float(a[cl[0]])
1672     elif self.intre.match(a[cl[0]].strip()):
1673     v1 = int(a[cl[0]])
1674     else:
1675     v1 = a[cl[0]].strip()
1676     else:
1677     v1 = a[cl[0]]
1678     v2 = cl[2]
1679     # print >>sys.stderr, "Spectrum test:",v1,type(v1),cl[1],v2,type(v2)
1680     # sys.stderr.flush()
1681     if cl[1] == 'eq' and v1 != v2:
1682     # print >>sys.stderr, "FALSE"
1683     # sys.stderr.flush()
1684     return False
1685     elif cl[1] == 'ne' and not v1 != v2:
1686     return False
1687     elif cl[1] == 'lt' and not v1 < v2:
1688     return False
1689     elif cl[1] == 'le' and not v1 <= v2:
1690     return False
1691     elif cl[1] == 'gt' and not v1 > v2:
1692     return False
1693     elif cl[1] == 'ge' and not v1 >= v2:
1694     return False
1695     # print >>sys.stderr, "TRUE"
1696     # sys.stderr.flush()
1697     return True
1698    
1699     if __name__ == '__main__':
1700    
1701     from optparse import OptionParser
1702     import sys
1703     import glob
1704    
1705     parser = OptionParser()
1706     parser.add_option("-R", "--rawdata", type="string", dest="rawdata", default="",\
1707     help="Format of raw data. Optional if raw spectra file ends in .wiff or .t2m.")
1708     parser.add_option("-X", "--xmlformat", type="string", dest="xmlformat", default="",\
1709     help="XML format to output. Optional if output file ends in .mzxml or .mzdata.")
1710     parser.add_option("-o", "--output", type="string", dest="output", default="",\
1711     help="Output file.")
1712     parser.add_option("-p", "--peaks", type="string", dest="peaks", default="2",\
1713     help="Level(s) of spectra to apply peak detection to (comma separated). QStar,4700 only.")
1714     parser.add_option("-f", "--filter", type="string", dest="filter", default=None,\
1715     help="Filter on mzxml scan meta-data: field.op.value[,field.op.value]. Default: No filter.")
1716 nje01 1.3 parser.add_option("-V", "--version", type='string', dest="version", default="3.0",
1717     help="XML version. mzXML only. Valid options '2.1','2.2','3.0'. Default: '3.0'.")
1718     parser.add_option("-z", "--compress_peaks", action="store_true", dest="compress_peaks", default=None,\
1719     help="Compress mzXML peaks data using zlib. Default: False")
1720     parser.add_option("-Z", "--compress", type="string", dest="compress", default=None,\
1721     help="Compress output file. Valid options 'gz'. Default: None.")
1722 nje01 1.1 parser.add_option("-d", "--debug", action="store_true", dest="debug", default=False,\
1723     help="Debug. First 10 spectra only, and truncated peaks data element. Default: False.")
1724    
1725     ( opts, args ) = parser.parse_args()
1726    
1727     if opts.output == "" and opts.xmlformat == "":
1728     parser.error("Either -o or -X must be suppied.")
1729     sys.exit(1)
1730    
1731     if opts.filter != None:
1732     filt=scanfilter(opts.filter)
1733     else:
1734     filt=None
1735    
1736     argsglob = []
1737     for a in args:
1738     argsglob.extend(glob.glob(a))
1739    
1740     if opts.output != "" and len(argsglob) > 1:
1741     parser.error("At most one raw spectrum data file is permitted, if -o option is used.")
1742    
1743     for a in argsglob:
1744    
1745     if opts.output != "":
1746     o = opts.output
1747     else:
1748     p = a.rfind('.')
1749     o = a[0:p]+'.'+opts.xmlformat
1750    
1751     if o == a:
1752     print >>sys.stderr, "Input file %s and output file %s are the same!"%(a,o)
1753     sys.exit(1)
1754    
1755     print >>sys.stderr, "Processing",a,"to",o
1756    
1757     if opts.rawdata.lower() in ('wiff','qstar'):
1758     r = QStarWiff(a,opts.peaks)
1759     elif opts.rawdata.lower() in ('t2d','dat','ab4700','ab4800','mariner','voyager'):
1760     r = AB4700T2D(a,opts.peaks)
1761     elif opts.rawdata.lower() in ('mzxml',):
1762     r = FromMzXML(a)
1763     elif a[-5:].lower() in ('.wiff',):
1764     r = QStarWiff(a,opts.peaks)
1765     elif a[-4:].lower() in ('.t2m',):
1766     r = AB4700T2D(a,opts.peaks)
1767     elif a[-6:].lower() in ('.mzxml',):
1768     r = FromMzXML(a)
1769     else:
1770     parser.error("Bad rawdata format specification.")
1771     sys.exit(1)
1772    
1773     if opts.xmlformat.lower() == 'mzxml':
1774 nje01 1.3 x = ToMzXML(r,o,opts.compress,filt,opts.compress_peaks,opts.version)
1775 nje01 1.1 elif opts.xmlformat.lower() == 'mzdata':
1776     x = ToMzData(r,o,opts.compress,filt)
1777 nje01 1.7 elif re.search(r'\.mzxml(\.gz)?$',opts.output,re.IGNORECASE):
1778 nje01 1.3 x = ToMzXML(r,opts.output,opts.compress,filt,opts.compress_peaks,opts.version)
1779 nje01 1.7 elif re.search(r'\.mzdata(\.gz)?$',opts.output,re.IGNORECASE):
1780 nje01 1.1 x = ToMzData(r,opts.output,opts.compress,filt)
1781     else:
1782     parser.error("Bad xml format specification.")
1783     sys.exit(1)
1784    
1785     x.write(debug=opts.debug)