1 |
"""
|
2 |
Converter: Perform the datafile reading, pre-processing and writing.
|
3 |
Yuhang Wan, Christopher Lausted
|
4 |
Last modified on 100427 (yymmdd) by YW
|
5 |
|
6 |
Typical Pipeline:
|
7 |
>import converter as cv
|
8 |
>dat = cv.datafile_read("example.txt")
|
9 |
>dat = cv.outlier_remove(dat)
|
10 |
>dat = cv.calibrate(dat)
|
11 |
>dat = cv.background_subtract(dat)
|
12 |
>dat = cv.curve_split(dat, 100, 800, 200)
|
13 |
>dat = cv.data_output_clamp(dat, "/dir", 1)
|
14 |
>dat = cv.data_output_biosensor(dat, "/dir", 1,2,3,4)
|
15 |
>dat = cv.obj_save(dat, "saved_dataobj")
|
16 |
"""
|
17 |
__version__ = "100427"
|
18 |
|
19 |
## Import libraries.
|
20 |
import numpy as np
|
21 |
import pylab as plt
|
22 |
##import sys
|
23 |
import xlrd
|
24 |
import time
|
25 |
import os
|
26 |
import copy
|
27 |
import pickle
|
28 |
import SPRdataclass as spr
|
29 |
|
30 |
## Global constants and variables.
|
31 |
FMT_UNKNOWN = 0
|
32 |
FMT_SPRIT = 1 ## Old Plexera SPRit format.
|
33 |
FMT_ICMTXT = 2 ## Plexera Instrument Control Module text.
|
34 |
FMT_CLAMP34 = 3 ## Morton & Myszka CLAMP Version 3.4+
|
35 |
FMT_DAMXLS = 4 ## Plexera Data Analysis Module Excel export.
|
36 |
FMT_DATAOBJ = 5 ## Packed spr Data Object
|
37 |
|
38 |
def check_format_input(fname):
|
39 |
"""Examine the format of an input .txt data file."""
|
40 |
## Modified 100407 CGL.
|
41 |
|
42 |
## Open text file. Exit if file not found.
|
43 |
try:
|
44 |
fp = open(fname, "r")
|
45 |
except IOError:
|
46 |
print "Error: Unable to open file %s for reading." % fname
|
47 |
return
|
48 |
|
49 |
## Examine first line of text file to guess format.
|
50 |
fmt = FMT_UNKNOWN
|
51 |
sprit_line1="Elapsed Time (Seconds)\tAverage Intensity (Pixel Intensity)"
|
52 |
clamp_line1="Vers 3.4"
|
53 |
line1 = fp.readline()
|
54 |
if (sprit_line1 in line1):
|
55 |
fmt = FMT_SPRIT
|
56 |
elif (line1[2]=="/" and line1[5]=="/"):
|
57 |
## This format has date mm/dd/yyy in first column.
|
58 |
fmt=FMT_ICMTXT
|
59 |
elif (clamp_line1 in line1):
|
60 |
fmt = FMT_CLAMP34
|
61 |
|
62 |
fp.close()
|
63 |
return fmt, line1
|
64 |
|
65 |
|
66 |
def sprit_format(fname):
|
67 |
"""Read the SPRit formatted data and reshape it.
|
68 |
Return the data in the form of list.
|
69 |
"""
|
70 |
# open a file, if file doesn't exit, then exit
|
71 |
try:
|
72 |
fp = open(fname, "r")
|
73 |
except IOError:
|
74 |
print 'Cannot open file %s for reading' % fname
|
75 |
return
|
76 |
status = {'DatainputType':'SPRit data'}
|
77 |
# skip the first line of the input file
|
78 |
Tmpstr = fp.readline()
|
79 |
# check if the second line is BEGIN
|
80 |
TmpStr = fp.readline()
|
81 |
if ("BEGIN" not in TmpStr):
|
82 |
print "Warning: Second line of data file is not 'BEGIN'"
|
83 |
##sys.exit(0)
|
84 |
# count the lines of each data spot
|
85 |
TmpStr = fp.readline() # skip first "0.000000e+000"
|
86 |
num_line = 1 # so we start to count from 1
|
87 |
# count until read the "0.000000e+000"
|
88 |
while True:
|
89 |
TmpStr = fp.readline()
|
90 |
if TmpStr.split('\t')[0] == "0.000000e+000":
|
91 |
break
|
92 |
else:
|
93 |
num_line = num_line + 1
|
94 |
# count the spots in each file
|
95 |
num_spot = 0
|
96 |
# skip the first two lines
|
97 |
fp.seek(0)
|
98 |
for i in range(0,2):
|
99 |
TmpStr = fp.readline()
|
100 |
text = fp.readlines()
|
101 |
for i in text:
|
102 |
num_spot += i.count('0.000000e+000')
|
103 |
fp.close()
|
104 |
#---------------------- Reshaping the data --------------------
|
105 |
# reshape the data in the string form num_line*num_spot
|
106 |
Shaped_str = np.reshape(text,(num_spot,-1))
|
107 |
tmpshape=np.shape(Shaped_str)
|
108 |
# reshape the data in the number form: num_line*(num_spot+1)
|
109 |
# only the first column is time: time spotl spot2 ... spot n
|
110 |
time = np.zeros(num_line)
|
111 |
Tmpdata = Shaped_str[0,:]
|
112 |
for n,i in enumerate(Tmpdata):
|
113 |
time[n]=float(i.split('\t')[0])
|
114 |
Shaped_data_1 = np.zeros((tmpshape[0]+1,tmpshape[1]))
|
115 |
Shaped_data_1[0,:]=time
|
116 |
for n1,i in enumerate(Shaped_str):
|
117 |
for n2,j in enumerate(i):
|
118 |
Shaped_data_1[n1+1][n2]=float(Shaped_str[n1][n2].split('\t')[1])
|
119 |
# reshape the data in the number form:
|
120 |
# time1 spotl time2 spot2 ... timen spot n
|
121 |
Shaped_data_2 = np.zeros((tmpshape[0]*2,tmpshape[1]))
|
122 |
for i in range(tmpshape[0]):
|
123 |
Shaped_data_2[i*2]=Shaped_data_1[0]
|
124 |
Shaped_data_2[i*2+1] = Shaped_data_1[i+1]
|
125 |
|
126 |
## status = {'Datainput Type':'SPRit data'}
|
127 |
dataformat = 0
|
128 |
ROInum = num_spot
|
129 |
dataobj=spr.DataPass(Shaped_data_1, ROInum, dataformat)
|
130 |
dataobj.status = {}
|
131 |
dataobj.updateStatus(**status)
|
132 |
print "There are %d ROIs, and %d lines of data" % (ROInum, num_line)
|
133 |
|
134 |
return dataobj
|
135 |
|
136 |
|
137 |
def plexera_icm_format(fname):
|
138 |
"""Read the txt data exported from Plexera ICM software and reshape it.
|
139 |
Return the data in the form of list.
|
140 |
"""
|
141 |
fp=file(fname,'r')
|
142 |
Tmpdata,tmptime=[],[]
|
143 |
status = {'DatainputType':'Plexera ICM data'}
|
144 |
for eachline in fp:
|
145 |
Tmpdata.append(eachline.split('\t'))
|
146 |
num_line,num_spot=np.shape(Tmpdata)
|
147 |
# initialize the Shaped_data
|
148 |
Shaped_data = np.zeros((num_spot,num_line))
|
149 |
num_spot=num_spot-1 # the real number of spot
|
150 |
Tmpdata=np.transpose(Tmpdata)
|
151 |
# delete the last '\r\n' of the data in last column of the datafile
|
152 |
for n,i in enumerate(Tmpdata[-1]):
|
153 |
Tmpdata[-1][n]=i.rstrip()
|
154 |
# time converting: convert the first column of the datafile(date and time information) into numerical time
|
155 |
for i in Tmpdata[0]:
|
156 |
tmptime_str=i.split(' ')[1]
|
157 |
timelist=tmptime_str.split(':')
|
158 |
now=float(timelist[0])*60*60+float(timelist[1])*60+float(timelist[2])
|
159 |
tmptime.append(now)
|
160 |
Shaped_data[0]=tmptime
|
161 |
Shaped_data[0]=Shaped_data[0]-Shaped_data[0][0]
|
162 |
# assign the signal data
|
163 |
for i in range(num_spot):
|
164 |
Shaped_data[i+1]=Tmpdata[i+1]
|
165 |
|
166 |
dataformat = 0
|
167 |
ROInum = num_spot
|
168 |
dataobj=spr.DataPass(Shaped_data, ROInum, dataformat)
|
169 |
dataobj.status = {}
|
170 |
dataobj.updateStatus(**status)
|
171 |
|
172 |
print "There are %d ROIs, and %d lines of data" % (ROInum, num_line)
|
173 |
return dataobj
|
174 |
|
175 |
|
176 |
|
177 |
def dam_single_sheet(sh,DataList,ROIinfo,start,end, colsize):
|
178 |
"""Retrieve and shape data form one single sheet from the Spreadsheet.
|
179 |
Return the data, ROI infomation in the form of list.
|
180 |
"""
|
181 |
## and return the minimal spot Id (start_spot)
|
182 |
## retrieve the head of the Spreadsheet, SpotList is used to store the spot Id of each sheet
|
183 |
|
184 |
Head = sh.row_values(1)
|
185 |
SpotList = []
|
186 |
num_spot = sh.row_values(0).count('Relative Time')
|
187 |
|
188 |
for i in range(num_spot):
|
189 |
TmpHead=unicode.encode(Head[colsize*i])
|
190 |
# now there are 5 columns for each ROI for Raw Intensity
|
191 |
# and 4 columns for each ROI for Satellite subtracted Intensity
|
192 |
DataList.append(sh.col_values(2+colsize*i,start,end))
|
193 |
|
194 |
key, val = [], []
|
195 |
tmpinfo = TmpHead.split('\n')
|
196 |
for j in tmpinfo:
|
197 |
key.append(j.split(': ')[0])
|
198 |
val.append(j.split(': ')[1])
|
199 |
tmpdic = dict(zip(key,val))
|
200 |
try: # spot id
|
201 |
tmpdic['ID'] = int(tmpdic['ID']) # either in form of "spot1" or "1"
|
202 |
except ValueError:
|
203 |
if str.upper(tmpdic['ID']).startswith('SPOT'):
|
204 |
tmpdic['ID'] = int(tmpdic['ID'][4:])
|
205 |
else:
|
206 |
print "Warning: Illegal galfile format."
|
207 |
print "The ID is neither 'spot1' form nor '1' form. \n", "'ID:'", tmp
|
208 |
|
209 |
tmpdic['Position'] = (int(tmpdic['Block'])+1, int(tmpdic['Row'])+1, int(tmpdic['Column'])+1)
|
210 |
ROIinfo.append(tmpdic)
|
211 |
|
212 |
return DataList,ROIinfo
|
213 |
|
214 |
def dam_spreadsheet_format(book):
|
215 |
"""Read the spreadsheet exported from Plexera DAM software and reshape it.
|
216 |
Return the shaped data and ROI information.
|
217 |
"""
|
218 |
ROIinfo, newROIinfo, n_sheet = [], [], 0
|
219 |
DataList=[]
|
220 |
status = {'DatainputType':'Plexera DAM data'}
|
221 |
sh0=book.sheet_by_index(0)
|
222 |
if sh0.row_values(0).count('Raw Intensity') == 0 and sh0.row_values(0).count('Subtracted Intensity') > 0:
|
223 |
print 'This is the satellite subtracted data.'
|
224 |
colsize = 4
|
225 |
status ['DAM spreadsheet'] = 'Satellite subtracted Intensity'
|
226 |
elif sh0.row_values(0).count('Raw Intensity') > 0 and sh0.row_values(0).count('Subtracted Intensity') == 0:
|
227 |
print 'This is the raw intensity data.'
|
228 |
colsize = 5
|
229 |
status ['DAM spreadsheet'] = 'Raw Intensity'
|
230 |
|
231 |
for i in range(2,sh0.nrows):
|
232 |
if sum(sh0.col_values(2,1,i))>0:
|
233 |
break
|
234 |
start,end = i-1,sh0.nrows
|
235 |
num_line = end - start
|
236 |
## print 'start: ', start
|
237 |
# the time axis initialization and assignment
|
238 |
Time=sh0.col_values(1,start,end)
|
239 |
# in case we lose the time axis during the DAM processing
|
240 |
if sum(Time)==0: Time=np.arange(num_line)
|
241 |
DataList.append(Time)
|
242 |
|
243 |
# list DataList to store the Shaped_data of each ROI in each sheet,
|
244 |
# list ROIinfo to store the Spot id and the ligand name of each ROI in each sheet
|
245 |
for i in range(book.nsheets):
|
246 |
sh=book.sheet_by_index(i)
|
247 |
if sh.ncols!=0:
|
248 |
DataList,ROIinfo = dam_single_sheet(sh,DataList,ROIinfo,start,end, colsize)
|
249 |
n_sheet=n_sheet+1
|
250 |
else:
|
251 |
break
|
252 |
# calculate the dim of the Shaped_data, and initialize the matrix
|
253 |
num_spot=len(DataList)-1
|
254 |
num_line=end-start
|
255 |
Shaped_data=np.zeros((num_spot+1,num_line))
|
256 |
Shaped_data[0]=DataList[0]
|
257 |
|
258 |
# in case that the ROIs doesn't count from spot1
|
259 |
newROIinfo = copy.deepcopy(ROIinfo)
|
260 |
StartspotList=np.array([i['ID'] for i in ROIinfo])
|
261 |
start_spot=min(StartspotList)
|
262 |
|
263 |
for i in range(num_spot):
|
264 |
tmp = ROIinfo[i]['ID']
|
265 |
j=tmp-(start_spot-1)
|
266 |
Shaped_data[j]=DataList[i+1]
|
267 |
newROIinfo[j-1]=ROIinfo[i]
|
268 |
|
269 |
# pack the date and relative information into SPRdata obj
|
270 |
dataformat = 0
|
271 |
ROInum = num_spot
|
272 |
dataobj=spr.DataPass(Shaped_data, ROInum, dataformat, newROIinfo)
|
273 |
dataobj.status = {}
|
274 |
dataobj.updateStatus(**status)
|
275 |
print "There are %d ROIs, and %d lines of data" % (ROInum, num_line)
|
276 |
|
277 |
return dataobj
|
278 |
|
279 |
|
280 |
def read_clamp_data(fname):
|
281 |
"""Read the Clamp format data
|
282 |
Return the data in the form of list.
|
283 |
"""
|
284 |
fp=open(fname,'r')
|
285 |
Tmpstr=fp.readline()
|
286 |
# examine the file head
|
287 |
# e.g. 'Vers 3.41 cTBA-1 Conc:250 nM\n'
|
288 |
ROInum, dataformat, Conc, ROIdic = 1, 4, [], {}
|
289 |
try:
|
290 |
name = Tmpstr.split(' ')[2].strip()
|
291 |
ROIdic['ROI name'] = name
|
292 |
stmp = Tmpstr.split(' ')[3]
|
293 |
rest = Tmpstr[Tmpstr.find(stmp):].strip()
|
294 |
ROIdic['concentration'] = rest.split(':')
|
295 |
except IndexError:
|
296 |
pass
|
297 |
|
298 |
for Tmpstr in fp:
|
299 |
try:
|
300 |
float(Tmpstr.split('\t')[0])
|
301 |
n_conc=(Tmpstr.count('\t')+1)/2
|
302 |
##print Tmpstr, 'the beginning of data'
|
303 |
break
|
304 |
except ValueError:
|
305 |
if Tmpstr.startswith('Conc1'):
|
306 |
n_conc=Tmpstr.count('\t')
|
307 |
Conc=Tmpstr.strip().split('\t')[1:]
|
308 |
elif Tmpstr.startswith('Start1'):
|
309 |
Start=Tmpstr.strip().split('\t')[1:]
|
310 |
n_conc=Tmpstr.count('\t')
|
311 |
elif Tmpstr.startswith('Stop1'):
|
312 |
Stop=Tmpstr.strip().split('\t')[1:]
|
313 |
n_conc=Tmpstr.count('\t')
|
314 |
|
315 |
|
316 |
# write the data into matrix
|
317 |
Tmpdata=[]
|
318 |
for eachline in fp:
|
319 |
a=eachline.strip().split('\t')
|
320 |
Tmpdata.append(np.array(a,dtype='f'))
|
321 |
Data=np.transpose(Tmpdata)
|
322 |
|
323 |
# write the infomation
|
324 |
ROIinfo = [ROIdic]
|
325 |
status = {'DatainputType':'Clamp data'}
|
326 |
sampleinfo, injinfo = [], []
|
327 |
injpara = [map(float,i) for i in [Start,Stop]]
|
328 |
|
329 |
if len(Conc) >= 1:
|
330 |
for i,c in enumerate(Conc):
|
331 |
sampledic = {}
|
332 |
name = 'Sample '+str(i+1)
|
333 |
sampledic = dict(zip(['Name','Concentration'],[name,c]))
|
334 |
## sampledic['Name'] = name
|
335 |
## sampledic['Concentration'] = c[injpara[0][i],injpara[1][i]]
|
336 |
sampleinfo.append(sampledic)
|
337 |
injdic = dict(zip(['ton','toff'],[float(Start[i]),float(Stop[i])]))
|
338 |
injinfo.append(injdic)
|
339 |
dataobj=spr.DataPass(Data, ROInum, dataformat, ROIinfo, sampleinfo, injinfo)
|
340 |
dataobj.status = {}
|
341 |
dataobj.updateStatus(**status)
|
342 |
|
343 |
return dataobj
|
344 |
|
345 |
|
346 |
|
347 |
def keyfile_read(fkey):
|
348 |
# The Key File contains
|
349 |
"""Function: Read the key file for old SPR instrument.
|
350 |
Input: the filename.
|
351 |
Return: the ROI information.
|
352 |
"""
|
353 |
try:
|
354 |
fp = open(fkey, "r")
|
355 |
except IOError:
|
356 |
print "Cannot open file %s for reading" % fname
|
357 |
return
|
358 |
ROIinfo = []
|
359 |
firstline=fp.readline() # skip the first line
|
360 |
# Table.append([firstline.split('\t')[0],firstline.split('\t')[1],firstline.split('\t')[2],firstline.split('\t')[3]])
|
361 |
for eachline in fp:
|
362 |
ROIdic = {}
|
363 |
tmplist = eachline.strip().split('\t')
|
364 |
|
365 |
try: # spot id
|
366 |
ROIdic['ID'] = int(tmplist[0]) # either in form of "spot1" or "1"
|
367 |
except ValueError:
|
368 |
if str.upper(tmplist[0]).startswith('SPOT'):
|
369 |
ROIdic['ID'] = int(tmplist[0][4:])
|
370 |
else:
|
371 |
print "Error: Unable to read Spot IDs"
|
372 |
|
373 |
ROIdic['Name'] = tmplist[1] # name of the protein immobilized
|
374 |
try:
|
375 |
ROIdic['Conc'] = tmplist[2]
|
376 |
ROIdic['Background Spot'] = map(int, tmplist[3].split(';')) # in case more than one spot set as background with ";" as SEPARATOR
|
377 |
except IndexError:
|
378 |
pass
|
379 |
ROIinfo.append(ROIdic)
|
380 |
|
381 |
fp.close()
|
382 |
print 'There are %d ROIs in the Key file.' % len(ROIinfo)
|
383 |
return ROIinfo
|
384 |
|
385 |
def keyfile_read_fake(obj):
|
386 |
ROIinfo = []
|
387 |
spot_num = obj.ROInum
|
388 |
for i in range(spot_num):
|
389 |
ROIdic = {}
|
390 |
ROIdic['ID'] = i+1
|
391 |
ROIdic['Name'] = "Ligand"+str(i+1)
|
392 |
ROIinfo.append(ROIdic)
|
393 |
return ROIinfo
|
394 |
|
395 |
|
396 |
def galfile_read(fname):
|
397 |
"""Function: Read the GAL file.
|
398 |
Input: the filename.
|
399 |
Return: the ROI information.
|
400 |
"""
|
401 |
fp=file(fname,'r')
|
402 |
ROIinfo = []
|
403 |
Tmpstr = fp.readline() # skip the first line
|
404 |
Tmpstr = fp.readline()
|
405 |
Tmpn = int(Tmpstr.split('\t')[0])
|
406 |
for i in range(Tmpn):
|
407 |
labelline = fp.readline()
|
408 |
|
409 |
if labelline.startswith('"Block1'):
|
410 |
a = labelline.strip().split(',')
|
411 |
columnsize, rowsize = int(a[3]), int(a[5])
|
412 |
## print labelline
|
413 |
def IDcal(p):
|
414 |
id = (p[0]-1)*columnsize*rowsize + (p[1]-1)*columnsize + p[2]
|
415 |
return id
|
416 |
|
417 |
for eachline in fp:
|
418 |
ROIdic = {}
|
419 |
tmplist = eachline.strip().split('\t')
|
420 |
pos = tuple(map(int,tmplist[:3]))
|
421 |
ROIdic['Position'] = pos
|
422 |
ROIdic['IDcal'] =IDcal(pos)
|
423 |
##(pos[0]-1)*columnsize*rowsize + (pos[1]-1)*columnsize + pos[2]
|
424 |
|
425 |
##
|
426 |
ROIdic['Name'] = tmplist[3]
|
427 |
## ROIdic['ID'] = tmplist[4]
|
428 |
try: # spot id
|
429 |
ROIdic['ID'] = int(tmplist[4]) # either in form of "spot1" or "1"
|
430 |
except ValueError:
|
431 |
if str.upper(tmplist[4]).startswith('SPOT'):
|
432 |
ROIdic['ID'] = int(tmplist[4][4:])
|
433 |
else:
|
434 |
print "Error: Unable to read Spot IDs"
|
435 |
|
436 |
try:
|
437 |
ROIdic['Conc'] = tmplist[5]
|
438 |
ROIdic['Family'] = tmplist[6]
|
439 |
blist = tmplist[7][1:-1].split(';') # always something like '"1,1,2; 1,2,2"'
|
440 |
Backn = len(blist)
|
441 |
Bid = []
|
442 |
|
443 |
for i in blist:
|
444 |
if '' in i.split(','):
|
445 |
break
|
446 |
Bpos = tuple(map(int, i.split(',')))
|
447 |
Bid.append(IDcal(Bpos))
|
448 |
##
|
449 |
ROIdic['Background Spot'] = Bid ##tuple(Bid)
|
450 |
except IndexError:
|
451 |
pass
|
452 |
|
453 |
if ROIdic['ID'] != ROIdic['IDcal']:
|
454 |
print 'The spot ID should be consistent with the position. Please check the ID.'
|
455 |
break
|
456 |
else:
|
457 |
ROIdic['ID'] = ROIdic['IDcal']
|
458 |
ROIinfo.append(ROIdic)
|
459 |
fp.close()
|
460 |
print 'There are %d ROIs in the Gal file.' % len(ROIinfo)
|
461 |
|
462 |
return ROIinfo
|
463 |
|
464 |
|
465 |
|
466 |
def method_read(fname):
|
467 |
"""Function: Read the analyte table.
|
468 |
Input: the filename
|
469 |
Return: the sample information
|
470 |
"""
|
471 |
# create a concentration dictionary for the sample injected into the flowcell
|
472 |
# export a table containing the concentration of the sample injected, and the duration
|
473 |
fp=file(fname,'r')
|
474 |
sampleinfo = []
|
475 |
##TmpTable,TmpTable2={},{}
|
476 |
labelline=fp.readline()
|
477 |
for n,eachline in enumerate(fp):
|
478 |
sampledic = {}
|
479 |
sampledic['Location'] = eachline.split('\t')[0]
|
480 |
sampledic['Name'] = eachline.split('\t')[1]
|
481 |
sampledic['Concentration'] = float(eachline.split('\t')[2])
|
482 |
sampledic['Duration'] = float(eachline.split('\t')[4])
|
483 |
sampledic['Flow Rate'] = float(eachline.split('\t')[3])
|
484 |
sampledic['Analyte Series'] = eachline.split('\t')[6]
|
485 |
sampledic['Buffer Blank Series'] = eachline.split('\t')[7]
|
486 |
sampleinfo.append(sampledic)
|
487 |
|
488 |
print 'Name','\t','Concentration'
|
489 |
for i in sampleinfo:
|
490 |
print i['Name'],'\t', i['Concentration']
|
491 |
|
492 |
return sampleinfo
|
493 |
|
494 |
def method_read_fake():
|
495 |
sampleinfo = []
|
496 |
sampleinfo.append({'Location':0, 'Name':0, 'Concentration':0, 'Duration':0, 'Flow Rate':0, 'Analyte Series':0, 'Buffer Blank Series':0})
|
497 |
return sampleinfo
|
498 |
|
499 |
|
500 |
def datafile_read(fname):
|
501 |
"""Function: Read the raw data from the Plexera instrument,
|
502 |
and pack the raw data (Shaped_data) into dataobject.
|
503 |
Therefore, the initial dataformat = 0
|
504 |
Input: the filename
|
505 |
Return: the packed SPR.DataPass type data object
|
506 |
"""
|
507 |
#===================check the format of the input files-------
|
508 |
if fname.split('.')[-1] == 'txt':
|
509 |
|
510 |
Format_input, Firstline = check_format_input(fname)
|
511 |
if Format_input == FMT_SPRIT: # for SPRit file format
|
512 |
dataobj = sprit_format(fname)
|
513 |
print '-'*10,' This is a SPRit data file. ','-'*10
|
514 |
|
515 |
elif Format_input == FMT_ICMTXT: # for Plexera ICM file format
|
516 |
dataobj = plexera_icm_format(fname)
|
517 |
print '-'*10,' This is a Plexera ICM exported data file. ','-'*10
|
518 |
|
519 |
elif Format_input == FMT_CLAMP34: # for Clamp datafile format
|
520 |
dataobj = read_clamp_data(fname)
|
521 |
print '-'*10,' This is a Clamp file. ','-'*10
|
522 |
|
523 |
else:
|
524 |
print '-'*10,' unreadable file input! ','-'*10
|
525 |
return
|
526 |
|
527 |
if Format_input == FMT_SPRIT or Format_input == FMT_ICMTXT:
|
528 |
flag = str.upper(raw_input('Load the key file? (y/n): '))
|
529 |
if flag == 'Y':
|
530 |
fkey = raw_input('Input the path of the key file: ')
|
531 |
ROIinfo = keyfile_read(fkey)
|
532 |
dataobj.updateROIinfo(ROIinfo)
|
533 |
else:
|
534 |
dataobj.updateROIinfo(keyfile_read_fake(dataobj))
|
535 |
|
536 |
elif fname.split('.')[-1] == 'xls':
|
537 |
# for the DAM Spreadsheet file format
|
538 |
print '-'*10,' This is a Plexera DAM exported data file. ','-'*10
|
539 |
Format_input = FMT_DAMXLS
|
540 |
book = xlrd.open_workbook(fname)
|
541 |
#shape the data in the Spreadsheet, whether single sheet or multiple sheets
|
542 |
dataobj=dam_spreadsheet_format(book)
|
543 |
flag = str.upper(raw_input('Load the gal file? (y/n): '))
|
544 |
if flag == 'Y':
|
545 |
fgal = raw_input('Input the path of the gal file: ')
|
546 |
ROIinfo = galfile_read(fgal)
|
547 |
dataobj.updateROIinfo(ROIinfo)
|
548 |
|
549 |
elif fname.split('.')[-1] == 'obj':
|
550 |
# for the saved data object file format
|
551 |
print '-'*10,' This is a saved data object file. ','-'*10
|
552 |
Format_input = FMT_DATAOBJ
|
553 |
fp = file(fname, 'r+')
|
554 |
dataobj = pickle.load(fp)
|
555 |
fp.close()
|
556 |
|
557 |
flag = str.upper(raw_input('Load the experimental analyte file? (y/n): '))
|
558 |
if flag == 'Y':
|
559 |
fprotocol = raw_input('Input the path of the analyte table: ')
|
560 |
sampleinfo = method_read(fprotocol)
|
561 |
dataobj.updateSampleinfo(sampleinfo)
|
562 |
elif Format_input != FMT_CLAMP34 and Format_input != FMT_DATAOBJ:
|
563 |
# clamp data and obj data may contain the experimental protocol already
|
564 |
dataobj.updateSampleinfo(method_read_fake())
|
565 |
|
566 |
return dataobj
|
567 |
|
568 |
|
569 |
def outlier_remove(obj):
|
570 |
##dataobj.viewAll()
|
571 |
"""Function: Remove unwanted data points from noisy periods.
|
572 |
Return: the data object with clean data
|
573 |
"""
|
574 |
dataobj = copy.deepcopy(obj)
|
575 |
print ('Select the time area that you want to remove.')
|
576 |
tmpstr = raw_input('in the format of "200:240, 500:510" :')
|
577 |
areatodel = tmpstr.split(',')
|
578 |
data = dataobj.data
|
579 |
for i in areatodel:
|
580 |
##if
|
581 |
t1, t2 = float(i.split(':')[0]), float(i.split(':')[1])
|
582 |
t = data[0] # for dataformat=0 Shaped_data
|
583 |
tmpind = plt.find((t>t1) & (t<t2)) # find the outliar indice
|
584 |
## data[1:,tmpind] = 0
|
585 |
tmplist = data.tolist()
|
586 |
tmplist2 = []
|
587 |
for j in tmplist:
|
588 |
del j[min(tmpind):max(tmpind)]
|
589 |
tmplist2.append(j)
|
590 |
data = np.array(tmplist2)
|
591 |
## tmplist = tmplist
|
592 |
dataobj.data = data
|
593 |
# dataobj.status = 'Outliar removed'
|
594 |
dataobj.updateStatus(outlier_remove=True)
|
595 |
return dataobj
|
596 |
|
597 |
def calibrate(obj):
|
598 |
"""Function: calibrate the Intensity response.
|
599 |
Return: the data object with calibrated data
|
600 |
Note: at present, this function is valid only when
|
601 |
the whole procedure includes calibration precedure.
|
602 |
"""
|
603 |
dataobj = copy.deepcopy(obj)
|
604 |
data = dataobj.data
|
605 |
caldata = copy.deepcopy(data)
|
606 |
t = data[0]
|
607 |
if dataobj.dataformat != 0:
|
608 |
print 'We only calibrate the raw data.'
|
609 |
return
|
610 |
else:
|
611 |
t1 = input('Enter the time position of the TOP response during calibration:')
|
612 |
s1 = input('The refractive index of the TOP response is: ')
|
613 |
t2 = input('Enter the time position of the BOTTOM response during calibration:')
|
614 |
s2 = input('The refractive index of the BOTTOM response is: ')
|
615 |
ind1 = plt.find(abs(t-t1) == min(abs(t-t1)))[0]
|
616 |
ind2 = plt.find(abs(t-t2) == min(abs(t-t2)))[0]
|
617 |
|
618 |
for i,y in enumerate(data[1:]):
|
619 |
y1, y2 = y[ind1], y[ind2]
|
620 |
slope = (s1-s2)/(y1-y2)
|
621 |
offset = s1- slope*y1
|
622 |
caldata[i+1] = slope*y+offset
|
623 |
dataobj.data = caldata
|
624 |
dataobj.updateStatus(calibrate=True)
|
625 |
return dataobj
|
626 |
|
627 |
def get_background_value(obj,bgids):
|
628 |
"""Get the averaged value of background spots for each ROI."""
|
629 |
bgsignal = obj.data[0]*0
|
630 |
for j in bgids:
|
631 |
if j == int(obj.ROIinfo[j-1]['ID']): # in case the ID in ROIinfo is string type
|
632 |
bgsignal = bgsignal + obj.data[j]
|
633 |
|
634 |
elif j != int(obj.ROIinfo[j-1]['ID']):
|
635 |
for n,eachspot in enumerate(obj.ROIinfo):
|
636 |
if j == int(eachspot['ID']):
|
637 |
bgsignal = bgsignal + obj.data[n+1]
|
638 |
|
639 |
bgsignal = bgsignal/len(bgids)
|
640 |
return bgsignal
|
641 |
|
642 |
|
643 |
def background_subtract(obj, *bgspot):
|
644 |
"""Function: Perform the Background subtraction for the UNSPLIT curve.
|
645 |
Input besides "obj" is the id of the spot taken as background.
|
646 |
The following inputs are acceptable:
|
647 |
1. background_subtract(obj): the default background in Galfile
|
648 |
will be subtracted.
|
649 |
2. background_subtract(obj, 1, 6): the average value of spot1
|
650 |
and spot6 will be subtracted.
|
651 |
"""
|
652 |
dataobj = copy.deepcopy(obj)
|
653 |
ROIinfo = obj.ROIinfo
|
654 |
data = dataobj.data
|
655 |
|
656 |
if dataobj.dataformat == 0:
|
657 |
newdata = np.zeros(np.shape(data))
|
658 |
newdata[0] = data[0] # the time scale
|
659 |
if bgspot == ():
|
660 |
# The average of the default background spots listed in the galfile
|
661 |
#are to be subtracted.
|
662 |
for i in range(1,dataobj.ROInum+1):
|
663 |
bgids = ROIinfo[i-1]['Background Spot']
|
664 |
bgsignal = get_background_value(dataobj,bgids)
|
665 |
newdata[i] = data[i]-bgsignal
|
666 |
# update the status of the data object.
|
667 |
dataobj.updateStatus(BackgroundType='Default in Gal')
|
668 |
print "The background spot embodied in gal is used."
|
669 |
|
670 |
else:
|
671 |
# The average of the manually input background spots
|
672 |
#are to be subtracted.
|
673 |
for i in range(1,dataobj.ROInum+1):
|
674 |
bgsignal = get_background_value(dataobj,bgspot)
|
675 |
newdata[i] = data[i]-bgsignal
|
676 |
dataobj.updateStatus(BackgroundType='Manually chosen')
|
677 |
print "The background spot is manually chosen."
|
678 |
|
679 |
dataobj.data = newdata
|
680 |
dataobj.updateStatus(background_subtraction=True)
|
681 |
return dataobj
|
682 |
|
683 |
else:
|
684 |
print 'The Background Subtraction should be run at the beginning, with the UNsplit curve.'
|
685 |
return
|
686 |
|
687 |
def curve_split(obj, t_before, t_after, *t_inj):
|
688 |
"""Function: Split the whole curve that contains several injections
|
689 |
into pieces.
|
690 |
The sample information will be checked during the split,
|
691 |
if the injection number in sampleinfo is not consistant
|
692 |
with the input injection parameter, it will help you to
|
693 |
update the sample information.
|
694 |
Input besides obj:
|
695 |
t_before: time duration before the injection time point
|
696 |
t_after: time duration after the injection time point
|
697 |
t_inj: the exact start time for each injection
|
698 |
Return: the data object with splitted data, and updated sample
|
699 |
information
|
700 |
"""
|
701 |
dataobj = copy.deepcopy(obj)
|
702 |
ROInum = dataobj.ROInum
|
703 |
t=dataobj.data[0]
|
704 |
ind=[]
|
705 |
# find how many lines of data each injection
|
706 |
for i in range(len(t_inj)):
|
707 |
ind.append(np.shape(plt.find((t>(t_inj[i]-t_before))&(t<(t_inj[i]+t_after))))[0])
|
708 |
m,j = max(ind),max(ind)-min(ind)
|
709 |
# the new dataset must be a m*((ROInum+1)*num_inj) matrix:
|
710 |
# T1,S_ROI1_c1,S_ROI2_c1,S_ROI3_c1...T2,S_ROI1_c2,S_ROI2_c2...
|
711 |
tmpdata = np.zeros(((ROInum+1)*len(t_inj),m))
|
712 |
|
713 |
for i in range(len(t_inj)):
|
714 |
TmpInd = plt.find((t>(t_inj[i]-t_before))&(t<(t_inj[i]+t_after)))
|
715 |
Tmp1,Tmp2 = i*(ROInum+1),(i+1)*(ROInum+1)
|
716 |
tmpdata[Tmp1:Tmp2,0:len(TmpInd)] = dataobj.data[:,TmpInd]
|
717 |
if j!=0:
|
718 |
Split_data = tmpdata[:,:-j]
|
719 |
else:
|
720 |
Split_data = tmpdata[:]
|
721 |
# zero and align the split data
|
722 |
for i in range(np.shape(Split_data)[0]):
|
723 |
Split_data[i]=Split_data[i]-Split_data[i][0]
|
724 |
|
725 |
newsampleinfo, injectinfo = check_sample_info(dataobj,t_before, t_after, t_inj)
|
726 |
if newsampleinfo != None :
|
727 |
dataobj.data = Split_data
|
728 |
dataobj.updateStatus(DataSplit=True)
|
729 |
dataobj.updateSampleinfo(newsampleinfo)
|
730 |
dataobj.dataformat = 2
|
731 |
dataobj.injectinfo = injectinfo
|
732 |
return dataobj
|
733 |
else:
|
734 |
dataobj.updateStatus(DataSplit=False)
|
735 |
print 'Curve Splitting not successful!'
|
736 |
|
737 |
return
|
738 |
|
739 |
|
740 |
def check_sample_info(dataobj, t_before, t_after, t_inj):
|
741 |
"""Check if the sample information consistent with the injection
|
742 |
parameter input. If not, help the user to update sample information
|
743 |
for the splitted curve.
|
744 |
"""
|
745 |
injectinfo = []
|
746 |
if len(dataobj.sampleinfo) == len(t_inj):
|
747 |
for i,j in enumerate(t_inj):
|
748 |
injectinfo.append([t_before, t_before+dataobj.sampleinfo[i]['Duration'], t_before+t_after])
|
749 |
return dataobj.sampleinfo, injectinfo
|
750 |
|
751 |
elif len(dataobj.sampleinfo) > len(t_inj):
|
752 |
print """ The number of injection you just input does not match that in sample infomation.
|
753 |
If you just want to split part of the curve, please follow the following direction to update the sampleinfo in the new object."""
|
754 |
choose = str.upper(raw_input('Continue? (y/n)'))
|
755 |
if choose == 'Y':
|
756 |
newsampleinfo, injectinfo = [], []
|
757 |
print ' Your input indicates %s injections out of %s times in experiment. ' % (len(t_inj), len(dataobj.sampleinfo))
|
758 |
for n,i in enumerate(dataobj.sampleinfo):
|
759 |
print '--',n,'.',i['Name'],':',i['Concentration']
|
760 |
print ' Enter the ID number of the injections you just select, in the form of "2-10" or "4,5,6",'
|
761 |
tmpstr = raw_input(':')
|
762 |
if ',' in tmpstr:
|
763 |
injID = map(int, tmpstr.split(','))
|
764 |
elif '-' in tmpstr:
|
765 |
injID = range(int(tmpstr.split('-')[0]), int(tmpstr.split('-')[1])+1)
|
766 |
else:
|
767 |
print ' Illegal input!'
|
768 |
return
|
769 |
|
770 |
if len(injID) != len(t_inj):
|
771 |
print ' Please input %s injection ID numbers instead of %s You just input.' %(len(t_inj),len(injID))
|
772 |
return
|
773 |
|
774 |
for i in injID:
|
775 |
newsampleinfo.append(dataobj.sampleinfo[i])
|
776 |
injectinfo.append([t_before, t_before+dataobj.sampleinfo[i]['Duration'], t_before+t_after])
|
777 |
return newsampleinfo, injectinfo
|
778 |
else:
|
779 |
return
|
780 |
|
781 |
elif len(dataobj.sampleinfo) < len(t_inj):
|
782 |
print ' The number of injection you just input does not match that in sample infomation.'
|
783 |
print ' Your input indicates %s injections, where there is only %s times in experiment. ' % (len(t_inj), len(dataobj.sampleinfo))
|
784 |
print ' Please check.'
|
785 |
|
786 |
return
|
787 |
|
788 |
|
789 |
def write_data(Head,data_out,ROI_out,DataDir,output_format):
|
790 |
"""Write the data into a txt file.
|
791 |
"""
|
792 |
# Modified 100405 CGL
|
793 |
# create a newfolder with time stamp to save the data
|
794 |
if output_format=='B':
|
795 |
ftitle='biosensor'
|
796 |
Tmpstr = str(ROI_out).replace(', ','_')
|
797 |
Tmpstr = Tmpstr.replace('(','-')
|
798 |
foutname = ftitle+Tmpstr[:-1]+'.txt'
|
799 |
elif output_format=='C':
|
800 |
foutname='clamp-'+str(ROI_out)+'.txt'
|
801 |
else:
|
802 |
ftitle=''
|
803 |
|
804 |
# Change directories, if requested.
|
805 |
if (DataDir != ""):
|
806 |
os.chdir('..')
|
807 |
try:
|
808 |
os.makedirs(DataDir)
|
809 |
##except WindowsError:
|
810 |
except OSError:
|
811 |
pass
|
812 |
os.chdir(DataDir)
|
813 |
|
814 |
np.savetxt('temp.txt',np.transpose(data_out),fmt='%f')
|
815 |
fdt=file('temp.txt','r')
|
816 |
|
817 |
## foutname=ftitle+Tmpstr[:-1]+'.txt' ##raw_input('Enter the file name of the output: ')
|
818 |
fout=file(foutname,'w+')
|
819 |
# write the head of the data first
|
820 |
fout.write(Head+'\n')
|
821 |
count = len(fdt.readlines())
|
822 |
fdt.seek(0)
|
823 |
for i in range(count):
|
824 |
temp=fdt.readline()
|
825 |
fout.write(temp.replace(' ','\t'))
|
826 |
fdt.close()
|
827 |
fout.close()
|
828 |
print '-------The path of the exported data is "', os.getcwd(),'"-------'
|
829 |
return
|
830 |
|
831 |
|
832 |
def data_output_biosensor(obj, DataDir, *ROI_out):
|
833 |
"""Export the data into a biosensor format txt file,
|
834 |
which can be operated in Scrubber.
|
835 |
Input: the data object, the output path, the ROIs selected to export
|
836 |
"""
|
837 |
# initialize the data output matrix, which will contain 'len(spot_out)*2)*n_inj' columns
|
838 |
dataobj = copy.deepcopy(obj)
|
839 |
data = dataobj.data
|
840 |
n_inj = len(dataobj.sampleinfo)
|
841 |
ROIinfo = dataobj.ROIinfo
|
842 |
ROInum = dataobj.ROInum
|
843 |
|
844 |
data_out = np.zeros(((len(ROI_out)*2)*n_inj,np.shape(data)[1]))
|
845 |
head1 = ""
|
846 |
new_ROIinfo = []
|
847 |
for n,i in enumerate(ROI_out):
|
848 |
new_ROIinfo.append(ROIinfo[i-1]) ##the new ROI info
|
849 |
if i != ROIinfo[i-1]['IDcal']:
|
850 |
# check if the ID number matches position
|
851 |
print n,i, ROIinfo[i-1]
|
852 |
return
|
853 |
else:
|
854 |
headcomp = ROIinfo[i-1]['ID']+': '+ROIinfo[i-1]['Name']
|
855 |
try:
|
856 |
headcomp = headcomp + ', '+ROIinfo[i-1]['Conc']
|
857 |
except KeyError:
|
858 |
pass
|
859 |
headcomp = headcomp+' Fc='+str(n+1)+' - '
|
860 |
for j in range(n_inj):
|
861 |
headtmp = headcomp+str(j+1)+'_'
|
862 |
head1 = head1+headtmp+'X\t'+headtmp+'Y\t'
|
863 |
data_out[2*n*n_inj+2*j]=data[j*(ROInum+1)]
|
864 |
data_out[2*n*n_inj+2*j+1]=data[j*(ROInum+1)+i]
|
865 |
head1=head1[:-1]
|
866 |
|
867 |
dataobj.data = data_out
|
868 |
dataobj.dataformat = 3 # biosensor dataformat
|
869 |
dataobj.updateStatus(DataType='Biosensor data')
|
870 |
dataobj.updateStatus(Head=head1)
|
871 |
dataobj.updateROIinfo(new_ROIinfo)
|
872 |
|
873 |
write_data(head1,data_out,ROI_out,DataDir,'B')
|
874 |
|
875 |
return dataobj
|
876 |
|
877 |
def data_output_clamp(obj, DataDir, ROI_out):
|
878 |
"""Export the data into a Clamp format txt file,
|
879 |
which can be operated in Clamp.
|
880 |
Input: the data object, the output path, the selected ROI
|
881 |
"""
|
882 |
# initialize the data output matrix, which will contain '2*n_inj' columns
|
883 |
dataobj = copy.deepcopy(obj)
|
884 |
data = dataobj.data
|
885 |
n_inj = len(dataobj.sampleinfo)
|
886 |
n_inj = max(n_inj, 1) # Must have at least one injection.
|
887 |
ROIinfo = dataobj.ROIinfo
|
888 |
ROInum = dataobj.ROInum
|
889 |
sampleinfo = dataobj.sampleinfo
|
890 |
|
891 |
data_out = np.zeros((2*n_inj,np.shape(data)[1]))
|
892 |
## if ROI_out != ROIinfo[ROI_out-1]['IDcal']:
|
893 |
if (ROI_out != int(ROIinfo[ROI_out-1]['ID'])):
|
894 |
print 'Please check the ROI information.'
|
895 |
print ROI_out, ROIinfo[ROI_out-1]
|
896 |
return
|
897 |
else:
|
898 |
tmp = ROIinfo[ROI_out-1]['ID']+': '+ROIinfo[ROI_out-1]['Name']
|
899 |
try:
|
900 |
tmp = tmp +', Conc:'+ROIinfo[ROI_out-1]['Conc']
|
901 |
except KeyError:
|
902 |
pass
|
903 |
Clamphead = 'Vers 3.41 '+tmp
|
904 |
conc=''
|
905 |
t_start,t_stop='',''
|
906 |
head1='Time\tData\t'*n_inj
|
907 |
head1=head1[:-1]
|
908 |
|
909 |
for i in range(n_inj):
|
910 |
data_out[2*i]=data[i*(ROInum+1)]
|
911 |
data_out[2*i+1]=data[i*(ROInum+1)+ROI_out]
|
912 |
name = sampleinfo[i]['Name']
|
913 |
conc=conc+'\t'+str(sampleinfo[i]['Concentration'])
|
914 |
t_start=t_start+'\t'+str(dataobj.injectinfo[i][0])
|
915 |
t_stop=t_stop+'\t'+str(dataobj.injectinfo[i][1])
|
916 |
|
917 |
hconc,h_start,h_stop='\nConc1'+conc,'\nStart1'+t_start,'\nStop1'+t_stop
|
918 |
for i in (hconc,h_start,h_stop):
|
919 |
if i[-1]!='':
|
920 |
Clamphead=Clamphead+i
|
921 |
|
922 |
Clamphead=Clamphead+'\n'+head1
|
923 |
|
924 |
dataobj.data = data_out
|
925 |
dataobj.dataformat = 4 # clamp dataformat
|
926 |
dataobj.updateStatus(DataType='Clamp data')
|
927 |
dataobj.updateStatus(Head=Clamphead)
|
928 |
dataobj.updateROIinfo(ROIinfo[ROI_out-1])
|
929 |
|
930 |
write_data(Clamphead,data_out,ROI_out,DataDir,'C')
|
931 |
|
932 |
return dataobj
|
933 |
## return data_out,Clamphead
|
934 |
|
935 |
def obj_save(obj, foutname):
|
936 |
foutname = foutname + '.obj' # the file extension of the saved data object is .obj
|
937 |
fout = file(foutname, 'w+')
|
938 |
pickle.dump(obj, fout)
|
939 |
fout.close()
|
940 |
print '-------The path of the exported obj is "', os.getcwd(),'"-------'
|
941 |
return
|
942 |
|
943 |
if (__name__ == '__main__'):
|
944 |
# Print a short message if this module is run like an app.
|
945 |
print "This is the OSPRAI file type conversion tool."
|
946 |
|