torus_ephys/NixFrame.py
2019-10-23 15:53:39 +02:00

447 lines
18 KiB
Python

import nixio as nix
import numpy as np
import os
import pandas as pd
import pickle
from copy import deepcopy
'''
This File is meant to convert .nix Files into a more readable format (pandas.DataFrame)
NixToFrame(<folder>) searches all folders in <folder> for .nix files and converts them with
DataFrame(<nixfile>) and saves them as picklefile
df = load_data(<filename>) can load these picklefiles
'''
def DataFrame(nixfile, before=0.001, after=0.001, savefile=False, saveto='./', mindepth=0, delimiter=';'):
'''
opens a nix file, extracts the data and converts it to a pandas.DataFrame
:param nixfile (string): path and name of .nix file
:param before (float): time before sweeps that is saved, too (in the unit that is given in the file. Usually [s])
:param after (float): time after sweeps that is saved, too (in the unit that is given in the file. Usually [s])
:param savefile (string, bool): if not False and a string, the dataframe will be saved as <savefile>.pickle, if True it will be saved as nixfile.split('.nix')[0]
:param saveto (string): path to save the files in path defined by saveto
:param mindepth (int): minimum number of nested entries in dataframe
:param delimiter (string): internally used to create the nested dataframe, use a delimiter that does not naturally occur in your protocol names
:return dataframe (pandas.DataFrame): pandas.DataFrame with available nix data
'''
try:
block = nix.File.open(nixfile,'r').blocks[0]
except:
print('cannot open ' + nixfile.split('/')[-1] + ', skip it')
return None
data_arrays = block.data_arrays
names = [data_arrays[i].name for i in range(len(data_arrays))]
shapes = [x.shape for x in data_arrays]
data_names = np.array([[x, i] for i, x in enumerate(names) if (shapes[i][0] >= 0.999 * shapes[0][0])])
header_names = [mt.name for mt in block.multi_tags]
try:
data_traces = np.array([data_arrays[name][:] for name, idx in data_names])
except:
print(nixfile.split('/')[-1] + ' has corrputed data_arrays, no file will be saved')
return None
time = data_arrays[1].dimensions[0].axis(data_arrays[1].shape[0])
dt = time[1]-time[0]
block_metadata = {}
block_metadata[block.id] = GetMetadataDict(block.metadata)
tag = block.tags
tag_metadata = {}
tag_id_times = {}
protocols = [[]] * len(tag)
for i,t in enumerate(tag):
protocols[i] = {}
protocols[i]['id'] = t.id
protocols[i]['name'] = t.name
protocols[i]['position'] = t.position[0]
protocols = pd.DataFrame(protocols)
for i in range(len(tag)):
try:
meta = tag[i].metadata
tag_metadata[meta.id] = GetMetadataDict(meta)
tag_id_times[meta.id] = [tag[i].position[0], tag[i].position[0]+tag[i].extent[0]]
except:
print(nixfile.split('/')[-1] + ' has no tags, no file will be saved')
return None
protocol_idcs_old = np.where([(' onset times' in name) for name in names])[0]
durations_idcs_old = np.where([(' durations' in name) for name in names])[0]
for idx in protocol_idcs_old:
names[idx] = names[idx].split(' onset times')[0] + '_onset_times'
for idx in durations_idcs_old:
names[idx] = names[idx].split(' durations')[0] + '_durations'
protocol_idcs = np.where([('_onset_times' in name) for name in names])[0]
if len(protocol_idcs) == 0:
print(nixfile.split('/')[-1] + ' is empty, no file will be saved')
return None
data_df = []
for hn in header_names:
idcs = np.where(np.array([hn in name[:len(hn)] for name in names], dtype=int))[0]
for j,idx in enumerate(idcs):
data = data_arrays[int(idx)][:]
if j == 0:
i0 = len(data_df)
protname = hn.split('-')[0]
typename = hn.split('-')[1]
if protname == 'VC=':
continue
data_df.extend([[]]*len(data))
for jj in range(len(data)):
data_df[i0 + jj] = {}
data_df[i0 + jj]['type'] = typename
# data_df[i0 + jj]['row_id'] = uuid.uuid4()
if typename == 'PNSubatraction':
data_df[i0 + jj]['type'] = 'PNSubtraction'
elif typename == 'Qualitycontrol':
data_df[i0 + jj]['type'] = 'QualityControl'
dataname = names[idx].split(hn)[1][1:]
if protname == 'VC=':
continue
if dataname == 'Current-1':
continue
for jj in range(len(data)):
data_df[i0 + jj][dataname] = data[jj]
keys = np.unique([list(d.keys()) for d in data_df])
k = []
for key in keys:
k.extend(key)
keys = np.unique(k)
if 'repro_tag_id' not in keys:
for i in range(len(data_df)):
data_df[i]['repro_tag_id'] = protocols[~(protocols.position >= data_df[i]['onset_times'])]['id'].iloc[-1]
traces_idx = np.where([("data.sampled" in d.type) or ("data.events" in d.type) for d in data_arrays])[0]
traces_df = []
for i,idx in enumerate(traces_idx):
for j in range(len(data_df)):
if i == 0:
traces_df.append({})
if "data.sampled" in data_arrays[names[int(idx)]].type:
idx0 = int((data_df[j]['onset_times'] - before) / dt)
idx1 = int((data_df[j]['onset_times'] + data_df[j]['durations'] + after) / dt)
if idx0>=idx1:
traces_df[j][names[idx]] = np.array([np.nan])
else:
traces_df[j][names[idx]] = data_arrays[names[int(idx)]][idx0:idx1]
elif "data.events" in data_arrays[names[int(idx)]].type:
idx0 = int((data_df[j]['onset_times'] - before) / dt)
idx1 = int((data_df[j]['onset_times'] + data_df[j]['durations'] + after) / dt)
t0 = data_df[j]['onset_times'] - before
t1 = data_df[j]['onset_times'] + data_df[j]['durations'] + after
if t0>=t1:
traces_df[j][names[idx]] = np.array([np.nan])
else:
arr = data_arrays[names[int(idx)]][:]
traces_df[j][names[idx]] = arr[(arr>=t0) & (arr<=t1)] - data_df[j]['onset_times']
if i == 0:
traces_df[j]['time'] = time[idx0:idx1] - data_df[j]['onset_times']
traces_df[j]['time_before_stimulus'] = before
traces_df[j]['time_after_stimulus'] = after
traces_df[j]['samplingrate'] = 1 / dt
traces_df[j]['meta_id'] = list(block_metadata.keys())[0]
if type(data_df[j]['repro_tag_id']) == bytes:
data_df[j]['repro_tag_id'] = data_df[j]['repro_tag_id'].decode("utf-8")
traces_df[j]['protocol'] = np.array(protocols[protocols.id == data_df[j]['repro_tag_id']].name)[0].split('_')[0]
traces_df[j]['protocol_number'] = np.array(protocols[protocols.id == str(data_df[j]['repro_tag_id'])].name)[0].split('_')[1]
traces_df[j]['id'] = data_df[j]['repro_tag_id']
metadic = {}
for i,key in enumerate(protocols.id):
d = GetMetadataDict(tag[key].metadata)
if (len(d.keys()) == 1) and ('RePro-Info' in list(d.keys())[0]):
d = d['RePro-Info']
metadic[key] = DicCrawl(d, key='tag_meta', delimiter=delimiter)
metadic[key].update(DicCrawl(block_metadata[list(block_metadata.keys())[0]], key='block_meta', delimiter=delimiter))
meta_df = [[]]*len(data_df)
for i in range(len(data_df)):
meta_df[i] = metadic[str(data_df[i]['repro_tag_id'])]
dics = [meta_df, data_df, traces_df]
old_maxcount = mindepth
for k in range(len(dics)):
for j in range(len(dics[k])):
keys = list(dics[k][j].keys())
counts = np.array([key.count(delimiter) for key in keys])
maxcount = np.max(counts)
if maxcount < mindepth:
maxcount = mindepth
if maxcount < old_maxcount:
maxcount = old_maxcount
# append delimiters to the keys until each key contains the same number of keys
for i, key in enumerate(keys):
add = maxcount - counts[i]
newkey=key
for ii in range(add):
newkey += delimiter
dics[k][j][newkey] = dics[k][j].pop(key)
old_maxcount = maxcount
data_df = pd.DataFrame(data_df)
traces_df = pd.DataFrame(traces_df)
meta_df = pd.DataFrame(meta_df)
data = pd.concat([meta_df, data_df, traces_df], axis=1)
data.columns = data.columns.str.split(';', expand=True)
if savefile != False:
if savefile == True:
savefile = nixfile.split('.nix')[0]
if saveto != None:
savefile = savefile.split('/')[-1]
savefile = saveto+savefile
with open(savefile + '_dataframe.pickle', 'wb') as f:
pickle.dump(data, f, -1) # create pickle-files, using the highest pickle-protocol
return data
def NixToFrame(folder, before=0.0, after=0.0, skipold=True, saveto=None, mindepth=0):
'''
searches subfolders of folder to convert .nix files to a pandas dataframe and saves them in the folder
:param folder: path to folder that contains subfolders of year-month-day-aa style that contain .nix files
:param skipold (bool): skip creation of dataframe if it already eists
:param saveto (string): path where files should be saved, if None it saves in the folder, where the .nix file is
'''
if folder[-1] != '/':
folder = folder + '/'
dirlist = os.listdir(folder)
for dir in dirlist:
if os.path.isdir(folder + dir):
for file in os.listdir(folder+dir):
if ('.nix' in file):
if skipold == True:
if np.sum(['_dataframe.pickle' in x for x in os.listdir(folder+dir)]) >= 1:
print('skip ' + file)
continue
print(file)
DataFrame(folder+dir+'/'+file, before, after, True, saveto, mindepth=mindepth)
def load_data(filename):
with open(filename, 'rb') as f:
data = pickle.load(f) # load data with pickle
return data
def GetMetadataDict(metadata):
def unpackMetadata(sec):
metadata = dict()
metadata = {prop.name: sec[prop.name] for prop in sec.props}
if hasattr(sec, 'sections') and len(sec.sections) > 0:
metadata.update({subsec.name: unpackMetadata(subsec) for subsec in sec.sections})
return metadata
return unpackMetadata(metadata)
def DicCrawl(dic, key='', delimiter=';'):
keylist = GetKeyList(dic, key=key, delimiter=delimiter)
newdic = KeylistToDic(keylist)
return newdic
def GetKeyList(dic, keylist=[], key='', delimiter=';'):
items = dic.items()
for item in items:
if type(item[1]) == dict:
if len(key) == 0:
keylist = GetKeyList(item[1], keylist, str(item[0]), delimiter)
else:
keylist = GetKeyList(item[1], keylist, key + delimiter + str(item[0]), delimiter)
else:
if len(key) == 0:
keylist.append([str(item[0]), item[1]])
else:
keylist.append([key + delimiter + str(item[0]), item[1]])
return keylist
def KeylistToDic(keylist):
dic = {}
for item in keylist:
dic[item[0]] = item[1]
return dic
def PNSubtraction(df, currenttrace='Current-1', newcurrenttrace='Current-2', kernelcurrenttrace='Current-3', delimiter=';'):
'''
Only for VoltageClamp experiments and WILL BE OBSOLETE, SOON
:param df (pandas.DataFrame): NixFrame.DataFrame
:param currenttrace:
:param newcurrenttrace:
:param kernelcurrenttrace:
:param delimiter:
:return:
'''
df = deepcopy(df)
if 'TraceId' not in df.columns:
for id in np.unique(df.repro_tag_id):
dfp = df[df.repro_tag_id == id]
if np.isnan(dfp.pn.iloc[0]) or dfp.pn.iloc[0] == 0:
continue
pn = int(dfp.pn.iloc[0])
for i in np.arange(0,len(dfp),np.abs(pn)+1, dtype=int):
locs_pn = df[df.repro_tag_id == id].iloc[i:i+np.abs(pn)].index
loc = locs_pn[-1] + 1
if loc <= df[df.repro_tag_id == id].index[-1]:
df.loc[locs_pn, 'type'] = 'PNSubtraction'
df.loc[loc, 'type'] = 'Trace'
trace_idx = np.where(df.type == 'Trace')[0]
currentdic = [[]] * len(trace_idx)
kerneldic = [[]] * len(trace_idx)
affix = ''
affix_num = len(list(df)[0]) - 1
for i in range(affix_num):
affix += delimiter
for idx,i in enumerate(trace_idx):
if np.isnan(df.pn.iloc[i]) or df.pn.iloc[i] == 0:
continue
if 'TraceId' in df.columns:
trace_id = df.TraceId.iloc[i]
idxvec = np.where((df.TraceId == trace_id) & (df.type == 'PNSubtraction'))[0]
else:
delays = (df['onset_times'][df.type == 'PNSubtraction']) - df['onset_times'].iloc[i]
maxidx = delays[np.array(delays)<0].index[-1]
idxvec = np.arange(maxidx-np.abs(df.pn.iloc[i])+1, maxidx+.1, dtype=int)
pn_traces = hf.get_traces(currenttrace, df.iloc[idxvec])
I_trace = np.array(df[currenttrace].iloc[i])
pn_traces = np.mean(pn_traces, axis=1)
pn_traces -= np.mean(pn_traces[:int(df.iloc[i].time_before_stimulus * df.iloc[i].samplingrate)])
pn_trace = pn_traces*df.pn.iloc[i]
I_trace -= np.mean(I_trace[:int(df.iloc[i].time_before_stimulus * df.iloc[i].samplingrate)])
''' try kernels '''
Vpn_trace = hf.get_traces('V-1', df.iloc[idxvec])
tpn = hf.get_traces('time', df.iloc[idxvec])[:, 0]
Vtr_trace = np.array(df['V-1'].iloc[i])
ttr = np.array(df['time'].iloc[i])
kernel_length = 3
idx0 = np.nanargmin(np.abs(tpn)) - 1
idx1 = np.nanargmin(np.abs(tpn - kernel_length / 1000))
idx40 = np.nanargmin(np.abs(tpn - 0.04))
idx60 = np.nanargmin(np.abs(tpn - 0.06))
Vpn = tpn * 0.0 + np.mean(np.mean(Vpn_trace, axis=1)[:idx0])
Vpn[tpn >= 0] = np.mean(np.mean(Vpn_trace, axis=1)[idx40:idx60])
Vtr = ttr * 0.0 + np.mean(Vtr_trace[:idx0])
Vtr[ttr >= 0] = np.mean(Vtr_trace[idx40:idx60])
ftinput = np.fft.fft(np.diff(Vpn[idx0:idx1]))
ftoutput = np.fft.fft(pn_traces[idx0:idx1 - 1])
kernel = np.fft.ifft(ftoutput / ftinput)
kernel = np.append(kernel, np.zeros(len(I_trace) - len(kernel))+np.mean(kernel[-20:]))
kerneldic[idx] = {}
kerneldic[idx][kernelcurrenttrace + affix] = I_trace - np.convolve(np.diff(Vtr), kernel)[:len(I_trace)] #- np.convolve(np.diff(Vtr),kernel2)[:len(I_trace)]
''' end of kernels '''
if len(pn_trace) < len(I_trace):
I_trace = I_trace[:len(pn_trace)]
elif len(pn_trace) > len(I_trace):
pn_trace = pn_trace[:len(I_trace)]
currentdic[idx] = {}
currentdic[idx][newcurrenttrace+affix] = I_trace - pn_trace
currentdf = pd.DataFrame(currentdic, index=df[df.type == 'Trace'].index)
currentdf.columns = currentdf.columns.str.split(';', expand=True)
df = pd.concat([df, currentdf], axis=1)
df = df.drop(index = df[df.type == 'PNSubtraction'].index)
return df
def QualityControl(df, currenttrace='Current-1', potentialtrace='V-1', delimiter=';'):
'''
only for VoltageClamp experiments
'''
qc = [[]]*len(df[df.type == 'QualityControl'])
prefix = 'qualitycontrol' + delimiter
affix = ''
affix_num = len(list(df)[0]) - 2
for i in range(affix_num):
affix += delimiter
# assume that stimulus has 10ms at holdingpotential followed by 10ms at holdingpotential-20
samplingrate = df.samplingrate.iloc[0]
idx10ms = int(0.01 * samplingrate)
idx20ms = int(0.02 * samplingrate)
idx1ms = int(0.001 * samplingrate)
for i in range(len(qc)):
qc[i] = {}
qc[i][prefix + currenttrace + affix] = df[df.type == 'QualityControl'][currenttrace].iloc[i]
qc[i][prefix + potentialtrace + affix] = df[df.type == 'QualityControl'][potentialtrace].iloc[i]
qc[i][prefix + 'onset_times' + affix] = df[df.type == 'QualityControl']['onset_times'].iloc[i]
qc[i][prefix + 'holdingpotential' + affix] = df[df.type == 'QualityControl']['tag_meta']['settings']['holdingpotential'].iloc[i]
if 'TraceId' in df.columns:
qc[i][prefix + 'TraceId' + affix] = df[df.type == 'QualityControl']['TraceId'].iloc[i]
I1 = np.mean(df[df.type == 'QualityControl'][currenttrace].iloc[i][idx10ms-idx1ms : idx10ms - 2])
I2 = np.mean(df[df.type == 'QualityControl'][currenttrace].iloc[i][idx20ms-idx1ms : idx20ms - 2])
R = 20/(I1-I2)
qc[i][prefix + 'resistance' + affix] = R
qc_df = pd.DataFrame(qc)
if len(qc) == 0:
return df
qc_empty = [{}]
for key in qc[0].keys():
qc_empty[0][key] = np.nan
qc_dic = qc_empty * len(df)
for idx in df[df.type == 'Trace'].index:
i = np.where(df.index == idx)[0][0]
if 'TraceId' in df.columns:
qc_dic[i] = qc[np.where(qc_df[prefix + 'TraceId' + affix] == df.TraceId.loc[idx])[0][0]]
else:
None
delays = qc_df[prefix + 'onset_times' + affix] - df['onset_times'].loc[idx]
maxidx = np.where(delays[np.array(delays) < 0])[0][0]
qc_dic[i] = qc[maxidx]
qc = pd.DataFrame(qc_dic, index=df.index)
qc.columns = qc.columns.str.split(';', expand=True)
df = pd.concat([df, qc], axis=1)
df = df.drop(index=df[df.type == 'QualityControl'].index)
return df