fishBook/fishbook/backend/database.py
Jan Grewe 6e2bfde2dc [database] fix repro run counter ...
sometimes repros do not  put out stimuli for whatever reason.
They are noted in the stimuli.dat but not e.g. in the stimspikes.dat.
This leads to conflicts when fetching the data...
2020-08-14 17:19:02 +02:00

577 lines
20 KiB
Python

import numpy as np
import datajoint as dj
import nixio as nix
import os
import glob
import socket
from fishbook.backend.util import read_info_file, read_dataset_info, read_stimuli_file
from fishbook.backend.util import find_key_recursive, deep_get, find_mtags_for_tag
from fishbook.backend.util import mtag_settings_to_yaml, nix_metadata_to_yaml, mtag_features_to_yaml, progress
import uuid
import yaml
from IPython import embed
dj.config["enable_python_native_blobs"] = True
schema = dj.schema("fish_book", locals())
@schema
class Datasets(dj.Manual):
definition = """ # _Dataset
dataset_id : varchar(256)
----
data_source : varchar(512) # path to the dataset
data_host : varchar(512) # fully qualified domain name
experimenter : varchar(512)
setup : varchar(128)
recording_date : date
quality : varchar(512)
comment : varchar(1024)
duration : float
has_nix : bool
"""
@staticmethod
def get_template_tuple(id=None):
if id is not None:
d = dict((Datasets() & {"dataset_id": id}).fetch1())
return d
return dict(dataset_id=None, data_source="", data_host="", experimenter="", setup="",
recording_date=None, quality="", comment="", duration=0.0, has_nix=False)
@staticmethod
def get_nix_file(key):
dset = (Datasets() & key).fetch1()
if dset["ignore"]:
return None
file_path = os.path.join(dset["data_source"], dset["dataset_id"] + ".nix")
if not (os.path.exists(file_path)):
print("\t No nix file found for path: %s" % dset["data_source"])
return None
if not Datasets.check_file_integrity(file_path):
return None
return file_path
@staticmethod
def check_file_integrity(nix_file):
sane = True
try:
f = nix.File.open(nix_file, nix.FileMode.ReadOnly)
b = f.blocks[0]
m = b.metadata
if "Recording" not in m.sections:
Warning("\t Could not find Recording section in dataset: %s" % nix_file)
sane = False
f.close()
except ():
f = None
print("file: %s is NOT SANE!")
sane = False
return sane
@schema
class Subjects(dj.Manual):
definition = """
# Subjects
subject_id : varchar(256)
----
species : varchar(256)
"""
@staticmethod
def get_template_tuple(subject_id=None):
tup = dict(subject_id=None, species="")
if subject_id is not None:
d = dict((Subjects() & {"subject_id": subject_id}).fetch1())
return d
return tup
def make(self, key):
file_path = Datasets.get_nix_file(key)
if file_path is None:
return
nix_file = nix.File.open(file_path, nix.FileMode.ReadOnly)
m = nix_file.blocks[0].metadata
inserts = Subjects.get_template_tuple()
subj_info = m["Recording"]["Subject"]
inserts["subject_id"] = subj_info["Identifier"]
inserts["species"] = subj_info["Species"][0]
inserts["weight"] = subj_info["Weight"]
inserts["size"] = subj_info["Size"]
inserts["eod_frequency"] = np.round(subj_info["EOD Frequency"] * 10) / 10
inserts.update(key)
self.insert1(inserts, skip_duplicates=True)
nix_file.close()
#@property
#def datasets(self):
# retrun
@schema
class SubjectDatasetMap(dj.Manual):
definition = """
# SubjectDatasetMap
-> Subjects
-> Datasets
"""
@schema
class SubjectProperties(dj.Manual):
definition = """
# _SubjectProperties
id : int auto_increment
----
-> Subjects
recording_date : date
weight : float
size : float
eod_frequency : float
"""
@staticmethod
def get_template_tuple(id=None):
tup = dict(id=None, subject_id=None, recording_date=None, weight=0.0, size=0.0,
eod_frequency=0.0)
if id is not None:
return dict((SubjectProperties() & {"id": id}).fetch1())
return tup
@schema
class Cells(dj.Manual):
definition = """
# Table that stores information about recorded cells.
cell_id : varchar(256)
----
-> Subjects
cell_type : varchar(256)
firing_rate : float
structure : varchar(256)
region : varchar(256)
subregion : varchar(256)
depth : float
lateral_pos : float
transversal_section : float
"""
@staticmethod
def get_template_tuple(cell_id=None):
tup = dict(cell_id=None, subject_id=None, cell_type="", firing_rate=0.0,
depth=0.0, region="", subregion="", structure="",
lateral_pos=0.0, transversal_section=0.0)
if cell_id is not None:
d = dict((Cells() & {"cell_id": cell_id}).fetch1())
return d
return tup
@schema
class CellDatasetMap(dj.Manual):
definition = """
# Table that maps recorded cells to datasets
-> Datasets
-> Cells
"""
@schema
class Repros(dj.Manual):
definition = """
repro_id : varchar(512) # The name that was given to the RePro run by relacs
run : smallint # A counter counting the runs of the ReProp in this dataset
-> Cells #
----
repro_name : varchar(512) # The original name of the RePro itself, not any given name by user or relacs
settings : varchar(3000) # Yaml formatted string containing the repro settings (tag.metadata in case of a nix file)
start : float # The start time of the repro
duration : float # The duration of the repro
"""
@staticmethod
def get_template_tuple(repro_id=None):
tup = dict(repro_id=None, cell_id=None, run=0, repro_name="", settings=None, start=None, duration=None)
if repro_id is not None:
d = dict((Repros() & {"repro_id": repro_id}).fetch1())
return d
return tup
@schema
class Stimuli(dj.Manual):
definition = """
stimulus_id : varchar(50)
-> Repros
---
stimulus_index : int
stimulus_name : varchar(512)
mtag_id : varchar(50)
start_time : float
start_index : int
duration : float
settings : varchar(3000)
"""
@staticmethod
def get_template_tuple(stimulus_id=None):
if stimulus_id is not None:
tup = dict((Stimuli & {"stimulus_id": stimulus_id}).fetch1())
else:
tup = dict(stimulus_id=None, stimulus_index=None, stimulus_name="", start_index=0, start_time=0.0,
duration=0.0, settings=None)
return tup
def populate_datasets(data_path, update=False):
if not os.path.exists(data_path):
return False
dset_name = os.path.split(data_path)[-1]
experimenter, rec_date, quality, comment, has_nix, rec_duration, setup = read_dataset_info(os.path.join(data_path, 'info.dat'))
if not experimenter:
return False
inserts = Datasets.get_template_tuple()
inserts["dataset_id"] = dset_name
inserts["data_source"] = os.path.abspath(data_path)
inserts["data_host"] = socket.getfqdn()
inserts["experimenter"] = experimenter
inserts["recording_date"] = rec_date
inserts["quality"] = quality if not isinstance(quality, dict) else ""
inserts["comment"] = comment if not isinstance(comment, dict) else ""
inserts["duration"] = rec_duration
inserts["setup"] = setup
inserts["has_nix"] = has_nix
if len(Datasets & "dataset_id like '%s'" % inserts["dataset_id"]) > 0 and not update:
print('\t\t %s is already in database!' % dset_name)
return False
Datasets().insert1(inserts, skip_duplicates=True)
return True
def populate_subjects(data_path):
print("\tImporting subject(s) of %s" % data_path)
dset_name = os.path.split(data_path)[-1]
info_file = os.path.join(data_path, 'info.dat')
if not os.path.exists(info_file):
return None, None, False
info = read_info_file(info_file)
p = []
find_key_recursive(info, "Subject", p)
subj = {}
if len(p) > 0:
subj = deep_get(info, p)
inserts = Subjects.get_template_tuple()
subj_id = None
if "Identifier" in subj.keys():
if isinstance(subj["Identifier"], dict):
subj_id = "unspecified_" + dset_name
else:
subj_id = subj["Identifier"]
elif "Identifier" in info.keys():
if isinstance(info["Identifier"], dict):
subj_id = "unspecified_" + dset_name
else:
subj_id = info["Identifier"]
else:
subj_id = "unspecified_" + dset_name
inserts["subject_id"] = subj_id
inserts["species"] = subj["Species"]
Subjects().insert1(inserts, skip_duplicates=True)
# multi match entry
dataset = dict((Datasets() & {"dataset_id": dset_name}).fetch1())
mm = dict(dataset_id=dataset["dataset_id"], subject_id=inserts["subject_id"])
SubjectDatasetMap.insert1(mm, skip_duplicates=True)
# subject properties
props = SubjectProperties.get_template_tuple()
props["subject_id"] = inserts["subject_id"]
props["recording_date"] = dataset["recording_date"]
if "Weight" in subj.keys():
props["weight"] = np.round(float(subj["Weight"][:-1]), 1)
if "Size" in subj.keys():
props["size"] = np.round(float(subj["Size"][:-2]), 1)
if "EOD Frequency" in subj.keys():
props["eod_frequency"] = np.round(float(subj["EOD Frequency"][:-2]))
p = props.copy()
p.pop("id")
if len(SubjectProperties & p) == 0:
SubjectProperties.insert1(props, skip_duplicates=True)
def populate_cells(data_path):
print("\tImporting cell(s) of %s" % data_path)
dset_name = os.path.split(data_path)[-1]
info_file = os.path.join(data_path, 'info.dat')
if not os.path.exists(info_file):
return None, None, False
info = read_info_file(info_file)
p = []
find_key_recursive(info, "Subject", p)
subject_info = deep_get(info, p)
p = []
find_key_recursive(info, "Cell", p)
cell_info = deep_get(info, p)
p = []
res = find_key_recursive(info, "Firing Rate1", p)
if res:
firing_rate = deep_get(info, p, default=0.0)
else:
firing_rate = 0.0
if isinstance(firing_rate, str):
firing_rate = float(firing_rate[:-2])
subj_id = None
if "Identifier" in subject_info.keys():
if isinstance(subject_info["Identifier"], dict):
subj_id = "unspecified_" + dset_name
else:
subj_id = subject_info["Identifier"]
elif "Identifier" in info.keys():
if isinstance(info["Identifier"], dict):
subj_id = "unspecified_" + dset_name
else:
subj_id = info["Identifier"]
else:
subj_id = "unspecified_" + dset_name
dataset = dict((Datasets & {"dataset_id": dset_name}).fetch1())
subject = dict((Subjects & {"subject_id": subj_id}).fetch1())
dataset_id = dataset["dataset_id"]
cell_id = "-".join(dataset_id.split("-")[:4]) if len(dataset_id) > 4 else dataset_id
cell_props = Cells.get_template_tuple()
cell_props["subject_id"] = subject["subject_id"]
cell_props["cell_id"] = cell_id
cell_props["cell_type"] = cell_info["CellType"]
cell_props["firing_rate"] = firing_rate
if "Structure" in cell_info.keys():
cell_props["structure"] = cell_info["Structure"]
if "BrainRegion" in cell_info.keys():
cell_props["region"] = cell_info["BrainRegion"]
if "BrainSubRegion" in cell_info.keys():
cell_props["subregion"] = cell_info["BrainSubRegion"]
if "Depth" in cell_info.keys():
cell_props["depth"] = float(cell_info["Depth"][:-2])
if "Lateral position" in cell_info.keys():
cell_props["lateral_pos"] = float(cell_info["Lateral position"][:-2])
if "Transverse section" in cell_info.keys():
cell_props["transversal_section"] = float(cell_info["Transverse section"])
Cells.insert1(cell_props, skip_duplicates=True)
# multi match entry
mm = dict(dataset_id=dataset["dataset_id"], cell_id=cell_props["cell_id"])
CellDatasetMap.insert1(mm, skip_duplicates=True)
def scan_nix_file_for_repros(dataset):
print("\t\tscanning nix file")
cell_id = (Cells * CellDatasetMap * (Datasets & "dataset_id = '%s'" % dataset["dataset_id"])).fetch("cell_id", limit=1)[0]
nix_files = glob.glob(os.path.join(dataset["data_source"], "*.nix"))
for nf in nix_files:
if not Datasets.check_file_integrity(nf):
print("\t\tfile is not sane!!!")
continue
f = nix.File.open(nf, nix.FileMode.ReadOnly)
b = f.blocks[0]
repro_runs = [t for t in b.tags if "relacs.repro_run" in t.type]
total = len(repro_runs)
for i, t in enumerate(repro_runs):
rs = t.metadata.find_sections(lambda x: "Run" in x.props)
rs = rs[0]
if len(rs) == 0:
continue
progress(i+1, total, "Scanning repro run %s" % rs["RePro"])
rp = Repros.get_template_tuple()
rp["run"] = rs["Run"]
rp["repro_name"] = rs["RePro"]
rp["cell_id"] = cell_id
rp["repro_id"] = t.name
settings = t.metadata.find_sections(lambda x: "settings" in x.type)
if len(settings) > 0:
rp["settings"] = nix_metadata_to_yaml(settings[0])
else:
rp["settings"] = nix_metadata_to_yaml(t.metadata)
rp["start"] = t.position[0]
rp["duration"] = t.extent[0]
Repros.insert1(rp, skip_duplicates=True)
# import Stimuli
repro = dict((Repros & dict(repro_id=rp["repro_id"], cell_id=cell_id)).fetch1())
repro.pop("settings")
repro.pop("repro_name")
repro.pop("start")
repro.pop("duration")
mtags, positions = find_mtags_for_tag(b, t)
mt_settings_dict = {}
positions_dict = {}
extents_dict = {}
for j, mt in enumerate(mtags):
if mt.id in positions_dict.keys():
mt_positions = positions_dict[mt.id]
mt_extents = extents_dict[mt.id]
mdata_yaml = mt_settings_dict[mt.id]
else:
mdata_yaml = nix_metadata_to_yaml(mt.metadata)
mt_settings_dict[mt.id] = mdata_yaml
mt_positions = np.atleast_2d(mt.positions[:])
mt_extents = np.atleast_2d(mt.extents[:])
if mt.positions.shape[0] != mt_positions.shape[0]:
mt_positions = mt_positions.T
mt_extents = mt_extents.T
for p in positions[j]:
settings = mtag_features_to_yaml(mt, p, mdata_yaml)
stim_start = mt_positions[p, 0]
stim_duration = mt_extents[p, 0]
stim = Stimuli.get_template_tuple()
stim["stimulus_id"] = str(uuid.uuid1())
stim["stimulus_index"] = p
stim["start_time"] = stim_start
stim["start_index"] = -1
stim["duration"] = stim_duration
stim["settings"] = settings
stim["mtag_id"] = mt.id
stim["stimulus_name"] = mt.name
stim.update(repro)
Stimuli.insert1(stim, skip_duplicates=True)
print(" " * 120, end="\r")
print("\n")
f.close()
f = None
def scan_folder_for_repros(dataset):
print("\t\tNo nix-file, scanning directory!")
repro_settings, stim_indices = read_stimuli_file(dataset["data_source"])
repro_counts = {} # internal counter for repro runs, for cases in which the repro does not have a run counter
repros_skipped = {} # internal counter to correct the run counter for repro runs in which no stimulus was put out, needed to align e.g. with stimspikes file
repros_without_stims = ["baselineactivity"]
cell_id = (Cells * CellDatasetMap * (Datasets & "dataset_id = '%s'" % dataset["dataset_id"])).fetch("cell_id", limit=1)[0]
for rs, si in zip(repro_settings, stim_indices):
rp = Repros.get_template_tuple()
path = []
if not find_key_recursive(rs, "repro", path):
find_key_recursive(rs, "RePro", path)
rp["repro_name"] = deep_get(rs, path, "None")
if rp["repro_name"] not in repros_skipped.keys():
repros_skipped[rp["repro_name"]] = 0
if rp["repro_name"].lower() not in repros_without_stims and len(si) == 0:
repros_skipped[rp["repro_name"]] += 1
continue
path = []
if rp["repro_name"] in repro_counts.keys():
repro_counts[rp["repro_name"]] += 1
else:
repro_counts[rp["repro_name"]] = 0
path = []
if not find_key_recursive(rs, "run", path):
find_key_recursive(rs, "Run", path)
if len(path) > 0:
rp["run"] = int(deep_get(rs, path, 0)) - repros_skipped[rp["repro_name"]]
else: # the run information is not there and needs to be fixed!
rp["run"] = repro_counts[rp["repro_name"]] - repros_skipped[rp["repro_name"]]
rp["cell_id"] = cell_id
rp["repro_id"] = rp["repro_name"] + str(repro_counts[rp["repro_name"]])
rp["start"] = 0.
rp["duration"] = 0.
rp["settings"] = yaml.dump(rs).replace("'", "")
Repros.insert1(rp, skip_duplicates=True)
# import stimuli
repro = dict((Repros & dict(repro_id=rp["repro_id"], cell_id=cell_id)).fetch1())
repro.pop("settings")
repro.pop("repro_name")
repro.pop("start")
repro.pop("duration")
total = len(si.keys())
for j, k in enumerate(si.keys()):
progress(j+1, total, "scanning repro %s" % rp["repro_name"])
s = int(si[k])
stim_start = 0.
path = []
if not find_key_recursive(rs, "duration", path):
find_key_recursive(rs, "Duration", path)
if len(path) > 0 :
stim_duration = deep_get(rs, path, None)
if "sec" in stim_duration:
stim_duration = float(stim_duration[:stim_duration.index("sec")])
elif "ms" in stim_duration:
stim_duration = float(stim_duration[:stim_duration.index("ms")]) / 1000
else:
stim_duration = float(stim_duration[:stim_duration.index("s")])
else:
stim_duration = 0.0
stim = Stimuli.get_template_tuple()
stim["stimulus_id"] = str(uuid.uuid1())
stim["stimulus_index"] = j
stim["start_time"] = stim_start
stim["start_index"] = s
stim["duration"] = stim_duration
stim["settings"] = yaml.dump(rs).replace("'", "")
stim["mtag_id"] = ""
stim["stimulus_name"] = ""
stim.update(repro)
Stimuli.insert1(stim, skip_duplicates=True)
print(" " *120, end='\r')
#if i < len(repro_settings):
# print((" " * 150), end="\r")
def populate_repros(data_path):
print("\tImporting RePro(s) of %s" % data_path)
dset_name = os.path.split(data_path)[-1]
if len(Datasets & {"dataset_id": dset_name}) != 1:
return False
dataset = dict((Datasets & {"dataset_id": dset_name}).fetch1())
if dataset["has_nix"]:
scan_nix_file_for_repros(dataset)
else:
scan_folder_for_repros(dataset)
return True
def drop_tables():
Datasets.drop()
Subjects.drop()
def populate(datasets, update=False):
for i, d in enumerate(datasets):
print("Importing %i of %i: %s" % (i+1, len(datasets), d))
if not populate_datasets(d, update):
continue
populate_subjects(d)
populate_cells(d)
try:
populate_repros(d)
except ():
print("\t\tsomething went wrong! %s" % d)
if __name__ == "__main__":
data_dir = "/data/apteronotus"
# data_dir = "../high_freq_chirps/data"
# drop_tables()
# datasets = glob.glob("/Users/jan/zwischenlager/2012-*")2010-06-21-ac/info.dat
datasets = glob.glob(os.path.join(data_dir, '/data/apteronotus/2010-06-18*'))
populate(datasets, update=False)