# -*- coding: utf-8 -*-
"""
The :mod:`neurotic.datasets.metadata` module implements a class for reading
metadata files.
.. autoclass:: MetadataSelector
:members:
"""
import os
import urllib
import yaml
from packaging.specifiers import SpecifierSet
from packaging import version
from .. import __version__
from ..datasets.download import download
import logging
logger = logging.getLogger(__name__)
def _load_metadata(file = 'metadata.yml', local_data_root = None, remote_data_root = None):
"""
Read metadata stored in a YAML file about available collections of data,
assign defaults to missing parameters, and resolve absolute paths for local
data stores and full URLs for remote data stores.
``local_data_root`` must be an absolute or relative path on the local
system, or None. If it is a relative path, it is relative to the current
working directory. If it is None, its value defaults to the directory
containing ``file``.
``remote_data_root`` must be a full URL or None. If it is None, ``file``
will be checked for a fallback value. "remote_data_root" may be provided in
the YAML file under the reserved keyword "neurotic_config". Any non-None
value passed to this function will override the value provided in the file.
If both are unspecified, it is assumed that no remote data store exists.
The "data_dir" property is optional for every data set in ``file`` and
specifies the directory on the local system containing the data files.
"data_dir" may be an absolute path or a relative path with respect to
``local_data_root``. If it is a relative path, it will be converted to an
absolute path.
The "remote_data_dir" property is optional for every data set in ``file``
and specifies the directory on a remote server containing the data files.
"remote_data_dir" may be a full URL or a relative path with respect to
``remote_data_root``. If it is a relative path, it will be converted to a
full URL.
File paths (e.g., "data_file", "video_file") are assumed to be relative to
both "data_dir" and "remote_data_dir" (i.e., the local and remote data
stores mirror one another) and can be resolved with ``_abs_path`` or
``_abs_url``.
"""
assert file is not None, 'metadata file must be specified'
assert os.path.exists(file), 'metadata file "{}" cannot be found'.format(file)
# local_data_root defaults to the directory containing file
if local_data_root is None:
local_data_root = os.path.dirname(file)
# load metadata from file
with open(file) as f:
md = yaml.safe_load(f)
# remove special entry "neurotic_config" from the dict if it exists
config = md.pop('neurotic_config', None)
if isinstance(config, dict):
# process global settings
neurotic_version = config.get('neurotic_version', None)
remote_data_root_from_file = config.get('remote_data_root', None)
else:
# use defaults for all global settings
neurotic_version = None
remote_data_root_from_file = None
# check neurotic version requirements
if neurotic_version is not None:
version_spec = SpecifierSet(str(neurotic_version), prereleases=True)
if version.parse(__version__) not in version_spec:
logger.warning('the installed version of neurotic '
f'({__version__}) does not meet version '
'requirements specified in the metadata file: '
f'{version_spec}')
# use remote_data_root passed to function preferentially
if remote_data_root is not None:
if not _is_url(remote_data_root):
raise ValueError('"remote_data_root" passed to function is not a full URL: "{}"'.format(remote_data_root))
else:
# use the value passed to the function
pass
elif remote_data_root_from_file is not None:
if not _is_url(remote_data_root_from_file):
raise ValueError('"remote_data_root" provided in file is not a full URL: "{}"'.format(remote_data_root_from_file))
else:
# use the value provided in the file
remote_data_root = remote_data_root_from_file
else:
# both potential sources of remote_data_root are None
pass
# iterate over all data sets
for key in md:
assert type(md[key]) is dict, 'File "{}" may be formatted incorrectly, especially beginning with entry "{}"'.format(file, key)
# fill in missing metadata with default values
defaults = _defaults_for_key(key)
for k in defaults:
md[key].setdefault(k, defaults[k])
# determine the absolute path of the local data directory
if md[key]['data_dir'] is not None:
# data_dir is either an absolute path already or is specified
# relative to local_data_root
if os.path.isabs(md[key]['data_dir']):
dir = md[key]['data_dir']
else:
dir = os.path.abspath(os.path.join(local_data_root, md[key]['data_dir']))
else:
# data_dir is a required property
raise ValueError('"data_dir" missing for "{}"'.format(key))
md[key]['data_dir'] = os.path.normpath(dir)
# determine the full URL to the remote data directory
if md[key]['remote_data_dir'] is not None:
# remote_data_dir is either a full URL already or is specified
# relative to remote_data_root
if _is_url(md[key]['remote_data_dir']):
url = md[key]['remote_data_dir']
elif _is_url(remote_data_root):
url = '/'.join([remote_data_root, md[key]['remote_data_dir']])
else:
url = None
else:
# there is no remote data store
url = None
md[key]['remote_data_dir'] = url
return md
def _defaults_for_key(key):
"""
Default values for metadata.
"""
defaults = {
# store the key with the metadata
'key': key,
# description of data set
'description': None,
# the path of the directory containing the data on the local system
# - this may be an absolute or relative path, but not None since data
# must be located locally
# - if it is a relative path, it will be interpreted by _load_metadata
# as relative to local_data_root and will be converted to an absolute
# path
'data_dir': '.',
# the path of the directory containing the data on a remote server
# - this may be a full URL or a relative path, or None if there exists
# no remote data store
# - if it is a relative path, it will be interpreted by _load_metadata
# as relative to remote_data_root and will be converted to a full URL
'remote_data_dir': None,
# the ephys data file
# - path relative to data_dir and remote_data_dir
'data_file': None,
# the name of a Neo IO class
# - this parameter is optional and exists for overriding the IO class
# determined automatically from the data file's extension
'io_class': None,
# arguments for the Neo IO class
# - e.g. for AsciiSignalIO, {'delimiter': ',', 'sampling_rate': 1000, 'units': 'mV'}
'io_args': None,
# a real-world start time for the data_file, which overrides the value
# that may be stored in the data_file
# - e.g. '2020-01-01 13:14:15'
'rec_datetime': None,
# digital filters to apply before analysis and plotting
# 0 <= highpass <= lowpass < sample_rate/2
# - e.g. [{'channel': 'Channel A', 'highpass': 0, 'lowpass': 50}, ...]
'filters': None,
# the annotations file
# - path relative to data_dir and remote_data_dir
'annotations_file': None,
# the epoch encoder file
# - path relative to data_dir and remote_data_dir
'epoch_encoder_file': None,
# list of labels for epoch encoder
'epoch_encoder_possible_labels': [],
# list of dicts giving name, channel, units, amplitude window, epoch window, color for each unit
# - e.g. [{'name': 'Unit X', 'channel': 'Channel A', 'units': 'uV', 'amplitude': [75, 150], 'epoch': 'Type 1', 'color': 'ff0000'}, ...]
'amplitude_discriminators': None,
# list of dicts giving name of a spiketrain, start and stop firing rate
# thresholds in Hz for each burst
# - 'spiketrain' is required and used to find the appropriate spike
# train by name, whereas 'name' is option and is used to name the
# Epoch generated by load_dataset, defaults to the spiketrain's name
# with ' burst' appended
# - e.g. [{'spiketrain': 'Unit X', 'name': 'Unit X burst', 'thresholds': [10, 8]}, ...]
'burst_detectors': None,
# the output file of a tridesclous spike sorting analysis
# - path relative to data_dir and remote_data_dir
'tridesclous_file': None,
# dict mapping spike ids to lists of channel indices
# - e.g. {0: ['Channel A'], 1: ['Channel A'], ...} to indicate clusters 0 and 1 are both on channel A
# - e.g. {0: ['Channel A', 'Channel B'], ...} to indicate cluster 0 is on both channels A and B
'tridesclous_channels': None,
# list of lists of spike ids specifying how to merge clusters
# - e.g. [[0, 1, 2], [3, 4]] to merge clusters 1 and 2 into 0, merge 4 into 3, and discard all others
# - e.g. [[0], [1], [2], [3], [4]] to keep clusters 0-4 as they are and discard all others
'tridesclous_merge': None,
# list of dicts giving name of a spiketrain, name of a kernel to be
# convolved with the spiketrain, and the sigma parameter of the kernel
# in seconds
# - e.g. [{'name': 'Unit X', 'kernel': 'CausalAlphaKernel', 'sigma': 0.5}, ...]
'firing_rates': None,
# the video file
# - path relative to data_dir and remote_data_dir
'video_file': None,
# the video time offset in seconds
'video_offset': None,
# list of ordered pairs specifying times and durations that the ephys
# data collection was paused while the video continued recording
# - e.g. [[60, 10], [120, 10], [240, 10]] for three 10-second pauses
# occurring at times 1:00, 2:00, 3:00 according to the daq, which
# would correspond to times 1:00, 2:10, 3:20 according to the video
'video_jumps': None,
# a factor to multiply the video frame rate by to correct for async
# error that accumulates over time at a constant rate
# - a value less than 1 will decrease the frame rate and shift video
# events to later times
# - a value greater than 1 will increase the frame rate and shift video
# events to earlier times
# - a good estimate can be obtained by taking the amount of time
# between two events in the video and dividing by the amount of time
# between the same two events in the data
'video_rate_correction': None,
# list the channels in the order they should be plotted
# - e.g. [{'channel': 'Channel A', 'ylabel': 'My channel', 'ylim': [-120, 120], 'units': 'uV', 'color': 'ff0000'}, ...]
'plots': None,
# amount of time in seconds to plot initially
't_width': 40,
# proportion of the plot range, between 0 and 1, to the left of the
# current time (in the "past"), indicated by the position of the
# vertical line
'past_fraction': 0.3,
# factor to subtract from each signal before rectification when
# calculating rectified area under the curve (RAUC)
# - can be None, 'mean', or 'median'
'rauc_baseline': None,
# width of bins in seconds used for calculating rectified area under
# the curve (RAUC) for signals
'rauc_bin_duration': None,
}
return defaults
def _abs_path(metadata, file):
"""
Convert the relative path of file to an absolute path using data_dir
"""
if metadata.get(file, None) is None:
return None
else:
return os.path.normpath(os.path.join(metadata.get('data_dir', '.'), metadata[file]))
def _abs_url(metadata, file):
"""
Convert the relative path of file to a full URL using remote_data_dir
"""
if metadata.get(file, None) is None or metadata.get('remote_data_dir', None) is None:
return None
else:
file_path = metadata[file].replace(os.sep, '/')
url = '/'.join([metadata['remote_data_dir'], file_path])
# url = urllib.parse.unquote(url)
# url = urllib.parse.quote(url, safe='/:')
return url
def _is_url(url):
"""
Returns True only if the parameter begins with the form <scheme>://<netloc>
"""
try:
result = urllib.parse.urlparse(url)
return all([result.scheme, result.netloc])
except Exception:
return False
def _download_file(metadata, file, **kwargs):
"""
Download a file.
See :func:`neurotic.datasets.download.download` for possible keyword
arguments.
"""
if not _is_url(metadata.get('remote_data_dir', None)):
logger.error('metadata[remote_data_dir] is not a full URL')
return
if metadata.get(file, None):
# create directories if necessary
if not os.path.exists(os.path.dirname(_abs_path(metadata, file))):
os.makedirs(os.path.dirname(_abs_path(metadata, file)))
# download the file only if it does not already exist
download(_abs_url(metadata, file), _abs_path(metadata, file), **kwargs)
def _download_all_data_files(metadata, **kwargs):
"""
Download all files associated with metadata.
See :func:`neurotic.datasets.download.download` for possible keyword
arguments.
"""
if not _is_url(metadata.get('remote_data_dir', None)):
logger.error('metadata[remote_data_dir] is not a full URL')
return
for file in [k for k in metadata if k.endswith('_file')]:
_download_file(metadata, file, **kwargs)
logger.info('Downloads complete')
def _selector_labels(all_metadata):
"""
"""
# indicate presence of local data files with symbols
has_local_data = {}
for key, metadata in all_metadata.items():
filenames = [k for k in metadata if k.endswith('_file') and metadata[k] is not None]
files_exist = [os.path.exists(_abs_path(metadata, file)) for file in filenames]
if all(files_exist):
has_local_data[key] = '◆'
elif any(files_exist):
has_local_data[key] = '⬖'
else:
has_local_data[key] = '◇'
# indicate lack of video_offset with an exclamation point unless there is
# no video_file
has_video_offset = {}
for key, metadata in all_metadata.items():
if metadata.get('video_offset', None) is None and metadata.get('video_file', None) is not None:
has_video_offset[key] = '!'
else:
has_video_offset[key] = ' '
# create display text for the selector from keys and descriptions
longest_key_length = max([len(k) for k in all_metadata.keys()])
labels = [
has_local_data[k] +
has_video_offset[k] +
' ' +
k.ljust(longest_key_length + 4) +
str(all_metadata[k]['description']
if all_metadata[k]['description'] else '')
for k in all_metadata.keys()]
return labels