Source code for neurotic.datasets.metadata

# -*- coding: utf-8 -*-
"""
The :mod:`neurotic.datasets.metadata` module implements a class for reading
metadata files.

.. autoclass:: MetadataSelector
   :members:
"""

import os
import urllib
import yaml
from packaging.specifiers import SpecifierSet
from packaging import version

from .. import __version__
from ..datasets.download import download

import logging
logger = logging.getLogger(__name__)


[docs]class MetadataSelector(): """ A class for managing metadata. A metadata file can be specified at initialization, in which case it is read immediately. The file contents are stored as a dictionary in :attr:`all_metadata`. >>> metadata = MetadataSelector(file='metadata.yml') >>> print(metadata.all_metadata) File contents can be reloaded after they have been changed, or after changing ``file``, using the :meth:`load` method. >>> metadata = MetadataSelector() >>> metadata.file = 'metadata.yml' >>> metadata.load() A particular metadata set contained within the file can be selected at initialization with ``initial_selection`` or later using the :meth:`select` method. After making a selection, the selected metadata set is accessible at :meth:`metadata.selected_metadata <selected_metadata>`, e.g. >>> metadata = MetadataSelector(file='metadata.yml') >>> metadata.select('Data Set 5') >>> print(metadata.selected_metadata['data_file']) A compact indexing method is implemented that allows the selected metadata set to be accessed directly, e.g. >>> print(metadata['data_file']) This allows the MetadataSelector to be passed to functions expecting a simple dictionary corresponding to a single metadata set, and the selected metadata set will be used automatically. Files associated with the selected metadata set can be downloaded individually or all together, e.g. >>> metadata.download('video_file') or >>> metadata.download_all_data_files() The absolute path to a local file or the full URL to a remote file associated with the selected metadata set can be resolved with the :meth:`abs_path` and :meth:`abs_url` methods, e.g. >>> print(metadata.abs_path('data_file')) >>> print(metadata.abs_url('data_file')) """ def __init__(self, file=None, local_data_root=None, remote_data_root=None, initial_selection=None): """ Initialize a new MetadataSelector. """ self.file = file self.local_data_root = local_data_root self.remote_data_root = remote_data_root self.all_metadata = None #: A dictionary containing the entire file contents, set by :meth:`load`. self._selection = None if self.file is not None: self.load() if initial_selection is not None: self.select(initial_selection)
[docs] def load(self): """ Read the metadata file. """ self.all_metadata = _load_metadata(self.file, self.local_data_root, self.remote_data_root) if self._selection not in self.all_metadata: self._selection = None
[docs] def select(self, selection): """ Select a metadata set. """ if self.all_metadata is None: logger.error('Load metadata before selecting') elif selection not in self.all_metadata: raise ValueError('{} was not found in {}'.format(selection, self.file)) else: self._selection = selection
@property def keys(self): """ The available metadata keys. """ if self.all_metadata is None: return None else: return list(self.all_metadata.keys()) @property def selected_metadata(self): """ The access point for the selected metadata set. """ if self._selection is None: return None else: return self.all_metadata[self._selection]
[docs] def abs_path(self, file): """ Convert the relative path of ``file`` to an absolute path using ``data_dir``. """ return _abs_path(self.selected_metadata, file)
[docs] def abs_url(self, file): """ Convert the relative path of ``file`` to a full URL using ``remote_data_dir``. """ return _abs_url(self.selected_metadata, file)
[docs] def download(self, file, **kwargs): """ Download a file associated with the selected metadata set. See :func:`neurotic.datasets.download.download` for possible keyword arguments. """ _download_file(self.selected_metadata, file, **kwargs)
[docs] def download_all_data_files(self, **kwargs): """ Download all files associated with the selected metadata set. See :func:`neurotic.datasets.download.download` for possible keyword arguments. """ _download_all_data_files(self.selected_metadata, **kwargs)
def __iter__(self, *args): if self.selected_metadata is None: logger.error('No metadata set is selected. Use the select() method first.') else: return self.selected_metadata.__iter__(*args) def __getitem__(self, *args): if self.selected_metadata is None: logger.error('No metadata set is selected. Use the select() method first.') else: return self.selected_metadata.__getitem__(*args) def __setitem__(self, *args): if self.selected_metadata is None: logger.error('No metadata set is selected. Use the select() method first.') else: return self.selected_metadata.__setitem__(*args) def __delitem__(self, *args): if self.selected_metadata is None: logger.error('No metadata set is selected. Use the select() method first.') else: return self.selected_metadata.__delitem__(*args) def get(self, *args): if self.selected_metadata is None: logger.error('No metadata set is selected. Use the select() method first.') else: return self.selected_metadata.get(*args) def setdefault(self, *args): if self.selected_metadata is None: logger.error('No metadata set is selected. Use the select() method first.') else: return self.selected_metadata.setdefault(*args)
def _load_metadata(file = 'metadata.yml', local_data_root = None, remote_data_root = None): """ Read metadata stored in a YAML file about available collections of data, assign defaults to missing parameters, and resolve absolute paths for local data stores and full URLs for remote data stores. ``local_data_root`` must be an absolute or relative path on the local system, or None. If it is a relative path, it is relative to the current working directory. If it is None, its value defaults to the directory containing ``file``. ``remote_data_root`` must be a full URL or None. If it is None, ``file`` will be checked for a fallback value. "remote_data_root" may be provided in the YAML file under the reserved keyword "neurotic_config". Any non-None value passed to this function will override the value provided in the file. If both are unspecified, it is assumed that no remote data store exists. The "data_dir" property is optional for every data set in ``file`` and specifies the directory on the local system containing the data files. "data_dir" may be an absolute path or a relative path with respect to ``local_data_root``. If it is a relative path, it will be converted to an absolute path. The "remote_data_dir" property is optional for every data set in ``file`` and specifies the directory on a remote server containing the data files. "remote_data_dir" may be a full URL or a relative path with respect to ``remote_data_root``. If it is a relative path, it will be converted to a full URL. File paths (e.g., "data_file", "video_file") are assumed to be relative to both "data_dir" and "remote_data_dir" (i.e., the local and remote data stores mirror one another) and can be resolved with ``_abs_path`` or ``_abs_url``. """ assert file is not None, 'metadata file must be specified' assert os.path.exists(file), 'metadata file "{}" cannot be found'.format(file) # local_data_root defaults to the directory containing file if local_data_root is None: local_data_root = os.path.dirname(file) # load metadata from file with open(file) as f: md = yaml.safe_load(f) # remove special entry "neurotic_config" from the dict if it exists config = md.pop('neurotic_config', None) if isinstance(config, dict): # process global settings neurotic_version = config.get('neurotic_version', None) remote_data_root_from_file = config.get('remote_data_root', None) else: # use defaults for all global settings neurotic_version = None remote_data_root_from_file = None # check neurotic version requirements if neurotic_version is not None: version_spec = SpecifierSet(str(neurotic_version), prereleases=True) if version.parse(__version__) not in version_spec: logger.warning('the installed version of neurotic ' f'({__version__}) does not meet version ' 'requirements specified in the metadata file: ' f'{version_spec}') # use remote_data_root passed to function preferentially if remote_data_root is not None: if not _is_url(remote_data_root): raise ValueError('"remote_data_root" passed to function is not a full URL: "{}"'.format(remote_data_root)) else: # use the value passed to the function pass elif remote_data_root_from_file is not None: if not _is_url(remote_data_root_from_file): raise ValueError('"remote_data_root" provided in file is not a full URL: "{}"'.format(remote_data_root_from_file)) else: # use the value provided in the file remote_data_root = remote_data_root_from_file else: # both potential sources of remote_data_root are None pass # iterate over all data sets for key in md: assert type(md[key]) is dict, 'File "{}" may be formatted incorrectly, especially beginning with entry "{}"'.format(file, key) # fill in missing metadata with default values defaults = _defaults_for_key(key) for k in defaults: md[key].setdefault(k, defaults[k]) # determine the absolute path of the local data directory if md[key]['data_dir'] is not None: # data_dir is either an absolute path already or is specified # relative to local_data_root if os.path.isabs(md[key]['data_dir']): dir = md[key]['data_dir'] else: dir = os.path.abspath(os.path.join(local_data_root, md[key]['data_dir'])) else: # data_dir is a required property raise ValueError('"data_dir" missing for "{}"'.format(key)) md[key]['data_dir'] = os.path.normpath(dir) # determine the full URL to the remote data directory if md[key]['remote_data_dir'] is not None: # remote_data_dir is either a full URL already or is specified # relative to remote_data_root if _is_url(md[key]['remote_data_dir']): url = md[key]['remote_data_dir'] elif _is_url(remote_data_root): url = '/'.join([remote_data_root, md[key]['remote_data_dir']]) else: url = None else: # there is no remote data store url = None md[key]['remote_data_dir'] = url return md def _defaults_for_key(key): """ Default values for metadata. """ defaults = { # store the key with the metadata 'key': key, # description of data set 'description': None, # the path of the directory containing the data on the local system # - this may be an absolute or relative path, but not None since data # must be located locally # - if it is a relative path, it will be interpreted by _load_metadata # as relative to local_data_root and will be converted to an absolute # path 'data_dir': '.', # the path of the directory containing the data on a remote server # - this may be a full URL or a relative path, or None if there exists # no remote data store # - if it is a relative path, it will be interpreted by _load_metadata # as relative to remote_data_root and will be converted to a full URL 'remote_data_dir': None, # the ephys data file # - path relative to data_dir and remote_data_dir 'data_file': None, # the name of a Neo IO class # - this parameter is optional and exists for overriding the IO class # determined automatically from the data file's extension 'io_class': None, # arguments for the Neo IO class # - e.g. for AsciiSignalIO, {'delimiter': ',', 'sampling_rate': 1000, 'units': 'mV'} 'io_args': None, # a real-world start time for the data_file, which overrides the value # that may be stored in the data_file # - e.g. '2020-01-01 13:14:15' 'rec_datetime': None, # digital filters to apply before analysis and plotting # 0 <= highpass <= lowpass < sample_rate/2 # - e.g. [{'channel': 'Channel A', 'highpass': 0, 'lowpass': 50}, ...] 'filters': None, # the annotations file # - path relative to data_dir and remote_data_dir 'annotations_file': None, # the epoch encoder file # - path relative to data_dir and remote_data_dir 'epoch_encoder_file': None, # list of labels for epoch encoder 'epoch_encoder_possible_labels': [], # list of dicts giving name, channel, units, amplitude window, epoch window, color for each unit # - e.g. [{'name': 'Unit X', 'channel': 'Channel A', 'units': 'uV', 'amplitude': [75, 150], 'epoch': 'Type 1', 'color': 'ff0000'}, ...] 'amplitude_discriminators': None, # list of dicts giving name of a spiketrain, start and stop firing rate # thresholds in Hz for each burst # - 'spiketrain' is required and used to find the appropriate spike # train by name, whereas 'name' is option and is used to name the # Epoch generated by load_dataset, defaults to the spiketrain's name # with ' burst' appended # - e.g. [{'spiketrain': 'Unit X', 'name': 'Unit X burst', 'thresholds': [10, 8]}, ...] 'burst_detectors': None, # the output file of a tridesclous spike sorting analysis # - path relative to data_dir and remote_data_dir 'tridesclous_file': None, # dict mapping spike ids to lists of channel indices # - e.g. {0: ['Channel A'], 1: ['Channel A'], ...} to indicate clusters 0 and 1 are both on channel A # - e.g. {0: ['Channel A', 'Channel B'], ...} to indicate cluster 0 is on both channels A and B 'tridesclous_channels': None, # list of lists of spike ids specifying how to merge clusters # - e.g. [[0, 1, 2], [3, 4]] to merge clusters 1 and 2 into 0, merge 4 into 3, and discard all others # - e.g. [[0], [1], [2], [3], [4]] to keep clusters 0-4 as they are and discard all others 'tridesclous_merge': None, # list of dicts giving name of a spiketrain, name of a kernel to be # convolved with the spiketrain, and the sigma parameter of the kernel # in seconds # - e.g. [{'name': 'Unit X', 'kernel': 'CausalAlphaKernel', 'sigma': 0.5}, ...] 'firing_rates': None, # the video file # - path relative to data_dir and remote_data_dir 'video_file': None, # the video time offset in seconds 'video_offset': None, # list of ordered pairs specifying times and durations that the ephys # data collection was paused while the video continued recording # - e.g. [[60, 10], [120, 10], [240, 10]] for three 10-second pauses # occurring at times 1:00, 2:00, 3:00 according to the daq, which # would correspond to times 1:00, 2:10, 3:20 according to the video 'video_jumps': None, # a factor to multiply the video frame rate by to correct for async # error that accumulates over time at a constant rate # - a value less than 1 will decrease the frame rate and shift video # events to later times # - a value greater than 1 will increase the frame rate and shift video # events to earlier times # - a good estimate can be obtained by taking the amount of time # between two events in the video and dividing by the amount of time # between the same two events in the data 'video_rate_correction': None, # list the channels in the order they should be plotted # - e.g. [{'channel': 'Channel A', 'ylabel': 'My channel', 'ylim': [-120, 120], 'units': 'uV', 'color': 'ff0000'}, ...] 'plots': None, # amount of time in seconds to plot initially 't_width': 40, # proportion of the plot range, between 0 and 1, to the left of the # current time (in the "past"), indicated by the position of the # vertical line 'past_fraction': 0.3, # factor to subtract from each signal before rectification when # calculating rectified area under the curve (RAUC) # - can be None, 'mean', or 'median' 'rauc_baseline': None, # width of bins in seconds used for calculating rectified area under # the curve (RAUC) for signals 'rauc_bin_duration': None, } return defaults def _abs_path(metadata, file): """ Convert the relative path of file to an absolute path using data_dir """ if metadata.get(file, None) is None: return None else: return os.path.normpath(os.path.join(metadata.get('data_dir', '.'), metadata[file])) def _abs_url(metadata, file): """ Convert the relative path of file to a full URL using remote_data_dir """ if metadata.get(file, None) is None or metadata.get('remote_data_dir', None) is None: return None else: file_path = metadata[file].replace(os.sep, '/') url = '/'.join([metadata['remote_data_dir'], file_path]) # url = urllib.parse.unquote(url) # url = urllib.parse.quote(url, safe='/:') return url def _is_url(url): """ Returns True only if the parameter begins with the form <scheme>://<netloc> """ try: result = urllib.parse.urlparse(url) return all([result.scheme, result.netloc]) except Exception: return False def _download_file(metadata, file, **kwargs): """ Download a file. See :func:`neurotic.datasets.download.download` for possible keyword arguments. """ if not _is_url(metadata.get('remote_data_dir', None)): logger.error('metadata[remote_data_dir] is not a full URL') return if metadata.get(file, None): # create directories if necessary if not os.path.exists(os.path.dirname(_abs_path(metadata, file))): os.makedirs(os.path.dirname(_abs_path(metadata, file))) # download the file only if it does not already exist download(_abs_url(metadata, file), _abs_path(metadata, file), **kwargs) def _download_all_data_files(metadata, **kwargs): """ Download all files associated with metadata. See :func:`neurotic.datasets.download.download` for possible keyword arguments. """ if not _is_url(metadata.get('remote_data_dir', None)): logger.error('metadata[remote_data_dir] is not a full URL') return for file in [k for k in metadata if k.endswith('_file')]: _download_file(metadata, file, **kwargs) logger.info('Downloads complete') def _selector_labels(all_metadata): """ """ # indicate presence of local data files with symbols has_local_data = {} for key, metadata in all_metadata.items(): filenames = [k for k in metadata if k.endswith('_file') and metadata[k] is not None] files_exist = [os.path.exists(_abs_path(metadata, file)) for file in filenames] if all(files_exist): has_local_data[key] = '◆' elif any(files_exist): has_local_data[key] = '⬖' else: has_local_data[key] = '◇' # indicate lack of video_offset with an exclamation point unless there is # no video_file has_video_offset = {} for key, metadata in all_metadata.items(): if metadata.get('video_offset', None) is None and metadata.get('video_file', None) is not None: has_video_offset[key] = '!' else: has_video_offset[key] = ' ' # create display text for the selector from keys and descriptions longest_key_length = max([len(k) for k in all_metadata.keys()]) labels = [ has_local_data[k] + has_video_offset[k] + ' ' + k.ljust(longest_key_length + 4) + str(all_metadata[k]['description'] if all_metadata[k]['description'] else '') for k in all_metadata.keys()] return labels