From 5a3e3b14f28d0dd452912c919769604ddf6b4c97 Mon Sep 17 00:00:00 2001 From: Moritz Gerster <45031224+moritz-gerster@users.noreply.github.com> Date: Tue, 22 Nov 2022 17:40:09 +0100 Subject: [PATCH] [ENH] Add function to enable smart path search #1098 (#1103) --- CITATION.cff | 4 + doc/api.rst | 1 + doc/authors.rst | 1 + doc/whats_new.rst | 3 +- examples/read_bids_datasets.py | 23 ++- examples/update_bids_datasets.py | 19 +-- mne_bids/__init__.py | 3 +- mne_bids/path.py | 261 +++++++++++++++++++++++++------ mne_bids/tests/test_path.py | 66 +++++++- 9 files changed, 309 insertions(+), 72 deletions(-) diff --git a/CITATION.cff b/CITATION.cff index b22885eca..d0ad70af0 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -151,6 +151,10 @@ authors: family-names: Engemann affiliation: 'Roche Pharma Research and Early Development (pRED), Basel, Switzerland' orcid: 'https://orcid.org/0000-0002-7223-1014' + - given-names: Moritz + family-names: Gerster + affiliation: 'Max Planck Institute for Human Cognitive and Brain Sciences, Leipzig, Germany' + orcid: 'https://orcid.org/0000-0001-9343-6986' - given-names: Alexandre family-names: Gramfort affiliation: 'Université Paris-Saclay, Inria, CEA, Palaiseau, France' diff --git a/doc/api.rst b/doc/api.rst index fdd567d58..d8874d6d2 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -44,6 +44,7 @@ mne_bids inspect_dataset update_sidecar_json anonymize_dataset + find_matching_paths mne_bids.stats -------------- diff --git a/doc/authors.rst b/doc/authors.rst index a4104d278..c9d454e88 100644 --- a/doc/authors.rst +++ b/doc/authors.rst @@ -39,3 +39,4 @@ .. _Denis Engemann: https://github.com/dengemann .. _Bruno Hebling Vieira: https://bhvieira.github.io/ .. _Daniel McCloy: http://dan.mccloy.info +.. _Moritz Gerster: http://moritz-gerster.com diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 5be8385d9..2e8f160d3 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -24,7 +24,7 @@ Version 0.12 (unreleased) The following authors contributed for the first time. Thank you so much! 🤩 -* ... +* `Moritz Gerster`_ The following authors had contributed before. Thank you for sticking around! 🤘 @@ -44,6 +44,7 @@ Detailed list of changes - Add :meth:`mne_bids.BIDSPath.find_matching_sidecar` to find the sidecar file associated with a given file path by `Eric Larson`_ (:gh:`1093`) - When writing data via :func:`~mne_bids.write_raw_bids`, it is now possible to specify a custom mapping of :class:`mne.Annotations` descriptions to event codes via the ``event_id`` parameter. Previously, passing this parameter would always require to also pass ``events``, and using a custom event code mapping for annotations was impossible, by `Richard Höchenberger`_ (:gh:`1084`) - Improve error message when :obj:`~mne_bids.BIDSPath.fpath` cannot be uniquely resolved by `Eric Larson`_ (:gh:`1097`) +- Add :func:`mne_bids.find_matching_paths` to retrieve all `BIDSPaths` matching user-specified entities. The functionality partially overlaps with what's offered through :meth:`mne_bids.BIDSPath.match()`, but is more versatile, by `Moritz Gerster`_ (:gh:`1103`) 🧐 API and behavior changes ^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/examples/read_bids_datasets.py b/examples/read_bids_datasets.py index d40bb9a84..809ece6a7 100644 --- a/examples/read_bids_datasets.py +++ b/examples/read_bids_datasets.py @@ -39,7 +39,8 @@ import openneuro from mne.datasets import sample -from mne_bids import BIDSPath, read_raw_bids, print_dir_tree, make_report +from mne_bids import (BIDSPath, read_raw_bids, print_dir_tree, make_report, + find_matching_paths, get_entity_vals) # %% # Download a subject's data from an OpenNeuro BIDS dataset @@ -92,17 +93,24 @@ # For now, we're interested only in the EEG data in the BIDS root directory # of the Parkinson's disease patient dataset. There were two sessions, one # where the patients took their regular anti-Parkinsonian medications and -# one where they abstained for more than twelve hours. Let's start with the -# off-medication session. +# one where they abstained for more than twelve hours. For now, we are +# not interested in the on-medication session. +sessions = get_entity_vals(bids_root, 'session', ignore_sessions='on') datatype = 'eeg' -session = 'off' -bids_path = BIDSPath(root=bids_root, session=session, datatype=datatype) +extensions = [".bdf", ".tsv"] # ignore .json files +bids_paths = find_matching_paths(bids_root, datatypes=datatype, + sessions=sessions, extensions=extensions) # %% # We can now retrieve a list of all MEG-related files in the dataset: +print(bids_paths) -print(bids_path.match()) +# %% +# Note that this is the same as running: +session = 'off' +bids_path = BIDSPath(root=bids_root, session=session, datatype=datatype) +print(bids_path.match(ignore_json=True)) # %% # The returned list contains ``BIDSpaths`` of 3 files: @@ -127,8 +135,7 @@ task = 'rest' suffix = 'eeg' -bids_path = BIDSPath(subject=subject, session=session, task=task, - suffix=suffix, datatype=datatype, root=bids_root) +bids_path = bids_path.update(subject=subject, task=task, suffix=suffix) # %% # Now let's print the contents of ``bids_path``. diff --git a/examples/update_bids_datasets.py b/examples/update_bids_datasets.py index 9589a10d2..0a6446c42 100644 --- a/examples/update_bids_datasets.py +++ b/examples/update_bids_datasets.py @@ -21,7 +21,7 @@ # We are importing everything we need for this example: from mne.datasets import somato -from mne_bids import (BIDSPath, read_raw_bids, +from mne_bids import (read_raw_bids, find_matching_paths, print_dir_tree, make_report, update_sidecar_json) # %% @@ -60,20 +60,15 @@ # ``BIDSPath`` object. We then pass in a dictionary (or JSON file) to update # all matching metadata fields within the BIDS dataset. -# create a BIDSPath object +# Search for all matching BIDSPaths in the root directory bids_root = somato.data_path() -datatype = 'meg' -subject = '01' -task = 'somato' suffix = 'meg' +extension = '.fif' -bids_path = BIDSPath(subject=subject, task=task, suffix=suffix, - datatype=datatype, root=bids_root) -sidecar_path = bids_path.copy().update(extension='.json') - +bids_paths = find_matching_paths(bids_root, suffixes=suffix, + extensions=extension) # We can now retrieve a list of all MEG-related files in the dataset: -# we will specifically now update the sidecar json file. -print(bids_path.match()) +print(bids_paths) # Define a sidecar update as a dictionary entries = { @@ -90,6 +85,8 @@ # ``entries``. # # Now update all sidecar fields according to our updating dictionary +bids_path = bids_paths[0] +sidecar_path = bids_path.copy().update(extension='.json') update_sidecar_json(bids_path=sidecar_path, entries=entries) # %% diff --git a/mne_bids/__init__.py b/mne_bids/__init__.py index dbccfb8ec..71b290701 100644 --- a/mne_bids/__init__.py +++ b/mne_bids/__init__.py @@ -5,7 +5,8 @@ from mne_bids.report import make_report from mne_bids.path import (BIDSPath, get_datatypes, get_entity_vals, print_dir_tree, get_entities_from_fname, - search_folder_for_text, get_bids_path_from_fname) + search_folder_for_text, get_bids_path_from_fname, + find_matching_paths) from mne_bids.read import get_head_mri_trans, read_raw_bids from mne_bids.utils import get_anonymization_daysback from mne_bids.write import (make_dataset_description, write_anat, diff --git a/mne_bids/path.py b/mne_bids/path.py index cd984372e..aacc67c02 100644 --- a/mne_bids/path.py +++ b/mne_bids/path.py @@ -789,7 +789,7 @@ def update(self, *, check=None, **kwargs): raise e return self - def match(self, check=False): + def match(self, ignore_json=True, check=False): """Get a list of all matching paths in the root directory. Performs a recursive search, starting in ``.root`` (if set), based on @@ -797,6 +797,8 @@ def match(self, check=False): Parameters ---------- + ignore_json : bool + If ``True``, ignores json files. Defaults to ``True``. check : bool If ``True``, only returns paths that conform to BIDS. If ``False`` (default), the ``.check`` attribute of the returned @@ -814,45 +816,14 @@ def match(self, check=False): 'BIDS root directory path to `root` via ' 'BIDSPath.update().') - # allow searching by datatype - # all other entities are filtered below - if self.datatype is not None: - search_str = f'*/{self.datatype}/*' - else: - search_str = '*.*' + paths = _return_root_paths(self.root, datatype=self.datatype, + ignore_json=ignore_json) - paths = self.root.rglob(search_str) - # Only keep files (not directories), and omit the JSON sidecars. - paths = [p for p in paths - if p.is_file() and p.suffix != '.json'] fnames = _filter_fnames(paths, suffix=self.suffix, extension=self.extension, **self.entities) - bids_paths = [] - for fname in fnames: - # Form the BIDSPath object. - # To check whether the BIDSPath is conforming to BIDS if - # check=True, we first instantiate without checking and then run - # the check manually, allowing us to be more specific about the - # exception to catch - datatype = _infer_datatype_from_path(fname) - bids_path = get_bids_path_from_fname(fname, check=False) - bids_path.root = self.root - bids_path.datatype = datatype - bids_path.check = True - - try: - bids_path._check() - except ValueError: - # path is not BIDS-compatible - if check: # skip! - continue - else: - bids_path.check = False - - bids_paths.append(bids_path) - + bids_paths = _fnames_to_bidspaths(fnames, self.root, check=check) return bids_paths def _check(self): @@ -1935,6 +1906,8 @@ def _filter_fnames(fnames, *, subject=None, session=None, task=None, extension=None): """Filter a list of BIDS filenames / paths based on BIDS entity values. + Input can be str or list of str. + Parameters ---------- fnames : iterable of pathlib.Path | iterable of str @@ -1944,25 +1917,48 @@ def _filter_fnames(fnames, *, subject=None, session=None, task=None, list of pathlib.Path """ + subject = _ensure_tuple(subject) + session = _ensure_tuple(session) + task = _ensure_tuple(task) + acquisition = _ensure_tuple(acquisition) + run = _ensure_tuple(run) + processing = _ensure_tuple(processing) + space = _ensure_tuple(space) + recording = _ensure_tuple(recording) + split = _ensure_tuple(split) + description = _ensure_tuple(description) + suffix = _ensure_tuple(suffix) + extension = _ensure_tuple(extension) + leading_path_str = r'.*\/?' # nothing or something ending with a `/` - sub_str = f'sub-{subject}' if subject else r'sub-([^_]+)' - ses_str = f'_ses-{session}' if session else r'(|_ses-([^_]+))' - task_str = f'_task-{task}' if task else r'(|_task-([^_]+))' - acq_str = f'_acq-{acquisition}' if acquisition else r'(|_acq-([^_]+))' - run_str = f'_run-{run}' if run else r'(|_run-([^_]+))' - proc_str = f'_proc-{processing}' if processing else r'(|_proc-([^_]+))' - rec_str = f'_rec-{recording}' if recording else r'(|_rec-([^_]+))' - space_str = f'_space-{space}' if space else r'(|_space-([^_]+))' - split_str = f'_split-{split}' if split else r'(|_split-([^_]+))' - desc_str = f'_desc-{description}' if description else r'(|_desc-([^_]+))' - suffix_str = (f'_{suffix}' if suffix - else r'_(' + '|'.join(ALLOWED_FILENAME_SUFFIX) + ')') - ext_str = extension if extension else r'.([^_]+)' + sub_str = (r'sub-(' + '|'.join(subject) + ')' + if subject else r'sub-([^_]+)') + ses_str = (r'_ses-(' + '|'.join(session) + ')' + if session else r'(|_ses-([^_]+))') + task_str = (r'_task-(' + '|'.join(task) + ')' + if task else r'(|_task-([^_]+))') + acq_str = (r'_acq-(' + '|'.join(acquisition) + ')' + if acquisition else r'(|_acq-([^_]+))') + run_str = (r'_run-(' + '|'.join(run) + ')' + if run else r'(|_run-([^_]+))') + proc_str = (r'_proc-(' + '|'.join(processing) + ')' + if processing else r'(|_proc-([^_]+))') + space_str = (r'_space-(' + '|'.join(space) + ')' + if space else r'(|_space-([^_]+))') + rec_str = (r'_rec-(' + '|'.join(recording) + ')' + if recording else r'(|_rec-([^_]+))') + split_str = (r'_split-(' + '|'.join(split) + ')' + if split else r'(|_split-([^_]+))') + desc_str = (r'_desc-(' + '|'.join(description) + ')' + if description else r'(|_desc-([^_]+))') + suffix_str = (r'_(' + '|'.join(suffix) + ')' if suffix + else r'_([^_]+)') + ext_str = r'(' + '|'.join(extension) + ')' if extension else r'.([^_]+)' regexp = ( leading_path_str + sub_str + ses_str + task_str + acq_str + run_str + proc_str + - rec_str + space_str + split_str + desc_str + suffix_str + ext_str + space_str + rec_str + split_str + desc_str + suffix_str + ext_str ) # Convert to str so we can apply the regexp ... @@ -1974,3 +1970,170 @@ def _filter_fnames(fnames, *, subject=None, session=None, task=None, # ... and return Paths. fnames_filtered = [Path(f) for f in fnames_filtered] return fnames_filtered + + +def find_matching_paths(root, subjects=None, sessions=None, tasks=None, + acquisitions=None, runs=None, processings=None, + recordings=None, spaces=None, splits=None, + descriptions=None, suffixes=None, extensions=None, + datatypes=None, check=False): + """Get list of all matching paths for all matching entity values. + + Input can be str or list of str. None matches all found values. + + Performs a recursive search, starting in ``.root`` (if set), based on + `BIDSPath.entities` object. + + Parameters + ---------- + root : pathlib.Path | str + The root of the BIDS path. + subjects : str | array-like of str | None + The subject ID. Corresponds to "sub". + sessions : str | array-like of str | None + The acquisition session. Corresponds to "ses". + tasks : str | array-like of str | None + The experimental task. Corresponds to "task". + acquisitions: str | array-like of str | None + The acquisition parameters. Corresponds to "acq". + runs : str | array-like of str | None + The run number. Corresponds to "run". + processings : str | array-like of str | None + The processing label. Corresponds to "proc". + recordings : str | array-like of str | None + The recording name. Corresponds to "rec". + spaces : str | array-like of str | None + The coordinate space for anatomical and sensor location + files (e.g., ``*_electrodes.tsv``, ``*_markers.mrk``). + Corresponds to "space". + Note that valid values for ``space`` must come from a list + of BIDS keywords as described in the BIDS specification. + splits : str | array-like of str | None + The split of the continuous recording file for ``.fif`` data. + Corresponds to "split". + descriptions : str | array-like of str | None + This corresponds to the BIDS entity ``desc``. It is used to provide + additional information for derivative data, e.g., preprocessed data + may be assigned ``description='cleaned'``. + + .. versionadded:: 0.11 + suffixes : str | array-like of str | None + The filename suffix. This is the entity after the + last ``_`` before the extension. E.g., ``'channels'``. + The following filename suffix's are accepted: + 'meg', 'markers', 'eeg', 'ieeg', 'T1w', + 'participants', 'scans', 'electrodes', 'coordsystem', + 'channels', 'events', 'headshape', 'digitizer', + 'beh', 'physio', 'stim' + extensions : str | array-like of str | None + The extension of the filename. E.g., ``'.json'``. + datatypes : str | array-like of str | None + The BIDS data type, e.g., ``'anat'``, ``'func'``, ``'eeg'``, ``'meg'``, + ``'ieeg'``. + check : bool + If ``True``, only returns paths that conform to BIDS. If ``False`` + (default), the ``.check`` attribute of the returned + `mne_bids.BIDSPath` object will be set to ``True`` for paths that + do conform to BIDS, and to ``False`` for those that don't. + + Returns + ------- + bids_paths : list of mne_bids.BIDSPath + The matching paths. + + """ + fpaths = _return_root_paths(root, datatype=datatypes, ignore_json=False) + + fpaths_filtered = _filter_fnames(fpaths, + subject=subjects, + session=sessions, + task=tasks, + acquisition=acquisitions, + run=runs, + processing=processings, + recording=recordings, + space=spaces, + split=splits, + description=descriptions, + suffix=suffixes, + extension=extensions) + + bids_paths = _fnames_to_bidspaths(fpaths_filtered, root, check=check) + return bids_paths + + +def _return_root_paths(root, datatype=None, ignore_json=True): + """Return all paths in root. + + Can be filtered by datatype (which is present in the path but not in + the BIDSPath basename). Can also be list of datatypes. + root : pathlib.Path | str + The root of the BIDS path. + datatype : str | array-like of str | None + The BIDS data type, e.g., ``'anat'``, ``'func'``, ``'eeg'``, ``'meg'``, + ``'ieeg'``. + """ + root = Path(root) # if root is str + + if datatype is not None: + datatype = _ensure_tuple(datatype) + search_str = f'*/{"|".join(datatype)}/*' + else: + search_str = '*.*' + + paths = root.rglob(search_str) + # Only keep files (not directories), and omit the JSON sidecars + # if ignore_json is True. + if ignore_json: + paths = [p for p in paths + if p.is_file() and p.suffix != '.json'] + else: + paths = [p for p in paths if p.is_file()] + + return paths + + +def _fnames_to_bidspaths(fnames, root, check=False): + """Make BIDSPaths from file names. + + To check whether the BIDSPath is conforming to BIDS if check=True, we + first instantiate without checking and then run the check manually, + allowing us to be more specific about the exception to catch. + + Parameters + ---------- + fnames : list of str + Filenames as list of strings. + root : path-like | None + The root directory of the BIDS dataset. + check : bool + If ``True``, only returns paths that conform to BIDS. If ``False`` + (default), the ``.check`` attribute of the returned + `mne_bids.BIDSPath` object will be set to ``True`` for paths that + do conform to BIDS, and to ``False`` for those that don't. + + Returns + ------- + bids_paths : list of mne_bids.BIDSPath + Bids paths. + + """ + bids_paths = [] + for fname in fnames: + datatype = _infer_datatype_from_path(fname) + bids_path = get_bids_path_from_fname(fname, check=False) + bids_path.root = root + bids_path.datatype = datatype + bids_path.check = True + + try: + bids_path._check() + except ValueError: + # path is not BIDS-compatible + if check: # skip! + continue + else: + bids_path.check = False + + bids_paths.append(bids_path) + return bids_paths diff --git a/mne_bids/tests/test_path.py b/mne_bids/tests/test_path.py index 0ddcbac38..7c84c6b39 100644 --- a/mne_bids/tests/test_path.py +++ b/mne_bids/tests/test_path.py @@ -21,7 +21,7 @@ from mne_bids.path import (_parse_ext, get_entities_from_fname, _find_best_candidates, _filter_fnames, search_folder_for_text, - get_bids_path_from_fname) + get_bids_path_from_fname, find_matching_paths) from mne_bids.config import ALLOWED_PATH_ENTITIES_SHORT from test_read import _read_raw_fif, warning_str @@ -742,7 +742,8 @@ def test_make_filenames(): (dict(suffix='meg'), 4), (dict(acquisition='lowres'), 1), (dict(task='test', processing='ica', suffix='eeg'), 2), - (dict(subject='5', task='test', processing='ica', suffix='eeg'), 1) + (dict(subject='5', task='test', processing='ica', suffix='eeg'), 1), + (dict(subject=['01', '02']), 3), # test multiple input ]) def test_filter_fnames(entities, expected_n_matches): """Test filtering filenames based on BIDS entities works.""" @@ -851,6 +852,67 @@ def test_match(return_bids_test_dir): assert bids_path_01.match(check=False)[0].fpath.name == 'sub-01_foo.eeg' +@testing.requires_testing_data +def test_find_matching_paths(return_bids_test_dir): + """We test by yielding the same results as BIDSPath.match() which + is extensively tested above.""" + bids_root = Path(return_bids_test_dir) + + # Check a few exemplary entities + bids_path_01 = BIDSPath(root=bids_root) + paths_match = bids_path_01.match(ignore_json=False) + paths_find = find_matching_paths(bids_root) + assert paths_match == paths_find + + # Datatype is important because handled differently + bids_path_01 = BIDSPath(root=bids_root, datatype="meg") + paths_match = bids_path_01.match(ignore_json=False) + paths_find = find_matching_paths(bids_root, datatypes="meg") + assert paths_match == paths_find + + bids_path_01 = BIDSPath(root=bids_root, run="02") + paths_match = bids_path_01.match(ignore_json=False) + paths_find = find_matching_paths(bids_root, runs="02") + assert paths_match == paths_find + + # Check list of str as input + bids_path_01 = BIDSPath(root=bids_root, extension=".tsv") + bids_path_02 = BIDSPath(root=bids_root, extension=".json") + paths_match1 = bids_path_01.match(ignore_json=False) + paths_match2 = bids_path_02.match(ignore_json=False) + paths_match = paths_match1 + paths_match2 + paths_match = sorted([str(f.fpath) for f in paths_match]) + paths_find = find_matching_paths(bids_root, extensions=[".tsv", ".json"]) + paths_find = sorted([str(f.fpath) for f in paths_find]) + assert paths_match == paths_find + + # Test ignore_json parameter + bids_path_01 = BIDSPath(root=bids_root) + paths_match = bids_path_01.match(ignore_json=True) + paths_find = find_matching_paths(bids_root, extensions=[".tsv", ".fif", + ".dat", ".eeg"]) + assert paths_match == paths_find + + # Test `check` parameter + bids_path_01 = _bids_path.copy() + bids_path_01.update( + root=bids_root, session=None, task=None, run=None, + suffix='foo', extension='.eeg', check=False + ) + bids_path_01.fpath.touch() + paths_match = bids_path_01.match(check=True) + paths_find = find_matching_paths(bids_root, sessions=None, tasks=None, + runs=None, suffixes='foo', + extensions='.eeg', check=True) + assert paths_match == paths_find + + paths_match = bids_path_01.match(check=False) + paths_find = find_matching_paths(bids_root, sessions=None, tasks=None, + runs=None, suffixes='foo', + extensions='.eeg', check=False) + assert paths_match == paths_find + + @pytest.mark.filterwarnings(warning_str['meas_date_set_to_none']) @pytest.mark.filterwarnings(warning_str['channel_unit_changed']) @testing.requires_testing_data