Skip to content

Commit

Permalink
Define new scheme for ID generation and set immutable_id and `last_…
Browse files Browse the repository at this point in the history
…modified`
  • Loading branch information
ml-evs committed Apr 17, 2024
1 parent 83bc873 commit f3a5c3b
Show file tree
Hide file tree
Showing 2 changed files with 114 additions and 7 deletions.
70 changes: 63 additions & 7 deletions src/optimake/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""

import datetime
import os
import warnings
from collections import defaultdict
Expand Down Expand Up @@ -258,9 +259,55 @@ def _parse_entries(
f"None of the provided parsers {ENTRY_PARSERS[entry_type]} could parse {_path}. Errors: {exceptions}"
)

if len(set(entry_ids)) != len(entry_ids):
raise RuntimeError(
"Duplicate entry IDs found even when generated directly from filepaths. This should not be possible."
)

return parsed_entries, entry_ids


def _set_unique_entry_ids(entry_ids: list[str]) -> list[str]:
"""Attempt to make the simplest unique set of entry IDs possible,
following a series of deterministic rules.
Parameters:
entry_ids: A list of entry IDs derived from file paths.
Returns:
A list of unique entry IDs.
"""

new_ids: list[str] = list(entry_ids)
target_num_ids = len(entry_ids)
depth: int = 0
max_depth: int = 10 # somewhat arbitrary upper limit
# Loop through each filename and try to ablate directories until a unique set arises
while len(set(new_ids)) != target_num_ids and depth < max_depth:
for i, id in enumerate(entry_ids):
new_ids[i] = "/".join(id.split("/")[-1 - depth :])
depth += 1

# Now try to ablate any common file names, e.g,. subfolders of POSCARs (1/POSCAR, 2/POSCAR)
# Loop through each filename and try to ablate directories until a unique set arises
new_ids_sans_common_filenames = [
"/".join(new_id.split("/")[0:-2]) for new_id in new_ids
]
if len(set(new_ids_sans_common_filenames)) == target_num_ids:
new_ids = new_ids_sans_common_filenames

# Now try to ablate any file extensions
new_ids_sans_extensions = [id.split(".")[0] for id in new_ids]
if len(set(new_ids_sans_extensions)) == target_num_ids:
return new_ids_sans_extensions

if len(set(new_ids)) != target_num_ids:
return entry_ids

return new_ids


def _parse_and_assign_properties(
optimade_entries: dict[str, EntryResource],
property_matches_by_file: dict[str | None, list[Path]],
Expand Down Expand Up @@ -370,22 +417,27 @@ def construct_entries(
_check_missing(entry_matches_by_file)

# Parse into intermediate format
parsed_entries, entry_ids = _parse_entries(
parsed_entries, file_path_entry_ids = _parse_entries(
archive_path,
entry_matches_by_file,
entry_config.entry_type,
)

# Generate a better set of entry IDs
unique_entry_ids = _set_unique_entry_ids(file_path_entry_ids)

# Parse properties
property_matches_by_file: dict[str | None, list[Path]] = _get_matches(
archive_path, entry_config.property_paths
)
_check_missing(property_matches_by_file)

timestamp = datetime.datetime.now().isoformat()

# Construct OPTIMADE entries from intermediate format
optimade_entries: dict[str, EntryResource] = {}
for entry_id, entry in tqdm.tqdm(
zip(entry_ids, parsed_entries),
for file_path_entry_id, unique_entry_id, entry in tqdm.tqdm(
zip(file_path_entry_ids, unique_entry_ids, parsed_entries),
desc=f"Constructing OPTIMADE {entry_config.entry_type} entries",
):
exceptions = {}
Expand All @@ -407,12 +459,16 @@ def construct_entries(
entry = entry.dict()

if not entry["id"]:
entry["id"] = entry_id
entry["id"] = unique_entry_id

if entry["id"] in optimade_entries:
raise RuntimeError(f"Duplicate entry ID found: {entry['id']}")

if entry_id in optimade_entries:
raise RuntimeError(f"Duplicate entry ID found: {entry_id}")
optimade_entries[entry["id"]] = entry

optimade_entries[entry_id] = entry
if not entry["attributes"].get("immutable_id"):
entry["attributes"]["immutable_id"] = file_path_entry_id
entry["attributes"]["last_modified"] = timestamp

# Now try to parse the properties and assign them to OPTIMADE entries
_parse_and_assign_properties(
Expand Down
51 changes: 51 additions & 0 deletions tests/test_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,54 @@ def check_arrays(reference, test, field):
assert json.dumps(
first_entry["attributes"], sort_keys=True, indent=2
) == json.dumps(next_entry["attributes"], sort_keys=True, indent=2)


def test_unique_id_generator():
"""Unit tests for some common cases of the unique ID generator."""

from optimake.convert import _set_unique_entry_ids

entry_ids = [
"data/structures/1.cif",
"data/structures/2.cif",
"data/structures/3.cif",
]
assert _set_unique_entry_ids(entry_ids) == ["1", "2", "3"]

entry_ids = ["data/structures/1", "data/structures/2", "data/structures/3"]
assert _set_unique_entry_ids(entry_ids) == ["1", "2", "3"]

entry_ids = [
"data/structures/1/POSCAR",
"data/structures/2/POSCAR",
"data/structures/3/POSCAR",
]
assert _set_unique_entry_ids(entry_ids) == ["1", "2", "3"]

entry_ids = [
"data1/structures/1/POSCAR",
"data2/structures/1/POSCAR",
"data3/structures/1/POSCAR",
]
assert _set_unique_entry_ids(entry_ids) == entry_ids

entry_ids = [
"data.zip/data/structures/1.cif",
"data.zip/data/structures/2.cif",
"data.zip/data/structures/3.cif",
]
assert _set_unique_entry_ids(entry_ids) == ["1", "2", "3"]

entry_ids = [
"data.tar.gz/data/structures/1.cif",
"data.tar.gz/data/structures/2.cif",
"data.tar.gz/data/structures/3.cif",
]
assert _set_unique_entry_ids(entry_ids) == ["1", "2", "3"]

entry_ids = [
"data.tar.gz/data/structures/1.cif.gz",
"data.tar.gz/data/structures/2.cif.gz",
"data.tar.gz/data/structures/3.cif.gz",
]
assert _set_unique_entry_ids(entry_ids) == ["1", "2", "3"]

0 comments on commit f3a5c3b

Please sign in to comment.