Define new scheme for ID generation and set immutable_id and `last_…

…modified`
materialscloud-org · Apr 17, 2024 · f3a5c3b · f3a5c3b
1 parent 83bc873
commit f3a5c3b
Show file tree

Hide file tree

Showing 2 changed files with 114 additions and 7 deletions.
diff --git a/src/optimake/convert.py b/src/optimake/convert.py
@@ -4,6 +4,7 @@
 
 """
 
+import datetime
 import os
 import warnings
 from collections import defaultdict
@@ -258,9 +259,55 @@ def _parse_entries(
                     f"None of the provided parsers {ENTRY_PARSERS[entry_type]} could parse {_path}. Errors: {exceptions}"
                 )
 
+    if len(set(entry_ids)) != len(entry_ids):
+        raise RuntimeError(
+            "Duplicate entry IDs found even when generated directly from filepaths. This should not be possible."
+        )
+
     return parsed_entries, entry_ids
 
 
+def _set_unique_entry_ids(entry_ids: list[str]) -> list[str]:
+    """Attempt to make the simplest unique set of entry IDs possible,
+    following a series of deterministic rules.
+
+    Parameters:
+        entry_ids: A list of entry IDs derived from file paths.
+
+    Returns:
+        A list of unique entry IDs.
+
+    """
+
+    new_ids: list[str] = list(entry_ids)
+    target_num_ids = len(entry_ids)
+    depth: int = 0
+    max_depth: int = 10  # somewhat arbitrary upper limit
+    # Loop through each filename and try to ablate directories until a unique set arises
+    while len(set(new_ids)) != target_num_ids and depth < max_depth:
+        for i, id in enumerate(entry_ids):
+            new_ids[i] = "/".join(id.split("/")[-1 - depth :])
+        depth += 1
+
+    # Now try to ablate any common file names, e.g,. subfolders of POSCARs (1/POSCAR, 2/POSCAR)
+    # Loop through each filename and try to ablate directories until a unique set arises
+    new_ids_sans_common_filenames = [
+        "/".join(new_id.split("/")[0:-2]) for new_id in new_ids
+    ]
+    if len(set(new_ids_sans_common_filenames)) == target_num_ids:
+        new_ids = new_ids_sans_common_filenames
+
+    # Now try to ablate any file extensions
+    new_ids_sans_extensions = [id.split(".")[0] for id in new_ids]
+    if len(set(new_ids_sans_extensions)) == target_num_ids:
+        return new_ids_sans_extensions
+
+    if len(set(new_ids)) != target_num_ids:
+        return entry_ids
+
+    return new_ids
+
+
 def _parse_and_assign_properties(
     optimade_entries: dict[str, EntryResource],
     property_matches_by_file: dict[str | None, list[Path]],
@@ -370,22 +417,27 @@ def construct_entries(
     _check_missing(entry_matches_by_file)
 
     # Parse into intermediate format
-    parsed_entries, entry_ids = _parse_entries(
+    parsed_entries, file_path_entry_ids = _parse_entries(
         archive_path,
         entry_matches_by_file,
         entry_config.entry_type,
     )
 
+    # Generate a better set of entry IDs
+    unique_entry_ids = _set_unique_entry_ids(file_path_entry_ids)
+
     # Parse properties
     property_matches_by_file: dict[str | None, list[Path]] = _get_matches(
         archive_path, entry_config.property_paths
     )
     _check_missing(property_matches_by_file)
 
+    timestamp = datetime.datetime.now().isoformat()
+
     # Construct OPTIMADE entries from intermediate format
     optimade_entries: dict[str, EntryResource] = {}
-    for entry_id, entry in tqdm.tqdm(
-        zip(entry_ids, parsed_entries),
+    for file_path_entry_id, unique_entry_id, entry in tqdm.tqdm(
+        zip(file_path_entry_ids, unique_entry_ids, parsed_entries),
         desc=f"Constructing OPTIMADE {entry_config.entry_type} entries",
     ):
         exceptions = {}
@@ -407,12 +459,16 @@ def construct_entries(
             entry = entry.dict()
 
         if not entry["id"]:
-            entry["id"] = entry_id
+            entry["id"] = unique_entry_id
+
+        if entry["id"] in optimade_entries:
+            raise RuntimeError(f"Duplicate entry ID found: {entry['id']}")
 
-        if entry_id in optimade_entries:
-            raise RuntimeError(f"Duplicate entry ID found: {entry_id}")
+        optimade_entries[entry["id"]] = entry
 
-        optimade_entries[entry_id] = entry
+        if not entry["attributes"].get("immutable_id"):
+            entry["attributes"]["immutable_id"] = file_path_entry_id
+        entry["attributes"]["last_modified"] = timestamp
 
     # Now try to parse the properties and assign them to OPTIMADE entries
     _parse_and_assign_properties(

diff --git a/tests/test_convert.py b/tests/test_convert.py
@@ -87,3 +87,54 @@ def check_arrays(reference, test, field):
             assert json.dumps(
                 first_entry["attributes"], sort_keys=True, indent=2
             ) == json.dumps(next_entry["attributes"], sort_keys=True, indent=2)
+
+
+def test_unique_id_generator():
+    """Unit tests for some common cases of the unique ID generator."""
+
+    from optimake.convert import _set_unique_entry_ids
+
+    entry_ids = [
+        "data/structures/1.cif",
+        "data/structures/2.cif",
+        "data/structures/3.cif",
+    ]
+    assert _set_unique_entry_ids(entry_ids) == ["1", "2", "3"]
+
+    entry_ids = ["data/structures/1", "data/structures/2", "data/structures/3"]
+    assert _set_unique_entry_ids(entry_ids) == ["1", "2", "3"]
+
+    entry_ids = [
+        "data/structures/1/POSCAR",
+        "data/structures/2/POSCAR",
+        "data/structures/3/POSCAR",
+    ]
+    assert _set_unique_entry_ids(entry_ids) == ["1", "2", "3"]
+
+    entry_ids = [
+        "data1/structures/1/POSCAR",
+        "data2/structures/1/POSCAR",
+        "data3/structures/1/POSCAR",
+    ]
+    assert _set_unique_entry_ids(entry_ids) == entry_ids
+
+    entry_ids = [
+        "data.zip/data/structures/1.cif",
+        "data.zip/data/structures/2.cif",
+        "data.zip/data/structures/3.cif",
+    ]
+    assert _set_unique_entry_ids(entry_ids) == ["1", "2", "3"]
+
+    entry_ids = [
+        "data.tar.gz/data/structures/1.cif",
+        "data.tar.gz/data/structures/2.cif",
+        "data.tar.gz/data/structures/3.cif",
+    ]
+    assert _set_unique_entry_ids(entry_ids) == ["1", "2", "3"]
+
+    entry_ids = [
+        "data.tar.gz/data/structures/1.cif.gz",
+        "data.tar.gz/data/structures/2.cif.gz",
+        "data.tar.gz/data/structures/3.cif.gz",
+    ]
+    assert _set_unique_entry_ids(entry_ids) == ["1", "2", "3"]