diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 9838b5b..1974713 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,3 +1,6 @@ +### Version 1.3.5 +- Fixed a bug that under some circumstances could cause incomplete file metadata to be returned. + ### Version 1.3.4 - Alter the behavior of the bulk specification file writers to return an error if the input `types` parameter is empty. diff --git a/staging_service/app.py b/staging_service/app.py index 7d6b858..25dda8e 100644 --- a/staging_service/app.py +++ b/staging_service/app.py @@ -34,7 +34,7 @@ logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) routes = web.RouteTableDef() -VERSION = "1.3.4" +VERSION = "1.3.5" _DATATYPE_MAPPINGS = None diff --git a/staging_service/metadata.py b/staging_service/metadata.py index 87bc8d7..110a771 100644 --- a/staging_service/metadata.py +++ b/staging_service/metadata.py @@ -48,7 +48,8 @@ async def _generate_metadata(path: Path, source: str): else: data = {} # first ouptut of md5sum is the checksum - data["source"] = source + if source: + data["source"] = source try: md5 = hashlib.md5(open(path.full_path, "rb").read()).hexdigest() except: @@ -165,7 +166,7 @@ async def some_metadata(path: Path, desired_fields=False, source=None): os.stat(path.metadata_path).st_mtime < file_stats["mtime"] / 1000 ): # if metadata does not exist or older than file: regenerate - if source is None: + if source is None: # TODO BUGFIX this will overwrite any source in the file source = _determine_source(path) data = await _generate_metadata(path, source) else: # metadata already exists and is up to date @@ -174,9 +175,11 @@ async def some_metadata(path: Path, desired_fields=False, source=None): data = await f.read() data = decoder.decode(data) # due to legacy code, some file has corrupted metadata file + # Also if a file is listed or checked for existence before the upload completes + # this code block will be triggered expected_keys = ["source", "md5", "lineCount", "head", "tail"] - if set(expected_keys) > set(data.keys()): - if source is None: + if not set(expected_keys) <= set(data.keys()): + if source is None and "source" not in data: source = _determine_source(path) data = await _generate_metadata(path, source) data = {**data, **file_stats} diff --git a/tests/test_metadata.py b/tests/test_metadata.py new file mode 100644 index 0000000..599f0ce --- /dev/null +++ b/tests/test_metadata.py @@ -0,0 +1,80 @@ +""" Unit tests for the metadata handling routines. """ + +import json +import uuid + +from collections.abc import Generator +from pathlib import Path as PyPath +from pytest import fixture + +from staging_service.utils import Path +from staging_service.metadata import some_metadata + +from tests.test_app import FileUtil + +# TODO TEST add more unit tests here. + + +@fixture(scope="module") +def temp_dir() -> Generator[PyPath, None, None]: + with FileUtil() as fu: + childdir = PyPath(fu.make_dir(str(uuid.uuid4()))).resolve() + + yield childdir + + # FileUtil will auto delete after exiting + + +async def test_incomplete_metadata_file_update(temp_dir: Path): + """ + Tests the case where a file is listed or checked for existance prior to completing + upload, and then an UPA is added to the file. This previously caused incomplete metadata + to be returned as the logic for whether to run the metadata regeneration code based on + the contents of the current metadata file was incorrect. + See https://kbase-jira.atlassian.net/browse/PTV-1767 + """ + await _incomplete_metadata_file_update( + temp_dir, + {"source": "some place", "UPA": "1/2/3"}, + "some place" + ) + + await _incomplete_metadata_file_update( + temp_dir, + {"UPA": "1/2/3"}, + "Unknown" + ) + +async def _incomplete_metadata_file_update(temp_dir, metadict, source): + target = Path( + str(temp_dir / "full"), + str(temp_dir / "meta"), + "user_path", + "myfilename", + "super_fake_jgi_path") + + with open(target.full_path, "w") as p: + p.writelines(make_test_lines(1, 6)) + with open(target.metadata_path, "w") as p: + p.write(json.dumps(metadict)) + + res = await some_metadata(target) + + assert "mtime" in res # just check the file time is there + del res["mtime"] + + assert res == { + "source": source, + "md5": "9f07da9655223b777121141ff7735f25", + "head": "".join(make_test_lines(1, 5)), + "tail": "".join(make_test_lines(2, 6)), + "UPA": "1/2/3", + "isFolder": False, + "lineCount": "5", + "name": "myfilename", + "path": "user_path", + "size": 1280 + } + +def make_test_lines(start, stop): + return [str(i) + "a" * (256 - len(str(i)) - 1) + "\n" for i in range(start, stop)]