From a4561222905898807262bd86ab8e48306705b2d6 Mon Sep 17 00:00:00 2001 From: Bill Riehl Date: Mon, 11 Nov 2024 13:50:07 -0500 Subject: [PATCH 01/12] add .json files (dts import manifest) to import specification list --- deployment/conf/supported_apps_w_extensions.json | 10 ++++++++-- staging_service/app.py | 5 +++-- staging_service/autodetect/Mappings.py | 6 +++++- tests/test_app.py | 2 +- 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/deployment/conf/supported_apps_w_extensions.json b/deployment/conf/supported_apps_w_extensions.json index 205fc0b0..8dacbb99 100644 --- a/deployment/conf/supported_apps_w_extensions.json +++ b/deployment/conf/supported_apps_w_extensions.json @@ -121,7 +121,8 @@ "csv", "tsv", "xls", - "xlsx" + "xlsx", + "json" ], "media": [ "tsv", @@ -1122,8 +1123,13 @@ "id": "escher_map", "title": "EscherMap", "app_weight": 1 + }, + { + "id": "import_specification", + "title": "Import Specification", + "app_weight": 1 } ] } } -} \ No newline at end of file +} diff --git a/staging_service/app.py b/staging_service/app.py index b7e90f24..2ce06400 100644 --- a/staging_service/app.py +++ b/staging_service/app.py @@ -12,7 +12,7 @@ from .app_error_formatter import format_import_spec_errors from .auth2Client import KBaseAuth2 -from .autodetect.Mappings import CSV, EXCEL, TSV +from .autodetect.Mappings import CSV, EXCEL, TSV, DTS_MANIFEST from .AutoDetectUtils import AutoDetectUtils from .globus import assert_globusid_exists, is_globusid from .import_specifications.file_parser import ( @@ -26,7 +26,7 @@ write_excel, write_tsv, ) -from .import_specifications.individual_parsers import parse_csv, parse_excel, parse_tsv +from .import_specifications.individual_parsers import parse_csv, parse_excel, parse_tsv, parse_dts_manifest from .JGIMetadata import read_metadata_for from .metadata import add_upa, dir_info, similar, some_metadata from .utils import AclManager, Path, run_command @@ -43,6 +43,7 @@ CSV: parse_csv, TSV: parse_tsv, EXCEL: parse_excel, + DTS_MANIFEST: parse_dts_manifest } _IMPSPEC_FILE_TO_WRITER = { diff --git a/staging_service/autodetect/Mappings.py b/staging_service/autodetect/Mappings.py index af08e54c..37edbf4e 100644 --- a/staging_service/autodetect/Mappings.py +++ b/staging_service/autodetect/Mappings.py @@ -7,6 +7,9 @@ EXCEL = "EXCEL" ZIP = "CompressedFileFormatArchive" +# Data Transfer Service Manifest format (which is a specific JSON format) +DTS_MANIFEST = "JSON" + # BIOINFORMATICS FORMATS FASTA = "FASTA" FASTQ = "FASTQ" @@ -99,7 +102,8 @@ def _add_gzip(extension_list): # "stockholm", # ], TSV: ["tsv"], # See Note 1 below - CSV: ["csv"], # See Note 1 below + CSV: ["csv"], # See Note 1 below, + DTS_MANIFEST: ["json"], JSON: ["json"], EXCEL: ["xls", "xlsx"], # See Note 1 below ZIP: ["zip", "tar", "tgz", "tar.gz", "7z", "gz", "gzip", "rar"], diff --git a/tests/test_app.py b/tests/test_app.py index 289de0ad..54b6535b 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -1654,7 +1654,7 @@ async def test_importer_filetypes(): a2f = js["datatype_to_filetype"] assert a2f["assembly"] == ["FASTA"] assert a2f["gff_genome"] == ["FASTA", "GFF"] - assert a2f["import_specification"] == ["CSV", "EXCEL", "TSV"] + assert a2f["import_specification"] == ["CSV", "EXCEL", "JSON", "TSV"] f2e = js["filetype_to_extensions"] assert f2e["FASTA"] == [ From d2765a468a7cc09cde549c01c7a0448484fef11f Mon Sep 17 00:00:00 2001 From: Bill Riehl Date: Mon, 11 Nov 2024 15:33:25 -0500 Subject: [PATCH 02/12] add dummy parse_dts_manifest function --- .../individual_parsers.py | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/staging_service/import_specifications/individual_parsers.py b/staging_service/import_specifications/individual_parsers.py index 00eee18b..e4ff8471 100644 --- a/staging_service/import_specifications/individual_parsers.py +++ b/staging_service/import_specifications/individual_parsers.py @@ -3,6 +3,7 @@ """ import csv +import json import math import re from pathlib import Path @@ -316,3 +317,55 @@ def parse_excel(path: Path) -> ParseResults: return ParseResults(frozendict(results)) else: return _error(Error(ErrorType.PARSE_FAIL, "No non-header data in file", spcsrc)) + +def parse_dts_manifest(path: Path) -> ParseResults: + """ + Parse the provided DTS manifest file. Expected to be JSON, and will fail otherwise. + The manifest should have roughly this format, with expected keys included: + { + "resources": [{ + "id": str, + "name": str, + "path": str, + "format": str, + "instructions": { + "data_type": str, + "parameters": { + "": value, + "": value + } + } + }] + } + The parameters under the "instructions"."parameters" dictionary are arbitrary keys with + arbitrary values and will be returned as-is. They are expected to be PRIMITIVE_TYPEs and + will fail otherwise. + """ + spcsrc = SpecificationSource(path) + errors = [] + # dummy for now + results = {} + try: + with open(path, "r") as manifest: + manifest_json = json.load(manifest) + if not isinstance(manifest_json, dict): + errors.append( + Error( + ErrorType.PARSE_FAIL, + "Manifest is not a dictionary", + spcsrc + ) + ) + + except FileNotFoundError: + return _error(Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc)) + except IsADirectoryError: + return _error(Error(ErrorType.PARSE_FAIL, "The given path is a directory", spcsrc)) + except _ParseException as e: + return _error(e.args[0]) + if errors: + return ParseResults(errors=tuple(errors)) + elif results: + return ParseResults(frozendict(results)) + else: + return _error(Error(ErrorType.PARSE_FAIL, "No import specification data in file", spcsrc)) From 2f5b2d06a91c0525da37a755e25ae373df53b8ba Mon Sep 17 00:00:00 2001 From: Bill Riehl Date: Wed, 13 Nov 2024 13:04:29 -0500 Subject: [PATCH 03/12] progress on dts_manifest base error tests --- .../test_individual_parsers.py | 105 ++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/tests/import_specifications/test_individual_parsers.py b/tests/import_specifications/test_individual_parsers.py index 8c9c445a..09bb0a84 100644 --- a/tests/import_specifications/test_individual_parsers.py +++ b/tests/import_specifications/test_individual_parsers.py @@ -1,7 +1,9 @@ +import json import os import uuid from collections.abc import Callable, Generator from pathlib import Path +from typing import Union # TODO update to C impl when fixed: https://github.com/Marco-Sulla/python-frozendict/issues/26 from frozendict import frozendict @@ -14,6 +16,7 @@ ParseResults, SpecificationSource, parse_csv, + parse_dts_manifest, parse_excel, parse_tsv, ) @@ -765,3 +768,105 @@ def test_excel_parse_fail_unequal_rows(): ), ], ) + +def test_dts_manifest_parse_success(): + f = _get_test_file("manifest_small.json") + res = parse_dts_manifest(f) + assert res.results + assert res.errors is None + assert list(res.results.keys()) == ["gff_metagenome"] + assert res.results["gff_metagenome"] + assert res.results["gff_metagenome"].source.file == f + assert len(res.results["gff_metagenome"].result) == 3 + for parsed in res.results["gff_metagenome"].result: + assert parsed == { + "param1": "value1", + "param2": "value2" + } + +@fixture(scope="module") +def write_dts_manifest(temp_dir: Generator[Path, None, None]) -> Callable[[Union[dict,list]], Path]: + def manifest_writer(input_json: Union[dict,list]) -> Path: + file_path = temp_dir / str(uuid.uuid4()) + with open(file_path, "w", encoding="utf-8") as outfile: + json.dump(input_json, outfile) + return file_path + return manifest_writer + +def _dts_manifest_parse_fail(input_file: Path, error_infos: list[dict[str, str]]=None): + """ + Tests a failing DTS manifest parse. + input_file - the path to the input file. Might be a directory or not exist. + error_infos - a list of error info dicts, expected to be in the order generated by the call to + parse_dts_manifest. Expected keys are 'type' (will fail without) and 'text' which + defaults to None. + + This auto-generates the tuple of Error objects for testing against a failed parse_dts_manifest + call. + """ + res = parse_dts_manifest(input_file) + errors = None + if error_infos is not None: + errors = tuple([ + Error(e["type"], e.get("text"), SpecificationSource(input_file)) for e in error_infos + ]) + assert res.results is None + assert res.errors == errors + +def test_dts_manifest_non_dict(write_dts_manifest: Callable[[Union[dict,list]], Path]): + manifest_path = write_dts_manifest(["wrong_format"]) + _dts_manifest_parse_fail( + manifest_path, + error_infos=[{ + "type": ErrorType.PARSE_FAIL, + "text": "Manifest is not a dictionary" + }] + ) + +def test_dts_manifest_not_found(temp_dir: Generator[Path, None, None]): + manifest_path = temp_dir / "not_a_file" + _dts_manifest_parse_fail( + manifest_path, + error_infos = [{ + "type": ErrorType.FILE_NOT_FOUND, + }] + ) + +# def test_dts_manifest_parse_missing_resource(write_dts_manifest): +# manifest_path = write_dts_manifest({"not_a_manifest": "ok"}) +# _dts_manifest_parse_fail( +# manifest_path, +# tuple([ +# Error( +# ErrorType.PARSE_FAIL, +# "Manifest is missing a list of file resources", +# SpecificationSource(manifest_path) +# ) +# ]) +# ) + +# def test_dts_manifest_parse_missing_keys(): +# # note that this includes one valid entry, which should not be returned +# manifest_path = _get_test_file("manifest_errors.json") +# _dts_manifest_parse_fail( +# manifest_path, +# tuple([ +# Error( +# ErrorType.PARSE_FAIL, +# "resource missing key(s) instructions", +# SpecificationSource(manifest_path) +# ) +# ]) +# ) + +# @pytest.mark.parametrize() +# def test_dts_manifest_parse_missing_instructions_keys(): +# manifest_path = write_dts_manifest({"resources": [{ +# "id": "foo", +# "path": "bar", +# "name": "some_object", +# "format": "some_format", +# }]}) + +# def test_dts_manifest_parse_malformed_instructions(): +# pass From ab6bbc7e49b4f130d3eee14738caa72d4b6987da Mon Sep 17 00:00:00 2001 From: Bill Riehl Date: Wed, 13 Nov 2024 14:51:04 -0500 Subject: [PATCH 04/12] add dummy dts manifest parser and basic fail tests --- .../individual_parsers.py | 2 - .../test_data/manifest_small.json | 122 ++++++++++++++++++ .../test_individual_parsers.py | 96 +++++--------- 3 files changed, 151 insertions(+), 69 deletions(-) create mode 100644 tests/import_specifications/test_data/manifest_small.json diff --git a/staging_service/import_specifications/individual_parsers.py b/staging_service/import_specifications/individual_parsers.py index e4ff8471..44158cc8 100644 --- a/staging_service/import_specifications/individual_parsers.py +++ b/staging_service/import_specifications/individual_parsers.py @@ -361,8 +361,6 @@ def parse_dts_manifest(path: Path) -> ParseResults: return _error(Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc)) except IsADirectoryError: return _error(Error(ErrorType.PARSE_FAIL, "The given path is a directory", spcsrc)) - except _ParseException as e: - return _error(e.args[0]) if errors: return ParseResults(errors=tuple(errors)) elif results: diff --git a/tests/import_specifications/test_data/manifest_small.json b/tests/import_specifications/test_data/manifest_small.json new file mode 100644 index 00000000..e87472d5 --- /dev/null +++ b/tests/import_specifications/test_data/manifest_small.json @@ -0,0 +1,122 @@ +{ + "name": "manifest", + "resources": [ + { + "id": "JDP:555518eb0d8785178e712d88", + "name": "61564.assembled", + "path": "img/submissions/61564/61564.assembled.gff", + "format": "gff", + "media_type": "text/plain", + "bytes": 455161, + "hash": "", + "credit": { + "comment": "", + "content_url": "", + "contributors": null, + "credit_metadata_source": "", + "dates": null, + "descriptions": null, + "funding": null, + "identifier": "JDP:555518eb0d8785178e712d88", + "license": { + "id": "", + "url": "" + }, + "publisher": { + "organization_id": "", + "organization_name": "" + }, + "related_identifiers": null, + "resource_type": "dataset", + "titles": null, + "url": "", + "version": "" + }, + "instructions": { + "data_type": "gff_metagenome", + "parameters": { + "param1": "value1", + "param2": "value2" + } + } + }, + { + "id": "JDP:555518eb0d8785178e712d84", + "name": "61564.assembled", + "path": "img/submissions/61564/61564.assembled.fna", + "format": "fasta", + "media_type": "text/plain", + "bytes": 6354414, + "hash": "", + "credit": { + "comment": "", + "content_url": "", + "contributors": null, + "credit_metadata_source": "", + "dates": null, + "descriptions": null, + "funding": null, + "identifier": "JDP:555518eb0d8785178e712d84", + "license": { + "id": "", + "url": "" + }, + "publisher": { + "organization_id": "", + "organization_name": "" + }, + "related_identifiers": null, + "resource_type": "dataset", + "titles": null, + "url": "", + "version": "" + }, + "instructions": { + "data_type": "gff_metagenome", + "parameters": { + "param1": "value1", + "param2": "value2" + } + } + }, + { + "id": "JDP:555518ec0d8785178e712d9f", + "name": "61567.assembled", + "path": "img/submissions/61567/61567.assembled.gff", + "format": "gff", + "media_type": "text/plain", + "bytes": 545583, + "hash": "", + "credit": { + "comment": "", + "content_url": "", + "contributors": null, + "credit_metadata_source": "", + "dates": null, + "descriptions": null, + "funding": null, + "identifier": "JDP:555518ec0d8785178e712d9f", + "license": { + "id": "", + "url": "" + }, + "publisher": { + "organization_id": "", + "organization_name": "" + }, + "related_identifiers": null, + "resource_type": "dataset", + "titles": null, + "url": "", + "version": "" + }, + "instructions": { + "data_type": "gff_metagenome", + "parameters": { + "param1": "value1", + "param2": "value2" + } + } + } + ] +} diff --git a/tests/import_specifications/test_individual_parsers.py b/tests/import_specifications/test_individual_parsers.py index 09bb0a84..6b21f2b7 100644 --- a/tests/import_specifications/test_individual_parsers.py +++ b/tests/import_specifications/test_individual_parsers.py @@ -3,7 +3,6 @@ import uuid from collections.abc import Callable, Generator from pathlib import Path -from typing import Union # TODO update to C impl when fixed: https://github.com/Marco-Sulla/python-frozendict/issues/26 from frozendict import frozendict @@ -772,28 +771,26 @@ def test_excel_parse_fail_unequal_rows(): def test_dts_manifest_parse_success(): f = _get_test_file("manifest_small.json") res = parse_dts_manifest(f) - assert res.results - assert res.errors is None - assert list(res.results.keys()) == ["gff_metagenome"] - assert res.results["gff_metagenome"] - assert res.results["gff_metagenome"].source.file == f - assert len(res.results["gff_metagenome"].result) == 3 - for parsed in res.results["gff_metagenome"].result: - assert parsed == { - "param1": "value1", - "param2": "value2" - } + # fails for now + assert res.results is None + assert res.errors == tuple([ + Error( + ErrorType.PARSE_FAIL, + "No import specification data in file", + SpecificationSource(f) + ) + ]) @fixture(scope="module") -def write_dts_manifest(temp_dir: Generator[Path, None, None]) -> Callable[[Union[dict,list]], Path]: - def manifest_writer(input_json: Union[dict,list]) -> Path: +def write_dts_manifest(temp_dir: Generator[Path, None, None]) -> Callable[[dict|list], Path]: + def manifest_writer(input_json: dict|list) -> Path: file_path = temp_dir / str(uuid.uuid4()) with open(file_path, "w", encoding="utf-8") as outfile: json.dump(input_json, outfile) return file_path return manifest_writer -def _dts_manifest_parse_fail(input_file: Path, error_infos: list[dict[str, str]]=None): +def _dts_manifest_parse_fail(input_file: Path, errors: list[Error]): """ Tests a failing DTS manifest parse. input_file - the path to the input file. Might be a directory or not exist. @@ -805,68 +802,33 @@ def _dts_manifest_parse_fail(input_file: Path, error_infos: list[dict[str, str]] call. """ res = parse_dts_manifest(input_file) - errors = None - if error_infos is not None: - errors = tuple([ - Error(e["type"], e.get("text"), SpecificationSource(input_file)) for e in error_infos - ]) assert res.results is None - assert res.errors == errors + assert res.errors == tuple(errors) -def test_dts_manifest_non_dict(write_dts_manifest: Callable[[Union[dict,list]], Path]): +def test_dts_manifest_non_dict(write_dts_manifest: Callable[[dict|list], Path]): manifest_path = write_dts_manifest(["wrong_format"]) _dts_manifest_parse_fail( manifest_path, - error_infos=[{ - "type": ErrorType.PARSE_FAIL, - "text": "Manifest is not a dictionary" - }] + [ + Error( + ErrorType.PARSE_FAIL, + "Manifest is not a dictionary", + SpecificationSource(manifest_path) + ) + ] ) def test_dts_manifest_not_found(temp_dir: Generator[Path, None, None]): manifest_path = temp_dir / "not_a_file" _dts_manifest_parse_fail( manifest_path, - error_infos = [{ - "type": ErrorType.FILE_NOT_FOUND, - }] + [Error(ErrorType.FILE_NOT_FOUND, source_1=SpecificationSource(manifest_path))] ) -# def test_dts_manifest_parse_missing_resource(write_dts_manifest): -# manifest_path = write_dts_manifest({"not_a_manifest": "ok"}) -# _dts_manifest_parse_fail( -# manifest_path, -# tuple([ -# Error( -# ErrorType.PARSE_FAIL, -# "Manifest is missing a list of file resources", -# SpecificationSource(manifest_path) -# ) -# ]) -# ) - -# def test_dts_manifest_parse_missing_keys(): -# # note that this includes one valid entry, which should not be returned -# manifest_path = _get_test_file("manifest_errors.json") -# _dts_manifest_parse_fail( -# manifest_path, -# tuple([ -# Error( -# ErrorType.PARSE_FAIL, -# "resource missing key(s) instructions", -# SpecificationSource(manifest_path) -# ) -# ]) -# ) - -# @pytest.mark.parametrize() -# def test_dts_manifest_parse_missing_instructions_keys(): -# manifest_path = write_dts_manifest({"resources": [{ -# "id": "foo", -# "path": "bar", -# "name": "some_object", -# "format": "some_format", -# }]}) - -# def test_dts_manifest_parse_malformed_instructions(): -# pass +def test_dts_manifest_file_is_directory(temp_dir: Generator[Path, None, None]): + test_file = temp_dir / "testdir.json" + os.makedirs(test_file, exist_ok=True) + _dts_manifest_parse_fail( + test_file, + [Error(ErrorType.PARSE_FAIL, "The given path is a directory", SpecificationSource(test_file))] + ) From ae4a74dda814f732f584263d40197ec86a26e899 Mon Sep 17 00:00:00 2001 From: Bill Riehl Date: Wed, 13 Nov 2024 15:04:05 -0500 Subject: [PATCH 05/12] add JSON format check and test --- .../import_specifications/individual_parsers.py | 2 ++ .../test_individual_parsers.py | 15 +++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/staging_service/import_specifications/individual_parsers.py b/staging_service/import_specifications/individual_parsers.py index 44158cc8..4e372d10 100644 --- a/staging_service/import_specifications/individual_parsers.py +++ b/staging_service/import_specifications/individual_parsers.py @@ -357,6 +357,8 @@ def parse_dts_manifest(path: Path) -> ParseResults: ) ) + except json.JSONDecodeError: + return _error(Error(ErrorType.PARSE_FAIL, "File must be in JSON format", spcsrc)) except FileNotFoundError: return _error(Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc)) except IsADirectoryError: diff --git a/tests/import_specifications/test_individual_parsers.py b/tests/import_specifications/test_individual_parsers.py index 6b21f2b7..9ca1ccc5 100644 --- a/tests/import_specifications/test_individual_parsers.py +++ b/tests/import_specifications/test_individual_parsers.py @@ -805,6 +805,21 @@ def _dts_manifest_parse_fail(input_file: Path, errors: list[Error]): assert res.results is None assert res.errors == tuple(errors) +def test_dts_manifest_non_json(temp_dir: Generator[Path, None, None]): + test_file = temp_dir / str(uuid.uuid4()) + with open(test_file, "w", encoding="utf-8") as outfile: + outfile.write("totally not json") + _dts_manifest_parse_fail( + test_file, + [ + Error( + ErrorType.PARSE_FAIL, + "File must be in JSON format", + SpecificationSource(test_file) + ) + ] + ) + def test_dts_manifest_non_dict(write_dts_manifest: Callable[[dict|list], Path]): manifest_path = write_dts_manifest(["wrong_format"]) _dts_manifest_parse_fail( From 5d945ff1229fecd0c6227b346804600811ed130e Mon Sep 17 00:00:00 2001 From: Bill Riehl Date: Wed, 13 Nov 2024 15:07:34 -0500 Subject: [PATCH 06/12] fix test fixture docstrong --- tests/import_specifications/test_individual_parsers.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/import_specifications/test_individual_parsers.py b/tests/import_specifications/test_individual_parsers.py index 9ca1ccc5..201845ac 100644 --- a/tests/import_specifications/test_individual_parsers.py +++ b/tests/import_specifications/test_individual_parsers.py @@ -794,12 +794,8 @@ def _dts_manifest_parse_fail(input_file: Path, errors: list[Error]): """ Tests a failing DTS manifest parse. input_file - the path to the input file. Might be a directory or not exist. - error_infos - a list of error info dicts, expected to be in the order generated by the call to - parse_dts_manifest. Expected keys are 'type' (will fail without) and 'text' which - defaults to None. - - This auto-generates the tuple of Error objects for testing against a failed parse_dts_manifest - call. + errors - a list of Error objects expected to be in the order generated by + the call to parse_dts_manifest. """ res = parse_dts_manifest(input_file) assert res.results is None From ccf17613f75c389c78eb26f740f5aaeb86d1e384 Mon Sep 17 00:00:00 2001 From: Bill Riehl Date: Wed, 13 Nov 2024 15:11:39 -0500 Subject: [PATCH 07/12] ruff formatting --- staging_service/app.py | 9 +++- .../individual_parsers.py | 9 +--- .../test_individual_parsers.py | 48 ++++++++++++------- 3 files changed, 39 insertions(+), 27 deletions(-) diff --git a/staging_service/app.py b/staging_service/app.py index 2ce06400..e403f73b 100644 --- a/staging_service/app.py +++ b/staging_service/app.py @@ -26,7 +26,12 @@ write_excel, write_tsv, ) -from .import_specifications.individual_parsers import parse_csv, parse_excel, parse_tsv, parse_dts_manifest +from .import_specifications.individual_parsers import ( + parse_csv, + parse_excel, + parse_tsv, + parse_dts_manifest, +) from .JGIMetadata import read_metadata_for from .metadata import add_upa, dir_info, similar, some_metadata from .utils import AclManager, Path, run_command @@ -43,7 +48,7 @@ CSV: parse_csv, TSV: parse_tsv, EXCEL: parse_excel, - DTS_MANIFEST: parse_dts_manifest + DTS_MANIFEST: parse_dts_manifest, } _IMPSPEC_FILE_TO_WRITER = { diff --git a/staging_service/import_specifications/individual_parsers.py b/staging_service/import_specifications/individual_parsers.py index 4e372d10..b5961f66 100644 --- a/staging_service/import_specifications/individual_parsers.py +++ b/staging_service/import_specifications/individual_parsers.py @@ -318,6 +318,7 @@ def parse_excel(path: Path) -> ParseResults: else: return _error(Error(ErrorType.PARSE_FAIL, "No non-header data in file", spcsrc)) + def parse_dts_manifest(path: Path) -> ParseResults: """ Parse the provided DTS manifest file. Expected to be JSON, and will fail otherwise. @@ -349,13 +350,7 @@ def parse_dts_manifest(path: Path) -> ParseResults: with open(path, "r") as manifest: manifest_json = json.load(manifest) if not isinstance(manifest_json, dict): - errors.append( - Error( - ErrorType.PARSE_FAIL, - "Manifest is not a dictionary", - spcsrc - ) - ) + errors.append(Error(ErrorType.PARSE_FAIL, "Manifest is not a dictionary", spcsrc)) except json.JSONDecodeError: return _error(Error(ErrorType.PARSE_FAIL, "File must be in JSON format", spcsrc)) diff --git a/tests/import_specifications/test_individual_parsers.py b/tests/import_specifications/test_individual_parsers.py index 201845ac..61214cfd 100644 --- a/tests/import_specifications/test_individual_parsers.py +++ b/tests/import_specifications/test_individual_parsers.py @@ -768,28 +768,32 @@ def test_excel_parse_fail_unequal_rows(): ], ) + def test_dts_manifest_parse_success(): f = _get_test_file("manifest_small.json") res = parse_dts_manifest(f) # fails for now assert res.results is None - assert res.errors == tuple([ - Error( - ErrorType.PARSE_FAIL, - "No import specification data in file", - SpecificationSource(f) - ) - ]) + assert res.errors == tuple( + [ + Error( + ErrorType.PARSE_FAIL, "No import specification data in file", SpecificationSource(f) + ) + ] + ) + @fixture(scope="module") -def write_dts_manifest(temp_dir: Generator[Path, None, None]) -> Callable[[dict|list], Path]: - def manifest_writer(input_json: dict|list) -> Path: +def write_dts_manifest(temp_dir: Generator[Path, None, None]) -> Callable[[dict | list], Path]: + def manifest_writer(input_json: dict | list) -> Path: file_path = temp_dir / str(uuid.uuid4()) with open(file_path, "w", encoding="utf-8") as outfile: json.dump(input_json, outfile) return file_path + return manifest_writer + def _dts_manifest_parse_fail(input_file: Path, errors: list[Error]): """ Tests a failing DTS manifest parse. @@ -801,6 +805,7 @@ def _dts_manifest_parse_fail(input_file: Path, errors: list[Error]): assert res.results is None assert res.errors == tuple(errors) + def test_dts_manifest_non_json(temp_dir: Generator[Path, None, None]): test_file = temp_dir / str(uuid.uuid4()) with open(test_file, "w", encoding="utf-8") as outfile: @@ -809,14 +814,13 @@ def test_dts_manifest_non_json(temp_dir: Generator[Path, None, None]): test_file, [ Error( - ErrorType.PARSE_FAIL, - "File must be in JSON format", - SpecificationSource(test_file) + ErrorType.PARSE_FAIL, "File must be in JSON format", SpecificationSource(test_file) ) - ] + ], ) -def test_dts_manifest_non_dict(write_dts_manifest: Callable[[dict|list], Path]): + +def test_dts_manifest_non_dict(write_dts_manifest: Callable[[dict | list], Path]): manifest_path = write_dts_manifest(["wrong_format"]) _dts_manifest_parse_fail( manifest_path, @@ -824,22 +828,30 @@ def test_dts_manifest_non_dict(write_dts_manifest: Callable[[dict|list], Path]): Error( ErrorType.PARSE_FAIL, "Manifest is not a dictionary", - SpecificationSource(manifest_path) + SpecificationSource(manifest_path), ) - ] + ], ) + def test_dts_manifest_not_found(temp_dir: Generator[Path, None, None]): manifest_path = temp_dir / "not_a_file" _dts_manifest_parse_fail( manifest_path, - [Error(ErrorType.FILE_NOT_FOUND, source_1=SpecificationSource(manifest_path))] + [Error(ErrorType.FILE_NOT_FOUND, source_1=SpecificationSource(manifest_path))], ) + def test_dts_manifest_file_is_directory(temp_dir: Generator[Path, None, None]): test_file = temp_dir / "testdir.json" os.makedirs(test_file, exist_ok=True) _dts_manifest_parse_fail( test_file, - [Error(ErrorType.PARSE_FAIL, "The given path is a directory", SpecificationSource(test_file))] + [ + Error( + ErrorType.PARSE_FAIL, + "The given path is a directory", + SpecificationSource(test_file), + ) + ], ) From 4f6044dcc2f04158651788669376f7b28f68fd94 Mon Sep 17 00:00:00 2001 From: Bill Riehl Date: Thu, 14 Nov 2024 16:32:25 -0500 Subject: [PATCH 08/12] fix silliness around dts manifest mapping --- README.md | 10 +++++++ .../conf/supported_apps_w_extensions.json | 28 +++++++++---------- staging_service/app.py | 4 +-- .../autodetect/GenerateMappings.py | 11 +++++++- staging_service/autodetect/Mappings.py | 4 --- 5 files changed, 36 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 67aae27d..4b140ded 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,16 @@ gzip, bzip2, md5sum, head, tail, wc in the docker container all of these should be available +## Updating importer mapping types +The types returned by the `importer_mappings` endpoint (see [Get Importer Mappings](#get-importer-mappings) below) +are generated by the `GenerateMappings.py` script here: `staging_service/autodetect/GenerateMappings.py`. +Running this script will build the `supported_apps_w_extensions.json` file that should be placed in the +`deployment/conf` directory. + +To add new mappings, update the `GenerateMappings.py` script. New file types and object types should be added +to the `staging_service.autodetect.Mappings` module and included from there. See `GenerateMappings.py` +docstrings for more details. + ## API all paths should be specified treating the user's home directory as root diff --git a/deployment/conf/supported_apps_w_extensions.json b/deployment/conf/supported_apps_w_extensions.json index 8dacbb99..a4649424 100644 --- a/deployment/conf/supported_apps_w_extensions.json +++ b/deployment/conf/supported_apps_w_extensions.json @@ -1102,34 +1102,34 @@ } ] }, - "smbl": { + "json": { "file_ext_type": [ - "SBML" + "JSON" ], "mappings": [ { - "id": "fba_model", - "title": "FBA Model", + "id": "import_specification", + "title": "Import Specification", + "app_weight": 1 + }, + { + "id": "escher_map", + "title": "EscherMap", "app_weight": 1 } ] }, - "json": { + "smbl": { "file_ext_type": [ - "JSON" + "SBML" ], "mappings": [ { - "id": "escher_map", - "title": "EscherMap", - "app_weight": 1 - }, - { - "id": "import_specification", - "title": "Import Specification", + "id": "fba_model", + "title": "FBA Model", "app_weight": 1 } ] } } -} +} \ No newline at end of file diff --git a/staging_service/app.py b/staging_service/app.py index e403f73b..c7c0e699 100644 --- a/staging_service/app.py +++ b/staging_service/app.py @@ -12,7 +12,7 @@ from .app_error_formatter import format_import_spec_errors from .auth2Client import KBaseAuth2 -from .autodetect.Mappings import CSV, EXCEL, TSV, DTS_MANIFEST +from .autodetect.Mappings import CSV, EXCEL, TSV, JSON from .AutoDetectUtils import AutoDetectUtils from .globus import assert_globusid_exists, is_globusid from .import_specifications.file_parser import ( @@ -48,7 +48,7 @@ CSV: parse_csv, TSV: parse_tsv, EXCEL: parse_excel, - DTS_MANIFEST: parse_dts_manifest, + JSON: parse_dts_manifest, } _IMPSPEC_FILE_TO_WRITER = { diff --git a/staging_service/autodetect/GenerateMappings.py b/staging_service/autodetect/GenerateMappings.py index ce11458a..4d2287f7 100644 --- a/staging_service/autodetect/GenerateMappings.py +++ b/staging_service/autodetect/GenerateMappings.py @@ -23,6 +23,15 @@ * Note: We should serve the generated content from memory * Note: This doesn't handle if we want to have different output types based on file extensions feeding into the same app + +Adding new file types and app ids: +* file types and app ids should be added to staging_service.autodetect.Mappings +* file types should be all upper-case and represent the file suffix. +* app ids should be in snake_case, and represent either an object import type, or some + internal application to be run on selection (i.e. decompress or import specification). +* app ids that map to actual object import maps should match the app ids in the narrative + interface configuration here: + https://github.com/kbase/narrative/blob/main/kbase-extension/static/kbase/config/staging_upload.json """ from collections import defaultdict @@ -120,7 +129,7 @@ fba_model_id, import_specification, ] -file_format_to_app_mapping[JSON] = [escher_map_id] +file_format_to_app_mapping[JSON] = [escher_map_id, import_specification] file_format_to_app_mapping[SBML] = [fba_model_id] app_id_to_extensions = defaultdict(list) diff --git a/staging_service/autodetect/Mappings.py b/staging_service/autodetect/Mappings.py index 37edbf4e..cada2387 100644 --- a/staging_service/autodetect/Mappings.py +++ b/staging_service/autodetect/Mappings.py @@ -7,9 +7,6 @@ EXCEL = "EXCEL" ZIP = "CompressedFileFormatArchive" -# Data Transfer Service Manifest format (which is a specific JSON format) -DTS_MANIFEST = "JSON" - # BIOINFORMATICS FORMATS FASTA = "FASTA" FASTQ = "FASTQ" @@ -103,7 +100,6 @@ def _add_gzip(extension_list): # ], TSV: ["tsv"], # See Note 1 below CSV: ["csv"], # See Note 1 below, - DTS_MANIFEST: ["json"], JSON: ["json"], EXCEL: ["xls", "xlsx"], # See Note 1 below ZIP: ["zip", "tar", "tgz", "tar.gz", "7z", "gz", "gzip", "rar"], From 1e4fc01441eb03e1e2ca8464c3ab60f5c2820b66 Mon Sep 17 00:00:00 2001 From: Bill Riehl Date: Thu, 14 Nov 2024 16:36:09 -0500 Subject: [PATCH 09/12] add link from API call to instructions on how to update mappings --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 4b140ded..e79e1ba8 100644 --- a/README.md +++ b/README.md @@ -1089,6 +1089,8 @@ For example, - for files for which there is no predicted app, the return is a null value - this endpoint is used to power the dropdowns for the staging service window in the Narrative +Note: to update these mappings see instructions [here](#updating-importer-mapping-types) + **URL** : `ci.kbase.us/services/staging_service/importer_mappings` **local URL** : `localhost:3000/importer_mappings` From 1fc9a2c7edaed4b50ba5e9c04b74a4566ea48e68 Mon Sep 17 00:00:00 2001 From: Bill Riehl Date: Thu, 14 Nov 2024 18:16:33 -0500 Subject: [PATCH 10/12] fix typo --- staging_service/autodetect/Mappings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/staging_service/autodetect/Mappings.py b/staging_service/autodetect/Mappings.py index cada2387..af08e54c 100644 --- a/staging_service/autodetect/Mappings.py +++ b/staging_service/autodetect/Mappings.py @@ -99,7 +99,7 @@ def _add_gzip(extension_list): # "stockholm", # ], TSV: ["tsv"], # See Note 1 below - CSV: ["csv"], # See Note 1 below, + CSV: ["csv"], # See Note 1 below JSON: ["json"], EXCEL: ["xls", "xlsx"], # See Note 1 below ZIP: ["zip", "tar", "tgz", "tar.gz", "7z", "gz", "gzip", "rar"], From a19d227a33d8ff083e203d9afc6453cac9f4d437 Mon Sep 17 00:00:00 2001 From: Bill Riehl Date: Mon, 18 Nov 2024 14:44:31 -0500 Subject: [PATCH 11/12] update docstrings, add TODOS --- staging_service/app.py | 4 +++ .../individual_parsers.py | 28 ++++++++++--------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/staging_service/app.py b/staging_service/app.py index c7c0e699..cd668dc7 100644 --- a/staging_service/app.py +++ b/staging_service/app.py @@ -116,6 +116,10 @@ async def bulk_specification(request: web.Request) -> web.json_response: type, in the `types` key. :param request: contains a comma separated list of files, e.g. folder1/file1.txt,file2.txt + + TODO: since JSON files are rather generic and we might want to use a different JSON bulk-spec + format later, add a separate query parameter to request that the selected file is treated as a + Data Transfer Service manifest. """ username = await authorize_request(request) files = parse_qs(request.query_string).get("files", []) diff --git a/staging_service/import_specifications/individual_parsers.py b/staging_service/import_specifications/individual_parsers.py index b5961f66..48e4d9c1 100644 --- a/staging_service/import_specifications/individual_parsers.py +++ b/staging_service/import_specifications/individual_parsers.py @@ -324,23 +324,25 @@ def parse_dts_manifest(path: Path) -> ParseResults: Parse the provided DTS manifest file. Expected to be JSON, and will fail otherwise. The manifest should have roughly this format, with expected keys included: { - "resources": [{ - "id": str, - "name": str, - "path": str, - "format": str, - "instructions": { + "resources": [{ file manifest info isn't currently relevant }], + "instructions": { + "protocol": "KBase narrative import", + "objects": [{ "data_type": str, "parameters": { "": value, - "": value - } - } - }] + "": value, + }, ... + }] + } } - The parameters under the "instructions"."parameters" dictionary are arbitrary keys with - arbitrary values and will be returned as-is. They are expected to be PRIMITIVE_TYPEs and - will fail otherwise. + + This will get parsed and returned in line with the other parsers as a valid + ParseResults object, i.e. each result will be keyed on the data type string, + and its value will be a Tuple of frozendicts of the parameters. Also, in keeping + with the xsv parsers, each parameter value is expected to be a PRIMITIVE_TYPE. + + TODO: include further details here, and in separate documentation - ADR? """ spcsrc = SpecificationSource(path) errors = [] From bd557911884d228b569fd55c153118852cc95369 Mon Sep 17 00:00:00 2001 From: Bill Riehl Date: Tue, 19 Nov 2024 16:15:11 -0500 Subject: [PATCH 12/12] remove 'roughly' --- staging_service/import_specifications/individual_parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/staging_service/import_specifications/individual_parsers.py b/staging_service/import_specifications/individual_parsers.py index 48e4d9c1..bf98a52b 100644 --- a/staging_service/import_specifications/individual_parsers.py +++ b/staging_service/import_specifications/individual_parsers.py @@ -322,7 +322,7 @@ def parse_excel(path: Path) -> ParseResults: def parse_dts_manifest(path: Path) -> ParseResults: """ Parse the provided DTS manifest file. Expected to be JSON, and will fail otherwise. - The manifest should have roughly this format, with expected keys included: + The manifest should have this format, with expected keys included: { "resources": [{ file manifest info isn't currently relevant }], "instructions": {