diff --git a/README.md b/README.md index 67aae27d..e79e1ba8 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,16 @@ gzip, bzip2, md5sum, head, tail, wc in the docker container all of these should be available +## Updating importer mapping types +The types returned by the `importer_mappings` endpoint (see [Get Importer Mappings](#get-importer-mappings) below) +are generated by the `GenerateMappings.py` script here: `staging_service/autodetect/GenerateMappings.py`. +Running this script will build the `supported_apps_w_extensions.json` file that should be placed in the +`deployment/conf` directory. + +To add new mappings, update the `GenerateMappings.py` script. New file types and object types should be added +to the `staging_service.autodetect.Mappings` module and included from there. See `GenerateMappings.py` +docstrings for more details. + ## API all paths should be specified treating the user's home directory as root @@ -1079,6 +1089,8 @@ For example, - for files for which there is no predicted app, the return is a null value - this endpoint is used to power the dropdowns for the staging service window in the Narrative +Note: to update these mappings see instructions [here](#updating-importer-mapping-types) + **URL** : `ci.kbase.us/services/staging_service/importer_mappings` **local URL** : `localhost:3000/importer_mappings` diff --git a/deployment/conf/supported_apps_w_extensions.json b/deployment/conf/supported_apps_w_extensions.json index 205fc0b0..a4649424 100644 --- a/deployment/conf/supported_apps_w_extensions.json +++ b/deployment/conf/supported_apps_w_extensions.json @@ -121,7 +121,8 @@ "csv", "tsv", "xls", - "xlsx" + "xlsx", + "json" ], "media": [ "tsv", @@ -1101,26 +1102,31 @@ } ] }, - "smbl": { + "json": { "file_ext_type": [ - "SBML" + "JSON" ], "mappings": [ { - "id": "fba_model", - "title": "FBA Model", + "id": "import_specification", + "title": "Import Specification", + "app_weight": 1 + }, + { + "id": "escher_map", + "title": "EscherMap", "app_weight": 1 } ] }, - "json": { + "smbl": { "file_ext_type": [ - "JSON" + "SBML" ], "mappings": [ { - "id": "escher_map", - "title": "EscherMap", + "id": "fba_model", + "title": "FBA Model", "app_weight": 1 } ] diff --git a/staging_service/app.py b/staging_service/app.py index b7e90f24..cd668dc7 100644 --- a/staging_service/app.py +++ b/staging_service/app.py @@ -12,7 +12,7 @@ from .app_error_formatter import format_import_spec_errors from .auth2Client import KBaseAuth2 -from .autodetect.Mappings import CSV, EXCEL, TSV +from .autodetect.Mappings import CSV, EXCEL, TSV, JSON from .AutoDetectUtils import AutoDetectUtils from .globus import assert_globusid_exists, is_globusid from .import_specifications.file_parser import ( @@ -26,7 +26,12 @@ write_excel, write_tsv, ) -from .import_specifications.individual_parsers import parse_csv, parse_excel, parse_tsv +from .import_specifications.individual_parsers import ( + parse_csv, + parse_excel, + parse_tsv, + parse_dts_manifest, +) from .JGIMetadata import read_metadata_for from .metadata import add_upa, dir_info, similar, some_metadata from .utils import AclManager, Path, run_command @@ -43,6 +48,7 @@ CSV: parse_csv, TSV: parse_tsv, EXCEL: parse_excel, + JSON: parse_dts_manifest, } _IMPSPEC_FILE_TO_WRITER = { @@ -110,6 +116,10 @@ async def bulk_specification(request: web.Request) -> web.json_response: type, in the `types` key. :param request: contains a comma separated list of files, e.g. folder1/file1.txt,file2.txt + + TODO: since JSON files are rather generic and we might want to use a different JSON bulk-spec + format later, add a separate query parameter to request that the selected file is treated as a + Data Transfer Service manifest. """ username = await authorize_request(request) files = parse_qs(request.query_string).get("files", []) diff --git a/staging_service/autodetect/GenerateMappings.py b/staging_service/autodetect/GenerateMappings.py index ce11458a..4d2287f7 100644 --- a/staging_service/autodetect/GenerateMappings.py +++ b/staging_service/autodetect/GenerateMappings.py @@ -23,6 +23,15 @@ * Note: We should serve the generated content from memory * Note: This doesn't handle if we want to have different output types based on file extensions feeding into the same app + +Adding new file types and app ids: +* file types and app ids should be added to staging_service.autodetect.Mappings +* file types should be all upper-case and represent the file suffix. +* app ids should be in snake_case, and represent either an object import type, or some + internal application to be run on selection (i.e. decompress or import specification). +* app ids that map to actual object import maps should match the app ids in the narrative + interface configuration here: + https://github.com/kbase/narrative/blob/main/kbase-extension/static/kbase/config/staging_upload.json """ from collections import defaultdict @@ -120,7 +129,7 @@ fba_model_id, import_specification, ] -file_format_to_app_mapping[JSON] = [escher_map_id] +file_format_to_app_mapping[JSON] = [escher_map_id, import_specification] file_format_to_app_mapping[SBML] = [fba_model_id] app_id_to_extensions = defaultdict(list) diff --git a/staging_service/import_specifications/individual_parsers.py b/staging_service/import_specifications/individual_parsers.py index 00eee18b..bf98a52b 100644 --- a/staging_service/import_specifications/individual_parsers.py +++ b/staging_service/import_specifications/individual_parsers.py @@ -3,6 +3,7 @@ """ import csv +import json import math import re from pathlib import Path @@ -316,3 +317,52 @@ def parse_excel(path: Path) -> ParseResults: return ParseResults(frozendict(results)) else: return _error(Error(ErrorType.PARSE_FAIL, "No non-header data in file", spcsrc)) + + +def parse_dts_manifest(path: Path) -> ParseResults: + """ + Parse the provided DTS manifest file. Expected to be JSON, and will fail otherwise. + The manifest should have this format, with expected keys included: + { + "resources": [{ file manifest info isn't currently relevant }], + "instructions": { + "protocol": "KBase narrative import", + "objects": [{ + "data_type": str, + "parameters": { + "": value, + "": value, + }, ... + }] + } + } + + This will get parsed and returned in line with the other parsers as a valid + ParseResults object, i.e. each result will be keyed on the data type string, + and its value will be a Tuple of frozendicts of the parameters. Also, in keeping + with the xsv parsers, each parameter value is expected to be a PRIMITIVE_TYPE. + + TODO: include further details here, and in separate documentation - ADR? + """ + spcsrc = SpecificationSource(path) + errors = [] + # dummy for now + results = {} + try: + with open(path, "r") as manifest: + manifest_json = json.load(manifest) + if not isinstance(manifest_json, dict): + errors.append(Error(ErrorType.PARSE_FAIL, "Manifest is not a dictionary", spcsrc)) + + except json.JSONDecodeError: + return _error(Error(ErrorType.PARSE_FAIL, "File must be in JSON format", spcsrc)) + except FileNotFoundError: + return _error(Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc)) + except IsADirectoryError: + return _error(Error(ErrorType.PARSE_FAIL, "The given path is a directory", spcsrc)) + if errors: + return ParseResults(errors=tuple(errors)) + elif results: + return ParseResults(frozendict(results)) + else: + return _error(Error(ErrorType.PARSE_FAIL, "No import specification data in file", spcsrc)) diff --git a/tests/import_specifications/test_data/manifest_small.json b/tests/import_specifications/test_data/manifest_small.json new file mode 100644 index 00000000..e87472d5 --- /dev/null +++ b/tests/import_specifications/test_data/manifest_small.json @@ -0,0 +1,122 @@ +{ + "name": "manifest", + "resources": [ + { + "id": "JDP:555518eb0d8785178e712d88", + "name": "61564.assembled", + "path": "img/submissions/61564/61564.assembled.gff", + "format": "gff", + "media_type": "text/plain", + "bytes": 455161, + "hash": "", + "credit": { + "comment": "", + "content_url": "", + "contributors": null, + "credit_metadata_source": "", + "dates": null, + "descriptions": null, + "funding": null, + "identifier": "JDP:555518eb0d8785178e712d88", + "license": { + "id": "", + "url": "" + }, + "publisher": { + "organization_id": "", + "organization_name": "" + }, + "related_identifiers": null, + "resource_type": "dataset", + "titles": null, + "url": "", + "version": "" + }, + "instructions": { + "data_type": "gff_metagenome", + "parameters": { + "param1": "value1", + "param2": "value2" + } + } + }, + { + "id": "JDP:555518eb0d8785178e712d84", + "name": "61564.assembled", + "path": "img/submissions/61564/61564.assembled.fna", + "format": "fasta", + "media_type": "text/plain", + "bytes": 6354414, + "hash": "", + "credit": { + "comment": "", + "content_url": "", + "contributors": null, + "credit_metadata_source": "", + "dates": null, + "descriptions": null, + "funding": null, + "identifier": "JDP:555518eb0d8785178e712d84", + "license": { + "id": "", + "url": "" + }, + "publisher": { + "organization_id": "", + "organization_name": "" + }, + "related_identifiers": null, + "resource_type": "dataset", + "titles": null, + "url": "", + "version": "" + }, + "instructions": { + "data_type": "gff_metagenome", + "parameters": { + "param1": "value1", + "param2": "value2" + } + } + }, + { + "id": "JDP:555518ec0d8785178e712d9f", + "name": "61567.assembled", + "path": "img/submissions/61567/61567.assembled.gff", + "format": "gff", + "media_type": "text/plain", + "bytes": 545583, + "hash": "", + "credit": { + "comment": "", + "content_url": "", + "contributors": null, + "credit_metadata_source": "", + "dates": null, + "descriptions": null, + "funding": null, + "identifier": "JDP:555518ec0d8785178e712d9f", + "license": { + "id": "", + "url": "" + }, + "publisher": { + "organization_id": "", + "organization_name": "" + }, + "related_identifiers": null, + "resource_type": "dataset", + "titles": null, + "url": "", + "version": "" + }, + "instructions": { + "data_type": "gff_metagenome", + "parameters": { + "param1": "value1", + "param2": "value2" + } + } + } + ] +} diff --git a/tests/import_specifications/test_individual_parsers.py b/tests/import_specifications/test_individual_parsers.py index 8c9c445a..61214cfd 100644 --- a/tests/import_specifications/test_individual_parsers.py +++ b/tests/import_specifications/test_individual_parsers.py @@ -1,3 +1,4 @@ +import json import os import uuid from collections.abc import Callable, Generator @@ -14,6 +15,7 @@ ParseResults, SpecificationSource, parse_csv, + parse_dts_manifest, parse_excel, parse_tsv, ) @@ -765,3 +767,91 @@ def test_excel_parse_fail_unequal_rows(): ), ], ) + + +def test_dts_manifest_parse_success(): + f = _get_test_file("manifest_small.json") + res = parse_dts_manifest(f) + # fails for now + assert res.results is None + assert res.errors == tuple( + [ + Error( + ErrorType.PARSE_FAIL, "No import specification data in file", SpecificationSource(f) + ) + ] + ) + + +@fixture(scope="module") +def write_dts_manifest(temp_dir: Generator[Path, None, None]) -> Callable[[dict | list], Path]: + def manifest_writer(input_json: dict | list) -> Path: + file_path = temp_dir / str(uuid.uuid4()) + with open(file_path, "w", encoding="utf-8") as outfile: + json.dump(input_json, outfile) + return file_path + + return manifest_writer + + +def _dts_manifest_parse_fail(input_file: Path, errors: list[Error]): + """ + Tests a failing DTS manifest parse. + input_file - the path to the input file. Might be a directory or not exist. + errors - a list of Error objects expected to be in the order generated by + the call to parse_dts_manifest. + """ + res = parse_dts_manifest(input_file) + assert res.results is None + assert res.errors == tuple(errors) + + +def test_dts_manifest_non_json(temp_dir: Generator[Path, None, None]): + test_file = temp_dir / str(uuid.uuid4()) + with open(test_file, "w", encoding="utf-8") as outfile: + outfile.write("totally not json") + _dts_manifest_parse_fail( + test_file, + [ + Error( + ErrorType.PARSE_FAIL, "File must be in JSON format", SpecificationSource(test_file) + ) + ], + ) + + +def test_dts_manifest_non_dict(write_dts_manifest: Callable[[dict | list], Path]): + manifest_path = write_dts_manifest(["wrong_format"]) + _dts_manifest_parse_fail( + manifest_path, + [ + Error( + ErrorType.PARSE_FAIL, + "Manifest is not a dictionary", + SpecificationSource(manifest_path), + ) + ], + ) + + +def test_dts_manifest_not_found(temp_dir: Generator[Path, None, None]): + manifest_path = temp_dir / "not_a_file" + _dts_manifest_parse_fail( + manifest_path, + [Error(ErrorType.FILE_NOT_FOUND, source_1=SpecificationSource(manifest_path))], + ) + + +def test_dts_manifest_file_is_directory(temp_dir: Generator[Path, None, None]): + test_file = temp_dir / "testdir.json" + os.makedirs(test_file, exist_ok=True) + _dts_manifest_parse_fail( + test_file, + [ + Error( + ErrorType.PARSE_FAIL, + "The given path is a directory", + SpecificationSource(test_file), + ) + ], + ) diff --git a/tests/test_app.py b/tests/test_app.py index 289de0ad..54b6535b 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -1654,7 +1654,7 @@ async def test_importer_filetypes(): a2f = js["datatype_to_filetype"] assert a2f["assembly"] == ["FASTA"] assert a2f["gff_genome"] == ["FASTA", "GFF"] - assert a2f["import_specification"] == ["CSV", "EXCEL", "TSV"] + assert a2f["import_specification"] == ["CSV", "EXCEL", "JSON", "TSV"] f2e = js["filetype_to_extensions"] assert f2e["FASTA"] == [