Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DTS manifest file part 1 #205

Merged
merged 12 commits into from
Nov 27, 2024
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,16 @@ gzip, bzip2, md5sum, head, tail, wc

in the docker container all of these should be available

## Updating importer mapping types
The types returned by the `importer_mappings` endpoint (see [Get Importer Mappings](#get-importer-mappings) below)
are generated by the `GenerateMappings.py` script here: `staging_service/autodetect/GenerateMappings.py`.
Running this script will build the `supported_apps_w_extensions.json` file that should be placed in the
`deployment/conf` directory.

To add new mappings, update the `GenerateMappings.py` script. New file types and object types should be added
to the `staging_service.autodetect.Mappings` module and included from there. See `GenerateMappings.py`
docstrings for more details.

## API

all paths should be specified treating the user's home directory as root
Expand Down Expand Up @@ -1079,6 +1089,8 @@ For example,
- for files for which there is no predicted app, the return is a null value
- this endpoint is used to power the dropdowns for the staging service window in the Narrative

Note: to update these mappings see instructions [here](#updating-importer-mapping-types)

**URL** : `ci.kbase.us/services/staging_service/importer_mappings`

**local URL** : `localhost:3000/importer_mappings`
Expand Down
24 changes: 15 additions & 9 deletions deployment/conf/supported_apps_w_extensions.json
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,8 @@
"csv",
"tsv",
"xls",
"xlsx"
"xlsx",
"json"
],
"media": [
"tsv",
Expand Down Expand Up @@ -1101,26 +1102,31 @@
}
]
},
"smbl": {
"json": {
"file_ext_type": [
"SBML"
"JSON"
],
"mappings": [
{
"id": "fba_model",
"title": "FBA Model",
"id": "import_specification",
"title": "Import Specification",
"app_weight": 1
},
{
"id": "escher_map",
"title": "EscherMap",
"app_weight": 1
}
]
},
"json": {
"smbl": {
"file_ext_type": [
"JSON"
"SBML"
],
"mappings": [
{
"id": "escher_map",
"title": "EscherMap",
"id": "fba_model",
"title": "FBA Model",
"app_weight": 1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file is supposed to be autogenerated IIRC but I don't see any code changes that would cause this to be different

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIRC you need to update the mapping here:

file_format_to_app_mapping[JSON] = [escher_map_id]

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well that'd be real cool if that were documented anywhere. I'll add that and fix it.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#96

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added some documentation to the README. Not perfect, but should be enough for now.

}
]
Expand Down
14 changes: 12 additions & 2 deletions staging_service/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from .app_error_formatter import format_import_spec_errors
from .auth2Client import KBaseAuth2
from .autodetect.Mappings import CSV, EXCEL, TSV
from .autodetect.Mappings import CSV, EXCEL, TSV, JSON
from .AutoDetectUtils import AutoDetectUtils
from .globus import assert_globusid_exists, is_globusid
from .import_specifications.file_parser import (
Expand All @@ -26,7 +26,12 @@
write_excel,
write_tsv,
)
from .import_specifications.individual_parsers import parse_csv, parse_excel, parse_tsv
from .import_specifications.individual_parsers import (
parse_csv,
parse_excel,
parse_tsv,
parse_dts_manifest,
)
from .JGIMetadata import read_metadata_for
from .metadata import add_upa, dir_info, similar, some_metadata
from .utils import AclManager, Path, run_command
Expand All @@ -43,6 +48,7 @@
CSV: parse_csv,
TSV: parse_tsv,
EXCEL: parse_excel,
JSON: parse_dts_manifest,
}

_IMPSPEC_FILE_TO_WRITER = {
Expand Down Expand Up @@ -110,6 +116,10 @@ async def bulk_specification(request: web.Request) -> web.json_response:
type, in the `types` key.

:param request: contains a comma separated list of files, e.g. folder1/file1.txt,file2.txt

TODO: since JSON files are rather generic and we might want to use a different JSON bulk-spec
format later, add a separate query parameter to request that the selected file is treated as a
Data Transfer Service manifest.
"""
username = await authorize_request(request)
files = parse_qs(request.query_string).get("files", [])
Expand Down
11 changes: 10 additions & 1 deletion staging_service/autodetect/GenerateMappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,15 @@
* Note: We should serve the generated content from memory
* Note: This doesn't handle if we want to have different output types based on file extensions
feeding into the same app

Adding new file types and app ids:
* file types and app ids should be added to staging_service.autodetect.Mappings
* file types should be all upper-case and represent the file suffix.
* app ids should be in snake_case, and represent either an object import type, or some
internal application to be run on selection (i.e. decompress or import specification).
* app ids that map to actual object import maps should match the app ids in the narrative
interface configuration here:
https://github.com/kbase/narrative/blob/main/kbase-extension/static/kbase/config/staging_upload.json
"""

from collections import defaultdict
Expand Down Expand Up @@ -120,7 +129,7 @@
fba_model_id,
import_specification,
]
file_format_to_app_mapping[JSON] = [escher_map_id]
file_format_to_app_mapping[JSON] = [escher_map_id, import_specification]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now I'm wondering if we should have a separate type in the dropdown for DTS manifests vs. import specifications. Maybe just DTS manifest? That would be forward compatible if we ever wanted to support standard import specifications with JSON input.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I thought of that. I guess that would be a trivial change, but I'm not sure the best thing to call it. I also wonder if it would be confusing to users at all? They'd have to read instructions on the whole DTS process anyway.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I take it back, that's not trivial at all. It means a narrative change at minimum, possibly an API change if we want to be really thorough. Maybe a new API endpoint?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would make it a query param probably

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just checking here - you're saying the separate type and query param is the way you're going to go?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, if it's going to work that way, there probably shouldn't be a JSON: dts_parser entry in the parser mapping

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is how it work right now, as of this PR, and everything else that's in it. A later PR that updates the endpoint to use a query param will probably undo this. Also, the Narrative will need to have a mapping of some sort, which is what this sets up. So it might not be from .json -> Import Specification, but it'll need to be something similar.

I don't want this to get more complicated and would like to keep that in a single PR with the bulk_specification endpoint change.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair enough, as long as the plan is clear

file_format_to_app_mapping[SBML] = [fba_model_id]

app_id_to_extensions = defaultdict(list)
Expand Down
50 changes: 50 additions & 0 deletions staging_service/import_specifications/individual_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""

import csv
import json
import math
import re
from pathlib import Path
Expand Down Expand Up @@ -316,3 +317,52 @@ def parse_excel(path: Path) -> ParseResults:
return ParseResults(frozendict(results))
else:
return _error(Error(ErrorType.PARSE_FAIL, "No non-header data in file", spcsrc))


def parse_dts_manifest(path: Path) -> ParseResults:
"""
Parse the provided DTS manifest file. Expected to be JSON, and will fail otherwise.
The manifest should have this format, with expected keys included:
{
"resources": [{ file manifest info isn't currently relevant }],
"instructions": {
"protocol": "KBase narrative import",
"objects": [{
"data_type": str,
"parameters": {
"<param1>": value,
"<param2>": value,
}, ...
}]
}
}

This will get parsed and returned in line with the other parsers as a valid
ParseResults object, i.e. each result will be keyed on the data type string,
and its value will be a Tuple of frozendicts of the parameters. Also, in keeping
with the xsv parsers, each parameter value is expected to be a PRIMITIVE_TYPE.

TODO: include further details here, and in separate documentation - ADR?
"""
spcsrc = SpecificationSource(path)
errors = []
# dummy for now
results = {}
try:
with open(path, "r") as manifest:
manifest_json = json.load(manifest)
if not isinstance(manifest_json, dict):
errors.append(Error(ErrorType.PARSE_FAIL, "Manifest is not a dictionary", spcsrc))
MrCreosote marked this conversation as resolved.
Show resolved Hide resolved

except json.JSONDecodeError:
MrCreosote marked this conversation as resolved.
Show resolved Hide resolved
return _error(Error(ErrorType.PARSE_FAIL, "File must be in JSON format", spcsrc))
except FileNotFoundError:
return _error(Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc))
except IsADirectoryError:
return _error(Error(ErrorType.PARSE_FAIL, "The given path is a directory", spcsrc))
if errors:
return ParseResults(errors=tuple(errors))
elif results:
return ParseResults(frozendict(results))
else:
return _error(Error(ErrorType.PARSE_FAIL, "No import specification data in file", spcsrc))
122 changes: 122 additions & 0 deletions tests/import_specifications/test_data/manifest_small.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
{
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't seem to match the documentation in individual_parsers but I assume the format is still changing

"name": "manifest",
"resources": [
{
"id": "JDP:555518eb0d8785178e712d88",
"name": "61564.assembled",
"path": "img/submissions/61564/61564.assembled.gff",
"format": "gff",
"media_type": "text/plain",
"bytes": 455161,
"hash": "",
"credit": {
"comment": "",
"content_url": "",
"contributors": null,
"credit_metadata_source": "",
"dates": null,
"descriptions": null,
"funding": null,
"identifier": "JDP:555518eb0d8785178e712d88",
"license": {
"id": "",
"url": ""
},
"publisher": {
"organization_id": "",
"organization_name": ""
},
"related_identifiers": null,
"resource_type": "dataset",
"titles": null,
"url": "",
"version": ""
},
"instructions": {
"data_type": "gff_metagenome",
"parameters": {
"param1": "value1",
"param2": "value2"
}
}
},
{
"id": "JDP:555518eb0d8785178e712d84",
"name": "61564.assembled",
"path": "img/submissions/61564/61564.assembled.fna",
"format": "fasta",
"media_type": "text/plain",
"bytes": 6354414,
"hash": "",
"credit": {
"comment": "",
"content_url": "",
"contributors": null,
"credit_metadata_source": "",
"dates": null,
"descriptions": null,
"funding": null,
"identifier": "JDP:555518eb0d8785178e712d84",
"license": {
"id": "",
"url": ""
},
"publisher": {
"organization_id": "",
"organization_name": ""
},
"related_identifiers": null,
"resource_type": "dataset",
"titles": null,
"url": "",
"version": ""
},
"instructions": {
"data_type": "gff_metagenome",
"parameters": {
"param1": "value1",
"param2": "value2"
}
}
},
{
"id": "JDP:555518ec0d8785178e712d9f",
"name": "61567.assembled",
"path": "img/submissions/61567/61567.assembled.gff",
"format": "gff",
"media_type": "text/plain",
"bytes": 545583,
"hash": "",
"credit": {
"comment": "",
"content_url": "",
"contributors": null,
"credit_metadata_source": "",
"dates": null,
"descriptions": null,
"funding": null,
"identifier": "JDP:555518ec0d8785178e712d9f",
"license": {
"id": "",
"url": ""
},
"publisher": {
"organization_id": "",
"organization_name": ""
},
"related_identifiers": null,
"resource_type": "dataset",
"titles": null,
"url": "",
"version": ""
},
"instructions": {
"data_type": "gff_metagenome",
"parameters": {
"param1": "value1",
"param2": "value2"
}
}
}
]
}
Loading
Loading