diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 9eeab3d1..9838b5b3 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,3 +1,10 @@ +### Version 1.3.4 +- Alter the behavior of the bulk specification file writers to return an error if the + input `types` parameter is empty. +- Fixed a bug in the csv/tsv bulk specification parser that would case a failure if the + first header of a file had trailing separators. This occurs if a csv/tsv file is opened and + saved by Excel. + ### Version 1.3.3 - Fixed a bug in the csv/tsv bulk specification parser that would include an empty entry for each empty line in the file. diff --git a/staging_service/app.py b/staging_service/app.py index a4472da5..7d6b858d 100644 --- a/staging_service/app.py +++ b/staging_service/app.py @@ -34,7 +34,7 @@ logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) routes = web.RouteTableDef() -VERSION = "1.3.3" +VERSION = "1.3.4" _DATATYPE_MAPPINGS = None @@ -59,7 +59,7 @@ async def importer_filetypes(request: web.Request) -> web.json_response: Returns the file types for the configured datatypes. The returned JSON contains two keys: * datatype_to_filetype, which maps import datatypes (like gff_genome) to their accepted filetypes (like [FASTA, GFF]) - * filetype_to_extensions, which maps file types (e.g. FASTA) to their extensions (e.g. + * filetype_to_extensions, which maps file types (e.g. FASTA) to their extensions (e.g. *.fa, *.fasta, *.fa.gz, etc.) This information is currently static over the life of the server. @@ -197,7 +197,7 @@ async def write_bulk_specification(request: web.Request) -> web.json_response: def _createJSONErrorResponse(error_text: str, error_class=web.HTTPBadRequest): err = json.dumps({"error": error_text}) return error_class(text=err, content_type=_APP_JSON) - + @routes.get("/add-acl-concierge") async def add_acl_concierge(request: web.Request): diff --git a/staging_service/import_specifications/file_writers.py b/staging_service/import_specifications/file_writers.py index 377e7c8b..160b32e2 100644 --- a/staging_service/import_specifications/file_writers.py +++ b/staging_service/import_specifications/file_writers.py @@ -23,7 +23,7 @@ entry in the list corresponds to a row in the resulting import specification, and the order of the list defines the order of the rows. Leave the `data` list empty to write an empty template. -:returns: A mapping of the data types to the files to which they were written. +:returns: A mapping of the data types to the files to which they were written. """ # note that we can't use an f string here to interpolate the variables below, e.g. # order_and_display, etc. @@ -82,7 +82,7 @@ def _check_import_specification(types: dict[str, dict[str, list[Any]]]): Leave the {_DATA} list empty to write an empty template. """ if not types: - return + raise ImportSpecWriteException("At least one data type must be specified") for datatype in types: # replace this with jsonschema? don't worry about it for now _check_string(datatype, "A data type") @@ -235,4 +235,4 @@ class ImportSpecWriteException(Exception): """ An exception thrown when writing an import specification fails. """ - pass \ No newline at end of file + pass diff --git a/staging_service/import_specifications/individual_parsers.py b/staging_service/import_specifications/individual_parsers.py index bb15d083..47836a6c 100644 --- a/staging_service/import_specifications/individual_parsers.py +++ b/staging_service/import_specifications/individual_parsers.py @@ -10,7 +10,7 @@ # TODO update to C impl when fixed: https://github.com/Marco-Sulla/python-frozendict/issues/26 from frozendict.core import frozendict from pathlib import Path -from typing import TextIO, Optional as O, Union, Any +from typing import Optional as O, Union, Any from staging_service.import_specifications.file_parser import ( PRIMITIVE_TYPE, @@ -35,7 +35,7 @@ _HEADER_REGEX = re.compile(f"{_DATA_TYPE} (\\w+){_HEADER_SEP} " + f"{_COLUMN_STR} (\\d+){_HEADER_SEP} {_VERSION_STR} (\\d+)") -_MAGIC_TEXT_FILES = {"text/plain", "inode/x-empty"} +_MAGIC_TEXT_FILES = {"text/plain", "inode/x-empty", "application/csv", "text/csv"} class _ParseException(Exception): @@ -63,26 +63,18 @@ def _parse_header(header: str, spec_source: SpecificationSource, maximum_version return match[1], int(match[2]) -def _required_next( - input_: Union[TextIO, Any], # Any really means a csv reader object - spec_source: SpecificationSource, - error: str -) -> Union[str, list[str]]: - # returns a string for a TextIO input or a list for a Reader input - try: - return next(input_) - except StopIteration: - raise _ParseException(Error(ErrorType.PARSE_FAIL, error, spec_source)) - def _csv_next( - input_: Union[TextIO, Any], # Any really means a csv reader object + input_: Any, # Any really means a csv reader object line_number: int, - expected_line_count: int, + expected_line_count: Union[None, int], # None = skip columns check spec_source: SpecificationSource, error: str ) -> list[str]: - line = _required_next(input_, spec_source, error) - if len(line) != expected_line_count: + try: + line = next(input_) + except StopIteration: + raise _ParseException(Error(ErrorType.PARSE_FAIL, error, spec_source)) + if expected_line_count and len(line) != expected_line_count: raise _ParseException(Error( ErrorType.INCORRECT_COLUMN_COUNT, f"Incorrect number of items in line {line_number}, " @@ -91,15 +83,6 @@ def _csv_next( return line -def _get_datatype(input_: TextIO, spec_source: SpecificationSource, maximum_version: int -) -> tuple[str, int]: - # return is (data type, column count) - return _parse_header( - _required_next(input_, spec_source, "Missing data type / version header").strip(), - spec_source, - maximum_version) - - def _error(error: Error) -> ParseResults: return ParseResults(errors = tuple([error])) @@ -155,11 +138,13 @@ def _normalize_headers( def _parse_xsv(path: Path, sep: str) -> ParseResults: spcsrc = SpecificationSource(path) try: - if magic.from_file(str(path), mime=True) not in _MAGIC_TEXT_FILES: - return _error(Error(ErrorType.PARSE_FAIL, "Not a text file", spcsrc)) + filetype = magic.from_file(str(path), mime=True) + if filetype not in _MAGIC_TEXT_FILES: + return _error(Error(ErrorType.PARSE_FAIL, "Not a text file: " + filetype, spcsrc)) with open(path, newline='') as input_: - datatype, columns = _get_datatype(input_, spcsrc, _VERSION) rdr = csv.reader(input_, delimiter=sep) # let parser handle quoting + dthd = _csv_next(rdr, 1, None, spcsrc, "Missing data type / version header") + datatype, columns = _parse_header(dthd[0], spcsrc, _VERSION) hd1 = _csv_next(rdr, 2, columns, spcsrc, "Missing 2nd header line") param_ids = _normalize_headers(hd1, 2, spcsrc) _csv_next(rdr, 3, columns, spcsrc, "Missing 3rd header line") diff --git a/tests/import_specifications/test_file_writers.py b/tests/import_specifications/test_file_writers.py index acb5c48e..7f2cf26f 100644 --- a/tests/import_specifications/test_file_writers.py +++ b/tests/import_specifications/test_file_writers.py @@ -64,9 +64,6 @@ def temp_dir() -> Generator[Path, None, None]: } } -def test_noop(): - assert write_csv(Path("."), {}) == {} - assert write_tsv(Path("."), {}) == {} def test_write_csv(temp_dir: Path): res = write_csv(temp_dir, _TEST_DATA) @@ -194,6 +191,7 @@ def test_file_writers_fail(): E = ImportSpecWriteException file_writers_fail(None, {}, ValueError("The folder cannot be null")) file_writers_fail(p, None, E("The types value must be a mapping")) + file_writers_fail(p, {}, E("At least one data type must be specified")) file_writers_fail( p, {None: 1}, E("A data type cannot be a non-string or a whitespace only string")) file_writers_fail( @@ -281,4 +279,4 @@ def file_writers_fail(path: Path, types: dict, expected: Exception): assert_exception_correct(got.value, expected) with raises(Exception) as got: write_excel(path, types) - assert_exception_correct(got.value, expected) \ No newline at end of file + assert_exception_correct(got.value, expected) diff --git a/tests/import_specifications/test_individual_parsers.py b/tests/import_specifications/test_individual_parsers.py index eb3d6c1a..f7caace9 100644 --- a/tests/import_specifications/test_individual_parsers.py +++ b/tests/import_specifications/test_individual_parsers.py @@ -39,6 +39,9 @@ def temp_dir() -> Generator[Path, None, None]: def test_xsv_parse_success(temp_dir: Path): + # Tests a special case where if an xSV file is opened by Excel and then resaved, + # Excel will add separators to the end of the 1st header line. This previously caused + # the parse to fail. _xsv_parse_success(temp_dir, ',', parse_csv) _xsv_parse_success(temp_dir, '\t', parse_tsv) @@ -48,7 +51,7 @@ def _xsv_parse_success(temp_dir: Path, sep: str, parser: Callable[[Path], ParseR input_ = temp_dir / str(uuid.uuid4()) with open(input_, "w") as test_file: test_file.writelines([ - "Data type: some_type; Columns: 4; Version: 1\n", + f"Data type: some_type; Columns: 4; Version: 1{s}{s}{s}\n", f"spec1{s} spec2{s} spec3 {s} spec4\n", # test trimming f"Spec 1{s} Spec 2{s} Spec 3{s} Spec 4\n", f"val1 {s} val2 {s} 7 {s} 3.2\n", # test trimming @@ -166,7 +169,10 @@ def test_xsv_parse_fail_binary_file(temp_dir: Path): res = parse_csv(test_file) assert res == ParseResults(errors=tuple([ - Error(ErrorType.PARSE_FAIL, "Not a text file", source_1=SpecificationSource(test_file)) + Error( + ErrorType.PARSE_FAIL, + "Not a text file: application/vnd.ms-excel", + source_1=SpecificationSource(test_file)) ]))