From 12e6f04a4828bf0eecd6f6c752475fe3da7b8b34 Mon Sep 17 00:00:00 2001
From: MrCreosote <MrCreosote@users.noreply.github.com>
Date: Fri, 11 Mar 2022 08:36:19 -0800
Subject: [PATCH 1/5] DATAUP-727: Fix an import spec parse bug re trailing
 separators (#168)

* Fix a import spec parse bug re trailing separators

If an xSV file is opened and saved by Excel, it'll add trailing separators to
the end of the first header line. This fix ignores those separators.

* Bump version, add release notes

* Fix unused import

* Provide more info if a file isn't an expected type

* Add csv file type to list of allowed files

Not quite sure why tests were passing everywhere up till now, or why
tests pass locally - maybe a `magic` update

* Remove redundant file type check

doh

* Fix tests on mac

... maybe? Tests still don't pass on Boris's machine, this might fix it
---
 RELEASE_NOTES.md                              |  5 +++
 staging_service/app.py                        |  2 +-
 .../individual_parsers.py                     | 43 ++++++-------------
 .../test_individual_parsers.py                | 10 ++++-
 4 files changed, 28 insertions(+), 32 deletions(-)

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index 9eeab3d1..d5895652 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -1,3 +1,8 @@
+### Version 1.3.4
+- Fixed a bug in the csv/tsv bulk specification parser that would case a failure if the
+  first header of a file had trailing separators. This occurs if a csv/tsv file is opened and
+  saved by Excel.
+
 ### Version 1.3.3
 - Fixed a bug in the csv/tsv bulk specification parser that would include an empty entry for
   each empty line in the file.
diff --git a/staging_service/app.py b/staging_service/app.py
index a4472da5..e082d0f1 100644
--- a/staging_service/app.py
+++ b/staging_service/app.py
@@ -34,7 +34,7 @@
 
 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 routes = web.RouteTableDef()
-VERSION = "1.3.3"
+VERSION = "1.3.4"
 
 _DATATYPE_MAPPINGS = None
 
diff --git a/staging_service/import_specifications/individual_parsers.py b/staging_service/import_specifications/individual_parsers.py
index bb15d083..47836a6c 100644
--- a/staging_service/import_specifications/individual_parsers.py
+++ b/staging_service/import_specifications/individual_parsers.py
@@ -10,7 +10,7 @@
 # TODO update to C impl when fixed: https://github.com/Marco-Sulla/python-frozendict/issues/26
 from frozendict.core import frozendict
 from pathlib import Path
-from typing import TextIO, Optional as O, Union, Any
+from typing import Optional as O, Union, Any
 
 from staging_service.import_specifications.file_parser import (
     PRIMITIVE_TYPE,
@@ -35,7 +35,7 @@
 _HEADER_REGEX = re.compile(f"{_DATA_TYPE} (\\w+){_HEADER_SEP} "
     + f"{_COLUMN_STR} (\\d+){_HEADER_SEP} {_VERSION_STR} (\\d+)")
 
-_MAGIC_TEXT_FILES = {"text/plain", "inode/x-empty"}
+_MAGIC_TEXT_FILES = {"text/plain", "inode/x-empty", "application/csv", "text/csv"}
 
 
 class _ParseException(Exception):
@@ -63,26 +63,18 @@ def _parse_header(header: str, spec_source: SpecificationSource, maximum_version
     return match[1], int(match[2])
 
 
-def _required_next(
-    input_: Union[TextIO, Any],  # Any really means a csv reader object
-    spec_source: SpecificationSource,
-    error: str
-) -> Union[str, list[str]]:
-    # returns a string for a TextIO input or a list for a Reader input
-    try:
-        return next(input_)
-    except StopIteration:
-        raise _ParseException(Error(ErrorType.PARSE_FAIL, error, spec_source))
-
 def _csv_next(
-    input_: Union[TextIO, Any],  # Any really means a csv reader object
+    input_: Any,  # Any really means a csv reader object
     line_number: int,
-    expected_line_count: int,
+    expected_line_count: Union[None, int],  # None = skip columns check
     spec_source: SpecificationSource,
     error: str
 ) -> list[str]:
-    line = _required_next(input_, spec_source, error)
-    if len(line) != expected_line_count:
+    try:
+        line = next(input_)
+    except StopIteration:
+        raise _ParseException(Error(ErrorType.PARSE_FAIL, error, spec_source))
+    if expected_line_count and len(line) != expected_line_count:
         raise _ParseException(Error(
             ErrorType.INCORRECT_COLUMN_COUNT,
             f"Incorrect number of items in line {line_number}, "
@@ -91,15 +83,6 @@ def _csv_next(
     return line
 
 
-def _get_datatype(input_: TextIO, spec_source: SpecificationSource, maximum_version: int
-) -> tuple[str, int]:
-    # return is (data type, column count)
-    return _parse_header(
-        _required_next(input_, spec_source, "Missing data type / version header").strip(),
-        spec_source,
-        maximum_version)
-
-
 def _error(error: Error) -> ParseResults:
     return ParseResults(errors = tuple([error]))
 
@@ -155,11 +138,13 @@ def _normalize_headers(
 def _parse_xsv(path: Path, sep: str) -> ParseResults:
     spcsrc = SpecificationSource(path)
     try:
-        if magic.from_file(str(path), mime=True) not in _MAGIC_TEXT_FILES:
-            return _error(Error(ErrorType.PARSE_FAIL, "Not a text file", spcsrc))
+        filetype = magic.from_file(str(path), mime=True)
+        if filetype not in _MAGIC_TEXT_FILES:
+            return _error(Error(ErrorType.PARSE_FAIL, "Not a text file: " + filetype, spcsrc))
         with open(path, newline='') as input_:
-            datatype, columns = _get_datatype(input_, spcsrc, _VERSION)
             rdr = csv.reader(input_, delimiter=sep)  # let parser handle quoting
+            dthd = _csv_next(rdr, 1, None, spcsrc, "Missing data type / version header")
+            datatype, columns = _parse_header(dthd[0], spcsrc, _VERSION)
             hd1 = _csv_next(rdr, 2, columns, spcsrc, "Missing 2nd header line")
             param_ids = _normalize_headers(hd1, 2, spcsrc)
             _csv_next(rdr, 3, columns, spcsrc, "Missing 3rd header line")
diff --git a/tests/import_specifications/test_individual_parsers.py b/tests/import_specifications/test_individual_parsers.py
index eb3d6c1a..f7caace9 100644
--- a/tests/import_specifications/test_individual_parsers.py
+++ b/tests/import_specifications/test_individual_parsers.py
@@ -39,6 +39,9 @@ def temp_dir() -> Generator[Path, None, None]:
 
 
 def test_xsv_parse_success(temp_dir: Path):
+    # Tests a special case where if an xSV file is opened by Excel and then resaved, 
+    # Excel will add separators to the end of the 1st header line. This previously caused
+    # the parse to fail.
     _xsv_parse_success(temp_dir, ',', parse_csv)
     _xsv_parse_success(temp_dir, '\t', parse_tsv)
 
@@ -48,7 +51,7 @@ def _xsv_parse_success(temp_dir: Path, sep: str, parser: Callable[[Path], ParseR
     input_ = temp_dir / str(uuid.uuid4())
     with open(input_, "w") as test_file:
         test_file.writelines([
-            "Data type: some_type; Columns: 4; Version: 1\n",
+            f"Data type: some_type; Columns: 4; Version: 1{s}{s}{s}\n",
             f"spec1{s} spec2{s}   spec3   {s} spec4\n",    # test trimming
             f"Spec 1{s} Spec 2{s} Spec 3{s} Spec 4\n",
             f"val1 {s}   val2   {s}    7     {s} 3.2\n",   # test trimming
@@ -166,7 +169,10 @@ def test_xsv_parse_fail_binary_file(temp_dir: Path):
     res = parse_csv(test_file)
 
     assert res == ParseResults(errors=tuple([
-        Error(ErrorType.PARSE_FAIL, "Not a text file", source_1=SpecificationSource(test_file))
+        Error(
+            ErrorType.PARSE_FAIL,
+            "Not a text file: application/vnd.ms-excel",
+            source_1=SpecificationSource(test_file))
     ]))
 
 

From 4355c54268d78b8cc0abb2ee777405f0fde40c42 Mon Sep 17 00:00:00 2001
From: ialarmedalien <ialarmedalien@gmail.com>
Date: Fri, 22 Apr 2022 10:48:06 -0700
Subject: [PATCH 2/5] Throw an error if the types parameter is an empty
 dictionary

---
 staging_service/import_specifications/file_writers.py | 6 +++---
 tests/import_specifications/test_file_writers.py      | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/staging_service/import_specifications/file_writers.py b/staging_service/import_specifications/file_writers.py
index 377e7c8b..160b32e2 100644
--- a/staging_service/import_specifications/file_writers.py
+++ b/staging_service/import_specifications/file_writers.py
@@ -23,7 +23,7 @@
         entry in the list corresponds to a row in the resulting import specification,
         and the order of the list defines the order of the rows.
     Leave the `data` list empty to write an empty template.
-:returns: A mapping of the data types to the files to which they were written. 
+:returns: A mapping of the data types to the files to which they were written.
 """
 # note that we can't use an f string here to interpolate the variables below, e.g.
 # order_and_display, etc.
@@ -82,7 +82,7 @@ def _check_import_specification(types: dict[str, dict[str, list[Any]]]):
         Leave the {_DATA} list empty to write an empty template.
     """
     if not types:
-        return
+        raise ImportSpecWriteException("At least one data type must be specified")
     for datatype in types:
         # replace this with jsonschema? don't worry about it for now
         _check_string(datatype, "A data type")
@@ -235,4 +235,4 @@ class ImportSpecWriteException(Exception):
     """
     An exception thrown when writing an import specification fails.
     """
-    pass
\ No newline at end of file
+    pass
diff --git a/tests/import_specifications/test_file_writers.py b/tests/import_specifications/test_file_writers.py
index acb5c48e..b68da60d 100644
--- a/tests/import_specifications/test_file_writers.py
+++ b/tests/import_specifications/test_file_writers.py
@@ -194,6 +194,7 @@ def test_file_writers_fail():
     E = ImportSpecWriteException
     file_writers_fail(None, {}, ValueError("The folder cannot be null"))
     file_writers_fail(p, None, E("The types value must be a mapping"))
+    file_writers_fail(p, {}, E("At least one data type must be specified"))
     file_writers_fail(
         p, {None: 1}, E("A data type cannot be a non-string or a whitespace only string"))
     file_writers_fail(
@@ -281,4 +282,4 @@ def file_writers_fail(path: Path, types: dict, expected: Exception):
     assert_exception_correct(got.value, expected)
     with raises(Exception) as got:
         write_excel(path, types)
-    assert_exception_correct(got.value, expected)
\ No newline at end of file
+    assert_exception_correct(got.value, expected)

From 42331f0dcc5217592f43bd49b603b6f4bd6155f7 Mon Sep 17 00:00:00 2001
From: ialarmedalien <ialarmedalien@gmail.com>
Date: Fri, 22 Apr 2022 11:05:06 -0700
Subject: [PATCH 3/5] Remove noop tests -- no `types` now throws an error

---
 tests/import_specifications/test_file_writers.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/import_specifications/test_file_writers.py b/tests/import_specifications/test_file_writers.py
index b68da60d..7f2cf26f 100644
--- a/tests/import_specifications/test_file_writers.py
+++ b/tests/import_specifications/test_file_writers.py
@@ -64,9 +64,6 @@ def temp_dir() -> Generator[Path, None, None]:
     }
 }
 
-def test_noop():
-    assert write_csv(Path("."), {}) == {}
-    assert write_tsv(Path("."), {}) == {}
 
 def test_write_csv(temp_dir: Path):
     res = write_csv(temp_dir, _TEST_DATA)

From aacbfbfa66f605d7fa4bad4d0089d93cc2b0dd6f Mon Sep 17 00:00:00 2001
From: ialarmedalien <ialarmedalien@gmail.com>
Date: Mon, 25 Apr 2022 09:28:08 -0700
Subject: [PATCH 4/5] Add release notes and bump version number

---
 RELEASE_NOTES.md       | 4 ++++
 staging_service/app.py | 6 +++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index d5895652..fc8866e7 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -1,3 +1,7 @@
+### Version 1.3.5
+- Alter the behavior of the bulk specification file writers to return an error if the
+  input `types` parameter is empty.
+
 ### Version 1.3.4
 - Fixed a bug in the csv/tsv bulk specification parser that would case a failure if the
   first header of a file had trailing separators. This occurs if a csv/tsv file is opened and
diff --git a/staging_service/app.py b/staging_service/app.py
index e082d0f1..25dda8eb 100644
--- a/staging_service/app.py
+++ b/staging_service/app.py
@@ -34,7 +34,7 @@
 
 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 routes = web.RouteTableDef()
-VERSION = "1.3.4"
+VERSION = "1.3.5"
 
 _DATATYPE_MAPPINGS = None
 
@@ -59,7 +59,7 @@ async def importer_filetypes(request: web.Request) -> web.json_response:
     Returns the file types for the configured datatypes. The returned JSON contains two keys:
     * datatype_to_filetype, which maps import datatypes (like gff_genome) to their accepted
       filetypes (like [FASTA, GFF])
-    * filetype_to_extensions, which maps file types (e.g. FASTA) to their extensions (e.g. 
+    * filetype_to_extensions, which maps file types (e.g. FASTA) to their extensions (e.g.
       *.fa, *.fasta, *.fa.gz, etc.)
 
     This information is currently static over the life of the server.
@@ -197,7 +197,7 @@ async def write_bulk_specification(request: web.Request) -> web.json_response:
 def _createJSONErrorResponse(error_text: str, error_class=web.HTTPBadRequest):
     err = json.dumps({"error": error_text})
     return error_class(text=err, content_type=_APP_JSON)
-    
+
 
 @routes.get("/add-acl-concierge")
 async def add_acl_concierge(request: web.Request):

From 87ed3c46a0b697e361bf79701445e0ea0df123ad Mon Sep 17 00:00:00 2001
From: Gavin <gaprice@lbl.gov>
Date: Thu, 28 Apr 2022 13:00:22 -0700
Subject: [PATCH 5/5] Merge versions to 1.3.4

1.3.4 was never deployed to CI, let alone prod,, so no need for another
version bump
---
 RELEASE_NOTES.md       | 4 +---
 staging_service/app.py | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index fc8866e7..9838b5b3 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -1,8 +1,6 @@
-### Version 1.3.5
+### Version 1.3.4
 - Alter the behavior of the bulk specification file writers to return an error if the
   input `types` parameter is empty.
-
-### Version 1.3.4
 - Fixed a bug in the csv/tsv bulk specification parser that would case a failure if the
   first header of a file had trailing separators. This occurs if a csv/tsv file is opened and
   saved by Excel.
diff --git a/staging_service/app.py b/staging_service/app.py
index 25dda8eb..7d6b858d 100644
--- a/staging_service/app.py
+++ b/staging_service/app.py
@@ -34,7 +34,7 @@
 
 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 routes = web.RouteTableDef()
-VERSION = "1.3.5"
+VERSION = "1.3.4"
 
 _DATATYPE_MAPPINGS = None