From b92bba551146586d510da03cc581037dc4c4c05e Mon Sep 17 00:00:00 2001
From: Lars Reimann <mail@larsreimann.com>
Date: Tue, 28 Mar 2023 10:45:32 +0200
Subject: [PATCH] feat: remove `OrdinalEncoder` (#107)

### Summary of Changes

The `OrdinalEncoder` was a bit of an outlier compared to the other
`Transformer` classes:

* It could only be applied to a single column instead of a list of
columns. Because of this, it was not possible to implement #61.
* Nothing was "learned" since the user had to specify the value order
explicitly. The `fit` step was completely unnecessary.

Therefore, I've removed the class `OrdinalEncoder`. Instead the
`transform_column` method on a `Table` can be used. If eventually find
this to be too cumbersome, we can implement a new method
`transform_column_into_ordered_labels` on `Table`.

---------

Co-authored-by: lars-reimann <lars-reimann@users.noreply.github.com>
---
 src/safeds/data/tabular/containers/_table.py  |  75 +++------
 .../data/tabular/transformation/__init__.py   |   1 -
 .../transformation/_ordinal_encoder.py        | 144 ------------------
 .../_ordinal_encoder/__init__.py              |   0
 .../_ordinal_encoder/test_fit_transform.py    |  50 ------
 .../test_inverse_transform.py                 |  37 -----
 .../_ordinal_encoder/test_transform.py        |  20 ---
 7 files changed, 19 insertions(+), 308 deletions(-)
 delete mode 100644 src/safeds/data/tabular/transformation/_ordinal_encoder.py
 delete mode 100644 tests/safeds/data/tabular/transformation/_ordinal_encoder/__init__.py
 delete mode 100644 tests/safeds/data/tabular/transformation/_ordinal_encoder/test_fit_transform.py
 delete mode 100644 tests/safeds/data/tabular/transformation/_ordinal_encoder/test_inverse_transform.py
 delete mode 100644 tests/safeds/data/tabular/transformation/_ordinal_encoder/test_transform.py

diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py
index 3e27edc29..3cec5c269 100644
--- a/src/safeds/data/tabular/containers/_table.py
+++ b/src/safeds/data/tabular/containers/_table.py
@@ -109,9 +109,7 @@ def from_json_file(path: str) -> Table:
         except FileNotFoundError as exception:
             raise FileNotFoundError(f'File "{path}" does not exist') from exception
         except Exception as exception:
-            raise ValueError(
-                f'Could not read file from "{path}" as JSON'
-            ) from exception
+            raise ValueError(f'Could not read file from "{path}" as JSON') from exception
 
     @staticmethod
     def from_columns(columns: list[Column]) -> Table:
@@ -143,9 +141,7 @@ def from_columns(columns: list[Column]) -> Table:
         for column in columns:
             if column._data.size != columns[0]._data.size:
                 raise ColumnLengthMismatchError(
-                    "\n".join(
-                        [f"{column.name}: {column._data.size}" for column in columns]
-                    )
+                    "\n".join([f"{column.name}: {column._data.size}" for column in columns])
                 )
             dataframe[column.name] = column._data
 
@@ -193,9 +189,7 @@ def from_rows(rows: list[Row]) -> Table:
     # ------------------------------------------------------------------------------------------------------------------
 
     def __init__(self, data: typing.Iterable, schema: Optional[TableSchema] = None):
-        self._data: pd.Dataframe = (
-            data if isinstance(data, pd.DataFrame) else pd.DataFrame(data)
-        )
+        self._data: pd.Dataframe = data if isinstance(data, pd.DataFrame) else pd.DataFrame(data)
         if schema is None:
             if self.count_columns() == 0:
                 raise MissingSchemaError()
@@ -272,9 +266,7 @@ def get_column(self, column_name: str) -> Column:
         if self._schema.has_column(column_name):
             output_column = Column(
                 column_name,
-                self._data.iloc[
-                    :, [self._schema._get_column_index_by_name(column_name)]
-                ].squeeze(),
+                self._data.iloc[:, [self._schema._get_column_index_by_name(column_name)]].squeeze(),
                 self._schema.get_type_of_column(column_name),
             )
             return output_column
@@ -533,9 +525,7 @@ def add_rows(self, rows: Union[list[Row], Table]) -> Table:
         for row in rows:
             if self._schema != row.schema:
                 raise SchemaMismatchError()
-        result = pd.concat(
-            [result, *[row._data.to_frame().T for row in rows]]
-        ).infer_objects()
+        result = pd.concat([result, *[row._data.to_frame().T for row in rows]]).infer_objects()
         result.columns = self._schema.get_column_names()
         return Table(result)
 
@@ -568,9 +558,7 @@ def drop_columns(self, column_names: list[str]) -> Table:
         if len(invalid_columns) != 0:
             raise UnknownColumnNameError(invalid_columns)
         transformed_data = self._data.drop(labels=column_indices, axis="columns")
-        transformed_data.columns = list(
-            name for name in self._schema.get_column_names() if name not in column_names
-        )
+        transformed_data.columns = list(name for name in self._schema.get_column_names() if name not in column_names)
         return Table(transformed_data)
 
     def drop_columns_with_missing_values(self) -> Table:
@@ -582,9 +570,7 @@ def drop_columns_with_missing_values(self) -> Table:
         table : Table
             A table without the columns that contain missing values.
         """
-        return Table.from_columns(
-            [column for column in self.to_columns() if not column.has_missing_values()]
-        )
+        return Table.from_columns([column for column in self.to_columns() if not column.has_missing_values()])
 
     def drop_columns_with_non_numerical_values(self) -> Table:
         """
@@ -596,9 +582,7 @@ def drop_columns_with_non_numerical_values(self) -> Table:
             A table without the columns that contain non-numerical values.
 
         """
-        return Table.from_columns(
-            [column for column in self.to_columns() if column.type.is_numeric()]
-        )
+        return Table.from_columns([column for column in self.to_columns() if column.type.is_numeric()])
 
     def drop_duplicate_rows(self) -> Table:
         """
@@ -642,9 +626,7 @@ def drop_rows_with_outliers(self) -> Table:
         copy = self._data.copy(deep=True)
 
         table_without_nonnumericals = self.drop_columns_with_non_numerical_values()
-        z_scores = np.absolute(
-            stats.zscore(table_without_nonnumericals._data, nan_policy="omit")
-        )
+        z_scores = np.absolute(stats.zscore(table_without_nonnumericals._data, nan_policy="omit"))
         filter_ = ((z_scores < 3) | np.isnan(z_scores)).all(axis=1)
 
         return Table(copy[filter_], self._schema)
@@ -699,9 +681,7 @@ def keep_only_columns(self, column_names: list[str]) -> Table:
         if len(invalid_columns) != 0:
             raise UnknownColumnNameError(invalid_columns)
         transformed_data = self._data[column_indices]
-        transformed_data.columns = list(
-            name for name in self._schema.get_column_names() if name in column_names
-        )
+        transformed_data.columns = list(name for name in self._schema.get_column_names() if name in column_names)
         return Table(transformed_data)
 
     def rename_column(self, old_name: str, new_name: str) -> Table:
@@ -769,10 +749,7 @@ def replace_column(self, old_column_name: str, new_column: Column) -> Table:
         if old_column_name not in self._schema.get_column_names():
             raise UnknownColumnNameError([old_column_name])
 
-        if (
-            new_column.name in self._schema.get_column_names()
-            and new_column.name != old_column_name
-        ):
+        if new_column.name in self._schema.get_column_names() and new_column.name != old_column_name:
             raise DuplicateColumnNameError(new_column.name)
 
         if self.count_rows() != new_column._data.size:
@@ -838,13 +815,7 @@ def slice(
         if end is None:
             end = self.count_rows()
 
-        if (
-            start < 0
-            or end < 0
-            or start >= self.count_rows()
-            or end > self.count_rows()
-            or end < start
-        ):
+        if start < 0 or end < 0 or start >= self.count_rows() or end > self.count_rows() or end < start:
             raise ValueError("the given index is out of bounds")
 
         new_df = self._data.iloc[start:end:step]
@@ -853,9 +824,7 @@ def slice(
 
     def sort_columns(
         self,
-        comparator: Callable[[Column, Column], int] = lambda col1, col2: (
-            col1.name > col2.name
-        )
+        comparator: Callable[[Column, Column], int] = lambda col1, col2: (col1.name > col2.name)
         - (col1.name < col2.name),
     ) -> Table:
         """
@@ -891,9 +860,9 @@ def sort_rows(self, comparator: Callable[[Row, Row], int]) -> Table:
 
         The comparator is a function that takes two rows `row1` and `row2` and returns an integer:
 
-        * If `col1` should be ordered before `col2`, the function should return a negative number.
-        * If `col1` should be ordered after `col2`, the function should return a positive number.
-        * If the original order of `col1` and `col2` should be kept, the function should return 0.
+        * If `row1` should be ordered before `row2`, the function should return a negative number.
+        * If `row1` should be ordered after `row2`, the function should return a positive number.
+        * If the original order of `row1` and `row2` should be kept, the function should return 0.
 
         Parameters
         ----------
@@ -933,9 +902,7 @@ def split(self, percentage_in_first: float) -> typing.Tuple[Table, Table]:
             self.slice(round(percentage_in_first * self.count_rows())),
         )
 
-    def transform_column(
-        self, name: str, transformer: Callable[[Row], typing.Any]
-    ) -> Table:
+    def transform_column(self, name: str, transformer: Callable[[Row], typing.Any]) -> Table:
         """
         Transform provided column by calling provided transformer.
 
@@ -1103,9 +1070,7 @@ def to_rows(self) -> list[Row]:
         rows : list[Row]
             List of rows.
         """
-        return [
-            Row(series_row, self._schema) for (_, series_row) in self._data.iterrows()
-        ]
+        return [Row(series_row, self._schema) for (_, series_row) in self._data.iterrows()]
 
     # ------------------------------------------------------------------------------------------------------------------
     # Other
@@ -1123,7 +1088,5 @@ def _ipython_display_(self) -> DisplayHandle:
         tmp = self._data.copy(deep=True)
         tmp.columns = self.get_column_names()
 
-        with pd.option_context(
-            "display.max_rows", tmp.shape[0], "display.max_columns", tmp.shape[1]
-        ):
+        with pd.option_context("display.max_rows", tmp.shape[0], "display.max_columns", tmp.shape[1]):
             return display(tmp)
diff --git a/src/safeds/data/tabular/transformation/__init__.py b/src/safeds/data/tabular/transformation/__init__.py
index 1d06d8de2..17e046538 100644
--- a/src/safeds/data/tabular/transformation/__init__.py
+++ b/src/safeds/data/tabular/transformation/__init__.py
@@ -1,4 +1,3 @@
 from ._imputer import Imputer
 from ._label_encoder import LabelEncoder
 from ._one_hot_encoder import OneHotEncoder
-from ._ordinal_encoder import OrdinalEncoder
diff --git a/src/safeds/data/tabular/transformation/_ordinal_encoder.py b/src/safeds/data/tabular/transformation/_ordinal_encoder.py
deleted file mode 100644
index 5ccfb4d3f..000000000
--- a/src/safeds/data/tabular/transformation/_ordinal_encoder.py
+++ /dev/null
@@ -1,144 +0,0 @@
-from safeds import exceptions
-from safeds.data.tabular.containers import Table
-from sklearn import preprocessing
-
-
-# noinspection PyProtectedMember
-class OrdinalEncoder:
-    """
-    This OrdinalEncoder encodes one or more given columns into ordinal numbers. The encoding order must be provided.
-
-    Parameters
-    --------
-    order : list[str]
-        The order in which the ordinal encoder encodes the values.
-    """
-
-    def __init__(self, order: list[str]) -> None:
-        self._is_fitted = 0
-        self._oe = preprocessing.OrdinalEncoder(categories=[order])
-        self._order = order
-
-    def fit(self, table: Table, column_name: str) -> None:
-        """
-        Fit the ordinal encoder with the values in the given table.
-
-        Parameters
-        ----------
-        table : Table
-            The table containing the data used to fit the ordinal encoder.
-        column_name : str
-            The column which should be ordinal-encoded.
-
-        Returns
-        -------
-        None
-            This function does not return any value. It updates the internal state of the ordinal encoder object.
-
-        Raises
-        -------
-        LearningError
-            If the model could not be fitted correctly.
-        """
-
-        p_df = table._data
-        p_df.columns = table.schema.get_column_names()
-        try:
-            self._oe.fit(p_df[[column_name]])
-        except exceptions.NotFittedError as exc:
-            raise exceptions.LearningError("") from exc
-
-    def transform(self, table: Table, column_name: str) -> Table:
-        """
-        Transform the given table to an ordinal-encoded table.
-
-        Parameters
-        ----------
-        table : Table
-            The table with target values.
-        column_name : str
-            The name of the column.
-
-        Returns
-        -------
-        table : Table
-            The table with ordinal encodings.
-
-        Raises
-        ------
-        NotFittedError
-            If the model was not fitted before transforming.
-        """
-        p_df = table._data.copy()
-        p_df.columns = table.schema.get_column_names()
-        try:
-            p_df[[column_name]] = self._oe.transform(p_df[[column_name]])
-            p_df[column_name] = p_df[column_name].astype(dtype="int64", copy=False)
-            return Table(p_df)
-        except Exception as exc:
-            raise exceptions.NotFittedError from exc
-
-    def fit_transform(self, table: Table, columns: list[str]) -> Table:
-        """
-        Oridnal-encode a given table with the given ordinal encoder.
-        The order is provided in the constructor. A new order will not be inferred from other columns.
-
-        Parameters
-        ----------
-        table : Table
-            The table which will be transformed.
-        columns : list[str]
-            The list of column names to be considered while encoding.
-
-        Returns
-        -------
-        table : Table
-            A new Table object which is ordinal-encoded.
-
-        Raises
-        -------
-        NotFittedError
-            If the encoder was not fitted before transforming.
-        KeyError
-            If the column does not exist.
-
-        """
-        try:
-            for col in columns:
-                # Fit the Ordinal Encoder on the Column
-                self.fit(table, col)
-                # transform the column using the trained Ordinal Encoder
-                table = self.transform(table, col)
-            return table
-        except exceptions.NotFittedError as exc:
-            raise exceptions.NotFittedError from exc
-
-    def inverse_transform(self, table: Table, column_name: str) -> Table:
-        """
-        Inverse the transformed table back to original encodings.
-
-        Parameters
-        ----------
-        table : Table
-            The table to be inverse-transformed.
-        column_name : str
-            The column which should be inverse-transformed.
-
-        Returns
-        -------
-        table : Table
-            The inverse-transformed table.
-
-        Raises
-        -------
-        NotFittedError
-            If the encoder was not fitted before transforming.
-        """
-
-        p_df = table._data.copy()
-        p_df.columns = table.schema.get_column_names()
-        try:
-            p_df[[column_name]] = self._oe.inverse_transform(p_df[[column_name]])
-            return Table(p_df)
-        except exceptions.NotFittedError as exc:
-            raise exceptions.NotFittedError from exc
diff --git a/tests/safeds/data/tabular/transformation/_ordinal_encoder/__init__.py b/tests/safeds/data/tabular/transformation/_ordinal_encoder/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/safeds/data/tabular/transformation/_ordinal_encoder/test_fit_transform.py b/tests/safeds/data/tabular/transformation/_ordinal_encoder/test_fit_transform.py
deleted file mode 100644
index 863bdf349..000000000
--- a/tests/safeds/data/tabular/transformation/_ordinal_encoder/test_fit_transform.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import pandas as pd
-import pytest
-from safeds.data.tabular.containers import Table
-from safeds.data.tabular.transformation import OrdinalEncoder
-from safeds.data.tabular.typing import IntColumnType
-from safeds.exceptions import NotFittedError
-
-
-def test_fit_transform_valid() -> None:
-    test_table = Table(
-        pd.DataFrame(
-            {
-                "temperatur": ["warm", "kalt", "kalt", "warm", "heiss"],
-                "gedöns": ["1", "2", "3", "4", "5"],
-                "temperatur_2": ["kalt", "kalt", "warm", "warm", "kalt"],
-            }
-        )
-    )
-    check_table = Table(
-        pd.DataFrame(
-            {
-                "temperatur": [1, 0, 0, 1, 2],
-                "gedöns": ["1", "2", "3", "4", "5"],
-                "temperatur_2": [0, 0, 1, 1, 0],
-            }
-        )
-    )
-    oe = OrdinalEncoder(["kalt", "warm", "heiss"])
-    test_table = oe.fit_transform(test_table, ["temperatur", "temperatur_2"])
-    assert test_table.schema.get_column_names() == check_table.schema.get_column_names()
-    assert isinstance(test_table.schema.get_type_of_column("temperatur"), IntColumnType)
-    assert isinstance(
-        test_table.schema.get_type_of_column("temperatur_2"), IntColumnType
-    )
-    assert test_table == check_table
-
-
-def test_fit_transform_invalid() -> None:
-    oe = OrdinalEncoder(["test", "test"])
-    test_table = Table(
-        pd.DataFrame(
-            {
-                "temperatur": ["warm", "kalt", "kalt", "warm", "heiss"],
-                "gedöns": ["1", "2", "3", "4", "5"],
-                "temperatur_2": ["kalt", "kalt", "warm", "warm", "kalt"],
-            }
-        )
-    )
-    with pytest.raises(NotFittedError):
-        oe.transform(test_table, "test")
diff --git a/tests/safeds/data/tabular/transformation/_ordinal_encoder/test_inverse_transform.py b/tests/safeds/data/tabular/transformation/_ordinal_encoder/test_inverse_transform.py
deleted file mode 100644
index 94a95fc60..000000000
--- a/tests/safeds/data/tabular/transformation/_ordinal_encoder/test_inverse_transform.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import pandas as pd
-from safeds.data.tabular.containers import Table
-from safeds.data.tabular.transformation import OrdinalEncoder
-from safeds.data.tabular.typing import StringColumnType
-
-
-def test_inverse_transform() -> None:
-    test_table = Table(
-        pd.DataFrame(
-            {
-                "temperatur": [1, 0, 0, 1, 2],
-                "gedöns": ["1", "2", "3", "4", "5"],
-                "temperatur_2": [0, 0, 1, 1, 0],
-            }
-        )
-    )
-    check_table = Table(
-        pd.DataFrame(
-            {
-                "temperatur": ["warm", "kalt", "kalt", "warm", "heiss"],
-                "gedöns": ["1", "2", "3", "4", "5"],
-                "temperatur_2": ["kalt", "kalt", "warm", "warm", "kalt"],
-            }
-        )
-    )
-    oe = OrdinalEncoder(["kalt", "warm", "heiss"])
-    oe.fit(check_table, "temperatur")
-    test_table = oe.inverse_transform(test_table, "temperatur")
-    test_table = oe.inverse_transform(test_table, "temperatur_2")
-    assert test_table.schema.get_column_names() == check_table.schema.get_column_names()
-    assert isinstance(
-        test_table.schema.get_type_of_column("temperatur"), StringColumnType
-    )
-    assert isinstance(
-        test_table.schema.get_type_of_column("temperatur_2"), StringColumnType
-    )
-    assert test_table == check_table
diff --git a/tests/safeds/data/tabular/transformation/_ordinal_encoder/test_transform.py b/tests/safeds/data/tabular/transformation/_ordinal_encoder/test_transform.py
deleted file mode 100644
index 4b7a7a574..000000000
--- a/tests/safeds/data/tabular/transformation/_ordinal_encoder/test_transform.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import pandas as pd
-import pytest
-from safeds.data.tabular.containers import Table
-from safeds.data.tabular.transformation import OrdinalEncoder
-from safeds.exceptions import NotFittedError
-
-
-def test_transform_invalid() -> None:
-    test_table = Table(
-        pd.DataFrame(
-            {
-                "temperatur": ["warm", "kalt", "kalt", "warm", "heiss"],
-                "gedöns": ["1", "2", "3", "4", "5"],
-                "temperatur_2": ["kalt", "kalt", "warm", "warm", "kalt"],
-            }
-        )
-    )
-    ode = OrdinalEncoder(["kalt", "warm", "heiss"])
-    with pytest.raises(NotFittedError):
-        ode.transform(test_table, "temperatur")