From b92bba551146586d510da03cc581037dc4c4c05e Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Tue, 28 Mar 2023 10:45:32 +0200 Subject: [PATCH] feat: remove `OrdinalEncoder` (#107) ### Summary of Changes The `OrdinalEncoder` was a bit of an outlier compared to the other `Transformer` classes: * It could only be applied to a single column instead of a list of columns. Because of this, it was not possible to implement #61. * Nothing was "learned" since the user had to specify the value order explicitly. The `fit` step was completely unnecessary. Therefore, I've removed the class `OrdinalEncoder`. Instead the `transform_column` method on a `Table` can be used. If eventually find this to be too cumbersome, we can implement a new method `transform_column_into_ordered_labels` on `Table`. --------- Co-authored-by: lars-reimann --- src/safeds/data/tabular/containers/_table.py | 75 +++------ .../data/tabular/transformation/__init__.py | 1 - .../transformation/_ordinal_encoder.py | 144 ------------------ .../_ordinal_encoder/__init__.py | 0 .../_ordinal_encoder/test_fit_transform.py | 50 ------ .../test_inverse_transform.py | 37 ----- .../_ordinal_encoder/test_transform.py | 20 --- 7 files changed, 19 insertions(+), 308 deletions(-) delete mode 100644 src/safeds/data/tabular/transformation/_ordinal_encoder.py delete mode 100644 tests/safeds/data/tabular/transformation/_ordinal_encoder/__init__.py delete mode 100644 tests/safeds/data/tabular/transformation/_ordinal_encoder/test_fit_transform.py delete mode 100644 tests/safeds/data/tabular/transformation/_ordinal_encoder/test_inverse_transform.py delete mode 100644 tests/safeds/data/tabular/transformation/_ordinal_encoder/test_transform.py diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index 3e27edc29..3cec5c269 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -109,9 +109,7 @@ def from_json_file(path: str) -> Table: except FileNotFoundError as exception: raise FileNotFoundError(f'File "{path}" does not exist') from exception except Exception as exception: - raise ValueError( - f'Could not read file from "{path}" as JSON' - ) from exception + raise ValueError(f'Could not read file from "{path}" as JSON') from exception @staticmethod def from_columns(columns: list[Column]) -> Table: @@ -143,9 +141,7 @@ def from_columns(columns: list[Column]) -> Table: for column in columns: if column._data.size != columns[0]._data.size: raise ColumnLengthMismatchError( - "\n".join( - [f"{column.name}: {column._data.size}" for column in columns] - ) + "\n".join([f"{column.name}: {column._data.size}" for column in columns]) ) dataframe[column.name] = column._data @@ -193,9 +189,7 @@ def from_rows(rows: list[Row]) -> Table: # ------------------------------------------------------------------------------------------------------------------ def __init__(self, data: typing.Iterable, schema: Optional[TableSchema] = None): - self._data: pd.Dataframe = ( - data if isinstance(data, pd.DataFrame) else pd.DataFrame(data) - ) + self._data: pd.Dataframe = data if isinstance(data, pd.DataFrame) else pd.DataFrame(data) if schema is None: if self.count_columns() == 0: raise MissingSchemaError() @@ -272,9 +266,7 @@ def get_column(self, column_name: str) -> Column: if self._schema.has_column(column_name): output_column = Column( column_name, - self._data.iloc[ - :, [self._schema._get_column_index_by_name(column_name)] - ].squeeze(), + self._data.iloc[:, [self._schema._get_column_index_by_name(column_name)]].squeeze(), self._schema.get_type_of_column(column_name), ) return output_column @@ -533,9 +525,7 @@ def add_rows(self, rows: Union[list[Row], Table]) -> Table: for row in rows: if self._schema != row.schema: raise SchemaMismatchError() - result = pd.concat( - [result, *[row._data.to_frame().T for row in rows]] - ).infer_objects() + result = pd.concat([result, *[row._data.to_frame().T for row in rows]]).infer_objects() result.columns = self._schema.get_column_names() return Table(result) @@ -568,9 +558,7 @@ def drop_columns(self, column_names: list[str]) -> Table: if len(invalid_columns) != 0: raise UnknownColumnNameError(invalid_columns) transformed_data = self._data.drop(labels=column_indices, axis="columns") - transformed_data.columns = list( - name for name in self._schema.get_column_names() if name not in column_names - ) + transformed_data.columns = list(name for name in self._schema.get_column_names() if name not in column_names) return Table(transformed_data) def drop_columns_with_missing_values(self) -> Table: @@ -582,9 +570,7 @@ def drop_columns_with_missing_values(self) -> Table: table : Table A table without the columns that contain missing values. """ - return Table.from_columns( - [column for column in self.to_columns() if not column.has_missing_values()] - ) + return Table.from_columns([column for column in self.to_columns() if not column.has_missing_values()]) def drop_columns_with_non_numerical_values(self) -> Table: """ @@ -596,9 +582,7 @@ def drop_columns_with_non_numerical_values(self) -> Table: A table without the columns that contain non-numerical values. """ - return Table.from_columns( - [column for column in self.to_columns() if column.type.is_numeric()] - ) + return Table.from_columns([column for column in self.to_columns() if column.type.is_numeric()]) def drop_duplicate_rows(self) -> Table: """ @@ -642,9 +626,7 @@ def drop_rows_with_outliers(self) -> Table: copy = self._data.copy(deep=True) table_without_nonnumericals = self.drop_columns_with_non_numerical_values() - z_scores = np.absolute( - stats.zscore(table_without_nonnumericals._data, nan_policy="omit") - ) + z_scores = np.absolute(stats.zscore(table_without_nonnumericals._data, nan_policy="omit")) filter_ = ((z_scores < 3) | np.isnan(z_scores)).all(axis=1) return Table(copy[filter_], self._schema) @@ -699,9 +681,7 @@ def keep_only_columns(self, column_names: list[str]) -> Table: if len(invalid_columns) != 0: raise UnknownColumnNameError(invalid_columns) transformed_data = self._data[column_indices] - transformed_data.columns = list( - name for name in self._schema.get_column_names() if name in column_names - ) + transformed_data.columns = list(name for name in self._schema.get_column_names() if name in column_names) return Table(transformed_data) def rename_column(self, old_name: str, new_name: str) -> Table: @@ -769,10 +749,7 @@ def replace_column(self, old_column_name: str, new_column: Column) -> Table: if old_column_name not in self._schema.get_column_names(): raise UnknownColumnNameError([old_column_name]) - if ( - new_column.name in self._schema.get_column_names() - and new_column.name != old_column_name - ): + if new_column.name in self._schema.get_column_names() and new_column.name != old_column_name: raise DuplicateColumnNameError(new_column.name) if self.count_rows() != new_column._data.size: @@ -838,13 +815,7 @@ def slice( if end is None: end = self.count_rows() - if ( - start < 0 - or end < 0 - or start >= self.count_rows() - or end > self.count_rows() - or end < start - ): + if start < 0 or end < 0 or start >= self.count_rows() or end > self.count_rows() or end < start: raise ValueError("the given index is out of bounds") new_df = self._data.iloc[start:end:step] @@ -853,9 +824,7 @@ def slice( def sort_columns( self, - comparator: Callable[[Column, Column], int] = lambda col1, col2: ( - col1.name > col2.name - ) + comparator: Callable[[Column, Column], int] = lambda col1, col2: (col1.name > col2.name) - (col1.name < col2.name), ) -> Table: """ @@ -891,9 +860,9 @@ def sort_rows(self, comparator: Callable[[Row, Row], int]) -> Table: The comparator is a function that takes two rows `row1` and `row2` and returns an integer: - * If `col1` should be ordered before `col2`, the function should return a negative number. - * If `col1` should be ordered after `col2`, the function should return a positive number. - * If the original order of `col1` and `col2` should be kept, the function should return 0. + * If `row1` should be ordered before `row2`, the function should return a negative number. + * If `row1` should be ordered after `row2`, the function should return a positive number. + * If the original order of `row1` and `row2` should be kept, the function should return 0. Parameters ---------- @@ -933,9 +902,7 @@ def split(self, percentage_in_first: float) -> typing.Tuple[Table, Table]: self.slice(round(percentage_in_first * self.count_rows())), ) - def transform_column( - self, name: str, transformer: Callable[[Row], typing.Any] - ) -> Table: + def transform_column(self, name: str, transformer: Callable[[Row], typing.Any]) -> Table: """ Transform provided column by calling provided transformer. @@ -1103,9 +1070,7 @@ def to_rows(self) -> list[Row]: rows : list[Row] List of rows. """ - return [ - Row(series_row, self._schema) for (_, series_row) in self._data.iterrows() - ] + return [Row(series_row, self._schema) for (_, series_row) in self._data.iterrows()] # ------------------------------------------------------------------------------------------------------------------ # Other @@ -1123,7 +1088,5 @@ def _ipython_display_(self) -> DisplayHandle: tmp = self._data.copy(deep=True) tmp.columns = self.get_column_names() - with pd.option_context( - "display.max_rows", tmp.shape[0], "display.max_columns", tmp.shape[1] - ): + with pd.option_context("display.max_rows", tmp.shape[0], "display.max_columns", tmp.shape[1]): return display(tmp) diff --git a/src/safeds/data/tabular/transformation/__init__.py b/src/safeds/data/tabular/transformation/__init__.py index 1d06d8de2..17e046538 100644 --- a/src/safeds/data/tabular/transformation/__init__.py +++ b/src/safeds/data/tabular/transformation/__init__.py @@ -1,4 +1,3 @@ from ._imputer import Imputer from ._label_encoder import LabelEncoder from ._one_hot_encoder import OneHotEncoder -from ._ordinal_encoder import OrdinalEncoder diff --git a/src/safeds/data/tabular/transformation/_ordinal_encoder.py b/src/safeds/data/tabular/transformation/_ordinal_encoder.py deleted file mode 100644 index 5ccfb4d3f..000000000 --- a/src/safeds/data/tabular/transformation/_ordinal_encoder.py +++ /dev/null @@ -1,144 +0,0 @@ -from safeds import exceptions -from safeds.data.tabular.containers import Table -from sklearn import preprocessing - - -# noinspection PyProtectedMember -class OrdinalEncoder: - """ - This OrdinalEncoder encodes one or more given columns into ordinal numbers. The encoding order must be provided. - - Parameters - -------- - order : list[str] - The order in which the ordinal encoder encodes the values. - """ - - def __init__(self, order: list[str]) -> None: - self._is_fitted = 0 - self._oe = preprocessing.OrdinalEncoder(categories=[order]) - self._order = order - - def fit(self, table: Table, column_name: str) -> None: - """ - Fit the ordinal encoder with the values in the given table. - - Parameters - ---------- - table : Table - The table containing the data used to fit the ordinal encoder. - column_name : str - The column which should be ordinal-encoded. - - Returns - ------- - None - This function does not return any value. It updates the internal state of the ordinal encoder object. - - Raises - ------- - LearningError - If the model could not be fitted correctly. - """ - - p_df = table._data - p_df.columns = table.schema.get_column_names() - try: - self._oe.fit(p_df[[column_name]]) - except exceptions.NotFittedError as exc: - raise exceptions.LearningError("") from exc - - def transform(self, table: Table, column_name: str) -> Table: - """ - Transform the given table to an ordinal-encoded table. - - Parameters - ---------- - table : Table - The table with target values. - column_name : str - The name of the column. - - Returns - ------- - table : Table - The table with ordinal encodings. - - Raises - ------ - NotFittedError - If the model was not fitted before transforming. - """ - p_df = table._data.copy() - p_df.columns = table.schema.get_column_names() - try: - p_df[[column_name]] = self._oe.transform(p_df[[column_name]]) - p_df[column_name] = p_df[column_name].astype(dtype="int64", copy=False) - return Table(p_df) - except Exception as exc: - raise exceptions.NotFittedError from exc - - def fit_transform(self, table: Table, columns: list[str]) -> Table: - """ - Oridnal-encode a given table with the given ordinal encoder. - The order is provided in the constructor. A new order will not be inferred from other columns. - - Parameters - ---------- - table : Table - The table which will be transformed. - columns : list[str] - The list of column names to be considered while encoding. - - Returns - ------- - table : Table - A new Table object which is ordinal-encoded. - - Raises - ------- - NotFittedError - If the encoder was not fitted before transforming. - KeyError - If the column does not exist. - - """ - try: - for col in columns: - # Fit the Ordinal Encoder on the Column - self.fit(table, col) - # transform the column using the trained Ordinal Encoder - table = self.transform(table, col) - return table - except exceptions.NotFittedError as exc: - raise exceptions.NotFittedError from exc - - def inverse_transform(self, table: Table, column_name: str) -> Table: - """ - Inverse the transformed table back to original encodings. - - Parameters - ---------- - table : Table - The table to be inverse-transformed. - column_name : str - The column which should be inverse-transformed. - - Returns - ------- - table : Table - The inverse-transformed table. - - Raises - ------- - NotFittedError - If the encoder was not fitted before transforming. - """ - - p_df = table._data.copy() - p_df.columns = table.schema.get_column_names() - try: - p_df[[column_name]] = self._oe.inverse_transform(p_df[[column_name]]) - return Table(p_df) - except exceptions.NotFittedError as exc: - raise exceptions.NotFittedError from exc diff --git a/tests/safeds/data/tabular/transformation/_ordinal_encoder/__init__.py b/tests/safeds/data/tabular/transformation/_ordinal_encoder/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/tests/safeds/data/tabular/transformation/_ordinal_encoder/test_fit_transform.py b/tests/safeds/data/tabular/transformation/_ordinal_encoder/test_fit_transform.py deleted file mode 100644 index 863bdf349..000000000 --- a/tests/safeds/data/tabular/transformation/_ordinal_encoder/test_fit_transform.py +++ /dev/null @@ -1,50 +0,0 @@ -import pandas as pd -import pytest -from safeds.data.tabular.containers import Table -from safeds.data.tabular.transformation import OrdinalEncoder -from safeds.data.tabular.typing import IntColumnType -from safeds.exceptions import NotFittedError - - -def test_fit_transform_valid() -> None: - test_table = Table( - pd.DataFrame( - { - "temperatur": ["warm", "kalt", "kalt", "warm", "heiss"], - "gedöns": ["1", "2", "3", "4", "5"], - "temperatur_2": ["kalt", "kalt", "warm", "warm", "kalt"], - } - ) - ) - check_table = Table( - pd.DataFrame( - { - "temperatur": [1, 0, 0, 1, 2], - "gedöns": ["1", "2", "3", "4", "5"], - "temperatur_2": [0, 0, 1, 1, 0], - } - ) - ) - oe = OrdinalEncoder(["kalt", "warm", "heiss"]) - test_table = oe.fit_transform(test_table, ["temperatur", "temperatur_2"]) - assert test_table.schema.get_column_names() == check_table.schema.get_column_names() - assert isinstance(test_table.schema.get_type_of_column("temperatur"), IntColumnType) - assert isinstance( - test_table.schema.get_type_of_column("temperatur_2"), IntColumnType - ) - assert test_table == check_table - - -def test_fit_transform_invalid() -> None: - oe = OrdinalEncoder(["test", "test"]) - test_table = Table( - pd.DataFrame( - { - "temperatur": ["warm", "kalt", "kalt", "warm", "heiss"], - "gedöns": ["1", "2", "3", "4", "5"], - "temperatur_2": ["kalt", "kalt", "warm", "warm", "kalt"], - } - ) - ) - with pytest.raises(NotFittedError): - oe.transform(test_table, "test") diff --git a/tests/safeds/data/tabular/transformation/_ordinal_encoder/test_inverse_transform.py b/tests/safeds/data/tabular/transformation/_ordinal_encoder/test_inverse_transform.py deleted file mode 100644 index 94a95fc60..000000000 --- a/tests/safeds/data/tabular/transformation/_ordinal_encoder/test_inverse_transform.py +++ /dev/null @@ -1,37 +0,0 @@ -import pandas as pd -from safeds.data.tabular.containers import Table -from safeds.data.tabular.transformation import OrdinalEncoder -from safeds.data.tabular.typing import StringColumnType - - -def test_inverse_transform() -> None: - test_table = Table( - pd.DataFrame( - { - "temperatur": [1, 0, 0, 1, 2], - "gedöns": ["1", "2", "3", "4", "5"], - "temperatur_2": [0, 0, 1, 1, 0], - } - ) - ) - check_table = Table( - pd.DataFrame( - { - "temperatur": ["warm", "kalt", "kalt", "warm", "heiss"], - "gedöns": ["1", "2", "3", "4", "5"], - "temperatur_2": ["kalt", "kalt", "warm", "warm", "kalt"], - } - ) - ) - oe = OrdinalEncoder(["kalt", "warm", "heiss"]) - oe.fit(check_table, "temperatur") - test_table = oe.inverse_transform(test_table, "temperatur") - test_table = oe.inverse_transform(test_table, "temperatur_2") - assert test_table.schema.get_column_names() == check_table.schema.get_column_names() - assert isinstance( - test_table.schema.get_type_of_column("temperatur"), StringColumnType - ) - assert isinstance( - test_table.schema.get_type_of_column("temperatur_2"), StringColumnType - ) - assert test_table == check_table diff --git a/tests/safeds/data/tabular/transformation/_ordinal_encoder/test_transform.py b/tests/safeds/data/tabular/transformation/_ordinal_encoder/test_transform.py deleted file mode 100644 index 4b7a7a574..000000000 --- a/tests/safeds/data/tabular/transformation/_ordinal_encoder/test_transform.py +++ /dev/null @@ -1,20 +0,0 @@ -import pandas as pd -import pytest -from safeds.data.tabular.containers import Table -from safeds.data.tabular.transformation import OrdinalEncoder -from safeds.exceptions import NotFittedError - - -def test_transform_invalid() -> None: - test_table = Table( - pd.DataFrame( - { - "temperatur": ["warm", "kalt", "kalt", "warm", "heiss"], - "gedöns": ["1", "2", "3", "4", "5"], - "temperatur_2": ["kalt", "kalt", "warm", "warm", "kalt"], - } - ) - ) - ode = OrdinalEncoder(["kalt", "warm", "heiss"]) - with pytest.raises(NotFittedError): - ode.transform(test_table, "temperatur")