From 4a059e09a4c66397cf266f71c4248543e2e972e5 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Fri, 3 May 2024 21:13:39 +0200 Subject: [PATCH] feat: replace other values than NaN with imputer (#707) Closes #643 ### Summary of Changes * Add an optional argument to `Imputer` to configure the `value_to_replace`. This can be an int, float, or string. --------- Co-authored-by: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com> --- docs/tutorials/data_processing.ipynb | 72 ++--- .../data/tabular/transformation/_imputer.py | 249 ++++++++++------ src/safeds/data/tabular/typing/__init__.py | 3 - .../data/tabular/typing/_imputer_strategy.py | 55 ---- .../tabular/transformation/test_imputer.py | 268 +++++++++++------- 5 files changed, 359 insertions(+), 288 deletions(-) delete mode 100644 src/safeds/data/tabular/typing/_imputer_strategy.py diff --git a/docs/tutorials/data_processing.ipynb b/docs/tutorials/data_processing.ipynb index 1e2f87661..c08fc31db 100644 --- a/docs/tutorials/data_processing.ipynb +++ b/docs/tutorials/data_processing.ipynb @@ -32,7 +32,6 @@ { "cell_type": "code", "execution_count": null, - "outputs": [], "source": [ "from safeds.data.tabular.containers import Table\n", "\n", @@ -40,7 +39,8 @@ ], "metadata": { "collapsed": false - } + }, + "outputs": [] }, { "cell_type": "markdown", @@ -54,7 +54,6 @@ { "cell_type": "code", "execution_count": null, - "outputs": [], "source": [ "titanic_slice = titanic.slice_rows(end=10)\n", "\n", @@ -62,7 +61,8 @@ ], "metadata": { "collapsed": false - } + }, + "outputs": [] }, { "cell_type": "markdown", @@ -76,13 +76,13 @@ { "cell_type": "code", "execution_count": null, - "outputs": [], "source": [ "titanic_slice.get_row(0)" ], "metadata": { "collapsed": false - } + }, + "outputs": [] }, { "cell_type": "markdown", @@ -96,13 +96,13 @@ { "cell_type": "code", "execution_count": null, - "outputs": [], "source": [ "titanic_slice.get_column(\"name\")" ], "metadata": { "collapsed": false - } + }, + "outputs": [] }, { "cell_type": "markdown", @@ -116,7 +116,6 @@ { "cell_type": "code", "execution_count": null, - "outputs": [], "source": [ "Table.from_rows([\n", " titanic_slice.get_row(0),\n", @@ -125,7 +124,8 @@ ], "metadata": { "collapsed": false - } + }, + "outputs": [] }, { "cell_type": "markdown", @@ -139,7 +139,6 @@ { "cell_type": "code", "execution_count": null, - "outputs": [], "source": [ "Table.from_columns([\n", " titanic_slice.get_column(\"name\"),\n", @@ -148,7 +147,8 @@ ], "metadata": { "collapsed": false - } + }, + "outputs": [] }, { "cell_type": "markdown", @@ -162,7 +162,6 @@ { "cell_type": "code", "execution_count": null, - "outputs": [], "source": [ "titanic_slice.remove_columns([\n", " \"id\",\n", @@ -175,7 +174,8 @@ ], "metadata": { "collapsed": false - } + }, + "outputs": [] }, { "cell_type": "markdown", @@ -189,13 +189,13 @@ { "cell_type": "code", "execution_count": null, - "outputs": [], "source": [ "titanic_slice.keep_only_columns([\"name\", \"survived\"])" ], "metadata": { "collapsed": false - } + }, + "outputs": [] }, { "cell_type": "markdown", @@ -211,13 +211,13 @@ { "cell_type": "code", "execution_count": null, - "outputs": [], "source": [ "titanic_slice.sort_columns()" ], "metadata": { "collapsed": false - } + }, + "outputs": [] }, { "cell_type": "markdown", @@ -231,7 +231,6 @@ { "cell_type": "code", "execution_count": null, - "outputs": [], "source": [ "titanic_slice.sort_columns(\n", " lambda column1, column2:\n", @@ -240,7 +239,8 @@ ], "metadata": { "collapsed": false - } + }, + "outputs": [] }, { "cell_type": "markdown", @@ -254,7 +254,6 @@ { "cell_type": "code", "execution_count": null, - "outputs": [], "source": [ "titanic.filter_rows(\n", " lambda row:\n", @@ -263,7 +262,8 @@ ], "metadata": { "collapsed": false - } + }, + "outputs": [] }, { "cell_type": "markdown", @@ -278,7 +278,6 @@ { "cell_type": "code", "execution_count": null, - "outputs": [], "source": [ "from safeds.data.tabular.transformation import Imputer\n", "\n", @@ -287,7 +286,8 @@ ], "metadata": { "collapsed": false - } + }, + "outputs": [] }, { "cell_type": "markdown", @@ -301,7 +301,6 @@ { "cell_type": "code", "execution_count": null, - "outputs": [], "source": [ "from safeds.data.tabular.transformation import LabelEncoder\n", "\n", @@ -310,7 +309,8 @@ ], "metadata": { "collapsed": false - } + }, + "outputs": [] }, { "cell_type": "markdown", @@ -324,7 +324,6 @@ { "cell_type": "code", "execution_count": null, - "outputs": [], "source": [ "from safeds.data.tabular.transformation import OneHotEncoder\n", "\n", @@ -333,7 +332,8 @@ ], "metadata": { "collapsed": false - } + }, + "outputs": [] }, { "cell_type": "markdown", @@ -347,7 +347,6 @@ { "cell_type": "code", "execution_count": null, - "outputs": [], "source": [ "from safeds.data.tabular.transformation import RangeScaler\n", "\n", @@ -356,7 +355,8 @@ ], "metadata": { "collapsed": false - } + }, + "outputs": [] }, { "cell_type": "markdown", @@ -370,7 +370,6 @@ { "cell_type": "code", "execution_count": null, - "outputs": [], "source": [ "from safeds.data.tabular.transformation import StandardScaler\n", "\n", @@ -379,7 +378,8 @@ ], "metadata": { "collapsed": false - } + }, + "outputs": [] }, { "cell_type": "markdown", @@ -394,13 +394,13 @@ { "cell_type": "code", "execution_count": null, - "outputs": [], "source": [ "titanic_slice.transform_column(\"sex\", lambda row: 1 if row.get_value(\"sex\") == \"female\" else 0)\n" ], "metadata": { "collapsed": false - } + }, + "outputs": [] }, { "cell_type": "markdown", @@ -414,13 +414,13 @@ { "cell_type": "code", "execution_count": null, - "outputs": [], "source": [ "titanic_slice.transform_column(\"parents_children\", lambda row: \"No\" if row.get_value(\"parents_children\") == 0 else \"Yes\")\n" ], "metadata": { "collapsed": false - } + }, + "outputs": [] } ], "metadata": { diff --git a/src/safeds/data/tabular/transformation/_imputer.py b/src/safeds/data/tabular/transformation/_imputer.py index 9602712e8..f8bd25439 100644 --- a/src/safeds/data/tabular/transformation/_imputer.py +++ b/src/safeds/data/tabular/transformation/_imputer.py @@ -2,11 +2,14 @@ import sys import warnings +from abc import ABC, abstractmethod from typing import TYPE_CHECKING, Any +import pandas as pd + +from safeds._utils import _structural_hash from safeds.data.tabular.containers import Table from safeds.data.tabular.transformation._table_transformer import TableTransformer -from safeds.data.tabular.typing import ImputerStrategy from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, UnknownColumnNameError if TYPE_CHECKING: @@ -15,12 +18,14 @@ class Imputer(TableTransformer): """ - Replace missing values with the given strategy. + Replace missing values using the given strategy. Parameters ---------- strategy: - The strategy used to impute missing values. Use the classes nested inside `Imputer.Strategy` to specify it. + How to replace missing values. + value_to_replace: + The value that should be replaced. Examples -------- @@ -37,105 +42,80 @@ class Imputer(TableTransformer): >>> transformed_table = transformer.fit_and_transform(table) """ - class Strategy: - class Constant(ImputerStrategy): + class Strategy(ABC): + """Various strategies to replace missing values. Use the static methods to create instances of this class.""" + + @abstractmethod + def __eq__(self, other: object) -> bool: + pass # pragma: no cover + + @abstractmethod + def __hash__(self) -> int: + pass # pragma: no cover + + @abstractmethod + def _apply(self, imputer: sk_SimpleImputer) -> None: """ - An imputation strategy for imputing missing data with given constant values. + Set the imputer strategy of the given imputer. Parameters ---------- - value: - The given value to impute missing values. + imputer: + The imputer to augment. """ - def __eq__(self, other: object) -> bool: - if not isinstance(other, Imputer.Strategy.Constant): - return NotImplemented - if self is other: - return True - return self._value == other._value - - __hash__ = ImputerStrategy.__hash__ - - def __init__(self, value: Any): - self._value = value - - def __sizeof__(self) -> int: - """ - Return the complete size of this object. - - Returns - ------- - size: - Size of this object in bytes. - """ - return sys.getsizeof(self._value) - - def __str__(self) -> str: - return f"Constant({self._value})" - - def _augment_imputer(self, imputer: sk_SimpleImputer) -> None: - imputer.strategy = "constant" - imputer.fill_value = self._value - - class Mean(ImputerStrategy): - """An imputation strategy for imputing missing data with mean values.""" - - def __eq__(self, other: object) -> bool: - if not isinstance(other, Imputer.Strategy.Mean): - return NotImplemented - return True - - __hash__ = ImputerStrategy.__hash__ - - def __str__(self) -> str: - return "Mean" - - def _augment_imputer(self, imputer: sk_SimpleImputer) -> None: - imputer.strategy = "mean" - - class Median(ImputerStrategy): - """An imputation strategy for imputing missing data with median values.""" - - def __eq__(self, other: object) -> bool: - if not isinstance(other, Imputer.Strategy.Median): - return NotImplemented - return True - - __hash__ = ImputerStrategy.__hash__ - - def __str__(self) -> str: - return "Median" - - def _augment_imputer(self, imputer: sk_SimpleImputer) -> None: - imputer.strategy = "median" - - class Mode(ImputerStrategy): + @staticmethod + def Constant(value: Any) -> Imputer.Strategy: # noqa: N802 """ - An imputation strategy for imputing missing data with mode values. + Replace missing values with the given constant value. - The lowest value will be used if there are multiple values with the same highest count. + Parameters + ---------- + value: + The value to replace missing values. """ + return _Constant(value) # pragma: no cover - def __eq__(self, other: object) -> bool: - if not isinstance(other, Imputer.Strategy.Mode): - return NotImplemented - return True + @staticmethod + def Mean() -> Imputer.Strategy: # noqa: N802 + """Replace missing values with the mean of each column.""" + return _Mean() # pragma: no cover - __hash__ = ImputerStrategy.__hash__ + @staticmethod + def Median() -> Imputer.Strategy: # noqa: N802 + """Replace missing values with the median of each column.""" + return _Median() # pragma: no cover - def __str__(self) -> str: - return "Mode" + @staticmethod + def Mode() -> Imputer.Strategy: # noqa: N802 + """Replace missing values with the mode of each column.""" + return _Mode() # pragma: no cover - def _augment_imputer(self, imputer: sk_SimpleImputer) -> None: - imputer.strategy = "most_frequent" + def __init__(self, strategy: Imputer.Strategy, *, value_to_replace: float | str | None = None): + if value_to_replace is None: + value_to_replace = pd.NA - def __init__(self, strategy: ImputerStrategy): self._strategy = strategy + self._value_to_replace = value_to_replace self._wrapped_transformer: sk_SimpleImputer | None = None self._column_names: list[str] | None = None + @property + def strategy(self) -> Imputer.Strategy: + """The strategy used to replace missing values.""" + return self._strategy + + @property + def value_to_replace(self) -> Any: + """The value that should be replaced.""" + return self._value_to_replace + + @property + def is_fitted(self) -> bool: + """Whether the transformer is fitted.""" + return self._wrapped_transformer is not None + # noinspection PyProtectedMember def fit(self, table: Table, column_names: list[str] | None) -> Imputer: """ @@ -176,7 +156,7 @@ def fit(self, table: Table, column_names: list[str] | None) -> Imputer: if table.number_of_rows == 0: raise ValueError("The Imputer cannot be fitted because the table contains 0 rows") - if (isinstance(self._strategy, Imputer.Strategy.Mean | Imputer.Strategy.Median)) and table.keep_only_columns( + if (isinstance(self._strategy, _Mean | _Median)) and table.keep_only_columns( column_names, ).remove_columns_with_non_numerical_values().number_of_columns < len( column_names, @@ -194,7 +174,7 @@ def fit(self, table: Table, column_names: list[str] | None) -> Imputer: ), ) - if isinstance(self._strategy, Imputer.Strategy.Mode): + if isinstance(self._strategy, _Mode): multiple_most_frequent = {} for name in column_names: if len(table.get_column(name).mode()) > 1: @@ -209,7 +189,8 @@ def fit(self, table: Table, column_names: list[str] | None) -> Imputer: ) wrapped_transformer = sk_SimpleImputer() - self._strategy._augment_imputer(wrapped_transformer) + self._strategy._apply(wrapped_transformer) + wrapped_transformer.missing_values = self._value_to_replace wrapped_transformer.fit(table._data[column_names]) result = Imputer(self._strategy) @@ -265,11 +246,6 @@ def transform(self, table: Table) -> Table: ) return Table._from_pandas_dataframe(data, table.schema) - @property - def is_fitted(self) -> bool: - """Whether the transformer is fitted.""" - return self._wrapped_transformer is not None - def get_names_of_added_columns(self) -> list[str]: """ Get the names of all new columns that have been added by the Imputer. @@ -288,7 +264,6 @@ def get_names_of_added_columns(self) -> list[str]: raise TransformerNotFittedError return [] - # (Must implement abstract method, cannot instantiate class otherwise.) def get_names_of_changed_columns(self) -> list[str]: """ Get the names of all columns that may have been changed by the Imputer. @@ -324,3 +299,93 @@ def get_names_of_removed_columns(self) -> list[str]: if not self.is_fitted: raise TransformerNotFittedError return [] + + +# ---------------------------------------------------------------------------------------------------------------------- +# Imputation strategies +# ---------------------------------------------------------------------------------------------------------------------- + + +class _Constant(Imputer.Strategy): + def __init__(self, value: Any): + self._value = value + + @property + def value(self) -> Any: + return self._value + + def __eq__(self, other: object) -> bool: + if not isinstance(other, _Constant): + return NotImplemented + if self is other: + return True + return self._value == other._value + + def __hash__(self) -> int: + return _structural_hash(str(self)) + + def __sizeof__(self) -> int: + return sys.getsizeof(self._value) + + def __str__(self) -> str: + return f"Constant({self._value})" + + def _apply(self, imputer: sk_SimpleImputer) -> None: + imputer.strategy = "constant" + imputer.fill_value = self._value + + +class _Mean(Imputer.Strategy): + def __eq__(self, other: object) -> bool: + if not isinstance(other, _Mean): + return NotImplemented + return True + + def __hash__(self) -> int: + return _structural_hash(str(self)) + + def __str__(self) -> str: + return "Mean" + + def _apply(self, imputer: sk_SimpleImputer) -> None: + imputer.strategy = "mean" + + +class _Median(Imputer.Strategy): + def __eq__(self, other: object) -> bool: + if not isinstance(other, _Median): + return NotImplemented + return True + + def __hash__(self) -> int: + return _structural_hash(str(self)) + + def __str__(self) -> str: + return "Median" + + def _apply(self, imputer: sk_SimpleImputer) -> None: + imputer.strategy = "median" + + +class _Mode(Imputer.Strategy): + def __eq__(self, other: object) -> bool: + if not isinstance(other, _Mode): + return NotImplemented + return True + + def __hash__(self) -> int: + return _structural_hash(str(self)) + + def __str__(self) -> str: + return "Mode" + + def _apply(self, imputer: sk_SimpleImputer) -> None: + imputer.strategy = "most_frequent" + + +# Override the methods with classes, so they can be used in `isinstance` calls. Unlike methods, classes define a type. +# This is needed for the DSL, where imputer strategies are variants of an enum. +Imputer.Strategy.Constant = _Constant # type: ignore[method-assign] +Imputer.Strategy.Mean = _Mean # type: ignore[method-assign] +Imputer.Strategy.Median = _Median # type: ignore[method-assign] +Imputer.Strategy.Mode = _Mode # type: ignore[method-assign] diff --git a/src/safeds/data/tabular/typing/__init__.py b/src/safeds/data/tabular/typing/__init__.py index 5b6db59a2..ab6a79399 100644 --- a/src/safeds/data/tabular/typing/__init__.py +++ b/src/safeds/data/tabular/typing/__init__.py @@ -6,7 +6,6 @@ if TYPE_CHECKING: from ._column_type import Anything, Boolean, ColumnType, Integer, Nothing, RealNumber, String - from ._imputer_strategy import ImputerStrategy from ._schema import Schema apipkg.initpkg( @@ -15,7 +14,6 @@ "Anything": "._column_type:Anything", "Boolean": "._column_type:Boolean", "ColumnType": "._column_type:ColumnType", - "ImputerStrategy": "._imputer_strategy:ImputerStrategy", "Integer": "._column_type:Integer", "Nothing": "._column_type:Nothing", "RealNumber": "._column_type:RealNumber", @@ -28,7 +26,6 @@ "Anything", "Boolean", "ColumnType", - "ImputerStrategy", "Integer", "Nothing", "RealNumber", diff --git a/src/safeds/data/tabular/typing/_imputer_strategy.py b/src/safeds/data/tabular/typing/_imputer_strategy.py deleted file mode 100644 index f35cf9d25..000000000 --- a/src/safeds/data/tabular/typing/_imputer_strategy.py +++ /dev/null @@ -1,55 +0,0 @@ -from __future__ import annotations - -from abc import ABC, abstractmethod -from typing import TYPE_CHECKING - -from safeds._utils import _structural_hash - -if TYPE_CHECKING: - from sklearn.impute import SimpleImputer as sk_SimpleImputer - - -class ImputerStrategy(ABC): - """ - The abstract base class of the different imputation strategies supported by the `Imputer`. - - This class is only needed for type annotations. Use the subclasses nested inside `Imputer.Strategy` instead. - """ - - @abstractmethod - def _augment_imputer(self, imputer: sk_SimpleImputer) -> None: - """ - Set the imputer strategy of the given imputer. - - Parameters - ---------- - imputer: - The imputer to augment. - """ - - @abstractmethod - def __eq__(self, other: object) -> bool: - """ - Compare two imputer strategies. - - Parameters - ---------- - other: - other object to compare to - - Returns - ------- - equals: - Whether the two imputer strategies are equal - """ - - def __hash__(self) -> int: - """ - Return a deterministic hash value for this imputer strategy. - - Returns - ------- - hash: - The hash value. - """ - return _structural_hash(self.__class__.__qualname__) diff --git a/tests/safeds/data/tabular/transformation/test_imputer.py b/tests/safeds/data/tabular/transformation/test_imputer.py index 1d986192b..55699318d 100644 --- a/tests/safeds/data/tabular/transformation/test_imputer.py +++ b/tests/safeds/data/tabular/transformation/test_imputer.py @@ -4,11 +4,11 @@ import pytest from safeds.data.tabular.containers import Table from safeds.data.tabular.transformation import Imputer -from safeds.data.tabular.typing import ImputerStrategy +from safeds.data.tabular.transformation._imputer import _Mode from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, UnknownColumnNameError -def strategies() -> list[ImputerStrategy]: +def strategies() -> list[Imputer.Strategy]: """ Return the list of imputer strategies to test. @@ -17,13 +17,107 @@ def strategies() -> list[ImputerStrategy]: Returns ------- - strategies : list[ImputerStrategy] + strategies : list[Imputer.Strategy] The list of classifiers to test. """ return [Imputer.Strategy.Constant(2), Imputer.Strategy.Mean(), Imputer.Strategy.Median(), Imputer.Strategy.Mode()] -class TestStrategy: +class TestStrategyClass: + def test_should_be_able_to_get_value_of_constant_strategy(self) -> None: + assert Imputer.Strategy.Constant(1).value == 1 # type: ignore[attr-defined] + + @pytest.mark.parametrize( + ("strategy", "type_", "expected"), + [ + (Imputer.Strategy.Constant(0), Imputer.Strategy.Constant, True), + (Imputer.Strategy.Mean(), Imputer.Strategy.Mean, True), + (Imputer.Strategy.Median(), Imputer.Strategy.Median, True), + (Imputer.Strategy.Mode(), Imputer.Strategy.Mode, True), + (Imputer.Strategy.Mode(), Imputer.Strategy.Mean, False), + ], + ) + def test_should_be_able_to_use_strategy_in_isinstance( + self, + strategy: Imputer.Strategy, + type_: type, + expected: bool, + ) -> None: + assert isinstance(strategy, type_) == expected + + class TestEq: + @pytest.mark.parametrize( + ("strategy1", "strategy2"), + ([(x, y) for x in strategies() for y in strategies() if x.__class__ == y.__class__]), + ids=lambda x: x.__class__.__name__, + ) + def test_equal_strategy( + self, + strategy1: Imputer.Strategy, + strategy2: Imputer.Strategy, + ) -> None: + assert strategy1 == strategy2 + + @pytest.mark.parametrize( + "strategy", + ([x for x in strategies() if x.__class__]), + ids=lambda x: x.__class__.__name__, + ) + def test_equal_identity_strategy( + self, + strategy: Imputer.Strategy, + ) -> None: + assert strategy == strategy # noqa: PLR0124 + + @pytest.mark.parametrize( + ("strategy1", "strategy2"), + ([(x, y) for x in strategies() for y in strategies() if x.__class__ != y.__class__]), + ids=lambda x: x.__class__.__name__, + ) + def test_unequal_strategy( + self, + strategy1: Imputer.Strategy, + strategy2: Imputer.Strategy, + ) -> None: + assert strategy1 != strategy2 + + class TestHash: + @pytest.mark.parametrize( + ("strategy1", "strategy2"), + ([(x, y) for x in strategies() for y in strategies() if x.__class__ == y.__class__]), + ids=lambda x: x.__class__.__name__, + ) + def test_should_return_same_hash_for_equal_strategy( + self, + strategy1: Imputer.Strategy, + strategy2: Imputer.Strategy, + ) -> None: + assert hash(strategy1) == hash(strategy2) + + @pytest.mark.parametrize( + ("strategy1", "strategy2"), + ([(x, y) for x in strategies() for y in strategies() if x.__class__ != y.__class__]), + ids=lambda x: x.__class__.__name__, + ) + def test_should_return_different_hash_for_unequal_strategy( + self, + strategy1: Imputer.Strategy, + strategy2: Imputer.Strategy, + ) -> None: + assert hash(strategy1) != hash(strategy2) + + class TestSizeof: + @pytest.mark.parametrize( + "strategy", + ([Imputer.Strategy.Constant(1)]), + ids=lambda x: x.__class__.__name__, + ) + def test_sizeof_strategy( + self, + strategy: Imputer.Strategy, + ) -> None: + assert sys.getsizeof(strategy) > sys.getsizeof(object()) + class TestStr: @pytest.mark.parametrize( ("strategy", "expected"), @@ -33,15 +127,34 @@ class TestStr: (Imputer.Strategy.Median(), "Median"), (Imputer.Strategy.Mode(), "Mode"), ], - ids=["Constant", "Mean", "Median", "Mode"], + ids=lambda x: x.__class__.__name__, ) - def test_should_return_correct_string_representation(self, strategy: ImputerStrategy, expected: str) -> None: + def test_should_return_correct_string_representation(self, strategy: Imputer.Strategy, expected: str) -> None: assert str(strategy) == expected +class TestStrategyProperty: + @pytest.mark.parametrize( + "strategy", + strategies(), + ids=lambda x: x.__class__.__name__, + ) + def test_should_return_correct_strategy(self, strategy: Imputer.Strategy) -> None: + assert Imputer(strategy).strategy == strategy + + +class TestValueToReplaceProperty: + @pytest.mark.parametrize( + "value_to_replace", + [0], + ) + def test_should_return_correct_value_to_replace(self, value_to_replace: float | str | None) -> None: + assert Imputer(Imputer.Strategy.Mode(), value_to_replace=value_to_replace).value_to_replace == value_to_replace + + class TestFit: @pytest.mark.parametrize("strategy", strategies(), ids=lambda x: x.__class__.__name__) - def test_should_raise_if_column_not_found(self, strategy: ImputerStrategy) -> None: + def test_should_raise_if_column_not_found(self, strategy: Imputer.Strategy) -> None: table = Table( { "a": [1, 3, None], @@ -52,7 +165,7 @@ def test_should_raise_if_column_not_found(self, strategy: ImputerStrategy) -> No Imputer(strategy).fit(table, ["b", "c"]) @pytest.mark.parametrize("strategy", strategies(), ids=lambda x: x.__class__.__name__) - def test_should_raise_if_table_contains_no_rows(self, strategy: ImputerStrategy) -> None: + def test_should_raise_if_table_contains_no_rows(self, strategy: Imputer.Strategy) -> None: with pytest.raises(ValueError, match=r"The Imputer cannot be fitted because the table contains 0 rows"): Imputer(strategy).fit(Table({"col1": []}), ["col1"]) @@ -68,7 +181,7 @@ def test_should_raise_if_table_contains_non_numerical_data( self, table: Table, col_names: list[str], - strategy: ImputerStrategy, + strategy: Imputer.Strategy, ) -> None: with pytest.raises( NonNumericColumnError, @@ -100,7 +213,7 @@ def test_should_warn_if_multiple_mode_values(self, table: Table, most_frequent: Imputer(Imputer.Strategy.Mode()).fit(table, None) @pytest.mark.parametrize("strategy", strategies(), ids=lambda x: x.__class__.__name__) - def test_should_not_change_original_transformer(self, strategy: ImputerStrategy) -> None: + def test_should_not_change_original_transformer(self, strategy: Imputer.Strategy) -> None: table = Table( { "a": [1, 3, 3, None], @@ -116,7 +229,7 @@ def test_should_not_change_original_transformer(self, strategy: ImputerStrategy) class TestTransform: @pytest.mark.parametrize("strategy", strategies(), ids=lambda x: x.__class__.__name__) - def test_should_raise_if_column_not_found(self, strategy: ImputerStrategy) -> None: + def test_should_raise_if_column_not_found(self, strategy: Imputer.Strategy) -> None: table_to_fit = Table( { "a": [1, 3, 3, None], @@ -124,7 +237,7 @@ def test_should_raise_if_column_not_found(self, strategy: ImputerStrategy) -> No }, ) - if isinstance(strategy, Imputer.Strategy.Mode): + if isinstance(strategy, _Mode): with warnings.catch_warnings(): warnings.filterwarnings( action="ignore", @@ -145,12 +258,12 @@ def test_should_raise_if_column_not_found(self, strategy: ImputerStrategy) -> No transformer.transform(table_to_transform) @pytest.mark.parametrize("strategy", strategies(), ids=lambda x: x.__class__.__name__) - def test_should_raise_if_table_contains_no_rows(self, strategy: ImputerStrategy) -> None: + def test_should_raise_if_table_contains_no_rows(self, strategy: Imputer.Strategy) -> None: with pytest.raises(ValueError, match=r"The Imputer cannot transform the table because it contains 0 rows"): Imputer(strategy).fit(Table({"col1": [1, 2, 2]}), ["col1"]).transform(Table({"col1": []})) @pytest.mark.parametrize("strategy", strategies(), ids=lambda x: x.__class__.__name__) - def test_should_raise_if_not_fitted(self, strategy: ImputerStrategy) -> None: + def test_should_raise_if_not_fitted(self, strategy: Imputer.Strategy) -> None: table = Table( { "a": [1, 3, None], @@ -165,12 +278,12 @@ def test_should_raise_if_not_fitted(self, strategy: ImputerStrategy) -> None: class TestIsFitted: @pytest.mark.parametrize("strategy", strategies(), ids=lambda x: x.__class__.__name__) - def test_should_return_false_before_fitting(self, strategy: ImputerStrategy) -> None: + def test_should_return_false_before_fitting(self, strategy: Imputer.Strategy) -> None: transformer = Imputer(strategy) assert not transformer.is_fitted @pytest.mark.parametrize("strategy", strategies(), ids=lambda x: x.__class__.__name__) - def test_should_return_true_after_fitting(self, strategy: ImputerStrategy) -> None: + def test_should_return_true_after_fitting(self, strategy: Imputer.Strategy) -> None: table = Table( { "a": [1, 3, 3, None], @@ -184,7 +297,7 @@ def test_should_return_true_after_fitting(self, strategy: ImputerStrategy) -> No class TestFitAndTransform: @pytest.mark.parametrize( - ("table", "column_names", "strategy", "expected"), + ("table", "column_names", "strategy", "value_to_replace", "expected"), [ ( Table( @@ -194,6 +307,7 @@ class TestFitAndTransform: ), None, Imputer.Strategy.Constant(0.0), + None, Table( { "a": [1.0, 3.0, 0.0], @@ -208,6 +322,7 @@ class TestFitAndTransform: ), None, Imputer.Strategy.Mean(), + None, Table( { "a": [1.0, 3.0, 2.0], @@ -222,6 +337,7 @@ class TestFitAndTransform: ), None, Imputer.Strategy.Median(), + None, Table( { "a": [1.0, 3.0, 1.0, 1.0], @@ -236,6 +352,7 @@ class TestFitAndTransform: ), None, Imputer.Strategy.Mode(), + None, Table( { "a": [1.0, 3.0, 3.0, 3.0], @@ -251,6 +368,7 @@ class TestFitAndTransform: ), ["a"], Imputer.Strategy.Constant(0.0), + None, Table( { "a": [1.0, 3.0, 0.0], @@ -266,8 +384,24 @@ class TestFitAndTransform: ), ["a"], Imputer.Strategy.Mode(), + None, Table({"a": [1.0, 1.0, 2.0, 2.0, 1.0]}), ), + ( + Table( + { + "a": [0.0, 1.0, 2.0], + }, + ), + None, + Imputer.Strategy.Constant(1.0), + 0.0, + Table( + { + "a": [1.0, 1.0, 2.0], + }, + ), + ), ], ids=[ "constant strategy", @@ -276,28 +410,35 @@ class TestFitAndTransform: "mode strategy", "constant strategy multiple columns", "mode strategy multiple most frequent values", + "other value to replace", ], ) def test_should_return_transformed_table( self, table: Table, column_names: list[str] | None, - strategy: ImputerStrategy, + strategy: Imputer.Strategy, + value_to_replace: float | str | None, expected: Table, ) -> None: - if isinstance(strategy, Imputer.Strategy.Mode): + if isinstance(strategy, _Mode): with warnings.catch_warnings(): warnings.filterwarnings( action="ignore", message=r"There are multiple most frequent values in a column given to the Imputer\..*", category=UserWarning, ) - assert Imputer(strategy).fit_and_transform(table, column_names) == expected + assert ( + Imputer(strategy, value_to_replace=value_to_replace).fit_and_transform(table, column_names) + == expected + ) else: - assert Imputer(strategy).fit_and_transform(table, column_names) == expected + assert ( + Imputer(strategy, value_to_replace=value_to_replace).fit_and_transform(table, column_names) == expected + ) @pytest.mark.parametrize("strategy", strategies(), ids=lambda x: x.__class__.__name__) - def test_should_not_change_original_table(self, strategy: ImputerStrategy) -> None: + def test_should_not_change_original_table(self, strategy: Imputer.Strategy) -> None: table = Table( { "a": [1, None, None], @@ -315,7 +456,7 @@ def test_should_not_change_original_table(self, strategy: ImputerStrategy) -> No assert table == expected @pytest.mark.parametrize("strategy", strategies(), ids=lambda x: x.__class__.__name__) - def test_get_names_of_added_columns(self, strategy: ImputerStrategy) -> None: + def test_get_names_of_added_columns(self, strategy: Imputer.Strategy) -> None: transformer = Imputer(strategy=strategy) with pytest.raises(TransformerNotFittedError): transformer.get_names_of_added_columns() @@ -330,7 +471,7 @@ def test_get_names_of_added_columns(self, strategy: ImputerStrategy) -> None: assert transformer.get_names_of_added_columns() == [] @pytest.mark.parametrize("strategy", strategies(), ids=lambda x: x.__class__.__name__) - def test_get_names_of_changed_columns(self, strategy: ImputerStrategy) -> None: + def test_get_names_of_changed_columns(self, strategy: Imputer.Strategy) -> None: transformer = Imputer(strategy=strategy) with pytest.raises(TransformerNotFittedError): transformer.get_names_of_changed_columns() @@ -344,7 +485,7 @@ def test_get_names_of_changed_columns(self, strategy: ImputerStrategy) -> None: assert transformer.get_names_of_changed_columns() == ["a", "b"] @pytest.mark.parametrize("strategy", strategies(), ids=lambda x: x.__class__.__name__) - def test_get_names_of_removed_columns(self, strategy: ImputerStrategy) -> None: + def test_get_names_of_removed_columns(self, strategy: Imputer.Strategy) -> None: transformer = Imputer(strategy=strategy) with pytest.raises(TransformerNotFittedError): transformer.get_names_of_removed_columns() @@ -357,80 +498,3 @@ def test_get_names_of_removed_columns(self, strategy: ImputerStrategy) -> None: ) transformer = transformer.fit(table, None) assert transformer.get_names_of_removed_columns() == [] - - -class TestHash: - @pytest.mark.parametrize( - ("strategy1", "strategy2"), - ([(x, y) for x in strategies() for y in strategies() if x.__class__ == y.__class__]), - ids=lambda x: x.__class__.__name__, - ) - def test_should_return_same_hash_for_equal_strategy( - self, - strategy1: ImputerStrategy, - strategy2: ImputerStrategy, - ) -> None: - assert hash(strategy1) == hash(strategy2) - - @pytest.mark.parametrize( - ("strategy1", "strategy2"), - ([(x, y) for x in strategies() for y in strategies() if x.__class__ != y.__class__]), - ids=lambda x: x.__class__.__name__, - ) - def test_should_return_different_hash_for_unequal_strategy( - self, - strategy1: ImputerStrategy, - strategy2: ImputerStrategy, - ) -> None: - assert hash(strategy1) != hash(strategy2) - - -class TestEq: - - @pytest.mark.parametrize( - ("strategy1", "strategy2"), - ([(x, y) for x in strategies() for y in strategies() if x.__class__ == y.__class__]), - ids=lambda x: x.__class__.__name__, - ) - def test_equal_strategy( - self, - strategy1: ImputerStrategy, - strategy2: ImputerStrategy, - ) -> None: - assert strategy1 == strategy2 - - @pytest.mark.parametrize( - "strategy", - ([x for x in strategies() if x.__class__]), - ids=lambda x: x.__class__.__name__, - ) - def test_equal_identity_strategy( - self, - strategy: ImputerStrategy, - ) -> None: - assert strategy == strategy # noqa: PLR0124 - - @pytest.mark.parametrize( - ("strategy1", "strategy2"), - ([(x, y) for x in strategies() for y in strategies() if x.__class__ != y.__class__]), - ids=lambda x: x.__class__.__name__, - ) - def test_unequal_strategy( - self, - strategy1: ImputerStrategy, - strategy2: ImputerStrategy, - ) -> None: - assert strategy1 != strategy2 - - -class TestSizeof: - @pytest.mark.parametrize( - "strategy", - ([Imputer.Strategy.Constant(1)]), - ids=lambda x: x.__class__.__name__, - ) - def test_sizeof_strategy( - self, - strategy: ImputerStrategy, - ) -> None: - assert sys.getsizeof(strategy) > sys.getsizeof(object())