From ab6b1b2169a3741b12c8d281efa6b4d7d5ce5f9c Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Tue, 22 Oct 2024 20:15:34 +0200 Subject: [PATCH] :construction: fix ci runs (mamba and numpy related) (#81) * :construction: switch mamba installation - see if snakemake envs are somehow cached * :bug: specify python version, move ls * :construction: deactivate some workflow, run relatvie ls command * try not to cache * :construction: test using venv created by codespace with python 3.12 - might be that I need to create (not sure what change in runner configurations) * try to use full snakemake installation * :construction: use miniconda for pypi installation test * try miniconda again - snakemake environment has it's own mamba installation - auto-activate environment "test" * install build dependencies, fix ubuntu first * :bug: try to put mamba below 2.0 https://github.com/snakemake/snakemake/issues/3108 * test should be activate per default * :construction: conda env not activated... * :bug: pip does not install in environment * :construction: experiment * :bug: shell was not iniated * :bug: test installing njab separately * :bug: order matters! * try again new order, add umap-learn explicitly * :bug: do not re-install njab * restrict scipy (trapz missing in lifelines) latest scipy not supported by lifelines * :bug: exclude numpy 2.0 for now * numpy try two * swap numpy and njab, adapt other pkg to what it was before * add back umap learn, relax constraints * :construction: in package single requirement single packages cannot be specified to just ignore the dependencies. * :heavy_minus_sign: remove scipy dependency - leave it to njab to install dependencies in a second step. * :arrow_up: remove support for python 3.8 (end-of-life) * :art: setuptools_scm uses tags to determine version, add tags * :bug: tags not fetched without entire history see https://github.com/actions/checkout/issues/1471 * :art: clean-up workflow file * :sparkles: add njab after update to requirements - enable again more workflows (using mamba constraint snakemake environement) * :fire: remove comments, :rewind: add back tests * :bug: make order explicit (by feat freq or bin and bin count) * :bug: fix order of example more explicitly. * :bug: actually test latest version of pimms, remove comments * :bug: runs natively in colab without issues --- .github/workflows/ci.yaml | 34 ++--- .github/workflows/ci_workflow.yaml | 7 +- .github/workflows/test_pkg_on_colab.yaml | 5 +- .github/workflows/workflow_website.yaml | 4 +- .readthedocs.yaml | 4 +- environment.yml | 2 +- pimmslearn/imputation.py | 162 +---------------------- pimmslearn/pandas/__init__.py | 3 +- project/workflow/envs/pimms.yaml | 2 +- pyproject.toml | 15 +-- snakemake_env.yml | 2 +- tests/pandas/test_calc_errors.py | 105 +++++++-------- tests/test_imputation.py | 41 +----- 13 files changed, 96 insertions(+), 290 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 7e16f5b60..7cb8b26a3 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -21,23 +21,19 @@ jobs: "macos-13", # "windows-latest" # rrcovNA cannot be build from source on windows-server ] - python-version: ["3.8", "3.9", "3.10"] + python-version: ["3.9", "3.10", "3.11", "3.12"] steps: - name: Checkout uses: actions/checkout@v4 - name: Set up Miniconda - # ! change action https://github.com/mamba-org/setup-micromamba uses: conda-incubator/setup-miniconda@v3 with: - miniforge-variant: Mambaforge - # miniforge-version: latest - use-mamba: true - channel-priority: disabled python-version: ${{ matrix.python-version }} + channel-priority: strict environment-file: snakemake_env.yml activate-environment: snakemake auto-activate-base: true - # auto-update-conda: true + auto-update-conda: true - name: inspect-conda-environment run: | conda info @@ -45,11 +41,6 @@ jobs: conda env export --from-history --no-builds > environment.yml conda env export --no-builds conda env export --no-builds > environment_w_versions.yml - # - name: test-r-kernel-imports - # run: | - # Rscript -e "library(stringi)" - # Rscript -e "library(stringr)" - # Rscript -e "library(reshape2)" - name: Dry-Run demo workflow (integration test) run: | cd project @@ -75,8 +66,8 @@ jobs: name: ${{ matrix.os }}-${{ matrix.python-version }}-example-workflow-results path: | project/runs/example/ - environment.yml - environment_w_versions.yml + snakemake_env + project/.snakemake/conda/ run-unit-local-pip-installation: runs-on: ${{ matrix.os }} @@ -85,25 +76,28 @@ jobs: fail-fast: false matrix: os: ["ubuntu-latest", "macos-latest", "windows-latest"] - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.9", "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v4 + with: + fetch-tags: true + fetch-depth: 0 - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: install pimms - run: python -m pip install . - + run: pip install . + - name: Install pytest - run: python -m pip install pytest pytest-cov + run: pip install pytest pytest-cov - name: Run pytest run: pytest . - name: Install papermill - run: python -m pip install papermill ipykernel + run: pip install papermill ipykernel - name: View papermill help message for notebooks (as scripts) run: | @@ -141,4 +135,4 @@ jobs: - uses: pypa/gh-action-pypi-publish@release/v1 with: user: __token__ - password: ${{ secrets.PYPI_API_TOKEN }} \ No newline at end of file + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.github/workflows/ci_workflow.yaml b/.github/workflows/ci_workflow.yaml index 20a64f2db..7475ccdc4 100644 --- a/.github/workflows/ci_workflow.yaml +++ b/.github/workflows/ci_workflow.yaml @@ -1,4 +1,4 @@ -name: run workflow with conda envs +name: run workflow (v1) with conda envs on: push: branches: [main, dev] @@ -31,13 +31,12 @@ jobs: # ! change action https://github.com/mamba-org/setup-micromamba uses: conda-incubator/setup-miniconda@v3 with: - miniforge-variant: Mambaforge - use-mamba: true - channel-priority: disabled + channel-priority: strict python-version: ${{ matrix.python-version }} environment-file: snakemake_env.yml activate-environment: snakemake auto-activate-base: true + auto-update-conda: true - name: inspect-conda-environment run: | conda info diff --git a/.github/workflows/test_pkg_on_colab.yaml b/.github/workflows/test_pkg_on_colab.yaml index 9fae4d14a..6109c0810 100644 --- a/.github/workflows/test_pkg_on_colab.yaml +++ b/.github/workflows/test_pkg_on_colab.yaml @@ -20,11 +20,12 @@ jobs: - name: Install pimms-learn (from branch) and papermill if: github.event_name == 'pull_request' run: | - python3 -m pip install pimms-learn papermill + pip install . + pip install papermill - name: Install pimms-learn (from PyPI) and papermill if: github.event_name == 'schedule' run: | - python3 -m pip install pimms-learn papermill + pip install pimms-learn papermill - name: Run tutorial run: | cd project diff --git a/.github/workflows/workflow_website.yaml b/.github/workflows/workflow_website.yaml index 1d794bdd7..aa9fcaac2 100644 --- a/.github/workflows/workflow_website.yaml +++ b/.github/workflows/workflow_website.yaml @@ -1,4 +1,4 @@ -name: Build workflow website on public Alzheimer dataset (for protein groups) +name: Build workflow (v2) website on public Alzheimer dataset (for protein groups) on: pull_request: branches: [main, dev] @@ -73,4 +73,4 @@ jobs: uses: peaceiris/actions-gh-pages@v4 with: github_token: ${{ secrets.GITHUB_TOKEN }} - publish_dir: project/runs/alzheimer_study/_build/ \ No newline at end of file + publish_dir: project/runs/alzheimer_study/_build/ diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 6e817d6be..3199f225a 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -9,7 +9,7 @@ version: 2 build: os: ubuntu-22.04 tools: - python: "3.8" + python: "3.10" # You can also specify other tool versions: # nodejs: "19" # rust: "1.64" @@ -32,4 +32,4 @@ python: - method: pip path: . extra_requirements: - - docs \ No newline at end of file + - docs diff --git a/environment.yml b/environment.yml index aad91462b..7aa4526ed 100644 --- a/environment.yml +++ b/environment.yml @@ -9,7 +9,7 @@ channels: - plotly # - defaults dependencies: - - python>=3.8,<=3.12 + - python>=3.9,<=3.12 - numpy - pandas>=1 - scipy>=1.6 diff --git a/pimmslearn/imputation.py b/pimmslearn/imputation.py index 4dd553ae9..36b065516 100644 --- a/pimmslearn/imputation.py +++ b/pimmslearn/imputation.py @@ -5,12 +5,11 @@ """ -from typing import Tuple, Dict -from sklearn.neighbors import NearestNeighbors -import scipy +import logging +from typing import Dict, Tuple + import numpy as np import pandas as pd -import logging logger = logging.getLogger(__name__) @@ -18,152 +17,6 @@ RANDOMSEED = 123 -def impute_missing(protein_values, mean=None, std=None): - """ - Imputation is based on the mean and standard deviation - from the protein_values. - If mean and standard deviation (std) are given, - missing values are imputed and protein_values are returned imputed. - If no mean and std are given, the mean and std are computed from - the non-missing protein_values. - - Parameters - ---------- - protein_values: Iterable - mean: float - std: float - - Returns - ------ - protein_values: pandas.Series - """ - raise NotImplementedError('Will be the main function combining features') - # clip by zero? - - -def _select_data(data: pd.DataFrame, threshold: float): - """Select (protein-) columns for imputation. - - Based on the threshold representing the minimum proportion of available - data per protein, the columns of a `pandas.DataFrame` are selected. - - Parameters - ---------- - data: pandas.DataFrame - threshold: float - Threshold of percentage of non-missing values to select a column/feature. - """ - columns_to_impute = data.notnull().mean() >= threshold - return columns_to_impute - - -def _sparse_coo_array(data: pd.DataFrame): - """Return a sparse scipy matrix from dense `pandas.DataFrame` with many - missing values. - """ - indices = np.nonzero(~np.isnan(data.to_numpy())) - data_selected_sparse = data.to_numpy() - data_selected_sparse = scipy.sparse.coo_matrix( - (data_selected_sparse[indices], indices), - shape=data_selected_sparse.shape) - return data_selected_sparse - - -def _get_weighted_mean(distances, data): - """Compute weighted mean ignoring - identical entries""" - mask = distances > 0.0 - weights = distances[mask] / distances[mask].sum() - weighted_sum = data.loc[mask].mul(weights, axis=0) - mean_imputed = weighted_sum.sum() / sum(mask) - return mean_imputed - - -# define imputation methods -# could be done in PCA transformed space -def imputation_KNN(data, alone=True, threshold=0.5): - """ - - - Parameters - ---------- - data: pandas.DataFrame - alone: bool # is not used - threshold: float - Threshold of missing data by column in interval (0, 1) - """ - mask_selected = _select_data(data=data, threshold=threshold) - data_selected = data.loc[:, mask_selected].copy() - data_selected_sparse = _sparse_coo_array(data_selected) - # impute - knn_fitted = NearestNeighbors(n_neighbors=3, algorithm='brute').fit( - data_selected_sparse) - fit_distances, fit_neighbors = knn_fitted.kneighbors(data_selected_sparse) - for i, (distances, ids) in enumerate(zip(fit_distances, fit_neighbors)): - mean_imputed = _get_weighted_mean(distances, data_selected.loc[ids]) - if all(distances == 0.0): - logger.warning(f"Did not find any neighbor for int-id: {i}") - else: - assert i == ids[distances == 0.0], ( - "None or more then one identical data points " - "for ids: {}".format(ids[distances == 0.0]) - ) - mask = data_selected.iloc[i].isna() - data_selected.loc[i, mask] = mean_imputed.loc[mask] # SettingWithCopyError - - data.update(data_selected) - return data - - -def imputation_normal_distribution(log_intensities: pd.Series, - mean_shift=1.8, - std_shrinkage=0.3, - copy=True): - """Impute missing log-transformed intensity values of a single feature. - Samples one value for imputation for all samples. - - Parameters - ---------- - log_intensities: pd.Series - Series of normally distributed values of a single feature (for all samples/runs). - Here usually log-transformed intensities. - mean_shift: integer, float - Shift the mean of the log_intensities by factors of their standard - deviation to the negative. - std_shrinkage: float - Value greater than zero by which to shrink (or inflate) the - standard deviation of the log_intensities. - """ - np.random.seed(RANDOMSEED) - if not isinstance(log_intensities, pd.Series): - try: - log_intensities.Series(log_intensities) - logger.warning("Series created of Iterable.") - except BaseException: - raise ValueError( - "Plese provided data which is a pandas.Series or an Iterable") - if mean_shift < 0: - raise ValueError( - "Please specify a positive float as the std.-dev. is non-negative.") - if std_shrinkage <= 0: - raise ValueError( - "Please specify a positive float as shrinkage factor for std.-dev.") - if std_shrinkage >= 1: - logger.warning("Standard Deviation will increase for imputed values.") - - mean = log_intensities.mean() - std = log_intensities.std() - - mean_shifted = mean - (std * mean_shift) - std_shrinked = std * std_shrinkage - - if copy: - log_intensities = log_intensities.copy(deep=True) - - return log_intensities.where(log_intensities.notna(), - np.random.normal(mean_shifted, std_shrinked)) - - def impute_shifted_normal(df_wide: pd.DataFrame, mean_shift: float = 1.8, std_shrinkage: float = 0.3, @@ -224,15 +77,6 @@ def impute_shifted_normal(df_wide: pd.DataFrame, return imputed_shifted_normal -def imputation_mixed_norm_KNN(data): - # impute columns with less than 50% missing values with KNN - data = imputation_KNN(data, alone=False) # ToDo: Alone is not used. - # impute remaining columns based on the distribution of the protein - data = imputation_normal_distribution( - data, mean_shift=1.8, std_shrinkage=0.3) - return data - - def compute_moments_shift(observed: pd.Series, imputed: pd.Series, names: Tuple[str, str] = ('observed', 'imputed')) -> Dict[str, float]: """Summary of overall shift of mean and std. dev. of predictions for a imputation method.""" diff --git a/pimmslearn/pandas/__init__.py b/pimmslearn/pandas/__init__.py index 4be42b68d..fa69cd7af 100644 --- a/pimmslearn/pandas/__init__.py +++ b/pimmslearn/pandas/__init__.py @@ -7,7 +7,8 @@ import omegaconf import pandas as pd -from pimmslearn.pandas.calc_errors import calc_errors_per_feat, get_absolute_error +from pimmslearn.pandas.calc_errors import (calc_errors_per_feat, + get_absolute_error) __all__ = [ 'calc_errors_per_feat', diff --git a/project/workflow/envs/pimms.yaml b/project/workflow/envs/pimms.yaml index 9d1c927f3..a2ab6f0a6 100644 --- a/project/workflow/envs/pimms.yaml +++ b/project/workflow/envs/pimms.yaml @@ -9,7 +9,7 @@ channels: - plotly # - defaults dependencies: - - python>=3.8,<=3.12 + - python>=3.9,<=3.12 - numpy - pandas>=1 - scipy>=1.6 diff --git a/pyproject.toml b/pyproject.toml index 571d9cb63..73bcb27f8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ name = "pimms-learn" # See the section below: [tools.setuptools.dynamic] dynamic = ["version"] readme = "README.md" -requires-python = ">=3.8" +requires-python = ">=3.9" # These are keywords classifiers = [ "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", @@ -16,20 +16,17 @@ classifiers = [ "Topic :: Scientific/Engineering :: Bio-Informatics", ] dependencies = [ - "njab>=0.0.8", - "numpy", - "matplotlib", "pandas", - "plotly", + "numpy", "torch", + "fastai", "scikit-learn>=1.0", - "scipy", "seaborn", - "fastai", + "matplotlib", + "plotly", # not used in library, but workflow "omegaconf", - "tqdm", - "mrmr-selection", "pingouin", + "njab>=0.1" ] [project.scripts] diff --git a/snakemake_env.yml b/snakemake_env.yml index 7713b7b18..fc0ab023f 100644 --- a/snakemake_env.yml +++ b/snakemake_env.yml @@ -5,4 +5,4 @@ channels: - defaults dependencies: - snakemake-minimal - - mamba + - mamba<2.0 diff --git a/tests/pandas/test_calc_errors.py b/tests/pandas/test_calc_errors.py index 63b47adad..8af9ffd6f 100644 --- a/tests/pandas/test_calc_errors.py +++ b/tests/pandas/test_calc_errors.py @@ -24,11 +24,9 @@ def example_data(): columns=['observed'] + ['model_' + str(i + 1) for i in range(4)]) data.columns.name = 'model' data.index.name = 'feat' - data['freq_feat'] = [4, 5, 5, 4, 6, 7, 7, 9, 8, 6] + data['freq_feat'] = [4, 4, 5, 5, 5, 6, 7, 9, 8, 6] return data -# %% - def test_get_absolute_error(example_data): expected = {'feat': {0: 'feat_0', @@ -89,58 +87,57 @@ def test_get_absolute_error(example_data): def test_calc_errors_per_feat(example_data): expected = {'feat': {0: 'feat_0', - 1: 'feat_1', - 2: 'feat_0', + 1: 'feat_0', + 2: 'feat_1', 3: 'feat_1', 4: 'feat_1', - 5: 'feat_6', - 6: 'feat_2', + 5: 'feat_2', + 6: 'feat_6', 7: 'feat_3', 8: 'feat_5', 9: 'feat_4'}, 'model_1': {0: 1.0836015099999994, - 1: 0.38399649333333247, - 2: 1.0836015099999994, + 1: 1.0836015099999994, + 2: 0.38399649333333247, 3: 0.38399649333333247, 4: 0.38399649333333247, - 5: 0.3581477100000008, - 6: 1.0785032900000004, + 5: 1.0785032900000004, + 6: 0.3581477100000008, 7: 0.5197284500000023, 8: 0.35989225000000147, 9: 0.25562937999999846}, 'model_2': {0: 0.6558889949999998, - 1: 0.30025493000000125, - 2: 0.6558889949999998, + 1: 0.6558889949999998, + 2: 0.30025493000000125, 3: 0.30025493000000125, 4: 0.30025493000000125, - 5: 0.10481768000000002, - 6: 0.6079609700000006, + 5: 0.6079609700000006, + 6: 0.10481768000000002, 7: 0.48225405000000166, 8: 0.3109490500000014, 9: 0.24097977999999998}, 'model_3': {0: 1.8424256349999997, - 1: 0.3030794033333339, - 2: 1.8424256349999997, + 1: 1.8424256349999997, + 2: 0.3030794033333339, 3: 0.3030794033333339, 4: 0.3030794033333339, - 5: 0.025569629999999677, - 6: 1.3011469200000008, + 5: 1.3011469200000008, + 6: 0.025569629999999677, 7: 0.6282909300000021, 8: 0.749302710000002, 9: 0.04352294999999984}, 'model_4': {0: 1.3207320749999987, - 1: 0.6042852166666677, - 2: 1.3207320749999987, + 1: 1.3207320749999987, + 2: 0.6042852166666677, 3: 0.6042852166666677, 4: 0.6042852166666677, - 5: 0.1415143900000011, - 6: 1.2042582899999985, + 5: 1.2042582899999985, + 6: 0.1415143900000011, 7: 0.8281038200000026, 8: 0.5444545000000005, 9: 0.06842009000000004}, - 'freq_feat': {0: 4, 1: 4, 2: 5, 3: 5, 4: 6, 5: 6, 6: 7, 7: 7, 8: 8, 9: 9}, - 'n_obs': {0: 2, 1: 3, 2: 2, 3: 3, 4: 3, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1}} - + 'freq_feat': {0: 4, 1: 4, 2: 5, 3: 5, 4: 5, 5: 6, 6: 6, 7: 7, 8: 8, 9: 9}, + 'n_obs': {0: 2, 1: 2, 2: 3, 3: 3, 4: 3, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1}} actual = calc_errors.calc_errors_per_feat( pred=example_data.drop('freq_feat', axis=1), freq_feat=example_data['freq_feat']).reset_index().to_dict() @@ -151,56 +148,56 @@ def test_calc_errors_per_bin(example_data): expected = {'feat': {0: 'feat_0', 1: 'feat_0', 2: 'feat_1', - 3: 'feat_2', - 4: 'feat_3', + 3: 'feat_5', + 4: 'feat_2', 5: 'feat_4', - 6: 'feat_5', + 6: 'feat_3', 7: 'feat_1', - 8: 'feat_1', - 9: 'feat_6'}, + 8: 'feat_6', + 9: 'feat_1'}, 'model_1': {0: 1.7588900899999977, 1: 0.408312930000001, 2: 0.03497017999999841, - 3: 1.0785032900000004, - 4: 0.5197284500000023, + 3: 0.35989225000000147, + 4: 1.0785032900000004, 5: 0.25562937999999846, - 6: 0.35989225000000147, + 6: 0.5197284500000023, 7: 0.31798253999999915, - 8: 0.7990367599999999, - 9: 0.3581477100000008}, + 8: 0.3581477100000008, + 9: 0.7990367599999999}, 'model_2': {0: 0.9619296899999981, 1: 0.34984830000000144, 2: 0.04799503999999999, - 3: 0.6079609700000006, - 4: 0.48225405000000166, + 3: 0.3109490500000014, + 4: 0.6079609700000006, 5: 0.24097977999999998, - 6: 0.3109490500000014, + 6: 0.48225405000000166, 7: 0.055784630000001556, - 8: 0.7969851200000022, - 9: 0.10481768000000002}, + 8: 0.10481768000000002, + 9: 0.7969851200000022}, 'model_3': {0: 2.9334374200000006, 1: 0.7514138499999987, 2: 0.023260270000001526, - 3: 1.3011469200000008, - 4: 0.6282909300000021, + 3: 0.749302710000002, + 4: 1.3011469200000008, 5: 0.04352294999999984, - 6: 0.749302710000002, + 6: 0.6282909300000021, 7: 0.18840471000000036, - 8: 0.6975732299999997, - 9: 0.025569629999999677}, + 8: 0.025569629999999677, + 9: 0.6975732299999997}, 'model_4': {0: 2.1805211699999987, 1: 0.46094297999999867, 2: 0.1140570700000012, - 3: 1.2042582899999985, - 4: 0.8281038200000026, + 3: 0.5444545000000005, + 4: 1.2042582899999985, 5: 0.06842009000000004, - 6: 0.5444545000000005, + 6: 0.8281038200000026, 7: 0.7145071600000001, - 8: 0.9842914200000017, - 9: 0.1415143900000011}, - 'bin': {0: 25, 1: 30, 2: 31, 3: 26, 4: 29, 5: 29, 6: 26, 7: 28, 8: 28, 9: 28}, + 8: 0.1415143900000011, + 9: 0.9842914200000017}, + 'bin': {0: 25, 1: 30, 2: 31, 3: 26, 4: 26, 5: 29, 6: 29, 7: 28, 8: 28, 9: 28}, 'n_obs': {0: 1, 1: 1, 2: 1, 3: 2, 4: 2, 5: 2, 6: 2, 7: 3, 8: 3, 9: 3}} - actual = calc_errors.calc_errors_per_bin( - example_data.drop('freq_feat', axis=1)).reset_index().to_dict() + example_data.drop('freq_feat', axis=1) + ).sort_values(["n_obs", "bin", "model_1"]).reset_index().to_dict() assert actual == expected diff --git a/tests/test_imputation.py b/tests/test_imputation.py index 0c98f77bd..61cd3068f 100644 --- a/tests/test_imputation.py +++ b/tests/test_imputation.py @@ -1,9 +1,3 @@ -from pathlib import Path -import numpy as np -import pandas as pd -import pytest - -from pimmslearn.imputation import imputation_KNN, imputation_normal_distribution, impute_shifted_normal """ # Test Data set was created from a sample by shuffling: @@ -19,6 +13,13 @@ data.apply(numpy.random.shuffle, axis=1) data.to_csv('test_data.csv') """ +from pathlib import Path + +import numpy as np +import pandas as pd +import pytest + +from pimmslearn.imputation import impute_shifted_normal @pytest.fixture @@ -29,34 +30,6 @@ def example_data(): example_data_path = Path(__file__).resolve().parent / 'test_data.csv' return pd.read_csv(example_data_path, index_col='id') -# def test_impute_missing(): -# pass - - -def test_imputation_KNN(example_data): - threshold = 0.55 - data = example_data.copy() - data_transformed = imputation_KNN(data, threshold=threshold) - columns_to_impute = data.notnull().mean() >= threshold - columns_to_impute = columns_to_impute[columns_to_impute].index - assert all(data_transformed.loc[:, columns_to_impute].isna().sum() < 15) - n_not_to_impute = data.loc[:, - data.notnull().mean() < threshold].isna().sum() - assert all(data_transformed.loc[:, n_not_to_impute.index].isna().sum() - == n_not_to_impute) - - -def test_imputation_normal_dist(): - log_intensities = pd.Series([26.0, np.nan, 24.0, 25.0, np.nan]) - imputed = imputation_normal_distribution(log_intensities) - imputed = round(imputed, ndigits=5) - assert imputed.equals( - pd.Series([26.0, 22.87431, 24.0, 25.0, 22.87431]) - ) - -# def test_imputation_mixed_norm_KNN(): -# pass - @pytest.mark.parametrize('axis', [0, 1]) def test_impute_shifted_normal(example_data, axis):