diff --git a/docker/openproblems-python-tf2.4/Dockerfile b/docker/openproblems-python-tf2.4/Dockerfile new file mode 100644 index 0000000000..688aee580d --- /dev/null +++ b/docker/openproblems-python-tf2.4/Dockerfile @@ -0,0 +1,15 @@ +FROM singlecellopenproblems/openproblems:latest + +ARG NB_USER="sagemaker-user" +ARG NB_UID="1000" +ARG NB_GID="100" + +USER root +WORKDIR / + +# Install Python packages +COPY ./docker/openproblems-python-tf2.4/requirements.txt ./requirements.txt +RUN pip install --no-cache-dir -r requirements.txt + +USER $NB_UID +WORKDIR /home/$NB_USER diff --git a/docker/openproblems-python-tf2.4/README.md b/docker/openproblems-python-tf2.4/README.md new file mode 100644 index 0000000000..2704835345 --- /dev/null +++ b/docker/openproblems-python-tf2.4/README.md @@ -0,0 +1,14 @@ +# openproblems-python-tf2.4 Docker image + +Base image: singlecellopenproblems/openproblems + +OS: Debian Stretch + +Python: 3.8 + +Python packages: + + +* keras >=2.4,<2.6 +* tensorflow >=2.4,<2.5 +* dca diff --git a/docker/openproblems-python-tf2.4/requirements.txt b/docker/openproblems-python-tf2.4/requirements.txt new file mode 100644 index 0000000000..2948af862e --- /dev/null +++ b/docker/openproblems-python-tf2.4/requirements.txt @@ -0,0 +1,4 @@ +dca==0.3.* +keras>=2.4,<2.6 # pinned in dca +pyyaml==5.4.1 # pinned in #431 +tensorflow==2.4.* # pinned in dca diff --git a/openproblems/tasks/denoising/README.md b/openproblems/tasks/denoising/README.md index 41fffaf193..7584024ec5 100644 --- a/openproblems/tasks/denoising/README.md +++ b/openproblems/tasks/denoising/README.md @@ -8,10 +8,10 @@ A key challenge in evaluating denoising methods is the general lack of a ground # The metrics -Metrics for data denoising aim to +Metrics for data denoising aim to assess denoising accuracy by comparing the denoised *training* set to the randomly sampled *test* set. -* **TODO**: TODO -* **TODO**: TODO +* **MSE**: The mean squared error between the denoised counts of the training dataset and the true counts of the test dataset after reweighting by the train/test ratio. +* **Poisson**: The Poisson log likelihood of observing the true counts of the test dataset given the distribution given in the denoised dataset. ## API diff --git a/openproblems/tasks/denoising/datasets/utils.py b/openproblems/tasks/denoising/datasets/utils.py index b902fe42b7..ec31002e72 100644 --- a/openproblems/tasks/denoising/datasets/utils.py +++ b/openproblems/tasks/denoising/datasets/utils.py @@ -26,6 +26,12 @@ def split_data( X_train, X_test = molecular_cross_validation.util.split_molecules( X, 0.9, 0.0, random_state ) + # remove zero entries + is_missing = X_train.sum(axis=0) == 0 + X_train, X_test = X_train[:, ~is_missing], X_test[:, ~is_missing] + + adata = adata[:, ~is_missing].copy() adata.obsm["train"] = scipy.sparse.csr_matrix(X_train).astype(float) adata.obsm["test"] = scipy.sparse.csr_matrix(X_test).astype(float) + return adata diff --git a/openproblems/tasks/denoising/methods/__init__.py b/openproblems/tasks/denoising/methods/__init__.py index 1452b21cc2..a2596a2a70 100644 --- a/openproblems/tasks/denoising/methods/__init__.py +++ b/openproblems/tasks/denoising/methods/__init__.py @@ -1,4 +1,5 @@ from .alra import alra +from .dca import dca from .magic import magic from .magic import magic_approx from .no_denoising import no_denoising diff --git a/openproblems/tasks/denoising/methods/dca.py b/openproblems/tasks/denoising/methods/dca.py new file mode 100644 index 0000000000..eee6c78354 --- /dev/null +++ b/openproblems/tasks/denoising/methods/dca.py @@ -0,0 +1,35 @@ +from ....tools.decorators import method +from ....tools.utils import check_version + +import scanpy as sc + + +def _dca(adata, test=False, epochs=None): + if test: + epochs = epochs or 30 + else: # pragma: nocover + epochs = epochs or 300 + from dca.api import dca + + # make adata object with train counts + adata_train = sc.AnnData(adata.obsm["train"]) + # run DCA + dca(adata_train, epochs=epochs) + + # set denoised to Xmat + adata.obsm["denoised"] = adata_train.X + # check version of dca + adata.uns["method_code_version"] = check_version("dca") + return adata + + +@method( + method_name="DCA", + paper_name="Single-cell RNA-seq denoising using a deep count autoencoder", + paper_url="https://www.nature.com/articles/s41467-018-07931-2", + paper_year=2019, + code_url="https://github.com/theislab/dca", + image="openproblems-python-tf2.4", +) +def dca(adata, test=False, epochs=None): + return _dca(adata, test=test, epochs=epochs)