From 903469a532b96b25ef7d727e687c5b4263966323 Mon Sep 17 00:00:00 2001
From: Wesley Lewis <59123674+wes-lewis@users.noreply.github.com>
Date: Tue, 12 Jul 2022 20:35:36 -0400
Subject: [PATCH] Task denoising method dca (#431)

* Create alra.py

Add alra.py (includes existing bug).

* pre-commit

* import alra

* pre-commit

* set alra version

* split up alra oneliner

* debug

* fix syntax error

* pre-commit

* use dgCMatrix

* output is stored in obsm

* remove prints

* pre-commit

* Update alra.py

add to_csr() to fix coo matrix error

* fix csr casting

* Update alra.py

try adding custom exception to catch shape mismatch from ALRA

* pre-commit

* add ValueError

* pre-commit

* simplify ValueError to avoid errors

* pre-commit

* cast to array for MSE

Now getting an error in MSE--seems like this was already the case with earlier code, but attempting to fix regardless!

* pre-commit

* separate error line functions

Seems something about ALRA is failing tests. Separate out obsm call to get cleaner traceback

* Remove to_array()

* pre-commit

* try casting to a matrix one more time

* notate that wes' ALRA fork must be used instead

* pre-commit

* source from wes' code

* fix URL

* shorten line lengths

* Check output is ndarray

* Fix typo

* Return dense data

* don't need tocsr now that the data is dense

* Return directly to denoised

* code cleanup

* Revert debugging

* Don't edit adata.obsm['train']

* access train_norm

* Add warning about editing adata.obsm['train']

* pre-commit

* check train and test are not modified

* pre-commit

* Retry ALRA on failure

* pre-commit

* Switch t(as.matrix()) order

* Check dense data

* Return sparse data

* Check input data is sparse

* Fix typo

* pre-commit

* Don't send the full AnnData to R

* Expect sparse input, dense array output

* train and test must be floats

* Convert back to float

* Fail on final attempt

* put the retry inside python

* Remove the retry from R

* pre-commit

* layers['counts'] might not be sparse

* pre-commit

* Log error each time

* import logging

* pre-commit

* Better way to check matrices

* pre-commit

* fix array equal comparison

* add explicit comment

* More explicit toarray

* Can't check for untouched train/test

* Don't import scprep

* Just use a fixed target_sum

* Sample data should match API

* pre-commit

* flake8

* no_denoising still needs to densify

* convert to csc

* pre-commit

* Convert to csr

* conversion of sparse doesn't work, try anndata

* accept sce

* pre-commit

* Convert to dense

* pre-commit

* Convert to dense

* pre-commit

* Try `.tocsr()`

* Create dca.py

* pre-commit

* Create dca.py

* pre-commit

* add dca

* add dca

* Update dca.py

* Update dca.py

update import statement for DCA. Note that the main function, DCA(), might need to share the same name as the overall file (?), i.e. if it is DCA(), the file might need to be DCA.py

* pre-commit

* Update dca.py

* Update dca.py

* Delete dca.py

* Update requirements.txt

* Update __init__.py

* pre-commit

* Update dca.py

Try just importing dca

* pre-commit

* Update dca.py

* pre-commit

* put dca import inside method

* pre-commit

* Update dca.py

* Update requirements.txt

* pre-commit

* Create README.md

* Update README.md

* Create Dockerfile

* Create requirements.txt

* pre-commit

* Create requirements.txt

* pre-commit

* remove dca from python-extras readme

* fix image specification

* remove dca from here

* Update Dockerfile

* pin dca 0.3*

used ==, uncertain if = would've sufficed

* Update dca.py

* Update __init__.py

* Update requirements.txt

* Update README.md

* Update README.md

* Update README.md

* Update requirements.txt

* Update `check_version` api

* Require pyyaml==5.4.1 to prevent kopt error

Due to https://github.com/yaml/pyyaml/issues/576

* pre-commit

* Fix keras version

* Update dca.py

Remove scprep normalization commands.
make adata2 object, which is adata made from just adata.obsm['train']

* pre-commit

* Update dca.py

* pre-commit

* Update dca.py

* pre-commit

* Update dca.py

* pre-commit

* Add test args

* fix thread count and pass epochs to dca

* pre-commit

* add in masking

* pre-commit

* Update README.md

* Update README.md

* add removezeros and insert_at functions

* pre-commit

* Update dca.py

* pre-commit

* Remove zero counts from train data

* Remove filtering from DCA

* Remove unused code

* pre-commit

* Don't need a line break

* Update utils.py

* pre-commit

* Use epochs if passed

* Fix metric descriptions

* don't compute coverage on non-test args

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Scott Gigante <scott.gigante@immunai.com>
Co-authored-by: Scott Gigante <scottgigante@users.noreply.github.com>
Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com>
---
 docker/openproblems-python-tf2.4/Dockerfile   | 15 ++++++++
 docker/openproblems-python-tf2.4/README.md    | 14 ++++++++
 .../requirements.txt                          |  4 +++
 openproblems/tasks/denoising/README.md        |  6 ++--
 .../tasks/denoising/datasets/utils.py         |  6 ++++
 .../tasks/denoising/methods/__init__.py       |  1 +
 openproblems/tasks/denoising/methods/dca.py   | 35 +++++++++++++++++++
 7 files changed, 78 insertions(+), 3 deletions(-)
 create mode 100644 docker/openproblems-python-tf2.4/Dockerfile
 create mode 100644 docker/openproblems-python-tf2.4/README.md
 create mode 100644 docker/openproblems-python-tf2.4/requirements.txt
 create mode 100644 openproblems/tasks/denoising/methods/dca.py

diff --git a/docker/openproblems-python-tf2.4/Dockerfile b/docker/openproblems-python-tf2.4/Dockerfile
new file mode 100644
index 0000000000..688aee580d
--- /dev/null
+++ b/docker/openproblems-python-tf2.4/Dockerfile
@@ -0,0 +1,15 @@
+FROM singlecellopenproblems/openproblems:latest
+
+ARG NB_USER="sagemaker-user"
+ARG NB_UID="1000"
+ARG NB_GID="100"
+
+USER root
+WORKDIR /
+
+# Install Python packages
+COPY ./docker/openproblems-python-tf2.4/requirements.txt ./requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+
+USER $NB_UID
+WORKDIR /home/$NB_USER
diff --git a/docker/openproblems-python-tf2.4/README.md b/docker/openproblems-python-tf2.4/README.md
new file mode 100644
index 0000000000..2704835345
--- /dev/null
+++ b/docker/openproblems-python-tf2.4/README.md
@@ -0,0 +1,14 @@
+# openproblems-python-tf2.4 Docker image
+
+Base image: singlecellopenproblems/openproblems
+
+OS: Debian Stretch
+
+Python: 3.8
+
+Python packages:
+
+
+* keras >=2.4,<2.6
+* tensorflow >=2.4,<2.5
+* dca
diff --git a/docker/openproblems-python-tf2.4/requirements.txt b/docker/openproblems-python-tf2.4/requirements.txt
new file mode 100644
index 0000000000..2948af862e
--- /dev/null
+++ b/docker/openproblems-python-tf2.4/requirements.txt
@@ -0,0 +1,4 @@
+dca==0.3.*
+keras>=2.4,<2.6  # pinned in dca
+pyyaml==5.4.1  # pinned in #431
+tensorflow==2.4.*  # pinned in dca
diff --git a/openproblems/tasks/denoising/README.md b/openproblems/tasks/denoising/README.md
index 41fffaf193..7584024ec5 100644
--- a/openproblems/tasks/denoising/README.md
+++ b/openproblems/tasks/denoising/README.md
@@ -8,10 +8,10 @@ A key challenge in evaluating denoising methods is the general lack of a ground
 
 # The metrics
 
-Metrics for data denoising aim to 
+Metrics for data denoising aim to assess denoising accuracy by comparing the denoised *training* set to the randomly sampled *test* set. 
 
-* **TODO**: TODO
-* **TODO**: TODO
+* **MSE**: The mean squared error between the denoised counts of the training dataset and the true counts of the test dataset after reweighting by the train/test ratio.
+* **Poisson**: The Poisson log likelihood of observing the true counts of the test dataset given the distribution given in the denoised dataset.
 
 ## API
 
diff --git a/openproblems/tasks/denoising/datasets/utils.py b/openproblems/tasks/denoising/datasets/utils.py
index b902fe42b7..ec31002e72 100644
--- a/openproblems/tasks/denoising/datasets/utils.py
+++ b/openproblems/tasks/denoising/datasets/utils.py
@@ -26,6 +26,12 @@ def split_data(
     X_train, X_test = molecular_cross_validation.util.split_molecules(
         X, 0.9, 0.0, random_state
     )
+    # remove zero entries
+    is_missing = X_train.sum(axis=0) == 0
+    X_train, X_test = X_train[:, ~is_missing], X_test[:, ~is_missing]
+
+    adata = adata[:, ~is_missing].copy()
     adata.obsm["train"] = scipy.sparse.csr_matrix(X_train).astype(float)
     adata.obsm["test"] = scipy.sparse.csr_matrix(X_test).astype(float)
+
     return adata
diff --git a/openproblems/tasks/denoising/methods/__init__.py b/openproblems/tasks/denoising/methods/__init__.py
index 1452b21cc2..a2596a2a70 100644
--- a/openproblems/tasks/denoising/methods/__init__.py
+++ b/openproblems/tasks/denoising/methods/__init__.py
@@ -1,4 +1,5 @@
 from .alra import alra
+from .dca import dca
 from .magic import magic
 from .magic import magic_approx
 from .no_denoising import no_denoising
diff --git a/openproblems/tasks/denoising/methods/dca.py b/openproblems/tasks/denoising/methods/dca.py
new file mode 100644
index 0000000000..eee6c78354
--- /dev/null
+++ b/openproblems/tasks/denoising/methods/dca.py
@@ -0,0 +1,35 @@
+from ....tools.decorators import method
+from ....tools.utils import check_version
+
+import scanpy as sc
+
+
+def _dca(adata, test=False, epochs=None):
+    if test:
+        epochs = epochs or 30
+    else:  # pragma: nocover
+        epochs = epochs or 300
+    from dca.api import dca
+
+    # make adata object with train counts
+    adata_train = sc.AnnData(adata.obsm["train"])
+    # run DCA
+    dca(adata_train, epochs=epochs)
+
+    # set denoised to Xmat
+    adata.obsm["denoised"] = adata_train.X
+    # check version of dca
+    adata.uns["method_code_version"] = check_version("dca")
+    return adata
+
+
+@method(
+    method_name="DCA",
+    paper_name="Single-cell RNA-seq denoising using a deep count autoencoder",
+    paper_url="https://www.nature.com/articles/s41467-018-07931-2",
+    paper_year=2019,
+    code_url="https://github.com/theislab/dca",
+    image="openproblems-python-tf2.4",
+)
+def dca(adata, test=False, epochs=None):
+    return _dca(adata, test=test, epochs=epochs)