diff --git a/.readthedocs.yaml b/.readthedocs.yaml
new file mode 100644
index 00000000..9591dc03
--- /dev/null
+++ b/.readthedocs.yaml
@@ -0,0 +1,16 @@
+version: 2
+
+build:
+ os: ubuntu-20.04
+ tools:
+ python: "3.9"
+
+sphinx:
+ configuration: docs/source/conf.py
+
+python:
+ install:
+ - requirements: docs/requirements.txt
+ - requirements: requirements.txt
+ - method: pip
+ path: .
diff --git a/README.md b/README.md
index a4f16fd8..eb15abfa 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ The code in this repository can be used to run our Multi-Omics Variational
autoEncoder (MOVE) framework for integration of omics and clinical variabels
spanning both categorial and continuous data. Our approach includes training
ensemble VAE models and using *in silico* perturbation experiments to identify
-cross omics associations. The manuscript has been accepted and we will provide
+cross omics associations. The manuscript has been accepted and we will provide
the link when it is published.
We developed the method based on a Type 2 Diabetes cohort from the IMI DIRECT
@@ -68,29 +68,8 @@ MOVE has five-six steps:
## How to run MOVE
-You can run the move-dl pipeline from the command line or within a Jupyter
-notebook.
-
-You can run MOVE as Python module with the following command. Details on how
-to set up the configuration for the data and task can be found our
-[tutorial](https://github.com/RasmussenLab/MOVE/tree/main/tutorial) folder.
-
-```bash
->>> move-dl data=[name of data config] task=[name of task config]
-```
-
-Feel free to
-[open an issue](https://github.com/RasmussenLab/MOVE/issues/new/choose) if you
-need any help.
-
-### How to use MOVE with your data
-
-Your data files should be tab separated, include a header and the first column
-should be the IDs of your samples. The configuration of MOVE is done using YAML
-files that describe the input data and the task specification. These should be
-placed in a `config` directory in the working directory. Please see the
-[tutorial](https://github.com/RasmussenLab/MOVE/tree/main/tutorial)
-for more information.
+Please refer to our [**documentation**](https://move-dl.readthedocs.io/) for
+examples and tutorials on how to run MOVE.
# Data sets
@@ -110,5 +89,13 @@ available [here](https://directdiabetes.org).
## Simulated and publicaly available data sets
-We have therefore provided two datasets to test the workflow: a simulated
+We have therefore provided two datasets to test the workflow: a simulated
dataset and a publicly-available maize rhizosphere microbiome data set.
+
+# Citation
+
+To cite MOVE, use the following information:
+
+Allesøe, R.L., Lundgaard, A.T., Hernández Medina, R. et al. Discovery of
+drug–omics associations in type 2 diabetes with generative deep-learning models.
+*Nat Biotechnol* (2023). https://doi.org/10.1038/s41587-022-01520-x
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 4676a887..1478543a 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,2 +1,4 @@
sphinx==5.3.0
-sphinx_rtd_theme=1.1.1
\ No newline at end of file
+sphinx-rtd-theme
+sphinx-autodoc-typehints
+sphinxemoji
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 56619bf6..0c127b86 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -11,18 +11,23 @@
sys.path.insert(0, str(Path("../src").resolve()))
-project = "move-dl"
-copyright = "2022, Valentas Brasas, Ricardo Hernandez Medina"
-author = "Valentas Brasas, Ricardo Hernandez Medina"
-release = "1.0.0"
+import move
+
+project = "MOVE"
+copyright = "2022, Rasmussen Lab"
+author = "Rasmussen Lab"
+release = ".".join(map(str, move.__version__))
# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
extensions = [
"sphinx.ext.autodoc",
+ "sphinx.ext.autosectionlabel",
"sphinx.ext.autosummary",
"sphinx.ext.napoleon",
+ "sphinx_autodoc_typehints",
+ "sphinxemoji.sphinxemoji",
]
templates_path = ["_templates"]
@@ -32,6 +37,9 @@
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
html_theme = "sphinx_rtd_theme"
+html_theme_options = {
+ "collapse_navigation" : False,
+}
html_static_path = []
# -- Napoleon settings --------------------------------------------------------
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 8f6a769a..abe3dca0 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,15 +1,22 @@
-.. move-dl documentation master file, created by
- sphinx-quickstart on Sat Nov 5 15:48:56 2022.
- You can adapt this file completely to your liking, but it should at least
- contain the root `toctree` directive.
-
-Welcome to move-dl's documentation!
-===================================
+Welcome to MOVE's documentation!
+================================
.. toctree::
+ :hidden:
:maxdepth: 1
:caption: Contents:
- pages/installation
- pages/tutorial
- pages/api/API
+ install
+ method
+ tutorial/index
+
+MOVE (**m**\ ulti-\ **o**\ mics **v**\ ariational auto\ **e**\ ncoder) is a
+framework for integration of omics and other data modalities (including both
+categorical and continuous data). Our approach consists of training an ensemble
+of VAE (variational autoencoder) models and performing *in silico* perturbation
+experiments to identify associations across the different omics datasets.
+
+We invite you to read `our publication`_ presenting this method, or read
+about the method :doc:`here`.
+
+.. _`our publication`: https://www.nature.com/articles/s41587-022-01520-x
diff --git a/docs/source/install.rst b/docs/source/install.rst
new file mode 100644
index 00000000..901f72e7
--- /dev/null
+++ b/docs/source/install.rst
@@ -0,0 +1,47 @@
+Install
+=======
+
+MOVE is distributed as ``move-dl``, a Python package.
+
+It requires Python 3.9 (or later) and third-party libraries, such as `PyTorch`_
+and `Hydra`_. These dependencies will be installed automatically when you
+install with ``pip``.
+
+Install the stable version
+--------------------------
+
+We recommend installing ``move-dl`` in a fresh virtual environment. If you wish
+to learn how to create and manage virtual environments with Conda, please
+follow `these instructions`_.
+
+The latest stable version of ``move-dl`` can be installed with ``pip``.
+
+.. code-block:: bash
+
+ >>> pip install move-dl
+
+Install the development version
+-------------------------------
+
+If you wish to install the development of ``move-dl``, create a new virtual
+environment, and do:
+
+.. code-block:: bash
+
+ >>> pip install git+https://github.com/RasmussenLab/MOVE@developer
+
+Alternatively, you can clone ``move-dl`` from `GitHub`_ and install by
+running the following command from the top-level source directory:
+
+.. code-block:: bash
+
+ >>> pip install -e .
+
+The ``-e`` flag installs the project in "editable" mode, so you can follow the
+development branch and update your installation by pulling from GitHub.
+
+.. _PyTorch: https://pytorch.org/
+.. _Hydra: https://hydra.cc/
+.. _GitHub: https://github.com/RasmussenLab/MOVE
+
+.. _these instructions: https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html
diff --git a/docs/source/method.rst b/docs/source/method.rst
new file mode 100644
index 00000000..d9982600
--- /dev/null
+++ b/docs/source/method.rst
@@ -0,0 +1,98 @@
+About the method
+================
+
+MOVE is based on the VAE (variational autoencoder) model, a deep learning model
+that transforms high-dimensional data into a lower-dimensional space (so-called
+latent representation). The autoencoder is made up of two neural networks: an
+encoder, which compresses the input variables; and a decoder, which tries to
+reconstruct the original input from the compressed representation. In doing so,
+the model learns the structure and associations between the input variables.
+
+In `our publication`_, we used this type of model to integrate different data
+modalities, including: genomics, transcriptomics, proteomics, metabolomics,
+microbiomes, medication data, diet questionnaires, and clinical measurements.
+Once we obtained a trained model, we exploited the decoder network to identify
+cross-omics associations.
+
+Our approach consists of performing *in silico* perturbations of the original
+data and using either univariate statistical methods or Bayesian decision
+theory to identify significant differences between the reconstruction with or
+without perturbation. Thus, we are able to detect associations between the
+input variables.
+
+.. _`our publication`: https://www.nature.com/articles/s41587-022-01520-x
+
+.. image:: method/fig1.svg
+
+VAE design
+-----------
+
+The VAE was designed to account for a variable number of fully-connected hidden
+layers in both encoder and decoder. Each hidden layer is followed by batch
+normalization, dropout, and a leaky rectified linear unit (leaky ReLU).
+
+To integrate different modalities, each dataset is reshaped and concatenated
+into an input matrix. Moreover, error calculation is done on a dataset
+basis: binary cross-entropy for binary and categorical datasets and mean squared
+error for continuous datasets. Each error :math:`E_i` is then multiplied by a
+given weight :math:`W_i` and added up to form the loss function:
+
+:math:`L = \sum_i W_i E_i + W_\textnormal{KL} D_\textnormal{KL}`
+
+Note that the :math:`D_\textnormal{KL}` (Kullback–Leibler divergence) penalizes
+deviance of the latent representation from the standard normal distribution. It
+is also subject to a weight :math:`W_\textnormal{KL}`, which warms up as the
+model is trained.
+
+Extracting associations
+-----------------------
+
+After determining the right set of hyperparameters, associations are extracted
+by perturbing the original input data and passing it through an ensemble of
+trained models. The reason behind using an ensemble is that VAE models are
+stochastic, so we need to ensure that the results we obtain are not a product
+of chance.
+
+We perturbed categorical data by changing its value from one category to
+another (e.g., drug status changed from "not received" to "received"). Then, we
+compare the change between the reconstruction generated from the original data
+and the perturbed data. To achieve this, we proposed two approaches: using
+*t*\ -test and Bayes factors. Both are described below:
+
+MOVE *t*\ -test
+^^^^^^^^^^^^^^^
+
+#. Perturb a variable in one dataset.
+#. Repeat 10 times for 4 different latent space sizes:
+
+ #. Train VAE model with original data.
+ #. Obtain reconstruction of original data (baseline reconstruction).
+ #. Obtain 10 additional reconstructions of original data and calculate
+ difference from the first (baseline difference).
+ #. Obtain reconstruction of perturbed data (perturbed reconstruction) and
+ subtract from baseline reconstruction (perturbed difference).
+ #. Compute p-value between baseline and perturbed differences with t-test.
+
+#. Correct p-values using Bonferroni method.
+#. Select features that are significant (p-value lower than 0.05).
+#. Select significant features that overlap in at least half of the refits and
+ 3 out of 4 architectures. These features are associated with the
+ perturbed variable.
+
+MOVE Bayes
+^^^^^^^^^^
+
+#. Perturb a variable in one dataset.
+#. Repeat 30 times:
+
+ #. Train VAE model with original data.
+ #. Obtain reconstruction of original data (baseline reconstruction).
+ #. Obtain reconstruction of perturbed data (perturbed reconstruction).
+ #. Record difference between baseline and perturbed reconstruction.
+
+#. Compute probability of difference being greater than 0.
+#. Compute Bayes factor from probability: :math:`K = \log p - \log (1 - p)`.
+#. Sort probabilities by Bayes factor, from highest to lowest.
+#. Compute false discovery rate (FDR) as cumulative evidence.
+#. Select features whose FDR is above desired threshold (e.g., 0.05). These
+ features are associated with the perturbed variable.
\ No newline at end of file
diff --git a/docs/source/method/fig1.svg b/docs/source/method/fig1.svg
new file mode 100644
index 00000000..87654877
--- /dev/null
+++ b/docs/source/method/fig1.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/source/pages/installation.rst b/docs/source/pages/installation.rst
deleted file mode 100644
index 11e44375..00000000
--- a/docs/source/pages/installation.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-Installation
-============
diff --git a/docs/source/pages/tutorial.rst b/docs/source/pages/tutorial.rst
deleted file mode 100644
index d51454e0..00000000
--- a/docs/source/pages/tutorial.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-Tutorial(s)
-============
\ No newline at end of file
diff --git a/docs/source/tutorial/data_preparation.rst b/docs/source/tutorial/data_preparation.rst
new file mode 100644
index 00000000..06690ffe
--- /dev/null
+++ b/docs/source/tutorial/data_preparation.rst
@@ -0,0 +1,191 @@
+Data preparation
+================
+
+In this tutorial, we explain how to make your data compatible with the
+``move-dl`` commands.
+
+For this tutorial we will work with a dataset taken from Walters et al. (2008)
+[#]_. In their work, they report soil microbiome census data along with
+environmental data (e.g., temperature and precipitation) of different cultivars
+of maize.
+
+We will start by downloading the files corresponding to their `OTU table`_
+and `metadata`_.
+
+Formatting omics data
+---------------------
+
+The ``move-dl`` pipeline requires continuous omics input to be formatted as a
+TSV file with one column per feature and one row per feature.
+
+If we load the microbiome OTU table from the maize rhizosphere dataset, it will
+look something like this:
+
+.. table:: Original OTU table
+
+ ======== ====================== ====================== ======================
+ otuids 11116.C02A66.1194587 11116.C06A63.1195666 11116.C08A61.1197689
+ ======== ====================== ====================== ======================
+ 4479944 70 8 18
+ 513055 2 16 1
+ 519510 22 15 12
+ 810959 5 0 3
+ 849092 5 2 1
+ ======== ====================== ====================== ======================
+
+We have columns corresponding to samples and rows corresponding to features
+(OTUs), so we need to **transpose** this table for MOVE.
+
+.. table:: Transposed OTU table
+
+ ==================== ========= ======== ======== ======== ========
+ sampleids 4479944 513055 519510 810959 849092
+ ==================== ========= ======== ======== ======== ========
+ 11116.C02A66.1194587 70 2 22 5 5
+ 11116.C06A63.1195666 8 16 15 0 2
+ 11116.C08A61.1197689 18 1 12 3 1
+ ==================== ========= ======== ======== ======== ========
+
+Now, we can save our table as a TSV and we are ready to go. No need to do any
+further processing.
+
+Formatting other continuous data
+--------------------------------
+
+Other non-omics continuous data is formatted in a similar way.
+
+For this tutorial, we are going to extract some continuous data from the maize
+metadata table. Let us load the table and take a peek:
+
+.. table:: Original metadata table
+
+ ==================== ==================== ========= =============== ==============
+ X.SampleID Precipitation3Days INBREDS Maize_Line Description1
+ ==================== ==================== ========= =============== ==============
+ 11116.C02A66.1194587 0.14 Oh7B Non_Stiff_Stalk rhizosphere
+ 11116.C06A63.1195666 0.14 P39 Sweet_Corn rhizosphere
+ 11116.C08A61.1197689 0.14 CML333 Tropical rhizosphere
+ 11116.C08A63.1196825 0.14 CML333 Tropical rhizosphere
+ 11116.C12A64.1197667 0.14 Il14H Sweet_Corn rhizosphere
+ ==================== ==================== ========= =============== ==============
+
+The original metadata table contains both categorical (e.g., ``Maize_Line``)
+and continuous data (e.g., ``Precipitation3Days``). We need to separate these
+into different files.
+
+In this example, we select three columns: ``age``, ``Precipitation3Days``, and
+``Temperature``.
+
+.. table:: Extracted continuous data
+
+ ==================== ===== ============= ====================
+ X.SampleID age Temperature Precipitation3Days
+ ==================== ===== ============= ====================
+ 11116.C02A66.1194587 12 76 0.14
+ 11116.C06A63.1195666 12 76 0.14
+ 11116.C08A61.1197689 12 76 0.14
+ 11116.C08A63.1196825 12 76 0.14
+ 11116.C12A64.1197667 12 76 0.14
+ ==================== ===== ============= ====================
+
+Once again, we can save this table as a TSV, and we are ready to continue.
+
+Formatting categorical data
+---------------------------
+
+Categorical data like binary variables (e.g., with/without treatment) or
+discrete categories needs to be formatted in individual files.
+
+The metadata table contains several discrete variables that can be useful for
+classification, such as maize line, cultivar, and type of soil. For each one of
+these, we need to create a separate TSV file that will look something like:
+
+.. table:: Extracted maize line data
+
+ ==================== ===============
+ X.SampleID Maize_Line
+ ==================== ===============
+ 11116.C02A66.1194587 Non_Stiff_Stalk
+ 11116.C06A63.1195666 Sweet_Corn
+ 11116.C08A61.1197689 Tropical
+ 11116.C08A63.1196825 Tropical
+ 11116.C12A64.1197667 Sweet_Corn
+ ==================== ===============
+
+Creating a data config file
+---------------------------
+
+We are missing two components to make our data compatible with ``move-dl``.
+First, we need to create an additional text file with all the sample IDs (one
+ID per line, see example below). This file tells MOVE which samples to use, so
+the IDs in this file must be present in all the other input files.
+
+.. code-block:: text
+ :caption: Maize sample IDs
+
+ 11116.C02A66.1194587
+ 11116.C06A63.1195666
+ 11116.C08A61.1197689
+ 11116.C08A63.1196825
+ 11116.C12A64.1197667
+
+Finally, we need to create a data config YAML file. The purpose of this file is
+to tell MOVE which files to load, where to find them, and where to save any
+output files.
+
+The data config file for this tutorial would look like this:
+
+.. literalinclude:: /../../tutorial/config/data/maize.yaml
+ :language: yaml
+
+Here we break down the fields of this file:
+
+* ``defaults`` indicates this file is a config file. It should be left intact.
+* ``raw_data_path`` points to the raw data location (i.e., the files we
+ created in this tutorial).
+* ``interim_data_path`` points to the directory where intermediary files will
+ be deposited.
+* ``results_path`` points to the folder where results will be saved.
+* ``sample_names`` is the file name of the file containing all valid sample
+ IDs. This file must have a ``txt`` extension.
+* ``categorical_inputs`` is a list of file names containing categorical data.
+ Each element of the list should have a name ``name`` and may optionally have
+ a ``weight``. All referenced files should have a ``tsv`` extension.
+* ``continuous_inputs`` lists the continuous data files. Same format as
+ ``categorical_inputs``.
+
+The data config file can have any name, but it must be saved in ``config/data``
+directory. The final workspace structure should look like this:::
+
+ tutorial/
+ │
+ ├── maize/
+ │ └── data/
+ │ ├── maize_field.tsv <- Type of soil data
+ │ ├── maize_ids.txt <- Sample IDs
+ │ ├── maize_line.tsv <- Maize line data
+ │ ├── maize_metadata.tsv <- Age, temperature, precipitation data
+ │ ├── maize_microbiome.tsv <- OTU table
+ │ └── maize_variety.tsv <- Maize variety data
+ │
+ └── config/
+ └── data/
+ └── maize.yaml <- Data configuration file
+
+With your data formatted and ready, we can continue to run MOVE and exploring
+the associations between the different variables in your datasets. Have a look
+at our :doc:`introductory tutorial` for more
+information on this.
+
+References
+----------
+
+.. [#] Walters WA, Jin Z, Youngblut N, Wallace JG, Sutter J, Zhang W, et al.
+ Large-scale replicated field study of maize rhizosphere identifies heritable
+ microbes. `Proc Natl Acad Sci U S A`. 2018; 115: 7368–7373.
+ `doi:10.1073/pnas.1800918115`_
+
+.. _`doi:10.1073/pnas.1800918115`: https://doi.org/10.1073/pnas.1800918115
+
+.. _`OTU table`: https://github.com/jorgemf/DeepLatentMicrobiome/raw/91e384b7115978bb3cd0f61c7dd3d8ffc866efc3/Datasets/otu_table_all_80.csv
+.. _`metadata`: https://github.com/jorgemf/DeepLatentMicrobiome/raw/91e384b7115978bb3cd0f61c7dd3d8ffc866efc3/Datasets/metadata_table_all_80.csv
diff --git a/docs/source/tutorial/index.rst b/docs/source/tutorial/index.rst
new file mode 100644
index 00000000..4f2d3592
--- /dev/null
+++ b/docs/source/tutorial/index.rst
@@ -0,0 +1,12 @@
+Tutorials
+=========
+
+Here, we provide several guides and tutorials to help you start working with
+MOVE.
+
+.. toctree::
+ :maxdepth: 1
+
+ introduction
+ data_preparation
+ model_tuning
diff --git a/docs/source/tutorial/introduction.rst b/docs/source/tutorial/introduction.rst
new file mode 100644
index 00000000..259f0266
--- /dev/null
+++ b/docs/source/tutorial/introduction.rst
@@ -0,0 +1,210 @@
+Introduction to MOVE
+====================
+
+This guide can help you get started with MOVE.
+
+About the MOVE pipeline
+-----------------------
+
+MOVE has the following four steps (also called tasks):
+
+1. **Encoding data**: taking input data and converting it into a format MOVE
+ can read.
+2. **Tuning the hyperparameters of the VAE model**: training multiple models to
+ find the set of hyperparameters that produces the best reconstructions or
+ most stable latent space.
+3. **Analyzing the latent space**: training a model and inspecting the latent
+ representation it creates.
+4. **Identifying associations**: using an ensemble of VAE to find associations
+ between the input datasets.
+
+Simulated dataset
+-----------------
+
+For this short tutorial, we provide `simulated dataset`_ (available from our
+GitHub repository). This dataset consists of pretend proteomics and
+metagenomcis measurements for 500 fictitious individuals. We also report
+whether these individuals have taken or not 20 imaginary drugs.
+
+All values were randomly generated, but we have added 200
+associations between different pairs of drugs and omics features. Let us find
+these associations with MOVE!
+
+.. _simulated dataset: https://download-directory.github.io/?url=https%3A%2F%2Fgithub.com%2FRasmussenLab%2FMOVE%2Ftree%2Fmain%2Ftutorial
+
+Workspace structure
+-------------------
+
+First, we take a look at how to organize our data and configuration:::
+
+ tutorial/
+ │
+ ├── data/
+ │ ├── changes.small.txt <- Ground-truth associations (200 links)
+ │ ├── random.small.drugs.tsv <- Drug dataset (20 drugs)
+ │ ├── random.small.ids.tsv <- Sample IDs (500 samples)
+ │ ├── random.small.proteomics.tsv <- Proteomics dataset (200 proteins)
+ │ └── random.small.metagenomics.tsv <- Metagenomics dataset (1000 taxa)
+ │
+ └── config/ <- Stores user configuration files
+ ├── data/
+ │ └── random_small.yaml <- Configuration to read in the necessary
+ │ data files.
+ ├── experiment/ <- Configuration for experiments (e.g.,
+ │ └── random_small__tune.yaml for tuning hyperparameters).
+ │
+ └── task/ <- Configuration for tasks: such as
+ | latent space or identify associations
+ │ using the t-test or Bayesian approach
+ ├── random_small__id_assoc_bayes.yaml
+ ├── random_small__id_assoc_ttest.yaml
+ └── random_small__latent.yaml
+
+The data directory
+^^^^^^^^^^^^^^^^^^
+
+All "raw" data files should be placed inside the same directory. These files
+are TSVs (tab-separated value tables) containing discrete values (e.g., for
+binary or categorical datasets) or continuous values.
+
+Additionally, make sure each sample has an assigned ID and provide an ID
+table containing a list of all valid IDs (must appear in every dataset).
+
+The ``config`` directory
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+Configuration is composed and managed by `Hydra`_.
+
+User-defined configuration must be stored in a ``config`` folder. This folder
+can contain a ``data`` and ``task`` folder to store the configuration files for
+your dataset and tasks.
+
+.. _`Hydra`: https://hydra.cc/
+
+Data configuration
+""""""""""""""""""
+
+Let us take a look at the configuration for our dataset. It is a YAML file,
+specifying: the directories to look for raw data and store intermediary and
+final output files, and the list of categorical and continuous datasets we
+have.
+
+.. literalinclude:: /../../tutorial/config/data/random_small.yaml
+ :language: yaml
+
+Note that we do not recommend changing the ``defaults`` field, otherwise the
+configuration file will not be properly recognized by MOVE.
+
+Task configuration
+""""""""""""""""""
+
+Similarly, the ``task`` folder contains YAML files to configure the tasks of
+MOVE. In this tutorial, we provided two examples for running the method to
+identify associations using our t-test and Bayesian approach, and an example to
+perform latent space analysis.
+
+For example, for the t-test approach (``random_small__id_assoc_ttest.yaml``),
+we define the following values: batch size, number of refits, name of dataset to
+perturb, target perturb value, configuration for VAE model, and configuration
+for training loop.
+
+.. literalinclude:: /../../tutorial/config/task/random_small__id_assoc_ttest.yaml
+ :language: yaml
+
+Note that the ``random_small__id_assoc_bayes.yaml`` looks pretty similar, but
+declares a different ``defaults``. This tells MOVE which algorithm to use!
+
+Running MOVE
+------------
+
+Encoding data
+^^^^^^^^^^^^^
+
+Make sure you are on the parent directory of the ``config`` folder (in this
+example, it is the ``tutorial`` folder), and proceed to run:
+
+.. code-block:: bash
+
+ >>> cd tutorial
+ >>> move-dl data=random_small task=encode_data
+
+|:arrow_up:| This command will encode the datasets. The ``random.small.drugs``
+dataset (defined in ``config/data/random_small.yaml``) will be one-hot encoded,
+whereas the other two omics datasets will be standardized. Encoded data will
+be placed in the intermediary folder defined in the
+:ref:`data config`.
+
+|:loud_sound:| Every ``move-dl`` command will generate a ``logs`` folder to
+store log files timestamping the program's current doings.
+
+Tuning the model's hyperparameters
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Once the data has been encoded, we can proceed with the first step of our
+pipeline: tuning the hyperparameters of our deep learning model. This process
+can be time-consuming, because several models will be trained and tested. For
+this short tutorial, you may choose to skip it and proceed to
+:ref:`analyze the latent space`.
+
+Analyzing the latent space
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Next, we will train a variational autoencoder and analyze how good it is at
+reconstructing our input data and generating an informative latent space. Run:
+
+.. code-block:: bash
+
+ >>> move-dl data=random_small task=random_small__latent
+
+|:arrow_up:| This command will create a ``latent_space`` directory in the
+results folder defined in the :ref:`data config`. This
+folder will contain the following plots:
+
+* **Loss curve** shows the overall loss, KLD term, binary cross-entropy term,
+ and sum of squared errors term over number of training epochs.
+* **Reconstructions metrics boxplot** shows a score (accuracy or cosine
+ similarity for categorical and continuous datasets, respectively) per
+ reconstructed dataset.
+* **Latent space scatterplot** shows a reduced representation of the latent
+ space. To generate this visualization, the latent space is reduced to two
+ dimensions using TSNE (or another user-defined algorithm, e.g., UMAP).
+* **Feature importance swarmplot** displays the impact perturbing a feature has
+ on the latent space.
+
+Additionally, TSV files corresponding to each plot will be generated. These can
+be used, for example, to re-create the plots manually or with different
+styling.
+
+Identifying associations
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+Next step is to find associations between the drugs taken by each individual
+and the omics features. Run:
+
+.. code-block:: bash
+
+ >>> move-dl data=random_small task=random_small__id_assoc_ttest
+
+|:arrow_up:| This command will create a ``results_sig_assoc.tsv`` file, listing
+each pair of associated features and the corresponding median p-value for such
+association. There should be ~120 associations found. Due to the nature of the
+method, this number may slightly fluctuate.
+
+|:warning:| Note that the value after ``task=`` matches the name of our
+configuration file. We can create multiple configuration files (for example,
+changing hyperparameters like learning rate) and call them by their name here.
+
+|:stopwatch:| This command takes approximately 45 min to run on a work laptop
+(Intel Core i7-10610U @ 1.80 GHz, 32 MB RAM). You can track the progress by
+checking the corresponding log file created in the ``logs`` folder.
+
+If you want to try the Bayesian approach instead, run:
+
+.. code-block:: bash
+
+ >>> move-dl data=random_small task=random_small__id_assoc_bayes
+
+Again, it should generate similar results with over 120 associations known.
+
+Take a look at the ``changes.small.txt`` file and compare your results against
+it. Did MOVE find any false positives?
diff --git a/docs/source/tutorial/model_tuning.rst b/docs/source/tutorial/model_tuning.rst
new file mode 100644
index 00000000..af773a5d
--- /dev/null
+++ b/docs/source/tutorial/model_tuning.rst
@@ -0,0 +1,95 @@
+Tuning models
+=============
+
+The second of MOVE's pipeline consists of training multiple models with
+different hyperparameters in order to determine which set is optimal, i.e.,
+produces models that generate the most accurate reconstructions and/or the most
+stable latent representations.
+
+The hyperparameters can be anything from number of training epochs, to size of
+samples per batch, to number and size of hidden layers in the encoder-decoder
+architecture.
+
+The ``experiment`` config
+-------------------------
+
+To start with this step, we define a experiment configuration (please
+first consult the :doc:`introductory` and
+:doc:`data preparation tutorial` if you have not
+set up your workspace and data). This type of config references a data
+config, a task config, and the values of hyperparameters to test out.
+
+The first lines of our config should look like:
+
+.. literalinclude:: /../../tutorial/config/experiment/random_small__tune_reconstruction.yaml
+ :language: yaml
+ :lines: 1-8
+
+The ``override`` directives indicate (1) the name of our data config (in this
+example we reference the config of our simulated dataset, see tutorial for
+more info about this dataset) and (2) the name of the tuning task. There are
+two possible values for tuning task:
+
+- ``tune_model_reconstruction``, which reports the reconstruction accuracy of
+ models trained with different hyperparameter combinations; and
+- ``tune_model_stability``, which reports the stability of the latent space of
+ differently hyperparameterized models.
+
+Next, we have to define the hyperparameters that we wish to test out. An
+example would be:
+
+.. literalinclude:: /../../tutorial/config/experiment/random_small__tune_reconstruction.yaml
+ :language: yaml
+ :lines: 15-21
+
+The above config would result in 12 hyperparameter combinations (2 options of
+batch size times 2 options of encoder-decoder architecture times 3 options of
+training epochs).
+
+Any parameter of the training loop, model, and task can be swept. However, do
+note that the more options you provide, the more models that will be trained,
+and the more resource-intensive this task will become.
+
+Below is a list of hyperparameters that we recommend tuning:
+
+.. list-table:: Tunable hyperparameters
+ :width: 100
+ :widths: 40 60
+ :header-rows: 1
+
+ * - Hyperparameter
+ - Description
+ * - ``task.batch_size``
+ - Number of samples per training batch
+ * - ``task.model.num_hidden``
+ - Architecture of the encoder network
+ (reversed for the decoder network)
+ * - ``task.model.num_latent``
+ - Number of units of the latent space
+ * - ``task.model.beta``
+ - Weight applied to the KLD term in the loss function
+ * - ``task.model.dropout``
+ - Dropout
+ * - ``task.training_loop.num_epochs``
+ - Number of training epochs
+ * - ``task.training_loop.lr``
+ - Learning rate
+ * - ``task.training_loop.kld_warmup_steps``
+ - Epochs at which KLD is warmed
+ * - ``task.training_loop.batch_dilation_steps``
+ - Epochs at which batch size is increased
+ * - ``task.training_loop.early_stopping``
+ - Whether early stopping is triggered
+
+Finally, to run the tuning:
+
+.. code-block:: bash
+
+ >>> cd tutorial
+ >>> move-dl experiment=random_small__tune_reconstruction
+
+This process may take a while (depending on the number of hyperparameter
+combinations that will be trained and tested), and it will produce a TSV table
+in a ``{results_path}/tune_model`` directory summarizing the metrics (either
+reconstruction metrics like accuracy or stability). These metrics can be
+plotted to visualize and select the optimal hyperparameter combination.
diff --git a/requirements.txt b/requirements.txt
index 14f6a2d6..775f5c46 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,7 @@
hydra-core>=1.2.0
-numpy>=1.19.5
-pandas>=1.1.5
-torch==1.9.0
+numpy>=1.21.5
+pandas>=1.4.2
+torch>=1.11.0
+matplotlib>=3.5.2
+seaborn>=0.12.1
+scikit-learn>=1.0.2
diff --git a/setup.cfg b/setup.cfg
index 27f76841..f269f4a2 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -22,7 +22,7 @@ install_requires =
seaborn
scikit-learn
scipy
-
+
package_dir =
= src
packages = find:
diff --git a/src/move/__init__.py b/src/move/__init__.py
index f13a81a5..636eb345 100644
--- a/src/move/__init__.py
+++ b/src/move/__init__.py
@@ -1,7 +1,7 @@
from __future__ import annotations
__license__ = "MIT"
-__version__ = (1, 3, 0)
+__version__ = (1, 4, 0)
__all__ = ["conf", "data", "models", "training_loop", "VAE"]
HYDRA_VERSION_BASE = "1.2"
diff --git a/src/move/__main__.py b/src/move/__main__.py
index 6cd62054..485e5bac 100644
--- a/src/move/__main__.py
+++ b/src/move/__main__.py
@@ -34,7 +34,7 @@ def main(config: MOVEConfig) -> None:
logger.info("No task specified.")
elif task_type is EncodeDataConfig:
move.tasks.encode_data(config.data)
- elif task_type is TuneModelConfig:
+ elif issubclass(task_type, TuneModelConfig):
move.tasks.tune_model(config)
elif task_type is AnalyzeLatentConfig:
move.tasks.analyze_latent(config)
diff --git a/src/move/conf/schema.py b/src/move/conf/schema.py
index fac7d697..c9dee984 100644
--- a/src/move/conf/schema.py
+++ b/src/move/conf/schema.py
@@ -2,7 +2,8 @@
"MOVEConfig",
"EncodeDataConfig",
"AnalyzeLatentConfig",
- "TuneModelConfig",
+ "TuneModelReconstructionConfig",
+ "TuneModelStabilityConfig",
"IdentifyAssociationsConfig",
"IdentifyAssociationsBayesConfig",
"IdentifyAssociationsTTestConfig",
@@ -104,6 +105,20 @@ class TuneModelConfig(TaskConfig):
...
+@dataclass
+class TuneModelStabilityConfig(TuneModelConfig):
+ """Configure the "tune model" task."""
+
+ num_refits: int = MISSING
+
+
+@dataclass
+class TuneModelReconstructionConfig(TuneModelConfig):
+ """Configure the "tune model" task."""
+
+ ...
+
+
@dataclass
class AnalyzeLatentConfig(TaskConfig):
"""Configure the "analyze latents" task.
@@ -194,8 +209,14 @@ def extract_names(configs: list[InputConfig]) -> list[str]:
)
cs.store(
group="task",
- name="tune_model_schema",
- node=TuneModelConfig,
+ name="tune_model_reconstruction_schema",
+ node=TuneModelReconstructionConfig,
+)
+
+cs.store(
+ group="task",
+ name="tune_model_stability_schema",
+ node=TuneModelStabilityConfig,
)
cs.store(
group="task",
diff --git a/src/move/conf/task/tune_model.yaml b/src/move/conf/task/tune_model_reconstruction.yaml
similarity index 91%
rename from src/move/conf/task/tune_model.yaml
rename to src/move/conf/task/tune_model_reconstruction.yaml
index 7f620e52..677dc052 100644
--- a/src/move/conf/task/tune_model.yaml
+++ b/src/move/conf/task/tune_model_reconstruction.yaml
@@ -1,5 +1,5 @@
defaults:
- - tune_model_schema
+ - tune_model_reconstruction_schema
batch_size: 10
diff --git a/src/move/conf/task/tune_model_stability.yaml b/src/move/conf/task/tune_model_stability.yaml
new file mode 100644
index 00000000..3df9f3cf
--- /dev/null
+++ b/src/move/conf/task/tune_model_stability.yaml
@@ -0,0 +1,30 @@
+defaults:
+ - tune_model_stability_schema
+
+batch_size: 10
+num_refits: 3
+
+model:
+ categorical_weights: ${weights:${data.categorical_inputs}}
+ continuous_weights: ${weights:${data.continuous_inputs}}
+ num_hidden:
+ - 1000
+ num_latent: 150
+ beta: 0.0001
+ dropout: 0.1
+ cuda: false
+
+training_loop:
+ lr: 1e-4
+ num_epochs: 40
+ batch_dilation_steps:
+ - 50
+ - 100
+ - 150
+ kld_warmup_steps:
+ - 15
+ - 20
+ - 25
+ early_stopping: false
+ patience: 0
+
\ No newline at end of file
diff --git a/src/move/models/vae.py b/src/move/models/vae.py
index 801a4da3..a76e1a75 100644
--- a/src/move/models/vae.py
+++ b/src/move/models/vae.py
@@ -582,11 +582,14 @@ def _validate_batch(self, batch: tuple[torch.Tensor, torch.Tensor]) -> torch.Ten
a formed batch
"""
cat, con = batch
+ cat = cat.to(self.device)
+ con = con.to(self.device)
+
if self.num_categorical is None:
return con
elif self.num_continuous is None:
return cat
- return torch.cat(batch, dim=1)
+ return torch.cat((cat, con), dim=1)
@torch.no_grad()
def project(self, dataloader: DataLoader) -> FloatArray:
@@ -604,7 +607,7 @@ def project(self, dataloader: DataLoader) -> FloatArray:
batch = self._validate_batch(batch)
*_, mu, _ = self(batch)
embedding.append(mu)
- embedding = torch.cat(embedding, dim=0).numpy()
+ embedding = torch.cat(embedding, dim=0).cpu().numpy()
return embedding
@torch.no_grad()
@@ -630,8 +633,8 @@ def reconstruct(
for i, cat in enumerate(cat_recon):
cat_recons[i].append(torch.argmax(cat, dim=1))
con_recons.append(con_recon)
- cat_recons = [torch.cat(cats, dim=0).numpy() for cats in cat_recons]
- con_recons = torch.cat(con_recons, dim=0).numpy()
+ cat_recons = [torch.cat(cats, dim=0).cpu().numpy() for cats in cat_recons]
+ con_recons = torch.cat(con_recons, dim=0).cpu().numpy()
return cat_recons, con_recons
@torch.no_grad()
diff --git a/src/move/tasks/analyze_latent.py b/src/move/tasks/analyze_latent.py
index 97e2d256..67399bcb 100644
--- a/src/move/tasks/analyze_latent.py
+++ b/src/move/tasks/analyze_latent.py
@@ -98,19 +98,24 @@ def analyze_latent(config: MOVEConfig) -> None:
df_index = pd.Index(sample_names, name="sample")
assert task_config.model is not None
+ device = torch.device("cuda" if task_config.model.cuda == True else "cpu")
model: VAE = hydra.utils.instantiate(
task_config.model,
continuous_shapes=test_dataset.con_shapes,
categorical_shapes=test_dataset.cat_shapes,
)
+
logger.debug(f"Model: {model}")
model_path = output_path / "model.pt"
if model_path.exists():
logger.debug("Re-loading model")
model.load_state_dict(torch.load(model_path))
+ model.to(device)
else:
logger.debug("Training model")
+
+ model.to(device)
train_dataloader = make_dataloader(
cat_list,
con_list,
@@ -133,6 +138,7 @@ def analyze_latent(config: MOVEConfig) -> None:
fig_df = pd.DataFrame(dict(zip(viz.LOSS_LABELS, losses)))
fig_df.index.name = "epoch"
fig_df.to_csv(output_path / "loss_curve.tsv", sep="\t")
+
model.eval()
logger.info("Projecting into latent space")
diff --git a/src/move/tasks/identify_associations.py b/src/move/tasks/identify_associations.py
index 3f485117..93ea65cc 100644
--- a/src/move/tasks/identify_associations.py
+++ b/src/move/tasks/identify_associations.py
@@ -118,6 +118,9 @@ def identify_associations(config: MOVEConfig):
feature_mask = np.all(target_dataset == target_value, axis=2) # 2D: N x P
feature_mask |= np.sum(target_dataset, axis=2) == 0
+ assert task_config.model is not None
+ device = torch.device("cuda" if task_config.model.cuda == True else "cpu")
+
def _bayes_approach(
task_config: IdentifyAssociationsBayesConfig,
) -> tuple[IntArray, FloatArray]:
@@ -141,8 +144,11 @@ def _bayes_approach(
if model_path.exists():
logger.debug(f"Re-loading refit {j + 1}/{task_config.num_refits}")
model.load_state_dict(torch.load(model_path))
+ model.to(device)
else:
logger.debug(f"Training refit {j + 1}/{task_config.num_refits}")
+
+ model.to(device)
hydra.utils.call(
task_config.training_loop,
model=model,
@@ -216,8 +222,10 @@ def _ttest_approach(
if model_path.exists():
logger.debug(f"Re-loading refit {j + 1}/{task_config.num_refits}")
model.load_state_dict(torch.load(model_path))
+ model.to(device)
else:
logger.debug(f"Training refit {j + 1}/{task_config.num_refits}")
+ model.to(device)
hydra.utils.call(
task_config.training_loop,
model=model,
diff --git a/src/move/tasks/tune_model.py b/src/move/tasks/tune_model.py
index 6861bf02..49b66826 100644
--- a/src/move/tasks/tune_model.py
+++ b/src/move/tasks/tune_model.py
@@ -2,28 +2,50 @@
from pathlib import Path
from random import shuffle
-from typing import Any, cast
+from typing import Any, Literal, cast
import hydra
import numpy as np
import pandas as pd
+import torch
from hydra.core.hydra_config import HydraConfig
from hydra.types import RunMode
from matplotlib.cbook import boxplot_stats
+from numpy.typing import ArrayLike
+from omegaconf import OmegaConf
+from sklearn.metrics.pairwise import cosine_similarity
from move.analysis.metrics import (
calculate_accuracy,
calculate_cosine_similarity,
)
-from move.conf.schema import MOVEConfig, TuneModelConfig
+from move.conf.schema import (
+ MOVEConfig,
+ TuneModelConfig,
+ TuneModelReconstructionConfig,
+ TuneModelStabilityConfig,
+)
from move.core.logging import get_logger
from move.core.typing import BoolArray, FloatArray
from move.data import io
from move.data.dataloaders import MOVEDataset, make_dataloader, split_samples
from move.models.vae import VAE
+TaskType = Literal["reconstruction", "stability"]
+
+
+def _get_task_type(
+ task_config: TuneModelConfig,
+) -> TaskType:
+ task_type = OmegaConf.get_type(task_config)
+ if task_type is TuneModelReconstructionConfig:
+ return "reconstruction"
+ if task_type is TuneModelStabilityConfig:
+ return "stability"
+ raise ValueError("Unsupported type of task!")
+
-def _get_record(values: FloatArray, **kwargs) -> dict[str, Any]:
+def _get_record(values: ArrayLike, **kwargs) -> dict[str, Any]:
record = kwargs
bxp_stats, *_ = boxplot_stats(values)
bxp_stats.pop("fliers")
@@ -34,6 +56,7 @@ def _get_record(values: FloatArray, **kwargs) -> dict[str, Any]:
def tune_model(config: MOVEConfig) -> float:
"""Train multiple models to tune the model hyperparameters."""
hydra_config = HydraConfig.get()
+
if hydra_config.mode != RunMode.MULTIRUN:
raise ValueError("This task must run in multirun mode.")
@@ -45,9 +68,11 @@ def tune_model(config: MOVEConfig) -> float:
job_num = hydra_config.job.num + 1
logger = get_logger(__name__)
- logger.info(f"Beginning task: tune model {job_num}")
- logger.info(f"Job name: {hydra_config.job.override_dirname}")
task_config = cast(TuneModelConfig, config.task)
+ task_type = _get_task_type(task_config)
+
+ logger.info(f"Beginning task: tune model {task_type} {job_num}")
+ logger.info(f"Job name: {hydra_config.job.override_dirname}")
interim_path = Path(config.data.interim_data_path)
output_path = Path(config.data.results_path) / "tune_model"
@@ -61,86 +86,167 @@ def tune_model(config: MOVEConfig) -> float:
config.data.continuous_names,
)
- split_path = interim_path / "split_mask.npy"
- if split_path.exists():
- split_mask: BoolArray = np.load(split_path)
- else:
- num_samples = cat_list[0].shape[0] if cat_list else con_list[0].shape[0]
- split_mask = split_samples(num_samples, 0.9)
- np.save(split_path, split_mask)
-
- train_dataloader = make_dataloader(
- cat_list,
- con_list,
- split_mask,
- shuffle=True,
- batch_size=task_config.batch_size,
- drop_last=True,
- )
- train_dataset = cast(MOVEDataset, train_dataloader.dataset)
-
assert task_config.model is not None
- model: VAE = hydra.utils.instantiate(
- task_config.model,
- continuous_shapes=train_dataset.con_shapes,
- categorical_shapes=train_dataset.cat_shapes,
- )
- logger.debug(f"Model: {model}")
+ device = torch.device("cuda" if task_config.model.cuda == True else "cpu")
- logger.debug("Training model")
- hydra.utils.call(
- task_config.training_loop,
- model=model,
- train_dataloader=train_dataloader,
- )
- model.eval()
-
- logger.info("Reconstructing")
- logger.info("Computing reconstruction metrics")
- label = [hp.split("=") for hp in hydra_config.job.override_dirname.split(",")]
- records = []
- splits = zip(["train", "test"], [split_mask, ~split_mask])
- for split_name, mask in splits:
- dataloader = make_dataloader(
+ def _tune_stability(
+ task_config: TuneModelStabilityConfig,
+ ):
+ label = [hp.split("=") for hp in hydra_config.job.override_dirname.split(",")]
+
+ train_dataloader = make_dataloader(
+ cat_list,
+ con_list,
+ shuffle=True,
+ batch_size=task_config.batch_size,
+ drop_last=True,
+ )
+
+ test_dataloader = make_dataloader(
cat_list,
con_list,
- mask,
shuffle=False,
- batch_size=np.count_nonzero(mask),
+ batch_size=1,
+ drop_last=False,
)
- cat_recons, con_recons = model.reconstruct(dataloader)
- con_recons = np.split(con_recons, model.continuous_shapes[:-1], axis=1)
- for cat, cat_recon, dataset_name in zip(
- cat_list, cat_recons, config.data.categorical_names
- ):
- accuracy = calculate_accuracy(cat[mask], cat_recon)
- record = _get_record(
- accuracy,
- job_num=job_num,
- **dict(label),
- metric="accuracy",
- dataset=dataset_name,
- split=split_name,
+
+ train_dataset = cast(MOVEDataset, train_dataloader.dataset)
+
+ logger.info(f"Training {task_config.num_refits} refits")
+
+ cosine_sim0 = None
+ cosine_sim_diffs = []
+ for j in range(task_config.num_refits):
+ logger.debug(f"Refit: {j+1}/{task_config.num_refits}")
+ model: VAE = hydra.utils.instantiate(
+ task_config.model,
+ continuous_shapes=train_dataset.con_shapes,
+ categorical_shapes=train_dataset.cat_shapes,
)
- records.append(record)
- for con, con_recon, dataset_name in zip(
- con_list, con_recons, config.data.continuous_names
- ):
- cosine_sim = calculate_cosine_similarity(con[mask], con_recon)
- record = _get_record(
- cosine_sim,
- job_num=job_num,
- **dict(label),
- metric="cosine_similarity",
- dataset=dataset_name,
- split=split_name,
+ model.to(device)
+
+ hydra.utils.call(
+ task_config.training_loop,
+ model=model,
+ train_dataloader=train_dataloader,
)
- records.append(record)
- logger.info("Writing results")
- df_path = output_path / "reconstruction_stats.tsv"
- header = not df_path.exists()
- df = pd.DataFrame.from_records(records)
- df.to_csv(df_path, sep="\t", mode="a", header=header, index=False)
+ model.eval()
+ latent, *_ = model.latent(test_dataloader, kld_weight=1)
+
+ if cosine_sim0 is None:
+ cosine_sim0 = cosine_similarity(latent)
+ else:
+ cosine_sim = cosine_similarity(latent)
+ D = np.absolute(cosine_sim - cosine_sim0)
+ # removing the diagonal element (cos_sim with itself)
+ diff = D[~np.eye(D.shape[0], dtype=bool)].reshape(D.shape[0], -1)
+ mean_diff = np.mean(diff)
+ cosine_sim_diffs.append(mean_diff)
+
+ record = _get_record(
+ cosine_sim_diffs,
+ job_num=job_num,
+ **dict(label),
+ metric="mean_diff_cosine_similarity",
+ num_refits=task_config.num_refits,
+ )
+ logger.info("Writing results")
+ df_path = output_path / "stability_stats.tsv"
+ header = not df_path.exists()
+ df = pd.DataFrame.from_records([record])
+ df.to_csv(df_path, sep="\t", mode="a", header=header, index=False)
+
+ def _tune_reconstruction(
+ task_config: TuneModelReconstructionConfig,
+ ):
+ split_path = interim_path / "split_mask.npy"
+ if split_path.exists():
+ split_mask: BoolArray = np.load(split_path)
+ else:
+ num_samples = cat_list[0].shape[0] if cat_list else con_list[0].shape[0]
+ split_mask = split_samples(num_samples, 0.9)
+ np.save(split_path, split_mask)
+
+ train_dataloader = make_dataloader(
+ cat_list,
+ con_list,
+ split_mask,
+ shuffle=True,
+ batch_size=task_config.batch_size,
+ drop_last=True,
+ )
+
+ train_dataset = cast(MOVEDataset, train_dataloader.dataset)
+
+ model: VAE = hydra.utils.instantiate(
+ task_config.model,
+ continuous_shapes=train_dataset.con_shapes,
+ categorical_shapes=train_dataset.cat_shapes,
+ )
+ model.to(device)
+ logger.debug(f"Model: {model}")
+
+ logger.debug("Training model")
+ hydra.utils.call(
+ task_config.training_loop,
+ model=model,
+ train_dataloader=train_dataloader,
+ )
+ model.eval()
+ logger.info("Reconstructing")
+ logger.info("Computing reconstruction metrics")
+ label = [hp.split("=") for hp in hydra_config.job.override_dirname.split(",")]
+ records = []
+ splits = zip(["train", "test"], [split_mask, ~split_mask])
+ for split_name, mask in splits:
+ dataloader = make_dataloader(
+ cat_list,
+ con_list,
+ mask,
+ shuffle=False,
+ batch_size=np.count_nonzero(mask),
+ )
+ cat_recons, con_recons = model.reconstruct(dataloader)
+ con_recons = np.split(con_recons, model.continuous_shapes[:-1], axis=1)
+ for cat, cat_recon, dataset_name in zip(
+ cat_list, cat_recons, config.data.categorical_names
+ ):
+ accuracy = calculate_accuracy(cat[mask], cat_recon)
+ record = _get_record(
+ accuracy,
+ job_num=job_num,
+ **dict(label),
+ metric="accuracy",
+ dataset=dataset_name,
+ split=split_name,
+ )
+ records.append(record)
+ for con, con_recon, dataset_name in zip(
+ con_list, con_recons, config.data.continuous_names
+ ):
+ cosine_sim = calculate_cosine_similarity(con[mask], con_recon)
+ record = _get_record(
+ cosine_sim,
+ job_num=job_num,
+ **dict(label),
+ metric="cosine_similarity",
+ dataset=dataset_name,
+ split=split_name,
+ )
+ records.append(record)
+
+ logger.info("Writing results")
+ df_path = output_path / "reconstruction_stats.tsv"
+ header = not df_path.exists()
+ df = pd.DataFrame.from_records(records)
+ df.to_csv(df_path, sep="\t", mode="a", header=header, index=False)
+
+ if task_type == "reconstruction":
+ task_config = cast(TuneModelReconstructionConfig, task_config)
+ _tune_reconstruction(task_config)
+ elif task_type == "stability":
+ task_config = cast(TuneModelStabilityConfig, task_config)
+ _tune_stability(task_config)
return 0.0
diff --git a/tutorial/config/data/random_small.yaml b/tutorial/config/data/random_small.yaml
index ad935b99..e1cc182d 100644
--- a/tutorial/config/data/random_small.yaml
+++ b/tutorial/config/data/random_small.yaml
@@ -7,14 +7,14 @@ defaults:
raw_data_path: data/ # where raw data is stored
interim_data_path: interim_data/ # where intermediate files will be stored
-results_path: results/ # where result files will be placed
+results_path: results/ # where result files will be placed
-sample_names: random.small.ids # names/IDs of each sample, must appear in the
- # other datasets
+sample_names: random.small.ids # names/IDs of each sample, must appear in
+ # the other datasets
-categorical_inputs: # a list of categorical datasets
+categorical_inputs: # a list of categorical datasets
- name: random.small.drugs
-continuous_inputs: # a list of continuous datasets
+continuous_inputs: # a list of continuous datasets
- name: random.small.proteomics
- name: random.small.metagenomics
diff --git a/tutorial/config/experiment/random_small__tune.yaml b/tutorial/config/experiment/random_small__tune_reconstruction.yaml
similarity index 92%
rename from tutorial/config/experiment/random_small__tune.yaml
rename to tutorial/config/experiment/random_small__tune_reconstruction.yaml
index 480080c7..d83938dd 100644
--- a/tutorial/config/experiment/random_small__tune.yaml
+++ b/tutorial/config/experiment/random_small__tune_reconstruction.yaml
@@ -4,7 +4,7 @@
defaults:
- override /data: random_small
- - override /task: tune_model
+ - override /task: tune_model_reconstruction
# Configure which hyperarameters to vary
# This will run and log the metrics of 12 models (combination of 3 hyperparams
diff --git a/tutorial/config/experiment/random_small__tune_stability.yaml b/tutorial/config/experiment/random_small__tune_stability.yaml
new file mode 100644
index 00000000..62e87610
--- /dev/null
+++ b/tutorial/config/experiment/random_small__tune_stability.yaml
@@ -0,0 +1,26 @@
+# @package _global_
+
+# Define the default configuration for the data and task (model and training)
+
+defaults:
+ - override /data: random_small
+ - override /task: tune_model_stability
+
+# Define how many models to train to calculate stability
+
+task:
+ num_refits: 10
+
+# Configure which hyperarameters to vary
+# This will run and log the metrics of 12 models (combination of 3 hyperparams
+# with 2-3 levels: 2 * 2 * 3)
+
+# Any field defined in the task configuration can be configured below.
+
+hydra:
+ mode: MULTIRUN
+ sweeper:
+ params:
+ task.batch_size: 10, 50
+ task.model.num_hidden: "[500],[1000]"
+ task.training_loop.num_epochs: 40, 60, 100
diff --git a/tutorial/notebooks/04 Latent space analysis.ipynb b/tutorial/notebooks/04 Latent space analysis.ipynb
index 016e2c95..88a43db2 100644
--- a/tutorial/notebooks/04 Latent space analysis.ipynb
+++ b/tutorial/notebooks/04 Latent space analysis.ipynb
@@ -193,7 +193,7 @@
"import pandas as pd\n",
"import seaborn as sns\n",
"\n",
- "results_path = Path(config.data.processed_data_path) / \"latent_space\"\n",
+ "results_path = Path(config.data.results_path) / \"latent_space\"\n",
"\n",
"print(\"Saved plot files:\")\n",
"for plot_path in results_path.glob(\"*.png\"):\n",
diff --git a/tutorial/notebooks/05 Identify associations.ipynb b/tutorial/notebooks/05 Identify associations.ipynb
index 5b69736b..2823d841 100644
--- a/tutorial/notebooks/05 Identify associations.ipynb
+++ b/tutorial/notebooks/05 Identify associations.ipynb
@@ -74,7 +74,7 @@
"import pandas as pd\n",
"\n",
"raw_data_path = Path(config.data.raw_data_path)\n",
- "results_path = Path(config.data.processed_data_path) / \"identify_associations\"\n",
+ "results_path = Path(config.data.results_path) / \"identify_associations\"\n",
"\n",
"truth = pd.read_csv(raw_data_path / \"changes.small.txt\", sep=\"\\t\", index_col=0)\n",
"results = pd.read_csv(results_path / \"results_sig_assoc.tsv\", sep=\"\\t\")"
@@ -120,7 +120,7 @@
"source": [
"We can also run the t-test method, and compare both results.\n",
"\n",
- "Note that here we override the `processed_data_path` field to prevent overwriting\n",
+ "Note that here we override the `results_path` field to prevent overwriting\n",
"the previous results."
]
},
@@ -142,7 +142,7 @@
}
],
"source": [
- "ttest_config = io.read_config(\"random_small\", \"random_small__id_assoc_ttest\", \"data.processed_data_path=results_ttest\")\n",
+ "ttest_config = io.read_config(\"random_small\", \"random_small__id_assoc_ttest\", \"data.results_path=results_ttest\")\n",
"identify_associations(ttest_config)"
]
},
@@ -162,7 +162,7 @@
"metadata": {},
"outputs": [],
"source": [
- "results_ttest_path = Path(config.data.processed_data_path) / \"identify_associations\"\n",
+ "results_ttest_path = Path(config.data.results_path) / \"identify_associations\"\n",
"\n",
"results_ttest = pd.read_csv(results_ttest_path / \"results_sig_assoc.tsv\", sep=\"\\t\")"
]