diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 00000000..9591dc03 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,16 @@ +version: 2 + +build: + os: ubuntu-20.04 + tools: + python: "3.9" + +sphinx: + configuration: docs/source/conf.py + +python: + install: + - requirements: docs/requirements.txt + - requirements: requirements.txt + - method: pip + path: . diff --git a/README.md b/README.md index a4f16fd8..eb15abfa 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ The code in this repository can be used to run our Multi-Omics Variational autoEncoder (MOVE) framework for integration of omics and clinical variabels spanning both categorial and continuous data. Our approach includes training ensemble VAE models and using *in silico* perturbation experiments to identify -cross omics associations. The manuscript has been accepted and we will provide +cross omics associations. The manuscript has been accepted and we will provide the link when it is published. We developed the method based on a Type 2 Diabetes cohort from the IMI DIRECT @@ -68,29 +68,8 @@ MOVE has five-six steps: ## How to run MOVE -You can run the move-dl pipeline from the command line or within a Jupyter -notebook. - -You can run MOVE as Python module with the following command. Details on how -to set up the configuration for the data and task can be found our -[tutorial](https://github.com/RasmussenLab/MOVE/tree/main/tutorial) folder. - -```bash ->>> move-dl data=[name of data config] task=[name of task config] -``` - -Feel free to -[open an issue](https://github.com/RasmussenLab/MOVE/issues/new/choose) if you -need any help. - -### How to use MOVE with your data - -Your data files should be tab separated, include a header and the first column -should be the IDs of your samples. The configuration of MOVE is done using YAML -files that describe the input data and the task specification. These should be -placed in a `config` directory in the working directory. Please see the -[tutorial](https://github.com/RasmussenLab/MOVE/tree/main/tutorial) -for more information. +Please refer to our [**documentation**](https://move-dl.readthedocs.io/) for +examples and tutorials on how to run MOVE. # Data sets @@ -110,5 +89,13 @@ available [here](https://directdiabetes.org). ## Simulated and publicaly available data sets -We have therefore provided two datasets to test the workflow: a simulated +We have therefore provided two datasets to test the workflow: a simulated dataset and a publicly-available maize rhizosphere microbiome data set. + +# Citation + +To cite MOVE, use the following information: + +Allesøe, R.L., Lundgaard, A.T., Hernández Medina, R. et al. Discovery of +drug–omics associations in type 2 diabetes with generative deep-learning models. +*Nat Biotechnol* (2023). https://doi.org/10.1038/s41587-022-01520-x diff --git a/docs/requirements.txt b/docs/requirements.txt index 4676a887..1478543a 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,2 +1,4 @@ sphinx==5.3.0 -sphinx_rtd_theme=1.1.1 \ No newline at end of file +sphinx-rtd-theme +sphinx-autodoc-typehints +sphinxemoji diff --git a/docs/source/conf.py b/docs/source/conf.py index 56619bf6..0c127b86 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -11,18 +11,23 @@ sys.path.insert(0, str(Path("../src").resolve())) -project = "move-dl" -copyright = "2022, Valentas Brasas, Ricardo Hernandez Medina" -author = "Valentas Brasas, Ricardo Hernandez Medina" -release = "1.0.0" +import move + +project = "MOVE" +copyright = "2022, Rasmussen Lab" +author = "Rasmussen Lab" +release = ".".join(map(str, move.__version__)) # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration extensions = [ "sphinx.ext.autodoc", + "sphinx.ext.autosectionlabel", "sphinx.ext.autosummary", "sphinx.ext.napoleon", + "sphinx_autodoc_typehints", + "sphinxemoji.sphinxemoji", ] templates_path = ["_templates"] @@ -32,6 +37,9 @@ # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output html_theme = "sphinx_rtd_theme" +html_theme_options = { + "collapse_navigation" : False, +} html_static_path = [] # -- Napoleon settings -------------------------------------------------------- diff --git a/docs/source/index.rst b/docs/source/index.rst index 8f6a769a..abe3dca0 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,15 +1,22 @@ -.. move-dl documentation master file, created by - sphinx-quickstart on Sat Nov 5 15:48:56 2022. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -Welcome to move-dl's documentation! -=================================== +Welcome to MOVE's documentation! +================================ .. toctree:: + :hidden: :maxdepth: 1 :caption: Contents: - pages/installation - pages/tutorial - pages/api/API + install + method + tutorial/index + +MOVE (**m**\ ulti-\ **o**\ mics **v**\ ariational auto\ **e**\ ncoder) is a +framework for integration of omics and other data modalities (including both +categorical and continuous data). Our approach consists of training an ensemble +of VAE (variational autoencoder) models and performing *in silico* perturbation +experiments to identify associations across the different omics datasets. + +We invite you to read `our publication`_ presenting this method, or read +about the method :doc:`here`. + +.. _`our publication`: https://www.nature.com/articles/s41587-022-01520-x diff --git a/docs/source/install.rst b/docs/source/install.rst new file mode 100644 index 00000000..901f72e7 --- /dev/null +++ b/docs/source/install.rst @@ -0,0 +1,47 @@ +Install +======= + +MOVE is distributed as ``move-dl``, a Python package. + +It requires Python 3.9 (or later) and third-party libraries, such as `PyTorch`_ +and `Hydra`_. These dependencies will be installed automatically when you +install with ``pip``. + +Install the stable version +-------------------------- + +We recommend installing ``move-dl`` in a fresh virtual environment. If you wish +to learn how to create and manage virtual environments with Conda, please +follow `these instructions`_. + +The latest stable version of ``move-dl`` can be installed with ``pip``. + +.. code-block:: bash + + >>> pip install move-dl + +Install the development version +------------------------------- + +If you wish to install the development of ``move-dl``, create a new virtual +environment, and do: + +.. code-block:: bash + + >>> pip install git+https://github.com/RasmussenLab/MOVE@developer + +Alternatively, you can clone ``move-dl`` from `GitHub`_ and install by +running the following command from the top-level source directory: + +.. code-block:: bash + + >>> pip install -e . + +The ``-e`` flag installs the project in "editable" mode, so you can follow the +development branch and update your installation by pulling from GitHub. + +.. _PyTorch: https://pytorch.org/ +.. _Hydra: https://hydra.cc/ +.. _GitHub: https://github.com/RasmussenLab/MOVE + +.. _these instructions: https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html diff --git a/docs/source/method.rst b/docs/source/method.rst new file mode 100644 index 00000000..d9982600 --- /dev/null +++ b/docs/source/method.rst @@ -0,0 +1,98 @@ +About the method +================ + +MOVE is based on the VAE (variational autoencoder) model, a deep learning model +that transforms high-dimensional data into a lower-dimensional space (so-called +latent representation). The autoencoder is made up of two neural networks: an +encoder, which compresses the input variables; and a decoder, which tries to +reconstruct the original input from the compressed representation. In doing so, +the model learns the structure and associations between the input variables. + +In `our publication`_, we used this type of model to integrate different data +modalities, including: genomics, transcriptomics, proteomics, metabolomics, +microbiomes, medication data, diet questionnaires, and clinical measurements. +Once we obtained a trained model, we exploited the decoder network to identify +cross-omics associations. + +Our approach consists of performing *in silico* perturbations of the original +data and using either univariate statistical methods or Bayesian decision +theory to identify significant differences between the reconstruction with or +without perturbation. Thus, we are able to detect associations between the +input variables. + +.. _`our publication`: https://www.nature.com/articles/s41587-022-01520-x + +.. image:: method/fig1.svg + +VAE design +----------- + +The VAE was designed to account for a variable number of fully-connected hidden +layers in both encoder and decoder. Each hidden layer is followed by batch +normalization, dropout, and a leaky rectified linear unit (leaky ReLU). + +To integrate different modalities, each dataset is reshaped and concatenated +into an input matrix. Moreover, error calculation is done on a dataset +basis: binary cross-entropy for binary and categorical datasets and mean squared +error for continuous datasets. Each error :math:`E_i` is then multiplied by a +given weight :math:`W_i` and added up to form the loss function: + +:math:`L = \sum_i W_i E_i + W_\textnormal{KL} D_\textnormal{KL}` + +Note that the :math:`D_\textnormal{KL}` (Kullback–Leibler divergence) penalizes +deviance of the latent representation from the standard normal distribution. It +is also subject to a weight :math:`W_\textnormal{KL}`, which warms up as the +model is trained. + +Extracting associations +----------------------- + +After determining the right set of hyperparameters, associations are extracted +by perturbing the original input data and passing it through an ensemble of +trained models. The reason behind using an ensemble is that VAE models are +stochastic, so we need to ensure that the results we obtain are not a product +of chance. + +We perturbed categorical data by changing its value from one category to +another (e.g., drug status changed from "not received" to "received"). Then, we +compare the change between the reconstruction generated from the original data +and the perturbed data. To achieve this, we proposed two approaches: using +*t*\ -test and Bayes factors. Both are described below: + +MOVE *t*\ -test +^^^^^^^^^^^^^^^ + +#. Perturb a variable in one dataset. +#. Repeat 10 times for 4 different latent space sizes: + + #. Train VAE model with original data. + #. Obtain reconstruction of original data (baseline reconstruction). + #. Obtain 10 additional reconstructions of original data and calculate + difference from the first (baseline difference). + #. Obtain reconstruction of perturbed data (perturbed reconstruction) and + subtract from baseline reconstruction (perturbed difference). + #. Compute p-value between baseline and perturbed differences with t-test. + +#. Correct p-values using Bonferroni method. +#. Select features that are significant (p-value lower than 0.05). +#. Select significant features that overlap in at least half of the refits and + 3 out of 4 architectures. These features are associated with the + perturbed variable. + +MOVE Bayes +^^^^^^^^^^ + +#. Perturb a variable in one dataset. +#. Repeat 30 times: + + #. Train VAE model with original data. + #. Obtain reconstruction of original data (baseline reconstruction). + #. Obtain reconstruction of perturbed data (perturbed reconstruction). + #. Record difference between baseline and perturbed reconstruction. + +#. Compute probability of difference being greater than 0. +#. Compute Bayes factor from probability: :math:`K = \log p - \log (1 - p)`. +#. Sort probabilities by Bayes factor, from highest to lowest. +#. Compute false discovery rate (FDR) as cumulative evidence. +#. Select features whose FDR is above desired threshold (e.g., 0.05). These + features are associated with the perturbed variable. \ No newline at end of file diff --git a/docs/source/method/fig1.svg b/docs/source/method/fig1.svg new file mode 100644 index 00000000..87654877 --- /dev/null +++ b/docs/source/method/fig1.svg @@ -0,0 +1 @@ +Test likelihoodReconstructionaccuracyStabilityQuestionnairesTrained VAE modelNaïve VAE modelBaseline modelDrug perturbed modelMulti-omics dataNon-omics dataSignificant drug ~ omicsassociationsDrugVAE hyperparameterselectionClinical Diet DrugPhysical activity01 \ No newline at end of file diff --git a/docs/source/pages/installation.rst b/docs/source/pages/installation.rst deleted file mode 100644 index 11e44375..00000000 --- a/docs/source/pages/installation.rst +++ /dev/null @@ -1,2 +0,0 @@ -Installation -============ diff --git a/docs/source/pages/tutorial.rst b/docs/source/pages/tutorial.rst deleted file mode 100644 index d51454e0..00000000 --- a/docs/source/pages/tutorial.rst +++ /dev/null @@ -1,2 +0,0 @@ -Tutorial(s) -============ \ No newline at end of file diff --git a/docs/source/tutorial/data_preparation.rst b/docs/source/tutorial/data_preparation.rst new file mode 100644 index 00000000..06690ffe --- /dev/null +++ b/docs/source/tutorial/data_preparation.rst @@ -0,0 +1,191 @@ +Data preparation +================ + +In this tutorial, we explain how to make your data compatible with the +``move-dl`` commands. + +For this tutorial we will work with a dataset taken from Walters et al. (2008) +[#]_. In their work, they report soil microbiome census data along with +environmental data (e.g., temperature and precipitation) of different cultivars +of maize. + +We will start by downloading the files corresponding to their `OTU table`_ +and `metadata`_. + +Formatting omics data +--------------------- + +The ``move-dl`` pipeline requires continuous omics input to be formatted as a +TSV file with one column per feature and one row per feature. + +If we load the microbiome OTU table from the maize rhizosphere dataset, it will +look something like this: + +.. table:: Original OTU table + + ======== ====================== ====================== ====================== + otuids 11116.C02A66.1194587 11116.C06A63.1195666 11116.C08A61.1197689 + ======== ====================== ====================== ====================== + 4479944 70 8 18 + 513055 2 16 1 + 519510 22 15 12 + 810959 5 0 3 + 849092 5 2 1 + ======== ====================== ====================== ====================== + +We have columns corresponding to samples and rows corresponding to features +(OTUs), so we need to **transpose** this table for MOVE. + +.. table:: Transposed OTU table + + ==================== ========= ======== ======== ======== ======== + sampleids 4479944 513055 519510 810959 849092 + ==================== ========= ======== ======== ======== ======== + 11116.C02A66.1194587 70 2 22 5 5 + 11116.C06A63.1195666 8 16 15 0 2 + 11116.C08A61.1197689 18 1 12 3 1 + ==================== ========= ======== ======== ======== ======== + +Now, we can save our table as a TSV and we are ready to go. No need to do any +further processing. + +Formatting other continuous data +-------------------------------- + +Other non-omics continuous data is formatted in a similar way. + +For this tutorial, we are going to extract some continuous data from the maize +metadata table. Let us load the table and take a peek: + +.. table:: Original metadata table + + ==================== ==================== ========= =============== ============== + X.SampleID Precipitation3Days INBREDS Maize_Line Description1 + ==================== ==================== ========= =============== ============== + 11116.C02A66.1194587 0.14 Oh7B Non_Stiff_Stalk rhizosphere + 11116.C06A63.1195666 0.14 P39 Sweet_Corn rhizosphere + 11116.C08A61.1197689 0.14 CML333 Tropical rhizosphere + 11116.C08A63.1196825 0.14 CML333 Tropical rhizosphere + 11116.C12A64.1197667 0.14 Il14H Sweet_Corn rhizosphere + ==================== ==================== ========= =============== ============== + +The original metadata table contains both categorical (e.g., ``Maize_Line``) +and continuous data (e.g., ``Precipitation3Days``). We need to separate these +into different files. + +In this example, we select three columns: ``age``, ``Precipitation3Days``, and +``Temperature``. + +.. table:: Extracted continuous data + + ==================== ===== ============= ==================== + X.SampleID age Temperature Precipitation3Days + ==================== ===== ============= ==================== + 11116.C02A66.1194587 12 76 0.14 + 11116.C06A63.1195666 12 76 0.14 + 11116.C08A61.1197689 12 76 0.14 + 11116.C08A63.1196825 12 76 0.14 + 11116.C12A64.1197667 12 76 0.14 + ==================== ===== ============= ==================== + +Once again, we can save this table as a TSV, and we are ready to continue. + +Formatting categorical data +--------------------------- + +Categorical data like binary variables (e.g., with/without treatment) or +discrete categories needs to be formatted in individual files. + +The metadata table contains several discrete variables that can be useful for +classification, such as maize line, cultivar, and type of soil. For each one of +these, we need to create a separate TSV file that will look something like: + +.. table:: Extracted maize line data + + ==================== =============== + X.SampleID Maize_Line + ==================== =============== + 11116.C02A66.1194587 Non_Stiff_Stalk + 11116.C06A63.1195666 Sweet_Corn + 11116.C08A61.1197689 Tropical + 11116.C08A63.1196825 Tropical + 11116.C12A64.1197667 Sweet_Corn + ==================== =============== + +Creating a data config file +--------------------------- + +We are missing two components to make our data compatible with ``move-dl``. +First, we need to create an additional text file with all the sample IDs (one +ID per line, see example below). This file tells MOVE which samples to use, so +the IDs in this file must be present in all the other input files. + +.. code-block:: text + :caption: Maize sample IDs + + 11116.C02A66.1194587 + 11116.C06A63.1195666 + 11116.C08A61.1197689 + 11116.C08A63.1196825 + 11116.C12A64.1197667 + +Finally, we need to create a data config YAML file. The purpose of this file is +to tell MOVE which files to load, where to find them, and where to save any +output files. + +The data config file for this tutorial would look like this: + +.. literalinclude:: /../../tutorial/config/data/maize.yaml + :language: yaml + +Here we break down the fields of this file: + +* ``defaults`` indicates this file is a config file. It should be left intact. +* ``raw_data_path`` points to the raw data location (i.e., the files we + created in this tutorial). +* ``interim_data_path`` points to the directory where intermediary files will + be deposited. +* ``results_path`` points to the folder where results will be saved. +* ``sample_names`` is the file name of the file containing all valid sample + IDs. This file must have a ``txt`` extension. +* ``categorical_inputs`` is a list of file names containing categorical data. + Each element of the list should have a name ``name`` and may optionally have + a ``weight``. All referenced files should have a ``tsv`` extension. +* ``continuous_inputs`` lists the continuous data files. Same format as + ``categorical_inputs``. + +The data config file can have any name, but it must be saved in ``config/data`` +directory. The final workspace structure should look like this::: + + tutorial/ + │ + ├── maize/ + │ └── data/ + │ ├── maize_field.tsv <- Type of soil data + │ ├── maize_ids.txt <- Sample IDs + │ ├── maize_line.tsv <- Maize line data + │ ├── maize_metadata.tsv <- Age, temperature, precipitation data + │ ├── maize_microbiome.tsv <- OTU table + │ └── maize_variety.tsv <- Maize variety data + │ + └── config/ + └── data/ + └── maize.yaml <- Data configuration file + +With your data formatted and ready, we can continue to run MOVE and exploring +the associations between the different variables in your datasets. Have a look +at our :doc:`introductory tutorial` for more +information on this. + +References +---------- + +.. [#] Walters WA, Jin Z, Youngblut N, Wallace JG, Sutter J, Zhang W, et al. + Large-scale replicated field study of maize rhizosphere identifies heritable + microbes. `Proc Natl Acad Sci U S A`. 2018; 115: 7368–7373. + `doi:10.1073/pnas.1800918115`_ + +.. _`doi:10.1073/pnas.1800918115`: https://doi.org/10.1073/pnas.1800918115 + +.. _`OTU table`: https://github.com/jorgemf/DeepLatentMicrobiome/raw/91e384b7115978bb3cd0f61c7dd3d8ffc866efc3/Datasets/otu_table_all_80.csv +.. _`metadata`: https://github.com/jorgemf/DeepLatentMicrobiome/raw/91e384b7115978bb3cd0f61c7dd3d8ffc866efc3/Datasets/metadata_table_all_80.csv diff --git a/docs/source/tutorial/index.rst b/docs/source/tutorial/index.rst new file mode 100644 index 00000000..4f2d3592 --- /dev/null +++ b/docs/source/tutorial/index.rst @@ -0,0 +1,12 @@ +Tutorials +========= + +Here, we provide several guides and tutorials to help you start working with +MOVE. + +.. toctree:: + :maxdepth: 1 + + introduction + data_preparation + model_tuning diff --git a/docs/source/tutorial/introduction.rst b/docs/source/tutorial/introduction.rst new file mode 100644 index 00000000..259f0266 --- /dev/null +++ b/docs/source/tutorial/introduction.rst @@ -0,0 +1,210 @@ +Introduction to MOVE +==================== + +This guide can help you get started with MOVE. + +About the MOVE pipeline +----------------------- + +MOVE has the following four steps (also called tasks): + +1. **Encoding data**: taking input data and converting it into a format MOVE + can read. +2. **Tuning the hyperparameters of the VAE model**: training multiple models to + find the set of hyperparameters that produces the best reconstructions or + most stable latent space. +3. **Analyzing the latent space**: training a model and inspecting the latent + representation it creates. +4. **Identifying associations**: using an ensemble of VAE to find associations + between the input datasets. + +Simulated dataset +----------------- + +For this short tutorial, we provide `simulated dataset`_ (available from our +GitHub repository). This dataset consists of pretend proteomics and +metagenomcis measurements for 500 fictitious individuals. We also report +whether these individuals have taken or not 20 imaginary drugs. + +All values were randomly generated, but we have added 200 +associations between different pairs of drugs and omics features. Let us find +these associations with MOVE! + +.. _simulated dataset: https://download-directory.github.io/?url=https%3A%2F%2Fgithub.com%2FRasmussenLab%2FMOVE%2Ftree%2Fmain%2Ftutorial + +Workspace structure +------------------- + +First, we take a look at how to organize our data and configuration::: + + tutorial/ + │ + ├── data/ + │ ├── changes.small.txt <- Ground-truth associations (200 links) + │ ├── random.small.drugs.tsv <- Drug dataset (20 drugs) + │ ├── random.small.ids.tsv <- Sample IDs (500 samples) + │ ├── random.small.proteomics.tsv <- Proteomics dataset (200 proteins) + │ └── random.small.metagenomics.tsv <- Metagenomics dataset (1000 taxa) + │ + └── config/ <- Stores user configuration files + ├── data/ + │ └── random_small.yaml <- Configuration to read in the necessary + │ data files. + ├── experiment/ <- Configuration for experiments (e.g., + │ └── random_small__tune.yaml for tuning hyperparameters). + │ + └── task/ <- Configuration for tasks: such as + | latent space or identify associations + │ using the t-test or Bayesian approach + ├── random_small__id_assoc_bayes.yaml + ├── random_small__id_assoc_ttest.yaml + └── random_small__latent.yaml + +The data directory +^^^^^^^^^^^^^^^^^^ + +All "raw" data files should be placed inside the same directory. These files +are TSVs (tab-separated value tables) containing discrete values (e.g., for +binary or categorical datasets) or continuous values. + +Additionally, make sure each sample has an assigned ID and provide an ID +table containing a list of all valid IDs (must appear in every dataset). + +The ``config`` directory +^^^^^^^^^^^^^^^^^^^^^^^^ + +Configuration is composed and managed by `Hydra`_. + +User-defined configuration must be stored in a ``config`` folder. This folder +can contain a ``data`` and ``task`` folder to store the configuration files for +your dataset and tasks. + +.. _`Hydra`: https://hydra.cc/ + +Data configuration +"""""""""""""""""" + +Let us take a look at the configuration for our dataset. It is a YAML file, +specifying: the directories to look for raw data and store intermediary and +final output files, and the list of categorical and continuous datasets we +have. + +.. literalinclude:: /../../tutorial/config/data/random_small.yaml + :language: yaml + +Note that we do not recommend changing the ``defaults`` field, otherwise the +configuration file will not be properly recognized by MOVE. + +Task configuration +"""""""""""""""""" + +Similarly, the ``task`` folder contains YAML files to configure the tasks of +MOVE. In this tutorial, we provided two examples for running the method to +identify associations using our t-test and Bayesian approach, and an example to +perform latent space analysis. + +For example, for the t-test approach (``random_small__id_assoc_ttest.yaml``), +we define the following values: batch size, number of refits, name of dataset to +perturb, target perturb value, configuration for VAE model, and configuration +for training loop. + +.. literalinclude:: /../../tutorial/config/task/random_small__id_assoc_ttest.yaml + :language: yaml + +Note that the ``random_small__id_assoc_bayes.yaml`` looks pretty similar, but +declares a different ``defaults``. This tells MOVE which algorithm to use! + +Running MOVE +------------ + +Encoding data +^^^^^^^^^^^^^ + +Make sure you are on the parent directory of the ``config`` folder (in this +example, it is the ``tutorial`` folder), and proceed to run: + +.. code-block:: bash + + >>> cd tutorial + >>> move-dl data=random_small task=encode_data + +|:arrow_up:| This command will encode the datasets. The ``random.small.drugs`` +dataset (defined in ``config/data/random_small.yaml``) will be one-hot encoded, +whereas the other two omics datasets will be standardized. Encoded data will +be placed in the intermediary folder defined in the +:ref:`data config`. + +|:loud_sound:| Every ``move-dl`` command will generate a ``logs`` folder to +store log files timestamping the program's current doings. + +Tuning the model's hyperparameters +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Once the data has been encoded, we can proceed with the first step of our +pipeline: tuning the hyperparameters of our deep learning model. This process +can be time-consuming, because several models will be trained and tested. For +this short tutorial, you may choose to skip it and proceed to +:ref:`analyze the latent space`. + +Analyzing the latent space +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Next, we will train a variational autoencoder and analyze how good it is at +reconstructing our input data and generating an informative latent space. Run: + +.. code-block:: bash + + >>> move-dl data=random_small task=random_small__latent + +|:arrow_up:| This command will create a ``latent_space`` directory in the +results folder defined in the :ref:`data config`. This +folder will contain the following plots: + +* **Loss curve** shows the overall loss, KLD term, binary cross-entropy term, + and sum of squared errors term over number of training epochs. +* **Reconstructions metrics boxplot** shows a score (accuracy or cosine + similarity for categorical and continuous datasets, respectively) per + reconstructed dataset. +* **Latent space scatterplot** shows a reduced representation of the latent + space. To generate this visualization, the latent space is reduced to two + dimensions using TSNE (or another user-defined algorithm, e.g., UMAP). +* **Feature importance swarmplot** displays the impact perturbing a feature has + on the latent space. + +Additionally, TSV files corresponding to each plot will be generated. These can +be used, for example, to re-create the plots manually or with different +styling. + +Identifying associations +^^^^^^^^^^^^^^^^^^^^^^^^ + +Next step is to find associations between the drugs taken by each individual +and the omics features. Run: + +.. code-block:: bash + + >>> move-dl data=random_small task=random_small__id_assoc_ttest + +|:arrow_up:| This command will create a ``results_sig_assoc.tsv`` file, listing +each pair of associated features and the corresponding median p-value for such +association. There should be ~120 associations found. Due to the nature of the +method, this number may slightly fluctuate. + +|:warning:| Note that the value after ``task=`` matches the name of our +configuration file. We can create multiple configuration files (for example, +changing hyperparameters like learning rate) and call them by their name here. + +|:stopwatch:| This command takes approximately 45 min to run on a work laptop +(Intel Core i7-10610U @ 1.80 GHz, 32 MB RAM). You can track the progress by +checking the corresponding log file created in the ``logs`` folder. + +If you want to try the Bayesian approach instead, run: + +.. code-block:: bash + + >>> move-dl data=random_small task=random_small__id_assoc_bayes + +Again, it should generate similar results with over 120 associations known. + +Take a look at the ``changes.small.txt`` file and compare your results against +it. Did MOVE find any false positives? diff --git a/docs/source/tutorial/model_tuning.rst b/docs/source/tutorial/model_tuning.rst new file mode 100644 index 00000000..af773a5d --- /dev/null +++ b/docs/source/tutorial/model_tuning.rst @@ -0,0 +1,95 @@ +Tuning models +============= + +The second of MOVE's pipeline consists of training multiple models with +different hyperparameters in order to determine which set is optimal, i.e., +produces models that generate the most accurate reconstructions and/or the most +stable latent representations. + +The hyperparameters can be anything from number of training epochs, to size of +samples per batch, to number and size of hidden layers in the encoder-decoder +architecture. + +The ``experiment`` config +------------------------- + +To start with this step, we define a experiment configuration (please +first consult the :doc:`introductory` and +:doc:`data preparation tutorial` if you have not +set up your workspace and data). This type of config references a data +config, a task config, and the values of hyperparameters to test out. + +The first lines of our config should look like: + +.. literalinclude:: /../../tutorial/config/experiment/random_small__tune_reconstruction.yaml + :language: yaml + :lines: 1-8 + +The ``override`` directives indicate (1) the name of our data config (in this +example we reference the config of our simulated dataset, see tutorial for +more info about this dataset) and (2) the name of the tuning task. There are +two possible values for tuning task: + +- ``tune_model_reconstruction``, which reports the reconstruction accuracy of + models trained with different hyperparameter combinations; and +- ``tune_model_stability``, which reports the stability of the latent space of + differently hyperparameterized models. + +Next, we have to define the hyperparameters that we wish to test out. An +example would be: + +.. literalinclude:: /../../tutorial/config/experiment/random_small__tune_reconstruction.yaml + :language: yaml + :lines: 15-21 + +The above config would result in 12 hyperparameter combinations (2 options of +batch size times 2 options of encoder-decoder architecture times 3 options of +training epochs). + +Any parameter of the training loop, model, and task can be swept. However, do +note that the more options you provide, the more models that will be trained, +and the more resource-intensive this task will become. + +Below is a list of hyperparameters that we recommend tuning: + +.. list-table:: Tunable hyperparameters + :width: 100 + :widths: 40 60 + :header-rows: 1 + + * - Hyperparameter + - Description + * - ``task.batch_size`` + - Number of samples per training batch + * - ``task.model.num_hidden`` + - Architecture of the encoder network + (reversed for the decoder network) + * - ``task.model.num_latent`` + - Number of units of the latent space + * - ``task.model.beta`` + - Weight applied to the KLD term in the loss function + * - ``task.model.dropout`` + - Dropout + * - ``task.training_loop.num_epochs`` + - Number of training epochs + * - ``task.training_loop.lr`` + - Learning rate + * - ``task.training_loop.kld_warmup_steps`` + - Epochs at which KLD is warmed + * - ``task.training_loop.batch_dilation_steps`` + - Epochs at which batch size is increased + * - ``task.training_loop.early_stopping`` + - Whether early stopping is triggered + +Finally, to run the tuning: + +.. code-block:: bash + + >>> cd tutorial + >>> move-dl experiment=random_small__tune_reconstruction + +This process may take a while (depending on the number of hyperparameter +combinations that will be trained and tested), and it will produce a TSV table +in a ``{results_path}/tune_model`` directory summarizing the metrics (either +reconstruction metrics like accuracy or stability). These metrics can be +plotted to visualize and select the optimal hyperparameter combination. diff --git a/requirements.txt b/requirements.txt index 14f6a2d6..775f5c46 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,7 @@ hydra-core>=1.2.0 -numpy>=1.19.5 -pandas>=1.1.5 -torch==1.9.0 +numpy>=1.21.5 +pandas>=1.4.2 +torch>=1.11.0 +matplotlib>=3.5.2 +seaborn>=0.12.1 +scikit-learn>=1.0.2 diff --git a/setup.cfg b/setup.cfg index 27f76841..f269f4a2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -22,7 +22,7 @@ install_requires = seaborn scikit-learn scipy - + package_dir = = src packages = find: diff --git a/src/move/__init__.py b/src/move/__init__.py index f13a81a5..636eb345 100644 --- a/src/move/__init__.py +++ b/src/move/__init__.py @@ -1,7 +1,7 @@ from __future__ import annotations __license__ = "MIT" -__version__ = (1, 3, 0) +__version__ = (1, 4, 0) __all__ = ["conf", "data", "models", "training_loop", "VAE"] HYDRA_VERSION_BASE = "1.2" diff --git a/src/move/__main__.py b/src/move/__main__.py index 6cd62054..485e5bac 100644 --- a/src/move/__main__.py +++ b/src/move/__main__.py @@ -34,7 +34,7 @@ def main(config: MOVEConfig) -> None: logger.info("No task specified.") elif task_type is EncodeDataConfig: move.tasks.encode_data(config.data) - elif task_type is TuneModelConfig: + elif issubclass(task_type, TuneModelConfig): move.tasks.tune_model(config) elif task_type is AnalyzeLatentConfig: move.tasks.analyze_latent(config) diff --git a/src/move/conf/schema.py b/src/move/conf/schema.py index fac7d697..c9dee984 100644 --- a/src/move/conf/schema.py +++ b/src/move/conf/schema.py @@ -2,7 +2,8 @@ "MOVEConfig", "EncodeDataConfig", "AnalyzeLatentConfig", - "TuneModelConfig", + "TuneModelReconstructionConfig", + "TuneModelStabilityConfig", "IdentifyAssociationsConfig", "IdentifyAssociationsBayesConfig", "IdentifyAssociationsTTestConfig", @@ -104,6 +105,20 @@ class TuneModelConfig(TaskConfig): ... +@dataclass +class TuneModelStabilityConfig(TuneModelConfig): + """Configure the "tune model" task.""" + + num_refits: int = MISSING + + +@dataclass +class TuneModelReconstructionConfig(TuneModelConfig): + """Configure the "tune model" task.""" + + ... + + @dataclass class AnalyzeLatentConfig(TaskConfig): """Configure the "analyze latents" task. @@ -194,8 +209,14 @@ def extract_names(configs: list[InputConfig]) -> list[str]: ) cs.store( group="task", - name="tune_model_schema", - node=TuneModelConfig, + name="tune_model_reconstruction_schema", + node=TuneModelReconstructionConfig, +) + +cs.store( + group="task", + name="tune_model_stability_schema", + node=TuneModelStabilityConfig, ) cs.store( group="task", diff --git a/src/move/conf/task/tune_model.yaml b/src/move/conf/task/tune_model_reconstruction.yaml similarity index 91% rename from src/move/conf/task/tune_model.yaml rename to src/move/conf/task/tune_model_reconstruction.yaml index 7f620e52..677dc052 100644 --- a/src/move/conf/task/tune_model.yaml +++ b/src/move/conf/task/tune_model_reconstruction.yaml @@ -1,5 +1,5 @@ defaults: - - tune_model_schema + - tune_model_reconstruction_schema batch_size: 10 diff --git a/src/move/conf/task/tune_model_stability.yaml b/src/move/conf/task/tune_model_stability.yaml new file mode 100644 index 00000000..3df9f3cf --- /dev/null +++ b/src/move/conf/task/tune_model_stability.yaml @@ -0,0 +1,30 @@ +defaults: + - tune_model_stability_schema + +batch_size: 10 +num_refits: 3 + +model: + categorical_weights: ${weights:${data.categorical_inputs}} + continuous_weights: ${weights:${data.continuous_inputs}} + num_hidden: + - 1000 + num_latent: 150 + beta: 0.0001 + dropout: 0.1 + cuda: false + +training_loop: + lr: 1e-4 + num_epochs: 40 + batch_dilation_steps: + - 50 + - 100 + - 150 + kld_warmup_steps: + - 15 + - 20 + - 25 + early_stopping: false + patience: 0 + \ No newline at end of file diff --git a/src/move/models/vae.py b/src/move/models/vae.py index 801a4da3..a76e1a75 100644 --- a/src/move/models/vae.py +++ b/src/move/models/vae.py @@ -582,11 +582,14 @@ def _validate_batch(self, batch: tuple[torch.Tensor, torch.Tensor]) -> torch.Ten a formed batch """ cat, con = batch + cat = cat.to(self.device) + con = con.to(self.device) + if self.num_categorical is None: return con elif self.num_continuous is None: return cat - return torch.cat(batch, dim=1) + return torch.cat((cat, con), dim=1) @torch.no_grad() def project(self, dataloader: DataLoader) -> FloatArray: @@ -604,7 +607,7 @@ def project(self, dataloader: DataLoader) -> FloatArray: batch = self._validate_batch(batch) *_, mu, _ = self(batch) embedding.append(mu) - embedding = torch.cat(embedding, dim=0).numpy() + embedding = torch.cat(embedding, dim=0).cpu().numpy() return embedding @torch.no_grad() @@ -630,8 +633,8 @@ def reconstruct( for i, cat in enumerate(cat_recon): cat_recons[i].append(torch.argmax(cat, dim=1)) con_recons.append(con_recon) - cat_recons = [torch.cat(cats, dim=0).numpy() for cats in cat_recons] - con_recons = torch.cat(con_recons, dim=0).numpy() + cat_recons = [torch.cat(cats, dim=0).cpu().numpy() for cats in cat_recons] + con_recons = torch.cat(con_recons, dim=0).cpu().numpy() return cat_recons, con_recons @torch.no_grad() diff --git a/src/move/tasks/analyze_latent.py b/src/move/tasks/analyze_latent.py index 97e2d256..67399bcb 100644 --- a/src/move/tasks/analyze_latent.py +++ b/src/move/tasks/analyze_latent.py @@ -98,19 +98,24 @@ def analyze_latent(config: MOVEConfig) -> None: df_index = pd.Index(sample_names, name="sample") assert task_config.model is not None + device = torch.device("cuda" if task_config.model.cuda == True else "cpu") model: VAE = hydra.utils.instantiate( task_config.model, continuous_shapes=test_dataset.con_shapes, categorical_shapes=test_dataset.cat_shapes, ) + logger.debug(f"Model: {model}") model_path = output_path / "model.pt" if model_path.exists(): logger.debug("Re-loading model") model.load_state_dict(torch.load(model_path)) + model.to(device) else: logger.debug("Training model") + + model.to(device) train_dataloader = make_dataloader( cat_list, con_list, @@ -133,6 +138,7 @@ def analyze_latent(config: MOVEConfig) -> None: fig_df = pd.DataFrame(dict(zip(viz.LOSS_LABELS, losses))) fig_df.index.name = "epoch" fig_df.to_csv(output_path / "loss_curve.tsv", sep="\t") + model.eval() logger.info("Projecting into latent space") diff --git a/src/move/tasks/identify_associations.py b/src/move/tasks/identify_associations.py index 3f485117..93ea65cc 100644 --- a/src/move/tasks/identify_associations.py +++ b/src/move/tasks/identify_associations.py @@ -118,6 +118,9 @@ def identify_associations(config: MOVEConfig): feature_mask = np.all(target_dataset == target_value, axis=2) # 2D: N x P feature_mask |= np.sum(target_dataset, axis=2) == 0 + assert task_config.model is not None + device = torch.device("cuda" if task_config.model.cuda == True else "cpu") + def _bayes_approach( task_config: IdentifyAssociationsBayesConfig, ) -> tuple[IntArray, FloatArray]: @@ -141,8 +144,11 @@ def _bayes_approach( if model_path.exists(): logger.debug(f"Re-loading refit {j + 1}/{task_config.num_refits}") model.load_state_dict(torch.load(model_path)) + model.to(device) else: logger.debug(f"Training refit {j + 1}/{task_config.num_refits}") + + model.to(device) hydra.utils.call( task_config.training_loop, model=model, @@ -216,8 +222,10 @@ def _ttest_approach( if model_path.exists(): logger.debug(f"Re-loading refit {j + 1}/{task_config.num_refits}") model.load_state_dict(torch.load(model_path)) + model.to(device) else: logger.debug(f"Training refit {j + 1}/{task_config.num_refits}") + model.to(device) hydra.utils.call( task_config.training_loop, model=model, diff --git a/src/move/tasks/tune_model.py b/src/move/tasks/tune_model.py index 6861bf02..49b66826 100644 --- a/src/move/tasks/tune_model.py +++ b/src/move/tasks/tune_model.py @@ -2,28 +2,50 @@ from pathlib import Path from random import shuffle -from typing import Any, cast +from typing import Any, Literal, cast import hydra import numpy as np import pandas as pd +import torch from hydra.core.hydra_config import HydraConfig from hydra.types import RunMode from matplotlib.cbook import boxplot_stats +from numpy.typing import ArrayLike +from omegaconf import OmegaConf +from sklearn.metrics.pairwise import cosine_similarity from move.analysis.metrics import ( calculate_accuracy, calculate_cosine_similarity, ) -from move.conf.schema import MOVEConfig, TuneModelConfig +from move.conf.schema import ( + MOVEConfig, + TuneModelConfig, + TuneModelReconstructionConfig, + TuneModelStabilityConfig, +) from move.core.logging import get_logger from move.core.typing import BoolArray, FloatArray from move.data import io from move.data.dataloaders import MOVEDataset, make_dataloader, split_samples from move.models.vae import VAE +TaskType = Literal["reconstruction", "stability"] + + +def _get_task_type( + task_config: TuneModelConfig, +) -> TaskType: + task_type = OmegaConf.get_type(task_config) + if task_type is TuneModelReconstructionConfig: + return "reconstruction" + if task_type is TuneModelStabilityConfig: + return "stability" + raise ValueError("Unsupported type of task!") + -def _get_record(values: FloatArray, **kwargs) -> dict[str, Any]: +def _get_record(values: ArrayLike, **kwargs) -> dict[str, Any]: record = kwargs bxp_stats, *_ = boxplot_stats(values) bxp_stats.pop("fliers") @@ -34,6 +56,7 @@ def _get_record(values: FloatArray, **kwargs) -> dict[str, Any]: def tune_model(config: MOVEConfig) -> float: """Train multiple models to tune the model hyperparameters.""" hydra_config = HydraConfig.get() + if hydra_config.mode != RunMode.MULTIRUN: raise ValueError("This task must run in multirun mode.") @@ -45,9 +68,11 @@ def tune_model(config: MOVEConfig) -> float: job_num = hydra_config.job.num + 1 logger = get_logger(__name__) - logger.info(f"Beginning task: tune model {job_num}") - logger.info(f"Job name: {hydra_config.job.override_dirname}") task_config = cast(TuneModelConfig, config.task) + task_type = _get_task_type(task_config) + + logger.info(f"Beginning task: tune model {task_type} {job_num}") + logger.info(f"Job name: {hydra_config.job.override_dirname}") interim_path = Path(config.data.interim_data_path) output_path = Path(config.data.results_path) / "tune_model" @@ -61,86 +86,167 @@ def tune_model(config: MOVEConfig) -> float: config.data.continuous_names, ) - split_path = interim_path / "split_mask.npy" - if split_path.exists(): - split_mask: BoolArray = np.load(split_path) - else: - num_samples = cat_list[0].shape[0] if cat_list else con_list[0].shape[0] - split_mask = split_samples(num_samples, 0.9) - np.save(split_path, split_mask) - - train_dataloader = make_dataloader( - cat_list, - con_list, - split_mask, - shuffle=True, - batch_size=task_config.batch_size, - drop_last=True, - ) - train_dataset = cast(MOVEDataset, train_dataloader.dataset) - assert task_config.model is not None - model: VAE = hydra.utils.instantiate( - task_config.model, - continuous_shapes=train_dataset.con_shapes, - categorical_shapes=train_dataset.cat_shapes, - ) - logger.debug(f"Model: {model}") + device = torch.device("cuda" if task_config.model.cuda == True else "cpu") - logger.debug("Training model") - hydra.utils.call( - task_config.training_loop, - model=model, - train_dataloader=train_dataloader, - ) - model.eval() - - logger.info("Reconstructing") - logger.info("Computing reconstruction metrics") - label = [hp.split("=") for hp in hydra_config.job.override_dirname.split(",")] - records = [] - splits = zip(["train", "test"], [split_mask, ~split_mask]) - for split_name, mask in splits: - dataloader = make_dataloader( + def _tune_stability( + task_config: TuneModelStabilityConfig, + ): + label = [hp.split("=") for hp in hydra_config.job.override_dirname.split(",")] + + train_dataloader = make_dataloader( + cat_list, + con_list, + shuffle=True, + batch_size=task_config.batch_size, + drop_last=True, + ) + + test_dataloader = make_dataloader( cat_list, con_list, - mask, shuffle=False, - batch_size=np.count_nonzero(mask), + batch_size=1, + drop_last=False, ) - cat_recons, con_recons = model.reconstruct(dataloader) - con_recons = np.split(con_recons, model.continuous_shapes[:-1], axis=1) - for cat, cat_recon, dataset_name in zip( - cat_list, cat_recons, config.data.categorical_names - ): - accuracy = calculate_accuracy(cat[mask], cat_recon) - record = _get_record( - accuracy, - job_num=job_num, - **dict(label), - metric="accuracy", - dataset=dataset_name, - split=split_name, + + train_dataset = cast(MOVEDataset, train_dataloader.dataset) + + logger.info(f"Training {task_config.num_refits} refits") + + cosine_sim0 = None + cosine_sim_diffs = [] + for j in range(task_config.num_refits): + logger.debug(f"Refit: {j+1}/{task_config.num_refits}") + model: VAE = hydra.utils.instantiate( + task_config.model, + continuous_shapes=train_dataset.con_shapes, + categorical_shapes=train_dataset.cat_shapes, ) - records.append(record) - for con, con_recon, dataset_name in zip( - con_list, con_recons, config.data.continuous_names - ): - cosine_sim = calculate_cosine_similarity(con[mask], con_recon) - record = _get_record( - cosine_sim, - job_num=job_num, - **dict(label), - metric="cosine_similarity", - dataset=dataset_name, - split=split_name, + model.to(device) + + hydra.utils.call( + task_config.training_loop, + model=model, + train_dataloader=train_dataloader, ) - records.append(record) - logger.info("Writing results") - df_path = output_path / "reconstruction_stats.tsv" - header = not df_path.exists() - df = pd.DataFrame.from_records(records) - df.to_csv(df_path, sep="\t", mode="a", header=header, index=False) + model.eval() + latent, *_ = model.latent(test_dataloader, kld_weight=1) + + if cosine_sim0 is None: + cosine_sim0 = cosine_similarity(latent) + else: + cosine_sim = cosine_similarity(latent) + D = np.absolute(cosine_sim - cosine_sim0) + # removing the diagonal element (cos_sim with itself) + diff = D[~np.eye(D.shape[0], dtype=bool)].reshape(D.shape[0], -1) + mean_diff = np.mean(diff) + cosine_sim_diffs.append(mean_diff) + + record = _get_record( + cosine_sim_diffs, + job_num=job_num, + **dict(label), + metric="mean_diff_cosine_similarity", + num_refits=task_config.num_refits, + ) + logger.info("Writing results") + df_path = output_path / "stability_stats.tsv" + header = not df_path.exists() + df = pd.DataFrame.from_records([record]) + df.to_csv(df_path, sep="\t", mode="a", header=header, index=False) + + def _tune_reconstruction( + task_config: TuneModelReconstructionConfig, + ): + split_path = interim_path / "split_mask.npy" + if split_path.exists(): + split_mask: BoolArray = np.load(split_path) + else: + num_samples = cat_list[0].shape[0] if cat_list else con_list[0].shape[0] + split_mask = split_samples(num_samples, 0.9) + np.save(split_path, split_mask) + + train_dataloader = make_dataloader( + cat_list, + con_list, + split_mask, + shuffle=True, + batch_size=task_config.batch_size, + drop_last=True, + ) + + train_dataset = cast(MOVEDataset, train_dataloader.dataset) + + model: VAE = hydra.utils.instantiate( + task_config.model, + continuous_shapes=train_dataset.con_shapes, + categorical_shapes=train_dataset.cat_shapes, + ) + model.to(device) + logger.debug(f"Model: {model}") + + logger.debug("Training model") + hydra.utils.call( + task_config.training_loop, + model=model, + train_dataloader=train_dataloader, + ) + model.eval() + logger.info("Reconstructing") + logger.info("Computing reconstruction metrics") + label = [hp.split("=") for hp in hydra_config.job.override_dirname.split(",")] + records = [] + splits = zip(["train", "test"], [split_mask, ~split_mask]) + for split_name, mask in splits: + dataloader = make_dataloader( + cat_list, + con_list, + mask, + shuffle=False, + batch_size=np.count_nonzero(mask), + ) + cat_recons, con_recons = model.reconstruct(dataloader) + con_recons = np.split(con_recons, model.continuous_shapes[:-1], axis=1) + for cat, cat_recon, dataset_name in zip( + cat_list, cat_recons, config.data.categorical_names + ): + accuracy = calculate_accuracy(cat[mask], cat_recon) + record = _get_record( + accuracy, + job_num=job_num, + **dict(label), + metric="accuracy", + dataset=dataset_name, + split=split_name, + ) + records.append(record) + for con, con_recon, dataset_name in zip( + con_list, con_recons, config.data.continuous_names + ): + cosine_sim = calculate_cosine_similarity(con[mask], con_recon) + record = _get_record( + cosine_sim, + job_num=job_num, + **dict(label), + metric="cosine_similarity", + dataset=dataset_name, + split=split_name, + ) + records.append(record) + + logger.info("Writing results") + df_path = output_path / "reconstruction_stats.tsv" + header = not df_path.exists() + df = pd.DataFrame.from_records(records) + df.to_csv(df_path, sep="\t", mode="a", header=header, index=False) + + if task_type == "reconstruction": + task_config = cast(TuneModelReconstructionConfig, task_config) + _tune_reconstruction(task_config) + elif task_type == "stability": + task_config = cast(TuneModelStabilityConfig, task_config) + _tune_stability(task_config) return 0.0 diff --git a/tutorial/config/data/random_small.yaml b/tutorial/config/data/random_small.yaml index ad935b99..e1cc182d 100644 --- a/tutorial/config/data/random_small.yaml +++ b/tutorial/config/data/random_small.yaml @@ -7,14 +7,14 @@ defaults: raw_data_path: data/ # where raw data is stored interim_data_path: interim_data/ # where intermediate files will be stored -results_path: results/ # where result files will be placed +results_path: results/ # where result files will be placed -sample_names: random.small.ids # names/IDs of each sample, must appear in the - # other datasets +sample_names: random.small.ids # names/IDs of each sample, must appear in + # the other datasets -categorical_inputs: # a list of categorical datasets +categorical_inputs: # a list of categorical datasets - name: random.small.drugs -continuous_inputs: # a list of continuous datasets +continuous_inputs: # a list of continuous datasets - name: random.small.proteomics - name: random.small.metagenomics diff --git a/tutorial/config/experiment/random_small__tune.yaml b/tutorial/config/experiment/random_small__tune_reconstruction.yaml similarity index 92% rename from tutorial/config/experiment/random_small__tune.yaml rename to tutorial/config/experiment/random_small__tune_reconstruction.yaml index 480080c7..d83938dd 100644 --- a/tutorial/config/experiment/random_small__tune.yaml +++ b/tutorial/config/experiment/random_small__tune_reconstruction.yaml @@ -4,7 +4,7 @@ defaults: - override /data: random_small - - override /task: tune_model + - override /task: tune_model_reconstruction # Configure which hyperarameters to vary # This will run and log the metrics of 12 models (combination of 3 hyperparams diff --git a/tutorial/config/experiment/random_small__tune_stability.yaml b/tutorial/config/experiment/random_small__tune_stability.yaml new file mode 100644 index 00000000..62e87610 --- /dev/null +++ b/tutorial/config/experiment/random_small__tune_stability.yaml @@ -0,0 +1,26 @@ +# @package _global_ + +# Define the default configuration for the data and task (model and training) + +defaults: + - override /data: random_small + - override /task: tune_model_stability + +# Define how many models to train to calculate stability + +task: + num_refits: 10 + +# Configure which hyperarameters to vary +# This will run and log the metrics of 12 models (combination of 3 hyperparams +# with 2-3 levels: 2 * 2 * 3) + +# Any field defined in the task configuration can be configured below. + +hydra: + mode: MULTIRUN + sweeper: + params: + task.batch_size: 10, 50 + task.model.num_hidden: "[500],[1000]" + task.training_loop.num_epochs: 40, 60, 100 diff --git a/tutorial/notebooks/04 Latent space analysis.ipynb b/tutorial/notebooks/04 Latent space analysis.ipynb index 016e2c95..88a43db2 100644 --- a/tutorial/notebooks/04 Latent space analysis.ipynb +++ b/tutorial/notebooks/04 Latent space analysis.ipynb @@ -193,7 +193,7 @@ "import pandas as pd\n", "import seaborn as sns\n", "\n", - "results_path = Path(config.data.processed_data_path) / \"latent_space\"\n", + "results_path = Path(config.data.results_path) / \"latent_space\"\n", "\n", "print(\"Saved plot files:\")\n", "for plot_path in results_path.glob(\"*.png\"):\n", diff --git a/tutorial/notebooks/05 Identify associations.ipynb b/tutorial/notebooks/05 Identify associations.ipynb index 5b69736b..2823d841 100644 --- a/tutorial/notebooks/05 Identify associations.ipynb +++ b/tutorial/notebooks/05 Identify associations.ipynb @@ -74,7 +74,7 @@ "import pandas as pd\n", "\n", "raw_data_path = Path(config.data.raw_data_path)\n", - "results_path = Path(config.data.processed_data_path) / \"identify_associations\"\n", + "results_path = Path(config.data.results_path) / \"identify_associations\"\n", "\n", "truth = pd.read_csv(raw_data_path / \"changes.small.txt\", sep=\"\\t\", index_col=0)\n", "results = pd.read_csv(results_path / \"results_sig_assoc.tsv\", sep=\"\\t\")" @@ -120,7 +120,7 @@ "source": [ "We can also run the t-test method, and compare both results.\n", "\n", - "Note that here we override the `processed_data_path` field to prevent overwriting\n", + "Note that here we override the `results_path` field to prevent overwriting\n", "the previous results." ] }, @@ -142,7 +142,7 @@ } ], "source": [ - "ttest_config = io.read_config(\"random_small\", \"random_small__id_assoc_ttest\", \"data.processed_data_path=results_ttest\")\n", + "ttest_config = io.read_config(\"random_small\", \"random_small__id_assoc_ttest\", \"data.results_path=results_ttest\")\n", "identify_associations(ttest_config)" ] }, @@ -162,7 +162,7 @@ "metadata": {}, "outputs": [], "source": [ - "results_ttest_path = Path(config.data.processed_data_path) / \"identify_associations\"\n", + "results_ttest_path = Path(config.data.results_path) / \"identify_associations\"\n", "\n", "results_ttest = pd.read_csv(results_ttest_path / \"results_sig_assoc.tsv\", sep=\"\\t\")" ]