From dcd3a2e40bcb7fa87fd084bd53d44315958aec38 Mon Sep 17 00:00:00 2001 From: stefanpeidli Date: Fri, 1 Dec 2023 14:20:00 +0100 Subject: [PATCH] Add LotfollahiTheis2023 --- .../notebooks/DraegerKampmann2022.ipynb | 572 +++++++++++++ .../notebooks/LiangWang2023.ipynb | 792 ++++++++++++++++++ .../notebooks/LotfollahiTheis2023.ipynb | 214 +++++ .../notebooks/UrsuBoehm2022.ipynb | 388 +++++++++ dataset_processing/snakemake/Snakefile | 13 +- .../DraegerKampmann2022.py | 93 ++ .../DraegerKampmann2022/Snakefile | 41 + .../LiangWang2023/LiangWang2023.py | 51 ++ .../subworkflows/LiangWang2023/Snakefile | 67 ++ .../LotfollahiTheis2023.py | 47 ++ .../LotfollahiTheis2023/Snakefile | 41 + .../subworkflows/WesselsSatija2023/Snakefile | 43 +- .../archive/ChenGuo2021/ChenGuo2021.py | 93 ++ .../archive/ChenGuo2021/Snakefile | 42 + .../archive/UrsuBoehm2022/Snakefile | 40 + .../archive/UrsuBoehm2022/UrsuBoehm2022.py | 93 ++ 16 files changed, 2610 insertions(+), 20 deletions(-) create mode 100644 dataset_processing/notebooks/DraegerKampmann2022.ipynb create mode 100644 dataset_processing/notebooks/LiangWang2023.ipynb create mode 100644 dataset_processing/notebooks/LotfollahiTheis2023.ipynb create mode 100644 dataset_processing/notebooks/UrsuBoehm2022.ipynb create mode 100644 dataset_processing/snakemake/subworkflows/DraegerKampmann2022/DraegerKampmann2022.py create mode 100644 dataset_processing/snakemake/subworkflows/DraegerKampmann2022/Snakefile create mode 100644 dataset_processing/snakemake/subworkflows/LiangWang2023/LiangWang2023.py create mode 100644 dataset_processing/snakemake/subworkflows/LiangWang2023/Snakefile create mode 100644 dataset_processing/snakemake/subworkflows/LotfollahiTheis2023/LotfollahiTheis2023.py create mode 100644 dataset_processing/snakemake/subworkflows/LotfollahiTheis2023/Snakefile create mode 100644 dataset_processing/snakemake/subworkflows/archive/ChenGuo2021/ChenGuo2021.py create mode 100644 dataset_processing/snakemake/subworkflows/archive/ChenGuo2021/Snakefile create mode 100644 dataset_processing/snakemake/subworkflows/archive/UrsuBoehm2022/Snakefile create mode 100644 dataset_processing/snakemake/subworkflows/archive/UrsuBoehm2022/UrsuBoehm2022.py diff --git a/dataset_processing/notebooks/DraegerKampmann2022.ipynb b/dataset_processing/notebooks/DraegerKampmann2022.ipynb new file mode 100644 index 0000000..b6d472d --- /dev/null +++ b/dataset_processing/notebooks/DraegerKampmann2022.ipynb @@ -0,0 +1,572 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "338742b7-7103-4475-adf1-0b6122dd3abb", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "pyopenssl 23.0.0 requires cryptography<40,>=38.0.0, but you have cryptography 41.0.7 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install mygene statannotations scrublet scanpy scvelo decoupler matplotlib_venn goatools gseapy scperturb biomart PyComplexHeatmap statsmodels omnipath git+https://github.com/saezlab/pypath.git --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "bdf19927-f5a5-49e3-85ee-9b22bbed4b30", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import subprocess\n", + "import os\n", + "import sys\n", + "import matplotlib.backends.backend_pdf\n", + "import scanpy as sc\n", + "import matplotlib.pyplot as pl\n", + "import anndata as ad\n", + "import pandas as pd\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import scvelo as scv\n", + "scv.settings.verbosity=1\n", + "\n", + "from pathlib import Path\n", + "\n", + "# Jupyter stuff\n", + "from tqdm.notebook import tqdm\n", + "from IPython.display import clear_output\n", + "from IPython.core.display import display, HTML\n", + "display(HTML(\"\"))\n", + "\n", + "%matplotlib inline\n", + "\n", + "# Custom functions\n", + "sys.path.insert(1, '../')\n", + "from utils import *\n", + "\n", + "# scperturb package\n", + "sys.path.insert(1, '../package/src/')\n", + "from scperturb import *\n", + "\n", + "from pathlib import Path\n", + "figure_path = Path('../figures/')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1276f674-d382-4a04-a5a1-89034a5d6892", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "TEMPDIR = Path('/scratch/peidli/scPerturb/')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c604eabe-3e9e-48f7-a090-3fd76bc847f3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['GSM5387652_iTF_Microglia_10X_Lane1_filtered_feature_bc_matrix.h5',\n", + " 'GSM5387652_iTF_Microglia_10X_Lane1_raw_feature_bc_matrix.h5',\n", + " 'GSM5387653_iTF_Microglia_10X_Lane2_filtered_feature_bc_matrix.h5',\n", + " 'GSM5387653_iTF_Microglia_10X_Lane2_raw_feature_bc_matrix.h5',\n", + " 'GSM5387654_iTF_Microglia_10X_Lane3_filtered_feature_bc_matrix.h5',\n", + " 'GSM5387654_iTF_Microglia_10X_Lane3_raw_feature_bc_matrix.h5',\n", + " 'GSM5387655_iTF_Microglia_10X_Lane4_filtered_feature_bc_matrix.h5',\n", + " 'GSM5387655_iTF_Microglia_10X_Lane4_raw_feature_bc_matrix.h5',\n", + " 'GSM5387656_iTF_Microglia_sgRNAenrichment_Lane1_filtered_feature_bc_matrix.h5',\n", + " 'GSM5387656_iTF_Microglia_sgRNAenrichment_Lane1_raw_feature_bc_matrix.h5',\n", + " 'GSM5387657_iTF_Microglia_sgRNAenrichment_Lane2_filtered_feature_bc_matrix.h5',\n", + " 'GSM5387657_iTF_Microglia_sgRNAenrichment_Lane2_raw_feature_bc_matrix.h5',\n", + " 'GSM5387658_iTF_Microglia_sgRNAenrichment_Lane3_filtered_feature_bc_matrix.h5',\n", + " 'GSM5387658_iTF_Microglia_sgRNAenrichment_Lane3_raw_feature_bc_matrix.h5',\n", + " 'GSM5387659_iTF_Microglia_sgRNAenrichment_Lane4_filtered_feature_bc_matrix.h5',\n", + " 'GSM5387659_iTF_Microglia_sgRNAenrichment_Lane4_raw_feature_bc_matrix.h5']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sorted([file.name for file in (TEMPDIR / 'DraegerKampmann2022/').glob('*')])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce78baa9-ac2a-47a4-9f1b-ec946a900c0e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "adata = sc.read_10x_h5(TEMPDIR / 'DraegerKampmann2022/GSM5387652_iTF_Microglia_10X_Lane1_filtered_feature_bc_matrix.h5', gex_only=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "c38519dc-b293-47f3-88af-88e82e3ca005", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gene_idsfeature_typesgenome
MIR1302-2HGENSG00000243485Gene ExpressionGRCh38
FAM138AENSG00000237613Gene ExpressionGRCh38
OR4F5ENSG00000186092Gene ExpressionGRCh38
AL627309.1ENSG00000238009Gene ExpressionGRCh38
AL627309.3ENSG00000239945Gene ExpressionGRCh38
............
AC233755.2ENSG00000277856Gene ExpressionGRCh38
AC233755.1ENSG00000275063Gene ExpressionGRCh38
AC240274.1ENSG00000271254Gene ExpressionGRCh38
AC213203.1ENSG00000277475Gene ExpressionGRCh38
FAM231CENSG00000268674Gene ExpressionGRCh38
\n", + "

33538 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " gene_ids feature_types genome\n", + "MIR1302-2HG ENSG00000243485 Gene Expression GRCh38\n", + "FAM138A ENSG00000237613 Gene Expression GRCh38\n", + "OR4F5 ENSG00000186092 Gene Expression GRCh38\n", + "AL627309.1 ENSG00000238009 Gene Expression GRCh38\n", + "AL627309.3 ENSG00000239945 Gene Expression GRCh38\n", + "... ... ... ...\n", + "AC233755.2 ENSG00000277856 Gene Expression GRCh38\n", + "AC233755.1 ENSG00000275063 Gene Expression GRCh38\n", + "AC240274.1 ENSG00000271254 Gene Expression GRCh38\n", + "AC213203.1 ENSG00000277475 Gene Expression GRCh38\n", + "FAM231C ENSG00000268674 Gene Expression GRCh38\n", + "\n", + "[33538 rows x 3 columns]" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.var" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "d7232587-66fa-45c3-a77e-a7bf35942fb7", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "feature_types\n", + "Gene Expression 33538\n", + "dtype: int64" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.var.value_counts('feature_types')" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "0b4a44dc-b63e-4819-b960-76956f4e8b69", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "sdata = sc.read_10x_h5(TEMPDIR / 'DraegerKampmann2022/GSM5387656_iTF_Microglia_sgRNAenrichment_Lane1_filtered_feature_bc_matrix.h5', gex_only=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "7e599e57-81c4-41e9-a7a7-90dae7a420ac", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "AnnData object with n_obs × n_vars = 59551 × 33538\n", + " var: 'gene_ids', 'feature_types', 'genome'" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sdata" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "14770447-745d-45dc-ac9c-d1fa19319d93", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gene_idsfeature_typesgenome
MIR1302-2HGENSG00000243485Gene ExpressionGRCh38
FAM138AENSG00000237613Gene ExpressionGRCh38
OR4F5ENSG00000186092Gene ExpressionGRCh38
AL627309.1ENSG00000238009Gene ExpressionGRCh38
AL627309.3ENSG00000239945Gene ExpressionGRCh38
............
AC233755.2ENSG00000277856Gene ExpressionGRCh38
AC233755.1ENSG00000275063Gene ExpressionGRCh38
AC240274.1ENSG00000271254Gene ExpressionGRCh38
AC213203.1ENSG00000277475Gene ExpressionGRCh38
FAM231CENSG00000268674Gene ExpressionGRCh38
\n", + "

33538 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " gene_ids feature_types genome\n", + "MIR1302-2HG ENSG00000243485 Gene Expression GRCh38\n", + "FAM138A ENSG00000237613 Gene Expression GRCh38\n", + "OR4F5 ENSG00000186092 Gene Expression GRCh38\n", + "AL627309.1 ENSG00000238009 Gene Expression GRCh38\n", + "AL627309.3 ENSG00000239945 Gene Expression GRCh38\n", + "... ... ... ...\n", + "AC233755.2 ENSG00000277856 Gene Expression GRCh38\n", + "AC233755.1 ENSG00000275063 Gene Expression GRCh38\n", + "AC240274.1 ENSG00000271254 Gene Expression GRCh38\n", + "AC213203.1 ENSG00000277475 Gene Expression GRCh38\n", + "FAM231C ENSG00000268674 Gene Expression GRCh38\n", + "\n", + "[33538 rows x 3 columns]" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sdata.var" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "067f6eb4-ed98-4bdf-a3cb-e87578c191ab", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "feature_types\n", + "Gene Expression 33538\n", + "dtype: int64" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sdata.var.value_counts('feature_types')" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "2a044e8a-16ac-4fb5-a88a-06c4f6a019b0", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['AARS2', 'AARS', 'AARSD1'], dtype='object')" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sdata.var_names[sdata.var.index.str.startswith('AARS')]" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "8e3ebcbc-a7ef-4b65-8545-586656cfb215", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['CDK12'], dtype='object')" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sdata.var_names[sdata.var.index.str.startswith('CDK12')]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6fe653cd-894f-470d-bcdd-204846f72552", + "metadata": {}, + "outputs": [], + "source": [ + "# There is no sgRNA-seq info in here. I wrote to the original authors (01.12.2023)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/dataset_processing/notebooks/LiangWang2023.ipynb b/dataset_processing/notebooks/LiangWang2023.ipynb new file mode 100644 index 0000000..fdb381c --- /dev/null +++ b/dataset_processing/notebooks/LiangWang2023.ipynb @@ -0,0 +1,792 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "31abf236-2930-411e-a5a0-7829d7a9a00e", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "pyopenssl 23.0.0 requires cryptography<40,>=38.0.0, but you have cryptography 41.0.7 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install mygene statannotations scrublet scanpy scvelo decoupler matplotlib_venn goatools gseapy scperturb biomart PyComplexHeatmap statsmodels omnipath git+https://github.com/saezlab/pypath.git --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "7341e27b-d1f0-48a3-ac7e-8f8b462b1f4f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import subprocess\n", + "import os\n", + "import sys\n", + "import matplotlib.backends.backend_pdf\n", + "import scanpy as sc\n", + "import matplotlib.pyplot as pl\n", + "import anndata as ad\n", + "import pandas as pd\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import scvelo as scv\n", + "scv.settings.verbosity=1\n", + "\n", + "from pathlib import Path\n", + "\n", + "# Jupyter stuff\n", + "from tqdm.notebook import tqdm\n", + "from IPython.display import clear_output\n", + "from IPython.core.display import display, HTML\n", + "display(HTML(\"\"))\n", + "\n", + "%matplotlib inline\n", + "\n", + "# Custom functions\n", + "sys.path.insert(1, '../')\n", + "from utils import *\n", + "\n", + "# scperturb package\n", + "sys.path.insert(1, '../package/src/')\n", + "from scperturb import *\n", + "\n", + "from pathlib import Path\n", + "figure_path = Path('../figures/')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d6eb7fa7-5773-400c-b20e-cdf922e9a2fa", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "TEMPDIR = Path('/scratch/peidli/scPerturb/')" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "292116db-2b0a-4b83-a45e-777812097ceb", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['OSCAR_DM_expression_matrix.csv',\n", + " 'OSCAR_EM_expression_matrix.csv',\n", + " 'OSCAR_metadata.csv']" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sorted([file.name for file in (TEMPDIR / 'LiangWang2023/').glob('*')])" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "8aeb3310-2814-4e02-8959-cd5cb8e173a1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "tab_DM = pd.read_csv(TEMPDIR / f'LiangWang2023/OSCAR_DM_expression_matrix.csv', index_col=0)\n", + "tab_EM = pd.read_csv(TEMPDIR / f'LiangWang2023/OSCAR_EM_expression_matrix.csv', index_col=0)\n", + "meta_df = pd.read_csv(TEMPDIR / f'LiangWang2023/OSCAR_metadata.csv', index_col=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "f87eb4b0-9060-4f05-9f37-e8c47cd36299", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from scipy.sparse import csr_matrix" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "43cc12a1-029b-420a-afb4-1b93279c3b45", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "tab = pd.concat([tab_DM, tab_EM], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "163be1af-c6f5-45ea-a805-452907a662c6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "adata = sc.AnnData(csr_matrix(tab.T.values))\n", + "adata.obs_names = tab.columns\n", + "adata.var_names = tab.index\n", + "adata.obs = meta_df.loc[adata.obs_names]" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "9923031d-f6b6-4691-b62e-c07f775e4c6a", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cellbarcodesgrnagenesampletype
DM2_1_AAACCTGAGAGCTGCA-1AAACCTGAGAGCTGCA-1Spib-3CTGGGTCGAGGGTCCCCATGSpibDM2_1DM
DM2_1_AAACCTGCAAGTACCT-1AAACCTGCAAGTACCT-1Spic-3CCCTTCTTCCAGCAAAAGGGSpicDM2_1DM
DM2_1_AAACCTGCAAGTTCTG-1AAACCTGCAAGTTCTG-1Onecut1-3CCACTATGCTCATCCCGGCGOnecut1DM2_1DM
DM2_1_AAACCTGCAGATTGCT-1AAACCTGCAGATTGCT-1non_target-1AAGGCGTAAACGAGTACACGNon-TargetingDM2_1DM
DM2_1_AAACCTGCATGATCCA-1AAACCTGCATGATCCA-1Lyl1-1AGGGTGGTAATGTACAGCCALyl1DM2_1DM
.....................
EM2_3_TTTGTCACACTTGGAT-1TTTGTCACACTTGGAT-1Peg3-2GAAGGGAGCGCATTTAGGGGPeg3EM2_3EM
EM2_3_TTTGTCACAGGTCCAC-1TTTGTCACAGGTCCAC-1Nfe2-2GAAGACCCAGAATCTGACTCNfe2EM2_3EM
EM2_3_TTTGTCAGTACCGCTG-1TTTGTCAGTACCGCTG-1Mxd3-2CTGGCGCGCTGAACAGTGGGMxd3EM2_3EM
EM2_3_TTTGTCAGTCTGGTCG-1TTTGTCAGTCTGGTCG-1H1fx-3GGAGACCATCCGCAAGCTGGH1fxEM2_3EM
EM2_3_TTTGTCAGTGCCTTGG-1TTTGTCAGTGCCTTGG-1Hmga2-2TCCTCGCTTCTGTGGCACCGHmga2EM2_3EM
\n", + "

41383 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " cell barcode \\\n", + "DM2_1_AAACCTGAGAGCTGCA-1 AAACCTGAGAGCTGCA-1 Spib-3 \n", + "DM2_1_AAACCTGCAAGTACCT-1 AAACCTGCAAGTACCT-1 Spic-3 \n", + "DM2_1_AAACCTGCAAGTTCTG-1 AAACCTGCAAGTTCTG-1 Onecut1-3 \n", + "DM2_1_AAACCTGCAGATTGCT-1 AAACCTGCAGATTGCT-1 non_target-1 \n", + "DM2_1_AAACCTGCATGATCCA-1 AAACCTGCATGATCCA-1 Lyl1-1 \n", + "... ... ... \n", + "EM2_3_TTTGTCACACTTGGAT-1 TTTGTCACACTTGGAT-1 Peg3-2 \n", + "EM2_3_TTTGTCACAGGTCCAC-1 TTTGTCACAGGTCCAC-1 Nfe2-2 \n", + "EM2_3_TTTGTCAGTACCGCTG-1 TTTGTCAGTACCGCTG-1 Mxd3-2 \n", + "EM2_3_TTTGTCAGTCTGGTCG-1 TTTGTCAGTCTGGTCG-1 H1fx-3 \n", + "EM2_3_TTTGTCAGTGCCTTGG-1 TTTGTCAGTGCCTTGG-1 Hmga2-2 \n", + "\n", + " sgrna gene sample type \n", + "DM2_1_AAACCTGAGAGCTGCA-1 CTGGGTCGAGGGTCCCCATG Spib DM2_1 DM \n", + "DM2_1_AAACCTGCAAGTACCT-1 CCCTTCTTCCAGCAAAAGGG Spic DM2_1 DM \n", + "DM2_1_AAACCTGCAAGTTCTG-1 CCACTATGCTCATCCCGGCG Onecut1 DM2_1 DM \n", + "DM2_1_AAACCTGCAGATTGCT-1 AAGGCGTAAACGAGTACACG Non-Targeting DM2_1 DM \n", + "DM2_1_AAACCTGCATGATCCA-1 AGGGTGGTAATGTACAGCCA Lyl1 DM2_1 DM \n", + "... ... ... ... ... \n", + "EM2_3_TTTGTCACACTTGGAT-1 GAAGGGAGCGCATTTAGGGG Peg3 EM2_3 EM \n", + "EM2_3_TTTGTCACAGGTCCAC-1 GAAGACCCAGAATCTGACTC Nfe2 EM2_3 EM \n", + "EM2_3_TTTGTCAGTACCGCTG-1 CTGGCGCGCTGAACAGTGGG Mxd3 EM2_3 EM \n", + "EM2_3_TTTGTCAGTCTGGTCG-1 GGAGACCATCCGCAAGCTGG H1fx EM2_3 EM \n", + "EM2_3_TTTGTCAGTGCCTTGG-1 TCCTCGCTTCTGTGGCACCG Hmga2 EM2_3 EM \n", + "\n", + "[41383 rows x 6 columns]" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.obs" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "f9379c6b-0b2e-4cb1-88cc-007ece5aa5c3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Harmonize Metadata\n", + "adata.obs_names = [x.split('-')[0] for x in adata.obs_names]\n", + "adata.var.index.name='gene_symbol'\n", + "adata.obs.drop(['cell'], axis=1, inplace=True)\n", + "adata.obs.index.name = 'cell_barcode'\n", + "adata.obs.rename({\n", + " 'barcode': 'guide_id',\n", + " 'sgrna': 'guide_sequence',\n", + " 'gene': 'perturbation',\n", + " 'type': 'medium'\n", + "}, axis=1, inplace=True)\n", + "adata.obs.perturbation[adata.obs.perturbation=='Non-Targeting'] = 'control'\n", + "adata.obs.medium.replace({'DM': 'Differentiation Medium', 'EM': 'Expansion Medium'}, inplace=True)\n", + "adata.obs['perturbation_type'] = 'CRISPR-cas9'\n", + "adata.obs['nperts']= 1 - adata.obs.perturbation.str.count('control')\n", + "adata.obs['organism'] = 'mouse'\n", + "adata.obs['tissue_type'] = 'organoid'\n", + "adata.obs['disease'] = 'healthy'\n", + "adata.obs['celltype'] = 'hepatocyte'\n", + "adata.obs['cancer'] = False\n", + "adata.obs['cell_line'] = 'intrahepatic cholangiocyte organoids'" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "3507a6b2-a254-45ae-8983-1c22d4cf2d2a", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
guide_idguide_sequenceperturbationsamplemediumperturbation_typenpertsorganismtissue_typediseasecelltype
cell_barcode
DM2_1_AAACCTGAGAGCTGCASpib-3CTGGGTCGAGGGTCCCCATGSpibDM2_1Differentiation MediumCRISPR-cas91mouseorganoidhealthyhepatocyte
DM2_1_AAACCTGCAAGTACCTSpic-3CCCTTCTTCCAGCAAAAGGGSpicDM2_1Differentiation MediumCRISPR-cas91mouseorganoidhealthyhepatocyte
DM2_1_AAACCTGCAAGTTCTGOnecut1-3CCACTATGCTCATCCCGGCGOnecut1DM2_1Differentiation MediumCRISPR-cas91mouseorganoidhealthyhepatocyte
DM2_1_AAACCTGCAGATTGCTnon_target-1AAGGCGTAAACGAGTACACGcontrolDM2_1Differentiation MediumCRISPR-cas90mouseorganoidhealthyhepatocyte
DM2_1_AAACCTGCATGATCCALyl1-1AGGGTGGTAATGTACAGCCALyl1DM2_1Differentiation MediumCRISPR-cas91mouseorganoidhealthyhepatocyte
....................................
EM2_3_TTTGTCACACTTGGATPeg3-2GAAGGGAGCGCATTTAGGGGPeg3EM2_3Expansion MediumCRISPR-cas91mouseorganoidhealthyhepatocyte
EM2_3_TTTGTCACAGGTCCACNfe2-2GAAGACCCAGAATCTGACTCNfe2EM2_3Expansion MediumCRISPR-cas91mouseorganoidhealthyhepatocyte
EM2_3_TTTGTCAGTACCGCTGMxd3-2CTGGCGCGCTGAACAGTGGGMxd3EM2_3Expansion MediumCRISPR-cas91mouseorganoidhealthyhepatocyte
EM2_3_TTTGTCAGTCTGGTCGH1fx-3GGAGACCATCCGCAAGCTGGH1fxEM2_3Expansion MediumCRISPR-cas91mouseorganoidhealthyhepatocyte
EM2_3_TTTGTCAGTGCCTTGGHmga2-2TCCTCGCTTCTGTGGCACCGHmga2EM2_3Expansion MediumCRISPR-cas91mouseorganoidhealthyhepatocyte
\n", + "

41383 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " guide_id guide_sequence perturbation \\\n", + "cell_barcode \n", + "DM2_1_AAACCTGAGAGCTGCA Spib-3 CTGGGTCGAGGGTCCCCATG Spib \n", + "DM2_1_AAACCTGCAAGTACCT Spic-3 CCCTTCTTCCAGCAAAAGGG Spic \n", + "DM2_1_AAACCTGCAAGTTCTG Onecut1-3 CCACTATGCTCATCCCGGCG Onecut1 \n", + "DM2_1_AAACCTGCAGATTGCT non_target-1 AAGGCGTAAACGAGTACACG control \n", + "DM2_1_AAACCTGCATGATCCA Lyl1-1 AGGGTGGTAATGTACAGCCA Lyl1 \n", + "... ... ... ... \n", + "EM2_3_TTTGTCACACTTGGAT Peg3-2 GAAGGGAGCGCATTTAGGGG Peg3 \n", + "EM2_3_TTTGTCACAGGTCCAC Nfe2-2 GAAGACCCAGAATCTGACTC Nfe2 \n", + "EM2_3_TTTGTCAGTACCGCTG Mxd3-2 CTGGCGCGCTGAACAGTGGG Mxd3 \n", + "EM2_3_TTTGTCAGTCTGGTCG H1fx-3 GGAGACCATCCGCAAGCTGG H1fx \n", + "EM2_3_TTTGTCAGTGCCTTGG Hmga2-2 TCCTCGCTTCTGTGGCACCG Hmga2 \n", + "\n", + " sample medium perturbation_type \\\n", + "cell_barcode \n", + "DM2_1_AAACCTGAGAGCTGCA DM2_1 Differentiation Medium CRISPR-cas9 \n", + "DM2_1_AAACCTGCAAGTACCT DM2_1 Differentiation Medium CRISPR-cas9 \n", + "DM2_1_AAACCTGCAAGTTCTG DM2_1 Differentiation Medium CRISPR-cas9 \n", + "DM2_1_AAACCTGCAGATTGCT DM2_1 Differentiation Medium CRISPR-cas9 \n", + "DM2_1_AAACCTGCATGATCCA DM2_1 Differentiation Medium CRISPR-cas9 \n", + "... ... ... ... \n", + "EM2_3_TTTGTCACACTTGGAT EM2_3 Expansion Medium CRISPR-cas9 \n", + "EM2_3_TTTGTCACAGGTCCAC EM2_3 Expansion Medium CRISPR-cas9 \n", + "EM2_3_TTTGTCAGTACCGCTG EM2_3 Expansion Medium CRISPR-cas9 \n", + "EM2_3_TTTGTCAGTCTGGTCG EM2_3 Expansion Medium CRISPR-cas9 \n", + "EM2_3_TTTGTCAGTGCCTTGG EM2_3 Expansion Medium CRISPR-cas9 \n", + "\n", + " nperts organism tissue_type disease celltype \n", + "cell_barcode \n", + "DM2_1_AAACCTGAGAGCTGCA 1 mouse organoid healthy hepatocyte \n", + "DM2_1_AAACCTGCAAGTACCT 1 mouse organoid healthy hepatocyte \n", + "DM2_1_AAACCTGCAAGTTCTG 1 mouse organoid healthy hepatocyte \n", + "DM2_1_AAACCTGCAGATTGCT 0 mouse organoid healthy hepatocyte \n", + "DM2_1_AAACCTGCATGATCCA 1 mouse organoid healthy hepatocyte \n", + "... ... ... ... ... ... \n", + "EM2_3_TTTGTCACACTTGGAT 1 mouse organoid healthy hepatocyte \n", + "EM2_3_TTTGTCACAGGTCCAC 1 mouse organoid healthy hepatocyte \n", + "EM2_3_TTTGTCAGTACCGCTG 1 mouse organoid healthy hepatocyte \n", + "EM2_3_TTTGTCAGTCTGGTCG 1 mouse organoid healthy hepatocyte \n", + "EM2_3_TTTGTCAGTGCCTTGG 1 mouse organoid healthy hepatocyte \n", + "\n", + "[41383 rows x 11 columns]" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.obs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a586159-ad66-428c-8740-877a7e6f7f53", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5931ad47-bbf8-49e3-9a59-0085b8f97dfc", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "f71584aa-1cb5-4a92-915b-52fd535abd83", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gene_symbol
Xkr4
Rp1
Sox17
Mrpl15
Lypla1
...
AC125149.3
AC168977.1
AC149090.1
CAAA01118383.1
CAAA01147332.1
\n", + "

17444 rows × 0 columns

\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: []\n", + "Index: [Xkr4, Rp1, Sox17, Mrpl15, Lypla1, Gm37988, Tcea1, Rgs20, Atp6v1h, Rb1cc1, 4732440D04Rik, St18, Pcmtd1, Gm26901, Gm30414, Sntg1, Rrs1, Adhfe1, 3110035E14Rik, Mybl1, Vcpip1, 1700034P13Rik, Sgk3, Mcmdc2, Snhg6, Tcf24, Ppp1r42, Cops5, Cspp1, Arfgef1, Cpa6, Prex2, A830018L16Rik, Slco5a1, Ncoa2, Tram1, Lactb2, Xkr9, Eya1, Trpa1, Terf1, Sbspon, 4930444P10Rik, Rpl7, Rdh10, Stau2, Ube2w, Eloc, D030040B21Rik, Tmem70, Ly96, Jph1, Gdap1, Defb41, Pkhd1, Il17f, Mcm3, 6720483E21Rik, Paqr8, Efhc1, Tram2, Tmem14a, Gsta3, Gm28836, Khdc1a, Kcnq5, Rims1, Gm29107, Ogfrl1, B3gat2, Smap1, Sdhaf4, Fam135a, Lmbrd1, Gm5524, Phf3, Ptp4a1, Gm29669, 4931428L18Rik, Khdrbs2, Prim2, Rab23, Bag2, Zfp451, Bend6, Dst, Ccdc115, Imp4, Ptpn18, 4930568A12Rik, Arhgef4, Gm38336, Fam168b, Plekhb2, 1110002O04Rik, Hs6st1, Uggt1, Neurl3, Arid5a, 4930403P22Rik, ...]\n", + "\n", + "[17444 rows x 0 columns]" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.var" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49738291-fe94-4e94-900c-6ca99a75e976", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/dataset_processing/notebooks/LotfollahiTheis2023.ipynb b/dataset_processing/notebooks/LotfollahiTheis2023.ipynb new file mode 100644 index 0000000..9d1de33 --- /dev/null +++ b/dataset_processing/notebooks/LotfollahiTheis2023.ipynb @@ -0,0 +1,214 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "338742b7-7103-4475-adf1-0b6122dd3abb", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!pip install mygene statannotations scrublet scanpy scvelo decoupler matplotlib_venn goatools gseapy scperturb biomart PyComplexHeatmap statsmodels omnipath git+https://github.com/saezlab/pypath.git --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "bdf19927-f5a5-49e3-85ee-9b22bbed4b30", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import subprocess\n", + "import os\n", + "import sys\n", + "import matplotlib.backends.backend_pdf\n", + "import scanpy as sc\n", + "import matplotlib.pyplot as pl\n", + "import anndata as ad\n", + "import pandas as pd\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import scvelo as scv\n", + "scv.settings.verbosity=1\n", + "\n", + "from pathlib import Path\n", + "\n", + "# Jupyter stuff\n", + "from tqdm.notebook import tqdm\n", + "from IPython.display import clear_output\n", + "from IPython.core.display import display, HTML\n", + "display(HTML(\"\"))\n", + "\n", + "%matplotlib inline\n", + "\n", + "# Custom functions\n", + "sys.path.insert(1, '../')\n", + "from utils import *\n", + "\n", + "# scperturb package\n", + "sys.path.insert(1, '../package/src/')\n", + "from scperturb import *\n", + "\n", + "from pathlib import Path\n", + "figure_path = Path('../figures/')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1276f674-d382-4a04-a5a1-89034a5d6892", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "TEMPDIR = Path('/scratch/peidli/scPerturb/')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c604eabe-3e9e-48f7-a090-3fd76bc847f3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['GSE206741_cell_metadata.tsv',\n", + " 'GSE206741_count_matrix.mtx',\n", + " 'GSE206741_gene_metadata.tsv']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sorted([file.name for file in (TEMPDIR / 'LotfollahiTheis2023/').glob('*')])" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "027e6124-715a-453c-81a2-e74b7166bfcf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from scipy.io import mmread\n", + "from scipy.sparse import csr_matrix\n", + "obs = pd.read_csv(TEMPDIR / f'LotfollahiTheis2023/GSE206741_cell_metadata.tsv', index_col=0, sep='\\t')\n", + "var = pd.read_csv(TEMPDIR / f'LotfollahiTheis2023/GSE206741_gene_metadata.tsv', index_col=0, sep='\\t')\n", + "X = csr_matrix(mmread(TEMPDIR / f'LotfollahiTheis2023/GSE206741_count_matrix.mtx'))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "1697c169-08b2-4714-9d8d-f4c18980d494", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "adata = sc.AnnData(X.T, obs, var)\n", + "\n", + "adata.var.set_index('gene_short_name', inplace=True, drop=False)\n", + "adata.var.columns = ['ensembl_id']\n", + "adata.var.index.name = 'gene_symbol'\n", + "\n", + "adata.obs['perturbation'] = ['_'.join(np.sort([d1,d2])).replace('\\xa0','') for d1, d2 in zip(adata.obs.Drug1, adata.obs.Drug2)]\n", + "adata.obs['perturbation'] = [x.replace('DMSO_', '').replace('_DMSO', '').replace('DMSO', 'control') for x in adata.obs.perturbation]\n", + "adata.obs.index.name = 'cell_barcode'\n", + "adata.obs.rename({\n", + " 'n.umi': 'ncounts', \n", + "}, axis=1, inplace=True)\n", + "adata.obs.drop(['sample', 'Drug1', 'Drug2'], axis=1, inplace=True)\n", + "adata.obs = adata.obs[['perturbation', 'Size_Factor', 'ncounts', 'RT_well', 'Well']]\n", + "adata.obs['nperts'] = [p.count('_')+1-p.count('control') if type(p)==str else 0 for p in adata.obs.perturbation]\n", + "adata.obs['perturbation_type'] = 'drug'\n", + "adata.obs['disease'] = \"lung adenocarcinoma\"\n", + "adata.obs['cancer'] = True\n", + "adata.obs['tissue_type']=\"cell_line\"\n", + "adata.obs[\"cell_line\"] = \"A549\"\n", + "adata.obs[\"celltype\"] = 'lung epthelial cells'\n", + "adata.obs['organism'] = 'human'\n", + "#annotate_qc(adata, species='human')\n", + "#assert_annotations(adata)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88c75250-16de-427c-b0a1-02b5a4cd75db", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd6705a5-e938-41d9-af0b-c08651210975", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08ad720e-25ee-42b5-a4b1-b5e6ea5091d3", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a27157d-43c2-4017-85c6-1333e05dd182", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/dataset_processing/notebooks/UrsuBoehm2022.ipynb b/dataset_processing/notebooks/UrsuBoehm2022.ipynb new file mode 100644 index 0000000..d53ad6b --- /dev/null +++ b/dataset_processing/notebooks/UrsuBoehm2022.ipynb @@ -0,0 +1,388 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 12, + "id": "338742b7-7103-4475-adf1-0b6122dd3abb", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!pip install mygene statannotations scrublet scanpy scvelo decoupler matplotlib_venn goatools gseapy scperturb biomart PyComplexHeatmap statsmodels omnipath git+https://github.com/saezlab/pypath.git --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "bdf19927-f5a5-49e3-85ee-9b22bbed4b30", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import subprocess\n", + "import os\n", + "import sys\n", + "import matplotlib.backends.backend_pdf\n", + "import scanpy as sc\n", + "import matplotlib.pyplot as pl\n", + "import anndata as ad\n", + "import pandas as pd\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import scvelo as scv\n", + "scv.settings.verbosity=1\n", + "\n", + "from pathlib import Path\n", + "\n", + "# Jupyter stuff\n", + "from tqdm.notebook import tqdm\n", + "from IPython.display import clear_output\n", + "from IPython.core.display import display, HTML\n", + "display(HTML(\"\"))\n", + "\n", + "%matplotlib inline\n", + "\n", + "# Custom functions\n", + "sys.path.insert(1, '../')\n", + "from utils import *\n", + "\n", + "# scperturb package\n", + "sys.path.insert(1, '../package/src/')\n", + "from scperturb import *\n", + "\n", + "from pathlib import Path\n", + "figure_path = Path('../figures/')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "1276f674-d382-4a04-a5a1-89034a5d6892", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "TEMPDIR = Path('/scratch/peidli/scPerturb/')" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "c604eabe-3e9e-48f7-a090-3fd76bc847f3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['GSE161824_A549_KRAS.processed.cells.csv',\n", + " 'GSE161824_A549_KRAS.processed.cells.metadata.csv',\n", + " 'GSE161824_A549_KRAS.processed.genes.csv',\n", + " 'GSE161824_A549_KRAS.processed.genes.metadata.csv',\n", + " 'GSE161824_A549_KRAS.processed.matrix.mtx',\n", + " 'GSE161824_A549_KRAS.rawcounts.cells.csv',\n", + " 'GSE161824_A549_KRAS.rawcounts.genes.csv',\n", + " 'GSE161824_A549_KRAS.rawcounts.matrix.mtx',\n", + " 'GSE161824_A549_KRAS.variants2cell.csv',\n", + " 'GSE161824_A549_TP53.processed.cells.csv',\n", + " 'GSE161824_A549_TP53.processed.cells.metadata.csv',\n", + " 'GSE161824_A549_TP53.processed.genes.csv',\n", + " 'GSE161824_A549_TP53.processed.genes.metadata.csv',\n", + " 'GSE161824_A549_TP53.processed.matrix.mtx',\n", + " 'GSE161824_A549_TP53.rawcounts.cells.csv',\n", + " 'GSE161824_A549_TP53.rawcounts.genes.csv',\n", + " 'GSE161824_A549_TP53.rawcounts.matrix.mtx',\n", + " 'GSE161824_A549_TP53.variants2cell.csv',\n", + " 'GSE161824_RAW.tar',\n", + " 'GSE161824_SCEVIP.README.pdf',\n", + " 'filelist.txt']" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sorted([file.name for file in (TEMPDIR / 'UrsuBoehm2022/').glob('*')])" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "ce78baa9-ac2a-47a4-9f1b-ec946a900c0e", + "metadata": {}, + "outputs": [], + "source": [ + "# Read Data\n", + "from scipy.io import mmread\n", + "from scipy.sparse import csr_matrix\n", + "keys = ['KRAS', 'TP53']\n", + "adatas = {}\n", + "for key in tqdm(keys):\n", + " var = pd.read_csv(TEMPDIR / f'UrsuBoehm2022/GSE161824_A549_{key}.rawcounts.genes.csv', index_col=0, names=['gene_symbol'])\n", + " X = csr_matrix(mmread(TEMPDIR / f'UrsuBoehm2022/GSE161824_A549_{key}.rawcounts.matrix.mtx'))\n", + " \n", + " obs = pd.read_csv(TEMPDIR / f'UrsuBoehm2022/GSE161824_A549_{key}.rawcounts.cells.csv', index_col=0, names=['cell_barcode'])\n", + " variants = pd.read_csv(TEMPDIR / f'UrsuBoehm2022/GSE161824_A549_{key}.variants2cell.csv', sep='\\t')\n", + " variants.set_index('cell', inplace=True)\n", + " assert np.sum(obs.index!=variants.index)==0\n", + " var_counts = variants.iloc[:,2:-2]\n", + " variants.drop(var_counts.columns, axis=1, inplace=True)\n", + " obs = pd.concat([obs, variants], axis=1)\n", + "\n", + " adata = sc.AnnData(X, obs, var)\n", + " adata.obsm['Variant_Counts'] = var_counts\n", + " adatas[key] = adata" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "fd6705a5-e938-41d9-af0b-c08651210975", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "adata = adatas['KRAS']" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "08ad720e-25ee-42b5-a4b1-b5e6ea5091d3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
batchn_countsvariantvariant.detailed_multi
AAACCTGCAACGCACC-1-0025694.0unassignedunassigned
AAACCTGCAATGGTCT-1-0034868.0T50TT50T
AAACCTGCAGGACGTA-1-0025170.0unassignedunassigned
AAACCTGCAGTAAGAT-1-0031500.0unassignedunassigned
AAACGGGAGACAGAGA-1-0022654.0T127TT127T
...............
TTTGTCAAGAGGGCTT-1-313115471.0M170LM170L
TTTGTCAAGGGATACC-1-313121986.0Q99EQ99E
TTTGTCACACATGACT-1-313117064.0G13RG13R
TTTGTCACAGAAGCAC-1-313123769.0Y166HY166H
TTTGTCATCCGAATGT-1-313124162.0T20RT20R
\n", + "

150044 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " batch n_counts variant variant.detailed_multi\n", + "AAACCTGCAACGCACC-1-0 0 25694.0 unassigned unassigned\n", + "AAACCTGCAATGGTCT-1-0 0 34868.0 T50T T50T\n", + "AAACCTGCAGGACGTA-1-0 0 25170.0 unassigned unassigned\n", + "AAACCTGCAGTAAGAT-1-0 0 31500.0 unassigned unassigned\n", + "AAACGGGAGACAGAGA-1-0 0 22654.0 T127T T127T\n", + "... ... ... ... ...\n", + "TTTGTCAAGAGGGCTT-1-31 31 15471.0 M170L M170L\n", + "TTTGTCAAGGGATACC-1-31 31 21986.0 Q99E Q99E\n", + "TTTGTCACACATGACT-1-31 31 17064.0 G13R G13R\n", + "TTTGTCACAGAAGCAC-1-31 31 23769.0 Y166H Y166H\n", + "TTTGTCATCCGAATGT-1-31 31 24162.0 T20R T20R\n", + "\n", + "[150044 rows x 4 columns]" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.obs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a27157d-43c2-4017-85c6-1333e05dd182", + "metadata": {}, + "outputs": [], + "source": [ + "adata.obs.rename({'n_counts': 'ncounts'}, axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "4112a80d-56c2-4b54-af99-f32e1029a161", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['unassigned', 'T50T', 'T127T', 'G13V', 'Q22K', 'T20M', 'Q61P',\n", + " 'S17S', 'Q99E', 'G77A', 'M111L', 'Q61A', 'D30D', 'I36M', 'A146V',\n", + " 'E31K', 'G12A', 'G60D', 'R68S', 'P34R', 'E63K', 'multiple',\n", + " 'F141L', 'A66A', 'K178K', 'T74A', 'D119G', 'WT', 'G13C', 'G75A',\n", + " 'K88K', 'L159S', 'V112I', 'S136N', 'I163S', 'T158A', 'A146P',\n", + " 'D173D', 'R135T', 'A146T', 'A59G', 'R164Q', 'G12F', 'Q61R',\n", + " 'K117R', 'T74T', 'K117N', 'C118S', 'AG59GV', 'T144T', 'N26Y',\n", + " 'K169K', 'A155G', 'G13R', 'K176Q', 'A130V', 'D57N', 'T50I', 'L52F',\n", + " 'T144P', 'R149K', 'G12I', 'G13E', 'P110S', 'T50P', 'V8V', 'P34L',\n", + " 'G60S', 'K147N', 'V14L', 'G60V', 'T20R', 'L19F', 'G12S', 'L79I',\n", + " 'K147T', 'Q25H', 'G12Y', 'G12C', 'M170L', 'K179R', 'G12D', 'Q61L',\n", + " 'T20T', 'V14I', 'C185Y', 'Q61H', 'T158T', 'G12R', 'K5E', 'A59E',\n", + " 'Q22H', 'Y166H', 'R41K', 'A59T', 'Q61K', 'G12V', 'T58I', 'AG11TD',\n", + " 'D33E'], dtype=object)" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.obs.variant.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f5a5e1d-a594-42e4-8e98-b476f4fc1b81", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/dataset_processing/snakemake/Snakefile b/dataset_processing/snakemake/Snakefile index c3a95b0..ffa5735 100644 --- a/dataset_processing/snakemake/Snakefile +++ b/dataset_processing/snakemake/Snakefile @@ -25,6 +25,9 @@ include: "subworkflows/SantinhaPlatt2023/Snakefile" include: "subworkflows/LaraAstiasoHuntly2023/Snakefile" include: "subworkflows/SunshineHein2023/Snakefile" include: "subworkflows/WesselsSatija2023/Snakefile" +include: "subworkflows/LiangWang2023/Snakefile" +include: "subworkflows/UrsuBoehm2022/Snakefile" +include: "subworkflows/LotfollahiTheis2023/Snakefile" ### RULES ### @@ -35,11 +38,17 @@ rule all: rules.YaoCleary2023.output, rules.QinTape2023.output, # only scRNA-seq for now, CyTOF maybe later rules.SantinhaPlatt2023.output, - rules.McFalineTrapnell2023.output, # screen 2 too big. OOF??? rules.XuCao2023.output, rules.LaraAstiasoHuntly2023.output, rules.SunshineHein2023.output, - rules.WesselsSatija2023.output + rules.WesselsSatija2023.output, + rules.LiangWang2023.output, + rules.LotfollahiTheis2023.output + # dysfunct + # WIP rules.McFalineTrapnell2023.output, # screen 2 too big. OOF??? # rules.WuBassett2023.output, # obtained from original authors, maybe a copy is still left on my charité mac? + + # deprecated + # rules.UrsuBoehm2022.output, # does not have a control diff --git a/dataset_processing/snakemake/subworkflows/DraegerKampmann2022/DraegerKampmann2022.py b/dataset_processing/snakemake/subworkflows/DraegerKampmann2022/DraegerKampmann2022.py new file mode 100644 index 0000000..3921fe9 --- /dev/null +++ b/dataset_processing/snakemake/subworkflows/DraegerKampmann2022/DraegerKampmann2022.py @@ -0,0 +1,93 @@ +import pandas as pd +import scanpy as sc +import numpy as np +import sys + +from scipy.io import mmread +from scipy.sparse import csr_matrix +from pathlib import Path + +# Custom functions +sys.path.insert(1, '../../') +from utils import annotate_qc, assert_annotations + +TEMPDIR = Path(snakemake.config['TEMPDIR']) + +# def read_count_matrix(key): +# X = csr_matrix(mmread(TEMPDIR / f'WesselsSatija2023/{key}.matrix.mtx').T) +# obs = pd.read_csv(TEMPDIR / f'WesselsSatija2023/{key}.barcodes.tsv', sep='\t', index_col=0, names=['cell_barcode']) +# var = pd.read_csv(TEMPDIR / f'WesselsSatija2023/{key}.features.tsv', sep='\t', index_col=1, names=['ENSEMBL_ID', 'gene_symbol', 'feature_type']) +# return X, obs, var + +# adatas = [] +# for j in ['1', '2', '3', '4']: +# X, obs, var = read_count_matrix(f'THP1-CaRPool-seq_and_HEK293FTstabRNA.GEXGDO{j}') +# adata = sc.AnnData(X, obs=obs, var=var) +# adata.obs.index = [x[:-2] for x in obs.index] +# adata.obs.index = [f'L{j}_{x}' for x in adata.obs.index] + +# # Add ADT +# X_ADT, obs_ADT, var_ADT = read_count_matrix(f'THP1-CaRPool-seq_and_HEK293FTstabRNA.ADT{j}') +# var_ADT = var_ADT.set_index('ENSEMBL_ID') +# var_ADT.index.name='ADT_id' +# var_ADT.feature_type = 'Surface Protein' +# obs_ADT.index = [f'L{j}_{x}' for x in obs_ADT.index] +# ADT = pd.DataFrame(X_ADT.A, index=obs_ADT.index, columns=var_ADT.index) +# diff = adata.obs.index[~adata.obs.index.isin(ADT.index)] # 2 idxs... +# for d in diff: +# ADT.loc[d]=0 +# adata.obsm['Surface_protein']=ADT.loc[adata.obs_names] + +# # Add HTO +# X_HTO, obs_HTO, var_HTO = read_count_matrix(f'THP1-CaRPool-seq_and_HEK293FTstabRNA.HTO{j}') +# var_HTO = var_HTO.set_index('ENSEMBL_ID') +# var_HTO.index.name='HTO_id' +# var_HTO.feature_type = 'Hashtag Oligo' +# obs_HTO.index = [f'L{j}_{x}' for x in obs_HTO.index] +# HTO = pd.DataFrame(X_HTO.A, index=obs_HTO.index, columns=var_HTO.index) +# diff = adata.obs.index[~adata.obs.index.isin(HTO.index)] # 2 idxs... +# for d in diff: +# HTO.loc[d]=0 +# adata.obsm['Sample_Tags']=HTO.loc[adata.obs_names] + +# adata.var_names_make_unique() # Otherwise error +# adatas.append(adata) + +# # merge and add metadata +# adata = sc.concat(adatas) +# tab = pd.read_csv(TEMPDIR / 'WesselsSatija2023/GSE213957_THP1-CaRPool-seq.metadata.tsv', sep='\t') +# tab.index = [x[:-2] for x in tab.index] +# adata = adata[tab.index].copy() +# adata.obs = tab + +# # Harmonize +# adata.obs.drop(['S.Score', 'G2M.Score'], axis=1, inplace=True) +# adata.obs.rename({ +# 'nCount_RNA': 'ncounts', +# 'nFeature_RNA': 'ngenes', +# 'percent.mt': 'percent_mito', +# 'nCount_HTO': 'ncounts_HTO', +# 'nCount_ADT': 'ncounts_ADT', +# 'TenX.Lane': '10X_lane', +# 'CRISPR.Array': 'CRISPRcas13_Array', +# 'GenePair': 'perturbation', +# 'Guides': 'guides', +# 'Phase': 'Cell_cycle_phase' +# }, axis=1, inplace=True) +# adata.obs.perturbation[adata.obs.perturbation=='NT_NT'] = 'control' +# adata.obs.perturbation = [x.replace('NT_','').replace('_NT', '') for x in adata.obs.perturbation] +# adata.obs['nperts'] = [p.count('_')+1-p.count('control') if type(p)==str else 0 for p in adata.obs.perturbation] +# adata.obs['perturbation_type'] = 'CRISPR-cas13' +# adata.obs['disease'] = "leukemia" +# adata.obs['cancer'] = True +# adata.obs['tissue_type']="cell_line" +# adata.obs["cell_line"] = "THP-1" +# adata.obs["celltype"] = 'monocytes' +# adata.obs['organism'] = 'human' +# annotate_qc(adata, species='human') +# adata.obs.index.name = 'cell_barcode' +# assert_annotations(adata) + +adata.write(snakemake.output[0], compression='gzip') + + diff --git a/dataset_processing/snakemake/subworkflows/DraegerKampmann2022/Snakefile b/dataset_processing/snakemake/subworkflows/DraegerKampmann2022/Snakefile new file mode 100644 index 0000000..1a1279d --- /dev/null +++ b/dataset_processing/snakemake/subworkflows/DraegerKampmann2022/Snakefile @@ -0,0 +1,41 @@ +""" +Author: Stefan Peidli +Date: 23.11.2023 +Run: snakemake +""" + +from pathlib import Path +configfile: "../../configuration/config.yaml" + +### PATHS ### +DATADIR = Path(config['DOWNDIR']) # place to store data +TEMPDIR = Path(config['TEMPDIR']) # place to store temporary files (huge files) + +# ### RULES ### +rule DraegerKampmann2022_download: + output: + TEMPDIR / 'DraegerKampmann2022/download.flag' + resources: + partititon='short', + time='01:00:00', + mem_mb=8000, + disk_mb=8000 + shell: + """ + cd {TEMPDIR}/DraegerKampmann2022 + rm -rf * + wget -O GSE178317_RAW.tar 'http://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE178317&format=file' + tar -xvf GSE178317_RAW.tar + rm GSE178317_RAW.tar + """ + +rule DraegerKampmann2022: + input: + rules.DraegerKampmann2022_download.output + output: DATADIR / 'DraegerKampmann2022.h5ad' + resources: + partititon='short', + time='04:00:00', + mem_mb=128000, + disk_mb=128000 + script: 'DraegerKampmann2022.py' diff --git a/dataset_processing/snakemake/subworkflows/LiangWang2023/LiangWang2023.py b/dataset_processing/snakemake/subworkflows/LiangWang2023/LiangWang2023.py new file mode 100644 index 0000000..42d6b1d --- /dev/null +++ b/dataset_processing/snakemake/subworkflows/LiangWang2023/LiangWang2023.py @@ -0,0 +1,51 @@ +import pandas as pd +import scanpy as sc +import numpy as np +import sys + +from scipy.io import mmread +from scipy.sparse import csr_matrix +from pathlib import Path + +# Custom functions +sys.path.insert(1, '../../') +from utils import annotate_qc, assert_annotations + +TEMPDIR = Path(snakemake.config['TEMPDIR']) + +tab_DM = pd.read_csv(TEMPDIR / f'LiangWang2023/OSCAR_DM_expression_matrix.csv', index_col=0) +tab_EM = pd.read_csv(TEMPDIR / f'LiangWang2023/OSCAR_EM_expression_matrix.csv', index_col=0) +meta_df = pd.read_csv(TEMPDIR / f'LiangWang2023/OSCAR_metadata.csv', index_col=0) + +tab = pd.concat([tab_DM, tab_EM], axis=1) + +adata = sc.AnnData(csr_matrix(tab.T.values)) +adata.obs_names = tab.columns +adata.var_names = tab.index +adata.obs = meta_df.loc[adata.obs_names] + +# Harmonize Metadata +adata.obs_names = [x.split('-')[0] for x in adata.obs_names] +adata.var.index.name='gene_symbol' +adata.obs.drop(['cell'], axis=1, inplace=True) +adata.obs.index.name = 'cell_barcode' +adata.obs.rename({ + 'barcode': 'guide_id', + 'sgrna': 'guide_sequence', + 'gene': 'perturbation', + 'type': 'medium' +}, axis=1, inplace=True) +adata.obs.perturbation[adata.obs.perturbation=='Non-Targeting'] = 'control' +adata.obs.medium.replace({'DM': 'Differentiation Medium', 'EM': 'Expansion Medium'}, inplace=True) +adata.obs['perturbation_type'] = 'CRISPR-cas9' +adata.obs['nperts']= 1 - adata.obs.perturbation.str.count('control') +adata.obs['organism'] = 'mouse' +adata.obs['tissue_type'] = 'organoid' +adata.obs['disease'] = 'healthy' +adata.obs['celltype'] = 'hepatocyte' +adata.obs['cancer'] = False +adata.obs['cell_line'] = 'intrahepatic cholangiocyte organoids' +annotate_qc(adata) +assert_annotations(adata) + +adata.write(snakemake.output[0], compression='gzip') diff --git a/dataset_processing/snakemake/subworkflows/LiangWang2023/Snakefile b/dataset_processing/snakemake/subworkflows/LiangWang2023/Snakefile new file mode 100644 index 0000000..25fe177 --- /dev/null +++ b/dataset_processing/snakemake/subworkflows/LiangWang2023/Snakefile @@ -0,0 +1,67 @@ +""" +Author: Stefan Peidli +Date: 23.11.2023 +Run: snakemake +""" + +from pathlib import Path +configfile: "../../configuration/config.yaml" + +### PATHS ### +DATADIR = Path(config['DOWNDIR']) # place to store data +TEMPDIR = Path(config['TEMPDIR']) # place to store temporary files (huge files) + +# ### RULES ### +rule LiangWang2023_download: + output: + temp(directory(TEMPDIR / 'LiangWang2023/Wangxiaoyue-lab-OSCAR-d5880d4/')), + TEMPDIR / 'LiangWang2023/OSCAR_metadata.csv' + resources: + partititon='short', + time='01:00:00', + mem_mb=8000, + disk_mb=8000 + shell: + """ + cd {TEMPDIR}/LiangWang2023 + rm -rf * + wget https://zenodo.org/records/8385065/files/Wangxiaoyue-lab/OSCAR-Source_code.zip + unzip OSCAR-Source_code.zip + rm OSCAR-Source_code.zip + mv Wangxiaoyue-lab-OSCAR-d5880d4/data/2\ OSCAR/OSCAR_metadata.csv {TEMPDIR}/LiangWang2023/OSCAR_metadata.csv + """ + +rule LiangWang2023_extract: + input: + rules.LiangWang2023_download.output + output: + TEMPDIR / 'LiangWang2023/OSCAR_DM_expression_matrix.csv', + TEMPDIR / 'LiangWang2023/OSCAR_EM_expression_matrix.csv' + resources: + partititon='short', + time='01:00:00', + mem_mb=8000, + disk_mb=8000 + shell: + """ + cd {TEMPDIR}/LiangWang2023/Wangxiaoyue-lab-OSCAR-d5880d4/data/2\ OSCAR/ + cat OSCAR_DM_expression_matrix.*.0* > OSCAR_DM_expression_matrix.csv.gz + cat OSCAR_EM_expression_matrix.*.0* > OSCAR_EM_expression_matrix.csv.gz + gunzip OSCAR_DM_expression_matrix.csv.gz + gunzip OSCAR_EM_expression_matrix.csv.gz + mv OSCAR_DM_expression_matrix.csv {TEMPDIR}/LiangWang2023/OSCAR_DM_expression_matrix.csv + mv OSCAR_EM_expression_matrix.csv {TEMPDIR}/LiangWang2023/OSCAR_EM_expression_matrix.csv + """ + +rule LiangWang2023: + input: + TEMPDIR / 'LiangWang2023/OSCAR_DM_expression_matrix.csv', + TEMPDIR / 'LiangWang2023/OSCAR_EM_expression_matrix.csv', + TEMPDIR / 'LiangWang2023/OSCAR_metadata.csv' + output: DATADIR / 'LiangWang2023.h5ad' + resources: + partititon='short', + time='04:00:00', + mem_mb=128000, + disk_mb=128000 + script: 'LiangWang2023.py' diff --git a/dataset_processing/snakemake/subworkflows/LotfollahiTheis2023/LotfollahiTheis2023.py b/dataset_processing/snakemake/subworkflows/LotfollahiTheis2023/LotfollahiTheis2023.py new file mode 100644 index 0000000..6d98320 --- /dev/null +++ b/dataset_processing/snakemake/subworkflows/LotfollahiTheis2023/LotfollahiTheis2023.py @@ -0,0 +1,47 @@ +import pandas as pd +import scanpy as sc +import numpy as np +import sys + +from scipy.io import mmread +from scipy.sparse import csr_matrix +from pathlib import Path + +# Custom functions +sys.path.insert(1, '../../') +from utils import annotate_qc, assert_annotations + +TEMPDIR = Path(snakemake.config['TEMPDIR']) + +obs = pd.read_csv(TEMPDIR / f'LotfollahiTheis2023/GSE206741_cell_metadata.tsv', index_col=0, sep='\t') +var = pd.read_csv(TEMPDIR / f'LotfollahiTheis2023/GSE206741_gene_metadata.tsv', index_col=0, sep='\t') +X = csr_matrix(mmread(TEMPDIR / f'LotfollahiTheis2023/GSE206741_count_matrix.mtx')) + +adata = sc.AnnData(X.T, obs, var) + +adata.var.set_index('gene_short_name', inplace=True, drop=False) +adata.var.columns = ['ensembl_id'] +adata.var.index.name = 'gene_symbol' + +adata.obs['perturbation'] = ['_'.join(np.sort([d1,d2])).replace('\xa0','') for d1, d2 in zip(adata.obs.Drug1, adata.obs.Drug2)] +adata.obs['perturbation'] = [x.replace('DMSO_', '').replace('_DMSO', '').replace('DMSO', 'control') for x in adata.obs.perturbation] +adata.obs.index.name = 'cell_barcode' +adata.obs.rename({ + 'n.umi': 'ncounts', +}, axis=1, inplace=True) +adata.obs.drop(['sample', 'Drug1', 'Drug2'], axis=1, inplace=True) +adata.obs = adata.obs[['perturbation', 'Size_Factor', 'ncounts', 'RT_well', 'Well']] +adata.obs['nperts'] = [p.count('_')+1-p.count('control') if type(p)==str else 0 for p in adata.obs.perturbation] +adata.obs['perturbation_type'] = 'drug' +adata.obs['disease'] = "lung adenocarcinoma" +adata.obs['cancer'] = True +adata.obs['tissue_type']="cell_line" +adata.obs["cell_line"] = "A549" +adata.obs["celltype"] = 'lung epthelial cells' +adata.obs['organism'] = 'human' +annotate_qc(adata, species='human') +assert_annotations(adata) + +adata.write(snakemake.output[0], compression='gzip') + + diff --git a/dataset_processing/snakemake/subworkflows/LotfollahiTheis2023/Snakefile b/dataset_processing/snakemake/subworkflows/LotfollahiTheis2023/Snakefile new file mode 100644 index 0000000..71d4170 --- /dev/null +++ b/dataset_processing/snakemake/subworkflows/LotfollahiTheis2023/Snakefile @@ -0,0 +1,41 @@ +""" +Author: Stefan Peidli +Date: 23.11.2023 +Run: snakemake +""" + +from pathlib import Path +configfile: "../../configuration/config.yaml" + +### PATHS ### +DATADIR = Path(config['DOWNDIR']) # place to store data +TEMPDIR = Path(config['TEMPDIR']) # place to store temporary files (huge files) + +# ### RULES ### +rule LotfollahiTheis2023_download: + output: + TEMPDIR / 'LotfollahiTheis2023/download.flag' + resources: + partititon='short', + time='01:00:00', + mem_mb=8000, + disk_mb=8000 + shell: + """ + cd {TEMPDIR}/LotfollahiTheis2023 + rm -rf * + wget --recursive --no-parent -nd -R "index.html*" ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE206nnn/GSE206741/suppl/ + gunzip *.gz + touch {output} + """ + +rule LotfollahiTheis2023: + input: + rules.LotfollahiTheis2023_download.output + output: DATADIR / 'LotfollahiTheis2023.h5ad' + resources: + partititon='short', + time='04:00:00', + mem_mb=128000, + disk_mb=128000 + script: 'LotfollahiTheis2023.py' diff --git a/dataset_processing/snakemake/subworkflows/WesselsSatija2023/Snakefile b/dataset_processing/snakemake/subworkflows/WesselsSatija2023/Snakefile index 38f1bd5..4e85ba6 100644 --- a/dataset_processing/snakemake/subworkflows/WesselsSatija2023/Snakefile +++ b/dataset_processing/snakemake/subworkflows/WesselsSatija2023/Snakefile @@ -12,20 +12,14 @@ DATADIR = Path(config['DOWNDIR']) # place to store data TEMPDIR = Path(config['TEMPDIR']) # place to store temporary files (huge files) # ### RULES ### -rule WesselsSatija2023: - input: - TEMPDIR / 'WesselsSatija2023/download.flag' - output: DATADIR / 'WesselsSatija2023.h5ad' - resources: - partititon='short', - time='04:00:00', - mem_mb=128000, - disk_mb=128000 - script: 'WesselsSatija2023.py' - rule WesselsSatija2023_download: output: - TEMPDIR / 'WesselsSatija2023/download.flag' + TEMPDIR / 'WesselsSatija2023/GSE213957_THP1-CaRPool-seq.metadata.tsv', + expand(TEMPDIR / 'WesselsSatija2023/THP1-CaRPool-seq_and_HEK293FTstabRNA.{types}{N}.{feature}', + feature=['barcodes.tsv', 'features.tsv', 'matrix.mtx'], + types=['ADT', 'GEXGDO', 'HTO'], + N=[1, 2, 3, 4] + ) resources: partititon='short', time='01:00:00', @@ -34,11 +28,24 @@ rule WesselsSatija2023_download: shell: """ cd {TEMPDIR}/WesselsSatija2023 - wget --recursive --no-parent -nd -R "index.html*" ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213957/suppl/ - tar -xzvf GSE213957_HEX293FT_and_SpeciesMixing_CaRPool-seq.tar.gz + rm -rf * + # wget --recursive --no-parent -nd -R "index.html*" ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213957/suppl/ + wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213957/suppl/GSE213957%5FTHP1%2DCaRPool%2Dseq.metadata.tsv.gz + wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213957/suppl/GSE213957%5FTHP1%2DCaRPool%2Dseq%5Fand%5FGSE213957%5FHEX293FTstabRNA.tar.gz tar -xzvf GSE213957_THP1-CaRPool-seq_and_GSE213957_HEX293FTstabRNA.tar.gz - tar -xzvf GSE213957_Perturb-seq.tar.gz - rm *.tar.gz + rm GSE213957_THP1-CaRPool-seq_and_GSE213957_HEX293FTstabRNA.tar.gz + rm *.feature_ref.* gunzip *.gz - touch {output} - """ \ No newline at end of file + """ + + +rule WesselsSatija2023: + input: + rules.WesselsSatija2023_download.output + output: DATADIR / 'WesselsSatija2023.h5ad' + resources: + partititon='short', + time='04:00:00', + mem_mb=128000, + disk_mb=128000 + script: 'WesselsSatija2023.py' diff --git a/dataset_processing/snakemake/subworkflows/archive/ChenGuo2021/ChenGuo2021.py b/dataset_processing/snakemake/subworkflows/archive/ChenGuo2021/ChenGuo2021.py new file mode 100644 index 0000000..3921fe9 --- /dev/null +++ b/dataset_processing/snakemake/subworkflows/archive/ChenGuo2021/ChenGuo2021.py @@ -0,0 +1,93 @@ +import pandas as pd +import scanpy as sc +import numpy as np +import sys + +from scipy.io import mmread +from scipy.sparse import csr_matrix +from pathlib import Path + +# Custom functions +sys.path.insert(1, '../../') +from utils import annotate_qc, assert_annotations + +TEMPDIR = Path(snakemake.config['TEMPDIR']) + +# def read_count_matrix(key): +# X = csr_matrix(mmread(TEMPDIR / f'WesselsSatija2023/{key}.matrix.mtx').T) +# obs = pd.read_csv(TEMPDIR / f'WesselsSatija2023/{key}.barcodes.tsv', sep='\t', index_col=0, names=['cell_barcode']) +# var = pd.read_csv(TEMPDIR / f'WesselsSatija2023/{key}.features.tsv', sep='\t', index_col=1, names=['ENSEMBL_ID', 'gene_symbol', 'feature_type']) +# return X, obs, var + +# adatas = [] +# for j in ['1', '2', '3', '4']: +# X, obs, var = read_count_matrix(f'THP1-CaRPool-seq_and_HEK293FTstabRNA.GEXGDO{j}') +# adata = sc.AnnData(X, obs=obs, var=var) +# adata.obs.index = [x[:-2] for x in obs.index] +# adata.obs.index = [f'L{j}_{x}' for x in adata.obs.index] + +# # Add ADT +# X_ADT, obs_ADT, var_ADT = read_count_matrix(f'THP1-CaRPool-seq_and_HEK293FTstabRNA.ADT{j}') +# var_ADT = var_ADT.set_index('ENSEMBL_ID') +# var_ADT.index.name='ADT_id' +# var_ADT.feature_type = 'Surface Protein' +# obs_ADT.index = [f'L{j}_{x}' for x in obs_ADT.index] +# ADT = pd.DataFrame(X_ADT.A, index=obs_ADT.index, columns=var_ADT.index) +# diff = adata.obs.index[~adata.obs.index.isin(ADT.index)] # 2 idxs... +# for d in diff: +# ADT.loc[d]=0 +# adata.obsm['Surface_protein']=ADT.loc[adata.obs_names] + +# # Add HTO +# X_HTO, obs_HTO, var_HTO = read_count_matrix(f'THP1-CaRPool-seq_and_HEK293FTstabRNA.HTO{j}') +# var_HTO = var_HTO.set_index('ENSEMBL_ID') +# var_HTO.index.name='HTO_id' +# var_HTO.feature_type = 'Hashtag Oligo' +# obs_HTO.index = [f'L{j}_{x}' for x in obs_HTO.index] +# HTO = pd.DataFrame(X_HTO.A, index=obs_HTO.index, columns=var_HTO.index) +# diff = adata.obs.index[~adata.obs.index.isin(HTO.index)] # 2 idxs... +# for d in diff: +# HTO.loc[d]=0 +# adata.obsm['Sample_Tags']=HTO.loc[adata.obs_names] + +# adata.var_names_make_unique() # Otherwise error +# adatas.append(adata) + +# # merge and add metadata +# adata = sc.concat(adatas) +# tab = pd.read_csv(TEMPDIR / 'WesselsSatija2023/GSE213957_THP1-CaRPool-seq.metadata.tsv', sep='\t') +# tab.index = [x[:-2] for x in tab.index] +# adata = adata[tab.index].copy() +# adata.obs = tab + +# # Harmonize +# adata.obs.drop(['S.Score', 'G2M.Score'], axis=1, inplace=True) +# adata.obs.rename({ +# 'nCount_RNA': 'ncounts', +# 'nFeature_RNA': 'ngenes', +# 'percent.mt': 'percent_mito', +# 'nCount_HTO': 'ncounts_HTO', +# 'nCount_ADT': 'ncounts_ADT', +# 'TenX.Lane': '10X_lane', +# 'CRISPR.Array': 'CRISPRcas13_Array', +# 'GenePair': 'perturbation', +# 'Guides': 'guides', +# 'Phase': 'Cell_cycle_phase' +# }, axis=1, inplace=True) +# adata.obs.perturbation[adata.obs.perturbation=='NT_NT'] = 'control' +# adata.obs.perturbation = [x.replace('NT_','').replace('_NT', '') for x in adata.obs.perturbation] +# adata.obs['nperts'] = [p.count('_')+1-p.count('control') if type(p)==str else 0 for p in adata.obs.perturbation] +# adata.obs['perturbation_type'] = 'CRISPR-cas13' +# adata.obs['disease'] = "leukemia" +# adata.obs['cancer'] = True +# adata.obs['tissue_type']="cell_line" +# adata.obs["cell_line"] = "THP-1" +# adata.obs["celltype"] = 'monocytes' +# adata.obs['organism'] = 'human' +# annotate_qc(adata, species='human') +# adata.obs.index.name = 'cell_barcode' +# assert_annotations(adata) + +adata.write(snakemake.output[0], compression='gzip') + + diff --git a/dataset_processing/snakemake/subworkflows/archive/ChenGuo2021/Snakefile b/dataset_processing/snakemake/subworkflows/archive/ChenGuo2021/Snakefile new file mode 100644 index 0000000..3d902f5 --- /dev/null +++ b/dataset_processing/snakemake/subworkflows/archive/ChenGuo2021/Snakefile @@ -0,0 +1,42 @@ +""" +Author: Stefan Peidli +Date: 23.11.2023 +Run: snakemake +""" + +from pathlib import Path +configfile: "../../configuration/config.yaml" + +### PATHS ### +DATADIR = Path(config['DOWNDIR']) # place to store data +TEMPDIR = Path(config['TEMPDIR']) # place to store temporary files (huge files) + +# ### RULES ### +rule ChenGuo2021_download: + output: + TEMPDIR / 'ChenGuo2021/download.flag' + resources: + partititon='short', + time='01:00:00', + mem_mb=8000, + disk_mb=8000 + shell: + """ + cd {TEMPDIR}/ChenGuo2021 + rm -rf * + wget --recursive --no-parent -nd -R "index.html*" ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE175nnn/GSE175413/suppl/ + tar -xvf GSE175413_RAW.tar + rm GSE175413_RAW.tar + touch {output} + """ + +rule ChenGuo2021: + input: + rules.ChenGuo2021_download.output + output: DATADIR / 'ChenGuo2021.h5ad' + resources: + partititon='short', + time='04:00:00', + mem_mb=128000, + disk_mb=128000 + script: 'ChenGuo2021.py' diff --git a/dataset_processing/snakemake/subworkflows/archive/UrsuBoehm2022/Snakefile b/dataset_processing/snakemake/subworkflows/archive/UrsuBoehm2022/Snakefile new file mode 100644 index 0000000..25ede20 --- /dev/null +++ b/dataset_processing/snakemake/subworkflows/archive/UrsuBoehm2022/Snakefile @@ -0,0 +1,40 @@ +""" +Author: Stefan Peidli +Date: 23.11.2023 +Run: snakemake +""" + +from pathlib import Path +configfile: "../../configuration/config.yaml" + +### PATHS ### +DATADIR = Path(config['DOWNDIR']) # place to store data +TEMPDIR = Path(config['TEMPDIR']) # place to store temporary files (huge files) + +# ### RULES ### +rule UrsuBoehm2022_download: + output: + TEMPDIR / 'UrsuBoehm2022/download.flag' + resources: + partititon='short', + time='01:00:00', + mem_mb=8000, + disk_mb=8000 + shell: + """ + cd {TEMPDIR}/UrsuBoehm2022 + rm -rf * + wget --recursive --no-parent -nd -R "index.html*" ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE161nnn/GSE161824/suppl/ + gunzip *.gz + """ + +rule UrsuBoehm2022: + input: + rules.UrsuBoehm2022_download.output + output: DATADIR / 'UrsuBoehm2022.h5ad' + resources: + partititon='short', + time='04:00:00', + mem_mb=128000, + disk_mb=128000 + script: 'UrsuBoehm2022.py' diff --git a/dataset_processing/snakemake/subworkflows/archive/UrsuBoehm2022/UrsuBoehm2022.py b/dataset_processing/snakemake/subworkflows/archive/UrsuBoehm2022/UrsuBoehm2022.py new file mode 100644 index 0000000..3921fe9 --- /dev/null +++ b/dataset_processing/snakemake/subworkflows/archive/UrsuBoehm2022/UrsuBoehm2022.py @@ -0,0 +1,93 @@ +import pandas as pd +import scanpy as sc +import numpy as np +import sys + +from scipy.io import mmread +from scipy.sparse import csr_matrix +from pathlib import Path + +# Custom functions +sys.path.insert(1, '../../') +from utils import annotate_qc, assert_annotations + +TEMPDIR = Path(snakemake.config['TEMPDIR']) + +# def read_count_matrix(key): +# X = csr_matrix(mmread(TEMPDIR / f'WesselsSatija2023/{key}.matrix.mtx').T) +# obs = pd.read_csv(TEMPDIR / f'WesselsSatija2023/{key}.barcodes.tsv', sep='\t', index_col=0, names=['cell_barcode']) +# var = pd.read_csv(TEMPDIR / f'WesselsSatija2023/{key}.features.tsv', sep='\t', index_col=1, names=['ENSEMBL_ID', 'gene_symbol', 'feature_type']) +# return X, obs, var + +# adatas = [] +# for j in ['1', '2', '3', '4']: +# X, obs, var = read_count_matrix(f'THP1-CaRPool-seq_and_HEK293FTstabRNA.GEXGDO{j}') +# adata = sc.AnnData(X, obs=obs, var=var) +# adata.obs.index = [x[:-2] for x in obs.index] +# adata.obs.index = [f'L{j}_{x}' for x in adata.obs.index] + +# # Add ADT +# X_ADT, obs_ADT, var_ADT = read_count_matrix(f'THP1-CaRPool-seq_and_HEK293FTstabRNA.ADT{j}') +# var_ADT = var_ADT.set_index('ENSEMBL_ID') +# var_ADT.index.name='ADT_id' +# var_ADT.feature_type = 'Surface Protein' +# obs_ADT.index = [f'L{j}_{x}' for x in obs_ADT.index] +# ADT = pd.DataFrame(X_ADT.A, index=obs_ADT.index, columns=var_ADT.index) +# diff = adata.obs.index[~adata.obs.index.isin(ADT.index)] # 2 idxs... +# for d in diff: +# ADT.loc[d]=0 +# adata.obsm['Surface_protein']=ADT.loc[adata.obs_names] + +# # Add HTO +# X_HTO, obs_HTO, var_HTO = read_count_matrix(f'THP1-CaRPool-seq_and_HEK293FTstabRNA.HTO{j}') +# var_HTO = var_HTO.set_index('ENSEMBL_ID') +# var_HTO.index.name='HTO_id' +# var_HTO.feature_type = 'Hashtag Oligo' +# obs_HTO.index = [f'L{j}_{x}' for x in obs_HTO.index] +# HTO = pd.DataFrame(X_HTO.A, index=obs_HTO.index, columns=var_HTO.index) +# diff = adata.obs.index[~adata.obs.index.isin(HTO.index)] # 2 idxs... +# for d in diff: +# HTO.loc[d]=0 +# adata.obsm['Sample_Tags']=HTO.loc[adata.obs_names] + +# adata.var_names_make_unique() # Otherwise error +# adatas.append(adata) + +# # merge and add metadata +# adata = sc.concat(adatas) +# tab = pd.read_csv(TEMPDIR / 'WesselsSatija2023/GSE213957_THP1-CaRPool-seq.metadata.tsv', sep='\t') +# tab.index = [x[:-2] for x in tab.index] +# adata = adata[tab.index].copy() +# adata.obs = tab + +# # Harmonize +# adata.obs.drop(['S.Score', 'G2M.Score'], axis=1, inplace=True) +# adata.obs.rename({ +# 'nCount_RNA': 'ncounts', +# 'nFeature_RNA': 'ngenes', +# 'percent.mt': 'percent_mito', +# 'nCount_HTO': 'ncounts_HTO', +# 'nCount_ADT': 'ncounts_ADT', +# 'TenX.Lane': '10X_lane', +# 'CRISPR.Array': 'CRISPRcas13_Array', +# 'GenePair': 'perturbation', +# 'Guides': 'guides', +# 'Phase': 'Cell_cycle_phase' +# }, axis=1, inplace=True) +# adata.obs.perturbation[adata.obs.perturbation=='NT_NT'] = 'control' +# adata.obs.perturbation = [x.replace('NT_','').replace('_NT', '') for x in adata.obs.perturbation] +# adata.obs['nperts'] = [p.count('_')+1-p.count('control') if type(p)==str else 0 for p in adata.obs.perturbation] +# adata.obs['perturbation_type'] = 'CRISPR-cas13' +# adata.obs['disease'] = "leukemia" +# adata.obs['cancer'] = True +# adata.obs['tissue_type']="cell_line" +# adata.obs["cell_line"] = "THP-1" +# adata.obs["celltype"] = 'monocytes' +# adata.obs['organism'] = 'human' +# annotate_qc(adata, species='human') +# adata.obs.index.name = 'cell_barcode' +# assert_annotations(adata) + +adata.write(snakemake.output[0], compression='gzip') + +