diff --git a/scripts/notebooks/Batching.ipynb b/scripts/notebooks/Batching.ipynb new file mode 100644 index 000000000..4bf62dddc --- /dev/null +++ b/scripts/notebooks/Batching.ipynb @@ -0,0 +1,1315 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Overview" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The purpose of this notebook is to conduct batching for samples in preparation for the batched steps of the GATK-SV pipeline. \n", + "\n", + "This notebook performs hierarchical batching as described in the [batching documentation](https://broadinstitute.github.io/gatk-sv/docs/modules/eqc#batching). Users can customize the batch generation process based on various metrics analyzed in the sample QC process. At the end, generated batches are created automatically in the workspace, which can be used directly in the pipeline.\n", + "\n", + "**Suggested VM Specifications**:\n", + "* Application Configuration: Default\n", + "* CPUs: 2\n", + "* Memory: 13 GB\n", + "* Persistent Disk: 100 GB\n", + "\n", + "**Prerequisites**: Sample QC Notebook.\n", + "\n", + "**Next Steps**: TrainGCNV.\n", + "\n", + "**Legend**:\n", + "
Green Boxes for User Inputs: Users may edit the inputs provided in the code cell directly below to customize the batching parameters. The inputs that are editable are defined in all capitals (e.g. INCLUDE_METRICS), and their descriptions can be found in the cells above them. Reasonable defaults are provided.
\n", + "\n", + "**Execution Tips:**\n", + "* To quickly run all the cells containing helper functions, constants, and imports, skip to the first cell of *Data Ingestion*, click \"Cell\" in the toolbar at the top of the notebook, and select \"Run All Above.\" Then, starting from *Data Ingestion*, proceed step-by-step through the notebook.\n", + "* The keyboard shortcut to run a cell is `Shift`+`Return`." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true + }, + "source": [ + "# Imports\n", + "This section defines all imports required by this notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "code_folding": [], + "hidden": true + }, + "outputs": [], + "source": [ + "# Package imports\n", + "import os\n", + "import re\n", + "import math\n", + "import subprocess\n", + "from firecloud import api as fapi\n", + "from pprint import pprint\n", + "\n", + "# Aliased imports\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "# Plotting imports\n", + "import matplotlib as mpl\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.image as mpimg\n", + "import matplotlib_inline.backend_inline\n", + "from matplotlib.colors import TABLEAU_COLORS\n", + "\n", + "# Plotting settings\n", + "matplotlib_inline.backend_inline.set_matplotlib_formats('svg')\n", + "default_dpi = mpl.rcParams['figure.dpi']\n", + "mpl.rcParams['figure.dpi'] = 200" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true + }, + "source": [ + "# Constants\n", + "This section declares constants used throughout the notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "# Workspace-level\n", + "TLD_PATH = 'evidence_qc'\n", + "PROJECT = os.environ['GOOGLE_PROJECT']\n", + "WORKSPACE = os.environ['WORKSPACE_NAME']\n", + "WS_BUCKET = os.environ['WORKSPACE_BUCKET']\n", + "NAMESPACE = os.environ['WORKSPACE_NAMESPACE']\n", + "\n", + "# PED file validation\n", + "ID_TYPE_SAMPLE = \"sample\"\n", + "ID_TYPE_FAMILY = \"family\"\n", + "ID_TYPE_PARENT = \"parent\"\n", + "FIELD_NUMBER_ID = 1\n", + "FIELD_NUMBER_SEX = 4\n", + "ILLEGAL_ID_SUBSTRINGS = [\"chr\", \"name\", \"DEL\", \"DUP\", \"CPX\", \"CHROM\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Helper Functions\n", + "This section instantiates helper functions used throughout the notebook." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true + }, + "source": [ + "## File System" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def generate_file_path(tld_path, file_type, file_name):\n", + " \"\"\"\n", + " Calculate the file path of a file to save in either the file system or Google Cloud Storage.\n", + "\n", + " Args:\n", + " tld_path (str): Top-level directory path.\n", + " file_type (str): Enables generation of the specific sub-directory which a file should live in.\n", + " file_name (str): File name to chain at the end of the path.\n", + "\n", + " Returns:\n", + " str: Path to file as it should be saved, per file system outline.\n", + " \"\"\"\n", + " return tld_path + '/' + file_type + '/' + file_name" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def save_df(ws_bucket, file_path, df):\n", + " \"\"\"\n", + " Save a dataframe to the specified file path, creating directories if they don't exist.\n", + " \n", + " Args:\n", + " ws_bucket (str): The bucket in Google Cloud Storage where the file should be saved.\n", + " file_path (str): The path where the figure should be saved.\n", + " df (pandas.DataFrame): The dataframe to save.\n", + " \n", + " Returns:\n", + " None.\n", + " \"\"\"\n", + " dir_path = os.path.dirname(file_path)\n", + " \n", + " if dir_path:\n", + " os.makedirs(dir_path, exist_ok=True)\n", + " \n", + " df.to_csv(file_path, sep='\\t', index=False)\n", + " \n", + " gcs_path = f\"{ws_bucket}/{file_path}\"\n", + " subprocess.run(\n", + " [\"gsutil\", \"cp\", \"-r\", file_path, gcs_path], \n", + " stdout=subprocess.DEVNULL,\n", + " stderr=subprocess.DEVNULL\n", + " )\n", + " \n", + " print(f\"File saved to: {file_path}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def save_figure(ws_bucket, file_path, fig=None):\n", + " \"\"\"\n", + " Save a figure to the specified file path, creating directories if they don't exist.\n", + " \n", + " Args:\n", + " ws_bucket (str): The bucket in Google Cloud Storage where the file should be saved.\n", + " file_path (str): The path where the figure should be saved.\n", + " fig (matplotlib.figure.Figure, optional): The figure to save. If None, uses the current figure.\n", + " \n", + " Returns:\n", + " None.\n", + " \"\"\"\n", + " dir_path = os.path.dirname(file_path)\n", + " \n", + " if dir_path:\n", + " os.makedirs(dir_path, exist_ok=True)\n", + " \n", + " if fig is None:\n", + " plt.savefig(file_path)\n", + " else:\n", + " fig.savefig(file_path)\n", + " \n", + " gcs_path = f\"{ws_bucket}/{file_path}\"\n", + " subprocess.run(\n", + " [\"gsutil\", \"cp\", \"-r\", file_path, gcs_path], \n", + " stdout=subprocess.DEVNULL,\n", + " stderr=subprocess.DEVNULL\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def write_batch_assignments(batches):\n", + " \"\"\"\n", + " Creates a two-column DataFrame with batch assignments and writes it to a file.\n", + "\n", + " Args:\n", + " batches (dict): Dictionary containing batch assignment information.\n", + "\n", + " Returns:\n", + " None.\n", + " \"\"\"\n", + " # Create batches dataframe\n", + " batch_assignments = []\n", + " for batch_name in sorted(batches.keys()):\n", + " sample_list = batches[batch_name]\n", + " batch_df = pd.DataFrame({\n", + " 'batch': [batch_name] * len(sample_list),\n", + " 'sample': sample_list\n", + " })\n", + " batch_assignments.append(batch_df)\n", + " \n", + " # Create and write the dataframe\n", + " batch_assignment_df = pd.concat(batch_assignments, ignore_index=True)\n", + " file_path = generate_file_path(TLD_PATH, \"batching\", \"batch_assignments.tsv\")\n", + " save_df(WS_BUCKET, file_path, batch_assignment_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def upload_sample_sets(batches, dry_run=False):\n", + " \"\"\"\n", + " Uploads sample sets matching generated batches to the Terra workspace.\n", + " \n", + " Args:\n", + " batches (dict): Dictionary containing batch assignment information.\n", + " dry_run (bool): Indicates whether to actually create the sample sets in workspace data.\n", + " \n", + " Returns:\n", + " None.\n", + " \"\"\"\n", + " for set_name, sample_list in batches.items():\n", + " sample_set_df = pd.DataFrame({\n", + " \"membership:sample_set_id\": [set_name]*len(sample_list),\n", + " \"sample\" : sample_list\n", + " })\n", + " \n", + " file_name = f\"{set_name}_sample_set_membership.tsv\"\n", + " file_path = generate_file_path(TLD_PATH, \"batching\", file_name)\n", + " save_df(WS_BUCKET, file_path, sample_set_df)\n", + "\n", + " if not dry_run:\n", + " response = fapi.upload_entities_tsv(NAMESPACE, WORKSPACE, file_path, model=\"flexible\")\n", + " if response.status_code != 200:\n", + " print(\"ERROR\")\n", + " print(response)\n", + " pprint(response.text)\n", + " exit(1)\n", + " else:\n", + " print(f\"Successfully created new sample set: {set_name}.\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true + }, + "source": [ + "## Validation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def validate_numeric_inputs(input_vals, log=True):\n", + " \"\"\"\n", + " Validates user input to check whether it is all numeric or not. \n", + " \n", + " Args:\n", + " input_vals (list): List of user input to validate.\n", + " \n", + " Returns:\n", + " None.\n", + " \"\"\"\n", + " for i in input_vals:\n", + " if not isinstance(i, (int, float)) or not i:\n", + " raise Exception('Value input must be numeric.')\n", + " \n", + " if log:\n", + " print(\"Inputs are valid - please proceed to the next cell.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def validate_string_inputs(input_vals, log=True):\n", + " \"\"\"\n", + " Validates user input to check whether it is all strings or not. \n", + " \n", + " Args:\n", + " input_vals (list): List of user input to validate.\n", + " \n", + " Returns:\n", + " None.\n", + " \"\"\"\n", + " for i in input_vals:\n", + " if not isinstance(i, str):\n", + " raise Exception('Value input must be a string.')\n", + " \n", + " if log:\n", + " print(\"Inputs are valid - please proceed to the next cell.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def validate_include_metrics(include_metrics, include_cols):\n", + " \"\"\"\n", + " Validates user input to check whether they are valid metrics to include in batching. \n", + " \n", + " Args:\n", + " include_metrics (list): List of metrics to validate.\n", + " include_cols (list): List of available metrics.\n", + " \n", + " Returns:\n", + " None.\n", + " \"\"\"\n", + " validate_string_inputs(include_metrics, log=False)\n", + " \n", + " if (not len(include_metrics) > 0):\n", + " raise Exception(f\"Length of INCLUDE_METRICS must be at least 1.\")\n", + " \n", + " for i in include_metrics:\n", + " if not i in include_cols:\n", + " raise Exception(f\"Metric '{i}' not available - please only use metrics from above.\")\n", + " \n", + " print(\"Inputs are valid - please proceed to the next cell.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def validate_include_bins(include_metrics, include_bins):\n", + " \"\"\"\n", + " Validates user input to check whether they are valid bins to use when batching. \n", + " \n", + " Args:\n", + " include_metrics (list): List of metrics to validate.\n", + " include_bins (list): List of metric bins to validate.\n", + " \n", + " Returns:\n", + " None.\n", + " \"\"\"\n", + " validate_numeric_inputs(include_bins, log=False)\n", + " \n", + " if (len(include_metrics) != len(include_bins) + 1):\n", + " raise Exception(f\"Length of INCLUDE_BINS must be 1 less than the length of INCLUDE_METRICS.\")\n", + " \n", + " print(\"Inputs are valid - please proceed to the next cell.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def validate_batch_sizes(df, target_batch_size, min_batch_size, max_batch_size):\n", + " \"\"\"\n", + " Validates user input to check whether they are valid bins to use when batching. \n", + " \n", + " Args:\n", + " df (pd.DataFrame): Dataframe with sample data.\n", + " target_batch_size (int): Target size of each batch.\n", + " min_batch_size (int): Minimum size of each batch.\n", + " max_batch_size (int): Maximum size of each batch.\n", + " \n", + " Returns:\n", + " None.\n", + " \"\"\"\n", + " validate_numeric_inputs([target_batch_size, min_batch_size, max_batch_size], log=False)\n", + " \n", + " if (target_batch_size < min_batch_size):\n", + " raise Exception(\"TARGET_BATCH_SIZE must exceed MIN_BATCH_SIZE.\")\n", + " \n", + " if (max_batch_size < target_batch_size):\n", + " raise Exception(\"MAX_BATCH_SIZE must exceed TARGET_BATCH_SIZE.\")\n", + " \n", + " if (len(df) < min_batch_size):\n", + " raise Exception(\"MIN_BATCH_SIZE must exceed the number of samples.\")\n", + " \n", + " print(\"Inputs are valid - please proceed to the next cell.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def validate_id(identifier, id_type, source_file):\n", + " \"\"\"\n", + " Validates sample IDs provided based on a source file of samples.\n", + " \n", + " Args:\n", + " identifier (str): ID for a given sample.\n", + " id_type (str): Type of ID provided.\n", + " source_file (str): File that contains all samples.\n", + " \n", + " Returns:\n", + " None.\n", + " \"\"\"\n", + " # Check for empty IDs\n", + " if identifier is None or identifier == \"\":\n", + " raise ValueError(f\"Empty {id_type} ID in {source_file}.\")\n", + "\n", + " # Check all characters are alphanumeric or underscore\n", + " if not re.match(r'^\\w+$', identifier):\n", + " raise ValueError(f\"Invalid {id_type} ID in {source_file}: '{identifier}'.\" + \n", + " \"IDs should only contain alphanumeric and underscore characters.\")\n", + "\n", + " # Check for all-numeric IDs, besides maternal & paternal ID (can be 0) and all-numeric family IDs\n", + " if id_type != ID_TYPE_FAMILY and not (id_type == ID_TYPE_PARENT and identifier == \"0\") and identifier.isdigit():\n", + " raise ValueError(f\"Invalid {id_type} ID in {source_file}: {identifier}. \" +\n", + " \"IDs should not contain only numeric characters.\")\n", + "\n", + " # Check for illegal substrings\n", + " for sub in ILLEGAL_ID_SUBSTRINGS:\n", + " if sub in identifier:\n", + " raise ValueError(f\"Invalid {id_type} ID in {source_file}: {identifier}. \" +\n", + " f\"IDs cannot contain the following substrings: {', '.join(ILLEGAL_ID_SUBSTRINGS)}.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def validate_ped(ped_file, samples):\n", + " \"\"\"\n", + " Validates structure and data within PED file based on series of samples provided.\n", + " Works with both local and GCS files.\n", + " \n", + " Args:\n", + " ped_file (str): Path to PED file (local or GCS path starting with 'gs://').\n", + " samples (set): Set of sample IDs to validate against.\n", + " \n", + " Returns:\n", + " None\n", + " \"\"\"\n", + " seen_sex_1 = False\n", + " seen_sex_2 = False\n", + " samples_found = set()\n", + "\n", + " # Read PED file\n", + " try:\n", + " df = pd.read_table(ped_file, dtype=str, header=None, comment='#', names=[\n", + " 'Family_ID', 'Sample_ID', 'Paternal_ID', 'Maternal_ID', 'Sex', 'Phenotype'\n", + " ])\n", + " except Exception as e:\n", + " raise ValueError(f\"Error reading PED file: {str(e)}\")\n", + " \n", + " # Ensure column count\n", + " if len(df.columns) != 6:\n", + " raise ValueError(\"PED file must have 6 columns: Family_ID, Sample_ID, \" +\n", + " \"Paternal_ID, Maternal_ID, Sex, Phenotype.\")\n", + "\n", + " # Iteratively validate each row\n", + " for _, row in df.iterrows():\n", + " # Validate ID\n", + " for identifier, id_type in zip(row[:FIELD_NUMBER_SEX],\n", + " [ID_TYPE_FAMILY, ID_TYPE_SAMPLE, ID_TYPE_PARENT, ID_TYPE_PARENT]):\n", + " validate_id(identifier, id_type, \"PED file\")\n", + "\n", + " # Assign main information to variables\n", + " sample_id = row['Sample_ID']\n", + " sex = int(row['Sex'])\n", + "\n", + " # Check for appearance of each sex\n", + " if sex == 1:\n", + " seen_sex_1 = True\n", + " elif sex == 2:\n", + " seen_sex_2 = True\n", + " elif sex != 0:\n", + " raise ValueError(f\"Sample {sample_id} has an invalid value for sex: {sex}. \" +\n", + " \"PED file must use the following values for sex: \" + \n", + " \"1 for Male, 2 for Female, 0 for Unknown/Other.\")\n", + "\n", + " # Verify no duplications\n", + " if sample_id in samples_found:\n", + " raise ValueError(f\"Duplicate entries for sample {sample_id}.\")\n", + " elif sample_id in samples:\n", + " samples_found.add(sample_id)\n", + "\n", + " # Check if all samples in the sample list are present in PED file\n", + " if len(samples_found) < len(samples):\n", + " missing = samples - samples_found\n", + " raise ValueError(f\"PED file is missing sample(s): {','.join(missing)}.\")\n", + "\n", + " # Raise error if at least one of either sex is not found\n", + " if not (seen_sex_2 and seen_sex_1):\n", + " raise ValueError(\"Did not find existence of multiple sexes in file. \" +\n", + " \"PED file must use the following values for sex: \" + \n", + " \"1 for Male, 2 for Female, 0 for Unknown/Other.\")\n", + " \n", + " print(\"PED file is valid - please proceed to the next cell.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true + }, + "source": [ + "## Batching" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def plot_batch_metrics(df, batches, include_metrics, display=False, prefix=''):\n", + " \"\"\"\n", + " Plots graphs per batch regarding the distribution of included metrics.\n", + " \n", + " Args:\n", + " df (pandas.DataFrame): Dataframe containing sample data.\n", + " batches (list): Contains names of batches generated.\n", + " include_metrics (list): Contains metrics included in batching process.\n", + " prefix (str): Prefix to use when naming output files.\n", + " \n", + " Returns:\n", + " None\n", + " \"\"\"\n", + " fig, ax = plt.subplots(nrows=len(batches), ncols=len(include_metrics), \n", + " figsize=(5*len(include_metrics), 5*len(batches)), sharex='col', sharey='col')\n", + " titlesize = 18\n", + " labelsize = 16\n", + " \n", + " # Determine global min and max for each metric using df\n", + " global_min = df[include_metrics].min()\n", + " global_max = df[include_metrics].max()\n", + " \n", + " for i, (batch_name, batch_samples) in enumerate(batches.items()):\n", + " batch_samples = set(batch_samples)\n", + " batch_meta = df[df.sample_id.isin(batch_samples)]\n", + " \n", + " for j, metric in enumerate(include_metrics):\n", + " data = batch_meta[metric].values\n", + " bins = np.linspace(global_min[metric], global_max[metric], 30)\n", + " \n", + " if len(batches) == 1:\n", + " current_ax = ax[j]\n", + " else:\n", + " current_ax = ax[i][j]\n", + " \n", + " current_ax.hist(data, bins=bins, color='gray', edgecolor='black')\n", + " current_ax.set_xlabel(metric, fontsize=labelsize)\n", + " current_ax.set_ylabel(\"Sample count\", fontsize=labelsize)\n", + " current_ax.set_xlim(global_min[metric], global_max[metric])\n", + " current_ax.xaxis.set_tick_params(labelbottom=True, size=labelsize)\n", + " \n", + " if len(batches) == 1:\n", + " ax[0].set_title(batch_name, fontsize=titlesize)\n", + " else:\n", + " ax[i][0].set_title(batch_name, fontsize=titlesize)\n", + " \n", + " plt.tight_layout()\n", + " \n", + " # Save the plot as an image\n", + " file_name = f\"{prefix}_distribution_{len(batches)}_batches.png\"\n", + " file_path = generate_file_path(TLD_PATH, \"batching/metric_plots\", file_name)\n", + " save_figure(WS_BUCKET, file_path)\n", + " plt.close()\n", + " \n", + " # Supplementary Plot\n", + " plot_cluster_distribution(df, include_metrics, batches, prefix)\n", + " \n", + " # Supplementary Plot\n", + " plot_panelled_cluster_distribution(df, include_metrics, batches, prefix)\n", + " \n", + " # Show figure if display = True\n", + " if display:\n", + " img = mpimg.imread(file_path)\n", + " plt.figure(figsize=(10, 20))\n", + " plt.imshow(img)\n", + " plt.axis('off')\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def plot_cluster_distribution(df, include_metrics, batches, prefix=''):\n", + " \"\"\"\n", + " Create a plot showing the batch distribution based on the first two metrics in include_metrics.\n", + " \n", + " Args:\n", + " df (pd.DataFrame): DataFrame containing sample data.\n", + " include_metrics (list): List of metric names considered for batching.\n", + " batches (dict): Dictionary of batch names and lists of sample IDs.\n", + " prefix (str): Prefix to use when naming output files.\n", + " \"\"\"\n", + " x_metric, y_metric = include_metrics[:2]\n", + " \n", + " # Create a color map for batches\n", + " color_list = list(TABLEAU_COLORS.values())\n", + " batch_colors = {batch: color_list[i % len(color_list)] for i, batch in enumerate(batches.keys())}\n", + " \n", + " # Original combined plot\n", + " plt.figure(figsize=(10, 8))\n", + " \n", + " for batch_name, sample_ids in batches.items():\n", + " batch_data = df[df['sample_id'].isin(sample_ids)]\n", + " plt.scatter(batch_data[x_metric], batch_data[y_metric], label=batch_name, \n", + " color=batch_colors[batch_name], alpha=0.7)\n", + " \n", + " plt.xlabel(x_metric)\n", + " plt.ylabel(y_metric)\n", + " plt.title(f\"Batch Distribution: {x_metric} vs {y_metric}\")\n", + " plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", + " plt.tight_layout()\n", + " \n", + " # Save the combined plot as an image\n", + " file_name = f\"{prefix}_distribution_{len(batches)}_batches.png\"\n", + " file_path = generate_file_path(TLD_PATH, \"batching/cluster_plots\", file_name)\n", + " save_figure(WS_BUCKET, file_path)\n", + " plt.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def plot_panelled_cluster_distribution(df, include_metrics, batches, prefix=''):\n", + " \"\"\"\n", + " Create a pannelled plot showing the batch distribution based on the first two metrics in include_metrics.\n", + " \n", + " Args:\n", + " df (pd.DataFrame): DataFrame containing sample data.\n", + " include_metrics (list): List of metric names considered for batching.\n", + " batches (dict): Dictionary of batch names and lists of sample IDs.\n", + " prefix (str): Prefix to use when naming output files.\n", + " \"\"\"\n", + " x_metric, y_metric = include_metrics[:2]\n", + " \n", + " # Create a color map for batches\n", + " color_list = list(TABLEAU_COLORS.values())\n", + " batch_colors = {batch: color_list[i % len(color_list)] for i, batch in enumerate(batches.keys())}\n", + " \n", + " # New panel plot\n", + " plt.figure(figsize=(10, 8))\n", + " n_batches = len(batches)\n", + " n_cols = min(3, n_batches) # Maximum 3 columns\n", + " n_rows = math.ceil(n_batches / n_cols)\n", + " \n", + " fig, axes = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 5*n_rows))\n", + " fig.suptitle(f\"Individual Batch Distributions: {x_metric} vs {y_metric}\", fontsize=16)\n", + " \n", + " axes = axes.flatten() if n_batches > 1 else [axes]\n", + " \n", + " # Find global min and max for consistent axis limits\n", + " x_min, x_max = df[x_metric].min(), df[x_metric].max()\n", + " y_min, y_max = df[y_metric].min(), df[y_metric].max()\n", + " \n", + " for i, (batch_name, sample_ids) in enumerate(batches.items()):\n", + " batch_data = df[df['sample_id'].isin(sample_ids)]\n", + " axes[i].scatter(batch_data[x_metric], batch_data[y_metric], \n", + " color=batch_colors[batch_name], alpha=0.7)\n", + " axes[i].set_title(batch_name)\n", + " axes[i].set_xlabel(x_metric)\n", + " axes[i].set_ylabel(y_metric)\n", + " axes[i].set_xlim(x_min, x_max)\n", + " axes[i].set_ylim(y_min, y_max)\n", + " \n", + " # Remove any unused subplots\n", + " for j in range(i+1, len(axes)):\n", + " fig.delaxes(axes[j])\n", + " \n", + " plt.tight_layout()\n", + " \n", + " # Save the panel plot as an image\n", + " file_name = f\"{prefix}_distribution_{len(batches)}_batches.png\"\n", + " file_path = generate_file_path(TLD_PATH, \"batching/pannelled_cluster_plots\", file_name)\n", + " save_figure(WS_BUCKET, file_path)\n", + " plt.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def group_related_samples(df, ped_df):\n", + " \"\"\"\n", + " Groups samples that are related to other samples within a cohort.\n", + " \n", + " Args:\n", + " df (pandas.DataFrame): Dataframe that contains sample data.\n", + " ped_df (pandas.DataFrame): Dataframe that contains family structure data derived from the PED file.\n", + " \n", + " Returns:\n", + " Dictionary that maps samples to family members that are related to them.\n", + " \"\"\"\n", + " if ped_df is None:\n", + " return {sample_id: {sample_id} for sample_id in df.sample_id}\n", + " \n", + " related_groups = {}\n", + " for _, row in ped_df.iterrows():\n", + " sample_id = row['Sample_ID']\n", + " if sample_id not in df.sample_id.values:\n", + " continue\n", + " related_ids = {sample_id, row['Paternal_ID'], row['Maternal_ID']}\n", + " related_ids = {id for id in related_ids if id in df.sample_id.values}\n", + " \n", + " for id in related_ids:\n", + " if id in related_groups:\n", + " related_groups[id].update(related_ids)\n", + " else:\n", + " related_groups[id] = related_ids\n", + " \n", + " merged_groups = []\n", + " for group in related_groups.values():\n", + " for merged in merged_groups:\n", + " if group & merged:\n", + " merged.update(group)\n", + " break\n", + " else:\n", + " merged_groups.append(group)\n", + " \n", + " return {sample_id: next(group for group in merged_groups if sample_id in group) for sample_id in df.sample_id}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def select_family_representatives(ped_df, related_groups):\n", + " \"\"\"\n", + " Selects a set of representatives for each family group part of a cohort.\n", + " \n", + " Args:\n", + " ped_df (pandas.DataFrame): Dataframe that contains family structure data.\n", + " related_groups (dict): Dictionary mapping samples to their families.\n", + " \n", + " Returns:\n", + " Set of representatives to use as a proxy for unique families when batching.\n", + " \"\"\"\n", + " representatives = set()\n", + " parents = set(ped_df['Paternal_ID']).union(set(ped_df['Maternal_ID']))\n", + " parents.discard('0')\n", + " \n", + " for group in related_groups.values():\n", + " lowest_descendants = [member for member in group if member not in parents]\n", + " if lowest_descendants:\n", + " group_df = ped_df[ped_df['Sample_ID'].isin(lowest_descendants)]\n", + " phenotype_2 = group_df[group_df['Phenotype'] == 2]\n", + " rep = group_df['Sample_ID'].iloc[0] if phenotype_2.empty else phenotype_2['Sample_ID'].iloc[0]\n", + " else:\n", + " rep = min(group, key=lambda x: df[df.sample_id == x].index[0])\n", + " representatives.add(rep)\n", + " \n", + " return representatives" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def split_dataframe(df, metrics, bins, batches_per_level):\n", + " \"\"\"\n", + " Splits samples hierarchically.\n", + " \n", + " Args:\n", + " df (pandas.DataFrame): Dataframe that contains sample data.\n", + " metrics (list): Metrics to split on.\n", + " bins (list): Number of bins to split into for each level except the last.\n", + " batches_per_level (list): Number of batches for each level.\n", + " \n", + " Returns:\n", + " List of split dataframes.\n", + " \"\"\"\n", + " if not metrics:\n", + " return [df]\n", + "\n", + " metric = metrics[0]\n", + " if len(metrics) == 1:\n", + " return np.array_split(df.sort_values(by=metric), batches_per_level[-1])\n", + "\n", + " n_bins = bins[0]\n", + " splits = np.array_split(df.sort_values(by=metric), n_bins)\n", + " return [subsplit for split in splits for subsplit in split_dataframe(split, metrics[1:], bins[1:], batches_per_level[1:])]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def generate_hierarchical_batches(df, include_metrics, include_bins, target_batch_size, min_batch_size, max_batch_size, batch_prefix, batch_suffix, reference_ped=None):\n", + " \"\"\"\n", + " Generate hierarchical batches based on specified metrics and parameters, with family-based batching.\n", + " \n", + " Args:\n", + " df (pandas.DataFrame): Dataframe that contains sample data.\n", + " include_metrics (list): Metrics to consider for batching.\n", + " include_bins (list): List of number of bins for each metric, except the last.\n", + " target_batch_size (int): Target size for each batch.\n", + " min_batch_size (int): Minimum allowable batch size.\n", + " max_batch_size (int): Maximum allowable batch size.\n", + " batch_prefix (str): Prefix for batch names.\n", + " batch_suffix (str): Suffix for batch names.\n", + " reference_ped (pandas.DataFrame): Dataframe that contains family structure data.\n", + " \n", + " Returns:\n", + " Tuple of batches dictionary and batches metadata dictionary.\n", + " \"\"\"\n", + " # Validation\n", + " estimated_samples_per_split = len(df)\n", + " for bins in include_bins:\n", + " estimated_samples_per_split /= bins\n", + " if estimated_samples_per_split < min_batch_size:\n", + " raise Exception(f\"Based on INCLUDE_BINS, expected samples per split ({estimated_samples_per_split}) \" +\n", + " \"is less than MIN_BATCH_SIZE ({min_batch_size}). Please adjust parameters accordingly.\")\n", + " \n", + " # Group related samples and select family representatives\n", + " related_groups = group_related_samples(df, reference_ped)\n", + " family_representatives = select_family_representatives(reference_ped, related_groups) if reference_ped is not None else set(df['sample_id'])\n", + " df_unrelated = df[df['sample_id'].isin(family_representatives)]\n", + " \n", + " # Split by gender\n", + " isfemale = (df_unrelated.chrX_CopyNumber_rounded >= 2)\n", + " male_df = df_unrelated[~isfemale]\n", + " female_df = df_unrelated[isfemale]\n", + " \n", + " # Calculate batches_per_level\n", + " n_samples = len(df_unrelated)\n", + " batches_per_level = [max(1, int(np.round(n_samples / target_batch_size)))]\n", + " for bins in include_bins:\n", + " n_samples = int(np.round(n_samples / bins))\n", + " batches_per_level.append(max(1, int(np.round(n_samples / target_batch_size))))\n", + " \n", + " # Create hierarchical splits\n", + " male_splits = split_dataframe(male_df, include_metrics, include_bins, batches_per_level)\n", + " female_splits = split_dataframe(female_df, include_metrics, include_bins, batches_per_level)\n", + " combined_splits = [pd.concat([m, f]) for m, f in zip(male_splits, female_splits)]\n", + " \n", + " # Create batches\n", + " batches = {}\n", + " batches_meta = {}\n", + " batch_num = 0\n", + " for split_index, split in enumerate(combined_splits):\n", + " batch_samples = split['sample_id'].tolist()\n", + " while batch_samples:\n", + " batch_num += 1\n", + " batch_name = f\"{batch_prefix}{batch_num}{batch_suffix}\"\n", + " batch_size = min(max(min_batch_size, len(batch_samples)), max_batch_size)\n", + " batches[batch_name] = batch_samples[:batch_size]\n", + " batch_samples = batch_samples[batch_size:]\n", + " batches_meta[batch_name] = tuple(\n", + " [split_index // (len(combined_splits) // bins) + 1 for bins in include_bins] \n", + " + [split_index % batches_per_level[-1] + 1]\n", + " )\n", + " \n", + " # Add related individuals to proband's batch\n", + " unbatched_samples = set()\n", + " for batch_name, sample_ids in batches.items():\n", + " for sample_id in sample_ids.copy():\n", + " if sample_id in related_groups:\n", + " related_samples = related_groups[sample_id] - set(sample_ids)\n", + " space_left = max_batch_size - len(batches[batch_name])\n", + " batches[batch_name].extend(list(related_samples)[:space_left])\n", + " unbatched_samples.update(list(related_samples)[space_left:])\n", + " \n", + " # Rebalance batches if they exceed max_batch_size\n", + " while unbatched_samples:\n", + " eligible_batches = [b for b in batches if len(batches[b]) < max_batch_size]\n", + " if not eligible_batches:\n", + " batch_num += 1\n", + " new_batch_name = f\"{batch_prefix}{batch_num}{batch_suffix}\"\n", + " batches[new_batch_name] = []\n", + " eligible_batches = [new_batch_name]\n", + " smallest_batch = min(eligible_batches, key=lambda x: len(batches[x]))\n", + " sample_to_add = unbatched_samples.pop()\n", + " batches[smallest_batch].append(sample_to_add)\n", + " \n", + " # Final adjustment to ensure within batch size bounds\n", + " while True:\n", + " batches_to_adjust = [b for b in batches if len(batches[b]) < min_batch_size or len(batches[b]) > max_batch_size]\n", + " if not batches_to_adjust:\n", + " break\n", + " \n", + " sorted_batches = sorted(batches.items(), key=lambda x: len(x[1]))\n", + " for batch_name in batches_to_adjust:\n", + " if len(batches[batch_name]) < min_batch_size:\n", + " samples_needed = min_batch_size - len(batches[batch_name])\n", + " for large_batch_name, large_batch_samples in reversed(sorted_batches):\n", + " if len(large_batch_samples) > min_batch_size:\n", + " samples_to_move = min(samples_needed, len(large_batch_samples) - min_batch_size)\n", + " batches[batch_name].extend(large_batch_samples[-samples_to_move:])\n", + " batches[large_batch_name] = large_batch_samples[:-samples_to_move]\n", + " samples_needed -= samples_to_move\n", + " if samples_needed == 0:\n", + " break\n", + " \n", + " elif len(batches[batch_name]) > max_batch_size:\n", + " overflow = batches[batch_name][max_batch_size:]\n", + " batches[batch_name] = batches[batch_name][:max_batch_size]\n", + " for small_batch_name, small_batch_samples in sorted_batches:\n", + " if len(small_batch_samples) < max_batch_size:\n", + " space_available = max_batch_size - len(small_batch_samples)\n", + " samples_to_move = min(space_available, len(overflow))\n", + " batches[small_batch_name].extend(overflow[:samples_to_move])\n", + " overflow = overflow[samples_to_move:]\n", + " if not overflow:\n", + " break\n", + " if overflow:\n", + " batch_num += 1\n", + " new_batch_name = f\"{batch_prefix}{batch_num}{batch_suffix}\"\n", + " batches[new_batch_name] = overflow\n", + " sorted_batches = sorted(batches.items(), key=lambda x: len(x[1]))\n", + " \n", + " return batches, batches_meta" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data Ingestion\n", + "This section fetches the sample data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "code_folding": [] + }, + "outputs": [], + "source": [ + "# Display metadata table, which was generated as part of EvidenceQC\n", + "PASS_METADATA = os.path.join(WS_BUCKET, \"evidence_qc/filtering/passing_samples_metadata.tsv\")\n", + "pass_df = pd.read_table(PASS_METADATA)\n", + "\n", + "pass_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check for any duplicates\n", + "id_counts = pass_df['sample_id'].value_counts()\n", + "\n", + "duplicates_dict = id_counts[id_counts > 1].to_dict()\n", + "\n", + "if (len(duplicates_dict) > 0):\n", + " print(f\"{len(duplicates_dict)} duplicate 'sample_id' e xist in the dataset.\")\n", + " for sample_id, count in duplicates_dict.items():\n", + " print(f\"Sample ID: {sample_id}, Count: {count}\")\n", + " raise Exception(\"Batching requires unique 'sample_id' - please resolve this before proceeding.\")\n", + "\n", + "print(\"No duplicates found - please proceed to the next cell.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Batching\n", + "This section conducts the batch generation process." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set Parameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Display columns available to batch on\n", + "print(\"Columns available to batch on:\")\n", + "print(\"------------------------------\")\n", + "\n", + "include_cols = [col for col in pass_df.columns if col not in ('sample_id', 'chrX_CopyNumber_rounded')]\n", + "for col in include_cols:\n", + " if (not pass_df[col].isnull().all()):\n", + " print(f\"{col}:\")\n", + " print(f\" - Mean: {round(pass_df[col].mean(), 3)}\")\n", + " print(f\" - Median: {round(pass_df[col].median(), 3)}\")\n", + " print(f\" - MAD: {round(np.median(np.abs(pass_df[col] - pass_df[col].median())), 3)}\")\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Based on the columns listed above, input the list of metrics you would like to batch based on. We recommend using median_coverage and wgd_score for batching at minimum.

You may also wish to batch samples based on other characteristics that could impact SV calling for your data, such as mean insert size or PCR status. If these additional metrics do not appear above, you may add them to the metadata file and reload it.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "INCLUDE_METRICS = []\n", + "\n", + "validate_include_metrics(INCLUDE_METRICS, include_cols)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Input the list of bins you'd like to divide samples into, where INCLUDE_BINS[i] corresponds to the number of bins for INCLUDE_METRICS[i]. Note that the number of bins for the final metric in INCLUDE_METRICS will be determined based on the other parameters, so INCLUDE_BINS should contain 1 less element than INCLUDE_METRICS.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "INCLUDE_BINS = []\n", + "\n", + "validate_include_bins(INCLUDE_METRICS, INCLUDE_BINS)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Input the target batch size you'd like to create, as well as the minimum and maximum batch sizes.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "TARGET_BATCH_SIZE = None\n", + "MINIMUM_BATCH_SIZE = None\n", + "MAXIMUM_BATCH_SIZE = None\n", + "\n", + "validate_batch_sizes(pass_df, TARGET_BATCH_SIZE, MINIMUM_BATCH_SIZE, MAXIMUM_BATCH_SIZE)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Input the batch prefixes and suffixes you'd like to use. Remember to adhere to the naming requirements.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "BATCH_PREFIX = \"batch\"\n", + "BATCH_SUFFIX = \"\"\n", + "\n", + "validate_string_inputs([BATCH_PREFIX, BATCH_SUFFIX])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
If you wish to group related samples in the same batch, set PED_FILE to True - if not, then leave this as is. Note that batching related samples together may result in less homogeneous batches.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "PED_FILE = False\n", + "\n", + "reference_ped = None\n", + "default_ped_path = os.path.join(WS_BUCKET, \"evidence_qc/sex_analysis/sample_qc.ped\")\n", + "if PED_FILE:\n", + " validate_ped(default_ped_path, set(pass_df['sample_id']))\n", + " reference_ped = pd.read_table(default_ped_path, dtype=str, header=None, names = [\n", + " \"Family_ID\", \"Sample_ID\", \"Paternal_ID\", \"Maternal_ID\", \"Sex\", \"Phenotype\"\n", + " ])\n", + " reference_ped = reference_ped[reference_ped['Sample_ID'].isin(pass_df['sample_id'])]\n", + "else: \n", + " print(\"PED file is not provided - skipping this step.\")\n", + "\n", + "reference_ped" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Batches" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Perform hierarchical batching and print summary of batch size and sex balance\n", + "batches, batches_meta = generate_hierarchical_batches(pass_df, INCLUDE_METRICS, INCLUDE_BINS, TARGET_BATCH_SIZE, \n", + " MINIMUM_BATCH_SIZE, MAXIMUM_BATCH_SIZE, BATCH_PREFIX, \n", + " BATCH_SUFFIX, reference_ped)\n", + "\n", + "for batch_name, sample_ids in batches.items():\n", + " batch_df = pass_df[pass_df['sample_id'].isin(sample_ids)]\n", + " female_count = (batch_df.chrX_CopyNumber_rounded >= 2).sum()\n", + " male_count = (batch_df.chrX_CopyNumber_rounded < 2).sum()\n", + " print(f\"{batch_name}: {len(sample_ids)} samples ({male_count} male, {female_count} female)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize distributions of batching metrics per batch to assess within-batch homogeneity\n", + "plot_batch_metrics(pass_df, batches, INCLUDE_METRICS, display=True, prefix=\"fig_hierarchical\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save metadata to file\n", + "tsv_meta = pd.DataFrame.from_dict(batches_meta, orient='index', columns=INCLUDE_METRICS)\n", + "tsv_meta['n_samples'] = tsv_meta.index.map(lambda x: len(batches[x]))\n", + "tsv_meta = tsv_meta.reset_index().rename(columns={'index': 'batch'})\n", + "\n", + "file_path = generate_file_path(TLD_PATH, 'batching', \"batching_metadata.tsv\")\n", + "save_df(WS_BUCKET, file_path, tsv_meta)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Write batch assignments file\n", + "write_batch_assignments(batches)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create sample set corresponding to batches\n", + "upload_sample_sets(batches)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "toc": { + "base_numbering": 1, + "nav_menu": { + "height": "447.995px", + "width": "361.102px" + }, + "number_sections": false, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": { + "height": "470px", + "left": "867px", + "top": "195.141px", + "width": "239px" + }, + "toc_section_display": true, + "toc_window_display": true + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/scripts/notebooks/SampleQC.ipynb b/scripts/notebooks/SampleQC.ipynb new file mode 100644 index 000000000..8901311f1 --- /dev/null +++ b/scripts/notebooks/SampleQC.ipynb @@ -0,0 +1,2837 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Overview" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The purpose of this notebook is to perform preliminary sample quality control (QC) in order to identify and/or remove outlier samples in preparation for the batched steps of the GATK-SV pipeline.\n", + "\n", + "This notebook allows the user to analyze sample QC metrics produced by the *EvidenceQC* workflow, such as median coverage and whole genome dosage (WGD). After visualizing the distributions of these metrics, the user can select outlier cutoffs to filter or flag outlier samples. At the end, a file of sample QC metrics for passing samples is produced, which can be used for batching.\n", + "\n", + "**Suggested VM Specifications**:\n", + "* Application Configuration: Default\n", + "* CPUs: 2\n", + "* Memory: 13 GB\n", + "* Persistent Disk: 100 GB\n", + "\n", + "**Prerequisites**: GatherSampleEvidence, EvidenceQC.\n", + "\n", + "**Next Steps**: Batching, TrainGCNV.\n", + "\n", + "**Legend**:\n", + "
Blue Boxes for One-Time Runs: Uncomment and run the code cell directly below just once, then re-comment the cell to skip it next time. These cells typically save intermediate outputs locally so that the notebook can be paused and picked back up without a significant delay.
\n", + "
Green Boxes for User Inputs: Edit the inputs provided in the code cell directly below. The inputs that are editable are defined in all capitals, and their descriptions can be found in the section headers.
\n", + "\n", + "**Execution Tips**:\n", + "* The first time you start this notebook (one time only), you will need to uncomment and run the package installation cell under *Imports*.\n", + "* Once the packages are installed, to quickly run all the cells containing helper functions, constants, and imports, skip to the first cell of *Data Ingestion*, click \"Cell\" in the toolbar at the top of the notebook, and select \"Run All Above.\" Then, starting from *Data Ingestion*, proceed step-by-step through the notebook.\n", + "* The keyboard shortcut to run a cell is `Shift`+`Return`.\n", + "* The keyboard shortcut to comment or uncomment an enitre cell is `Command`+`/` or `Control`+`/`." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true + }, + "source": [ + "# Imports\n", + "This section defines all imports required by this notebook." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "hidden": true + }, + "source": [ + "
Uncomment and run once. It is not necessary to reinstall these packages each time you restart your cloud environment.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "code_folding": [], + "hidden": true, + "tags": [ + "uncooment" + ] + }, + "outputs": [], + "source": [ + "# ! pip install upsetplot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "# Package imports\n", + "import os\n", + "import io\n", + "import re\n", + "import logging\n", + "import shutil\n", + "import subprocess\n", + "import zipfile\n", + "from collections import defaultdict\n", + "from logging import INFO\n", + "\n", + "# Aliased imports\n", + "import pandas as pd\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import firecloud.api as fapi\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.image as mpimg\n", + "import matplotlib_inline.backend_inline\n", + "\n", + "# Plotting imports\n", + "from upsetplot import UpSet\n", + "from upsetplot import from_memberships\n", + "from IPython.display import IFrame\n", + "from matplotlib.lines import Line2D\n", + "from matplotlib.collections import LineCollection\n", + "import matplotlib.ticker\n", + "from PIL import Image\n", + "\n", + "# Plotting settings \n", + "default_dpi = plt.rcParams['figure.dpi'] \n", + "plt.rcParams['figure.dpi'] = 200\n", + "matplotlib_inline.backend_inline.set_matplotlib_formats('svg')\n", + "\n", + "# Logger settings\n", + "logger = logging.getLogger()\n", + "logger.setLevel(INFO)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true + }, + "source": [ + "# Constants\n", + "This section declares constants that are used throughout the notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "# Workspace-level\n", + "TLD_PATH = 'evidence_qc'\n", + "PROJECT = os.environ['GOOGLE_PROJECT']\n", + "WORKSPACE = os.environ['WORKSPACE_NAME']\n", + "WS_BUCKET = os.environ['WORKSPACE_BUCKET']\n", + "NAMESPACE = os.environ['WORKSPACE_NAMESPACE']\n", + "\n", + "# Autosomal copy number\n", + "cols_autosome = [\n", + " 'sample_id', 'chr1_CopyNumber', 'chr2_CopyNumber', \n", + " 'chr3_CopyNumber', 'chr4_CopyNumber', 'chr5_CopyNumber', \n", + " 'chr6_CopyNumber', 'chr7_CopyNumber', 'chr8_CopyNumber', \n", + " 'chr9_CopyNumber', 'chr10_CopyNumber', 'chr11_CopyNumber', \n", + " 'chr12_CopyNumber', 'chr13_CopyNumber', 'chr14_CopyNumber', \n", + " 'chr15_CopyNumber', 'chr16_CopyNumber', 'chr17_CopyNumber', \n", + " 'chr18_CopyNumber', 'chr19_CopyNumber', 'chr20_CopyNumber', \n", + " 'chr21_CopyNumber', 'chr22_CopyNumber'\n", + "]\n", + "\n", + "# PED file validation\n", + "ID_TYPE_SAMPLE = \"sample\"\n", + "ID_TYPE_FAMILY = \"family\"\n", + "ID_TYPE_PARENT = \"parent\"\n", + "FIELD_NUMBER_ID = 1\n", + "FIELD_NUMBER_SEX = 4\n", + "ILLEGAL_ID_SUBSTRINGS = [\"chr\", \"name\", \"DEL\", \"DUP\", \"CPX\", \"CHROM\"]\n", + "\n", + "# Sex assignment to numerical entry in PED file\n", + "SEX_CODES = {\n", + " \"MALE\": 1, \"FEMALE\": 2, \"MOSAIC\": 0, \"TURNER\": 0, \n", + " \"TRIPLE X\": 0, \"KLINEFELTER\": 0, \"JACOBS\": 0, \"OTHER\": 0\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Helper Functions\n", + "This section instantiates helper functions used throughout the notebook." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true + }, + "source": [ + "## File System Functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def copy_local_file_to_gs(ws_bucket, file_path, print_path=True):\n", + " \"\"\"\n", + " Copies a file saved locally to Google Cloud Storage\n", + " \n", + " Args:\n", + " ws_bucket (str): The bucket in Google Cloud Storage where the file should be saved.\n", + " file_path (str): The path where the file should be saved.\n", + " print_path (bool): Determines whether to print the Google Cloud Storage path of the file created.\n", + " \n", + " Returns:\n", + " None.\n", + " \"\"\"\n", + " gcs_path = f\"{ws_bucket}/{file_path}\"\n", + " \n", + " try:\n", + " subprocess.run(\n", + " [\"gsutil\", \"cp\", \"-r\", file_path, gcs_path],\n", + " stdout=subprocess.DEVNULL,\n", + " stderr=subprocess.DEVNULL,\n", + " check=True\n", + " )\n", + " except subprocess.CalledProcessError as e:\n", + " raise Exception(f\"Error copying file to GCS: {e}\")\n", + " \n", + " if (print_path):\n", + " print(f\"GCS file path: {gcs_path}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def generate_file_path(tld_path, file_type, file_name):\n", + " \"\"\"\n", + " Calculate the file path of a file to save in either the file system or Google Cloud Storage.\n", + "\n", + " Args:\n", + " tld_path (str): Top-level directory path.\n", + " file_type (str): Enables generation of the specific sub-directory which a file should live in.\n", + " file_name (str): File name to chain at the end of the path.\n", + "\n", + " Returns:\n", + " str: Path to file as it should be saved, per file system outline.\n", + " \"\"\"\n", + " if 'outlier' in file_type:\n", + " full_path = os.path.join(tld_path, 'raw_caller_outliers', file_type, file_name)\n", + " else:\n", + " full_path = os.path.join(tld_path, file_type, file_name)\n", + " \n", + " return full_path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def save_df(ws_bucket, file_path, df, print_path=True, header=True):\n", + " \"\"\"\n", + " Save a dataframe to the specified file path, creating directories if they don't exist.\n", + " \n", + " Args:\n", + " ws_bucket (str): The bucket in Google Cloud Storage where the file should be saved.\n", + " file_path (str): The path where the figure should be saved.\n", + " df (pandas.DataFrame): The dataframe to save.\n", + " print_path (bool): Determines whether to print the Google Cloud Storage path of the file created.\n", + " \n", + " Returns:\n", + " None.\n", + " \"\"\"\n", + " dir_path = os.path.dirname(file_path)\n", + " os.makedirs(dir_path, exist_ok=True)\n", + " \n", + " df.to_csv(file_path, sep='\\t', index=False, header=header)\n", + " copy_local_file_to_gs(ws_bucket, file_path, print_path)\n", + " \n", + " print(f\"Dataframe saved to: {file_path}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def save_figure(ws_bucket, file_path):\n", + " \"\"\"\n", + " Saves the current figure to the specified file path, creating directories if they don't exist.\n", + " \n", + " Args:\n", + " ws_bucket (str): The bucket in Google Cloud Storage where the file should be saved.\n", + " file_path (str): The path where the figure should be saved. \n", + " Returns:\n", + " None.\n", + " \"\"\"\n", + " dir_path = os.path.dirname(file_path)\n", + " os.makedirs(dir_path, exist_ok=True)\n", + " \n", + " plt.savefig(file_path)\n", + " copy_local_file_to_gs(ws_bucket, file_path)\n", + " \n", + " print(f\"Figure saved to: {file_path}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true + }, + "source": [ + "## Processing Functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def get_batch_for_sample_id(sample_id):\n", + " \"\"\"\n", + " Function to retrieve the batch associated with a given sample ID.\n", + "\n", + " Args:\n", + " sample_id (str): The sample ID to get the batch for.\n", + " \n", + " Returns:\n", + " str: Batch ID corresponding to searched sample.\n", + " \"\"\"\n", + " for batch, samples in zip(sample_set_tbl['entity:sample_set_id'].values, sample_set_tbl.samples.values):\n", + " if sample_id in samples:\n", + " return batch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "# Dictionary to store samples and their associated reasons for removal\n", + "samples_to_remove = {} \n", + " \n", + "def remove_samples(sample_ids, filter_type):\n", + " \"\"\"\n", + " Function to remove samples based on given standard sample IDs and reasons.\n", + "\n", + " Args:\n", + " sample_ids (list or iterable): A list of standard sample IDs to be removed.\n", + " filter_type (str): The type of filter that caused a sample to be removed.\n", + " \n", + " Returns:\n", + " None.\n", + " \"\"\"\n", + " # Reset the filter if it has already been applied\n", + " samples_to_remove[filter_type] = set()\n", + " \n", + " # Add each standard sample ID to exclusions from the given filter\n", + " for sample_id in sample_ids:\n", + " samples_to_remove[filter_type].add(sample_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def invert_removed_samples(samples_to_remove):\n", + " \"\"\"\n", + " Inverts the key-value orientation of a sample dictionary.\n", + "\n", + " Args:\n", + " samples_to_remove (dict): A dictionary with reasons as keys and sets of samples as values.\n", + "\n", + " Returns:\n", + " dict: A dictionary with samples as keys and sets of reasons as values.\n", + " \"\"\" \n", + " inv_samples_to_remove = {}\n", + " \n", + " for reason, samples in samples_to_remove.items():\n", + " for sample in samples:\n", + " if sample not in inv_samples_to_remove:\n", + " inv_samples_to_remove[sample] = set()\n", + " inv_samples_to_remove[sample].add(reason)\n", + " \n", + " return inv_samples_to_remove" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def filter_and_save_metadata(samples_qc_table, samples_to_remove, file_path):\n", + " \"\"\"\n", + " Filter a metadata table based on a list of samples to remove, and then save the filtered table.\n", + "\n", + " Args:\n", + " samples_qc_table (pandas.DataFrame): DataFrame containing sample metadata.\n", + " samples_to_remove (dict): Dictionary with samples to be removed as keys.\n", + " file_path (str): File path to save metadata at.\n", + "\n", + " Returns:\n", + " str: Full path to the saved file.\n", + " \"\"\"\n", + " # Convert the keys of 'samples_to_remove' to a set to remove any duplicates, and then back to a list\n", + " final_list_to_remove = list(set(samples_to_remove.keys()))\n", + " \n", + " # Select relevant columns from the 'samples_qc_table' DataFrame\n", + " all_meta = samples_qc_table[[\n", + " 'sample_id', 'mean_insert_size','wgd_score', \n", + " 'median_coverage', 'chrX_CopyNumber_rounded'\n", + " ]]\n", + "\n", + " # Remove rows corresponding to filtered samples\n", + " pass_meta = all_meta[~all_meta.sample_id.isin(final_list_to_remove)]\n", + " \n", + " # Save metadata file\n", + " save_df(WS_BUCKET, file_path, pass_meta, print_path=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true + }, + "source": [ + "## Validation Functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def validate_numeric_input(*args, log=True):\n", + " \"\"\"\n", + " Validates user input to check whether it is numeric or not. \n", + " \n", + " Args:\n", + " *args (numeric): Any number of positional arguments to validate.\n", + " \n", + " Returns:\n", + " None.\n", + " \"\"\"\n", + " for arg in args:\n", + " if not isinstance(arg, (int, float)):\n", + " raise Exception('Value input must be numeric.')\n", + " \n", + " if log:\n", + " print(\"Inputs are valid - please proceed to the next cell.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def validate_table_filter(table, filter_type, log=True):\n", + " \"\"\"\n", + " Validates user input to check a particular filter is valid for a given table. \n", + " \n", + " Args:\n", + " table (pd.DataFrame): Sample table to check.\n", + " filter_type: \n", + " \n", + " Returns:\n", + " None.\n", + " \"\"\"\n", + " if (filter_type not in table):\n", + " raise Exception(f\"'{filter_type}' is not a valid column in the provided dataframe - skip this QC step.\")\n", + " \n", + " if (table[filter_type].notna().sum() <= 0 or table[filter_type].empty):\n", + " raise Exception(f\"'{filter_type}' has no non-null values in the provided dataframe - skip this QC step.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def validate_qc_inputs(table, filter_type, line_deviations=None, line_styles=None, method=None, \n", + " lower_cutoff=None, upper_cutoff=None, mad_cutoff=None, caller=None, \n", + " caller_type=None, log_scale=None):\n", + " \"\"\"\n", + " Validates user inputs for QC tests.\n", + " \n", + " Args:\n", + " table (pd.DataFrame): Sample table.\n", + " filter_type (str): QC filter being checked.\n", + " line_deviations (list): List of line deviations as input by user.\n", + " line_styles (list): List of line styles as input by user.\n", + " method (str): Method used for filtering as input by user.\n", + " lower_cutoff (numeric): Lower cutoff threshold as input by user.\n", + " upper_cutoff (numeric): Upper cutoff threshold as input by user.\n", + " mad_cutoff (numeric): MAD cutoff threshold as input by user.\n", + " caller (str): Type of caller used as input by user.\n", + " caller_type (str): Determines whether to look for high or low outliers as input by user.\n", + " log_scale (bool): Determines whether to log-scale the plot as input by user.\n", + " \n", + " Returns:\n", + " None.\n", + " \"\"\"\n", + " validate_table_filter(table, filter_type, log=False)\n", + " \n", + " if log_scale:\n", + " if (not isinstance(log_scale, bool)):\n", + " raise Exception('LOG_SCALE must be a boolean value.')\n", + " \n", + " if line_deviations:\n", + " if (not isinstance(line_deviations, list)):\n", + " raise Exception('LINE_DEVIATIONS must be a list.')\n", + " \n", + " for ld in line_deviations:\n", + " validate_numeric_input(ld, log=False)\n", + " \n", + " if line_styles:\n", + " if (not isinstance(line_styles, list)):\n", + " raise Exception('LINE_STYLES must be a list of strings.')\n", + " \n", + " for ls in line_styles:\n", + " if (ls not in ['solid', 'dotted', 'dashed', 'dashdot']):\n", + " raise Exception(f'The value {ls} is invalid - it must be one of \"solid\", \"dotted\", \"dashed\" or \"dashdot\".')\n", + " \n", + " if line_deviations and line_styles:\n", + " if (len(line_deviations) != len(line_styles)):\n", + " raise Exception('The number of cutoffs provided should match the number of line styles input.')\n", + " \n", + " if method:\n", + " if (method not in ['MAD', 'hard']):\n", + " raise Exception('The value for the filter method must be one of \"MAD\" or \"hard\".')\n", + " \n", + " if (method == 'hard'):\n", + " if (lower_cutoff and not isinstance(lower_cutoff, (int, float))):\n", + " raise Exception('Given that the chosen method is \"hard\", the value for the lower cutoff should be numeric.')\n", + "\n", + " if (upper_cutoff and not isinstance(upper_cutoff, (int, float))):\n", + " raise Exception('Given that the chosen method is \"hard\", the value for the upper cutoff should be numeric.')\n", + " \n", + " if (not lower_cutoff):\n", + " print('[WARNING] Setting LOWER_CUTOFF to None results in no lower cutoff being applied.')\n", + " \n", + " if (not upper_cutoff):\n", + " print('[WARNING] Setting UPPER_CUTOFF to None results in no upper cutoff being applied.')\n", + " \n", + " if (method == 'MAD'):\n", + " if (mad_cutoff and not isinstance(mad_cutoff, (int, float))):\n", + " raise Exception('The value for the MAD cutoff should be numeric.')\n", + " \n", + " if (not mad_cutoff):\n", + " print('[WARNING] Setting MAD_CUTOFF to None results in no lower cutoff being applied.')\n", + "\n", + " if (caller and caller not in ['overall', 'manta', 'melt', 'scramble', 'scramble', 'wham']):\n", + " raise Exception(f'The value {caller} for category is invalid - it must be one of \"overall\", \"manta\", \"melt\", \"scramble\" or \"wham\".')\n", + "\n", + " if (caller_type and caller_type not in ['high', 'low']):\n", + " raise Exception(f'The value {caller_type} for caller type is invalid - it must be one of \"high\" or \"low\".')\n", + " \n", + " \n", + " print(\"Inputs are valid - please proceed to the next cell.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def validate_unique_samples(samples_df):\n", + " \"\"\"\n", + " Validates user input to check whether it is numeric or not. \n", + " \n", + " Args:\n", + " samples_df (pandas.DataFrame): Contains sample data\n", + " \n", + " Returns:\n", + " None.\n", + " \"\"\"\n", + " id_counts = samples_df['sample_id'].value_counts()\n", + " duplicates_dict = id_counts[id_counts > 1].to_dict()\n", + "\n", + " if (len(duplicates_dict) > 0):\n", + " print(f\"{len(duplicates_dict)} duplicate samples exist in the dataset.\")\n", + " for sample_id, count in duplicates_dict.items():\n", + " print(f\"Sample ID: {sample_id}, Count: {count}\")\n", + " raise Exception(\"QC requires unique samples - please resolve duplicates before proceeding.\")\n", + "\n", + " print(\"No duplicates found - please proceed to the next cell.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def validate_id(identifier, id_type, source_file):\n", + " \"\"\"\n", + " Validates sample IDs provided based on a source file of samples.\n", + " \n", + " Args:\n", + " identifier (str): ID for a given sample.\n", + " id_type (str): Type of ID provided.\n", + " source_file (str): File that contains all samples.\n", + " \n", + " Returns:\n", + " None.\n", + " \"\"\"\n", + " # Check for empty IDs\n", + " if identifier is None or identifier == \"\":\n", + " raise ValueError(f\"Empty {id_type} ID in {source_file}.\")\n", + "\n", + " # Check all characters are alphanumeric or underscore\n", + " if not re.match(r'^\\w+$', identifier):\n", + " raise ValueError(f\"Invalid {id_type} ID in {source_file}: '{identifier}'.\" + \n", + " \"IDs should only contain alphanumeric and underscore characters.\")\n", + "\n", + " # Check for all-numeric IDs, besides maternal & paternal ID (can be 0) and all-numeric family IDs\n", + " if id_type != ID_TYPE_FAMILY and not (id_type == ID_TYPE_PARENT and identifier == \"0\") and identifier.isdigit():\n", + " raise ValueError(f\"Invalid {id_type} ID in {source_file}: {identifier}. \" +\n", + " \"IDs should not contain only numeric characters.\")\n", + "\n", + " # Check for illegal substrings\n", + " for sub in ILLEGAL_ID_SUBSTRINGS:\n", + " if sub in identifier:\n", + " raise ValueError(f\"Invalid {id_type} ID in {source_file}: {identifier}. \" +\n", + " f\"IDs cannot contain the following substrings: {', '.join(ILLEGAL_ID_SUBSTRINGS)}.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def validate_ped(ped_file, samples):\n", + " \"\"\"\n", + " Validates structure and data within PED file based on series of samples provided.\n", + " Works with both local and GCS files.\n", + " \n", + " Args:\n", + " ped_file (str): Path to PED file (local or GCS path starting with 'gs://').\n", + " samples (set): Set of sample IDs to validate against.\n", + " \n", + " Returns:\n", + " None\n", + " \"\"\"\n", + " seen_sex_1 = False\n", + " seen_sex_2 = False\n", + " samples_found = set()\n", + "\n", + " # Read PED file\n", + " try:\n", + " df = pd.read_table(ped_file, dtype=str, header=None, comment='#', names=[\n", + " 'Family_ID', 'Sample_ID', 'Paternal_ID', 'Maternal_ID', 'Sex', 'Phenotype'\n", + " ])\n", + " except Exception as e:\n", + " raise ValueError(f\"Error reading PED file: {str(e)}\")\n", + " \n", + " # Ensure column count\n", + " if len(df.columns) != 6:\n", + " raise ValueError(\"PED file must have 6 columns: Family_ID, Sample_ID, \" +\n", + " \"Paternal_ID, Maternal_ID, Sex, Phenotype.\")\n", + "\n", + " # Iteratively validate each row\n", + " for _, row in df.iterrows():\n", + " # Validate ID\n", + " for identifier, id_type in zip(row[:FIELD_NUMBER_SEX],\n", + " [ID_TYPE_FAMILY, ID_TYPE_SAMPLE, ID_TYPE_PARENT, ID_TYPE_PARENT]):\n", + " validate_id(identifier, id_type, \"PED file\")\n", + "\n", + " # Assign main information to variables\n", + " sample_id = row['Sample_ID']\n", + " sex = int(row['Sex'])\n", + "\n", + " # Check for appearance of each sex\n", + " if sex == 1:\n", + " seen_sex_1 = True\n", + " elif sex == 2:\n", + " seen_sex_2 = True\n", + " elif sex != 0:\n", + " raise ValueError(f\"Sample {sample_id} has an invalid value for sex: {sex}. \" +\n", + " \"PED file must use the following values for sex: \" + \n", + " \"1 for Male, 2 for Female, 0 for Unknown/Other.\")\n", + "\n", + " # Verify no duplications\n", + " if sample_id in samples_found:\n", + " raise ValueError(f\"Duplicate entries for sample {sample_id}.\")\n", + " elif sample_id in samples:\n", + " samples_found.add(sample_id)\n", + "\n", + " # Check if all samples in the sample list are present in PED file\n", + " if len(samples_found) < len(samples):\n", + " missing = samples - samples_found\n", + " raise ValueError(f\"PED file is missing sample(s): {','.join(missing)}.\")\n", + "\n", + " # Raise error if at least one of either sex is not found\n", + " if not (seen_sex_2 and seen_sex_1):\n", + " raise ValueError(\"Did not find existence of multiple sexes in file. \" +\n", + " \"PED file must use the following values for sex: \" + \n", + " \"1 for Male, 2 for Female, 0 for Unknown/Other.\")\n", + " \n", + " print(\"PED file is valid - please proceed to the next cell.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## QC Functions" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true + }, + "source": [ + "### General Functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def calculate_mad_filter_bounds(filter_type_data, filter_type_cut_off, filter_type):\n", + " \"\"\"\n", + " Calculate the lower and upper bounds for filtering based on the Median Absolute Deviation (MAD) method.\n", + "\n", + " Args:\n", + " filter_type_data (numpy.array): Data for which MAD filter bounds will be calculated.\n", + " filter_type_cut_off (float): The cutoff value for the filter. It determines how many MADs away \n", + " from the median to set the filter bounds.\n", + " filter_type (str): The type of the filter or data for which MAD bounds are calculated.\n", + "\n", + " Returns:\n", + " tuple: Contains the MAD value, median value, lower filter bound, and upper filter bound.\n", + " \"\"\"\n", + " # Check if the cut_off value is less than 0\n", + " if filter_type_cut_off < 0:\n", + " raise Exception(\"Invalid cutoff - please ensure that the cutoff is greater than or equal to 0.\")\n", + " \n", + " # Calculate the median and the Median Absolute Deviation (MAD) for the filter type data\n", + " median_filter_type = np.median(filter_type_data)\n", + " mad_filter_type = np.median(np.absolute(filter_type_data - median_filter_type))\n", + "\n", + " # Calculate the lower and upper filter bounds based on the MAD and the cutoff value\n", + " filter_type_lower_cff = median_filter_type - float(filter_type_cut_off) * mad_filter_type\n", + " filter_type_upper_cff = median_filter_type + float(filter_type_cut_off) * mad_filter_type\n", + "\n", + " # Return the MAD value, median value, and filter bounds as a tuple\n", + " return mad_filter_type, median_filter_type, filter_type_lower_cff, filter_type_upper_cff" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def plot_histogram(filter_type_data, filter_type, filter_name, cutoffs=None, line_styles=None, log_scale=False, **kwargs):\n", + " \"\"\"\n", + " Plots a histogram for a given data array and saves the plot as an image.\n", + "\n", + " Args:\n", + " filter_type_data (np.array): The data to plot the histogram for.\n", + " filter_type (str): The type of data being plotted (e.g., \"age\", \"height\", etc).\n", + " filter_name (str): The name of the metric being plotted.\n", + " cutoffs (list): List of cutoff values for which to plot vertical lines.\n", + " line_styles (list): List of line styles corresponding to each cutoff.\n", + " log_scale (bool): Defines whether to log-scale the plot.\n", + " **kwargs: Additional plotting parameters for matplotlib.\n", + "\n", + " Returns:\n", + " str: Path to saved file containing plot.\n", + " \"\"\" \n", + " # Validation\n", + " if len(filter_type_data) == 0:\n", + " print(f\"The {filter_type} data is empty. No plot will be generated.\")\n", + " return None\n", + "\n", + " # Calculate the median and MAD\n", + " median_filter_type = np.median(filter_type_data)\n", + " mad_filter_type = np.median(np.absolute(filter_type_data - median_filter_type))\n", + "\n", + " # Print summary values\n", + " print(f'Median: {median_filter_type:.5f}')\n", + " print(f'MAD: {mad_filter_type:.5f}')\n", + " print(f'Minimum: {min(filter_type_data):.5f}')\n", + " print(f'Maximum: {max(filter_type_data):.5f}')\n", + " print()\n", + "\n", + " # Create new figure\n", + " fig = plt.figure(figsize=(8, 6))\n", + " fig.patch.set_facecolor('white')\n", + "\n", + " # Plot a histogram\n", + " bins = kwargs.pop('bins', 50)\n", + " plt.hist(filter_type_data, edgecolor='black', bins=bins, **kwargs)\n", + " if log_scale:\n", + " plt.gca().set_yscale('log')\n", + " plt.gca().yaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter())\n", + " plt.gca().yaxis.get_major_formatter().set_scientific(False)\n", + " plt.gca().yaxis.set_minor_formatter(matplotlib.ticker.NullFormatter())\n", + " plt.gca().set_facecolor('white')\n", + " plt.grid(False)\n", + " plt.gca().yaxis.grid(True, zorder=0, color='lightgrey')\n", + "\n", + " # Set the title and axes\n", + " plt.title(f\"{filter_name} - {len(filter_type_data)} Samples (Filtered)\", fontsize=16)\n", + " plt.xlabel(f\"{filter_name}\", fontsize=12)\n", + " plt.ylabel(\"Sample Count (Log Scale)\" if log_scale else \"Sample Count\", fontsize=12)\n", + "\n", + " # Additional plots\n", + " plt.axvline(x=np.median(filter_type_data), color='black', linestyle='--')\n", + " legend_handles = [Line2D([0], [0], alpha=1, color='black', linestyle='--', label='Median')]\n", + " if cutoffs is not None and line_styles is not None:\n", + " for cutoff, line_style in zip(cutoffs, line_styles):\n", + " plt.axvline(x=median_filter_type + cutoff * mad_filter_type, linestyle=line_style, color='grey', label=f'Median + {cutoff}*MAD')\n", + " plt.axvline(x=median_filter_type - cutoff * mad_filter_type, linestyle=line_style, color='grey', label=f'Median - {cutoff}*MAD')\n", + " legend_handles.append(Line2D([0], [0], alpha=1, linestyle=line_style, color='grey', label=f'Median +- {cutoff}*MAD'))\n", + " plt.legend(handles=legend_handles, framealpha=1, labelspacing=0.5)\n", + "\n", + " # Save and close figure\n", + " file_name = f\"filtered_histogram_{len(filter_type_data)}_samples.png\"\n", + " if cutoffs is not None:\n", + " file_name = f\"analysis_histogram_{len(filter_type_data)}_samples.png\"\n", + " file_path = generate_file_path(TLD_PATH, filter_type, file_name)\n", + " save_figure(WS_BUCKET, file_path)\n", + " plt.close()\n", + "\n", + " return file_path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def run_analysis(table, filter_type, cutoffs=None, line_styles=None, read_length=None, log_scale=False, **kwargs):\n", + " \"\"\"\n", + " Analyse data based on a series of parameters provided.\n", + "\n", + " Args:\n", + " table (pandas.DataFrame): Pandas dataframe from which data is extracted.\n", + " filter_type (str): Type of the filter being plotted.\n", + " cutoffs (list): List of cutoff values for which to plot vertical lines.\n", + " line_styles (list): List of line styles corresponding to each cutoff.\n", + " read_length (numeric): Read length, from which coverage coefficient can be calculated.\n", + " log_scale (bool): Defines whether to log-scale the histogram produced.\n", + " **kwargs: Additional plotting parameters to be passed to the plotting function.\n", + "\n", + " Returns:\n", + " None.\n", + " \"\"\"\n", + " # Get data for specific QC metric\n", + " filter_data = table[filter_type].values\n", + "\n", + " # Multiply median_coverage by coefficient for median coverage metric\n", + " if filter_type == 'median_coverage':\n", + " if not isinstance(read_length, (int, float)) or read_length is None:\n", + " raise Exception(f\"A numeric value must be provided for READ_LENGTH.\")\n", + " filter_data = filter_data * (read_length / 100)\n", + "\n", + " # Remove samples that are NaN for outlier metrics\n", + " if 'outlier' in filter_type:\n", + " filter_data = np.nan_to_num(filter_data, nan=0.0)\n", + "\n", + " # Plot figure\n", + " filter_name = ' '.join(word.capitalize() for word in filter_type.split('_'))\n", + " filter_name = filter_name.replace('Wgd', 'WGD').replace('Nondiploid', 'Non-Diploid')\n", + " file_png = plot_histogram(filter_data, filter_type, filter_name, log_scale=log_scale, cutoffs=cutoffs, line_styles=line_styles, **kwargs)\n", + "\n", + " # Display plot\n", + " if display:\n", + " plt.figure(figsize=(8, 8))\n", + " img = mpimg.imread(file_png)\n", + " plt.imshow(img)\n", + " plt.axis('off')\n", + " plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def run_filtering(table, filter_type, filter_method, lower_cutoff=float('-inf'), upper_cutoff=float('inf'), mad_cutoff=None, read_length=None, log_scale=False, **kwargs):\n", + " \"\"\"\n", + " Filter data based on a given filter type within specified lower and upper limits.\n", + "\n", + " Args:\n", + " table (pandas.DataFrame): A DataFrame containing the data to be filtered.\n", + " filter_type (str): The type of the filter or data column to be filtered.\n", + " filter_method (str): The filter method to use to filter.\n", + " lower_cutoff (numeric): The lower cutoff for the filter.\n", + " upper_cutoff (numeric): The upper cutoff for the filter.\n", + " mad_cutoff (int): The MAD cutoff for the filter.\n", + " read_length (numeric): Read length, from which coverage coefficient can be calculated.\n", + " log_scale (bool): Defines whether to log-scale the histogram produced.\n", + " **kwargs: Additional plotting parameters to be passed to the plotting function.\n", + "\n", + " Returns:\n", + " None.\n", + " \"\"\"\n", + " # Set defaults if None passed\n", + " if not lower_cutoff:\n", + " lower_cutoff = float('-inf')\n", + " \n", + " if not upper_cutoff:\n", + " upper_cutoff = float('inf')\n", + " \n", + " if not mad_cutoff:\n", + " mad_cutoff = float('inf')\n", + " \n", + " # Validation\n", + " if filter_method == 'hard' and lower_cutoff and upper_cutoff and not lower_cutoff <= upper_cutoff:\n", + " raise Exception(\"Invalid cutoff - please ensure that the lower cutoff is less than or equal to the upper cutoff.\")\n", + " \n", + " # Capture specified data being filtered on\n", + " filter_data = table[filter_type].values\n", + " \n", + " # Multiply median_coverage by coefficient for median coverage metric\n", + " if (filter_type == 'median_coverage'):\n", + " if (not isinstance(read_length, (int, float)) or read_length is None):\n", + " raise Exception(f\"A numeric value must be provided for READ_LENGTH.\")\n", + " filter_data = filter_data * (read_length / 100)\n", + " \n", + " # Calculate filter bounds if MAD selected as cutoff method\n", + " if (filter_method == 'MAD'):\n", + " filter_mad, filter_median, lower_cutoff, upper_cutoff = calculate_mad_filter_bounds(filter_data, mad_cutoff, filter_type)\n", + "\n", + " # Filter the data based on the lower and upper limits\n", + " filter_pass = table[((filter_data >= lower_cutoff) & (filter_data <= upper_cutoff))]\n", + " failed_sample_ids = table[((filter_data < lower_cutoff) | (filter_data > upper_cutoff))].sample_id.values\n", + " \n", + " # Log intermediete outputs\n", + " print(f'Passing samples: {len(filter_pass)}')\n", + " print(f'Failing samples: {len(failed_sample_ids)}')\n", + " print()\n", + " \n", + " # Remove failed samples\n", + " remove_samples(failed_sample_ids, filter_type)\n", + "\n", + " # Validation\n", + " if (len(filter_pass) <= 0):\n", + " print(f'No samples passed this QC step, so a figure cannot be displayed.')\n", + " return\n", + " \n", + " # Create figure name\n", + " filter_name = ' '.join(word.capitalize() for word in filter_type.split('_'))\n", + " filter_name = filter_name.replace('Wgd', 'WGD')\n", + " filter_name = filter_name.replace('Nondiploid', 'Non-Diploid')\n", + " \n", + " # Create plot\n", + " filter_pass_data = filter_pass[filter_type]\n", + " if (filter_type == 'median_coverage'):\n", + " filter_pass_data = filter_pass_data * (read_length / 100)\n", + " file_png = plot_histogram(filter_pass_data, filter_type, filter_name, log_scale=log_scale, **kwargs)\n", + " \n", + " # Display plot\n", + " if display:\n", + " plt.figure(figsize=(8, 8))\n", + " img = mpimg.imread(file_png)\n", + " plt.imshow(img)\n", + " plt.axis('off')\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def create_upset_plot(filter_df, num_samples_to_be_removed):\n", + " \"\"\"\n", + " Create an upset plot based on the provided DataFrame and save it as a .png file.\n", + "\n", + " Args:\n", + " filter_df (pandas.DataFrame): A DataFrame containing the filtered samples, with columns \n", + " 'sample_id' and 'filters_applied'.\n", + " num_samples_to_be_removed (int): The total number of samples to be removed.\n", + "\n", + " Returns:\n", + " None.\n", + " \"\"\"\n", + " # Catch failing case - no samples to drop\n", + " if (len(filter_df) == 0):\n", + " print(\"UpSet plot cannot be generated - no samples are dropped.\")\n", + " return\n", + " \n", + " # Group by 'filters_applied' and get the count for each group\n", + " grouped_new = filter_df.groupby(\"filters_applied\").count()\n", + " \n", + " # Plot an upset or bar plot based on number of failing filters\n", + " if len(grouped_new) == 1:\n", + " print(\"UpSet plot cannot be generated with a single unique filter - generating histogram instead.\")\n", + " \n", + " # Create the histogram\n", + " filter_counts = filter_df['filters_applied'].explode().value_counts()\n", + " filter_counts.plot(kind='bar')\n", + " \n", + " # Set the title and axes\n", + " failing_filter = grouped_new.index[0]\n", + " plt.title(f\"Histogram - {num_samples_to_be_removed} Failing Samples\")\n", + " plt.ylabel(\"Count\")\n", + " plt.xticks(rotation=45)\n", + " \n", + " # Set file name\n", + " file_name = f\"single_filter_plot_{str(num_samples_to_be_removed)}_filtered_samples.png\"\n", + " else:\n", + " # Create a new DataFrame with an additional 'count' column\n", + " filter_df_new = filter_df.assign(count=filter_df[\"filters_applied\"].map(grouped_new['sample_id']))\n", + "\n", + " # Create the upset plot\n", + " removed_by_filter = from_memberships(filter_df_new.filters_applied.str.split(','), data=filter_df_new)\n", + " upset = UpSet(removed_by_filter, subset_size='auto', show_percentages=True, show_counts='%d', element_size=60, orientation='horizontal')\n", + " upset.plot()\n", + "\n", + " # Set the title and margins\n", + " plt.suptitle(f\"Upset Plot - {str(num_samples_to_be_removed)} Failing Samples\")\n", + " plt.margins(x =0.1, y=0.25)\n", + " \n", + " # Set file name\n", + " file_name = f\"upset_plot_{str(num_samples_to_be_removed)}_filtered_samples.png\"\n", + " \n", + " # Save the output file\n", + " file_path = generate_file_path(TLD_PATH, 'filtering', file_name)\n", + " save_figure(WS_BUCKET, file_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true + }, + "source": [ + "### Autosomal Copy Number Functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def process_ploidy_data(sample_set_tbl, column_name, dest_path):\n", + " \"\"\"\n", + " Localize files, extract tarballs, and create directories for ploidy data.\n", + " \n", + " Args:\n", + " sample_set_tbl (pandas.DataFrame): The sample set table.\n", + " column_name (str): The name of the column containing file paths.\n", + " dest_path (str): The destination path for localized files.\n", + " \n", + " Returns:\n", + " list: Paths to the created ploidy directories.\n", + " \"\"\"\n", + " os.makedirs(dest_path, exist_ok=True)\n", + " ploidy_dirs = []\n", + " \n", + " for i, (batch, file) in enumerate(zip(sample_set_tbl['entity:sample_set_id'], sample_set_tbl[column_name])):\n", + " # Localize file\n", + " local_file = os.path.join(dest_path, os.path.basename(file))\n", + " subprocess.run([\"gsutil\", \"cp\", file, local_file], check=True)\n", + " \n", + " # Create and extract to subdirectory\n", + " subdir = f\"{batch}_ploidy\"\n", + " if os.path.exists(subdir):\n", + " shutil.rmtree(subdir)\n", + " os.mkdir(subdir)\n", + " \n", + " # Extract contents of tarball into new directory\n", + " os.system(f'tar -xf {local_file} -C {subdir}')\n", + " ploidy_dirs.append(subdir)\n", + " \n", + " return ploidy_dirs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def find_samples_outside_threshold(cn_data, upper_cutoff=None, lower_cutoff=None):\n", + " \"\"\"\n", + " Function to find samples with copy numbers outside the specified thresholds for each chromosome.\n", + "\n", + " Args:\n", + " cn_data (pandas.DataFrame): Contains the copy number data.\n", + " upper_cutoff (float): The upper threshold value. \n", + " lower_cutoff (float): The lower threshold value.\n", + "\n", + " Returns:\n", + " A two-element tuple containing:\n", + " 1. pandas.Series: Contains the count of samples outside the threshold(s) for each chromosome.\n", + " 2. list: Contains the sample IDs with at least one chromosome outside the threshold(s).\n", + " \"\"\"\n", + " chr_columns = [col for col in cn_data.columns if 'chr' in col.lower()]\n", + " \n", + " # Create boolean mask for values outside threshold(s)\n", + " if upper_cutoff is not None and lower_cutoff is not None:\n", + " mask = (cn_data[chr_columns] > upper_cutoff) | (cn_data[chr_columns] < lower_cutoff)\n", + " elif upper_cutoff is not None:\n", + " mask = cn_data[chr_columns] > upper_cutoff\n", + " else:\n", + " mask = cn_data[chr_columns] < lower_cutoff\n", + " \n", + " # Create data structures to return\n", + " cn_filter_pairs = mask.sum()\n", + " cn_filter_ids = cn_data.loc[mask.any(axis=1), cn_data.columns[0]].tolist()\n", + " \n", + " # Print summary statistics\n", + " print(f\"Sample-chromosome pairs outside threshold(s): {mask.sum().sum()}\")\n", + " print(f\"Samples outside threshold(s): {len(cn_filter_ids)}\")\n", + " \n", + " return cn_filter_pairs, cn_filter_ids" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def display_aneuploidy_outside_threshold(cn_data, chr_number, upper_cutoff=None, lower_cutoff=None, display=False):\n", + " \"\"\"\n", + " Display aneuploidy for batches with samples outside the estimated copy number threshold for a given chromosome.\n", + "\n", + " Args:\n", + " cn_data (pandas.DataFrame): Contains the copy number data.\n", + " chr_number (int): The chromosome number to plot for.\n", + " upper_cutoff (float): The upper threshold value.\n", + " lower_cutoff (float): The lower threshold value.\n", + " display (bool): Defines whether to display the plot directly.\n", + "\n", + " Returns:\n", + " dict: Dictionary of batch ID to image file paths for the plot per batch.\n", + " \"\"\"\n", + " # Validation\n", + " if (upper_cutoff is None and lower_cutoff is None):\n", + " raise Exception(\"Either one of 'upper_cutoff' or 'lower_cutoff' must be provided.\") \n", + " \n", + " # Find sample IDs with copy number outside the threshold for the given chromosome\n", + " chromosome = f\"chr{chr_number}_CopyNumber\"\n", + " if (upper_cutoff):\n", + " failed_aneu = cn_data[cn_data[chromosome] > upper_cutoff]['sample_id'].values\n", + " else:\n", + " failed_aneu = cn_data[cn_data[chromosome] < lower_cutoff]['sample_id'].values\n", + " \n", + " # Exit if no samples fall outside the threshold\n", + " if (len(failed_aneu) <= 0):\n", + " print(f\"No samples failed this threshold - please modify the cutoff accordingly.\")\n", + " return\n", + " \n", + " # Find all batches failing with samples below this threshold\n", + " failed_batches = defaultdict(list)\n", + " for sample_id in failed_aneu:\n", + " batch_id = get_batch_for_sample_id(sample_id)\n", + " failed_batches[batch_id].append(sample_id)\n", + " \n", + " # Log batch-specific information\n", + " image_paths = {}\n", + " for batch_id, sample_ids in failed_batches.items():\n", + " image_file_path = str(batch_id) + \"_ploidy/ploidy_est/estimated_CN_per_bin.all_samples.\" + str(chromosome).split(\"_\")[0] + \".png\"\n", + " image_paths[batch_id] = image_file_path\n", + " \n", + " if (display):\n", + " for batch_id, image_path in image_paths.items():\n", + " img = Image.open(image_path)\n", + " plt.figure(figsize=(12, 5))\n", + " plt.imshow(img)\n", + " plt.title(f'Batch: {batch_id} - {len(failed_batches[batch_id])} Failing Sample(s)')\n", + " plt.axis('off')\n", + " plt.show()\n", + " return\n", + " return image_paths" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def plot_copy_number_per_autosome(cn_data, display=False):\n", + " \"\"\"\n", + " Plot the copy number per autosome for samples.\n", + "\n", + " Args:\n", + " cn_data (pandas.DataFrame): Contains the copy number data.\n", + " display (bool): Defines whether to display the plot directly.\n", + "\n", + " Returns:\n", + " str: Path to saved file containing plot.\n", + " \"\"\"\n", + " # Extract contigs by removing \"_CopyNumber\" from column names (1 to 22) of 'all_cn' DataFrame\n", + " contigs = [x.replace(\"_CopyNumber\", \"\") for x in cn_data.columns[1:23]]\n", + " num_samples = len(cn_data['sample_id'])\n", + "\n", + " # Create a box plot of copy number data for contigs (columns 1 to 22) in 'all_cn' DataFrame\n", + " plt.figure(figsize=(10, 3.5))\n", + " bplot = plt.boxplot(\n", + " cn_data.iloc[:, 1:23],\n", + " sym='.',\n", + " labels=contigs,\n", + " whis=6,\n", + " patch_artist=True,\n", + " showfliers=True,\n", + " medianprops=dict(color=\"xkcd:steel blue\")\n", + " )\n", + " plt.tick_params(axis='both', which='major', labelsize=12)\n", + " plt.tick_params(axis='x', rotation=45)\n", + " plt.title(f\"Copy Number Per Autosome - {len(cn_data)} Samples\", fontsize=15)\n", + " plt.grid(True, zorder=0)\n", + " \n", + " # Save the plot as an image \n", + " file_name = f\"cn_per_autosome_{num_samples}_samples.png\"\n", + " file_path = generate_file_path(TLD_PATH, 'autosomal_copy_number', file_name)\n", + " save_figure(WS_BUCKET, file_path)\n", + " \n", + " # Conditionally display the plot\n", + " if (display == True):\n", + " plt.show()\n", + " return\n", + " plot.close()\n", + " return cn_per_autosome_plot" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true + }, + "source": [ + "### Sex Analysis Functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def compute_sex_assignments(samples_qc_table, lower_cutoff_chrX=1.2, upper_cutoff_chrX=1.7, \n", + " lower_cutoff_chrY=0.1, upper_cutoff_chrY=0.8):\n", + " \"\"\"\n", + " Update sex assignments based on copy number data and sex assignment information from EvidenceQC.\n", + "\n", + " Parameters:\n", + " samples_qc_table (pandas.DataFrame): DataFrame containing sex information with columns 'sample_id' and 'Assignment'.\n", + " lower_cutoff_chrX (numeric): Lower chrX copy number cutoff for mosaic assignments.\n", + " upper_cutoff_chrX (numeric): Upper chrX copy number cutoff for mosaic assignments.\n", + " lower_cutoff_chrY (numeric): Lower chrY copy number cutoff for mosaic assignments.\n", + " upper_cutoff_chrY (numeric): Upper chrY copy number cutoff for mosaic assignments.\n", + "\n", + " Returns:\n", + " dict: Updated sex assignments\n", + " \"\"\"\n", + " # Validate\n", + " for cutoff in [lower_cutoff_chrX, upper_cutoff_chrX, lower_cutoff_chrY, upper_cutoff_chrY]:\n", + " validate_numeric_input(cutoff, log=False)\n", + " \n", + " # Create two dictionaries to store updated sex assignments for figures and pedigree\n", + " updated_sex = dict(zip(samples_qc_table['sample_id'], samples_qc_table['sex_assignment']))\n", + "\n", + " # Use copy number metrics to determine sex assignments\n", + " for sample, cnY, cnX in zip(samples_qc_table['sample_id'], samples_qc_table.chrY_CopyNumber, \n", + " samples_qc_table.chrX_CopyNumber):\n", + " if updated_sex[sample] in (\"MALE\", \"TURNER\") and (cnY > lower_cutoff_chrY and cnY < upper_cutoff_chrY):\n", + " updated_sex[sample] = 'MALE'\n", + " elif cnY <= lower_cutoff_chrY and cnX > lower_cutoff_chrX and cnX < upper_cutoff_chrX:\n", + " updated_sex[sample] = \"MOSAIC\"\n", + " \n", + " # Return two dictionaries containing updated sex assignments for figures and pedigree\n", + " return updated_sex" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def process_reference_ped(file_path, table):\n", + " \"\"\"\n", + " Loads and formats a reference sex assignment dataframe from a specified PED file.\n", + "\n", + " Args:\n", + " file_path (str): The path to the file in Google Cloud Storage containing the reference PED file.\n", + " table (pandas.DataFrame): Contains sample information for comparison.\n", + "\n", + " Returns:\n", + " pandas.DataFrame: Contains information from the raw sex assignments file.\n", + " \"\"\"\n", + " # Load the data from the file at 'file_path' into a pandas DataFrame. The 'dtype=str'\n", + " reference_ped = pd.read_table(file_path, dtype=str, header=None, comment=\"#\", names=[\n", + " \"Family_ID\", \"Sample_ID\", \"Paternal_ID\",\"Maternal_ID\", \"Sex\",\"Phenotype\"\n", + " ])\n", + " \n", + " # Get PED information for only the batches you are working with if the PED file contains more samples\n", + " reference_ped = reference_ped[reference_ped['Sample_ID'].isin(table['sample_id'])]\n", + " \n", + " return reference_ped" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def create_ped(sex_assignments, reference_ped, file_path):\n", + " \"\"\"\n", + " Create a PED file based on the sex assignments provided.\n", + "\n", + " Args:\n", + " sex_assignments (dict): Maps sample IDs to computed sex assignments.\n", + " reference_ped (pandas.DataFrame): Represents the reference PED file.\n", + " file_path (str): File path to the output PED file.\n", + "\n", + " Returns:\n", + " None.\n", + " \"\"\"\n", + " # Ensure valid sex assignments\n", + " valid_sex_assignments = set(SEX_CODES.keys())\n", + " invalid_assignments = set(sex_assignments.values()) - valid_sex_assignments\n", + " if invalid_assignments:\n", + " raise ValueError(f\"Invalid sex assignments: {', '.join(invalid_assignments)}\")\n", + " \n", + " # Create directories in file path if necessary\n", + " dir_path = os.path.dirname(file_path)\n", + " os.makedirs(dir_path, exist_ok=True)\n", + " \n", + " # Account for case where reference PED file does not exist\n", + " if (reference_ped.empty):\n", + " reference_ped = pd.DataFrame({'Sample_ID': sex_assignments.keys()})\n", + " reference_ped[\"Family_ID\"] = reference_ped[\"Sample_ID\"]\n", + " reference_ped[\"Paternal_ID\"] = \"0\"\n", + " reference_ped[\"Maternal_ID\"] = \"0\"\n", + " reference_ped[\"Sex\"] = \"0\"\n", + " reference_ped[\"Phenotype\"] = \"0\"\n", + " \n", + " # Assign sexes based on sex assignment dictionary\n", + " computed_ped = reference_ped[['Family_ID', 'Sample_ID', 'Paternal_ID', 'Maternal_ID', 'Phenotype']]\n", + " computed_ped['Sex'] = computed_ped['Sample_ID'].map(sex_assignments).map(SEX_CODES)\n", + " computed_ped = computed_ped[['Family_ID', 'Sample_ID', 'Paternal_ID', 'Maternal_ID', 'Sex', 'Phenotype']]\n", + " computed_ped = computed_ped[computed_ped['Sample_ID'].isin(sex_assignments.keys())]\n", + " \n", + " # Save dataframe\n", + " save_df(WS_BUCKET, file_path, computed_ped, print_path=True, header=False)\n", + " \n", + " # Update cohort_ped_file workspace attribute\n", + " workspace_ped_path = os.path.join(WS_BUCKET, file_path)\n", + " attrs = [fapi._attr_set(\"cohort_ped_file\", workspace_ped_path)]\n", + " r = fapi.update_workspace_attributes(NAMESPACE, WORKSPACE, attrs)\n", + " if r.status_code != 200:\n", + " raise Exception(f\"Unable to update workspace attributes: {r}.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def create_ped_differences(reference_ped, computed_sex, file_path):\n", + " \"\"\"\n", + " Creates the a PED file with information from the reference and computed_ped assignment files for samples whose\n", + " sex assignment differs across the two.\n", + " \n", + " Args:\n", + " reference_ped (pandas.DataFrame): Contains the information from the reference PED file.\n", + " computed_sex (dict): Maps sample IDs to their computed sexes.\n", + " file_path (str): The file path to the output PED file. \n", + " \n", + " Returns:\n", + " differences(pandas.DataFrame): Contains the samples whose sex information was modified through the QC process.\n", + " \"\"\"\n", + " # Create column for computed sex values\n", + " computed_sex = {k: SEX_CODES.get(v, 0) for k, v in computed_sex.items()}\n", + " reference_ped['Sex'] = reference_ped['Sex'].astype(int)\n", + " reference_ped['Computed_Sex'] = reference_ped['Sample_ID'].map(computed_sex)\n", + " \n", + " # Keep only samples with differing sex assignments\n", + " differences = reference_ped[reference_ped['Sex'] != reference_ped['Computed_Sex']]\n", + " differences = differences[['Family_ID', 'Sample_ID', 'Paternal_ID', 'Maternal_ID', 'Sex', 'Phenotype']]\n", + " \n", + " # Save differences file\n", + " differences = differences.reset_index(drop=True)\n", + " save_df(WS_BUCKET, file_path, differences)\n", + " \n", + " return differences" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "def plot_sex_chromosome_ploidy(all_cn, sex_for_fig, display=False):\n", + " \"\"\"\n", + " Function to plot sex chromosome ploidy.\n", + "\n", + " Args:\n", + " all_cn (pandas.DataFrame): Data to be plotted. It should contain columns 'sample_id', 'chrX_CopyNumber' and 'chrY_CopyNumber'.\n", + " sex_for_fig (dict): Maps sample ID to its sex category. The keys should be present in `all_cn['sample_id']`.\n", + " prefix (str): A prefix to use when plotting.\n", + " display (bool): Defines whether to display the plot directly.\n", + "\n", + " Returns:\n", + " str: Path to saved file containing plot.\n", + "\n", + " \"\"\"\n", + " # Dictionary for colors to be used for different sex categories\n", + " color_dict = {\"MALE\": 'deepskyblue', \"FEMALE\": 'tab:pink', \"TURNER\": 'tab:red', \n", + " \"TRIPLE X\": 'darkviolet', \"KLINEFELTER\": 'darkorange', \"JACOBS\": 'g', \n", + " \"MOSAIC\": 'aquamarine', \"OTHER\": 'maroon'}\n", + "\n", + " # Dictionary for chromosomal representation for different sex categories\n", + " xy_rep = {\"MALE\": \" (XY)\", \"FEMALE\": \" (XX)\", \"TURNER\": \" (X)\", \"TRIPLE X\": \" (XXX)\", \n", + " \"KLINEFELTER\": \" (XXY)\", \"JACOBS\": \" (XYY)\", \"MOSAIC\": \"\", \"OTHER\": \"\"}\n", + "\n", + " # Creating a list of colors corresponding to sex categories for each sample in the data\n", + " color_sex_list = [color_dict[sex_for_fig[sample]] for sample in all_cn['sample_id']]\n", + "\n", + " # List for the x and y axis ticks\n", + " cn_list = [0,1,2,3]\n", + " \n", + " # Creating the figure and setting its size\n", + " fig = plt.figure(figsize=(7,7))\n", + " \n", + " # Adding a grid to the figure\n", + " plt.grid(True, zorder=0)\n", + " \n", + " # Adding a scatter plot to the figure with data points colored according to sex category\n", + " plt.scatter(all_cn.chrX_CopyNumber, all_cn.chrY_CopyNumber, alpha=1, s=10,\n", + " c=color_sex_list, zorder=2)\n", + " \n", + " # Setting the x and y axis limits\n", + " plt.xlim([-0.1,3.15])\n", + " plt.ylim([-0.1,3.1])\n", + " \n", + " # Setting the x and y axis ticks\n", + " plt.xticks(cn_list)\n", + " plt.yticks(cn_list)\n", + "\n", + " # Adding text to the plot representing different combinations of X and Y chromosomes\n", + " for nx in cn_list:\n", + " for ny in cn_list:\n", + " if nx + ny >=6:\n", + " continue\n", + " plt.text(nx, ny, nx*\"X\" + ny*\"Y\", color='lightgray', ha='center', va='center', zorder=1, weight=\"bold\")\n", + " \n", + " # Adding a legend to the plot\n", + " plt.legend(handles=[Line2D(\n", + " [0],[0], alpha=1, marker='o', color='w', \n", + " markerfacecolor=color_dict[label], \n", + " label=label + xy_rep[label]\n", + " ) for label in color_dict.keys()], framealpha=1, labelspacing=0.5)\n", + " \n", + " # Adding a title and labels to the plot\n", + " plt.title(f\"Sex Chromosome Ploidy - {len(sex_for_fig)} Samples\", fontsize=15)\n", + " plt.xlabel(\"chrX Copy Number\", fontsize=13)\n", + " plt.ylabel(\"chrY Copy Number\", fontsize=13)\n", + " \n", + " # Save the plot as an image \n", + " file_name = f\"sex_chromosome_ploidy_{len(sex_for_fig)}_samples.png\"\n", + " file_path = generate_file_path(TLD_PATH, 'sex_analysis', file_name)\n", + " save_figure(WS_BUCKET, file_path)\n", + " \n", + " # Show figure if display = True, else close to free up memory and return file name\n", + " if (display == True):\n", + " plt.show()\n", + " return\n", + " plt.close()\n", + " return output_png" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data Ingestion\n", + "This section fetches the sample QC data." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 1. Sample Sets\n", + "This step loads the sample_set data table to find the QC data file paths. You can exclude unnecessary sample_sets at this stage if needed." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Uncomment and run once. Once this step has run, if you need to load the table again, use the following cell.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "code_folding": [] + }, + "outputs": [], + "source": [ + "# sample_set_response = fapi.get_entities_tsv(\n", + "# NAMESPACE, WORKSPACE, \"sample_set\", \n", + "# attrs=[\"ploidy_plots\", \"qc_table\"], model=\"flexible\"\n", + "# )\n", + "\n", + "# with open('sample_set.zip', 'wb') as f:\n", + "# f.write(sample_set_response.content)\n", + " \n", + "# with zipfile.ZipFile('sample_set.zip', 'r') as zip_ref:\n", + "# # Extract sample set data\n", + "# with zip_ref.open('sample_set_entity.tsv') as file:\n", + "# tsv_file = io.StringIO(file.read().decode('utf-8'))\n", + "# sample_set_tbl = pd.read_csv(tsv_file, sep='\\t')\n", + "# sample_set_tbl = sample_set_tbl[sample_set_tbl['ploidy_plots'].notnull() & sample_set_tbl['qc_table'].notnull()]\n", + "# sample_set_tbl = sample_set_tbl.reset_index(drop=True)\n", + " \n", + "# # Extract sample membership data\n", + "# with zip_ref.open('sample_set_membership.tsv') as membership_file:\n", + "# membership_tsv = io.StringIO(membership_file.read().decode('utf-8'))\n", + "# membership_df = pd.read_csv(membership_tsv, sep='\\t')\n", + " \n", + "# # Add list of samples to corresponding sample set\n", + "# sample_groups = membership_df.groupby('membership:sample_set_id')['sample'].unique().apply(list)\n", + "# sample_set_tbl['samples'] = sample_set_tbl['entity:sample_set_id'].map(sample_groups)\n", + "# sample_set_tbl['samples'] = sample_set_tbl['samples'].apply(lambda x: x if isinstance(x, list) else [])\n", + "\n", + "# file_path = generate_file_path(TLD_PATH, 'artifacts', 'sample_sets_qc.tsv')\n", + "# save_df(WS_BUCKET, file_path, sample_set_tbl)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Use this block to reload the data saved in the previous cell\n", + "file_path = generate_file_path(TLD_PATH, 'artifacts', 'sample_sets_qc.tsv')\n", + "\n", + "sample_set_tbl = pd.read_table(file_path, sep='\\t', dtype={'sample_id': str})\n", + "\n", + "sample_set_tbl" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Optional: If you wish to only include a subset of batches out of the ones listed above, you can filter sample_set_tbl to only include them using the cell below. Ensure that all batches you expect are included.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Optionally filter to only include a subset of batches from sample_set_tbl and update the saved file. For example:\n", + "sample_set_tbl = sample_set_tbl[sample_set_tbl['entity:sample_set_id'].str.contains('KJ_EvidenceQC_Updates')]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "file_path = generate_file_path(TLD_PATH, 'artifacts', 'sample_sets_qc.tsv')\n", + "save_df(WS_BUCKET, file_path, sample_set_tbl)\n", + "\n", + "# Output batch information\n", + "print(f\"Sample Set DataFrame Dimensions: {sample_set_tbl.shape}\")\n", + "print(f\"Batch Count: {len(sample_set_tbl)}\\n\")\n", + "print(\"Batches:\")\n", + "print(*list(sample_set_tbl['entity:sample_set_id'].values), sep='\\n')\n", + "\n", + "sample_set_tbl" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 2. Samples\n", + "This step aggregates sample QC data across *sample_sets*." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Uncomment and run once. Once this step has run, if you need to load the table again, use the next cell.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# samples_qc_table = pd.concat([pd.read_csv(f, sep='\\t') for f in sample_set_tbl['qc_table']], ignore_index = True)\n", + "\n", + "# print(f\"Sample DataFrame Dimensions: {samples_qc_table.shape}\")\n", + "# print(f\"Sample Count: {len(samples_qc_table)}\\n\")\n", + "\n", + "# validate_unique_samples(samples_qc_table)\n", + "\n", + "# file_path = generate_file_path(TLD_PATH, 'artifacts', 'samples_qc.tsv')\n", + "# save_df(WS_BUCKET, file_path, samples_qc_table)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Use this block to reload the data saved in the previous cell\n", + "file_path = generate_file_path(TLD_PATH, 'artifacts', 'samples_qc.tsv')\n", + "\n", + "samples_qc_table = pd.read_table(file_path, sep='\\t', dtype={'sample_id': str})\n", + "\n", + "samples_qc_table" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 3. Ploidy Data\n", + "This step localizes files with additional data related to copy number." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Uncomment and run once. Once this step has run, if you need to load the table again, use the next cell.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "code_folding": [] + }, + "outputs": [], + "source": [ + "# dir_path = os.path.join(TLD_PATH, \"ploidy\")\n", + "# ploidy_dirs = process_ploidy_data(sample_set_tbl, 'ploidy_plots', dir_path)\n", + "\n", + "# # Write the directory names to a file\n", + "# file_path = os.path.join(TLD_PATH, \"ploidy\", \"ploidy_dirs.list\")\n", + "# with open(file_path, 'w') as dirs_file:\n", + "# for ploidy_dir in ploidy_dirs:\n", + "# dirs_file.write(ploidy_dir + '\\n')\n", + "\n", + "# # Get binwise copy number files\n", + "# binwise_cn_files = [os.path.join(ploidy_dir, \"ploidy_est\", \"binwise_estimated_copy_numbers.bed.gz\") for ploidy_dir in ploidy_dirs]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Use this block to reload the data saved in the previous cell\n", + "file_path = generate_file_path(TLD_PATH, \"ploidy\", \"ploidy_dirs.list\")\n", + "\n", + "ploidy_dirs = [line.strip() for line in open(file_path)]\n", + "\n", + "binwise_cn_files = [ploidy_dir + \"/ploidy_est/binwise_estimated_copy_numbers.bed.gz\" for ploidy_dir in ploidy_dirs]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# QC\n", + "This section involves analyzing and filtering samples based on a series of sample quality metrics. \n", + "\n", + "**Usage**:\n", + "\n", + "1. Each step should be executed by first using the **Analysis** section, which plots figures to assist with determining appropriate filtering cutoff values. This will involve modifying the following parameters:\n", + " - `LOG_SCALE`: Determines whether to log-scale the resulting plot.\n", + " - `LINE_DEVIATIONS`: A list of integers that defines the cutoff lines to draw on each histogram plot. This is a list of multipliers *x* for the median absolute deviation (MAD) such that lines are drawn at Median ± x * MAD. It has been initialized with default values, but set this to `[]` or `None` if you don't wish to include any cutoffs.\n", + " - `LINE_STYLES`: A list of strings that defines the line styles that each cutoff line should use in each histogram plot displayed. It has been initialized with default values, but set this to `[]` or `None` if you don't wish to include any cutoffs.\n", + " - `**kwargs`: Additional arguments passed in to the plotting function (e.g. `bins = 50`).\n", + "\n", + "2. After this is done and suitable cutoffs have been established, the **Filtering** section enables dropping samples using these cutoffs. This will involve modifying the following parameters:\n", + " - `LOG_SCALE`: Determines whether to log-scale the resulting plot.\n", + " - `METHOD`: Must be set to one of `'hard'` or `'MAD'`. It is initialized to the method used for the [All of Us (AoU) CDRv7 off-cycle SV dataset](https://support.researchallofus.org/hc/en-us/articles/27496716922900-All-of-Us-Short-Read-Structural-Variant-Quality-Report).\n", + " - If set to `'hard'`, the filtering will be based on strict thresholds set by `LOWER_CUTOFF` and `UPPER_CUTOFF`. \n", + " - `LOWER_CUTOFF`: This defines the lower threshold when `METHOD = 'hard'` - any samples whose value for a given metric is less than this will be dropped. It is initialized to `None`, which corresponds to a lower cutoff of negative infinity, but should be set to a numerical value if `METHOD = hard`. For some metrics, a lower cutoff is not relevant, so the parameter is not set and defaults to negative infinity.\n", + " - `UPPER_CUTOFF`: The upper threshold when `METHOD = 'hard'` - any samples whose value for a given metric is greater than this will be dropped. It is initialized to `None`, which corresponds to an upper cutoff of infinity, but should be set to a numerical value if `METHOD = hard`. For some metrics, an upper cutoff is not relevant, so the parameter is not set and defaults to infinity.\n", + " - If set to `'MAD'`, the filtering will be based on MAD-based thresholds set by `MAD_CUTOFF`.\n", + " - `MAD_CUTOFF`: The MAD threshold when `METHOD = 'MAD'` - any samples whose value for a given metric is outside this number of MAD from the median will be dropped. It is initialized to `None`, which corresponds to an infinite cutoff, but should be set to a numerical value if `METHOD = hard`.\n", + " - `**kwargs`: Additional arguments passed in to the plotting function (e.g. `bins = 50`).\n", + " \n", + " \n", + "**Outputs**:\n", + "\n", + "Outputs generated in the sections below are saved to the virtual machine's file system (within the analyses edit directory) and Google Cloud Storage (in the bucket attached to your workspace) under the top-level directory `evidence_qc/`. The file system structure within the `evidence_qc/` directory is mirrored across both file systems, and can be seen as follows:\n", + "- `median_coverage/`: Contains outputs generated within the **Median Coverage** section.\n", + "- `mean_insert_size/`: Contains outputs generated within the **Mean Insert Size** section.\n", + "- `wgd_score/`: Contains outputs generated within the **WGD Score** section.\n", + "- `nondiploid_bins/`: Contains outputs generated within the **Non-Diploid Bins** section.\n", + "- `contamination/`: Contains outputs generated within the **Contamination** section.\n", + "- `raw_caller_outliers/`: Contains outputs generated within the **Raw Caller Outliers** section.\n", + "- `autosomal_copy_number/`: Contains outputs generated within the **Autosomal Copy Number** section.\n", + "- `sex_analysis/`: Contains outputs generated within the **Sex Analysis** section.\n", + "- `filtering/`: Contains outputs generated within the **Sample Filtering** section.\n", + "- `artifacts/`: Contains temporary artifacts generated throughout the notebook.\n", + "\n", + " \n", + "**Additional Notes**:\n", + "\n", + "- For guidance, cutoffs from the [AoU CDRv7 off-cycle SV dataset](https://support.researchallofus.org/hc/en-us/articles/27496716922900-All-of-Us-Short-Read-Structural-Variant-Quality-Report) will be noted in each section, but users should exercise their judgment to select cutoffs appropriate for their data.\n", + "- Some steps dynamically set the value of `filter_type`, which defines the column name within the metadata table that corresponds to the metric that a given step is based on. The code for this should not have to be changed in any cell.\n", + "- If you wish to see the samples that have been filtered out at any particular point, run the **Sample Filtering** section - you can always come back and apply more filters at any point." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Median Coverage\n", + "The median coverage measures the typical sequencing depth in a sample. We recommend a minimum coverage of about 30x. Coverage is also important for batching in order to set appropriate genotyping cutoffs among homogeneous samples in a batch." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
The read length is used to convert the median binned coverage to median base coverage for easier interpretation. The parameter READ_LENGTH is set to 151 by default, which is typical for Illumina data; please edit this input if your data has a different read length.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "READ_LENGTH = 151\n", + "\n", + "validate_numeric_input(READ_LENGTH)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Analysis" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Input the analysis parameters.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "LOG_SCALE = False # Boolean value that defines whether to log-scale the plot\n", + "LINE_DEVIATIONS = None # List of integers that defines the MAD cutoff lines to draw on each histogram plot\n", + "LINE_STYLES = None # List of strings that defines the line styles of each MAD cutoff line passed above\n", + "\n", + "validate_qc_inputs(samples_qc_table, 'median_coverage', line_deviations=LINE_DEVIATIONS, \n", + " line_styles=LINE_STYLES, log_scale=LOG_SCALE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Example use of **kwargs to pass in parameter `bins` to the histogram plotting function\n", + "run_analysis(samples_qc_table, 'median_coverage', LINE_DEVIATIONS, LINE_STYLES, READ_LENGTH, \n", + " log_scale=LOG_SCALE, bins=50)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Filtering" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Input the filtering parameters. For the AoU SV data, the lower cutoff was 30.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "LOG_SCALE = False # Boolean value that defines whether to log-scale the plot\n", + "METHOD = 'hard' # String value that defines the cutoff method to use - either 'MAD' or 'hard'\n", + "\n", + "LOWER_CUTOFF = None # Numeric value that defines the lower threshold if METHOD = 'hard'\n", + "UPPER_CUTOFF = None # Numeric value that defines the upper threshold if METHOD = 'hard'\n", + "MAD_CUTOFF = None # Numeric value that defines the MAD deviation threshold if METHOD = 'MAD'\n", + "\n", + "validate_qc_inputs(samples_qc_table, 'median_coverage', method=METHOD, lower_cutoff=LOWER_CUTOFF, \n", + " upper_cutoff=UPPER_CUTOFF, mad_cutoff=MAD_CUTOFF, log_scale=LOG_SCALE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Example use of **kwargs to pass in parameter `bins` to the histogram plotting function\n", + "run_filtering(samples_qc_table, 'median_coverage', METHOD, lower_cutoff=LOWER_CUTOFF, upper_cutoff=UPPER_CUTOFF, \n", + " mad_cutoff=MAD_CUTOFF, read_length=READ_LENGTH, log_scale=LOG_SCALE, bins=50)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Mean Insert Size\n", + "The insert size is the length of the DNA fragment being sequenced, excluding adapters. We estimate insert size for each read pair as the distance between the mapped positions of the mates. The mean insert size is the average insert size for the reads within a sample; refer to the [Picard documentation](https://broadinstitute.github.io/picard/picard-metric-definitions.html#InsertSizeMetrics) for details of how outliers are excluded. Insert size is important for SV calling because discordant read pairs are one type of SV evidence in short read data; this metric is particularly useful for batching to ensure paired end evidence cutoffs for genotyping are well-calibrated within each batch.\n", + "\n", + "Samples processed with Scramble instead of MELT will have missing values for mean insert size. If mean insert size is not available for your data, you can skip this step." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Analysis" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Input the analysis parameters.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "LOG_SCALE = False # Boolean value that defines whether to log-scale the plot\n", + "LINE_DEVIATIONS = None # List of integers that defines the MAD cutoff lines to draw on each histogram plot\n", + "LINE_STYLES = None # List of strings that defines the line styles of each MAD cutoff line passed above\n", + "\n", + "validate_qc_inputs(samples_qc_table, 'mean_insert_size', line_deviations=LINE_DEVIATIONS, \n", + " line_styles=LINE_STYLES, log_scale=LOG_SCALE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_analysis(samples_qc_table, 'mean_insert_size', LINE_DEVIATIONS, LINE_STYLES, log_scale=LOG_SCALE)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Filtering" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Input the filtering parameters. For the AoU SV data, the lower cutoff was 320 and the upper cutoff was 700.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "LOG_SCALE = False # Boolean value that defines whether to log-scale the plot\n", + "METHOD = 'hard' # String value that defines the cutoff method to use - either 'MAD' or 'hard'\n", + "\n", + "LOWER_CUTOFF = None # Numeric value that defines the lower threshold if METHOD = 'hard'\n", + "UPPER_CUTOFF = None # Numeric value that defines the upper threshold if METHOD = 'hard'\n", + "MAD_CUTOFF = None # Numeric value that defines the MAD deviation threshold if METHOD = 'MAD'\n", + "\n", + "validate_qc_inputs(samples_qc_table, 'mean_insert_size', method=METHOD, lower_cutoff=LOWER_CUTOFF, \n", + " upper_cutoff=UPPER_CUTOFF, mad_cutoff=MAD_CUTOFF, log_scale=LOG_SCALE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_filtering(samples_qc_table, 'mean_insert_size', METHOD, lower_cutoff=LOWER_CUTOFF, \n", + " upper_cutoff=UPPER_CUTOFF, mad_cutoff=MAD_CUTOFF, log_scale=LOG_SCALE)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## WGD Score\n", + "The whole genome dosage (WGD), or dosage bias, score quantifies the non-uniformity of sequencing coverage within a sample ([Collins et al., Nature 2020](https://www.nature.com/articles/s41586-020-2287-8)). The distribution of WGD scores should be centered just below 0 for PCR-free genomes and just above 0 for PCR+ genomes. Greater magnitude of WGD indicates more variable coverage, which can result in poor read depth-based CNV calling, so we recommend removing samples which are outliers for WGD. WGD is also useful for creating batches with similar coverage biases, which improves batch-level model training for read depth-based CNV calling." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Analysis" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Input the analysis parameters.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "LOG_SCALE = False # Boolean value that defines whether to log-scale the plot\n", + "LINE_DEVIATIONS = [2, 4, 6] # List of integers that defines the MAD cutoff lines to draw on each histogram plot\n", + "LINE_STYLES = ['solid', 'dashed', 'dashdot'] # List of strings that defines the line styles of each MAD cutoff line passed above\n", + "\n", + "validate_qc_inputs(samples_qc_table, 'wgd_score', line_deviations=LINE_DEVIATIONS, \n", + " line_styles=LINE_STYLES, log_scale=LOG_SCALE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_analysis(samples_qc_table, 'wgd_score', LINE_DEVIATIONS, LINE_STYLES, log_scale=LOG_SCALE)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Filtering" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Input the filtering parameters. For the AoU SV data, the MAD cutoff was 6, which represented a range of approximately [-0.162, 0.136].
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "LOG_SCALE = False # Boolean value that defines whether to log-scale the plot\n", + "METHOD = 'MAD' # String value that defines the cutoff method to use - either 'MAD' or 'hard'\n", + "\n", + "LOWER_CUTOFF = None # Numeric value that defines the lower threshold if METHOD = 'hard'\n", + "UPPER_CUTOFF = None # Numeric value that defines the upper threshold if METHOD = 'hard'\n", + "MAD_CUTOFF = None # Numeric value that defines the MAD deviation threshold if METHOD = 'MAD'\n", + "\n", + "validate_qc_inputs(samples_qc_table, 'wgd_score', method=METHOD, lower_cutoff=LOWER_CUTOFF, \n", + " upper_cutoff=UPPER_CUTOFF, mad_cutoff=MAD_CUTOFF, log_scale=LOG_SCALE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_filtering(samples_qc_table, 'wgd_score', METHOD, lower_cutoff=LOWER_CUTOFF, \n", + " upper_cutoff=UPPER_CUTOFF, mad_cutoff=MAD_CUTOFF, log_scale=LOG_SCALE)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Non-Diploid Bins\n", + "The number of non-diploid bins for each sample was calculated by counting the number of 1Mb bins whose sequencing coverage significantly deviated from the expectation for that sample. Similar to WGD, samples with a very high number of non-diploid bins may perform poorly in read depth-based CNV calling." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Analysis" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Input the analysis parameters.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "LOG_SCALE = False # Boolean value that defines whether to log-scale the plot\n", + "LINE_DEVIATIONS = None # List of integers that defines the MAD cutoff lines to draw on each histogram plot\n", + "LINE_STYLES = None # List of strings that defines the line styles of each MAD cutoff line passed above\n", + "\n", + "validate_qc_inputs(samples_qc_table, 'nondiploid_bins', line_deviations=LINE_DEVIATIONS, \n", + " line_styles=LINE_STYLES, log_scale=LOG_SCALE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_analysis(samples_qc_table, 'nondiploid_bins', LINE_DEVIATIONS, LINE_STYLES, log_scale=LOG_SCALE)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Filtering" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Input the filtering parameters. For the AoU SV data, the upper cutoff was 500.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "LOG_SCALE = False # Boolean value that defines whether to log-scale the plot\n", + "METHOD = 'hard' # String value that defines the cutoff method to use - either 'MAD' or 'hard'\n", + "\n", + "UPPER_CUTOFF = None # Numeric value that defines the upper threshold if METHOD = 'hard'\n", + "MAD_CUTOFF = None # Numeric value that defines the MAD deviation threshold if METHOD = 'MAD'\n", + "\n", + "validate_qc_inputs(samples_qc_table, 'nondiploid_bins', method=METHOD, upper_cutoff=UPPER_CUTOFF, \n", + " mad_cutoff=MAD_CUTOFF, log_scale=LOG_SCALE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_filtering(samples_qc_table, 'nondiploid_bins', METHOD, upper_cutoff=UPPER_CUTOFF, \n", + " mad_cutoff=MAD_CUTOFF, log_scale=LOG_SCALE)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Contamination\n", + "If you wish to analyze any additional metrics, you may add them to the metrics table and do so here. One additional useful metric, shown here, is cross-sample contamination - this is the fraction of reads in the sample that come from another individual. High rates of contamination can cause artifacts in genotyping. If you do not have access to contamination metrics, you may skip this section." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Analysis" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Input the analysis parameters.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "LOG_SCALE = True # Boolean value that defines whether to log-scale the plot\n", + "LINE_DEVIATIONS = None # List of integers that defines the MAD cutoff lines to draw on each histogram plot\n", + "LINE_STYLES = None # List of strings that defines the line styles of each MAD cutoff line passed above\n", + "\n", + "validate_qc_inputs(samples_qc_table, 'contamination', line_deviations=LINE_DEVIATIONS, \n", + " line_styles=LINE_STYLES, log_scale=LOG_SCALE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_analysis(samples_qc_table, 'contamination', LINE_DEVIATIONS, LINE_STYLES, log_scale=LOG_SCALE)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Filtering" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Input the filtering parameters. For the AoU SV data, the upper cutoff was 0.01.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "LOG_SCALE = True # Boolean value that defines whether to log-scale the plot\n", + "METHOD = 'hard' # String value that defines the cutoff method to use - either 'MAD' or 'hard'\n", + "\n", + "UPPER_CUTOFF = None # Numeric value that defines the upper threshold if METHOD = 'hard'\n", + "MAD_CUTOFF = None # Numeric value that defines the MAD deviation threshold if METHOD = 'MAD'\n", + "\n", + "\n", + "validate_qc_inputs(samples_qc_table, 'contamination', method=METHOD, upper_cutoff=UPPER_CUTOFF, \n", + " mad_cutoff=MAD_CUTOFF, log_scale=LOG_SCALE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_filtering(samples_qc_table, 'contamination', METHOD, upper_cutoff=UPPER_CUTOFF, \n", + " mad_cutoff=MAD_CUTOFF, log_scale=LOG_SCALE)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Raw Caller Outliers\n", + "This series of metrics look for samples with an abnormally high or low number of raw SV calls from the three initial algorithms: Manta, Wham, and Scramble (or MELT). Higher than typical SV counts may indicate technical artifacts, while extremely low SV counts may indicate that an algorithm failed to complete. The values represent the number of times the sample was an outlier for SV counts across categories defined by algorithm, SV type, and chromosome. \n", + "\n", + "**Note**: \n", + "In the sections below, there are two additional parameters that have not been covered as of yet.\n", + "- `CALLER`: The caller for which to analyze results. This must be one of `['overall', 'manta', 'melt', 'scramble', 'wham']`, where 'overall' corresponds to the sum of outlier occurrences across the individual callers.\n", + "- `TYPE`: The type of outliers for which to analyze results. This must be one of `['high', 'low']`, where 'high' indicates an the number of cases in which the sample had more SVs than typical, while 'low' indicates the number of cases in which the sample had fewer SVs than typical. \n", + "\n", + "We recommend checking the overall high and low outliers (i.e. `CALLER = 'overall'` and `TYPE = 'high'/'low'`), but you may also examine results for individual algorithms." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Analysis" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Input the analysis parameters.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "LOG_SCALE = False # Boolean value that defines whether to log-scale the plot\n", + "LINE_DEVIATIONS = None # List of integers that defines the MAD cutoff lines to draw on each histogram plot\n", + "LINE_STYLES = None # List of strings that defines the line styles of each MAD cutoff line passed above\n", + "\n", + "CALLER = 'overall' # String value that defines the caller - either 'overall', 'manta', 'melt', 'wham' or 'dragen'\n", + "TYPE = 'high' # String value that defines the outlier direction - either 'high' or 'low'\n", + "\n", + "validate_qc_inputs(samples_qc_table, f\"{CALLER}_{TYPE}_outlier\", line_deviations=LINE_DEVIATIONS, \n", + " line_styles=LINE_STYLES, caller=CALLER, caller_type=TYPE, log_scale=LOG_SCALE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_analysis(samples_qc_table, f\"{CALLER}_{TYPE}_outlier\", LINE_DEVIATIONS, LINE_STYLES, log_scale=LOG_SCALE)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Filtering" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Input the filtering parameters. For the AoU SV data, the upper cutoff was 30 across all callers - i.e. CALLER = 'overall' and TYPE = 'high'.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "LOG_SCALE = False # Boolean value that defines whether to log-scale the plot\n", + "METHOD = 'hard' # String value that defines the cutoff method to use - either 'MAD' or 'hard'\n", + "\n", + "CALLER = 'overall' # String value that defines the caller - either 'overall', 'manta', 'melt', 'wham' or 'dragen'\n", + "TYPE = 'high' # String value that defines the outlier direction - either 'high' or 'low'\n", + "\n", + "UPPER_CUTOFF = None # Numeric value that defines the upper threshold if METHOD = 'hard'\n", + "MAD_CUTOFF = None # Numeric value that defines the MAD deviation threshold if METHOD = 'MAD'\n", + "\n", + "validate_qc_inputs(samples_qc_table, f\"{CALLER}_{TYPE}_outlier\", method=METHOD, upper_cutoff=UPPER_CUTOFF, \n", + " mad_cutoff=MAD_CUTOFF, caller=CALLER, caller_type=TYPE, log_scale=LOG_SCALE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_filtering(samples_qc_table, f\"{CALLER}_{TYPE}_outlier\", METHOD, upper_cutoff=UPPER_CUTOFF, \n", + " mad_cutoff=MAD_CUTOFF, log_scale=LOG_SCALE)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Autosomal Copy Number\n", + "Samples that are outliers for normalized copy number on one or more autosomes should be filtered to preserve the quality of SV calls for other samples. Explore the plots of copy ratio across chromosomes and binned copy number per chromosome to set an appropriate cutoff. These plots can be used to identify potential germline or mosaic aneuploidies as well.\n", + "\n", + "**Note**: The parameter `CHR_NUMBER` in this step is a numeric value that allows you to select a specific chromosome to view an analysis for." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Visualize All Autosomes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "code_folding": [], + "scrolled": true + }, + "outputs": [], + "source": [ + "# Dislay copy ratio across autosomes for all samples\n", + "plot_copy_number_per_autosome(samples_qc_table[cols_autosome], display=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Below Threshold" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Analysis" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Adjust the cutoff parameter to get the counts of samples with likely copy number aberrations on the autosome. The copy ratio is defined as the ratio of observed to expected depth per chromosome copy, i.e. a copy ratio of 2.0 corresponds to diploid. A recommended range is 1.2-1.8, but the choice of cutoff will depend on dataset quality.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "LOWER_CUTOFF = 1.8\n", + "\n", + "validate_numeric_input(LOWER_CUTOFF)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cn_filter_pairs, cn_filter_ids = find_samples_outside_threshold(samples_qc_table[cols_autosome], \n", + " lower_cutoff=LOWER_CUTOFF)\n", + "\n", + "cn_filter_pairs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Input the analysis parameters to plot the aneuploidies of samples that fall below the threshold for a specific chromosome.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "CHR_NUMBER = 1\n", + "LOWER_CUTOFF = 1.8\n", + "\n", + "validate_numeric_input(CHR_NUMBER, LOWER_CUTOFF)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Dislay copy number plots for batches with samples that don't meet threshold\n", + "display_aneuploidy_outside_threshold(samples_qc_table[cols_autosome], CHR_NUMBER, lower_cutoff=LOWER_CUTOFF, \n", + " display=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Filtering" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Input the filtering parameters to filter out the samples whose estimated copy number fall below the threshold for at least one chromosome. For the AoU SV data, the lower cutoff was 1.8.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "LOWER_CUTOFF = None\n", + "\n", + "validate_numeric_input(LOWER_CUTOFF)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cn_filter_pairs, cn_filter_ids = find_samples_outside_threshold(samples_qc_table[cols_autosome], \n", + " lower_cutoff=LOWER_CUTOFF)\n", + "\n", + "remove_samples(cn_filter_ids, 'low_copy_number_outlier')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Above Threshold" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Analysis" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Input the analysis parameters to get the counts of samples with a copy number exceeding the threshold on each autosome. A recommended range is 2.2-2.8, but the choice of cutoff will depend on dataset quality.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "UPPER_CUTOFF = 2.3\n", + "\n", + "validate_numeric_input(UPPER_CUTOFF)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cn_filter_pairs, cn_filter_ids = find_samples_outside_threshold(samples_qc_table[cols_autosome], upper_cutoff=UPPER_CUTOFF)\n", + "\n", + "cn_filter_pairs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Input the analysis parameters to plot the aneuploidies of samples that exceed the threshold for a specific chromosome.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "CHR_NUMBER = 1\n", + "UPPER_CUTOFF = 2.3\n", + "\n", + "validate_numeric_input(CHR_NUMBER, UPPER_CUTOFF)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Dislay copy number plots for batches with samples that don't meet threshold\n", + "display_aneuploidy_outside_threshold(samples_qc_table[cols_autosome], CHR_NUMBER, upper_cutoff=UPPER_CUTOFF, display=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Filtering" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Input the filtering parameters to filter out the samples whose estimated copy number exceeds the threshold for at least one chromosome. For the AoU SV data, the upper cutoff was 2.3.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "UPPER_CUTOFF = None\n", + "\n", + "validate_numeric_input(UPPER_CUTOFF)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cn_filter_pairs, cn_filter_ids = find_samples_outside_threshold(samples_qc_table[cols_autosome], upper_cutoff=UPPER_CUTOFF)\n", + "\n", + "remove_samples(cn_filter_ids, 'high_copy_number_outlier')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Sex Analysis\n", + "In this section, you can examine the sex chromosome ploidy and computed sex labels. You can generate a PED file with the computed sex information, or if you already have a PED file, you can compare and update the sex information as needed. Importantly, samples with sex chromosome ploidy other than XX or XY should have sex set to 0 in the PED file." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Input cutoffs for determining mosaic sex assignments. These cutoffs update the computed sex to account for mosaic loss of allosomes. Males with mosaic loss of chromosome Y between the upper and lower chrY cutoffs will be set to MALE. Females with mosaic loss of chromosome X will be set to MOSAIC and SV calls will not be made on allosomes. Please adjust the cutoffs as needed based on the plot of sex chromosome ploidy below.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "LOWER_CUTOFF_CHRX = 1.2\n", + "UPPER_CUTOFF_CHRX = 1.7\n", + "LOWER_CUTOFF_CHRY = 0.1\n", + "UPPER_CUTOFF_CHRY = 0.8\n", + "\n", + "validate_numeric_input(LOWER_CUTOFF_CHRX, UPPER_CUTOFF_CHRX, LOWER_CUTOFF_CHRY, UPPER_CUTOFF_CHRY)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create dictionary of sex assignments\n", + "updated_sex = compute_sex_assignments(samples_qc_table, \n", + " lower_cutoff_chrX=LOWER_CUTOFF_CHRX,\n", + " upper_cutoff_chrX=UPPER_CUTOFF_CHRX, \n", + " lower_cutoff_chrY=LOWER_CUTOFF_CHRY,\n", + " upper_cutoff_chrY=UPPER_CUTOFF_CHRY)\n", + "\n", + "# Calculate sex counts and frequencies\n", + "sex_ped_counts = pd.DataFrame.from_dict({'ID': list(updated_sex.keys()), 'SEX': list(updated_sex.values())})['SEX'].value_counts()\n", + "sex_ped_counts_normalized = pd.DataFrame.from_dict({'ID': list(updated_sex.keys()), 'SEX': list(updated_sex.values())})['SEX'].value_counts(normalize=True)\n", + "\n", + "# Display sex counts and frequencies\n", + "print(f\"Count of each unique value in column: {sex_ped_counts}\\n\")\n", + "print(f\"Proportion of each unique value in column: {sex_ped_counts_normalized}\\n\")\n", + "\n", + "plot_sex_chromosome_ploidy(samples_qc_table, updated_sex, display=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
If available: If you have a PED file for your cohort, provide the Google Cloud Storage path to it via REFERENCE_ASSIGNMENTS_PED.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "REFERENCE_ASSIGNMENTS_PED = None\n", + "\n", + "if REFERENCE_ASSIGNMENTS_PED:\n", + " validate_ped(REFERENCE_ASSIGNMENTS_PED, set(samples_qc_table['sample_id']))\n", + "else: \n", + " print(\"Reference PED file is not provided - skipping this step.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "reference_ped = pd.DataFrame()\n", + "\n", + "if REFERENCE_ASSIGNMENTS_PED:\n", + " reference_ped = process_reference_ped(REFERENCE_ASSIGNMENTS_PED, samples_qc_table)\n", + "else:\n", + " print(\"Reference PED file is not provided - skipping this step.\")\n", + " \n", + "reference_ped" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create and display differences files using reference and computed sex assignments\n", + "differences_ped = pd.DataFrame()\n", + "\n", + "if REFERENCE_ASSIGNMENTS_PED:\n", + " file_path = generate_file_path(TLD_PATH, 'sex_analysis', \"differing_sex_assignments.tsv\")\n", + " differences_ped = create_ped_differences(reference_ped, updated_sex, file_path)\n", + "else:\n", + " print(\"Reference PED file is not provided - skipping this step.\")\n", + " \n", + "differences_ped" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save and display generated PED file, and update 'cohort_ped' workspace variable to this\n", + "file_path = generate_file_path(TLD_PATH, 'sex_analysis', 'sample_qc.ped')\n", + "create_ped(updated_sex, reference_ped, file_path)\n", + "\n", + "created_ped = pd.read_table(file_path, names=['Family_ID', 'Sample_ID', 'Paternal_ID',\n", + " 'Maternal_ID', 'Sex', 'Phenotype'])\n", + "created_ped" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Sample Filtering\n", + "This section filters samples based on the results of the series of QC steps completed above, creating metadata files for both passing and failed samples.\n", + "\n", + "**Note**: If you ever wish to test see the samples currently being filtered at any point in the notebook's execution, simply run all cells in this section." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Print summary filtering statistics\n", + "inv_samples_to_remove = invert_removed_samples(samples_to_remove)\n", + "sample_ids_to_remove = list(inv_samples_to_remove.keys())\n", + "sample_ids = samples_qc_table['sample_id'].tolist()\n", + "\n", + "print(f'Total samples: {len(sample_ids)}')\n", + "print(f'Passing samples: {len(sample_ids) - len(sample_ids_to_remove)}')\n", + "print(f'Failing samples: {len(sample_ids_to_remove)}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save and display table of filtered samples\n", + "filter_df = pd.DataFrame({\n", + " 'sample_id': list(inv_samples_to_remove.keys()),\n", + " 'filters_applied': [','.join(filters) for filters in inv_samples_to_remove.values()]\n", + "})\n", + "\n", + "file_path = generate_file_path(TLD_PATH, 'filtering', 'filtered_samples.tsv')\n", + "save_df(WS_BUCKET, file_path, filter_df)\n", + "\n", + "filter_df = pd.read_table(file_path, sep='\\t')\n", + "filter_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Save and display upset plot of filtered samples\n", + "create_upset_plot(filter_df, len(inv_samples_to_remove))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save and display metadata table of passing samples\n", + "file_path = generate_file_path(TLD_PATH, 'filtering', 'passing_samples_metadata.tsv')\n", + "filter_and_save_metadata(samples_qc_table, inv_samples_to_remove, file_path)\n", + "\n", + "tsv = pd.read_csv(file_path, sep='\\t')\n", + "tsv" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "toc": { + "base_numbering": 1, + "nav_menu": { + "height": "447.995px", + "width": "361.102px" + }, + "number_sections": false, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": { + "height": "470px", + "left": "867px", + "top": "195.141px", + "width": "278px" + }, + "toc_section_display": true, + "toc_window_display": true + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}