broadinstitute · kjaisingh · Nov 14, 2024 · Nov 14, 2024 · Nov 19, 2024 · Nov 20, 2024
diff --git a/.github/.dockstore.yml b/.github/.dockstore.yml
@@ -15,6 +15,7 @@ workflows:
     filters:
       branches:
         - main
+        - kj_dragensv_benchmarking
       tags:
         - /.*/
 
@@ -33,6 +34,7 @@ workflows:
     filters:
       branches:
         - main
+        - kj_dragensv_benchmarking
       tags:
         - /.*/
 
@@ -42,6 +44,7 @@ workflows:
     filters:
       branches:
         - main
+        - kj_dragensv_benchmarking
       tags:
         - /.*/
 
@@ -51,6 +54,7 @@ workflows:
     filters:
       branches:
         - main
+        - kj_dragensv_benchmarking
       tags:
         - /.*/
 
@@ -60,6 +64,7 @@ workflows:
     filters:
       branches:
         - main
+        - kj_dragensv_benchmarking
       tags:
         - /.*/
 
@@ -78,6 +83,7 @@ workflows:
     filters:
       branches:
         - main
+        - kj_dragensv_benchmarking
       tags:
         - /.*/
 
@@ -159,6 +165,7 @@ workflows:
     filters:
       branches:
         - main
+        - kj_dragensv_benchmarking
       tags:
         - /.*/
 
@@ -204,6 +211,7 @@ workflows:
     filters:
       branches:
         - main
+        - kj_dragensv_benchmarking
       tags:
         - /.*/
 

diff --git a/scripts/notebooks/SampleQC.ipynb b/scripts/notebooks/SampleQC.ipynb
@@ -581,8 +581,8 @@
     "            if (not mad_cutoff):\n",
     "                print('[WARNING] Setting MAD_CUTOFF to None results in no lower cutoff being applied.')\n",
     "\n",
-    "    if (caller and caller not in ['overall', 'manta', 'melt', 'scramble', 'scramble', 'wham']):\n",
-    "        raise Exception(f'The value {caller} for category is invalid - it must be one of \"overall\", \"manta\", \"melt\", \"scramble\" or \"wham\".')\n",
+    "    if (caller and caller not in ['overall', 'dragen', 'manta', 'melt', 'scramble', 'scramble', 'wham']):\n",
+    "        raise Exception(f'The value {caller} for category is invalid - it must be one of \"overall\", \"dragen\", \"manta\", \"melt\", \"scramble\" or \"wham\".')\n",
     "\n",
     "    if (caller_type and caller_type not in ['high', 'low']):\n",
     "        raise Exception(f'The value {caller_type} for caller type is invalid - it must be one of \"high\" or \"low\".')\n",
@@ -2281,11 +2281,11 @@
    "metadata": {},
    "source": [
     "## Raw Caller Outliers\n",
-    "This series of metrics look for samples with an abnormally high or low number of raw SV calls from the three initial algorithms: Manta, Wham, and Scramble (or MELT). Higher than typical SV counts may indicate technical artifacts, while extremely low SV counts may indicate that an algorithm failed to complete. The values represent the number of times the sample was an outlier for SV counts across categories defined by algorithm, SV type, and chromosome. \n",
+    "This series of metrics look for samples with an abnormally high or low number of raw SV calls from the initial callers: Dragen, Manta, Wham, and Scramble (or MELT). Higher than typical SV counts may indicate technical artifacts, while extremely low SV counts may indicate that an algorithm failed to complete. The values represent the number of times the sample was an outlier for SV counts across categories defined by algorithm, SV type, and chromosome. \n",
     "\n",
     "**Note**: \n",
     "In the sections below, there are two additional parameters that have not been covered as of yet.\n",
-    "- `CALLER`: The caller for which to analyze results. This must be one of `['overall', 'manta', 'melt', 'scramble', 'wham']`, where 'overall' corresponds to the sum of outlier occurrences across the individual callers.\n",
+    "- `CALLER`: The caller for which to analyze results. This must be one of `['overall', 'dragen', 'manta', 'melt', 'scramble', 'wham']`, where 'overall' corresponds to the sum of outlier occurrences across the individual callers.\n",
     "- `TYPE`: The type of outliers for which to analyze results. This must be one of `['high', 'low']`, where 'high' indicates an the number of cases in which the sample had more SVs than typical, while 'low' indicates the number of cases in which the sample had fewer SVs than typical. \n",
     "\n",
     "We recommend checking the overall high and low outliers (i.e. `CALLER = 'overall'` and `TYPE = 'high'/'low'`), but you may also examine results for individual algorithms."
@@ -2315,7 +2315,7 @@
     "LINE_DEVIATIONS = None  # List of integers that defines the MAD cutoff lines to draw on each histogram plot\n",
     "LINE_STYLES = None  # List of strings that defines the line styles of each MAD cutoff line passed above\n",
     "\n",
-    "CALLER = 'overall' # String value that defines the caller - either 'overall', 'manta', 'melt', 'wham' or 'dragen'\n",
+    "CALLER = 'overall' # String value that defines the caller - either 'overall', 'dragen', 'manta', 'melt', 'wham' or 'dragen'\n",
     "TYPE = 'high' # String value that defines the outlier direction - either 'high' or 'low'\n",
     "\n",
     "validate_qc_inputs(samples_qc_table, f\"{CALLER}_{TYPE}_outlier\", line_deviations=LINE_DEVIATIONS, \n",
@@ -2354,7 +2354,7 @@
     "LOG_SCALE = False # Boolean value that defines whether to log-scale the plot\n",
     "METHOD = 'hard' # String value that defines the cutoff method to use - either 'MAD' or 'hard'\n",
     "\n",
-    "CALLER = 'overall' # String value that defines the caller - either 'overall', 'manta', 'melt', 'wham' or 'dragen'\n",
+    "CALLER = 'overall' # String value that defines the caller - either 'overall', 'dragen', 'manta', 'melt', 'wham' or 'dragen'\n",
     "TYPE = 'high' # String value that defines the outlier direction - either 'high' or 'low'\n",
     "\n",
     "UPPER_CUTOFF = None # Numeric value that defines the upper threshold if METHOD = 'hard'\n",

diff --git a/src/denovo/denovo_svs.py b/src/denovo/denovo_svs.py
@@ -772,6 +772,7 @@ def main():
     print("Took %f seconds to process" % delta)
 
     # Filter out INS that are manta or melt only and are SR only, have GQ=0, and FILTER contains 'HIGH_SR_BACKGROUND'
+    # TODO: Do I also update this to reference Dragen?
     verbose_print('Filtering out INS that are manta or melt only and SR only, with GQ=0 and FILTER contains HIGH_SR_BACKGROUND', verbose)
     start = time.time()
     remove_ins = bed_child[(bed_child['SVTYPE'] == 'INS') & ((bed_child['ALGORITHMS'] == 'manta') | (bed_child['ALGORITHMS'] == 'melt')) & (bed_child['EVIDENCE_FIX'] == 'SR') & ((bed_child['GQ'] == '0') | (bed_child.FILTER.str.contains('HIGH_SR_BACKGROUND')))]['name_famid'].to_list()

diff --git a/src/sv-pipeline/04_variant_resolution/scripts/overlap_breakpoint_filter.py b/src/sv-pipeline/04_variant_resolution/scripts/overlap_breakpoint_filter.py
@@ -64,6 +64,7 @@ def __init__(self, record):
         self.length = record.info['SVLEN']
         self.cnv_gt_5kbp = (record.info['SVTYPE'] == 'DEL' or record.info['SVTYPE'] == 'DUP') and self.length >= 5000
         self.gt_50bp = self.length >= 50
+        self.is_dragen = 'dragen' in record.info['ALGORITHMS']
         self.is_melt = 'melt' in record.info['ALGORITHMS']
         self.is_scramble = 'scramble' in record.info['ALGORITHMS']
         self.is_manta = 'manta' in record.info['ALGORITHMS']
@@ -164,10 +165,10 @@ def __str__(self):
     if len(sample_intersection) < 0.50 * max_freq:
         continue
     # Determine which to filter
-    # Special case if one is a Manta insertion and the other is MEI, keep the MEI
-    if first.is_manta and first.svtype == "INS" and second.is_mei:
+    # Special case if one is a Dragen/Manta insertion and the other is MEI, keep the MEI
+    if (first.is_dragen or first.is_manta) and first.svtype == "INS" and second.is_mei:
         sorted_data_list = [second, first]
-    elif second.is_manta and second.svtype == "INS" and first.is_mei:
+    elif (second.is_dragen or second.is_manta) and second.svtype == "INS" and first.is_mei:
         sorted_data_list = [first, second]
     else:
         # Otherwise use sorting spec

diff --git a/src/sv-pipeline/04_variant_resolution/scripts/overlap_pass.py b/src/sv-pipeline/04_variant_resolution/scripts/overlap_pass.py
@@ -65,7 +65,7 @@ def overlap_pass(phase1, pilot, fout, dist=300, frac=0.1, prefix="SSC_merged"):
     sources = get_sources(fout.header)
 
     # Helper for testing if SVRecord has pe/sr support
-    pesr_sources = set('delly lumpy manta wham'.split())
+    pesr_sources = set('delly dragen lumpy manta wham'.split())
 
     def _has_pesr(record):
         sources = set(record.record.info['SOURCES'])

diff --git a/src/sv-pipeline/scripts/make_evidence_qc_table.py b/src/sv-pipeline/scripts/make_evidence_qc_table.py
@@ -130,13 +130,15 @@ def read_outlier(filename: str, outlier_col_label: str) -> pd.DataFrame:
     return outlier_df
 
 
-def read_all_outlier(outlier_manta_df: pd.DataFrame, outlier_melt_df: pd.DataFrame, outlier_wham_df: pd.DataFrame, outlier_scramble_df: pd.DataFrame, outlier_type: str) -> pd.DataFrame:
+def read_all_outlier(outlier_manta_df: pd.DataFrame, outlier_melt_df: pd.DataFrame, outlier_wham_df: pd.DataFrame,
+                     outlier_scramble_df: pd.DataFrame, outlier_dragen_df: pd.DataFrame, outlier_type: str) -> pd.DataFrame:
     """
     Args:
         outlier_manta_df: Outliers determined in EvidenceQC for Manta.
         outlier_melt_df: Outliers determined in EvidenceQC for MELT.
         outlier_wham_df: Outliers determined in EvidenceQC for Wham.
-        outlier_scramble_df: Outliers determined in EvidenceQC for Scramble
+        outlier_scramble_df: Outliers determined in EvidenceQC for Scramble.
+        outlier_dragen_df: Outliers determined in EvidenceQC for Dragen.
         outlier_type: high or low. Determined in EvidenceQC for each of the three callers.
     Returns:
         The total number of times that a sample appears as an outlier
@@ -158,8 +160,12 @@ def read_all_outlier(outlier_manta_df: pd.DataFrame, outlier_melt_df: pd.DataFra
     col_name = get_col_name("scramble", outlier_type)
     dict_scramble = dict(zip(outlier_scramble_df[ID_COL], outlier_scramble_df[col_name]))
 
+    # Dragen:
+    col_name = get_col_name("dragen", outlier_type)
+    dict_dragen = dict(zip(outlier_dragen_df[ID_COL], outlier_dragen_df[col_name]))
+
     # merging all the dictionaries
-    outlier_dicts = [dict_manta, dict_melt, dict_wham, dict_scramble]
+    outlier_dicts = [dict_manta, dict_melt, dict_wham, dict_scramble, dict_dragen]
     merged_dicts = Counter()
     for counted in outlier_dicts:
         merged_dicts.update(counted)
@@ -182,10 +188,12 @@ def merge_evidence_qc_table(
         filename_high_melt: str,
         filename_high_wham: str,
         filename_high_scramble: str,
+        filename_high_dragen: str,
         filename_low_manta: str,
         filename_low_melt: str,
         filename_low_wham: str,
         filename_low_scramble: str,
+        filename_low_dragen: str,
         filename_melt_insert_size: str,
         output_prefix: str) -> None:
     """
@@ -201,23 +209,28 @@ def merge_evidence_qc_table(
     df_melt_high_outlier = read_outlier(filename_high_melt, get_col_name("melt", "high"))
     df_wham_high_outlier = read_outlier(filename_high_wham, get_col_name("wham", "high"))
     df_scramble_high_outlier = read_outlier(filename_high_scramble, get_col_name("scramble", "high"))
-    df_total_high_outliers = read_all_outlier(df_manta_high_outlier, df_melt_high_outlier, df_wham_high_outlier, df_scramble_high_outlier, "high")
+    df_dragen_high_outlier = read_outlier(filename_high_dragen, get_col_name("dragen", "high"))
+    df_total_high_outliers = read_all_outlier(df_manta_high_outlier, df_melt_high_outlier, df_wham_high_outlier,
+                                              df_scramble_high_outlier, df_dragen_high_outlier, "high")
     df_manta_low_outlier = read_outlier(filename_low_manta, get_col_name("manta", "low"))
     df_melt_low_outlier = read_outlier(filename_low_melt, get_col_name("melt", "low"))
     df_wham_low_outlier = read_outlier(filename_low_wham, get_col_name("wham", "low"))
     df_scramble_low_outlier = read_outlier(filename_low_scramble, get_col_name("scramble", "low"))
-    df_total_low_outliers = read_all_outlier(df_manta_low_outlier, df_melt_low_outlier, df_wham_low_outlier, df_scramble_low_outlier, "low")
+    df_dragen_low_outlier = read_outlier(filename_low_dragen, get_col_name("dragen", "low"))
+    df_total_low_outliers = read_all_outlier(df_manta_low_outlier, df_melt_low_outlier, df_wham_low_outlier,
+                                             df_scramble_low_outlier, df_dragen_low_outlier, "low")
     df_melt_insert_size = read_melt_insert_size(filename_melt_insert_size)
 
     # outlier column names
-    callers = ["wham", "melt", "manta", "scramble", "overall"]
+    callers = ["wham", "melt", "manta", "scramble", "dragen", "overall"]
     types = ["high", "low"]
     outlier_cols = [get_col_name(caller, type) for caller in callers for type in types]
 
     # all data frames
     dfs = [df_ploidy, df_sex_assignments, df_bincov_median, df_wgd_scores, df_non_diploid,
-           df_manta_high_outlier, df_melt_high_outlier, df_wham_high_outlier, df_scramble_high_outlier, df_total_high_outliers,
-           df_manta_low_outlier, df_melt_low_outlier, df_wham_low_outlier, df_scramble_low_outlier, df_total_low_outliers,
+           df_manta_high_outlier, df_melt_high_outlier, df_wham_high_outlier, df_scramble_high_outlier,
+           df_dragen_high_outlier, df_total_high_outliers, df_manta_low_outlier, df_melt_low_outlier,
+           df_wham_low_outlier, df_scramble_low_outlier, df_dragen_low_outlier, df_total_low_outliers,
            df_melt_insert_size]
     for df in dfs:
         df[ID_COL] = df[ID_COL].astype(object)
@@ -263,6 +276,14 @@ def main():
         "-w", "--wham-qc-outlier-high-filename",
         help="Sets the filename containing Wham QC outlier high.")
 
+    parser.add_argument(
+        "-t", "--scramble-qc-outlier-high-filename",
+        help="Sets the filename containing Scramble QC outlier high.")
+
+    parser.add_argument(
+        "-i", "--dragen-qc-outlier-high-filename",
+        help="Sets the filename containing Dragen QC outlier high.")
+
     parser.add_argument(
         "-a", "--manta-qc-outlier-low-filename",
         help="Sets the filename containing Manta QC outlier low.")
@@ -280,8 +301,8 @@ def main():
         help="Sets the filename containing Scramble QC outlier low.")
 
     parser.add_argument(
-        "-t", "--scramble-qc-outlier-high-filename",
-        help="Sets the filename containing Scramble QC outlier high.")
+        "-j", "--dragen-qc-outlier-low-filename",
+        help="Sets the filename containing Dragen QC outlier low.")
 
     parser.add_argument(
         "-m", "--melt-insert-size-filename",
@@ -307,10 +328,12 @@ def main():
         args.melt_qc_outlier_high_filename,
         args.wham_qc_outlier_high_filename,
         args.scramble_qc_outlier_high_filename,
+        args.dragen_qc_outlier_high_filename,
         args.manta_qc_outlier_low_filename,
         args.melt_qc_outlier_low_filename,
         args.wham_qc_outlier_low_filename,
         args.scramble_qc_outlier_low_filename,
+        args.dragen_qc_outlier_low_filename,
         args.melt_insert_size_filename,
         args.output_prefix)
 

diff --git a/src/sv-pipeline/scripts/make_scramble_vcf.py b/src/sv-pipeline/scripts/make_scramble_vcf.py
@@ -493,6 +493,7 @@ def main(argv: Optional[List[Text]] = None):
                     l1_size=arguments.l1_size)
     logging.info("Loading MEI bed...")
     mei_trees = create_trees_from_bed_records(arguments.mei_bed, padding=arguments.mei_padding)
+    # TODO: Do I also update this to reference Dragen?
     logging.info("Loading Manta deletions...")
     with pysam.VariantFile(arguments.manta_vcf) as f_manta:
         del_filter_trees = dict()

diff --git a/src/svtk/svtk/cli/standardize_vcf.py b/src/svtk/svtk/cli/standardize_vcf.py
@@ -30,7 +30,7 @@ def main(argv):
     parser.add_argument('vcf', help='Raw VCF.')
     parser.add_argument('fout', help='Standardized VCF.')
     parser.add_argument('source', help='Source algorithm. '
-                        '[delly,lumpy,manta,wham,melt,scramble]')
+                        '[delly,lumpy,manta,wham,melt,scramble,dragen]')
     parser.add_argument('-p', '--prefix', help='If provided, variant names '
                         'will be overwritten with this prefix.')
     parser.add_argument('--include-reference-sites', action='store_true',

diff --git a/src/svtk/svtk/standardize/__init__.py b/src/svtk/svtk/standardize/__init__.py
@@ -5,4 +5,5 @@
 from .std_manta import MantaStandardizer
 from .std_melt import MeltStandardizer
 from .std_scramble import ScrambleStandardizer
+from .std_dragen import DragenStandardizer
 from .std_smoove import SmooveStandardizer