From c19804a7417f04b4cd11cf15372d059d84a3e3d6 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Tue, 5 Mar 2024 17:33:20 -0500 Subject: [PATCH 01/11] add wdl and script for NCR, ref artifact, and zero carrier site filtering --- .../apply_ncr_and_ref_artifact_filters.py | 171 ++++++++++++++++++ wdl/ApplyNCRAndRefArtifactFilters.wdl | 33 ++++ ...ApplyNCRAndRefArtifactFiltersPerContig.wdl | 118 ++++++++++++ 3 files changed, 322 insertions(+) create mode 100644 src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py create mode 100644 wdl/ApplyNCRAndRefArtifactFilters.wdl create mode 100644 wdl/ApplyNCRAndRefArtifactFiltersPerContig.wdl diff --git a/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py b/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py new file mode 100644 index 000000000..6a93f72f8 --- /dev/null +++ b/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py @@ -0,0 +1,171 @@ +#!/bin/env python + +import argparse +import sys +import pysam +import math +from typing import List, Text, Dict, Optional + + +_HIGH_NCR_FILTER = "HIGH_NCR" +_REFERENCE_ARTIFACT_FILTER = "REFERENCE_ARTIFACT" +_REFERENCE_ARTIFACT_THRESHOLD = 0.99 + + +def _is_no_call(gt): + s = _gt_no_call_map.get(gt, None) + if s is None: + s = all([a is None for a in gt]) + _gt_no_call_map[gt] = s + return s + + +def _is_hom_ref(gt): + s = _gt_ref_map.get(gt, None) + if s is None: + s = all([a is not None and a == 0 for a in gt]) + _gt_ref_map[gt] = s + return s + + +def _is_hom_var(gt): + s = _gt_hom_var_map.get(gt, None) + if s is None: + s = all([a is not None and a > 0 for a in gt]) + _gt_hom_var_map[gt] = s + return s + + +def _apply_filters(record, ploidy_dict, ncr_threshold, filter_reference_artifacts, remove_zero_carrier_sites): + if record.info['SVTYPE'] == 'CNV': + return True # skip checks and annotations for mCNVs due to lack of GT + + n_samples = 0 + n_no_call = 0 + n_hom_var = 0 + has_carrier = False + for s, gt in record.samples.items(): + # Count every sample where ploidy > 0 + if ploidy_dict[s][record.chrom] == 0: + continue + n_samples += 1 + # Count no-calls and hom vars + if _is_no_call(gt['GT']): + n_no_call += 1 + elif _is_hom_var(gt['GT']): + n_hom_var += 1 + has_carrier = True + elif not _is_hom_ref(gt['GT']): + has_carrier = True + + # hard filter sites with no carriers + if remove_zero_carrier_sites and not has_carrier: + return False + + # Annotate metrics and apply filters + record.info['NCN'] = n_no_call + record.info['NCR'] = n_no_call / n_samples if n_samples > 0 else None + if ncr_threshold is not None and record.info['NCR'] is not None and record.info['NCR'] >= ncr_threshold: + record.filter.add(_HIGH_NCR_FILTER) + + if filter_reference_artifacts and n_hom_var / n_samples > _REFERENCE_ARTIFACT_THRESHOLD: + record.filter.add(_REFERENCE_ARTIFACT_FILTER) + + # Clean out AF metrics since they're no longer valid + for field in ['AC', 'AF']: + if field in record.info: + del record.info[field] + return True + + +def process(vcf, fout, ploidy_dict, thresholds, args): + n_samples = float(len(fout.header.samples)) + if n_samples == 0: + raise ValueError("This is a sites-only vcf") + for record in vcf: + keep = _apply_filters(record=record, + ploidy_dict=ploidy_dict, + ncr_threshold=args.ncr_threshold, + filter_reference_artifacts=args.filter_reference_artifacts, + remove_zero_carrier_sites=args.remove_zero_carrier_sites) + if keep: + fout.write(record) + + +def _parse_ploidy_table(path: Text) -> Dict[Text, Dict[Text, int]]: + """ + Parses tsv of sample ploidy values. + + Parameters + ---------- + path: Text + table path + + Returns + ------- + header: Dict[Text, Dict[Text, int]] + map of sample to contig to ploidy, i.e. Dict[sample][contig] = ploidy + """ + ploidy_dict = dict() + with open(path, 'r') as f: + header = f.readline().strip().split('\t') + for line in f: + tokens = line.strip().split('\t') + sample = tokens[0] + ploidy_dict[sample] = {header[i]: int(tokens[i]) for i in range(1, len(header))} + return ploidy_dict + + +def _parse_arguments(argv: List[Text]) -> argparse.Namespace: + # noinspection PyTypeChecker + parser = argparse.ArgumentParser( + description="Apply class-specific genotype filtering using SL scores and annotates NCR", + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument('--vcf', type=str, help='Input vcf (defaults to stdin)') + parser.add_argument('--out', type=str, help='Output file (defaults to stdout)') + parser.add_argument('--ploidy-table', type=str, required=True, help='Ploidy table tsv') + parser.add_argument("--ncr-threshold", type=float, + help=f"If provided, adds {_HIGH_NCR_FILTER} filter to records with no-call rates equal to or " + f"exceeding this threshold") + parser.add_argument("--filter-reference-artifacts", type=bool, action='store_true', default=False, + help=f"If provided, adds {_REFERENCE_ARTIFACT_FILTER} filter to records that are hom alt in " + f">{_REFERENCE_ARTIFACT_THRESHOLD*100}% of samples>") + parser.add_argument("--remove-zero-carrier-sites", type=bool, action='store_true', default=False, + help=f"If provided, hard filters sites with zero carriers>") + if len(argv) <= 1: + parser.parse_args(["--help"]) + sys.exit(0) + parsed_arguments = parser.parse_args(argv[1:]) + return parsed_arguments + + +def main(argv: Optional[List[Text]] = None): + if argv is None: + argv = sys.argv + args = _parse_arguments(argv) + + if args.vcf is None: + vcf = pysam.VariantFile(sys.stdin) + else: + vcf = pysam.VariantFile(args.vcf) + + header = vcf.header + header.add_line('##INFO=') + header.add_line('##INFO=') + if args.ncr_threshold is not None: + header.add_line(f"##FILTER=") + if args.filter_reference_artifacts: + header.add_line(f"##FILTER=") + if args.out is None: + fout = pysam.VariantFile(sys.stdout, 'w', header=header) + else: + fout = pysam.VariantFile(args.out, 'w', header=header) + + ploidy_dict = _parse_ploidy_table(args.ploidy_table) + process(vcf, fout, ploidy_dict, args) + fout.close() + + +if __name__ == "__main__": + main() diff --git a/wdl/ApplyNCRAndRefArtifactFilters.wdl b/wdl/ApplyNCRAndRefArtifactFilters.wdl new file mode 100644 index 000000000..8264778f2 --- /dev/null +++ b/wdl/ApplyNCRAndRefArtifactFilters.wdl @@ -0,0 +1,33 @@ +version 1.0 + +import "ApplyNCRAndRefArtifactFiltersPerContig.wdl" as per_contig + +workflow ApplyNCRAndRefArtifactFilters { + input { + Array[File] vcfs + String label = "ncr_and_refartifact" + + File? apply_filters_script + + String sv_pipeline_docker + String sv_base_mini_docker + } + + scatter (vcf in vcfs) { + String base = basename(vcf, ".vcf.gz") + call per_contig.ApplyNCRAndRefArtifactFiltersPerContig { + input: + vcf = vcf, + prefix = "~{base}.~{label}", + apply_filters_script = apply_filters_script, + sv_pipeline_docker = sv_pipeline_docker, + sv_base_mini_docker = sv_base_mini_docker + } + } + + + output { + Array[File] filtered_vcfs = ApplyNCRAndRefArtifactFiltersPerContig.filtered_vcf + Array[File] filtered_vcf_indexes = ApplyNCRAndRefArtifactFiltersPerContig.filtered_vcf_index + } +} \ No newline at end of file diff --git a/wdl/ApplyNCRAndRefArtifactFiltersPerContig.wdl b/wdl/ApplyNCRAndRefArtifactFiltersPerContig.wdl new file mode 100644 index 000000000..ff4096d89 --- /dev/null +++ b/wdl/ApplyNCRAndRefArtifactFiltersPerContig.wdl @@ -0,0 +1,118 @@ +version 1.0 + +import "TasksMakeCohortVcf.wdl" as tasks +import "Structs.wdl" + +workflow ApplyNCRAndRefArtifactFiltersPerContig { + input { + File vcf + String prefix + + File ploidy_table + Int records_per_shard = 20000 + + Float? no_call_rate_cutoff + Boolean filter_reference_artifacts + Boolean remove_zero_carrier_sites + + File? apply_filters_script + + String sv_pipeline_docker + String sv_base_mini_docker + + RuntimeAttr? runtime_attr_scatter_vcf + RuntimeAttr? runtime_attr_apply_filters + RuntimeAttr? runtime_attr_concat_vcfs + } + + call tasks.ScatterVcf { + input: + vcf = vcf, + records_per_shard = records_per_shard, + prefix = "~{prefix}.scatter", + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_scatter_vcf + } + + scatter (i in range(length(ScatterVcf.shards))) { + call ApplyFilters { + input: + vcf = ScatterVcf.shards[i], + prefix = "~{prefix}.shard_~{i}", + ploidy_table = ploidy_table, + no_call_rate_cutoff = no_call_rate_cutoff, + filter_reference_artifacts = filter_reference_artifacts, + remove_zero_carrier_sites = remove_zero_carrier_sites, + apply_filters_script = apply_filters_script, + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_apply_filters + } + } + + call tasks.ConcatVcfs { + input: + vcfs = ApplyFilters.filtered_vcf, + naive = true, + outfile_prefix = prefix, + sv_base_mini_docker = sv_base_mini_docker, + runtime_attr_override = runtime_attr_concat_vcfs + } + + output { + File filtered_vcf = ConcatVcfs.concat_vcf + File filtered_vcf_index = ConcatVcfs.concat_vcf_idx + } +} + + +task ApplyFilters { + input { + File vcf + String prefix + File ploidy_table + Float? no_call_rate_cutoff + Boolean filter_reference_artifacts + Boolean remove_zero_carrier_sites + String? apply_filters_script + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 5, + disk_gb: ceil(10.0 + 2 * size(vcf, "GiB")), + boot_disk_gb: 30, + preemptible_tries: 1, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + command <<< + set -euo pipefail + + python ~{select_first([apply_filters_script, "/opt/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py"])} \ + --vcf ~{vcf} \ + --out ~{prefix}.vcf.gz \ + --ploidy-table ~{ploidy_table} \ + --ncr-threshold ~{no_call_rate_cutoff} \ + ~{if (filter_reference_artifacts) then "--filter-reference-artifacts" else ""} \ + ~{if (remove_zero_carrier_sites) then "--remove-zero-carrier-sites" else ""} + + >>> + + output { + File filtered_vcf = "~{prefix}.vcf.gz" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + From 1827c4db9d651960c85d95d624e78e91fa7f7fbc Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Tue, 5 Mar 2024 18:10:37 -0500 Subject: [PATCH 02/11] add test json, fix script --- .../ApplyNCRAndRefArtifactFilters.json.tmpl | 12 ++++++++++++ .../scripts/apply_ncr_and_ref_artifact_filters.py | 10 ++++++---- wdl/ApplyNCRAndRefArtifactFilters.wdl | 2 ++ wdl/ApplyNCRAndRefArtifactFiltersPerContig.wdl | 4 ++-- 4 files changed, 22 insertions(+), 6 deletions(-) create mode 100644 inputs/templates/test/ApplyNCRAndRefArtifactFilters/ApplyNCRAndRefArtifactFilters.json.tmpl diff --git a/inputs/templates/test/ApplyNCRAndRefArtifactFilters/ApplyNCRAndRefArtifactFilters.json.tmpl b/inputs/templates/test/ApplyNCRAndRefArtifactFilters/ApplyNCRAndRefArtifactFilters.json.tmpl new file mode 100644 index 000000000..1b80037c0 --- /dev/null +++ b/inputs/templates/test/ApplyNCRAndRefArtifactFilters/ApplyNCRAndRefArtifactFilters.json.tmpl @@ -0,0 +1,12 @@ +{ + "ApplyNCRAndRefArtifactFilters.vcfs": [ {{ test_batch.genotype_filtered_vcf | tojson }} ], + "ApplyNCRAndRefArtifactFilters.apply_filters_script": "gs://broad-dsde-methods-eph/apply_ncr_and_ref_artifact_filters.py", + "ApplyNCRAndRefArtifactFilters.ploidy_table": {{ test_batch.ploidy_table | tojson }}, + + "ApplyNCRAndRefArtifactFilters.sv_base_mini_docker": {{ dockers.sv_base_mini_docker | tojson }}, + "ApplyNCRAndRefArtifactFilters.sv_pipeline_docker": {{ dockers.sv_pipeline_docker | tojson }}, + + "ApplyNCRAndRefArtifactFilters.ApplyNCRAndRefArtifactFiltersPerContig.no_call_rate_cutoff": 0.05, + "ApplyNCRAndRefArtifactFilters.ApplyNCRAndRefArtifactFiltersPerContig.filter_reference_artifacts": "true", + "ApplyNCRAndRefArtifactFilters.ApplyNCRAndRefArtifactFiltersPerContig.remove_zero_carrier_sites": "true" +} diff --git a/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py b/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py index 6a93f72f8..24d752d08 100644 --- a/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py +++ b/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py @@ -6,7 +6,9 @@ import math from typing import List, Text, Dict, Optional - +_gt_no_call_map = dict() +_gt_hom_var_map = dict() +_gt_ref_map = dict() _HIGH_NCR_FILTER = "HIGH_NCR" _REFERENCE_ARTIFACT_FILTER = "REFERENCE_ARTIFACT" _REFERENCE_ARTIFACT_THRESHOLD = 0.99 @@ -78,7 +80,7 @@ def _apply_filters(record, ploidy_dict, ncr_threshold, filter_reference_artifact return True -def process(vcf, fout, ploidy_dict, thresholds, args): +def process(vcf, fout, ploidy_dict, args): n_samples = float(len(fout.header.samples)) if n_samples == 0: raise ValueError("This is a sites-only vcf") @@ -128,10 +130,10 @@ def _parse_arguments(argv: List[Text]) -> argparse.Namespace: parser.add_argument("--ncr-threshold", type=float, help=f"If provided, adds {_HIGH_NCR_FILTER} filter to records with no-call rates equal to or " f"exceeding this threshold") - parser.add_argument("--filter-reference-artifacts", type=bool, action='store_true', default=False, + parser.add_argument("--filter-reference-artifacts", action='store_true', default=False, help=f"If provided, adds {_REFERENCE_ARTIFACT_FILTER} filter to records that are hom alt in " f">{_REFERENCE_ARTIFACT_THRESHOLD*100}% of samples>") - parser.add_argument("--remove-zero-carrier-sites", type=bool, action='store_true', default=False, + parser.add_argument("--remove-zero-carrier-sites", action='store_true', default=False, help=f"If provided, hard filters sites with zero carriers>") if len(argv) <= 1: parser.parse_args(["--help"]) diff --git a/wdl/ApplyNCRAndRefArtifactFilters.wdl b/wdl/ApplyNCRAndRefArtifactFilters.wdl index 8264778f2..7cd602d5b 100644 --- a/wdl/ApplyNCRAndRefArtifactFilters.wdl +++ b/wdl/ApplyNCRAndRefArtifactFilters.wdl @@ -6,6 +6,7 @@ workflow ApplyNCRAndRefArtifactFilters { input { Array[File] vcfs String label = "ncr_and_refartifact" + File ploidy_table File? apply_filters_script @@ -19,6 +20,7 @@ workflow ApplyNCRAndRefArtifactFilters { input: vcf = vcf, prefix = "~{base}.~{label}", + ploidy_table = ploidy_table, apply_filters_script = apply_filters_script, sv_pipeline_docker = sv_pipeline_docker, sv_base_mini_docker = sv_base_mini_docker diff --git a/wdl/ApplyNCRAndRefArtifactFiltersPerContig.wdl b/wdl/ApplyNCRAndRefArtifactFiltersPerContig.wdl index ff4096d89..2ef5bef3a 100644 --- a/wdl/ApplyNCRAndRefArtifactFiltersPerContig.wdl +++ b/wdl/ApplyNCRAndRefArtifactFiltersPerContig.wdl @@ -12,8 +12,8 @@ workflow ApplyNCRAndRefArtifactFiltersPerContig { Int records_per_shard = 20000 Float? no_call_rate_cutoff - Boolean filter_reference_artifacts - Boolean remove_zero_carrier_sites + Boolean filter_reference_artifacts = true + Boolean remove_zero_carrier_sites = true File? apply_filters_script From 913093df52d2f4b8680e220f4dc6a4f336656e58 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Thu, 7 Mar 2024 13:47:25 -0500 Subject: [PATCH 03/11] script should be a file not a string --- wdl/ApplyNCRAndRefArtifactFiltersPerContig.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wdl/ApplyNCRAndRefArtifactFiltersPerContig.wdl b/wdl/ApplyNCRAndRefArtifactFiltersPerContig.wdl index 2ef5bef3a..3c426d59c 100644 --- a/wdl/ApplyNCRAndRefArtifactFiltersPerContig.wdl +++ b/wdl/ApplyNCRAndRefArtifactFiltersPerContig.wdl @@ -73,7 +73,7 @@ task ApplyFilters { Float? no_call_rate_cutoff Boolean filter_reference_artifacts Boolean remove_zero_carrier_sites - String? apply_filters_script + File? apply_filters_script String sv_pipeline_docker RuntimeAttr? runtime_attr_override } From 00d88cfb04b94972c3255e90cd9dbc09d86f63e3 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Fri, 8 Mar 2024 13:49:09 -0500 Subject: [PATCH 04/11] when applying filter remove PASS --- src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py b/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py index 24d752d08..4029ee446 100644 --- a/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py +++ b/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py @@ -69,9 +69,13 @@ def _apply_filters(record, ploidy_dict, ncr_threshold, filter_reference_artifact record.info['NCR'] = n_no_call / n_samples if n_samples > 0 else None if ncr_threshold is not None and record.info['NCR'] is not None and record.info['NCR'] >= ncr_threshold: record.filter.add(_HIGH_NCR_FILTER) + if 'PASS' in record.filter: + record.filter.pop('PASS') if filter_reference_artifacts and n_hom_var / n_samples > _REFERENCE_ARTIFACT_THRESHOLD: record.filter.add(_REFERENCE_ARTIFACT_FILTER) + if 'PASS' in record.filter: + record.filter.pop('PASS') # Clean out AF metrics since they're no longer valid for field in ['AC', 'AF']: From de7327bc6dab6222d5f80dfcdb466ea5512085e1 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Fri, 8 Mar 2024 14:13:32 -0500 Subject: [PATCH 05/11] linting --- .../scripts/apply_ncr_and_ref_artifact_filters.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py b/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py index 4029ee446..05f539b75 100644 --- a/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py +++ b/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py @@ -3,7 +3,6 @@ import argparse import sys import pysam -import math from typing import List, Text, Dict, Optional _gt_no_call_map = dict() @@ -40,7 +39,7 @@ def _is_hom_var(gt): def _apply_filters(record, ploidy_dict, ncr_threshold, filter_reference_artifacts, remove_zero_carrier_sites): if record.info['SVTYPE'] == 'CNV': - return True # skip checks and annotations for mCNVs due to lack of GT + return True # skip checks and annotations for mCNVs due to lack of GT n_samples = 0 n_no_call = 0 @@ -90,10 +89,10 @@ def process(vcf, fout, ploidy_dict, args): raise ValueError("This is a sites-only vcf") for record in vcf: keep = _apply_filters(record=record, - ploidy_dict=ploidy_dict, - ncr_threshold=args.ncr_threshold, - filter_reference_artifacts=args.filter_reference_artifacts, - remove_zero_carrier_sites=args.remove_zero_carrier_sites) + ploidy_dict=ploidy_dict, + ncr_threshold=args.ncr_threshold, + filter_reference_artifacts=args.filter_reference_artifacts, + remove_zero_carrier_sites=args.remove_zero_carrier_sites) if keep: fout.write(record) @@ -162,7 +161,8 @@ def main(argv: Optional[List[Text]] = None): if args.ncr_threshold is not None: header.add_line(f"##FILTER=") if args.filter_reference_artifacts: - header.add_line(f"##FILTER=") + header.add_line(f"##FILTER=") if args.out is None: fout = pysam.VariantFile(sys.stdout, 'w', header=header) else: From 211f6ff2cf174da14d254dd264d27dfe3b6dd2e7 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Fri, 8 Mar 2024 14:15:50 -0500 Subject: [PATCH 06/11] linting --- src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py b/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py index 05f539b75..b1ed80434 100644 --- a/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py +++ b/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py @@ -137,7 +137,7 @@ def _parse_arguments(argv: List[Text]) -> argparse.Namespace: help=f"If provided, adds {_REFERENCE_ARTIFACT_FILTER} filter to records that are hom alt in " f">{_REFERENCE_ARTIFACT_THRESHOLD*100}% of samples>") parser.add_argument("--remove-zero-carrier-sites", action='store_true', default=False, - help=f"If provided, hard filters sites with zero carriers>") + help="If provided, hard filters sites with zero carriers>") if len(argv) <= 1: parser.parse_args(["--help"]) sys.exit(0) From 36de33a6c8661fbe04ab6359809324388c0280c1 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Tue, 23 Apr 2024 11:27:42 -0400 Subject: [PATCH 07/11] rename variant ids, remove existing ncr annots, diploid GTs, ME DEL SVLEN --- .../ApplyNCRAndRefArtifactFilters.json.tmpl | 2 + .../apply_ncr_and_ref_artifact_filters.py | 54 +++++++++++++++++-- wdl/ApplyNCRAndRefArtifactFilters.wdl | 12 +++-- ...ApplyNCRAndRefArtifactFiltersPerContig.wdl | 7 +++ 4 files changed, 68 insertions(+), 7 deletions(-) diff --git a/inputs/templates/test/ApplyNCRAndRefArtifactFilters/ApplyNCRAndRefArtifactFilters.json.tmpl b/inputs/templates/test/ApplyNCRAndRefArtifactFilters/ApplyNCRAndRefArtifactFilters.json.tmpl index 1b80037c0..834cd5a73 100644 --- a/inputs/templates/test/ApplyNCRAndRefArtifactFilters/ApplyNCRAndRefArtifactFilters.json.tmpl +++ b/inputs/templates/test/ApplyNCRAndRefArtifactFilters/ApplyNCRAndRefArtifactFilters.json.tmpl @@ -2,6 +2,8 @@ "ApplyNCRAndRefArtifactFilters.vcfs": [ {{ test_batch.genotype_filtered_vcf | tojson }} ], "ApplyNCRAndRefArtifactFilters.apply_filters_script": "gs://broad-dsde-methods-eph/apply_ncr_and_ref_artifact_filters.py", "ApplyNCRAndRefArtifactFilters.ploidy_table": {{ test_batch.ploidy_table | tojson }}, + "ApplyNCRAndRefArtifactFilters.primary_contigs_list": {{ reference_resources.primary_contigs_list | tojson }}, + "ApplyNCRAndRefArtifactFilters.cohort_id": {{ test_batch.name | tojson }}, "ApplyNCRAndRefArtifactFilters.sv_base_mini_docker": {{ dockers.sv_base_mini_docker | tojson }}, "ApplyNCRAndRefArtifactFilters.sv_pipeline_docker": {{ dockers.sv_pipeline_docker | tojson }}, diff --git a/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py b/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py index b1ed80434..bfb5124f4 100644 --- a/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py +++ b/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py @@ -11,6 +11,8 @@ _HIGH_NCR_FILTER = "HIGH_NCR" _REFERENCE_ARTIFACT_FILTER = "REFERENCE_ARTIFACT" _REFERENCE_ARTIFACT_THRESHOLD = 0.99 +SVTYPES = ["DEL", "DUP", "INS", "CNV", "INV", "CPX", "CTX", "BND"] +GT_UPDATE = {(None,):(None,None), (0,):(0,0), (1,):(0,1)} def _is_no_call(gt): @@ -38,6 +40,13 @@ def _is_hom_var(gt): def _apply_filters(record, ploidy_dict, ncr_threshold, filter_reference_artifacts, remove_zero_carrier_sites): + # remove previous NCR annotations and filters prior to recalculating + if _HIGH_NCR_FILTER in record.filter: + del record.filter[_HIGH_NCR_FILTER] + for field in ['NCR', 'NCN']: + if field in record.info: + record.info.pop(field) + if record.info['SVTYPE'] == 'CNV': return True # skip checks and annotations for mCNVs due to lack of GT @@ -46,6 +55,10 @@ def _apply_filters(record, ploidy_dict, ncr_threshold, filter_reference_artifact n_hom_var = 0 has_carrier = False for s, gt in record.samples.items(): + # set all GTs for biallelic SVs back to diploid + if len(gt['GT']) == 1: + orig_gt = gt['GT'] + gt['GT'] = GT_UPDATE[orig_gt] # Count every sample where ploidy > 0 if ploidy_dict[s][record.chrom] == 0: continue @@ -69,12 +82,12 @@ def _apply_filters(record, ploidy_dict, ncr_threshold, filter_reference_artifact if ncr_threshold is not None and record.info['NCR'] is not None and record.info['NCR'] >= ncr_threshold: record.filter.add(_HIGH_NCR_FILTER) if 'PASS' in record.filter: - record.filter.pop('PASS') + del record.filter['PASS'] if filter_reference_artifacts and n_hom_var / n_samples > _REFERENCE_ARTIFACT_THRESHOLD: record.filter.add(_REFERENCE_ARTIFACT_FILTER) if 'PASS' in record.filter: - record.filter.pop('PASS') + del record.filter['PASS'] # Clean out AF metrics since they're no longer valid for field in ['AC', 'AF']: @@ -82,8 +95,26 @@ def _apply_filters(record, ploidy_dict, ncr_threshold, filter_reference_artifact del record.info[field] return True +def _increment_counter(record, counter, prev_chrom): + if prev_chrom is not None and prev_chrom != record.chrom: + counter = {svtype: 0 for svtype in SVTYPES} + counter[record.info['SVTYPE']] += 1 + return counter, record.chrom + + +def _new_variant_id(record, counter, cohort_id, shard_str): + svtype = record.info['SVTYPE'] + return f"{cohort_id}.{svtype}_{record.chrom}_{shard_str}{str(counter[svtype])}" + def process(vcf, fout, ploidy_dict, args): + if args.cohort_id is not None: + id_map = open(f"{args.cohort_id}.vid_map.tsv", 'w') + counter = {svtype: 0 for svtype in SVTYPES} + prev_chrom = None + shard_str = "" + if args.shard_index is not None: + shard_str = f"shard{shard_index}_" n_samples = float(len(fout.header.samples)) if n_samples == 0: raise ValueError("This is a sites-only vcf") @@ -94,8 +125,21 @@ def process(vcf, fout, ploidy_dict, args): filter_reference_artifacts=args.filter_reference_artifacts, remove_zero_carrier_sites=args.remove_zero_carrier_sites) if keep: + # add SVLEN to ME DELs + if record.info['SVTYPE'] == 'DEL' and '_BND_' in record.id: + record.info['SVLEN'] = record.info['END2'] - record.start + record.stop = record.info['END2'] + record.info.pop('END2') + record.info.pop('CHR2') + # re-set all variant IDs if cohort ID is provided to be part of variant IDs + if args.cohort_id is not None: + counter, prev_chrom = _increment_counter(record, counter, prev_chrom) + new_id = _new_variant_id(record, counter, args.cohort_id, shard_str) + id_map.write(f"{record.id}\t{new_id}\n") + record.id = new_id fout.write(record) - + if args.cohort_id is not None: + id_map.close() def _parse_ploidy_table(path: Text) -> Dict[Text, Dict[Text, int]]: """ @@ -138,6 +182,10 @@ def _parse_arguments(argv: List[Text]) -> argparse.Namespace: f">{_REFERENCE_ARTIFACT_THRESHOLD*100}% of samples>") parser.add_argument("--remove-zero-carrier-sites", action='store_true', default=False, help="If provided, hard filters sites with zero carriers>") + parser.add_argument("--cohort-id", type=str, required=False, + help="If provided, rename all variant IDs, using cohort-id as base") + parser.add_argument("--shard-index", type=str, required=False, + help="Provide if sharding within chromosomes for unique variant IDs") if len(argv) <= 1: parser.parse_args(["--help"]) sys.exit(0) diff --git a/wdl/ApplyNCRAndRefArtifactFilters.wdl b/wdl/ApplyNCRAndRefArtifactFilters.wdl index 7cd602d5b..2a5ddf52b 100644 --- a/wdl/ApplyNCRAndRefArtifactFilters.wdl +++ b/wdl/ApplyNCRAndRefArtifactFilters.wdl @@ -5,6 +5,8 @@ import "ApplyNCRAndRefArtifactFiltersPerContig.wdl" as per_contig workflow ApplyNCRAndRefArtifactFilters { input { Array[File] vcfs + File primary_contigs_list + String cohort_id String label = "ncr_and_refartifact" File ploidy_table @@ -14,12 +16,14 @@ workflow ApplyNCRAndRefArtifactFilters { String sv_base_mini_docker } - scatter (vcf in vcfs) { - String base = basename(vcf, ".vcf.gz") + Array[String] contigs = read_lines(primary_contigs_list) + + scatter (i in range(length(vcfs))) { call per_contig.ApplyNCRAndRefArtifactFiltersPerContig { input: - vcf = vcf, - prefix = "~{base}.~{label}", + vcf = vcfs[i], + prefix = "~{cohort_id}.~{label}.~{contigs[i]}", + cohort_id = cohort_id, ploidy_table = ploidy_table, apply_filters_script = apply_filters_script, sv_pipeline_docker = sv_pipeline_docker, diff --git a/wdl/ApplyNCRAndRefArtifactFiltersPerContig.wdl b/wdl/ApplyNCRAndRefArtifactFiltersPerContig.wdl index 3c426d59c..48bc79d25 100644 --- a/wdl/ApplyNCRAndRefArtifactFiltersPerContig.wdl +++ b/wdl/ApplyNCRAndRefArtifactFiltersPerContig.wdl @@ -7,6 +7,7 @@ workflow ApplyNCRAndRefArtifactFiltersPerContig { input { File vcf String prefix + String cohort_id File ploidy_table Int records_per_shard = 20000 @@ -39,6 +40,8 @@ workflow ApplyNCRAndRefArtifactFiltersPerContig { input: vcf = ScatterVcf.shards[i], prefix = "~{prefix}.shard_~{i}", + cohort_id = cohort_id, + shard_index = i, ploidy_table = ploidy_table, no_call_rate_cutoff = no_call_rate_cutoff, filter_reference_artifacts = filter_reference_artifacts, @@ -70,6 +73,8 @@ task ApplyFilters { File vcf String prefix File ploidy_table + String cohort_id + Int shard_index Float? no_call_rate_cutoff Boolean filter_reference_artifacts Boolean remove_zero_carrier_sites @@ -96,6 +101,8 @@ task ApplyFilters { --out ~{prefix}.vcf.gz \ --ploidy-table ~{ploidy_table} \ --ncr-threshold ~{no_call_rate_cutoff} \ + --cohort-id ~{cohort_id} \ + --shard-index ~{shard_index} \ ~{if (filter_reference_artifacts) then "--filter-reference-artifacts" else ""} \ ~{if (remove_zero_carrier_sites) then "--remove-zero-carrier-sites" else ""} From aa0dc1f5983cc1093a794eabc3ad6098028ad83a Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Tue, 23 Apr 2024 11:41:15 -0400 Subject: [PATCH 08/11] linting --- .../scripts/apply_ncr_and_ref_artifact_filters.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py b/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py index bfb5124f4..0a82d8492 100644 --- a/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py +++ b/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py @@ -12,7 +12,7 @@ _REFERENCE_ARTIFACT_FILTER = "REFERENCE_ARTIFACT" _REFERENCE_ARTIFACT_THRESHOLD = 0.99 SVTYPES = ["DEL", "DUP", "INS", "CNV", "INV", "CPX", "CTX", "BND"] -GT_UPDATE = {(None,):(None,None), (0,):(0,0), (1,):(0,1)} +GT_UPDATE = {(None,): (None, None), (0,): (0, 0), (1,): (0, 1)} def _is_no_call(gt): @@ -95,6 +95,7 @@ def _apply_filters(record, ploidy_dict, ncr_threshold, filter_reference_artifact del record.info[field] return True + def _increment_counter(record, counter, prev_chrom): if prev_chrom is not None and prev_chrom != record.chrom: counter = {svtype: 0 for svtype in SVTYPES} @@ -114,7 +115,7 @@ def process(vcf, fout, ploidy_dict, args): prev_chrom = None shard_str = "" if args.shard_index is not None: - shard_str = f"shard{shard_index}_" + shard_str = f"shard{args.shard_index}_" n_samples = float(len(fout.header.samples)) if n_samples == 0: raise ValueError("This is a sites-only vcf") @@ -141,6 +142,7 @@ def process(vcf, fout, ploidy_dict, args): if args.cohort_id is not None: id_map.close() + def _parse_ploidy_table(path: Text) -> Dict[Text, Dict[Text, int]]: """ Parses tsv of sample ploidy values. From 631d41955d90553e196b76450fe9585269948f79 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Fri, 26 Apr 2024 17:13:29 -0400 Subject: [PATCH 09/11] don't fix ME dels - done during additional filtering --- .../scripts/apply_ncr_and_ref_artifact_filters.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py b/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py index 0a82d8492..a5461c892 100644 --- a/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py +++ b/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py @@ -126,12 +126,6 @@ def process(vcf, fout, ploidy_dict, args): filter_reference_artifacts=args.filter_reference_artifacts, remove_zero_carrier_sites=args.remove_zero_carrier_sites) if keep: - # add SVLEN to ME DELs - if record.info['SVTYPE'] == 'DEL' and '_BND_' in record.id: - record.info['SVLEN'] = record.info['END2'] - record.start - record.stop = record.info['END2'] - record.info.pop('END2') - record.info.pop('CHR2') # re-set all variant IDs if cohort ID is provided to be part of variant IDs if args.cohort_id is not None: counter, prev_chrom = _increment_counter(record, counter, prev_chrom) From 803451ffe3ef4e23534c7179557aca6dfbf3a5cb Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Fri, 26 Apr 2024 17:19:30 -0400 Subject: [PATCH 10/11] set filter to PASS if empty --- src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py b/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py index a5461c892..e23cfd018 100644 --- a/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py +++ b/src/sv-pipeline/scripts/apply_ncr_and_ref_artifact_filters.py @@ -89,6 +89,10 @@ def _apply_filters(record, ploidy_dict, ncr_threshold, filter_reference_artifact if 'PASS' in record.filter: del record.filter['PASS'] + # if filter is . set to PASS + if len(record.filter) == 0: + record.filter.add('PASS') + # Clean out AF metrics since they're no longer valid for field in ['AC', 'AF']: if field in record.info: From 96d585f9e95c20c25c760ab7fd958674345b32f1 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Sat, 27 Apr 2024 17:10:22 -0400 Subject: [PATCH 11/11] output id rename map --- wdl/ApplyNCRAndRefArtifactFilters.wdl | 1 + wdl/ApplyNCRAndRefArtifactFiltersPerContig.wdl | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/wdl/ApplyNCRAndRefArtifactFilters.wdl b/wdl/ApplyNCRAndRefArtifactFilters.wdl index 2a5ddf52b..b2cd62ea2 100644 --- a/wdl/ApplyNCRAndRefArtifactFilters.wdl +++ b/wdl/ApplyNCRAndRefArtifactFilters.wdl @@ -35,5 +35,6 @@ workflow ApplyNCRAndRefArtifactFilters { output { Array[File] filtered_vcfs = ApplyNCRAndRefArtifactFiltersPerContig.filtered_vcf Array[File] filtered_vcf_indexes = ApplyNCRAndRefArtifactFiltersPerContig.filtered_vcf_index + Array[File] id_rename_maps = ApplyNCRAndRefArtifactFiltersPerContig.id_rename_map } } \ No newline at end of file diff --git a/wdl/ApplyNCRAndRefArtifactFiltersPerContig.wdl b/wdl/ApplyNCRAndRefArtifactFiltersPerContig.wdl index 48bc79d25..f7b8aa752 100644 --- a/wdl/ApplyNCRAndRefArtifactFiltersPerContig.wdl +++ b/wdl/ApplyNCRAndRefArtifactFiltersPerContig.wdl @@ -61,9 +61,17 @@ workflow ApplyNCRAndRefArtifactFiltersPerContig { runtime_attr_override = runtime_attr_concat_vcfs } + call tasks.CatUncompressedFiles { + input: + shards=ApplyFilters.id_rename_map, + outfile_name="~{prefix}.id_rename_map.tsv", + sv_base_mini_docker=sv_base_mini_docker + } + output { File filtered_vcf = ConcatVcfs.concat_vcf File filtered_vcf_index = ConcatVcfs.concat_vcf_idx + File id_rename_map = CatUncompressedFiles.outfile } } @@ -110,6 +118,7 @@ task ApplyFilters { output { File filtered_vcf = "~{prefix}.vcf.gz" + File id_rename_map = "~{cohort_id}.vid_map.tsv" } runtime {