Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove outliers across per-contig VCFs #654

Draft
wants to merge 7 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"FilterOutlierSamplesFinal.sv_pipeline_docker": {{ dockers.sv_pipeline_docker | tojson }},
"FilterOutlierSamplesFinal.sv_base_mini_docker": {{ dockers.sv_base_mini_docker | tojson }},

"FilterOutlierSamplesFinal.N_IQR_cutoff": "1",

"FilterOutlierSamplesFinal.autosomes_list": {{ reference_resources.autosome_file | tojson }},
"FilterOutlierSamplesFinal.contigs_list": {{ reference_resources.primary_contigs_list | tojson }},

"FilterOutlierSamplesFinal.vcfs" : {{ test_batch.combined_vcfs | tojson }},
"FilterOutlierSamplesFinal.prefix": {{ test_batch.name | tojson }},
"FilterOutlierSamplesFinal.additional_samples_to_remove": "gs://broad-dsde-methods-eph/ref_panel_1kg.to_remove.txt",
"FilterOutlierSamplesFinal.plot_counts": "true",
"FilterOutlierSamplesFinal.filter_vcf": "true"
}
167 changes: 167 additions & 0 deletions wdl/FilterOutlierSamplesFinal.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
version 1.0

import "Utils.wdl" as util
import "ShardedCountSVs.wdl" as count
import "PlotSVCountsPerSample.wdl" as plot

workflow FilterOutlierSamplesFinal {
input {
String prefix
Array[File] vcfs # sharded by contig, in order, allosomes last
File additional_samples_to_remove # can be empty file if no additional

Int N_IQR_cutoff
Boolean filter_vcf
Boolean plot_counts
String? bcftools_preprocessing_options # for preprocessing prior to SV counting. will not affect output VCFs

File? sv_counts_in # SV counts file - if not provided, will create
File? all_samples_to_remove # if not provided, will determine outliers to remove. if provided, will just filter vcfs

File autosomes_list
File contigs_list

String sv_pipeline_docker
String sv_base_mini_docker

}

Array[Array[String]] autosomes = read_tsv(autosomes_list)
Array[String] contigs = read_lines(contigs_list)

if (!defined(sv_counts_in)) {
scatter ( i in range(length(autosomes)) ) {
call count.ShardedCountSVs {
input:
vcf = vcfs[i],
bcftools_preprocessing_options = bcftools_preprocessing_options,
prefix = "~{prefix}.~{autosomes[i][0]}",
sv_pipeline_docker = sv_pipeline_docker,
sv_base_mini_docker = sv_base_mini_docker
}
}

call count.SumSVCounts {
input:
counts = ShardedCountSVs.sv_counts,
prefix = "~{prefix}.all_autosomes",
sv_pipeline_docker = sv_pipeline_docker
}
}

File sv_counts_ = select_first([sv_counts_in, SumSVCounts.sv_counts_summed])

if (!defined(all_samples_to_remove) || plot_counts ) {
call plot.PlotSVCountsWithCutoff {
input:
svcounts = sv_counts_,
n_iqr_cutoff = N_IQR_cutoff,
prefix = prefix,
sv_pipeline_docker = sv_pipeline_docker
}
}

if (!defined(all_samples_to_remove)) {
call util.GetSampleIdsFromVcf {
input:
vcf = vcfs[0],
sv_base_mini_docker = sv_base_mini_docker
}

call GetOutliersListAndCount {
input:
outlier_samples_with_reason = select_first([PlotSVCountsWithCutoff.outliers_list]),
cohort_samples = GetSampleIdsFromVcf.out_file,
additional_samples_to_remove = additional_samples_to_remove,
prefix = prefix,
sv_base_mini_docker = sv_base_mini_docker
}
}

File to_remove_ = select_first([all_samples_to_remove, GetOutliersListAndCount.samples_to_remove])

if (filter_vcf) {
scatter (i in range(length(contigs))) {
call util.SubsetVcfBySamplesList {
input:
vcf = vcfs[i],
list_of_samples = to_remove_,
outfile_name = "~{prefix}.~{contigs[i]}.outliers_removed.vcf.gz",
remove_samples = true,
sv_base_mini_docker = sv_base_mini_docker
}
}
}


output {
File sv_counts = sv_counts_
File? sv_count_plots = PlotSVCountsWithCutoff.svcount_distrib_plots
File? outlier_samples_with_reason = PlotSVCountsWithCutoff.outliers_list
File? outlier_samples = GetOutliersListAndCount.outliers_list
Int? num_outliers = GetOutliersListAndCount.num_outliers
File samples_to_remove = to_remove_
Int? num_to_remove = GetOutliersListAndCount.num_to_remove
File? updated_sample_list = GetOutliersListAndCount.updated_sample_list
Int? new_sample_count = GetOutliersListAndCount.new_sample_count
Array[File]? vcfs_samples_removed = SubsetVcfBySamplesList.vcf_subset
Array[File]? vcf_indexes_samples_removed = SubsetVcfBySamplesList.vcf_subset_index
}
}


task GetOutliersListAndCount {
input {
File outlier_samples_with_reason
File cohort_samples
File additional_samples_to_remove
String prefix
String sv_base_mini_docker
RuntimeAttr? runtime_attr_override
}

RuntimeAttr default_attr = object {
cpu_cores: 1,
mem_gb: 3.75,
disk_gb: 10 + ceil(size([outlier_samples_with_reason, cohort_samples, additional_samples_to_remove], "GiB")),
boot_disk_gb: 10,
preemptible_tries: 3,
max_retries: 1
}
RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])

command <<<
set -euo pipefail

# extract sample id column, remove header, deduplicate outlier samples list
cut -f1 ~{outlier_samples_with_reason} | sed '1d' | sort | uniq > ~{prefix}.SV_count_outlier_samples.txt
wc -l < ~{prefix}.SV_count_outlier_samples.txt > outlier_count.txt

# merge outliers & additional samples to remove and deduplicate
cat ~{prefix}.SV_count_outlier_samples.txt ~{additional_samples_to_remove} | sort | uniq > ~{prefix}.outliers_plus_additional_to_remove.txt
wc -l < ~{prefix}.outliers_plus_additional_to_remove.txt > removed_count.txt

# filter cohort sample list
fgrep -wvf ~{prefix}.outliers_plus_additional_to_remove.txt ~{cohort_samples} > ~{prefix}.sample_list.filtered.txt || true
wc -l < ~{prefix}.sample_list.filtered.txt > new_count.txt
>>>

output {
Int num_outliers = read_int("outlier_count.txt")
File outliers_list = "~{prefix}.SV_count_outlier_samples.txt"
File samples_to_remove = "~{prefix}.outliers_plus_additional_to_remove.txt"
Int num_to_remove = read_int("removed_count.txt")
File updated_sample_list = "~{prefix}.sample_list.filtered.txt"
Int new_sample_count = read_int("new_count.txt")
}

runtime {
cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
docker: sv_base_mini_docker
preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
}
}
117 changes: 117 additions & 0 deletions wdl/ShardedCountSVs.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
version 1.0

import "PlotSVCountsPerSample.wdl" as plot
import "CollectQcVcfWide.wdl" as collect
import "TasksMakeCohortVcf.wdl" as tasks

workflow ShardedCountSVs {
input {
File vcf
String prefix

String? bcftools_preprocessing_options
Int records_per_shard = 20000

String sv_pipeline_docker
String sv_base_mini_docker

}

call tasks.ScatterVcf {
input:
vcf = vcf,
records_per_shard = records_per_shard,
prefix = "~{prefix}.scatter",
sv_pipeline_docker = sv_pipeline_docker
}

scatter (i in range(length(ScatterVcf.shards))) {
if (defined(bcftools_preprocessing_options)) {
call collect.PreprocessVcf {
input:
vcf = ScatterVcf.shards[i],
bcftools_preprocessing_options=select_first([bcftools_preprocessing_options]),
prefix = "~{prefix}.shard_~{i}.preprocessed",
sv_base_mini_docker = sv_base_mini_docker
}
}

call plot.CountSVsPerSamplePerType {
input:
vcf = select_first([PreprocessVcf.outvcf, ScatterVcf.shards[i]]),
prefix = "~{prefix}.shard_~{i}",
sv_pipeline_docker = sv_pipeline_docker
}
}

call SumSVCounts {
input:
counts = CountSVsPerSamplePerType.sv_counts,
prefix = prefix,
sv_pipeline_docker = sv_pipeline_docker
}

output {
File sv_counts = SumSVCounts.sv_counts_summed
}
}

task SumSVCounts {
input {
Array[File] counts
String prefix
String sv_pipeline_docker
RuntimeAttr? runtime_attr_override
}

RuntimeAttr default_attr = object {
cpu_cores: 1,
mem_gb: 3.75,
disk_gb: 10 + ceil(size(counts, "GiB")),
boot_disk_gb: 10,
preemptible_tries: 3,
max_retries: 1
}
RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])

command <<<
set -euo pipefail
python <<CODE
import pandas as pd
counts = dict()
for file_path in ["~{sep='", "' counts}"]:
with open(file_path, 'r') as inp:
inp.readline() # skip header
for line in inp:
sample, svtype, count = line.strip().split('\t')
if sample in counts:
if svtype in counts[sample]:
counts[sample][svtype] += int(count)
else:
counts[sample][svtype] = int(count)
else:
counts[sample] = dict()
counts[sample][svtype] = int(count)
with open("~{prefix}.sv_counts_summed.tsv", 'w') as out:
out.write("sample\tsvtype\tcount\n")
for sample in counts:
for svtype in counts[sample]:
out.write(f"{sample}\t{svtype}\t{counts[sample][svtype]}\n")
CODE

>>>

output {
File sv_counts_summed = "~{prefix}.sv_counts_summed.tsv"
}

runtime {
cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
docker: sv_pipeline_docker
preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
}
}
Loading