diff --git a/.github/.dockstore.yml b/.github/.dockstore.yml index 035d1cc84..6c105e73c 100644 --- a/.github/.dockstore.yml +++ b/.github/.dockstore.yml @@ -198,6 +198,15 @@ workflows: tags: - /.*/ + - subclass: WDL + name: Vapor + primaryDescriptorPath: /wdl/Vapor.wdl + filters: + branches: + - main + tags: + - /.*/ + - subclass: WDL name: VisualizeCnvs primaryDescriptorPath: /wdl/VisualizeCnvs.wdl diff --git a/inputs/templates/test/Vapor/Vapor.json.tmpl b/inputs/templates/test/Vapor/Vapor.json.tmpl index b8ebfd0d6..ad343f302 100644 --- a/inputs/templates/test/Vapor/Vapor.json.tmpl +++ b/inputs/templates/test/Vapor/Vapor.json.tmpl @@ -8,7 +8,6 @@ "Vapor.ref_dict": {{ reference_resources.reference_dict | tojson }}, "Vapor.save_plots": "false", - "Vapor.prefix": {{ test_batch.example_pacbio_sample_id | tojson }}, "Vapor.sample_id": {{ test_batch.example_pacbio_sample_id | tojson }}, "Vapor.bam_or_cram_file": {{ test_batch.example_pacbio_cram | tojson }}, "Vapor.bam_or_cram_index": {{ test_batch.example_pacbio_cram_index | tojson }}, diff --git a/src/sv-pipeline/scripts/preprocess_bed_for_vapor.py b/src/sv-pipeline/scripts/preprocess_bed_for_vapor.py index 89df62fe3..d6d948148 100644 --- a/src/sv-pipeline/scripts/preprocess_bed_for_vapor.py +++ b/src/sv-pipeline/scripts/preprocess_bed_for_vapor.py @@ -33,13 +33,13 @@ def handle_header(line, columns, fields, default_num_columns, sample_to_extract) for i, name in enumerate(fields[default_num_columns:]): columns[name] = default_num_columns + i if "SVLEN" not in columns: - logging.warning("SVLEN column not found. Will not be able to add SVLEN info to INS events") + raise ValueError("SVLEN column not found in header") else: - logging.warning("Header not found. Will not be able to add SVLEN info to INS events") + raise ValueError("Header not found. Header must exist and start with #") if len(fields) >= default_num_columns: columns['samples'] = default_num_columns # if no header but extra fields, assume samples is next column if sample_to_extract is not None and "samples" not in columns: - logging.warning("Sample to extract provided but no samples column found") + raise ValueError("Sample to extract provided but no samples column found") def reformat(bed_in, bed_out, contig, sample_to_extract): diff --git a/wdl/Utils.wdl b/wdl/Utils.wdl index 9592a23ea..70a39436a 100644 --- a/wdl/Utils.wdl +++ b/wdl/Utils.wdl @@ -783,6 +783,72 @@ task SubsetVcfBySamplesList { } } +# Subset a VCF to a specific sample +task SubsetVcfToSample { + input { + File vcf + File? vcf_idx + String sample + String? outfile_name + Boolean remove_sample = false # If false (default), keep the sample If true, remove it. + Boolean remove_private_sites = true # If true (default), remove sites that are private to excluded samples. If false, keep sites even if no remaining samples are non-ref. + Boolean keep_af = true # If true (default), do not recalculate allele frequencies (AC/AF/AN) + String sv_base_mini_docker + RuntimeAttr? runtime_attr_override + } + + String vcf_subset_filename = select_first([outfile_name, basename(vcf, ".vcf.gz") + ".subset.vcf.gz"]) + String vcf_subset_idx_filename = vcf_subset_filename + ".tbi" + + String remove_private_sites_flag = if remove_private_sites then " | bcftools view -e 'SVTYPE!=\"CNV\" && COUNT(GT=\"alt\")==0' " else "" + String keep_af_flag = if keep_af then "--no-update" else "" + String complement_flag = if remove_sample then "^" else "" + + # Disk must be scaled proportionally to the size of the VCF + Float input_size = size(vcf, "GiB") + RuntimeAttr default_attr = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + (input_size * 2)), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + command <<< + + set -euo pipefail + + bcftools view \ + -s ~{complement_flag}~{sample} \ + --force-samples \ + ~{keep_af_flag} \ + ~{vcf} \ + ~{remove_private_sites_flag} \ + -O z \ + -o ~{vcf_subset_filename} + + tabix -f -p vcf ~{vcf_subset_filename} + + >>> + + output { + File vcf_subset = vcf_subset_filename + File vcf_subset_index = vcf_subset_idx_filename + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_base_mini_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + task VcfToBed { input { diff --git a/wdl/Vapor.wdl b/wdl/Vapor.wdl index 386e8d607..22973d834 100644 --- a/wdl/Vapor.wdl +++ b/wdl/Vapor.wdl @@ -1,14 +1,17 @@ version 1.0 +import "Utils.wdl" as utils import "Structs.wdl" workflow Vapor { input { - String prefix + String sample_id File bam_or_cram_file File bam_or_cram_index - File bed_file - String sample_id + + # One of the following must be specified. May be single- or multi-sample. + File? bed_file + File? vcf_file Boolean save_plots # Control whether plots are final output @@ -21,31 +24,53 @@ workflow Vapor { String sv_base_mini_docker String sv_pipeline_docker + RuntimeAttr? runtime_attr_subset_sample + RuntimeAttr? runtime_attr_vcf_to_bed RuntimeAttr? runtime_attr_vapor - RuntimeAttr? runtime_attr_bcf2vcf - RuntimeAttr? runtime_attr_vcf2bed RuntimeAttr? runtime_attr_split_vcf RuntimeAttr? runtime_attr_concat_beds - RuntimeAttr? runtime_attr_LocalizeCram File? NONE_FILE_ # Create a null file - do not use this input } + # Convert vcf to bed if provided + if (defined(vcf_file) && !defined(bed_file)) { + + call utils.SubsetVcfToSample { + input: + vcf=select_first([vcf_file]), + vcf_idx=select_first([vcf_file]) + ".tbi", + sample=sample_id, + outfile_name=sample_id, + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override = runtime_attr_subset_sample + } + + call utils.VcfToBed { + input: + vcf_file = SubsetVcfToSample.vcf_subset, + args = "-i SVLEN", + variant_interpretation_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_vcf_to_bed + } + + } + scatter (contig in read_lines(contigs)) { call PreprocessBedForVapor { input: - prefix = "~{prefix}.~{contig}.preprocess", + prefix = "~{sample_id}.~{contig}.preprocess", contig = contig, sample_to_extract = sample_id, - bed_file = bed_file, + bed_file = select_first([bed_file, VcfToBed.bed_output]), sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_split_vcf } call RunVaporWithCram { input: - prefix = "~{prefix}.~{contig}", + prefix = "~{sample_id}.~{contig}", contig = contig, bam_or_cram_file = bam_or_cram_file, bam_or_cram_index = bam_or_cram_index, @@ -62,7 +87,7 @@ workflow Vapor { input: shard_bed_files = RunVaporWithCram.vapor, shard_plots = RunVaporWithCram.vapor_plot, - prefix = prefix, + prefix = sample_id, sv_base_mini_docker = sv_base_mini_docker, runtime_attr_override = runtime_attr_concat_beds } @@ -101,9 +126,16 @@ task PreprocessBedForVapor { command <<< set -euo pipefail + + if [[ ~{bed_file} == *.gz ]]; then + gunzip -c ~{bed_file} > in.bed + else + cp ~{bed_file} in.bed + fi + python /opt/sv-pipeline/scripts/preprocess_bed_for_vapor.py \ --contig ~{contig} \ - --bed-in ~{bed_file} \ + --bed-in in.bed \ --bed-out ~{prefix}.bed \ ~{"-s " + sample_to_extract} >>> diff --git a/wdl/VaporBatch.wdl b/wdl/VaporBatch.wdl index 3b3389e2b..5bafd335a 100644 --- a/wdl/VaporBatch.wdl +++ b/wdl/VaporBatch.wdl @@ -21,9 +21,9 @@ workflow VaporBatch { Boolean save_plots + RuntimeAttr? runtime_attr_subset_sample + RuntimeAttr? runtime_attr_vcf_to_bed RuntimeAttr? runtime_attr_vapor - RuntimeAttr? runtime_attr_bcf2vcf - RuntimeAttr? runtime_attr_vcf2bed RuntimeAttr? runtime_attr_split_vcf RuntimeAttr? runtime_attr_concat_beds } @@ -31,7 +31,6 @@ workflow VaporBatch { scatter (i in range(length(bam_or_cram_files))) { call vapor_bed.Vapor { input: - prefix = samples[i], bam_or_cram_file = bam_or_cram_files[i], bam_or_cram_index = bam_or_cram_indexes[i], bed_file = bed_file, @@ -45,8 +44,8 @@ workflow VaporBatch { sv_base_mini_docker = sv_base_mini_docker, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_vapor = runtime_attr_vapor, - runtime_attr_bcf2vcf = runtime_attr_bcf2vcf, - runtime_attr_vcf2bed = runtime_attr_vcf2bed, + runtime_attr_subset_sample = runtime_attr_subset_sample, + runtime_attr_vcf_to_bed = runtime_attr_vcf_to_bed, runtime_attr_split_vcf = runtime_attr_split_vcf, runtime_attr_concat_beds = runtime_attr_concat_beds }