Readding preprocess vcf for vapor

broadinstitute · Feb 4, 2025 · 5a48f28 · 5a48f28
1 parent bdb3bcc
commit 5a48f28
Show file tree

Hide file tree

Showing 2 changed files with 95 additions and 3 deletions.
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/overlap_breakpoint_filter.py b/src/sv-pipeline/04_variant_resolution/scripts/overlap_breakpoint_filter.py
@@ -64,6 +64,7 @@ def __init__(self, record):
         self.length = record.info['SVLEN']
         self.cnv_gt_5kbp = (record.info['SVTYPE'] == 'DEL' or record.info['SVTYPE'] == 'DUP') and self.length >= 5000
         self.gt_50bp = self.length >= 50
+        self.is_dragen = 'dragen' in record.info['ALGORITHMS']
         self.is_melt = 'melt' in record.info['ALGORITHMS']
         self.is_scramble = 'scramble' in record.info['ALGORITHMS']
         self.is_manta = 'manta' in record.info['ALGORITHMS']
@@ -164,10 +165,10 @@ def __str__(self):
     if len(sample_intersection) < 0.50 * max_freq:
         continue
     # Determine which to filter
-    # Special case if one is a Manta insertion and the other is MEI, keep the MEI
-    if first.is_manta and first.svtype == "INS" and second.is_mei:
+    # Special case if one is a Dragen/Manta insertion and the other is MEI, keep the MEI
+    if (first.is_dragen or first.is_manta) and first.svtype == "INS" and second.is_mei:
         sorted_data_list = [second, first]
-    elif second.is_manta and second.svtype == "INS" and first.is_mei:
+    elif (second.is_dragen or second.is_manta) and second.svtype == "INS" and first.is_mei:
         sorted_data_list = [first, second]
     else:
         # Otherwise use sorting spec

diff --git a/wdl/PreprocessVcfForVapor.wdl b/wdl/PreprocessVcfForVapor.wdl
@@ -0,0 +1,91 @@
+version 1.0
+
+workflow PreprocessVcfForVapor {
+	input {
+		String sample_id            # Sample identifier
+		File vcf_path             	# Path to the input VCF file
+
+		File contigs_fai          	# Path to the contigs file
+		Int min_size               	# Minimum size for standardization
+
+		String sv_pipeline_docker   # Docker image path for GATK-SV
+	}
+
+	call StandardizeVcf {
+		input:
+			sample_id = sample_id,
+			vcf_path = vcf_path,
+			contigs_fai = contigs_fai,
+			min_size = min_size,
+			sv_pipeline_docker = sv_pipeline_docker
+	}
+
+	call Vcf2Bed {
+		input:
+			sample_id = sample_id,
+			vcf_path = StandardizeVcf.standardized_vcf,
+			sv_pipeline_docker = sv_pipeline_docker
+	}
+
+	output {
+		File dragen_sr_bed = Vcf2Bed.vcf2bed_vapor
+	}
+}
+
+task StandardizeVcf {
+	input {
+		String sample_id
+		File vcf_path
+		File contigs_fai
+		Int min_size
+		String sv_pipeline_docker
+	}
+
+	command <<<
+		set -eu -o pipefail
+
+		svtk standardize \
+			--sample-names ~{sample_id} \
+			--contigs ~{contigs_fai} \
+			--min-size ~{min_size} \
+			~{vcf_path} \
+			~{sample_id}.std_dragen.vcf.gz \
+			dragen
+	>>>
+
+	output {
+		File standardized_vcf = "~{sample_id}.std_dragen.vcf.gz"
+	}
+
+	runtime {
+		cpu: 1
+		memory: "2 GiB"
+		disks: "local-disk 2 HDD"
+		docker: sv_pipeline_docker
+	}
+}
+
+task Vcf2Bed {
+	input {
+		String sample_id
+		File vcf_path
+		String sv_pipeline_docker
+	}
+
+	command <<<
+		set -eu -o pipefail
+
+		svtk vcf2bed --info SVTYPE --info SVLEN ~{vcf_path} - | awk '$7 != "BND"' > ~{sample_id}.bed
+	>>>
+
+	output {
+		File vcf2bed_vapor = "~{sample_id}.bed"
+	}
+
+	runtime {
+		cpu: 1
+		memory: "2 GiB"
+		disks: "local-disk 2 HDD"
+		docker: sv_pipeline_docker
+	}
+}