Reviewer comments

broadinstitute · Oct 21, 2024 · 0024d24 · 0024d24
1 parent 767e8a2
commit 0024d24
Show file tree

Hide file tree

Showing 2 changed files with 33 additions and 28 deletions.
diff --git a/inputs/templates/test/ApplyManualVariantFilter/ApplyManualVariantFilter.json.tmpl b/inputs/templates/test/ApplyManualVariantFilter/ApplyManualVariantFilter.json.tmpl
@@ -3,5 +3,5 @@
   "ApplyManualVariantFilter.prefix" : {{ test_batch.name | tojson }},
   "ApplyManualVariantFilter.sv_base_mini_docker":{{ dockers.sv_base_mini_docker | tojson }},
   "ApplyManualVariantFilter.bcftools_filter": "(SVTYPE==\"DEL\" && COUNT(ALGORITHMS)==1 && ALGORITHMS==\"wham\") || (ALT==\"<INS:ME:SVA>\" && COUNT(ALGORITHMS)==1 && ALGORITHMS==\"scramble\" && HIGH_SR_BACKGROUND==1)",
-  "ApplyManualVariantFilter.filter_name": "hard_filter"
+  "ApplyManualVariantFilter.filter_name": "high_algorithm_fp_rate"
 }
diff --git a/src/sv-pipeline/scripts/make_scramble_vcf.py b/src/sv-pipeline/scripts/make_scramble_vcf.py
@@ -5,7 +5,7 @@
 """
 
 import argparse
-from collections import defaultdict
+from collections import defaultdict, deque
 import gzip
 import logging
 import os
@@ -101,6 +101,7 @@ def read_table(file_lines, ref_path, cluster_distance, alu_size, sva_size, l1_si
         "alu": alu_size
     }
 
+    # See discussion for description: https://github.com/broadinstitute/gatk-sv/discussions/734
     def _calculate_svlen_one_sided(r):
         if r['Clipped_Side'] == 'right':
             if r['Insertion_Direction'] == 'Minus':
@@ -113,6 +114,7 @@ def _calculate_svlen_one_sided(r):
             else:
                 return r['Stop_In_MEI']
 
+    # See discussion for description: https://github.com/broadinstitute/gatk-sv/discussions/734
     def _calculate_svlen_two_sided(r1, r2):
         if r1['Clipped_Side'] == 'right':
             r_right = r1
@@ -141,8 +143,8 @@ def _record_sort_key(record, ref_sequences_index_dict):
         ref_sequences_index_dict = {seq: i for i, seq in enumerate(f.references)}
 
     # Buffer of active items within cluster_distance
-    buffer = list()
-    data = list()
+    buffer = deque()
+    data = deque()
     columns = None
     for line in file_lines:
         if not line:
@@ -158,39 +160,43 @@ def _record_sort_key(record, ref_sequences_index_dict):
         row['sides'] = 1
         _cast_numeric(row, FLOAT_COLUMNS, float)
         _cast_numeric(row, INT_COLUMNS, int)
-        new_buffer = list()
+        new_buffer = deque()
         found_match = False
         for item in buffer:
             if found_match:
                 new_buffer.append(item)
             elif item['chrom'] != row['chrom'] \
                     or abs(int(item['pos']) - int(row['pos'])) > cluster_distance:
                 item['svlen'] = _calculate_svlen_one_sided(item)
-                data.insert(0, item)
+                data.appendleft(item)
             elif item['Clipped_Side'] != row['Clipped_Side'] \
                     and item['Insertion_Direction'] == row['Insertion_Direction']:
                 row['pos'] = item['pos']
                 row['svlen'] = _calculate_svlen_two_sided(item, row)
                 data.append(row)
                 found_match = True
             else:
-                new_buffer.insert(0, item)
+                new_buffer.appendleft(item)
         if not found_match:
-            new_buffer.insert(0, row)
+            new_buffer.appendleft(row)
         buffer = new_buffer
     for item in buffer:
         item['svlen'] = _calculate_svlen_one_sided(item)
         data.append(item)
     # Sort records in memory
-    data = sorted(data, key=lambda x: _record_sort_key(record=x, ref_sequences_index_dict=ref_sequences_index_dict))
+    data = sorted(list(data), key=lambda x: _record_sort_key(record=x, ref_sequences_index_dict=ref_sequences_index_dict))
     return pd.DataFrame(data=data)
 
 
 def fails_indel_filter(record, samfile, mei_trees, indel_window, min_reads, min_ins_size, max_ins_size,
                        min_del_size, max_del_size):
     """
     Checks if a given record lies within a reference MEI and is sufficiently close to a nearby indel,
-    based on local read alignments, which is a common error mode for Scramble.
+    based on local read alignments, which is a common error mode for Scramble. The check for indels is to count
+    the number of insertions and deletions occurring anywhere in the local vicinity of the breakpoint and determine
+    if either count exceeds the specified threshold. Note that indel read counts are tallied across the entire window,
+    as identifying individual indels within simple repeats is very difficult (CIGAR indels tend to be scattered). Also,
+    the indels are tallied over all reads overlapping the window, so the full search window is indel_window + read_length.
     Args:
         record: VariantRecord
             Variant in question
@@ -250,8 +256,8 @@ def _get_num_indels(reads, min_reads, min_ins_size, max_ins_size, min_del_size,
     return num_indels >= min_reads
 
 
-def write_vcf(vcf, samfile, df, sample_id, enable_indel_filter, mei_trees, del_filter_trees,
-              indel_window, indel_reads, min_ins_size, max_ins_size, min_del_size, max_del_size):
+def filter_and_write_vcf(vcf, samfile, df, sample_id, enable_indel_filter, mei_trees, del_filter_trees,
+                         indel_window, indel_reads, min_ins_size, max_ins_size, min_del_size, max_del_size):
     """
     Iterates through calls in the given DataFrame, creating VCF variant records. Applies indel and SV deletion filters.
     Writes resulting records to the VCF. All variants are genotyped as diploid/heterozygous.
@@ -269,7 +275,7 @@ def write_vcf(vcf, samfile, df, sample_id, enable_indel_filter, mei_trees, del_f
         mei_trees: dict of IntervalTree
             Reference MEI intervals
         del_filter_trees: dict of IntervalTree
-            Manta/Wham deletion calls, used for filtering
+            Manta deletion calls, used for filtering
         indel_window: int
             Window in bp to look for indels around the insertion site
         indel_reads: int
@@ -320,8 +326,7 @@ def write_vcf(vcf, samfile, df, sample_id, enable_indel_filter, mei_trees, del_f
 
 def create_trees_from_bed_records(path, padding):
     """
-    Iterates through calls in the given DataFrame, creating variant records. Applies indel and SV deletion filters.
-    Writes resulting records to the VCF. All variants are genotyped as diploid/heterozygous.
+    Creates a set of interval trees (one per contig) from a bed file at the given path. Adds padding to each interval.
     Args:
         path: str
             Path to reference MEI bed file
@@ -386,7 +391,7 @@ def gzip_open_read_text(path):
         path: str
             File path
     Returns:
-        Method to open the file
+        File handle
     """
     return gzip.open(path, "rt")
 
@@ -479,31 +484,31 @@ def main(argv: Optional[List[Text]] = None):
 
     header = make_header(reference_path=arguments.reference, sample_id=arguments.sample)
     logging.info("Loading Scramble table...")
-    sorted_bed_lines = preprocess_scramble_table(path=arguments.table)
-    df = read_table(file_lines=sorted_bed_lines,
+    bed_lines = preprocess_scramble_table(path=arguments.table)
+    df = read_table(file_lines=bed_lines,
                     ref_path=arguments.reference,
                     cluster_distance=arguments.cluster_distance,
                     alu_size=arguments.alu_size,
                     sva_size=arguments.sva_size,
                     l1_size=arguments.l1_size)
     logging.info("Loading MEI bed...")
     mei_trees = create_trees_from_bed_records(arguments.mei_bed, padding=arguments.mei_padding)
-    logging.info("Loading Manta and Wham deletions...")
+    logging.info("Loading Manta deletions...")
     with pysam.VariantFile(arguments.manta_vcf) as f_manta:
         del_filter_trees = dict()
         add_del_ends_to_trees(vcf=f_manta, trees=del_filter_trees, padding=arguments.del_filter_window)
     logging.info("Writing vcf...")
     with pysam.VariantFile(arguments.out, "w", header=header) as vcf, \
             pysam.AlignmentFile(arguments.alignments_file, reference_filename=arguments.reference) as samfile:
-        write_vcf(vcf=vcf, samfile=samfile, df=df, sample_id=arguments.sample,
-                  enable_indel_filter=not arguments.disable_indel_filter,
-                  mei_trees=mei_trees,
-                  del_filter_trees=del_filter_trees,
-                  indel_window=arguments.indel_window, indel_reads=arguments.indel_reads,
-                  min_ins_size=arguments.small_ins_min_size,
-                  max_ins_size=arguments.small_ins_max_size,
-                  min_del_size=arguments.small_del_min_size,
-                  max_del_size=arguments.small_del_max_size)
+        filter_and_write_vcf(vcf=vcf, samfile=samfile, df=df, sample_id=arguments.sample,
+                             enable_indel_filter=not arguments.disable_indel_filter,
+                             mei_trees=mei_trees,
+                             del_filter_trees=del_filter_trees,
+                             indel_window=arguments.indel_window, indel_reads=arguments.indel_reads,
+                             min_ins_size=arguments.small_ins_min_size,
+                             max_ins_size=arguments.small_ins_max_size,
+                             min_del_size=arguments.small_del_min_size,
+                             max_del_size=arguments.small_del_max_size)
     pysam.tabix_index(arguments.out, preset="vcf", force=True)