Skip to content

Commit

Permalink
Reviewer comments
Browse files Browse the repository at this point in the history
  • Loading branch information
mwalker174 committed Oct 21, 2024
1 parent 767e8a2 commit 0024d24
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@
"ApplyManualVariantFilter.prefix" : {{ test_batch.name | tojson }},
"ApplyManualVariantFilter.sv_base_mini_docker":{{ dockers.sv_base_mini_docker | tojson }},
"ApplyManualVariantFilter.bcftools_filter": "(SVTYPE==\"DEL\" && COUNT(ALGORITHMS)==1 && ALGORITHMS==\"wham\") || (ALT==\"<INS:ME:SVA>\" && COUNT(ALGORITHMS)==1 && ALGORITHMS==\"scramble\" && HIGH_SR_BACKGROUND==1)",
"ApplyManualVariantFilter.filter_name": "hard_filter"
"ApplyManualVariantFilter.filter_name": "high_algorithm_fp_rate"
}
59 changes: 32 additions & 27 deletions src/sv-pipeline/scripts/make_scramble_vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"""

import argparse
from collections import defaultdict
from collections import defaultdict, deque
import gzip
import logging
import os
Expand Down Expand Up @@ -101,6 +101,7 @@ def read_table(file_lines, ref_path, cluster_distance, alu_size, sva_size, l1_si
"alu": alu_size
}

# See discussion for description: https://github.com/broadinstitute/gatk-sv/discussions/734
def _calculate_svlen_one_sided(r):
if r['Clipped_Side'] == 'right':
if r['Insertion_Direction'] == 'Minus':
Expand All @@ -113,6 +114,7 @@ def _calculate_svlen_one_sided(r):
else:
return r['Stop_In_MEI']

# See discussion for description: https://github.com/broadinstitute/gatk-sv/discussions/734
def _calculate_svlen_two_sided(r1, r2):
if r1['Clipped_Side'] == 'right':
r_right = r1
Expand Down Expand Up @@ -141,8 +143,8 @@ def _record_sort_key(record, ref_sequences_index_dict):
ref_sequences_index_dict = {seq: i for i, seq in enumerate(f.references)}

# Buffer of active items within cluster_distance
buffer = list()
data = list()
buffer = deque()
data = deque()
columns = None
for line in file_lines:
if not line:
Expand All @@ -158,39 +160,43 @@ def _record_sort_key(record, ref_sequences_index_dict):
row['sides'] = 1
_cast_numeric(row, FLOAT_COLUMNS, float)
_cast_numeric(row, INT_COLUMNS, int)
new_buffer = list()
new_buffer = deque()
found_match = False
for item in buffer:
if found_match:
new_buffer.append(item)
elif item['chrom'] != row['chrom'] \
or abs(int(item['pos']) - int(row['pos'])) > cluster_distance:
item['svlen'] = _calculate_svlen_one_sided(item)
data.insert(0, item)
data.appendleft(item)
elif item['Clipped_Side'] != row['Clipped_Side'] \
and item['Insertion_Direction'] == row['Insertion_Direction']:
row['pos'] = item['pos']
row['svlen'] = _calculate_svlen_two_sided(item, row)
data.append(row)
found_match = True
else:
new_buffer.insert(0, item)
new_buffer.appendleft(item)
if not found_match:
new_buffer.insert(0, row)
new_buffer.appendleft(row)
buffer = new_buffer
for item in buffer:
item['svlen'] = _calculate_svlen_one_sided(item)
data.append(item)
# Sort records in memory
data = sorted(data, key=lambda x: _record_sort_key(record=x, ref_sequences_index_dict=ref_sequences_index_dict))
data = sorted(list(data), key=lambda x: _record_sort_key(record=x, ref_sequences_index_dict=ref_sequences_index_dict))
return pd.DataFrame(data=data)


def fails_indel_filter(record, samfile, mei_trees, indel_window, min_reads, min_ins_size, max_ins_size,
min_del_size, max_del_size):
"""
Checks if a given record lies within a reference MEI and is sufficiently close to a nearby indel,
based on local read alignments, which is a common error mode for Scramble.
based on local read alignments, which is a common error mode for Scramble. The check for indels is to count
the number of insertions and deletions occurring anywhere in the local vicinity of the breakpoint and determine
if either count exceeds the specified threshold. Note that indel read counts are tallied across the entire window,
as identifying individual indels within simple repeats is very difficult (CIGAR indels tend to be scattered). Also,
the indels are tallied over all reads overlapping the window, so the full search window is indel_window + read_length.
Args:
record: VariantRecord
Variant in question
Expand Down Expand Up @@ -250,8 +256,8 @@ def _get_num_indels(reads, min_reads, min_ins_size, max_ins_size, min_del_size,
return num_indels >= min_reads


def write_vcf(vcf, samfile, df, sample_id, enable_indel_filter, mei_trees, del_filter_trees,
indel_window, indel_reads, min_ins_size, max_ins_size, min_del_size, max_del_size):
def filter_and_write_vcf(vcf, samfile, df, sample_id, enable_indel_filter, mei_trees, del_filter_trees,
indel_window, indel_reads, min_ins_size, max_ins_size, min_del_size, max_del_size):
"""
Iterates through calls in the given DataFrame, creating VCF variant records. Applies indel and SV deletion filters.
Writes resulting records to the VCF. All variants are genotyped as diploid/heterozygous.
Expand All @@ -269,7 +275,7 @@ def write_vcf(vcf, samfile, df, sample_id, enable_indel_filter, mei_trees, del_f
mei_trees: dict of IntervalTree
Reference MEI intervals
del_filter_trees: dict of IntervalTree
Manta/Wham deletion calls, used for filtering
Manta deletion calls, used for filtering
indel_window: int
Window in bp to look for indels around the insertion site
indel_reads: int
Expand Down Expand Up @@ -320,8 +326,7 @@ def write_vcf(vcf, samfile, df, sample_id, enable_indel_filter, mei_trees, del_f

def create_trees_from_bed_records(path, padding):
"""
Iterates through calls in the given DataFrame, creating variant records. Applies indel and SV deletion filters.
Writes resulting records to the VCF. All variants are genotyped as diploid/heterozygous.
Creates a set of interval trees (one per contig) from a bed file at the given path. Adds padding to each interval.
Args:
path: str
Path to reference MEI bed file
Expand Down Expand Up @@ -386,7 +391,7 @@ def gzip_open_read_text(path):
path: str
File path
Returns:
Method to open the file
File handle
"""
return gzip.open(path, "rt")

Expand Down Expand Up @@ -479,31 +484,31 @@ def main(argv: Optional[List[Text]] = None):

header = make_header(reference_path=arguments.reference, sample_id=arguments.sample)
logging.info("Loading Scramble table...")
sorted_bed_lines = preprocess_scramble_table(path=arguments.table)
df = read_table(file_lines=sorted_bed_lines,
bed_lines = preprocess_scramble_table(path=arguments.table)
df = read_table(file_lines=bed_lines,
ref_path=arguments.reference,
cluster_distance=arguments.cluster_distance,
alu_size=arguments.alu_size,
sva_size=arguments.sva_size,
l1_size=arguments.l1_size)
logging.info("Loading MEI bed...")
mei_trees = create_trees_from_bed_records(arguments.mei_bed, padding=arguments.mei_padding)
logging.info("Loading Manta and Wham deletions...")
logging.info("Loading Manta deletions...")
with pysam.VariantFile(arguments.manta_vcf) as f_manta:
del_filter_trees = dict()
add_del_ends_to_trees(vcf=f_manta, trees=del_filter_trees, padding=arguments.del_filter_window)
logging.info("Writing vcf...")
with pysam.VariantFile(arguments.out, "w", header=header) as vcf, \
pysam.AlignmentFile(arguments.alignments_file, reference_filename=arguments.reference) as samfile:
write_vcf(vcf=vcf, samfile=samfile, df=df, sample_id=arguments.sample,
enable_indel_filter=not arguments.disable_indel_filter,
mei_trees=mei_trees,
del_filter_trees=del_filter_trees,
indel_window=arguments.indel_window, indel_reads=arguments.indel_reads,
min_ins_size=arguments.small_ins_min_size,
max_ins_size=arguments.small_ins_max_size,
min_del_size=arguments.small_del_min_size,
max_del_size=arguments.small_del_max_size)
filter_and_write_vcf(vcf=vcf, samfile=samfile, df=df, sample_id=arguments.sample,
enable_indel_filter=not arguments.disable_indel_filter,
mei_trees=mei_trees,
del_filter_trees=del_filter_trees,
indel_window=arguments.indel_window, indel_reads=arguments.indel_reads,
min_ins_size=arguments.small_ins_min_size,
max_ins_size=arguments.small_ins_max_size,
min_del_size=arguments.small_del_min_size,
max_del_size=arguments.small_del_max_size)
pysam.tabix_index(arguments.out, preset="vcf", force=True)


Expand Down

0 comments on commit 0024d24

Please sign in to comment.