Skip to content

Commit

Permalink
Fix file naming in GenotypeBatch.SplitVariants (#712)
Browse files Browse the repository at this point in the history
  • Loading branch information
epiercehoffman authored Aug 19, 2024
1 parent c05620a commit 84a0627
Showing 1 changed file with 22 additions and 26 deletions.
48 changes: 22 additions & 26 deletions src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,74 +3,70 @@
import logging


def process_bed_file(input_bed, n_per_split, bca=True):
def process_bed_file(input_bed, n_per_split, bca=True, digits=9):
SVTYPE_FIELD = 5
END_FIELD = 2
START_FIELD = 1

# Check the conditions to generate prefixes for the output files
# Conditions for each category of variants
condition_prefixes = {
'gt5kb': {'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_FIELD]) - int(line[START_FIELD]) >= 5000)},
'lt5kb': {'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_FIELD]) - int(line[START_FIELD]) < 5000)},
'gt5kb': {'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and
(int(line[END_FIELD]) - int(line[START_FIELD]) >= 5000)},
'lt5kb': {'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and
(int(line[END_FIELD]) - int(line[START_FIELD]) < 5000)},
'bca': {'condition': lambda line: bca and line[SVTYPE_FIELD] not in ['DEL', 'DUP'] and not line[SVTYPE_FIELD].startswith('INS')},
'ins': {'condition': lambda line: bca and line[SVTYPE_FIELD].startswith('INS')}
}

# Create trackers for the current file information
current_lines = {prefix: [] for prefix in condition_prefixes.keys()}
current_counts = {prefix: 0 for prefix in condition_prefixes.keys()}
current_suffixes = {prefix: 'a' for prefix in condition_prefixes.keys()}
current_suffixes = {prefix: 0 for prefix in condition_prefixes.keys()}

with open(input_bed, 'r') as infile:
for line in infile:
line = line.strip('\n').split('\t')
# This line swaps the last two columns so the sample names are in the fifth column and SV type in the last
line[4], line[5] = line[5], line[4]
for prefix, conditions in condition_prefixes.items():
# If a line matches a condition add it to the appropriate file
# If a line matches a condition add it to the appropriate category
if conditions['condition'](line):
current_lines[prefix].append('\t'.join(line))
current_counts[prefix] += 1
# If a file has met the number of records per file create a new file with the next suffix and write
# the current line to that new file
# If a category has the specified number of records, create a new file and write the current records
if current_counts[prefix] == n_per_split:
output_suffix = current_suffixes[prefix].rjust(6, 'a')
output_file = f"{prefix}.{output_suffix}.bed"
output_file = get_file_name(prefix, current_suffixes[prefix], digits)
with open(output_file, 'w') as outfile:
outfile.write('\n'.join(current_lines[prefix]))
# Keep track of which files have been written after reaching the max number of files
# Log the file name that was created
logging.info(f"File '{output_file}' written.")
# Update the tracking information
current_lines[prefix] = []
current_counts[prefix] = 0
current_suffixes[prefix] = increment_suffix(current_suffixes[prefix])
# Handle the samples after files with the given number of lines per file have been written
current_suffixes[prefix] = current_suffixes[prefix] + 1
# Handle the remaining records
for prefix, lines in current_lines.items():
if lines:
output_suffix = current_suffixes[prefix].rjust(6, 'a')
output_file = f"{prefix}.{output_suffix}.bed"
output_file = get_file_name(prefix, current_suffixes[prefix], digits)
with open(output_file, 'w') as outfile:
outfile.write('\n'.join(lines))
logging.info(f"File '{output_file}' written.")


# Create a function to appropriately add a suffix to each corresponding file
def increment_suffix(suffix):
alphabet = 'abcdefghijklmnopqrstuvwxyz'
if suffix == 'z' * 6:
raise ValueError('All possible files generated.')
else:
index = alphabet.index(suffix[0])
next_char = alphabet[(index + 1) % 26]
return next_char + suffix[1:]
def get_file_name(prefix, suffix, digits):
if len(str(suffix)) > digits:
raise ValueError('No more files can be generated with the current naming scheme. '
'Increase the digits parameter or the n parameter to proceed.')
return f"{prefix}.{str(suffix).zfill(digits)}.bed"


def main():
parser = argparse.ArgumentParser()
parser.add_argument("--bed", help="Path to input bed file", required=True)
parser.add_argument("--n", help="number of variants per file", required=True, type=int)
parser.add_argument("--n", help="number of variants per output file", required=True, type=int)
parser.add_argument("--bca", default=False, help="Flag to set to True if the VCF contains BCAs",
action='store_true')
parser.add_argument("--digits", "-d", default=9, type=int, help="Number of digits in filename suffix")
parser.add_argument("--log-level", required=False, default="INFO", help="Specify level of logging information")
args = parser.parse_args()

Expand All @@ -79,7 +75,7 @@ def main():
if not isinstance(numeric_level, int):
raise ValueError('Invalid log level: %s' % log_level)
logging.basicConfig(level=numeric_level, format='%(levelname)s: %(message)s')
process_bed_file(args.bed, args.n, args.bca)
process_bed_file(args.bed, args.n, args.bca, args.digits)


if __name__ == '__main__':
Expand Down

0 comments on commit 84a0627

Please sign in to comment.