From 84a0627d25ea04645283fd181d4cb7ef9ec496c9 Mon Sep 17 00:00:00 2001 From: epiercehoffman Date: Mon, 19 Aug 2024 11:05:56 -0400 Subject: [PATCH] Fix file naming in GenotypeBatch.SplitVariants (#712) --- .../scripts/split_variants.py | 48 +++++++++---------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py index ec0418459..0fabae4d0 100644 --- a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py @@ -3,15 +3,17 @@ import logging -def process_bed_file(input_bed, n_per_split, bca=True): +def process_bed_file(input_bed, n_per_split, bca=True, digits=9): SVTYPE_FIELD = 5 END_FIELD = 2 START_FIELD = 1 - # Check the conditions to generate prefixes for the output files + # Conditions for each category of variants condition_prefixes = { - 'gt5kb': {'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_FIELD]) - int(line[START_FIELD]) >= 5000)}, - 'lt5kb': {'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_FIELD]) - int(line[START_FIELD]) < 5000)}, + 'gt5kb': {'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and + (int(line[END_FIELD]) - int(line[START_FIELD]) >= 5000)}, + 'lt5kb': {'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and + (int(line[END_FIELD]) - int(line[START_FIELD]) < 5000)}, 'bca': {'condition': lambda line: bca and line[SVTYPE_FIELD] not in ['DEL', 'DUP'] and not line[SVTYPE_FIELD].startswith('INS')}, 'ins': {'condition': lambda line: bca and line[SVTYPE_FIELD].startswith('INS')} } @@ -19,7 +21,7 @@ def process_bed_file(input_bed, n_per_split, bca=True): # Create trackers for the current file information current_lines = {prefix: [] for prefix in condition_prefixes.keys()} current_counts = {prefix: 0 for prefix in condition_prefixes.keys()} - current_suffixes = {prefix: 'a' for prefix in condition_prefixes.keys()} + current_suffixes = {prefix: 0 for prefix in condition_prefixes.keys()} with open(input_bed, 'r') as infile: for line in infile: @@ -27,50 +29,44 @@ def process_bed_file(input_bed, n_per_split, bca=True): # This line swaps the last two columns so the sample names are in the fifth column and SV type in the last line[4], line[5] = line[5], line[4] for prefix, conditions in condition_prefixes.items(): - # If a line matches a condition add it to the appropriate file + # If a line matches a condition add it to the appropriate category if conditions['condition'](line): current_lines[prefix].append('\t'.join(line)) current_counts[prefix] += 1 - # If a file has met the number of records per file create a new file with the next suffix and write - # the current line to that new file + # If a category has the specified number of records, create a new file and write the current records if current_counts[prefix] == n_per_split: - output_suffix = current_suffixes[prefix].rjust(6, 'a') - output_file = f"{prefix}.{output_suffix}.bed" + output_file = get_file_name(prefix, current_suffixes[prefix], digits) with open(output_file, 'w') as outfile: outfile.write('\n'.join(current_lines[prefix])) - # Keep track of which files have been written after reaching the max number of files + # Log the file name that was created logging.info(f"File '{output_file}' written.") # Update the tracking information current_lines[prefix] = [] current_counts[prefix] = 0 - current_suffixes[prefix] = increment_suffix(current_suffixes[prefix]) - # Handle the samples after files with the given number of lines per file have been written + current_suffixes[prefix] = current_suffixes[prefix] + 1 + # Handle the remaining records for prefix, lines in current_lines.items(): if lines: - output_suffix = current_suffixes[prefix].rjust(6, 'a') - output_file = f"{prefix}.{output_suffix}.bed" + output_file = get_file_name(prefix, current_suffixes[prefix], digits) with open(output_file, 'w') as outfile: outfile.write('\n'.join(lines)) logging.info(f"File '{output_file}' written.") -# Create a function to appropriately add a suffix to each corresponding file -def increment_suffix(suffix): - alphabet = 'abcdefghijklmnopqrstuvwxyz' - if suffix == 'z' * 6: - raise ValueError('All possible files generated.') - else: - index = alphabet.index(suffix[0]) - next_char = alphabet[(index + 1) % 26] - return next_char + suffix[1:] +def get_file_name(prefix, suffix, digits): + if len(str(suffix)) > digits: + raise ValueError('No more files can be generated with the current naming scheme. ' + 'Increase the digits parameter or the n parameter to proceed.') + return f"{prefix}.{str(suffix).zfill(digits)}.bed" def main(): parser = argparse.ArgumentParser() parser.add_argument("--bed", help="Path to input bed file", required=True) - parser.add_argument("--n", help="number of variants per file", required=True, type=int) + parser.add_argument("--n", help="number of variants per output file", required=True, type=int) parser.add_argument("--bca", default=False, help="Flag to set to True if the VCF contains BCAs", action='store_true') + parser.add_argument("--digits", "-d", default=9, type=int, help="Number of digits in filename suffix") parser.add_argument("--log-level", required=False, default="INFO", help="Specify level of logging information") args = parser.parse_args() @@ -79,7 +75,7 @@ def main(): if not isinstance(numeric_level, int): raise ValueError('Invalid log level: %s' % log_level) logging.basicConfig(level=numeric_level, format='%(levelname)s: %(message)s') - process_bed_file(args.bed, args.n, args.bca) + process_bed_file(args.bed, args.n, args.bca, args.digits) if __name__ == '__main__':