From 84a0627d25ea04645283fd181d4cb7ef9ec496c9 Mon Sep 17 00:00:00 2001
From: epiercehoffman <epierceh@broadinstitute.org>
Date: Mon, 19 Aug 2024 11:05:56 -0400
Subject: [PATCH] Fix file naming in GenotypeBatch.SplitVariants (#712)

---
 .../scripts/split_variants.py                 | 48 +++++++++----------
 1 file changed, 22 insertions(+), 26 deletions(-)

diff --git a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
index ec0418459..0fabae4d0 100644
--- a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
+++ b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
@@ -3,15 +3,17 @@
 import logging
 
 
-def process_bed_file(input_bed, n_per_split, bca=True):
+def process_bed_file(input_bed, n_per_split, bca=True, digits=9):
     SVTYPE_FIELD = 5
     END_FIELD = 2
     START_FIELD = 1
 
-    # Check the conditions to generate prefixes for the output files
+    # Conditions for each category of variants
     condition_prefixes = {
-        'gt5kb': {'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_FIELD]) - int(line[START_FIELD]) >= 5000)},
-        'lt5kb': {'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_FIELD]) - int(line[START_FIELD]) < 5000)},
+        'gt5kb': {'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and
+                                            (int(line[END_FIELD]) - int(line[START_FIELD]) >= 5000)},
+        'lt5kb': {'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and
+                                            (int(line[END_FIELD]) - int(line[START_FIELD]) < 5000)},
         'bca': {'condition': lambda line: bca and line[SVTYPE_FIELD] not in ['DEL', 'DUP'] and not line[SVTYPE_FIELD].startswith('INS')},
         'ins': {'condition': lambda line: bca and line[SVTYPE_FIELD].startswith('INS')}
     }
@@ -19,7 +21,7 @@ def process_bed_file(input_bed, n_per_split, bca=True):
     # Create trackers for the current file information
     current_lines = {prefix: [] for prefix in condition_prefixes.keys()}
     current_counts = {prefix: 0 for prefix in condition_prefixes.keys()}
-    current_suffixes = {prefix: 'a' for prefix in condition_prefixes.keys()}
+    current_suffixes = {prefix: 0 for prefix in condition_prefixes.keys()}
 
     with open(input_bed, 'r') as infile:
         for line in infile:
@@ -27,50 +29,44 @@ def process_bed_file(input_bed, n_per_split, bca=True):
             # This line swaps the last two columns so the sample names are in the fifth column and SV type in the last
             line[4], line[5] = line[5], line[4]
             for prefix, conditions in condition_prefixes.items():
-                # If a line matches a condition add it to the appropriate file
+                # If a line matches a condition add it to the appropriate category
                 if conditions['condition'](line):
                     current_lines[prefix].append('\t'.join(line))
                     current_counts[prefix] += 1
-                    # If a file has met the number of records per file create a new file with the next suffix and write
-                    # the current line to that new file
+                    # If a category has the specified number of records, create a new file and write the current records
                     if current_counts[prefix] == n_per_split:
-                        output_suffix = current_suffixes[prefix].rjust(6, 'a')
-                        output_file = f"{prefix}.{output_suffix}.bed"
+                        output_file = get_file_name(prefix, current_suffixes[prefix], digits)
                         with open(output_file, 'w') as outfile:
                             outfile.write('\n'.join(current_lines[prefix]))
-                        # Keep track of which files have been written after reaching the max number of files
+                        # Log the file name that was created
                         logging.info(f"File '{output_file}' written.")
                         # Update the tracking information
                         current_lines[prefix] = []
                         current_counts[prefix] = 0
-                        current_suffixes[prefix] = increment_suffix(current_suffixes[prefix])
-    # Handle the samples after files with the given number of lines per file have been written
+                        current_suffixes[prefix] = current_suffixes[prefix] + 1
+    # Handle the remaining records
     for prefix, lines in current_lines.items():
         if lines:
-            output_suffix = current_suffixes[prefix].rjust(6, 'a')
-            output_file = f"{prefix}.{output_suffix}.bed"
+            output_file = get_file_name(prefix, current_suffixes[prefix], digits)
             with open(output_file, 'w') as outfile:
                 outfile.write('\n'.join(lines))
             logging.info(f"File '{output_file}' written.")
 
 
-# Create a function to appropriately add a suffix to each corresponding file
-def increment_suffix(suffix):
-    alphabet = 'abcdefghijklmnopqrstuvwxyz'
-    if suffix == 'z' * 6:
-        raise ValueError('All possible files generated.')
-    else:
-        index = alphabet.index(suffix[0])
-        next_char = alphabet[(index + 1) % 26]
-        return next_char + suffix[1:]
+def get_file_name(prefix, suffix, digits):
+    if len(str(suffix)) > digits:
+        raise ValueError('No more files can be generated with the current naming scheme. '
+                         'Increase the digits parameter or the n parameter to proceed.')
+    return f"{prefix}.{str(suffix).zfill(digits)}.bed"
 
 
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--bed", help="Path to input bed file", required=True)
-    parser.add_argument("--n", help="number of variants per file", required=True, type=int)
+    parser.add_argument("--n", help="number of variants per output file", required=True, type=int)
     parser.add_argument("--bca", default=False, help="Flag to set to True if the VCF contains BCAs",
                         action='store_true')
+    parser.add_argument("--digits", "-d", default=9, type=int, help="Number of digits in filename suffix")
     parser.add_argument("--log-level", required=False, default="INFO", help="Specify level of logging information")
     args = parser.parse_args()
 
@@ -79,7 +75,7 @@ def main():
     if not isinstance(numeric_level, int):
         raise ValueError('Invalid log level: %s' % log_level)
     logging.basicConfig(level=numeric_level, format='%(levelname)s: %(message)s')
-    process_bed_file(args.bed, args.n, args.bca)
+    process_bed_file(args.bed, args.n, args.bca, args.digits)
 
 
 if __name__ == '__main__':