Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix file naming in GenotypeBatch.SplitVariants #712

Merged
merged 6 commits into from
Aug 19, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 22 additions & 26 deletions src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,74 +3,70 @@
import logging


def process_bed_file(input_bed, n_per_split, bca=True):
def process_bed_file(input_bed, n_per_split, bca=True, digits=9):
SVTYPE_FIELD = 5
END_FIELD = 2
START_FIELD = 1

# Check the conditions to generate prefixes for the output files
# Conditions for each category of variants
condition_prefixes = {
'gt5kb': {'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_FIELD]) - int(line[START_FIELD]) >= 5000)},
'lt5kb': {'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_FIELD]) - int(line[START_FIELD]) < 5000)},
'gt5kb': {'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and
(int(line[END_FIELD]) - int(line[START_FIELD]) >= 5000)},
'lt5kb': {'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and
(int(line[END_FIELD]) - int(line[START_FIELD]) < 5000)},
'bca': {'condition': lambda line: bca and line[SVTYPE_FIELD] not in ['DEL', 'DUP'] and not line[SVTYPE_FIELD].startswith('INS')},
'ins': {'condition': lambda line: bca and line[SVTYPE_FIELD].startswith('INS')}
}

# Create trackers for the current file information
current_lines = {prefix: [] for prefix in condition_prefixes.keys()}
current_counts = {prefix: 0 for prefix in condition_prefixes.keys()}
current_suffixes = {prefix: 'a' for prefix in condition_prefixes.keys()}
current_suffixes = {prefix: 0 for prefix in condition_prefixes.keys()}

with open(input_bed, 'r') as infile:
for line in infile:
line = line.strip('\n').split('\t')
# This line swaps the last two columns so the sample names are in the fifth column and SV type in the last
line[4], line[5] = line[5], line[4]
for prefix, conditions in condition_prefixes.items():
# If a line matches a condition add it to the appropriate file
# If a line matches a condition add it to the appropriate category
if conditions['condition'](line):
current_lines[prefix].append('\t'.join(line))
current_counts[prefix] += 1
# If a file has met the number of records per file create a new file with the next suffix and write
# the current line to that new file
# If a category has the specified number of records, create a new file and write the current records
if current_counts[prefix] == n_per_split:
output_suffix = current_suffixes[prefix].rjust(6, 'a')
output_file = f"{prefix}.{output_suffix}.bed"
output_file = get_file_name(prefix, current_suffixes[prefix], digits)
with open(output_file, 'w') as outfile:
outfile.write('\n'.join(current_lines[prefix]))
# Keep track of which files have been written after reaching the max number of files
# Log the file name that was created
logging.info(f"File '{output_file}' written.")
# Update the tracking information
current_lines[prefix] = []
current_counts[prefix] = 0
current_suffixes[prefix] = increment_suffix(current_suffixes[prefix])
# Handle the samples after files with the given number of lines per file have been written
current_suffixes[prefix] = current_suffixes[prefix] + 1
# Handle the remaining records
for prefix, lines in current_lines.items():
if lines:
output_suffix = current_suffixes[prefix].rjust(6, 'a')
output_file = f"{prefix}.{output_suffix}.bed"
output_file = get_file_name(prefix, current_suffixes[prefix], digits)
with open(output_file, 'w') as outfile:
outfile.write('\n'.join(lines))
logging.info(f"File '{output_file}' written.")


# Create a function to appropriately add a suffix to each corresponding file
def increment_suffix(suffix):
alphabet = 'abcdefghijklmnopqrstuvwxyz'
if suffix == 'z' * 6:
raise ValueError('All possible files generated.')
else:
index = alphabet.index(suffix[0])
next_char = alphabet[(index + 1) % 26]
return next_char + suffix[1:]
def get_file_name(prefix, suffix, digits):
if len(str(suffix)) > digits:
raise ValueError('No more files can be generated with the current naming scheme. '
'Increase the digits parameter or the n parameter to proceed.')
return f"{prefix}.{str(suffix).zfill(digits)}.bed"


def main():
parser = argparse.ArgumentParser()
parser.add_argument("--bed", help="Path to input bed file", required=True)
parser.add_argument("--n", help="number of variants per file", required=True, type=int)
parser.add_argument("--n", help="number of variants per output file", required=True, type=int)
parser.add_argument("--bca", default=False, help="Flag to set to True if the VCF contains BCAs",
action='store_true')
parser.add_argument("--digits", "-d", default=9, type=int, help="Number of digits in filename suffix")
parser.add_argument("--log-level", required=False, default="INFO", help="Specify level of logging information")
args = parser.parse_args()

Expand All @@ -79,7 +75,7 @@ def main():
if not isinstance(numeric_level, int):
raise ValueError('Invalid log level: %s' % log_level)
logging.basicConfig(level=numeric_level, format='%(levelname)s: %(message)s')
process_bed_file(args.bed, args.n, args.bca)
process_bed_file(args.bed, args.n, args.bca, args.digits)


if __name__ == '__main__':
Expand Down
Loading