Merge branch 'main' into kj/709_deprecate_cloud_env_json

broadinstitute · Sep 10, 2024 · d089f45 · d089f45
2 parents 1d0147b + 16c9fdb
commit d089f45
Show file tree

Hide file tree

Showing 20 changed files with 86 additions and 90 deletions.
diff --git a/README.md b/README.md
@@ -90,7 +90,7 @@ Sample IDs should not:
 
 The same requirements apply to family IDs in the PED file, as well as batch IDs and the cohort ID provided as workflow inputs.
 
-Sample IDs are provided to [GatherSampleEvidence](#gather-sample-evidence) directly and need not match sample names from the BAM/CRAM headers. `GetSampleID.wdl` can be used to fetch BAM sample IDs and also generates a set of alternate IDs that are considered safe for this pipeline; alternatively, [this script](https://github.com/talkowski-lab/gnomad_sv_v3/blob/master/sample_id/convert_sample_ids.py) transforms a list of sample IDs to fit these requirements. Currently, sample IDs can be replaced again in [GatherBatchEvidence](#gather-batch-evidence). 
+Sample IDs are provided to [GatherSampleEvidence](#gather-sample-evidence) directly and need not match sample names from the BAM/CRAM headers.  `GetSampleID.wdl` can be used to fetch BAM sample IDs and also generates a set of alternate IDs that are considered safe for this pipeline; alternatively, [this script](https://github.com/talkowski-lab/gnomad_sv_v3/blob/master/sample_id/convert_sample_ids.py) transforms a list of sample IDs to fit these requirements. Currently, sample IDs can be replaced again in [GatherBatchEvidence](#gather-batch-evidence) - to do so, set the parameter `rename_samples = True` and provide updated sample IDs via the `samples` parameter.
 
 The following inputs will need to be updated with the transformed sample IDs:
 * Sample ID list for [GatherSampleEvidence](#gather-sample-evidence) or [GatherBatchEvidence](#gather-batch-evidence)

diff --git a/dockerfiles/sv-base/Dockerfile b/dockerfiles/sv-base/Dockerfile
@@ -1,7 +1,7 @@
 # This is the base dockerfile for the GATK SV pipeline that adds R, a few R packages, and GATK
 ARG SAMTOOLS_CLOUD_IMAGE=samtools-cloud:latest
 ARG VIRTUAL_ENV_IMAGE=sv-base-virtual-env:latest
-ARG GATK_COMMIT="a33bf19dd3188af0af1bd17bce015eb20ba73227"
+ARG GATK_COMMIT="64348bc9750ebf6cc473ecb8c1ced3fc66f05488"
 ARG GATK_JAR="/opt/gatk.jar"
 ARG R_INSTALL_PATH=/opt/R
 
@@ -14,8 +14,8 @@ FROM $SAMTOOLS_CLOUD_IMAGE as samtools_cloud
 FROM $VIRTUAL_ENV_IMAGE as virtual_env_image
 RUN rm_unneeded_r_library_files.sh
 
-ARG GATK_BUILD_DEP="git git-lfs openjdk-8-jdk"
-ARG GATK_RUN_DEP="openjdk-8-jre-headless libgomp1"
+ARG GATK_BUILD_DEP="git git-lfs openjdk-17-jdk"
+ARG GATK_RUN_DEP="openjdk-17-jre-headless libgomp1"
 ARG GATK_COMMIT
 ARG GATK_JAR
 ARG DEBIAN_FRONTEND=noninteractive

diff --git a/inputs/templates/terra_workspaces/cohort_mode/cohort_mode_workspace_dashboard.md.tmpl b/inputs/templates/terra_workspaces/cohort_mode/cohort_mode_workspace_dashboard.md.tmpl
@@ -166,7 +166,7 @@ Read the full EvidenceQC documentation [here](https://github.com/broadinstitute/
 
 Read the full TrainGCNV documentation [here](https://github.com/broadinstitute/gatk-sv#gcnv-training-1).
 * Before running this workflow, create the batches (~100-500 samples) you will use for the rest of the pipeline based on sample coverage, WGD score (from `02-EvidenceQC`), and PCR status. These will likely not be the same as the batches you used for `02-EvidenceQC`.
-* By default, `03-TrainGCNV` is configured to be run once per `sample_set` on 100 randomly-chosen samples from that set to create a gCNV model for each batch. If your `sample_set` contains fewer than 100 samples (not recommended), you will need to edit the `n_samples_subsample` parameter to be less than or equal to the number of samples.
+* By default, `03-TrainGCNV` is configured to be run once per `sample_set` on 100 randomly-chosen samples from that set to create a gCNV model for each batch. To modify this behavior, you can set the `n_samples_subsample` parameter to the number of samples to use for training.
 
 #### 04-GatherBatchEvidence
 

diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json
@@ -1,6 +1,6 @@
 {
   "name": "dockers",
-  "cnmops_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/cnmops:2024-06-04-v0.28.5-beta-a8dfecba",
+  "cnmops_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/cnmops:2024-08-27-v0.29-beta-6b27c39f",
   "condense_counts_docker": "us.gcr.io/broad-dsde-methods/tsharpe/gatk:4.2.6.1-57-g9e03432",
   "gatk_docker": "us.gcr.io/broad-dsde-methods/eph/gatk:2024-07-02-4.6.0.0-1-g4af2b49e9-NIGHTLY-SNAPSHOT",
   "gatk_docker_pesr_override": "us.gcr.io/broad-dsde-methods/tsharpe/gatk:4.2.6.1-57-g9e03432",
@@ -10,10 +10,10 @@
   "melt_docker": "us.gcr.io/talkowski-sv-gnomad/melt:a85c92f",
   "scramble_docker": "us.gcr.io/broad-dsde-methods/markw/scramble:mw-scramble-99af4c50",
   "samtools_cloud_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:2024-01-24-v0.28.4-beta-9debd6d7",
-  "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2024-01-24-v0.28.4-beta-9debd6d7",
+  "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2024-08-27-v0.29-beta-6b27c39f",
   "sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base-mini:2024-01-24-v0.28.4-beta-9debd6d7",
-  "sv_pipeline_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2024-07-02-v0.28.5-beta-d9530265",
-  "sv_pipeline_qc_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2024-07-02-v0.28.5-beta-d9530265",
+  "sv_pipeline_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2024-08-27-v0.29-beta-6b27c39f",
+  "sv_pipeline_qc_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2024-08-27-v0.29-beta-6b27c39f",
   "wham_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/wham:2024-01-24-v0.28.4-beta-9debd6d7",
   "igv_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/igv:mw-xz-fixes-2-b1be6a9",
   "duphold_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/duphold:mw-xz-fixes-2-b1be6a9",
@@ -28,5 +28,5 @@
   "sv_utils_docker": "us.gcr.io/broad-dsde-methods/markw/sv-utils:mw-train-genotype-filtering-a9479501",
   "gq_recalibrator_docker": "us.gcr.io/broad-dsde-methods/markw/gatk:mw-tb-form-sv-filter-training-data-899360a",
   "str": "us.gcr.io/broad-dsde-methods/gatk-sv/str:2023-05-23-v0.27.3-beta-e537bdd6",
-  "denovo": "us.gcr.io/broad-dsde-methods/gatk-sv/denovo:2024-07-02-v0.28.5-beta-d9530265"
+  "denovo": "us.gcr.io/broad-dsde-methods/gatk-sv/denovo:2024-08-27-v0.29-beta-6b27c39f"
 }
diff --git a/inputs/values/dockers_azure.json b/inputs/values/dockers_azure.json
@@ -1,6 +1,6 @@
 {
   "name": "dockers",
-  "cnmops_docker": "vahid.azurecr.io/gatk-sv/cnmops:2024-06-04-v0.28.5-beta-a8dfecba",
+  "cnmops_docker": "vahid.azurecr.io/gatk-sv/cnmops:2024-08-27-v0.29-beta-6b27c39f",
   "condense_counts_docker": "vahid.azurecr.io/tsharpe/gatk:4.2.6.1-57-g9e03432",
   "gatk_docker": "vahid.azurecr.io/gatk-sv/gatk:2024-07-02-4.6.0.0-1-g4af2b49e9-NIGHTLY-SNAPSHOT",
   "gatk_docker_pesr_override": "vahid.azurecr.io/tsharpe/gatk:4.2.6.1-57-g9e03432",
@@ -10,10 +10,10 @@
   "melt_docker": "vahid.azurecr.io/melt:a85c92f",
   "scramble_docker": "vahid.azurecr.io/scramble:mw-scramble-99af4c50",
   "samtools_cloud_docker": "vahid.azurecr.io/gatk-sv/samtools-cloud:2024-01-24-v0.28.4-beta-9debd6d7",
-  "sv_base_docker": "vahid.azurecr.io/gatk-sv/sv-base:2024-01-24-v0.28.4-beta-9debd6d7",
+  "sv_base_docker": "vahid.azurecr.io/gatk-sv/sv-base:2024-08-27-v0.29-beta-6b27c39f",
   "sv_base_mini_docker": "vahid.azurecr.io/gatk-sv/sv-base-mini:2024-01-24-v0.28.4-beta-9debd6d7",
-  "sv_pipeline_docker": "vahid.azurecr.io/gatk-sv/sv-pipeline:2024-07-02-v0.28.5-beta-d9530265",
-  "sv_pipeline_qc_docker": "vahid.azurecr.io/gatk-sv/sv-pipeline:2024-07-02-v0.28.5-beta-d9530265",
+  "sv_pipeline_docker": "vahid.azurecr.io/gatk-sv/sv-pipeline:2024-08-27-v0.29-beta-6b27c39f",
+  "sv_pipeline_qc_docker": "vahid.azurecr.io/gatk-sv/sv-pipeline:2024-08-27-v0.29-beta-6b27c39f",
   "wham_docker": "vahid.azurecr.io/gatk-sv/wham:2024-01-24-v0.28.4-beta-9debd6d7",
   "igv_docker": "vahid.azurecr.io/gatk-sv/igv:mw-xz-fixes-2-b1be6a9",
   "duphold_docker": "vahid.azurecr.io/gatk-sv/duphold:mw-xz-fixes-2-b1be6a9",
@@ -28,5 +28,5 @@
   "sv_utils_docker": "vahid.azurecr.io/gatk-sv/sv-utils:2024-01-24-v0.28.4-beta-9debd6d7",
   "gq_recalibrator_docker": "vahid.azurecr.io/markw/gatk:mw-tb-form-sv-filter-training-data-899360a",
   "str": "vahid.azurecr.io/gatk-sv/str:2023-05-23-v0.27.3-beta-e537bdd6",
-  "denovo": "vahid.azurecr.io/gatk-sv/denovo:2024-07-02-v0.28.5-beta-d9530265"
+  "denovo": "vahid.azurecr.io/gatk-sv/denovo:2024-08-27-v0.29-beta-6b27c39f"
 }
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/SR_genotype.opt_part1.sh b/src/sv-pipeline/04_variant_resolution/scripts/SR_genotype.opt_part1.sh
@@ -37,22 +37,25 @@ awk '{if ($NF~"SR") print $4}' int.bed> pass.srtest.txt
 echo "step1"
 
 # Join RD and SR genotypes and filter same as PE
-cat $petrainfile|fgrep -wf pass.srtest.txt > sr.train.include.txt
+cat $petrainfile \
+  |awk -F'\t' -v OFS='\t' 'ARGIND==1{ids[$1]; next} ($1 in ids)' pass.srtest.txt - \
+  > sr.train.include.txt
 
-join -j 1  -a 1 -e "2" -o 1.2 1.3 1.4 2.2 \
+join -j 1 -a 1 -e "2" -o 1.2 1.3 1.4 2.2 \
     <(zcat ${SR_sum} \
-        | fgrep -wf sr.train.include.txt \
+        | awk 'ARGIND==1{ids[$1]; next} ($1 in ids)' sr.train.include.txt - \
         | awk '{print $1"@"$2 "\t" $0}' \
-        | fgrep -wf two.sided.pass.txt \
+        | awk 'ARGIND==1{ids[$1]; next} ($1 in ids)' two.sided.pass.txt - \
         | sort -k1,1 ) \
-    <(zcat $RD_melted_genotypes|fgrep -wf sr.train.include.txt \
+    <(zcat $RD_melted_genotypes \
+        | awk 'ARGIND==1{ids[$1]; next} ($4 in ids)' sr.train.include.txt - \
         | awk '{print $4"@"$5 "\t" $6}' \
-        | fgrep -wf two.sided.pass.txt \
+        | awk 'ARGIND==1{ids[$1]; next} ($1 in ids)' two.sided.pass.txt - \
         | sort -k1,1) \
   | tr ' ' '\t' \
-  > SR.RD.merged.txt 
+  > SR.RD.merged.txt
 
-# Get cutoffs to filter out incorrectly label hom in R and treat combine het (1 and 3) and hom (0 and 4) copy states 
+# Get cutoffs to filter out incorrectly label hom in R and treat combine het (1 and 3) and hom (0 and 4) copy states
 # throw out any copy state  calls that have reads less than with p=0.05 away from copy state 1 or 3
 
 het_cutoff=$(awk '{print $1"@"$2"\t" $3 "\t" $4}' SR.RD.merged.txt \
@@ -74,7 +77,7 @@ median_hom=$(awk '{if ($NF==0 || $NF==4) print $3}'  SR.RD.hetfilter.merged.txt
                          -e 'median(d)' \
                | tr '\n' '\t' \
                | awk '{print $NF}')
-##get std from 1 && 3  for hom restriction###          
+##get std from 1 && 3  for hom restriction###
 sd_het=$(awk '{if ($NF==1 || $NF==3) print $3}'  SR.RD.hetfilter.merged.txt \
            | Rscript -e 'd<-scan("stdin", quiet=TRUE)' \
                      -e 'mad(d)' \
@@ -84,28 +87,28 @@ sd_het=$(awk '{if ($NF==1 || $NF==3) print $3}'  SR.RD.hetfilter.merged.txt \
 ##Genotype SR genotype (0-ref, then estimate copy state based on copy state that is 1 sd from sd_het  )##
 zcat ${SR_sum} \
   | awk '{print $0 "\t" $1"@"$2}' \
-  | fgrep -wf two.sided.pass.txt \
+  | awk -F'\t' -v OFS='\t' 'ARGIND==1{ids[$1]; next} ($4 in ids)' two.sided.pass.txt - \
   | cut -f1-3 \
   | awk -v var=$sr_count -v var1=$median_hom -v var2=$sd_het '{if ($3<var) print $1,$2,$3,0;else if ($3<=var1-var2) print $1,$2,$3,1; else print $1,$2,$3,int($3/(var1/2)+0.5)}'  \
   > sr.geno.final.txt
 
 zcat ${SR_sum} \
   | awk '{print $0 "\t" $1"@"$2}' \
-  | fgrep -wvf two.sided.pass.txt \
+  | awk -F'\t' -v OFS='\t' 'ARGIND==1{ids[$1]; next} (!($4 in ids))' two.sided.pass.txt - \
   | cut -f1-3 \
   | awk '{print $1,$2,$3,0}' \
   >> sr.geno.final.txt
 
 
-gzip sr.geno.final.txt
+gzip -f sr.geno.final.txt
 
 zcat ${SR_sum} \
   | awk '{print $0 "\t" $1"@"$2}' \
   | cut -f1-3 \
   | awk -v var=$sr_count -v var1=$median_hom -v var2=$sd_het '{if ($3<var) print $1,$2,$3,0;else if ($3<=var1-var2) print $1,$2,$3,1; else print $1,$2,$3,int($3/(var1/2)+0.5)}' \
   | gzip \
   > sr.geno.final.oneside.txt.gz
-  
+
 echo "step3"
 ##filter by quality of site by looking at % of calls with ##
 ##Allow just one side##
@@ -140,15 +143,18 @@ echo "step4"
 ##pull out cnvs gt1kb and not located on x or y##
 zcat $RD_melted_genotypes|egrep -v "^X|^Y"|awk '{if ($3-$2>=1000) print $4}'|sort -u>idsgt1kb.txt
 
+awk -F'\t' -v OFS='\t' 'ARGIND==1{ids[$1]; next} ($1 in ids)' <(cut -d '@' -f1 sr.final.ids.oneside.txt|sort -u) \
+  <(zcat $RD_melted_genotypes|awk -F'\t' -v OFS='\t' '{if ($6!=2) print $4,$5}') \
+  > nonref_rd.txt
 
 zcat $pegenotypes \
-  |fgrep -wf <(cut -d '@' -f1 sr.final.ids.oneside.txt|sort -u) \
-  |awk '{if ($NF>0) print $1"@"$2}' \
-  |cat - <(fgrep -wf <(cut -d '@' -f1 sr.final.ids.oneside.txt|sort -u) \
-  <(zcat $RD_melted_genotypes|awk '{if ($6!=2) print $4"@"$5}')) \
-  |fgrep -wf idsgt1kb.txt \
-  |fgrep -wf pass.srtest.txt \
+  |awk -F'\t' -v OFS='\t' 'ARGIND==1{ids[$1]; next} ($1 in ids)' <(cut -d '@' -f1 sr.final.ids.oneside.txt|sort -u) - \
+  |awk -F'\t' -v OFS='\t' '{if ($NF>0) print $1,$2}' \
+  |cat - nonref_rd.txt \
+  |awk -F'\t' -v OFS='\t' 'ARGIND==1{ids[$1]; next} ($1 in ids)' idsgt1kb.txt - \
+  |awk -F'\t' -v OFS='\t' 'ARGIND==1{ids[$1]; next} ($1 in ids)' pass.srtest.txt - \
   |sort -u \
+  |tr '\t' '@' \
   >pass.pe_rd.txt
 
 ##look for optimal cutoffs for SR variants using a 1% freq cutoff##
@@ -159,28 +165,28 @@ cat recover.txt \
   |sort -k1,1 \
   |join -j 1 - <(zcat sr.geno.final.oneside.txt.gz|awk '{if ($NF>0) print $1 "\t" $1"@"$2 }'|sort -k1,1) \
   |tr ' ' '\t' \
-  |fgrep -wf pass.pe_rd.txt \
+  |awk -F'\t' -v OFS='\t' 'ARGIND==1{ids[$1]; next} ($5 in ids)' pass.pe_rd.txt - \
   >recover.single.txt
 
 cat recover.bothsides.txt \
   |sort -k1,1 \
   |join -j 1 - <(zcat sr.geno.final.oneside.txt.gz|awk '{if ($NF>0) print $1 "\t" $1"@"$2 }'|sort -k1,1) \
   |tr ' ' '\t' \
-  |fgrep -wf pass.pe_rd.txt \
+  |awk -F'\t' -v OFS='\t' 'ARGIND==1{ids[$1]; next} ($5 in ids)' pass.pe_rd.txt - \
   >recover.both.txt
 
 cat recover.txt \
   |sort -k1,1 \
   |join -j 1 - <(zcat sr.geno.final.oneside.txt.gz|awk '{if ($NF>0) print $1 "\t" $1"@"$2 }'|sort -k1,1) \
   |tr ' ' '\t' \
-  |fgrep -wvf pass.pe_rd.txt \
+  |awk -F'\t' -v OFS='\t' 'ARGIND==1{ids[$1]; next} (!($5 in ids))' pass.pe_rd.txt - \
   >recover.single.fail.txt
 
 cat recover.bothsides.txt \
   |sort -k1,1 \
   |join -j 1 - <(zcat sr.geno.final.oneside.txt.gz|awk '{if ($NF>0) print $1 "\t" $1"@"$2 }'|sort -k1,1) \
   |tr ' ' '\t' \
-  |fgrep -wvf pass.pe_rd.txt \
+  |awk -F'\t' -v OFS='\t' 'ARGIND==1{ids[$1]; next} (!($5 in ids))' pass.pe_rd.txt - \
   >recover.both.fail.txt
 
 echo "step5"

diff --git a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
@@ -3,74 +3,70 @@
 import logging
 
 
-def process_bed_file(input_bed, n_per_split, bca=True):
+def process_bed_file(input_bed, n_per_split, bca=True, digits=9):
     SVTYPE_FIELD = 5
     END_FIELD = 2
     START_FIELD = 1
 
-    # Check the conditions to generate prefixes for the output files
+    # Conditions for each category of variants
     condition_prefixes = {
-        'gt5kb': {'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_FIELD]) - int(line[START_FIELD]) >= 5000)},
-        'lt5kb': {'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_FIELD]) - int(line[START_FIELD]) < 5000)},
+        'gt5kb': {'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and
+                                            (int(line[END_FIELD]) - int(line[START_FIELD]) >= 5000)},
+        'lt5kb': {'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and
+                                            (int(line[END_FIELD]) - int(line[START_FIELD]) < 5000)},
         'bca': {'condition': lambda line: bca and line[SVTYPE_FIELD] not in ['DEL', 'DUP'] and not line[SVTYPE_FIELD].startswith('INS')},
         'ins': {'condition': lambda line: bca and line[SVTYPE_FIELD].startswith('INS')}
     }
 
     # Create trackers for the current file information
     current_lines = {prefix: [] for prefix in condition_prefixes.keys()}
     current_counts = {prefix: 0 for prefix in condition_prefixes.keys()}
-    current_suffixes = {prefix: 'a' for prefix in condition_prefixes.keys()}
+    current_suffixes = {prefix: 0 for prefix in condition_prefixes.keys()}
 
     with open(input_bed, 'r') as infile:
         for line in infile:
             line = line.strip('\n').split('\t')
             # This line swaps the last two columns so the sample names are in the fifth column and SV type in the last
             line[4], line[5] = line[5], line[4]
             for prefix, conditions in condition_prefixes.items():
-                # If a line matches a condition add it to the appropriate file
+                # If a line matches a condition add it to the appropriate category
                 if conditions['condition'](line):
                     current_lines[prefix].append('\t'.join(line))
                     current_counts[prefix] += 1
-                    # If a file has met the number of records per file create a new file with the next suffix and write
-                    # the current line to that new file
+                    # If a category has the specified number of records, create a new file and write the current records
                     if current_counts[prefix] == n_per_split:
-                        output_suffix = current_suffixes[prefix].rjust(6, 'a')
-                        output_file = f"{prefix}.{output_suffix}.bed"
+                        output_file = get_file_name(prefix, current_suffixes[prefix], digits)
                         with open(output_file, 'w') as outfile:
                             outfile.write('\n'.join(current_lines[prefix]))
-                        # Keep track of which files have been written after reaching the max number of files
+                        # Log the file name that was created
                         logging.info(f"File '{output_file}' written.")
                         # Update the tracking information
                         current_lines[prefix] = []
                         current_counts[prefix] = 0
-                        current_suffixes[prefix] = increment_suffix(current_suffixes[prefix])
-    # Handle the samples after files with the given number of lines per file have been written
+                        current_suffixes[prefix] = current_suffixes[prefix] + 1
+    # Handle the remaining records
     for prefix, lines in current_lines.items():
         if lines:
-            output_suffix = current_suffixes[prefix].rjust(6, 'a')
-            output_file = f"{prefix}.{output_suffix}.bed"
+            output_file = get_file_name(prefix, current_suffixes[prefix], digits)
             with open(output_file, 'w') as outfile:
                 outfile.write('\n'.join(lines))
             logging.info(f"File '{output_file}' written.")
 
 
-# Create a function to appropriately add a suffix to each corresponding file
-def increment_suffix(suffix):
-    alphabet = 'abcdefghijklmnopqrstuvwxyz'
-    if suffix == 'z' * 6:
-        raise ValueError('All possible files generated.')
-    else:
-        index = alphabet.index(suffix[0])
-        next_char = alphabet[(index + 1) % 26]
-        return next_char + suffix[1:]
+def get_file_name(prefix, suffix, digits):
+    if len(str(suffix)) > digits:
+        raise ValueError('No more files can be generated with the current naming scheme. '
+                         'Increase the digits parameter or the n parameter to proceed.')
+    return f"{prefix}.{str(suffix).zfill(digits)}.bed"
 
 
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--bed", help="Path to input bed file", required=True)
-    parser.add_argument("--n", help="number of variants per file", required=True, type=int)
+    parser.add_argument("--n", help="number of variants per output file", required=True, type=int)
     parser.add_argument("--bca", default=False, help="Flag to set to True if the VCF contains BCAs",
                         action='store_true')
+    parser.add_argument("--digits", "-d", default=9, type=int, help="Number of digits in filename suffix")
     parser.add_argument("--log-level", required=False, default="INFO", help="Specify level of logging information")
     args = parser.parse_args()
 
@@ -79,7 +75,7 @@ def main():
     if not isinstance(numeric_level, int):
         raise ValueError('Invalid log level: %s' % log_level)
     logging.basicConfig(level=numeric_level, format='%(levelname)s: %(message)s')
-    process_bed_file(args.bed, args.n, args.bca)
+    process_bed_file(args.bed, args.n, args.bca, args.digits)
 
 
 if __name__ == '__main__':