Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add TrainGCNV input specifying subset list of samples for training #294

Merged
merged 2 commits into from
Feb 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 20 additions & 5 deletions wdl/TrainGCNV.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,12 @@ workflow TrainGCNV {
File reference_index # Index (.fai), must be in same dir as fasta
File reference_dict # Dictionary (.dict), must be in same dir as fasta

# Options for subsetting samples for training. Both options require providing sv_pipeline_base_docker
# Assumes all other inputs correspond to the full sample list. Intended for Terra
Int? n_samples_subsample # Number of samples to subsample from provided sample list for trainGCNV (rec: ~100)
Int subsample_seed = 42
# Subset of full sample list on which to train the gCNV model. Overrides n_samples_subsample if both provided
Array[String]? sample_ids_training_subset

# Condense read counts
Int? condense_num_bins
Expand Down Expand Up @@ -85,7 +89,7 @@ workflow TrainGCNV {
String linux_docker
String gatk_docker
String condense_counts_docker
String? sv_pipeline_base_docker # required if using n_samples_subsample to select samples
String? sv_pipeline_base_docker # required if using n_samples_subsample or sample_ids_training_subset to subset samples

# Runtime configuration overrides
RuntimeAttr? condense_counts_runtime_attr
Expand All @@ -100,20 +104,31 @@ workflow TrainGCNV {
RuntimeAttr? runtime_attr_explode
}

if (defined(n_samples_subsample)) {
if (defined(sample_ids_training_subset)) {
call util.GetSubsampledIndices {
input:
all_strings = write_lines(samples),
subset_strings = write_lines(select_first([sample_ids_training_subset])),
prefix = cohort,
sv_pipeline_base_docker = select_first([sv_pipeline_base_docker])
}
}

if (defined(n_samples_subsample) && !defined(sample_ids_training_subset)) {
call util.RandomSubsampleStringArray {
input:
strings = samples,
strings = write_lines(samples),
seed = subsample_seed,
subset_size = select_first([n_samples_subsample]),
prefix = cohort,
sv_pipeline_base_docker = select_first([sv_pipeline_base_docker])
}
}

Array[Int] sample_indices = select_first([RandomSubsampleStringArray.subsample_indices_array, range(length(samples))])
Array[Int] sample_indices = select_first([GetSubsampledIndices.subsample_indices_array, RandomSubsampleStringArray.subsample_indices_array, range(length(samples))])

scatter (i in sample_indices) {
String sample_ids_ = samples[i]
call cov.CondenseReadCounts as CondenseReadCounts {
input:
counts = count_files[i],
Expand All @@ -138,7 +153,7 @@ workflow TrainGCNV {
preprocessed_intervals = CountsToIntervals.out,
filter_intervals = filter_intervals,
counts = CondenseReadCounts.out,
count_entity_ids = select_first([RandomSubsampleStringArray.subsampled_strings_array, samples]),
count_entity_ids = sample_ids_,
cohort_entity_id = cohort,
contig_ploidy_priors = contig_ploidy_priors,
num_intervals_per_scatter = num_intervals_per_scatter,
Expand Down
59 changes: 56 additions & 3 deletions wdl/Utils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ task RunQC {

task RandomSubsampleStringArray {
input {
Array[String] strings
File strings
Int seed
Int subset_size
String prefix
Expand All @@ -172,7 +172,7 @@ task RandomSubsampleStringArray {

RuntimeAttr default_attr = object {
cpu_cores: 1,
mem_gb: 3.75,
mem_gb: 1,
disk_gb: 10,
boot_disk_gb: 10,
preemptible_tries: 3,
Expand All @@ -185,7 +185,7 @@ task RandomSubsampleStringArray {
set -euo pipefail
python3 <<CODE
import random
string_array = ['~{sep="','" strings}']
string_array = [line.rstrip() for line in open("~{strings}", 'r')]
array_len = len(string_array)
if ~{subset_size} > array_len:
raise ValueError("Subsample quantity ~{subset_size} cannot > array length %d" % array_len)
Expand Down Expand Up @@ -218,6 +218,59 @@ task RandomSubsampleStringArray {
}
}

task GetSubsampledIndices {
input {
File all_strings
File subset_strings
String prefix
String sv_pipeline_base_docker
RuntimeAttr? runtime_attr_override
}

String subsample_indices_filename = "~{prefix}.subsample_indices.list"

RuntimeAttr default_attr = object {
cpu_cores: 1,
mem_gb: 1,
disk_gb: 10,
boot_disk_gb: 10,
preemptible_tries: 3,
max_retries: 1
}
RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])

command <<<

set -euo pipefail
python3 <<CODE
all_strings = [line.rstrip() for line in open("~{all_strings}", 'r')]
subset_strings = {line.rstrip() for line in open("~{subset_strings}", 'r')}
if not subset_strings.issubset(set(all_strings)):
raise ValueError("Subset list must be a subset of full list")
with open("~{subsample_indices_filename}", 'w') as indices:
for i, string in enumerate(all_strings):
if string in subset_strings:
indices.write(f"{i}\n")
CODE

>>>

output {
Array[Int] subsample_indices_array = read_lines(subsample_indices_filename)
}

runtime {
cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
docker: sv_pipeline_base_docker
preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
}
}


task SubsetPedFile {
input {
File ped_file
Expand Down