diff --git a/wdl/TrainGCNV.wdl b/wdl/TrainGCNV.wdl index fd03bff31..167b5045e 100644 --- a/wdl/TrainGCNV.wdl +++ b/wdl/TrainGCNV.wdl @@ -18,8 +18,12 @@ workflow TrainGCNV { File reference_index # Index (.fai), must be in same dir as fasta File reference_dict # Dictionary (.dict), must be in same dir as fasta + # Options for subsetting samples for training. Both options require providing sv_pipeline_base_docker + # Assumes all other inputs correspond to the full sample list. Intended for Terra Int? n_samples_subsample # Number of samples to subsample from provided sample list for trainGCNV (rec: ~100) Int subsample_seed = 42 + # Subset of full sample list on which to train the gCNV model. Overrides n_samples_subsample if both provided + Array[String]? sample_ids_training_subset # Condense read counts Int? condense_num_bins @@ -85,7 +89,7 @@ workflow TrainGCNV { String linux_docker String gatk_docker String condense_counts_docker - String? sv_pipeline_base_docker # required if using n_samples_subsample to select samples + String? sv_pipeline_base_docker # required if using n_samples_subsample or sample_ids_training_subset to subset samples # Runtime configuration overrides RuntimeAttr? condense_counts_runtime_attr @@ -100,10 +104,20 @@ workflow TrainGCNV { RuntimeAttr? runtime_attr_explode } - if (defined(n_samples_subsample)) { + if (defined(sample_ids_training_subset)) { + call util.GetSubsampledIndices { + input: + all_strings = write_lines(samples), + subset_strings = write_lines(select_first([sample_ids_training_subset])), + prefix = cohort, + sv_pipeline_base_docker = select_first([sv_pipeline_base_docker]) + } + } + + if (defined(n_samples_subsample) && !defined(sample_ids_training_subset)) { call util.RandomSubsampleStringArray { input: - strings = samples, + strings = write_lines(samples), seed = subsample_seed, subset_size = select_first([n_samples_subsample]), prefix = cohort, @@ -111,9 +125,10 @@ workflow TrainGCNV { } } - Array[Int] sample_indices = select_first([RandomSubsampleStringArray.subsample_indices_array, range(length(samples))]) + Array[Int] sample_indices = select_first([GetSubsampledIndices.subsample_indices_array, RandomSubsampleStringArray.subsample_indices_array, range(length(samples))]) scatter (i in sample_indices) { + String sample_ids_ = samples[i] call cov.CondenseReadCounts as CondenseReadCounts { input: counts = count_files[i], @@ -138,7 +153,7 @@ workflow TrainGCNV { preprocessed_intervals = CountsToIntervals.out, filter_intervals = filter_intervals, counts = CondenseReadCounts.out, - count_entity_ids = select_first([RandomSubsampleStringArray.subsampled_strings_array, samples]), + count_entity_ids = sample_ids_, cohort_entity_id = cohort, contig_ploidy_priors = contig_ploidy_priors, num_intervals_per_scatter = num_intervals_per_scatter, diff --git a/wdl/Utils.wdl b/wdl/Utils.wdl index a5a14353e..f3d8b81ed 100644 --- a/wdl/Utils.wdl +++ b/wdl/Utils.wdl @@ -159,7 +159,7 @@ task RunQC { task RandomSubsampleStringArray { input { - Array[String] strings + File strings Int seed Int subset_size String prefix @@ -172,7 +172,7 @@ task RandomSubsampleStringArray { RuntimeAttr default_attr = object { cpu_cores: 1, - mem_gb: 3.75, + mem_gb: 1, disk_gb: 10, boot_disk_gb: 10, preemptible_tries: 3, @@ -185,7 +185,7 @@ task RandomSubsampleStringArray { set -euo pipefail python3 < array_len: raise ValueError("Subsample quantity ~{subset_size} cannot > array length %d" % array_len) @@ -218,6 +218,59 @@ task RandomSubsampleStringArray { } } +task GetSubsampledIndices { + input { + File all_strings + File subset_strings + String prefix + String sv_pipeline_base_docker + RuntimeAttr? runtime_attr_override + } + + String subsample_indices_filename = "~{prefix}.subsample_indices.list" + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + command <<< + + set -euo pipefail + python3 <>> + + output { + Array[Int] subsample_indices_array = read_lines(subsample_indices_filename) + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_base_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + + task SubsetPedFile { input { File ped_file