From c8df30760a652dcfe029ea1c3c0668b4d7e0a35b Mon Sep 17 00:00:00 2001 From: epiercehoffman Date: Mon, 28 Jun 2021 10:28:54 -0400 Subject: [PATCH] add option to pass a file containing list of SNP VCF shard paths (#191) --- README.md | 4 ++-- .../cohort_mode/cohort_mode_workspace_dashboard.md.tmpl | 6 +++--- wdl/Module00c.wdl | 7 +++++-- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index f78c529d3..5656fc05d 100644 --- a/README.md +++ b/README.md @@ -246,7 +246,7 @@ Trains a gCNV model for use in [Module 00c](#module00c). The WDL can be found at ## Module 00c -Runs CNV callers (cnMOPs, GATK gCNV) and combines single-sample raw evidence into a batch. See [above]("#cohort-mode") for more information on batching. +Runs CNV callers (cnMOPs, GATK gCNV) and combines single-sample raw evidence into a batch. See [above](#cohort-mode) for more information on batching. #### Prerequisites: * [Module 00a](#module00a) @@ -255,7 +255,7 @@ Runs CNV callers (cnMOPs, GATK gCNV) and combines single-sample raw evidence int #### Inputs: * PED file (updated with [Module 00b](#module00b) sex assignments, including sex = 0 for sex aneuploidies. Calls will not be made on sex chromosomes when sex = 0 in order to avoid generating many confusing calls or upsetting normalized copy numbers for the batch.) -* Per-sample GVCFs generated with HaplotypeCaller (`gvcfs` input), or a jointly-genotyped VCF (position-sharded, `snp_vcfs` input) +* Per-sample GVCFs generated with HaplotypeCaller (`gvcfs` input), or a jointly-genotyped VCF (position-sharded, `snp_vcfs` input or `snp_vcfs_shard_list` input) * Read count, BAF, PE, and SR files ([Module 00a](#module00a)) * Caller VCFs ([Module 00a](#module00a)) * Contig ploidy model and gCNV model files (gCNV training) diff --git a/input_templates/terra_workspaces/cohort_mode/cohort_mode_workspace_dashboard.md.tmpl b/input_templates/terra_workspaces/cohort_mode/cohort_mode_workspace_dashboard.md.tmpl index 21f6d5f14..8fb63c26c 100644 --- a/input_templates/terra_workspaces/cohort_mode/cohort_mode_workspace_dashboard.md.tmpl +++ b/input_templates/terra_workspaces/cohort_mode/cohort_mode_workspace_dashboard.md.tmpl @@ -29,9 +29,9 @@ The following cohort-level or batch-level inputs are also required: |`String`|`sample_set_id`|Batch identifier| |`String`|`sample_set_set_id`|Cohort identifier| |`File`|`cohort_ped_file`|Path to the GCS location of a family structure definitions file in [PED format](https://gatk.broadinstitute.org/hc/en-us/articles/360035531972-PED-Pedigree-format). Sex aneuploidies (detected in Module 00b) should be entered as sex = 0.| -|`Array[File]`|`snp_vcfs`|Paths to the GCS locations of a jointly-genotyped, position-sharded SNP VCF**| +|`Array[File]`|`snp_vcfs`|Paths to the GCS locations of a jointly-genotyped, position-sharded SNP VCF. Alternatively, provide a GCS path to a text file containing one SNP VCF shard path per line using the `File` input `snp_vcfs_shard_list`.**| -**Only one of `gvcf` or `snp_vcfs` is required +**Only one of `gvcf` or `snp_vcfs` or `snp_vcfs_shard_list` is required ### Pipeline outputs @@ -163,7 +163,7 @@ To create batches (in the `sample_set` table), the easiest way is to upload a ta #### Module00c * Use the same `sample_set` definitions you used for `train-gCNV`. -* The default configuration for `module00c` in this workspace uses sample GVCFs. To use a position-sharded joint SNP VCF instead, delete the `gvcfs` input, provide your file(s) for `snp_vcfs`, and click "Save". The `snp_vcfs` argument should be formatted as an `Array[File]`, ie. `["gs://bucket/shard1.vcf", "gs://bucket/shard2.vcf"]`. +* The default configuration for `module00c` in this workspace uses sample GVCFs. To use a position-sharded joint SNP VCF instead, delete the `gvcfs` input, provide your file(s) for `snp_vcfs`, and click "Save". The `snp_vcfs` argument should be formatted as an `Array[File]`, ie. `["gs://bucket/shard1.vcf", "gs://bucket/shard2.vcf"]`. Alternatively, provide the input `snp_vcfs_shard_list`: a GCS path to a text file containing one SNP VCF shard path per line (this option is useful if the `Array[File]` of `snp_vcfs` shards is too long for Terra to handle). * If you are using GVCFs in a requester-pays bucket, you must provide the Terra billing project for the workspace to the `gvcf_gcs_project_for_requester_pays` argument as a string, surrounded by double-quotes. #### Module01 and Module02 diff --git a/wdl/Module00c.wdl b/wdl/Module00c.wdl index ee0087755..4f33ff733 100644 --- a/wdl/Module00c.wdl +++ b/wdl/Module00c.wdl @@ -66,6 +66,8 @@ workflow Module00c { # BAF Option #2, position-sharded VCFs Array[File]? snp_vcfs File? snp_vcf_header # Only use if snp vcfs are unheadered + # Text file with paths to SNP VCF shards, one per line. Use instead of snp_vcfs if Array[File] is too long to manage + File? snp_vcfs_shard_list # Sample ids in vcf, where vcf_samples[i] corresponds to samples[i]. Only use if sample ids are different in vcf Array[String]? vcf_samples @@ -267,10 +269,11 @@ workflow Module00c { } } - if (!defined(BAF_files) && !defined(gvcfs) && defined(snp_vcfs)) { + if (!defined(BAF_files) && !defined(gvcfs) && (defined(snp_vcfs) || defined(snp_vcfs_shard_list))) { + Array[File] snp_vcfs_ = if (defined(snp_vcfs)) then select_first([snp_vcfs]) else read_lines(select_first([snp_vcfs_shard_list])) call sbaf.BAFFromShardedVCF { input: - vcfs = select_first([snp_vcfs]), + vcfs = snp_vcfs_, vcf_header = snp_vcf_header, samples = select_first([vcf_samples, samples]), batch = batch,