From c8df30760a652dcfe029ea1c3c0668b4d7e0a35b Mon Sep 17 00:00:00 2001
From: epiercehoffman <epierceh@broadinstitute.org>
Date: Mon, 28 Jun 2021 10:28:54 -0400
Subject: [PATCH] add option to pass a file containing list of SNP VCF shard
 paths (#191)

---
 README.md                                                  | 4 ++--
 .../cohort_mode/cohort_mode_workspace_dashboard.md.tmpl    | 6 +++---
 wdl/Module00c.wdl                                          | 7 +++++--
 3 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/README.md b/README.md
index f78c529d3..5656fc05d 100644
--- a/README.md
+++ b/README.md
@@ -246,7 +246,7 @@ Trains a gCNV model for use in [Module 00c](#module00c). The WDL can be found at
 
 
 ## <a name="module00c">Module 00c</a>
-Runs CNV callers (cnMOPs, GATK gCNV) and combines single-sample raw evidence into a batch. See [above]("#cohort-mode") for more information on batching.
+Runs CNV callers (cnMOPs, GATK gCNV) and combines single-sample raw evidence into a batch. See [above](#cohort-mode) for more information on batching.
 
 #### Prerequisites:
 * [Module 00a](#module00a)
@@ -255,7 +255,7 @@ Runs CNV callers (cnMOPs, GATK gCNV) and combines single-sample raw evidence int
 
 #### Inputs:
 * PED file (updated with [Module 00b](#module00b) sex assignments, including sex = 0 for sex aneuploidies. Calls will not be made on sex chromosomes when sex = 0 in order to avoid generating many confusing calls or upsetting normalized copy numbers for the batch.)
-* Per-sample GVCFs generated with HaplotypeCaller (`gvcfs` input), or a jointly-genotyped VCF (position-sharded, `snp_vcfs` input)
+* Per-sample GVCFs generated with HaplotypeCaller (`gvcfs` input), or a jointly-genotyped VCF (position-sharded, `snp_vcfs` input or `snp_vcfs_shard_list` input)
 * Read count, BAF, PE, and SR files ([Module 00a](#module00a))
 * Caller VCFs ([Module 00a](#module00a))
 * Contig ploidy model and gCNV model files (gCNV training)
diff --git a/input_templates/terra_workspaces/cohort_mode/cohort_mode_workspace_dashboard.md.tmpl b/input_templates/terra_workspaces/cohort_mode/cohort_mode_workspace_dashboard.md.tmpl
index 21f6d5f14..8fb63c26c 100644
--- a/input_templates/terra_workspaces/cohort_mode/cohort_mode_workspace_dashboard.md.tmpl
+++ b/input_templates/terra_workspaces/cohort_mode/cohort_mode_workspace_dashboard.md.tmpl
@@ -29,9 +29,9 @@ The following cohort-level or batch-level inputs are also required:
 |`String`|`sample_set_id`|Batch identifier|
 |`String`|`sample_set_set_id`|Cohort identifier|
 |`File`|`cohort_ped_file`|Path to the GCS location of a family structure definitions file in [PED format](https://gatk.broadinstitute.org/hc/en-us/articles/360035531972-PED-Pedigree-format). Sex aneuploidies (detected in Module 00b) should be entered as sex = 0.|
-|`Array[File]`|`snp_vcfs`|Paths to the GCS locations of a jointly-genotyped, position-sharded SNP VCF**|
+|`Array[File]`|`snp_vcfs`|Paths to the GCS locations of a jointly-genotyped, position-sharded SNP VCF. Alternatively, provide a GCS path to a text file containing one SNP VCF shard path per line using the `File` input `snp_vcfs_shard_list`.**|
 
-**Only one of `gvcf` or `snp_vcfs` is required
+**Only one of `gvcf` or `snp_vcfs` or `snp_vcfs_shard_list` is required
 
 ### Pipeline outputs
 
@@ -163,7 +163,7 @@ To create batches (in the `sample_set` table), the easiest way is to upload a ta
 #### Module00c
 
 * Use the same `sample_set` definitions you used for `train-gCNV`.
-* The default configuration for `module00c` in this workspace uses sample GVCFs. To use a position-sharded joint SNP VCF instead, delete the `gvcfs` input, provide your file(s) for `snp_vcfs`, and click "Save". The `snp_vcfs` argument should be formatted as an `Array[File]`, ie. `["gs://bucket/shard1.vcf", "gs://bucket/shard2.vcf"]`.
+* The default configuration for `module00c` in this workspace uses sample GVCFs. To use a position-sharded joint SNP VCF instead, delete the `gvcfs` input, provide your file(s) for `snp_vcfs`, and click "Save". The `snp_vcfs` argument should be formatted as an `Array[File]`, ie. `["gs://bucket/shard1.vcf", "gs://bucket/shard2.vcf"]`. Alternatively, provide the input `snp_vcfs_shard_list`: a GCS path to a text file containing one SNP VCF shard path per line (this option is useful if the `Array[File]` of `snp_vcfs` shards is too long for Terra to handle).
 * If you are using GVCFs in a requester-pays bucket, you must provide the Terra billing project for the workspace to the `gvcf_gcs_project_for_requester_pays` argument as a string, surrounded by double-quotes.
 
 #### Module01 and Module02
diff --git a/wdl/Module00c.wdl b/wdl/Module00c.wdl
index ee0087755..4f33ff733 100644
--- a/wdl/Module00c.wdl
+++ b/wdl/Module00c.wdl
@@ -66,6 +66,8 @@ workflow Module00c {
     # BAF Option #2, position-sharded VCFs
     Array[File]? snp_vcfs
     File? snp_vcf_header  # Only use if snp vcfs are unheadered
+    # Text file with paths to SNP VCF shards, one per line. Use instead of snp_vcfs if Array[File] is too long to manage
+    File? snp_vcfs_shard_list
     # Sample ids in vcf, where vcf_samples[i] corresponds to samples[i]. Only use if sample ids are different in vcf
     Array[String]? vcf_samples
 
@@ -267,10 +269,11 @@ workflow Module00c {
     }
   }
 
-  if (!defined(BAF_files) && !defined(gvcfs) && defined(snp_vcfs)) {
+  if (!defined(BAF_files) && !defined(gvcfs) && (defined(snp_vcfs) || defined(snp_vcfs_shard_list))) {
+    Array[File] snp_vcfs_ = if (defined(snp_vcfs)) then select_first([snp_vcfs]) else read_lines(select_first([snp_vcfs_shard_list]))
     call sbaf.BAFFromShardedVCF {
       input:
-        vcfs = select_first([snp_vcfs]),
+        vcfs = snp_vcfs_,
         vcf_header = snp_vcf_header,
         samples = select_first([vcf_samples, samples]),
         batch = batch,