Performed a round of ablation on new annotation-based filtering tools. (

#8131) * Performed a round of ablation on new annotation-based filtering tools. * Removed Javadoc tags unsupported by Barclay in VETS tool documentation and fixed other minor documentation issues.
broadinstitute · Nov 28, 2023 · ea6ae82 · ea6ae82
1 parent b921612
commit ea6ae82
Show file tree

Hide file tree

Showing 108 changed files with 269 additions and 710 deletions.
diff --git a/scripts/vcf_site_level_filtering_cromwell_tests/run_vcf_site_level_filtering_wdl.sh b/scripts/vcf_site_level_filtering_cromwell_tests/run_vcf_site_level_filtering_wdl.sh
@@ -28,7 +28,6 @@ fi
 echo "Docker build done =========="
 
 sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" $CROMWELL_TEST_DIR/vcf_site_level_filtering.json >$WORKING_DIR/vcf_site_level_filtering_mod.json
-sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" $CROMWELL_TEST_DIR/vcf_site_level_filtering_pos_neg.json >$WORKING_DIR/vcf_site_level_filtering_pos_neg_mod.json
 
 echo "Running Filtering WDL through cromwell"
 
@@ -41,6 +40,3 @@ done
 FIN
 cat $WORKING_DIR/vcf_site_level_filtering_mod.json
 java -jar $CROMWELL_JAR run $WDL_DIR/JointVcfFiltering.wdl -i $WORKING_DIR/vcf_site_level_filtering_mod.json
-
-cat $WORKING_DIR/vcf_site_level_filtering_pos_neg_mod.json
-java -jar $CROMWELL_JAR run $WDL_DIR/JointVcfFiltering.wdl -i $WORKING_DIR/vcf_site_level_filtering_pos_neg_mod.json
diff --git a/scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_pos_neg.json b/scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_pos_neg.json
diff --git a/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl b/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
@@ -29,8 +29,9 @@ workflow JointVcfFiltering {
         String resource_args
 
         String? model_backend
-        File? python_script
+        File? training_python_script
         File? hyperparameters_json
+        File? scoring_python_script
 
         String? extract_extra_args
         String? train_extra_args
@@ -55,9 +56,9 @@ workflow JointVcfFiltering {
         model_backend: "(Optional) Model backend to be used by TrainVariantAnnotationsModel. See GATK documentation for this tool."
         python_script: "(Optional) Python script specifying custom model backend to be used by TrainVariantAnnotationsModel. See GATK documentation for this tool."
         hyperparameters_json: "(Optional) JSON file specifying model hyperparameters to be used by TrainVariantAnnotationsModel. See GATK documentation for this tool."
-        extract_extra_args: "(Optional) Catch-all string to provide additional arguments for ExtractVariantAnnotations. This can include intervals (as string arguments or non-localized files), variant-type modes, arguments for enabling positive-negative training, etc. The \"do-not-gzip-vcf-output\" argument is not supported by this workflow. See GATK documentation for this tool."
-        train_extra_args: "(Optional) Catch-all string to provide additional arguments for TrainVariantAnnotationsModel. This can include variant-type modes, arguments for enabling positive-negative training, etc. See GATK documentation for this tool."
-        score_extra_args: "(Optional) Catch-all string to provide additional arguments for ScoreVariantAnnotations. This can include intervals (as string arguments or non-localized files), variant-type modes, arguments for enabling positive-negative training and hard filtering, etc. The \"do-not-gzip-vcf-output\" argument is not supported by this workflow. See GATK documentation for this tool."
+        extract_extra_args: "(Optional) Catch-all string to provide additional arguments for ExtractVariantAnnotations. This can include intervals (as string arguments or non-localized files), variant-type modes, arguments for enabling positive-unlabeled learning, etc. The \"do-not-gzip-vcf-output\" argument is not supported by this workflow. See GATK documentation for this tool."
+        train_extra_args: "(Optional) Catch-all string to provide additional arguments for TrainVariantAnnotationsModel. This can include variant-type modes, arguments for enabling positive-unlabeled learning, etc. See GATK documentation for this tool."
+        score_extra_args: "(Optional) Catch-all string to provide additional arguments for ScoreVariantAnnotations. This can include intervals (as string arguments or non-localized files), variant-type modes, arguments for enabling positive-unlabeled learning and hard filtering, etc. The \"do-not-gzip-vcf-output\" argument is not supported by this workflow. See GATK documentation for this tool."
     }
 
     call ExtractVariantAnnotations {
@@ -79,7 +80,7 @@ workflow JointVcfFiltering {
             annotations_hdf5 = ExtractVariantAnnotations.annotations_hdf5,
             unlabeled_annotations_hdf5 = ExtractVariantAnnotations.unlabeled_annotations_hdf5,
             model_backend = model_backend,
-            python_script = python_script,
+            python_script = training_python_script,
             hyperparameters_json = hyperparameters_json,
             output_prefix = output_prefix,
             extra_args = train_extra_args,
@@ -101,6 +102,8 @@ workflow JointVcfFiltering {
                 extracted_vcf_idx = ExtractVariantAnnotations.extracted_vcf_idx,
                 model_prefix = output_prefix,
                 model_files = TrainVariantAnnotationsModel.model_files,
+                model_backend = model_backend,
+                python_script = scoring_python_script,
                 extra_args = score_extra_args,
                 gatk_docker = gatk_docker,
                 gatk_override = gatk_override,
@@ -251,6 +254,8 @@ task ScoreVariantAnnotations {
         File extracted_vcf_idx
         String model_prefix
         Array[File] model_files
+        String? model_backend
+        File? python_script
         String? extra_args
         File? monitoring_script
 
@@ -287,6 +292,8 @@ task ScoreVariantAnnotations {
                 ~{resource_args} \
                 --resource:extracted,extracted=true ~{extracted_vcf} \
                 --model-prefix model-files/~{model_prefix}.train \
+                ~{"--model-backend " + model_backend} \
+                ~{"--python-script " + python_script} \
                 ~{extra_args}
     }
 

diff --git a/.../org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotations.java b/.../org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotations.java
@@ -31,7 +31,7 @@
  * Extracts site-level variant annotations, labels, and other metadata from a VCF file to HDF5 files.
  *
  * <p>
- *     This tool is intended to be used as the first step in a variant-filtering workflow that supersedes the
+ *     This tool is primarily intended to be used as the first step in a variant-filtering workflow that supersedes the
  *     {@link VariantRecalibrator} workflow. This tool extracts site-level annotations, labels, and other relevant metadata
  *     from variant sites (or alleles, in allele-specific mode) that are or are not present in specified labeled
  *     resource VCFs (e.g., training or calibration VCFs). Input sites that are present in the resources are considered
@@ -65,7 +65,7 @@
  * <ul>
  *     <li>
  *         Input VCF file. Site-level annotations will be extracted from the contained variants (or alleles, 
- *         if the {@value USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME} argument is specified).
+ *         if at least one allele-specific annotation with "Number=A" is specified).
  *     </li>
  *     <li>
  *         Annotations to extract.
@@ -78,13 +78,12 @@
  *     </li>
  *     <li>
  *         (Optional) Resource VCF file(s). Each resource should be tagged with a label, which will be assigned to
- *         extracted sites that are present in the resource. In typical use, the {@value LabeledVariantAnnotationsData#TRAINING_LABEL}
- *         and {@value LabeledVariantAnnotationsData#CALIBRATION_LABEL} labels should be used to tag at least one resource
- *         apiece. The resulting sets of sites will be used for model training and conversion of scores to
+ *         extracted sites that are present in the resource. In typical use, the "training"
+ *         and "calibration" labels should be used to tag at least one resource apiece.
+ *         The resulting sets of sites will be used for model training and conversion of scores to
  *         calibration-set sensitivity, respectively; the trustworthiness of the respective resources should be
- *         taken into account accordingly. The {@value LabeledVariantAnnotationsData#SNP_LABEL} label is
- *         reserved by the tool, as it is used to label sites determined to be SNPs, and thus it cannot be used to tag
- *         provided resources.
+ *         taken into account accordingly. The "snp" label is reserved by the tool, as it is used to label sites
+ *         determined to be SNPs, and thus it cannot be used to tag provided resources.
  *     </li>
  *     <li>
  *         (Optional) Maximum number of unlabeled variants (or alleles) to randomly sample with reservoir sampling.
@@ -128,19 +127,19 @@
  *         <p>
  *             Here, each chunk is a double matrix, with dimensions given by (number of sites in the chunk) x (number of annotations).
  *             See the methods {@link HDF5Utils#writeChunkedDoubleMatrix} and {@link HDF5Utils#writeIntervals} for additional details.
- *             If {@value USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME} is specified, each record corresponds to an individual allele;
+ *             In allele-specific mode (i.e., when allele-specific annotations are requested), each record corresponds to an individual allele;
  *             otherwise, each record corresponds to a variant site, which may contain multiple alleles.
- *             Storage of alleles can be omitted using the {@value OMIT_ALLELES_IN_HDF5_LONG_NAME} argument, which will reduce
+ *             Storage of alleles can be omitted using the "--omit-alleles-in-hdf5" argument, which will reduce
  *             the size of the file. This file will only be produced if resources are provided and the number of extracted
  *             labeled sites is nonzero.
  *         </p>
  *
  *     </li>
  *     <li>
- *         Labeled sites-only VCF file and index. The VCF will not be gzipped if the {@value DO_NOT_GZIP_VCF_OUTPUT_LONG_NAME}
+ *         Labeled sites-only VCF file and index. The VCF will not be gzipped if the "--do-not-gzip-vcf-output"
  *         argument is set to true. The VCF can be provided as a resource in subsequent runs of
  *         {@link ScoreVariantAnnotations} and used to indicate labeled sites that were extracted.
- *         This can be useful if the {@value StandardArgumentDefinitions#INTERVALS_LONG_NAME} argument was used to
+ *         This can be useful if the "--intervals/-L" argument was used to
  *         subset sites in training or calibration resources for extraction; this may occur when setting up
  *         training/validation/test splits, for example. Note that records for the random sample of unlabeled sites are
  *         currently not included in the VCF.
@@ -149,7 +148,7 @@
  *         (Optional) Unlabeled-annotations HDF5 file. This will have the same directory structure as in the
  *         labeled-annotations HDF5 file. However, note that records are currently written in the order they
  *         appear in the downsampling reservoir after random sampling, and hence, are not in genomic order.
- *         This file will only be produced if a nonzero value of the {@value MAXIMUM_NUMBER_OF_UNLABELED_VARIANTS_LONG_NAME}
+ *         This file will only be produced if a nonzero value of the "--maximum-number-of-unlabeled-variants"
  *         argument is provided.
  *     </li>
  * </ul>
@@ -158,9 +157,9 @@
  *
  * <p>
  *     Extract annotations from training/calibration SNP/INDEL sites, producing the outputs
- *     1) {@code extract.annot.hdf5}, 2) {@code extract.vcf.gz}, and 3) {@code extract.vcf.gz.tbi}.
+ *     1) extract.annot.hdf5, 2) extract.vcf.gz, and 3) extract.vcf.gz.tbi.
  *     The HDF5 file can then be provided to {@link TrainVariantAnnotationsModel}
- *     to train a model using a positive-only approach. Note that the {@value MODE_LONG_NAME} arguments are made
+ *     to train a model using a positive-only approach. Note that the "--mode" arguments are made
  *     explicit here, although both SNP and INDEL modes are selected by default.
  *
  * <pre>
@@ -182,11 +181,10 @@
  * <p>
  *     Extract annotations from both training/calibration SNP/INDEL sites and a random sample of
  *     1000000 unlabeled (i.e., non-training/calibration) sites, producing the outputs
- *     1) {@code extract.annot.hdf5}, 2) {@code extract.unlabeled.annot.hdf5}, 3) {@code extract.vcf.gz},
- *     and 4) {@code extract.vcf.gz.tbi}. The HDF5 files can then be provided to {@link TrainVariantAnnotationsModel}
- *     to train a model using a positive-negative approach (similar to that used in {@link VariantRecalibrator}).
- *     Note that the {@value MODE_LONG_NAME} arguments are made explicit here, although both SNP and INDEL modes are
- *     selected by default.
+ *     1) extract.annot.hdf5, 2) extract.unlabeled.annot.hdf5, 3) extract.vcf.gz,
+ *     and 4) extract.vcf.gz.tbi. The HDF5 files can then be provided to {@link TrainVariantAnnotationsModel}
+ *     to train a model using a positive-unlabeled approach. Note that the "--mode" arguments
+ *     are made explicit here, although both SNP and INDEL modes are selected by default.
  *
  * <pre>
  *     gatk ExtractVariantAnnotations \
@@ -200,17 +198,23 @@
  *          --mode INDEL \
  *          --resource:indel-training,training=true indel-training.vcf \
  *          --resource:indel-calibration,calibration=true indel-calibration.vcf \
- *          --maximum-number-of-unlableled-variants 1000000
+ *          --maximum-number-of-unlabeled-variants 1000000
  *          -O extract
  * </pre>
  * </p>
  *
  * <p>
+ *     Note that separate SNP and INDEL resources are shown in the above examples purely for demonstration purposes,
+ *     as are separate training and calibration resources. However, it may be desirable to specify combined
+ *     resource(s); e.g., "--resource:snp-and-indel-resource,training=true,calibration=true snp-and-indel-resource.vcf".
+ * </p>
+ *
+ * <p>
  *     In the (atypical) event that resource VCFs are unavailable, one can still extract annotations from a random sample of
- *     unlabeled sites, producing the outputs 1) {@code extract.unlabeled.annot.hdf5},
- *     2) {@code extract.vcf.gz} (which will contain no records), and 3) {@code extract.vcf.gz.tbi}.
+ *     unlabeled sites, producing the outputs 1) extract.unlabeled.annot.hdf5,
+ *     2) extract.vcf.gz (which will contain no records), and 3) extract.vcf.gz.tbi.
  *     This random sample cannot be used by {@link TrainVariantAnnotationsModel}, but may still be useful for
- *     exploratory analyses. Note that the {@value MODE_LONG_NAME} arguments are made explicit here, although both
+ *     exploratory analyses. Note that the "--mode" arguments are made explicit here, although both
  *     SNP and INDEL modes are selected by default.
  *
  * <pre>
@@ -221,12 +225,20 @@
  *          -A annotation_N \
  *          --mode SNP \
  *          --mode INDEL \
- *          --maximum-number-of-unlableled-variants 1000000
+ *          --maximum-number-of-unlabeled-variants 1000000
  *          -O extract
  * </pre>
  * </p>
  *
- * DEVELOPER NOTE: See documentation in {@link LabeledVariantAnnotationsWalker}.
+ * <p>
+ *     Alternatively, if resource VCFs are unavailable, one might want to specify the input VCF itself as a resource
+ *     and extract annotations for the input variants (or a subset thereof). Again, this may be useful for
+ *     exploratory analyses.
+ * </p>
+ *
+ * <p>
+ *     DEVELOPER NOTE: See documentation in {@link LabeledVariantAnnotationsWalker}.
+ * </p>
  *
  * @author Samuel Lee &lt;[email protected]&gt;
  */
@@ -249,11 +261,10 @@ public final class ExtractVariantAnnotations extends LabeledVariantAnnotationsWa
             doc = "Maximum number of unlabeled variants to extract. " +
                     "If greater than zero, reservoir sampling will be used to randomly sample this number " +
                     "of sites from input sites that are not present in the specified resources. " +
-                    "Choice of this number should be guided by considerations for training the negative model in " +
+                    "Choice of this number should be guided by considerations for training the model in " +
                     "TrainVariantAnnotationsModel; users may wish to choose a number that is comparable to the " +
                     "expected size of the labeled training set or that is compatible with available memory resources. " +
-                    "Note that in allele-specific mode (--" + LabeledVariantAnnotationsWalker.USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME +
-                    " true), this argument limits the number of variant records, rather than the number of alleles.",
+                    "Note that in allele-specific mode, this argument limits the number of variant records, rather than the number of alleles.",
             minValue = 0)
     private int maximumNumberOfUnlabeledVariants = 0;