broadinstitute · koncheto-broad · Dec 10, 2024 · Nov 23, 2024 · Nov 23, 2024 · Nov 23, 2024
diff --git a/.dockstore.yml b/.dockstore.yml
@@ -173,6 +173,7 @@ workflows:
          branches:
              - master
              - ah_var_store
+             - vs_1516_yolo
          tags:
              - /.*/
    - name: GvsImportGenomes
@@ -241,6 +242,7 @@ workflows:
          branches:
              - master
              - ah_var_store
+             - vs_1516_yolo
          tags:
              - /.*/
    - name: GvsWithdrawSamples
@@ -314,7 +316,7 @@ workflows:
          branches:
              - master
              - ah_var_store
-             - vs_1490_fix_curate_input_array_files
+             - vs_1516_yolo
          tags:
              - /.*/
    - name: GvsIngestTieout

diff --git a/scripts/variantstore/wdl/GvsUtils.wdl b/scripts/variantstore/wdl/GvsUtils.wdl
@@ -74,7 +74,7 @@ task GetToolVersions {
     String cloud_sdk_slim_docker = "gcr.io/google.com/cloudsdktool/cloud-sdk:435.0.0-slim"
     String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2024-11-25-alpine-913039adf8f4"
     String variants_nirvana_docker = "us.gcr.io/broad-dsde-methods/variantstore:nirvana_2022_10_19"
-    String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2024-11-24-gatkbase-1807487d5912"
+    String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2024-11-24-gatkbase-5b5c307bdb5e"
     String real_time_genomics_docker = "docker.io/realtimegenomics/rtg-tools:latest"
     String gotc_imputation_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.5-1.10.2-0.1.16-1649948623"
     String plink_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/plink2:2024-04-23-slim-a0a65f52cc0e"

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java
@@ -6,7 +6,6 @@
 import htsjdk.variant.vcf.VCFHeader;
 import org.apache.avro.generic.GenericRecord;
 import org.apache.commons.lang.StringUtils;
-import org.apache.commons.lang.math.LongRange;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.broadinstitute.hellbender.engine.FeatureContext;
@@ -19,9 +18,9 @@
 import org.broadinstitute.hellbender.tools.walkers.ReferenceConfidenceVariantContextMerger;
 import org.broadinstitute.hellbender.tools.walkers.annotator.VariantAnnotatorEngine;
 import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.gvs.bigquery.AvroFileReader;
 import org.broadinstitute.hellbender.utils.gvs.bigquery.StorageAPIAvroReader;
 import org.broadinstitute.hellbender.utils.gvs.bigquery.TableReference;
-import org.broadinstitute.hellbender.utils.gvs.bigquery.AvroFileReader;
 import org.broadinstitute.hellbender.utils.gvs.localsort.AvroSortingCollectionCodec;
 import org.broadinstitute.hellbender.utils.gvs.localsort.SortingCollection;
 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
@@ -86,6 +85,15 @@ public class ExtractCohortEngine {
 
     private final Consumer<VariantContext> variantContextConsumer;
 
+    private static class VariantIterables {
+        public Iterable<GenericRecord> vets;
+        public Iterable<GenericRecord> refRanges;
+        public VariantIterables(Iterable<GenericRecord> vets, Iterable<GenericRecord> refRanges) {
+            this.vets = vets;
+            this.refRanges = refRanges;
+        }
+    }
+
     List<String> getFilterSetInfoTableFields() {
         return SchemaUtils.YNG_FIELDS;
     }
@@ -234,6 +242,18 @@ private void processBytesScanned(StorageAPIAvroReader reader) {
     }
 
     public void traverse() {
+
+        SortedSet<Long> sampleIdsToExtract = new TreeSet<>(this.sampleIdToName.keySet());
+        VariantBitSet vbs = new VariantBitSet(minLocation, maxLocation);
+        VariantIterables variantIterables;
+        if (fqRangesExtractVetTable != null) {
+            variantIterables = createVariantIterablesFromUnsortedExtractTableBigQueryRanges(fqRangesExtractVetTable, fqRangesExtractRefTable, vbs);
+        } else if (vetRangesFQDataSet != null) {
+            variantIterables = createVariantIterablesFromUnsortedBigQueryRanges(vetRangesFQDataSet, sampleIdsToExtract, vbs);
+        } else {
+            variantIterables = createVariantsIterablesFromUnsortedAvroRanges(vetAvroFileName, refRangesAvroFileName, vbs, presortedAvroFiles);
+        }
+
         // First allele here is the ref, followed by the alts associated with that ref. We need this because at this
         // point the alleles haven't been joined and remapped to one reference allele.
         final Map<Long, Map<Allele, Map<Allele, Double>>> fullScoreMap = new HashMap<>();
@@ -260,16 +280,26 @@ public void traverse() {
 
             // get filter info (vqslod/sensitivity & yng values)
             try (StorageAPIAvroReader reader = new StorageAPIAvroReader(filterSetInfoTableRef, rowRestrictionWithFilterSetName, projectID)) {
-
+                long recordsProcessed = 0;
+                long recordsDropped = 0;
                 for (final GenericRecord queryRow : reader) {
+                    if (++recordsProcessed % 100000 == 0) {
+                        logger.info("Processed " + recordsProcessed + " filter set info records, dropped " + recordsDropped + ".");
+                    }
                     final ExtractCohortFilterRecord filterRow = new ExtractCohortFilterRecord(queryRow, getVQScoreFieldName(), getScoreFieldName());
 
                     final long location = filterRow.getLocation();
+                    final Allele ref = Allele.create(filterRow.getRefAllele(), true);
+                    final Allele alt = Allele.create(filterRow.getAltAllele(), false);
+
+                    if (!vbs.containsVariant(location, location + Math.max(ref.length(), alt.length()))) {
+                        ++recordsDropped;
+                        continue;
+                    }
                     final Double score = filterRow.getScore();
                     final Double vqsScore = filterRow.getVqScore();
                     final String yng = filterRow.getYng();
-                    final Allele ref = Allele.create(filterRow.getRefAllele(), true);
-                    final Allele alt = Allele.create(filterRow.getAltAllele(), false);
+
                     fullScoreMap.putIfAbsent(location, new HashMap<>());
                     fullScoreMap.get(location).putIfAbsent(ref, new HashMap<>());
                     fullScoreMap.get(location).get(ref).put(alt, score);
@@ -280,15 +310,25 @@ public void traverse() {
                     fullYngMap.get(location).putIfAbsent(ref, new HashMap<>());
                     fullYngMap.get(location).get(ref).put(alt, yng);
                 }
+                logger.info("Processed " + recordsProcessed + " filter set info records, dropped " + recordsDropped + ".");
                 processBytesScanned(reader);
             }
         }
 
         // load site-level filter data into data structure
         if (filterSetSiteTableRef != null) {
             try (StorageAPIAvroReader reader = new StorageAPIAvroReader(filterSetSiteTableRef, rowRestrictionWithFilterSetName, projectID)) {
+                long recordsProcessed = 0;
+                long recordsDropped = 0;
                 for (final GenericRecord queryRow : reader) {
+                    if (++recordsProcessed % 10000 == 0) {
+                        logger.info("Processed " + recordsProcessed + " filter set sites records, dropped " + recordsDropped + ".");
+                    }
                     long location = Long.parseLong(queryRow.get(SchemaUtils.LOCATION_FIELD_NAME).toString());
+                    if (!vbs.containsVariant(location, location + 1)) {
+                        ++recordsDropped;
+                        continue;
+                    }
                     List<String> filters = Arrays.asList(queryRow.get(SchemaUtils.FILTERS).toString().split(","));
                     siteFilterMap.put(location, filters);
                 }
@@ -320,17 +360,7 @@ public void traverse() {
             throw new GATKException("Can not process cross-contig boundaries for Ranges implementation");
         }
 
-        SortedSet<Long> sampleIdsToExtract = new TreeSet<>(this.sampleIdToName.keySet());
-        if (fqRangesExtractVetTable != null) {
-            createVariantsFromUnsortedExtractTableBigQueryRanges(fqRangesExtractVetTable, fqRangesExtractRefTable,
-                    sampleIdsToExtract, minLocation, maxLocation, fullScoreMap, fullVQScoreMap, fullYngMap, samplePloidyMap, siteFilterMap, noVQScoreFilteringRequested);
-        } else if (vetRangesFQDataSet != null) {
-            createVariantsFromUnsortedBigQueryRanges(vetRangesFQDataSet, sampleIdsToExtract, minLocation, maxLocation,
-                    fullScoreMap, fullVQScoreMap, fullYngMap, samplePloidyMap, siteFilterMap, noVQScoreFilteringRequested);
-        } else {
-            createVariantsFromUnsortedAvroRanges(vetAvroFileName, refRangesAvroFileName, sampleIdsToExtract, minLocation,
-                    maxLocation,  fullScoreMap, fullVQScoreMap, fullYngMap, samplePloidyMap, siteFilterMap, noVQScoreFilteringRequested, presortedAvroFiles);
-        }
+        createVariantsFromSortedRanges(sampleIdsToExtract, variantIterables, fullScoreMap, fullVQScoreMap, fullYngMap, samplePloidyMap, siteFilterMap, noVQScoreFilteringRequested);
 
         logger.debug("Finished Initializing Reader");
 
@@ -1049,25 +1079,16 @@ private SortingCollection<GenericRecord> createSortedReferenceRangeCollectionFro
     }
 
 
-    private void createVariantsFromUnsortedBigQueryRanges(
+    private VariantIterables createVariantIterablesFromUnsortedBigQueryRanges(
             final String fqDatasetName,
             final SortedSet<Long> sampleIdsToExtract,
-            final Long minLocation,
-            final Long maxLocation,
-            final Map<Long, Map<Allele, Map<Allele, Double>>> fullScoreMap,
-            final Map<Long, Map<Allele, Map<Allele, Double>>> fullVQScoreMap,
-            final Map<Long, Map<Allele, Map<Allele, String>>> fullYngMap,
-            final Map<String, Integer> samplePloidyMap,
-            final Map<Long, List<String>> siteFilterMap,
-            final boolean noVQScoreFilteringRequested) {
+            VariantBitSet vbs) {
 
         // We could handle this by making a map of BitSets or something, but it seems unnecessary to support this
         if (!SchemaUtils.decodeContig(minLocation).equals(SchemaUtils.decodeContig(maxLocation))) {
             throw new GATKException("Can not process cross-contig boundaries");
         }
 
-        VariantBitSet vbs = new VariantBitSet(minLocation, maxLocation);
-
         SortingCollection<GenericRecord> sortedVet = createSortedVetCollectionFromBigQuery(projectID,
                 fqDatasetName,
                 sampleIdsToExtract,
@@ -1085,32 +1106,22 @@ private void createVariantsFromUnsortedBigQueryRanges(
                 localSortMaxRecordsInRam,
                 vbs);
 
-        createVariantsFromSortedRanges(sampleIdsToExtract, sortedVet, sortedReferenceRange, fullScoreMap, fullVQScoreMap, fullYngMap, samplePloidyMap, siteFilterMap, noVQScoreFilteringRequested);
+        return new VariantIterables(sortedVet, sortedReferenceRange);
     }
 
     //
     // BEGIN REF RANGES COHORT EXTRACT
     //
-    private void createVariantsFromUnsortedExtractTableBigQueryRanges(
+    private VariantIterables createVariantIterablesFromUnsortedExtractTableBigQueryRanges(
             final String fqVetTable,
             final String fqRefTable,
-            final SortedSet<Long> sampleIdsToExtract,
-            final Long minLocation,
-            final Long maxLocation,
-            final Map<Long, Map<Allele, Map<Allele, Double>>> fullScoreMap,
-            final Map<Long, Map<Allele, Map<Allele, Double>>> fullVQScoreMap,
-            final Map<Long, Map<Allele, Map<Allele, String>>> fullYngMap,
-            final Map<String, Integer> samplePloidyMap,
-            final Map<Long, List<String>> siteFilterMap,
-            final boolean noVQScoreFilteringRequested) {
+            VariantBitSet vbs) {
 
         // We could handle this by making a map of BitSets or something, but it seems unnecessary to support this
         if (!SchemaUtils.decodeContig(minLocation).equals(SchemaUtils.decodeContig(maxLocation))) {
             throw new GATKException("Can not process cross-contig boundaries");
         }
 
-        VariantBitSet vbs = new VariantBitSet(minLocation, maxLocation);
-
         SortingCollection<GenericRecord> sortedVet = createSortedVetCollectionFromExtractTableBigQuery(projectID,
                 fqVetTable,
                 minLocation,
@@ -1126,7 +1137,7 @@ private void createVariantsFromUnsortedExtractTableBigQueryRanges(
                 localSortMaxRecordsInRam,
                 vbs);
 
-        createVariantsFromSortedRanges(sampleIdsToExtract, sortedVet, sortedReferenceRange, fullScoreMap, fullVQScoreMap, fullYngMap, samplePloidyMap, siteFilterMap, noVQScoreFilteringRequested);
+        return new VariantIterables(sortedVet, sortedReferenceRange);
     }
 
     private SortingCollection<GenericRecord> createSortedVetCollectionFromExtractTableBigQuery(final String projectID,
@@ -1185,18 +1196,10 @@ private SortingCollection<GenericRecord> createSortedReferenceRangeCollectionFro
     //
     // END REF RANGES COHORT EXTRACT
     //
-    private void createVariantsFromUnsortedAvroRanges(
+    private VariantIterables createVariantsIterablesFromUnsortedAvroRanges(
             final GATKPath vetAvroFileName,
             final GATKPath refRangesAvroFileName,
-            final SortedSet<Long> sampleIdsToExtract,
-            final Long minLocation,
-            final Long maxLocation,
-            final Map<Long, Map<Allele, Map<Allele, Double>>> fullScoreMap,
-            final Map<Long, Map<Allele, Map<Allele, Double>>> fullVQScoreMap,
-            final Map<Long, Map<Allele, Map<Allele, String>>> fullYngMap,
-            final Map<String, Integer> samplePloidyMap,
-            final Map<Long, List<String>> siteFilterMap,
-            final boolean noVQScoreFilteringRequested,
+            VariantBitSet vbs,
             final boolean presortedAvroFiles) {
 
         final AvroFileReader vetReader = new AvroFileReader(vetAvroFileName);
@@ -1209,8 +1212,6 @@ private void createVariantsFromUnsortedAvroRanges(
             sortedVet = vetReader;
             sortedReferenceRange = refRangesReader;
         } else {
-            VariantBitSet vbs = new VariantBitSet(minLocation, maxLocation);
-
             SortingCollection<GenericRecord> localSortedVet = getAvroSortingCollection(vetReader.getSchema(), localSortMaxRecordsInRam);
             addToVetSortingCollection(localSortedVet, vetReader, vbs);
 
@@ -1221,13 +1222,11 @@ private void createVariantsFromUnsortedAvroRanges(
             sortedReferenceRange = localSortedReferenceRange;
         }
 
-        createVariantsFromSortedRanges(sampleIdsToExtract, sortedVet, sortedReferenceRange, fullScoreMap, fullVQScoreMap, fullYngMap, samplePloidyMap, siteFilterMap, noVQScoreFilteringRequested);
-
+        return new VariantIterables(sortedVet, sortedReferenceRange);
     }
 
     void createVariantsFromSortedRanges(final SortedSet<Long> sampleIdsToExtract,
-                                        final Iterable<GenericRecord> sortedVet,
-                                        Iterable<GenericRecord> sortedReferenceRange,
+                                        final VariantIterables variantIterables,
                                         final Map<Long, Map<Allele, Map<Allele, Double>>> fullScoreMap,
                                         final Map<Long, Map<Allele, Map<Allele, Double>>> fullVQScoreMap,
                                         final Map<Long, Map<Allele, Map<Allele, String>>> fullYngMap,
@@ -1255,9 +1254,9 @@ void createVariantsFromSortedRanges(final SortedSet<Long> sampleIdsToExtract,
             referenceCache.put(sampleId, new TreeSet<>());
         }
 
-        Iterator<GenericRecord> sortedReferenceRangeIterator = sortedReferenceRange.iterator();
+        Iterator<GenericRecord> sortedReferenceRangeIterator = variantIterables.refRanges.iterator();
 
-        for (final GenericRecord sortedRow : sortedVet) {
+        for (final GenericRecord sortedRow : variantIterables.vets) {
             final ExtractCohortRecord vetRow = new ExtractCohortRecord(sortedRow);
             long variantLocation = vetRow.getLocation();
             long variantSample = vetRow.getSampleId();