Skip to content

Commit

Permalink
Fix for NPE when GDB has too many alleles (#7738)
Browse files Browse the repository at this point in the history
* Fix for NPE when GDB has too many alleles and many GQ0 hom-ref genotypes
allow variant to pass QUAL filter
  • Loading branch information
ldgauthier authored Mar 30, 2022
1 parent cb813ed commit a85c25f
Show file tree
Hide file tree
Showing 7 changed files with 3,477 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,9 @@ public static List<Allele> calculateMostLikelyAlleles(final VariantContext vc, f
Utils.nonNull(vc, "vc is null");
Utils.validateArg(defaultPloidy > 0, () -> "default ploidy must be > 0 but defaultPloidy=" + defaultPloidy);
Utils.validateArg(numAltAllelesToKeep > 0, () -> "numAltAllelesToKeep must be > 0, but numAltAllelesToKeep=" + numAltAllelesToKeep);
//allow PLs or GPs (as for GATK-DRAGEN), but we need some kind of genotype data
Utils.validateArg(vc.getGenotypes().stream().anyMatch(g -> g.hasPL() || g.hasExtendedAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY)), () -> "Most likely alleles cannot be calculated without likelihoods");
//NOTE: this is used in the reblocking case when we have a hom-ref GT and real ALTs
final boolean allHomRefData = vc.getGenotypes().stream().allMatch(g -> g.hasPL() && g.getPL()[0] == 0); //PL=[0,0,0] is okay, we just don't want confident variants

final boolean hasSymbolicNonRef = vc.hasAllele(Allele.NON_REF_ALLELE);
Expand All @@ -306,6 +309,10 @@ public static List<Allele> calculateMostLikelyAlleles(final VariantContext vc, f
}

final double[] likelihoodSums = calculateLikelihoodSums(vc, defaultPloidy, allHomRefData);
if (MathUtils.sum(likelihoodSums) == 0.0 && !allHomRefData) {
throw new IllegalStateException("No likelihood sum exceeded zero -- method was called for variant data " +
"with no variant information.");
}
return filterToMaxNumberOfAltAllelesBasedOnScores(numAltAllelesToKeep, vc.getAlleles(), likelihoodSums);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,10 +112,13 @@ public Config getConfiguration() {
}

/**
* Main entry function to calculate genotypes of a given VC with corresponding GLs that is shared across genotypers (namely GGVCFs and HC).
* Main entry function to calculate genotypes of a given VC with corresponding GLs that is shared across
* genotypers (namely GGVCFs and HC).
*
* Completes a variant context with genotype calls and associated annotations given the genotype likelihoods and
* the model that need to be applied.
* the model that need to be applied. Hom-ref likelihoods can be approximated from GQs, but if no genotype has
* likelihoods then that variant is either all-ref or contains variants with no likelihoods, and in both cases
* we want to exit.
*
* @param vc Input variant context to complete.
* @return VC with assigned genotypes
Expand Down Expand Up @@ -393,7 +396,7 @@ boolean isVcCoveredByDeletion(final VariantContext vc) {
*/
protected final boolean cannotBeGenotyped(final VariantContext vc) {
if (vc.getNAlleles() <= GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED
&& vc.getGenotypes().stream().anyMatch(GenotypeUtils::genotypeIsUsableForAFCalculation)) {
&& vc.getGenotypes().stream().anyMatch(Genotype::hasLikelihoods)) { //likelihoods may be missing when reading from GenomicsDB if there are more alts that GDB args allow
return false;
}
// protect against too many alternate alleles that we can't even run AF on:
Expand All @@ -402,7 +405,7 @@ protected final boolean cannotBeGenotyped(final VariantContext vc) {
" alleles. Site will be skipped at location " + vc.getContig() + ":" + vc.getStart());
return true;
}else {
logger.warn("No genotype contained sufficient data to recalculate genotypes. Site will be skipped at location "
logger.warn("No genotype contained sufficient data to recalculate site and allele qualities. Site will be skipped at location "
+ vc.getContig() + ":" + vc.getStart());
return true;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,14 +124,20 @@ public AFCalculationResult calculate(final VariantContext vc) {
* Compute the probability of the alleles segregating given the genotype likelihoods of the samples in vc
*
* @param vc the VariantContext holding the alleles and sample information. The VariantContext
* must have at least 1 alternative allele
* must have at least 1 alternative allele and at least one variant genotype with likelihoods.
* Hom-ref genotype likelihoods can be approximated, but the result will be zero allele counts without
* likelihoods for a non-reference genotype.
* @return result (for programming convenience)
*/
public AFCalculationResult calculate(final VariantContext vc, final int defaultPloidy) {
Utils.nonNull(vc, "VariantContext cannot be null");
Utils.validate(vc.getGenotypes().stream().anyMatch(Genotype::hasLikelihoods),
"VariantContext at " + vc.getContig() + ":" + vc.getStart() + "must contain at least one " +
"genotype with likelihoods -- did this VC exceed the max number of alt alleles?");
final int numAlleles = vc.getNAlleles();
final List<Allele> alleles = vc.getAlleles();
Utils.validateArg( numAlleles > 1, () -> "VariantContext has only a single reference allele, but getLog10PNonRef requires at least one at all " + vc);
Utils.validateArg( numAlleles > 1, () -> "VariantContext at " + vc.getContig() + ":" + vc.getStart() +
"has only a single reference allele, but getLog10PNonRef requires at least alternate allele");

final double[] priorPseudocounts = alleles.stream()
.mapToDouble(a -> a.isReference() ? refPseudocount : (a.length() == vc.getReference().length() ? snpPseudocount : indelPseudocount)).toArray();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,12 @@ public static GenotypeCounts computeDiploidGenotypeCounts(final VariantContext v
}

// Genotype::getLikelihoods returns a new array, so modification in-place is safe
final double[] normalizedLikelihoods = MathUtils.normalizeFromLog10ToLinearSpace(g.getLikelihoods().getAsVector());
final double[] normalizedLikelihoods;
if (g.hasLikelihoods()) {
normalizedLikelihoods = MathUtils.normalizeFromLog10ToLinearSpace(g.getLikelihoods().getAsVector());
} else {
throw new IllegalStateException("Genotype has no likelihoods: " + g.toString());
}
final double[] biallelicLikelihoods;

//if there are multiple alts, use the biallelic PLs for the best alt
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,23 @@ public void testGDBMaxAltsEqualsGGVCFsMaxAlts() {
File output = runGenotypeGVCFS(genomicsDBUri, null, args, b37_reference_20_21);
}

@Test
public void testGenotypingOnVCWithMissingPLs() {
//this regression test input:
// 1) is missing PLs because it had more than the allowed number of alts for GDB
// 2) has enough GQ0 samples to achieve a QUAL high enough to pass the initial threshold
final String input = toolsTestDir + "/walkers/GenotypeGVCFs/test.tooManyAltsNoPLs.g.vcf";
final List<String> args = new ArrayList<String>();
args.add("-G");
args.add("StandardAnnotation");
args.add("-G");
args.add("AS_StandardAnnotation");
final File output = runGenotypeGVCFS(input, null, args, hg38Reference);
final Pair<VCFHeader, List<VariantContext>> outputData = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath());
//only variant is a big variant that shouldn't get output because it is missing PLs, but we should skip it gracefully
Assert.assertEquals(outputData.getRight().size(), 0);
}

private void runAndCheckGenomicsDBOutput(final ArgumentsBuilder args, final File expected, final File output) {
Utils.resetRandomGenerator();
runCommandLine(args);
Expand Down
Loading

0 comments on commit a85c25f

Please sign in to comment.