Skip to content

Commit

Permalink
feat: update validation for obs['in_tissue'] to include descendants o…
Browse files Browse the repository at this point in the history
…f Visiium (#1124)

Co-authored-by: Evan Molinelli <[email protected]>
  • Loading branch information
ejmolinelli and Evan Molinelli authored Nov 25, 2024
1 parent 0392f2b commit 0c9f9af
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 37 deletions.
14 changes: 14 additions & 0 deletions cellxgene_schema_cli/cellxgene_schema/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@
import os
import sys
from base64 import b85encode
from functools import lru_cache
from typing import Dict, List, Union

import anndata as ad
import numpy as np
from cellxgene_ontology_guide.ontology_parser import OntologyParser
from scipy import sparse
from xxhash import xxh3_64_intdigest

Expand Down Expand Up @@ -151,3 +153,15 @@ def get_hash_digest_column(dataframe):
.astype(np.uint64)
.apply(lambda v: b85encode(v.to_bytes(8, "big")).decode("ascii"))
)


@lru_cache()
def is_ontological_descendant_of(onto: OntologyParser, term: str, target: str, include_self: bool = True) -> bool:
"""
Determines if :term is an ontological descendant of :target and whether to include :term==:target.
This function is cached and is safe to call many times.
#TODO:[EM] needs testing
"""
return term in set(onto.get_term_descendants(target, include_self))
88 changes: 51 additions & 37 deletions cellxgene_schema_cli/cellxgene_schema/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from scipy import sparse

from . import gencode, schema
from .utils import SPARSE_MATRIX_TYPES, get_matrix_format, getattr_anndata, read_h5ad
from .utils import SPARSE_MATRIX_TYPES, get_matrix_format, getattr_anndata, is_ontological_descendant_of, read_h5ad

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -211,7 +211,7 @@ def _validate_curie_ancestors(
is_valid_term_id = ONTOLOGY_PARSER.is_valid_term_id(term_id)
is_valid_ancestor_id = ONTOLOGY_PARSER.is_valid_term_id(ancestor)
if is_valid_term_id & is_valid_ancestor_id:
is_descendant = ancestor in ONTOLOGY_PARSER.get_term_ancestors(term_id)
is_descendant = ancestor in ONTOLOGY_PARSER.get_term_ancestors(term_id, inclusive)
checks.append(is_descendant)

if True not in checks:
Expand Down Expand Up @@ -1477,18 +1477,25 @@ def _validate_spatial_cell_type_ontology_term_id(self):
:rtype none
"""
# Exit if:
# - not Visium and is_single is True as no further checks are necessary
# - in_tissue is not specified as checks are dependent on this value
if not self._is_visium_including_descendants() and self._is_single() or "in_tissue" not in self.adata.obs:
self._is_visium_including_descendants()
self._is_single()
self._is_visium_and_is_single_true()

# skip checks if not a valid spatial assay with a corresponding "in_tissue" column
if not self.is_visium_and_is_single_true:
# not a valid spatial assay
return
elif self.is_visium_and_is_single_true and "in_tissue" not in self.adata.obs.columns:
# valid spatial assay, but missing "in_tissue" column
return

# Validate cell type: must be "unknown" if Visium and is_single is True and in_tissue is 0.
if (
self._is_visium_including_descendants()
& (self.adata.obs["in_tissue"] == 0)
& (self.adata.obs["cell_type_ontology_term_id"] != "unknown")
).any():
# Validate all out of tissue (in_tissue==0) spatial spots have unknown cell ontology term
is_spatial = self.adata.obs["assay_ontology_term_id"].apply(
lambda assay: is_ontological_descendant_of(ONTOLOGY_PARSER, assay, ASSAY_VISIUM, True)
)
is_not_tissue = self.adata.obs["in_tissue"] == 0
is_not_unknown = self.adata.obs["cell_type_ontology_term_id"] != "unknown"
if (is_spatial & is_not_tissue & is_not_unknown).any():
self.errors.append(
f"obs['cell_type_ontology_term_id'] must be 'unknown' when {ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE_IN_TISSUE_0}."
)
Expand All @@ -1500,11 +1507,21 @@ def _validate_spatial_tissue_position(self, tissue_position_name: str, min: int,
:rtype none
"""
# check for visium status and then is visium and single
# techdebt: the following lines are order dependent. Violates idempotence.
self._is_visium_including_descendants()
self._is_single()
self._is_visium_and_is_single_true()

# Tissue position is foribidden if assay is not Visium and is_single is True.
if tissue_position_name in self.adata.obs and (
not self._is_visium_and_is_single_true()
not (self.is_visium_and_is_single_true)
or (
~(self.adata.obs["assay_ontology_term_id"] == ASSAY_VISIUM)
~(
self.adata.obs["assay_ontology_term_id"].apply(
lambda t: is_ontological_descendant_of(ONTOLOGY_PARSER, t, ASSAY_VISIUM, True)
)
)
& (self.adata.obs[tissue_position_name].notnull())
).any()
):
Expand All @@ -1521,7 +1538,11 @@ def _validate_spatial_tissue_position(self, tissue_position_name: str, min: int,
if (
tissue_position_name not in self.adata.obs
or (
(self.adata.obs["assay_ontology_term_id"] == ASSAY_VISIUM)
(
self.adata.obs["assay_ontology_term_id"].apply(
lambda t: is_ontological_descendant_of(ONTOLOGY_PARSER, t, ASSAY_VISIUM, True)
)
)
& (self.adata.obs[tissue_position_name].isnull())
).any()
):
Expand Down Expand Up @@ -1767,34 +1788,27 @@ def _is_visium(self) -> bool:

def _is_visium_including_descendants(self) -> bool:
"""
Determine if the assay_ontology_term_id is Visium (descendant of EFO:0010961).
Determine if the assay_ontology_term_id is Visium (inclusive descendant of EFO:0010961).
Returns True if ANY assay_ontology_term_id is a Visium descendant
:return True if assay_ontology_term_id is Visium, False otherwise.
:rtype bool
"""
if self.is_visium is None:
assay_ontology_term_id = self.adata.obs.get("assay_ontology_term_id")

if assay_ontology_term_id is not None:
# Convert to a regular Series if it's Categorical
assay_ontology_term_id = pd.Series(assay_ontology_term_id)
_assay_key = "assay_ontology_term_id"
includes_and_visium = False

# Check if any term is a descendant of ASSAY_VISIUM
try:
visium_results = assay_ontology_term_id.apply(
lambda term: ASSAY_VISIUM
in list(ONTOLOGY_PARSER.get_lowest_common_ancestors(ASSAY_VISIUM, term))
)
self.is_visium = visium_results.astype(bool).any()
except KeyError as e:
# This generally means the assay_ontology_term_id is invalid, but we want the error to be raised
# by our explicit validator checks, not this implicit one.
logger.warning(f"KeyError processing assay_ontology_term_id ontology: {e}")
self.is_visium = False
else:
self.is_visium = False
# only compute if not already stored
if self.is_visium is None and _assay_key in self.adata.obs.columns:
# check if any assay_ontology_term_ids are descendants of VISIUM
includes_and_visium = (
self.adata.obs[_assay_key]
.apply(lambda assay: is_ontological_descendant_of(ONTOLOGY_PARSER, assay, ASSAY_VISIUM, True))
.any()
)

return self.is_visium
# save state and return
self.is_visium = includes_and_visium
return includes_and_visium

def _validate_spatial_image_shape(self, image_name: str, image: np.ndarray, max_dimension: int = None):
"""
Expand Down
21 changes: 21 additions & 0 deletions cellxgene_schema_cli/tests/test_schema_compliance.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,27 @@ def test_column_presence_assay(self, validator_with_adata):
"to missing dependent column in adata.obs.",
]

@pytest.mark.parametrize(
"assay_ontology_term_id, is_descendant",
[("EFO:0010961", True), ("EFO:0022858", True), ("EFO:0030029", False), ("EFO:0002697", False)],
)
def test_column_presence_in_tissue(self, validator_with_visium_assay, assay_ontology_term_id, is_descendant):
"""
Spatial assays that are descendants of visium must have a valid "in_tissue" column.
"""
validator: Validator = validator_with_visium_assay

# reset and test
validator.reset()
validator.adata.obs["assay_ontology_term_id"] = assay_ontology_term_id
validator._validate_spatial_tissue_position("in_tissue", 0, 1)
if is_descendant:
assert validator.errors == []
else:
assert validator.errors == [
"obs['in_tissue'] is only allowed for descendants of obs['assay_ontology_term_id'] 'EFO:0010961' (Visium Spatial Gene Expression) and uns['spatial']['is_single'] is True."
]

@pytest.mark.parametrize("reserved_column", schema_def["components"]["obs"]["reserved_columns"])
def test_obs_reserved_columns_presence(self, validator_with_adata, reserved_column):
"""
Expand Down

0 comments on commit 0c9f9af

Please sign in to comment.