Skip to content

Commit

Permalink
add donor id check
Browse files Browse the repository at this point in the history
  • Loading branch information
joyceyan committed Nov 27, 2024
1 parent 8008bcd commit 477f592
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 2 deletions.
19 changes: 17 additions & 2 deletions cellxgene_schema_cli/cellxgene_schema/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,8 @@ def _validate_genetic_ancestry(self):
Performs row-based validation of the genetic_ancestry_X fields. This ensures that a valid row must be:
- all float('nan') if organism is not homo sapiens or info is unavailable
- sum to 1.0
Additionally, verifies that all rows with the same donor_id must have the same genetic ancestry values
"""
ancestry_columns = [
"genetic_ancestry_African",
Expand All @@ -432,16 +434,28 @@ def _validate_genetic_ancestry(self):
]

organism_column = "organism_ontology_term_id"
donor_id_column = "donor_id"

# Skip any additional validation if the genetic ancestry or organism columns are not present
# An error for missing columns will be raised at a different point
required_columns = ancestry_columns + [organism_column]
required_columns = ancestry_columns + [organism_column, donor_id_column]
for column in required_columns:
if column not in self.adata.obs.columns:
return

donor_id_to_ancestry_values = dict()

def is_valid_row(row):
ancestry_values = row[ancestry_columns]

# If ancestry values are different for the same donor id, then this row is invalid
donor_id = row[donor_id_column]
if donor_id in donor_id_to_ancestry_values:
if not donor_id_to_ancestry_values[donor_id].equals(ancestry_values):
return False
else:
donor_id_to_ancestry_values[donor_id] = ancestry_values

# All values are NaN. This is always valid, regardless of organism
if ancestry_values.isna().all():
return True
Expand Down Expand Up @@ -470,7 +484,8 @@ def is_valid_row(row):
if invalid_rows.any():
invalid_indices = self.adata.obs.index[invalid_rows].tolist()
self.errors.append(
f"obs rows with indices {invalid_indices} have invalid genetic_ancestry_* values. If "
f"obs rows with indices {invalid_indices} have invalid genetic_ancestry_* values. All "
f"observations with the same donor_id must contain the same genetic_ancestry_* values. If "
f"organism_ontolology_term_id is NOT 'NCBITaxon:9606' for Homo sapiens, then all genetic"
f"ancestry values MUST be float('nan'). If organism_ontolology_term_id is 'NCBITaxon:9606' "
f"for Homo sapiens, then the value MUST be a float('nan') if unavailable; otherwise, the "
Expand Down
27 changes: 27 additions & 0 deletions cellxgene_schema_cli/tests/test_schema_compliance.py
Original file line number Diff line number Diff line change
Expand Up @@ -1601,6 +1601,33 @@ def test_genetic_ancestry__invalid(
validator.validate_adata()
assert len(validator.errors) > 0

def test_genetic_ancestry_same_donor_id(self, validator_with_adata):
"""
genetic_ancestry_X fields must be the same when the donor id is the same
"""
validator = validator_with_adata
original_donor_id_column = validator.adata.obs["donor_id"].copy()

# Second row should have identical donor id + genetic ancestry values, so this should pass validation
validator.adata.obs.iloc[1] = validator.adata.obs.iloc[0].values
validator.validate_adata()
assert validator.errors == []

# Update the genetic ancestry values to be different. This should now fail validation
validator.adata.obs["genetic_ancestry_African"] = [1.0, 0.0]
validator.adata.obs["genetic_ancestry_East_Asian"] = [0.0, 1.0]
validator.adata.obs["genetic_ancestry_European"] = [0.0, 0.0]
validator.adata.obs["genetic_ancestry_Indigenous_American"] = [0.0, 0.0]
validator.adata.obs["genetic_ancestry_Oceanian"] = [0.0, 0.0]
validator.adata.obs["genetic_ancestry_South_Asian"] = [0.0, 0.0]
validator.validate_adata()
assert len(validator.errors) > 0

# Change the donor id back to two different donor id's. Now, this should pass validation
validator.adata.obs["donor_id"] = original_donor_id_column
validator.validate_adata()
assert validator.errors == []


class TestVar:
"""
Expand Down

0 comments on commit 477f592

Please sign in to comment.