Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add genetic ancestry fields for schema 5.3 #1132

Merged
merged 5 commits into from
Nov 27, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -582,3 +582,15 @@ components:
- "cell culture"
- "organoid"
- "tissue"
genetic_ancestry_African:
type: genetic_ancestry_value
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i initially tried doing something like:

type: float
rule: genetic_ancestry_African >= 0 and genetic_ancestry_African <= 1 or genetic_ancestry_African == float('nan')

but i couldn't quite figure out how to get the NaN check to work correctly with the query syntax. so i just created a new genetic_ancestry_value type

genetic_ancestry_East_Asian:
type: genetic_ancestry_value
genetic_ancestry_European:
type: genetic_ancestry_value
genetic_ancestry_Indigenous_American:
type: genetic_ancestry_value
genetic_ancestry_Oceanian:
type: genetic_ancestry_value
genetic_ancestry_South_Asian:
type: genetic_ancestry_value
92 changes: 92 additions & 0 deletions cellxgene_schema_cli/cellxgene_schema/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,94 @@ def _count_matrix_nonzero(self, matrix_name: str, matrix: Union[np.ndarray, spar
self.number_non_zero[matrix_name] = nnz
return nnz

def _validate_genetic_ancestry(self):
"""
Performs row-based validation of the genetic_ancestry_X fields. This ensures that a valid row must be:
- all float('nan') if organism is not homo sapiens or info is unavailable
- sum to 1.0
"""
ancestry_columns = [
"genetic_ancestry_African",
"genetic_ancestry_East_Asian",
"genetic_ancestry_European",
"genetic_ancestry_Indigenous_American",
"genetic_ancestry_Oceanian",
"genetic_ancestry_South_Asian",
]

organism_column = "organism_ontology_term_id"

# Skip any additional validation if the genetic ancestry or organism columns are not present
# An error for missing columns will be raised at a different point
required_columns = ancestry_columns + [organism_column]
for column in required_columns:
if column not in self.adata.obs.columns:
return

def is_valid_row(row):
ancestry_values = row[ancestry_columns]
# All values are NaN. This is always valid, regardless of organism
if ancestry_values.isna().all():
return True

# If any values are NaN, and we didn't return in the earlier all NaN check, then
# this is invalid
if ancestry_values.isna().any():
return False

# If organism is not homo sapiens, and we didn't return in the earlier all NaN check,
# then this row is invalid
if row[organism_column] != "NCBITaxon:9606":
return False

# The sum of genetic ancestry values should be approximately 1.0
if (
ancestry_values.apply(lambda x: isinstance(x, (float, int))).all()
and abs(ancestry_values.sum() - 1.0) <= 1e-6
):
return True

return False

invalid_rows = ~self.adata.obs.apply(is_valid_row, axis=1)

if invalid_rows.any():
invalid_indices = self.adata.obs.index[invalid_rows].tolist()
self.errors.append(
f"obs rows with indices {invalid_indices} have invalid genetic_ancestry_* values. If "
f"organism_ontolology_term_id is NOT 'NCBITaxon:9606' for Homo sapiens, then all genetic"
f"ancestry values MUST be float('nan'). If organism_ontolology_term_id is 'NCBITaxon:9606' "
f"for Homo sapiens, then the value MUST be a float('nan') if unavailable; otherwise, the "
f"sum of all genetic_ancestry_* fields must be equal to 1.0"
)

def _validate_individual_genetic_ancestry_value(self, column: pd.Series, column_name: str):
"""
The following fields are valid for genetic_ancestry_value columns:
- float values between 0 and 1
- float('nan')
"""
if column.dtype != float:
self.errors.append(f"Column '{column_name}' in obs must be float, not '{column.dtype.name}'.")
return

def is_individual_value_valid(value):
if isinstance(value, (float, int)) and 0 <= value <= 1:
return True
# Ensures only float('nan') or numpy.nan is valid, None is invalid
if isinstance(value, float) and pd.isna(value):
return True
return False

# Identify invalid values
invalid_values = column[~column.map(is_individual_value_valid)]

if not invalid_values.empty:
self.errors.append(
f"Column '{column_name}' in obs contains invalid values: {invalid_values.to_list()}. "
f"Valid values are floats between 0 and 1 or float('nan')."
)

def _validate_column_feature_is_filtered(self, column: pd.Series, column_name: str, df_name: str):
"""
Validates the "is_feature_filtered" in adata.var. This column must be bool, and for genes that are set to
Expand Down Expand Up @@ -505,6 +593,9 @@ def _validate_column(self, column: pd.Series, column_name: str, df_name: str, co
if column_def.get("type") == "feature_is_filtered":
self._validate_column_feature_is_filtered(column, column_name, df_name)

if column_def.get("type") == "genetic_ancestry_value":
self._validate_individual_genetic_ancestry_value(column, column_name)

if "enum" in column_def:
bad_enums = [v for v in column.drop_duplicates() if v not in column_def["enum"]]
if bad_enums:
Expand Down Expand Up @@ -781,6 +872,7 @@ def _validate_dataframe(self, df_name: str):
f"Column '{column_name}' in dataframe '{df_name}' contains a category '{category}' with "
f"zero observations. These categories will be removed when `--add-labels` flag is present."
)
self._validate_genetic_ancestry()
categorical_types = {type(x) for x in column.dtype.categories.values}
# Check for columns that have illegal categories, which are not supported by anndata 0.8.0
# TODO: check if this can be removed after upgading to anndata 0.10.0
Expand Down
90 changes: 90 additions & 0 deletions cellxgene_schema_cli/tests/fixtures/examples_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,12 @@
"HsapDv:0000003",
"donor_1",
"nucleus",
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
],
[
"CL:0000192",
Expand All @@ -62,6 +68,12 @@
"MmusDv:0000003",
"donor_2",
"na",
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
],
],
index=["X", "Y"],
Expand All @@ -78,6 +90,12 @@
"development_stage_ontology_term_id",
"donor_id",
"suspension_type",
"genetic_ancestry_African",
"genetic_ancestry_East_Asian",
"genetic_ancestry_European",
"genetic_ancestry_Indigenous_American",
"genetic_ancestry_Oceanian",
"genetic_ancestry_South_Asian",
],
)

Expand Down Expand Up @@ -144,6 +162,12 @@
"donor_1",
"na",
0,
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
],
[
2,
Expand All @@ -161,6 +185,12 @@
"donor_2",
"na",
1,
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
],
],
index=["X", "Y"],
Expand All @@ -180,6 +210,12 @@
"donor_id",
"suspension_type",
"in_tissue",
"genetic_ancestry_African",
"genetic_ancestry_East_Asian",
"genetic_ancestry_European",
"genetic_ancestry_Indigenous_American",
"genetic_ancestry_Oceanian",
"genetic_ancestry_South_Asian",
],
)

Expand All @@ -203,6 +239,12 @@
"HsapDv:0000003",
"donor_1",
"na",
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
],
[
"CL:0000192",
Expand All @@ -217,6 +259,12 @@
"MmusDv:0000003",
"donor_2",
"na",
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
],
],
index=["X", "Y"],
Expand All @@ -233,6 +281,12 @@
"development_stage_ontology_term_id",
"donor_id",
"suspension_type",
"genetic_ancestry_African",
"genetic_ancestry_East_Asian",
"genetic_ancestry_European",
"genetic_ancestry_Indigenous_American",
"genetic_ancestry_Oceanian",
"genetic_ancestry_South_Asian",
],
)

Expand All @@ -255,6 +309,12 @@
"HsapDv:0000003",
"donor_1",
"na",
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
],
[
"CL:0000192",
Expand All @@ -269,6 +329,12 @@
"MmusDv:0000003",
"donor_2",
"na",
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
],
],
index=["X", "Y"],
Expand All @@ -285,6 +351,12 @@
"development_stage_ontology_term_id",
"donor_id",
"suspension_type",
"genetic_ancestry_African",
"genetic_ancestry_East_Asian",
"genetic_ancestry_European",
"genetic_ancestry_Indigenous_American",
"genetic_ancestry_Oceanian",
"genetic_ancestry_South_Asian",
],
)

Expand Down Expand Up @@ -493,6 +565,12 @@
"tissue:1",
"sre:1",
"development_stage:1",
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
],
[
"cell_type:1",
Expand All @@ -503,6 +581,12 @@
"tissue:1",
"sre:1",
"development_stage:1",
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
],
],
index=["X", "Y"],
Expand All @@ -515,6 +599,12 @@
"tissue_ontology_term_id",
"self_reported_ethnicity_ontology_term_id",
"development_stage_ontology_term_id",
"genetic_ancestry_African",
"genetic_ancestry_East_Asian",
"genetic_ancestry_European",
"genetic_ancestry_Indigenous_American",
"genetic_ancestry_Oceanian",
"genetic_ancestry_South_Asian",
],
)

Expand Down
Binary file modified cellxgene_schema_cli/tests/fixtures/h5ads/example_valid.h5ad
Binary file not shown.
Loading
Loading