From 8008bcd68502dd945a8fff756fc1930f34097ef0 Mon Sep 17 00:00:00 2001 From: Joyce Yan <5653616+joyceyan@users.noreply.github.com> Date: Tue, 26 Nov 2024 16:39:10 -0800 Subject: [PATCH] update comments --- .../cellxgene_schema/validate.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index d9b477200..9744ea702 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -422,7 +422,6 @@ def _validate_genetic_ancestry(self): - all float('nan') if organism is not homo sapiens or info is unavailable - sum to 1.0 """ - # Extract the relevant genetic ancestry columns ancestry_columns = [ "genetic_ancestry_African", "genetic_ancestry_East_Asian", @@ -432,7 +431,6 @@ def _validate_genetic_ancestry(self): "genetic_ancestry_South_Asian", ] - # Extract the organism ontology column organism_column = "organism_ontology_term_id" # Skip any additional validation if the genetic ancestry or organism columns are not present @@ -448,7 +446,8 @@ def is_valid_row(row): if ancestry_values.isna().all(): return True - # If any values are NaN, and we didn't return earlier, then this is invalid + # If any values are NaN, and we didn't return in the earlier all NaN check, then + # this is invalid if ancestry_values.isna().any(): return False @@ -466,18 +465,16 @@ def is_valid_row(row): return False - # Identify invalid rows invalid_rows = ~self.adata.obs.apply(is_valid_row, axis=1) - # If there are invalid rows, raise an error if invalid_rows.any(): invalid_indices = self.adata.obs.index[invalid_rows].tolist() self.errors.append( - f"obs rows with indices {invalid_indices} have invalid genetic ancestry values. If " - f"organism_ontolology_term_id is NOT 'NCBITaxon:9606' for Homo sapiens, then the value " - f"MUST be float('nan'). If organism_ontolology_term_id is 'NCBITaxon:9606' for Homo sapiens, " - f"then the value MUST be a float('nan') if unavailable; otherwise, the sum of all " - f"genetic_ancestry_* fields must be equal to 1.0" + f"obs rows with indices {invalid_indices} have invalid genetic_ancestry_* values. If " + f"organism_ontolology_term_id is NOT 'NCBITaxon:9606' for Homo sapiens, then all genetic" + f"ancestry values MUST be float('nan'). If organism_ontolology_term_id is 'NCBITaxon:9606' " + f"for Homo sapiens, then the value MUST be a float('nan') if unavailable; otherwise, the " + f"sum of all genetic_ancestry_* fields must be equal to 1.0" ) def _validate_individual_genetic_ancestry_value(self, column: pd.Series, column_name: str): @@ -493,7 +490,7 @@ def _validate_individual_genetic_ancestry_value(self, column: pd.Series, column_ def is_individual_value_valid(value): if isinstance(value, (float, int)) and 0 <= value <= 1: return True - # Ensures only float('nan') is valid, None is invalid + # Ensures only float('nan') or numpy.nan is valid, None is invalid if isinstance(value, float) and pd.isna(value): return True return False