From e4d5a58a8adfd2a9aa925372826813c8c07d3da3 Mon Sep 17 00:00:00 2001 From: Joyce Yan <5653616+joyceyan@users.noreply.github.com> Date: Tue, 26 Nov 2024 15:20:28 -0800 Subject: [PATCH] fix comments --- .../cellxgene_schema/validate.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index 2943143be..4b6e0cb4a 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -427,7 +427,7 @@ def _validate_genetic_ancestry(self): organism_column = "organism_ontology_term_id" # Skip any additional validation if the genetic ancestry or organism columns are not present - # An error for missing columns will be raised at a different point in + # An error for missing columns will be raised at a different point required_columns = ancestry_columns + [organism_column] for column in required_columns: if column not in self.adata.obs.columns: @@ -435,19 +435,20 @@ def _validate_genetic_ancestry(self): def is_valid_row(row): ancestry_values = row[ancestry_columns] - # All values are NaN + # All values are NaN. This is always valid, regardless of organism if ancestry_values.isna().all(): return True - # If one value is NaN, then all must be NaN + # If any values are NaN, and we didn't return earlier, then this is invalid if ancestry_values.isna().any(): return False - # If organism is not homo sapiens, then it must be all NaN + # If organism is not homo sapiens, and we didn't return in the earlier all NaN check, + # then this row is invalid if row[organism_column] != "NCBITaxon:9606": return False - # The sum of values is approximately 1.0 + # The sum of genetic ancestry values should be approximately 1.0 if ancestry_values.apply(lambda x: isinstance(x, (float, int))).all(): if abs(ancestry_values.sum() - 1.0) <= 1e-6: return True @@ -464,9 +465,8 @@ def is_valid_row(row): f"obs rows with indices {invalid_indices} have invalid genetic ancestry values. If " f"organism_ontolology_term_id is NOT 'NCBITaxon:9606' for Homo sapiens, then the value " f"MUST be float('nan'). If organism_ontolology_term_id is 'NCBITaxon:9606' for Homo sapiens, " - f"then the value MUST be a float('nan') if unavailable; otherwise, the value MUST be " - f"the genetic ancestry percentage of 'HANCESTRO:0010' for African expressed as a float " - f"greater than or equal to 0.0 and less than or equal to 1.0" + f"then the value MUST be a float('nan') if unavailable; otherwise, the sum of all " + f"genetic_ancestry_* fields must be equal to 1.0" ) def _validate_individual_genetic_ancestry_value(self, column: pd.Series, column_name: str):