diff --git a/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml b/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml index d14a0442e..28a153d73 100644 --- a/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml +++ b/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml @@ -186,7 +186,12 @@ components: type: curie dependencies: - # If tissue_type is tissue OR organoid - rule: "tissue_type == 'tissue' | tissue_type == 'organoid'" + rule: + column: tissue_type + match_exact: + terms: + - tissue + - organoid error_message_suffix: >- When 'tissue_type' is 'tissue' or 'organoid', 'tissue_ontology_term_id' MUST be a descendant term id of 'UBERON:0001062' (anatomical entity). @@ -199,7 +204,11 @@ components: UBERON: - UBERON:0001062 - # If tissue_type is cell culture - rule: "tissue_type == 'cell culture'" + rule: + column: tissue_type + match_exact: + terms: + - cell culture error_message_suffix: >- When 'tissue_type' is 'cell culture', 'tissue_ontology_term_id' MUST be either a CL term (excluding 'CL:0000255' (eukaryotic cell), 'CL:0000257' (Eumycetozoan cell), @@ -222,7 +231,11 @@ components: type: curie dependencies: - # If organism is Human - rule: "organism_ontology_term_id == 'NCBITaxon:9606'" + rule: + column: organism_ontology_term_id + match_exact: + terms: + - NCBITaxon:9606 error_message_suffix: >- When 'organism_ontology_term_id' is 'NCBITaxon:9606' (Homo sapiens), self_reported_ethnicity_ontology_term_id MUST be formatted as one @@ -285,7 +298,11 @@ components: type: curie dependencies: - # If organism is Human - rule: "organism_ontology_term_id == 'NCBITaxon:9606'" + rule: + column: organism_ontology_term_id + match_exact: + terms: + - NCBITaxon:9606 error_message_suffix: >- When 'organism_ontology_term_id' is 'NCBITaxon:9606' (Homo sapiens), 'development_stage_ontology_term_id' MUST be the most accurate descendant of 'HsapDv:0000001' or unknown. @@ -300,7 +317,11 @@ components: exceptions: - unknown - # If organism is Mouse - rule: "organism_ontology_term_id == 'NCBITaxon:10090'" + rule: + column: organism_ontology_term_id + match_exact: + terms: + - NCBITaxon:10090 error_message_suffix: >- When 'organism_ontology_term_id' is 'NCBITaxon:10090' (Mus musculus), 'development_stage_ontology_term_id' MUST be the most accurate descendant of 'MmusDv:0000001' or unknown. @@ -353,227 +374,70 @@ components: selected the most appropriate value for the assay(s) between 'cell', 'nucleus', and 'na'. Please contact cellxgene@chanzuckerberg.com during submission so that the assay(s) can be added to the schema definition document. dependencies: - - # If assay_ontology_term_id is EFO:0030080 or its descendants, 'suspension_type' MUST be 'cell' or 'nucleus' - complex_rule: - match_ancestors: - column: assay_ontology_term_id + - # 'suspension_type' MUST be 'cell' or 'nucleus' + rule: + column: assay_ontology_term_id + match_ancestors_inclusive: ancestors: - EFO: - - EFO:0030080 - inclusive: True + - EFO:0030080 + - EFO:0010184 + match_exact: + terms: + - EFO:0010010 + - EFO:0008722 + - EFO:0010550 + - EFO:0008780 + - EFO:0700010 + - EFO:0700011 + - EFO:0009919 + - EFO:0030060 + - EFO:0022490 + - EFO:0030028 type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0030080 or its descendants enum: - "cell" - "nucleus" - - # If assay_ontology_term_id is EFO:0007045 or its descendants, 'suspension_type' MUST be 'nucleus' - complex_rule: - match_ancestors: - column: assay_ontology_term_id + - # 'suspension_type' MUST be 'nucleus' + rule: + column: assay_ontology_term_id + match_ancestors_inclusive: ancestors: - EFO: - - EFO:0007045 - inclusive: True - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0007045 or its descendants - enum: - - "nucleus" - - # If assay_ontology_term_id is EFO:0010184 or its descendants, 'suspension_type' MUST be 'cell' or 'nucleus' - complex_rule: - match_ancestors: - column: assay_ontology_term_id - ancestors: - EFO: - - EFO:0010184 - inclusive: True + - EFO:0007045 + - EFO:0002761 + match_exact: + terms: + - EFO:0008720 + - EFO:0030026 type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0010184 or its descendants enum: - - "cell" - "nucleus" - - # If assay_ontology_term_id is EFO:0008994 or its descendants, 'suspension_type' MUST be 'na' - complex_rule: - match_ancestors: - column: assay_ontology_term_id + - #'suspension_type' MUST be 'cell' + rule: + column: assay_ontology_term_id + match_ancestors_inclusive: ancestors: - EFO: - - EFO:0008994 - inclusive: True - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0008994 or its descendants - enum: - - "na" - - # If assay_ontology_term_id is EFO:0008919 or its descendants, 'suspension_type' MUST be 'cell' - complex_rule: - match_ancestors: - column: assay_ontology_term_id - ancestors: - EFO: - - EFO:0008919 - inclusive: True + - EFO:0008919 + match_exact: + terms: + - EFO:0030002 + - EFO:0008853 + - EFO:0008796 + - EFO:0700003 + - EFO:0700004 + - EFO:0008953 type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0008919 or its descendants enum: - "cell" - - # If assay_ontology_term_id is EFO:0002761 or its descendants, 'suspension_type' MUST be 'nucleus' - complex_rule: - match_ancestors: - column: assay_ontology_term_id + - # 'suspension_type' MUST be 'na' + rule: + column: assay_ontology_term_id + match_ancestors_inclusive: ancestors: - EFO: - - EFO:0002761 - inclusive: True - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0002761 or its descendants - enum: - - "nucleus" - - # If assay_ontology_term_id is EFO:0010010, 'suspension_type' MUST be 'cell' or 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0010010'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0010010 - enum: - - "cell" - - "nucleus" - - # If assay_ontology_term_id is EFO:0008720, 'suspension_type' MUST be 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0008720'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0008720 - enum: - - "nucleus" - - # If assay_ontology_term_id is EFO:0008722, 'suspension_type' MUST be 'cell' or 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0008722'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0008722 - enum: - - "cell" - - "nucleus" - - # If assay_ontology_term_id is EFO:0030002, 'suspension_type' MUST be 'cell' - rule: "assay_ontology_term_id == 'EFO:0030002'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0030002 - enum: - - "cell" - - # If assay_ontology_term_id is EFO:0008853, 'suspension_type' MUST be 'cell' - rule: "assay_ontology_term_id == 'EFO:0008853'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0008853 - enum: - - "cell" - - # If assay_ontology_term_id is EFO:0030026, 'suspension_type' MUST be 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0030026'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0030026 - enum: - - "nucleus" - - # If assay_ontology_term_id is EFO:0010550, 'suspension_type' MUST be 'cell' or 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0010550'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0010550 - enum: - - "cell" - - "nucleus" - - # If assay_ontology_term_id is EFO:0008796, 'suspension_type' MUST be 'cell' - rule: "assay_ontology_term_id == 'EFO:0008796'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0008796 - enum: - - "cell" - - # If assay_ontology_term_id is EFO:0700003, 'suspension_type' MUST be 'cell' - rule: "assay_ontology_term_id == 'EFO:0700003'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0700003 - enum: - - "cell" - - # If assay_ontology_term_id is EFO:0700004, 'suspension_type' MUST be 'cell' - rule: "assay_ontology_term_id == 'EFO:0700004'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0700004 - enum: - - "cell" - - # If assay_ontology_term_id is EFO:0008780, 'suspension_type' MUST be 'cell' or 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0008780'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0008780 - enum: - - "cell" - - "nucleus" - - # If assay_ontology_term_id is EFO:0008953, 'suspension_type' MUST be 'cell' - rule: "assay_ontology_term_id == 'EFO:0008953'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0008953 - enum: - - "cell" - - # If assay_ontology_term_id is EFO:0700010, 'suspension_type' MUST be 'cell' or 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0700010'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0700010 - enum: - - "cell" - - "nucleus" - - # If assay_ontology_term_id is EFO:0700011, 'suspension_type' MUST be 'cell' or 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0700011'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0700011 - enum: - - "cell" - - "nucleus" - - # If assay_ontology_term_id is EFO:0009919, 'suspension_type' MUST be 'cell' or 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0009919'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0009919 - enum: - - "cell" - - "nucleus" - - # If assay_ontology_term_id is EFO:0030060, 'suspension_type' MUST be 'cell' or 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0030060'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0030060 - enum: - - "cell" - - "nucleus" - - # If assay_ontology_term_id is EFO:0022490, 'suspension_type' MUST be 'cell' or 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0022490'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0022490 - enum: - - "cell" - - "nucleus" - - # If assay_ontology_term_id is EFO:0030028, 'suspension_type' MUST be 'cell' or 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0030028'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0030028 - enum: - - "cell" - - "nucleus" - - # If assay_ontology_term_id is EFO:0008992, 'suspension_type' MUST be 'na' - rule: "assay_ontology_term_id == 'EFO:0008992'" + - EFO:0008994 + match_exact: + terms: + - EFO:0008992 type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0008992 enum: - "na" tissue_type: diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index 6b62f796b..292cc8ec7 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -13,7 +13,6 @@ import scipy from anndata._core.sparse_dataset import SparseDataset from cellxgene_ontology_guide.ontology_parser import OntologyParser -from pandas.errors import UndefinedVariableError from scipy import sparse from . import gencode, schema @@ -630,7 +629,9 @@ def _validate_column_feature_is_filtered(self, column: pd.Series, column_name: s f"these features must be 0." ) - def _validate_column(self, column: pd.Series, column_name: str, df_name: str, column_def: dict): + def _validate_column( + self, column: pd.Series, column_name: str, df_name: str, column_def: dict, default_error_message_suffix=None + ): """ Given a schema definition and the column of a dataframe, verify that the column satisfies the schema. If there are any errors, it adds them to self.errors @@ -640,6 +641,7 @@ def _validate_column(self, column: pd.Series, column_name: str, df_name: str, co :param str df_name: Name of the dataframe :param dict column_def: schema definition for this specific column, e.g. schema_def["obs"]["columns"]["cell_type_ontology_term_id"] + :param str default_error_message_suffix: default error message suffix to be added to errors found here :rtype None """ @@ -708,10 +710,11 @@ def _validate_column(self, column: pd.Series, column_name: str, df_name: str, co self._validate_curie_str(term_str, column_name, column_def["curie_constraints"]) # Add error suffix to errors found here - if "error_message_suffix" in column_def: + error_message_suffix = column_def.get("error_message_suffix", default_error_message_suffix) + if error_message_suffix: error_total_count = len(self.errors) for i in range(error_original_count, error_total_count): - self.errors[i] = self.errors[i] + " " + column_def["error_message_suffix"] + self.errors[i] = self.errors[i] + " " + error_message_suffix def _validate_column_dependencies( self, df: pd.DataFrame, df_name: str, column_name: str, dependencies: List[dict] @@ -731,73 +734,38 @@ def _validate_column_dependencies( """ all_rules = [] - for dependency_def in dependencies: - if "complex_rule" in dependency_def: - if "match_ancestors" in dependency_def["complex_rule"]: - query_fn, args = self._generate_match_ancestors_query_fn( - dependency_def["complex_rule"]["match_ancestors"] - ) - term_id, ontologies, ancestors, ancestor_inclusive = args - query_exp = f"@query_fn({term_id}, {ontologies}, {ancestors}, {ancestor_inclusive})" - elif "rule" in dependency_def: - query_exp = dependency_def["rule"] - else: - continue - + terms_to_match = set() + column_to_match = dependency_def["rule"]["column"] + if "match_ancestors_inclusive" in dependency_def["rule"]: + ancestors = dependency_def["rule"]["match_ancestors_inclusive"]["ancestors"] + for ancestor in ancestors: + terms_to_match.update(ONTOLOGY_PARSER.get_term_descendants(ancestor, include_self=True)) + if "match_exact" in dependency_def["rule"]: + terms_to_match.update(dependency_def["rule"]["match_exact"]["terms"]) try: - column = getattr(df.query(query_exp, engine="python"), column_name) - except UndefinedVariableError: + match_query = df[column_to_match].isin(terms_to_match) + match_df = df[match_query] + column = getattr(match_df, column_name) + error_message_suffix = dependency_def.get("error_message_suffix", None) + if not error_message_suffix: + matched_values = list(getattr(match_df, column_to_match).unique()) + error_message_suffix = f"when '{column_to_match}' is in {matched_values}" + except KeyError: self.errors.append( f"Checking values with dependencies failed for adata.{df_name}['{column_name}'], " f"this is likely due to missing dependent column in adata.{df_name}." ) return pd.Series(dtype=np.float64) - all_rules.append(query_exp) - - self._validate_column(column, column_name, df_name, dependency_def) + all_rules.append(match_query) + self._validate_column(column, column_name, df_name, dependency_def, error_message_suffix) - # Set column with the data that's left - all_rules = " | ".join(all_rules) - column = getattr(df.query("not (" + all_rules + " )", engine="python"), column_name) + # Return column of data that was not matched by any of the rules + column = getattr(df[~np.logical_or.reduce(all_rules)], column_name) return column - def _generate_match_ancestors_query_fn(self, rule_def: Dict): - """ - Generates vectorized function and args to query a pandas dataframe. Function will determine whether values from - a specified column is a descendant term to a group of specified ancestors, returning a Bool. - :param rule_def: defines arguments to pass into vectorized ancestor match validation function - :return: Tuple(function, Tuple(str, List[str], List[str])) - """ - validate_curie_ancestors_vectorized = np.vectorize(self._validate_curie_ancestors) - ancestor_map = rule_def["ancestors"] - inclusive = rule_def["inclusive"] - - # hack: pandas dataframe query doesn't support Dict inputs - ontology_keys = [] - ancestor_list = [] - for key, val in ancestor_map.items(): - ontology_keys.append(key) - ancestor_list.append(val) - - def is_ancestor_match( - term_id: str, - ontologies: List[str], - ancestors: List[str], - ancestor_inclusive: bool, - ) -> bool: - allowed_ancestors = dict(zip(ontologies, ancestors)) - return validate_curie_ancestors_vectorized(term_id, allowed_ancestors, inclusive=ancestor_inclusive) - - return is_ancestor_match, ( - rule_def["column"], - ontology_keys, - ancestor_list, - inclusive, - ) - def _validate_list(self, list_name: str, current_list: List[str], element_type: str): """ Validates the elements of a list based on the type definition. Adds errors to self.errors if any diff --git a/cellxgene_schema_cli/tests/test_schema_compliance.py b/cellxgene_schema_cli/tests/test_schema_compliance.py index bf83c97b7..f78ad6dac 100644 --- a/cellxgene_schema_cli/tests/test_schema_compliance.py +++ b/cellxgene_schema_cli/tests/test_schema_compliance.py @@ -1484,13 +1484,15 @@ def test_suspension_type(self, validator, assay, suspension_types): if "na" in suspension_types: invalid_suspension_type = "nucleus" if "nucleus" not in suspension_types else "cell" obs = validator.adata.obs - obs.loc[obs.index[1], "suspension_type"] = invalid_suspension_type - obs.loc[obs.index[1], "assay_ontology_term_id"] = assay + obs["suspension_type"] = invalid_suspension_type + obs["assay_ontology_term_id"] = assay + obs["suspension_type"] = obs["suspension_type"].astype("category") + obs["assay_ontology_term_id"] = obs["assay_ontology_term_id"].astype("category") validator.validate_adata() assert validator.errors == [ f"ERROR: Column 'suspension_type' in dataframe 'obs' contains invalid values " f"'['{invalid_suspension_type}']'. Values must be one of {suspension_types} when " - f"'assay_ontology_term_id' is {assay}" + f"'assay_ontology_term_id' is in ['{assay}']" ] @pytest.mark.parametrize( @@ -1517,13 +1519,15 @@ def test_suspension_type_ancestors_inclusive(self, validator_with_adata, assay, if "na" in suspension_types: invalid_suspension_type = "nucleus" if "nucleus" not in suspension_types else "cell" obs["suspension_type"] = obs["suspension_type"].cat.remove_unused_categories() - obs.loc[obs.index[1], "assay_ontology_term_id"] = assay - obs.loc[obs.index[1], "suspension_type"] = invalid_suspension_type + obs["suspension_type"] = invalid_suspension_type + obs["assay_ontology_term_id"] = assay + obs["suspension_type"] = obs["suspension_type"].astype("category") + obs["assay_ontology_term_id"] = obs["assay_ontology_term_id"].astype("category") validator.validate_adata() assert validator.errors == [ f"ERROR: Column 'suspension_type' in dataframe 'obs' contains invalid values " f"'['{invalid_suspension_type}']'. Values must be one of {suspension_types} when " - f"'assay_ontology_term_id' is {assay} or its descendants" + f"'assay_ontology_term_id' is in ['{assay}']" ] def test_suspension_type_with_descendant_term_id_failure(self, validator_with_adata): @@ -1533,14 +1537,15 @@ def test_suspension_type_with_descendant_term_id_failure(self, validator_with_ad """ validator = validator_with_adata obs = validator.adata.obs - obs.loc[obs.index[0], "assay_ontology_term_id"] = "EFO:0022615" # descendant of EFO:0008994 - obs.loc[obs.index[0], "suspension_type"] = "nucleus" - + obs["suspension_type"] = "nucleus" + obs["assay_ontology_term_id"] = "EFO:0022615" # descendant of EFO:0008994 + obs["suspension_type"] = obs["suspension_type"].astype("category") + obs["assay_ontology_term_id"] = obs["assay_ontology_term_id"].astype("category") validator.validate_adata() assert validator.errors == [ "ERROR: Column 'suspension_type' in dataframe 'obs' contains invalid values " "'['nucleus']'. Values must be one of ['na'] when " - "'assay_ontology_term_id' is EFO:0008994 or its descendants" + "'assay_ontology_term_id' is in ['EFO:0022615']" ] def test_suspension_type_with_descendant_term_id_success(self, validator_with_adata):