Skip to content

Commit

Permalink
fix: determine species from gene file, not from gene prefix (#1253)
Browse files Browse the repository at this point in the history
  • Loading branch information
joyceyan authored Feb 7, 2025
1 parent 3107d81 commit b7e96bf
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 37 deletions.
42 changes: 7 additions & 35 deletions cellxgene_schema_cli/cellxgene_schema/gencode.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,48 +30,20 @@ def get_organism_from_feature_id(
feature_id: str,
) -> Union[SupportedOrganisms, None]:
"""
Infers the organism of a feature id based on the prefix of a feature id, e.g. ENSG means Homo sapiens
Determines organism based on which gene file the feature id was in
:param str feature_id: the feature id
:rtype Union[ontology.SypportedOrganisms, None]
:return: the organism the feature id is from
"""

if feature_id.startswith("ENSG") or feature_id.startswith("ENST"):
return SupportedOrganisms.HOMO_SAPIENS
elif feature_id.startswith("ENSMUS"):
return SupportedOrganisms.MUS_MUSCULUS
elif feature_id.startswith("ENSSAS"):
return SupportedOrganisms.SARS_COV_2
elif feature_id.startswith("ERCC-"):
return SupportedOrganisms.ERCC
elif feature_id.startswith("FB") or feature_id.startswith("RR"):
return SupportedOrganisms.DROSOPHILA_MELANOGASTER
elif feature_id.startswith("ENSDARG"):
return SupportedOrganisms.DANIO_RERIO
elif feature_id.startswith("WBGene"):
return SupportedOrganisms.CAENORHABDITIS_ELEGANS
elif feature_id.startswith("ENSCJAG"):
return SupportedOrganisms.CALLITHRIX_JACCHUS
elif feature_id.startswith("ENSGGOG"):
return SupportedOrganisms.GORILLA_GORILLA
elif feature_id.startswith("ENSMFAG"):
return SupportedOrganisms.MACACA_FASCICULARIS
elif feature_id.startswith("ENSMMUG"):
return SupportedOrganisms.MACACA_MULATTA
elif feature_id.startswith("ENSMICG"):
return SupportedOrganisms.MICROCEBUS_MURINUS
elif feature_id.startswith("ENSOCUG"):
return SupportedOrganisms.ORYCTOLAGUS_CUNICULUS
elif feature_id.startswith("ENSPTRG"):
return SupportedOrganisms.PAN_TROGLODYTES
elif feature_id.startswith("ENSRNOG"):
return SupportedOrganisms.RATTUS_NORVEGICUS
elif feature_id.startswith("ENSSSCG"):
return SupportedOrganisms.SUS_SCROFA
else:
return None
for organism in SupportedOrganisms:
gene_checker = get_gene_checker(organism)
if gene_checker.is_valid_id(feature_id):
return organism

return None


class GeneChecker:
Expand Down
1 change: 1 addition & 0 deletions cellxgene_schema_cli/tests/test_gencode.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def test_valid_genes(self, species, valid_genes):
assert geneChecker.is_valid_id(gene_id)
assert geneChecker.get_symbol(gene_id) == gene_label
assert geneChecker.get_length(gene_id) == gene_length
assert gencode.get_organism_from_feature_id(gene_id) == species

@pytest.mark.parametrize("species,invalid_genes", invalid_genes.items())
def test_invalid_genes(self, species, invalid_genes):
Expand Down
4 changes: 2 additions & 2 deletions cellxgene_schema_cli/tests/test_schema_compliance.py
Original file line number Diff line number Diff line change
Expand Up @@ -1879,7 +1879,7 @@ def test_feature_id_non_existent_ensembl(self, validator_with_adata, component_n
component.set_index(pd.Index(new_index), inplace=True)

validator.validate_adata()
assert validator.errors == [f"ERROR: 'ENSG000' is not a valid feature ID in '{component_name}'."]
assert len(validator.errors) > 0

@pytest.mark.parametrize("component_name", ["var", "raw.var"])
def test_feature_id_non_existent_ercc(self, validator_with_adata, component_name):
Expand All @@ -1896,7 +1896,7 @@ def test_feature_id_non_existent_ercc(self, validator_with_adata, component_name
component.set_index(pd.Index(new_index), inplace=True)

validator.validate_adata()
assert validator.errors == [f"ERROR: 'ERCC-000000' is not a valid feature ID in '{component_name}'."]
assert len(validator.errors) > 0

def test_should_warn_for_low_gene_count(self, validator_with_adata):
"""
Expand Down

0 comments on commit b7e96bf

Please sign in to comment.