feat: support for all secondary species (#1250)

Co-authored-by: Evan Molinelli <[email protected]>
chanzuckerberg · Feb 7, 2025 · 241cefd · 241cefd
1 parent 0aaf700
commit 241cefd
Show file tree

Hide file tree

Showing 24 changed files with 136 additions and 19 deletions.
diff --git a/cellxgene_schema_cli/cellxgene_schema/gencode.py b/cellxgene_schema_cli/cellxgene_schema/gencode.py
@@ -7,13 +7,23 @@
 
 
 class SupportedOrganisms(enum.Enum):
+    # NOTE: these could be enumerated from loading the `schema_definition.yaml` and scraping the 'organism_ontology_term_id' constraints
     HOMO_SAPIENS = "NCBITaxon:9606"
     MUS_MUSCULUS = "NCBITaxon:10090"
     SARS_COV_2 = "NCBITaxon:2697049"
     ERCC = "NCBITaxon:32630"
     DROSOPHILA_MELANOGASTER = "NCBITaxon:7227"
     DANIO_RERIO = "NCBITaxon:7955"
     CAENORHABDITIS_ELEGANS = "NCBITaxon:6239"
+    MACACA_FASCICULARIS = "NCBITaxon:9541"
+    ORYCTOLAGUS_CUNICULUS = "NCBITaxon:9986"
+    CALLITHRIX_JACCHUS = "NCBITaxon:9483"
+    GORILLA_GORILLA = "NCBITaxon:9595"
+    MACACA_MULATTA = "NCBITaxon:9544"
+    PAN_TROGLODYTES = "NCBITaxon:9598"
+    SUS_SCROFA = "NCBITaxon:9823"
+    MICROCEBUS_MURINUS = "NCBITaxon:30608"
+    RATTUS_NORVEGICUS = "NCBITaxon:10116"
 
 
 def get_organism_from_feature_id(
@@ -42,6 +52,24 @@ def get_organism_from_feature_id(
         return SupportedOrganisms.DANIO_RERIO
     elif feature_id.startswith("WBGene"):
         return SupportedOrganisms.CAENORHABDITIS_ELEGANS
+    elif feature_id.startswith("ENSCJAG"):
+        return SupportedOrganisms.CALLITHRIX_JACCHUS
+    elif feature_id.startswith("ENSGGOG"):
+        return SupportedOrganisms.GORILLA_GORILLA
+    elif feature_id.startswith("ENSMFAG"):
+        return SupportedOrganisms.MACACA_FASCICULARIS
+    elif feature_id.startswith("ENSMMUG"):
+        return SupportedOrganisms.MACACA_MULATTA
+    elif feature_id.startswith("ENSMICG"):
+        return SupportedOrganisms.MICROCEBUS_MURINUS
+    elif feature_id.startswith("ENSOCUG"):
+        return SupportedOrganisms.ORYCTOLAGUS_CUNICULUS
+    elif feature_id.startswith("ENSPTRG"):
+        return SupportedOrganisms.PAN_TROGLODYTES
+    elif feature_id.startswith("ENSRNOG"):
+        return SupportedOrganisms.RATTUS_NORVEGICUS
+    elif feature_id.startswith("ENSSSCG"):
+        return SupportedOrganisms.SUS_SCROFA
     else:
         return None
 
@@ -59,6 +87,15 @@ class GeneChecker:
         ),
         SupportedOrganisms.DANIO_RERIO: os.path.join(env.GENCODE_DIR, "genes_danio_rerio.csv.gz"),
         SupportedOrganisms.CAENORHABDITIS_ELEGANS: os.path.join(env.GENCODE_DIR, "genes_caenorhabditis_elegans.csv.gz"),
+        SupportedOrganisms.MACACA_FASCICULARIS: os.path.join(env.GENCODE_DIR, "genes_macaca_fascicularis.csv.gz"),
+        SupportedOrganisms.ORYCTOLAGUS_CUNICULUS: os.path.join(env.GENCODE_DIR, "genes_oryctolagus_cuniculus.csv.gz"),
+        SupportedOrganisms.CALLITHRIX_JACCHUS: os.path.join(env.GENCODE_DIR, "genes_callithrix_jacchus.csv.gz"),
+        SupportedOrganisms.GORILLA_GORILLA: os.path.join(env.GENCODE_DIR, "genes_gorilla_gorilla.csv.gz"),
+        SupportedOrganisms.MACACA_MULATTA: os.path.join(env.GENCODE_DIR, "genes_macaca_mulatta.csv.gz"),
+        SupportedOrganisms.PAN_TROGLODYTES: os.path.join(env.GENCODE_DIR, "genes_pan_troglodytes.csv.gz"),
+        SupportedOrganisms.SUS_SCROFA: os.path.join(env.GENCODE_DIR, "genes_sus_scrofa.csv.gz"),
+        SupportedOrganisms.MICROCEBUS_MURINUS: os.path.join(env.GENCODE_DIR, "genes_microcebus_murinus.csv.gz"),
+        SupportedOrganisms.RATTUS_NORVEGICUS: os.path.join(env.GENCODE_DIR, "genes_rattus_norvegicus.csv.gz"),
     }
 
     def __init__(self, species: SupportedOrganisms):

diff --git a/cellxgene_schema_cli/cellxgene_schema/gencode_files/gene_info.yml b/cellxgene_schema_cli/cellxgene_schema/gencode_files/gene_info.yml
@@ -22,3 +22,30 @@ zebrafish:
 roundworm:
   description: caenorhabditis_elegans
   url: https://ftp.ensembl.org/pub/release-113/gtf/caenorhabditis_elegans/Caenorhabditis_elegans.WBcel235.113.gtf.gz
+macaque:
+  description: macaca_fascicularis
+  url: https://ftp.ensembl.org/pub/release-113/gtf/macaca_fascicularis/Macaca_fascicularis.Macaca_fascicularis_6.0.113.gtf.gz
+rabbit:
+  description: oryctolagus_cuniculus
+  url: https://ftp.ensembl.org/pub/release-113/gtf/oryctolagus_cuniculus/Oryctolagus_cuniculus.OryCun2.0.113.gtf.gz
+marmoset:
+  description: callithrix_jacchus
+  url: https://ftp.ensembl.org/pub/release-113/gtf/callithrix_jacchus/Callithrix_jacchus.mCalJac1.pat.X.113.gtf.gz
+gorilla:
+  description: gorilla_gorilla
+  url: https://ftp.ensembl.org/pub/release-113/gtf/gorilla_gorilla/Gorilla_gorilla.gorGor4.113.gtf.gz
+rhesus_macaque:
+  description: macaca_mulatta
+  url: https://ftp.ensembl.org/pub/release-113/gtf/macaca_mulatta/Macaca_mulatta.Mmul_10.113.gtf.gz
+troglodyte:
+  description: pan_troglodytes
+  url: https://ftp.ensembl.org/pub/release-113/gtf/pan_troglodytes/Pan_troglodytes.Pan_tro_3.0.113.gtf.gz
+pig:
+  description: sus_scrofa
+  url: https://ftp.ensembl.org/pub/release-113/gtf/sus_scrofa/Sus_scrofa.Sscrofa11.1.113.gtf.gz
+lemur:
+  description: microcebus_murinus
+  url: https://ftp.ensembl.org/pub/release-113/gtf/microcebus_murinus/Microcebus_murinus.Mmur_3.0.113.gtf.gz
+rat: 
+  description: rattus_norvegicus
+  url: https://ftp.ensembl.org/pub/release-113/gtf/rattus_norvegicus/Rattus_norvegicus.mRatBN7.2.113.gtf.gz
diff --git a/cellxgene_schema_cli/cellxgene_schema/gencode_files/genes_caenorhabditis_elegans.csv.gz b/cellxgene_schema_cli/cellxgene_schema/gencode_files/genes_caenorhabditis_elegans.csv.gz
diff --git a/cellxgene_schema_cli/cellxgene_schema/gencode_files/genes_callithrix_jacchus.csv.gz b/cellxgene_schema_cli/cellxgene_schema/gencode_files/genes_callithrix_jacchus.csv.gz
diff --git a/cellxgene_schema_cli/cellxgene_schema/gencode_files/genes_danio_rerio.csv.gz b/cellxgene_schema_cli/cellxgene_schema/gencode_files/genes_danio_rerio.csv.gz
diff --git a/cellxgene_schema_cli/cellxgene_schema/gencode_files/genes_drosophila_melanogaster.csv.gz b/cellxgene_schema_cli/cellxgene_schema/gencode_files/genes_drosophila_melanogaster.csv.gz
diff --git a/cellxgene_schema_cli/cellxgene_schema/gencode_files/genes_gorilla_gorilla.csv.gz b/cellxgene_schema_cli/cellxgene_schema/gencode_files/genes_gorilla_gorilla.csv.gz
diff --git a/cellxgene_schema_cli/cellxgene_schema/gencode_files/genes_homo_sapiens.csv.gz b/cellxgene_schema_cli/cellxgene_schema/gencode_files/genes_homo_sapiens.csv.gz
diff --git a/cellxgene_schema_cli/cellxgene_schema/gencode_files/genes_macaca_fascicularis.csv.gz b/cellxgene_schema_cli/cellxgene_schema/gencode_files/genes_macaca_fascicularis.csv.gz
diff --git a/cellxgene_schema_cli/cellxgene_schema/gencode_files/genes_macaca_mulatta.csv.gz b/cellxgene_schema_cli/cellxgene_schema/gencode_files/genes_macaca_mulatta.csv.gz
diff --git a/cellxgene_schema_cli/cellxgene_schema/gencode_files/genes_microcebus_murinus.csv.gz b/cellxgene_schema_cli/cellxgene_schema/gencode_files/genes_microcebus_murinus.csv.gz
diff --git a/cellxgene_schema_cli/cellxgene_schema/gencode_files/genes_mus_musculus.csv.gz b/cellxgene_schema_cli/cellxgene_schema/gencode_files/genes_mus_musculus.csv.gz
diff --git a/cellxgene_schema_cli/cellxgene_schema/gencode_files/genes_oryctolagus_cuniculus.csv.gz b/cellxgene_schema_cli/cellxgene_schema/gencode_files/genes_oryctolagus_cuniculus.csv.gz
diff --git a/cellxgene_schema_cli/cellxgene_schema/gencode_files/genes_pan_troglodytes.csv.gz b/cellxgene_schema_cli/cellxgene_schema/gencode_files/genes_pan_troglodytes.csv.gz
diff --git a/cellxgene_schema_cli/cellxgene_schema/gencode_files/genes_rattus_norvegicus.csv.gz b/cellxgene_schema_cli/cellxgene_schema/gencode_files/genes_rattus_norvegicus.csv.gz
diff --git a/cellxgene_schema_cli/cellxgene_schema/gencode_files/genes_sus_scrofa.csv.gz b/cellxgene_schema_cli/cellxgene_schema/gencode_files/genes_sus_scrofa.csv.gz
diff --git a/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml b/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml
@@ -175,10 +175,39 @@ components:
             to_column: disease
       organism_ontology_term_id:
         type: curie
-        error_message_suffix: "Only descendant term ids of 'NCBITaxon:33208' for metazoan are allowed."
+        error_message_suffix: "Only explicitly enumerated species are allowed. See Schema"
         curie_constraints:
           ontologies:
             - NCBITaxon
+          allowed:
+            terms:
+              NCBITaxon:
+                - NCBITaxon:9606
+                - NCBITaxon:10090
+                - NCBITaxon:2697049
+                - NCBITaxon:32630
+                - NCBITaxon:7227
+                - NCBITaxon:7955
+                - NCBITaxon:6239
+                - NCBITaxon:9541
+                - NCBITaxon:9986
+                - NCBITaxon:9483
+                - NCBITaxon:9595
+                - NCBITaxon:9544
+                - NCBITaxon:9598
+                - NCBITaxon:9823
+                - NCBITaxon:30608
+                - NCBITaxon:10116
+            ancestors:
+              NCBITaxon:
+                - NCBITaxon:9541
+                - NCBITaxon:9544
+                - NCBITaxon:10090
+                - NCBITaxon:9986
+                - NCBITaxon:9598
+                - NCBITaxon:10116
+                - NCBITaxon:9823
+
         add_labels:
           - type: curie
             to_column: organism

diff --git a/cellxgene_schema_cli/cellxgene_schema/utils.py b/cellxgene_schema_cli/cellxgene_schema/utils.py
@@ -197,3 +197,8 @@ def is_ontological_descendant_of(onto: OntologyParser, term: str, target: str, i
     #TODO:[EM] needs testing
     """
     return term in set(onto.get_term_descendants(target, include_self))
+
+
+@lru_cache()
+def get_descendants(onto: OntologyParser, term: str, include_self: bool = True) -> List[str]:
+    return onto.get_term_descendants(term, include_self=True)
diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py
@@ -21,6 +21,7 @@
 from .utils import (
     SPARSE_MATRIX_TYPES,
     SUPPORTED_SPARSE_MATRIX_TYPES,
+    get_descendants,
     get_matrix_format,
     getattr_anndata,
     is_ontological_descendant_of,
@@ -802,16 +803,9 @@ def _validate_column_dependencies(
             terms_to_match = set()
             column_to_match = dependency_def["rule"]["column"]
             if "match_ancestors_inclusive" in dependency_def["rule"]:
-                print("rule", dependency_def["rule"])
                 ancestors = dependency_def["rule"]["match_ancestors_inclusive"]["ancestors"]
                 for ancestor in ancestors:
-                    print("ancestor", ancestor)
-                    term_descendants = ONTOLOGY_PARSER.get_term_descendants(ancestor, include_self=True)
-                    print("term_descendants", term_descendants)
-                    print("len(terms to match) before", len(terms_to_match))
-                    terms_to_match.update(ONTOLOGY_PARSER.get_term_descendants(ancestor, include_self=True))
-                    print("len(terms to match) after", len(terms_to_match))
-                print("terms to match", terms_to_match)
+                    terms_to_match.update(get_descendants(ONTOLOGY_PARSER, ancestor, True))
             if "match_exact" in dependency_def["rule"]:
                 terms_to_match.update(dependency_def["rule"]["match_exact"]["terms"])
             try:

diff --git a/cellxgene_schema_cli/tests/fixtures/examples_ontology_test.py b/cellxgene_schema_cli/tests/fixtures/examples_ontology_test.py
@@ -4,8 +4,19 @@
 invalid_species = ["Caenorhabditis elegans"]
 
 valid_genes = {
-    gencode.SupportedOrganisms.HOMO_SAPIENS: {"ENSG00000141510": ("TP53", 2404)},
+    gencode.SupportedOrganisms.HOMO_SAPIENS: {"ENSG00000141510": ("TP53_ENSG00000141510", 2404)},
     gencode.SupportedOrganisms.MUS_MUSCULUS: {"ENSMUSG00000059552": ("Trp53", 1797)},
+    gencode.SupportedOrganisms.CAENORHABDITIS_ELEGANS: {"WBGene00000003": ("aat-2", 1738)},
+    gencode.SupportedOrganisms.CALLITHRIX_JACCHUS: {"ENSCJAG00000071296": ("U4_ENSCJAG00000071296", 141)},
+    gencode.SupportedOrganisms.DANIO_RERIO: {"ENSDARG00000009657": ("fgfr1op2", 1088)},
+    gencode.SupportedOrganisms.GORILLA_GORILLA: {"ENSGGOG00000010861": ("CAMSAP2_ENSGGOG00000010861", 7438)},
+    gencode.SupportedOrganisms.MACACA_FASCICULARIS: {"ENSMFAG00000001539": ("DFFB_ENSMFAG00000001539", 1174)},
+    gencode.SupportedOrganisms.MACACA_MULATTA: {"ENSMMUG00000000634": ("ZNF692_ENSMMUG00000000634", 1944)},
+    gencode.SupportedOrganisms.MICROCEBUS_MURINUS: {"ENSMICG00000026886": ("CIR1_ENSMICG00000026886", 1807)},
+    gencode.SupportedOrganisms.ORYCTOLAGUS_CUNICULUS: {"ENSOCUG00000025472": ("SNORD42_ENSOCUG00000025472", 67)},
+    gencode.SupportedOrganisms.PAN_TROGLODYTES: {"ENSPTRG00000000799": ("HOOK1_ENSPTRG00000000799", 5839)},
+    gencode.SupportedOrganisms.RATTUS_NORVEGICUS: {"ENSRNOG00000070901": ("Irgq_ENSRNOG00000070901", 6116)},
+    gencode.SupportedOrganisms.SUS_SCROFA: {"ENSSSCG00000031382": ("C9orf40_ENSSSCG00000031382", 3815)},
 }
 
 valid_genes_same_name_diff_species = {
@@ -25,6 +36,17 @@
 invalid_genes = {
     gencode.SupportedOrganisms.HOMO_SAPIENS: ["ENSMUSG00000059552", ("GENE", 1000)],
     gencode.SupportedOrganisms.MUS_MUSCULUS: ["ENSG00000141510", ("GENE", 200)],
+    gencode.SupportedOrganisms.CAENORHABDITIS_ELEGANS: {"WBGene_00000003": ("aat-2", 1738)},
+    gencode.SupportedOrganisms.CALLITHRIX_JACCHUS: {"ENSCJAG_00000071296": ("U4_ENSCJAG00000071296", 141)},
+    gencode.SupportedOrganisms.DANIO_RERIO: {"ENSDARG_00000009657": ("fgfr1op2", 1088)},
+    gencode.SupportedOrganisms.GORILLA_GORILLA: {"ENSGGOG_00000010861": ("CAMSAP2_ENSGGOG00000010861", 7438)},
+    gencode.SupportedOrganisms.MACACA_FASCICULARIS: {"ENSMFAG_00000001539": ("DFFB_ENSMFAG00000001539", 1174)},
+    gencode.SupportedOrganisms.MACACA_MULATTA: {"ENSMMUG_00000000634": ("ZNF692_ENSMMUG00000000634", 1944)},
+    gencode.SupportedOrganisms.MICROCEBUS_MURINUS: {"ENSMICG_00000026886": ("CIR1_ENSMICG00000026886", 1807)},
+    gencode.SupportedOrganisms.ORYCTOLAGUS_CUNICULUS: {"ENSOCUG_00000025472": ("SNORD42_ENSOCUG00000025472", 67)},
+    gencode.SupportedOrganisms.PAN_TROGLODYTES: {"ENSPTRG_00000000799": ("HOOK1_ENSPTRG00000000799", 5839)},
+    gencode.SupportedOrganisms.RATTUS_NORVEGICUS: {"ENSRNOG_00000070901": ("Irgq_ENSRNOG00000070901", 6116)},
+    gencode.SupportedOrganisms.SUS_SCROFA: {"ENSSSCG_00000031382": ("C9orf40_ENSSSCG00000031382", 3815)},
 }
 
 # For ontology checker

diff --git a/cellxgene_schema_cli/tests/fixtures/examples_validate.py b/cellxgene_schema_cli/tests/fixtures/examples_validate.py
@@ -402,7 +402,8 @@
 var_expected = pd.DataFrame(
     [
         ["spike-in", False, "ERCC-00002 (spike-in control)", "NCBITaxon:32630", 1061, "synthetic"],
-        ["gene", False, "MACF1", "NCBITaxon:9606", 2821, "protein_coding"],
+        # ["gene", False, "MACF1", "NCBITaxon:9606", 2821, "protein_coding"],
+        ["gene", False, "MACF1_ENSG00000127603", "NCBITaxon:9606", 2821, "protein_coding"],
         ["gene", False, "Trp53", "NCBITaxon:10090", 1797, "protein_coding"],
         ["gene", False, "S_ENSSASG00005000004", "NCBITaxon:2697049", 3822, "protein_coding"],
         ["gene", False, "FBtr0472816_df_nrg", "NCBITaxon:7227", 22, "ncRNA"],

diff --git a/cellxgene_schema_cli/tests/fixtures/h5ads/example_valid.h5ad b/cellxgene_schema_cli/tests/fixtures/h5ads/example_valid.h5ad
diff --git a/cellxgene_schema_cli/tests/test_schema_compliance.py b/cellxgene_schema_cli/tests/test_schema_compliance.py
@@ -795,7 +795,7 @@ def test_development_stage_ontology_term_id_all_species(self, validator_with_ada
         validator = validator_with_adata
         obs = validator.adata.obs
         # Fail case not an UBERON term
-        obs.loc[obs.index[0], "organism_ontology_term_id"] = "NCBITaxon:10114"
+        obs.loc[obs.index[0], "organism_ontology_term_id"] = "NCBITaxon:9541"
         obs.loc[obs.index[0], "development_stage_ontology_term_id"] = "EFO:0000001"
         obs.loc[
             obs.index[0],
@@ -812,7 +812,7 @@ def test_development_stage_ontology_term_id_all_species(self, validator_with_ada
         # All other it MUST be descendants of UBERON:0000105 and not UBERON:0000071
         # Fail case UBERON:0000071
         validator.errors = []
-        obs.loc[obs.index[0], "organism_ontology_term_id"] = "NCBITaxon:10114"
+        obs.loc[obs.index[0], "organism_ontology_term_id"] = "NCBITaxon:9541"
         obs.loc[obs.index[0], "development_stage_ontology_term_id"] = "UBERON:0000071"
         obs.loc[
             obs.index[0],
@@ -1226,7 +1226,7 @@ def test_self_reported_ethnicity_ontology_term_id__multi_term_list(self, validat
 
     def test_organism_ontology_term_id(self, validator_with_adata):
         """
-        organism_ontology_term_id categorical with str categories. This MUST be a descendant of NCBITaxon:33208.
+        organism_ontology_term_id categorical with str categories. This MUST be one of approved enumerated species.
         """
         validator = validator_with_adata
         obs = validator.adata.obs
@@ -1241,8 +1241,7 @@ def test_organism_ontology_term_id(self, validator_with_adata):
         ] = "na"
         validator.validate_adata()
         assert validator.errors == [
-            "ERROR: 'EFO:0000001' in 'organism_ontology_term_id' is not a valid "
-            "ontology term id of 'NCBITaxon'. Only descendant term ids of 'NCBITaxon:33208' for metazoan are allowed."
+            "ERROR: 'EFO:0000001' in 'organism_ontology_term_id' is not a valid ontology term id of 'NCBITaxon'. Only explicitly enumerated species are allowed. See Schema"
         ]
 
     def test_tissue_ontology_term_id_base(self, validator_with_adata):

diff --git a/scripts/schema_bump_dry_run_genes/tests/test_gene_bump_dry_run.py b/scripts/schema_bump_dry_run_genes/tests/test_gene_bump_dry_run.py
@@ -18,9 +18,12 @@ def test_get_diff_map(tmp_path):  # type: ignore
             fp.write("test")
     with patch("scripts.schema_bump_dry_run_genes.gene_bump_dry_run.GENCODE_DIR", tmp_path):
         diff_map = get_diff_map()
-    assert len(diff_map) == 7
-    for key in diff_map:  # type: ignore
-        assert key in ["NCBITaxon:9606", "NCBITaxon:10090", "NCBITaxon:2697049", "NCBITaxon:32630", "NCBITaxon:7227", "NCBITaxon:7955", "NCBITaxon:6239"]  # type: ignore
+
+    # one diff-map for each of our supported species
+    assert len(diff_map) == len(SupportedOrganisms)
+
+    # each species should have a diff map
+    assert len({x.value for x in SupportedOrganisms}.difference(list(diff_map))) == 0
 
 
 @pytest.fixture