From 4700bbd51627d262c82989bbcdda6a17cb8dd6fc Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Wed, 29 Nov 2023 11:58:19 +0100 Subject: [PATCH 01/17] Add entity similarity endpoint --- mira/dkg/api.py | 37 +++++++++++++++++++++++++++++++++++-- mira/dkg/utils.py | 8 +++++--- mira/dkg/wsgi.py | 1 + setup.cfg | 2 ++ 4 files changed, 43 insertions(+), 5 deletions(-) diff --git a/mira/dkg/api.py b/mira/dkg/api.py index 9f1975b40..f53a662c2 100644 --- a/mira/dkg/api.py +++ b/mira/dkg/api.py @@ -1,14 +1,16 @@ """API endpoints.""" +import itertools as itt from typing import Any, List, Mapping, Optional, Union import pydantic -from fastapi import APIRouter, Body, Path, Query, Request, HTTPException +from fastapi import APIRouter, Body, HTTPException, Path, Query, Request from neo4j.graph import Relationship from pydantic import BaseModel, Field +from scipy.spatial import distance from typing_extensions import Literal -from mira.dkg.client import Entity, AskemEntity +from mira.dkg.client import AskemEntity, Entity from mira.dkg.utils import DKG_REFINER_RELS __all__ = [ @@ -444,3 +446,34 @@ def common_parent( entity = request.app.state.client.get_common_parents(query.curie1, query.curie2) return entity + + +class Distance(BaseModel): + """Represents the distance between two entities.""" + + source: str = Field(..., title="source CURIE") + target: str = Field(..., title="target CURIE") + distance: float = Field(..., title="cosine distance") + + +@api_blueprint.post("/entity_similarity", response_model=List[Distance]) +def entity_similarity( + request: Request, + sources: List[str] = Body(..., title="source CURIEs", examples=[["ido:0000566", "ido:0000567"]]), + targets: List[str] = Body(..., title="target CURIEs", examples=[["ido:0000566", "ido:0000567"]]), +): + """Get the pairwise similarities between elements referenced by CURIEs in the first list and second list.""" + vectors = request.app.state.client.vectors + rv = [] + for source, target in itt.product(sources, targets): + source_vector = vectors.get(source) + if not source_vector: + continue + target_vector = vectors.get(target) + if not target_vector: + continue + cosine_distance = distance.cosine(source_vector, target_vector) + rv.append( + Distance(source=source, target=target, distance=cosine_distance) + ) + return rv diff --git a/mira/dkg/utils.py b/mira/dkg/utils.py index 56b9c5397..9bb981d4c 100644 --- a/mira/dkg/utils.py +++ b/mira/dkg/utils.py @@ -1,11 +1,12 @@ """Utilities and constants for the MIRA app.""" from dataclasses import dataclass -from typing import List +from typing import Dict, List +import numpy as np from gilda.grounder import Grounder -from mira.dkg.client import Neo4jClient, Entity +from mira.dkg.client import Entity, Neo4jClient from mira.metamodel import RefinementClosure __all__ = [ @@ -17,12 +18,13 @@ @dataclass class MiraState: - """All of the state associated with the MIRA app.""" + """Represents the state associated with the MIRA app.""" client: Neo4jClient grounder: Grounder refinement_closure: RefinementClosure lexical_dump: List[Entity] + vectors: Dict[str, np.array] #: A list of all prefixes used in MIRA diff --git a/mira/dkg/wsgi.py b/mira/dkg/wsgi.py index 4af73fbe1..b170968aa 100644 --- a/mira/dkg/wsgi.py +++ b/mira/dkg/wsgi.py @@ -95,6 +95,7 @@ def startup_event(): grounder=client.get_grounder(PREFIXES), refinement_closure=RefinementClosure(client.get_transitive_closure()), lexical_dump=client.get_lexical(), + # TODO load vectors! ) flask_app.register_blueprint(ui_blueprint) diff --git a/setup.cfg b/setup.cfg index 72b6bb17d..f5f60d9e1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -75,6 +75,8 @@ web = python-libsbml lxml bioregistry + scipy + numpy uvicorn = uvicorn gunicorn = From 9c31c1212435a310758e2e29436479c8cc1f34ec Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Wed, 29 Nov 2023 12:10:16 +0100 Subject: [PATCH 02/17] Update .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index f30e0b6dd..9fb4e43f0 100644 --- a/.gitignore +++ b/.gitignore @@ -134,5 +134,6 @@ scratch/ docs/_site docker/edges.tsv.gz docker/nodes.tsv.gz +docker/embeddings.tsv.gz mira/dkg/resources/ncit.obo docker/epi.sh From 0757641db8943228e090a3d7a87216b85eb59900 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Wed, 29 Nov 2023 12:10:52 +0100 Subject: [PATCH 03/17] Load vectors --- docker/Dockerfile | 1 + docker/Dockerfile.local | 1 + mira/dkg/api.py | 3 +++ mira/dkg/wsgi.py | 26 +++++++++++++++++++++----- 4 files changed, 26 insertions(+), 5 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 667a53f85..7f458c3d6 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -22,6 +22,7 @@ ENV MIRA_DOMAIN=${domain} # Download graph content and ingest into neo4j RUN wget -O /sw/nodes.tsv.gz https://askem-mira.s3.amazonaws.com/dkg/$domain/build/$version/nodes.tsv.gz && \ wget -O /sw/edges.tsv.gz https://askem-mira.s3.amazonaws.com/dkg/$domain/build/$version/edges.tsv.gz && \ + wget -O /sw/embeddings.tsv.gz https://askem-mira.s3.amazonaws.com/dkg/$domain/build/$version/embeddings.tsv.gz && \ sed -i 's/#dbms.default_listen_address/dbms.default_listen_address/' /etc/neo4j/neo4j.conf && \ sed -i 's/#dbms.security.auth_enabled/dbms.security.auth_enabled/' /etc/neo4j/neo4j.conf && \ neo4j-admin import --delimiter='TAB' --skip-duplicate-nodes=true --skip-bad-relationships=true --nodes /sw/nodes.tsv.gz --relationships /sw/edges.tsv.gz diff --git a/docker/Dockerfile.local b/docker/Dockerfile.local index abeb5b6ba..436c38eb5 100644 --- a/docker/Dockerfile.local +++ b/docker/Dockerfile.local @@ -17,6 +17,7 @@ ARG branch=main # Add graph content COPY nodes.tsv.gz /sw/nodes.tsv.gz COPY edges.tsv.gz /sw/edges.tsv.gz +COPY embeddings.tsv.gz /sw/embeddings.tsv.gz # Ingest graph content into neo4j RUN sed -i 's/#dbms.default_listen_address/dbms.default_listen_address/' /etc/neo4j/neo4j.conf && \ diff --git a/mira/dkg/api.py b/mira/dkg/api.py index f53a662c2..75e304dab 100644 --- a/mira/dkg/api.py +++ b/mira/dkg/api.py @@ -464,6 +464,9 @@ def entity_similarity( ): """Get the pairwise similarities between elements referenced by CURIEs in the first list and second list.""" vectors = request.app.state.client.vectors + if not vectors: + raise HTTPException(status_code=500, detail="No entity vectors available") + rv = [] for source, target in itt.product(sources, targets): source_vector = vectors.get(source) diff --git a/mira/dkg/wsgi.py b/mira/dkg/wsgi.py index b170968aa..203f9f8e7 100644 --- a/mira/dkg/wsgi.py +++ b/mira/dkg/wsgi.py @@ -1,10 +1,14 @@ """Neo4j client module.""" +import csv +import gzip import logging import os +from pathlib import Path from textwrap import dedent import flask +import numpy as np from fastapi import FastAPI from fastapi.middleware.wsgi import WSGIMiddleware from flask_bootstrap import Bootstrap5 @@ -16,14 +20,15 @@ from mira.dkg.utils import PREFIXES, MiraState from mira.metamodel import RefinementClosure -logger = logging.getLogger(__name__) - - __all__ = [ "flask_app", "app", ] +logger = logging.getLogger(__name__) + +HERE = Path(__file__).parent.resolve() +EMBEDDINGS_PATH = HERE.joinpath("embeddings.tsv.gz") DOMAIN = os.getenv("MIRA_DOMAIN") tags_metadata = [ @@ -46,7 +51,7 @@ { "name": "relations", "description": "Query relation data", - } + }, ] @@ -87,6 +92,17 @@ def startup_event(): logger.info("Running app startup function") Bootstrap5(flask_app) + if not EMBEDDINGS_PATH.is_file(): + vectors = {} + else: + with gzip.open(EMBEDDINGS_PATH, "rt") as file: + reader = csv.reader(file, delimiter="\t") + next(reader) # skip header + vectors = { + curie: np.array([float(p) for p in parts]) + for curie, *parts in reader + } + # Set MIRA_NEO4J_URL in the environment # to point this somewhere specific client = Neo4jClient() @@ -95,7 +111,7 @@ def startup_event(): grounder=client.get_grounder(PREFIXES), refinement_closure=RefinementClosure(client.get_transitive_closure()), lexical_dump=client.get_lexical(), - # TODO load vectors! + vectors=vectors, ) flask_app.register_blueprint(ui_blueprint) From c175412ca6e63c30179e42792276b4e5ef4debee Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Wed, 29 Nov 2023 12:16:23 +0100 Subject: [PATCH 04/17] Update README.md --- docker/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/README.md b/docker/README.md index d6fc9bfa7..f4056eecd 100644 --- a/docker/README.md +++ b/docker/README.md @@ -22,6 +22,7 @@ this folder and use: export DOMAIN=epi cp ~/.data/mira/$DOMAIN/nodes.tsv.gz nodes.tsv.gz cp ~/.data/mira/$DOMAIN/edges.tsv.gz edges.tsv.gz +cp ~/.data/mira/$DOMAIN/embeddings.tsv.gz embeddings.tsv.gz # Build docker docker build --file Dockerfile.local --tag mira_$DOMAIN_dkg:latest . From daff8d0ca300877c0b23999c46d519fddd20958e Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Wed, 29 Nov 2023 12:28:15 +0100 Subject: [PATCH 05/17] Fix loading of misc example --- mira/dkg/model.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/mira/dkg/model.py b/mira/dkg/model.py index 4c209829c..804e0eacd 100644 --- a/mira/dkg/model.py +++ b/mira/dkg/model.py @@ -728,14 +728,19 @@ def askepetrinet_model_comparison( return resp -flux_span_path = docker_test_file_path if docker_test_file_path.exists() else \ - test_file_path +if docker_test_file_path.exists(): + flux_span_query_example = json.loads(docker_test_file_path.read_text()) +elif test_file_path.exists(): + flux_span_query_example = json.loads(test_file_path.read_text()) +else: + flux_span_query_example = None + class FluxSpanQuery(BaseModel): model: Dict[str, Any] = Field( ..., - example=json.load(flux_span_path.open()), + example=flux_span_query_example, description="The model to recover the ODE-semantics from.", ) From 3ecbf10bf517bf117aa42ed193e2272dece29527 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Wed, 29 Nov 2023 12:35:36 +0100 Subject: [PATCH 06/17] Update git references --- docker/Dockerfile | 4 ++-- docker/Dockerfile.local | 2 +- mira/dkg/model.py | 1 - 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 7f458c3d6..1cb8683ff 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -29,7 +29,7 @@ RUN wget -O /sw/nodes.tsv.gz https://askem-mira.s3.amazonaws.com/dkg/$domain/bui # Python packages RUN python -m pip install --upgrade pip && \ - python -m pip install git+https://github.com/indralab/mira.git@main#egg=mira[web,uvicorn,dkg-client] && \ + python -m pip install git+https://github.com/gyorilab/mira.git@main#egg=mira[web,uvicorn,dkg-client] && \ python -m pip uninstall -y flask_bootstrap && \ python -m pip uninstall -y bootstrap_flask && \ python -m pip install bootstrap_flask && \ @@ -38,7 +38,7 @@ RUN python -m pip install --upgrade pip && \ python -m pip install --no-dependencies --ignore-requires-python sbmlmath # Copy the example json for reconstructing the ode semantics -RUN wget -O /sw/sir_flux_span.json https://raw.githubusercontent.com/indralab/mira/main/tests/sir_flux_span.json +RUN wget -O /sw/sir_flux_span.json https://raw.githubusercontent.com/gyorilab/mira/main/tests/sir_flux_span.json COPY startup.sh startup.sh ENTRYPOINT ["/bin/bash", "/sw/startup.sh"] diff --git a/docker/Dockerfile.local b/docker/Dockerfile.local index 436c38eb5..d8ac4e8f9 100644 --- a/docker/Dockerfile.local +++ b/docker/Dockerfile.local @@ -25,7 +25,7 @@ RUN sed -i 's/#dbms.default_listen_address/dbms.default_listen_address/' /etc/ne neo4j-admin import --delimiter='TAB' --skip-duplicate-nodes=true --skip-bad-relationships=true --nodes /sw/nodes.tsv.gz --relationships /sw/edges.tsv.gz # Python packages -RUN python -m pip install git+https://github.com/indralab/mira.git@$branch#egg=mira[web,uvicorn,dkg-client] && \ +RUN python -m pip install git+https://github.com/gyorilab/mira.git@$branch#egg=mira[web,uvicorn,dkg-client] && \ python -m pip uninstall -y flask_bootstrap && \ python -m pip uninstall -y bootstrap_flask && \ python -m pip install bootstrap_flask diff --git a/mira/dkg/model.py b/mira/dkg/model.py index 804e0eacd..c1dc6d15a 100644 --- a/mira/dkg/model.py +++ b/mira/dkg/model.py @@ -736,7 +736,6 @@ def askepetrinet_model_comparison( flux_span_query_example = None - class FluxSpanQuery(BaseModel): model: Dict[str, Any] = Field( ..., From 8322697b2f9ed04f93deb3f95ef62a3defc56aaf Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Wed, 29 Nov 2023 12:42:00 +0100 Subject: [PATCH 07/17] Allow all-by-all --- mira/dkg/api.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/mira/dkg/api.py b/mira/dkg/api.py index 75e304dab..69e74e759 100644 --- a/mira/dkg/api.py +++ b/mira/dkg/api.py @@ -459,16 +459,30 @@ class Distance(BaseModel): @api_blueprint.post("/entity_similarity", response_model=List[Distance]) def entity_similarity( request: Request, - sources: List[str] = Body(..., title="source CURIEs", examples=[["ido:0000566", "ido:0000567"]]), - targets: List[str] = Body(..., title="target CURIEs", examples=[["ido:0000566", "ido:0000567"]]), + sources: List[str] = Body( + ..., + title="source CURIEs", + examples=[["ido:0000511", "ido:0000592", "ido:0000597", "ido:0000514"]], + ), + targets: Optional[List[str]] = Body( + default=None, + title="target CURIEs", + description="If not given, source queries used for all-by-all comparison", + examples=[["ido:0000566", "ido:0000567"]], + ), ): """Get the pairwise similarities between elements referenced by CURIEs in the first list and second list.""" vectors = request.app.state.client.vectors if not vectors: - raise HTTPException(status_code=500, detail="No entity vectors available") - + raise HTTPException( + status_code=500, detail="No entity vectors available" + ) + if targets is None: + targets = sources rv = [] for source, target in itt.product(sources, targets): + if source == target: + continue source_vector = vectors.get(source) if not source_vector: continue From 1242d08ae2db02823da67956070cdbf2bbc204b5 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Wed, 29 Nov 2023 12:47:24 +0100 Subject: [PATCH 08/17] Add tag, fix typo in vector access, and add testing code comment --- mira/dkg/api.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/mira/dkg/api.py b/mira/dkg/api.py index 69e74e759..ae411cd15 100644 --- a/mira/dkg/api.py +++ b/mira/dkg/api.py @@ -456,7 +456,9 @@ class Distance(BaseModel): distance: float = Field(..., title="cosine distance") -@api_blueprint.post("/entity_similarity", response_model=List[Distance]) +@api_blueprint.post( + "/entity_similarity", response_model=List[Distance], tags=["entities"] +) def entity_similarity( request: Request, sources: List[str] = Body( @@ -472,7 +474,23 @@ def entity_similarity( ), ): """Get the pairwise similarities between elements referenced by CURIEs in the first list and second list.""" - vectors = request.app.state.client.vectors + """Test locally with: + + import requests + + def main(): + curies = ["ido:0000511", "ido:0000592", "ido:0000597", "ido:0000514"] + res = requests.post( + "http://0.0.0.0:8771/api/entity_similarity", + json={"sources": curies, "targets": curies}, + ) + res.raise_for_status() + print(res.json()) + + if __name__ == "__main__": + main() + """ + vectors = request.app.state.vectors if not vectors: raise HTTPException( status_code=500, detail="No entity vectors available" From 73c7ab963b393e33e58584c3808383983ed7a60d Mon Sep 17 00:00:00 2001 From: kkaris Date: Wed, 29 Nov 2023 09:56:31 -0800 Subject: [PATCH 09/17] Try new path for embeddings file --- mira/dkg/utils.py | 5 +++++ mira/dkg/wsgi.py | 9 ++++----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/mira/dkg/utils.py b/mira/dkg/utils.py index 9bb981d4c..3600c24cc 100644 --- a/mira/dkg/utils.py +++ b/mira/dkg/utils.py @@ -1,6 +1,7 @@ """Utilities and constants for the MIRA app.""" from dataclasses import dataclass +from pathlib import Path from typing import Dict, List import numpy as np @@ -13,6 +14,7 @@ "MiraState", "PREFIXES", "DKG_REFINER_RELS", + "DOCKER_ROOT", ] @@ -71,3 +73,6 @@ class MiraState: #: A list of all relation types that are considered refinement relations DKG_REFINER_RELS = ["subclassof", "part_of"] + +#: The root path of the MIRA app when running in a container +DOCKER_ROOT = Path("/sw") diff --git a/mira/dkg/wsgi.py b/mira/dkg/wsgi.py index 203f9f8e7..14ce425ab 100644 --- a/mira/dkg/wsgi.py +++ b/mira/dkg/wsgi.py @@ -17,7 +17,7 @@ from mira.dkg.client import Neo4jClient from mira.dkg.grounding import grounding_blueprint from mira.dkg.ui import ui_blueprint -from mira.dkg.utils import PREFIXES, MiraState +from mira.dkg.utils import PREFIXES, MiraState, DOCKER_ROOT from mira.metamodel import RefinementClosure __all__ = [ @@ -27,8 +27,7 @@ logger = logging.getLogger(__name__) -HERE = Path(__file__).parent.resolve() -EMBEDDINGS_PATH = HERE.joinpath("embeddings.tsv.gz") +EMBEDDINGS_PATH_DOCKER = DOCKER_ROOT / "embeddings.tsv.gz" DOMAIN = os.getenv("MIRA_DOMAIN") tags_metadata = [ @@ -92,10 +91,10 @@ def startup_event(): logger.info("Running app startup function") Bootstrap5(flask_app) - if not EMBEDDINGS_PATH.is_file(): + if not EMBEDDINGS_PATH_DOCKER.is_file(): vectors = {} else: - with gzip.open(EMBEDDINGS_PATH, "rt") as file: + with gzip.open(EMBEDDINGS_PATH_DOCKER, "rt") as file: reader = csv.reader(file, delimiter="\t") next(reader) # skip header vectors = { From 11c50c5d2cae29cf5846e83ac6076747514a95d6 Mon Sep 17 00:00:00 2001 From: kkaris Date: Wed, 29 Nov 2023 10:54:11 -0800 Subject: [PATCH 10/17] Better variable name --- mira/dkg/utils.py | 4 ++-- mira/dkg/wsgi.py | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/mira/dkg/utils.py b/mira/dkg/utils.py index 3600c24cc..6b62b5023 100644 --- a/mira/dkg/utils.py +++ b/mira/dkg/utils.py @@ -14,7 +14,7 @@ "MiraState", "PREFIXES", "DKG_REFINER_RELS", - "DOCKER_ROOT", + "DOCKER_FILES_ROOT", ] @@ -75,4 +75,4 @@ class MiraState: DKG_REFINER_RELS = ["subclassof", "part_of"] #: The root path of the MIRA app when running in a container -DOCKER_ROOT = Path("/sw") +DOCKER_FILES_ROOT = Path("/sw") diff --git a/mira/dkg/wsgi.py b/mira/dkg/wsgi.py index 14ce425ab..12a8b8821 100644 --- a/mira/dkg/wsgi.py +++ b/mira/dkg/wsgi.py @@ -4,7 +4,6 @@ import gzip import logging import os -from pathlib import Path from textwrap import dedent import flask @@ -17,7 +16,7 @@ from mira.dkg.client import Neo4jClient from mira.dkg.grounding import grounding_blueprint from mira.dkg.ui import ui_blueprint -from mira.dkg.utils import PREFIXES, MiraState, DOCKER_ROOT +from mira.dkg.utils import PREFIXES, MiraState, DOCKER_FILES_ROOT from mira.metamodel import RefinementClosure __all__ = [ @@ -27,7 +26,7 @@ logger = logging.getLogger(__name__) -EMBEDDINGS_PATH_DOCKER = DOCKER_ROOT / "embeddings.tsv.gz" +EMBEDDINGS_PATH_DOCKER = DOCKER_FILES_ROOT / "embeddings.tsv.gz" DOMAIN = os.getenv("MIRA_DOMAIN") tags_metadata = [ From 5ec2ce9bc170d7d6f15b6c58436869edc282792e Mon Sep 17 00:00:00 2001 From: kkaris Date: Wed, 29 Nov 2023 10:54:43 -0800 Subject: [PATCH 11/17] Warn of missing embeddings file --- mira/dkg/wsgi.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mira/dkg/wsgi.py b/mira/dkg/wsgi.py index 12a8b8821..0f233ae75 100644 --- a/mira/dkg/wsgi.py +++ b/mira/dkg/wsgi.py @@ -91,6 +91,10 @@ def startup_event(): Bootstrap5(flask_app) if not EMBEDDINGS_PATH_DOCKER.is_file(): + logger.warning( + f"Embeddings file {EMBEDDINGS_PATH_DOCKER} not found, skipping " + f"loading of embeddings" + ) vectors = {} else: with gzip.open(EMBEDDINGS_PATH_DOCKER, "rt") as file: From 1b3e2f956da986d430a57930a47ec633f9ff08c6 Mon Sep 17 00:00:00 2001 From: kkaris Date: Wed, 29 Nov 2023 11:20:50 -0800 Subject: [PATCH 12/17] Set embeddings path in Dockerfile with default --- docker/Dockerfile | 4 +++- mira/dkg/wsgi.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 1cb8683ff..7254d906f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -16,13 +16,15 @@ RUN apt-get update && \ ARG version=2023-10-19 ARG domain=climate +ARG embeddings_path=/sw/embeddings.tsv.gz # This latter is used in the code ENV MIRA_DOMAIN=${domain} +ENV EMBEDDINGS_PATH=${embeddings_path} # Download graph content and ingest into neo4j RUN wget -O /sw/nodes.tsv.gz https://askem-mira.s3.amazonaws.com/dkg/$domain/build/$version/nodes.tsv.gz && \ wget -O /sw/edges.tsv.gz https://askem-mira.s3.amazonaws.com/dkg/$domain/build/$version/edges.tsv.gz && \ - wget -O /sw/embeddings.tsv.gz https://askem-mira.s3.amazonaws.com/dkg/$domain/build/$version/embeddings.tsv.gz && \ + wget -O $embeddings_path https://askem-mira.s3.amazonaws.com/dkg/$domain/build/$version/embeddings.tsv.gz && \ sed -i 's/#dbms.default_listen_address/dbms.default_listen_address/' /etc/neo4j/neo4j.conf && \ sed -i 's/#dbms.security.auth_enabled/dbms.security.auth_enabled/' /etc/neo4j/neo4j.conf && \ neo4j-admin import --delimiter='TAB' --skip-duplicate-nodes=true --skip-bad-relationships=true --nodes /sw/nodes.tsv.gz --relationships /sw/edges.tsv.gz diff --git a/mira/dkg/wsgi.py b/mira/dkg/wsgi.py index 0f233ae75..eab98adf8 100644 --- a/mira/dkg/wsgi.py +++ b/mira/dkg/wsgi.py @@ -26,7 +26,9 @@ logger = logging.getLogger(__name__) -EMBEDDINGS_PATH_DOCKER = DOCKER_FILES_ROOT / "embeddings.tsv.gz" +EMBEDDINGS_PATH_DOCKER = os.getenv( + "EMBEDDINGS_PATH", DOCKER_FILES_ROOT / "embeddings.tsv.gz" +) DOMAIN = os.getenv("MIRA_DOMAIN") tags_metadata = [ From 1ea0b0dde8de9149628de964983882ebd39dcf2b Mon Sep 17 00:00:00 2001 From: kkaris Date: Wed, 29 Nov 2023 11:34:45 -0800 Subject: [PATCH 13/17] Set path as Path --- mira/dkg/wsgi.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mira/dkg/wsgi.py b/mira/dkg/wsgi.py index eab98adf8..179cea73c 100644 --- a/mira/dkg/wsgi.py +++ b/mira/dkg/wsgi.py @@ -4,6 +4,7 @@ import gzip import logging import os +from pathlib import Path from textwrap import dedent import flask @@ -26,8 +27,8 @@ logger = logging.getLogger(__name__) -EMBEDDINGS_PATH_DOCKER = os.getenv( - "EMBEDDINGS_PATH", DOCKER_FILES_ROOT / "embeddings.tsv.gz" +EMBEDDINGS_PATH_DOCKER = Path( + os.getenv("EMBEDDINGS_PATH", DOCKER_FILES_ROOT / "embeddings.tsv.gz") ) DOMAIN = os.getenv("MIRA_DOMAIN") From 8f46c0f8c69770b795102bc2d8caf32cc20f218c Mon Sep 17 00:00:00 2001 From: kkaris Date: Wed, 29 Nov 2023 13:13:23 -0800 Subject: [PATCH 14/17] Fix condition --- mira/dkg/api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mira/dkg/api.py b/mira/dkg/api.py index ae411cd15..874d4a458 100644 --- a/mira/dkg/api.py +++ b/mira/dkg/api.py @@ -502,10 +502,10 @@ def main(): if source == target: continue source_vector = vectors.get(source) - if not source_vector: + if source_vector is None: continue target_vector = vectors.get(target) - if not target_vector: + if target_vector is None: continue cosine_distance = distance.cosine(source_vector, target_vector) rv.append( From b450d9d435108edb2b22e1f4025142ee7853439c Mon Sep 17 00:00:00 2001 From: kkaris Date: Wed, 29 Nov 2023 13:24:16 -0800 Subject: [PATCH 15/17] Update local example --- mira/dkg/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mira/dkg/api.py b/mira/dkg/api.py index 874d4a458..73bfb3e5d 100644 --- a/mira/dkg/api.py +++ b/mira/dkg/api.py @@ -479,7 +479,7 @@ def entity_similarity( import requests def main(): - curies = ["ido:0000511", "ido:0000592", "ido:0000597", "ido:0000514"] + curies = ["probonto:k0000000", "probonto:k0000007", "probonto:k0000008"] res = requests.post( "http://0.0.0.0:8771/api/entity_similarity", json={"sources": curies, "targets": curies}, From 8aca4c071db34d66d6922b446c595ec21eb456db Mon Sep 17 00:00:00 2001 From: kkaris Date: Wed, 29 Nov 2023 13:45:56 -0800 Subject: [PATCH 16/17] Set embeddings_path in Dockerfile.local --- docker/Dockerfile.local | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile.local b/docker/Dockerfile.local index d8ac4e8f9..c1bf465c1 100644 --- a/docker/Dockerfile.local +++ b/docker/Dockerfile.local @@ -13,11 +13,13 @@ RUN apt-get update && \ ln -s /usr/bin/python3 /usr/bin/python ARG branch=main +ARG embeddings_path=/sw/embeddings.tsv.gz +ENV EMBEDDINGS_PATH=${embeddings_path} # Add graph content COPY nodes.tsv.gz /sw/nodes.tsv.gz COPY edges.tsv.gz /sw/edges.tsv.gz -COPY embeddings.tsv.gz /sw/embeddings.tsv.gz +COPY embeddings.tsv.gz ${embeddings_path} # Ingest graph content into neo4j RUN sed -i 's/#dbms.default_listen_address/dbms.default_listen_address/' /etc/neo4j/neo4j.conf && \ From c1f54c2663335dd63e79fd0009497392904ad438 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Thu, 30 Nov 2023 14:26:25 +0100 Subject: [PATCH 17/17] Update README.md --- docker/README.md | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/docker/README.md b/docker/README.md index f4056eecd..d009ec30d 100644 --- a/docker/README.md +++ b/docker/README.md @@ -25,22 +25,20 @@ cp ~/.data/mira/$DOMAIN/edges.tsv.gz edges.tsv.gz cp ~/.data/mira/$DOMAIN/embeddings.tsv.gz embeddings.tsv.gz # Build docker -docker build --file Dockerfile.local --tag mira_$DOMAIN_dkg:latest . +docker build --file Dockerfile.local --tag mira:latest . ``` Once the build finished, you can run the container locally as: ```shell # Option 1: run in the background -docker run --detach -p 8771:8771 -e MIRA_NEO4J_URL=bolt://0.0.0.0:7687 --name mira_$DOMAIN_dkg mira_$DOMAIN_dkg:latest +docker run --detach -p 8771:8771 -p 7687:7687 -e MIRA_NEO4J_URL=bolt://0.0.0.0:7687 --name mira mira:latest # Option 2: run ephemerally -docker run -p 8771:8771 -e MIRA_NEO4J_URL=bolt://0.0.0.0:7687 mira_$DOMAIN_dkg:latest +docker run -p 8771:8771 -p 7687:7687 -e MIRA_NEO4J_URL=bolt://0.0.0.0:7687 mira:latest ``` -This exposes a REST API at `http://localhost:8771`. Note that the `--detach` flag -runs the container in the background. If you want to expose Neo4j's bolt port, also -add `-p 7687:7687`. Note that +This exposes a REST API at `http://localhost:8771`. This also exposes Neo4j's bolt port at port 7687. ## MIRA Metaregistry