Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/improve streamlit #132

Merged
merged 6 commits into from
Mar 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ test-ingest:
poetry run pytest ingest/tests --cov=ingest -v --cov-report=term-missing --cov-fail-under=40

test-django:
docker-compose up -d db
docker-compose run django-app poetry run pytest django_app/tests/ --ds redbox_app.settings -v --cov=redbox_app.redbox_core --cov-fail-under 10
docker compose up -d --wait db
docker compose run django-app poetry run pytest django_app/tests/ --ds redbox_app.settings -v --cov=redbox_app.redbox_core --cov-fail-under 10

lint:
poetry run ruff check .
Expand All @@ -61,10 +61,10 @@ checktypes:
# poetry run mypy legacy_app --follow-imports skip --ignore-missing-imports

check-migrations:
docker-compose build django-app
docker-compose run django-app poetry run python django_app/manage.py migrate
docker-compose run django-app poetry run python django_app/manage.py makemigrations --check
docker compose build django-app
docker compose run django-app poetry run python django_app/manage.py migrate
docker compose run django-app poetry run python django_app/manage.py makemigrations --check

reset-db:
docker-compose down db --volumes
docker-compose up -d db
docker compose down db --volumes
docker compose up -d db
2 changes: 1 addition & 1 deletion core_api/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ RUN pip install poetry
ADD redbox/ /app/redbox
ADD pyproject.toml poetry.lock /app/
WORKDIR /app/
RUN poetry install --no-root --no-ansi --with api --without ai,ingest,dev,worker
RUN poetry install --no-root --no-ansi --with api --without ai,ingest,dev,worker,pytest-django,streamlit-app

WORKDIR /app

Expand Down
5 changes: 2 additions & 3 deletions django_app/redbox_app/redbox_core/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,7 @@ def homepage_view(request):


def documents_view(request):

# Testing with dummy data for now
# Testing with dummy data for now
if not File.objects.exists():
File.objects.create(name="Document 1", path="#download1", processing_status=ProcessingStatusEnum.complete)
File.objects.create(name="Document 2", path="#download2", processing_status=ProcessingStatusEnum.parsing)
Expand Down Expand Up @@ -54,4 +53,4 @@ def remove_doc_view(request, doc_id: str):
request,
template_name="remove-doc.html",
context={"request": request, "doc_id": doc_id, "doc_name": doc_name},
)
)
2 changes: 1 addition & 1 deletion django_app/redbox_app/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
path("accounts/", include("allauth.urls")),
path("documents/", views.documents_view, name="documents"),
path("upload/", views.upload_view, name="upload"),
path("remove-doc/<str:doc_id>", views.remove_doc_view, name="remove_doc")
path("remove-doc/<str:doc_id>", views.remove_doc_view, name="remove_doc"),
]

urlpatterns = info_urlpatterns + other_urlpatterns
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ services:
networks:
- redbox-app-network
streamlit-app:
image: redbox-app:latest
image: redbox-streamlit-app:latest
build:
context: .
dockerfile: ./streamlit_app/Dockerfile
Expand Down
2 changes: 1 addition & 1 deletion embed/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ ADD pyproject.toml poetry.lock /app/
ADD ./download_embedder.py /app/
ADD ./model_db.py /app/
WORKDIR /app/
RUN poetry install --no-root --no-ansi --with worker,api --without ai,ingest,dev
RUN poetry install --no-root --no-ansi --with worker,api --without ai,ingest,dev,pytest-django,streamlit-app

# Add the rest of the files
ADD ./embed/src/app.py /app/app.py
Expand Down
2 changes: 1 addition & 1 deletion ingest/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ ADD download_embedder.py /app/
ADD model_db.py /app/

WORKDIR /app/
RUN poetry install --no-root --no-ansi --with worker,ingest --without ai,dev,api,django-app,pytest-django
RUN poetry install --no-root --no-ansi --with worker,ingest --without ai,dev,api,django-app,pytest-django,streamlit-app

# Download the model
RUN poetry run download-model
Expand Down
8 changes: 2 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -63,19 +63,15 @@ boto3-stubs = "^1.34.67"
streamlit = "^1.32.2"
lxml = "^5.1.0"
loguru = "^0.7.2"
streamlit-feedback = "^0.1.3"
html2markdown = "^0.1.7"

[tool.poetry.group.ai.dependencies]
anthropic = "^0.21.1"
litellm = "^1.32.7"
openai = "^1.14.2"


[tool.poetry.group.streamlitapp.dependencies]
cognitojwt = "^1.4.1"
pandas = "^2.2.1"
html2markdown = "^0.1.7"
streamlit-feedback = "^0.1.3"

[tool.poetry.group.django-app.dependencies]
python = "^3.11.2"
django = "^4.2.8"
Expand Down
2 changes: 1 addition & 1 deletion redbox/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ RUN apt-get update
RUN pip install poetry

ADD ../pyproject.toml ../poetry.lock ./
RUN poetry install --no-root --no-ansi --with worker,api --without ai,streamlit-app,ingest,dev
RUN poetry install --no-root --no-ansi --with worker,api --without ai,streamlit-app,ingest,dev,pytest-django

ADD ./redbox /app/redbox
ADD ./Makefile /app/
Expand Down
34 changes: 25 additions & 9 deletions streamlit_app/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,25 +1,41 @@
FROM python:3.11
FROM python:3.11-buster as builder


RUN apt-get update
RUN apt-get install -y libgl-dev libmagic-dev inetutils-ping

RUN pip install poetry

ENV POETRY_NO_INTERACTION=1 \
POETRY_VIRTUALENVS_IN_PROJECT=1 \
POETRY_VIRTUALENVS_CREATE=1 \
POETRY_CACHE_DIR=/tmp/poetry_cache

WORKDIR /app/

ADD pyproject.toml poetry.lock ./
# Streamlit app needs a lot of dependencies

RUN --mount=type=cache,target=$POETRY_CACHE_DIR poetry install --no-root --no-ansi --with streamlit-app,ai,ingest,embed --without dev,worker,api,pytest-django --no-root

FROM python:3.11-slim-buster as runtime

RUN apt-get update
RUN apt-get install -y libgl-dev libmagic-dev inetutils-ping


ENV VIRTUAL_ENV=/app/.venv \
PATH="/app/.venv/bin:$PATH"

COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}

WORKDIR /app/

ADD ./download_embedder.py /app/
ADD ./model_db.py /app/
RUN poetry install --no-root --no-ansi --with streamlit-app,ai,ingest --without dev,worker,api
RUN python download_embedder.py

ADD redbox/ /app/redbox
ADD streamlit_app/ /app

# Download the model

RUN poetry run download-model

EXPOSE 8501

ENTRYPOINT ["poetry", "run", "streamlit", "run", "--server.address", "0.0.0.0", "--server.port", "8501", "Welcome.py" ]
ENTRYPOINT ["streamlit", "run", "--server.address", "0.0.0.0", "--server.port", "8501", "Welcome.py" ]
3 changes: 3 additions & 0 deletions streamlit_app/pages/5_Persona_Chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,12 @@

# Model selector


def change_selected_model():
load_llm_handler(ENV, update=True)
st.write(st.session_state.llm)


persona_select = st.sidebar.selectbox(
"What is your role?",
options=st.session_state.available_personas,
Expand Down Expand Up @@ -72,6 +74,7 @@ def get_files_by_uuid(file_uuids):
files = st.session_state.storage_handler.read_items(file_uuids, "File")
return files


def render_citation_response(response):
cited_chunks = [
(
Expand Down
53 changes: 16 additions & 37 deletions streamlit_app/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from typing import Optional

import boto3
import cognitojwt
import dotenv
import html2markdown
import pandas as pd
Expand All @@ -18,17 +17,13 @@
from langchain.callbacks.base import BaseCallbackHandler
from langchain.chains.base import Chain
from langchain.schema.output import LLMResult
from langchain.vectorstores.elasticsearch import (
ApproxRetrievalStrategy,
ElasticsearchStore,
)
from langchain.vectorstores.elasticsearch import ApproxRetrievalStrategy, ElasticsearchStore
from langchain_community.chat_models import ChatLiteLLM
from langchain_community.embeddings import SentenceTransformerEmbeddings
from loguru import logger
from lxml.html.clean import Cleaner
from sentence_transformers import SentenceTransformer
from streamlit.web.server.websocket_headers import _get_websocket_headers

from model_db import SentenceTransformerDB
from redbox.llm.llm_base import LLMHandler
from redbox.models.feedback import Feedback
from redbox.models.file import File
Expand Down Expand Up @@ -134,22 +129,18 @@ def init_session_state() -> dict:
"Foreign Policy Experts",
]

if "model_db" not in st.session_state:
st.session_state.model_db = SentenceTransformerDB()
st.session_state.model_db.init_from_disk()

if "embedding_model" not in st.session_state:
available_models = []
models = {}
for dirpath, _, filenames in os.walk("models"):
# Check if the current directory contains a file named "config.json"
if "pytorch_model.bin" in filenames:
# If it does, print the path to the directory
available_models.append(dirpath)
for model_name in st.session_state.model_db:
available_models.append(model_name)

for model_path in available_models:
model_name = model_path.split("/")[-3]
model = model_name.split("--")[-1]
models[model] = SentenceTransformer(model_path)
default_model = available_models[0]

model_name = ENV.get("EMBEDDING_MODEL", "all-mpnet-base-v2")
st.session_state.embedding_model = models[model_name]
st.session_state.embedding_model = st.session_state.model_db[default_model]

if "BUCKET_NAME" not in st.session_state:
st.session_state.BUCKET_NAME = f"redbox-storage-{st.session_state.user_uuid}"
Expand All @@ -160,9 +151,7 @@ def init_session_state() -> dict:
# The bucket does not exist or you have no access.
if err.response["Error"]["Code"] == "404":
print("The bucket does not exist.")
st.session_state.s3_client.create_bucket(
Bucket=st.session_state.BUCKET_NAME
)
st.session_state.s3_client.create_bucket(Bucket=st.session_state.BUCKET_NAME)
print("Bucket created successfully.")
else:
raise err
Expand All @@ -178,9 +167,7 @@ def init_session_state() -> dict:
],
basic_auth=(ENV["ELASTIC_USER"], ENV["ELASTIC_PASSWORD"]),
)
st.session_state.storage_handler = ElasticsearchStorageHandler(
es_client=es, root_index="redbox-data"
)
st.session_state.storage_handler = ElasticsearchStorageHandler(es_client=es, root_index="redbox-data")

if st.session_state.user_uuid == "dev":
st.sidebar.info("**DEV MODE**")
Expand Down Expand Up @@ -245,9 +232,7 @@ def init_session_state() -> dict:
return ENV


def get_link_html(
page: str, text: str, query_dict: Optional[dict] = None, target: str = "_self"
) -> str:
def get_link_html(page: str, text: str, query_dict: Optional[dict] = None, target: str = "_self") -> str:
"""Returns a link in HTML format

Args:
Expand Down Expand Up @@ -403,16 +388,12 @@ def st_render(self, file: File) -> None:
"""

render_method = self.render_methods[file.type]
stream = st.session_state.s3_client.get_object(
Bucket=st.session_state.BUCKET_NAME, Key=file.name
)
stream = st.session_state.s3_client.get_object(Bucket=st.session_state.BUCKET_NAME, Key=file.name)
file_bytes = stream["Body"].read()
render_method(file, file_bytes)

def _render_pdf(self, file: File, page_number: Optional[int] = None) -> None:
stream = st.session_state.s3_client.get_object(
Bucket=st.session_state.BUCKET_NAME, Key=file.name
)
stream = st.session_state.s3_client.get_object(Bucket=st.session_state.BUCKET_NAME, Key=file.name)
base64_pdf = base64.b64encode(stream["Body"].read()).decode("utf-8")

if page_number is not None:
Expand Down Expand Up @@ -444,9 +425,7 @@ def _render_csv(self, file: File, file_bytes: bytes) -> None:
st.dataframe(df, use_container_width=True)

def _render_eml(self, file: File, file_bytes: bytes) -> None:
st.markdown(
self.cleaner.clean_html(file_bytes.decode("utf-8")), unsafe_allow_html=True
)
st.markdown(self.cleaner.clean_html(file_bytes.decode("utf-8")), unsafe_allow_html=True)

def _render_html(self, file: File, file_bytes: bytes) -> None:
markdown_html = html2markdown.convert(file_bytes.decode("utf-8"))
Expand Down
Loading