From 96a1e98b65025166e260f434f597fbcc2d7caa50 Mon Sep 17 00:00:00 2001 From: George Burton Date: Sat, 13 Apr 2024 10:32:52 +0100 Subject: [PATCH 1/8] added publisher-handler --- core_api/src/publisher_handler.py | 23 +++++++++++++++++++++++ core_api/src/routes/file.py | 6 +++--- 2 files changed, 26 insertions(+), 3 deletions(-) create mode 100644 core_api/src/publisher_handler.py diff --git a/core_api/src/publisher_handler.py b/core_api/src/publisher_handler.py new file mode 100644 index 000000000..efd34d086 --- /dev/null +++ b/core_api/src/publisher_handler.py @@ -0,0 +1,23 @@ +from faststream.redis import RedisBroker + +from redbox.models import File + + +class FilePublisher: + """This class is a bit of a hack to overcome a shortcoming (bug?) in faststream + whereby the broker is not automatically connected in sub-applications. + + TODO: fix this properly, or raise an issue against faststream + """ + + def __init__(self, broker: RedisBroker, queue_name: str): + self.connected = False + self.broker = broker + self.queue_name = queue_name + + async def publish(self, file: File): + if not self.connected: + # we only do this once + await self.broker.connect() + self.connected = True + return self.broker.publish(file, self.queue_name) diff --git a/core_api/src/routes/file.py b/core_api/src/routes/file.py index c8367cf71..5f153d28d 100644 --- a/core_api/src/routes/file.py +++ b/core_api/src/routes/file.py @@ -5,6 +5,7 @@ from faststream.redis.fastapi import RedisRouter from pydantic import AnyHttpUrl +from core_api.src.publisher_handler import FilePublisher from redbox.models import Chunk, File, FileStatus, Settings from redbox.storage import ElasticsearchStorageHandler @@ -23,8 +24,8 @@ # === Queues === router = RedisRouter(url=env.redis_url) -publisher = router.publisher(env.ingest_queue_name) +file_publisher = FilePublisher(router.broker, env.ingest_queue_name) # === Storage === @@ -69,8 +70,7 @@ async def create_upload_file(name: str, type: str, location: AnyHttpUrl) -> UUID storage_handler.write_item(file) log.info(f"publishing {file.uuid}") - await router.broker.connect() - await publisher.publish(file) + await file_publisher.publish(file) return file.uuid From c7d0d599702b2654b64b5c29f6ff586ff5507fc5 Mon Sep 17 00:00:00 2001 From: George Burton Date: Sat, 13 Apr 2024 13:11:56 +0100 Subject: [PATCH 2/8] awaiting publish --- core_api/src/publisher_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core_api/src/publisher_handler.py b/core_api/src/publisher_handler.py index efd34d086..b744be62d 100644 --- a/core_api/src/publisher_handler.py +++ b/core_api/src/publisher_handler.py @@ -20,4 +20,4 @@ async def publish(self, file: File): # we only do this once await self.broker.connect() self.connected = True - return self.broker.publish(file, self.queue_name) + await self.broker.publish(file, self.queue_name) From b81f95307a96d3cdc34be7f237891fb0f796e3b1 Mon Sep 17 00:00:00 2001 From: George Burton Date: Sat, 13 Apr 2024 15:45:48 +0100 Subject: [PATCH 3/8] File now just bucket/key --- core_api/src/routes/file.py | 21 +++------ core_api/tests/conftest.py | 15 +------ core_api/tests/routes/test_file.py | 17 +++----- django_app/redbox_app/redbox_core/client.py | 14 +++--- django_app/redbox_app/redbox_core/views.py | 8 ++-- ingester/tests/conftest.py | 13 +----- redbox/models/file.py | 48 ++------------------- redbox/parsing/chunkers.py | 2 +- redbox/parsing/file_chunker.py | 29 ++++++++++++- tests/test_e2e.py | 11 +---- 10 files changed, 57 insertions(+), 121 deletions(-) diff --git a/core_api/src/routes/file.py b/core_api/src/routes/file.py index 5f153d28d..b4253ee7b 100644 --- a/core_api/src/routes/file.py +++ b/core_api/src/routes/file.py @@ -3,7 +3,6 @@ from fastapi import FastAPI, HTTPException from faststream.redis.fastapi import RedisRouter -from pydantic import AnyHttpUrl from core_api.src.publisher_handler import FilePublisher from redbox.models import Chunk, File, FileStatus, Settings @@ -49,30 +48,22 @@ @file_app.post("/", tags=["file"]) -async def create_upload_file(name: str, type: str, location: AnyHttpUrl) -> UUID: - """Upload a file to the object store and create a record in the database +async def add_file(file: File) -> File: + """Create a File record in the database Args: - name (str): The file name to be recorded - type (str): The file type to be recorded - location (AnyHttpUrl): The presigned file resource location + file (File): The file to be recorded Returns: - UUID: The file uuid from the elastic database + File: The file uuid from the elastic database """ - file = File( - name=name, - url=str(location), # avoids JSON serialisation error - content_type=type, - ) - storage_handler.write_item(file) log.info(f"publishing {file.uuid}") await file_publisher.publish(file) - return file.uuid + return file @file_app.get("/{file_uuid}", response_model=File, tags=["file"]) @@ -99,7 +90,7 @@ def delete_file(file_uuid: UUID) -> File: File: The file that was deleted """ file = storage_handler.read_item(file_uuid, model_type="File") - s3.delete_object(Bucket=env.bucket_name, Key=file.name) + s3.delete_object(Bucket=env.bucket_name, Key=file.key) storage_handler.delete_item(file) chunks = storage_handler.get_file_chunks(file.uuid) diff --git a/core_api/tests/conftest.py b/core_api/tests/conftest.py index c73a5611f..beddb62a4 100644 --- a/core_api/tests/conftest.py +++ b/core_api/tests/conftest.py @@ -48,11 +48,6 @@ def elasticsearch_storage_handler(es_client): @pytest.fixture def file(s3_client, file_pdf_path) -> YieldFixture[File]: - """ - TODO: this is a cut and paste of core_api:create_upload_file - When we come to test core_api we should think about - the relationship between core_api and the ingester app - """ file_name = os.path.basename(file_pdf_path) file_type = f'.{file_name.split(".")[-1]}' @@ -64,15 +59,7 @@ def file(s3_client, file_pdf_path) -> YieldFixture[File]: Tagging=f"file_type={file_type}", ) - authenticated_s3_url = s3_client.generate_presigned_url( - "get_object", - Params={"Bucket": env.bucket_name, "Key": file_name}, - ExpiresIn=3600, - ) - - # Strip off the query string (we don't need the keys) - simple_s3_url = authenticated_s3_url.split("?")[0] - file_record = File(name=file_name, url=simple_s3_url, content_type=file_type) + file_record = File(key=file_name, bucket=env.bucket_name) yield file_record diff --git a/core_api/tests/routes/test_file.py b/core_api/tests/routes/test_file.py index efff97d2a..113ec8806 100644 --- a/core_api/tests/routes/test_file.py +++ b/core_api/tests/routes/test_file.py @@ -25,19 +25,12 @@ async def test_post_file_upload(s3_client, app_client, elasticsearch_storage_han ExtraArgs={"Tagging": "file_type=pdf"}, ) - authenticated_s3_url = s3_client.generate_presigned_url( - "get_object", - Params={"Bucket": env.bucket_name, "Key": file_key}, - ExpiresIn=3600, - ) - async with TestRedisBroker(router.broker): response = app_client.post( "/file", - params={ - "name": "filename", - "type": ".pdf", - "location": authenticated_s3_url, + json={ + "key": file_key, + "bucket": env.bucket_name, }, ) assert response.status_code == 200 @@ -61,7 +54,7 @@ def test_delete_file(s3_client, app_client, elasticsearch_storage_handler, chunk I Expect to see it removed from s3 and elastic-search, including the chunks """ # check assets exist - assert s3_client.get_object(Bucket=env.bucket_name, Key=chunked_file.name) + assert s3_client.get_object(Bucket=env.bucket_name, Key=chunked_file.key) assert elasticsearch_storage_handler.read_item(item_uuid=chunked_file.uuid, model_type="file") assert elasticsearch_storage_handler.get_file_chunks(chunked_file.uuid) @@ -72,7 +65,7 @@ def test_delete_file(s3_client, app_client, elasticsearch_storage_handler, chunk # check assets dont exist with pytest.raises(Exception): - s3_client.get_object(Bucket=env.bucket_name, Key=chunked_file.name) + s3_client.get_object(Bucket=env.bucket_name, Key=chunked_file.key) with pytest.raises(NotFoundError): elasticsearch_storage_handler.read_item(item_uuid=chunked_file.uuid, model_type="file") diff --git a/django_app/redbox_app/redbox_core/client.py b/django_app/redbox_app/redbox_core/client.py index b19443509..d9051668d 100644 --- a/django_app/redbox_app/redbox_core/client.py +++ b/django_app/redbox_app/redbox_core/client.py @@ -47,20 +47,16 @@ def url(self) -> str: def upload_file(self, s3_url: str, name: str, extension: str): if self.host == "testserver": file = { - "url": "s3 url", - "content_type": "application/pdf", - "name": "my-test-file.pdf", - "text": "once upon a time....", - "processing_status": "uploaded", + "key": "my-test-file.pdf", + "bucket": settings.BUCKET_NAME, } return file response = requests.post( f"{self.url}/file", - params={ - "name": name, - "type": extension, - "location": s3_url, + json={ + "key": name, + "bucket": settings.BUCKET_NAME, }, ) if response.status_code != 201: diff --git a/django_app/redbox_app/redbox_core/views.py b/django_app/redbox_app/redbox_core/views.py index 0eee4eada..a910a1b2e 100644 --- a/django_app/redbox_app/redbox_core/views.py +++ b/django_app/redbox_app/redbox_core/views.py @@ -74,7 +74,7 @@ def documents_view(request): def get_file_extension(file): # TODO: use a third party checking service to validate this - _, extension = os.path.splitext(file.name) + _, extension = os.path.splitext(file.key) return extension @@ -88,7 +88,7 @@ def upload_view(request): file_extension = get_file_extension(uploaded_file) - if uploaded_file.name is None: + if uploaded_file.key is None: errors["upload_doc"].append("File has no name") if uploaded_file.content_type is None: errors["upload_doc"].append("File has no content-type") @@ -105,7 +105,7 @@ def upload_view(request): Bucket=settings.BUCKET_NAME, Fileobj=uploaded_file, Key=file_key, - ExtraArgs={"Tagging": f"file_type={uploaded_file.content_type}"}, + ExtraArgs={"Tagging": f"file_type={file_extension}"}, Config=TransferConfig( multipart_chunksize=CHUNK_SIZE, preferred_transfer_client="auto", @@ -132,7 +132,7 @@ def upload_view(request): try: api.upload_file( - uploaded_file.name, + uploaded_file.key, file_extension, simple_s3_url, ) diff --git a/ingester/tests/conftest.py b/ingester/tests/conftest.py index 5552dc01a..27a4d3ed6 100644 --- a/ingester/tests/conftest.py +++ b/ingester/tests/conftest.py @@ -60,18 +60,9 @@ def file(s3_client, file_pdf_path): Tagging=f"file_type={file_type}", ) - authenticated_s3_url = s3_client.generate_presigned_url( - "get_object", - Params={"Bucket": env.bucket_name, "Key": file_name}, - ExpiresIn=3600, - ) - - # Strip off the query string (we don't need the keys) - simple_s3_url = authenticated_s3_url.split("?")[0] file_record = File( - name=file_name, - url=simple_s3_url, - content_type=file_type, + key=file_name, + bucket=env.bucket_name, ) yield file_record diff --git a/redbox/models/file.py b/redbox/models/file.py index 01e472fdc..3e33af961 100644 --- a/redbox/models/file.py +++ b/redbox/models/file.py @@ -18,53 +18,11 @@ class ProcessingStatusEnum(str, Enum): complete = "complete" -class ContentType(str, Enum): - EML = ".eml" - HTML = ".html" - HTM = ".htm" - JSON = ".json" - MD = ".md" - MSG = ".msg" - RST = ".rst" - RTF = ".rtf" - TXT = ".txt" - XML = ".xml" - JPEG = ".jpeg" # Must have tesseract installed - PNG = ".png" # Must have tesseract installed - CSV = ".csv" - DOC = ".doc" - DOCX = ".docx" - EPUB = ".epub" - ODT = ".odt" - PDF = ".pdf" - PPT = ".ppt" - PPTX = ".pptx" - TSV = ".tsv" - XLSX = ".xlsx" - - class File(PersistableModel): - url: AnyUrl = Field(description="s3 url") - content_type: ContentType = Field(description="content_type of file") - name: str = Field(description="file name") - text: Optional[str] = Field(description="file content", default=None) - - @computed_field - def text_hash(self) -> str: - return hashlib.md5( - (self.text or "").encode(encoding="UTF-8", errors="strict"), - usedforsecurity=False, - ).hexdigest() - - @computed_field - def token_count(self) -> int: - return len(encoding.encode(self.text or "")) + """Reference to file stored on s3""" - def to_document(self) -> Document: - return Document( - page_content=f"Title: {self.name}\n\n{self.text}\n\n", - metadata={"source": self.url}, - ) + key: str = Field(description="file key") + bucket: str = Field(description="s3 bucket") class Chunk(PersistableModel): diff --git a/redbox/parsing/chunkers.py b/redbox/parsing/chunkers.py index 18c20d219..579ac8bd5 100644 --- a/redbox/parsing/chunkers.py +++ b/redbox/parsing/chunkers.py @@ -10,7 +10,7 @@ def other_chunker(file: File) -> list[Chunk]: authenticated_s3_url = s3_client.generate_presigned_url( "get_object", - Params={"Bucket": env.bucket_name, "Key": file.name}, + Params={"Bucket": env.bucket_name, "Key": file.key}, ExpiresIn=3600, ) diff --git a/redbox/parsing/file_chunker.py b/redbox/parsing/file_chunker.py index a3f4a15b8..03bb81097 100644 --- a/redbox/parsing/file_chunker.py +++ b/redbox/parsing/file_chunker.py @@ -1,10 +1,37 @@ +from enum import Enum + from sentence_transformers import SentenceTransformer -from redbox.models.file import Chunk, ContentType, File +from redbox.models.file import Chunk, File from redbox.parsing.chunk_clustering import cluster_chunks from redbox.parsing.chunkers import other_chunker +class ContentType(str, Enum): + EML = ".eml" + HTML = ".html" + HTM = ".htm" + JSON = ".json" + MD = ".md" + MSG = ".msg" + RST = ".rst" + RTF = ".rtf" + TXT = ".txt" + XML = ".xml" + JPEG = ".jpeg" # Must have tesseract installed + PNG = ".png" # Must have tesseract installed + CSV = ".csv" + DOC = ".doc" + DOCX = ".docx" + EPUB = ".epub" + ODT = ".odt" + PDF = ".pdf" + PPT = ".ppt" + PPTX = ".pptx" + TSV = ".tsv" + XLSX = ".xlsx" + + class FileChunker: """A class to wrap unstructured and generate compliant chunks from files""" diff --git a/tests/test_e2e.py b/tests/test_e2e.py index 0dacd5bb7..62701c861 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -23,18 +23,11 @@ def test_upload_to_elastic(file_pdf_path, s3_client): ExtraArgs={"Tagging": "file_type=pdf"}, ) - authenticated_s3_url = s3_client.generate_presigned_url( - "get_object", - Params={"Bucket": bucket_name, "Key": file_key}, - ExpiresIn=3600, - ) - response = requests.post( url="http://localhost:5002/file", params={ - "name": "filename", - "type": ".pdf", - "location": authenticated_s3_url, + "name": file_key, + "bucket": bucket_name, }, ) assert response.status_code == 200 From 245de7d3d1abe5dcda5f6226dffa21829e356288 Mon Sep 17 00:00:00 2001 From: George Burton Date: Sat, 13 Apr 2024 15:51:48 +0100 Subject: [PATCH 4/8] fixed django client --- django_app/redbox_app/redbox_core/client.py | 4 ++-- django_app/redbox_app/redbox_core/views.py | 24 ++++----------------- 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/django_app/redbox_app/redbox_core/client.py b/django_app/redbox_app/redbox_core/client.py index d9051668d..dd34d41f8 100644 --- a/django_app/redbox_app/redbox_core/client.py +++ b/django_app/redbox_app/redbox_core/client.py @@ -44,10 +44,10 @@ def __init__(self, host: str, port: int): def url(self) -> str: return f"{self.host}:{self.port}" - def upload_file(self, s3_url: str, name: str, extension: str): + def upload_file(self, name: str): if self.host == "testserver": file = { - "key": "my-test-file.pdf", + "key": name, "bucket": settings.BUCKET_NAME, } return file diff --git a/django_app/redbox_app/redbox_core/views.py b/django_app/redbox_app/redbox_core/views.py index a910a1b2e..bce4f472f 100644 --- a/django_app/redbox_app/redbox_core/views.py +++ b/django_app/redbox_app/redbox_core/views.py @@ -74,7 +74,7 @@ def documents_view(request): def get_file_extension(file): # TODO: use a third party checking service to validate this - _, extension = os.path.splitext(file.key) + _, extension = os.path.splitext(file.name) return extension @@ -88,7 +88,7 @@ def upload_view(request): file_extension = get_file_extension(uploaded_file) - if uploaded_file.key is None: + if uploaded_file.name is None: errors["upload_doc"].append("File has no name") if uploaded_file.content_type is None: errors["upload_doc"].append("File has no content-type") @@ -105,7 +105,7 @@ def upload_view(request): Bucket=settings.BUCKET_NAME, Fileobj=uploaded_file, Key=file_key, - ExtraArgs={"Tagging": f"file_type={file_extension}"}, + ExtraArgs={"Tagging": f"file_type={uploaded_file.content_type}"}, Config=TransferConfig( multipart_chunksize=CHUNK_SIZE, preferred_transfer_client="auto", @@ -115,27 +115,11 @@ def upload_view(request): ), ) - # TODO: Handle S3 upload errors - authenticated_s3_url = s3.generate_presigned_url( - "get_object", - Params={ - "Bucket": settings.BUCKET_NAME, - "Key": file_key, - }, - ExpiresIn=3600, - ) - # Strip off the query string (we don't need the keys) - simple_s3_url = authenticated_s3_url.split("?")[0] - # ingest file api = CoreApiClient(host=settings.CORE_API_HOST, port=settings.CORE_API_PORT) try: - api.upload_file( - uploaded_file.key, - file_extension, - simple_s3_url, - ) + api.upload_file(uploaded_file.name) # TODO: update improved File object with elastic uuid uploaded = True except ValueError as value_error: From 2d1611b4b2ccf7f9a85ebc18e1fd406b303d0392 Mon Sep 17 00:00:00 2001 From: George Burton Date: Sat, 13 Apr 2024 16:10:18 +0100 Subject: [PATCH 5/8] params -> json --- tests/test_e2e.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_e2e.py b/tests/test_e2e.py index 62701c861..e11a5a305 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -25,7 +25,7 @@ def test_upload_to_elastic(file_pdf_path, s3_client): response = requests.post( url="http://localhost:5002/file", - params={ + json={ "name": file_key, "bucket": bucket_name, }, From e93cbec5aa0a65b7e1d737d83c38652dcb672fda Mon Sep 17 00:00:00 2001 From: George Burton <8233643+gecBurton@users.noreply.github.com> Date: Sat, 13 Apr 2024 17:48:00 +0100 Subject: [PATCH 6/8] Update tests/test_e2e.py --- tests/test_e2e.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_e2e.py b/tests/test_e2e.py index e11a5a305..9cbd7a92f 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -26,7 +26,7 @@ def test_upload_to_elastic(file_pdf_path, s3_client): response = requests.post( url="http://localhost:5002/file", json={ - "name": file_key, + "key": file_key, "bucket": bucket_name, }, ) From c61376d0ad7f0041079f4da727eb923cc4dce814 Mon Sep 17 00:00:00 2001 From: George Burton Date: Sat, 13 Apr 2024 18:29:50 +0100 Subject: [PATCH 7/8] fetching uuid from payload --- tests/test_e2e.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_e2e.py b/tests/test_e2e.py index 9cbd7a92f..8f873873f 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -31,7 +31,7 @@ def test_upload_to_elastic(file_pdf_path, s3_client): }, ) assert response.status_code == 200 - file_uuid = response.json() + file_uuid = response.json()["uuid"] timeout = 120 start_time = time.time() From 5c4b99e2a04b846c22243de1d48340f536ef3e44 Mon Sep 17 00:00:00 2001 From: George Burton Date: Mon, 15 Apr 2024 09:14:18 +0100 Subject: [PATCH 8/8] reformatted --- django_app/redbox_app/settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/django_app/redbox_app/settings.py b/django_app/redbox_app/settings.py index 2ff621450..f76cd97cb 100644 --- a/django_app/redbox_app/settings.py +++ b/django_app/redbox_app/settings.py @@ -152,7 +152,7 @@ "s3.amazonaws.com", ) CSP_SCRIPT_SRC = ( - "'self'", + "'self'", "plausible.io", "'sha256-GUQ5ad8JK5KmEWmROf3LZd9ge94daqNvd8xy9YS1iDw='", )