i-dot-ai · gecBurton · Apr 15, 2024 · Apr 13, 2024 · Apr 13, 2024 · Apr 13, 2024
diff --git a/core_api/src/publisher_handler.py b/core_api/src/publisher_handler.py
@@ -0,0 +1,23 @@
+from faststream.redis import RedisBroker
+
+from redbox.models import File
+
+
+class FilePublisher:
+    """This class is a bit of a hack to overcome a shortcoming (bug?) in faststream
+    whereby the broker is not automatically connected in sub-applications.
+
+    TODO: fix this properly, or raise an issue against faststream
+    """
+
+    def __init__(self, broker: RedisBroker, queue_name: str):
+        self.connected = False
+        self.broker = broker
+        self.queue_name = queue_name
+
+    async def publish(self, file: File):
+        if not self.connected:
+            # we only do this once
+            await self.broker.connect()
+            self.connected = True
+        await self.broker.publish(file, self.queue_name)
diff --git a/core_api/src/routes/file.py b/core_api/src/routes/file.py
@@ -3,8 +3,8 @@
 
 from fastapi import FastAPI, HTTPException
 from faststream.redis.fastapi import RedisRouter
-from pydantic import AnyHttpUrl
 
+from core_api.src.publisher_handler import FilePublisher
 from redbox.models import Chunk, File, FileStatus, Settings
 from redbox.storage import ElasticsearchStorageHandler
 
@@ -23,8 +23,8 @@
 
 # === Queues ===
 router = RedisRouter(url=env.redis_url)
-publisher = router.publisher(env.ingest_queue_name)
 
+file_publisher = FilePublisher(router.broker, env.ingest_queue_name)
 
 # === Storage ===
 
@@ -48,31 +48,22 @@
 
 
 @file_app.post("/", tags=["file"])
-async def create_upload_file(name: str, type: str, location: AnyHttpUrl) -> UUID:
-    """Upload a file to the object store and create a record in the database
+async def add_file(file: File) -> File:
+    """Create a File record in the database
 
     Args:
-        name (str): The file name to be recorded
-        type (str): The file type to be recorded
-        location (AnyHttpUrl): The presigned file resource location
+        file (File): The file to be recorded
 
     Returns:
-        UUID: The file uuid from the elastic database
+        File: The file uuid from the elastic database
     """
 
-    file = File(
-        name=name,
-        url=str(location),  # avoids JSON serialisation error
-        content_type=type,
-    )
-
     storage_handler.write_item(file)
 
     log.info(f"publishing {file.uuid}")
-    await router.broker.connect()
-    await publisher.publish(file)
+    await file_publisher.publish(file)
 
-    return file.uuid
+    return file
 
 
 @file_app.get("/{file_uuid}", response_model=File, tags=["file"])
@@ -99,7 +90,7 @@ def delete_file(file_uuid: UUID) -> File:
         File: The file that was deleted
     """
     file = storage_handler.read_item(file_uuid, model_type="File")
-    s3.delete_object(Bucket=env.bucket_name, Key=file.name)
+    s3.delete_object(Bucket=env.bucket_name, Key=file.key)
     storage_handler.delete_item(file)
 
     chunks = storage_handler.get_file_chunks(file.uuid)

diff --git a/core_api/tests/conftest.py b/core_api/tests/conftest.py
@@ -48,11 +48,6 @@ def elasticsearch_storage_handler(es_client):
 
 @pytest.fixture
 def file(s3_client, file_pdf_path) -> YieldFixture[File]:
-    """
-    TODO: this is a cut and paste of core_api:create_upload_file
-    When we come to test core_api we should think about
-    the relationship between core_api and the ingester app
-    """
     file_name = os.path.basename(file_pdf_path)
     file_type = f'.{file_name.split(".")[-1]}'
 
@@ -64,15 +59,7 @@ def file(s3_client, file_pdf_path) -> YieldFixture[File]:
             Tagging=f"file_type={file_type}",
         )
 
-    authenticated_s3_url = s3_client.generate_presigned_url(
-        "get_object",
-        Params={"Bucket": env.bucket_name, "Key": file_name},
-        ExpiresIn=3600,
-    )
-
-    # Strip off the query string (we don't need the keys)
-    simple_s3_url = authenticated_s3_url.split("?")[0]
-    file_record = File(name=file_name, url=simple_s3_url, content_type=file_type)
+    file_record = File(key=file_name, bucket=env.bucket_name)
 
     yield file_record
 

diff --git a/core_api/tests/routes/test_file.py b/core_api/tests/routes/test_file.py
@@ -25,19 +25,12 @@ async def test_post_file_upload(s3_client, app_client, elasticsearch_storage_han
             ExtraArgs={"Tagging": "file_type=pdf"},
         )
 
-        authenticated_s3_url = s3_client.generate_presigned_url(
-            "get_object",
-            Params={"Bucket": env.bucket_name, "Key": file_key},
-            ExpiresIn=3600,
-        )
-
         async with TestRedisBroker(router.broker):
             response = app_client.post(
                 "/file",
-                params={
-                    "name": "filename",
-                    "type": ".pdf",
-                    "location": authenticated_s3_url,
+                json={
+                    "key": file_key,
+                    "bucket": env.bucket_name,
                 },
             )
     assert response.status_code == 200
@@ -61,7 +54,7 @@ def test_delete_file(s3_client, app_client, elasticsearch_storage_handler, chunk
     I Expect to see it removed from s3 and elastic-search, including the chunks
     """
     # check assets exist
-    assert s3_client.get_object(Bucket=env.bucket_name, Key=chunked_file.name)
+    assert s3_client.get_object(Bucket=env.bucket_name, Key=chunked_file.key)
     assert elasticsearch_storage_handler.read_item(item_uuid=chunked_file.uuid, model_type="file")
     assert elasticsearch_storage_handler.get_file_chunks(chunked_file.uuid)
 
@@ -72,7 +65,7 @@ def test_delete_file(s3_client, app_client, elasticsearch_storage_handler, chunk
 
     # check assets dont exist
     with pytest.raises(Exception):
-        s3_client.get_object(Bucket=env.bucket_name, Key=chunked_file.name)
+        s3_client.get_object(Bucket=env.bucket_name, Key=chunked_file.key)
 
     with pytest.raises(NotFoundError):
         elasticsearch_storage_handler.read_item(item_uuid=chunked_file.uuid, model_type="file")

diff --git a/django_app/redbox_app/redbox_core/client.py b/django_app/redbox_app/redbox_core/client.py
@@ -44,23 +44,19 @@ def __init__(self, host: str, port: int):
     def url(self) -> str:
         return f"{self.host}:{self.port}"
 
-    def upload_file(self, s3_url: str, name: str, extension: str):
+    def upload_file(self, name: str):
         if self.host == "testserver":
             file = {
-                "url": "s3 url",
-                "content_type": "application/pdf",
-                "name": "my-test-file.pdf",
-                "text": "once upon a time....",
-                "processing_status": "uploaded",
+                "key": name,
+                "bucket": settings.BUCKET_NAME,
             }
             return file
 
         response = requests.post(
             f"{self.url}/file",
-            params={
-                "name": name,
-                "type": extension,
-                "location": s3_url,
+            json={
+                "key": name,
+                "bucket": settings.BUCKET_NAME,
             },
         )
         if response.status_code != 201:

diff --git a/django_app/redbox_app/redbox_core/views.py b/django_app/redbox_app/redbox_core/views.py
@@ -115,27 +115,11 @@ def upload_view(request):
                 ),
             )
 
-            # TODO: Handle S3 upload errors
-            authenticated_s3_url = s3.generate_presigned_url(
-                "get_object",
-                Params={
-                    "Bucket": settings.BUCKET_NAME,
-                    "Key": file_key,
-                },
-                ExpiresIn=3600,
-            )
-            # Strip off the query string (we don't need the keys)
-            simple_s3_url = authenticated_s3_url.split("?")[0]
-
             # ingest file
             api = CoreApiClient(host=settings.CORE_API_HOST, port=settings.CORE_API_PORT)
 
             try:
-                api.upload_file(
-                    uploaded_file.name,
-                    file_extension,
-                    simple_s3_url,
-                )
+                api.upload_file(uploaded_file.name)
                 # TODO: update improved File object with elastic uuid
                 uploaded = True
             except ValueError as value_error:

diff --git a/django_app/redbox_app/settings.py b/django_app/redbox_app/settings.py
@@ -152,7 +152,7 @@
     "s3.amazonaws.com",
 )
 CSP_SCRIPT_SRC = (
-    "'self'", 
+    "'self'",
     "plausible.io",
     "'sha256-GUQ5ad8JK5KmEWmROf3LZd9ge94daqNvd8xy9YS1iDw='",
 )

diff --git a/ingester/tests/conftest.py b/ingester/tests/conftest.py
@@ -60,18 +60,9 @@ def file(s3_client, file_pdf_path):
             Tagging=f"file_type={file_type}",
         )
 
-    authenticated_s3_url = s3_client.generate_presigned_url(
-        "get_object",
-        Params={"Bucket": env.bucket_name, "Key": file_name},
-        ExpiresIn=3600,
-    )
-
-    # Strip off the query string (we don't need the keys)
-    simple_s3_url = authenticated_s3_url.split("?")[0]
     file_record = File(
-        name=file_name,
-        url=simple_s3_url,
-        content_type=file_type,
+        key=file_name,
+        bucket=env.bucket_name,
     )
 
     yield file_record
diff --git a/redbox/models/file.py b/redbox/models/file.py
@@ -18,53 +18,11 @@ class ProcessingStatusEnum(str, Enum):
     complete = "complete"
 
 
-class ContentType(str, Enum):
-    EML = ".eml"
-    HTML = ".html"
-    HTM = ".htm"
-    JSON = ".json"
-    MD = ".md"
-    MSG = ".msg"
-    RST = ".rst"
-    RTF = ".rtf"
-    TXT = ".txt"
-    XML = ".xml"
-    JPEG = ".jpeg"  # Must have tesseract installed
-    PNG = ".png"  # Must have tesseract installed
-    CSV = ".csv"
-    DOC = ".doc"
-    DOCX = ".docx"
-    EPUB = ".epub"
-    ODT = ".odt"
-    PDF = ".pdf"
-    PPT = ".ppt"
-    PPTX = ".pptx"
-    TSV = ".tsv"
-    XLSX = ".xlsx"
-
-
 class File(PersistableModel):
-    url: AnyUrl = Field(description="s3 url")
-    content_type: ContentType = Field(description="content_type of file")
-    name: str = Field(description="file name")
-    text: Optional[str] = Field(description="file content", default=None)
-
-    @computed_field
-    def text_hash(self) -> str:
-        return hashlib.md5(
-            (self.text or "").encode(encoding="UTF-8", errors="strict"),
-            usedforsecurity=False,
-        ).hexdigest()
-
-    @computed_field
-    def token_count(self) -> int:
-        return len(encoding.encode(self.text or ""))
+    """Reference to file stored on s3"""
 
-    def to_document(self) -> Document:
-        return Document(
-            page_content=f"<Doc{self.uuid}>Title: {self.name}\n\n{self.text}</Doc{self.uuid}>\n\n",
-            metadata={"source": self.url},
-        )
+    key: str = Field(description="file key")
+    bucket: str = Field(description="s3 bucket")
 
 
 class Chunk(PersistableModel):

diff --git a/redbox/parsing/chunkers.py b/redbox/parsing/chunkers.py
@@ -10,7 +10,7 @@
 def other_chunker(file: File) -> list[Chunk]:
     authenticated_s3_url = s3_client.generate_presigned_url(
         "get_object",
-        Params={"Bucket": env.bucket_name, "Key": file.name},
+        Params={"Bucket": env.bucket_name, "Key": file.key},
         ExpiresIn=3600,
     )
 

diff --git a/redbox/parsing/file_chunker.py b/redbox/parsing/file_chunker.py
@@ -1,10 +1,37 @@
+from enum import Enum
+
 from sentence_transformers import SentenceTransformer
 
-from redbox.models.file import Chunk, ContentType, File
+from redbox.models.file import Chunk, File
 from redbox.parsing.chunk_clustering import cluster_chunks
 from redbox.parsing.chunkers import other_chunker
 
 
+class ContentType(str, Enum):
+    EML = ".eml"
+    HTML = ".html"
+    HTM = ".htm"
+    JSON = ".json"
+    MD = ".md"
+    MSG = ".msg"
+    RST = ".rst"
+    RTF = ".rtf"
+    TXT = ".txt"
+    XML = ".xml"
+    JPEG = ".jpeg"  # Must have tesseract installed
+    PNG = ".png"  # Must have tesseract installed
+    CSV = ".csv"
+    DOC = ".doc"
+    DOCX = ".docx"
+    EPUB = ".epub"
+    ODT = ".odt"
+    PDF = ".pdf"
+    PPT = ".ppt"
+    PPTX = ".pptx"
+    TSV = ".tsv"
+    XLSX = ".xlsx"
+
+
 class FileChunker:
     """A class to wrap unstructured and generate compliant chunks from files"""
 

diff --git a/tests/test_e2e.py b/tests/test_e2e.py
@@ -23,22 +23,15 @@ def test_upload_to_elastic(file_pdf_path, s3_client):
             ExtraArgs={"Tagging": "file_type=pdf"},
         )
 
-        authenticated_s3_url = s3_client.generate_presigned_url(
-            "get_object",
-            Params={"Bucket": bucket_name, "Key": file_key},
-            ExpiresIn=3600,
-        )
-
         response = requests.post(
             url="http://localhost:5002/file",
-            params={
-                "name": "filename",
-                "type": ".pdf",
-                "location": authenticated_s3_url,
+            json={
+                "key": file_key,
+                "bucket": bucket_name,
             },
         )
         assert response.status_code == 200
-        file_uuid = response.json()
+        file_uuid = response.json()["uuid"]
 
         timeout = 120
         start_time = time.time()