Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Delete expired files with management command #518

Merged
merged 3 commits into from
Jun 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import logging
from datetime import timedelta

from botocore.exceptions import BotoCoreError
from django.conf import settings
from django.core.management import BaseCommand
from django.utils import timezone
from redbox_app.redbox_core.client import CoreApiClient
from redbox_app.redbox_core.models import File, StatusEnum
from requests.exceptions import RequestException

logger = logging.getLogger(__name__)
core_api = CoreApiClient(host=settings.CORE_API_HOST, port=settings.CORE_API_PORT)


class Command(BaseCommand):
help = """This should be run daily per environment to remove expired data.
It removes Files that have exceeded their expiry date.
"""

def handle(self, *_args, **_kwargs):
cutoff_date = timezone.now() - timedelta(seconds=settings.FILE_EXPIRY_IN_SECONDS)

self.stdout.write(self.style.NOTICE(f"Deleting Files expired before {cutoff_date}"))
counter = 0

for file in File.objects.filter(last_referenced__lt=cutoff_date):
logger.debug(
"Deleting file object %s, last_referenced %s",
file,
file.last_referenced,
)

try:
core_api.delete_file(file.core_file_uuid, file.user)
file.delete_from_s3()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this could be part of the File.delete method 🤔 , probably doesnt add much and just over-complicates things?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We currently have a File.delete that actually deletes the file, whereas this just removes the file contents but keeps a record of the File in Django (for auditing processes). I think this was a product decision a few weeks ago.


except RequestException as e:
logger.exception("Error deleting file object %s using core-api", file, exc_info=e)
file.status = StatusEnum.errored
file.save()
except BotoCoreError as e:
logger.exception("Error deleting file object %s from storage", file, exc_info=e)
file.status = StatusEnum.errored
file.save()
else:
file.status = StatusEnum.deleted
file.save()
logger.debug("File object %s deleted", file)

counter += 1

self.stdout.write(self.style.SUCCESS(f"Successfully deleted {counter} file objects"))
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Generated by Django 5.0.6 on 2024-06-05 06:46

from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("redbox_core", "0011_alter_file_processing_status"),
]

operations = [
migrations.AlterField(
model_name="file",
name="status",
field=models.CharField(
choices=[
("uploaded", "Uploaded"),
("parsing", "Parsing"),
("chunking", "Chunking"),
("embedding", "Embedding"),
("indexing", "Indexing"),
("complete", "Complete"),
("unknown", "Unknown"),
("deleted", "Deleted"),
("errored", "Errored"),
]
),
),
]
1 change: 1 addition & 0 deletions django_app/redbox_app/redbox_core/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ class StatusEnum(models.TextChoices):
complete = "complete"
unknown = "unknown"
deleted = "deleted"
errored = "errored"


class File(UUIDPrimaryKeyBase, TimeStampedModel):
Expand Down
11 changes: 6 additions & 5 deletions django_app/redbox_app/redbox_core/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
StatusEnum,
User,
)
from requests.exceptions import HTTPError
from requests.exceptions import RequestException
from yarl import URL

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -66,7 +66,8 @@ def homepage_view(request):

@login_required
def documents_view(request):
files = File.objects.filter(user=request.user).exclude(status=StatusEnum.deleted).order_by("-created_at")
hidden_statuses = [StatusEnum.deleted, StatusEnum.errored]
files = File.objects.filter(user=request.user).exclude(status__in=hidden_statuses).order_by("-created_at")

return render(
request,
Expand Down Expand Up @@ -130,7 +131,7 @@ def ingest_file(uploaded_file: UploadedFile, user: User) -> list[str]:
else:
try:
upload_file_response = core_api.upload_file(file.unique_name, user)
except HTTPError as e:
except RequestException as e:
logger.exception("Error uploading file object %s.", file, exc_info=e)
file.delete()
errors.append("failed to connect to core-api")
Expand All @@ -148,7 +149,7 @@ def remove_doc_view(request, doc_id: uuid):
if request.method == "POST":
try:
core_api.delete_file(file.core_file_uuid, request.user)
except HTTPError as e:
except RequestException as e:
logger.exception("Error deleting file object %s.", file, exc_info=e)
errors.append("There was an error deleting this file")

Expand Down Expand Up @@ -240,7 +241,7 @@ def file_status_api_view(request: HttpRequest) -> JsonResponse:
return JsonResponse({"status": StatusEnum.unknown.label})
try:
core_file_status_response = core_api.get_file_status(file_id=file.core_file_uuid, user=request.user)
except HTTPError as ex:
except RequestException as ex:
logger.exception("File object information from core not found - file does not exist %s.", file_id, exc_info=ex)
if not file.status:
file.status = StatusEnum.unknown.label
Expand Down
95 changes: 94 additions & 1 deletion django_app/tests/management/test_commands.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
import uuid
from datetime import UTC, datetime, timedelta
from io import StringIO
from unittest import mock

import pytest
import requests
from botocore.exceptions import UnknownClientMethodError
from django.conf import settings
from django.core.management import CommandError, call_command
from django.utils import timezone
from magic_link.models import MagicLink
from redbox_app.redbox_core.models import User
from redbox_app.redbox_core.models import File, StatusEnum, User
from requests_mock import Mocker

# === show_magiclink_url command tests ===


@pytest.mark.django_db()
Expand Down Expand Up @@ -55,3 +64,87 @@ def test_command_output_with_valid_links(alice: User):

# Then
assert out.getvalue().strip() == link.get_absolute_url()


# === delete_expired_data command tests ===
EXPIRED_FILE_DATE = timezone.now() - timedelta(seconds=(settings.FILE_EXPIRY_IN_SECONDS + 60))


@pytest.mark.parametrize(
("last_referenced", "should_delete"),
[
(EXPIRED_FILE_DATE, True),
(timezone.now(), False),
],
)
@pytest.mark.django_db()
def test_delete_expired_data(uploaded_file: File, requests_mock: Mocker, last_referenced: datetime, should_delete: bool):
# Given
mock_file = uploaded_file
mock_file.last_referenced = last_referenced
mock_file.save()

requests_mock.delete(
f"http://{settings.CORE_API_HOST}:{settings.CORE_API_PORT}/file/{mock_file.core_file_uuid}",
status_code=201,
json={
"key": mock_file.original_file_name,
"bucket": settings.BUCKET_NAME,
"uuid": str(uuid.uuid4()),
},
)

# When
call_command("delete_expired_data")

# Then
is_deleted = File.objects.get(id=mock_file.id).status == StatusEnum.deleted
assert is_deleted == should_delete


@pytest.mark.django_db()
def test_delete_expired_data_with_api_error(uploaded_file: File, requests_mock: Mocker):
# Given
mock_file = uploaded_file
mock_file.last_referenced = EXPIRED_FILE_DATE
mock_file.save()

(
requests_mock.delete(
f"http://{settings.CORE_API_HOST}:{settings.CORE_API_PORT}/file/{mock_file.core_file_uuid}",
exc=requests.exceptions.HTTPError,
),
)

# When
call_command("delete_expired_data")

# Then
assert File.objects.get(id=mock_file.id).status == StatusEnum.errored


@pytest.mark.django_db()
def test_delete_expired_data_with_s3_error(uploaded_file: File, requests_mock: Mocker):
with mock.patch("redbox_app.redbox_core.models.File.delete_from_s3") as s3_mock:
s3_mock.side_effect = UnknownClientMethodError(method_name="")

# Given
mock_file = uploaded_file
mock_file.last_referenced = EXPIRED_FILE_DATE
mock_file.save()

requests_mock.delete(
f"http://{settings.CORE_API_HOST}:{settings.CORE_API_PORT}/file/{mock_file.core_file_uuid}",
status_code=201,
json={
"key": mock_file.original_file_name,
"bucket": settings.BUCKET_NAME,
"uuid": str(uuid.uuid4()),
},
)

# When
call_command("delete_expired_data")

# Then
assert File.objects.get(id=mock_file.id).status == StatusEnum.errored
4 changes: 2 additions & 2 deletions django_app/tests_playwright/pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ class DocumentRow(NamedTuple):
class DocumentsPage(SignedInBasePage):
def get_expected_page_title(self) -> str:
return "Documents - Redbox Copilot"

def delete_latest_document(self) -> "DocumentDeletePage":
self.page.get_by_role("button", name="Remove").first.click()
return DocumentDeletePage(self.page)
Expand All @@ -172,7 +172,7 @@ def get_all_document_rows(self) -> list[DocumentRow]:
class DocumentDeletePage(SignedInBasePage):
def get_expected_page_title(self) -> str:
return "Remove document - Redbox Copilot"

def confirm_deletion(self) -> "DocumentsPage":
self.page.get_by_role("button", name="Remove").click()
return DocumentsPage(self.page)
Expand Down
Loading