Skip to content

Commit

Permalink
Redirect recipes during indexing when an earlier-known-origin-URL is …
Browse files Browse the repository at this point in the history
…discovered (#84)
  • Loading branch information
jayaddison authored Dec 12, 2023
1 parent db1c0e1 commit 03e19f2
Show file tree
Hide file tree
Showing 7 changed files with 130 additions and 2 deletions.
59 changes: 59 additions & 0 deletions migrations/versions/0ed6bcd27647_add_hash_ids_to_url_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""Add hash-ids to URL models
Revision ID: 0ed6bcd27647
Revises: a34cbcbedf7c
Create Date: 2023-12-12 18:26:21.409576
"""
from alembic import op
from pymmh3 import hash_bytes
import sqlalchemy as sa

from reciperadar.models.url import BaseURL


# revision identifiers, used by Alembic.
revision = '0ed6bcd27647'
down_revision = 'a34cbcbedf7c'
branch_labels = None
depends_on = None


def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table('crawl_urls', schema=None) as batch_op:
batch_op.add_column(sa.Column('id', sa.String(), nullable=True))

with op.batch_alter_table('recipe_urls', schema=None) as batch_op:
batch_op.add_column(sa.Column('id', sa.String(), nullable=True))

# ### end Alembic commands ###

connection = op.get_bind()

records = connection.execute(sa.text('SELECT url FROM crawl_urls'))
for (url,) in records.fetchall():
generated_id = BaseURL.generate_id(hash_bytes(url).encode("utf-8"))
connection.execute(
sa.text('UPDATE crawl_urls SET id = :generated_id WHERE url = :url'),
parameters={"generated_id": generated_id, "url": url},
)

records = connection.execute(sa.text('SELECT url FROM recipe_urls'))
for (url,) in records.fetchall():
generated_id = BaseURL.generate_id(hash_bytes(url).encode("utf-8"))
connection.execute(
sa.text('UPDATE recipe_urls SET id = :generated_id WHERE url = :url'),
parameters={"generated_id": generated_id, "url": url},
)


def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table('recipe_urls', schema=None) as batch_op:
batch_op.drop_column('id')

with op.batch_alter_table('crawl_urls', schema=None) as batch_op:
batch_op.drop_column('id')

# ### end Alembic commands ###
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""Make URL hash-id required and indexed
Revision ID: 330fe0f5e304
Revises: 0ed6bcd27647
Create Date: 2023-12-12 18:41:13.134572
"""
from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision = '330fe0f5e304'
down_revision = '0ed6bcd27647'
branch_labels = None
depends_on = None


def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table('crawl_urls', schema=None) as batch_op:
batch_op.alter_column('id',
existing_type=sa.VARCHAR(),
nullable=False)
batch_op.create_index(batch_op.f('ix_crawl_urls_id'), ['id'], unique=False)

with op.batch_alter_table('recipe_urls', schema=None) as batch_op:
batch_op.alter_column('id',
existing_type=sa.VARCHAR(),
nullable=False)
batch_op.create_index(batch_op.f('ix_recipe_urls_id'), ['id'], unique=False)

# ### end Alembic commands ###


def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table('recipe_urls', schema=None) as batch_op:
batch_op.drop_index(batch_op.f('ix_recipe_urls_id'))
batch_op.alter_column('id',
existing_type=sa.VARCHAR(),
nullable=True)

with op.batch_alter_table('crawl_urls', schema=None) as batch_op:
batch_op.drop_index(batch_op.f('ix_crawl_urls_id'))
batch_op.alter_column('id',
existing_type=sa.VARCHAR(),
nullable=True)

# ### end Alembic commands ###
2 changes: 1 addition & 1 deletion reciperadar/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def to_doc(self):
return {
c.name: getattr(self, c.name)
for c in self.__table__.columns
if not c.foreign_keys
if not c.foreign_keys and getattr(self, c.name) is not None
}


Expand Down
10 changes: 9 additions & 1 deletion reciperadar/models/recipes/recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,11 @@ def product_names(self):

@property
def hidden(self):
return not all([ingredient.product for ingredient in self.ingredients])
if self.redirected_id:
return True
if not all([ingredient.product for ingredient in self.ingredients]):
return True
return False

@property
def recipe_url(self):
Expand Down Expand Up @@ -96,6 +100,9 @@ def from_doc(doc):
servings=doc["servings"],
time=doc["time"],
rating=doc["rating"],
indexed_at=doc["indexed_at"],
redirected_id=doc.get("redirected_id"),
redirected_at=doc.get("redirected_at"),
)

@property
Expand Down Expand Up @@ -206,6 +213,7 @@ def to_doc(self):
else self.aggregate_ingredient_nutrition
)
data["nutrition_source"] = "crawler" if self.nutrition else "aggregation"
data["redirected_id"] = self.redirected_id # explicit foreign key serialization
data["is_dairy_free"] = self.is_dairy_free
data["is_gluten_free"] = self.is_gluten_free
data["is_vegan"] = self.is_vegan
Expand Down
4 changes: 4 additions & 0 deletions reciperadar/models/url.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from abc import ABC, abstractmethod
from datetime import datetime, timedelta
import httpx
from pymmh3 import hash_bytes
from tld import get_tld

from reciperadar import db
Expand Down Expand Up @@ -30,10 +31,13 @@ class BackoffException(Exception):

def __init__(self, *args, **kwargs):
if "url" in kwargs:
url_hash = hash_bytes(kwargs["url"]).encode("utf-8")
kwargs["id"] = BaseURL.generate_id(url_hash)
url_info = get_tld(kwargs["url"], as_object=True, search_private=False)
kwargs["domain"] = url_info.fld
super().__init__(*args, **kwargs)

id = db.Column(db.String, nullable=False, index=True)
url = db.Column(db.String, primary_key=True)
domain = db.Column(db.String)
earliest_crawled_at = db.Column(db.DateTime)
Expand Down
6 changes: 6 additions & 0 deletions reciperadar/workers/recipes.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ def index_recipe(recipe_id):
db.session.close()
return

# Display only the oldest-known recipe of record; redirect all others to it
earliest_crawl = CrawlURL.find_earliest_crawl(recipe.dst)
if earliest_crawl and recipe.id != earliest_crawl.id:
recipe.redirected_id = earliest_crawl.id
print(f"Redirected {recipe.id} to {earliest_crawl.id} url={earliest_crawl.url}")

if recipe.index():
print(f"Indexed {recipe.id} for url={recipe.src}")
db.session.commit()
Expand Down
1 change: 1 addition & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ def raw_recipe_hit(products):
"domain": "example.com",
"servings": 2,
"rating": 4.5,
"indexed_at": "1970-01-01T01:02:03.456789",
},
"inner_hits": {"ingredients": {"hits": {"hits": []}}},
}

0 comments on commit 03e19f2

Please sign in to comment.