Skip to content

Commit

Permalink
Refactor: relocate find_earliest_crawl, find_latest_crawl methods (#83)
Browse files Browse the repository at this point in the history
  • Loading branch information
jayaddison authored Dec 11, 2023
1 parent 6dadabb commit 1b1e951
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 25 deletions.
26 changes: 12 additions & 14 deletions reciperadar/models/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,17 +83,10 @@ def _make_request(self):
self.resolves_to = response.json()["url"]["resolves_to"]
return response


class RecipeURL(BaseURL):
__tablename__ = "recipe_urls"

recipe_scrapers_version = db.Column(db.String)

def find_earliest_crawl(self):
@staticmethod
def find_earliest_crawl(url):
earliest_crawl = (
db.session.query(CrawlURL)
.filter_by(resolves_to=self.url)
.cte(recursive=True)
db.session.query(CrawlURL).filter_by(resolves_to=url).cte(recursive=True)
)

previous_step = db.aliased(earliest_crawl)
Expand All @@ -107,11 +100,10 @@ def find_earliest_crawl(self):
.first()
)

def find_latest_crawl(self):
@staticmethod
def find_latest_crawl(url):
latest_crawl = (
db.session.query(CrawlURL)
.filter_by(resolves_to=self.url)
.cte(recursive=True)
db.session.query(CrawlURL).filter_by(resolves_to=url).cte(recursive=True)
)

previous_step = db.aliased(latest_crawl)
Expand All @@ -125,6 +117,12 @@ def find_latest_crawl(self):
.first()
)


class RecipeURL(BaseURL):
__tablename__ = "recipe_urls"

recipe_scrapers_version = db.Column(db.String)

def _make_request(self):
response = httpx.post(
url="http://crawler-service/crawl",
Expand Down
8 changes: 2 additions & 6 deletions reciperadar/workers/recipes.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,17 +132,13 @@ def crawl_recipe(url):
"""

# Find any more-recent crawls of this URL, allowing detection of duplicates
latest_crawl = recipe_url.find_latest_crawl()
latest_crawl = CrawlURL.find_latest_crawl(recipe_url.url)
if not latest_crawl:
print(f"Failed to find latest crawl for url={url}")
return

latest_recipe_url = db.session.get(RecipeURL, latest_crawl.url) or RecipeURL(
url=latest_crawl.url
)

# Find the first-known crawl for the latest URL, and consider it the origin
earliest_crawl = latest_recipe_url.find_earliest_crawl()
earliest_crawl = CrawlURL.find_earliest_crawl(latest_crawl.url)
if not earliest_crawl:
print(f"Failed to find earliest crawl for url={url}")
return
Expand Down
9 changes: 4 additions & 5 deletions tests/models/test_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,9 @@ def test_crawl_url_timeline(db_session):
for step in path:
db_session.add(step)

recipe = RecipeURL(url="//example.org/C")
earliest_crawl = recipe.find_earliest_crawl()
latest_crawl = recipe.find_latest_crawl()
url = "//example.org/C"
earliest_crawl = CrawlURL.find_earliest_crawl(url)
latest_crawl = CrawlURL.find_latest_crawl(url)

assert earliest_crawl.url == "//example.org/A"
assert latest_crawl.url == "//example.org/D"
Expand Down Expand Up @@ -91,8 +91,7 @@ def test_crawl_url_relocation_stability(utcnow_mock, db_session, respx_mock):
url.crawl()
db_session.add(url)

recipe_url = RecipeURL(url=to_url)
origin = recipe_url.find_earliest_crawl()
origin = CrawlURL.find_earliest_crawl(to_url)
origin_urls.add(origin.url)

assert len(origin_urls) == 1

0 comments on commit 1b1e951

Please sign in to comment.