Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Log settings changes and ignores #221

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Log settings changes and ignores
JustAnotherArchivist committed May 7, 2022
commit 3e62eb4f014eae2ebca8b266f6f91876d1485513
24 changes: 23 additions & 1 deletion libgrabsite/wpull_hooks.py
Original file line number Diff line number Diff line change
@@ -9,6 +9,7 @@
import traceback
import asyncio
import urllib.parse
import logging

from wpull.application.hook import Actions
from wpull.application.plugin import WpullPlugin, PluginFunctions, hook, event
@@ -146,13 +147,15 @@ def activate(self):
self.loop = asyncio.get_event_loop()
self.enable_stdio_capture()
self.add_signal_handlers()
self.logger = logging.getLogger("grab_site.wpull_plugin")
self.init_job_data()
self.init_ws()
self.setup_watchers()
self.all_start_urls = open(cf("all_start_urls")).read().rstrip("\n").split("\n")
self.all_start_netlocs = set(urllib.parse.urlparse(url).netloc for url in self.all_start_urls)
self.skipped_videos = open(cf("skipped_videos"), "w", encoding="utf-8")
self.skipped_max_content_length = open(cf("skipped_max_content_length"), "w", encoding="utf-8")
self.compiled_ignores = []
self.update_ignores()
super().activate()

@@ -255,6 +258,7 @@ def update_max_content_length(self):
return
with open(self.watchers["max_content_length"].fname, "r") as f:
self.job_data["max_content_length"] = int(f.read().strip())
self.logger.info(f"Settings change: max_content_length = {self.job_data['max_content_length']}")

@swallow_exception
def update_delay(self):
@@ -266,6 +270,8 @@ def update_delay(self):
self.job_data["delay_min"], self.job_data["delay_max"] = list(int(s) for s in content.split("-", 1))
else:
self.job_data["delay_min"] = self.job_data["delay_max"] = int(content)
max_string = f"-{self.job_data['delay_max']}" if self.job_data["delay_min"] != self.job_data["delay_max"] else ""
self.logger.info(f"Settings change: delay = {self.job_data['delay_min']}{max_string}")

@swallow_exception
def update_concurrency(self):
@@ -278,6 +284,7 @@ def update_concurrency(self):
concurrency = 1
self.job_data["concurrency"] = concurrency
self.app_session.factory["PipelineSeries"].concurrency = concurrency
self.logger.info(f"Settings change: concurrency = {concurrency}")

stop_path = cf("stop")
def should_stop(self):
@@ -298,6 +305,9 @@ def update_video(self):
@swallow_exception
def update_scrape(self):
scrape = path_exists_with_cache(self.scrape_path)
if scrape == self.job_data["scrape"]:
return
self.logger.info(f"Settings change: scrape = {scrape}")
self.job_data["scrape"] = scrape
if not scrape:
# Empty the list of scrapers, which will stop scraping for new URLs
@@ -329,6 +339,15 @@ def update_ignores(self):
for ig in sorted(ignores):
self.print_to_terminal(f"\t{ig}")

# Log changes
old_ignores = set(x[0] for x in self.compiled_ignores)
added_ignores = ignores - old_ignores
removed_ignores = old_ignores - ignores
for ig in added_ignores:
self.logger.info(f"Adding ignore: {ig}")
for ig in removed_ignores:
self.logger.info(f"Removing ignore: {ig}")

self.compiled_ignores = [(ig, re_compile(ig)) for ig in ignores]
self.combined_ignore_regexp = compile_combined_regexp(ignores)

@@ -366,7 +385,9 @@ def accept_url(self, item_session: ItemSession, verdict: bool, reasons: dict):
if should_ignore:
if not self.job_data["suppress_ignore_reports"]:
pattern = self.get_specific_ignore_pattern(url)
self.maybe_log_ignore(url, pattern)
else:
pattern = "[ignore pattern match]"
self.maybe_log_ignore(url, pattern)
return False

# If we get here, none of our ignores apply. Return the original verdict.
@@ -405,6 +426,7 @@ def handle_result(self, url_info, record_info, error_info, response):
return Actions.NORMAL

def maybe_log_ignore(self, url, pattern):
self.logger.info(f"Ignoring ‘{url}’: {pattern}")
if not self.job_data["suppress_ignore_reports"]:
self.print_to_terminal(f"IGNOR {url}\n by {pattern}")
self.put_ws_queue({