diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 35e3c59..bb5f2d4 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -53,6 +53,7 @@ def __init__(self, use_cached_html=False, js_code=None, **kwargs): self.browser = None self.hooks = { 'on_browser_created': None, + 'on_page_created': None, 'on_user_agent_updated': None, 'on_execution_started': None, 'before_goto': None, @@ -178,6 +179,7 @@ async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: ) await context.set_extra_http_headers(self.headers) page = await context.new_page() + page = await self.execute_hook('on_page_created', page) self.sessions[session_id] = (context, page, time.time()) else: context = await self.browser.new_context( @@ -186,6 +188,7 @@ async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: ) await context.set_extra_http_headers(self.headers) page = await context.new_page() + page = await self.execute_hook('on_page_created', page) try: if self.verbose: @@ -355,6 +358,7 @@ async def crawl_with_semaphore(url): async def take_screenshot(self, url: str) -> str: async with await self.browser.new_context(user_agent=self.user_agent) as context: page = await context.new_page() + page = await self.execute_hook('on_page_created', page) try: await page.goto(url, wait_until="domcontentloaded") screenshot = await page.screenshot(full_page=True) diff --git a/tests/async/test_crawler_strategy.py b/tests/async/test_crawler_strategy.py index a507058..d3e049d 100644 --- a/tests/async/test_crawler_strategy.py +++ b/tests/async/test_crawler_strategy.py @@ -53,6 +53,37 @@ async def test_hook(page): assert result.success assert "background-color: red" in result.html +@pytest.mark.asyncio +async def test_response_inspection(): + responses = [] + + async def log_response(response): + responses.append({ + 'url': response.url, + 'status': response.status, + 'headers': response.headers + }) + + async def on_page_created(page): + page.on("response", log_response) + return page + + crawler_strategy = AsyncPlaywrightCrawlerStrategy(verbose=True) + crawler_strategy.set_hook("on_page_created", on_page_created) + + async with AsyncWebCrawler(verbose=True, crawler_strategy=crawler_strategy) as crawler: + url = "https://httpbin.org/get" + result = await crawler.arun(url=url, bypass_cache=True) + + assert result.success + assert len(responses) > 0 + + main_response = next((r for r in responses if r['url'] == url), None) + assert main_response is not None + assert main_response['status'] == 200 + assert 'content-type' in main_response['headers'] + assert main_response['headers']['content-type'] == 'application/json' + @pytest.mark.asyncio async def test_screenshot(): async with AsyncWebCrawler(verbose=True) as crawler: