Fix crawling error in AsyncWebCrawler

Related to unclecode#105 Fix the 'NoneType' object has no attribute 'get' error in `AsyncWebCrawler`. * **crawl4ai/async_webcrawler.py** - Add a check in the `arun` method to ensure `html` is not `None` before further processing. - Raise a descriptive error if `html` is `None`. * **crawl4ai/async_crawler_strategy.py** - Add a check in the `crawl` method of the `AsyncPlaywrightCrawlerStrategy` class to handle cases where `html` is `None`. - Raise a descriptive error if `html` is `None`. * **tests/async/test_basic_crawling.py** - Add a test case to verify handling of `None` values for the `html` variable in the `test_invalid_url` function. * **tests/async/test_error_handling.py** - Add a test case to verify handling of `None` values for the `html` variable in the `test_network_error` function. --- For more details, open the [Copilot Workspace session](https://copilot-workspace.githubnext.com/unclecode/crawl4ai/issues/105?shareId=XXXX-XXXX-XXXX-XXXX).
theguy000 · Oct 3, 2024 · 3ec7de9 · 3ec7de9
1 parent 4750810
commit 3ec7de9
Show file tree

Hide file tree

Showing 4 changed files with 82 additions and 75 deletions.
diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
@@ -230,7 +230,7 @@ async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
 
         try:
             if self.verbose:
-                print(f"[LOG] 🕸️ Crawling {url} using AsyncPlaywrightCrawlerStrategy...")
+                print(f"[LOG] � Crawling {url} using AsyncPlaywrightCrawlerStrategy...")
 
             if self.use_cached_html:
                 cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest())
@@ -296,6 +296,8 @@ async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
                     raise RuntimeError(f"Wait condition failed: {str(e)}")
 
             html = await page.content()
+            if html is None:
+                raise ValueError(f"Failed to crawl {url}: HTML content is None")
             page = await self.execute_hook('before_return_html', page, html)
 
             if self.verbose:
@@ -404,4 +406,4 @@ async def take_screenshot(self, url: str) -> str:
                 img.save(buffered, format="JPEG")
                 return base64.b64encode(buffered.getvalue()).decode('utf-8')
             finally:
-                await page.close()
+                await page.close()
diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
@@ -45,7 +45,7 @@ async def __aexit__(self, exc_type, exc_val, exc_tb):
 
     async def awarmup(self):
         if self.verbose:
-            print("[LOG] 🌤️  Warming up the AsyncWebCrawler")
+            print("[LOG] �  Warming up the AsyncWebCrawler")
         await async_db_manager.ainit_db()
         await self.arun(
             url="https://google.com/",
@@ -55,7 +55,7 @@ async def awarmup(self):
         )
         self.ready = True
         if self.verbose:
-            print("[LOG] 🌞 AsyncWebCrawler is ready to crawl")
+            print("[LOG] � AsyncWebCrawler is ready to crawl")
 
     async def arun(
         self,
@@ -108,9 +108,12 @@ async def arun(
                 t2 = time.time()
                 if verbose:
                     print(
-                        f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds"
+                        f"[LOG] � Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds"
                     )
 
+            if html is None:
+                raise ValueError(f"Failed to crawl {url}: HTML content is None")
+
             crawl_result = await self.aprocess_html(
                 url,
                 html,
@@ -133,7 +136,7 @@ async def arun(
         except Exception as e:
             if not hasattr(e, "msg"):
                 e.msg = str(e)
-            print(f"[ERROR] 🚫 Failed to crawl {url}, error: {e.msg}")
+            print(f"[ERROR] � Failed to crawl {url}, error: {e.msg}")
             return CrawlResult(url=url, html="", success=False, error_message=e.msg)
 
     async def arun_many(
@@ -148,7 +151,7 @@ async def arun_many(
         user_agent: str = None,
         verbose=True,
         **kwargs,
-    ) -> List[CrawlResult]:
+    ) -> List<CrawlResult]:
         tasks = [
             self.arun(
                 url,
@@ -198,7 +201,7 @@ async def aprocess_html(
             )
             if verbose:
                 print(
-                    f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds"
+                    f"[LOG] � Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds"
                 )
 
             if result is None:
@@ -217,7 +220,7 @@ async def aprocess_html(
         if extracted_content is None and extraction_strategy and chunking_strategy:
             if verbose:
                 print(
-                    f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {self.__class__.__name__}"
+                    f"[LOG] � Extracting semantic blocks for {url}, Strategy: {self.__class__.__name__}"
                 )
 
             # Check if extraction strategy is type of JsonCssExtractionStrategy
@@ -232,7 +235,7 @@ async def aprocess_html(
 
         if verbose:
             print(
-                f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t:.2f} seconds."
+                f"[LOG] � Extraction done for {url}, time taken: {time.time() - t:.2f} seconds."
             )
 
         screenshot = None if not screenshot else screenshot

diff --git a/tests/async/test_basic_crawling.py b/tests/async/test_basic_crawling.py
@@ -28,6 +28,7 @@ async def test_invalid_url():
         result = await crawler.arun(url=url, bypass_cache=True)
         assert not result.success
         assert result.error_message
+        assert "HTML content is None" in result.error_message
 
 @pytest.mark.asyncio
 async def test_multiple_urls():
@@ -78,4 +79,4 @@ async def test_concurrent_crawling_performance():
 
 # Entry point for debugging
 if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
+    pytest.main([__file__, "-v"])
diff --git a/tests/async/test_error_handling.py b/tests/async/test_error_handling.py
@@ -1,78 +1,79 @@
-# import os
-# import sys
-# import pytest
-# import asyncio
+import os
+import sys
+import pytest
+import asyncio
 
-# # Add the parent directory to the Python path
-# parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-# sys.path.append(parent_dir)
+# Add the parent directory to the Python path
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
 
-# from crawl4ai.async_webcrawler import AsyncWebCrawler
-# from crawl4ai.utils import InvalidCSSSelectorError
+from crawl4ai.async_webcrawler import AsyncWebCrawler
+from crawl4ai.utils import InvalidCSSSelectorError
 
-# class AsyncCrawlerWrapper:
-#     def __init__(self):
-#         self.crawler = None
+class AsyncCrawlerWrapper:
+    def __init__(self):
+        self.crawler = None
 
-#     async def setup(self):
-#         self.crawler = AsyncWebCrawler(verbose=True)
-#         await self.crawler.awarmup()
+    async def setup(self):
+        self.crawler = AsyncWebCrawler(verbose=True)
+        await self.crawler.awarmup()
 
-#     async def cleanup(self):
-#         if self.crawler:
-#             await self.crawler.aclear_cache()
+    async def cleanup(self):
+        if self.crawler:
+            await self.crawler.aclear_cache()
 
-# @pytest.fixture(scope="module")
-# def crawler_wrapper():
-#     wrapper = AsyncCrawlerWrapper()
-#     asyncio.get_event_loop().run_until_complete(wrapper.setup())
-#     yield wrapper
-#     asyncio.get_event_loop().run_until_complete(wrapper.cleanup())
+@pytest.fixture(scope="module")
+def crawler_wrapper():
+    wrapper = AsyncCrawlerWrapper()
+    asyncio.get_event_loop().run_until_complete(wrapper.setup())
+    yield wrapper
+    asyncio.get_event_loop().run_until_complete(wrapper.cleanup())
+
+@pytest.mark.asyncio
+async def test_network_error(crawler_wrapper):
+    url = "https://www.nonexistentwebsite123456789.com"
+    result = await crawler_wrapper.crawler.arun(url=url, bypass_cache=True)
+    assert not result.success
+    assert "Failed to crawl" in result.error_message
+    assert "HTML content is None" in result.error_message
 
 # @pytest.mark.asyncio
-# async def test_network_error(crawler_wrapper):
-#     url = "https://www.nonexistentwebsite123456789.com"
-#     result = await crawler_wrapper.crawler.arun(url=url, bypass_cache=True)
+# async def test_timeout_error(crawler_wrapper):
+#     # Simulating a timeout by using a very short timeout value
+#     url = "https://www.nbcnews.com/business"
+#     result = await crawler_wrapper.crawler.arun(url=url, bypass_cache=True, timeout=0.001)
 #     assert not result.success
-#     assert "Failed to crawl" in result.error_message
-
-# # @pytest.mark.asyncio
-# # async def test_timeout_error(crawler_wrapper):
-# #     # Simulating a timeout by using a very short timeout value
-# #     url = "https://www.nbcnews.com/business"
-# #     result = await crawler_wrapper.crawler.arun(url=url, bypass_cache=True, timeout=0.001)
-# #     assert not result.success
-# #     assert "timeout" in result.error_message.lower()
+#     assert "timeout" in result.error_message.lower()
 
-# # @pytest.mark.asyncio
-# # async def test_invalid_css_selector(crawler_wrapper):
-# #     url = "https://www.nbcnews.com/business"
-# #     with pytest.raises(InvalidCSSSelectorError):
-# #         await crawler_wrapper.crawler.arun(url=url, bypass_cache=True, css_selector="invalid>>selector")
+# @pytest.mark.asyncio
+# async def test_invalid_css_selector(crawler_wrapper):
+#     url = "https://www.nbcnews.com/business"
+#     with pytest.raises(InvalidCSSSelectorError):
+#         await crawler_wrapper.crawler.arun(url=url, bypass_cache=True, css_selector="invalid>>selector")
 
-# # @pytest.mark.asyncio
-# # async def test_js_execution_error(crawler_wrapper):
-# #     url = "https://www.nbcnews.com/business"
-# #     invalid_js = "This is not valid JavaScript code;"
-# #     result = await crawler_wrapper.crawler.arun(url=url, bypass_cache=True, js=invalid_js)
-# #     assert not result.success
-# #     assert "JavaScript" in result.error_message
+# @pytest.mark.asyncio
+# async def test_js_execution_error(crawler_wrapper):
+#     url = "https://www.nbcnews.com/business"
+#     invalid_js = "This is not valid JavaScript code;"
+#     result = await crawler_wrapper.crawler.arun(url=url, bypass_cache=True, js=invalid_js)
+#     assert not result.success
+#     assert "JavaScript" in result.error_message
 
-# # @pytest.mark.asyncio
-# # async def test_empty_page(crawler_wrapper):
-# #     # Use a URL that typically returns an empty page
-# #     url = "http://example.com/empty"
-# #     result = await crawler_wrapper.crawler.arun(url=url, bypass_cache=True)
-# #     assert result.success  # The crawl itself should succeed
-# #     assert not result.markdown.strip()  # The markdown content should be empty or just whitespace
+# @pytest.mark.asyncio
+# async def test_empty_page(crawler_wrapper):
+#     # Use a URL that typically returns an empty page
+#     url = "http://example.com/empty"
+#     result = await crawler_wrapper.crawler.arun(url=url, bypass_cache=True)
+#     assert result.success  # The crawl itself should succeed
+#     assert not result.markdown.strip()  # The markdown content should be empty or just whitespace
 
-# # @pytest.mark.asyncio
-# # async def test_rate_limiting(crawler_wrapper):
-# #     # Simulate rate limiting by making multiple rapid requests
-# #     url = "https://www.nbcnews.com/business"
-# #     results = await asyncio.gather(*[crawler_wrapper.crawler.arun(url=url, bypass_cache=True) for _ in range(10)])
-# #     assert any(not result.success and "rate limit" in result.error_message.lower() for result in results)
+# @pytest.mark.asyncio
+# async def test_rate_limiting(crawler_wrapper):
+#     # Simulate rate limiting by making multiple rapid requests
+#     url = "https://www.nbcnews.com/business"
+#     results = await asyncio.gather(*[crawler_wrapper.crawler.arun(url=url, bypass_cache=True) for _ in range(10)])
+#     assert any(not result.success and "rate limit" in result.error_message.lower() for result in results)
 
-# # Entry point for debugging
-# if __name__ == "__main__":
-#     pytest.main([__file__, "-v"])
+# Entry point for debugging
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])