Merge pull request #148 from kaibadash/issues/147

fix: #147 Set `purgeOnStart: true` to process multiple sites as a server
BuilderIO · Feb 26, 2024 · 892cd9d · 892cd9d
2 parents 6a417bf + 5a2a565
commit 892cd9d
Show file tree

Hide file tree

Showing 2 changed files with 80 additions and 76 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,14 +1,12 @@
 # [1.4.0](https://github.com/BuilderIO/gpt-crawler/compare/v1.3.0...v1.4.0) (2024-01-15)
 
-
 ### Bug Fixes
 
-* linting ([0f4e58b](https://github.com/BuilderIO/gpt-crawler/commit/0f4e58b400eab312e7b595d7a2472bae93055415))
-
+- linting ([0f4e58b](https://github.com/BuilderIO/gpt-crawler/commit/0f4e58b400eab312e7b595d7a2472bae93055415))
 
 ### Features
 
-* add server api readme docs ([717e625](https://github.com/BuilderIO/gpt-crawler/commit/717e625f47257bdbd96437acb7242bcd28c233ba))
+- add server api readme docs ([717e625](https://github.com/BuilderIO/gpt-crawler/commit/717e625f47257bdbd96437acb7242bcd28c233ba))
 
 # [1.3.0](https://github.com/BuilderIO/gpt-crawler/compare/v1.2.1...v1.3.0) (2024-01-06)
 

diff --git a/src/core.ts b/src/core.ts
@@ -1,5 +1,5 @@
 // For more information, see https://crawlee.dev/
-import { PlaywrightCrawler, downloadListOfUrls } from "crawlee";
+import { Configuration, PlaywrightCrawler, downloadListOfUrls } from "crawlee";
 import { readFile, writeFile } from "fs/promises";
 import { glob } from "glob";
 import { Config, configSchema } from "./config.js";
@@ -54,83 +54,89 @@ export async function crawl(config: Config) {
   if (process.env.NO_CRAWL !== "true") {
     // PlaywrightCrawler crawls the web using a headless
     // browser controlled by the Playwright library.
-    crawler = new PlaywrightCrawler({
-      // Use the requestHandler to process each of the crawled pages.
-      async requestHandler({ request, page, enqueueLinks, log, pushData }) {
-        const title = await page.title();
-        pageCounter++;
-        log.info(
-          `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`,
-        );
-
-        // Use custom handling for XPath selector
-        if (config.selector) {
-          if (config.selector.startsWith("/")) {
-            await waitForXPath(
-              page,
-              config.selector,
-              config.waitForSelectorTimeout ?? 1000,
-            );
-          } else {
-            await page.waitForSelector(config.selector, {
-              timeout: config.waitForSelectorTimeout ?? 1000,
-            });
-          }
-        }
+    crawler = new PlaywrightCrawler(
+      {
+        // Use the requestHandler to process each of the crawled pages.
+        async requestHandler({ request, page, enqueueLinks, log, pushData }) {
+          const title = await page.title();
+          pageCounter++;
+          log.info(
+            `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`,
+          );
 
-        const html = await getPageHtml(page, config.selector);
+          // Use custom handling for XPath selector
+          if (config.selector) {
+            if (config.selector.startsWith("/")) {
+              await waitForXPath(
+                page,
+                config.selector,
+                config.waitForSelectorTimeout ?? 1000,
+              );
+            } else {
+              await page.waitForSelector(config.selector, {
+                timeout: config.waitForSelectorTimeout ?? 1000,
+              });
+            }
+          }
 
-        // Save results as JSON to ./storage/datasets/default
-        await pushData({ title, url: request.loadedUrl, html });
+          const html = await getPageHtml(page, config.selector);
 
-        if (config.onVisitPage) {
-          await config.onVisitPage({ page, pushData });
-        }
+          // Save results as JSON to ./storage/datasets/default
+          await pushData({ title, url: request.loadedUrl, html });
 
-        // Extract links from the current page
-        // and add them to the crawling queue.
-        await enqueueLinks({
-          globs:
-            typeof config.match === "string" ? [config.match] : config.match,
-          exclude:
-            typeof config.exclude === "string"
-              ? [config.exclude]
-              : config.exclude ?? [],
-        });
-      },
-      // Comment this option to scrape the full website.
-      maxRequestsPerCrawl: config.maxPagesToCrawl,
-      // Uncomment this option to see the browser window.
-      // headless: false,
-      preNavigationHooks: [
-        // Abort requests for certain resource types
-        async ({ request, page, log }) => {
-          // If there are no resource exclusions, return
-          const RESOURCE_EXCLUSTIONS = config.resourceExclusions ?? [];
-          if (RESOURCE_EXCLUSTIONS.length === 0) {
-            return;
-          }
-          if (config.cookie) {
-            const cookies = (
-              Array.isArray(config.cookie) ? config.cookie : [config.cookie]
-            ).map((cookie) => {
-              return {
-                name: cookie.name,
-                value: cookie.value,
-                url: request.loadedUrl,
-              };
-            });
-            await page.context().addCookies(cookies);
+          if (config.onVisitPage) {
+            await config.onVisitPage({ page, pushData });
           }
-          await page.route(`**\/*.{${RESOURCE_EXCLUSTIONS.join()}}`, (route) =>
-            route.abort("aborted"),
-          );
-          log.info(
-            `Aborting requests for as this is a resource excluded route`,
-          );
+
+          // Extract links from the current page
+          // and add them to the crawling queue.
+          await enqueueLinks({
+            globs:
+              typeof config.match === "string" ? [config.match] : config.match,
+            exclude:
+              typeof config.exclude === "string"
+                ? [config.exclude]
+                : config.exclude ?? [],
+          });
         },
-      ],
-    });
+        // Comment this option to scrape the full website.
+        maxRequestsPerCrawl: config.maxPagesToCrawl,
+        // Uncomment this option to see the browser window.
+        // headless: false,
+        preNavigationHooks: [
+          // Abort requests for certain resource types
+          async ({ request, page, log }) => {
+            // If there are no resource exclusions, return
+            const RESOURCE_EXCLUSTIONS = config.resourceExclusions ?? [];
+            if (RESOURCE_EXCLUSTIONS.length === 0) {
+              return;
+            }
+            if (config.cookie) {
+              const cookies = (
+                Array.isArray(config.cookie) ? config.cookie : [config.cookie]
+              ).map((cookie) => {
+                return {
+                  name: cookie.name,
+                  value: cookie.value,
+                  url: request.loadedUrl,
+                };
+              });
+              await page.context().addCookies(cookies);
+            }
+            await page.route(
+              `**\/*.{${RESOURCE_EXCLUSTIONS.join()}}`,
+              (route) => route.abort("aborted"),
+            );
+            log.info(
+              `Aborting requests for as this is a resource excluded route`,
+            );
+          },
+        ],
+      },
+      new Configuration({
+        purgeOnStart: true,
+      }),
+    );
 
     const isUrlASitemap = /sitemap.*\.xml$/.test(config.url);