From 13324c598c47d065d4a11aa6b930678adf3d4a90 Mon Sep 17 00:00:00 2001
From: FTAndy <fortheandy@gmail.com>
Date: Thu, 30 Nov 2023 17:27:20 +0800
Subject: [PATCH 1/5] feat: add match config multiple pattern

---
 config.ts         | 30 ++++++++++++++++++++++++++++++
 package-lock.json |  4 +++-
 package.json      |  1 +
 src/config.ts     | 30 +++++++++++++++++++++++++++---
 src/core.ts       | 47 +++++++++++++++++++++++++++++++++++++----------
 5 files changed, 98 insertions(+), 14 deletions(-)
diff --git a/config.ts b/config.ts
index bc2d22e0..18894f80 100644
--- a/config.ts
+++ b/config.ts
@@ -6,3 +6,33 @@ export const defaultConfig: Config = {
   maxPagesToCrawl: 50,
   outputFileName: "output.json",
 };
+
+
+// const treeEndPointUrl = 'https://github.com/BuilderIO/gpt-crawler/tree/main'
+// const blobEndPointUrl = 'https://github.com/BuilderIO/gpt-crawler/blob/main'
+
+// export const defaultConfig: Config = {
+//   url: "https://github.com/BuilderIO/gpt-crawler/tree/main",
+//   match: [
+//     {
+//       // skip the pattern you do not want to crawl
+//       // pattern: "https://github.com/BuilderIO/gpt-crawler/tree/main/**",
+//       pattern: `${treeEndPointUrl}/**`,
+//       skip: true
+//     },
+//     {
+//       // speical case for .md
+//       // for .md, we need to crawl the raw content in the .markdown-body selector
+//       // pattern: 'https://github.com/BuilderIO/gpt-crawler/blob/main/**/*.md',
+//       pattern: `${blobEndPointUrl}/**/*.md`,
+//       selector: '.markdown-body'
+//     },
+//     {
+//       // other files like .js, .ts, .json, etc
+//       pattern: `${blobEndPointUrl}/**`,
+//       selector: '#read-only-cursor-text-area'
+//     },
+//   ],
+//   maxPagesToCrawl: 50,
+//   outputFileName: "output.json",
+// };
diff --git a/package-lock.json b/package-lock.json
index aed0f5ac..f787b4ec 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -15,6 +15,7 @@
         "cross-env": "^7.0.3",
         "glob": "^10.3.10",
         "inquirer": "^9.2.12",
+        "minimatch": "^9.0.3",
         "playwright": "*",
         "prettier": "^3.1.0",
         "zod": "^3.22.4"
@@ -4382,7 +4383,8 @@
     },
     "node_modules/minimatch": {
       "version": "9.0.3",
-      "license": "ISC",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.3.tgz",
+      "integrity": "sha512-RHiac9mvaRw0x3AYRgDC1CxAP7HTcNrrECeA8YYJeWnpo+2Q5CegtZjaotWTWxDG3UeGA1coE05iH1mPjT/2mg==",
       "dependencies": {
         "brace-expansion": "^2.0.1"
       },
diff --git a/package.json b/package.json
index 9a31f847..ce7be4df 100644
--- a/package.json
+++ b/package.json
@@ -12,6 +12,7 @@
     "cross-env": "^7.0.3",
     "glob": "^10.3.10",
     "inquirer": "^9.2.12",
+    "minimatch": "^9.0.3",
     "playwright": "*",
     "prettier": "^3.1.0",
     "zod": "^3.22.4"
diff --git a/src/config.ts b/src/config.ts
index 3a28886f..314a4541 100644
--- a/src/config.ts
+++ b/src/config.ts
@@ -4,6 +4,29 @@ import type { Page } from "playwright";
 
 const Page: z.ZodType<Page> = z.any();
 
+export const OriginMatch = z.string().or(z.array(z.string()))
+
+export const PatternMatch = z.array(z.object({
+  /** 
+   * Pattern to match against for links on a page to subsequently crawl
+   * @example "https://www.builder.io/c/docs/**"
+   * @refer https://github.com/isaacs/minimatch
+   * @default ""
+   */
+  pattern: z.string(),
+  /**
+   * Selector to grab the inner text from, limited to pattern
+   * @example ".docs-builder-container"
+   * @default "body"
+   */
+  selector: z.string().optional(),
+  /*
+    * Skip to grap this for this pattern
+    * @default false
+  */
+  skip: z.boolean().optional()
+}))
+
 export const configSchema = z.object({
   /**
    * URL to start the crawl, if url is a sitemap, it will crawl all pages in the sitemap
@@ -17,8 +40,8 @@ export const configSchema = z.object({
    * @example "https://www.builder.io/c/docs/**"
    * @default ""
    */
-  match: z.string().or(z.array(z.string())),
-
+  match: OriginMatch.or(PatternMatch)
+  ,
   /**
    * Selector to grab the inner text from
    * @example ".docs-builder-container"
@@ -61,4 +84,5 @@ export const configSchema = z.object({
 });
 
 export type Config = z.infer<typeof configSchema>;
-
+export type PatternMatchType = z.infer<typeof PatternMatch>;
+export type OriginMatchType = z.infer<typeof OriginMatch>;
diff --git a/src/core.ts b/src/core.ts
index 278686be..439653ea 100644
--- a/src/core.ts
+++ b/src/core.ts
@@ -2,8 +2,9 @@
 import { PlaywrightCrawler, downloadListOfUrls } from "crawlee";
 import { readFile, writeFile } from "fs/promises";
 import { glob } from "glob";
-import {Config, configSchema} from "./config.js";
+import {Config, configSchema, PatternMatch, PatternMatchType, OriginMatch, OriginMatchType} from "./config.js";
 import { Page } from "playwright";
+import { minimatch } from 'minimatch'
 
 let pageCounter = 0;
 
@@ -23,7 +24,7 @@ export function getPageHtml(page: Page, selector = "body") {
     } else {
       // Handle as a CSS selector
       const el = document.querySelector(selector) as HTMLElement | null;
-      return el?.innerText || "";
+      return el?.innerText || el?.innerHTML || "";
     }
   }, selector);
 }
@@ -70,8 +71,36 @@ export async function crawl(config: Config) {
           `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`
         );
 
+        let globs: string | string[] = []
+
         // Use custom handling for XPath selector
-        if (config.selector) {
+        if (PatternMatch.safeParse(config.match).success) {
+          const matchPattern = config.match as PatternMatchType
+          globs = matchPattern.map(s => s.pattern)
+          const matchedPattern = matchPattern.find((match) => {
+            return minimatch(request.url, match.pattern);
+          })
+          if (matchedPattern && !matchedPattern.skip) {
+            const selector = matchedPattern?.selector || 'body';
+            if (selector.startsWith("/")) {
+              await waitForXPath(
+                page,
+                selector,
+                config.waitForSelectorTimeout ?? 1000
+              );
+            } else {
+              await page.waitForSelector(selector, {
+                timeout: config.waitForSelectorTimeout ?? 1000,
+              });
+            }
+            const html = await getPageHtml(page, selector);
+          
+            // Save results as JSON to ./storage/datasets/default
+            await pushData({ title, url: request.loadedUrl, html });
+          }
+        } else if (OriginMatch.safeParse(config.match).success && config.selector) {
+          const match = config.match as OriginMatchType
+          globs = typeof match === "string" ? [match] : match
           if (config.selector.startsWith("/")) {
             await waitForXPath(
               page,
@@ -83,12 +112,11 @@ export async function crawl(config: Config) {
               timeout: config.waitForSelectorTimeout ?? 1000,
             });
           }
-        }
+          const html = await getPageHtml(page, config.selector);
 
-        const html = await getPageHtml(page, config.selector);
-
-        // Save results as JSON to ./storage/datasets/default
-        await pushData({ title, url: request.loadedUrl, html });
+          // Save results as JSON to ./storage/datasets/default
+          await pushData({ title, url: request.loadedUrl, html });
+        }
 
         if (config.onVisitPage) {
           await config.onVisitPage({ page, pushData });
@@ -97,8 +125,7 @@ export async function crawl(config: Config) {
         // Extract links from the current page
         // and add them to the crawling queue.
         await enqueueLinks({
-          globs:
-            typeof config.match === "string" ? [config.match] : config.match,
+          globs
         });
       },
       // Comment this option to scrape the full website.

From 5c505c72fa42b48d59046a092aa5ea3fce4d957f Mon Sep 17 00:00:00 2001
From: FTAndy <fortheandy@gmail.com>
Date: Thu, 30 Nov 2023 17:29:09 +0800
Subject: [PATCH 2/5] fix: typo

---
 src/config.ts | 3 +--
 src/core.ts   | 3 ++-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/config.ts b/src/config.ts
index 314a4541..9a733d30 100644
--- a/src/config.ts
+++ b/src/config.ts
@@ -40,8 +40,7 @@ export const configSchema = z.object({
    * @example "https://www.builder.io/c/docs/**"
    * @default ""
    */
-  match: OriginMatch.or(PatternMatch)
-  ,
+  match: OriginMatch.or(PatternMatch),
   /**
    * Selector to grab the inner text from
    * @example ".docs-builder-container"
diff --git a/src/core.ts b/src/core.ts
index 439653ea..c67cbffd 100644
--- a/src/core.ts
+++ b/src/core.ts
@@ -73,7 +73,6 @@ export async function crawl(config: Config) {
 
         let globs: string | string[] = []
 
-        // Use custom handling for XPath selector
         if (PatternMatch.safeParse(config.match).success) {
           const matchPattern = config.match as PatternMatchType
           globs = matchPattern.map(s => s.pattern)
@@ -82,6 +81,7 @@ export async function crawl(config: Config) {
           })
           if (matchedPattern && !matchedPattern.skip) {
             const selector = matchedPattern?.selector || 'body';
+            // Use custom handling for XPath selector
             if (selector.startsWith("/")) {
               await waitForXPath(
                 page,
@@ -101,6 +101,7 @@ export async function crawl(config: Config) {
         } else if (OriginMatch.safeParse(config.match).success && config.selector) {
           const match = config.match as OriginMatchType
           globs = typeof match === "string" ? [match] : match
+          // Use custom handling for XPath selector
           if (config.selector.startsWith("/")) {
             await waitForXPath(
               page,

From 368cc96cf463629f7856a7b631d3825182c35b48 Mon Sep 17 00:00:00 2001
From: FTAndy <fortheandy@gmail.com>
Date: Thu, 30 Nov 2023 17:57:10 +0800
Subject: [PATCH 3/5] fix: readme

---
 README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 53d07eef..f089c4aa 100644
--- a/README.md
+++ b/README.md
@@ -71,7 +71,11 @@ type Config = {
   /** URL to start the crawl, if sitemap is provided then it will be used instead and download all pages in the sitemap */
   url: string;
   /** Pattern to match against for links on a page to subsequently crawl */
-  match: string;
+  match: string | string[] | {
+    pattern: string;
+    selector?: string | undefined; // Selector to grab the inner text from
+    skip?: boolean | undefined; // Whether skip to not grab any content from this pattern
+  }[];
   /** Selector to grab the inner text from */
   selector: string;
   /** Don't crawl more than this many pages */

From f620176c488ed181a62f14f3f672344b5633a87a Mon Sep 17 00:00:00 2001
From: FTAndy <fortheandy@gmail.com>
Date: Fri, 8 Dec 2023 15:32:14 +0800
Subject: [PATCH 4/5] feat: update config.ts to improve flexibility and
 performance

- Change the `match` property in the `Config` type to accept an array of string or string[]
- Remove unused code in `config.ts`
- Update the `PatternMatch` schema in `config.ts` to include a new property `skip`
- Modify the default config values in `config.ts`
- Update the `OriginMatch` schema in `config.ts` to accept an array of strings
- Fix a typo in the README.md file

Signed-off-by: FTAndy <fortheandy@gmail.com>
---
 README.md     |  2 +-
 config.ts     | 32 +-------------------------------
 src/config.ts | 13 +++++++++----
 3 files changed, 11 insertions(+), 36 deletions(-)

diff --git a/README.md b/README.md
index 4aeb6a6b..55db7170 100644
--- a/README.md
+++ b/README.md
@@ -72,7 +72,7 @@ type Config = {
   url: string;
   /** Pattern to match against for links on a page to subsequently crawl */
   match: string | string[] | {
-    pattern: string;
+    pattern: string; // url glob expressions from https://github.com/isaacs/minimatch
     selector?: string | undefined; // Selector to grab the inner text from
     skip?: boolean | undefined; // Whether skip to not grab any content from this pattern
   }[];
diff --git a/config.ts b/config.ts
index 18894f80..8dbe5516 100644
--- a/config.ts
+++ b/config.ts
@@ -5,34 +5,4 @@ export const defaultConfig: Config = {
   match: "https://www.builder.io/c/docs/**",
   maxPagesToCrawl: 50,
   outputFileName: "output.json",
-};
-
-
-// const treeEndPointUrl = 'https://github.com/BuilderIO/gpt-crawler/tree/main'
-// const blobEndPointUrl = 'https://github.com/BuilderIO/gpt-crawler/blob/main'
-
-// export const defaultConfig: Config = {
-//   url: "https://github.com/BuilderIO/gpt-crawler/tree/main",
-//   match: [
-//     {
-//       // skip the pattern you do not want to crawl
-//       // pattern: "https://github.com/BuilderIO/gpt-crawler/tree/main/**",
-//       pattern: `${treeEndPointUrl}/**`,
-//       skip: true
-//     },
-//     {
-//       // speical case for .md
-//       // for .md, we need to crawl the raw content in the .markdown-body selector
-//       // pattern: 'https://github.com/BuilderIO/gpt-crawler/blob/main/**/*.md',
-//       pattern: `${blobEndPointUrl}/**/*.md`,
-//       selector: '.markdown-body'
-//     },
-//     {
-//       // other files like .js, .ts, .json, etc
-//       pattern: `${blobEndPointUrl}/**`,
-//       selector: '#read-only-cursor-text-area'
-//     },
-//   ],
-//   maxPagesToCrawl: 50,
-//   outputFileName: "output.json",
-// };
+};
\ No newline at end of file
diff --git a/src/config.ts b/src/config.ts
index 23f3b297..04d5c327 100644
--- a/src/config.ts
+++ b/src/config.ts
@@ -4,6 +4,11 @@ import type { Page } from "playwright";
 
 const Page: z.ZodType<Page> = z.any();
 
+/**
+ * Pattern to match against for links on a page to subsequently crawl
+ * @example "https://www.builder.io/c/docs/**"
+ * @default ""
+ */
 export const OriginMatch = z.string().or(z.array(z.string()))
 
 export const PatternMatch = z.array(z.object({
@@ -20,10 +25,10 @@ export const PatternMatch = z.array(z.object({
    * @default "body"
    */
   selector: z.string().optional(),
-  /*
-    * Skip to grap this for this pattern
-    * @default false
-  */
+  /**
+   * Skip to grap inner text for this pattern
+   * @default false
+   */
   skip: z.boolean().optional()
 }))
 

From 9d184b9c7b1470abf7cea3e458335f219276c900 Mon Sep 17 00:00:00 2001
From: FTAndy <fortheandy@gmail.com>
Date: Fri, 8 Dec 2023 16:39:10 +0800
Subject: [PATCH 5/5] refactor: refactor core functionality for improved
 readability

- Modify the `README.md` file:
  - Change the `match` property type to accept an array of strings.
- Modify the `src/config.ts` file:
  - Change the `OriginMatch` property type to accept an array of strings.
  - Change the `PatternMatch` property type to accept an array of objects.
- Modify the `src/core.ts` file:
  - Add import statements for `minimatch`, `Config`, `PatternMatch`, and `OriginMatch`.
  - Modify the `crawl` function:
    - Change the `globs` variable declaration to include a semicolon at the end.
    - Change the condition for checking `matchedPattern` to use the optional chaining operator.
    - Add a missing semicolon in the `page.waitForSelector` call.
    - Move the code inside the `else if` condition to a separate block for better readability.

Signed-off-by: FTAndy <fortheandy@gmail.com>
---
 README.md     | 13 ++++++++-----
 config.ts     |  2 +-
 src/config.ts | 44 +++++++++++++++++++++++---------------------
 src/core.ts   | 36 +++++++++++++++++++++++-------------
 4 files changed, 55 insertions(+), 40 deletions(-)

diff --git a/README.md b/README.md
index 55db7170..165bce8e 100644
--- a/README.md
+++ b/README.md
@@ -71,11 +71,14 @@ type Config = {
   /** URL to start the crawl, if sitemap is provided then it will be used instead and download all pages in the sitemap */
   url: string;
   /** Pattern to match against for links on a page to subsequently crawl */
-  match: string | string[] | {
-    pattern: string; // url glob expressions from https://github.com/isaacs/minimatch
-    selector?: string | undefined; // Selector to grab the inner text from
-    skip?: boolean | undefined; // Whether skip to not grab any content from this pattern
-  }[];
+  match:
+    | string
+    | string[]
+    | {
+        pattern: string; // url glob expressions from https://github.com/isaacs/minimatch
+        selector?: string | undefined; // Selector to grab the inner text from
+        skip?: boolean | undefined; // Whether skip to not grab any content from this pattern
+      }[];
   /** Selector to grab the inner text from */
   selector: string;
   /** Don't crawl more than this many pages */
diff --git a/config.ts b/config.ts
index 8dbe5516..bc2d22e0 100644
--- a/config.ts
+++ b/config.ts
@@ -5,4 +5,4 @@ export const defaultConfig: Config = {
   match: "https://www.builder.io/c/docs/**",
   maxPagesToCrawl: 50,
   outputFileName: "output.json",
-};
\ No newline at end of file
+};
diff --git a/src/config.ts b/src/config.ts
index 04d5c327..2195a661 100644
--- a/src/config.ts
+++ b/src/config.ts
@@ -9,28 +9,30 @@ const Page: z.ZodType<Page> = z.any();
  * @example "https://www.builder.io/c/docs/**"
  * @default ""
  */
-export const OriginMatch = z.string().or(z.array(z.string()))
+export const OriginMatch = z.string().or(z.array(z.string()));
 
-export const PatternMatch = z.array(z.object({
-  /** 
-   * Pattern to match against for links on a page to subsequently crawl
-   * @example "https://www.builder.io/c/docs/**"
-   * @refer https://github.com/isaacs/minimatch
-   * @default ""
-   */
-  pattern: z.string(),
-  /**
-   * Selector to grab the inner text from, limited to pattern
-   * @example ".docs-builder-container"
-   * @default "body"
-   */
-  selector: z.string().optional(),
-  /**
-   * Skip to grap inner text for this pattern
-   * @default false
-   */
-  skip: z.boolean().optional()
-}))
+export const PatternMatch = z.array(
+  z.object({
+    /**
+     * Pattern to match against for links on a page to subsequently crawl
+     * @example "https://www.builder.io/c/docs/**"
+     * @refer https://github.com/isaacs/minimatch
+     * @default ""
+     */
+    pattern: z.string(),
+    /**
+     * Selector to grab the inner text from, limited to pattern
+     * @example ".docs-builder-container"
+     * @default "body"
+     */
+    selector: z.string().optional(),
+    /**
+     * Skip to grap inner text for this pattern
+     * @default false
+     */
+    skip: z.boolean().optional(),
+  }),
+);
 
 export const configSchema = z.object({
   /**
diff --git a/src/core.ts b/src/core.ts
index 7b13045a..150df333 100644
--- a/src/core.ts
+++ b/src/core.ts
@@ -2,8 +2,15 @@
 import { PlaywrightCrawler, downloadListOfUrls } from "crawlee";
 import { readFile, writeFile } from "fs/promises";
 import { glob } from "glob";
-import { minimatch } from 'minimatch'
-import { Config, configSchema, PatternMatch, PatternMatchType, OriginMatch, OriginMatchType } from "./config.js";
+import { minimatch } from "minimatch";
+import {
+  Config,
+  configSchema,
+  PatternMatch,
+  PatternMatchType,
+  OriginMatch,
+  OriginMatchType,
+} from "./config.js";
 import { Page } from "playwright";
 import { isWithinTokenLimit } from "gpt-tokenizer";
 
@@ -72,22 +79,22 @@ export async function crawl(config: Config) {
           `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`,
         );
 
-        let globs: string | string[] = []
+        let globs: string | string[] = [];
 
         if (PatternMatch.safeParse(config.match).success) {
-          const matchPattern = config.match as PatternMatchType
-          globs = matchPattern.map(s => s.pattern)
+          const matchPattern = config.match as PatternMatchType;
+          globs = matchPattern.map((s) => s.pattern);
           const matchedPattern = matchPattern.find((match) => {
             return minimatch(request.url, match.pattern);
-          })
+          });
           if (matchedPattern && !matchedPattern.skip) {
-            const selector = matchedPattern?.selector || 'body';
+            const selector = matchedPattern?.selector || "body";
             // Use custom handling for XPath selector
             if (selector.startsWith("/")) {
               await waitForXPath(
                 page,
                 selector,
-                config.waitForSelectorTimeout ?? 1000
+                config.waitForSelectorTimeout ?? 1000,
               );
             } else {
               await page.waitForSelector(selector, {
@@ -95,13 +102,16 @@ export async function crawl(config: Config) {
               });
             }
             const html = await getPageHtml(page, selector);
-          
+
             // Save results as JSON to ./storage/datasets/default
             await pushData({ title, url: request.loadedUrl, html });
           }
-        } else if (OriginMatch.safeParse(config.match).success && config.selector) {
-          const match = config.match as OriginMatchType
-          globs = typeof match === "string" ? [match] : match
+        } else if (
+          OriginMatch.safeParse(config.match).success &&
+          config.selector
+        ) {
+          const match = config.match as OriginMatchType;
+          globs = typeof match === "string" ? [match] : match;
           // Use custom handling for XPath selector
           if (config.selector.startsWith("/")) {
             await waitForXPath(
@@ -127,7 +137,7 @@ export async function crawl(config: Config) {
         // Extract links from the current page
         // and add them to the crawling queue.
         await enqueueLinks({
-          globs
+          globs,
         });
       },
       // Comment this option to scrape the full website.