From 13324c598c47d065d4a11aa6b930678adf3d4a90 Mon Sep 17 00:00:00 2001 From: FTAndy Date: Thu, 30 Nov 2023 17:27:20 +0800 Subject: [PATCH 1/5] feat: add match config multiple pattern --- config.ts | 30 ++++++++++++++++++++++++++++++ package-lock.json | 4 +++- package.json | 1 + src/config.ts | 30 +++++++++++++++++++++++++++--- src/core.ts | 47 +++++++++++++++++++++++++++++++++++++---------- 5 files changed, 98 insertions(+), 14 deletions(-) diff --git a/config.ts b/config.ts index bc2d22e0..18894f80 100644 --- a/config.ts +++ b/config.ts @@ -6,3 +6,33 @@ export const defaultConfig: Config = { maxPagesToCrawl: 50, outputFileName: "output.json", }; + + +// const treeEndPointUrl = 'https://github.com/BuilderIO/gpt-crawler/tree/main' +// const blobEndPointUrl = 'https://github.com/BuilderIO/gpt-crawler/blob/main' + +// export const defaultConfig: Config = { +// url: "https://github.com/BuilderIO/gpt-crawler/tree/main", +// match: [ +// { +// // skip the pattern you do not want to crawl +// // pattern: "https://github.com/BuilderIO/gpt-crawler/tree/main/**", +// pattern: `${treeEndPointUrl}/**`, +// skip: true +// }, +// { +// // speical case for .md +// // for .md, we need to crawl the raw content in the .markdown-body selector +// // pattern: 'https://github.com/BuilderIO/gpt-crawler/blob/main/**/*.md', +// pattern: `${blobEndPointUrl}/**/*.md`, +// selector: '.markdown-body' +// }, +// { +// // other files like .js, .ts, .json, etc +// pattern: `${blobEndPointUrl}/**`, +// selector: '#read-only-cursor-text-area' +// }, +// ], +// maxPagesToCrawl: 50, +// outputFileName: "output.json", +// }; diff --git a/package-lock.json b/package-lock.json index aed0f5ac..f787b4ec 100644 --- a/package-lock.json +++ b/package-lock.json @@ -15,6 +15,7 @@ "cross-env": "^7.0.3", "glob": "^10.3.10", "inquirer": "^9.2.12", + "minimatch": "^9.0.3", "playwright": "*", "prettier": "^3.1.0", "zod": "^3.22.4" @@ -4382,7 +4383,8 @@ }, "node_modules/minimatch": { "version": "9.0.3", - "license": "ISC", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.3.tgz", + "integrity": "sha512-RHiac9mvaRw0x3AYRgDC1CxAP7HTcNrrECeA8YYJeWnpo+2Q5CegtZjaotWTWxDG3UeGA1coE05iH1mPjT/2mg==", "dependencies": { "brace-expansion": "^2.0.1" }, diff --git a/package.json b/package.json index 9a31f847..ce7be4df 100644 --- a/package.json +++ b/package.json @@ -12,6 +12,7 @@ "cross-env": "^7.0.3", "glob": "^10.3.10", "inquirer": "^9.2.12", + "minimatch": "^9.0.3", "playwright": "*", "prettier": "^3.1.0", "zod": "^3.22.4" diff --git a/src/config.ts b/src/config.ts index 3a28886f..314a4541 100644 --- a/src/config.ts +++ b/src/config.ts @@ -4,6 +4,29 @@ import type { Page } from "playwright"; const Page: z.ZodType = z.any(); +export const OriginMatch = z.string().or(z.array(z.string())) + +export const PatternMatch = z.array(z.object({ + /** + * Pattern to match against for links on a page to subsequently crawl + * @example "https://www.builder.io/c/docs/**" + * @refer https://github.com/isaacs/minimatch + * @default "" + */ + pattern: z.string(), + /** + * Selector to grab the inner text from, limited to pattern + * @example ".docs-builder-container" + * @default "body" + */ + selector: z.string().optional(), + /* + * Skip to grap this for this pattern + * @default false + */ + skip: z.boolean().optional() +})) + export const configSchema = z.object({ /** * URL to start the crawl, if url is a sitemap, it will crawl all pages in the sitemap @@ -17,8 +40,8 @@ export const configSchema = z.object({ * @example "https://www.builder.io/c/docs/**" * @default "" */ - match: z.string().or(z.array(z.string())), - + match: OriginMatch.or(PatternMatch) + , /** * Selector to grab the inner text from * @example ".docs-builder-container" @@ -61,4 +84,5 @@ export const configSchema = z.object({ }); export type Config = z.infer; - +export type PatternMatchType = z.infer; +export type OriginMatchType = z.infer; diff --git a/src/core.ts b/src/core.ts index 278686be..439653ea 100644 --- a/src/core.ts +++ b/src/core.ts @@ -2,8 +2,9 @@ import { PlaywrightCrawler, downloadListOfUrls } from "crawlee"; import { readFile, writeFile } from "fs/promises"; import { glob } from "glob"; -import {Config, configSchema} from "./config.js"; +import {Config, configSchema, PatternMatch, PatternMatchType, OriginMatch, OriginMatchType} from "./config.js"; import { Page } from "playwright"; +import { minimatch } from 'minimatch' let pageCounter = 0; @@ -23,7 +24,7 @@ export function getPageHtml(page: Page, selector = "body") { } else { // Handle as a CSS selector const el = document.querySelector(selector) as HTMLElement | null; - return el?.innerText || ""; + return el?.innerText || el?.innerHTML || ""; } }, selector); } @@ -70,8 +71,36 @@ export async function crawl(config: Config) { `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...` ); + let globs: string | string[] = [] + // Use custom handling for XPath selector - if (config.selector) { + if (PatternMatch.safeParse(config.match).success) { + const matchPattern = config.match as PatternMatchType + globs = matchPattern.map(s => s.pattern) + const matchedPattern = matchPattern.find((match) => { + return minimatch(request.url, match.pattern); + }) + if (matchedPattern && !matchedPattern.skip) { + const selector = matchedPattern?.selector || 'body'; + if (selector.startsWith("/")) { + await waitForXPath( + page, + selector, + config.waitForSelectorTimeout ?? 1000 + ); + } else { + await page.waitForSelector(selector, { + timeout: config.waitForSelectorTimeout ?? 1000, + }); + } + const html = await getPageHtml(page, selector); + + // Save results as JSON to ./storage/datasets/default + await pushData({ title, url: request.loadedUrl, html }); + } + } else if (OriginMatch.safeParse(config.match).success && config.selector) { + const match = config.match as OriginMatchType + globs = typeof match === "string" ? [match] : match if (config.selector.startsWith("/")) { await waitForXPath( page, @@ -83,12 +112,11 @@ export async function crawl(config: Config) { timeout: config.waitForSelectorTimeout ?? 1000, }); } - } + const html = await getPageHtml(page, config.selector); - const html = await getPageHtml(page, config.selector); - - // Save results as JSON to ./storage/datasets/default - await pushData({ title, url: request.loadedUrl, html }); + // Save results as JSON to ./storage/datasets/default + await pushData({ title, url: request.loadedUrl, html }); + } if (config.onVisitPage) { await config.onVisitPage({ page, pushData }); @@ -97,8 +125,7 @@ export async function crawl(config: Config) { // Extract links from the current page // and add them to the crawling queue. await enqueueLinks({ - globs: - typeof config.match === "string" ? [config.match] : config.match, + globs }); }, // Comment this option to scrape the full website. From 5c505c72fa42b48d59046a092aa5ea3fce4d957f Mon Sep 17 00:00:00 2001 From: FTAndy Date: Thu, 30 Nov 2023 17:29:09 +0800 Subject: [PATCH 2/5] fix: typo --- src/config.ts | 3 +-- src/core.ts | 3 ++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/config.ts b/src/config.ts index 314a4541..9a733d30 100644 --- a/src/config.ts +++ b/src/config.ts @@ -40,8 +40,7 @@ export const configSchema = z.object({ * @example "https://www.builder.io/c/docs/**" * @default "" */ - match: OriginMatch.or(PatternMatch) - , + match: OriginMatch.or(PatternMatch), /** * Selector to grab the inner text from * @example ".docs-builder-container" diff --git a/src/core.ts b/src/core.ts index 439653ea..c67cbffd 100644 --- a/src/core.ts +++ b/src/core.ts @@ -73,7 +73,6 @@ export async function crawl(config: Config) { let globs: string | string[] = [] - // Use custom handling for XPath selector if (PatternMatch.safeParse(config.match).success) { const matchPattern = config.match as PatternMatchType globs = matchPattern.map(s => s.pattern) @@ -82,6 +81,7 @@ export async function crawl(config: Config) { }) if (matchedPattern && !matchedPattern.skip) { const selector = matchedPattern?.selector || 'body'; + // Use custom handling for XPath selector if (selector.startsWith("/")) { await waitForXPath( page, @@ -101,6 +101,7 @@ export async function crawl(config: Config) { } else if (OriginMatch.safeParse(config.match).success && config.selector) { const match = config.match as OriginMatchType globs = typeof match === "string" ? [match] : match + // Use custom handling for XPath selector if (config.selector.startsWith("/")) { await waitForXPath( page, From 368cc96cf463629f7856a7b631d3825182c35b48 Mon Sep 17 00:00:00 2001 From: FTAndy Date: Thu, 30 Nov 2023 17:57:10 +0800 Subject: [PATCH 3/5] fix: readme --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 53d07eef..f089c4aa 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,11 @@ type Config = { /** URL to start the crawl, if sitemap is provided then it will be used instead and download all pages in the sitemap */ url: string; /** Pattern to match against for links on a page to subsequently crawl */ - match: string; + match: string | string[] | { + pattern: string; + selector?: string | undefined; // Selector to grab the inner text from + skip?: boolean | undefined; // Whether skip to not grab any content from this pattern + }[]; /** Selector to grab the inner text from */ selector: string; /** Don't crawl more than this many pages */ From f620176c488ed181a62f14f3f672344b5633a87a Mon Sep 17 00:00:00 2001 From: FTAndy Date: Fri, 8 Dec 2023 15:32:14 +0800 Subject: [PATCH 4/5] feat: update config.ts to improve flexibility and performance - Change the `match` property in the `Config` type to accept an array of string or string[] - Remove unused code in `config.ts` - Update the `PatternMatch` schema in `config.ts` to include a new property `skip` - Modify the default config values in `config.ts` - Update the `OriginMatch` schema in `config.ts` to accept an array of strings - Fix a typo in the README.md file Signed-off-by: FTAndy --- README.md | 2 +- config.ts | 32 +------------------------------- src/config.ts | 13 +++++++++---- 3 files changed, 11 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index 4aeb6a6b..55db7170 100644 --- a/README.md +++ b/README.md @@ -72,7 +72,7 @@ type Config = { url: string; /** Pattern to match against for links on a page to subsequently crawl */ match: string | string[] | { - pattern: string; + pattern: string; // url glob expressions from https://github.com/isaacs/minimatch selector?: string | undefined; // Selector to grab the inner text from skip?: boolean | undefined; // Whether skip to not grab any content from this pattern }[]; diff --git a/config.ts b/config.ts index 18894f80..8dbe5516 100644 --- a/config.ts +++ b/config.ts @@ -5,34 +5,4 @@ export const defaultConfig: Config = { match: "https://www.builder.io/c/docs/**", maxPagesToCrawl: 50, outputFileName: "output.json", -}; - - -// const treeEndPointUrl = 'https://github.com/BuilderIO/gpt-crawler/tree/main' -// const blobEndPointUrl = 'https://github.com/BuilderIO/gpt-crawler/blob/main' - -// export const defaultConfig: Config = { -// url: "https://github.com/BuilderIO/gpt-crawler/tree/main", -// match: [ -// { -// // skip the pattern you do not want to crawl -// // pattern: "https://github.com/BuilderIO/gpt-crawler/tree/main/**", -// pattern: `${treeEndPointUrl}/**`, -// skip: true -// }, -// { -// // speical case for .md -// // for .md, we need to crawl the raw content in the .markdown-body selector -// // pattern: 'https://github.com/BuilderIO/gpt-crawler/blob/main/**/*.md', -// pattern: `${blobEndPointUrl}/**/*.md`, -// selector: '.markdown-body' -// }, -// { -// // other files like .js, .ts, .json, etc -// pattern: `${blobEndPointUrl}/**`, -// selector: '#read-only-cursor-text-area' -// }, -// ], -// maxPagesToCrawl: 50, -// outputFileName: "output.json", -// }; +}; \ No newline at end of file diff --git a/src/config.ts b/src/config.ts index 23f3b297..04d5c327 100644 --- a/src/config.ts +++ b/src/config.ts @@ -4,6 +4,11 @@ import type { Page } from "playwright"; const Page: z.ZodType = z.any(); +/** + * Pattern to match against for links on a page to subsequently crawl + * @example "https://www.builder.io/c/docs/**" + * @default "" + */ export const OriginMatch = z.string().or(z.array(z.string())) export const PatternMatch = z.array(z.object({ @@ -20,10 +25,10 @@ export const PatternMatch = z.array(z.object({ * @default "body" */ selector: z.string().optional(), - /* - * Skip to grap this for this pattern - * @default false - */ + /** + * Skip to grap inner text for this pattern + * @default false + */ skip: z.boolean().optional() })) From 9d184b9c7b1470abf7cea3e458335f219276c900 Mon Sep 17 00:00:00 2001 From: FTAndy Date: Fri, 8 Dec 2023 16:39:10 +0800 Subject: [PATCH 5/5] refactor: refactor core functionality for improved readability - Modify the `README.md` file: - Change the `match` property type to accept an array of strings. - Modify the `src/config.ts` file: - Change the `OriginMatch` property type to accept an array of strings. - Change the `PatternMatch` property type to accept an array of objects. - Modify the `src/core.ts` file: - Add import statements for `minimatch`, `Config`, `PatternMatch`, and `OriginMatch`. - Modify the `crawl` function: - Change the `globs` variable declaration to include a semicolon at the end. - Change the condition for checking `matchedPattern` to use the optional chaining operator. - Add a missing semicolon in the `page.waitForSelector` call. - Move the code inside the `else if` condition to a separate block for better readability. Signed-off-by: FTAndy --- README.md | 13 ++++++++----- config.ts | 2 +- src/config.ts | 44 +++++++++++++++++++++++--------------------- src/core.ts | 36 +++++++++++++++++++++++------------- 4 files changed, 55 insertions(+), 40 deletions(-) diff --git a/README.md b/README.md index 55db7170..165bce8e 100644 --- a/README.md +++ b/README.md @@ -71,11 +71,14 @@ type Config = { /** URL to start the crawl, if sitemap is provided then it will be used instead and download all pages in the sitemap */ url: string; /** Pattern to match against for links on a page to subsequently crawl */ - match: string | string[] | { - pattern: string; // url glob expressions from https://github.com/isaacs/minimatch - selector?: string | undefined; // Selector to grab the inner text from - skip?: boolean | undefined; // Whether skip to not grab any content from this pattern - }[]; + match: + | string + | string[] + | { + pattern: string; // url glob expressions from https://github.com/isaacs/minimatch + selector?: string | undefined; // Selector to grab the inner text from + skip?: boolean | undefined; // Whether skip to not grab any content from this pattern + }[]; /** Selector to grab the inner text from */ selector: string; /** Don't crawl more than this many pages */ diff --git a/config.ts b/config.ts index 8dbe5516..bc2d22e0 100644 --- a/config.ts +++ b/config.ts @@ -5,4 +5,4 @@ export const defaultConfig: Config = { match: "https://www.builder.io/c/docs/**", maxPagesToCrawl: 50, outputFileName: "output.json", -}; \ No newline at end of file +}; diff --git a/src/config.ts b/src/config.ts index 04d5c327..2195a661 100644 --- a/src/config.ts +++ b/src/config.ts @@ -9,28 +9,30 @@ const Page: z.ZodType = z.any(); * @example "https://www.builder.io/c/docs/**" * @default "" */ -export const OriginMatch = z.string().or(z.array(z.string())) +export const OriginMatch = z.string().or(z.array(z.string())); -export const PatternMatch = z.array(z.object({ - /** - * Pattern to match against for links on a page to subsequently crawl - * @example "https://www.builder.io/c/docs/**" - * @refer https://github.com/isaacs/minimatch - * @default "" - */ - pattern: z.string(), - /** - * Selector to grab the inner text from, limited to pattern - * @example ".docs-builder-container" - * @default "body" - */ - selector: z.string().optional(), - /** - * Skip to grap inner text for this pattern - * @default false - */ - skip: z.boolean().optional() -})) +export const PatternMatch = z.array( + z.object({ + /** + * Pattern to match against for links on a page to subsequently crawl + * @example "https://www.builder.io/c/docs/**" + * @refer https://github.com/isaacs/minimatch + * @default "" + */ + pattern: z.string(), + /** + * Selector to grab the inner text from, limited to pattern + * @example ".docs-builder-container" + * @default "body" + */ + selector: z.string().optional(), + /** + * Skip to grap inner text for this pattern + * @default false + */ + skip: z.boolean().optional(), + }), +); export const configSchema = z.object({ /** diff --git a/src/core.ts b/src/core.ts index 7b13045a..150df333 100644 --- a/src/core.ts +++ b/src/core.ts @@ -2,8 +2,15 @@ import { PlaywrightCrawler, downloadListOfUrls } from "crawlee"; import { readFile, writeFile } from "fs/promises"; import { glob } from "glob"; -import { minimatch } from 'minimatch' -import { Config, configSchema, PatternMatch, PatternMatchType, OriginMatch, OriginMatchType } from "./config.js"; +import { minimatch } from "minimatch"; +import { + Config, + configSchema, + PatternMatch, + PatternMatchType, + OriginMatch, + OriginMatchType, +} from "./config.js"; import { Page } from "playwright"; import { isWithinTokenLimit } from "gpt-tokenizer"; @@ -72,22 +79,22 @@ export async function crawl(config: Config) { `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`, ); - let globs: string | string[] = [] + let globs: string | string[] = []; if (PatternMatch.safeParse(config.match).success) { - const matchPattern = config.match as PatternMatchType - globs = matchPattern.map(s => s.pattern) + const matchPattern = config.match as PatternMatchType; + globs = matchPattern.map((s) => s.pattern); const matchedPattern = matchPattern.find((match) => { return minimatch(request.url, match.pattern); - }) + }); if (matchedPattern && !matchedPattern.skip) { - const selector = matchedPattern?.selector || 'body'; + const selector = matchedPattern?.selector || "body"; // Use custom handling for XPath selector if (selector.startsWith("/")) { await waitForXPath( page, selector, - config.waitForSelectorTimeout ?? 1000 + config.waitForSelectorTimeout ?? 1000, ); } else { await page.waitForSelector(selector, { @@ -95,13 +102,16 @@ export async function crawl(config: Config) { }); } const html = await getPageHtml(page, selector); - + // Save results as JSON to ./storage/datasets/default await pushData({ title, url: request.loadedUrl, html }); } - } else if (OriginMatch.safeParse(config.match).success && config.selector) { - const match = config.match as OriginMatchType - globs = typeof match === "string" ? [match] : match + } else if ( + OriginMatch.safeParse(config.match).success && + config.selector + ) { + const match = config.match as OriginMatchType; + globs = typeof match === "string" ? [match] : match; // Use custom handling for XPath selector if (config.selector.startsWith("/")) { await waitForXPath( @@ -127,7 +137,7 @@ export async function crawl(config: Config) { // Extract links from the current page // and add them to the crawling queue. await enqueueLinks({ - globs + globs, }); }, // Comment this option to scrape the full website.