diff --git a/README.md b/README.md index 43bfe4c7..165bce8e 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,14 @@ type Config = { /** URL to start the crawl, if sitemap is provided then it will be used instead and download all pages in the sitemap */ url: string; /** Pattern to match against for links on a page to subsequently crawl */ - match: string; + match: + | string + | string[] + | { + pattern: string; // url glob expressions from https://github.com/isaacs/minimatch + selector?: string | undefined; // Selector to grab the inner text from + skip?: boolean | undefined; // Whether skip to not grab any content from this pattern + }[]; /** Selector to grab the inner text from */ selector: string; /** Don't crawl more than this many pages */ diff --git a/package-lock.json b/package-lock.json index 37ef208c..14a06b80 100644 --- a/package-lock.json +++ b/package-lock.json @@ -16,6 +16,7 @@ "glob": "^10.3.10", "gpt-tokenizer": "^2.1.2", "inquirer": "^9.2.12", + "minimatch": "^9.0.3", "playwright": "*", "zod": "^3.22.4" }, diff --git a/package.json b/package.json index 7ff8cc5d..b85718a5 100644 --- a/package.json +++ b/package.json @@ -13,6 +13,7 @@ "glob": "^10.3.10", "gpt-tokenizer": "^2.1.2", "inquirer": "^9.2.12", + "minimatch": "^9.0.3", "playwright": "*", "zod": "^3.22.4" }, diff --git a/src/config.ts b/src/config.ts index 7e5f5fbf..2195a661 100644 --- a/src/config.ts +++ b/src/config.ts @@ -4,6 +4,36 @@ import type { Page } from "playwright"; const Page: z.ZodType = z.any(); +/** + * Pattern to match against for links on a page to subsequently crawl + * @example "https://www.builder.io/c/docs/**" + * @default "" + */ +export const OriginMatch = z.string().or(z.array(z.string())); + +export const PatternMatch = z.array( + z.object({ + /** + * Pattern to match against for links on a page to subsequently crawl + * @example "https://www.builder.io/c/docs/**" + * @refer https://github.com/isaacs/minimatch + * @default "" + */ + pattern: z.string(), + /** + * Selector to grab the inner text from, limited to pattern + * @example ".docs-builder-container" + * @default "body" + */ + selector: z.string().optional(), + /** + * Skip to grap inner text for this pattern + * @default false + */ + skip: z.boolean().optional(), + }), +); + export const configSchema = z.object({ /** * URL to start the crawl, if url is a sitemap, it will crawl all pages in the sitemap @@ -17,8 +47,7 @@ export const configSchema = z.object({ * @example "https://www.builder.io/c/docs/**" * @default "" */ - match: z.string().or(z.array(z.string())), - + match: OriginMatch.or(PatternMatch), /** * Selector to grab the inner text from * @example ".docs-builder-container" @@ -73,3 +102,5 @@ export const configSchema = z.object({ }); export type Config = z.infer; +export type PatternMatchType = z.infer; +export type OriginMatchType = z.infer; diff --git a/src/core.ts b/src/core.ts index 8e03bbe5..150df333 100644 --- a/src/core.ts +++ b/src/core.ts @@ -2,7 +2,15 @@ import { PlaywrightCrawler, downloadListOfUrls } from "crawlee"; import { readFile, writeFile } from "fs/promises"; import { glob } from "glob"; -import { Config, configSchema } from "./config.js"; +import { minimatch } from "minimatch"; +import { + Config, + configSchema, + PatternMatch, + PatternMatchType, + OriginMatch, + OriginMatchType, +} from "./config.js"; import { Page } from "playwright"; import { isWithinTokenLimit } from "gpt-tokenizer"; @@ -24,7 +32,7 @@ export function getPageHtml(page: Page, selector = "body") { } else { // Handle as a CSS selector const el = document.querySelector(selector) as HTMLElement | null; - return el?.innerText || ""; + return el?.innerText || el?.innerHTML || ""; } }, selector); } @@ -71,8 +79,40 @@ export async function crawl(config: Config) { `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`, ); - // Use custom handling for XPath selector - if (config.selector) { + let globs: string | string[] = []; + + if (PatternMatch.safeParse(config.match).success) { + const matchPattern = config.match as PatternMatchType; + globs = matchPattern.map((s) => s.pattern); + const matchedPattern = matchPattern.find((match) => { + return minimatch(request.url, match.pattern); + }); + if (matchedPattern && !matchedPattern.skip) { + const selector = matchedPattern?.selector || "body"; + // Use custom handling for XPath selector + if (selector.startsWith("/")) { + await waitForXPath( + page, + selector, + config.waitForSelectorTimeout ?? 1000, + ); + } else { + await page.waitForSelector(selector, { + timeout: config.waitForSelectorTimeout ?? 1000, + }); + } + const html = await getPageHtml(page, selector); + + // Save results as JSON to ./storage/datasets/default + await pushData({ title, url: request.loadedUrl, html }); + } + } else if ( + OriginMatch.safeParse(config.match).success && + config.selector + ) { + const match = config.match as OriginMatchType; + globs = typeof match === "string" ? [match] : match; + // Use custom handling for XPath selector if (config.selector.startsWith("/")) { await waitForXPath( page, @@ -84,12 +124,11 @@ export async function crawl(config: Config) { timeout: config.waitForSelectorTimeout ?? 1000, }); } - } + const html = await getPageHtml(page, config.selector); - const html = await getPageHtml(page, config.selector); - - // Save results as JSON to ./storage/datasets/default - await pushData({ title, url: request.loadedUrl, html }); + // Save results as JSON to ./storage/datasets/default + await pushData({ title, url: request.loadedUrl, html }); + } if (config.onVisitPage) { await config.onVisitPage({ page, pushData }); @@ -98,8 +137,7 @@ export async function crawl(config: Config) { // Extract links from the current page // and add them to the crawling queue. await enqueueLinks({ - globs: - typeof config.match === "string" ? [config.match] : config.match, + globs, }); }, // Comment this option to scrape the full website.