-
Notifications
You must be signed in to change notification settings - Fork 2.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Feat: Multiple Match Pattern Config; Pattern Avoid; Grap Content with innerHTML Compatible #97
base: main
Are you sure you want to change the base?
Changes from 4 commits
13324c5
5c505c7
368cc96
b615473
f620176
9d184b9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,8 @@ | |
import { PlaywrightCrawler, downloadListOfUrls } from "crawlee"; | ||
import { readFile, writeFile } from "fs/promises"; | ||
import { glob } from "glob"; | ||
import { Config, configSchema } from "./config.js"; | ||
import { minimatch } from 'minimatch' | ||
import { Config, configSchema, PatternMatch, PatternMatchType, OriginMatch, OriginMatchType } from "./config.js"; | ||
import { Page } from "playwright"; | ||
import { isWithinTokenLimit } from "gpt-tokenizer"; | ||
|
||
|
@@ -24,7 +25,7 @@ export function getPageHtml(page: Page, selector = "body") { | |
} else { | ||
// Handle as a CSS selector | ||
const el = document.querySelector(selector) as HTMLElement | null; | ||
return el?.innerText || ""; | ||
return el?.innerText || el?.innerHTML || ""; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It occurs to me that this could introduce some undesired content to the crawler output since it would cover use cases like possibly grabbing scripts contents and white-spaces, for example: <div>
<script>alert("Hello!");</script>
</div> Would grab: I'm likely not seeing the whole picture though, can you provide some practical examples you thought of covering with these changes @FTAndy ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @marcelovicentegc Yes, when I grab code block content from the tag I think the API There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I do think the |
||
} | ||
}, selector); | ||
} | ||
|
@@ -71,8 +72,37 @@ export async function crawl(config: Config) { | |
`Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`, | ||
); | ||
|
||
// Use custom handling for XPath selector | ||
if (config.selector) { | ||
let globs: string | string[] = [] | ||
|
||
if (PatternMatch.safeParse(config.match).success) { | ||
const matchPattern = config.match as PatternMatchType | ||
globs = matchPattern.map(s => s.pattern) | ||
const matchedPattern = matchPattern.find((match) => { | ||
return minimatch(request.url, match.pattern); | ||
}) | ||
if (matchedPattern && !matchedPattern.skip) { | ||
const selector = matchedPattern?.selector || 'body'; | ||
// Use custom handling for XPath selector | ||
if (selector.startsWith("/")) { | ||
await waitForXPath( | ||
page, | ||
selector, | ||
config.waitForSelectorTimeout ?? 1000 | ||
); | ||
} else { | ||
await page.waitForSelector(selector, { | ||
timeout: config.waitForSelectorTimeout ?? 1000, | ||
}); | ||
} | ||
const html = await getPageHtml(page, selector); | ||
|
||
// Save results as JSON to ./storage/datasets/default | ||
await pushData({ title, url: request.loadedUrl, html }); | ||
} | ||
} else if (OriginMatch.safeParse(config.match).success && config.selector) { | ||
const match = config.match as OriginMatchType | ||
globs = typeof match === "string" ? [match] : match | ||
// Use custom handling for XPath selector | ||
if (config.selector.startsWith("/")) { | ||
await waitForXPath( | ||
page, | ||
|
@@ -84,12 +114,11 @@ export async function crawl(config: Config) { | |
timeout: config.waitForSelectorTimeout ?? 1000, | ||
}); | ||
} | ||
} | ||
const html = await getPageHtml(page, config.selector); | ||
|
||
const html = await getPageHtml(page, config.selector); | ||
|
||
// Save results as JSON to ./storage/datasets/default | ||
await pushData({ title, url: request.loadedUrl, html }); | ||
// Save results as JSON to ./storage/datasets/default | ||
await pushData({ title, url: request.loadedUrl, html }); | ||
} | ||
|
||
if (config.onVisitPage) { | ||
await config.onVisitPage({ page, pushData }); | ||
|
@@ -98,8 +127,7 @@ export async function crawl(config: Config) { | |
// Extract links from the current page | ||
// and add them to the crawling queue. | ||
await enqueueLinks({ | ||
globs: | ||
typeof config.match === "string" ? [config.match] : config.match, | ||
globs | ||
}); | ||
}, | ||
// Comment this option to scrape the full website. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hey @FTAndy, thanks for this PR! Is this commented code useful?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is the test code for the new
match
parameter option because there is no test process for the project. Should I delete it or just put this example to README?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed