Skip to content

Commit

Permalink
feat: Customizing selector can sometimes cause web crawlers to fail
Browse files Browse the repository at this point in the history
  • Loading branch information
kukuze committed Feb 29, 2024
1 parent 4f799c5 commit b7ce8d1
Showing 1 changed file with 16 additions and 22 deletions.
38 changes: 16 additions & 22 deletions src/core.ts
Original file line number Diff line number Diff line change
Expand Up @@ -65,34 +65,28 @@ export async function crawl(config: Config) {
);

// Use custom handling for XPath selector
let html: string | null = null; // 初始化为null或空字符串""
if (config.selector) {
if (config.selector.startsWith("/")) {
await waitForXPath(
page,
config.selector,
config.waitForSelectorTimeout ?? 1000,
);
html = await getPageHtml(page, config.selector);
} else {
try {
// 尝试等待CSS选择器,捕获可能的异常
await page.waitForSelector(config.selector, {
timeout: config.waitForSelectorTimeout ?? 1000,
});
html = await getPageHtml(page, config.selector);
} catch (error) {
// 如果CSS选择器等待失败,则输出日志并等待<body>
console.log(
`CSS Selector "${config.selector}" not found. Waiting for <body> instead.`,
let effectiveSelector = config.selector;
if (effectiveSelector) {
try {
if (effectiveSelector.startsWith("/")) {
await waitForXPath(
page,
effectiveSelector,
config.waitForSelectorTimeout ?? 1000,
);
await page.waitForSelector("body", {
} else {
await page.waitForSelector(effectiveSelector, {
timeout: config.waitForSelectorTimeout ?? 1000,
});
html = await getPageHtml(page, "body");
}
} catch (error) {
console.log(
`Selector "${config.selector}" not found. Defaulting to <body>.`,
);
effectiveSelector = undefined;
}
}
const html = await getPageHtml(page, effectiveSelector);
// Save results as JSON to ./storage/datasets/default
await pushData({ title, url: request.loadedUrl, html });

Expand Down

0 comments on commit b7ce8d1

Please sign in to comment.