From af64584d2de7f9cde49d7be7398ed557bf6a2546 Mon Sep 17 00:00:00 2001 From: Aure7138 Date: Sun, 21 Jul 2024 02:10:17 +0800 Subject: [PATCH 1/3] push commit --- config.ts | 1 + src/config.ts | 4 ++++ src/core.ts | 7 ++++++- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/config.ts b/config.ts index f5c958df..6a24846b 100644 --- a/config.ts +++ b/config.ts @@ -6,4 +6,5 @@ export const defaultConfig: Config = { maxPagesToCrawl: 50, outputFileName: "output.json", maxTokens: 2000000, + // proxyUrls: ["http://username:password@proxyserver:port"], // socks5://username:password@proxyserver:port }; diff --git a/src/config.ts b/src/config.ts index 787744ce..0e4f0159 100644 --- a/src/config.ts +++ b/src/config.ts @@ -85,6 +85,10 @@ export const configSchema = z.object({ * @example 5000 */ maxTokens: z.number().int().positive().optional(), + /** Optional proxy server + * @example ['http://username:password@proxyserver:port', 'socks5://username:password@proxyserver:port'] + */ + proxyUrls: z.array(z.string()).optional(), }); export type Config = z.infer; diff --git a/src/core.ts b/src/core.ts index c996f2bb..f533d776 100644 --- a/src/core.ts +++ b/src/core.ts @@ -1,5 +1,5 @@ // For more information, see https://crawlee.dev/ -import { Configuration, PlaywrightCrawler, downloadListOfUrls } from "crawlee"; +import { Configuration, PlaywrightCrawler, ProxyConfiguration, downloadListOfUrls } from "crawlee"; import { readFile, writeFile } from "fs/promises"; import { glob } from "glob"; import { Config, configSchema } from "./config.js"; @@ -54,8 +54,13 @@ export async function crawl(config: Config) { if (process.env.NO_CRAWL !== "true") { // PlaywrightCrawler crawls the web using a headless // browser controlled by the Playwright library. + const proxyConfiguration = new ProxyConfiguration({ + proxyUrls: config.proxyUrls, + }); + crawler = new PlaywrightCrawler( { + proxyConfiguration, // Use the requestHandler to process each of the crawled pages. async requestHandler({ request, page, enqueueLinks, log, pushData }) { const title = await page.title(); From 7f1fd214c411d83ec409ce7185736d0f85e205f2 Mon Sep 17 00:00:00 2001 From: Aure7138 Date: Sat, 3 Aug 2024 20:51:12 +0800 Subject: [PATCH 2/3] Prettier --- CHANGELOG.md | 3 +-- src/core.ts | 9 +++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ac73b4e7..4ab0556d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,8 @@ # [1.5.0](https://github.com/BuilderIO/gpt-crawler/compare/v1.4.0...v1.5.0) (2024-07-05) - ### Features -* git clone depth limit in docker ([87767db](https://github.com/BuilderIO/gpt-crawler/commit/87767dbda99b3259d44ec2c02dceb3a59bb2ca3c)) +- git clone depth limit in docker ([87767db](https://github.com/BuilderIO/gpt-crawler/commit/87767dbda99b3259d44ec2c02dceb3a59bb2ca3c)) # [1.4.0](https://github.com/BuilderIO/gpt-crawler/compare/v1.3.0...v1.4.0) (2024-01-15) diff --git a/src/core.ts b/src/core.ts index f533d776..9c709988 100644 --- a/src/core.ts +++ b/src/core.ts @@ -1,5 +1,10 @@ // For more information, see https://crawlee.dev/ -import { Configuration, PlaywrightCrawler, ProxyConfiguration, downloadListOfUrls } from "crawlee"; +import { + Configuration, + PlaywrightCrawler, + ProxyConfiguration, + downloadListOfUrls, +} from "crawlee"; import { readFile, writeFile } from "fs/promises"; import { glob } from "glob"; import { Config, configSchema } from "./config.js"; @@ -101,7 +106,7 @@ export async function crawl(config: Config) { exclude: typeof config.exclude === "string" ? [config.exclude] - : config.exclude ?? [], + : (config.exclude ?? []), }); }, // Comment this option to scrape the full website. From cf03b8ab0a1e4ac8d92477a31b5c9498073d70bb Mon Sep 17 00:00:00 2001 From: Aure7138 Date: Sat, 3 Aug 2024 20:57:12 +0800 Subject: [PATCH 3/3] Prettier --- src/core.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core.ts b/src/core.ts index 9c709988..2e19c4e0 100644 --- a/src/core.ts +++ b/src/core.ts @@ -106,7 +106,7 @@ export async function crawl(config: Config) { exclude: typeof config.exclude === "string" ? [config.exclude] - : (config.exclude ?? []), + : config.exclude ?? [], }); }, // Comment this option to scrape the full website.