Skip to content

Commit

Permalink
Merge pull request #170 from Aure7138/proxy
Browse files Browse the repository at this point in the history
feat: proxy support
  • Loading branch information
steve8708 authored Aug 6, 2024
2 parents b62a002 + cf03b8a commit 73dfaef
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 3 deletions.
3 changes: 1 addition & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
# [1.5.0](https://github.com/BuilderIO/gpt-crawler/compare/v1.4.0...v1.5.0) (2024-07-05)


### Features

* git clone depth limit in docker ([87767db](https://github.com/BuilderIO/gpt-crawler/commit/87767dbda99b3259d44ec2c02dceb3a59bb2ca3c))
- git clone depth limit in docker ([87767db](https://github.com/BuilderIO/gpt-crawler/commit/87767dbda99b3259d44ec2c02dceb3a59bb2ca3c))

# [1.4.0](https://github.com/BuilderIO/gpt-crawler/compare/v1.3.0...v1.4.0) (2024-01-15)

Expand Down
1 change: 1 addition & 0 deletions config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ export const defaultConfig: Config = {
maxPagesToCrawl: 50,
outputFileName: "output.json",
maxTokens: 2000000,
// proxyUrls: ["http://username:password@proxyserver:port"], // socks5://username:password@proxyserver:port
};
4 changes: 4 additions & 0 deletions src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,10 @@ export const configSchema = z.object({
* @example 5000
*/
maxTokens: z.number().int().positive().optional(),
/** Optional proxy server
* @example ['http://username:password@proxyserver:port', 'socks5://username:password@proxyserver:port']
*/
proxyUrls: z.array(z.string()).optional(),
});

export type Config = z.infer<typeof configSchema>;
12 changes: 11 additions & 1 deletion src/core.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
// For more information, see https://crawlee.dev/
import { Configuration, PlaywrightCrawler, downloadListOfUrls } from "crawlee";
import {
Configuration,
PlaywrightCrawler,
ProxyConfiguration,
downloadListOfUrls,
} from "crawlee";
import { readFile, writeFile } from "fs/promises";
import { glob } from "glob";
import { Config, configSchema } from "./config.js";
Expand Down Expand Up @@ -54,8 +59,13 @@ export async function crawl(config: Config) {
if (process.env.NO_CRAWL !== "true") {
// PlaywrightCrawler crawls the web using a headless
// browser controlled by the Playwright library.
const proxyConfiguration = new ProxyConfiguration({
proxyUrls: config.proxyUrls,
});

crawler = new PlaywrightCrawler(
{
proxyConfiguration,
// Use the requestHandler to process each of the crawled pages.
async requestHandler({ request, page, enqueueLinks, log, pushData }) {
const title = await page.title();
Expand Down

0 comments on commit 73dfaef

Please sign in to comment.