From 91663ea5a7eb58167516d5c07ccba724ad6dd1b6 Mon Sep 17 00:00:00 2001 From: arvinxx Date: Fri, 18 Aug 2023 00:59:01 +0800 Subject: [PATCH] :fire: refactor: remove webCrawler --- plugins/index.ts | 2 -- plugins/webCrawler/index.ts | 22 ------------------ plugins/webCrawler/runner.ts | 45 ------------------------------------ plugins/webCrawler/type.ts | 35 ---------------------------- 4 files changed, 104 deletions(-) delete mode 100644 plugins/webCrawler/index.ts delete mode 100644 plugins/webCrawler/runner.ts delete mode 100644 plugins/webCrawler/type.ts diff --git a/plugins/index.ts b/plugins/index.ts index aab679e..8358a2e 100644 --- a/plugins/index.ts +++ b/plugins/index.ts @@ -1,8 +1,6 @@ import { PluginItem } from '../types/pluginItem'; import searchEngine from './searchEngine'; -import webCrawler from './webCrawler'; export const PluginsMap: Record = { [searchEngine.name]: searchEngine, - [webCrawler.name]: webCrawler, }; diff --git a/plugins/webCrawler/index.ts b/plugins/webCrawler/index.ts deleted file mode 100644 index 55e6e6a..0000000 --- a/plugins/webCrawler/index.ts +++ /dev/null @@ -1,22 +0,0 @@ -import { PluginItem } from '../../types/pluginItem'; -import runner from './runner'; -import { Result } from './type'; - -const schema = { - description: '提取网页内容并总结', - name: 'websiteCrawler', - parameters: { - properties: { - url: { - description: '网页内容', - type: 'string', - }, - }, - required: ['url'], - type: 'object', - }, -}; - -const getWeather: PluginItem = { avatar: '🕸', name: 'websiteCrawler', runner, schema }; - -export default getWeather; diff --git a/plugins/webCrawler/runner.ts b/plugins/webCrawler/runner.ts deleted file mode 100644 index 084458c..0000000 --- a/plugins/webCrawler/runner.ts +++ /dev/null @@ -1,45 +0,0 @@ -import { PluginRunner } from '../../types/pluginItem'; -import { ParserResponse, Result } from './type'; - -const BASE_URL = process.env.BROWSERLESS_URL ?? 'https://chrome.browserless.io'; -const BROWSERLESS_TOKEN = process.env.BROWSERLESS_TOKEN; - -// service from: https://github.com/lobehub/html-parser/tree/master -const HTML_PARSER_URL = process.env.HTML_PARSER_URL; - -const runner: PluginRunner<{ url: string }, Result> = async ({ url }) => { - const input = { - gotoOptions: { waitUntil: 'networkidle2' }, - url, - }; - - try { - const res = await fetch(`${BASE_URL}/content?token=${BROWSERLESS_TOKEN}`, { - body: JSON.stringify(input), - headers: { - 'Content-Type': 'application/json', - }, - method: 'POST', - }); - const html = await res.text(); - - const parserBody = { html, url }; - - const parseRes = await fetch(`${HTML_PARSER_URL}`, { - body: JSON.stringify(parserBody), - headers: { - 'Content-Type': 'application/json', - }, - method: 'POST', - }); - - const { title, textContent, siteName } = (await parseRes.json()) as ParserResponse; - - return { content: textContent, title, url, website: siteName }; - } catch (error) { - console.error(error); - return { content: '抓取失败', errorMessage: (error as any).message, url }; - } -}; - -export default runner; diff --git a/plugins/webCrawler/type.ts b/plugins/webCrawler/type.ts deleted file mode 100644 index 946fb61..0000000 --- a/plugins/webCrawler/type.ts +++ /dev/null @@ -1,35 +0,0 @@ -export type Result = { - content: string; - title?: string; - url: string; - website?: string; -}; - -export interface ParserResponse { - /** author metadata */ - byline: string; - - /** HTML string of processed article content */ - content: string; - - /** content direction */ - dir: string; - - /** article description, or short excerpt from the content */ - excerpt: string; - - /** content language */ - lang: string; - - /** length of an article, in characters */ - length: number; - - /** name of the site */ - siteName: string; - - /** text content of the article, with all the HTML tags removed */ - textContent: string; - - /** article title */ - title: string; -}