Skip to content

Commit

Permalink
feat: Add read_website tool to fetch and parse website content
Browse files Browse the repository at this point in the history
  • Loading branch information
onuratakan committed Dec 9, 2024
1 parent e48c1b0 commit f3219ad
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 2 deletions.
2 changes: 1 addition & 1 deletion gpt_computer_assistant/mcp/tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,5 +232,5 @@ def websearch():
def mcp_tools():
global the_tools_
if the_tools_ is None:
the_tools_ = file_system_tool() + memory_tool() + fetch()
the_tools_ = file_system_tool()
return the_tools_
54 changes: 53 additions & 1 deletion gpt_computer_assistant/standard_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,59 @@ def register_tool(func):
return func



@register_tool
@wrapper
def read_website(url: str, max_content_length: int = 5000) -> dict:
"""
Read the content of a website and return the title, meta data, content, and sub-links.
"""
try:
response = requests.get(url)
response.raise_for_status()
html = response.text
except requests.RequestException as e:
return {"error": f"Failed to retrieve the website content: {e}"}

soup = BeautifulSoup(html, "html.parser")

meta_properties = [
"og:description",
"og:site_name",
"og:title",
"og:type",
"og:url",
"description",
"keywords",
"author",
]
meta = {}
for property_name in meta_properties:
tag = soup.find("meta", property=property_name) or soup.find(
"meta", attrs={"name": property_name}
)
if tag:
meta[property_name] = tag.get("content", "")

for ignore_tag in soup(["script", "style"]):
ignore_tag.decompose()

title = soup.title.string.strip() if soup.title else ""
content = soup.body.get_text(separator="\n") if soup.body else ""

links = []
for a in soup.find_all("a", href=True):
link_url = urljoin(url, a["href"])
links.append({"title": a.text.strip(), "link": link_url})

content = re.sub(r"[\n\r\t]+", "\n", content)
content = re.sub(r" +", " ", content)
content = re.sub(r"[\n ]{3,}", "\n\n", content)
content = content.strip()

if len(content) > max_content_length:
content = content[:max_content_length].rsplit(" ", 1)[0] + "..."

return {"meta": meta, "title": title, "content": content, "sub_links": links}


@register_tool
Expand Down

0 comments on commit f3219ad

Please sign in to comment.