Files
vtb/server/mcp_tools/web_search.py
2026-03-05 18:45:04 +08:00

182 lines
5.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""网络搜索工具"""
import httpx
from typing import List, Dict, Optional
async def web_search(
query: str,
num_results: int = 10,
language: str = "zh-CN"
) -> dict:
"""
执行网络搜索并返回结果
Args:
query: 搜索查询关键词
num_results: 返回结果数量,默认为 10最多 20
language: 搜索语言默认为中文zh-CN也支持 "en-US"
Returns:
dict: 包含搜索结果的字典,包括:
- query: 搜索关键词
- total_results: 估计的总结果数
- results: 结果列表,每个结果包含:
- title: 网页标题
- url: 网页链接
- snippet: 网页摘要
- source: 来源网站
Example:
>>> result = await web_search("Python 异步编程")
>>> for item in result['results']:
... print(f"{item['title']}: {item['url']}")
>>> result = await web_search("latest AI news", num_results=5)
>>> print(len(result['results']))
5
"""
# 限制结果数量
num_results = min(max(1, num_results), 20)
# 使用 DuckDuckGo HTML 版本进行搜索(免费,无需 API key
base_url = "https://html.duckduckgo.com/html/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
params = {
"q": query,
"kl": language,
}
try:
async with httpx.AsyncClient(timeout=15.0) as client:
response = await client.post(base_url, data=params, headers=headers)
response.raise_for_status()
# 解析 HTML 结果
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
results = []
result_divs = soup.find_all('div', class_='result')
for div in result_divs[:num_results]:
try:
# 提取标题和链接
title_elem = div.find('a', class_='result__a')
if not title_elem:
continue
title = title_elem.get_text(strip=True)
url = title_elem.get('href', '')
# 提取摘要
snippet_elem = div.find('a', class_='result__snippet')
snippet = snippet_elem.get_text(strip=True) if snippet_elem else ""
# 提取来源
source_elem = div.find('span', class_='result__url')
source = source_elem.get_text(strip=True) if source_elem else ""
results.append({
"title": title,
"url": url,
"snippet": snippet,
"source": source,
})
except Exception:
continue
return {
"query": query,
"total_results": len(results),
"results": results,
}
except httpx.TimeoutException:
return {
"error": "请求超时,请稍后重试",
"query": query,
"results": []
}
except httpx.HTTPError as e:
return {
"error": f"搜索失败: {str(e)}",
"query": query,
"results": []
}
except Exception as e:
return {
"error": f"解析搜索结果失败: {str(e)}",
"query": query,
"results": []
}
async def search_url(url: str) -> dict:
"""
获取指定 URL 的网页内容摘要
Args:
url: 要获取的网页 URL
Returns:
dict: 包含网页内容的字典,包括:
- url: 网页 URL
- title: 网页标题
- content: 网页正文内容(纯文本)
- error: 错误信息(如果失败)
Example:
>>> result = await search_url("https://example.com")
>>> print(result['title'])
Example Domain
"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
try:
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
response = await client.get(url, headers=headers)
response.raise_for_status()
# 解析 HTML
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
# 提取标题
title = soup.find('title')
title_text = title.get_text(strip=True) if title else ""
# 提取主要内容(移除脚本和样式)
for script in soup(["script", "style"]):
script.decompose()
# 获取正文
content = soup.get_text(separator='\n', strip=True)
return {
"url": url,
"title": title_text,
"content": content[:5000], # 限制长度
}
except httpx.TimeoutException:
return {
"error": "请求超时",
"url": url
}
except httpx.HTTPError as e:
return {
"error": f"获取网页失败: {str(e)}",
"url": url
}
except Exception as e:
return {
"error": f"解析网页失败: {str(e)}",
"url": url
}