"""网络搜索工具""" import httpx from typing import List, Dict, Optional async def web_search( query: str, num_results: int = 10, language: str = "zh-CN" ) -> dict: """ 执行网络搜索并返回结果 Args: query: 搜索查询关键词 num_results: 返回结果数量,默认为 10,最多 20 language: 搜索语言,默认为中文(zh-CN),也支持 "en-US" 等 Returns: dict: 包含搜索结果的字典,包括: - query: 搜索关键词 - total_results: 估计的总结果数 - results: 结果列表,每个结果包含: - title: 网页标题 - url: 网页链接 - snippet: 网页摘要 - source: 来源网站 Example: >>> result = await web_search("Python 异步编程") >>> for item in result['results']: ... print(f"{item['title']}: {item['url']}") >>> result = await web_search("latest AI news", num_results=5) >>> print(len(result['results'])) 5 """ # 限制结果数量 num_results = min(max(1, num_results), 20) # 使用 DuckDuckGo HTML 版本进行搜索(免费,无需 API key) base_url = "https://html.duckduckgo.com/html/" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" } params = { "q": query, "kl": language, } try: async with httpx.AsyncClient(timeout=15.0) as client: response = await client.post(base_url, data=params, headers=headers) response.raise_for_status() # 解析 HTML 结果 from bs4 import BeautifulSoup soup = BeautifulSoup(response.text, 'html.parser') results = [] result_divs = soup.find_all('div', class_='result') for div in result_divs[:num_results]: try: # 提取标题和链接 title_elem = div.find('a', class_='result__a') if not title_elem: continue title = title_elem.get_text(strip=True) url = title_elem.get('href', '') # 提取摘要 snippet_elem = div.find('a', class_='result__snippet') snippet = snippet_elem.get_text(strip=True) if snippet_elem else "" # 提取来源 source_elem = div.find('span', class_='result__url') source = source_elem.get_text(strip=True) if source_elem else "" results.append({ "title": title, "url": url, "snippet": snippet, "source": source, }) except Exception: continue return { "query": query, "total_results": len(results), "results": results, } except httpx.TimeoutException: return { "error": "请求超时,请稍后重试", "query": query, "results": [] } except httpx.HTTPError as e: return { "error": f"搜索失败: {str(e)}", "query": query, "results": [] } except Exception as e: return { "error": f"解析搜索结果失败: {str(e)}", "query": query, "results": [] } async def search_url(url: str) -> dict: """ 获取指定 URL 的网页内容摘要 Args: url: 要获取的网页 URL Returns: dict: 包含网页内容的字典,包括: - url: 网页 URL - title: 网页标题 - content: 网页正文内容(纯文本) - error: 错误信息(如果失败) Example: >>> result = await search_url("https://example.com") >>> print(result['title']) Example Domain """ headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" } try: async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client: response = await client.get(url, headers=headers) response.raise_for_status() # 解析 HTML from bs4 import BeautifulSoup soup = BeautifulSoup(response.text, 'html.parser') # 提取标题 title = soup.find('title') title_text = title.get_text(strip=True) if title else "" # 提取主要内容(移除脚本和样式) for script in soup(["script", "style"]): script.decompose() # 获取正文 content = soup.get_text(separator='\n', strip=True) return { "url": url, "title": title_text, "content": content[:5000], # 限制长度 } except httpx.TimeoutException: return { "error": "请求超时", "url": url } except httpx.HTTPError as e: return { "error": f"获取网页失败: {str(e)}", "url": url } except Exception as e: return { "error": f"解析网页失败: {str(e)}", "url": url }