VLM 虚拟主播

commit 5073036034773f1c1865d75f8c141f223cb1dd9b Author: gouhanke <12219217+gouhanke@user.noreply.gitee.com> Date: Thu Mar 5 18:26:08 2026 +0800 初始化 diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..4b5a294 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,4 @@ +{ + "python-envs.defaultEnvManager": "ms-python.python:conda", + "python-envs.defaultPackageManager": "ms-python.python:conda" +} \ No newline at end of file diff --git a/__pycache__/main.cpython-312.pyc b/__pycache__/main.cpython-312.pyc new file mode 100644 index 0000000..07b248d Binary files /dev/null and b/__pycache__/main.cpython-312.pyc differ diff --git a/index.html b/index.html new file mode 100644 index 0000000..bf180f1 --- /dev/null +++ b/index.html @@ -0,0 +1,190 @@ + + + + + + VLM 虚拟主播 + + + + +

🤖 多模态虚拟主播 (VLM)

+ +

+ 等待互动... +

+ +

正在准备摄像头和麦克风...

+ + + + \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..add1538 --- /dev/null +++ b/main.py @@ -0,0 +1,8 @@ +import uvicorn + +from server.app import app +from server.config import SERVER_HOST, SERVER_PORT + + +if __name__ == "__main__": + uvicorn.run(app, host=SERVER_HOST, port=SERVER_PORT) diff --git a/server/__init__.py b/server/__init__.py new file mode 100644 index 0000000..34f275e --- /dev/null +++ b/server/__init__.py @@ -0,0 +1,3 @@ +from .app import app + +__all__ = ["app"] diff --git a/server/__pycache__/__init__.cpython-312.pyc b/server/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..fcda725 Binary files /dev/null and b/server/__pycache__/__init__.cpython-312.pyc differ diff --git a/server/__pycache__/agent_service.cpython-312.pyc b/server/__pycache__/agent_service.cpython-312.pyc new file mode 100644 index 0000000..50da02e Binary files /dev/null and b/server/__pycache__/agent_service.cpython-312.pyc differ diff --git a/server/__pycache__/app.cpython-312.pyc b/server/__pycache__/app.cpython-312.pyc new file mode 100644 index 0000000..46bcf69 Binary files /dev/null and b/server/__pycache__/app.cpython-312.pyc differ diff --git a/server/__pycache__/config.cpython-312.pyc b/server/__pycache__/config.cpython-312.pyc new file mode 100644 index 0000000..7878606 Binary files /dev/null and b/server/__pycache__/config.cpython-312.pyc differ diff --git a/server/__pycache__/mcp_tools.cpython-312.pyc b/server/__pycache__/mcp_tools.cpython-312.pyc new file mode 100644 index 0000000..5acebc6 Binary files /dev/null and b/server/__pycache__/mcp_tools.cpython-312.pyc differ diff --git a/server/__pycache__/speech.cpython-312.pyc b/server/__pycache__/speech.cpython-312.pyc new file mode 100644 index 0000000..9ed9801 Binary files /dev/null and b/server/__pycache__/speech.cpython-312.pyc differ diff --git a/server/__pycache__/ws.cpython-312.pyc b/server/__pycache__/ws.cpython-312.pyc new file mode 100644 index 0000000..98f0f75 Binary files /dev/null and b/server/__pycache__/ws.cpython-312.pyc differ diff --git a/server/__pycache__/ws_messages.cpython-312.pyc b/server/__pycache__/ws_messages.cpython-312.pyc new file mode 100644 index 0000000..dec584b Binary files /dev/null and b/server/__pycache__/ws_messages.cpython-312.pyc differ diff --git a/server/agent_service.py b/server/agent_service.py new file mode 100644 index 0000000..fa18580 --- /dev/null +++ b/server/agent_service.py @@ -0,0 +1,56 @@ +import asyncio + +from autogen_agentchat.agents import AssistantAgent +from autogen_agentchat.messages import MultiModalMessage, TextMessage +from autogen_core import Image +from autogen_core.models import ModelFamily +from autogen_ext.models.ollama import OllamaChatCompletionClient + +from . import config +from .mcp_tools import load_mcp_tools + + +class AvatarAgentService: + def __init__(self) -> None: + self._model_client = OllamaChatCompletionClient( + model=config.OLLAMA_MODEL, + model_info={ + "vision": True, + "function_calling": True, + "json_output": True, + "family": ModelFamily.UNKNOWN, + "structured_output": True, + }, + ) + self._agent: AssistantAgent | None = None + self._agent_lock = asyncio.Lock() + + async def _create_agent(self) -> AssistantAgent: + tools = await load_mcp_tools() + return AssistantAgent( + name="avatar", + model_client=self._model_client, + system_message=config.SYSTEM_MESSAGE, + tools=tools or None, + reflect_on_tool_use=bool(tools), + ) + + async def _get_agent(self) -> AssistantAgent: + if self._agent is not None: + return self._agent + async with self._agent_lock: + if self._agent is None: + self._agent = await self._create_agent() + return self._agent + + async def reply(self, user_text: str, image_b64: str) -> str: + agent = await self._get_agent() + user_image = Image.from_base64(image_b64) + multimodal_task = MultiModalMessage(source="user", content=[user_text, user_image]) + + ai_response = "" + async for message in agent.run_stream(task=multimodal_task): + if isinstance(message, TextMessage) and message.source == "avatar": + ai_response = message.content + + return ai_response diff --git a/server/app.py b/server/app.py new file mode 100644 index 0000000..3679cfb --- /dev/null +++ b/server/app.py @@ -0,0 +1,6 @@ +from fastapi import FastAPI + +from .ws import router as ws_router + +app = FastAPI() +app.include_router(ws_router) diff --git a/server/config.py b/server/config.py new file mode 100644 index 0000000..f2178ac --- /dev/null +++ b/server/config.py @@ -0,0 +1,44 @@ +import os +import shlex + + +def _env_bool(name: str, default: bool) -> bool: + value = os.getenv(name) + if value is None: + return default + return value.strip().lower() in {"1", "true", "yes", "on"} + + +def _env_args(name: str, default: str = "") -> list[str]: + value = os.getenv(name, default) + if not value.strip(): + return [] + return shlex.split(value) + + +SYSTEM_MESSAGE = ( + "你是一个友好、幽默的AI虚拟主播。你可以看到用户摄像头传来的画面，也能听到他们的话。" + "请用简短、自然、热情的中文口语回答，每次回答控制在两三句话以内，不要输出任何 Markdown 格式。" + "当用户询问实时天气、最新新闻或网页信息时，优先使用可用工具先查询再回答。" +) + +WHISPER_MODEL_NAME = "base" +WHISPER_DEVICE = "cpu" +WHISPER_COMPUTE_TYPE = "int8" +WHISPER_LANGUAGE = "zh" +WHISPER_BEAM_SIZE = 5 + +TTS_VOICE = "zh-CN-XiaoxiaoNeural" +OLLAMA_MODEL = "qwen3-vl:latest" + +SERVER_HOST = "0.0.0.0" +SERVER_PORT = 8000 + +ENABLE_MCP_TOOLS = _env_bool("ENABLE_MCP_TOOLS", True) +MCP_SERVER_READ_TIMEOUT_SECONDS = float(os.getenv("MCP_SERVER_READ_TIMEOUT_SECONDS", "30")) + +MCP_WEATHER_SERVER_COMMAND = os.getenv("MCP_WEATHER_SERVER_COMMAND", "") +MCP_WEATHER_SERVER_ARGS = _env_args("MCP_WEATHER_SERVER_ARGS") + +MCP_WEBSEARCH_SERVER_COMMAND = os.getenv("MCP_WEBSEARCH_SERVER_COMMAND", "") +MCP_WEBSEARCH_SERVER_ARGS = _env_args("MCP_WEBSEARCH_SERVER_ARGS") diff --git a/server/mcp_tools.py b/server/mcp_tools.py new file mode 100644 index 0000000..3d32dfc --- /dev/null +++ b/server/mcp_tools.py @@ -0,0 +1,73 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any + +from autogen_ext.tools.mcp import StdioServerParams, mcp_server_tools + +from . import config + + +@dataclass(frozen=True) +class MCPServerConfig: + name: str + command: str + args: list[str] + + +def _configured_servers() -> list[MCPServerConfig]: + if not config.ENABLE_MCP_TOOLS: + return [] + + servers: list[MCPServerConfig] = [] + if config.MCP_WEATHER_SERVER_COMMAND: + servers.append( + MCPServerConfig( + name="weather", + command=config.MCP_WEATHER_SERVER_COMMAND, + args=config.MCP_WEATHER_SERVER_ARGS, + ) + ) + if config.MCP_WEBSEARCH_SERVER_COMMAND: + servers.append( + MCPServerConfig( + name="websearch", + command=config.MCP_WEBSEARCH_SERVER_COMMAND, + args=config.MCP_WEBSEARCH_SERVER_ARGS, + ) + ) + return servers + + +async def load_mcp_tools() -> list[Any]: + configured_servers = _configured_servers() + if not configured_servers: + print("ℹ️ MCP 工具未配置，跳过加载。") + return [] + + loaded_tools: list[Any] = [] + tool_names: set[str] = set() + + for server in configured_servers: + params = StdioServerParams( + command=server.command, + args=server.args, + read_timeout_seconds=config.MCP_SERVER_READ_TIMEOUT_SECONDS, + ) + try: + server_tools = await mcp_server_tools(params) + for tool in server_tools: + if tool.name in tool_names: + print(f"⚠️ MCP 工具重名，已跳过: {tool.name}") + continue + loaded_tools.append(tool) + tool_names.add(tool.name) + print(f"✅ MCP 服务已加载: {server.name} ({len(server_tools)} tools)") + except Exception as exc: + print(f"⚠️ MCP 服务加载失败: {server.name}, error={exc}") + + if loaded_tools: + print(f"✅ MCP 工具总数: {len(loaded_tools)}") + else: + print("ℹ️ 未加载到任何 MCP 工具。") + return loaded_tools diff --git a/server/speech.py b/server/speech.py new file mode 100644 index 0000000..18d4540 --- /dev/null +++ b/server/speech.py @@ -0,0 +1,35 @@ +import base64 + +import edge_tts +from faster_whisper import WhisperModel + +from . import config + + +class SpeechService: + def __init__(self) -> None: + print("⏳ 正在加载本地语音识别模型 (首次启动可能需要下载)...") + self._whisper_model = WhisperModel( + config.WHISPER_MODEL_NAME, + device=config.WHISPER_DEVICE, + compute_type=config.WHISPER_COMPUTE_TYPE, + ) + print("✅ 本地语音模型加载完毕！") + + def transcribe(self, audio_path: str) -> str: + segments, _ = self._whisper_model.transcribe( + audio_path, + beam_size=config.WHISPER_BEAM_SIZE, + language=config.WHISPER_LANGUAGE, + ) + return "".join(segment.text for segment in segments) + + async def synthesize_audio_data_url(self, text: str) -> str: + communicate = edge_tts.Communicate(text, config.TTS_VOICE) + audio_data = b"" + async for chunk in communicate.stream(): + if chunk["type"] == "audio": + audio_data += chunk["data"] + + audio_b64 = base64.b64encode(audio_data).decode("utf-8") + return f"data:audio/mp3;base64,{audio_b64}" diff --git a/server/ws.py b/server/ws.py new file mode 100644 index 0000000..3c30914 --- /dev/null +++ b/server/ws.py @@ -0,0 +1,67 @@ +import base64 +import json +import os +import tempfile + +from fastapi import APIRouter, WebSocket, WebSocketDisconnect + +from .agent_service import AvatarAgentService +from .speech import SpeechService +from .ws_messages import send_audio_message, send_text_message + +router = APIRouter() +speech_service = SpeechService() +agent_service = AvatarAgentService() + + +def _save_audio_to_temp_file(audio_b64: str) -> str: + temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".webm") + try: + temp_file.write(base64.b64decode(audio_b64)) + return temp_file.name + finally: + temp_file.close() + + +@router.websocket("/ws") +async def websocket_endpoint(websocket: WebSocket) -> None: + await websocket.accept() + print("✅ WebSocket 连接成功！准备就绪。") + + try: + while True: + message_text = await websocket.receive_text() + data = json.loads(message_text) + + if data.get("type") != "user_input": + continue + + audio_b64 = data["audio"].split(",")[-1] + image_b64 = data["image"].split(",")[-1] + + audio_path = _save_audio_to_temp_file(audio_b64) + try: + await send_text_message(websocket, "[👂 正在辨识语音...]
") + user_text = speech_service.transcribe(audio_path) + finally: + if os.path.exists(audio_path): + os.remove(audio_path) + + if not user_text.strip(): + await send_text_message(websocket, "[没听清你说什么...]
") + continue + + await send_text_message(websocket, f"你说：{user_text}
") + await send_text_message(websocket, "[🧠 正在看图思考...]
") + + ai_response = await agent_service.reply(user_text, image_b64) + await send_text_message(websocket, f"AI主播：{ai_response}

") + + await send_text_message(websocket, "[🗣️ 正在生成语音...]
") + audio_data_url = await speech_service.synthesize_audio_data_url(ai_response) + await send_audio_message(websocket, audio_data_url) + + except WebSocketDisconnect: + print("❌ 前端页面已关闭或断开连接") + except Exception as exc: + print(f"⚠️ 发生错误: {exc}") diff --git a/server/ws_messages.py b/server/ws_messages.py new file mode 100644 index 0000000..78cac1f --- /dev/null +++ b/server/ws_messages.py @@ -0,0 +1,18 @@ +import json + +from fastapi import WebSocket + + +async def send_text_message(websocket: WebSocket, content: str) -> None: + await websocket.send_text(json.dumps({"type": "text", "content": content})) + + +async def send_audio_message(websocket: WebSocket, audio_data_url: str) -> None: + await websocket.send_text( + json.dumps( + { + "type": "audio", + "content": audio_data_url, + } + ) + ) diff --git a/temp_audio.webm b/temp_audio.webm new file mode 100644 index 0000000..c0d27c5 Binary files /dev/null and b/temp_audio.webm differ diff --git a/test_audio.webm b/test_audio.webm new file mode 100644 index 0000000..8705c35 Binary files /dev/null and b/test_audio.webm differ