初始化

2026-03-05 18:26:08 +08:00
commit 5073036034
22 changed files with 504 additions and 0 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -0,0 +1,4 @@
 {
    "python-envs.defaultEnvManager": "ms-python.python:conda",
    "python-envs.defaultPackageManager": "ms-python.python:conda"
 }
--- a/pycache/main.cpython-312.pyc
+++ b/pycache/main.cpython-312.pyc
--- a/index.html
+++ b/index.html
@@ -0,0 +1,190 @@
 <!DOCTYPE html>
 <html lang="zh-CN">
 <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>VLM 虚拟主播</title>
    <style>
        body { font-family: sans-serif; display: flex; flex-direction: column; align-items: center; background-color: #f0f2f5; margin-top: 50px; }
        .container { display: flex; gap: 20px; }
        .video-box { background: #000; border-radius: 10px; overflow: hidden; width: 320px; height: 240px; position: relative; }
        video { width: 100%; height: 100%; object-fit: cover; }
        .avatar-box { width: 320px; height: 240px; background: #fff; border-radius: 10px; display: flex; align-items: center; justify-content: center; box-shadow: 0 4px 12px rgba(0,0,0,0.1); flex-direction: column; }
        .avatar-box img { width: 100px; height: 100px; border-radius: 50%; margin-bottom: 10px; }
        .controls { margin-top: 30px; }
        button { padding: 15px 30px; font-size: 18px; border: none; border-radius: 25px; cursor: pointer; background-color: #007bff; color: white; transition: background 0.2s; }
        button:active { background-color: #0056b3; }
        button:disabled { background-color: #ccc; cursor: not-allowed; }
        #status { margin-top: 15px; color: #555; font-weight: bold; }
        #transcript { margin-top: 20px; width: 600px; text-align: center; color: #333; }
    </style>
 </head>
 <body>
    <h2>🤖 多模态虚拟主播 (VLM)</h2>
    <div class="container">
        <div class="video-box">
            <video id="userVideo" autoplay muted playsinline></video>
        </div>
        <div class="avatar-box" id="avatarBox">
            <img src="https://api.dicebear.com/7.x/bottts/svg?seed=Felix" alt="Avatar" id="avatarImg">
            <span id="avatarStatus">等待互动...</span>
        </div>
    </div>
    <canvas id="canvas" style="display:none;"></canvas>
    <div class="controls">
        <button id="talkBtn" disabled>连接中...</button>
    </div>
    <div id="status">正在准备摄像头和麦克风...</div>
    <div id="transcript"></div>
    <script>
        const videoElement = document.getElementById('userVideo');
        const canvasElement = document.getElementById('canvas');
        const talkBtn = document.getElementById('talkBtn');
        const statusText = document.getElementById('status');
        const transcriptText = document.getElementById('transcript');
        const avatarStatus = document.getElementById('avatarStatus');
        let ws;
        let mediaRecorder;
        let audioChunks = [];
        let isRecording = false;
        // 1. 初始化摄像头和麦克风
        async function initMedia() {
            try {
                const stream = await navigator.mediaDevices.getUserMedia({ video: true, audio: true });
                videoElement.srcObject = stream;
                // 设置音频录制器
                mediaRecorder = new MediaRecorder(stream);
                mediaRecorder.ondataavailable = event => {
                    if (event.data.size > 0) audioChunks.push(event.data);
                };
                mediaRecorder.onstop = async () => {
                    const audioBlob = new Blob(audioChunks, { type: 'audio/webm' });
                    audioChunks = []; // 清空准备下一次录音
                    // 录音结束时，截取当前摄像头画面
                    const imageBase64 = captureFrame();
                    // 将音频转为 Base64
                    const audioBase64 = await blobToBase64(audioBlob);
                    // 发送给后端
                    sendToServer(audioBase64, imageBase64);
                };
                statusText.innerText = "设备已就绪，正在连接服务器...";
                initWebSocket();
            } catch (err) {
                statusText.innerText = "获取摄像头/麦克风失败，请允许权限！";
                console.error(err);
            }
        }
        // 2. 初始化 WebSocket 连接
        function initWebSocket() {
            // 假设我们后端的 FastAPI 跑在 8000 端口
            ws = new WebSocket('ws://localhost:8000/ws');
            ws.onopen = () => {
                statusText.innerText = "✅ 已连接到大脑！长按按钮说话。";
                talkBtn.innerText = "按住说话 🎙️";
                talkBtn.disabled = false;
            };
            ws.onmessage = async (event) => {
                const response = JSON.parse(event.data);
                if (response.type === 'text') {
                    // 后端流式返回的文字
                    transcriptText.innerHTML += response.content;
                } else if (response.type === 'audio') {
                    // 后端返回的 TTS 音频 (Base64)
                    playAudio(response.content);
                }
            };
            ws.onclose = () => {
                statusText.innerText = "❌ 与服务器断开连接";
                talkBtn.disabled = true;
            };
        }
        // 3. 截取视频帧转为 Base64
        function captureFrame() {
            const context = canvasElement.getContext('2d');
            // 为了减轻后端压力，我们缩小图片分辨率到 320x240
            canvasElement.width = 320;
            canvasElement.height = 240;
            context.drawImage(videoElement, 0, 0, 320, 240);
            return canvasElement.toDataURL('image/jpeg', 0.8); // 压缩为 JPG
        }
        // 4. 将 Blob 转为 Base64 的辅助函数
        function blobToBase64(blob) {
            return new Promise((resolve, _) => {
                const reader = new FileReader();
                reader.onloadend = () => resolve(reader.result);
                reader.readAsDataURL(blob);
            });
        }
        // 5. 将图文数据发送给后端
        function sendToServer(audioB64, imageB64) {
            statusText.innerText = "🧠 正在思考...";
            avatarStatus.innerText = "思考中 🤔";
            transcriptText.innerHTML = "<strong>AI: </strong>"; // 准备显示 AI 的回复
            const payload = {
                type: "user_input",
                audio: audioB64,
                image: imageB64
            };
            ws.send(JSON.stringify(payload));
        }
        // 6. 播放后端返回的音频
        function playAudio(base64Audio) {
            avatarStatus.innerText = "说话中 🗣️";
            const audio = new Audio(base64Audio);
            audio.play();
            audio.onended = () => {
                avatarStatus.innerText = "等待互动...";
                statusText.innerText = "✅ 播放完毕，可继续对话。";
            };
        }
        // 7. 绑定按钮事件 (鼠标按下/松开 模拟对讲机)
        talkBtn.addEventListener('mousedown', () => {
            if (!isRecording) {
                mediaRecorder.start();
                isRecording = true;
                talkBtn.innerText = "松开发送 ⬆️";
                statusText.innerText = "🎙️ 正在录音...并准备抓取画面...";
            }
        });
        talkBtn.addEventListener('mouseup', () => {
            if (isRecording) {
                mediaRecorder.stop();
                isRecording = false;
                talkBtn.innerText = "按住说话 🎙️";
            }
        });
        // 启动应用
        initMedia();
    </script>
 </body>
 </html>
--- a/main.py
+++ b/main.py
@@ -0,0 +1,8 @@
 import uvicorn
 from server.app import app
 from server.config import SERVER_HOST, SERVER_PORT
 if __name__ == "__main__":
    uvicorn.run(app, host=SERVER_HOST, port=SERVER_PORT)
--- a/server/init.py
+++ b/server/init.py
@@ -0,0 +1,3 @@
 from .app import app
 __all__ = ["app"]
--- a/server/pycache/init.cpython-312.pyc
+++ b/server/pycache/init.cpython-312.pyc
--- a/server/pycache/agent_service.cpython-312.pyc
+++ b/server/pycache/agent_service.cpython-312.pyc
--- a/server/pycache/app.cpython-312.pyc
+++ b/server/pycache/app.cpython-312.pyc
--- a/server/pycache/config.cpython-312.pyc
+++ b/server/pycache/config.cpython-312.pyc
--- a/server/pycache/mcp_tools.cpython-312.pyc
+++ b/server/pycache/mcp_tools.cpython-312.pyc
--- a/server/pycache/speech.cpython-312.pyc
+++ b/server/pycache/speech.cpython-312.pyc
--- a/server/pycache/ws.cpython-312.pyc
+++ b/server/pycache/ws.cpython-312.pyc
--- a/server/pycache/ws_messages.cpython-312.pyc
+++ b/server/pycache/ws_messages.cpython-312.pyc
--- a/server/agent_service.py
+++ b/server/agent_service.py
@@ -0,0 +1,56 @@
 import asyncio
 from autogen_agentchat.agents import AssistantAgent
 from autogen_agentchat.messages import MultiModalMessage, TextMessage
 from autogen_core import Image
 from autogen_core.models import ModelFamily
 from autogen_ext.models.ollama import OllamaChatCompletionClient
 from . import config
 from .mcp_tools import load_mcp_tools
 class AvatarAgentService:
    def __init__(self) -> None:
        self._model_client = OllamaChatCompletionClient(
            model=config.OLLAMA_MODEL,
            model_info={
                "vision": True,
                "function_calling": True,
                "json_output": True,
                "family": ModelFamily.UNKNOWN,
                "structured_output": True,
            },
        )
        self._agent: AssistantAgent | None = None
        self._agent_lock = asyncio.Lock()
    async def _create_agent(self) -> AssistantAgent:
        tools = await load_mcp_tools()
        return AssistantAgent(
            name="avatar",
            model_client=self._model_client,
            system_message=config.SYSTEM_MESSAGE,
            tools=tools or None,
            reflect_on_tool_use=bool(tools),
        )
    async def _get_agent(self) -> AssistantAgent:
        if self._agent is not None:
            return self._agent
        async with self._agent_lock:
            if self._agent is None:
                self._agent = await self._create_agent()
        return self._agent
    async def reply(self, user_text: str, image_b64: str) -> str:
        agent = await self._get_agent()
        user_image = Image.from_base64(image_b64)
        multimodal_task = MultiModalMessage(source="user", content=[user_text, user_image])
        ai_response = ""
        async for message in agent.run_stream(task=multimodal_task):
            if isinstance(message, TextMessage) and message.source == "avatar":
                ai_response = message.content
        return ai_response
--- a/server/app.py
+++ b/server/app.py
@@ -0,0 +1,6 @@
 from fastapi import FastAPI
 from .ws import router as ws_router
 app = FastAPI()
 app.include_router(ws_router)
--- a/server/config.py
+++ b/server/config.py
@@ -0,0 +1,44 @@
 import os
 import shlex
 def _env_bool(name: str, default: bool) -> bool:
    value = os.getenv(name)
    if value is None:
        return default
    return value.strip().lower() in {"1", "true", "yes", "on"}
 def _env_args(name: str, default: str = "") -> list[str]:
    value = os.getenv(name, default)
    if not value.strip():
        return []
    return shlex.split(value)
 SYSTEM_MESSAGE = (
    "你是一个友好、幽默的AI虚拟主播。你可以看到用户摄像头传来的画面，也能听到他们的话。"
    "请用简短、自然、热情的中文口语回答，每次回答控制在两三句话以内，不要输出任何 Markdown 格式。"
    "当用户询问实时天气、最新新闻或网页信息时，优先使用可用工具先查询再回答。"
 )
 WHISPER_MODEL_NAME = "base"
 WHISPER_DEVICE = "cpu"
 WHISPER_COMPUTE_TYPE = "int8"
 WHISPER_LANGUAGE = "zh"
 WHISPER_BEAM_SIZE = 5
 TTS_VOICE = "zh-CN-XiaoxiaoNeural"
 OLLAMA_MODEL = "qwen3-vl:latest"
 SERVER_HOST = "0.0.0.0"
 SERVER_PORT = 8000
 ENABLE_MCP_TOOLS = _env_bool("ENABLE_MCP_TOOLS", True)
 MCP_SERVER_READ_TIMEOUT_SECONDS = float(os.getenv("MCP_SERVER_READ_TIMEOUT_SECONDS", "30"))
 MCP_WEATHER_SERVER_COMMAND = os.getenv("MCP_WEATHER_SERVER_COMMAND", "")
 MCP_WEATHER_SERVER_ARGS = _env_args("MCP_WEATHER_SERVER_ARGS")
 MCP_WEBSEARCH_SERVER_COMMAND = os.getenv("MCP_WEBSEARCH_SERVER_COMMAND", "")
 MCP_WEBSEARCH_SERVER_ARGS = _env_args("MCP_WEBSEARCH_SERVER_ARGS")
--- a/server/mcp_tools.py
+++ b/server/mcp_tools.py
@@ -0,0 +1,73 @@
 from __future__ import annotations
 from dataclasses import dataclass
 from typing import Any
 from autogen_ext.tools.mcp import StdioServerParams, mcp_server_tools
 from . import config
@dataclass(frozen=True)
 class MCPServerConfig:
    name: str
    command: str
    args: list[str]
 def _configured_servers() -> list[MCPServerConfig]:
    if not config.ENABLE_MCP_TOOLS:
        return []
    servers: list[MCPServerConfig] = []
    if config.MCP_WEATHER_SERVER_COMMAND:
        servers.append(
            MCPServerConfig(
                name="weather",
                command=config.MCP_WEATHER_SERVER_COMMAND,
                args=config.MCP_WEATHER_SERVER_ARGS,
            )
        )
    if config.MCP_WEBSEARCH_SERVER_COMMAND:
        servers.append(
            MCPServerConfig(
                name="websearch",
                command=config.MCP_WEBSEARCH_SERVER_COMMAND,
                args=config.MCP_WEBSEARCH_SERVER_ARGS,
            )
        )
    return servers
 async def load_mcp_tools() -> list[Any]:
    configured_servers = _configured_servers()
    if not configured_servers:
        print("ℹ️ MCP 工具未配置，跳过加载。")
        return []
    loaded_tools: list[Any] = []
    tool_names: set[str] = set()
    for server in configured_servers:
        params = StdioServerParams(
            command=server.command,
            args=server.args,
            read_timeout_seconds=config.MCP_SERVER_READ_TIMEOUT_SECONDS,
        )
        try:
            server_tools = await mcp_server_tools(params)
            for tool in server_tools:
                if tool.name in tool_names:
                    print(f"⚠️ MCP 工具重名，已跳过: {tool.name}")
                    continue
                loaded_tools.append(tool)
                tool_names.add(tool.name)
            print(f"✅ MCP 服务已加载: {server.name} ({len(server_tools)} tools)")
        except Exception as exc:
            print(f"⚠️ MCP 服务加载失败: {server.name}, error={exc}")
    if loaded_tools:
        print(f"✅ MCP 工具总数: {len(loaded_tools)}")
    else:
        print("ℹ️ 未加载到任何 MCP 工具。")
    return loaded_tools
--- a/server/speech.py
+++ b/server/speech.py
@@ -0,0 +1,35 @@
 import base64
 import edge_tts
 from faster_whisper import WhisperModel
 from . import config
 class SpeechService:
    def __init__(self) -> None:
        print("⏳ 正在加载本地语音识别模型 (首次启动可能需要下载)...")
        self._whisper_model = WhisperModel(
            config.WHISPER_MODEL_NAME,
            device=config.WHISPER_DEVICE,
            compute_type=config.WHISPER_COMPUTE_TYPE,
        )
        print("✅ 本地语音模型加载完毕！")
    def transcribe(self, audio_path: str) -> str:
        segments, _ = self._whisper_model.transcribe(
            audio_path,
            beam_size=config.WHISPER_BEAM_SIZE,
            language=config.WHISPER_LANGUAGE,
        )
        return "".join(segment.text for segment in segments)
    async def synthesize_audio_data_url(self, text: str) -> str:
        communicate = edge_tts.Communicate(text, config.TTS_VOICE)
        audio_data = b""
        async for chunk in communicate.stream():
            if chunk["type"] == "audio":
                audio_data += chunk["data"]
        audio_b64 = base64.b64encode(audio_data).decode("utf-8")
        return f"data:audio/mp3;base64,{audio_b64}"
--- a/server/ws.py
+++ b/server/ws.py
@@ -0,0 +1,67 @@
 import base64
 import json
 import os
 import tempfile
 from fastapi import APIRouter, WebSocket, WebSocketDisconnect
 from .agent_service import AvatarAgentService
 from .speech import SpeechService
 from .ws_messages import send_audio_message, send_text_message
 router = APIRouter()
 speech_service = SpeechService()
 agent_service = AvatarAgentService()
 def _save_audio_to_temp_file(audio_b64: str) -> str:
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".webm")
    try:
        temp_file.write(base64.b64decode(audio_b64))
        return temp_file.name
    finally:
        temp_file.close()
@router.websocket("/ws")
 async def websocket_endpoint(websocket: WebSocket) -> None:
    await websocket.accept()
    print("✅ WebSocket 连接成功！准备就绪。")
    try:
        while True:
            message_text = await websocket.receive_text()
            data = json.loads(message_text)
            if data.get("type") != "user_input":
                continue
            audio_b64 = data["audio"].split(",")[-1]
            image_b64 = data["image"].split(",")[-1]
            audio_path = _save_audio_to_temp_file(audio_b64)
            try:
                await send_text_message(websocket, "<i>[👂 正在辨识语音...]</i><br>")
                user_text = speech_service.transcribe(audio_path)
            finally:
                if os.path.exists(audio_path):
                    os.remove(audio_path)
            if not user_text.strip():
                await send_text_message(websocket, "<i>[没听清你说什么...]</i><br>")
                continue
            await send_text_message(websocket, f"<b>你说：</b>{user_text}<br>")
            await send_text_message(websocket, "<i>[🧠 正在看图思考...]</i><br>")
            ai_response = await agent_service.reply(user_text, image_b64)
            await send_text_message(websocket, f"<b>AI主播：</b>{ai_response}<br><br>")
            await send_text_message(websocket, "<i>[🗣️ 正在生成语音...]</i><br>")
            audio_data_url = await speech_service.synthesize_audio_data_url(ai_response)
            await send_audio_message(websocket, audio_data_url)
    except WebSocketDisconnect:
        print("❌ 前端页面已关闭或断开连接")
    except Exception as exc:
        print(f"⚠️ 发生错误: {exc}")
--- a/server/ws_messages.py
+++ b/server/ws_messages.py
@@ -0,0 +1,18 @@
 import json
 from fastapi import WebSocket
 async def send_text_message(websocket: WebSocket, content: str) -> None:
    await websocket.send_text(json.dumps({"type": "text", "content": content}))
 async def send_audio_message(websocket: WebSocket, audio_data_url: str) -> None:
    await websocket.send_text(
        json.dumps(
            {
                "type": "audio",
                "content": audio_data_url,
            }
        )
    )
--- a/temp_audio.webm
+++ b/temp_audio.webm
--- a/test_audio.webm
+++ b/test_audio.webm