初始化

This commit is contained in:
gouhanke
2026-03-05 18:26:08 +08:00
commit 5073036034
22 changed files with 504 additions and 0 deletions

4
.vscode/settings.json vendored Normal file
View File

@@ -0,0 +1,4 @@
{
"python-envs.defaultEnvManager": "ms-python.python:conda",
"python-envs.defaultPackageManager": "ms-python.python:conda"
}

Binary file not shown.

190
index.html Normal file
View File

@@ -0,0 +1,190 @@
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>VLM 虚拟主播</title>
<style>
body { font-family: sans-serif; display: flex; flex-direction: column; align-items: center; background-color: #f0f2f5; margin-top: 50px; }
.container { display: flex; gap: 20px; }
.video-box { background: #000; border-radius: 10px; overflow: hidden; width: 320px; height: 240px; position: relative; }
video { width: 100%; height: 100%; object-fit: cover; }
.avatar-box { width: 320px; height: 240px; background: #fff; border-radius: 10px; display: flex; align-items: center; justify-content: center; box-shadow: 0 4px 12px rgba(0,0,0,0.1); flex-direction: column; }
.avatar-box img { width: 100px; height: 100px; border-radius: 50%; margin-bottom: 10px; }
.controls { margin-top: 30px; }
button { padding: 15px 30px; font-size: 18px; border: none; border-radius: 25px; cursor: pointer; background-color: #007bff; color: white; transition: background 0.2s; }
button:active { background-color: #0056b3; }
button:disabled { background-color: #ccc; cursor: not-allowed; }
#status { margin-top: 15px; color: #555; font-weight: bold; }
#transcript { margin-top: 20px; width: 600px; text-align: center; color: #333; }
</style>
</head>
<body>
<h2>🤖 多模态虚拟主播 (VLM)</h2>
<div class="container">
<div class="video-box">
<video id="userVideo" autoplay muted playsinline></video>
</div>
<div class="avatar-box" id="avatarBox">
<img src="https://api.dicebear.com/7.x/bottts/svg?seed=Felix" alt="Avatar" id="avatarImg">
<span id="avatarStatus">等待互动...</span>
</div>
</div>
<canvas id="canvas" style="display:none;"></canvas>
<div class="controls">
<button id="talkBtn" disabled>连接中...</button>
</div>
<div id="status">正在准备摄像头和麦克风...</div>
<div id="transcript"></div>
<script>
const videoElement = document.getElementById('userVideo');
const canvasElement = document.getElementById('canvas');
const talkBtn = document.getElementById('talkBtn');
const statusText = document.getElementById('status');
const transcriptText = document.getElementById('transcript');
const avatarStatus = document.getElementById('avatarStatus');
let ws;
let mediaRecorder;
let audioChunks = [];
let isRecording = false;
// 1. 初始化摄像头和麦克风
async function initMedia() {
try {
const stream = await navigator.mediaDevices.getUserMedia({ video: true, audio: true });
videoElement.srcObject = stream;
// 设置音频录制器
mediaRecorder = new MediaRecorder(stream);
mediaRecorder.ondataavailable = event => {
if (event.data.size > 0) audioChunks.push(event.data);
};
mediaRecorder.onstop = async () => {
const audioBlob = new Blob(audioChunks, { type: 'audio/webm' });
audioChunks = []; // 清空准备下一次录音
// 录音结束时,截取当前摄像头画面
const imageBase64 = captureFrame();
// 将音频转为 Base64
const audioBase64 = await blobToBase64(audioBlob);
// 发送给后端
sendToServer(audioBase64, imageBase64);
};
statusText.innerText = "设备已就绪,正在连接服务器...";
initWebSocket();
} catch (err) {
statusText.innerText = "获取摄像头/麦克风失败,请允许权限!";
console.error(err);
}
}
// 2. 初始化 WebSocket 连接
function initWebSocket() {
// 假设我们后端的 FastAPI 跑在 8000 端口
ws = new WebSocket('ws://localhost:8000/ws');
ws.onopen = () => {
statusText.innerText = "✅ 已连接到大脑!长按按钮说话。";
talkBtn.innerText = "按住说话 🎙️";
talkBtn.disabled = false;
};
ws.onmessage = async (event) => {
const response = JSON.parse(event.data);
if (response.type === 'text') {
// 后端流式返回的文字
transcriptText.innerHTML += response.content;
} else if (response.type === 'audio') {
// 后端返回的 TTS 音频 (Base64)
playAudio(response.content);
}
};
ws.onclose = () => {
statusText.innerText = "❌ 与服务器断开连接";
talkBtn.disabled = true;
};
}
// 3. 截取视频帧转为 Base64
function captureFrame() {
const context = canvasElement.getContext('2d');
// 为了减轻后端压力,我们缩小图片分辨率到 320x240
canvasElement.width = 320;
canvasElement.height = 240;
context.drawImage(videoElement, 0, 0, 320, 240);
return canvasElement.toDataURL('image/jpeg', 0.8); // 压缩为 JPG
}
// 4. 将 Blob 转为 Base64 的辅助函数
function blobToBase64(blob) {
return new Promise((resolve, _) => {
const reader = new FileReader();
reader.onloadend = () => resolve(reader.result);
reader.readAsDataURL(blob);
});
}
// 5. 将图文数据发送给后端
function sendToServer(audioB64, imageB64) {
statusText.innerText = "🧠 正在思考...";
avatarStatus.innerText = "思考中 🤔";
transcriptText.innerHTML = "<strong>AI: </strong>"; // 准备显示 AI 的回复
const payload = {
type: "user_input",
audio: audioB64,
image: imageB64
};
ws.send(JSON.stringify(payload));
}
// 6. 播放后端返回的音频
function playAudio(base64Audio) {
avatarStatus.innerText = "说话中 🗣️";
const audio = new Audio(base64Audio);
audio.play();
audio.onended = () => {
avatarStatus.innerText = "等待互动...";
statusText.innerText = "✅ 播放完毕,可继续对话。";
};
}
// 7. 绑定按钮事件 (鼠标按下/松开 模拟对讲机)
talkBtn.addEventListener('mousedown', () => {
if (!isRecording) {
mediaRecorder.start();
isRecording = true;
talkBtn.innerText = "松开发送 ⬆️";
statusText.innerText = "🎙️ 正在录音...并准备抓取画面...";
}
});
talkBtn.addEventListener('mouseup', () => {
if (isRecording) {
mediaRecorder.stop();
isRecording = false;
talkBtn.innerText = "按住说话 🎙️";
}
});
// 启动应用
initMedia();
</script>
</body>
</html>

8
main.py Normal file
View File

@@ -0,0 +1,8 @@
import uvicorn
from server.app import app
from server.config import SERVER_HOST, SERVER_PORT
if __name__ == "__main__":
uvicorn.run(app, host=SERVER_HOST, port=SERVER_PORT)

3
server/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
from .app import app
__all__ = ["app"]

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

56
server/agent_service.py Normal file
View File

@@ -0,0 +1,56 @@
import asyncio
from autogen_agentchat.agents import AssistantAgent
from autogen_agentchat.messages import MultiModalMessage, TextMessage
from autogen_core import Image
from autogen_core.models import ModelFamily
from autogen_ext.models.ollama import OllamaChatCompletionClient
from . import config
from .mcp_tools import load_mcp_tools
class AvatarAgentService:
def __init__(self) -> None:
self._model_client = OllamaChatCompletionClient(
model=config.OLLAMA_MODEL,
model_info={
"vision": True,
"function_calling": True,
"json_output": True,
"family": ModelFamily.UNKNOWN,
"structured_output": True,
},
)
self._agent: AssistantAgent | None = None
self._agent_lock = asyncio.Lock()
async def _create_agent(self) -> AssistantAgent:
tools = await load_mcp_tools()
return AssistantAgent(
name="avatar",
model_client=self._model_client,
system_message=config.SYSTEM_MESSAGE,
tools=tools or None,
reflect_on_tool_use=bool(tools),
)
async def _get_agent(self) -> AssistantAgent:
if self._agent is not None:
return self._agent
async with self._agent_lock:
if self._agent is None:
self._agent = await self._create_agent()
return self._agent
async def reply(self, user_text: str, image_b64: str) -> str:
agent = await self._get_agent()
user_image = Image.from_base64(image_b64)
multimodal_task = MultiModalMessage(source="user", content=[user_text, user_image])
ai_response = ""
async for message in agent.run_stream(task=multimodal_task):
if isinstance(message, TextMessage) and message.source == "avatar":
ai_response = message.content
return ai_response

6
server/app.py Normal file
View File

@@ -0,0 +1,6 @@
from fastapi import FastAPI
from .ws import router as ws_router
app = FastAPI()
app.include_router(ws_router)

44
server/config.py Normal file
View File

@@ -0,0 +1,44 @@
import os
import shlex
def _env_bool(name: str, default: bool) -> bool:
value = os.getenv(name)
if value is None:
return default
return value.strip().lower() in {"1", "true", "yes", "on"}
def _env_args(name: str, default: str = "") -> list[str]:
value = os.getenv(name, default)
if not value.strip():
return []
return shlex.split(value)
SYSTEM_MESSAGE = (
"你是一个友好、幽默的AI虚拟主播。你可以看到用户摄像头传来的画面也能听到他们的话。"
"请用简短、自然、热情的中文口语回答,每次回答控制在两三句话以内,不要输出任何 Markdown 格式。"
"当用户询问实时天气、最新新闻或网页信息时,优先使用可用工具先查询再回答。"
)
WHISPER_MODEL_NAME = "base"
WHISPER_DEVICE = "cpu"
WHISPER_COMPUTE_TYPE = "int8"
WHISPER_LANGUAGE = "zh"
WHISPER_BEAM_SIZE = 5
TTS_VOICE = "zh-CN-XiaoxiaoNeural"
OLLAMA_MODEL = "qwen3-vl:latest"
SERVER_HOST = "0.0.0.0"
SERVER_PORT = 8000
ENABLE_MCP_TOOLS = _env_bool("ENABLE_MCP_TOOLS", True)
MCP_SERVER_READ_TIMEOUT_SECONDS = float(os.getenv("MCP_SERVER_READ_TIMEOUT_SECONDS", "30"))
MCP_WEATHER_SERVER_COMMAND = os.getenv("MCP_WEATHER_SERVER_COMMAND", "")
MCP_WEATHER_SERVER_ARGS = _env_args("MCP_WEATHER_SERVER_ARGS")
MCP_WEBSEARCH_SERVER_COMMAND = os.getenv("MCP_WEBSEARCH_SERVER_COMMAND", "")
MCP_WEBSEARCH_SERVER_ARGS = _env_args("MCP_WEBSEARCH_SERVER_ARGS")

73
server/mcp_tools.py Normal file
View File

@@ -0,0 +1,73 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import Any
from autogen_ext.tools.mcp import StdioServerParams, mcp_server_tools
from . import config
@dataclass(frozen=True)
class MCPServerConfig:
name: str
command: str
args: list[str]
def _configured_servers() -> list[MCPServerConfig]:
if not config.ENABLE_MCP_TOOLS:
return []
servers: list[MCPServerConfig] = []
if config.MCP_WEATHER_SERVER_COMMAND:
servers.append(
MCPServerConfig(
name="weather",
command=config.MCP_WEATHER_SERVER_COMMAND,
args=config.MCP_WEATHER_SERVER_ARGS,
)
)
if config.MCP_WEBSEARCH_SERVER_COMMAND:
servers.append(
MCPServerConfig(
name="websearch",
command=config.MCP_WEBSEARCH_SERVER_COMMAND,
args=config.MCP_WEBSEARCH_SERVER_ARGS,
)
)
return servers
async def load_mcp_tools() -> list[Any]:
configured_servers = _configured_servers()
if not configured_servers:
print(" MCP 工具未配置,跳过加载。")
return []
loaded_tools: list[Any] = []
tool_names: set[str] = set()
for server in configured_servers:
params = StdioServerParams(
command=server.command,
args=server.args,
read_timeout_seconds=config.MCP_SERVER_READ_TIMEOUT_SECONDS,
)
try:
server_tools = await mcp_server_tools(params)
for tool in server_tools:
if tool.name in tool_names:
print(f"⚠️ MCP 工具重名,已跳过: {tool.name}")
continue
loaded_tools.append(tool)
tool_names.add(tool.name)
print(f"✅ MCP 服务已加载: {server.name} ({len(server_tools)} tools)")
except Exception as exc:
print(f"⚠️ MCP 服务加载失败: {server.name}, error={exc}")
if loaded_tools:
print(f"✅ MCP 工具总数: {len(loaded_tools)}")
else:
print(" 未加载到任何 MCP 工具。")
return loaded_tools

35
server/speech.py Normal file
View File

@@ -0,0 +1,35 @@
import base64
import edge_tts
from faster_whisper import WhisperModel
from . import config
class SpeechService:
def __init__(self) -> None:
print("⏳ 正在加载本地语音识别模型 (首次启动可能需要下载)...")
self._whisper_model = WhisperModel(
config.WHISPER_MODEL_NAME,
device=config.WHISPER_DEVICE,
compute_type=config.WHISPER_COMPUTE_TYPE,
)
print("✅ 本地语音模型加载完毕!")
def transcribe(self, audio_path: str) -> str:
segments, _ = self._whisper_model.transcribe(
audio_path,
beam_size=config.WHISPER_BEAM_SIZE,
language=config.WHISPER_LANGUAGE,
)
return "".join(segment.text for segment in segments)
async def synthesize_audio_data_url(self, text: str) -> str:
communicate = edge_tts.Communicate(text, config.TTS_VOICE)
audio_data = b""
async for chunk in communicate.stream():
if chunk["type"] == "audio":
audio_data += chunk["data"]
audio_b64 = base64.b64encode(audio_data).decode("utf-8")
return f"data:audio/mp3;base64,{audio_b64}"

67
server/ws.py Normal file
View File

@@ -0,0 +1,67 @@
import base64
import json
import os
import tempfile
from fastapi import APIRouter, WebSocket, WebSocketDisconnect
from .agent_service import AvatarAgentService
from .speech import SpeechService
from .ws_messages import send_audio_message, send_text_message
router = APIRouter()
speech_service = SpeechService()
agent_service = AvatarAgentService()
def _save_audio_to_temp_file(audio_b64: str) -> str:
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".webm")
try:
temp_file.write(base64.b64decode(audio_b64))
return temp_file.name
finally:
temp_file.close()
@router.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket) -> None:
await websocket.accept()
print("✅ WebSocket 连接成功!准备就绪。")
try:
while True:
message_text = await websocket.receive_text()
data = json.loads(message_text)
if data.get("type") != "user_input":
continue
audio_b64 = data["audio"].split(",")[-1]
image_b64 = data["image"].split(",")[-1]
audio_path = _save_audio_to_temp_file(audio_b64)
try:
await send_text_message(websocket, "<i>[👂 正在辨识语音...]</i><br>")
user_text = speech_service.transcribe(audio_path)
finally:
if os.path.exists(audio_path):
os.remove(audio_path)
if not user_text.strip():
await send_text_message(websocket, "<i>[没听清你说什么...]</i><br>")
continue
await send_text_message(websocket, f"<b>你说:</b>{user_text}<br>")
await send_text_message(websocket, "<i>[🧠 正在看图思考...]</i><br>")
ai_response = await agent_service.reply(user_text, image_b64)
await send_text_message(websocket, f"<b>AI主播</b>{ai_response}<br><br>")
await send_text_message(websocket, "<i>[🗣️ 正在生成语音...]</i><br>")
audio_data_url = await speech_service.synthesize_audio_data_url(ai_response)
await send_audio_message(websocket, audio_data_url)
except WebSocketDisconnect:
print("❌ 前端页面已关闭或断开连接")
except Exception as exc:
print(f"⚠️ 发生错误: {exc}")

18
server/ws_messages.py Normal file
View File

@@ -0,0 +1,18 @@
import json
from fastapi import WebSocket
async def send_text_message(websocket: WebSocket, content: str) -> None:
await websocket.send_text(json.dumps({"type": "text", "content": content}))
async def send_audio_message(websocket: WebSocket, audio_data_url: str) -> None:
await websocket.send_text(
json.dumps(
{
"type": "audio",
"content": audio_data_url,
}
)
)

BIN
temp_audio.webm Normal file

Binary file not shown.

BIN
test_audio.webm Normal file

Binary file not shown.